1 package com.atlassian.bonnie.search.extractor;
2
3 import com.atlassian.bonnie.search.SearchableAttachment;
4
5 import java.io.InputStream;
6 import java.io.IOException;
7
8 import org.textmining.text.extraction.WordExtractor;
9 import org.apache.log4j.Category;
10
11 public class MsWordContentExtractor extends BaseAttachmentContentExtractor
12 {
13 public static final Category log = Category.getInstance(MsWordContentExtractor.class);
14
15 private static final String[] CONTENT_TYPES = { "application/msword" };
16 private static final String[] EXTENSIONS = { "doc" };
17
18 protected String[] getMatchingContentTypes()
19 {
20 return CONTENT_TYPES;
21 }
22
23 protected String[] getMatchingFileExtensions()
24 {
25 return EXTENSIONS;
26 }
27
28 protected String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException
29 {
30 WordExtractor extractor = new WordExtractor();
31 try
32 {
33 return extractor.extractText(is);
34 }
35 catch (Exception e)
36 {
37 throw new ExtractorException("Error reading content of Word document: " + e.getMessage(), e);
38 }
39 }
40 }