View Javadoc

1   package com.atlassian.bonnie.search.extractor;
2   
3   import com.atlassian.bonnie.search.SearchableAttachment;
4   
5   import java.io.InputStream;
6   import java.io.IOException;
7   
8   import org.textmining.text.extraction.WordExtractor;
9   import org.apache.log4j.Category;
10  
11  public class MsWordContentExtractor extends BaseAttachmentContentExtractor
12  {
13      public static final Category log = Category.getInstance(MsWordContentExtractor.class);
14  
15      private static final String[] CONTENT_TYPES = { "application/msword" };
16      private static final String[] EXTENSIONS = { "doc" };
17  
18      protected String[] getMatchingContentTypes()
19      {
20          return CONTENT_TYPES;
21      }
22  
23      protected String[] getMatchingFileExtensions()
24      {
25          return EXTENSIONS;
26      }
27  
28      protected String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException
29      {
30          WordExtractor extractor = new WordExtractor();
31          try
32          {
33              return extractor.extractText(is);
34          }
35          catch (Exception e)
36          {
37              throw new ExtractorException("Error reading content of Word document: " + e.getMessage(), e);
38          }
39      }
40  }