View Javadoc

1   package com.atlassian.bonnie.search.extractor;
2   
3   import com.atlassian.bonnie.search.Extractor;
4   import com.atlassian.bonnie.search.SearchableAttachment;
5   import com.atlassian.bonnie.Searchable;
6   import org.apache.lucene.document.Document;
7   import org.apache.log4j.Category;
8   import org.apache.commons.io.IOUtils;
9   
10  import java.io.InputStream;
11  import java.io.IOException;
12  
13  public abstract class BaseAttachmentContentExtractor implements Extractor
14  {
15      public static final Category log = Category.getInstance(BaseAttachmentContentExtractor.class);
16  
17      public void addFields(Document document, StringBuffer defaultSearchableText, Searchable searchable)
18      {
19          if (!(searchable instanceof SearchableAttachment)) return;
20          if (document.getField("contentBody") != null) return;
21          if (defaultSearchableText.length() != 0) return;
22  
23          SearchableAttachment attachment = (SearchableAttachment) searchable;
24          String fileName = (attachment.getFileName() == null) ? "" : attachment.getFileName().toLowerCase();
25          String contentType = attachment.getContentType();
26          if (!shouldExtractFrom(fileName, contentType)) return;
27  
28          InputStream is = null;
29          try
30          {
31              is = attachment.getContentsAsStream();
32              defaultSearchableText.append(extractText(is, attachment));
33          }
34          catch (IOException e)
35          {
36              log.warn("Error reading attachment ("  + attachment + ")" , e);
37          }
38          catch (ExtractorException e)
39          {
40              log.warn("Error indexing attachment ("  + attachment + ")" , e);
41          }
42          catch (RuntimeException e)
43          {
44              log.warn("Error indexing attachment ("  + attachment + ")" , e);
45          }
46          finally
47          {
48              IOUtils.closeQuietly(is);
49          }
50  
51      }
52  
53      protected boolean shouldExtractFrom(String fileName, String contentType)
54      {
55          for (int i = 0; i < getMatchingFileExtensions().length; i++)
56          {
57              if (fileName.endsWith(getMatchingFileExtensions()[i]))
58                  return true;
59          }
60  
61          for (int i = 0; i < getMatchingContentTypes().length; i++)
62          {
63              String validType = getMatchingContentTypes()[i];
64              if (validType.equalsIgnoreCase(contentType))
65                  return true;
66          }
67  
68          return false;
69      }
70  
71      protected String[] getMatchingContentTypes()
72      {
73          return new String[0];
74      }
75  
76      protected String[] getMatchingFileExtensions()
77      {
78          return new String[0];
79      }
80  
81      /**
82       * Package access for unit testing only. Do not use this method directly. Use #addFields().
83       *
84       * @param is         a stream containing the attachment contents
85       * @param attachment contains useful attachment metadata, e.g. filename
86       * @return a String with a textual representation of the attachment's contents
87       * @throws ExtractorException if there is a problem with converting the attachment content into text. A wrapper
88       *                            around the original exception.
89       */
90      protected abstract String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException;
91  }