1   package com.atlassian.bonnie.search.extractor;
2   
3   import com.atlassian.bonnie.Searchable;
4   import com.atlassian.bonnie.search.Extractor;
5   import com.atlassian.bonnie.search.SearchableAttachment;
6   import org.apache.commons.io.IOUtils;
7   import org.apache.lucene.document.Document;
8   import org.slf4j.Logger;
9   import org.slf4j.LoggerFactory;
10  
11  import java.io.IOException;
12  import java.io.InputStream;
13  
14  public abstract class BaseAttachmentContentExtractor implements Extractor
15  {
16      private static final Logger log = LoggerFactory.getLogger(BaseAttachmentContentExtractor.class);
17  
18      public void addFields(Document document, StringBuffer defaultSearchableText, Searchable searchable)
19      {
20          if (!(searchable instanceof SearchableAttachment)) return;
21          if (document.getField("contentBody") != null) return;
22          if (defaultSearchableText.length() != 0) return;
23  
24          SearchableAttachment attachment = (SearchableAttachment) searchable;
25          String fileName = (attachment.getFileName() == null) ? "" : attachment.getFileName().toLowerCase();
26          String contentType = attachment.getContentType();
27          if (!shouldExtractFrom(fileName, contentType)) return;
28  
29          InputStream is = null;
30          try
31          {
32              is = attachment.getContentsAsStream();
33  
34  			if (is == null)
35  			{
36  				log.warn("Encountered attachment with null stream: " + attachment.getFileName());
37  				return; 
38  			}
39  
40  			defaultSearchableText.append(extractText(is, attachment));
41          }
42          catch (IOException e)
43          {
44              log.warn("Error reading attachment ("  + attachment + ")" , e);
45          }
46          catch (ExtractorException e)
47          {
48              log.warn("Error indexing attachment ("  + attachment + ")" , e);
49          }
50          catch (RuntimeException e)
51          {
52              log.warn("Error indexing attachment ("  + attachment + ")" , e);
53          }
54          finally
55          {
56              IOUtils.closeQuietly(is);
57          }
58  
59      }
60  
61      protected boolean shouldExtractFrom(String fileName, String contentType)
62      {
63          for (int i = 0; i < getMatchingFileExtensions().length; i++)
64          {
65              if (fileName.endsWith(getMatchingFileExtensions()[i]))
66                  return true;
67          }
68  
69          for (int i = 0; i < getMatchingContentTypes().length; i++)
70          {
71              String validType = getMatchingContentTypes()[i];
72              if (validType.equalsIgnoreCase(contentType))
73                  return true;
74          }
75  
76          return false;
77      }
78  
79      protected String[] getMatchingContentTypes()
80      {
81          return new String[0];
82      }
83  
84      protected String[] getMatchingFileExtensions()
85      {
86          return new String[0];
87      }
88  
89      /**
90       * Package access for unit testing only. Do not use this method directly. Use #addFields().
91       *
92       * @param is         a stream containing the attachment contents
93       * @param attachment contains useful attachment metadata, e.g. filename
94       * @return a String with a textual representation of the attachment's contents
95       * @throws ExtractorException if there is a problem with converting the attachment content into text. A wrapper
96       *                            around the original exception.
97       */
98      protected abstract String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException;
99  }