1 package com.atlassian.bonnie.search.extractor;
2
3 import com.atlassian.bonnie.Searchable;
4 import com.atlassian.bonnie.search.Extractor;
5 import com.atlassian.bonnie.search.SearchableAttachment;
6 import org.apache.commons.io.IOUtils;
7 import org.apache.lucene.document.Document;
8 import org.slf4j.Logger;
9 import org.slf4j.LoggerFactory;
10
11 import java.io.IOException;
12 import java.io.InputStream;
13
14 public abstract class BaseAttachmentContentExtractor implements Extractor
15 {
16 private static final Logger log = LoggerFactory.getLogger(BaseAttachmentContentExtractor.class);
17
18 public void addFields(Document document, StringBuffer defaultSearchableText, Searchable searchable)
19 {
20 if (!(searchable instanceof SearchableAttachment)) return;
21 if (document.getField("contentBody") != null) return;
22 if (defaultSearchableText.length() != 0) return;
23
24 SearchableAttachment attachment = (SearchableAttachment) searchable;
25 String fileName = (attachment.getFileName() == null) ? "" : attachment.getFileName().toLowerCase();
26 String contentType = attachment.getContentType();
27 if (!shouldExtractFrom(fileName, contentType)) return;
28
29 InputStream is = null;
30 try
31 {
32 is = attachment.getContentsAsStream();
33
34 if (is == null)
35 {
36 log.warn("Encountered attachment with null stream: " + attachment.getFileName());
37 return;
38 }
39
40 defaultSearchableText.append(extractText(is, attachment));
41 }
42 catch (IOException e)
43 {
44 log.warn("Error reading attachment (" + attachment + ")" , e);
45 }
46 catch (ExtractorException e)
47 {
48 log.warn("Error indexing attachment (" + attachment + ")" , e);
49 }
50 catch (RuntimeException e)
51 {
52 log.warn("Error indexing attachment (" + attachment + ")" , e);
53 }
54 finally
55 {
56 IOUtils.closeQuietly(is);
57 }
58
59 }
60
61 protected boolean shouldExtractFrom(String fileName, String contentType)
62 {
63 for (int i = 0; i < getMatchingFileExtensions().length; i++)
64 {
65 if (fileName.endsWith(getMatchingFileExtensions()[i]))
66 return true;
67 }
68
69 for (int i = 0; i < getMatchingContentTypes().length; i++)
70 {
71 String validType = getMatchingContentTypes()[i];
72 if (validType.equalsIgnoreCase(contentType))
73 return true;
74 }
75
76 return false;
77 }
78
79 protected String[] getMatchingContentTypes()
80 {
81 return new String[0];
82 }
83
84 protected String[] getMatchingFileExtensions()
85 {
86 return new String[0];
87 }
88
89
90
91
92
93
94
95
96
97
98 protected abstract String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException;
99 }