1 package com.atlassian.bonnie.search.extractor;
2
3 import com.atlassian.bonnie.search.Extractor;
4 import com.atlassian.bonnie.search.SearchableAttachment;
5 import com.atlassian.bonnie.Searchable;
6 import org.apache.lucene.document.Document;
7 import org.apache.log4j.Category;
8 import org.apache.commons.io.IOUtils;
9
10 import java.io.InputStream;
11 import java.io.IOException;
12
13 public abstract class BaseAttachmentContentExtractor implements Extractor
14 {
15 public static final Category log = Category.getInstance(BaseAttachmentContentExtractor.class);
16
17 public void addFields(Document document, StringBuffer defaultSearchableText, Searchable searchable)
18 {
19 if (!(searchable instanceof SearchableAttachment)) return;
20 if (document.getField("contentBody") != null) return;
21 if (defaultSearchableText.length() != 0) return;
22
23 SearchableAttachment attachment = (SearchableAttachment) searchable;
24 String fileName = (attachment.getFileName() == null) ? "" : attachment.getFileName().toLowerCase();
25 String contentType = attachment.getContentType();
26 if (!shouldExtractFrom(fileName, contentType)) return;
27
28 InputStream is = null;
29 try
30 {
31 is = attachment.getContentsAsStream();
32 defaultSearchableText.append(extractText(is, attachment));
33 }
34 catch (IOException e)
35 {
36 log.warn("Error reading attachment (" + attachment + ")" , e);
37 }
38 catch (ExtractorException e)
39 {
40 log.warn("Error indexing attachment (" + attachment + ")" , e);
41 }
42 catch (RuntimeException e)
43 {
44 log.warn("Error indexing attachment (" + attachment + ")" , e);
45 }
46 finally
47 {
48 IOUtils.closeQuietly(is);
49 }
50
51 }
52
53 protected boolean shouldExtractFrom(String fileName, String contentType)
54 {
55 for (int i = 0; i < getMatchingFileExtensions().length; i++)
56 {
57 if (fileName.endsWith(getMatchingFileExtensions()[i]))
58 return true;
59 }
60
61 for (int i = 0; i < getMatchingContentTypes().length; i++)
62 {
63 String validType = getMatchingContentTypes()[i];
64 if (validType.equalsIgnoreCase(contentType))
65 return true;
66 }
67
68 return false;
69 }
70
71 protected String[] getMatchingContentTypes()
72 {
73 return new String[0];
74 }
75
76 protected String[] getMatchingFileExtensions()
77 {
78 return new String[0];
79 }
80
81
82
83
84
85
86
87
88
89
90 protected abstract String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException;
91 }