1 package com.atlassian.bonnie.search.extractor;
2
3 import com.atlassian.bonnie.search.SearchableAttachment;
4 import com.atlassian.bonnie.search.extractor.BaseAttachmentContentExtractor;
5
6 import java.io.InputStream;
7 import java.io.IOException;
8
9 import org.apache.log4j.Category;
10 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
11 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
12 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
13 import org.apache.poi.hssf.eventusermodel.HSSFListener;
14 import org.apache.poi.hssf.record.SSTRecord;
15 import org.apache.poi.hssf.record.Record;
16 import org.apache.poi.hssf.record.NumberRecord;
17 import org.apache.poi.hssf.record.LabelSSTRecord;
18 import org.apache.commons.io.IOUtils;
19
20 public class MsExcelContentExtractor extends BaseAttachmentContentExtractor
21 {
22 public static final Category log = Category.getInstance(MsExcelContentExtractor.class);
23
24 private static final String[] CONTENT_TYPES = {"application/excel", "application/x-excel",
25 "application/x-msexcel", "application/vnd.ms-excel"};
26 private static final String[] EXTENSIONS = {"xls"};
27
28 private static class EventCatcher implements HSSFListener
29 {
30 private final StringBuffer buff;
31 private SSTRecord sstrec;
32 private static final char SPACE = ' ';
33
34 public EventCatcher(StringBuffer buff)
35 {
36 this.buff = buff;
37 }
38
39 public void processRecord(Record record)
40 {
41 switch (record.getSid())
42 {
43 case NumberRecord.sid:
44 NumberRecord numrec = (NumberRecord) record;
45 buff.append(numrec.getValue() + SPACE);
46 break;
47
48 case SSTRecord.sid:
49 sstrec = (SSTRecord) record;
50 for (int k = 0; k < sstrec.getNumUniqueStrings(); k++)
51 {
52 buff.append(sstrec.getString(k)).append(SPACE);
53 }
54 break;
55 case LabelSSTRecord.sid:
56 LabelSSTRecord lrec = (LabelSSTRecord) record;
57 buff.append(sstrec.getString(lrec.getSSTIndex())).append(SPACE);
58 break;
59 }
60 }
61 }
62
63 protected String[] getMatchingContentTypes()
64 {
65 return CONTENT_TYPES;
66 }
67
68 protected String[] getMatchingFileExtensions()
69 {
70 return EXTENSIONS;
71 }
72
73 protected String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException
74 {
75 StringBuffer content = new StringBuffer();
76
77 InputStream din = null;
78
79 try
80 {
81 POIFSFileSystem poifs = new POIFSFileSystem(is);
82
83
84 din = poifs.createDocumentInputStream("Workbook");
85
86 HSSFRequest req = new HSSFRequest();
87
88 req.addListenerForAllRecords(new EventCatcher(content));
89
90 HSSFEventFactory factory = new HSSFEventFactory();
91 factory.processEvents(req, din);
92 }
93 catch (IOException e)
94 {
95 throw new ExtractorException("Error reading content of Excel document: " + e.getMessage(), e);
96 }
97 finally
98 {
99 IOUtils.closeQuietly(din);
100 }
101
102
103 return content.toString();
104 }
105 }