View Javadoc

1   package com.atlassian.bonnie.search.extractor;
2   
3   import com.atlassian.bonnie.search.SearchableAttachment;
4   import com.atlassian.bonnie.search.extractor.BaseAttachmentContentExtractor;
5   
6   import java.io.InputStream;
7   import java.io.IOException;
8   
9   import org.apache.log4j.Category;
10  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
11  import org.apache.poi.hssf.eventusermodel.HSSFRequest;
12  import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
13  import org.apache.poi.hssf.eventusermodel.HSSFListener;
14  import org.apache.poi.hssf.record.SSTRecord;
15  import org.apache.poi.hssf.record.Record;
16  import org.apache.poi.hssf.record.NumberRecord;
17  import org.apache.poi.hssf.record.LabelSSTRecord;
18  import org.apache.commons.io.IOUtils;
19  
20  public class MsExcelContentExtractor extends BaseAttachmentContentExtractor
21  {
22      public static final Category log = Category.getInstance(MsExcelContentExtractor.class);
23  
24      private static final String[] CONTENT_TYPES = {"application/excel", "application/x-excel",
25              "application/x-msexcel", "application/vnd.ms-excel"};
26      private static final String[] EXTENSIONS = {"xls"};
27  
28      private static class EventCatcher implements HSSFListener
29      {
30          private final StringBuffer buff;
31          private SSTRecord sstrec;
32          private static final char SPACE = ' ';
33  
34          public EventCatcher(StringBuffer buff)
35          {
36              this.buff = buff;
37          }
38  
39          public void processRecord(Record record)
40          {
41              switch (record.getSid())
42              {
43                  case NumberRecord.sid:
44                      NumberRecord numrec = (NumberRecord) record;
45                      buff.append(numrec.getValue() + SPACE);
46                      break;
47                      // SSTRecords store a array of unique strings used in Excel.
48                  case SSTRecord.sid:
49                      sstrec = (SSTRecord) record;
50                      for (int k = 0; k < sstrec.getNumUniqueStrings(); k++)
51                      {
52                          buff.append(sstrec.getString(k)).append(SPACE);
53                      }
54                      break;
55                  case LabelSSTRecord.sid:
56                      LabelSSTRecord lrec = (LabelSSTRecord) record;
57                      buff.append(sstrec.getString(lrec.getSSTIndex())).append(SPACE);
58                      break;
59              }
60          }
61      }
62  
63      protected String[] getMatchingContentTypes()
64      {
65          return CONTENT_TYPES;
66      }
67  
68      protected String[] getMatchingFileExtensions()
69      {
70          return EXTENSIONS;
71      }
72  
73      protected String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException
74      {
75          StringBuffer content = new StringBuffer();
76  
77          InputStream din = null;
78  
79          try
80          {
81              POIFSFileSystem poifs = new POIFSFileSystem(is);
82  
83              // get the Workbook (excel part) stream in a InputStream
84              din = poifs.createDocumentInputStream("Workbook");
85  
86              HSSFRequest req = new HSSFRequest();
87              // lazy listen for ALL records with the listener shown above
88              req.addListenerForAllRecords(new EventCatcher(content));
89  
90              HSSFEventFactory factory = new HSSFEventFactory();
91              factory.processEvents(req, din);
92          }
93          catch (IOException e)
94          {
95              throw new ExtractorException("Error reading content of Excel document: " + e.getMessage(), e);
96          }
97          finally
98          {
99              IOUtils.closeQuietly(din);
100         }
101 
102 
103         return content.toString();
104     }
105 }