View Javadoc

1   package com.atlassian.bonnie.search.extractor;
2   
3   import com.atlassian.bonnie.search.SearchableAttachment;
4   import org.apache.commons.io.IOUtils;
5   import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
6   import org.apache.poi.hssf.eventusermodel.HSSFListener;
7   import org.apache.poi.hssf.eventusermodel.HSSFRequest;
8   import org.apache.poi.hssf.record.LabelSSTRecord;
9   import org.apache.poi.hssf.record.NumberRecord;
10  import org.apache.poi.hssf.record.Record;
11  import org.apache.poi.hssf.record.SSTRecord;
12  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
13  
14  import java.io.IOException;
15  import java.io.InputStream;
16  
17  @Deprecated
18  public class MsExcelContentExtractor extends BaseAttachmentContentExtractor
19  {
20      private static final String[] CONTENT_TYPES = {"application/excel", "application/x-excel",
21              "application/x-msexcel", "application/vnd.ms-excel"};
22      private static final String[] EXTENSIONS = {"xls"};
23  
24      private static class ExcelEventListener implements HSSFListener
25      {
26          private final StringBuffer buff;
27          private SSTRecord sstrec;
28          private static final char SPACE = ' ';
29  
30          public ExcelEventListener(StringBuffer buff)
31          {
32              this.buff = buff;
33          }
34  
35          public void processRecord(Record record)
36          {
37              switch (record.getSid())
38              {
39  				case NumberRecord.sid:
40                      NumberRecord numrec = (NumberRecord) record;
41  					final double numberValue = numrec.getValue();
42  					if (isInteger(numberValue))
43  						buff.append((int) numberValue).append(SPACE);
44  					else
45  						buff.append(numberValue).append(SPACE);
46  					break;
47                  case SSTRecord.sid: // the SSTRecord stores a set of all the strings that appear in an Excel document (that is, all strings stored in this record are unique, though they may appear multiple times in the document)
48                      sstrec = (SSTRecord) record; // store a reference to the SSTRecord so we can get at these unique strings later by index
49                      break;
50                  case LabelSSTRecord.sid:
51                      LabelSSTRecord lrec = (LabelSSTRecord) record;
52                      buff.append(sstrec.getString(lrec.getSSTIndex())).append(SPACE);
53                      break;
54  			}
55          }
56  
57  		/**
58  		 * Determines if the double is actually an integer (and hence whether it is safe to cast to an int without loss of precision)
59  		 */
60  		private boolean isInteger(double doubleValue)
61  		{
62  			double floored = Math.floor(doubleValue);
63  			return doubleValue - floored == 0;
64  		}
65  	}
66  
67      protected String[] getMatchingContentTypes()
68      {
69          return CONTENT_TYPES;
70      }
71  
72      protected String[] getMatchingFileExtensions()
73      {
74          return EXTENSIONS;
75      }
76  
77      protected String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException
78      {
79          StringBuffer content = new StringBuffer();
80  
81          InputStream din = null;
82  
83          try
84          {
85              POIFSFileSystem poifs = new POIFSFileSystem(is);
86  
87              // get the Workbook (excel part) stream in a InputStream
88              din = poifs.createDocumentInputStream("Workbook");
89  
90              HSSFRequest req = new HSSFRequest();
91              // lazy listen for ALL records with the listener shown above
92              req.addListenerForAllRecords(new ExcelEventListener(content));
93  
94              HSSFEventFactory factory = new HSSFEventFactory();
95              factory.processEvents(req, din);
96          }
97          catch (IOException e)
98          {
99              throw new ExtractorException("Error reading content of Excel document: " + e.getMessage(), e);
100         }
101         finally
102         {
103             IOUtils.closeQuietly(din);
104         }
105 
106 
107         return content.toString();
108     }
109 }