View Javadoc

1   package com.atlassian.bonnie.search.extractor;
2   
3   import com.atlassian.bonnie.search.SearchableAttachment;
4   
5   import java.io.InputStream;
6   import java.io.IOException;
7   import java.io.ByteArrayOutputStream;
8   
9   import org.apache.log4j.Category;
10  import org.apache.poi.poifs.eventfilesystem.POIFSReader;
11  import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
12  import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
13  import org.apache.poi.util.LittleEndian;
14  
15  public class MsPowerpointContentExtractor extends BaseAttachmentContentExtractor {
16      static final String MASTER_TITLE_STYLE = "Click to edit Master title style";
17      static final String MASTER_TEXT_STYLE = "Click to edit Master text styles";
18  
19      public static final Category log = Category.getInstance(MsPowerpointContentExtractor.class);
20  
21      private static final String[] CONTENT_TYPES = {"application/powerpoint", "application/mspowerpoint",
22              "application/x-mspowerpoint", "application/vnd.ms-powerpoint"};
23      private static final String[] EXTENSIONS = {"ppt"};
24  
25      private static class PowerPointListener implements POIFSReaderListener {
26          private final StringBuffer buff;
27          private static final String SPACE = " ";
28  
29          public PowerPointListener(StringBuffer buff) {
30              this.buff = buff;
31          }
32  
33          public void processPOIFSReaderEvent(POIFSReaderEvent event) {
34              try {
35  
36                  if ("PowerPoint Document".equals(event.getName())) {
37                      org.apache.poi.poifs.filesystem.DocumentInputStream dis = event.getStream();
38                      ByteArrayOutputStream bos = new ByteArrayOutputStream();
39                      try {
40                          byte[] btoWrite = new byte[dis.available()];
41                          int bytesRead = dis.read(btoWrite, 0, dis.available());
42  
43                          if (log.isDebugEnabled() && bytesRead != btoWrite.length)
44                              log.debug("Bytes read from powerpoint file != bytes available");
45                          processContent(btoWrite, 0, btoWrite.length, false, false);
46                      }
47  
48  
49                      finally {
50                          dis.close();
51                          bos.close();
52                      }
53                  }
54              }
55              catch (Exception e)
56              {
57                  log.error("Error extracting PowerPoint text: " + e, e);
58              }
59          }
60  
61          private void processContent(byte[] buffer, int beginIndex, int endIndex, boolean masterTitleFound,
62                                      boolean masterStylesFound) {
63              while (beginIndex < endIndex) {
64                  int containerFlag = LittleEndian.getUShort(buffer, beginIndex);
65                  int recordType = LittleEndian.getUShort(buffer, beginIndex + 2);
66                  long recordLength = LittleEndian.getUInt(buffer, beginIndex + 4);
67                  beginIndex += 8;
68                  if ((containerFlag & 0x0f) == 0x0f) {
69                      processContent(buffer, beginIndex, beginIndex + (int) recordLength, masterTitleFound,
70                              masterStylesFound);
71                  } else if (recordType == 4008) {
72                      if (recordLength > 10000) {
73                          log.warn("Document appears to have invalid record length of  " +
74                                  recordLength + " for data segment. Document may be corrupted");
75                          continue;
76                      }
77                      String str = new String(buffer, beginIndex, (int) recordLength);
78                      boolean outputText = true;
79                      if (!masterTitleFound && str.startsWith(MASTER_TITLE_STYLE)) {
80                          masterTitleFound = true;
81                          outputText = false;
82                      }
83                      // don't include the master styles slide
84                      if (!masterStylesFound && str.startsWith(MASTER_TEXT_STYLE)) {
85                          masterStylesFound = true;
86                          outputText = false;
87                      }
88                      if (outputText) {
89                          buff.append(str);
90                          buff.append(SPACE);
91                      }
92                  }
93                  beginIndex += (int) recordLength;
94              }
95          }
96      }
97  
98      protected String[] getMatchingContentTypes() {
99          return CONTENT_TYPES;
100     }
101 
102     protected String[] getMatchingFileExtensions() {
103         return EXTENSIONS;
104     }
105 
106     protected String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException {
107         StringBuffer content = new StringBuffer();
108 
109         POIFSReader r = new POIFSReader();
110 
111         /* Register a listener for *all* documents. */
112         r.registerListener(new PowerPointListener(content));
113 
114         try
115         {
116             r.read(is);
117         }
118         catch (IOException e)
119         {
120             throw new ExtractorException("Error reading content of Powerpoint document: " + e.getMessage(), e);
121 
122         }
123 
124         return content.toString();
125     }
126 }