1 package com.atlassian.bonnie.search.extractor;
2
3 import com.atlassian.bonnie.search.SearchableAttachment;
4
5 import java.io.InputStream;
6 import java.io.IOException;
7 import java.io.ByteArrayOutputStream;
8
9 import org.apache.log4j.Category;
10 import org.apache.poi.poifs.eventfilesystem.POIFSReader;
11 import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
12 import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
13 import org.apache.poi.util.LittleEndian;
14
15 public class MsPowerpointContentExtractor extends BaseAttachmentContentExtractor {
16 static final String MASTER_TITLE_STYLE = "Click to edit Master title style";
17 static final String MASTER_TEXT_STYLE = "Click to edit Master text styles";
18
19 public static final Category log = Category.getInstance(MsPowerpointContentExtractor.class);
20
21 private static final String[] CONTENT_TYPES = {"application/powerpoint", "application/mspowerpoint",
22 "application/x-mspowerpoint", "application/vnd.ms-powerpoint"};
23 private static final String[] EXTENSIONS = {"ppt"};
24
25 private static class PowerPointListener implements POIFSReaderListener {
26 private final StringBuffer buff;
27 private static final String SPACE = " ";
28
29 public PowerPointListener(StringBuffer buff) {
30 this.buff = buff;
31 }
32
33 public void processPOIFSReaderEvent(POIFSReaderEvent event) {
34 try {
35
36 if ("PowerPoint Document".equals(event.getName())) {
37 org.apache.poi.poifs.filesystem.DocumentInputStream dis = event.getStream();
38 ByteArrayOutputStream bos = new ByteArrayOutputStream();
39 try {
40 byte[] btoWrite = new byte[dis.available()];
41 int bytesRead = dis.read(btoWrite, 0, dis.available());
42
43 if (log.isDebugEnabled() && bytesRead != btoWrite.length)
44 log.debug("Bytes read from powerpoint file != bytes available");
45 processContent(btoWrite, 0, btoWrite.length, false, false);
46 }
47
48
49 finally {
50 dis.close();
51 bos.close();
52 }
53 }
54 }
55 catch (Exception e)
56 {
57 log.error("Error extracting PowerPoint text: " + e, e);
58 }
59 }
60
61 private void processContent(byte[] buffer, int beginIndex, int endIndex, boolean masterTitleFound,
62 boolean masterStylesFound) {
63 while (beginIndex < endIndex) {
64 int containerFlag = LittleEndian.getUShort(buffer, beginIndex);
65 int recordType = LittleEndian.getUShort(buffer, beginIndex + 2);
66 long recordLength = LittleEndian.getUInt(buffer, beginIndex + 4);
67 beginIndex += 8;
68 if ((containerFlag & 0x0f) == 0x0f) {
69 processContent(buffer, beginIndex, beginIndex + (int) recordLength, masterTitleFound,
70 masterStylesFound);
71 } else if (recordType == 4008) {
72 if (recordLength > 10000) {
73 log.warn("Document appears to have invalid record length of " +
74 recordLength + " for data segment. Document may be corrupted");
75 continue;
76 }
77 String str = new String(buffer, beginIndex, (int) recordLength);
78 boolean outputText = true;
79 if (!masterTitleFound && str.startsWith(MASTER_TITLE_STYLE)) {
80 masterTitleFound = true;
81 outputText = false;
82 }
83
84 if (!masterStylesFound && str.startsWith(MASTER_TEXT_STYLE)) {
85 masterStylesFound = true;
86 outputText = false;
87 }
88 if (outputText) {
89 buff.append(str);
90 buff.append(SPACE);
91 }
92 }
93 beginIndex += (int) recordLength;
94 }
95 }
96 }
97
98 protected String[] getMatchingContentTypes() {
99 return CONTENT_TYPES;
100 }
101
102 protected String[] getMatchingFileExtensions() {
103 return EXTENSIONS;
104 }
105
106 protected String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException {
107 StringBuffer content = new StringBuffer();
108
109 POIFSReader r = new POIFSReader();
110
111
112 r.registerListener(new PowerPointListener(content));
113
114 try
115 {
116 r.read(is);
117 }
118 catch (IOException e)
119 {
120 throw new ExtractorException("Error reading content of Powerpoint document: " + e.getMessage(), e);
121
122 }
123
124 return content.toString();
125 }
126 }