1 package com.atlassian.bonnie.search.extractor;
2
3 import org.apache.poi.hslf.model.TextRun;
4 import org.apache.poi.hslf.record.*;
5 import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
6 import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
7 import org.apache.poi.poifs.filesystem.DocumentInputStream;
8 import org.apache.poi.util.LittleEndian;
9 import org.slf4j.Logger;
10 import org.slf4j.LoggerFactory;
11
12 import java.io.ByteArrayOutputStream;
13 import java.util.Collections;
14 import java.util.HashSet;
15 import java.util.Iterator;
16 import java.util.Set;
17
18
19
20
21
22
23 @Deprecated
24 public class PowerPointListener implements POIFSReaderListener
25 {
26 private static Logger log = LoggerFactory.getLogger(PowerPointListener.class);
27 private final StringBuffer buffer;
28
29 public static final Set
30 public static final Set
31
32 static
33 {
34 Set
35 tempIgnoredText.add("*");
36 tempIgnoredText.add("Microsoft PowerPoint Presentation");
37 tempIgnoredText.add("Click to edit Master title style");
38 tempIgnoredText.add("Click to edit Master subtitle style");
39 tempIgnoredText.add("Click to edit Master text styles\nSecond level\nThird level\nFourth level\nFifth level");
40 tempIgnoredText.add("Default Design");
41 tempIgnoredText.add("Microsoft Excel Worksheet");
42 tempIgnoredText.add("Worksheet");
43 tempIgnoredText.add("MS Org Chart");
44
45 IGNORED_TEXT = Collections.unmodifiableSet(tempIgnoredText);
46
47 Set
48 tempIgnoredPrefixes.add("PowerPoint.Show.");
49 tempIgnoredPrefixes.add("Excel.Sheet.");
50 tempIgnoredPrefixes.add("___PPT");
51 tempIgnoredPrefixes.add("MS Organization Chart ");
52 tempIgnoredPrefixes.add("WordArt ");
53 tempIgnoredPrefixes.add("MSWordArt.");
54 tempIgnoredPrefixes.add("Microsoft WordArt ");
55 tempIgnoredPrefixes.add("Microsoft WordArt ");
56 tempIgnoredPrefixes.add("OrgPlusWOPX.");
57
58 IGNORED_PREFIXES = Collections.unmodifiableSet(tempIgnoredPrefixes);
59 }
60
61 public PowerPointListener(StringBuffer buff)
62 {
63 this.buffer = buff;
64 }
65
66 public void processPOIFSReaderEvent(POIFSReaderEvent event)
67 {
68 try
69 {
70 if ("PowerPoint Document".equals(event.getName()))
71 {
72 DocumentInputStream dis = event.getStream();
73 ByteArrayOutputStream bos = new ByteArrayOutputStream();
74 try
75 {
76 byte[] documentBytes = new byte[dis.available()];
77 int bytesRead = dis.read(documentBytes, 0, dis.available());
78
79 if (log.isDebugEnabled() && bytesRead != documentBytes.length)
80 log.debug("Bytes read from powerpoint file != bytes available");
81
82 int pos = 0;
83
84 do
85 {
86 pos = findTextRecords(pos, documentBytes);
87 }
88 while (pos != -1);
89 }
90
91
92 finally
93 {
94 dis.close();
95 bos.close();
96 }
97 }
98 }
99 catch (Exception e)
100 {
101 log.error("Error extracting PowerPoint text: " + e, e);
102 }
103 }
104
105
106 public int findTextRecords(int startPos, byte[] documentBytes)
107 {
108
109
110 int len = (int) LittleEndian.getUInt(documentBytes, startPos + 4);
111 byte opt = documentBytes[startPos];
112
113
114
115 int container = (int) opt & 0x0f;
116 if (container == 0x0f)
117 {
118 return (startPos + 8);
119 }
120
121
122 long type = LittleEndian.getUShort(documentBytes, startPos + 2);
123 TextRun textRun = null;
124
125
126 if (type == RecordTypes.TextBytesAtom.typeID)
127 {
128 TextBytesAtom tba = (TextBytesAtom) Record.createRecordForType(type, documentBytes, startPos, len + 8);
129 textRun = new TextRun((TextHeaderAtom) null, tba, (StyleTextPropAtom) null);
130 }
131
132 if (type == RecordTypes.TextCharsAtom.typeID)
133 {
134 TextCharsAtom tca = (TextCharsAtom) Record.createRecordForType(type, documentBytes, startPos, len + 8);
135 textRun = new TextRun((TextHeaderAtom) null, tca, (StyleTextPropAtom) null);
136 }
137
138
139 if (type == RecordTypes.CString.typeID)
140 {
141 CString cs = (CString) Record.createRecordForType(type, documentBytes, startPos, len + 8);
142 appendText(cs.getText());
143 }
144
145
146 if (textRun != null)
147 {
148 String textRunAsText = textRun.getText();
149 appendText(textRunAsText);
150 }
151
152
153 int newPos = (startPos + 8 + len);
154 if (newPos > (documentBytes.length - 8))
155 {
156 newPos = -1;
157 }
158 return newPos;
159 }
160
161 private void appendText(String textRunAsText)
162 {
163 if (log.isDebugEnabled()) log.debug(textRunAsText);
164
165 if (IGNORED_TEXT.contains(textRunAsText))
166 return;
167
168 for (Iterator iterator = IGNORED_PREFIXES.iterator(); iterator.hasNext();)
169 {
170 String prefix = (String) iterator.next();
171 if (textRunAsText.startsWith(prefix))
172 return;
173 }
174
175 buffer.append(textRunAsText);
176 buffer.append(" ");
177 }
178
179 }