View Javadoc

1   package com.atlassian.bonnie.search.extractor;
2   
3   import org.apache.poi.hslf.model.TextRun;
4   import org.apache.poi.hslf.record.*;
5   import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
6   import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
7   import org.apache.poi.poifs.filesystem.DocumentInputStream;
8   import org.apache.poi.util.LittleEndian;
9   import org.slf4j.Logger;
10  import org.slf4j.LoggerFactory;
11  
12  import java.io.ByteArrayOutputStream;
13  import java.util.Collections;
14  import java.util.HashSet;
15  import java.util.Iterator;
16  import java.util.Set;
17  
18  /**
19   * Listener for responding to read events thrown when reading a powerpoint document. This implementation is largely
20   * based on {@link org.apache.poi.hslf.extractor.QuickButCruddyTextExtractor} (the {@link #findTextRecords(int,byte[])}
21   * has been copied and adapted). This listener is able to extract unicode text.
22   */
23  @Deprecated
24  public class PowerPointListener implements POIFSReaderListener
25  {
26  	private static Logger log = LoggerFactory.getLogger(PowerPointListener.class);
27  	private final StringBuffer buffer;
28  
29  	public static final Set/*<String>*/ IGNORED_TEXT;
30  	public static final Set/*<String>*/ IGNORED_PREFIXES;
31  
32  	static
33  	{
34  		Set/*<String>*/ tempIgnoredText = new HashSet/*<String>*/(9);
35  		tempIgnoredText.add("*");
36  		tempIgnoredText.add("Microsoft PowerPoint Presentation");
37  		tempIgnoredText.add("Click to edit Master title style");
38  		tempIgnoredText.add("Click to edit Master subtitle style");
39  		tempIgnoredText.add("Click to edit Master text styles\nSecond level\nThird level\nFourth level\nFifth level");
40  		tempIgnoredText.add("Default Design");
41  		tempIgnoredText.add("Microsoft Excel Worksheet");
42  		tempIgnoredText.add("Worksheet");
43  		tempIgnoredText.add("MS Org Chart");
44  
45  		IGNORED_TEXT = Collections.unmodifiableSet(tempIgnoredText);
46  
47  		Set/*<String>*/ tempIgnoredPrefixes = new HashSet/*<String>*/(9);
48  		tempIgnoredPrefixes.add("PowerPoint.Show.");
49  		tempIgnoredPrefixes.add("Excel.Sheet.");
50  		tempIgnoredPrefixes.add("___PPT");
51  		tempIgnoredPrefixes.add("MS Organization Chart ");
52  		tempIgnoredPrefixes.add("WordArt ");
53  		tempIgnoredPrefixes.add("MSWordArt.");
54  		tempIgnoredPrefixes.add("Microsoft WordArt ");
55  		tempIgnoredPrefixes.add("Microsoft WordArt ");
56  		tempIgnoredPrefixes.add("OrgPlusWOPX.");
57  
58  		IGNORED_PREFIXES = Collections.unmodifiableSet(tempIgnoredPrefixes);
59  	}
60  
61  	public PowerPointListener(StringBuffer buff)
62  	{
63  		this.buffer = buff;
64  	}
65  
66  	public void processPOIFSReaderEvent(POIFSReaderEvent event)
67  	{
68  		try
69  		{
70  			if ("PowerPoint Document".equals(event.getName()))
71  			{
72  				DocumentInputStream dis = event.getStream();
73  				ByteArrayOutputStream bos = new ByteArrayOutputStream();
74  				try
75  				{
76  					byte[] documentBytes = new byte[dis.available()];
77  					int bytesRead = dis.read(documentBytes, 0, dis.available());
78  
79  					if (log.isDebugEnabled() && bytesRead != documentBytes.length)
80  						log.debug("Bytes read from powerpoint file != bytes available");
81  
82  					int pos = 0;
83  
84  					do
85  					{
86  						pos = findTextRecords(pos, documentBytes);
87  					}
88  					while (pos != -1);
89  				}
90  
91  
92  				finally
93  				{
94  					dis.close();
95  					bos.close();
96  				}
97  			}
98  		}
99  		catch (Exception e)
100 		{
101 			log.error("Error extracting PowerPoint text: " + e, e);
102 		}
103 	}
104 
105 
106 	public int findTextRecords(int startPos, byte[] documentBytes)
107 	{
108 		// Grab the length, and the first option byte
109 		// Note that the length doesn't include the 8 byte atom header
110 		int len = (int) LittleEndian.getUInt(documentBytes, startPos + 4);
111 		byte opt = documentBytes[startPos];
112 
113 		// If it's a container, step into it and return
114 		// (If it's a container, option byte 1 BINARY_AND 0x0f will be 0x0f)
115 		int container = (int) opt & 0x0f;
116 		if (container == 0x0f)
117 		{
118 			return (startPos + 8);
119 		}
120 
121 		// Otherwise, check the type to see if it's text
122 		long type = LittleEndian.getUShort(documentBytes, startPos + 2);
123 		TextRun textRun = null;
124 
125 		// TextBytesAtom
126 		if (type == RecordTypes.TextBytesAtom.typeID)
127 		{
128 			TextBytesAtom tba = (TextBytesAtom) Record.createRecordForType(type, documentBytes, startPos, len + 8);
129 			textRun = new TextRun((TextHeaderAtom) null, tba, (StyleTextPropAtom) null);
130 		}
131 		// TextCharsAtom
132 		if (type == RecordTypes.TextCharsAtom.typeID)
133 		{
134 			TextCharsAtom tca = (TextCharsAtom) Record.createRecordForType(type, documentBytes, startPos, len + 8);
135 			textRun = new TextRun((TextHeaderAtom) null, tca, (StyleTextPropAtom) null);
136 		}
137 
138 		// CString (doesn't go via a TextRun)
139 		if (type == RecordTypes.CString.typeID)
140 		{
141 			CString cs = (CString) Record.createRecordForType(type, documentBytes, startPos, len + 8);
142 			appendText(cs.getText());
143 		}
144 
145 		// If we found text via a TextRun, save it in the vector
146 		if (textRun != null)
147 		{
148 			String textRunAsText = textRun.getText();
149 			appendText(textRunAsText);
150 		}
151 
152 		// Wind on by the atom length, and check we're not at the end
153 		int newPos = (startPos + 8 + len);
154 		if (newPos > (documentBytes.length - 8))
155 		{
156 			newPos = -1;
157 		}
158 		return newPos;
159 	}
160 
161 	private void appendText(String textRunAsText)
162 	{
163 		if (log.isDebugEnabled()) log.debug(textRunAsText);
164 
165 		if (IGNORED_TEXT.contains(textRunAsText))
166 			return;
167 
168 		for (Iterator iterator = IGNORED_PREFIXES.iterator(); iterator.hasNext();)
169 		{
170 			String prefix = (String) iterator.next();
171 			if (textRunAsText.startsWith(prefix))
172 				return;
173 		}
174 
175 		buffer.append(textRunAsText);
176 		buffer.append(" ");
177 	}
178 
179 }