1 package com.atlassian.bonnie.search.extractor;
2
3 import com.atlassian.bonnie.search.SearchableAttachment;
4 import org.apache.pdfbox.pdmodel.PDDocument;
5 import org.apache.pdfbox.util.PDFTextStripper;
6 import org.apache.pdfbox.exceptions.CryptographyException;
7 import org.apache.pdfbox.exceptions.InvalidPasswordException;
8 import org.slf4j.Logger;
9 import org.slf4j.LoggerFactory;
10
11 import java.io.InputStream;
12 import java.io.StringWriter;
13
14 public class PdfContentExtractor extends BaseAttachmentContentExtractor
15 {
16 private static final Logger log = LoggerFactory.getLogger(PdfContentExtractor.class);
17
18 private static final String[] EXTENSIONS = { "pdf" };
19 private static final String[] CONTENT_TYPES = { "application/pdf" };
20
21 protected String[] getMatchingContentTypes()
22 {
23 return CONTENT_TYPES;
24 }
25
26 protected String[] getMatchingFileExtensions()
27 {
28 return EXTENSIONS;
29 }
30
31
32
33
34 protected String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException
35
36 {
37 PDDocument pdfDocument = null;
38 try
39 {
40 pdfDocument = PDDocument.load(is);
41
42 if (pdfDocument.isEncrypted())
43 {
44
45 pdfDocument.decrypt("");
46 }
47
48 StringWriter writer = new StringWriter();
49 PDFTextStripper stripper = new PDFTextStripper();
50 stripper.writeText(pdfDocument, writer);
51 writer.close();
52
53 return writer.getBuffer().toString();
54 }
55 catch (CryptographyException e)
56 {
57 throw new ExtractorException("Could not decrypt PDF document: " + e.getMessage(), e);
58 }
59 catch (InvalidPasswordException e)
60 {
61
62 throw new ExtractorException("Password required for encrypted PDF document", e);
63 }
64 catch (Exception e)
65 {
66 throw new ExtractorException("Error getting content of PDF document", e);
67 }
68 finally
69 {
70 if (pdfDocument != null) try { pdfDocument.close(); } catch (Exception e) {}
71 }
72 }
73 }