1 package com.atlassian.bonnie.search.extractor;
2
3 import com.atlassian.bonnie.search.SearchableAttachment;
4 import org.apache.log4j.Category;
5 import org.pdfbox.pdmodel.PDDocument;
6 import org.pdfbox.util.PDFTextStripper;
7 import org.pdfbox.exceptions.CryptographyException;
8 import org.pdfbox.exceptions.InvalidPasswordException;
9
10 import java.io.InputStream;
11 import java.io.StringWriter;
12
13 public class PdfContentExtractor extends BaseAttachmentContentExtractor
14 {
15 public static final Category log = Category.getInstance(PdfContentExtractor.class);
16
17 private static final String[] EXTENSIONS = { "pdf" };
18 private static final String[] CONTENT_TYPES = { "application/pdf" };
19
20 protected String[] getMatchingContentTypes()
21 {
22 return CONTENT_TYPES;
23 }
24
25 protected String[] getMatchingFileExtensions()
26 {
27 return EXTENSIONS;
28 }
29
30
31
32
33 protected String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException
34
35 {
36 PDDocument pdfDocument = null;
37 try
38 {
39 pdfDocument = PDDocument.load(is);
40
41 if (pdfDocument.isEncrypted())
42 {
43
44 pdfDocument.decrypt("");
45 }
46
47 StringWriter writer = new StringWriter();
48 PDFTextStripper stripper = new PDFTextStripper();
49 stripper.writeText(pdfDocument, writer);
50 writer.close();
51
52 return writer.getBuffer().toString();
53 }
54 catch (CryptographyException e)
55 {
56 throw new ExtractorException("Could not decrypt PDF document: " + e.getMessage(), e);
57 }
58 catch (InvalidPasswordException e)
59 {
60
61 throw new ExtractorException("Password required for encrypted PDF document", e);
62 }
63 catch (Exception e)
64 {
65 throw new ExtractorException("Error getting content of PDF document", e);
66 }
67 finally
68 {
69 if (pdfDocument != null) try { pdfDocument.close(); } catch (Exception e) {}
70 }
71 }
72 }