View Javadoc

1   package com.atlassian.bonnie.search.extractor;
2   
3   import com.atlassian.bonnie.search.SearchableAttachment;
4   import org.apache.log4j.Category;
5   import org.pdfbox.pdmodel.PDDocument;
6   import org.pdfbox.util.PDFTextStripper;
7   import org.pdfbox.exceptions.CryptographyException;
8   import org.pdfbox.exceptions.InvalidPasswordException;
9   
10  import java.io.InputStream;
11  import java.io.StringWriter;
12  
13  public class PdfContentExtractor extends BaseAttachmentContentExtractor
14  {
15      public static final Category log = Category.getInstance(PdfContentExtractor.class);
16  
17      private static final String[] EXTENSIONS = { "pdf" };
18      private static final String[] CONTENT_TYPES = { "application/pdf" };
19  
20      protected String[] getMatchingContentTypes()
21      {
22          return CONTENT_TYPES;
23      }
24  
25      protected String[] getMatchingFileExtensions()
26      {
27          return EXTENSIONS;
28      }
29  
30      /*
31       * customized copy of the LucenePDFDocument.addContent() in PDFBox
32       */
33      protected String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException
34  
35      {
36          PDDocument pdfDocument = null;
37          try
38          {
39              pdfDocument = PDDocument.load(is);
40  
41              if (pdfDocument.isEncrypted())
42              {
43                  //Just try using the default password and move on
44                  pdfDocument.decrypt("");
45              }
46  
47              StringWriter writer = new StringWriter();
48              PDFTextStripper stripper = new PDFTextStripper();
49              stripper.writeText(pdfDocument, writer);
50              writer.close();
51  
52              return writer.getBuffer().toString();
53          }
54          catch (CryptographyException e)
55          {
56              throw new ExtractorException("Could not decrypt PDF document: " + e.getMessage(), e);
57          }
58          catch (InvalidPasswordException e)
59          {
60              //they didn't suppply a password and the default of "" was wrong.
61              throw new ExtractorException("Password required for encrypted PDF document", e);
62          }
63          catch (Exception e)
64          {
65              throw new ExtractorException("Error getting content of PDF document", e);
66          }
67          finally
68          {
69              if (pdfDocument != null) try { pdfDocument.close(); } catch (Exception e) {}
70          }
71      }
72  }