1   package com.atlassian.bonnie.search.extractor;
2   
3   import com.atlassian.bonnie.search.SearchableAttachment;
4   import org.apache.pdfbox.pdmodel.PDDocument;
5   import org.apache.pdfbox.util.PDFTextStripper;
6   import org.apache.pdfbox.exceptions.CryptographyException;
7   import org.apache.pdfbox.exceptions.InvalidPasswordException;
8   import org.slf4j.Logger;
9   import org.slf4j.LoggerFactory;
10  
11  import java.io.InputStream;
12  import java.io.StringWriter;
13  
14  public class PdfContentExtractor extends BaseAttachmentContentExtractor
15  {
16      private static final Logger log = LoggerFactory.getLogger(PdfContentExtractor.class);
17  
18      private static final String[] EXTENSIONS = { "pdf" };
19      private static final String[] CONTENT_TYPES = { "application/pdf" };
20  
21      protected String[] getMatchingContentTypes()
22      {
23          return CONTENT_TYPES;
24      }
25  
26      protected String[] getMatchingFileExtensions()
27      {
28          return EXTENSIONS;
29      }
30  
31      /*
32       * customized copy of the LucenePDFDocument.addContent() in PDFBox
33       */
34      protected String extractText(InputStream is, SearchableAttachment attachment) throws ExtractorException
35  
36      {
37          PDDocument pdfDocument = null;
38          try
39          {
40              pdfDocument = PDDocument.load(is);
41  
42              if (pdfDocument.isEncrypted())
43              {
44                  //Just try using the default password and move on
45                  pdfDocument.decrypt("");
46              }
47  
48              StringWriter writer = new StringWriter();
49              PDFTextStripper stripper = new PDFTextStripper();
50              stripper.writeText(pdfDocument, writer);
51              writer.close();
52  
53              return writer.getBuffer().toString();
54          }
55          catch (CryptographyException e)
56          {
57              throw new ExtractorException("Could not decrypt PDF document: " + e.getMessage(), e);
58          }
59          catch (InvalidPasswordException e)
60          {
61              //they didn't suppply a password and the default of "" was wrong.
62              throw new ExtractorException("Password required for encrypted PDF document", e);
63          }
64          catch (Exception e)
65          {
66              throw new ExtractorException("Error getting content of PDF document", e);
67          }
68          finally
69          {
70              if (pdfDocument != null) try { pdfDocument.close(); } catch (Exception e) {}
71          }
72      }
73  }