1   package com.atlassian.bonnie.search.extractor;
2   
3   import com.atlassian.bonnie.search.Extractor;
4   import com.atlassian.bonnie.search.MockSearchableAttachment;
5   
6   import java.io.IOException;
7   
8   public class TestPdfContentExtractor extends BaseAttachmentContentExtractorTest
9   {
10      public Extractor getExtractor()
11      {
12          return new PdfContentExtractor();
13      }
14  
15  	public void testSimplePdf()
16  	{
17  		assertOnExtractedTextOf(createSearchableAttachment("test-attachment-search.pdf", "application/pdf"), new String[]{"feature"}, new String[]{"apples"});
18  	}
19  
20      //BONNIE-43 - Testing that extraction works with PDFs with different PDF versions and content creators
21      //Files are actual problem files that customers reported having issues with
22      public void testPdfWithDifferentContentCreators()
23      {
24          //PDF version 1.3
25          assertOnExtractedTextOf(createSearchableAttachment("test-v1_3.pdf", "application/pdf"), new String[]{"grasses", "colleagues"}, new String[]{});
26          //PDF version 1.4
27          assertOnExtractedTextOf(createSearchableAttachment("test-v1_4.pdf", "application/pdf"), new String[]{}, new String[]{});
28  
29      }
30  
31      public void testInternationalisedPdf()
32      {
33          //Non-English characters
34          assertOnExtractedTextOf(createSearchableAttachment("chinese-characters.pdf", "application/pdf"), new String[]{"\u5c0f\u96de"}, new String[]{});
35          //Right-to-left languages
36          assertOnExtractedTextOf(createSearchableAttachment("arabic-characters.pdf", "application/pdf"), new String[]{"Romanization", "\u0637\u0648\u064a\u0644\u0629"}, new String[]{});
37      }
38  
39      public void testExtractorExceptionThrownOnError() throws IOException
40      {
41          MockSearchableAttachment attachment = createSearchableAttachment("test-attachment-search.txt", "text/plain");
42          attachment.setContentType("application/pdf");
43          try
44          {
45              ((PdfContentExtractor) extractor).extractText(attachment.getContentsAsStream(), attachment);
46              fail("Exception expected");
47          }
48          catch (ExtractorException e)
49          {
50              // pass - exception expected
51          }
52      }
53  }