1   package com.atlassian.bonnie.search.summary;
2   
3   import com.atlassian.bonnie.ILuceneConnection;
4   import com.atlassian.bonnie.analyzer.LuceneAnalyzerFactory;
5   import com.atlassian.bonnie.search.BaseDocumentBuilder;
6   import org.apache.commons.lang.StringUtils;
7   import org.apache.lucene.analysis.Analyzer;
8   import org.apache.lucene.analysis.Token;
9   import org.apache.lucene.analysis.TokenStream;
10  import org.apache.lucene.analysis.standard.StandardAnalyzer;
11  import org.apache.lucene.index.IndexReader;
12  import org.apache.lucene.index.Term;
13  import org.apache.lucene.index.TermEnum;
14  import org.apache.lucene.queryParser.ParseException;
15  import org.apache.lucene.queryParser.QueryParser;
16  import org.apache.lucene.search.Query;
17  import org.apache.lucene.util.Version;
18  import org.slf4j.LoggerFactory;
19  import org.slf4j.Logger;
20  
21  import java.io.IOException;
22  import java.io.StringReader;
23  import java.util.*;
24  import java.util.regex.Pattern;
25  
26  
27  /**
28   * Originally from org.apache.nutch.searcher.Summarizer v 0.7 (Revision: <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java?rev=179640&view=markup">179640</a>)
29   * <p/>
30   * Implements hit summarization using a sliding window and various document fragments.
31   */
32  public class Summarizer
33  {
34      private static final Logger log = LoggerFactory.getLogger(Summarizer.class);
35  
36      /**
37       * The default number of context terms to display preceding and following matches.
38       */
39      private static final int DEFAULT_SUM_CONTEXT = 10;
40  
41      /**
42       * The default total number of terms to display in a summary.
43       */
44      private static final int DEFAULT_SUM_LENGTH = 30;
45  
46      private Analyzer analyzer;
47      private StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
48      private int sumContext = DEFAULT_SUM_CONTEXT;
49      private int sumLength = DEFAULT_SUM_LENGTH;
50      private ILuceneConnection luceneConnection;
51  
52      public Summarizer() {
53      }
54  
55      public Summarizer(Analyzer analyzer) {
56          this.analyzer = analyzer;
57      }
58  
59      public Summarizer(Analyzer analyzer, int sumContext, int sumLength, ILuceneConnection luceneConnection) {
60          this.analyzer = analyzer;
61          this.sumContext = sumContext;
62          this.sumLength = sumLength;
63          this.luceneConnection = luceneConnection;
64      }
65  
66      public Summary getSummary(String text) throws IOException {
67          return this.getSummary(text, null);
68      }
69  
70      /**
71       * Returns a summary for the given pre-tokenized text.
72       */
73      public Summary getSummary(String text, String query) throws IOException {
74          // Simplistic implementation.  Finds the first fragments in the document
75          // containing any query terms.
76          //
77          // TODO: check that phrases in the query are matched in the fragment
78  
79          log.debug("\n\ntext = " + text);
80          log.debug("query = " + query);
81  
82          Token[] tokens = parseText(text);             // parse text to token array
83  
84  
85          if (log.isDebugEnabled()) {
86              StringBuffer buf = new StringBuffer();
87              for (int i = 0; i < tokens.length; i++) {
88                  buf.append(tokens[i].termText());
89                  if (i != (tokens.length - 1))
90                      buf.append(", ");
91              }
92              log.debug("tokens = ");
93          }
94  
95          if (tokens.length == 0)
96              return new Summary();
97  
98          Set highlight = getTerms(query);            // put query terms in table
99  
100         log.debug("highlight = " + highlight);
101 
102         // Create a SortedSet that ranks excerpts according to
103         // how many query terms are present.  An excerpt is
104         // a Vector full of Fragments and Highlights
105         SortedSet excerptSet = new TreeSet(new Comparator() {
106             public int compare(Object o1, Object o2) {
107                 Excerpt excerpt1 = (Excerpt) o1;
108                 Excerpt excerpt2 = (Excerpt) o2;
109 
110                 if (excerpt1 == null && excerpt2 != null) {
111                     return -1;
112                 } else if (excerpt1 != null && excerpt2 == null) {
113                     return 1;
114                 } else if (excerpt1 == null && excerpt2 == null) {
115                     return 0;
116                 }
117 
118                 int numToks1 = excerpt1.numUniqueTokens();
119                 int numToks2 = excerpt2.numUniqueTokens();
120 
121                 if (numToks1 < numToks2) {
122                     return -1;
123                 } else if (numToks1 == numToks2) {
124                     return excerpt1.numFragments() - excerpt2.numFragments();
125                 } else {
126                     return 1;
127                 }
128             }
129         }
130         );
131 
132         int lastExcerptPos = 0;
133 
134         if (highlight.size() > 0) // if we have any query terms
135         {
136             // Iterate through all terms in the document
137             for (int i = 0; i < tokens.length; i++) {
138                 // If we find a term that's in the query...
139                 if (highlight.contains(tokens[i].term())) {
140                     // Start searching at a point sumContext terms back,
141                     // and move sumContext terms into the future.
142                     int startToken = (i > sumContext) ? i - sumContext : 0;
143                     int endToken = Math.min(i + sumContext, tokens.length);
144                     int startOffset = tokens[startToken].startOffset();
145                     int currentToken = startToken;
146 
147                     // Iterate from the start point to the finish, adding
148                     // terms all the way.  The end of the passage is always
149                     // sumContext beyond the last query-term.
150                     Excerpt excerpt = new Excerpt();
151                     if (startOffset != 0) {
152                         excerpt.add(new Summary.Ellipsis());
153                     }
154 
155                     // Iterate through as long as we're before the end of
156                     // the document and we haven't hit the max-number-of-items
157                     // -in-a-summary.
158                     while ((currentToken < endToken) && (currentToken - startToken < sumLength)) {
159                         // Now grab the hit-element, if present
160                         Token t = tokens[currentToken];
161                         if (highlight.contains(t.termText())) {
162                             excerpt.addToken(t.termText());
163                             excerpt.add(new Summary.Fragment(text.substring(startOffset, t.startOffset())));
164                             excerpt.add(new Summary.Highlight(text.substring(t.startOffset(), t.endOffset())));
165                             startOffset = t.endOffset();
166                             endToken = Math.min(currentToken + sumContext, tokens.length);
167                         }
168 
169                         currentToken++;
170                     }
171 
172                     lastExcerptPos = endToken;
173 
174                     // We found the series of search-term hits and added
175                     // them (with intervening text) to the excerpt.  Now
176                     // we need to add the trailing edge of text.
177                     //
178                     // So if (currentToken < tokens.length) then there is still trailing
179                     // text to add.  (We haven't hit the end of the source doc.)
180                     // Add the words since the last hit-term insert.
181                     if (currentToken < tokens.length) {
182                         excerpt.add(new Summary.Fragment(text.substring(startOffset, tokens[currentToken].endOffset())));
183                     } else {
184                         // This else block is the fix for JST-884 (Search results truncated after keyword).
185                         int endOffset = tokens[(tokens.length - 1)].endOffset();
186                         String trailingFragment = text.substring(startOffset, endOffset);
187                         if(!StringUtils.isEmpty(trailingFragment))
188                         {
189                             excerpt.add(new Summary.Fragment(trailingFragment));
190                         }
191                     }
192 
193                     // Remember how many terms are in this excerpt
194                     excerpt.setNumTerms(currentToken - startToken);
195 
196                     // Store the excerpt for later sorting
197                     excerptSet.add(excerpt);
198 
199                     // Start sumContext places away.  The next
200                     // search for relevant excerpts begins at i-sumContext
201                     i = currentToken + sumContext;
202                 }
203             }
204         }
205 
206         // If the target text doesn't appear, then we just
207         // excerpt the first sumLength words from the document.
208         if (excerptSet.size() == 0) {
209             Excerpt excerpt = new Excerpt();
210             int excerptLen = Math.min(sumLength, tokens.length);
211             lastExcerptPos = excerptLen;
212 
213             excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen - 1].endOffset())));
214             excerpt.setNumTerms(excerptLen);
215             excerptSet.add(excerpt);
216         }
217 
218         log.debug("Found excerpts = " + excerptSet.size());
219 
220         // Now choose the best items from the excerpt set.
221         // Stop when our Summary grows too large.
222         double tokenCount = 0;
223         Summary s = new Summary();
224         while (tokenCount <= sumLength && excerptSet.size() > 0) {
225             Excerpt excerpt = (Excerpt) excerptSet.last();
226             excerptSet.remove(excerpt);
227 
228             double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
229             for (Enumeration e = excerpt.elements(); e.hasMoreElements();) {
230                 Summary.Fragment f = (Summary.Fragment) e.nextElement();
231                 // Don't add fragments if it takes us over the max-limit
232                 if (tokenCount + tokenFraction <= sumLength) {
233                     s.add(f);
234                 }
235                 tokenCount += tokenFraction;
236             }
237         }
238 
239         if (tokenCount > 0 && lastExcerptPos < tokens.length)
240             s.add(new Summary.Ellipsis());
241         return s;
242     }
243 
244     /**
245      * We use Lucene queries, not Nutch ones - so getting terms is a little different for us.
246      * <p/>
247      * Right now this just does simple string manipulation.
248      *
249      * @param query
250      * @return String[] A list of the individual terms in the query.
251      */
252     private Set getTerms(final String query) {
253         if (StringUtils.isNotEmpty(query)) {
254             try {
255                 Set tokens = new HashSet();
256                 if (luceneConnection != null && query.indexOf('*') > -1)            // only expand wildcard queries
257                 {
258                     QueryParser qp = new QueryParser(
259                             BaseDocumentBuilder.FieldName.CONTENT_BODY, analyzer);  // use standardanalyzer to avoid potential double-stem
260                     try {
261                         final Query parsed = qp.parse(query);
262                         final String[] queryTemp = new String[1];
263                         Set<String> set = (Set<String>)luceneConnection.withReader(new ILuceneConnection.ReaderAction() {
264                             public Object perform(IndexReader reader) throws IOException {
265                                 //TODO lucene upgrade, this is horrible but seems to be the only way to achieve what we want..
266                                     String transformedQuery = query.replaceAll("\\.","\\."); //looks stupid because the first is a regexp and the second the expression, so we replace all . with \.
267                                     transformedQuery = transformedQuery.replaceAll("\\*",".*");
268                                     transformedQuery = transformedQuery.replaceAll("\\?",".");
269                                 Set<String> set = new HashSet<String>();
270                                 TermEnum termEnum = reader.terms();
271                                 String[] tokens = transformedQuery.split(" ");
272 
273                                 while(termEnum.next())
274                                 {
275                                     Term t = termEnum.term();
276                                     for(int i=0;i<tokens.length;i++)
277                                     {
278                                         if( Pattern.matches(tokens[i],t.text()))
279                                         {
280                                             set.add(t.text());
281                                         }
282                                     }
283                                 }
284                                 return set;
285                             }
286                         });
287                         tokens.addAll(set);
288                     }
289                     catch (ParseException e) {
290                         log.warn("Error encountered parsing query: " + query + " for wildcard match.", e);
291                     }
292                 }
293                 TokenStream ts = analyzer.tokenStream(BaseDocumentBuilder.FieldName.CONTENT_BODY, new StringReader(query));
294                 for (Token token = ts.next(); token != null; token = ts.next()) {
295                     tokens.add(token.term());
296                 }
297                 return tokens;
298             }
299             catch (IOException e) {
300                 log.error(e.getMessage(), e);
301             }
302         }
303 
304         return Collections.EMPTY_SET;
305     }
306 
307     private Token[] parseText(String text) throws IOException {
308         if (text == null || text.trim().equals(""))
309             return new Token[0];
310 
311         ArrayList result = new ArrayList();
312         TokenStream ts = analyzer.tokenStream(BaseDocumentBuilder.FieldName.CONTENT_BODY, new StringReader(text));
313         for (Token token = ts.next(); token != null; token = ts.next()) {
314             result.add(token);
315         }
316         return (Token[]) result.toArray(new Token[result.size()]);
317     }
318 
319     public void setAnalyzer(Analyzer analyzer) {
320         this.analyzer = analyzer;
321     }
322 
323     public void setSumContext(int sumContext) {
324         this.sumContext = sumContext;
325     }
326 
327     public void setSumLength(int sumLength) {
328         this.sumLength = sumLength;
329     }
330 
331     public void setAnalyzerFactory(LuceneAnalyzerFactory f) {
332         this.analyzer = f.createAnalyzer();
333     }
334 
335     public void setLuceneConnection(ILuceneConnection luceneConnection) {
336         this.luceneConnection = luceneConnection;
337     }
338 }