View Javadoc

1   package com.atlassian.bonnie.search.summary;
2   
3   import com.atlassian.bonnie.ILuceneConnection;
4   import com.atlassian.bonnie.analyzer.LuceneAnalyzerFactory;
5   import com.atlassian.bonnie.search.BaseDocumentBuilder;
6   import org.apache.commons.lang.StringUtils;
7   import org.apache.lucene.analysis.Analyzer;
8   import org.apache.lucene.analysis.Token;
9   import org.apache.lucene.analysis.TokenStream;
10  import org.apache.lucene.analysis.standard.StandardAnalyzer;
11  import org.apache.lucene.index.IndexReader;
12  import org.apache.lucene.queryParser.ParseException;
13  import org.apache.lucene.queryParser.QueryParser;
14  import org.apache.lucene.search.Query;
15  import org.slf4j.LoggerFactory;
16  import org.slf4j.Logger;
17  
18  import java.io.IOException;
19  import java.io.StringReader;
20  import java.util.*;
21  
22  
23  /**
24   * Originally from org.apache.nutch.searcher.Summarizer v 0.7 (Revision: <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java?rev=179640&view=markup">179640</a>)
25   * <p/>
26   * Implements hit summarization using a sliding window and various document fragments.
27   */
28  public class Summarizer
29  {
30      private static final Logger log = LoggerFactory.getLogger(Summarizer.class);
31  
32      /**
33       * The default number of context terms to display preceding and following matches.
34       */
35      private static final int DEFAULT_SUM_CONTEXT = 10;
36  
37      /**
38       * The default total number of terms to display in a summary.
39       */
40      private static final int DEFAULT_SUM_LENGTH = 30;
41  
42      private Analyzer analyzer;
43      private StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
44      private int sumContext = DEFAULT_SUM_CONTEXT;
45      private int sumLength = DEFAULT_SUM_LENGTH;
46      private ILuceneConnection luceneConnection;
47  
48      public Summarizer() {
49      }
50  
51      public Summarizer(Analyzer analyzer) {
52          this.analyzer = analyzer;
53      }
54  
55      public Summarizer(Analyzer analyzer, int sumContext, int sumLength, ILuceneConnection luceneConnection) {
56          this.analyzer = analyzer;
57          this.sumContext = sumContext;
58          this.sumLength = sumLength;
59          this.luceneConnection = luceneConnection;
60      }
61  
62      public Summary getSummary(String text) throws IOException {
63          return this.getSummary(text, null);
64      }
65  
66      /**
67       * Returns a summary for the given pre-tokenized text.
68       */
69      public Summary getSummary(String text, String query) throws IOException {
70          // Simplistic implementation.  Finds the first fragments in the document
71          // containing any query terms.
72          //
73          // TODO: check that phrases in the query are matched in the fragment
74  
75          log.debug("\n\ntext = " + text);
76          log.debug("query = " + query);
77  
78          Token[] tokens = parseText(text);             // parse text to token array
79  
80  
81          if (log.isDebugEnabled()) {
82              StringBuffer buf = new StringBuffer();
83              for (int i = 0; i < tokens.length; i++) {
84                  buf.append(tokens[i].termText());
85                  if (i != (tokens.length - 1))
86                      buf.append(", ");
87              }
88              log.debug("tokens = ");
89          }
90  
91          if (tokens.length == 0)
92              return new Summary();
93  
94          Set highlight = getTerms(query);            // put query terms in table
95  
96          log.debug("highlight = " + highlight);
97  
98          // Create a SortedSet that ranks excerpts according to
99          // how many query terms are present.  An excerpt is
100         // a Vector full of Fragments and Highlights
101         SortedSet excerptSet = new TreeSet(new Comparator() {
102             public int compare(Object o1, Object o2) {
103                 Excerpt excerpt1 = (Excerpt) o1;
104                 Excerpt excerpt2 = (Excerpt) o2;
105 
106                 if (excerpt1 == null && excerpt2 != null) {
107                     return -1;
108                 } else if (excerpt1 != null && excerpt2 == null) {
109                     return 1;
110                 } else if (excerpt1 == null && excerpt2 == null) {
111                     return 0;
112                 }
113 
114                 int numToks1 = excerpt1.numUniqueTokens();
115                 int numToks2 = excerpt2.numUniqueTokens();
116 
117                 if (numToks1 < numToks2) {
118                     return -1;
119                 } else if (numToks1 == numToks2) {
120                     return excerpt1.numFragments() - excerpt2.numFragments();
121                 } else {
122                     return 1;
123                 }
124             }
125         }
126         );
127 
128         int lastExcerptPos = 0;
129 
130         if (highlight.size() > 0) // if we have any query terms
131         {
132             // Iterate through all terms in the document
133             for (int i = 0; i < tokens.length; i++) {
134                 // If we find a term that's in the query...
135                 if (highlight.contains(tokens[i].termText())) {
136                     // Start searching at a point sumContext terms back,
137                     // and move sumContext terms into the future.
138                     int startToken = (i > sumContext) ? i - sumContext : 0;
139                     int endToken = Math.min(i + sumContext, tokens.length);
140                     int startOffset = tokens[startToken].startOffset();
141                     int currentToken = startToken;
142 
143                     // Iterate from the start point to the finish, adding
144                     // terms all the way.  The end of the passage is always
145                     // sumContext beyond the last query-term.
146                     Excerpt excerpt = new Excerpt();
147                     if (startOffset != 0) {
148                         excerpt.add(new Summary.Ellipsis());
149                     }
150 
151                     // Iterate through as long as we're before the end of
152                     // the document and we haven't hit the max-number-of-items
153                     // -in-a-summary.
154                     while ((currentToken < endToken) && (currentToken - startToken < sumLength)) {
155                         // Now grab the hit-element, if present
156                         Token t = tokens[currentToken];
157                         if (highlight.contains(t.termText())) {
158                             excerpt.addToken(t.termText());
159                             excerpt.add(new Summary.Fragment(text.substring(startOffset, t.startOffset())));
160                             excerpt.add(new Summary.Highlight(text.substring(t.startOffset(), t.endOffset())));
161                             startOffset = t.endOffset();
162                             endToken = Math.min(currentToken + sumContext, tokens.length);
163                         }
164 
165                         currentToken++;
166                     }
167 
168                     lastExcerptPos = endToken;
169 
170                     // We found the series of search-term hits and added
171                     // them (with intervening text) to the excerpt.  Now
172                     // we need to add the trailing edge of text.
173                     //
174                     // So if (currentToken < tokens.length) then there is still trailing
175                     // text to add.  (We haven't hit the end of the source doc.)
176                     // Add the words since the last hit-term insert.
177                     if (currentToken < tokens.length) {
178                         excerpt.add(new Summary.Fragment(text.substring(startOffset, tokens[currentToken].endOffset())));
179                     } else {
180                         // This else block is the fix for JST-884 (Search results truncated after keyword).
181                         int endOffset = tokens[(tokens.length - 1)].endOffset();
182                         String trailingFragment = text.substring(startOffset, endOffset);
183                         if(!StringUtils.isEmpty(trailingFragment))
184                         {
185                             excerpt.add(new Summary.Fragment(trailingFragment));
186                         }
187                     }
188 
189                     // Remember how many terms are in this excerpt
190                     excerpt.setNumTerms(currentToken - startToken);
191 
192                     // Store the excerpt for later sorting
193                     excerptSet.add(excerpt);
194 
195                     // Start sumContext places away.  The next
196                     // search for relevant excerpts begins at i-sumContext
197                     i = currentToken + sumContext;
198                 }
199             }
200         }
201 
202         // If the target text doesn't appear, then we just
203         // excerpt the first sumLength words from the document.
204         if (excerptSet.size() == 0) {
205             Excerpt excerpt = new Excerpt();
206             int excerptLen = Math.min(sumLength, tokens.length);
207             lastExcerptPos = excerptLen;
208 
209             excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen - 1].endOffset())));
210             excerpt.setNumTerms(excerptLen);
211             excerptSet.add(excerpt);
212         }
213 
214         log.debug("Found excerpts = " + excerptSet.size());
215 
216         // Now choose the best items from the excerpt set.
217         // Stop when our Summary grows too large.
218         double tokenCount = 0;
219         Summary s = new Summary();
220         while (tokenCount <= sumLength && excerptSet.size() > 0) {
221             Excerpt excerpt = (Excerpt) excerptSet.last();
222             excerptSet.remove(excerpt);
223 
224             double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
225             for (Enumeration e = excerpt.elements(); e.hasMoreElements();) {
226                 Summary.Fragment f = (Summary.Fragment) e.nextElement();
227                 // Don't add fragments if it takes us over the max-limit
228                 if (tokenCount + tokenFraction <= sumLength) {
229                     s.add(f);
230                 }
231                 tokenCount += tokenFraction;
232             }
233         }
234 
235         if (tokenCount > 0 && lastExcerptPos < tokens.length)
236             s.add(new Summary.Ellipsis());
237         return s;
238     }
239 
240     /**
241      * We use Lucene queries, not Nutch ones - so getting terms is a little different for us.
242      * <p/>
243      * Right now this just does simple string manipulation.
244      *
245      * @param query
246      * @return String[] A list of the individual terms in the query.
247      */
248     private Set getTerms(String query) {
249         if (StringUtils.isNotEmpty(query)) {
250             try {
251                 Set tokens = new HashSet();
252                 if (luceneConnection != null && query.indexOf('*') > -1)            // only expand wildcard queries
253                 {
254                     QueryParser qp = new QueryParser(
255                             BaseDocumentBuilder.FieldName.CONTENT_BODY, standardAnalyzer);  // use standardanalyzer to avoid potential double-stem
256                     try {
257                         final Query parsed = qp.parse(query);
258                         final String[] queryTemp = new String[1];
259                         luceneConnection.withReader(new ILuceneConnection.ReaderAction() {
260                             public Object perform(IndexReader reader) throws IOException {
261                                 Query q = parsed.rewrite(reader);
262                                 queryTemp[0] = q.toString().replaceAll(BaseDocumentBuilder.FieldName.CONTENT_BODY + ":", "");
263                                 return null;
264                             }
265                         });
266                         String[] terms = queryTemp[0].split(" ");
267                         for (int i = 0; i < terms.length; ++i) tokens.add(terms[i]);
268                     }
269                     catch (ParseException e) {
270                         log.warn("Error encountered parsing query: " + query + " for wildcard match.", e);
271                     }
272                 }
273                 TokenStream ts = analyzer.tokenStream(BaseDocumentBuilder.FieldName.CONTENT_BODY, new StringReader(query));
274                 for (Token token = ts.next(); token != null; token = ts.next()) {
275                     tokens.add(token.termText());
276                 }
277                 return tokens;
278             }
279             catch (IOException e) {
280                 log.error(e.getMessage(), e);
281             }
282         }
283 
284         return Collections.EMPTY_SET;
285     }
286 
287     private Token[] parseText(String text) throws IOException {
288         if (text == null || text.trim().equals(""))
289             return new Token[0];
290 
291         ArrayList result = new ArrayList();
292         TokenStream ts = analyzer.tokenStream(BaseDocumentBuilder.FieldName.CONTENT_BODY, new StringReader(text));
293         for (Token token = ts.next(); token != null; token = ts.next()) {
294             result.add(token);
295         }
296         return (Token[]) result.toArray(new Token[result.size()]);
297     }
298 
299     public void setAnalyzer(Analyzer analyzer) {
300         this.analyzer = analyzer;
301     }
302 
303     public void setSumContext(int sumContext) {
304         this.sumContext = sumContext;
305     }
306 
307     public void setSumLength(int sumLength) {
308         this.sumLength = sumLength;
309     }
310 
311     public void setAnalyzerFactory(LuceneAnalyzerFactory f) {
312         this.analyzer = f.createAnalyzer();
313     }
314 
315     public void setLuceneConnection(ILuceneConnection luceneConnection) {
316         this.luceneConnection = luceneConnection;
317     }
318 }