View Javadoc

1   package com.atlassian.bonnie.search.summary;
2   
3   import com.atlassian.bonnie.LuceneConnection;
4   import com.atlassian.bonnie.analyzer.LuceneAnalyzerFactory;
5   import com.atlassian.bonnie.search.DocumentBuilder;
6   import org.apache.commons.lang.StringUtils;
7   import org.apache.log4j.Category;
8   import org.apache.lucene.analysis.Analyzer;
9   import org.apache.lucene.analysis.Token;
10  import org.apache.lucene.analysis.TokenStream;
11  import org.apache.lucene.analysis.standard.StandardAnalyzer;
12  import org.apache.lucene.index.IndexReader;
13  import org.apache.lucene.queryParser.ParseException;
14  import org.apache.lucene.queryParser.QueryParser;
15  import org.apache.lucene.search.Query;
16  
17  import java.io.IOException;
18  import java.io.StringReader;
19  import java.util.*;
20  
21  
22  /**
23   * Originally from org.apache.nutch.searcher.Summarizer v 0.7 (Revision: <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java?rev=179640&view=markup">179640</a>)
24   * <p/>
25   * Implements hit summarization using a sliding window and various document fragments.
26   */
27  public class Summarizer
28  {
29      private static final Category log = Category.getInstance(Summarizer.class);
30  
31      /**
32       * The default number of context terms to display preceding and following matches.
33       */
34      private static final int DEFAULT_SUM_CONTEXT = 10;
35  
36      /**
37       * The default total number of terms to display in a summary.
38       */
39      private static final int DEFAULT_SUM_LENGTH = 30;
40  
41      private Analyzer analyzer;
42      private StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
43      private int sumContext = DEFAULT_SUM_CONTEXT;
44      private int sumLength = DEFAULT_SUM_LENGTH;
45      private LuceneConnection luceneConnection;
46  
47      public Summarizer() {
48      }
49  
50      public Summarizer(Analyzer analyzer) {
51          this.analyzer = analyzer;
52      }
53  
54      public Summarizer(Analyzer analyzer, int sumContext, int sumLength, LuceneConnection luceneConnection) {
55          this.analyzer = analyzer;
56          this.sumContext = sumContext;
57          this.sumLength = sumLength;
58          this.luceneConnection = luceneConnection;
59      }
60  
61      public Summary getSummary(String text) throws IOException {
62          return this.getSummary(text, null);
63      }
64  
65      /**
66       * Returns a summary for the given pre-tokenized text.
67       */
68      public Summary getSummary(String text, String query) throws IOException {
69          // Simplistic implementation.  Finds the first fragments in the document
70          // containing any query terms.
71          //
72          // TODO: check that phrases in the query are matched in the fragment
73  
74          log.debug("\n\ntext = " + text);
75          log.debug("query = " + query);
76  
77          Token[] tokens = parseText(text);             // parse text to token array
78  
79  
80          if (log.isDebugEnabled()) {
81              StringBuffer buf = new StringBuffer();
82              for (int i = 0; i < tokens.length; i++) {
83                  buf.append(tokens[i].termText());
84                  if (i != (tokens.length - 1))
85                      buf.append(", ");
86              }
87              log.debug("tokens = ");
88          }
89  
90          if (tokens.length == 0)
91              return new Summary();
92  
93          Set highlight = getTerms(query);            // put query terms in table
94  
95          log.debug("highlight = " + highlight);
96  
97          // Create a SortedSet that ranks excerpts according to
98          // how many query terms are present.  An excerpt is
99          // a Vector full of Fragments and Highlights
100         SortedSet excerptSet = new TreeSet(new Comparator() {
101             public int compare(Object o1, Object o2) {
102                 Excerpt excerpt1 = (Excerpt) o1;
103                 Excerpt excerpt2 = (Excerpt) o2;
104 
105                 if (excerpt1 == null && excerpt2 != null) {
106                     return -1;
107                 } else if (excerpt1 != null && excerpt2 == null) {
108                     return 1;
109                 } else if (excerpt1 == null && excerpt2 == null) {
110                     return 0;
111                 }
112 
113                 int numToks1 = excerpt1.numUniqueTokens();
114                 int numToks2 = excerpt2.numUniqueTokens();
115 
116                 if (numToks1 < numToks2) {
117                     return -1;
118                 } else if (numToks1 == numToks2) {
119                     return excerpt1.numFragments() - excerpt2.numFragments();
120                 } else {
121                     return 1;
122                 }
123             }
124         }
125         );
126 
127         int lastExcerptPos = 0;
128 
129         if (highlight.size() > 0) // if we have any query terms
130         {
131             // Iterate through all terms in the document
132             for (int i = 0; i < tokens.length; i++) {
133                 // If we find a term that's in the query...
134                 if (highlight.contains(tokens[i].termText())) {
135                     // Start searching at a point sumContext terms back,
136                     // and move sumContext terms into the future.
137                     int startToken = (i > sumContext) ? i - sumContext : 0;
138                     int endToken = Math.min(i + sumContext, tokens.length);
139                     int offset = tokens[startToken].startOffset();
140                     int j = startToken;
141 
142                     // Iterate from the start point to the finish, adding
143                     // terms all the way.  The end of the passage is always
144                     // sumContext beyond the last query-term.
145                     Excerpt excerpt = new Excerpt();
146                     if (offset != 0) {
147                         excerpt.add(new Summary.Ellipsis());
148                     }
149 
150                     // Iterate through as long as we're before the end of
151                     // the document and we haven't hit the max-number-of-items
152                     // -in-a-summary.
153                     while ((j < endToken) && (j - startToken < sumLength)) {
154                         // Now grab the hit-element, if present
155                         Token t = tokens[j];
156                         if (highlight.contains(t.termText())) {
157                             excerpt.addToken(t.termText());
158                             excerpt.add(new Summary.Fragment(text.substring(offset, t.startOffset())));
159                             excerpt.add(new Summary.Highlight(text.substring(t.startOffset(), t.endOffset())));
160                             offset = t.endOffset();
161                             endToken = Math.min(j + sumContext, tokens.length);
162                         }
163 
164                         j++;
165                     }
166 
167                     lastExcerptPos = endToken;
168 
169                     // We found the series of search-term hits and added
170                     // them (with intervening text) to the excerpt.  Now
171                     // we need to add the trailing edge of text.
172                     //
173                     // So if (j < tokens.length) then there is still trailing
174                     // text to add.  (We haven't hit the end of the source doc.)
175                     // Add the words since the last hit-term insert.
176                     if (j < tokens.length) {
177                         excerpt.add(new Summary.Fragment(text.substring(offset, tokens[j].endOffset())));
178                     }
179 
180                     // Remember how many terms are in this excerpt
181                     excerpt.setNumTerms(j - startToken);
182 
183                     // Store the excerpt for later sorting
184                     excerptSet.add(excerpt);
185 
186                     // Start sumContext places away.  The next
187                     // search for relevant excerpts begins at i-sumContext
188                     i = j + sumContext;
189                 }
190             }
191         }
192 
193         // If the target text doesn't appear, then we just
194         // excerpt the first sumLength words from the document.
195         if (excerptSet.size() == 0) {
196             Excerpt excerpt = new Excerpt();
197             int excerptLen = Math.min(sumLength, tokens.length);
198             lastExcerptPos = excerptLen;
199 
200             excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen - 1].endOffset())));
201             excerpt.setNumTerms(excerptLen);
202             excerptSet.add(excerpt);
203         }
204 
205         log.debug("Found excerpts = " + excerptSet.size());
206 
207         // Now choose the best items from the excerpt set.
208         // Stop when our Summary grows too large.
209         double tokenCount = 0;
210         Summary s = new Summary();
211         while (tokenCount <= sumLength && excerptSet.size() > 0) {
212             Excerpt excerpt = (Excerpt) excerptSet.last();
213             excerptSet.remove(excerpt);
214 
215             double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
216             for (Enumeration e = excerpt.elements(); e.hasMoreElements();) {
217                 Summary.Fragment f = (Summary.Fragment) e.nextElement();
218                 // Don't add fragments if it takes us over the max-limit
219                 if (tokenCount + tokenFraction <= sumLength) {
220                     s.add(f);
221                 }
222                 tokenCount += tokenFraction;
223             }
224         }
225 
226         if (tokenCount > 0 && lastExcerptPos < tokens.length)
227             s.add(new Summary.Ellipsis());
228         return s;
229     }
230 
231     /**
232      * We use Lucene queries, not Nutch ones - so getting terms is a little different for us.
233      * <p/>
234      * Right now this just does simple string manipulation.
235      *
236      * @param query
237      * @return String[] A list of the individual terms in the query.
238      */
239     private Set getTerms(String query) {
240         if (StringUtils.isNotEmpty(query)) {
241             try {
242                 Set tokens = new HashSet();
243                 if (luceneConnection != null && query.indexOf('*') > -1)            // only expand wildcard queries
244                 {
245                     QueryParser qp = new QueryParser(
246                             DocumentBuilder.CONTENT_FIELD_NAME, standardAnalyzer);  // use standardanalyzer to avoid potential double-stem
247                     try {
248                         final Query parsed = qp.parse(query);
249                         final String[] queryTemp = new String[1];
250                         luceneConnection.withReader(new LuceneConnection.ReaderAction() {
251                             public Object perform(IndexReader reader) throws IOException {
252                                 Query q = parsed.rewrite(reader);
253                                 queryTemp[0] = q.toString().replaceAll(DocumentBuilder.CONTENT_FIELD_NAME + ":", "");
254                                 return null;
255                             }
256                         });
257                         String[] terms = queryTemp[0].split(" ");
258                         for (int i = 0; i < terms.length; ++i) tokens.add(terms[i]);
259                     }
260                     catch (ParseException e) {
261                         log.warn("Error encountered parsing query: " + query + " for wildcard match.", e);
262                     }
263                 }
264                 TokenStream ts = analyzer.tokenStream(DocumentBuilder.CONTENT_FIELD_NAME, new StringReader(query));
265                 for (Token token = ts.next(); token != null; token = ts.next()) {
266                     tokens.add(token.termText());
267                 }
268                 return tokens;
269             }
270             catch (IOException e) {
271                 log.error(e, e);
272             }
273         }
274 
275         return Collections.EMPTY_SET;
276     }
277 
278     private Token[] parseText(String text) throws IOException {
279         if (text == null || text.trim().equals(""))
280             return new Token[0];
281 
282         ArrayList result = new ArrayList();
283         TokenStream ts = analyzer.tokenStream(DocumentBuilder.CONTENT_FIELD_NAME, new StringReader(text));
284         for (Token token = ts.next(); token != null; token = ts.next()) {
285             result.add(token);
286         }
287         return (Token[]) result.toArray(new Token[result.size()]);
288     }
289 
290     public void setAnalyzer(Analyzer analyzer) {
291         this.analyzer = analyzer;
292     }
293 
294     public void setSumContext(int sumContext) {
295         this.sumContext = sumContext;
296     }
297 
298     public void setSumLength(int sumLength) {
299         this.sumLength = sumLength;
300     }
301 
302     public void setAnalyzerFactory(LuceneAnalyzerFactory f) {
303         this.analyzer = f.createAnalyzer();
304     }
305 
306     public void setLuceneConnection(LuceneConnection luceneConnection) {
307         this.luceneConnection = luceneConnection;
308     }
309 }