1 package com.atlassian.bonnie.search.summary;
2
3 import com.atlassian.bonnie.LuceneConnection;
4 import com.atlassian.bonnie.analyzer.LuceneAnalyzerFactory;
5 import com.atlassian.bonnie.search.DocumentBuilder;
6 import org.apache.commons.lang.StringUtils;
7 import org.apache.log4j.Category;
8 import org.apache.lucene.analysis.Analyzer;
9 import org.apache.lucene.analysis.Token;
10 import org.apache.lucene.analysis.TokenStream;
11 import org.apache.lucene.analysis.standard.StandardAnalyzer;
12 import org.apache.lucene.index.IndexReader;
13 import org.apache.lucene.queryParser.ParseException;
14 import org.apache.lucene.queryParser.QueryParser;
15 import org.apache.lucene.search.Query;
16
17 import java.io.IOException;
18 import java.io.StringReader;
19 import java.util.*;
20
21
22
23
24
25
26
27 public class Summarizer
28 {
29 private static final Category log = Category.getInstance(Summarizer.class);
30
31
32
33
34 private static final int DEFAULT_SUM_CONTEXT = 10;
35
36
37
38
39 private static final int DEFAULT_SUM_LENGTH = 30;
40
41 private Analyzer analyzer;
42 private StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
43 private int sumContext = DEFAULT_SUM_CONTEXT;
44 private int sumLength = DEFAULT_SUM_LENGTH;
45 private LuceneConnection luceneConnection;
46
47 public Summarizer() {
48 }
49
50 public Summarizer(Analyzer analyzer) {
51 this.analyzer = analyzer;
52 }
53
54 public Summarizer(Analyzer analyzer, int sumContext, int sumLength, LuceneConnection luceneConnection) {
55 this.analyzer = analyzer;
56 this.sumContext = sumContext;
57 this.sumLength = sumLength;
58 this.luceneConnection = luceneConnection;
59 }
60
61 public Summary getSummary(String text) throws IOException {
62 return this.getSummary(text, null);
63 }
64
65
66
67
68 public Summary getSummary(String text, String query) throws IOException {
69
70
71
72
73
74 log.debug("\n\ntext = " + text);
75 log.debug("query = " + query);
76
77 Token[] tokens = parseText(text);
78
79
80 if (log.isDebugEnabled()) {
81 StringBuffer buf = new StringBuffer();
82 for (int i = 0; i < tokens.length; i++) {
83 buf.append(tokens[i].termText());
84 if (i != (tokens.length - 1))
85 buf.append(", ");
86 }
87 log.debug("tokens = ");
88 }
89
90 if (tokens.length == 0)
91 return new Summary();
92
93 Set highlight = getTerms(query);
94
95 log.debug("highlight = " + highlight);
96
97
98
99
100 SortedSet excerptSet = new TreeSet(new Comparator() {
101 public int compare(Object o1, Object o2) {
102 Excerpt excerpt1 = (Excerpt) o1;
103 Excerpt excerpt2 = (Excerpt) o2;
104
105 if (excerpt1 == null && excerpt2 != null) {
106 return -1;
107 } else if (excerpt1 != null && excerpt2 == null) {
108 return 1;
109 } else if (excerpt1 == null && excerpt2 == null) {
110 return 0;
111 }
112
113 int numToks1 = excerpt1.numUniqueTokens();
114 int numToks2 = excerpt2.numUniqueTokens();
115
116 if (numToks1 < numToks2) {
117 return -1;
118 } else if (numToks1 == numToks2) {
119 return excerpt1.numFragments() - excerpt2.numFragments();
120 } else {
121 return 1;
122 }
123 }
124 }
125 );
126
127 int lastExcerptPos = 0;
128
129 if (highlight.size() > 0)
130 {
131
132 for (int i = 0; i < tokens.length; i++) {
133
134 if (highlight.contains(tokens[i].termText())) {
135
136
137 int startToken = (i > sumContext) ? i - sumContext : 0;
138 int endToken = Math.min(i + sumContext, tokens.length);
139 int offset = tokens[startToken].startOffset();
140 int j = startToken;
141
142
143
144
145 Excerpt excerpt = new Excerpt();
146 if (offset != 0) {
147 excerpt.add(new Summary.Ellipsis());
148 }
149
150
151
152
153 while ((j < endToken) && (j - startToken < sumLength)) {
154
155 Token t = tokens[j];
156 if (highlight.contains(t.termText())) {
157 excerpt.addToken(t.termText());
158 excerpt.add(new Summary.Fragment(text.substring(offset, t.startOffset())));
159 excerpt.add(new Summary.Highlight(text.substring(t.startOffset(), t.endOffset())));
160 offset = t.endOffset();
161 endToken = Math.min(j + sumContext, tokens.length);
162 }
163
164 j++;
165 }
166
167 lastExcerptPos = endToken;
168
169
170
171
172
173
174
175
176 if (j < tokens.length) {
177 excerpt.add(new Summary.Fragment(text.substring(offset, tokens[j].endOffset())));
178 }
179
180
181 excerpt.setNumTerms(j - startToken);
182
183
184 excerptSet.add(excerpt);
185
186
187
188 i = j + sumContext;
189 }
190 }
191 }
192
193
194
195 if (excerptSet.size() == 0) {
196 Excerpt excerpt = new Excerpt();
197 int excerptLen = Math.min(sumLength, tokens.length);
198 lastExcerptPos = excerptLen;
199
200 excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen - 1].endOffset())));
201 excerpt.setNumTerms(excerptLen);
202 excerptSet.add(excerpt);
203 }
204
205 log.debug("Found excerpts = " + excerptSet.size());
206
207
208
209 double tokenCount = 0;
210 Summary s = new Summary();
211 while (tokenCount <= sumLength && excerptSet.size() > 0) {
212 Excerpt excerpt = (Excerpt) excerptSet.last();
213 excerptSet.remove(excerpt);
214
215 double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
216 for (Enumeration e = excerpt.elements(); e.hasMoreElements();) {
217 Summary.Fragment f = (Summary.Fragment) e.nextElement();
218
219 if (tokenCount + tokenFraction <= sumLength) {
220 s.add(f);
221 }
222 tokenCount += tokenFraction;
223 }
224 }
225
226 if (tokenCount > 0 && lastExcerptPos < tokens.length)
227 s.add(new Summary.Ellipsis());
228 return s;
229 }
230
231
232
233
234
235
236
237
238
239 private Set getTerms(String query) {
240 if (StringUtils.isNotEmpty(query)) {
241 try {
242 Set tokens = new HashSet();
243 if (luceneConnection != null && query.indexOf('*') > -1)
244 {
245 QueryParser qp = new QueryParser(
246 DocumentBuilder.CONTENT_FIELD_NAME, standardAnalyzer);
247 try {
248 final Query parsed = qp.parse(query);
249 final String[] queryTemp = new String[1];
250 luceneConnection.withReader(new LuceneConnection.ReaderAction() {
251 public Object perform(IndexReader reader) throws IOException {
252 Query q = parsed.rewrite(reader);
253 queryTemp[0] = q.toString().replaceAll(DocumentBuilder.CONTENT_FIELD_NAME + ":", "");
254 return null;
255 }
256 });
257 String[] terms = queryTemp[0].split(" ");
258 for (int i = 0; i < terms.length; ++i) tokens.add(terms[i]);
259 }
260 catch (ParseException e) {
261 log.warn("Error encountered parsing query: " + query + " for wildcard match.", e);
262 }
263 }
264 TokenStream ts = analyzer.tokenStream(DocumentBuilder.CONTENT_FIELD_NAME, new StringReader(query));
265 for (Token token = ts.next(); token != null; token = ts.next()) {
266 tokens.add(token.termText());
267 }
268 return tokens;
269 }
270 catch (IOException e) {
271 log.error(e, e);
272 }
273 }
274
275 return Collections.EMPTY_SET;
276 }
277
278 private Token[] parseText(String text) throws IOException {
279 if (text == null || text.trim().equals(""))
280 return new Token[0];
281
282 ArrayList result = new ArrayList();
283 TokenStream ts = analyzer.tokenStream(DocumentBuilder.CONTENT_FIELD_NAME, new StringReader(text));
284 for (Token token = ts.next(); token != null; token = ts.next()) {
285 result.add(token);
286 }
287 return (Token[]) result.toArray(new Token[result.size()]);
288 }
289
290 public void setAnalyzer(Analyzer analyzer) {
291 this.analyzer = analyzer;
292 }
293
294 public void setSumContext(int sumContext) {
295 this.sumContext = sumContext;
296 }
297
298 public void setSumLength(int sumLength) {
299 this.sumLength = sumLength;
300 }
301
302 public void setAnalyzerFactory(LuceneAnalyzerFactory f) {
303 this.analyzer = f.createAnalyzer();
304 }
305
306 public void setLuceneConnection(LuceneConnection luceneConnection) {
307 this.luceneConnection = luceneConnection;
308 }
309 }