1 package com.atlassian.bonnie.search.summary;
2
3 import com.atlassian.bonnie.ILuceneConnection;
4 import com.atlassian.bonnie.analyzer.LuceneAnalyzerFactory;
5 import com.atlassian.bonnie.search.BaseDocumentBuilder;
6 import org.apache.commons.lang.StringUtils;
7 import org.apache.lucene.analysis.Analyzer;
8 import org.apache.lucene.analysis.Token;
9 import org.apache.lucene.analysis.TokenStream;
10 import org.apache.lucene.analysis.standard.StandardAnalyzer;
11 import org.apache.lucene.index.IndexReader;
12 import org.apache.lucene.queryParser.ParseException;
13 import org.apache.lucene.queryParser.QueryParser;
14 import org.apache.lucene.search.Query;
15 import org.slf4j.LoggerFactory;
16 import org.slf4j.Logger;
17
18 import java.io.IOException;
19 import java.io.StringReader;
20 import java.util.*;
21
22
23
24
25
26
27
28 public class Summarizer
29 {
30 private static final Logger log = LoggerFactory.getLogger(Summarizer.class);
31
32
33
34
35 private static final int DEFAULT_SUM_CONTEXT = 10;
36
37
38
39
40 private static final int DEFAULT_SUM_LENGTH = 30;
41
42 private Analyzer analyzer;
43 private StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
44 private int sumContext = DEFAULT_SUM_CONTEXT;
45 private int sumLength = DEFAULT_SUM_LENGTH;
46 private ILuceneConnection luceneConnection;
47
48 public Summarizer() {
49 }
50
51 public Summarizer(Analyzer analyzer) {
52 this.analyzer = analyzer;
53 }
54
55 public Summarizer(Analyzer analyzer, int sumContext, int sumLength, ILuceneConnection luceneConnection) {
56 this.analyzer = analyzer;
57 this.sumContext = sumContext;
58 this.sumLength = sumLength;
59 this.luceneConnection = luceneConnection;
60 }
61
62 public Summary getSummary(String text) throws IOException {
63 return this.getSummary(text, null);
64 }
65
66
67
68
69 public Summary getSummary(String text, String query) throws IOException {
70
71
72
73
74
75 log.debug("\n\ntext = " + text);
76 log.debug("query = " + query);
77
78 Token[] tokens = parseText(text);
79
80
81 if (log.isDebugEnabled()) {
82 StringBuffer buf = new StringBuffer();
83 for (int i = 0; i < tokens.length; i++) {
84 buf.append(tokens[i].termText());
85 if (i != (tokens.length - 1))
86 buf.append(", ");
87 }
88 log.debug("tokens = ");
89 }
90
91 if (tokens.length == 0)
92 return new Summary();
93
94 Set highlight = getTerms(query);
95
96 log.debug("highlight = " + highlight);
97
98
99
100
101 SortedSet excerptSet = new TreeSet(new Comparator() {
102 public int compare(Object o1, Object o2) {
103 Excerpt excerpt1 = (Excerpt) o1;
104 Excerpt excerpt2 = (Excerpt) o2;
105
106 if (excerpt1 == null && excerpt2 != null) {
107 return -1;
108 } else if (excerpt1 != null && excerpt2 == null) {
109 return 1;
110 } else if (excerpt1 == null && excerpt2 == null) {
111 return 0;
112 }
113
114 int numToks1 = excerpt1.numUniqueTokens();
115 int numToks2 = excerpt2.numUniqueTokens();
116
117 if (numToks1 < numToks2) {
118 return -1;
119 } else if (numToks1 == numToks2) {
120 return excerpt1.numFragments() - excerpt2.numFragments();
121 } else {
122 return 1;
123 }
124 }
125 }
126 );
127
128 int lastExcerptPos = 0;
129
130 if (highlight.size() > 0)
131 {
132
133 for (int i = 0; i < tokens.length; i++) {
134
135 if (highlight.contains(tokens[i].termText())) {
136
137
138 int startToken = (i > sumContext) ? i - sumContext : 0;
139 int endToken = Math.min(i + sumContext, tokens.length);
140 int startOffset = tokens[startToken].startOffset();
141 int currentToken = startToken;
142
143
144
145
146 Excerpt excerpt = new Excerpt();
147 if (startOffset != 0) {
148 excerpt.add(new Summary.Ellipsis());
149 }
150
151
152
153
154 while ((currentToken < endToken) && (currentToken - startToken < sumLength)) {
155
156 Token t = tokens[currentToken];
157 if (highlight.contains(t.termText())) {
158 excerpt.addToken(t.termText());
159 excerpt.add(new Summary.Fragment(text.substring(startOffset, t.startOffset())));
160 excerpt.add(new Summary.Highlight(text.substring(t.startOffset(), t.endOffset())));
161 startOffset = t.endOffset();
162 endToken = Math.min(currentToken + sumContext, tokens.length);
163 }
164
165 currentToken++;
166 }
167
168 lastExcerptPos = endToken;
169
170
171
172
173
174
175
176
177 if (currentToken < tokens.length) {
178 excerpt.add(new Summary.Fragment(text.substring(startOffset, tokens[currentToken].endOffset())));
179 } else {
180
181 int endOffset = tokens[(tokens.length - 1)].endOffset();
182 String trailingFragment = text.substring(startOffset, endOffset);
183 if(!StringUtils.isEmpty(trailingFragment))
184 {
185 excerpt.add(new Summary.Fragment(trailingFragment));
186 }
187 }
188
189
190 excerpt.setNumTerms(currentToken - startToken);
191
192
193 excerptSet.add(excerpt);
194
195
196
197 i = currentToken + sumContext;
198 }
199 }
200 }
201
202
203
204 if (excerptSet.size() == 0) {
205 Excerpt excerpt = new Excerpt();
206 int excerptLen = Math.min(sumLength, tokens.length);
207 lastExcerptPos = excerptLen;
208
209 excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen - 1].endOffset())));
210 excerpt.setNumTerms(excerptLen);
211 excerptSet.add(excerpt);
212 }
213
214 log.debug("Found excerpts = " + excerptSet.size());
215
216
217
218 double tokenCount = 0;
219 Summary s = new Summary();
220 while (tokenCount <= sumLength && excerptSet.size() > 0) {
221 Excerpt excerpt = (Excerpt) excerptSet.last();
222 excerptSet.remove(excerpt);
223
224 double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
225 for (Enumeration e = excerpt.elements(); e.hasMoreElements();) {
226 Summary.Fragment f = (Summary.Fragment) e.nextElement();
227
228 if (tokenCount + tokenFraction <= sumLength) {
229 s.add(f);
230 }
231 tokenCount += tokenFraction;
232 }
233 }
234
235 if (tokenCount > 0 && lastExcerptPos < tokens.length)
236 s.add(new Summary.Ellipsis());
237 return s;
238 }
239
240
241
242
243
244
245
246
247
248 private Set getTerms(String query) {
249 if (StringUtils.isNotEmpty(query)) {
250 try {
251 Set tokens = new HashSet();
252 if (luceneConnection != null && query.indexOf('*') > -1)
253 {
254 QueryParser qp = new QueryParser(
255 BaseDocumentBuilder.FieldName.CONTENT_BODY, standardAnalyzer);
256 try {
257 final Query parsed = qp.parse(query);
258 final String[] queryTemp = new String[1];
259 luceneConnection.withReader(new ILuceneConnection.ReaderAction() {
260 public Object perform(IndexReader reader) throws IOException {
261 Query q = parsed.rewrite(reader);
262 queryTemp[0] = q.toString().replaceAll(BaseDocumentBuilder.FieldName.CONTENT_BODY + ":", "");
263 return null;
264 }
265 });
266 String[] terms = queryTemp[0].split(" ");
267 for (int i = 0; i < terms.length; ++i) tokens.add(terms[i]);
268 }
269 catch (ParseException e) {
270 log.warn("Error encountered parsing query: " + query + " for wildcard match.", e);
271 }
272 }
273 TokenStream ts = analyzer.tokenStream(BaseDocumentBuilder.FieldName.CONTENT_BODY, new StringReader(query));
274 for (Token token = ts.next(); token != null; token = ts.next()) {
275 tokens.add(token.termText());
276 }
277 return tokens;
278 }
279 catch (IOException e) {
280 log.error(e.getMessage(), e);
281 }
282 }
283
284 return Collections.EMPTY_SET;
285 }
286
287 private Token[] parseText(String text) throws IOException {
288 if (text == null || text.trim().equals(""))
289 return new Token[0];
290
291 ArrayList result = new ArrayList();
292 TokenStream ts = analyzer.tokenStream(BaseDocumentBuilder.FieldName.CONTENT_BODY, new StringReader(text));
293 for (Token token = ts.next(); token != null; token = ts.next()) {
294 result.add(token);
295 }
296 return (Token[]) result.toArray(new Token[result.size()]);
297 }
298
299 public void setAnalyzer(Analyzer analyzer) {
300 this.analyzer = analyzer;
301 }
302
303 public void setSumContext(int sumContext) {
304 this.sumContext = sumContext;
305 }
306
307 public void setSumLength(int sumLength) {
308 this.sumLength = sumLength;
309 }
310
311 public void setAnalyzerFactory(LuceneAnalyzerFactory f) {
312 this.analyzer = f.createAnalyzer();
313 }
314
315 public void setLuceneConnection(ILuceneConnection luceneConnection) {
316 this.luceneConnection = luceneConnection;
317 }
318 }