1 package com.atlassian.bonnie.search.summary;
2
3 import com.atlassian.bonnie.ILuceneConnection;
4 import com.atlassian.bonnie.analyzer.LuceneAnalyzerFactory;
5 import com.atlassian.bonnie.search.BaseDocumentBuilder;
6 import org.apache.commons.lang.StringUtils;
7 import org.apache.lucene.analysis.Analyzer;
8 import org.apache.lucene.analysis.Token;
9 import org.apache.lucene.analysis.TokenStream;
10 import org.apache.lucene.analysis.standard.StandardAnalyzer;
11 import org.apache.lucene.index.IndexReader;
12 import org.apache.lucene.index.Term;
13 import org.apache.lucene.index.TermEnum;
14 import org.apache.lucene.queryParser.ParseException;
15 import org.apache.lucene.queryParser.QueryParser;
16 import org.apache.lucene.search.Query;
17 import org.apache.lucene.util.Version;
18 import org.slf4j.LoggerFactory;
19 import org.slf4j.Logger;
20
21 import java.io.IOException;
22 import java.io.StringReader;
23 import java.util.*;
24 import java.util.regex.Pattern;
25
26
27
28
29
30
31
32 public class Summarizer
33 {
34 private static final Logger log = LoggerFactory.getLogger(Summarizer.class);
35
36
37
38
39 private static final int DEFAULT_SUM_CONTEXT = 10;
40
41
42
43
44 private static final int DEFAULT_SUM_LENGTH = 30;
45
46 private Analyzer analyzer;
47 private StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
48 private int sumContext = DEFAULT_SUM_CONTEXT;
49 private int sumLength = DEFAULT_SUM_LENGTH;
50 private ILuceneConnection luceneConnection;
51
52 public Summarizer() {
53 }
54
55 public Summarizer(Analyzer analyzer) {
56 this.analyzer = analyzer;
57 }
58
59 public Summarizer(Analyzer analyzer, int sumContext, int sumLength, ILuceneConnection luceneConnection) {
60 this.analyzer = analyzer;
61 this.sumContext = sumContext;
62 this.sumLength = sumLength;
63 this.luceneConnection = luceneConnection;
64 }
65
66 public Summary getSummary(String text) throws IOException {
67 return this.getSummary(text, null);
68 }
69
70
71
72
73 public Summary getSummary(String text, String query) throws IOException {
74
75
76
77
78
79 log.debug("\n\ntext = " + text);
80 log.debug("query = " + query);
81
82 Token[] tokens = parseText(text);
83
84
85 if (log.isDebugEnabled()) {
86 StringBuffer buf = new StringBuffer();
87 for (int i = 0; i < tokens.length; i++) {
88 buf.append(tokens[i].termText());
89 if (i != (tokens.length - 1))
90 buf.append(", ");
91 }
92 log.debug("tokens = ");
93 }
94
95 if (tokens.length == 0)
96 return new Summary();
97
98 Set highlight = getTerms(query);
99
100 log.debug("highlight = " + highlight);
101
102
103
104
105 SortedSet excerptSet = new TreeSet(new Comparator() {
106 public int compare(Object o1, Object o2) {
107 Excerpt excerpt1 = (Excerpt) o1;
108 Excerpt excerpt2 = (Excerpt) o2;
109
110 if (excerpt1 == null && excerpt2 != null) {
111 return -1;
112 } else if (excerpt1 != null && excerpt2 == null) {
113 return 1;
114 } else if (excerpt1 == null && excerpt2 == null) {
115 return 0;
116 }
117
118 int numToks1 = excerpt1.numUniqueTokens();
119 int numToks2 = excerpt2.numUniqueTokens();
120
121 if (numToks1 < numToks2) {
122 return -1;
123 } else if (numToks1 == numToks2) {
124 return excerpt1.numFragments() - excerpt2.numFragments();
125 } else {
126 return 1;
127 }
128 }
129 }
130 );
131
132 int lastExcerptPos = 0;
133
134 if (highlight.size() > 0)
135 {
136
137 for (int i = 0; i < tokens.length; i++) {
138
139 if (highlight.contains(tokens[i].term())) {
140
141
142 int startToken = (i > sumContext) ? i - sumContext : 0;
143 int endToken = Math.min(i + sumContext, tokens.length);
144 int startOffset = tokens[startToken].startOffset();
145 int currentToken = startToken;
146
147
148
149
150 Excerpt excerpt = new Excerpt();
151 if (startOffset != 0) {
152 excerpt.add(new Summary.Ellipsis());
153 }
154
155
156
157
158 while ((currentToken < endToken) && (currentToken - startToken < sumLength)) {
159
160 Token t = tokens[currentToken];
161 if (highlight.contains(t.termText())) {
162 excerpt.addToken(t.termText());
163 excerpt.add(new Summary.Fragment(text.substring(startOffset, t.startOffset())));
164 excerpt.add(new Summary.Highlight(text.substring(t.startOffset(), t.endOffset())));
165 startOffset = t.endOffset();
166 endToken = Math.min(currentToken + sumContext, tokens.length);
167 }
168
169 currentToken++;
170 }
171
172 lastExcerptPos = endToken;
173
174
175
176
177
178
179
180
181 if (currentToken < tokens.length) {
182 excerpt.add(new Summary.Fragment(text.substring(startOffset, tokens[currentToken].endOffset())));
183 } else {
184
185 int endOffset = tokens[(tokens.length - 1)].endOffset();
186 String trailingFragment = text.substring(startOffset, endOffset);
187 if(!StringUtils.isEmpty(trailingFragment))
188 {
189 excerpt.add(new Summary.Fragment(trailingFragment));
190 }
191 }
192
193
194 excerpt.setNumTerms(currentToken - startToken);
195
196
197 excerptSet.add(excerpt);
198
199
200
201 i = currentToken + sumContext;
202 }
203 }
204 }
205
206
207
208 if (excerptSet.size() == 0) {
209 Excerpt excerpt = new Excerpt();
210 int excerptLen = Math.min(sumLength, tokens.length);
211 lastExcerptPos = excerptLen;
212
213 excerpt.add(new Summary.Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen - 1].endOffset())));
214 excerpt.setNumTerms(excerptLen);
215 excerptSet.add(excerpt);
216 }
217
218 log.debug("Found excerpts = " + excerptSet.size());
219
220
221
222 double tokenCount = 0;
223 Summary s = new Summary();
224 while (tokenCount <= sumLength && excerptSet.size() > 0) {
225 Excerpt excerpt = (Excerpt) excerptSet.last();
226 excerptSet.remove(excerpt);
227
228 double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
229 for (Enumeration e = excerpt.elements(); e.hasMoreElements();) {
230 Summary.Fragment f = (Summary.Fragment) e.nextElement();
231
232 if (tokenCount + tokenFraction <= sumLength) {
233 s.add(f);
234 }
235 tokenCount += tokenFraction;
236 }
237 }
238
239 if (tokenCount > 0 && lastExcerptPos < tokens.length)
240 s.add(new Summary.Ellipsis());
241 return s;
242 }
243
244
245
246
247
248
249
250
251
252 private Set getTerms(final String query) {
253 if (StringUtils.isNotEmpty(query)) {
254 try {
255 Set tokens = new HashSet();
256 if (luceneConnection != null && query.indexOf('*') > -1)
257 {
258 QueryParser qp = new QueryParser(
259 BaseDocumentBuilder.FieldName.CONTENT_BODY, analyzer);
260 try {
261 final Query parsed = qp.parse(query);
262 final String[] queryTemp = new String[1];
263 Set<String> set = (Set<String>)luceneConnection.withReader(new ILuceneConnection.ReaderAction() {
264 public Object perform(IndexReader reader) throws IOException {
265
266 String transformedQuery = query.replaceAll("\\.","\\.");
267 transformedQuery = transformedQuery.replaceAll("\\*",".*");
268 transformedQuery = transformedQuery.replaceAll("\\?",".");
269 Set<String> set = new HashSet<String>();
270 TermEnum termEnum = reader.terms();
271 String[] tokens = transformedQuery.split(" ");
272
273 while(termEnum.next())
274 {
275 Term t = termEnum.term();
276 for(int i=0;i<tokens.length;i++)
277 {
278 if( Pattern.matches(tokens[i],t.text()))
279 {
280 set.add(t.text());
281 }
282 }
283 }
284 return set;
285 }
286 });
287 tokens.addAll(set);
288 }
289 catch (ParseException e) {
290 log.warn("Error encountered parsing query: " + query + " for wildcard match.", e);
291 }
292 }
293 TokenStream ts = analyzer.tokenStream(BaseDocumentBuilder.FieldName.CONTENT_BODY, new StringReader(query));
294 for (Token token = ts.next(); token != null; token = ts.next()) {
295 tokens.add(token.term());
296 }
297 return tokens;
298 }
299 catch (IOException e) {
300 log.error(e.getMessage(), e);
301 }
302 }
303
304 return Collections.EMPTY_SET;
305 }
306
307 private Token[] parseText(String text) throws IOException {
308 if (text == null || text.trim().equals(""))
309 return new Token[0];
310
311 ArrayList result = new ArrayList();
312 TokenStream ts = analyzer.tokenStream(BaseDocumentBuilder.FieldName.CONTENT_BODY, new StringReader(text));
313 for (Token token = ts.next(); token != null; token = ts.next()) {
314 result.add(token);
315 }
316 return (Token[]) result.toArray(new Token[result.size()]);
317 }
318
319 public void setAnalyzer(Analyzer analyzer) {
320 this.analyzer = analyzer;
321 }
322
323 public void setSumContext(int sumContext) {
324 this.sumContext = sumContext;
325 }
326
327 public void setSumLength(int sumLength) {
328 this.sumLength = sumLength;
329 }
330
331 public void setAnalyzerFactory(LuceneAnalyzerFactory f) {
332 this.analyzer = f.createAnalyzer();
333 }
334
335 public void setLuceneConnection(ILuceneConnection luceneConnection) {
336 this.luceneConnection = luceneConnection;
337 }
338 }