View Javadoc

1   /*
2    * Copyright (c) 2003 by Atlassian Software Systems Pty. Ltd.
3    * All rights reserved.
4    */
5   
6   package com.atlassian.bonnie.search;
7   
8   import org.apache.lucene.analysis.Analyzer;
9   import org.apache.lucene.analysis.Token;
10  import org.apache.lucene.analysis.TokenStream;
11  import org.apache.lucene.analysis.standard.StandardAnalyzer;
12  
13  import java.io.IOException;
14  import java.io.StringReader;
15  import java.util.ArrayList;
16  import java.util.List;
17  
18  /**
19   * The <code>SearchWordLister</code> is like a reverse builder that takes a query string and decomposes it into
20   * words to be search on and ignored words
21   * 
22   * @author ROSS
23   */
24  public class SearchWordsLister
25  {
26      private static final String KEYWORDS_REGEX = "AND|NOT|OR";
27  
28      private List ignoredWords;
29      private List searchWords;
30      private Analyzer referenceAnalyzer = null;
31      private Analyzer queryAnalyzer = null;
32  
33      public SearchWordsLister()
34      {
35          ignoredWords = new ArrayList();
36          searchWords = new ArrayList();
37          referenceAnalyzer = new StandardAnalyzer(new String[]{});
38          queryAnalyzer = new StandardAnalyzer();
39      }
40  
41      public List getIgnoredWords()
42      {
43          return ignoredWords;
44      }
45  
46      public List getSearchWords()
47      {
48          return searchWords;
49      }
50  
51      public String getIgnoredWordsAsString()
52      {
53          return listToDelimitedString(ignoredWords, ", ");
54      }
55  
56      public String getSearchWordsAsString()
57      {
58          return listToDelimitedString(searchWords, ", ");
59      }
60  
61      private String listToDelimitedString(List list, String delimiter)
62      {
63          StringBuffer buffer = new StringBuffer();
64  
65          for (int i = 0; i < list.size(); i++)
66          {
67              buffer.append(list.get(i));
68              if (i + 1 < list.size())
69              {
70                  buffer.append(delimiter);
71              }
72          }
73          return buffer.toString();
74      }
75  
76      /**
77       * generates a comma separated string of words that would be removed from the search query
78       * 
79       * @param query the query being invoked
80       * @throws java.io.IOException 
81       */
82      public void parseQuery(String query) throws IOException
83      {
84          //strip out any search keywords
85          query = query.replaceAll(KEYWORDS_REGEX, "");
86  
87          //the token stream used to search with
88          TokenStream queryStream = queryAnalyzer.tokenStream(null, new StringReader(query));
89          //the token stream without StopFilter processing.  Other filters will be applied
90          TokenStream referenceStream = referenceAnalyzer.tokenStream(null, new StringReader(query));
91  
92          Token token = queryStream.next();
93          Token refToken = referenceStream.next();
94  
95          while (refToken != null && token != null)
96          {
97  
98              String tokenText;
99              tokenText = refToken.termText();
100             if (tokenText.equals(token.termText()))
101             {
102                 searchWords.add(tokenText);
103                 token = queryStream.next();
104 
105                 //avoid putting duplicates in
106             }
107             else if (!ignoredWords.contains(tokenText))
108             {
109                 ignoredWords.add(tokenText);
110             }
111 
112             refToken = referenceStream.next();
113             //need to check that the last words in our query are not stop words
114             if (token == null && refToken != null)
115             {
116                 while (refToken != null)
117                 {
118                     if (ignoredWords.contains(refToken.termText()))
119                     {
120                         ignoredWords.add(refToken.termText());
121                     }
122                     refToken = referenceStream.next();
123                 }
124             }
125         }
126     }
127 
128 }