View Javadoc

1   /*
2    * Copyright (c) 2003 by Atlassian Software Systems Pty. Ltd.
3    * All rights reserved.
4    */
5   
6   package com.atlassian.bonnie.search;
7   
8   import org.apache.lucene.analysis.Analyzer;
9   import org.apache.lucene.analysis.Token;
10  import org.apache.lucene.analysis.TokenStream;
11  import org.apache.lucene.analysis.standard.StandardAnalyzer;
12  
13  import java.io.IOException;
14  import java.io.StringReader;
15  import java.util.ArrayList;
16  import java.util.List;
17  
18  /**
19   * The <code>SearchWordLister</code> is like a reverse builder that takes a query string and decomposes it into
20   * words to be search on and ignored words
21   * 
22   * @author ROSS
23   */
24  public class SearchWordsLister
25  {
26      private static final String KEYWORDS_REGEX = "AND|NOT|OR";
27  
28      private List ignoredWords;
29      private List searchWords;
30      private Analyzer referenceAnalyzer = null;
31      private Analyzer queryAnalyzer = null;
32  
33  	/**
34  	 * Default Constructor uses the StandardAnalyzer to parse queries.
35  	 */
36  	public SearchWordsLister()
37      {
38          ignoredWords = new ArrayList();
39          searchWords = new ArrayList();
40          referenceAnalyzer = new StandardAnalyzer(new String[]{});
41          queryAnalyzer = new StandardAnalyzer();
42      }
43  
44  	/**
45  	 * Construct a SearchWordsLister that uses the supplied analyzers. The ignored words are the difference between
46  	 * the output of referenceAnalyzer and queryAnalyzer.
47  	 *
48  	 * @param referenceAnalyzer a query analyzer which will not remove any stop words.
49  	 * @param queryAnalyzer a query Analyzer which may remove stop words from the query.
50  	 */
51  	public SearchWordsLister(Analyzer referenceAnalyzer, Analyzer queryAnalyzer)
52  	{
53  		this.referenceAnalyzer = referenceAnalyzer;
54  		this.queryAnalyzer = queryAnalyzer;
55  	}
56  
57  	public List getIgnoredWords()
58      {
59          return ignoredWords;
60      }
61  
62      public List getSearchWords()
63      {
64          return searchWords;
65      }
66  
67      public String getIgnoredWordsAsString()
68      {
69          return listToDelimitedString(ignoredWords, ", ");
70      }
71  
72      public String getSearchWordsAsString()
73      {
74          return listToDelimitedString(searchWords, ", ");
75      }
76  
77      private String listToDelimitedString(List list, String delimiter)
78      {
79          StringBuffer buffer = new StringBuffer();
80  
81          for (int i = 0; i < list.size(); i++)
82          {
83              buffer.append(list.get(i));
84              if (i + 1 < list.size())
85              {
86                  buffer.append(delimiter);
87              }
88          }
89          return buffer.toString();
90      }
91  
92      /**
93       * generates a comma separated string of words that would be removed from the search query
94       * 
95       * @param query the query being invoked
96       * @throws java.io.IOException 
97       */
98      public void parseQuery(String query) throws IOException
99      {
100         //strip out any search keywords
101         query = query.replaceAll(KEYWORDS_REGEX, "");
102 
103         //the token stream used to search with
104         TokenStream queryStream = queryAnalyzer.tokenStream(null, new StringReader(query));
105         //the token stream without StopFilter processing.  Other filters will be applied
106         TokenStream referenceStream = referenceAnalyzer.tokenStream(null, new StringReader(query));
107 
108         Token token = queryStream.next();
109         Token refToken = referenceStream.next();
110 
111         while (refToken != null && token != null)
112         {
113 
114             String tokenText;
115             tokenText = refToken.termText();
116             if (tokenText.equals(token.termText()))
117             {
118                 searchWords.add(tokenText);
119                 token = queryStream.next();
120 
121                 //avoid putting duplicates in
122             }
123             else if (!ignoredWords.contains(tokenText))
124             {
125                 ignoredWords.add(tokenText);
126             }
127 
128             refToken = referenceStream.next();
129             //need to check that the last words in our query are not stop words
130             if (token == null && refToken != null)
131             {
132                 while (refToken != null)
133                 {
134                     if (ignoredWords.contains(refToken.termText()))
135                     {
136                         ignoredWords.add(refToken.termText());
137                     }
138                     refToken = referenceStream.next();
139                 }
140             }
141         }
142     }
143 
144 }