1   package com.atlassian.core.filters;
2   
3   import javax.servlet.FilterChain;
4   import javax.servlet.ServletRequest;
5   import javax.servlet.ServletResponse;
6   
7   import junit.framework.TestCase;
8   
9   import java.util.Map;
10  import java.util.HashMap;
11  
12  public class TestAbstractEncodingFilterPunctuationReplacement extends TestCase
13  {
14      private static final String TEST_STRING = getUnicodePunctuationCharacters() + getLatin1PunctuationCharacters();
15      private static final String TEST_STRING_WITH_UNICODE_PUNCTUATION_REPLACED = replaceUnicodePunctuationWithAscii(TEST_STRING);
16      private static final String TEST_STRING_WITH_WINDOWS_1252_PUNCTUATION_REPLACED = replaceWindows1252PunctuationWithAscii(TEST_STRING);
17  
18      private StubEncodingFilter encodingFilter;
19      private ServletStubs.Request request;
20      private ServletStubs.Response response;
21      private FilterChain filterChain;
22      private Map<String, Object> result;
23  
24      protected void setUp() throws Exception
25      {
26          super.setUp();
27  
28          request = ServletStubs.getRequestInstance();
29          request.setParameter("param", TEST_STRING);
30          request.addParameter("paramValues", TEST_STRING);
31          request.addParameter("paramValues", TEST_STRING);
32          request.setParameterMap(new HashMap<String, String[]>() {{
33              put("param", new String[]{ "not used -- filter calls getParameterValues" });
34              put("paramValues", new String[]{ "not used -- filter calls getParameterValues" });
35          }});
36  
37          response = ServletStubs.getResponseInstance();
38  
39          encodingFilter = new StubEncodingFilter();
40  
41          result = new HashMap<String, Object>();
42          filterChain = new FilterChain() {
43              public void doFilter(ServletRequest request, ServletResponse response)
44              {
45                  result.put("param", request.getParameter("param"));
46                  result.put("paramValues", request.getParameterValues("paramValues"));
47                  result.put("paramMap", request.getParameterMap());
48              }
49          };
50      }
51  
52      public void testUnicodePunctuationRemovedFromParametersInUtf8() throws Exception
53      {
54          encodingFilter.setEncoding("UTF-8");
55          encodingFilter.doFilter(request, response, filterChain);
56  
57          assertEquals(TEST_STRING_WITH_UNICODE_PUNCTUATION_REPLACED, result.get("param"));
58      }
59  
60      public void testPunctuationRemovedByGetParameterValues() throws Exception
61      {
62          encodingFilter.setEncoding("UTF-8");
63          encodingFilter.doFilter(request, response, filterChain);
64  
65          String[] values = (String[]) result.get("paramValues");
66          assertEquals(2, values.length);
67          assertEquals(TEST_STRING_WITH_UNICODE_PUNCTUATION_REPLACED, values[0]);
68          assertEquals(TEST_STRING_WITH_UNICODE_PUNCTUATION_REPLACED, values[1]);
69      }
70  
71      public void testPunctuationRemovedByGetParameterMap() throws Exception
72      {
73          encodingFilter.setEncoding("UTF-8");
74          encodingFilter.doFilter(request, response, filterChain);
75  
76          //noinspection unchecked
77          Map<String, String[]> map = (Map<String, String[]>) result.get("paramMap");
78          assertEquals(2, map.size());
79          assertEquals(1, map.get("param").length);
80          assertEquals(TEST_STRING_WITH_UNICODE_PUNCTUATION_REPLACED, map.get("param")[0]);
81          assertEquals(2, map.get("paramValues").length);
82          assertEquals(TEST_STRING_WITH_UNICODE_PUNCTUATION_REPLACED, map.get("paramValues")[0]);
83          assertEquals(TEST_STRING_WITH_UNICODE_PUNCTUATION_REPLACED, map.get("paramValues")[1]);
84      }
85  
86      public void testWindows1252PunctuationRemovedFromParametersInLatin1() throws Exception
87      {
88          encodingFilter.setEncoding("ISO-8859-1");
89          encodingFilter.doFilter(request, response, filterChain);
90  
91          assertEquals(TEST_STRING_WITH_WINDOWS_1252_PUNCTUATION_REPLACED, result.get("param"));
92      }
93  
94      public void testUnicodePunctuationRemovedFromParametersInWindows1252() throws Exception
95      {
96          encodingFilter.setEncoding("Windows-1252");
97          encodingFilter.doFilter(request, response, filterChain);
98  
99          assertEquals(TEST_STRING_WITH_UNICODE_PUNCTUATION_REPLACED, result.get("param"));
100     }
101 
102     public void testParametersUnchangedInUtf16() throws Exception
103     {
104         encodingFilter.setEncoding("UTF-16");
105         encodingFilter.doFilter(request, response, filterChain);
106 
107         assertEquals(TEST_STRING, result.get("param"));
108     }
109 
110     /**
111      * Returns the "General Punctuation" range of characters in Unicode as a String.
112      */
113     private static String getUnicodePunctuationCharacters()
114     {
115         StringBuffer result = new StringBuffer(200);
116         for (char c = '\u2000'; c < '\u2070'; c++) // General Punctuation range
117             result.append(c);
118         return result.toString();
119     }
120 
121     /**
122      * Returns punctuation characters in Latin-1 and Windows-1252 as a String.
123      */
124     private static String getLatin1PunctuationCharacters()
125     {
126         StringBuffer result = new StringBuffer(200);
127         for (char c = '\u0020'; c <= '\u0040'; c++) // ASCII punctuation and digits
128             result.append(c);
129         for (char c = '\u005b'; c <= '\u0060'; c++) // more ASCII punctuation
130             result.append(c);
131         for (char c = '\u007b'; c <= '\u007e'; c++) // more ASCII punctuation
132             result.append(c);
133         for (char c = '\u0080'; c < '\u0100'; c++) // high-bit Latin-1, includes Windows-1252 punctuation
134             result.append(c);
135         return result.toString();
136     }
137 
138     /**
139      * Replaces the Unicode punctuation with a rough ASCII equivalent. The same replacements
140      * as the encoding filter, but not as fast.
141      */
142     private static String replaceUnicodePunctuationWithAscii(String input)
143     {
144         StringBuffer output = new StringBuffer(input.length() + 50); // room for ellipsis
145         for (int i=0; i<input.length(); i++)
146         {
147             char c = input.charAt(i);
148             switch (c) {
149                 case '\u00b7': // MIDDLE DOT
150                     output.append("- ");
151                     break;
152                 case '\u2013': // EN DASH
153                     output.append("-");
154                     break;
155                 case '\u2018': // LEFT SINGLE QUOTATION MARK
156                 case '\u2019': // RIGHT SINGLE QUOTATION MARK
157                     output.append('\'');
158                     break;
159                 case '\u201c': // LEFT DOUBLE QUOTATION MARK
160                 case '\u201d': // RIGHT DOUBLE QUOTATION MARK
161                     output.append('"');
162                     break;
163                 case '\u2026': // HORIZONTAL ELLIPSIS
164                     output.append("...");
165                     break;
166                 default:
167                     output.append(c);
168             }
169         }
170         return output.toString();
171     }
172 
173     /**
174      * Replaces the Windows-1252 punctuation with a rough ASCII equivalent. The same replacements
175      * as the encoding filter, but not as fast.
176      */
177     private static String replaceWindows1252PunctuationWithAscii(String input)
178     {
179         StringBuffer output = new StringBuffer(input.length() + 50); // room for ellipsis
180         for (int i=0; i<input.length(); i++)
181         {
182             char c = input.charAt(i);
183             switch (c) {
184                 case 133: // HORIZONTAL ELLIPSIS
185                     output.append("...");
186                     break;
187                 case 145: // LEFT SINGLE QUOTATION MARK
188                 case 146: // RIGHT SINGLE QUOTATION MARK
189                     output.append('\'');
190                     break;
191                 case 147: // LEFT DOUBLE QUOTATION MARK
192                 case 148: // RIGHT DOUBLE QUOTATION MARK
193                     output.append('"');
194                     break;
195                 case 150: // EN DASH
196                     output.append("-");
197                     break;
198                 case 183: // MIDDLE DOT
199                     output.append("- ");
200                     break;
201                 default:
202                     output.append(c);
203             }
204         }
205         return output.toString();
206     }
207 }