1   package com.atlassian.mail;
2   
3   import org.apache.log4j.Category;
4   
5   import javax.swing.text.html.HTML;
6   import javax.swing.text.html.HTMLEditorKit;
7   import javax.swing.text.html.parser.ParserDelegator;
8   import java.io.*;
9   import java.util.ArrayList;
10  
11  /**
12   * Helper class to convert arbitrary HTML documents into text. Conversion is very basic, intended
13   * to be used to strip markup from HTML-only emails for inclusion within JIRA issue reports or
14   * comments.
15   */
16  public class HtmlToTextConverter
17  {
18      private static final Category log = Category.getInstance(HtmlToTextConverter.class);
19  
20      private class HTMLCallbackHandler extends HTMLEditorKit.ParserCallback {
21  
22          Writer out;
23          boolean started = false;
24          boolean inBody = false;
25          boolean inList = false;
26          boolean firstTD = true;
27          int listCount = -1;
28          ArrayList links = new ArrayList();
29  
30          private static final String NEWLINE = "\n";
31          private static final String TAB = "\t";
32          private static final String STAR = "*";
33          private static final String SPACE = " ";
34          private static final String PERIOD = ".";
35          private static final String OPEN_BRACKET = "[";
36          private static final String CLOSE_BRACKET = "]";
37          private static final String DASH_LINE = "----------------------------------------------------------------------------------------";
38  
39          // Note: the "position" parameter for all the methods below denotes our
40          // character position in the source document. Thus, we ignore it a lot.
41          public HTMLCallbackHandler(Writer writer) {
42              out = writer;
43          }
44  
45          public void handleStartTag(HTML.Tag tag, javax.swing.text.MutableAttributeSet set, int position) {
46              try
47              {
48                  if (inBody && started && tag.equals(HTML.Tag.P))
49                  {
50                      out.write(NEWLINE + NEWLINE);
51                  }
52                  else if (inBody && started && tag.equals(HTML.Tag.OL) || tag.equals(HTML.Tag.UL))
53                  {
54                      inList = true;
55                      out.write(NEWLINE + NEWLINE);
56                      if(tag.equals(HTML.Tag.OL))
57                          listCount = 1;
58                  }
59                  else if (inBody && started && inList && tag.equals(HTML.Tag.LI))
60                  {
61                      out.write(NEWLINE);
62                      if(listCount != -1)
63                      {
64                          out.write(listCount + PERIOD + SPACE);
65                          listCount++;
66                      }
67                      else
68                          out.write(STAR);
69                  }
70                  else if (inBody && started && tag.equals(HTML.Tag.TABLE))
71                  {
72                      out.write(NEWLINE);
73                  }
74                  else if (inBody && started && tag.equals(HTML.Tag.TR))
75                  {
76                      out.write(NEWLINE);
77                      firstTD = true;
78                  }
79                  else if (inBody && started && tag.equals(HTML.Tag.TD) || tag.equals(HTML.Tag.TH))
80                  {
81                      if(!firstTD)
82                      {
83                          out.write(TAB);
84                      }
85                      else
86                      {
87                          firstTD = false;
88                      }
89                  }
90                  else if (inBody && started && tag.equals(HTML.Tag.PRE))
91                  {
92                      out.write(NEWLINE);
93                  }
94                  else if (inBody && started && tag.equals(HTML.Tag.IMG))
95                  {
96                      // Check if the img has a src attribute
97                      handleLink((String)set.getAttribute(HTML.Attribute.SRC));
98                  }
99                  else if (inBody && started && tag.equals(HTML.Tag.A))
100                 {
101                     // Check if the img has a src attribute
102                     handleLink((String)set.getAttribute(HTML.Attribute.HREF));
103                 }
104                 else if (inBody && started && tag.equals(HTML.Tag.HR))
105                 {
106                     out.write(NEWLINE + DASH_LINE);
107                 }
108                 else if (inBody && started && tag.equals(HTML.Tag.H1) || tag.equals(HTML.Tag.H2) || tag.equals(HTML.Tag.H3) || tag.equals(HTML.Tag.H4) || tag.equals(HTML.Tag.H5) || tag.equals(HTML.Tag.H6))
109                 {
110                     out.write(NEWLINE);
111                 }
112                 else if (tag.equals(HTML.Tag.BODY))
113                 {
114                     inBody = true;
115                 }
116             }
117             catch (IOException e)
118             {
119                 log.warn("IO error converting HTML to text", e);
120             }
121 
122         }
123 
124         private void handleLink(String src) throws IOException
125         {
126             if(src != null)
127             {
128                 links.add(src);
129                 out.write(OPEN_BRACKET + links.size() + CLOSE_BRACKET);
130             }
131         }
132 
133         public void handleEndTag(HTML.Tag tag, int position) {
134             if (inBody && started && tag.equals(HTML.Tag.OL) || tag.equals(HTML.Tag.UL))
135             {
136                 inList = false;
137                 if(tag.equals(HTML.Tag.OL))
138                     listCount = -1;
139             }
140             else if (tag.equals(HTML.Tag.BODY))
141             {
142                 if(links.size() != 0)
143                 {
144                     // write out the links
145                     try
146                     {
147                         out.write(NEWLINE + DASH_LINE + NEWLINE);
148                         for (int i = 0; i < links.size(); i++)
149                         {
150                             String src = (String)links.get(i);
151                             out.write(OPEN_BRACKET + (i + 1) + CLOSE_BRACKET + SPACE + src);
152                             if((i + 1) < links.size())
153                             {
154                                 out.write(NEWLINE);
155                             }
156                         }
157                     }
158                     catch (IOException e)
159                     {
160                         e.printStackTrace();  //To change body of catch statement use File | Settings | File Templates.
161                     }
162                 }
163                 inBody = false;
164             }
165         }
166 
167         public void handleText(char[] aChar, int position) {
168             try
169             {
170                 if (inBody)
171                 {
172                     out.write(aChar);
173                     started = true;
174                 }
175             }
176             catch (IOException e)
177             {
178                 log.warn("IO error converting HTML to text", e);
179             }
180         }
181 
182         public void handleSimpleTag(HTML.Tag tag, javax.swing.text.MutableAttributeSet a, int pos) {
183             try
184             {
185                 if (inBody && started && tag.equals(HTML.Tag.BR))
186                     out.write(NEWLINE);
187             }
188             catch (IOException e)
189             {
190                 log.warn("IO error converting HTML to text", e);
191             }
192 
193         }
194     }
195 
196 
197     public String convert(String html) throws IOException
198     {
199         StringWriter out = new StringWriter();
200         convert(new StringReader(html), out);
201         out.close();
202         return out.toString();
203     }
204 
205     private void convert(Reader reader, Writer writer) throws IOException
206     {
207         HTMLCallbackHandler handler = new HTMLCallbackHandler(writer);
208         new ParserDelegator().parse(reader, handler, true);
209     }
210 }