001/* 002 * This library is part of OpenCms - 003 * the Open Source Content Management System 004 * 005 * Copyright (c) Alkacon Software GmbH (http://www.alkacon.com) 006 * 007 * This library is free software; you can redistribute it and/or 008 * modify it under the terms of the GNU Lesser General Public 009 * License as published by the Free Software Foundation; either 010 * version 2.1 of the License, or (at your option) any later version. 011 * 012 * This library is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * For further information about Alkacon Software GmbH, please see the 018 * company website: http://www.alkacon.com 019 * 020 * For further information about OpenCms, please see the 021 * project website: http://www.opencms.org 022 * 023 * You should have received a copy of the GNU Lesser General Public 024 * License along with this library; if not, write to the Free Software 025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 026 */ 027 028package org.opencms.util; 029 030 031import java.util.ArrayList; 032import java.util.Arrays; 033import java.util.Iterator; 034import java.util.List; 035 036import org.htmlparser.Parser; 037import org.htmlparser.PrototypicalNodeFactory; 038import org.htmlparser.Remark; 039import org.htmlparser.Tag; 040import org.htmlparser.Text; 041import org.htmlparser.lexer.Lexer; 042import org.htmlparser.lexer.Page; 043import org.htmlparser.util.ParserException; 044import org.htmlparser.visitors.NodeVisitor; 045 046/** 047 * Base utility class for OpenCms <code>{@link org.htmlparser.visitors.NodeVisitor}</code> 048 * implementations, which provides some often used utility functions. 049 * <p> 050 * 051 * This base implementation is only a "pass through" class, that is the content is parsed, but the 052 * generated result is exactly identical to the input. 053 * <p> 054 * 055 * @since 6.2.0 056 */ 057public class CmsHtmlParser extends NodeVisitor implements I_CmsHtmlNodeVisitor { 058 059 /** List of upper case tag name strings of tags that should not be auto-corrected if closing divs are missing. */ 060 protected List<String> m_noAutoCloseTags; 061 062 /** The array of supported tag names. */ 063 // important: don't change the order of these tags in the source, subclasses may expect the tags 064 // at the exact indices give here 065 // if you want to add tags, add them at the end 066 protected static final String[] TAG_ARRAY = new String[] { 067 "H1", 068 "H2", 069 "H3", 070 "H4", 071 "H5", 072 "H6", 073 "P", 074 "DIV", 075 "SPAN", 076 "BR", 077 "OL", 078 "UL", 079 "LI", 080 "TABLE", 081 "TD", 082 "TR", 083 "TH", 084 "THEAD", 085 "TBODY", 086 "TFOOT"}; 087 088 /** The list of supported tag names. */ 089 protected static final List<String> TAG_LIST = Arrays.asList(TAG_ARRAY); 090 091 /** Indicates if "echo" mode is on, that is all content is written to the result by default. */ 092 protected boolean m_echo; 093 094 /** The buffer to write the out to. */ 095 protected StringBuffer m_result; 096 097 /** The providable configuration - never null by contract of interface. */ 098 private String m_configuration = ""; 099 100 /** 101 * Creates a new instance of the html converter with echo mode set to <code>false</code>. 102 * <p> 103 */ 104 public CmsHtmlParser() { 105 106 this(false); 107 } 108 109 /** 110 * Creates a new instance of the html converter. 111 * <p> 112 * 113 * @param echo indicates if "echo" mode is on, that is all content is written to the result 114 */ 115 public CmsHtmlParser(boolean echo) { 116 117 m_result = new StringBuffer(1024); 118 m_echo = echo; 119 m_noAutoCloseTags = new ArrayList<String>(32); 120 } 121 122 /** 123 * Internally degrades Composite tags that do have children in the DOM tree 124 * to simple single tags. This allows to avoid auto correction of unclosed HTML tags.<p> 125 * 126 * @return A node factory that will not autocorrect open tags specified via <code>{@link #setNoAutoCloseTags(List)}</code> 127 */ 128 protected PrototypicalNodeFactory configureNoAutoCorrectionTags() { 129 130 PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); 131 132 String tagName; 133 Iterator<String> it = m_noAutoCloseTags.iterator(); 134 CmsNoAutoCloseTag noAutoCloseTag; 135 while (it.hasNext()) { 136 tagName = it.next(); 137 noAutoCloseTag = new CmsNoAutoCloseTag(new String[] {tagName}); 138 // TODO: This might break in case registering / unregistering will change from name based to tag-type based approach: 139 factory.unregisterTag(noAutoCloseTag); 140 factory.registerTag(noAutoCloseTag); 141 } 142 return factory; 143 } 144 145 /** 146 * @see org.opencms.util.I_CmsHtmlNodeVisitor#getConfiguration() 147 */ 148 public String getConfiguration() { 149 150 return m_configuration; 151 } 152 153 /** 154 * @see org.opencms.util.I_CmsHtmlNodeVisitor#getResult() 155 */ 156 public String getResult() { 157 158 return m_result.toString(); 159 } 160 161 /** 162 * Returns the HTML for the given tag itself (not the tag content). 163 * <p> 164 * 165 * @param tag the tag to create the HTML for 166 * 167 * @return the HTML for the given tag 168 */ 169 public String getTagHtml(Tag tag) { 170 171 StringBuffer result = new StringBuffer(32); 172 result.append('<'); 173 result.append(tag.getText()); 174 result.append('>'); 175 return result.toString(); 176 } 177 178 /** 179 * @see org.opencms.util.I_CmsHtmlNodeVisitor#process(java.lang.String, java.lang.String) 180 */ 181 public String process(String html, String encoding) throws ParserException { 182 183 m_result = new StringBuffer(); 184 Parser parser = new Parser(); 185 Lexer lexer = new Lexer(); 186 187 // initialize the page with the given char set 188 Page page = new Page(html, encoding); 189 lexer.setPage(page); 190 parser.setLexer(lexer); 191 192 if (m_noAutoCloseTags != null && m_noAutoCloseTags.size() > 0) { 193 // Degrade Composite tags that do have children in the DOM tree 194 // to simple single tags: This allows to finish this tag with opened HTML tags without the effect 195 // that html parser will generate the closing tags. 196 PrototypicalNodeFactory factory = configureNoAutoCorrectionTags(); 197 lexer.setNodeFactory(factory); 198 } 199 200 // process the page using the given visitor 201 parser.visitAllNodesWith(this); 202 // return the result 203 return getResult(); 204 } 205 206 /** 207 * 208 * @see org.opencms.util.I_CmsHtmlNodeVisitor#setConfiguration(java.lang.String) 209 */ 210 public void setConfiguration(String configuration) { 211 212 if (CmsStringUtil.isNotEmpty(configuration)) { 213 m_configuration = configuration; 214 } 215 216 } 217 218 /** 219 * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitEndTag(org.htmlparser.Tag) 220 */ 221 @Override 222 public void visitEndTag(Tag tag) { 223 224 if (m_echo) { 225 m_result.append(getTagHtml(tag)); 226 } 227 } 228 229 /** 230 * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitRemarkNode(org.htmlparser.Remark) 231 */ 232 @Override 233 public void visitRemarkNode(Remark remark) { 234 235 if (m_echo) { 236 m_result.append(remark.toHtml(true)); 237 } 238 } 239 240 /** 241 * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitStringNode(org.htmlparser.Text) 242 */ 243 @Override 244 public void visitStringNode(Text text) { 245 246 if (m_echo) { 247 m_result.append(text.getText()); 248 } 249 } 250 251 /** 252 * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitTag(org.htmlparser.Tag) 253 */ 254 @Override 255 public void visitTag(Tag tag) { 256 257 if (m_echo) { 258 m_result.append(getTagHtml(tag)); 259 } 260 } 261 262 /** 263 * Collapse HTML whitespace in the given String.<p> 264 * 265 * @param string the string to collapse 266 * 267 * @return the input String with all HTML whitespace collapsed 268 */ 269 protected String collapse(String string) { 270 271 int len = string.length(); 272 StringBuffer result = new StringBuffer(len); 273 int state = 0; 274 for (int i = 0; i < len; i++) { 275 char c = string.charAt(i); 276 switch (c) { 277 // see HTML specification section 9.1 White space 278 // http://www.w3.org/TR/html4/struct/text.html#h-9.1 279 case '\u0020': 280 case '\u0009': 281 case '\u000C': 282 case '\u200B': 283 case '\r': 284 case '\n': 285 if (0 != state) { 286 state = 1; 287 } 288 break; 289 default: 290 if (1 == state) { 291 result.append(' '); 292 } 293 state = 2; 294 result.append(c); 295 } 296 } 297 return result.toString(); 298 } 299 300 /** 301 * Returns a list of upper case tag names for which parsing / visiting will not correct missing closing tags.<p> 302 * 303 * @return a List of upper case tag names for which parsing / visiting will not correct missing closing tags 304 */ 305 public List<String> getNoAutoCloseTags() { 306 307 return m_noAutoCloseTags; 308 } 309 310 /** 311 * Sets a list of upper case tag names for which parsing / visiting should not correct missing closing tags.<p> 312 * 313 * @param noAutoCloseTagList a list of upper case tag names for which parsing / visiting 314 * should not correct missing closing tags to set. 315 */ 316 public void setNoAutoCloseTags(List<String> noAutoCloseTagList) { 317 318 // ensuring upper case 319 m_noAutoCloseTags.clear(); 320 if (noAutoCloseTagList != null) { 321 Iterator<String> it = noAutoCloseTagList.iterator(); 322 while (it.hasNext()) { 323 m_noAutoCloseTags.add((it.next()).toUpperCase()); 324 } 325 } 326 } 327}