001/*
002 * This library is part of OpenCms -
003 * the Open Source Content Management System
004 *
005 * Copyright (c) Alkacon Software GmbH (http://www.alkacon.com)
006 *
007 * This library is free software; you can redistribute it and/or
008 * modify it under the terms of the GNU Lesser General Public
009 * License as published by the Free Software Foundation; either
010 * version 2.1 of the License, or (at your option) any later version.
011 *
012 * This library is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015 * Lesser General Public License for more details.
016 *
017 * For further information about Alkacon Software GmbH, please see the
018 * company website: http://www.alkacon.com
019 *
020 * For further information about OpenCms, please see the
021 * project website: http://www.opencms.org
022 * 
023 * You should have received a copy of the GNU Lesser General Public
024 * License along with this library; if not, write to the Free Software
025 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
026 */
027
028package org.opencms.util;
029
030
031import java.util.ArrayList;
032import java.util.Arrays;
033import java.util.Iterator;
034import java.util.List;
035
036import org.htmlparser.Parser;
037import org.htmlparser.PrototypicalNodeFactory;
038import org.htmlparser.Remark;
039import org.htmlparser.Tag;
040import org.htmlparser.Text;
041import org.htmlparser.lexer.Lexer;
042import org.htmlparser.lexer.Page;
043import org.htmlparser.util.ParserException;
044import org.htmlparser.visitors.NodeVisitor;
045
046/**
047 * Base utility class for OpenCms <code>{@link org.htmlparser.visitors.NodeVisitor}</code>
048 * implementations, which provides some often used utility functions.
049 * <p>
050 * 
051 * This base implementation is only a "pass through" class, that is the content is parsed, but the
052 * generated result is exactly identical to the input.
053 * <p>
054 * 
055 * @since 6.2.0
056 */
057public class CmsHtmlParser extends NodeVisitor implements I_CmsHtmlNodeVisitor {
058
059    /** List of upper case tag name strings of tags that should not be auto-corrected if closing divs are missing. */
060    protected List<String> m_noAutoCloseTags;
061
062    /** The array of supported tag names. */
063    // important: don't change the order of these tags in the source, subclasses may expect the tags
064    // at the exact indices give here
065    // if you want to add tags, add them at the end
066    protected static final String[] TAG_ARRAY = new String[] {
067        "H1",
068        "H2",
069        "H3",
070        "H4",
071        "H5",
072        "H6",
073        "P",
074        "DIV",
075        "SPAN",
076        "BR",
077        "OL",
078        "UL",
079        "LI",
080        "TABLE",
081        "TD",
082        "TR",
083        "TH",
084        "THEAD",
085        "TBODY",
086        "TFOOT"};
087
088    /** The list of supported tag names. */
089    protected static final List<String> TAG_LIST = Arrays.asList(TAG_ARRAY);
090
091    /** Indicates if "echo" mode is on, that is all content is written to the result by default. */
092    protected boolean m_echo;
093
094    /** The buffer to write the out to. */
095    protected StringBuffer m_result;
096
097    /** The providable configuration - never null by contract of interface. */
098    private String m_configuration = "";
099
100    /**
101     * Creates a new instance of the html converter with echo mode set to <code>false</code>.
102     * <p>
103     */
104    public CmsHtmlParser() {
105
106        this(false);
107    }
108
109    /**
110     * Creates a new instance of the html converter.
111     * <p>
112     * 
113     * @param echo indicates if "echo" mode is on, that is all content is written to the result
114     */
115    public CmsHtmlParser(boolean echo) {
116
117        m_result = new StringBuffer(1024);
118        m_echo = echo;
119        m_noAutoCloseTags = new ArrayList<String>(32);
120    }
121
122    /**
123     * Internally degrades Composite tags that do have children in the DOM tree 
124     * to simple single tags. This allows to avoid auto correction of unclosed HTML tags.<p>
125     * 
126     * @return A node factory that will not autocorrect open tags specified via <code>{@link #setNoAutoCloseTags(List)}</code>
127     */
128    protected PrototypicalNodeFactory configureNoAutoCorrectionTags() {
129
130        PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
131
132        String tagName;
133        Iterator<String> it = m_noAutoCloseTags.iterator();
134        CmsNoAutoCloseTag noAutoCloseTag;
135        while (it.hasNext()) {
136            tagName = it.next();
137            noAutoCloseTag = new CmsNoAutoCloseTag(new String[] {tagName});
138            // TODO: This might break in case registering / unregistering  will change from name based to tag-type based approach:
139            factory.unregisterTag(noAutoCloseTag);
140            factory.registerTag(noAutoCloseTag);
141        }
142        return factory;
143    }
144
145    /**
146     * @see org.opencms.util.I_CmsHtmlNodeVisitor#getConfiguration()
147     */
148    public String getConfiguration() {
149
150        return m_configuration;
151    }
152
153    /**
154     * @see org.opencms.util.I_CmsHtmlNodeVisitor#getResult()
155     */
156    public String getResult() {
157
158        return m_result.toString();
159    }
160
161    /**
162     * Returns the HTML for the given tag itself (not the tag content).
163     * <p>
164     * 
165     * @param tag the tag to create the HTML for
166     * 
167     * @return the HTML for the given tag
168     */
169    public String getTagHtml(Tag tag) {
170
171        StringBuffer result = new StringBuffer(32);
172        result.append('<');
173        result.append(tag.getText());
174        result.append('>');
175        return result.toString();
176    }
177
178    /**
179     * @see org.opencms.util.I_CmsHtmlNodeVisitor#process(java.lang.String, java.lang.String)
180     */
181    public String process(String html, String encoding) throws ParserException {
182
183        m_result = new StringBuffer();
184        Parser parser = new Parser();
185        Lexer lexer = new Lexer();
186
187        // initialize the page with the given char set
188        Page page = new Page(html, encoding);
189        lexer.setPage(page);
190        parser.setLexer(lexer);
191
192        if (m_noAutoCloseTags != null && m_noAutoCloseTags.size() > 0) {
193            // Degrade Composite tags that do have children in the DOM tree 
194            // to simple single tags: This allows to finish this tag with opened HTML tags without the effect 
195            // that html parser will generate the closing tags. 
196            PrototypicalNodeFactory factory = configureNoAutoCorrectionTags();
197            lexer.setNodeFactory(factory);
198        }
199
200        // process the page using the given visitor
201        parser.visitAllNodesWith(this);
202        // return the result
203        return getResult();
204    }
205
206    /**
207     * 
208     * @see org.opencms.util.I_CmsHtmlNodeVisitor#setConfiguration(java.lang.String)
209     */
210    public void setConfiguration(String configuration) {
211
212        if (CmsStringUtil.isNotEmpty(configuration)) {
213            m_configuration = configuration;
214        }
215
216    }
217
218    /**
219     * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitEndTag(org.htmlparser.Tag)
220     */
221    @Override
222    public void visitEndTag(Tag tag) {
223
224        if (m_echo) {
225            m_result.append(getTagHtml(tag));
226        }
227    }
228
229    /**
230     * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitRemarkNode(org.htmlparser.Remark)
231     */
232    @Override
233    public void visitRemarkNode(Remark remark) {
234
235        if (m_echo) {
236            m_result.append(remark.toHtml(true));
237        }
238    }
239
240    /**
241     * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitStringNode(org.htmlparser.Text)
242     */
243    @Override
244    public void visitStringNode(Text text) {
245
246        if (m_echo) {
247            m_result.append(text.getText());
248        }
249    }
250
251    /**
252     * @see org.opencms.util.I_CmsHtmlNodeVisitor#visitTag(org.htmlparser.Tag)
253     */
254    @Override
255    public void visitTag(Tag tag) {
256
257        if (m_echo) {
258            m_result.append(getTagHtml(tag));
259        }
260    }
261
262    /**
263     * Collapse HTML whitespace in the given String.<p>
264     * 
265     * @param string the string to collapse
266     * 
267     * @return the input String with all HTML whitespace collapsed
268     */
269    protected String collapse(String string) {
270
271        int len = string.length();
272        StringBuffer result = new StringBuffer(len);
273        int state = 0;
274        for (int i = 0; i < len; i++) {
275            char c = string.charAt(i);
276            switch (c) {
277                // see HTML specification section 9.1 White space
278                // http://www.w3.org/TR/html4/struct/text.html#h-9.1
279                case '\u0020':
280                case '\u0009':
281                case '\u000C':
282                case '\u200B':
283                case '\r':
284                case '\n':
285                    if (0 != state) {
286                        state = 1;
287                    }
288                    break;
289                default:
290                    if (1 == state) {
291                        result.append(' ');
292                    }
293                    state = 2;
294                    result.append(c);
295            }
296        }
297        return result.toString();
298    }
299
300    /**
301     * Returns a list of upper case tag names for which parsing / visiting will not correct missing closing tags.<p>
302     * 
303     * @return a List of upper case tag names for which parsing / visiting will not correct missing closing tags
304     */
305    public List<String> getNoAutoCloseTags() {
306
307        return m_noAutoCloseTags;
308    }
309
310    /**
311     * Sets a list of upper case tag names for which parsing / visiting should not correct missing closing tags.<p> 
312     * 
313     * @param noAutoCloseTagList a list of upper case tag names for which parsing / visiting 
314     *      should not correct missing closing tags to set.
315     */
316    public void setNoAutoCloseTags(List<String> noAutoCloseTagList) {
317
318        // ensuring upper case
319        m_noAutoCloseTags.clear();
320        if (noAutoCloseTagList != null) {
321            Iterator<String> it = noAutoCloseTagList.iterator();
322            while (it.hasNext()) {
323                m_noAutoCloseTags.add((it.next()).toUpperCase());
324            }
325        }
326    }
327}