Source code

001package org.jsoup.helper;
002
003import org.jsoup.Connection;
004import org.jsoup.internal.ControllableInputStream;
005import org.jsoup.internal.Normalizer;
006import org.jsoup.internal.StringUtil;
007import org.jsoup.nodes.Comment;
008import org.jsoup.nodes.Document;
009import org.jsoup.nodes.Element;
010import org.jsoup.nodes.Node;
011import org.jsoup.nodes.XmlDeclaration;
012import org.jsoup.parser.Parser;
013import org.jsoup.parser.StreamParser;
014import org.jsoup.select.Elements;
015import org.jspecify.annotations.Nullable;
016
017import java.io.BufferedReader;
018import java.io.CharArrayReader;
019import java.io.File;
020import java.io.IOException;
021import java.io.InputStream;
022import java.io.InputStreamReader;
023import java.io.OutputStream;
024import java.io.Reader;
025import java.io.UncheckedIOException;
026import java.nio.Buffer;
027import java.nio.ByteBuffer;
028import java.nio.CharBuffer;
029import java.nio.channels.Channels;
030import java.nio.channels.SeekableByteChannel;
031import java.nio.charset.Charset;
032import java.nio.charset.IllegalCharsetNameException;
033import java.nio.file.Files;
034import java.nio.file.Path;
035import java.util.Locale;
036import java.util.Random;
037import java.util.regex.Matcher;
038import java.util.regex.Pattern;
039import java.util.zip.GZIPInputStream;
040
041import static org.jsoup.internal.SharedConstants.DefaultBufferSize;
042
043/**
044 * Internal static utilities for handling data.
045 *
046 */
047@SuppressWarnings("CharsetObjectCanBeUsed")
048public final class DataUtil {
049    private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)");
050    public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10.
051    static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset
052    private static final int firstReadBufferSize = 1024 * 5;
053    private static final char[] mimeBoundaryChars =
054            "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray();
055    static final int boundaryLength = 32;
056
057    private DataUtil() {}
058
059    /**
060     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
061     * are supported in addition to uncompressed files.
062     *
063     * @param file file to load
064     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
065     *     the file will always override this setting.
066     * @param baseUri base URI of document, to resolve relative links against
067     * @return Document
068     * @throws IOException on IO error
069     */
070    public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
071        return load(file.toPath(), charsetName, baseUri);
072    }
073
074    /**
075     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
076     * are supported in addition to uncompressed files.
077     *
078     * @param file file to load
079     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
080     *     the file will always override this setting.
081     * @param baseUri base URI of document, to resolve relative links against
082     * @param parser alternate {@link Parser#xmlParser() parser} to use.
083
084     * @return Document
085     * @throws IOException on IO error
086     * @since 1.14.2
087     */
088    public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
089        return load(file.toPath(), charsetName, baseUri, parser);
090    }
091
092    /**
093     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
094     * are supported in addition to uncompressed files.
095     *
096     * @param path file to load
097     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
098     *     the file will always override this setting.
099     * @param baseUri base URI of document, to resolve relative links against
100     * @return Document
101     * @throws IOException on IO error
102     */
103    public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException {
104        return load(path, charsetName, baseUri, Parser.htmlParser());
105    }
106
107    /**
108     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
109     * are supported in addition to uncompressed files.
110     *
111     * @param path file to load
112     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
113     * the file will always override this setting.
114     * @param baseUri base URI of document, to resolve relative links against
115     * @param parser alternate {@link Parser#xmlParser() parser} to use.
116
117     * @return Document
118     * @throws IOException on IO error
119     * @since 1.17.2
120     */
121    public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
122        InputStream stream = openStream(path);
123        return parseInputStream(stream, charsetName, baseUri, parser);
124    }
125
126    /**
127     * Returns a {@link StreamParser} that will parse the supplied file progressively.
128     * Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
129     * are supported in addition to uncompressed files.
130     *
131     * @param path file to load
132     * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata.
133     * A BOM in the file will always override this setting.
134     * @param baseUri base URI of document, to resolve relative links against
135     * @param parser alternate {@link Parser#xmlParser() parser} to use.
136
137     * @return Document
138     * @throws IOException on IO error
139     * @since 1.18.2
140     * @see Connection.Response#streamParser()
141     */
142    public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException {
143        StreamParser streamer = new StreamParser(parser);
144        String charsetName = charset != null? charset.name() : null;
145        DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser);
146        BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize);
147        maybeSkipBom(reader, charsetDoc);
148        streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
149
150        return streamer;
151    }
152
153    /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */
154    private static InputStream openStream(Path path) throws IOException {
155        final SeekableByteChannel byteChannel = Files.newByteChannel(path);
156        InputStream stream = Channels.newInputStream(byteChannel);
157        String name = Normalizer.lowerCase(path.getFileName().toString());
158        if (name.endsWith(".gz") || name.endsWith(".z")) {
159            final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
160            byteChannel.position(0); // reset to start of file
161            if (zipped) {
162                stream = new GZIPInputStream(stream);
163            }
164        }
165        return stream;
166    }
167
168    /**
169     * Parses a Document from an input steam.
170     * @param in input stream to parse. The stream will be closed after reading.
171     * @param charsetName character set of input (optional)
172     * @param baseUri base URI of document, to resolve relative links against
173     * @return Document
174     * @throws IOException on IO error
175     */
176    public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
177        return parseInputStream(in, charsetName, baseUri, Parser.htmlParser());
178    }
179
180    /**
181     * Parses a Document from an input steam, using the provided Parser.
182     * @param in input stream to parse. The stream will be closed after reading.
183     * @param charsetName character set of input (optional)
184     * @param baseUri base URI of document, to resolve relative links against
185     * @param parser alternate {@link Parser#xmlParser() parser} to use.
186     * @return Document
187     * @throws IOException on IO error
188     */
189    public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
190        return parseInputStream(in, charsetName, baseUri, parser);
191    }
192
193    /**
194     * Writes the input stream to the output stream. Doesn't close them.
195     * @param in input stream to read from
196     * @param out output stream to write to
197     * @throws IOException on IO error
198     */
199    static void crossStreams(final InputStream in, final OutputStream out) throws IOException {
200        final byte[] buffer = new byte[DefaultBufferSize];
201        int len;
202        while ((len = in.read(buffer)) != -1) {
203            out.write(buffer, 0, len);
204        }
205    }
206
207    /** A struct to return a detected charset, and a document (if fully read). */
208    static class CharsetDoc {
209        Charset charset;
210        InputStream input;
211        @Nullable Document doc;
212        boolean skip;
213
214        CharsetDoc(Charset charset, @Nullable Document doc, InputStream input, boolean skip) {
215            this.charset = charset;
216            this.input = input;
217            this.doc = doc;
218            this.skip = skip;
219        }
220    }
221
222    static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
223        if (input == null) // empty body // todo reconsider?
224            return new Document(baseUri);
225
226        final Document doc;
227        CharsetDoc charsetDoc = null;
228        try {
229            charsetDoc = detectCharset(input, charsetName, baseUri, parser);
230            doc = parseInputStream(charsetDoc, baseUri, parser);
231        } finally {
232            if (charsetDoc != null)
233                charsetDoc.input.close();
234        }
235        return doc;
236    }
237
238    static CharsetDoc detectCharset(InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
239        Document doc = null;
240
241        // read the start of the stream and look for a BOM or meta charset
242        InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0);
243        wrappedInputStream.mark(DefaultBufferSize);
244        ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
245        boolean fullyRead = (wrappedInputStream.read() == -1);
246        wrappedInputStream.reset();
247
248        // look for BOM - overrides any other header or input
249        BomCharset bomCharset = detectCharsetFromBom(firstBytes);
250        if (bomCharset != null)
251            charsetName = bomCharset.charset;
252
253        if (charsetName == null) { // determine from meta. safe first parse as UTF-8
254            try {
255                CharBuffer defaultDecoded = UTF_8.decode(firstBytes);
256                if (defaultDecoded.hasArray())
257                    doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri);
258                else
259                    doc = parser.parseInput(defaultDecoded.toString(), baseUri);
260            } catch (UncheckedIOException e) {
261                throw e.getCause();
262            }
263
264            // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
265            Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
266            String foundCharset = null; // if not found, will keep utf-8 as best attempt
267            for (Element meta : metaElements) {
268                if (meta.hasAttr("http-equiv"))
269                    foundCharset = getCharsetFromContentType(meta.attr("content"));
270                if (foundCharset == null && meta.hasAttr("charset"))
271                    foundCharset = meta.attr("charset");
272                if (foundCharset != null)
273                    break;
274            }
275
276            // look for <?xml encoding='ISO-8859-1'?>
277            if (foundCharset == null && doc.childNodeSize() > 0) {
278                Node first = doc.childNode(0);
279                XmlDeclaration decl = null;
280                if (first instanceof XmlDeclaration)
281                    decl = (XmlDeclaration) first;
282                else if (first instanceof Comment) {
283                    Comment comment = (Comment) first;
284                    if (comment.isXmlDeclaration())
285                        decl = comment.asXmlDeclaration();
286                }
287                if (decl != null && decl.name().equalsIgnoreCase("xml")) {
288                    foundCharset = decl.attr("encoding");
289                }
290            }
291            foundCharset = validateCharset(foundCharset);
292            if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works)
293                foundCharset = foundCharset.trim().replaceAll("[\"']", "");
294                charsetName = foundCharset;
295                doc = null;
296            } else if (!fullyRead) {
297                doc = null;
298            }
299        } else { // specified by content type header (or by user on file load)
300            Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
301        }
302
303        // finally: prepare the return struct
304        if (charsetName == null)
305            charsetName = defaultCharsetName;
306        Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
307        boolean skip = bomCharset != null && bomCharset.offset; // skip 1 if the BOM is there and needs offset
308        // if consumer needs to parse the input; prep it if there's a BOM. Can't skip in inputstream as wrapping buffer will ignore the pos
309        return new CharsetDoc(charset, doc, wrappedInputStream, skip);
310    }
311
312    static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException {
313        // if doc != null it was fully parsed during charset detection; so just return that
314        if (charsetDoc.doc != null)
315            return charsetDoc.doc;
316
317        final InputStream input = charsetDoc.input;
318        Validate.notNull(input);
319        final Document doc;
320        final Charset charset = charsetDoc.charset;
321        try (BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset), DefaultBufferSize)) {
322            maybeSkipBom(reader, charsetDoc);
323            try {
324                doc = parser.parseInput(reader, baseUri);
325            } catch (UncheckedIOException e) {
326                // io exception when parsing (not seen before because reading the stream as we go)
327                throw e.getCause();
328            }
329            doc.outputSettings().charset(charset);
330            if (!charset.canEncode()) {
331                // some charsets can read but not encode; switch to an encodable charset and update the meta el
332                doc.charset(UTF_8);
333            }
334        }
335        return doc;
336    }
337
338    static void maybeSkipBom(Reader reader, CharsetDoc charsetDoc) throws IOException {
339        if (charsetDoc.skip) {
340            long skipped = reader.skip(1);
341            Validate.isTrue(skipped == 1); // WTF if this fails.
342        }
343    }
344
345    /**
346     * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
347     * method is executing on. The data read until being interrupted will be available.
348     * @param inStream the input stream to read from
349     * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited.
350     * @return the filled byte buffer
351     * @throws IOException if an exception occurs whilst reading from the input stream.
352     */
353    public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException {
354        return ControllableInputStream.readToByteBuffer(inStream, maxSize);
355    }
356
357    static ByteBuffer emptyByteBuffer() {
358        return ByteBuffer.allocate(0);
359    }
360
361    /**
362     * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default
363     * will kick in.)
364     * @param contentType e.g. "text/html; charset=EUC-JP"
365     * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
366     */
367    static @Nullable String getCharsetFromContentType(@Nullable String contentType) {
368        if (contentType == null) return null;
369        Matcher m = charsetPattern.matcher(contentType);
370        if (m.find()) {
371            String charset = m.group(1).trim();
372            charset = charset.replace("charset=", "");
373            return validateCharset(charset);
374        }
375        return null;
376    }
377
378    private @Nullable static String validateCharset(@Nullable String cs) {
379        if (cs == null || cs.length() == 0) return null;
380        cs = cs.trim().replaceAll("[\"']", "");
381        try {
382            if (Charset.isSupported(cs)) return cs;
383            cs = cs.toUpperCase(Locale.ENGLISH);
384            if (Charset.isSupported(cs)) return cs;
385        } catch (IllegalCharsetNameException e) {
386            // if all this charset matching fails.... we just take the default
387        }
388        return null;
389    }
390
391    /**
392     * Creates a random string, suitable for use as a mime boundary
393     */
394    static String mimeBoundary() {
395        final StringBuilder mime = StringUtil.borrowBuilder();
396        final Random rand = new Random();
397        for (int i = 0; i < boundaryLength; i++) {
398            mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]);
399        }
400        return StringUtil.releaseBuilder(mime);
401    }
402
403    private static @Nullable BomCharset detectCharsetFromBom(final ByteBuffer byteData) {
404        @SuppressWarnings("UnnecessaryLocalVariable") final Buffer buffer = byteData; // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat
405        buffer.mark();
406        byte[] bom = new byte[4];
407        if (byteData.remaining() >= bom.length) {
408            byteData.get(bom);
409            buffer.rewind();
410        }
411        if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
412            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
413            return new BomCharset("UTF-32", false); // and I hope it's on your system
414        } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
415            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
416            return new BomCharset("UTF-16", false); // in all Javas
417        } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
418            return new BomCharset("UTF-8", true); // in all Javas
419            // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
420        }
421        return null;
422    }
423
424    private static class BomCharset {
425        private final String charset;
426        private final boolean offset;
427
428        public BomCharset(String charset, boolean offset) {
429            this.charset = charset;
430            this.offset = offset;
431        }
432    }
433}