001package org.jsoup.helper; 002 003import org.jsoup.Connection; 004import org.jsoup.internal.ControllableInputStream; 005import org.jsoup.internal.Normalizer; 006import org.jsoup.internal.StringUtil; 007import org.jsoup.nodes.Comment; 008import org.jsoup.nodes.Document; 009import org.jsoup.nodes.Element; 010import org.jsoup.nodes.Node; 011import org.jsoup.nodes.XmlDeclaration; 012import org.jsoup.parser.Parser; 013import org.jsoup.parser.StreamParser; 014import org.jsoup.select.Elements; 015import org.jspecify.annotations.Nullable; 016 017import java.io.BufferedReader; 018import java.io.CharArrayReader; 019import java.io.File; 020import java.io.IOException; 021import java.io.InputStream; 022import java.io.InputStreamReader; 023import java.io.OutputStream; 024import java.io.Reader; 025import java.io.UncheckedIOException; 026import java.nio.Buffer; 027import java.nio.ByteBuffer; 028import java.nio.CharBuffer; 029import java.nio.channels.Channels; 030import java.nio.channels.SeekableByteChannel; 031import java.nio.charset.Charset; 032import java.nio.charset.IllegalCharsetNameException; 033import java.nio.file.Files; 034import java.nio.file.Path; 035import java.util.Locale; 036import java.util.Random; 037import java.util.regex.Matcher; 038import java.util.regex.Pattern; 039import java.util.zip.GZIPInputStream; 040 041import static org.jsoup.internal.SharedConstants.DefaultBufferSize; 042 043/** 044 * Internal static utilities for handling data. 045 * 046 */ 047@SuppressWarnings("CharsetObjectCanBeUsed") 048public final class DataUtil { 049 private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)"); 050 public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10. 051 static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset 052 private static final int firstReadBufferSize = 1024 * 5; 053 private static final char[] mimeBoundaryChars = 054 "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray(); 055 static final int boundaryLength = 32; 056 057 private DataUtil() {} 058 059 /** 060 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 061 * are supported in addition to uncompressed files. 062 * 063 * @param file file to load 064 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 065 * the file will always override this setting. 066 * @param baseUri base URI of document, to resolve relative links against 067 * @return Document 068 * @throws IOException on IO error 069 */ 070 public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException { 071 return load(file.toPath(), charsetName, baseUri); 072 } 073 074 /** 075 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 076 * are supported in addition to uncompressed files. 077 * 078 * @param file file to load 079 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 080 * the file will always override this setting. 081 * @param baseUri base URI of document, to resolve relative links against 082 * @param parser alternate {@link Parser#xmlParser() parser} to use. 083 084 * @return Document 085 * @throws IOException on IO error 086 * @since 1.14.2 087 */ 088 public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 089 return load(file.toPath(), charsetName, baseUri, parser); 090 } 091 092 /** 093 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 094 * are supported in addition to uncompressed files. 095 * 096 * @param path file to load 097 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 098 * the file will always override this setting. 099 * @param baseUri base URI of document, to resolve relative links against 100 * @return Document 101 * @throws IOException on IO error 102 */ 103 public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException { 104 return load(path, charsetName, baseUri, Parser.htmlParser()); 105 } 106 107 /** 108 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 109 * are supported in addition to uncompressed files. 110 * 111 * @param path file to load 112 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 113 * the file will always override this setting. 114 * @param baseUri base URI of document, to resolve relative links against 115 * @param parser alternate {@link Parser#xmlParser() parser} to use. 116 117 * @return Document 118 * @throws IOException on IO error 119 * @since 1.17.2 120 */ 121 public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 122 InputStream stream = openStream(path); 123 return parseInputStream(stream, charsetName, baseUri, parser); 124 } 125 126 /** 127 * Returns a {@link StreamParser} that will parse the supplied file progressively. 128 * Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 129 * are supported in addition to uncompressed files. 130 * 131 * @param path file to load 132 * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata. 133 * A BOM in the file will always override this setting. 134 * @param baseUri base URI of document, to resolve relative links against 135 * @param parser alternate {@link Parser#xmlParser() parser} to use. 136 137 * @return Document 138 * @throws IOException on IO error 139 * @since 1.18.2 140 * @see Connection.Response#streamParser() 141 */ 142 public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException { 143 StreamParser streamer = new StreamParser(parser); 144 String charsetName = charset != null? charset.name() : null; 145 DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser); 146 BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize); 147 maybeSkipBom(reader, charsetDoc); 148 streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it 149 150 return streamer; 151 } 152 153 /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */ 154 private static InputStream openStream(Path path) throws IOException { 155 final SeekableByteChannel byteChannel = Files.newByteChannel(path); 156 InputStream stream = Channels.newInputStream(byteChannel); 157 String name = Normalizer.lowerCase(path.getFileName().toString()); 158 if (name.endsWith(".gz") || name.endsWith(".z")) { 159 final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes 160 byteChannel.position(0); // reset to start of file 161 if (zipped) { 162 stream = new GZIPInputStream(stream); 163 } 164 } 165 return stream; 166 } 167 168 /** 169 * Parses a Document from an input steam. 170 * @param in input stream to parse. The stream will be closed after reading. 171 * @param charsetName character set of input (optional) 172 * @param baseUri base URI of document, to resolve relative links against 173 * @return Document 174 * @throws IOException on IO error 175 */ 176 public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException { 177 return parseInputStream(in, charsetName, baseUri, Parser.htmlParser()); 178 } 179 180 /** 181 * Parses a Document from an input steam, using the provided Parser. 182 * @param in input stream to parse. The stream will be closed after reading. 183 * @param charsetName character set of input (optional) 184 * @param baseUri base URI of document, to resolve relative links against 185 * @param parser alternate {@link Parser#xmlParser() parser} to use. 186 * @return Document 187 * @throws IOException on IO error 188 */ 189 public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 190 return parseInputStream(in, charsetName, baseUri, parser); 191 } 192 193 /** 194 * Writes the input stream to the output stream. Doesn't close them. 195 * @param in input stream to read from 196 * @param out output stream to write to 197 * @throws IOException on IO error 198 */ 199 static void crossStreams(final InputStream in, final OutputStream out) throws IOException { 200 final byte[] buffer = new byte[DefaultBufferSize]; 201 int len; 202 while ((len = in.read(buffer)) != -1) { 203 out.write(buffer, 0, len); 204 } 205 } 206 207 /** A struct to return a detected charset, and a document (if fully read). */ 208 static class CharsetDoc { 209 Charset charset; 210 InputStream input; 211 @Nullable Document doc; 212 boolean skip; 213 214 CharsetDoc(Charset charset, @Nullable Document doc, InputStream input, boolean skip) { 215 this.charset = charset; 216 this.input = input; 217 this.doc = doc; 218 this.skip = skip; 219 } 220 } 221 222 static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 223 if (input == null) // empty body // todo reconsider? 224 return new Document(baseUri); 225 226 final Document doc; 227 CharsetDoc charsetDoc = null; 228 try { 229 charsetDoc = detectCharset(input, charsetName, baseUri, parser); 230 doc = parseInputStream(charsetDoc, baseUri, parser); 231 } finally { 232 if (charsetDoc != null) 233 charsetDoc.input.close(); 234 } 235 return doc; 236 } 237 238 static CharsetDoc detectCharset(InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 239 Document doc = null; 240 241 // read the start of the stream and look for a BOM or meta charset 242 InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0); 243 wrappedInputStream.mark(DefaultBufferSize); 244 ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. 245 boolean fullyRead = (wrappedInputStream.read() == -1); 246 wrappedInputStream.reset(); 247 248 // look for BOM - overrides any other header or input 249 BomCharset bomCharset = detectCharsetFromBom(firstBytes); 250 if (bomCharset != null) 251 charsetName = bomCharset.charset; 252 253 if (charsetName == null) { // determine from meta. safe first parse as UTF-8 254 try { 255 CharBuffer defaultDecoded = UTF_8.decode(firstBytes); 256 if (defaultDecoded.hasArray()) 257 doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri); 258 else 259 doc = parser.parseInput(defaultDecoded.toString(), baseUri); 260 } catch (UncheckedIOException e) { 261 throw e.getCause(); 262 } 263 264 // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> 265 Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]"); 266 String foundCharset = null; // if not found, will keep utf-8 as best attempt 267 for (Element meta : metaElements) { 268 if (meta.hasAttr("http-equiv")) 269 foundCharset = getCharsetFromContentType(meta.attr("content")); 270 if (foundCharset == null && meta.hasAttr("charset")) 271 foundCharset = meta.attr("charset"); 272 if (foundCharset != null) 273 break; 274 } 275 276 // look for <?xml encoding='ISO-8859-1'?> 277 if (foundCharset == null && doc.childNodeSize() > 0) { 278 Node first = doc.childNode(0); 279 XmlDeclaration decl = null; 280 if (first instanceof XmlDeclaration) 281 decl = (XmlDeclaration) first; 282 else if (first instanceof Comment) { 283 Comment comment = (Comment) first; 284 if (comment.isXmlDeclaration()) 285 decl = comment.asXmlDeclaration(); 286 } 287 if (decl != null && decl.name().equalsIgnoreCase("xml")) { 288 foundCharset = decl.attr("encoding"); 289 } 290 } 291 foundCharset = validateCharset(foundCharset); 292 if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works) 293 foundCharset = foundCharset.trim().replaceAll("[\"']", ""); 294 charsetName = foundCharset; 295 doc = null; 296 } else if (!fullyRead) { 297 doc = null; 298 } 299 } else { // specified by content type header (or by user on file load) 300 Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); 301 } 302 303 // finally: prepare the return struct 304 if (charsetName == null) 305 charsetName = defaultCharsetName; 306 Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName); 307 boolean skip = bomCharset != null && bomCharset.offset; // skip 1 if the BOM is there and needs offset 308 // if consumer needs to parse the input; prep it if there's a BOM. Can't skip in inputstream as wrapping buffer will ignore the pos 309 return new CharsetDoc(charset, doc, wrappedInputStream, skip); 310 } 311 312 static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException { 313 // if doc != null it was fully parsed during charset detection; so just return that 314 if (charsetDoc.doc != null) 315 return charsetDoc.doc; 316 317 final InputStream input = charsetDoc.input; 318 Validate.notNull(input); 319 final Document doc; 320 final Charset charset = charsetDoc.charset; 321 try (BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset), DefaultBufferSize)) { 322 maybeSkipBom(reader, charsetDoc); 323 try { 324 doc = parser.parseInput(reader, baseUri); 325 } catch (UncheckedIOException e) { 326 // io exception when parsing (not seen before because reading the stream as we go) 327 throw e.getCause(); 328 } 329 doc.outputSettings().charset(charset); 330 if (!charset.canEncode()) { 331 // some charsets can read but not encode; switch to an encodable charset and update the meta el 332 doc.charset(UTF_8); 333 } 334 } 335 return doc; 336 } 337 338 static void maybeSkipBom(Reader reader, CharsetDoc charsetDoc) throws IOException { 339 if (charsetDoc.skip) { 340 long skipped = reader.skip(1); 341 Validate.isTrue(skipped == 1); // WTF if this fails. 342 } 343 } 344 345 /** 346 * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this 347 * method is executing on. The data read until being interrupted will be available. 348 * @param inStream the input stream to read from 349 * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited. 350 * @return the filled byte buffer 351 * @throws IOException if an exception occurs whilst reading from the input stream. 352 */ 353 public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException { 354 return ControllableInputStream.readToByteBuffer(inStream, maxSize); 355 } 356 357 static ByteBuffer emptyByteBuffer() { 358 return ByteBuffer.allocate(0); 359 } 360 361 /** 362 * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default 363 * will kick in.) 364 * @param contentType e.g. "text/html; charset=EUC-JP" 365 * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased. 366 */ 367 static @Nullable String getCharsetFromContentType(@Nullable String contentType) { 368 if (contentType == null) return null; 369 Matcher m = charsetPattern.matcher(contentType); 370 if (m.find()) { 371 String charset = m.group(1).trim(); 372 charset = charset.replace("charset=", ""); 373 return validateCharset(charset); 374 } 375 return null; 376 } 377 378 private @Nullable static String validateCharset(@Nullable String cs) { 379 if (cs == null || cs.length() == 0) return null; 380 cs = cs.trim().replaceAll("[\"']", ""); 381 try { 382 if (Charset.isSupported(cs)) return cs; 383 cs = cs.toUpperCase(Locale.ENGLISH); 384 if (Charset.isSupported(cs)) return cs; 385 } catch (IllegalCharsetNameException e) { 386 // if all this charset matching fails.... we just take the default 387 } 388 return null; 389 } 390 391 /** 392 * Creates a random string, suitable for use as a mime boundary 393 */ 394 static String mimeBoundary() { 395 final StringBuilder mime = StringUtil.borrowBuilder(); 396 final Random rand = new Random(); 397 for (int i = 0; i < boundaryLength; i++) { 398 mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]); 399 } 400 return StringUtil.releaseBuilder(mime); 401 } 402 403 private static @Nullable BomCharset detectCharsetFromBom(final ByteBuffer byteData) { 404 @SuppressWarnings("UnnecessaryLocalVariable") final Buffer buffer = byteData; // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat 405 buffer.mark(); 406 byte[] bom = new byte[4]; 407 if (byteData.remaining() >= bom.length) { 408 byteData.get(bom); 409 buffer.rewind(); 410 } 411 if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE 412 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE 413 return new BomCharset("UTF-32", false); // and I hope it's on your system 414 } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE 415 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) { 416 return new BomCharset("UTF-16", false); // in all Javas 417 } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) { 418 return new BomCharset("UTF-8", true); // in all Javas 419 // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here 420 } 421 return null; 422 } 423 424 private static class BomCharset { 425 private final String charset; 426 private final boolean offset; 427 428 public BomCharset(String charset, boolean offset) { 429 this.charset = charset; 430 this.offset = offset; 431 } 432 } 433}