001package org.jsoup.select; 002 003import org.jsoup.internal.StringUtil; 004import org.jsoup.helper.Validate; 005import org.jsoup.nodes.CDataNode; 006import org.jsoup.nodes.Comment; 007import org.jsoup.nodes.DataNode; 008import org.jsoup.nodes.LeafNode; 009import org.jsoup.nodes.Node; 010import org.jsoup.nodes.TextNode; 011import org.jsoup.parser.TokenQueue; 012import org.jspecify.annotations.Nullable; 013 014import java.util.function.Function; 015import java.util.regex.Matcher; 016import java.util.regex.Pattern; 017 018import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun; 019import static org.jsoup.internal.Normalizer.normalize; 020 021/** 022 * Parses a CSS selector into an Evaluator tree. 023 */ 024public class QueryParser implements AutoCloseable { 025 private final static char[] Combinators = {'>', '+', '~'}; // ' ' is also a combinator, but found implicitly 026 private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="}; 027 private final static char[] SequenceEnders = {',', ')'}; 028 029 private final TokenQueue tq; 030 private final String query; 031 private boolean inNodeContext; // ::comment:contains should act on node value, vs element text 032 033 /** 034 * Create a new QueryParser. 035 * @param query CSS query 036 */ 037 private QueryParser(String query) { 038 Validate.notEmpty(query); 039 query = query.trim(); 040 this.query = query; 041 this.tq = new TokenQueue(query); 042 } 043 044 /** 045 Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to 046 parse it once and reuse the Evaluator. 047 048 @param query CSS query 049 @return Evaluator 050 @see Selector selector query syntax 051 @throws Selector.SelectorParseException if the CSS query is invalid 052 */ 053 public static Evaluator parse(String query) { 054 try (QueryParser p = new QueryParser(query)) { 055 return p.parse(); 056 } catch (IllegalArgumentException e) { 057 throw new Selector.SelectorParseException(e.getMessage()); 058 } 059 } 060 061 /** 062 Parse the query. We use this simplified expression of the grammar: 063 <pre> 064 SelectorGroup ::= Selector (',' Selector)* 065 Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )* 066 SimpleSequence ::= [ TypeSelector ] ( ID | Class | Attribute | Pseudo )* 067 Pseudo ::= ':' Name [ '(' SelectorGroup ')' ] 068 Combinator ::= S+ // descendant (whitespace) 069 | '>' // child 070 | '+' // adjacent sibling 071 | '~' // general sibling 072 </pre> 073 074 See <a href="https://www.w3.org/TR/selectors-4/#grammar">selectors-4</a> for the real thing 075 */ 076 Evaluator parse() { 077 Evaluator eval = parseSelectorGroup(); 078 tq.consumeWhitespace(); 079 if (!tq.isEmpty()) 080 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 081 return eval; 082 } 083 084 Evaluator parseSelectorGroup() { 085 // SelectorGroup. Into an Or if > 1 Selector 086 Evaluator left = parseSelector(); 087 while (tq.matchChomp(',')) { 088 Evaluator right = parseSelector(); 089 left = or(left, right); 090 } 091 return left; 092 } 093 094 Evaluator parseSelector() { 095 // Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )* 096 tq.consumeWhitespace(); 097 098 Evaluator left; 099 if (tq.matchesAny(Combinators)) { 100 // e.g. query is "> div"; left side is root element 101 left = new StructuralEvaluator.Root(); 102 } else { 103 left = parseSimpleSequence(); 104 } 105 106 while (true) { 107 char combinator = 0; 108 if (tq.consumeWhitespace()) 109 combinator = ' '; // maybe descendant? 110 if (tq.matchesAny(Combinators)) // no, explicit 111 combinator = tq.consume(); 112 else if (tq.matchesAny(SequenceEnders)) // , - space after simple like "foo , bar"; ) - close of :has() 113 break; 114 115 if (combinator != 0) { 116 Evaluator right = parseSimpleSequence(); 117 left = combinator(left, combinator, right); 118 } else { 119 break; 120 } 121 } 122 return left; 123 } 124 125 Evaluator parseSimpleSequence() { 126 // SimpleSequence ::= TypeSelector? ( Hash | Class | Pseudo )* 127 Evaluator left = null; 128 tq.consumeWhitespace(); 129 130 // one optional type selector 131 if (tq.matchesWord() || tq.matches("*|")) 132 left = byTag(); 133 else if (tq.matchChomp('*')) 134 left = new Evaluator.AllElements(); 135 136 // zero or more subclasses (#, ., [) 137 while(true) { 138 Evaluator right = parseSubclass(); 139 if (right != null) { 140 left = and(left, right); 141 } 142 else break; // no more simple tokens 143 } 144 145 if (left == null) 146 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 147 return left; 148 } 149 150 static Evaluator combinator(Evaluator left, char combinator, Evaluator right) { 151 switch (combinator) { 152 case '>': 153 ImmediateParentRun run = left instanceof ImmediateParentRun ? 154 (ImmediateParentRun) left : new ImmediateParentRun(left); 155 run.add(right); 156 return run; 157 case ' ': 158 return and(new StructuralEvaluator.Ancestor(left), right); 159 case '+': 160 return and(new StructuralEvaluator.ImmediatePreviousSibling(left), right); 161 case '~': 162 return and(new StructuralEvaluator.PreviousSibling(left), right); 163 default: 164 throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator); 165 } 166 } 167 168 @Nullable Evaluator parseSubclass() { 169 // Subclass: ID | Class | Attribute | Pseudo 170 if (tq.matchChomp('#')) return byId(); 171 else if (tq.matchChomp('.')) return byClass(); 172 else if (tq.matches('[')) return byAttribute(); 173 else if (tq.matchChomp("::")) return parseNodeSelector(); // ::comment etc 174 else if (tq.matchChomp(':')) return parsePseudoSelector(); 175 else return null; 176 } 177 178 /** Merge two evals into an Or. */ 179 static Evaluator or(Evaluator left, Evaluator right) { 180 if (left instanceof CombiningEvaluator.Or) { 181 ((CombiningEvaluator.Or) left).add(right); 182 return left; 183 } 184 return new CombiningEvaluator.Or(left, right); 185 } 186 187 /** Merge two evals into an And. */ 188 static Evaluator and(@Nullable Evaluator left, Evaluator right) { 189 if (left == null) return right; 190 if (left instanceof CombiningEvaluator.And) { 191 ((CombiningEvaluator.And) left).add(right); 192 return left; 193 } 194 return new CombiningEvaluator.And(left, right); 195 } 196 197 private Evaluator parsePseudoSelector() { 198 final String pseudo = tq.consumeCssIdentifier(); 199 switch (pseudo) { 200 case "lt": 201 return new Evaluator.IndexLessThan(consumeIndex()); 202 case "gt": 203 return new Evaluator.IndexGreaterThan(consumeIndex()); 204 case "eq": 205 return new Evaluator.IndexEquals(consumeIndex()); 206 case "has": 207 return has(); 208 case "is": 209 return is(); 210 case "contains": 211 return contains(false); 212 case "containsOwn": 213 return contains(true); 214 case "containsWholeText": 215 return containsWholeText(false); 216 case "containsWholeOwnText": 217 return containsWholeText(true); 218 case "containsData": 219 return containsData(); 220 case "matches": 221 return matches(false); 222 case "matchesOwn": 223 return matches(true); 224 case "matchesWholeText": 225 return matchesWholeText(false); 226 case "matchesWholeOwnText": 227 return matchesWholeText(true); 228 case "not": 229 return not(); 230 case "nth-child": 231 return cssNthChild(false, false); 232 case "nth-last-child": 233 return cssNthChild(true, false); 234 case "nth-of-type": 235 return cssNthChild(false, true); 236 case "nth-last-of-type": 237 return cssNthChild(true, true); 238 case "first-child": 239 return new Evaluator.IsFirstChild(); 240 case "last-child": 241 return new Evaluator.IsLastChild(); 242 case "first-of-type": 243 return new Evaluator.IsFirstOfType(); 244 case "last-of-type": 245 return new Evaluator.IsLastOfType(); 246 case "only-child": 247 return new Evaluator.IsOnlyChild(); 248 case "only-of-type": 249 return new Evaluator.IsOnlyOfType(); 250 case "empty": 251 return new Evaluator.IsEmpty(); 252 case "blank": 253 return new NodeEvaluator.BlankValue(); 254 case "root": 255 return new Evaluator.IsRoot(); 256 case "matchText": 257 return new Evaluator.MatchText(); 258 default: 259 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 260 } 261 } 262 263 // ::comment etc 264 private Evaluator parseNodeSelector() { 265 final String pseudo = tq.consumeCssIdentifier(); 266 inNodeContext = true; // Enter node context 267 268 Evaluator left; 269 switch (pseudo) { 270 case "node": 271 left = new NodeEvaluator.InstanceType(Node.class, pseudo); 272 break; 273 case "leafnode": 274 left = new NodeEvaluator.InstanceType(LeafNode.class, pseudo); 275 break; 276 case "text": 277 left = new NodeEvaluator.InstanceType(TextNode.class, pseudo); 278 break; 279 case "comment": 280 left = new NodeEvaluator.InstanceType(Comment.class, pseudo); 281 break; 282 case "data": 283 left = new NodeEvaluator.InstanceType(DataNode.class, pseudo); 284 break; 285 case "cdata": 286 left = new NodeEvaluator.InstanceType(CDataNode.class, pseudo); 287 break; 288 default: 289 throw new Selector.SelectorParseException( 290 "Could not parse query '%s': unknown node type '::%s'", query, pseudo); 291 } 292 293 // Handle following subclasses in node context (like ::comment:contains()) 294 Evaluator right; 295 while ((right = parseSubclass()) != null) { 296 left = and(left, right); 297 } 298 299 inNodeContext = false; 300 return left; 301 } 302 303 private Evaluator byId() { 304 String id = tq.consumeCssIdentifier(); 305 Validate.notEmpty(id); 306 return new Evaluator.Id(id); 307 } 308 309 private Evaluator byClass() { 310 String className = tq.consumeCssIdentifier(); 311 Validate.notEmpty(className); 312 return new Evaluator.Class(className.trim()); 313 } 314 315 private Evaluator byTag() { 316 // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make 317 // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for 318 // consistency - both the selector and the element tag 319 String tagName = normalize(tq.consumeElementSelector()); 320 Validate.notEmpty(tagName); 321 322 // namespaces: 323 if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName 324 String plainTag = tagName.substring(2); // strip *| 325 return new CombiningEvaluator.Or( 326 new Evaluator.Tag(plainTag), 327 new Evaluator.TagEndsWith(":" + plainTag) 328 ); 329 } else if (tagName.endsWith("|*")) { // ns|* 330 String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns: 331 return new Evaluator.TagStartsWith(ns); 332 } else if (tagName.contains("|")) { // flip "abc|def" to "abc:def" 333 tagName = tagName.replace("|", ":"); 334 } 335 336 return new Evaluator.Tag(tagName); 337 } 338 339 private Evaluator byAttribute() { 340 try (TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']'))) { 341 return evaluatorForAttribute(cq); 342 } 343 } 344 345 private Evaluator evaluatorForAttribute(TokenQueue cq) { 346 String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val) 347 Validate.notEmpty(key); 348 cq.consumeWhitespace(); 349 final Evaluator eval; 350 351 if (cq.isEmpty()) { 352 if (key.startsWith("^")) 353 eval = new Evaluator.AttributeStarting(key.substring(1)); 354 else if (key.equals("*")) // any attribute 355 eval = new Evaluator.AttributeStarting(""); 356 else 357 eval = new Evaluator.Attribute(key); 358 } else { 359 if (cq.matchChomp('=')) 360 eval = new Evaluator.AttributeWithValue(key, cq.remainder()); 361 else if (cq.matchChomp("!=")) 362 eval = new Evaluator.AttributeWithValueNot(key, cq.remainder()); 363 else if (cq.matchChomp("^=")) 364 eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder()); 365 else if (cq.matchChomp("$=")) 366 eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder()); 367 else if (cq.matchChomp("*=")) 368 eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder()); 369 else if (cq.matchChomp("~=")) 370 eval = new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder())); 371 else 372 throw new Selector.SelectorParseException( 373 "Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder()); 374 } 375 return eval; 376 } 377 378 //pseudo selectors :first-child, :last-child, :nth-child, ... 379 private static final Pattern NthStepOffset = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE); 380 private static final Pattern NthOffset = Pattern.compile("([+-])?(\\d+)"); 381 382 private Evaluator cssNthChild(boolean last, boolean ofType) { 383 String arg = normalize(consumeParens()); // arg is like "odd", or "-n+2", within nth-child(odd) 384 final int step, offset; 385 if ("odd".equals(arg)) { 386 step = 2; 387 offset = 1; 388 } else if ("even".equals(arg)) { 389 step = 2; 390 offset = 0; 391 } else { 392 Matcher stepOffsetM, stepM; 393 if ((stepOffsetM = NthStepOffset.matcher(arg)).matches()) { 394 if (stepOffsetM.group(3) != null) // has digits, like 3n+2 or -3n+2 395 step = Integer.parseInt(stepOffsetM.group(1).replaceFirst("^\\+", "")); 396 else // no digits, might be like n+2, or -n+2. if group(2) == "-", it’s -1; 397 step = "-".equals(stepOffsetM.group(2)) ? -1 : 1; 398 offset = 399 stepOffsetM.group(4) != null ? Integer.parseInt(stepOffsetM.group(4).replaceFirst("^\\+", "")) : 0; 400 } else if ((stepM = NthOffset.matcher(arg)).matches()) { 401 step = 0; 402 offset = Integer.parseInt(stepM.group().replaceFirst("^\\+", "")); 403 } else { 404 throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg); 405 } 406 } 407 408 return ofType 409 ? (last ? new Evaluator.IsNthLastOfType(step, offset) : new Evaluator.IsNthOfType(step, offset)) 410 : (last ? new Evaluator.IsNthLastChild(step, offset) : new Evaluator.IsNthChild(step, offset)); 411 } 412 413 private String consumeParens() { 414 return tq.chompBalanced('(', ')'); 415 } 416 417 private int consumeIndex() { 418 String index = consumeParens().trim(); 419 Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric"); 420 return Integer.parseInt(index); 421 } 422 423 // pseudo selector :has(el) 424 private Evaluator has() { 425 return parseNested(StructuralEvaluator.Has::new, ":has() must have a selector"); 426 } 427 428 // pseudo selector :is() 429 private Evaluator is() { 430 return parseNested(StructuralEvaluator.Is::new, ":is() must have a selector"); 431 } 432 433 private Evaluator parseNested(Function<Evaluator, Evaluator> func, String err) { 434 Validate.isTrue(tq.matchChomp('('), err); 435 Evaluator eval = parseSelectorGroup(); 436 Validate.isTrue(tq.matchChomp(')'), err); 437 return func.apply(eval); 438 } 439 440 // pseudo selector :contains(text), containsOwn(text) 441 private Evaluator contains(boolean own) { 442 String query = own ? ":containsOwn" : ":contains"; 443 String searchText = TokenQueue.unescape(consumeParens()); 444 Validate.notEmpty(searchText, query + "(text) query must not be empty"); 445 446 if (inNodeContext) 447 return new NodeEvaluator.ContainsValue(searchText); 448 449 return own 450 ? new Evaluator.ContainsOwnText(searchText) 451 : new Evaluator.ContainsText(searchText); 452 } 453 454 private Evaluator containsWholeText(boolean own) { 455 String query = own ? ":containsWholeOwnText" : ":containsWholeText"; 456 String searchText = TokenQueue.unescape(consumeParens()); 457 Validate.notEmpty(searchText, query + "(text) query must not be empty"); 458 return own 459 ? new Evaluator.ContainsWholeOwnText(searchText) 460 : new Evaluator.ContainsWholeText(searchText); 461 } 462 463 // pseudo selector :containsData(data) 464 private Evaluator containsData() { 465 String searchText = TokenQueue.unescape(consumeParens()); 466 Validate.notEmpty(searchText, ":containsData(text) query must not be empty"); 467 return new Evaluator.ContainsData(searchText); 468 } 469 470 // :matches(regex), matchesOwn(regex) 471 private Evaluator matches(boolean own) { 472 String query = own ? ":matchesOwn" : ":matches"; 473 String regex = consumeParens(); // don't unescape, as regex bits will be escaped 474 Validate.notEmpty(regex, query + "(regex) query must not be empty"); 475 Pattern pattern = Pattern.compile(regex); 476 477 if (inNodeContext) 478 return new NodeEvaluator.MatchesValue(pattern); 479 480 return own 481 ? new Evaluator.MatchesOwn(pattern) 482 : new Evaluator.Matches(pattern); 483 } 484 485 // :matches(regex), matchesOwn(regex) 486 private Evaluator matchesWholeText(boolean own) { 487 String query = own ? ":matchesWholeOwnText" : ":matchesWholeText"; 488 String regex = consumeParens(); // don't unescape, as regex bits will be escaped 489 Validate.notEmpty(regex, query + "(regex) query must not be empty"); 490 491 return own 492 ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex)) 493 : new Evaluator.MatchesWholeText(Pattern.compile(regex)); 494 } 495 496 // :not(selector) 497 private Evaluator not() { 498 String subQuery = consumeParens(); 499 Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty"); 500 501 return new StructuralEvaluator.Not(parse(subQuery)); 502 } 503 504 @Override 505 public String toString() { 506 return query; 507 } 508 509 @Override 510 public void close() { 511 tq.close(); 512 } 513}