Source code

001package org.jsoup.select;
002
003import org.jsoup.internal.StringUtil;
004import org.jsoup.helper.Validate;
005import org.jsoup.nodes.CDataNode;
006import org.jsoup.nodes.Comment;
007import org.jsoup.nodes.DataNode;
008import org.jsoup.nodes.LeafNode;
009import org.jsoup.nodes.Node;
010import org.jsoup.nodes.TextNode;
011import org.jsoup.parser.TokenQueue;
012import org.jspecify.annotations.Nullable;
013
014import java.util.function.Function;
015import java.util.regex.Matcher;
016import java.util.regex.Pattern;
017
018import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun;
019import static org.jsoup.internal.Normalizer.normalize;
020
021/**
022 * Parses a CSS selector into an Evaluator tree.
023 */
024public class QueryParser implements AutoCloseable {
025    private final static char[] Combinators = {'>', '+', '~'}; // ' ' is also a combinator, but found implicitly
026    private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="};
027    private final static char[] SequenceEnders = {',', ')'};
028
029    private final TokenQueue tq;
030    private final String query;
031    private boolean inNodeContext; // ::comment:contains should act on node value, vs element text
032
033    /**
034     * Create a new QueryParser.
035     * @param query CSS query
036     */
037    private QueryParser(String query) {
038        Validate.notEmpty(query);
039        query = query.trim();
040        this.query = query;
041        this.tq = new TokenQueue(query);
042    }
043
044    /**
045     Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to
046     parse it once and reuse the Evaluator.
047
048     @param query CSS query
049     @return Evaluator
050     @see Selector selector query syntax
051     @throws Selector.SelectorParseException if the CSS query is invalid
052     */
053    public static Evaluator parse(String query) {
054        try (QueryParser p = new QueryParser(query)) {
055            return p.parse();
056        } catch (IllegalArgumentException e) {
057            throw new Selector.SelectorParseException(e.getMessage());
058        }
059    }
060
061    /**
062     Parse the query. We use this simplified expression of the grammar:
063     <pre>
064     SelectorGroup   ::= Selector (',' Selector)*
065     Selector        ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )*
066     SimpleSequence  ::= [ TypeSelector ] ( ID | Class | Attribute | Pseudo )*
067     Pseudo           ::= ':' Name [ '(' SelectorGroup ')' ]
068     Combinator      ::= S+         // descendant (whitespace)
069     | '>'       // child
070     | '+'       // adjacent sibling
071     | '~'       // general sibling
072     </pre>
073
074     See <a href="https://www.w3.org/TR/selectors-4/#grammar">selectors-4</a> for the real thing
075     */
076    Evaluator parse() {
077        Evaluator eval = parseSelectorGroup();
078        tq.consumeWhitespace();
079        if (!tq.isEmpty())
080            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
081        return eval;
082    }
083
084    Evaluator parseSelectorGroup() {
085        // SelectorGroup. Into an Or if > 1 Selector
086        Evaluator left = parseSelector();
087        while (tq.matchChomp(',')) {
088            Evaluator right = parseSelector();
089            left = or(left, right);
090        }
091        return left;
092    }
093
094    Evaluator parseSelector() {
095        // Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )*
096        tq.consumeWhitespace();
097
098        Evaluator left;
099        if (tq.matchesAny(Combinators)) {
100            // e.g. query is "> div"; left side is root element
101            left = new StructuralEvaluator.Root();
102        } else {
103            left = parseSimpleSequence();
104        }
105
106        while (true) {
107            char combinator = 0;
108            if (tq.consumeWhitespace())
109                combinator = ' ';            // maybe descendant?
110            if (tq.matchesAny(Combinators)) // no, explicit
111                combinator = tq.consume();
112            else if (tq.matchesAny(SequenceEnders)) // , - space after simple like "foo , bar"; ) - close of :has()
113                break;
114
115            if (combinator != 0) {
116                Evaluator right = parseSimpleSequence();
117                left = combinator(left, combinator, right);
118            } else {
119                break;
120            }
121        }
122        return left;
123    }
124
125    Evaluator parseSimpleSequence() {
126        // SimpleSequence ::= TypeSelector? ( Hash | Class | Pseudo )*
127        Evaluator left = null;
128        tq.consumeWhitespace();
129
130        // one optional type selector
131        if (tq.matchesWord() || tq.matches("*|"))
132            left = byTag();
133        else if (tq.matchChomp('*'))
134            left = new Evaluator.AllElements();
135
136        // zero or more subclasses (#, ., [)
137        while(true) {
138            Evaluator right = parseSubclass();
139            if (right != null) {
140                left = and(left, right);
141            }
142            else break; // no more simple tokens
143        }
144
145        if (left == null)
146            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
147        return left;
148    }
149
150    static Evaluator combinator(Evaluator left, char combinator, Evaluator right) {
151        switch (combinator) {
152            case '>':
153                ImmediateParentRun run = left instanceof ImmediateParentRun ?
154                    (ImmediateParentRun) left : new ImmediateParentRun(left);
155                run.add(right);
156                return run;
157            case ' ':
158                return and(new StructuralEvaluator.Ancestor(left), right);
159            case '+':
160                return and(new StructuralEvaluator.ImmediatePreviousSibling(left), right);
161            case '~':
162                return and(new StructuralEvaluator.PreviousSibling(left), right);
163            default:
164                throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator);
165        }
166    }
167
168    @Nullable Evaluator parseSubclass() {
169        //  Subclass: ID | Class | Attribute | Pseudo
170        if      (tq.matchChomp('#'))    return byId();
171        else if (tq.matchChomp('.'))    return byClass();
172        else if (tq.matches('['))       return byAttribute();
173        else if (tq.matchChomp("::"))   return parseNodeSelector(); // ::comment etc
174        else if (tq.matchChomp(':'))    return parsePseudoSelector();
175        else                            return null;
176    }
177
178    /** Merge two evals into an Or. */
179    static Evaluator or(Evaluator left, Evaluator right) {
180        if (left instanceof CombiningEvaluator.Or) {
181            ((CombiningEvaluator.Or) left).add(right);
182            return left;
183        }
184        return new CombiningEvaluator.Or(left, right);
185    }
186
187    /** Merge two evals into an And. */
188    static Evaluator and(@Nullable Evaluator left, Evaluator right) {
189        if (left == null) return right;
190        if (left instanceof CombiningEvaluator.And) {
191            ((CombiningEvaluator.And) left).add(right);
192            return left;
193        }
194        return new CombiningEvaluator.And(left, right);
195    }
196
197    private Evaluator parsePseudoSelector() {
198        final String pseudo = tq.consumeCssIdentifier();
199        switch (pseudo) {
200            case "lt":
201                return new Evaluator.IndexLessThan(consumeIndex());
202            case "gt":
203                return new Evaluator.IndexGreaterThan(consumeIndex());
204            case "eq":
205                return new Evaluator.IndexEquals(consumeIndex());
206            case "has":
207                return has();
208            case "is":
209                return is();
210            case "contains":
211                return contains(false);
212            case "containsOwn":
213                return contains(true);
214            case "containsWholeText":
215                return containsWholeText(false);
216            case "containsWholeOwnText":
217                return containsWholeText(true);
218            case "containsData":
219                return containsData();
220            case "matches":
221                return matches(false);
222            case "matchesOwn":
223                return matches(true);
224            case "matchesWholeText":
225                return matchesWholeText(false);
226            case "matchesWholeOwnText":
227                return matchesWholeText(true);
228            case "not":
229                return not();
230            case "nth-child":
231                return cssNthChild(false, false);
232            case "nth-last-child":
233                return cssNthChild(true, false);
234            case "nth-of-type":
235                return cssNthChild(false, true);
236            case "nth-last-of-type":
237                return cssNthChild(true, true);
238            case "first-child":
239                return new Evaluator.IsFirstChild();
240            case "last-child":
241                return new Evaluator.IsLastChild();
242            case "first-of-type":
243                return new Evaluator.IsFirstOfType();
244            case "last-of-type":
245                return new Evaluator.IsLastOfType();
246            case "only-child":
247                return new Evaluator.IsOnlyChild();
248            case "only-of-type":
249                return new Evaluator.IsOnlyOfType();
250            case "empty":
251                return new Evaluator.IsEmpty();
252            case "blank":
253                return new NodeEvaluator.BlankValue();
254            case "root":
255                return new Evaluator.IsRoot();
256            case "matchText":
257                return new Evaluator.MatchText();
258            default:
259                throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
260        }
261    }
262
263    // ::comment etc
264    private Evaluator parseNodeSelector() {
265        final String pseudo = tq.consumeCssIdentifier();
266        inNodeContext = true;  // Enter node context
267
268        Evaluator left;
269        switch (pseudo) {
270            case "node":
271                left = new NodeEvaluator.InstanceType(Node.class, pseudo);
272                break;
273            case "leafnode":
274                left = new NodeEvaluator.InstanceType(LeafNode.class, pseudo);
275                break;
276            case "text":
277                left = new NodeEvaluator.InstanceType(TextNode.class, pseudo);
278                break;
279            case "comment":
280                left = new NodeEvaluator.InstanceType(Comment.class, pseudo);
281                break;
282            case "data":
283                left = new NodeEvaluator.InstanceType(DataNode.class, pseudo);
284                break;
285            case "cdata":
286                left = new NodeEvaluator.InstanceType(CDataNode.class, pseudo);
287                break;
288            default:
289                throw new Selector.SelectorParseException(
290                    "Could not parse query '%s': unknown node type '::%s'", query, pseudo);
291        }
292
293        // Handle following subclasses in node context (like ::comment:contains())
294        Evaluator right;
295        while ((right = parseSubclass()) != null) {
296            left = and(left, right);
297        }
298
299        inNodeContext = false;
300        return left;
301    }
302
303    private Evaluator byId() {
304        String id = tq.consumeCssIdentifier();
305        Validate.notEmpty(id);
306        return new Evaluator.Id(id);
307    }
308
309    private Evaluator byClass() {
310        String className = tq.consumeCssIdentifier();
311        Validate.notEmpty(className);
312        return new Evaluator.Class(className.trim());
313    }
314
315    private Evaluator byTag() {
316        // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make
317        // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for
318        // consistency - both the selector and the element tag
319        String tagName = normalize(tq.consumeElementSelector());
320        Validate.notEmpty(tagName);
321
322        // namespaces:
323        if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName
324            String plainTag = tagName.substring(2); // strip *|
325            return new CombiningEvaluator.Or(
326                new Evaluator.Tag(plainTag),
327                new Evaluator.TagEndsWith(":" + plainTag)
328            );
329        } else if (tagName.endsWith("|*")) { // ns|*
330            String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns:
331            return new Evaluator.TagStartsWith(ns);
332        } else if (tagName.contains("|")) { // flip "abc|def" to "abc:def"
333            tagName = tagName.replace("|", ":");
334        }
335
336        return new Evaluator.Tag(tagName);
337    }
338
339    private Evaluator byAttribute() {
340        try (TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']'))) {
341            return evaluatorForAttribute(cq);
342        }
343    }
344
345    private Evaluator evaluatorForAttribute(TokenQueue cq) {
346        String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val)
347        Validate.notEmpty(key);
348        cq.consumeWhitespace();
349        final Evaluator eval;
350
351        if (cq.isEmpty()) {
352            if (key.startsWith("^"))
353                eval = new Evaluator.AttributeStarting(key.substring(1));
354            else if (key.equals("*")) // any attribute
355                eval = new Evaluator.AttributeStarting("");
356            else
357                eval = new Evaluator.Attribute(key);
358        } else {
359            if (cq.matchChomp('='))
360                eval = new Evaluator.AttributeWithValue(key, cq.remainder());
361            else if (cq.matchChomp("!="))
362                eval = new Evaluator.AttributeWithValueNot(key, cq.remainder());
363            else if (cq.matchChomp("^="))
364                eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder());
365            else if (cq.matchChomp("$="))
366                eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder());
367            else if (cq.matchChomp("*="))
368                eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder());
369            else if (cq.matchChomp("~="))
370                eval = new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder()));
371            else
372                throw new Selector.SelectorParseException(
373                    "Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder());
374        }
375        return eval;
376    }
377
378    //pseudo selectors :first-child, :last-child, :nth-child, ...
379    private static final Pattern NthStepOffset = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE);
380    private static final Pattern NthOffset = Pattern.compile("([+-])?(\\d+)");
381
382    private Evaluator cssNthChild(boolean last, boolean ofType) {
383        String arg = normalize(consumeParens()); // arg is like "odd", or "-n+2", within nth-child(odd)
384        final int step, offset;
385        if ("odd".equals(arg)) {
386            step = 2;
387            offset = 1;
388        } else if ("even".equals(arg)) {
389            step = 2;
390            offset = 0;
391        } else {
392            Matcher stepOffsetM, stepM;
393            if ((stepOffsetM = NthStepOffset.matcher(arg)).matches()) {
394                if (stepOffsetM.group(3) != null) // has digits, like 3n+2 or -3n+2
395                    step = Integer.parseInt(stepOffsetM.group(1).replaceFirst("^\\+", ""));
396                else // no digits, might be like n+2, or -n+2. if group(2) == "-", it’s -1;
397                    step = "-".equals(stepOffsetM.group(2)) ? -1 : 1;
398                offset =
399                    stepOffsetM.group(4) != null ? Integer.parseInt(stepOffsetM.group(4).replaceFirst("^\\+", "")) : 0;
400            } else if ((stepM = NthOffset.matcher(arg)).matches()) {
401                step = 0;
402                offset = Integer.parseInt(stepM.group().replaceFirst("^\\+", ""));
403            } else {
404                throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg);
405            }
406        }
407
408        return ofType
409            ? (last ? new Evaluator.IsNthLastOfType(step, offset) : new Evaluator.IsNthOfType(step, offset))
410            : (last ? new Evaluator.IsNthLastChild(step, offset) : new Evaluator.IsNthChild(step, offset));
411    }
412
413    private String consumeParens() {
414        return tq.chompBalanced('(', ')');
415    }
416
417    private int consumeIndex() {
418        String index = consumeParens().trim();
419        Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric");
420        return Integer.parseInt(index);
421    }
422
423    // pseudo selector :has(el)
424    private Evaluator has() {
425        return parseNested(StructuralEvaluator.Has::new, ":has() must have a selector");
426    }
427
428    // pseudo selector :is()
429    private Evaluator is() {
430        return parseNested(StructuralEvaluator.Is::new, ":is() must have a selector");
431    }
432
433    private Evaluator parseNested(Function<Evaluator, Evaluator> func, String err) {
434        Validate.isTrue(tq.matchChomp('('), err);
435        Evaluator eval = parseSelectorGroup();
436        Validate.isTrue(tq.matchChomp(')'), err);
437        return func.apply(eval);
438    }
439
440    // pseudo selector :contains(text), containsOwn(text)
441    private Evaluator contains(boolean own) {
442        String query = own ? ":containsOwn" : ":contains";
443        String searchText = TokenQueue.unescape(consumeParens());
444        Validate.notEmpty(searchText, query + "(text) query must not be empty");
445
446        if (inNodeContext)
447            return new NodeEvaluator.ContainsValue(searchText);
448
449        return own
450            ? new Evaluator.ContainsOwnText(searchText)
451            : new Evaluator.ContainsText(searchText);
452    }
453
454    private Evaluator containsWholeText(boolean own) {
455        String query = own ? ":containsWholeOwnText" : ":containsWholeText";
456        String searchText = TokenQueue.unescape(consumeParens());
457        Validate.notEmpty(searchText, query + "(text) query must not be empty");
458        return own
459            ? new Evaluator.ContainsWholeOwnText(searchText)
460            : new Evaluator.ContainsWholeText(searchText);
461    }
462
463    // pseudo selector :containsData(data)
464    private Evaluator containsData() {
465        String searchText = TokenQueue.unescape(consumeParens());
466        Validate.notEmpty(searchText, ":containsData(text) query must not be empty");
467        return new Evaluator.ContainsData(searchText);
468    }
469
470    // :matches(regex), matchesOwn(regex)
471    private Evaluator matches(boolean own) {
472        String query = own ? ":matchesOwn" : ":matches";
473        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
474        Validate.notEmpty(regex, query + "(regex) query must not be empty");
475        Pattern pattern = Pattern.compile(regex);
476
477        if (inNodeContext)
478            return new NodeEvaluator.MatchesValue(pattern);
479
480        return own
481            ? new Evaluator.MatchesOwn(pattern)
482            : new Evaluator.Matches(pattern);
483    }
484
485    // :matches(regex), matchesOwn(regex)
486    private Evaluator matchesWholeText(boolean own) {
487        String query = own ? ":matchesWholeOwnText" : ":matchesWholeText";
488        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
489        Validate.notEmpty(regex, query + "(regex) query must not be empty");
490
491        return own
492            ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex))
493            : new Evaluator.MatchesWholeText(Pattern.compile(regex));
494    }
495
496    // :not(selector)
497    private Evaluator not() {
498        String subQuery = consumeParens();
499        Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty");
500
501        return new StructuralEvaluator.Not(parse(subQuery));
502    }
503
504    @Override
505    public String toString() {
506        return query;
507    }
508
509    @Override
510    public void close() {
511        tq.close();
512    }
513}