// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query.parser; import com.yahoo.prelude.IndexFacts; import com.yahoo.prelude.query.*; import com.yahoo.search.query.parser.ParserEnvironment; import java.util.ArrayList; import java.util.List; import java.util.Set; import static com.yahoo.prelude.query.parser.Token.Kind.*; /** * Base class for parsers of the query languages which can be used * for structured queries (types ANY, ALL and ADVANCED). * * @author Steinar Knutsen */ abstract class StructuredParser extends AbstractParser { protected StructuredParser(ParserEnvironment environment) { super(environment); } protected abstract Item handleComposite(boolean topLevel); protected Item compositeItem() { int position = tokens.getPosition(); Item item = null; try { tokens.skipMultiple(PLUS); if (!tokens.skip(LBRACE)) { return null; } item = handleComposite(false); tokens.skip(RBRACE); return item; } finally { if (item == null) { tokens.setPosition(position); } } } /** Sets the submodes used for url parsing. Override this to influence when such submodes are used. */ protected void setSubmodeFromIndex(String indexName, IndexFacts.Session indexFacts) { submodes.setFromIndex(indexName, indexFacts); } protected Item indexableItem() { int position = tokens.getPosition(); Item item = null; try { String indexName = indexPrefix(); setSubmodeFromIndex(indexName, indexFacts); item = number(indexName != null); if (item == null) { item = phrase(); } if (item == null && indexName != null) { if (wordsAhead()) { item = phrase(); } } submodes.reset(); int weight = -1; if (item != null) { weight = weightSuffix(); } if (indexName != null && item != null) { item.setIndexName(indexName); } if (weight != -1 && item != null) { item.setWeight(weight); } return item; } finally { if (item == null) { tokens.setPosition(position); } } } // scan forward for terms while ignoring noise private boolean wordsAhead() { while (tokens.hasNext()) { if (tokens.currentIsNoIgnore(SPACE)) { return false; } if (tokens.currentIsNoIgnore(NUMBER) || tokens.currentIsNoIgnore(WORD)) { return true; } tokens.skipNoIgnore(); } return false; } // wordsAhead and nothingAhead... uhm... so similar... private boolean nothingAhead(boolean skip) { int position = tokens.getPosition(); try { boolean quoted = false; while (tokens.hasNext()) { if (tokens.currentIsNoIgnore(QUOTE)) { tokens.skipMultiple(QUOTE); quoted = !quoted; } else { if (!quoted && tokens.currentIsNoIgnore(SPACE)) { return true; } if (tokens.currentIsNoIgnore(NUMBER) || tokens.currentIsNoIgnore(WORD)) { return false; } tokens.skipNoIgnore(); } } return true; } finally { if (!skip) { tokens.setPosition(position); } } } private String indexPrefix() { int position = tokens.getPosition(); String item = null; try { List firstWord = new ArrayList<>(); List secondWord = new ArrayList<>(); tokens.skip(LSQUAREBRACKET); // For test 93 and 60 if (!tokens.currentIs(WORD) && !tokens.currentIs(NUMBER) && !tokens.currentIs(UNDERSCORE)) { return null; } firstWord.add(tokens.next()); while (tokens.currentIsNoIgnore(UNDERSCORE) || tokens.currentIsNoIgnore(WORD) || tokens.currentIsNoIgnore(NUMBER)) { firstWord.add(tokens.next()); } if (tokens.currentIsNoIgnore(DOT)) { tokens.skip(); if (tokens.currentIsNoIgnore(WORD) || tokens.currentIsNoIgnore(NUMBER)) { secondWord.add(tokens.next()); } else { return null; } while (tokens.currentIsNoIgnore(UNDERSCORE) || tokens.currentIsNoIgnore(WORD) || tokens.currentIsNoIgnore(NUMBER)) { secondWord.add(tokens.next()); } } if (!tokens.skipNoIgnore(COLON)) { return null; } if (secondWord.size() == 0) { item = concatenate(firstWord); } else { item = concatenate(firstWord) + "." + concatenate(secondWord); } item = indexFacts.getCanonicName(item); if ( ! indexFacts.isIndex(item)) { // Only if this really is an index // Marker for the finally block item = null; return null; } else { if (nothingAhead(false)) { // correct index syntax, correct name, but followed // by noise. Let's skip this. nothingAhead(true); position = tokens.getPosition(); item = indexPrefix(); } } return item; } finally { if (item == null) { tokens.setPosition(position); } } } private String concatenate(List tokens) { StringBuilder s = new StringBuilder(); for (Token t : tokens) { s.append(t.toString()); } return s.toString(); } /** Returns the specified term weight, or -1 if there is no weight suffix */ private int weightSuffix() { int position = tokens.getPosition(); int item = -1; try { if (!tokens.skipNoIgnore(EXCLAMATION)) { return -1; } item = 150; if (tokens.currentIsNoIgnore(NUMBER)) { try { item = Integer.parseInt(tokens.next().toString()); } catch (NumberFormatException e) { item = -1; } } else { while (tokens.currentIsNoIgnore(EXCLAMATION)) { item += 50; tokens.skipNoIgnore(); } } return item; } finally { if (item == -1) { tokens.setPosition(position); } } } private boolean endOfNumber() { return tokens.currentIsNoIgnore(SPACE) || tokens.currentIsNoIgnore(RSQUAREBRACKET) || tokens.currentIsNoIgnore(SEMICOLON) || tokens.currentIsNoIgnore(RBRACE) || tokens.currentIsNoIgnore(EOF) || tokens.currentIsNoIgnore(EXCLAMATION); } private String decimalPart() { int position = tokens.getPosition(); boolean consumed = false; try { if (!tokens.skipNoIgnore(DOT)) return ""; if (tokens.currentIsNoIgnore(NUMBER)) { consumed = true; return "." + tokens.next().toString(); } return ""; } finally { if ( ! consumed) tokens.setPosition(position); } } private IntItem number(boolean hasIndex) { int position = tokens.getPosition(); IntItem item = null; try { if (item == null) { item = numberRange(); } tokens.skip(LSQUAREBRACKET); // For test 93 and 60 // TODO: Better definition of start and end of numeric items if (item == null && hasIndex && tokens.currentIsNoIgnore(MINUS) && (tokens.currentNoIgnore(1).kind == NUMBER)) { tokens.skipNoIgnore(); Token t = tokens.next(); item = new IntItem("-" + t.toString() + decimalPart(), true); item.setOrigin(t.substring); } else if (item == null && tokens.currentIs(NUMBER)) { Token t = tokens.next(); item = new IntItem(t.toString() + decimalPart(), true); item.setOrigin(t.substring); } if (item == null) { item = numberSmaller(); } if (item == null) { item = numberGreater(); } if (item != null && !endOfNumber()) { item = null; } return item; } finally { if (item == null) { tokens.setPosition(position); } } } private IntItem numberRange() { int position = tokens.getPosition(); IntItem item = null; boolean negative = false; try { Token initial = tokens.next(); if (initial.kind != LSQUAREBRACKET) { return null; } String rangeStart = ""; negative = tokens.skip(MINUS); if (tokens.currentIs(NUMBER)) { rangeStart = (negative ? "-" : "") + tokens.next().toString() + decimalPart(); } if (!tokens.skip(SEMICOLON)) { return null; } String rangeEnd = ""; negative = tokens.skip(MINUS); if (tokens.currentIs(NUMBER)) { rangeEnd = (negative ? "-" : "") + tokens.next().toString() + decimalPart(); } String range = "[" + rangeStart + ";" + rangeEnd; if (tokens.skip(SEMICOLON)) { negative = tokens.skip(MINUS); if (tokens.currentIs(NUMBER)) { String rangeLimit = (negative ? "-" : "") + tokens.next().toString(); range += ";" + rangeLimit; } } tokens.skip(RSQUAREBRACKET); item = new IntItem(range + "]", true); item.setOrigin(new Substring(initial.substring.start, tokens.currentNoIgnore().substring.start, initial.getSubstring().getSuperstring())); // XXX: Unsafe end? return item; } finally { if (item == null) { tokens.setPosition(position); } } } private IntItem numberSmaller() { int position = tokens.getPosition(); IntItem item = null; boolean negative = false; try { Token initial = tokens.next(); if (initial.kind != SMALLER) { return null; } negative = tokens.skipNoIgnore(MINUS); if (!tokens.currentIs(NUMBER)) { return null; } item = new IntItem("<" + (negative ? "-" : "") + tokens.next() + decimalPart(), true); item.setOrigin(new Substring(initial.substring.start, tokens.currentNoIgnore().substring.start, initial.getSubstring().getSuperstring())); // XXX: Unsafe end? return item; } finally { if (item == null) { tokens.setPosition(position); } } } private IntItem numberGreater() { int position = tokens.getPosition(); IntItem item = null; boolean negative = false; try { Token t = tokens.next(); if (t.kind != GREATER) { return null; } negative = tokens.skipNoIgnore(MINUS); if (!tokens.currentIs(NUMBER)) { return null; } Token number = tokens.next(); item = new IntItem(">" + (negative ? "-" : "") + number + decimalPart(), true); item.setOrigin(new Substring(t.substring.start, tokens.currentNoIgnore().substring.start, t.getSubstring().getSuperstring())); // XXX: Unsafe end? return item; } finally { if (item == null) { tokens.setPosition(position); } } } /** Words for phrases also permits numerals as words */ private Item phraseWord(boolean insidePhrase) { int position = tokens.getPosition(); Item item = null; try { item = word(); if (item == null && tokens.currentIs(NUMBER)) { Token t = tokens.next(); if (insidePhrase) { item = new WordItem(t, true); } else { item = new IntItem(t.toString(), true); ((TermItem) item).setOrigin(t.substring); } } return item; } finally { if (item == null) { tokens.setPosition(position); } } } /** * Returns a WordItem if this is a non CJK query, * a WordItem or PhraseSegmentItem if this is a CJK query, * null if the current item is not a word */ private Item word() { int position = tokens.getPosition(); Item item = null; try { if (!tokens.currentIs(WORD) && ((!tokens.currentIs(NUMBER) && !tokens.currentIs(MINUS) && !tokens.currentIs(UNDERSCORE)) || (!submodes.url && !submodes.site))) { return null; } Token word = tokens.next(); if (submodes.url) { item = new WordItem(word, true); } else { item = segment(word); } if (submodes.url || submodes.site) { StringBuilder buffer = null; Token token = tokens.currentNoIgnore(); while (token.kind == WORD || token.kind == NUMBER || token.kind == MINUS || token.kind == UNDERSCORE) { if (buffer == null) { buffer = getStringContents(item); } buffer.append(token.toString()); tokens.skipNoIgnore(); token = tokens.currentNoIgnore(); } if (buffer != null) { Substring termSubstring = ((BlockItem) item).getOrigin(); Substring substring = new Substring(termSubstring.start, token.substring.start, termSubstring.getSuperstring()); // XXX: Unsafe end? String str = buffer.toString(); item = new WordItem(str, "", true, substring); } } return item; } finally { if (item == null) { tokens.setPosition(position); } } } private StringBuilder getStringContents(Item item) { if (item instanceof TermItem) { return new StringBuilder( ((TermItem) item).stringValue()); } else if (item instanceof SegmentItem) { return new StringBuilder( ((SegmentItem) item).getRawWord()); } else { throw new RuntimeException("Parser bug. Unexpected item type, send stack trace in a bug ticket to the Vespa team."); } } /** * An phrase or word, either marked by quotes or by non-spaces between * words or by a combination. * * @return a word if there's only one word, a phrase if there is * several quoted or non-space-separated words, or null otherwise */ private Item phrase() { int position = tokens.getPosition(); Item item = null; try { item = phraseBody(); return item; } finally { if (item == null) { tokens.setPosition(position); } } } /** Returns a word, a phrase or another composite */ private Item phraseBody() { boolean quoted = false; PhraseItem phrase = null; Item firstWord = null; boolean starAfterFirst = false; boolean starBeforeFirst; if (tokens.skipMultiple(QUOTE)) { quoted = !quoted; } boolean addStartOfHostMarker = addStartMarking(); braceLevelURL = 0; do { starBeforeFirst = tokens.skip(STAR); if (tokens.skipMultiple(QUOTE)) { quoted = !quoted; } Item word = phraseWord((firstWord != null) || (phrase != null)); if (word == null) { if (tokens.skipMultiple(QUOTE)) { quoted = !quoted; } if (quoted && tokens.hasNext()) { tokens.skipNoIgnore(); continue; } else { break; } } else if (quoted && word instanceof PhraseSegmentItem) { ((PhraseSegmentItem) word).setExplicit(true); } if (phrase != null) { phrase.addItem(word); } else if (firstWord != null) { phrase = new PhraseItem(); if (quoted || submodes.site || submodes.url) { phrase.setExplicit(true); } if (addStartOfHostMarker) { phrase.addItem(MarkerWordItem.createStartOfHost()); } if (firstWord instanceof IntItem) { IntItem asInt = (IntItem) firstWord; firstWord = new WordItem(asInt.stringValue(), asInt.getIndexName(), true, asInt.getOrigin()); } phrase.addItem(firstWord); phrase.addItem(word); } else if (word instanceof PhraseItem) { phrase = (PhraseItem) word; } else { firstWord = word; starAfterFirst = tokens.skipNoIgnore(STAR); } if (!quoted && tokens.currentIs(QUOTE)) { break; } boolean atWord = skipToNextPhraseWord(quoted); if (!atWord && tokens.skipMultipleNoIgnore(QUOTE)) { quoted = !quoted; } if (!atWord && !quoted) { break; } if (quoted && tokens.skipMultiple(QUOTE)) { break; } } while (tokens.hasNext()); braceLevelURL = 0; if (phrase != null) { if (addEndMarking()) { phrase.addItem(MarkerWordItem.createEndOfHost()); } return phrase; } else if (firstWord != null && submodes.site) { if (starAfterFirst && !addStartOfHostMarker) { return firstWord; } else { phrase = new PhraseItem(); if (addStartOfHostMarker) { phrase.addItem(MarkerWordItem.createStartOfHost()); } if (firstWord instanceof IntItem) { IntItem asInt = (IntItem) firstWord; firstWord = new WordItem(asInt.stringValue(), asInt.getIndexName(), true, asInt.getOrigin()); } phrase.addItem(firstWord); if (!starAfterFirst) { phrase.addItem(MarkerWordItem.createEndOfHost()); } phrase.setExplicit(true); return phrase; } } else { if (firstWord != null && firstWord instanceof TermItem && (starAfterFirst || starBeforeFirst)) { // prefix, suffix or substring TermItem firstTerm = (TermItem) firstWord; if (starAfterFirst) { if (starBeforeFirst) { return new SubstringItem(firstTerm.stringValue(), true); } else { return new PrefixItem(firstTerm.stringValue(), true); } } else { return new SuffixItem(firstTerm.stringValue(), true); } } return firstWord; } } private boolean addStartMarking() { if (submodes.explicitAnchoring() && tokens.currentIs(HAT)) { tokens.skip(); return true; } return false; } private boolean addEndMarking() { if (submodes.explicitAnchoring() && tokens.currentIs(DOLLAR)) { tokens.skip(); return true; } else if (submodes.site && tokens.currentIs(STAR)) { tokens.skip(); return false; } else if (submodes.site && !tokens.currentIs(DOT)) { return true; } return false; } /** * Skips one or multiple phrase separators * * @return true if the item we land at after skipping zero or more is * a phrase word */ private boolean skipToNextPhraseWord(boolean quoted) { boolean skipped = false; do { skipped = false; if (submodes.url) { if (tokens.currentIsNoIgnore(RBRACE)) { braceLevelURL--; } if (tokens.currentIsNoIgnore(LBRACE)) { braceLevelURL++; } if (tokens.hasNext() && !tokens.currentIsNoIgnore(SPACE) && braceLevelURL >= 0) { tokens.skip(); skipped = true; } } else if (submodes.site) { if (tokens.hasNext() && !tokens.currentIsNoIgnore(SPACE) && !tokens.currentIsNoIgnore(STAR) && !tokens.currentIsNoIgnore(HAT) && !tokens.currentIsNoIgnore(DOLLAR) && !tokens.currentIsNoIgnore(RBRACE)) { tokens.skip(); skipped = true; } } else { if (tokens.skipMultipleNoIgnore(DOT)) { skipped = true; } if (tokens.skipMultipleNoIgnore(COMMA)) { skipped = true; } if (tokens.skipMultipleNoIgnore(PLUS)) { skipped = true; } if (tokens.skipMultipleNoIgnore(MINUS)) { skipped = true; } if (tokens.skipMultipleNoIgnore(UNDERSCORE)) { skipped = true; } if (tokens.skipMultipleNoIgnore(HAT)) { skipped = true; } if (tokens.skipMultipleNoIgnore(DOLLAR)) { skipped = true; } ; if (tokens.skipMultipleNoIgnore(STAR)) { skipped = true; } if (tokens.skipMultipleNoIgnore(COLON)) { skipped = true; } if (quoted) { if (tokens.skipMultipleNoIgnore(RBRACE)) { skipped = true; } if (tokens.skipMultipleNoIgnore(LBRACE)) { skipped = true; } } if (tokens.skipMultipleNoIgnore(NOISE)) { skipped = true; } } } while (skipped && !tokens.currentIsNoIgnore(WORD) && !tokens.currentIsNoIgnore(NUMBER) && !URLModeWordChar()); return tokens.currentIsNoIgnore(WORD) || tokens.currentIsNoIgnore(NUMBER) || URLModePhraseChar(); } private boolean URLModeWordChar() { if (!submodes.url) { return false; } return tokens.currentIsNoIgnore(UNDERSCORE) || tokens.currentIsNoIgnore(MINUS); } private boolean URLModePhraseChar() { if (!submodes.url) { return false; } return !(tokens.currentIsNoIgnore(RBRACE) || tokens.currentIsNoIgnore(SPACE)); } }