// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query.parser; import com.yahoo.language.Language; import com.yahoo.language.process.Segmenter; import com.yahoo.log.event.*; import com.yahoo.prelude.Index; import com.yahoo.prelude.IndexFacts; import com.yahoo.prelude.query.*; import com.yahoo.search.query.QueryTree; import com.yahoo.search.query.parser.Parsable; import com.yahoo.search.query.parser.ParserEnvironment; import java.util.*; /** * The Vespa query parser. * * @author bratseth * @author Steinar Knutsen */ @SuppressWarnings("deprecation") public abstract class AbstractParser implements CustomParser { /** The current submodes of this parser */ protected Submodes submodes = new Submodes(); /** * The current language of this parser. Used to decide whether and how to * use the CJKSegmenter */ protected Language language = Language.UNKNOWN; /** The IndexFacts.Session of this query */ protected IndexFacts.Session indexFacts; /** * The counter for braces in URLs, braces in URLs are accepted so long as * they are balanced. */ protected int braceLevelURL = 0; protected final ParserEnvironment environment; protected final TokenPosition tokens = new TokenPosition(); /** * An enumeration of the parser index-controlled submodes. Any combination * of these may be active at the same time. SubModes are activated or * deactivated by specifying special indexes in the query. */ final class Submodes { /** * Url mode allows "_" and "-" as word characters. Default is false */ public boolean url = false; /** * Site mode - host names get start of host and end of host markers. * Default is false */ public boolean site = false; /** * Sets submodes from an index. * * @param indexName the index name which should decide the submodes, or null to do nothing. * @param session the session used to look up information about this index */ @SuppressWarnings({"deprecation"}) // To avoid this we need to pass an IndexFacts.session down instead - easily done but not without breaking API's public void setFromIndex(final String indexName, IndexFacts.Session session) { if (indexName == null) { return; } reset(); final Index current = session.getIndex(indexName); if (current.isUriIndex()) { url = true; } else if (current.isHostIndex()) { site = true; } } /** Sets default values for all submodes */ public void reset() { url = false; site = false; } /** * Returns whether we are in a mode which allows explicit anchoring * markers, ^ and $ * * @return True if we are doing explicit anchoring. */ public boolean explicitAnchoring() { return site; } } /** *
Creates a new instance of this class, storing the given {@link ParserEnvironment} for parse-time access to the * environment.
* * @param environment The environment settings to attach to the Parser. */ protected AbstractParser(ParserEnvironment environment) { this.environment = ParserEnvironment.fromParserEnvironment(environment); if (this.environment.getIndexFacts() == null) { this.environment.setIndexFacts(new IndexFacts()); } } // TODO: Deprecate the unwanted method signatures below @Override public final QueryTree parse(Parsable query) { Item root = null; if (query != null) { root = parse(query.getQuery(), query.getFilter(), query.getExplicitLanguage().orElse(query.getLanguage()), environment.getIndexFacts().newSession(query.getSources(), query.getRestrict()), query.getDefaultIndexName(), query); } if (root == null) { root = new NullItem(); } return new QueryTree(root); } @Override public final Item parse(String queryToParse, String filterToParse, Language parsingLanguage, IndexFacts.Session indexFacts, String defaultIndexName) { return parse(queryToParse, filterToParse, parsingLanguage, indexFacts, defaultIndexName, null); } private Item parse(String queryToParse, String filterToParse, Language parsingLanguage, IndexFacts.Session indexFacts, String defaultIndexName, Parsable parsable) { if (queryToParse == null) return null; tokenize(queryToParse, defaultIndexName, indexFacts, parsingLanguage); if (parsingLanguage == null && parsable != null) { String detectionText = generateLanguageDetectionTextFrom(tokens, indexFacts, defaultIndexName); if (detectionText.isEmpty()) // heuristic detection text extraction is fallible detectionText = queryToParse; parsingLanguage = parsable.getOrDetectLanguage(detectionText); } setState(parsingLanguage, indexFacts); Item root = parseItems(); if (filterToParse != null) { AnyParser filterParser = new AnyParser(environment); if (root == null) { root = filterParser.parseFilter(filterToParse, parsingLanguage, indexFacts); } else { root = filterParser.applyFilter(root, filterToParse, parsingLanguage, indexFacts); } } root = simplifyPhrases(root); if (defaultIndexName != null) { assignDefaultIndex(indexFacts.getCanonicName(defaultIndexName), root); } return root; } /** * Do a best-effort attempt at creating a single string for language detection from only the relevant * subset of tokens. * The relevant tokens are text tokens which follows names of indexes which are tokenized. * * This method does not modify the position of the given token stream. */ private String generateLanguageDetectionTextFrom(TokenPosition tokens, IndexFacts.Session indexFacts, String defaultIndex) { StringBuilder detectionText = new StringBuilder(); int initialPosition = tokens.getPosition(); while (tokens.hasNext()) { // look for occurrences of text and text:text while (!tokens.currentIs(Token.Kind.WORD) && tokens.hasNext()) // skip nonwords tokens.next(); if (!tokens.hasNext()) break; String queryText; Index index; Token word1 = tokens.next(); if (is(Token.Kind.COLON, tokens.currentNoIgnore())) { tokens.next(); // colon Token word2 = tokens.next(); if ( is(Token.Kind.WORD, word2)) queryText = word2.image; else queryText = ""; index = indexFacts.getIndex(word1.image); if (index.isNull()) { // interpret both as words index = indexFacts.getIndex(defaultIndex); queryText = word1.image + " " + queryText; } } else if (is(Token.Kind.COLON, tokens.currentNoIgnore()) && is(Token.Kind.QUOTE, tokens.currentNoIgnore(1))) { tokens.next(); // colon tokens.next(); // quote StringBuilder quotedContent = new StringBuilder(); while (!tokens.currentIs(Token.Kind.QUOTE) && tokens.hasNext()) { Token token = tokens.next(); if (is(Token.Kind.WORD, token)) quotedContent.append(token.image).append(" "); } tokens.next(); queryText = quotedContent.toString(); index = indexFacts.getIndex(word1.image); if (index.isNull()) { // interpret both as words index = indexFacts.getIndex(defaultIndex); queryText = word1.image + " " + queryText; } } else { index = indexFacts.getIndex(defaultIndex); queryText = word1.image; } if (queryText != null && index.hasPlainTokens()) detectionText.append(queryText).append(" "); } tokens.setPosition(initialPosition); return detectionText.toString(); } private boolean is(Token.Kind kind, Token tokenOrNull) { if (tokenOrNull == null) return false; return kind.equals(tokenOrNull.kind); } protected abstract Item parseItems(); /** * Assigns the default index to query terms having no default index The * parser _should_ have done this, for some reason it doesn't * * @param defaultIndex The default index to assign. * @param item The item to check. */ private static void assignDefaultIndex(final String defaultIndex, Item item) { if (defaultIndex == null || item == null) return; if (item instanceof IndexedItem) { IndexedItem indexName = (IndexedItem) item; if ("".equals(indexName.getIndexName())) indexName.setIndexName(defaultIndex); } else if (item instanceof CompositeItem) { Iterator