// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query.parser; import com.yahoo.language.Language; import com.yahoo.language.process.Segmenter; import com.yahoo.prelude.Index; import com.yahoo.prelude.IndexFacts; import com.yahoo.prelude.query.AndSegmentItem; import com.yahoo.prelude.query.CompositeItem; import com.yahoo.prelude.query.IndexedItem; import com.yahoo.prelude.query.Item; import com.yahoo.prelude.query.NullItem; import com.yahoo.prelude.query.PhraseItem; import com.yahoo.prelude.query.PhraseSegmentItem; import com.yahoo.prelude.query.WordItem; import com.yahoo.search.query.QueryTree; import com.yahoo.search.query.parser.Parsable; import com.yahoo.search.query.parser.ParserEnvironment; import java.util.Iterator; import java.util.List; import java.util.ListIterator; /** * The Vespa query parser. * * @author bratseth * @author Steinar Knutsen */ public abstract class AbstractParser implements CustomParser { /** The current submodes of this parser */ protected Submodes submodes = new Submodes(); /** * The current language of this parser. Used to decide whether and how to * use the CJKSegmenter */ protected Language language = Language.UNKNOWN; /** The IndexFacts.Session of this query */ protected IndexFacts.Session indexFacts; protected String defaultIndex; /** * The counter for braces in URLs, braces in URLs are accepted so long as * they are balanced. */ protected int braceLevelURL = 0; protected final ParserEnvironment environment; protected final TokenPosition tokens = new TokenPosition(); /** * An enumeration of the parser index-controlled submodes. Any combination * of these may be active at the same time. SubModes are activated or * deactivated by specifying special indexes in the query. */ static final class Submodes { /** * Url mode allows "_" and "-" as word characters. Default is false */ public boolean url = false; /** * Site mode - host names get start of host and end of host markers. * Default is false */ public boolean site = false; /** * Sets submodes from an index. * * @param indexName the index name which should decide the submodes, or null to do nothing. * @param session the session used to look up information about this index */ // To avoid this we need to pass an IndexFacts.session down instead - easily done but not without breaking API's public void setFromIndex(String indexName, IndexFacts.Session session) { if (indexName == null) return; reset(); Index current = session.getIndex(indexName); if (current.isUriIndex()) url = true; else if (current.isHostIndex()) site = true; } /** Sets default values for all submodes */ public void reset() { url = false; site = false; } /** * Returns whether we are in a mode which allows explicit anchoring markers, ^ and $ */ public boolean explicitAnchoring() { return site; } } /** * Creates a new instance of this class, storing the given {@link ParserEnvironment} for parse-time access to the * environment. * * @param environment the environment settings to attach to the Parser */ protected AbstractParser(ParserEnvironment environment) { this.environment = ParserEnvironment.fromParserEnvironment(environment); if (this.environment.getIndexFacts() == null) { this.environment.setIndexFacts(new IndexFacts()); } } // TODO: Deprecate the unwanted method signatures below @Override public final QueryTree parse(Parsable query) { Item root = null; if (query != null) { root = parse(query.getQuery(), query.getFilter(), query.getExplicitLanguage().orElse(query.getLanguage()), environment.getIndexFacts().newSession(query.getSources(), query.getRestrict()), query.getDefaultIndexName(), query); } if (root == null) { root = new NullItem(); } return new QueryTree(root); } @Override public final Item parse(String queryToParse, String filterToParse, Language parsingLanguage, IndexFacts.Session indexFacts, String defaultIndex) { return parse(queryToParse, filterToParse, parsingLanguage, indexFacts, defaultIndex, null); } private Item parse(String queryToParse, String filterToParse, Language parsingLanguage, IndexFacts.Session indexFacts, String defaultIndex, Parsable parsable) { if (queryToParse == null) return null; if (defaultIndex != null) defaultIndex = indexFacts.getCanonicName(defaultIndex); tokenize(queryToParse, defaultIndex, indexFacts, parsingLanguage); if (parsingLanguage == null && parsable != null) { String detectionText = generateLanguageDetectionTextFrom(tokens, indexFacts, defaultIndex); if (detectionText.isEmpty()) // heuristic detection text extraction is fallible detectionText = queryToParse; parsingLanguage = parsable.getOrDetectLanguage(detectionText); } setState(parsingLanguage, indexFacts, defaultIndex); Item root = parseItems(); if (filterToParse != null) { AnyParser filterParser = new AnyParser(environment); if (root == null) { root = filterParser.parseFilter(filterToParse, parsingLanguage, indexFacts); } else { root = filterParser.applyFilter(root, filterToParse, parsingLanguage, indexFacts); } } if (defaultIndex != null) assignDefaultIndex(indexFacts.getCanonicName(defaultIndex), root); return simplifyPhrases(root); } /** * Do a best-effort attempt at creating a single string for language detection from only the relevant * subset of tokens. * The relevant tokens are text tokens which follows names of indexes which are tokenized. * * This method does not modify the position of the given token stream. */ private String generateLanguageDetectionTextFrom(TokenPosition tokens, IndexFacts.Session indexFacts, String defaultIndex) { StringBuilder detectionText = new StringBuilder(); int initialPosition = tokens.getPosition(); while (tokens.hasNext()) { // look for occurrences of text and text:text while (!tokens.currentIs(Token.Kind.WORD) && tokens.hasNext()) // skip nonwords tokens.next(); if (!tokens.hasNext()) break; String queryText; Index index; Token word1 = tokens.next(); if (is(Token.Kind.COLON, tokens.currentNoIgnore())) { tokens.next(); // colon Token word2 = tokens.next(); if ( is(Token.Kind.WORD, word2)) queryText = word2.image; else queryText = ""; index = indexFacts.getIndex(word1.image); if (index.isNull()) { // interpret both as words index = indexFacts.getIndex(defaultIndex); queryText = word1.image + " " + queryText; } } else if (is(Token.Kind.COLON, tokens.currentNoIgnore()) && is(Token.Kind.QUOTE, tokens.currentNoIgnore(1))) { tokens.next(); // colon tokens.next(); // quote StringBuilder quotedContent = new StringBuilder(); while (!tokens.currentIs(Token.Kind.QUOTE) && tokens.hasNext()) { Token token = tokens.next(); if (is(Token.Kind.WORD, token)) quotedContent.append(token.image).append(" "); } tokens.next(); queryText = quotedContent.toString(); index = indexFacts.getIndex(word1.image); if (index.isNull()) { // interpret both as words index = indexFacts.getIndex(defaultIndex); queryText = word1.image + " " + queryText; } } else { index = indexFacts.getIndex(defaultIndex); queryText = word1.image; } if (queryText != null && index.hasPlainTokens()) detectionText.append(queryText).append(" "); } tokens.setPosition(initialPosition); return detectionText.toString(); } /** * Assigns the default index to query terms having no default index. * * This will apply the default index to terms without it added through the filter parameter, * where setting defaultIndex into state causes incorrect parsing. * * @param defaultIndex the default index to assign * @param item the item to check */ private static void assignDefaultIndex(String defaultIndex, Item item) { if (defaultIndex == null || item == null) return; if (item instanceof IndexedItem indexName) { if ("".equals(indexName.getIndexName())) indexName.setIndexName(defaultIndex); } else if (item instanceof CompositeItem) { Iterator items = ((CompositeItem)item).getItemIterator(); while (items.hasNext()) assignDefaultIndex(defaultIndex, items.next()); } } private boolean is(Token.Kind kind, Token tokenOrNull) { if (tokenOrNull == null) return false; return kind.equals(tokenOrNull.kind); } protected abstract Item parseItems(); /** * Unicode normalizes some piece of natural language text. The chosen form * is compatibility decomposition, canonical composition (NFKC). */ protected String normalize(String input) { if (input == null || input.length() == 0) return input; return environment.getLinguistics().getNormalizer().normalize(input); } protected void setState(Language queryLanguage, IndexFacts.Session indexFacts, String defaultIndex) { this.indexFacts = indexFacts; this.defaultIndex = defaultIndex; this.language = queryLanguage; this.submodes.reset(); } /** * Tokenizes the given string and initializes tokens with the found tokens. * * @param query the string to tokenize * @param defaultIndexName the name of the index to use as default * @param indexFacts resolved information about the index we are searching * @param language the language set for this query, or null if none */ protected void tokenize(String query, String defaultIndexName, IndexFacts.Session indexFacts, Language language) { Tokenizer tokenizer = new Tokenizer(environment.getLinguistics()); tokenizer.setSubstringSpecialTokens(language != null && language.isCjk()); tokenizer.setSpecialTokens(environment.getSpecialTokens()); tokens.initialize(tokenizer.tokenize(query, defaultIndexName, indexFacts)); } /** * Collapses single item phrases in the tree to the contained item. * * @param unwashed The item whose phrases to simplify. * @return The simplified item. */ public static Item simplifyPhrases(Item unwashed) { if (unwashed == null) { return unwashed; } else if (unwashed instanceof PhraseItem) { return collapsePhrase((PhraseItem) unwashed); } else if (unwashed instanceof CompositeItem composite) { ListIterator i = composite.getItemIterator(); while (i.hasNext()) { Item original = i.next(); Item transformed = simplifyPhrases(original); if (original != transformed) { i.set(transformed); } } return unwashed; } else { return unwashed; } } private static Item collapsePhrase(PhraseItem phrase) { if (phrase.getItemCount() == 1 && phrase.getItem(0) instanceof WordItem word) { // TODO: Other stuff which needs propagation? word.setWeight(phrase.getWeight()); return word; } else { return phrase; } } /** * Segments a token * * @param indexName the index name which preceeded this token, or null if none * @param token the token to segment * @param quoted whether this segment is within quoted text * @return the resulting item */ // TODO: The segmenting stuff is a mess now, this will fix it: // - Make Segmenter a class which is instantiated per parsing // - Make the instance know the language, etc and do all dispatching internally // -bratseth // TODO: Use segmenting for forced phrase searches? // // Language detection currently depends on tokenization (see generateLanguageDetectionTextFrom), but // - the API's was originally not constructed for that, so a careful and somewhat unsatisfactory dance // must be carried out to make it work // - it should really depend on parsing // This can be solved by making the segment method language independent by // always producing a query item containing the token text and resolve it to a WordItem or // SegmentItem after parsing and language detection. protected Item segment(String indexName, Token token, boolean quoted) { String normalizedToken = normalize(token.toString()); if (token.isSpecial()) { WordItem w = new WordItem(token.toString(), true, token.substring); w.setWords(false); w.setFromSpecialToken(true); return w; } if (language == Language.UNKNOWN) { return new WordItem(normalizedToken, true, token.substring); } Segmenter segmenter = environment.getLinguistics().getSegmenter(); List segments = segmenter.segment(normalizedToken, language); if (segments.size() == 0) { return null; } if (segments.size() == 1) { return new WordItem(segments.get(0), "", true, token.substring); } CompositeItem composite; if (indexFacts.getIndex(indexName).getPhraseSegmenting() || quoted) { composite = new PhraseSegmentItem(token.toString(), normalizedToken, true, false, token.substring); } else { composite = new AndSegmentItem(token.toString(), true, false); } int n = 0; WordItem previous = null; for (String segment : segments) { WordItem w = new WordItem(segment, "", true, token.substring); w.setFromSegmented(true); w.setSegmentIndex(n++); w.setStemmed(false); if (previous != null) previous.setConnectivity(w, 1.0); previous = w; composite.addItem(w); } composite.lock(); return composite; } }