diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java |
Publish
Diffstat (limited to 'container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java')
-rw-r--r-- | container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java | 311 |
1 files changed, 311 insertions, 0 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java new file mode 100644 index 00000000000..fb56e10445a --- /dev/null +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java @@ -0,0 +1,311 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.prelude.query.parser; + +import com.yahoo.language.Language; +import com.yahoo.language.process.Segmenter; +import com.yahoo.log.event.*; +import com.yahoo.prelude.Index; +import com.yahoo.prelude.IndexFacts; +import com.yahoo.prelude.query.*; +import com.yahoo.search.query.QueryTree; +import com.yahoo.search.query.parser.Parsable; +import com.yahoo.search.query.parser.ParserEnvironment; + +import java.util.*; + +/** + * The Vespa query parser. + * + * @author bratseth + * @author Steinar Knutsen + */ +@SuppressWarnings("deprecation") +public abstract class AbstractParser implements CustomParser { + + /** The current submodes of this parser */ + protected Submodes submodes = new Submodes(); + + /** + * The current language of this parser. Used to decide whether and how to + * use the CJKSegmenter + */ + protected Language language = Language.UNKNOWN; + + /** The IndexFacts.Session of this query */ + protected IndexFacts.Session indexFacts; + + /** + * The counter for braces in URLs, braces in URLs are accepted so long as + * they are balanced. + */ + protected int braceLevelURL = 0; + + protected final ParserEnvironment environment; + protected final TokenPosition tokens = new TokenPosition(); + + /** + * An enumeration of the parser index-controlled submodes. Any combination + * of these may be active at the same time. SubModes are activated or + * deactivated by specifying special indexes in the query. + */ + final class Submodes { + + /** + * Url mode allows "_" and "-" as word characters. Default is false + */ + public boolean url = false; + + /** + * Site mode - host names get start of host and end of host markers. + * Default is false + */ + public boolean site = false; + + /** + * Sets submodes from an index. + * + * @param indexName the index name which should decide the submodes, or null to do nothing. + * @param session the session used to look up information about this index + */ + @SuppressWarnings({"deprecation"}) + // To avoid this we need to pass an IndexFacts.session down instead - easily done but not without breaking API's + public void setFromIndex(final String indexName, IndexFacts.Session session) { + if (indexName == null) { + return; + } + + reset(); + + final Index current = session.getIndex(indexName); + + if (current.isUriIndex()) { + url = true; + } else if (current.isHostIndex()) { + site = true; + } + } + + /** Sets default values for all submodes */ + public void reset() { + url = false; + site = false; + } + + /** + * Returns whether we are in a mode which allows explicit anchoring + * markers, ^ and $ + * + * @return True if we are doing explicit anchoring. + */ + public boolean explicitAnchoring() { + return site; + } + } + + /** + * <p>Creates a new instance of this class, storing the given {@link ParserEnvironment} for parse-time access to the + * environment.</p> + * + * @param environment The environment settings to attach to the Parser. + */ + protected AbstractParser(ParserEnvironment environment) { + this.environment = ParserEnvironment.fromParserEnvironment(environment); + if (this.environment.getIndexFacts() == null) { + this.environment.setIndexFacts(new IndexFacts()); + } + } + + @Override + public final QueryTree parse(Parsable query) { + Item root = null; + if (query != null) { + root = parse(query.getQuery(), + query.getFilter(), + query.getLanguage(), + environment.getIndexFacts().newSession(query.getSources(), query.getRestrict()), + query.getDefaultIndexName()); + } + if (root == null) { + root = new NullItem(); + } + return new QueryTree(root); + } + + @Override + public final Item parse(String queryToParse, String filterToParse, Language parsingLanguage, + IndexFacts.Session indexFacts, String defaultIndexName) { + if (queryToParse == null) { + return null; + } + if (parsingLanguage == null) { + parsingLanguage = environment.getLinguistics().getDetector().detect(queryToParse, null).getLanguage(); + } + setState(parsingLanguage, indexFacts); + tokenize(queryToParse, defaultIndexName, indexFacts); + Item root = parseItems(); + if (filterToParse != null) { + AnyParser filterParser = new AnyParser(environment); + if (root == null) { + root = filterParser.parseFilter(filterToParse, parsingLanguage, indexFacts); + } else { + root = filterParser.applyFilter(root, filterToParse, parsingLanguage, indexFacts); + } + } + root = simplifyPhrases(root); + if (defaultIndexName != null) { + assignDefaultIndex(indexFacts.getCanonicName(defaultIndexName), root); + } + return root; + } + + protected abstract Item parseItems(); + + /** + * Assigns the default index to query terms having no default index The + * parser _should_ have done this, for some reason it doesn't + * + * @param defaultIndex The default index to assign. + * @param item The item to check. + */ + private static void assignDefaultIndex(final String defaultIndex, + final Item item) { + if (defaultIndex == null || item == null) { + return; + } + + if (item instanceof IndexedItem) { + final IndexedItem indexName = (IndexedItem) item; + + if ("".equals(indexName.getIndexName())) { + indexName.setIndexName(defaultIndex); + } + } else if (item instanceof CompositeItem) { + final Iterator<Item> items = ((CompositeItem) item) + .getItemIterator(); + while (items.hasNext()) { + final Item i = items.next(); + assignDefaultIndex(defaultIndex, i); + } + } + } + + /** + * Unicode normalizes some piece of natural language text. The chosen form + * is compatibility decomposition, canonical composition (NFKC). + * + * @param input The string to normalize. + * @return The normalized string. + */ + protected String normalize(final String input) { + if (input == null || input.length() == 0) { + return input; + } + return environment.getLinguistics().getNormalizer().normalize(input); + } + + protected void setState(final Language queryLanguage, IndexFacts.Session indexFacts) { + this.indexFacts = indexFacts; + language = queryLanguage; + submodes.reset(); + } + + /** + * Tokenizes the given string and initializes tokens with the found tokens. + * + * @param query the string to tokenize. + * @param defaultIndexName the name of the index to use as default. + * @param indexFacts resolved information about the index we are searching + */ + protected void tokenize(String query, String defaultIndexName, IndexFacts.Session indexFacts) { + Tokenizer tokenizer = new Tokenizer(environment.getLinguistics()); + tokenizer.setSubstringSpecialTokens(language.isCjk()); + tokenizer.setSpecialTokens(environment.getSpecialTokens()); + tokens.initialize(tokenizer.tokenize(query, defaultIndexName, indexFacts)); + } + + /** + * Collapses single item phrases in the tree to the contained item. + * + * @param unwashed The item whose phrases to simplify. + * @return The simplified item. + */ + public static Item simplifyPhrases(final Item unwashed) { + if (unwashed == null) { + return unwashed; + } else if (unwashed instanceof PhraseItem) { + return collapsePhrase((PhraseItem) unwashed); + } else if (unwashed instanceof CompositeItem) { + final CompositeItem composite = (CompositeItem) unwashed; + final ListIterator<Item> i = composite.getItemIterator(); + + while (i.hasNext()) { + final Item original = i.next(); + final Item transformed = simplifyPhrases(original); + + if (original != transformed) { + i.set(transformed); + } + } + return unwashed; + } else { + return unwashed; + } + } + + private static Item collapsePhrase(final PhraseItem phrase) { + if (phrase.getItemCount() == 1 && phrase.getItem(0) instanceof WordItem) { + // TODO: Other stuff which needs propagation? + final WordItem word = (WordItem) phrase.getItem(0); + + word.setWeight(phrase.getWeight()); + return word; + } else { + return phrase; + } + } + + // TODO: The segmenting stuff is a mess now, this will fix it: + // - Make Segmenter a class which is instantiated per parsing + // - Make the instance know the language, etc and do all dispatching + // internally + // -JSB + // TODO: Use segmenting for forced phrase searches? + protected Item segment(final Token token) { + final String normalizedToken = normalize(token.toString()); + + if (token.isSpecial()) { + final WordItem w = new WordItem(token.toString(), true, token.substring); + w.setWords(false); + w.setFromSpecialToken(true); + return w; + } + + if (language == Language.UNKNOWN) { + return new WordItem(normalizedToken, true, token.substring); + } + + + Segmenter segmenter = environment.getLinguistics().getSegmenter(); + List<String> segments = segmenter.segment(normalizedToken, language); + if (segments.size() == 0) { + return null; + } + if (segments.size() == 1) { + return new WordItem(segments.get(0), "", true, token.substring); + } + + final CompositeItem composite = new PhraseSegmentItem(token.toString(), + normalizedToken, true, false, token.substring); + int n = 0; + for (final String segment : segments) { + final WordItem w = new WordItem(segment, "", true, token.substring); + w.setFromSegmented(true); + w.setSegmentIndex(n++); + w.setStemmed(false); + composite.addItem(w); + } + composite.lock(); + return composite; + } + +} |