diff options
author | Jon Bratseth <jonbratseth@yahoo.com> | 2017-01-25 10:36:59 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-01-25 10:36:59 +0100 |
commit | c44f6774a9c1d045e0f9a70a068a07ebd07ebb4c (patch) | |
tree | 64e5177f3e443a3192d8d95c0e454cde78cca02c /container-search | |
parent | d5190086e8a46e40164c7a32c67e7e88b864c217 (diff) | |
parent | 30cd11107f4e00331dfe8a4c7d1fa2b0e9ef7d94 (diff) |
Merge pull request #1564 from yahoo/bratseth/multiple-languages-in-a-query
Bratseth/multiple languages in a query
Diffstat (limited to 'container-search')
20 files changed, 257 insertions, 97 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/Index.java b/container-search/src/main/java/com/yahoo/prelude/Index.java index 4f596cefa95..27bcc77dee8 100644 --- a/container-search/src/main/java/com/yahoo/prelude/Index.java +++ b/container-search/src/main/java/com/yahoo/prelude/Index.java @@ -17,8 +17,8 @@ import java.util.Set; * </ul> * addCommand sets both types. * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> - * @author bratseth + * @author Steinar Knutsen + * @author bratseth */ public class Index { @@ -282,9 +282,7 @@ public class Index { this.isAttribute = isAttribute; } - public boolean hasPlainTokens() { - return plainTokens; - } + public boolean hasPlainTokens() { return plainTokens; } public void setPlainTokens(boolean plainTokens) { this.plainTokens = plainTokens; diff --git a/container-search/src/main/java/com/yahoo/prelude/IndexFacts.java b/container-search/src/main/java/com/yahoo/prelude/IndexFacts.java index 3631dedeffc..3f931c92489 100644 --- a/container-search/src/main/java/com/yahoo/prelude/IndexFacts.java +++ b/container-search/src/main/java/com/yahoo/prelude/IndexFacts.java @@ -18,11 +18,11 @@ import static com.yahoo.text.Lowercase.toLowerCase; * session.getIndex(indexName).[get index info] * </code></pre> * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ // TODO: We should replace this with a better representation of search definitions // which is immutable, models clusters and search definitions inside clusters properly, -// and uses better names. +// and uses better names. -bratseth public class IndexFacts { private Map<String, List<String>> clusterByDocument; diff --git a/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java b/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java index 7defe67eede..5ae07dd617e 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java @@ -11,7 +11,7 @@ import java.util.Iterator; * A term which contains a fixed length phrase, a collection of word terms, * resulting from a single segmentation operation. * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ public class PhraseSegmentItem extends IndexedSegmentItem { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java b/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java index 0dd8e1c36cc..38e2b82ea35 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java @@ -10,7 +10,7 @@ import com.yahoo.prelude.query.textualrepresentation.Discloser; * extend AndItem to avoid code using instanceof handling it as an * AndItem. * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ public abstract class SegmentItem extends CompositeItem implements BlockItem { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java index fb56e10445a..0a76e5fb939 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java @@ -115,6 +115,8 @@ public abstract class AbstractParser implements CustomParser { } } + // TODO: Deprecate the unwanted method signatures below + @Override public final QueryTree parse(Parsable query) { Item root = null; @@ -123,7 +125,8 @@ public abstract class AbstractParser implements CustomParser { query.getFilter(), query.getLanguage(), environment.getIndexFacts().newSession(query.getSources(), query.getRestrict()), - query.getDefaultIndexName()); + query.getDefaultIndexName(), + query); } if (root == null) { root = new NullItem(); @@ -134,14 +137,23 @@ public abstract class AbstractParser implements CustomParser { @Override public final Item parse(String queryToParse, String filterToParse, Language parsingLanguage, IndexFacts.Session indexFacts, String defaultIndexName) { - if (queryToParse == null) { - return null; - } - if (parsingLanguage == null) { - parsingLanguage = environment.getLinguistics().getDetector().detect(queryToParse, null).getLanguage(); + return parse(queryToParse, filterToParse, parsingLanguage, indexFacts, defaultIndexName, null); + } + + private Item parse(String queryToParse, String filterToParse, Language parsingLanguage, + IndexFacts.Session indexFacts, String defaultIndexName, Parsable parsable) { + if (queryToParse == null) return null; + + tokenize(queryToParse, defaultIndexName, indexFacts, parsingLanguage); + + if (parsingLanguage == null && parsable != null) { + String detectionText = generateLanguageDetectionTextFrom(tokens, indexFacts, defaultIndexName); + if (detectionText.isEmpty()) // heuristic detection text extraction is fallible + detectionText = queryToParse; + parsingLanguage = parsable.getOrDetectLanguage(detectionText); } setState(parsingLanguage, indexFacts); - tokenize(queryToParse, defaultIndexName, indexFacts); + Item root = parseItems(); if (filterToParse != null) { AnyParser filterParser = new AnyParser(environment); @@ -158,6 +170,70 @@ public abstract class AbstractParser implements CustomParser { return root; } + /** + * Do a best-effort attempt at creating a single string for language detection from only the relevant + * subset of tokens. + * The relevant tokens are text tokens which follows names of indexes which are tokenized. + * + * This method does not modify the position of the given token stream. + */ + private String generateLanguageDetectionTextFrom(TokenPosition tokens, IndexFacts.Session indexFacts, String defaultIndex) { + StringBuilder detectionText = new StringBuilder(); + int initialPosition = tokens.getPosition(); + while (tokens.hasNext()) { // look for occurrences of text and text:text + while (!tokens.currentIs(Token.Kind.WORD) && tokens.hasNext()) // skip nonwords + tokens.next(); + if (!tokens.hasNext()) break; + + String queryText; + Index index; + + Token word1 = tokens.next(); + if (is(Token.Kind.COLON, tokens.currentNoIgnore())) { + tokens.next(); // colon + Token word2 = tokens.next(); + if ( is(Token.Kind.WORD, word2)) + queryText = word2.image; + else + queryText = ""; + index = indexFacts.getIndex(word1.image); + if (index.isNull()) { // interpret both as words + index = indexFacts.getIndex(defaultIndex); + queryText = word1.image + " " + queryText; + } + } else if (is(Token.Kind.COLON, tokens.currentNoIgnore()) && is(Token.Kind.QUOTE, tokens.currentNoIgnore(1))) { + tokens.next(); // colon + tokens.next(); // quote + StringBuilder quotedContent = new StringBuilder(); + while (!tokens.currentIs(Token.Kind.QUOTE) && tokens.hasNext()) { + Token token = tokens.next(); + if (is(Token.Kind.WORD, token)) + quotedContent.append(token.image).append(" "); + } + tokens.next(); + queryText = quotedContent.toString(); + index = indexFacts.getIndex(word1.image); + if (index.isNull()) { // interpret both as words + index = indexFacts.getIndex(defaultIndex); + queryText = word1.image + " " + queryText; + } + } else { + index = indexFacts.getIndex(defaultIndex); + queryText = word1.image; + } + + if (queryText != null && index.hasPlainTokens()) + detectionText.append(queryText).append(" "); + } + tokens.setPosition(initialPosition); + return detectionText.toString(); + } + + private boolean is(Token.Kind kind, Token tokenOrNull) { + if (tokenOrNull == null) return false; + return kind.equals(tokenOrNull.kind); + } + protected abstract Item parseItems(); /** @@ -167,25 +243,19 @@ public abstract class AbstractParser implements CustomParser { * @param defaultIndex The default index to assign. * @param item The item to check. */ - private static void assignDefaultIndex(final String defaultIndex, - final Item item) { - if (defaultIndex == null || item == null) { - return; - } + private static void assignDefaultIndex(final String defaultIndex, Item item) { + if (defaultIndex == null || item == null) return; if (item instanceof IndexedItem) { - final IndexedItem indexName = (IndexedItem) item; + IndexedItem indexName = (IndexedItem) item; - if ("".equals(indexName.getIndexName())) { + if ("".equals(indexName.getIndexName())) indexName.setIndexName(defaultIndex); - } - } else if (item instanceof CompositeItem) { - final Iterator<Item> items = ((CompositeItem) item) - .getItemIterator(); - while (items.hasNext()) { - final Item i = items.next(); - assignDefaultIndex(defaultIndex, i); - } + } + else if (item instanceof CompositeItem) { + Iterator<Item> items = ((CompositeItem)item).getItemIterator(); + while (items.hasNext()) + assignDefaultIndex(defaultIndex, items.next()); } } @@ -196,14 +266,14 @@ public abstract class AbstractParser implements CustomParser { * @param input The string to normalize. * @return The normalized string. */ - protected String normalize(final String input) { + protected String normalize(String input) { if (input == null || input.length() == 0) { return input; } return environment.getLinguistics().getNormalizer().normalize(input); } - protected void setState(final Language queryLanguage, IndexFacts.Session indexFacts) { + protected void setState(Language queryLanguage, IndexFacts.Session indexFacts) { this.indexFacts = indexFacts; language = queryLanguage; submodes.reset(); @@ -215,10 +285,11 @@ public abstract class AbstractParser implements CustomParser { * @param query the string to tokenize. * @param defaultIndexName the name of the index to use as default. * @param indexFacts resolved information about the index we are searching + * @param language the language set for this query, or null if none */ - protected void tokenize(String query, String defaultIndexName, IndexFacts.Session indexFacts) { + protected void tokenize(String query, String defaultIndexName, IndexFacts.Session indexFacts, Language language) { Tokenizer tokenizer = new Tokenizer(environment.getLinguistics()); - tokenizer.setSubstringSpecialTokens(language.isCjk()); + tokenizer.setSubstringSpecialTokens(language != null && language.isCjk()); tokenizer.setSpecialTokens(environment.getSpecialTokens()); tokens.initialize(tokenizer.tokenize(query, defaultIndexName, indexFacts)); } @@ -229,18 +300,18 @@ public abstract class AbstractParser implements CustomParser { * @param unwashed The item whose phrases to simplify. * @return The simplified item. */ - public static Item simplifyPhrases(final Item unwashed) { + public static Item simplifyPhrases(Item unwashed) { if (unwashed == null) { return unwashed; } else if (unwashed instanceof PhraseItem) { return collapsePhrase((PhraseItem) unwashed); } else if (unwashed instanceof CompositeItem) { - final CompositeItem composite = (CompositeItem) unwashed; - final ListIterator<Item> i = composite.getItemIterator(); + CompositeItem composite = (CompositeItem) unwashed; + ListIterator<Item> i = composite.getItemIterator(); while (i.hasNext()) { - final Item original = i.next(); - final Item transformed = simplifyPhrases(original); + Item original = i.next(); + Item transformed = simplifyPhrases(original); if (original != transformed) { i.set(transformed); @@ -252,11 +323,10 @@ public abstract class AbstractParser implements CustomParser { } } - private static Item collapsePhrase(final PhraseItem phrase) { + private static Item collapsePhrase(PhraseItem phrase) { if (phrase.getItemCount() == 1 && phrase.getItem(0) instanceof WordItem) { // TODO: Other stuff which needs propagation? - final WordItem word = (WordItem) phrase.getItem(0); - + WordItem word = (WordItem) phrase.getItem(0); word.setWeight(phrase.getWeight()); return word; } else { @@ -266,15 +336,22 @@ public abstract class AbstractParser implements CustomParser { // TODO: The segmenting stuff is a mess now, this will fix it: // - Make Segmenter a class which is instantiated per parsing - // - Make the instance know the language, etc and do all dispatching - // internally - // -JSB + // - Make the instance know the language, etc and do all dispatching internally + // -bratseth // TODO: Use segmenting for forced phrase searches? - protected Item segment(final Token token) { - final String normalizedToken = normalize(token.toString()); + // + // Language detection currently depends on tokenization (see generateLanguageDetectionTextFrom), but + // - the API's was originally not constructed for that, so a careful nd somewhat unsatisfactory dance + // most be carried out to make it work + // - it should really depend on parsing + // This can be solved by making the segment method language independent by + // always producing a query item containing the token text and resolve it to a WordItem or + // SegmentItem after parsing and language detection. + protected Item segment(Token token) { + String normalizedToken = normalize(token.toString()); if (token.isSpecial()) { - final WordItem w = new WordItem(token.toString(), true, token.substring); + WordItem w = new WordItem(token.toString(), true, token.substring); w.setWords(false); w.setFromSpecialToken(true); return w; @@ -294,11 +371,10 @@ public abstract class AbstractParser implements CustomParser { return new WordItem(segments.get(0), "", true, token.substring); } - final CompositeItem composite = new PhraseSegmentItem(token.toString(), - normalizedToken, true, false, token.substring); + CompositeItem composite = new PhraseSegmentItem(token.toString(), normalizedToken, true, false, token.substring); int n = 0; - for (final String segment : segments) { - final WordItem w = new WordItem(segment, "", true, token.substring); + for (String segment : segments) { + WordItem w = new WordItem(segment, "", true, token.substring); w.setFromSegmented(true); w.setSegmentIndex(n++); w.setStemmed(false); diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/AnyParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/AnyParser.java index 3043cb27247..95cce001469 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/AnyParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/AnyParser.java @@ -15,7 +15,7 @@ import static com.yahoo.prelude.query.parser.Token.Kind.*; /** * Parser for queries of type any. * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ public class AnyParser extends SimpleParser { @@ -35,7 +35,7 @@ public class AnyParser extends SimpleParser { Item filterRoot; setState(queryLanguage, indexFacts); - tokenize(filter, null, indexFacts); + tokenize(filter, null, indexFacts, queryLanguage); filterRoot = anyItems(true); @@ -134,7 +134,7 @@ public class AnyParser extends SimpleParser { Item applyFilter(Item root, String filter, Language queryLanguage, IndexFacts.Session indexFacts) { setState(queryLanguage, indexFacts); - tokenize(filter, null, indexFacts); + tokenize(filter, null, indexFacts, queryLanguage); return filterItems(root); } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java index a658d35e6de..376d68add0a 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java @@ -11,7 +11,7 @@ import java.util.Objects; import java.util.Set; /** - * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> + * @author Simon Thoresen * @since 5.1.4 */ public interface CustomParser extends Parser { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/PhraseParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/PhraseParser.java index ba10b7b6ee1..dfd05ca0da5 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/PhraseParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/PhraseParser.java @@ -8,7 +8,7 @@ import com.yahoo.search.query.parser.ParserEnvironment; /** * Parser for queries of type phrase. * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ public class PhraseParser extends AbstractParser { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/SimpleParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/SimpleParser.java index 6117e8e29ed..d1df74fcfa5 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/SimpleParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/SimpleParser.java @@ -13,7 +13,7 @@ import static com.yahoo.prelude.query.parser.Token.Kind.SPACE; * Base class for parsers of the "simple" query languages (query types * ANY and ALL). * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ abstract class SimpleParser extends StructuredParser { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java index eb35655e4ca..99ac14bcc55 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java @@ -15,7 +15,7 @@ import static com.yahoo.prelude.query.parser.Token.Kind.*; * Base class for parsers of the query languages which can be used * for structured queries (types ANY, ALL and ADVANCED). * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ abstract class StructuredParser extends AbstractParser { @@ -430,9 +430,7 @@ abstract class StructuredParser extends AbstractParser { Item item = null; try { - if (item == null) { - item = word(); - } + item = word(); if (item == null && tokens.currentIs(NUMBER)) { Token t = tokens.next(); @@ -542,7 +540,7 @@ abstract class StructuredParser extends AbstractParser { PhraseItem phrase = null; Item firstWord = null; boolean starAfterFirst = false; - boolean starBeforeFirst = false; + boolean starBeforeFirst; if (tokens.skipMultiple(QUOTE)) { quoted = !quoted; @@ -700,8 +698,7 @@ abstract class StructuredParser extends AbstractParser { if (tokens.currentIsNoIgnore(LBRACE)) { braceLevelURL++; } - if (tokens.hasNext() && !tokens.currentIsNoIgnore(SPACE) - && braceLevelURL >= 0) { + if (tokens.hasNext() && !tokens.currentIsNoIgnore(SPACE) && braceLevelURL >= 0) { tokens.skip(); skipped = true; } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java index cfc22f038c0..e1aa83c7d37 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java @@ -6,10 +6,10 @@ import java.util.List; /** - * An iterator-like view of a list, but typed, random-accessible + * An iterator-like view of a list of tokens, but typed, random-accessible * and with more convenience methods * - * @author bratseth + * @author bratseth */ final class TokenPosition { @@ -214,5 +214,10 @@ final class TokenPosition { skip(); return true; } + + @Override + public String toString() { + return "token " + current(); + } } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java index 582395bc738..ed0af8d5060 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java @@ -16,7 +16,7 @@ import static com.yahoo.prelude.query.parser.Token.Kind.*; /** * Query tokenizer. Singlethreaded. * - * @author bratseth + * @author bratseth */ public final class Tokenizer { @@ -28,7 +28,7 @@ public final class Tokenizer { private SpecialTokens specialTokens = null; /** Whether to recognize tokens also as substrings of other tokens, needed for cjk */ - private boolean substringSpecialTokens=false; + private boolean substringSpecialTokens = false; private final CharacterClasses characterClasses; @@ -53,7 +53,7 @@ public final class Tokenizer { /** Sets whether to recognize tokens also as substrings of other tokens, needed for cjk. Default false. */ public void setSubstringSpecialTokens(boolean substringSpecialTokens) { - this.substringSpecialTokens=substringSpecialTokens; + this.substringSpecialTokens = substringSpecialTokens; } /** diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/CJKSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/CJKSearcher.java index 009c11ab1fd..b04ac2fcec5 100644 --- a/container-search/src/main/java/com/yahoo/prelude/querytransform/CJKSearcher.java +++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/CJKSearcher.java @@ -26,21 +26,21 @@ import com.yahoo.search.searchchain.Execution; import com.yahoo.search.searchchain.PhaseNames; /** - * Search to do necessary transforms if the query is in segmented in - * a "CJK language". + * Search to do necessary transforms if the query is in segmented in a CJK language. * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ @After(PhaseNames.UNBLENDED_RESULT) @Before(STEMMING) @Provides(CJKSearcher.TERM_ORDER_RELAXATION) public class CJKSearcher extends Searcher { + public static final String TERM_ORDER_RELAXATION = "TermOrderRelaxation"; @Override public Result search(Query query, Execution execution) { - Language l = query.getModel().getParsingLanguage(); - if (!l.isCjk()) return execution.search(query); + Language language = query.getModel().getParsingLanguage(); + if ( ! language.isCjk()) return execution.search(query); QueryTree tree = query.getModel().getQueryTree(); tree.setRoot(transform(tree.getRoot())); @@ -82,7 +82,6 @@ public class CJKSearcher extends Searcher { return root; } - private boolean hasOverlappingTokens(PhraseItem phrase) { boolean has = false; for (Iterator<Item> i = phrase.getItemIterator(); i.hasNext(); ) { @@ -108,4 +107,5 @@ public class CJKSearcher extends Searcher { } return segmentsLength > segments.getRawWord().length(); } + } diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java index 59baeb143d4..142c4455930 100644 --- a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java +++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java @@ -78,11 +78,11 @@ public class StemmingSearcher extends Searcher { public String getFunctionName() { return "Stemming"; } private Item replaceTerms(Query q, IndexFacts.Session indexFacts) { - Language l = q.getModel().getParsingLanguage(); - if (l == Language.UNKNOWN) { + Language language = q.getModel().getParsingLanguage(); + if (language == Language.UNKNOWN) { return q.getModel().getQueryTree().getRoot(); } - return scan(q.getModel().getQueryTree().getRoot(), l.isCjk(), l, indexFacts, + return scan(q.getModel().getQueryTree().getRoot(), language.isCjk(), language, indexFacts, createReverseConnectivities(q.getModel().getQueryTree().getRoot())); } diff --git a/container-search/src/main/java/com/yahoo/search/query/Model.java b/container-search/src/main/java/com/yahoo/search/query/Model.java index bf0939d17c1..5b2694b6dfd 100644 --- a/container-search/src/main/java/com/yahoo/search/query/Model.java +++ b/container-search/src/main/java/com/yahoo/search/query/Model.java @@ -74,7 +74,7 @@ public class Model implements Cloneable { private String filter = null; private Language language = null; private Locale locale = null; - private QueryTree queryTree = null; // The actual query. This is lazily created from the program + private QueryTree queryTree = null; // The query tree to execute. This is lazily created from the program private String defaultIndex = null; private Query.Type type = Query.Type.ALL; private Query parent; @@ -115,6 +115,10 @@ public class Model implements Cloneable { } } + public Language getParsingLanguage() { + return getParsingLanguage(queryString); + } + /** * Gets the language to use for parsing. If this is explicitly set in the model, that language is returned. * Otherwise, if a query tree is already produced and any node in it specifies a language the first such @@ -125,9 +129,9 @@ public class Model implements Cloneable { * @return the language determined, never null */ // TODO: We can support multiple languages per query by changing searchers which call this - // to look up the query to use at each point form item.getLanguage + // to look up the query to use at each point from item.getLanguage // with this as fallback for query branches where no parent item specifies language - public Language getParsingLanguage() { + public Language getParsingLanguage(String languageDetectionText) { Language language = getLanguage(); if (language != null) return language; @@ -140,7 +144,7 @@ public class Model implements Cloneable { Linguistics linguistics = execution.context().getLinguistics(); if (linguistics != null) - language = linguistics.getDetector().detect(queryString, null).getLanguage(); + language = linguistics.getDetector().detect(languageDetectionText, null).getLanguage(); // TODO: Set language if detected if (language != Language.UNKNOWN) return language; return Language.ENGLISH; @@ -431,7 +435,8 @@ public class Model implements Cloneable { return (Model)q.properties().get(argumentTypeName); } - public @Override String toString() { + @Override + public String toString() { return "query representation [queryTree: " + queryTree + ", filter: " + filter + "]"; } diff --git a/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java b/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java index f0126b3e866..80194bcccf4 100644 --- a/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java +++ b/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java @@ -25,8 +25,7 @@ import java.util.Set; * <p>In case you are parsing the content of a {@link Model}, you can use the {@link #fromQueryModel(Model)} factory for * convenience.</p> * - * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> - * @since 5.1.4 + * @author Simon Thoresen */ public final class Parsable { @@ -35,9 +34,12 @@ public final class Parsable { private String query; private String filter; private String defaultIndexName; - private Language language; + private Language language; // TODO: Initialize to UNKNOWN private Optional<Language> explicitLanguage = Optional.empty(); + /** If this is set it will be used to determine the language, if not set explicitly */ + private Optional<Model> model = Optional.empty(); + public String getQuery() { return query; } @@ -69,14 +71,26 @@ public final class Parsable { * Returns the language to use when parsing, * if not decided by the item under parsing. This is never null or UNKNOWN */ - public Language getLanguage() { return language; } + public Language getLanguage() { + return language; + } + + /** + * Returns the language to use when parsing, with a text to use for detection if necessary. + * if not decided by the item under parsing. This is never null or UNKNOWN + */ + public Language getOrDetectLanguage(String languageDetectionText) { + if (language != null && language != Language.UNKNOWN) return language; + if (model.isPresent()) return model.get().getParsingLanguage(languageDetectionText); + return Language.UNKNOWN; // against the promise in the JavaDoc, but it is not locally ensured + } public Parsable setLanguage(Language language) { Objects.requireNonNull(language, "Language cannot be null"); this.language = language; return this; } - + /** Returns the language explicitly set to be used when parsing, or empty if none is set. */ public Optional<Language> getExplicitLanguage() { return explicitLanguage; } @@ -86,6 +100,12 @@ public final class Parsable { return this; } + public Parsable setModel(Model model) { + Objects.requireNonNull(model, "Model cannot be null"); + this.model = Optional.of(model); + return this; + } + public Set<String> getSources() { return sourceList; } @@ -116,9 +136,9 @@ public final class Parsable { public static Parsable fromQueryModel(Model model) { return new Parsable() + .setModel(model) .setQuery(model.getQueryString()) .setFilter(model.getFilter()) - .setLanguage(model.getParsingLanguage()) .setExplicitLanguage(Optional.ofNullable(model.getLanguage())) .setDefaultIndexName(model.getDefaultIndex()) .addSources(model.getSources()) diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java index 1257f2e2746..819684ad9a5 100644 --- a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java @@ -1916,7 +1916,7 @@ public class ParseTestCase { @Test public void testChineseSpecialTokens() { tester.assertParsed("AND \"cat tcp/ip zu\" \"foo dotnet bar dotnet dotnet c# c++ bar dotnet dotnet wiz\"", - "cattcp/ipzu foo.netbar.net.netC#c++bar.net.netwiz","",Query.Type.ALL,Language.CHINESE_SIMPLIFIED); + "cattcp/ipzu foo.netbar.net.netC#c++bar.net.netwiz","", Query.Type.ALL, Language.CHINESE_SIMPLIFIED); } /** diff --git a/container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java b/container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java index 87835c08127..7cc440e815e 100644 --- a/container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java @@ -67,6 +67,7 @@ public abstract class RuleBaseAbstractTestCase extends junit.framework.TestCase } protected Query assertSemantics(String result, Query query) { + System.out.println(query.getModel().getQueryTree()); createExecution(searcher).search(query); assertEquals(result, query.getModel().getQueryTree().getRoot().toString()); return query; diff --git a/container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java b/container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java index 6a3180fc488..eaaf87bc035 100644 --- a/container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java @@ -2,6 +2,15 @@ package com.yahoo.search.test; import com.yahoo.component.chain.Chain; +import com.yahoo.language.Language; +import com.yahoo.language.Linguistics; +import com.yahoo.language.detect.Detection; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.detect.Hint; +import com.yahoo.language.simple.SimpleDetector; +import com.yahoo.language.simple.SimpleLinguistics; +import com.yahoo.prelude.Index; +import com.yahoo.prelude.IndexFacts; import com.yahoo.prelude.query.AndItem; import com.yahoo.prelude.query.Highlight; import com.yahoo.prelude.query.IndexedItem; @@ -45,7 +54,7 @@ import static org.junit.Assert.assertNotSame; import static org.junit.Assert.fail; /** - * @author <a href="mailto:arnebef@yahoo-inc.com">Arne Bergene Fossaa</a> + * @author Arne Bergene Fossaa */ public class QueryTestCase { @@ -604,7 +613,7 @@ public class QueryTestCase { @Test public void testModelProperties() { { - Query query=new Query(); + Query query = new Query(); query.properties().set("model.searchPath", "foo"); assertEquals("Set dynamic get dynamic works","foo",query.properties().get("model.searchPath")); assertEquals("Set dynamic get static works","foo",query.getModel().getSearchPath()); @@ -628,15 +637,64 @@ public class QueryTestCase { @Test public void testPositiveTerms() { - Query q = new Query(QueryTestCase.httpEncode("/?query=-a \"b c\" d e")); + Query q = new Query(httpEncode("/?query=-a \"b c\" d e")); Item i = q.getModel().getQueryTree().getRoot(); List<IndexedItem> l = QueryTree.getPositiveTerms(i); assertEquals(3, l.size()); } + + @Test + public void testHeuristicLanguageDetectionTextExtraction() { + assertDetectionText("b ", "a:b", "text:a", "text:default"); + assertDetectionText("b ", "b", "text:default"); + assertDetectionText("b ", "b","text:b", "text:default"); + assertDetectionText("a b ", "a:b","text:b", "text:default"); + assertDetectionText("foo bar fuz ", "foo a:bar --() fuz","text:a", "text:default"); + assertDetectionText(" 彭 博士 觀 風向 彭 博士 觀 風向 彭 博士 觀 風向 ","headline:\"彭 博士 觀 風向\" content:\"彭 博士 觀 風向\" description:\"彭 博士 觀 風向\" sddocname:contentindexing!0 embargo:<1484665288753!0 expires:>1484665288753!0", + "text:headline", "text:content", "text:description", "text:default", "nontext:tags", "nontext:sddocname", "nontext:embargo", "nontext:expires"); + } + + private void assertDetectionText(String expectedDetectionText, String queryString, String ... indexSpecs) { + Query q = new Query(httpEncode("/?query=" + queryString)); + IndexFacts indexFacts = new IndexFacts(); + for (String indexSpec : indexSpecs) { + String[] specParts = indexSpec.split(":"); + Index tokenIndex = new Index(specParts[1]); + if (specParts[0].equals("text")) + tokenIndex.setPlainTokens(true); + indexFacts.addIndex("testSearchDefinition", tokenIndex); + } + MockLinguistics mockLinguistics = new MockLinguistics(); + q.getModel().setExecution(new Execution(Execution.Context.createContextStub(null, indexFacts, mockLinguistics))); + q.getModel().getQueryTree(); // cause parsing + assertEquals(expectedDetectionText, mockLinguistics.detector.lastDetectionText); + } + + /** A linguistics instance which records the last language detection text passed to it */ + private static class MockLinguistics extends SimpleLinguistics { + + final MockDetector detector = new MockDetector(); + + @Override + public Detector getDetector() { return detector; } + + } + + private static class MockDetector extends SimpleDetector { + + String lastDetectionText = null; + + @Override + public Detection detect(String input, Hint hint) { + lastDetectionText = input; + return super.detect(input, hint); + } + + } protected boolean contains(String lineSubstring,String[] lines) { for (String line : lines) - if (line.indexOf(lineSubstring)>=0) return true; + if (line.contains(lineSubstring)) return true; return false; } diff --git a/container-search/src/test/java/com/yahoo/vespa/streamingvisitors/MetricsSearcherTestCase.java b/container-search/src/test/java/com/yahoo/vespa/streamingvisitors/MetricsSearcherTestCase.java index 901c9aa79d4..1fe62c2cd35 100644 --- a/container-search/src/test/java/com/yahoo/vespa/streamingvisitors/MetricsSearcherTestCase.java +++ b/container-search/src/test/java/com/yahoo/vespa/streamingvisitors/MetricsSearcherTestCase.java @@ -16,7 +16,7 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; /** - * @author <a href="mailto:ulf@yahoo-inc.com">Ulf Carlin</a> + * @author Ulf Carlin */ public class MetricsSearcherTestCase { private MetricsSearcher metricsSearcher = new MetricsSearcher(); |