diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-01-22 12:08:57 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-01-22 12:08:57 +0100 |
commit | 76c5c6d2ea96842d81cdaa482335e3ddf17a5659 (patch) | |
tree | 080a39a899b8bd2df4ef9c2038c5cdf5a751e251 /container-search | |
parent | 09caf52b327f6a48af8acf02872a49e08d75c9c9 (diff) |
Use text content only for query lang detection
Diffstat (limited to 'container-search')
11 files changed, 185 insertions, 34 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/Index.java b/container-search/src/main/java/com/yahoo/prelude/Index.java index 4f596cefa95..27bcc77dee8 100644 --- a/container-search/src/main/java/com/yahoo/prelude/Index.java +++ b/container-search/src/main/java/com/yahoo/prelude/Index.java @@ -17,8 +17,8 @@ import java.util.Set; * </ul> * addCommand sets both types. * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> - * @author bratseth + * @author Steinar Knutsen + * @author bratseth */ public class Index { @@ -282,9 +282,7 @@ public class Index { this.isAttribute = isAttribute; } - public boolean hasPlainTokens() { - return plainTokens; - } + public boolean hasPlainTokens() { return plainTokens; } public void setPlainTokens(boolean plainTokens) { this.plainTokens = plainTokens; diff --git a/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java b/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java index 7defe67eede..5ae07dd617e 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java @@ -11,7 +11,7 @@ import java.util.Iterator; * A term which contains a fixed length phrase, a collection of word terms, * resulting from a single segmentation operation. * - * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author Steinar Knutsen */ public class PhraseSegmentItem extends IndexedSegmentItem { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java index da73aab3396..0a76e5fb939 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java @@ -115,6 +115,8 @@ public abstract class AbstractParser implements CustomParser { } } + // TODO: Deprecate the unwanted method signatures below + @Override public final QueryTree parse(Parsable query) { Item root = null; @@ -123,7 +125,8 @@ public abstract class AbstractParser implements CustomParser { query.getFilter(), query.getLanguage(), environment.getIndexFacts().newSession(query.getSources(), query.getRestrict()), - query.getDefaultIndexName()); + query.getDefaultIndexName(), + query); } if (root == null) { root = new NullItem(); @@ -134,12 +137,20 @@ public abstract class AbstractParser implements CustomParser { @Override public final Item parse(String queryToParse, String filterToParse, Language parsingLanguage, IndexFacts.Session indexFacts, String defaultIndexName) { + return parse(queryToParse, filterToParse, parsingLanguage, indexFacts, defaultIndexName, null); + } + + private Item parse(String queryToParse, String filterToParse, Language parsingLanguage, + IndexFacts.Session indexFacts, String defaultIndexName, Parsable parsable) { if (queryToParse == null) return null; tokenize(queryToParse, defaultIndexName, indexFacts, parsingLanguage); - if (parsingLanguage == null) { - parsingLanguage = environment.getLinguistics().getDetector().detect(queryToParse, null).getLanguage(); + if (parsingLanguage == null && parsable != null) { + String detectionText = generateLanguageDetectionTextFrom(tokens, indexFacts, defaultIndexName); + if (detectionText.isEmpty()) // heuristic detection text extraction is fallible + detectionText = queryToParse; + parsingLanguage = parsable.getOrDetectLanguage(detectionText); } setState(parsingLanguage, indexFacts); @@ -159,6 +170,70 @@ public abstract class AbstractParser implements CustomParser { return root; } + /** + * Do a best-effort attempt at creating a single string for language detection from only the relevant + * subset of tokens. + * The relevant tokens are text tokens which follows names of indexes which are tokenized. + * + * This method does not modify the position of the given token stream. + */ + private String generateLanguageDetectionTextFrom(TokenPosition tokens, IndexFacts.Session indexFacts, String defaultIndex) { + StringBuilder detectionText = new StringBuilder(); + int initialPosition = tokens.getPosition(); + while (tokens.hasNext()) { // look for occurrences of text and text:text + while (!tokens.currentIs(Token.Kind.WORD) && tokens.hasNext()) // skip nonwords + tokens.next(); + if (!tokens.hasNext()) break; + + String queryText; + Index index; + + Token word1 = tokens.next(); + if (is(Token.Kind.COLON, tokens.currentNoIgnore())) { + tokens.next(); // colon + Token word2 = tokens.next(); + if ( is(Token.Kind.WORD, word2)) + queryText = word2.image; + else + queryText = ""; + index = indexFacts.getIndex(word1.image); + if (index.isNull()) { // interpret both as words + index = indexFacts.getIndex(defaultIndex); + queryText = word1.image + " " + queryText; + } + } else if (is(Token.Kind.COLON, tokens.currentNoIgnore()) && is(Token.Kind.QUOTE, tokens.currentNoIgnore(1))) { + tokens.next(); // colon + tokens.next(); // quote + StringBuilder quotedContent = new StringBuilder(); + while (!tokens.currentIs(Token.Kind.QUOTE) && tokens.hasNext()) { + Token token = tokens.next(); + if (is(Token.Kind.WORD, token)) + quotedContent.append(token.image).append(" "); + } + tokens.next(); + queryText = quotedContent.toString(); + index = indexFacts.getIndex(word1.image); + if (index.isNull()) { // interpret both as words + index = indexFacts.getIndex(defaultIndex); + queryText = word1.image + " " + queryText; + } + } else { + index = indexFacts.getIndex(defaultIndex); + queryText = word1.image; + } + + if (queryText != null && index.hasPlainTokens()) + detectionText.append(queryText).append(" "); + } + tokens.setPosition(initialPosition); + return detectionText.toString(); + } + + private boolean is(Token.Kind kind, Token tokenOrNull) { + if (tokenOrNull == null) return false; + return kind.equals(tokenOrNull.kind); + } + protected abstract Item parseItems(); /** @@ -264,11 +339,19 @@ public abstract class AbstractParser implements CustomParser { // - Make the instance know the language, etc and do all dispatching internally // -bratseth // TODO: Use segmenting for forced phrase searches? + // + // Language detection currently depends on tokenization (see generateLanguageDetectionTextFrom), but + // - the API's was originally not constructed for that, so a careful nd somewhat unsatisfactory dance + // most be carried out to make it work + // - it should really depend on parsing + // This can be solved by making the segment method language independent by + // always producing a query item containing the token text and resolve it to a WordItem or + // SegmentItem after parsing and language detection. protected Item segment(Token token) { String normalizedToken = normalize(token.toString()); if (token.isSpecial()) { - final WordItem w = new WordItem(token.toString(), true, token.substring); + WordItem w = new WordItem(token.toString(), true, token.substring); w.setWords(false); w.setFromSpecialToken(true); return w; diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java index a658d35e6de..376d68add0a 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java @@ -11,7 +11,7 @@ import java.util.Objects; import java.util.Set; /** - * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> + * @author Simon Thoresen * @since 5.1.4 */ public interface CustomParser extends Parser { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java index fd1617b5350..99ac14bcc55 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java @@ -698,8 +698,7 @@ abstract class StructuredParser extends AbstractParser { if (tokens.currentIsNoIgnore(LBRACE)) { braceLevelURL++; } - if (tokens.hasNext() && !tokens.currentIsNoIgnore(SPACE) - && braceLevelURL >= 0) { + if (tokens.hasNext() && !tokens.currentIsNoIgnore(SPACE) && braceLevelURL >= 0) { tokens.skip(); skipped = true; } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java index cfc22f038c0..e1aa83c7d37 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java @@ -6,10 +6,10 @@ import java.util.List; /** - * An iterator-like view of a list, but typed, random-accessible + * An iterator-like view of a list of tokens, but typed, random-accessible * and with more convenience methods * - * @author bratseth + * @author bratseth */ final class TokenPosition { @@ -214,5 +214,10 @@ final class TokenPosition { skip(); return true; } + + @Override + public String toString() { + return "token " + current(); + } } diff --git a/container-search/src/main/java/com/yahoo/search/query/Model.java b/container-search/src/main/java/com/yahoo/search/query/Model.java index ca6f7efaa5e..5b2694b6dfd 100644 --- a/container-search/src/main/java/com/yahoo/search/query/Model.java +++ b/container-search/src/main/java/com/yahoo/search/query/Model.java @@ -115,6 +115,10 @@ public class Model implements Cloneable { } } + public Language getParsingLanguage() { + return getParsingLanguage(queryString); + } + /** * Gets the language to use for parsing. If this is explicitly set in the model, that language is returned. * Otherwise, if a query tree is already produced and any node in it specifies a language the first such @@ -127,7 +131,7 @@ public class Model implements Cloneable { // TODO: We can support multiple languages per query by changing searchers which call this // to look up the query to use at each point from item.getLanguage // with this as fallback for query branches where no parent item specifies language - public Language getParsingLanguage() { + public Language getParsingLanguage(String languageDetectionText) { Language language = getLanguage(); if (language != null) return language; @@ -140,7 +144,7 @@ public class Model implements Cloneable { Linguistics linguistics = execution.context().getLinguistics(); if (linguistics != null) - language = linguistics.getDetector().detect(queryString, null).getLanguage(); + language = linguistics.getDetector().detect(languageDetectionText, null).getLanguage(); // TODO: Set language if detected if (language != Language.UNKNOWN) return language; return Language.ENGLISH; diff --git a/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java b/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java index f0126b3e866..80194bcccf4 100644 --- a/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java +++ b/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java @@ -25,8 +25,7 @@ import java.util.Set; * <p>In case you are parsing the content of a {@link Model}, you can use the {@link #fromQueryModel(Model)} factory for * convenience.</p> * - * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> - * @since 5.1.4 + * @author Simon Thoresen */ public final class Parsable { @@ -35,9 +34,12 @@ public final class Parsable { private String query; private String filter; private String defaultIndexName; - private Language language; + private Language language; // TODO: Initialize to UNKNOWN private Optional<Language> explicitLanguage = Optional.empty(); + /** If this is set it will be used to determine the language, if not set explicitly */ + private Optional<Model> model = Optional.empty(); + public String getQuery() { return query; } @@ -69,14 +71,26 @@ public final class Parsable { * Returns the language to use when parsing, * if not decided by the item under parsing. This is never null or UNKNOWN */ - public Language getLanguage() { return language; } + public Language getLanguage() { + return language; + } + + /** + * Returns the language to use when parsing, with a text to use for detection if necessary. + * if not decided by the item under parsing. This is never null or UNKNOWN + */ + public Language getOrDetectLanguage(String languageDetectionText) { + if (language != null && language != Language.UNKNOWN) return language; + if (model.isPresent()) return model.get().getParsingLanguage(languageDetectionText); + return Language.UNKNOWN; // against the promise in the JavaDoc, but it is not locally ensured + } public Parsable setLanguage(Language language) { Objects.requireNonNull(language, "Language cannot be null"); this.language = language; return this; } - + /** Returns the language explicitly set to be used when parsing, or empty if none is set. */ public Optional<Language> getExplicitLanguage() { return explicitLanguage; } @@ -86,6 +100,12 @@ public final class Parsable { return this; } + public Parsable setModel(Model model) { + Objects.requireNonNull(model, "Model cannot be null"); + this.model = Optional.of(model); + return this; + } + public Set<String> getSources() { return sourceList; } @@ -116,9 +136,9 @@ public final class Parsable { public static Parsable fromQueryModel(Model model) { return new Parsable() + .setModel(model) .setQuery(model.getQueryString()) .setFilter(model.getFilter()) - .setLanguage(model.getParsingLanguage()) .setExplicitLanguage(Optional.ofNullable(model.getLanguage())) .setDefaultIndexName(model.getDefaultIndex()) .addSources(model.getSources()) diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java index 1257f2e2746..819684ad9a5 100644 --- a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java @@ -1916,7 +1916,7 @@ public class ParseTestCase { @Test public void testChineseSpecialTokens() { tester.assertParsed("AND \"cat tcp/ip zu\" \"foo dotnet bar dotnet dotnet c# c++ bar dotnet dotnet wiz\"", - "cattcp/ipzu foo.netbar.net.netC#c++bar.net.netwiz","",Query.Type.ALL,Language.CHINESE_SIMPLIFIED); + "cattcp/ipzu foo.netbar.net.netC#c++bar.net.netwiz","", Query.Type.ALL, Language.CHINESE_SIMPLIFIED); } /** diff --git a/container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java b/container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java index 87835c08127..7cc440e815e 100644 --- a/container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java @@ -67,6 +67,7 @@ public abstract class RuleBaseAbstractTestCase extends junit.framework.TestCase } protected Query assertSemantics(String result, Query query) { + System.out.println(query.getModel().getQueryTree()); createExecution(searcher).search(query); assertEquals(result, query.getModel().getQueryTree().getRoot().toString()); return query; diff --git a/container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java b/container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java index ec6d4f11369..eaaf87bc035 100644 --- a/container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java @@ -3,7 +3,14 @@ package com.yahoo.search.test; import com.yahoo.component.chain.Chain; import com.yahoo.language.Language; +import com.yahoo.language.Linguistics; +import com.yahoo.language.detect.Detection; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.detect.Hint; +import com.yahoo.language.simple.SimpleDetector; import com.yahoo.language.simple.SimpleLinguistics; +import com.yahoo.prelude.Index; +import com.yahoo.prelude.IndexFacts; import com.yahoo.prelude.query.AndItem; import com.yahoo.prelude.query.Highlight; import com.yahoo.prelude.query.IndexedItem; @@ -637,23 +644,57 @@ public class QueryTestCase { } @Test - public void testMultipleLanguages() { - { - Query q = new Query(httpEncode("/?query=headline:\"彭 博士 觀 風向\" content:\"彭 博士 觀 風向\" description:\"彭 博士 觀 風向\"")); - q.getModel().setExecution(new Execution(Execution.Context.createContextStub(null, null, new SimpleLinguistics()))); - assertEquals(Language.CHINESE_TRADITIONAL, q.getModel().getParsingLanguage()); + public void testHeuristicLanguageDetectionTextExtraction() { + assertDetectionText("b ", "a:b", "text:a", "text:default"); + assertDetectionText("b ", "b", "text:default"); + assertDetectionText("b ", "b","text:b", "text:default"); + assertDetectionText("a b ", "a:b","text:b", "text:default"); + assertDetectionText("foo bar fuz ", "foo a:bar --() fuz","text:a", "text:default"); + assertDetectionText(" 彭 博士 觀 風向 彭 博士 觀 風向 彭 博士 觀 風向 ","headline:\"彭 博士 觀 風向\" content:\"彭 博士 觀 風向\" description:\"彭 博士 觀 風向\" sddocname:contentindexing!0 embargo:<1484665288753!0 expires:>1484665288753!0", + "text:headline", "text:content", "text:description", "text:default", "nontext:tags", "nontext:sddocname", "nontext:embargo", "nontext:expires"); + } + + private void assertDetectionText(String expectedDetectionText, String queryString, String ... indexSpecs) { + Query q = new Query(httpEncode("/?query=" + queryString)); + IndexFacts indexFacts = new IndexFacts(); + for (String indexSpec : indexSpecs) { + String[] specParts = indexSpec.split(":"); + Index tokenIndex = new Index(specParts[1]); + if (specParts[0].equals("text")) + tokenIndex.setPlainTokens(true); + indexFacts.addIndex("testSearchDefinition", tokenIndex); } + MockLinguistics mockLinguistics = new MockLinguistics(); + q.getModel().setExecution(new Execution(Execution.Context.createContextStub(null, indexFacts, mockLinguistics))); + q.getModel().getQueryTree(); // cause parsing + assertEquals(expectedDetectionText, mockLinguistics.detector.lastDetectionText); + } + + /** A linguistics instance which records the last language detection text passed to it */ + private static class MockLinguistics extends SimpleLinguistics { - { - Query q = new Query(httpEncode("/?query=headline:\"彭 博士 觀 風向\" content:\"彭 博士 觀 風向\" description:\"彭 博士 觀 風向\" tags:ymedia:type=story tags:ymedia:type=blogpost tags:ymedia:type=slideshow tags:ymedia:type=cavideo tags:ymedia:type=photo -tags:ymedia:hosted=no sddocname:contentindexing!0 embargo:<1484665288753!0 expires:>1484665288753!0")); - q.getModel().setExecution(new Execution(Execution.Context.createContextStub(null, null, new SimpleLinguistics()))); - assertEquals(Language.CHINESE_TRADITIONAL, q.getModel().getParsingLanguage()); + final MockDetector detector = new MockDetector(); + + @Override + public Detector getDetector() { return detector; } + + } + + private static class MockDetector extends SimpleDetector { + + String lastDetectionText = null; + + @Override + public Detection detect(String input, Hint hint) { + lastDetectionText = input; + return super.detect(input, hint); } + } protected boolean contains(String lineSubstring,String[] lines) { for (String line : lines) - if (line.indexOf(lineSubstring)>=0) return true; + if (line.contains(lineSubstring)) return true; return false; } |