summaryrefslogtreecommitdiffstats
path: root/container-search
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2017-01-22 12:08:57 +0100
committerJon Bratseth <bratseth@yahoo-inc.com>2017-01-22 12:08:57 +0100
commit76c5c6d2ea96842d81cdaa482335e3ddf17a5659 (patch)
tree080a39a899b8bd2df4ef9c2038c5cdf5a751e251 /container-search
parent09caf52b327f6a48af8acf02872a49e08d75c9c9 (diff)
Use text content only for query lang detection
Diffstat (limited to 'container-search')
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/Index.java8
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java2
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java91
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java2
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java3
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java9
-rw-r--r--container-search/src/main/java/com/yahoo/search/query/Model.java8
-rw-r--r--container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java32
-rw-r--r--container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java2
-rw-r--r--container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java1
-rw-r--r--container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java61
11 files changed, 185 insertions, 34 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/Index.java b/container-search/src/main/java/com/yahoo/prelude/Index.java
index 4f596cefa95..27bcc77dee8 100644
--- a/container-search/src/main/java/com/yahoo/prelude/Index.java
+++ b/container-search/src/main/java/com/yahoo/prelude/Index.java
@@ -17,8 +17,8 @@ import java.util.Set;
* </ul>
* addCommand sets both types.
*
- * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
- * @author bratseth
+ * @author Steinar Knutsen
+ * @author bratseth
*/
public class Index {
@@ -282,9 +282,7 @@ public class Index {
this.isAttribute = isAttribute;
}
- public boolean hasPlainTokens() {
- return plainTokens;
- }
+ public boolean hasPlainTokens() { return plainTokens; }
public void setPlainTokens(boolean plainTokens) {
this.plainTokens = plainTokens;
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java b/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java
index 7defe67eede..5ae07dd617e 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/PhraseSegmentItem.java
@@ -11,7 +11,7 @@ import java.util.Iterator;
* A term which contains a fixed length phrase, a collection of word terms,
* resulting from a single segmentation operation.
*
- * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
+ * @author Steinar Knutsen
*/
public class PhraseSegmentItem extends IndexedSegmentItem {
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java
index da73aab3396..0a76e5fb939 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/AbstractParser.java
@@ -115,6 +115,8 @@ public abstract class AbstractParser implements CustomParser {
}
}
+ // TODO: Deprecate the unwanted method signatures below
+
@Override
public final QueryTree parse(Parsable query) {
Item root = null;
@@ -123,7 +125,8 @@ public abstract class AbstractParser implements CustomParser {
query.getFilter(),
query.getLanguage(),
environment.getIndexFacts().newSession(query.getSources(), query.getRestrict()),
- query.getDefaultIndexName());
+ query.getDefaultIndexName(),
+ query);
}
if (root == null) {
root = new NullItem();
@@ -134,12 +137,20 @@ public abstract class AbstractParser implements CustomParser {
@Override
public final Item parse(String queryToParse, String filterToParse, Language parsingLanguage,
IndexFacts.Session indexFacts, String defaultIndexName) {
+ return parse(queryToParse, filterToParse, parsingLanguage, indexFacts, defaultIndexName, null);
+ }
+
+ private Item parse(String queryToParse, String filterToParse, Language parsingLanguage,
+ IndexFacts.Session indexFacts, String defaultIndexName, Parsable parsable) {
if (queryToParse == null) return null;
tokenize(queryToParse, defaultIndexName, indexFacts, parsingLanguage);
- if (parsingLanguage == null) {
- parsingLanguage = environment.getLinguistics().getDetector().detect(queryToParse, null).getLanguage();
+ if (parsingLanguage == null && parsable != null) {
+ String detectionText = generateLanguageDetectionTextFrom(tokens, indexFacts, defaultIndexName);
+ if (detectionText.isEmpty()) // heuristic detection text extraction is fallible
+ detectionText = queryToParse;
+ parsingLanguage = parsable.getOrDetectLanguage(detectionText);
}
setState(parsingLanguage, indexFacts);
@@ -159,6 +170,70 @@ public abstract class AbstractParser implements CustomParser {
return root;
}
+ /**
+ * Do a best-effort attempt at creating a single string for language detection from only the relevant
+ * subset of tokens.
+ * The relevant tokens are text tokens which follows names of indexes which are tokenized.
+ *
+ * This method does not modify the position of the given token stream.
+ */
+ private String generateLanguageDetectionTextFrom(TokenPosition tokens, IndexFacts.Session indexFacts, String defaultIndex) {
+ StringBuilder detectionText = new StringBuilder();
+ int initialPosition = tokens.getPosition();
+ while (tokens.hasNext()) { // look for occurrences of text and text:text
+ while (!tokens.currentIs(Token.Kind.WORD) && tokens.hasNext()) // skip nonwords
+ tokens.next();
+ if (!tokens.hasNext()) break;
+
+ String queryText;
+ Index index;
+
+ Token word1 = tokens.next();
+ if (is(Token.Kind.COLON, tokens.currentNoIgnore())) {
+ tokens.next(); // colon
+ Token word2 = tokens.next();
+ if ( is(Token.Kind.WORD, word2))
+ queryText = word2.image;
+ else
+ queryText = "";
+ index = indexFacts.getIndex(word1.image);
+ if (index.isNull()) { // interpret both as words
+ index = indexFacts.getIndex(defaultIndex);
+ queryText = word1.image + " " + queryText;
+ }
+ } else if (is(Token.Kind.COLON, tokens.currentNoIgnore()) && is(Token.Kind.QUOTE, tokens.currentNoIgnore(1))) {
+ tokens.next(); // colon
+ tokens.next(); // quote
+ StringBuilder quotedContent = new StringBuilder();
+ while (!tokens.currentIs(Token.Kind.QUOTE) && tokens.hasNext()) {
+ Token token = tokens.next();
+ if (is(Token.Kind.WORD, token))
+ quotedContent.append(token.image).append(" ");
+ }
+ tokens.next();
+ queryText = quotedContent.toString();
+ index = indexFacts.getIndex(word1.image);
+ if (index.isNull()) { // interpret both as words
+ index = indexFacts.getIndex(defaultIndex);
+ queryText = word1.image + " " + queryText;
+ }
+ } else {
+ index = indexFacts.getIndex(defaultIndex);
+ queryText = word1.image;
+ }
+
+ if (queryText != null && index.hasPlainTokens())
+ detectionText.append(queryText).append(" ");
+ }
+ tokens.setPosition(initialPosition);
+ return detectionText.toString();
+ }
+
+ private boolean is(Token.Kind kind, Token tokenOrNull) {
+ if (tokenOrNull == null) return false;
+ return kind.equals(tokenOrNull.kind);
+ }
+
protected abstract Item parseItems();
/**
@@ -264,11 +339,19 @@ public abstract class AbstractParser implements CustomParser {
// - Make the instance know the language, etc and do all dispatching internally
// -bratseth
// TODO: Use segmenting for forced phrase searches?
+ //
+ // Language detection currently depends on tokenization (see generateLanguageDetectionTextFrom), but
+ // - the API's was originally not constructed for that, so a careful nd somewhat unsatisfactory dance
+ // most be carried out to make it work
+ // - it should really depend on parsing
+ // This can be solved by making the segment method language independent by
+ // always producing a query item containing the token text and resolve it to a WordItem or
+ // SegmentItem after parsing and language detection.
protected Item segment(Token token) {
String normalizedToken = normalize(token.toString());
if (token.isSpecial()) {
- final WordItem w = new WordItem(token.toString(), true, token.substring);
+ WordItem w = new WordItem(token.toString(), true, token.substring);
w.setWords(false);
w.setFromSpecialToken(true);
return w;
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java
index a658d35e6de..376d68add0a 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/CustomParser.java
@@ -11,7 +11,7 @@ import java.util.Objects;
import java.util.Set;
/**
- * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ * @author Simon Thoresen
* @since 5.1.4
*/
public interface CustomParser extends Parser {
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java
index fd1617b5350..99ac14bcc55 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/StructuredParser.java
@@ -698,8 +698,7 @@ abstract class StructuredParser extends AbstractParser {
if (tokens.currentIsNoIgnore(LBRACE)) {
braceLevelURL++;
}
- if (tokens.hasNext() && !tokens.currentIsNoIgnore(SPACE)
- && braceLevelURL >= 0) {
+ if (tokens.hasNext() && !tokens.currentIsNoIgnore(SPACE) && braceLevelURL >= 0) {
tokens.skip();
skipped = true;
}
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java
index cfc22f038c0..e1aa83c7d37 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/TokenPosition.java
@@ -6,10 +6,10 @@ import java.util.List;
/**
- * An iterator-like view of a list, but typed, random-accessible
+ * An iterator-like view of a list of tokens, but typed, random-accessible
* and with more convenience methods
*
- * @author bratseth
+ * @author bratseth
*/
final class TokenPosition {
@@ -214,5 +214,10 @@ final class TokenPosition {
skip();
return true;
}
+
+ @Override
+ public String toString() {
+ return "token " + current();
+ }
}
diff --git a/container-search/src/main/java/com/yahoo/search/query/Model.java b/container-search/src/main/java/com/yahoo/search/query/Model.java
index ca6f7efaa5e..5b2694b6dfd 100644
--- a/container-search/src/main/java/com/yahoo/search/query/Model.java
+++ b/container-search/src/main/java/com/yahoo/search/query/Model.java
@@ -115,6 +115,10 @@ public class Model implements Cloneable {
}
}
+ public Language getParsingLanguage() {
+ return getParsingLanguage(queryString);
+ }
+
/**
* Gets the language to use for parsing. If this is explicitly set in the model, that language is returned.
* Otherwise, if a query tree is already produced and any node in it specifies a language the first such
@@ -127,7 +131,7 @@ public class Model implements Cloneable {
// TODO: We can support multiple languages per query by changing searchers which call this
// to look up the query to use at each point from item.getLanguage
// with this as fallback for query branches where no parent item specifies language
- public Language getParsingLanguage() {
+ public Language getParsingLanguage(String languageDetectionText) {
Language language = getLanguage();
if (language != null) return language;
@@ -140,7 +144,7 @@ public class Model implements Cloneable {
Linguistics linguistics = execution.context().getLinguistics();
if (linguistics != null)
- language = linguistics.getDetector().detect(queryString, null).getLanguage();
+ language = linguistics.getDetector().detect(languageDetectionText, null).getLanguage(); // TODO: Set language if detected
if (language != Language.UNKNOWN) return language;
return Language.ENGLISH;
diff --git a/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java b/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java
index f0126b3e866..80194bcccf4 100644
--- a/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java
+++ b/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java
@@ -25,8 +25,7 @@ import java.util.Set;
* <p>In case you are parsing the content of a {@link Model}, you can use the {@link #fromQueryModel(Model)} factory for
* convenience.</p>
*
- * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
- * @since 5.1.4
+ * @author Simon Thoresen
*/
public final class Parsable {
@@ -35,9 +34,12 @@ public final class Parsable {
private String query;
private String filter;
private String defaultIndexName;
- private Language language;
+ private Language language; // TODO: Initialize to UNKNOWN
private Optional<Language> explicitLanguage = Optional.empty();
+ /** If this is set it will be used to determine the language, if not set explicitly */
+ private Optional<Model> model = Optional.empty();
+
public String getQuery() {
return query;
}
@@ -69,14 +71,26 @@ public final class Parsable {
* Returns the language to use when parsing,
* if not decided by the item under parsing. This is never null or UNKNOWN
*/
- public Language getLanguage() { return language; }
+ public Language getLanguage() {
+ return language;
+ }
+
+ /**
+ * Returns the language to use when parsing, with a text to use for detection if necessary.
+ * if not decided by the item under parsing. This is never null or UNKNOWN
+ */
+ public Language getOrDetectLanguage(String languageDetectionText) {
+ if (language != null && language != Language.UNKNOWN) return language;
+ if (model.isPresent()) return model.get().getParsingLanguage(languageDetectionText);
+ return Language.UNKNOWN; // against the promise in the JavaDoc, but it is not locally ensured
+ }
public Parsable setLanguage(Language language) {
Objects.requireNonNull(language, "Language cannot be null");
this.language = language;
return this;
}
-
+
/** Returns the language explicitly set to be used when parsing, or empty if none is set. */
public Optional<Language> getExplicitLanguage() { return explicitLanguage; }
@@ -86,6 +100,12 @@ public final class Parsable {
return this;
}
+ public Parsable setModel(Model model) {
+ Objects.requireNonNull(model, "Model cannot be null");
+ this.model = Optional.of(model);
+ return this;
+ }
+
public Set<String> getSources() {
return sourceList;
}
@@ -116,9 +136,9 @@ public final class Parsable {
public static Parsable fromQueryModel(Model model) {
return new Parsable()
+ .setModel(model)
.setQuery(model.getQueryString())
.setFilter(model.getFilter())
- .setLanguage(model.getParsingLanguage())
.setExplicitLanguage(Optional.ofNullable(model.getLanguage()))
.setDefaultIndexName(model.getDefaultIndex())
.addSources(model.getSources())
diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
index 1257f2e2746..819684ad9a5 100644
--- a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
+++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
@@ -1916,7 +1916,7 @@ public class ParseTestCase {
@Test
public void testChineseSpecialTokens() {
tester.assertParsed("AND \"cat tcp/ip zu\" \"foo dotnet bar dotnet dotnet c# c++ bar dotnet dotnet wiz\"",
- "cattcp/ipzu foo.netbar.net.netC#c++bar.net.netwiz","",Query.Type.ALL,Language.CHINESE_SIMPLIFIED);
+ "cattcp/ipzu foo.netbar.net.netC#c++bar.net.netwiz","", Query.Type.ALL, Language.CHINESE_SIMPLIFIED);
}
/**
diff --git a/container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java b/container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java
index 87835c08127..7cc440e815e 100644
--- a/container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java
+++ b/container-search/src/test/java/com/yahoo/prelude/semantics/test/RuleBaseAbstractTestCase.java
@@ -67,6 +67,7 @@ public abstract class RuleBaseAbstractTestCase extends junit.framework.TestCase
}
protected Query assertSemantics(String result, Query query) {
+ System.out.println(query.getModel().getQueryTree());
createExecution(searcher).search(query);
assertEquals(result, query.getModel().getQueryTree().getRoot().toString());
return query;
diff --git a/container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java b/container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java
index ec6d4f11369..eaaf87bc035 100644
--- a/container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/test/QueryTestCase.java
@@ -3,7 +3,14 @@ package com.yahoo.search.test;
import com.yahoo.component.chain.Chain;
import com.yahoo.language.Language;
+import com.yahoo.language.Linguistics;
+import com.yahoo.language.detect.Detection;
+import com.yahoo.language.detect.Detector;
+import com.yahoo.language.detect.Hint;
+import com.yahoo.language.simple.SimpleDetector;
import com.yahoo.language.simple.SimpleLinguistics;
+import com.yahoo.prelude.Index;
+import com.yahoo.prelude.IndexFacts;
import com.yahoo.prelude.query.AndItem;
import com.yahoo.prelude.query.Highlight;
import com.yahoo.prelude.query.IndexedItem;
@@ -637,23 +644,57 @@ public class QueryTestCase {
}
@Test
- public void testMultipleLanguages() {
- {
- Query q = new Query(httpEncode("/?query=headline:\"彭 博士 觀 風向\" content:\"彭 博士 觀 風向\" description:\"彭 博士 觀 風向\""));
- q.getModel().setExecution(new Execution(Execution.Context.createContextStub(null, null, new SimpleLinguistics())));
- assertEquals(Language.CHINESE_TRADITIONAL, q.getModel().getParsingLanguage());
+ public void testHeuristicLanguageDetectionTextExtraction() {
+ assertDetectionText("b ", "a:b", "text:a", "text:default");
+ assertDetectionText("b ", "b", "text:default");
+ assertDetectionText("b ", "b","text:b", "text:default");
+ assertDetectionText("a b ", "a:b","text:b", "text:default");
+ assertDetectionText("foo bar fuz ", "foo a:bar --() fuz","text:a", "text:default");
+ assertDetectionText(" 彭 博士 觀 風向 彭 博士 觀 風向 彭 博士 觀 風向 ","headline:\"彭 博士 觀 風向\" content:\"彭 博士 觀 風向\" description:\"彭 博士 觀 風向\" sddocname:contentindexing!0 embargo:<1484665288753!0 expires:>1484665288753!0",
+ "text:headline", "text:content", "text:description", "text:default", "nontext:tags", "nontext:sddocname", "nontext:embargo", "nontext:expires");
+ }
+
+ private void assertDetectionText(String expectedDetectionText, String queryString, String ... indexSpecs) {
+ Query q = new Query(httpEncode("/?query=" + queryString));
+ IndexFacts indexFacts = new IndexFacts();
+ for (String indexSpec : indexSpecs) {
+ String[] specParts = indexSpec.split(":");
+ Index tokenIndex = new Index(specParts[1]);
+ if (specParts[0].equals("text"))
+ tokenIndex.setPlainTokens(true);
+ indexFacts.addIndex("testSearchDefinition", tokenIndex);
}
+ MockLinguistics mockLinguistics = new MockLinguistics();
+ q.getModel().setExecution(new Execution(Execution.Context.createContextStub(null, indexFacts, mockLinguistics)));
+ q.getModel().getQueryTree(); // cause parsing
+ assertEquals(expectedDetectionText, mockLinguistics.detector.lastDetectionText);
+ }
+
+ /** A linguistics instance which records the last language detection text passed to it */
+ private static class MockLinguistics extends SimpleLinguistics {
- {
- Query q = new Query(httpEncode("/?query=headline:\"彭 博士 觀 風向\" content:\"彭 博士 觀 風向\" description:\"彭 博士 觀 風向\" tags:ymedia:type=story tags:ymedia:type=blogpost tags:ymedia:type=slideshow tags:ymedia:type=cavideo tags:ymedia:type=photo -tags:ymedia:hosted=no sddocname:contentindexing!0 embargo:<1484665288753!0 expires:>1484665288753!0"));
- q.getModel().setExecution(new Execution(Execution.Context.createContextStub(null, null, new SimpleLinguistics())));
- assertEquals(Language.CHINESE_TRADITIONAL, q.getModel().getParsingLanguage());
+ final MockDetector detector = new MockDetector();
+
+ @Override
+ public Detector getDetector() { return detector; }
+
+ }
+
+ private static class MockDetector extends SimpleDetector {
+
+ String lastDetectionText = null;
+
+ @Override
+ public Detection detect(String input, Hint hint) {
+ lastDetectionText = input;
+ return super.detect(input, hint);
}
+
}
protected boolean contains(String lineSubstring,String[] lines) {
for (String line : lines)
- if (line.indexOf(lineSubstring)>=0) return true;
+ if (line.contains(lineSubstring)) return true;
return false;
}