diff options
4 files changed, 129 insertions, 90 deletions
diff --git a/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java b/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java index 92601a5464d..f0126b3e866 100644 --- a/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java +++ b/container-search/src/main/java/com/yahoo/search/query/parser/Parsable.java @@ -6,6 +6,8 @@ import com.yahoo.search.query.Model; import java.util.Collection; import java.util.HashSet; +import java.util.Objects; +import java.util.Optional; import java.util.Set; /** @@ -34,6 +36,7 @@ public final class Parsable { private String filter; private String defaultIndexName; private Language language; + private Optional<Language> explicitLanguage = Optional.empty(); public String getQuery() { return query; @@ -62,15 +65,27 @@ public final class Parsable { return this; } - public Language getLanguage() { - return language; - } + /** + * Returns the language to use when parsing, + * if not decided by the item under parsing. This is never null or UNKNOWN + */ + public Language getLanguage() { return language; } public Parsable setLanguage(Language language) { + Objects.requireNonNull(language, "Language cannot be null"); this.language = language; return this; } + /** Returns the language explicitly set to be used when parsing, or empty if none is set. */ + public Optional<Language> getExplicitLanguage() { return explicitLanguage; } + + public Parsable setExplicitLanguage(Optional<Language> language) { + Objects.requireNonNull(language, "Explicit language cannot be null"); + this.explicitLanguage = language; + return this; + } + public Set<String> getSources() { return sourceList; } @@ -104,6 +119,7 @@ public final class Parsable { .setQuery(model.getQueryString()) .setFilter(model.getFilter()) .setLanguage(model.getParsingLanguage()) + .setExplicitLanguage(Optional.ofNullable(model.getLanguage())) .setDefaultIndexName(model.getDefaultIndex()) .addSources(model.getSources()) .addRestricts(model.getRestrict()); diff --git a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java index bace3b0d9d4..5e8851bc5b3 100644 --- a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java +++ b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java @@ -10,6 +10,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.StringTokenizer; @@ -656,7 +657,7 @@ public class YqlParser implements Parser { Query.Type.ALL.toString(), "grammar for handling user input"); String defaultIndex = getAnnotation(ast, USER_INPUT_DEFAULT_INDEX, String.class, "default", "default index for user input terms"); - Language language = decideUserInputLanguage(ast, wordData); + Language language = decideParsingLanguage(ast, wordData); Item item; if (USER_INPUT_RAW.equals(grammar)) { item = instantiateWordItem(defaultIndex, wordData, ast, null, SegmentWhen.NEVER, language); @@ -666,17 +667,22 @@ public class YqlParser implements Parser { item = parseUserInput(grammar, defaultIndex, wordData, language, allowEmpty); propagateUserInputAnnotations(ast, item); } - item.setLanguage(language); return item; } - private Language decideUserInputLanguage(OperatorNode<ExpressionOperator> ast, String wordData) { + private Language decideParsingLanguage(OperatorNode<ExpressionOperator> ast, String wordData) { String languageTag = getAnnotation(ast, USER_INPUT_LANGUAGE, String.class, null, - "language setting for segmenting user input parameter"); + "language setting for segmenting query section"); + Language language = Language.fromLanguageTag(languageTag); if (language != Language.UNKNOWN) return language; + + Optional<Language> explicitLanguage = currentlyParsing.getExplicitLanguage(); + if (explicitLanguage.isPresent()) return explicitLanguage.get(); + language = detector.detect(wordData, null).getLanguage(); if (language != Language.UNKNOWN) return language; + return Language.ENGLISH; } @@ -711,6 +717,9 @@ public class YqlParser implements Parser { // the null check should be unnecessary, but is there to avoid having to suppress null warnings if ( !allowNullItem && (item == null || item instanceof NullItem)) throw new IllegalArgumentException("Parsing '" + wordData + "' only resulted in NullItem."); + + if (language != Language.ENGLISH) // mark the language used, unless it's the default + item.setLanguage(language); return item; } @@ -1037,11 +1046,8 @@ public class YqlParser implements Parser { } @NonNull - private CompositeItem convertVarArgs(OperatorNode<ExpressionOperator> ast, - int argIdx, @NonNull - CompositeItem out) { - Iterable<OperatorNode<ExpressionOperator>> args = ast - .getArgument(argIdx); + private CompositeItem convertVarArgs(OperatorNode<ExpressionOperator> ast, int argIdx, @NonNull CompositeItem out) { + Iterable<OperatorNode<ExpressionOperator>> args = ast.getArgument(argIdx); for (OperatorNode<ExpressionOperator> arg : args) { assertHasOperator(arg, ExpressionOperator.class); out.addItem(convertExpression(arg)); @@ -1049,10 +1055,8 @@ public class YqlParser implements Parser { return out; } - private void convertVarArgsAnd(OperatorNode<ExpressionOperator> ast, - int argIdx, AndItem outAnd, NotItem outNot) { - Iterable<OperatorNode<ExpressionOperator>> args = ast - .getArgument(argIdx); + private void convertVarArgsAnd(OperatorNode<ExpressionOperator> ast, int argIdx, AndItem outAnd, NotItem outNot) { + Iterable<OperatorNode<ExpressionOperator>> args = ast.getArgument(argIdx); for (OperatorNode<ExpressionOperator> arg : args) { assertHasOperator(arg, ExpressionOperator.class); if (arg.getOperator() == ExpressionOperator.NOT) { @@ -1087,28 +1091,25 @@ public class YqlParser implements Parser { assertHasOperator(spec, ExpressionOperator.CALL); assertHasFunctionName(spec, RANGE); - IntItem range = instantiateRangeItem( - spec.<List<OperatorNode<ExpressionOperator>>> getArgument(1), - spec); + IntItem range = instantiateRangeItem(spec.<List<OperatorNode<ExpressionOperator>>> getArgument(1), spec); return leafStyleSettings(spec, range); } private static Number negate(Number x) { if (x.getClass() == Integer.class) { int x1 = x.intValue(); - return Integer.valueOf(-x1); + return -x1; } else if (x.getClass() == Long.class) { long x1 = x.longValue(); - return Long.valueOf(-x1); + return -x1; } else if (x.getClass() == Float.class) { float x1 = x.floatValue(); - return Float.valueOf(-x1); + return -x1; } else if (x.getClass() == Double.class) { double x1 = x.doubleValue(); - return Double.valueOf(-x1); + return -x1; } else { - throw newUnexpectedArgumentException(x.getClass(), Integer.class, - Long.class, Float.class, Double.class); + throw newUnexpectedArgumentException(x.getClass(), Integer.class, Long.class, Float.class, Double.class); } } @@ -1199,23 +1200,19 @@ public class YqlParser implements Parser { private Item instantiateWordAlternativesItem(String field, OperatorNode<ExpressionOperator> ast) { List<OperatorNode<ExpressionOperator>> args = ast.getArgument(1); Preconditions.checkArgument(args.size() >= 1, "Expected 1 or more arguments, got %s.", args.size()); - Preconditions.checkArgument(args.get(0).getOperator() == ExpressionOperator.MAP, "Expected MAP, got %s.", args.get(0) - .getOperator()); + Preconditions.checkArgument(args.get(0).getOperator() == ExpressionOperator.MAP, "Expected MAP, got %s.", + args.get(0).getOperator()); List<WordAlternativesItem.Alternative> terms = new ArrayList<>(); List<String> keys = args.get(0).getArgument(0); List<OperatorNode<ExpressionOperator>> values = args.get(0).getArgument(1); for (int i = 0; i < keys.size(); ++i) { - String term = keys.get(i); - double exactness; OperatorNode<ExpressionOperator> value = values.get(i); - switch (value.getOperator()) { - case LITERAL: - exactness = value.getArgument(0, Double.class); - break; - default: + if (value.getOperator() != ExpressionOperator.LITERAL) throw newUnexpectedArgumentException(value.getOperator(), ExpressionOperator.LITERAL); - } + + String term = keys.get(i); + double exactness = value.getArgument(0, Double.class); terms.add(new WordAlternativesItem.Alternative(term, exactness)); } Substring origin = getOrigin(ast); @@ -1225,54 +1222,51 @@ public class YqlParser implements Parser { } @NonNull - private Item instantiateEquivItem(String field, - OperatorNode<ExpressionOperator> ast) { + private Item instantiateEquivItem(String field, OperatorNode<ExpressionOperator> ast) { List<OperatorNode<ExpressionOperator>> args = ast.getArgument(1); - Preconditions.checkArgument(args.size() >= 2, - "Expected 2 or more arguments, got %s.", args.size()); + Preconditions.checkArgument(args.size() >= 2, "Expected 2 or more arguments, got %s.", args.size()); EquivItem equiv = new EquivItem(); equiv.setIndexName(field); for (OperatorNode<ExpressionOperator> arg : args) { switch (arg.getOperator()) { - case LITERAL: - equiv.addItem(instantiateWordItem(field, arg, equiv.getClass())); - break; - case CALL: - assertHasFunctionName(arg, PHRASE); - equiv.addItem(instantiatePhraseItem(field, arg)); - break; - default: - throw newUnexpectedArgumentException(arg.getOperator(), - ExpressionOperator.CALL, ExpressionOperator.LITERAL); + case LITERAL: + equiv.addItem(instantiateWordItem(field, arg, equiv.getClass())); + break; + case CALL: + assertHasFunctionName(arg, PHRASE); + equiv.addItem(instantiatePhraseItem(field, arg)); + break; + default: + throw newUnexpectedArgumentException(arg.getOperator(), + ExpressionOperator.CALL, ExpressionOperator.LITERAL); } } return leafStyleSettings(ast, equiv); } @NonNull - private Item instantiateWordItem(String field, - OperatorNode<ExpressionOperator> ast, Class<?> parent) { + private Item instantiateWordItem(String field, OperatorNode<ExpressionOperator> ast, Class<?> parent) { return instantiateWordItem(field, ast, parent, SegmentWhen.POSSIBLY); } @NonNull - private Item instantiateWordItem(String field, - OperatorNode<ExpressionOperator> ast, Class<?> parent, - SegmentWhen segmentPolicy) { + private Item instantiateWordItem(String field, + OperatorNode<ExpressionOperator> ast, Class<?> parent, + SegmentWhen segmentPolicy) { String wordData = getStringContents(ast); - return instantiateWordItem(field, wordData, ast, parent, - segmentPolicy, null); + return instantiateWordItem(field, wordData, ast, parent, segmentPolicy, decideParsingLanguage(ast, wordData)); } @NonNull private Item instantiateWordItem(String field, - String rawWord, - OperatorNode<ExpressionOperator> ast, Class<?> parent, - SegmentWhen segmentPolicy, Language language) { + String rawWord, + OperatorNode<ExpressionOperator> ast, Class<?> parent, + SegmentWhen segmentPolicy, + Language language) { String wordData = rawWord; if (getAnnotation(ast, NFKC, Boolean.class, Boolean.TRUE, - "setting for whether to NFKC normalize input data")) { + "setting for whether to NFKC normalize input data")) { wordData = normalizer.normalize(wordData); } boolean fromQuery = getAnnotation(ast, IMPLICIT_TRANSFORMS, @@ -1320,6 +1314,8 @@ public class YqlParser implements Parser { if (wordItem instanceof WordItem) { prepareWord(field, ast, fromQuery, (WordItem) wordItem); } + if (language != Language.ENGLISH) // mark the language used, unless it's the default + ((Item)wordItem).setLanguage(language); return (Item) leafStyleSettings(ast, wordItem); } diff --git a/container-search/src/test/java/com/yahoo/search/yql/MinimalQueryInserterTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/MinimalQueryInserterTestCase.java index c2ce50b38b4..86ec570d6bb 100644 --- a/container-search/src/test/java/com/yahoo/search/yql/MinimalQueryInserterTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/yql/MinimalQueryInserterTestCase.java @@ -27,6 +27,8 @@ import com.yahoo.search.query.Sorting.UcaSorter; import com.yahoo.search.result.ErrorMessage; import com.yahoo.search.searchchain.Execution; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; @@ -133,32 +135,50 @@ public class MinimalQueryInserterTestCase { } @Test + public void testExplicitLanguageIsHonoredWithVerbatimQuery() { + String japaneseWord = "\u30ab\u30bf\u30ab\u30ca"; + Query query = new Query("search/?language=ja" + "&yql=select%20ignoredField%20from%20ignoredsource%20where%20title%20contains%20%22" + encode(japaneseWord) + "%22%3B"); + execution.search(query); + assertEquals(Language.JAPANESE, query.getModel().getParsingLanguage()); + assertEquals("title:"+ japaneseWord, query.getModel().getQueryTree().toString()); + } + + @Test + public void testUserLanguageIsDetectedWithVerbatimQuery() { + String japaneseWord = "\u30ab\u30bf\u30ab\u30ca"; + Query query = new Query("search/?yql=select%20ignoredField%20from%20ignoredsource%20where%20title%20contains%20%22" + encode(japaneseWord) + "%22%3B"); + execution.search(query); + assertEquals(Language.JAPANESE, query.getModel().getParsingLanguage()); + assertEquals("title:"+ japaneseWord, query.getModel().getQueryTree().toString()); + } + + @Test public void testUserLanguageIsDetectedWithUserInput() { String japaneseWord = "\u30ab\u30bf\u30ab\u30ca"; - Query query = new Query("search/?userString=" + japaneseWord + "&yql=select%20ignoredfield%20from%20ignoredsource%20where%20title%20contains%20%22madonna%22%20and%20userInput(@userString)%3B"); + Query query = new Query("search/?userString=" + encode(japaneseWord) + "&yql=select%20ignoredfield%20from%20ignoredsource%20where%20title%20contains%20%22madonna%22%20and%20userInput(@userString)%3B"); execution.search(query); - assertEquals("AND title:madonna default:" + japaneseWord, query.getModel().getQueryTree().toString()); assertEquals(Language.JAPANESE, query.getModel().getParsingLanguage()); + assertEquals("AND title:madonna default:" + japaneseWord, query.getModel().getQueryTree().toString()); } @Test public void testUserLanguageIsDetectedWithUserQuery() { String japaneseWord = "\u30ab\u30bf\u30ab\u30ca"; - Query query = new Query("search/?query=" + japaneseWord + "&yql=select%20ignoredfield%20from%20ignoredsource%20where%20title%20contains%20%22madonna%22%20and%20userQuery()%3B"); + Query query = new Query("search/?query=" + encode(japaneseWord) + "&yql=select%20ignoredfield%20from%20ignoredsource%20where%20title%20contains%20%22madonna%22%20and%20userQuery()%3B"); execution.search(query); - assertEquals("AND title:madonna " + japaneseWord, query.getModel().getQueryTree().toString()); assertEquals(Language.JAPANESE, query.getModel().getParsingLanguage()); + assertEquals("AND title:madonna " + japaneseWord, query.getModel().getQueryTree().toString()); } @Test - public final void testUserQueryFailsWithoutArgument() { + public void testUserQueryFailsWithoutArgument() { Query query = new Query("search/?query=easilyRecognizedString&yql=select%20ignoredfield%20from%20ignoredsource%20where%20title%20contains%20%22madonna%22%20and%20userQuery()%3B"); execution.search(query); assertEquals("AND title:madonna easilyRecognizedString", query.getModel().getQueryTree().toString()); } @Test - public final void testSearchFromAllSourcesWithUserSource() { + public void testSearchFromAllSourcesWithUserSource() { Query query = new Query("search/?query=easilyRecognizedString&sources=abc&yql=select%20ignoredfield%20from%20sources%20*%20where%20title%20contains%20%22madonna%22%20and%20userQuery()%3B"); execution.search(query); assertEquals("AND title:madonna easilyRecognizedString", query.getModel().getQueryTree().toString()); @@ -166,7 +186,7 @@ public class MinimalQueryInserterTestCase { } @Test - public final void testSearchFromAllSourcesWithoutUserSource() { + public void testSearchFromAllSourcesWithoutUserSource() { Query query = new Query("search/?query=easilyRecognizedString&yql=select%20ignoredfield%20from%20sources%20*%20where%20title%20contains%20%22madonna%22%20and%20userQuery()%3B"); execution.search(query); assertEquals("AND title:madonna easilyRecognizedString", query.getModel().getQueryTree().toString()); @@ -174,7 +194,7 @@ public class MinimalQueryInserterTestCase { } @Test - public final void testSearchFromSomeSourcesWithoutUserSource() { + public void testSearchFromSomeSourcesWithoutUserSource() { Query query = new Query("search/?query=easilyRecognizedString&yql=select%20ignoredfield%20from%20sources%20sourceA,%20sourceB%20where%20title%20contains%20%22madonna%22%20and%20userQuery()%3B"); execution.search(query); assertEquals("AND title:madonna easilyRecognizedString", query.getModel().getQueryTree().toString()); @@ -184,8 +204,8 @@ public class MinimalQueryInserterTestCase { } @Test - public final void testSearchFromSomeSourcesWithUserSource() { - final Query query = new Query("search/?query=easilyRecognizedString&sources=abc&yql=select%20ignoredfield%20from%20sources%20sourceA,%20sourceB%20where%20title%20contains%20%22madonna%22%20and%20userQuery()%3B"); + public void testSearchFromSomeSourcesWithUserSource() { + Query query = new Query("search/?query=easilyRecognizedString&sources=abc&yql=select%20ignoredfield%20from%20sources%20sourceA,%20sourceB%20where%20title%20contains%20%22madonna%22%20and%20userQuery()%3B"); execution.search(query); assertEquals("AND title:madonna easilyRecognizedString", query.getModel().getQueryTree().toString()); assertEquals(3, query.getModel().getSources().size()); @@ -206,8 +226,8 @@ public class MinimalQueryInserterTestCase { } @Test - public final void testLimitAndOffset() { - final Query query = new Query("search/?yql=select%20*%20from%20sources%20*%20where%20title%20contains%20%22madonna%22%20limit%2031offset%207%3B"); + public void testLimitAndOffset() { + Query query = new Query("search/?yql=select%20*%20from%20sources%20*%20where%20title%20contains%20%22madonna%22%20limit%2031offset%207%3B"); execution.search(query); assertEquals(7, query.getOffset()); assertEquals(24, query.getHits()); @@ -216,8 +236,8 @@ public class MinimalQueryInserterTestCase { } @Test - public final void testMaxOffset() { - final Query query = new Query("search/?yql=select%20*%20from%20sources%20*%20where%20title%20contains%20%22madonna%22%20limit%2040031offset%2040000%3B"); + public void testMaxOffset() { + Query query = new Query("search/?yql=select%20*%20from%20sources%20*%20where%20title%20contains%20%22madonna%22%20limit%2040031offset%2040000%3B"); Result r = execution.search(query); assertEquals(1, r.hits().getErrorHit().errors().size()); ErrorMessage e = r.hits().getErrorHit().errorIterator().next(); @@ -226,8 +246,8 @@ public class MinimalQueryInserterTestCase { } @Test - public final void testMaxLimit() { - final Query query = new Query("search/?yql=select%20*%20from%20sources%20*%20where%20title%20contains%20%22madonna%22%20limit%2040000offset%207%3B"); + public void testMaxLimit() { + Query query = new Query("search/?yql=select%20*%20from%20sources%20*%20where%20title%20contains%20%22madonna%22%20limit%2040000offset%207%3B"); Result r = execution.search(query); assertEquals(1, r.hits().getErrorHit().errors().size()); ErrorMessage e = r.hits().getErrorHit().errorIterator().next(); @@ -236,15 +256,15 @@ public class MinimalQueryInserterTestCase { } @Test - public final void testTimeout() { - final Query query = new Query("search/?yql=select%20*%20from%20sources%20*%20where%20title%20contains%20%22madonna%22%20timeout%2051%3B"); + public void testTimeout() { + Query query = new Query("search/?yql=select%20*%20from%20sources%20*%20where%20title%20contains%20%22madonna%22%20timeout%2051%3B"); execution.search(query); assertEquals(51L, query.getTimeout()); assertEquals("select * from sources * where title contains \"madonna\" timeout 51;", query.yqlRepresentation()); } @Test - public final void testOrdering() { + public void testOrdering() { { String yql = "select%20ignoredfield%20from%20ignoredsource%20where%20title%20contains%20%22madonna%22%20order%20by%20something%2C%20shoesize%20desc%20limit%20300%20timeout%203%3B"; Query query = new Query("search/?yql=" + yql); @@ -276,22 +296,20 @@ public class MinimalQueryInserterTestCase { Query query = new Query("search/?yql=" + yql); execution.search(query); { - final FieldOrder fieldOrder = query.getRanking().getSorting() - .fieldOrders().get(0); + FieldOrder fieldOrder = query.getRanking().getSorting().fieldOrders().get(0); assertEquals("other", fieldOrder.getFieldName()); assertEquals(Order.DESCENDING, fieldOrder.getSortOrder()); - final AttributeSorter sorter = fieldOrder.getSorter(); + AttributeSorter sorter = fieldOrder.getSorter(); assertEquals(UcaSorter.class, sorter.getClass()); - final UcaSorter uca = (UcaSorter) sorter; + UcaSorter uca = (UcaSorter) sorter; assertEquals("en_US", uca.getLocale()); assertEquals(UcaSorter.Strength.IDENTICAL, uca.getStrength()); } { - final FieldOrder fieldOrder = query.getRanking().getSorting() - .fieldOrders().get(1); + FieldOrder fieldOrder = query.getRanking().getSorting().fieldOrders().get(1); assertEquals("something", fieldOrder.getFieldName()); assertEquals(Order.ASCENDING, fieldOrder.getSortOrder()); - final AttributeSorter sorter = fieldOrder.getSorter(); + AttributeSorter sorter = fieldOrder.getSorter(); assertEquals(LowerCaseSorter.class, sorter.getClass()); } assertEquals("select foo from bar where title contains \"madonna\" order by [{\"function\": \"uca\", \"locale\": \"en_US\", \"strength\": \"IDENTICAL\"}]other desc, [{\"function\": \"lowercase\"}]something limit 300 timeout 3;", @@ -300,7 +318,7 @@ public class MinimalQueryInserterTestCase { } @Test - public final void testStringReprBasicSanity() { + public void testStringReprBasicSanity() { String yql = "select%20ignoredfield%20from%20ignoredsource%20where%20title%20contains%20%22madonna%22%20order%20by%20something%2C%20shoesize%20desc%20limit%20300%20timeout%203%3B"; Query query = new Query("search/?yql=" + yql); execution.search(query); @@ -316,4 +334,14 @@ public class MinimalQueryInserterTestCase { } assertEquals(expected, actual.toString()); } + + private String encode(String s) { + try { + return URLEncoder.encode(s, "utf-8"); + } + catch (UnsupportedEncodingException e) { + throw new RuntimeException("Will never happen"); + } + } + } diff --git a/linguistics/src/main/java/com/yahoo/language/Language.java b/linguistics/src/main/java/com/yahoo/language/Language.java index 0bf00f1230a..626fb2eac01 100644 --- a/linguistics/src/main/java/com/yahoo/language/Language.java +++ b/linguistics/src/main/java/com/yahoo/language/Language.java @@ -586,9 +586,8 @@ public enum Language { * @return the language given by the encoding, or {@link #UNKNOWN} if not determined. */ public static Language fromEncoding(String encoding) { - if (encoding == null) { - return UNKNOWN; - } + if (encoding == null) return UNKNOWN; + return fromLowerCasedEncoding(Lowercase.toLowerCase(encoding)); } |