summaryrefslogtreecommitdiffstats
path: root/container-search
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2018-10-19 12:24:00 +0200
committerJon Bratseth <bratseth@oath.com>2018-10-19 12:24:00 +0200
commit9d8c278983f9750af74301d769fd2538a6be93ec (patch)
tree54f723f69ac91d503649e1df192a54eb43192181 /container-search
parentceb6f44366553e2ffa340a55ac1db531743a8d07 (diff)
Segment 'contains' text by default
The current default is to not segment text given to 'contains' in YQL outside of userInput, which means that submitting text that is not pre-tokenized in exactly the same way as the tokenizer of the field won't work (such as the queries "contains 'foo bar'", or contains foo.bar). Because of this default choice, apparently some complicated logic was added to track when it needs to be automatically replaced with a more reasonable choice, in the case where one Vespa instance federates to another. I removed that now, because it doesn't seem necessary without the bad default.
Diffstat (limited to 'container-search')
-rw-r--r--container-search/src/main/java/com/yahoo/search/yql/YqlParser.java86
-rw-r--r--container-search/src/test/java/com/yahoo/search/yql/ResegmentingTestCase.java148
-rw-r--r--container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java45
3 files changed, 29 insertions, 250 deletions
diff --git a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
index ee288de4e88..e4ae759eec7 100644
--- a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
+++ b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
@@ -18,10 +18,8 @@ import com.google.common.annotations.Beta;
import com.google.common.base.Preconditions;
import com.yahoo.collections.LazyMap;
import com.yahoo.collections.LazySet;
-import com.yahoo.collections.Tuple2;
import com.yahoo.component.Version;
import com.yahoo.language.Language;
-import com.yahoo.language.Linguistics;
import com.yahoo.language.detect.Detector;
import com.yahoo.language.process.Normalizer;
import com.yahoo.language.process.Segmenter;
@@ -117,9 +115,6 @@ public class YqlParser implements Parser {
private static final String NORMALIZE_CASE_DESCRIPTION = "setting for whether to do case normalization if field implies it";
private static final String ORIGIN_DESCRIPTION = "string origin for a term";
private static final String RANKED_DESCRIPTION = "setting for whether to use term for ranking";
- private static final String SEGMENTER_BACKEND = "backend";
- private static final String SEGMENTER = "segmenter";
- private static final String SEGMENTER_VERSION = "version";
private static final String STEM_DESCRIPTION = "setting for whether to use stem if field implies it";
private static final String USE_POSITION_DATA_DESCRIPTION = "setting for whether to use position data for ranking this item";
private static final String USER_INPUT_ALLOW_EMPTY = "allowEmpty";
@@ -190,8 +185,6 @@ public class YqlParser implements Parser {
private final Detector detector;
private final Set<String> yqlSources = LazySet.newHashSet();
private final Set<String> yqlSummaryFields = LazySet.newHashSet();
- private final String localSegmenterBackend;
- private final Version localSegmenterVersion;
private Integer hits;
private Integer offset;
private Integer timeout;
@@ -201,10 +194,7 @@ public class YqlParser implements Parser {
private IndexNameExpander indexNameExpander = new IndexNameExpander();
private Set<String> docTypes;
private Sorting sorting;
- private String segmenterBackend;
- private Version segmenterVersion;
private boolean queryParser = true;
- private boolean resegment = false;
private final Deque<OperatorNode<?>> annotationStack = new ArrayDeque<>();
private final ParserEnvironment environment;
@@ -238,10 +228,6 @@ public class YqlParser implements Parser {
segmenter = environment.getLinguistics().getSegmenter();
detector = environment.getLinguistics().getDetector();
this.environment = environment;
-
- Tuple2<String, Version> version = environment.getLinguistics().getVersion(Linguistics.Component.SEGMENTER);
- localSegmenterBackend = version.first;
- localSegmenterVersion = version.second;
}
@NonNull
@@ -261,10 +247,7 @@ public class YqlParser implements Parser {
currentlyParsing = query;
docTypes = null;
sorting = null;
- segmenterBackend = null;
- segmenterVersion = null;
// queryParser set prior to calling this
- resegment = false;
return buildTree(parseYqlProgram());
}
@@ -287,32 +270,12 @@ public class YqlParser implements Parser {
filterPart.getArguments().length);
populateYqlSources(filterPart.<OperatorNode<?>> getArgument(0));
OperatorNode<ExpressionOperator> filterExpression = filterPart.getArgument(1);
- populateLinguisticsAnnotations(filterExpression);
Item root = convertExpression(filterExpression);
connectItems();
userQuery = null;
return new QueryTree(root);
}
- private void populateLinguisticsAnnotations(OperatorNode<ExpressionOperator> filterExpression) {
- Map<?, ?> segmenter = getAnnotation(filterExpression, SEGMENTER,
- Map.class, null, "segmenter engine and version");
- if (segmenter == null) {
- segmenterVersion = null;
- segmenterBackend = null;
- resegment = false;
- } else {
- segmenterBackend = getMapValue(SEGMENTER, segmenter, SEGMENTER_BACKEND, String.class);
- try {
- segmenterVersion = new Version(getMapValue(SEGMENTER, segmenter, SEGMENTER_VERSION, String.class));
- } catch (RuntimeException e) {
- segmenterVersion = null;
- }
- resegment = ! localSegmenterBackend.equals(segmenterBackend) ||
- ! localSegmenterVersion.equals(segmenterVersion);
- }
- }
-
private void populateYqlSources(OperatorNode<?> filterArgs) {
yqlSources.clear();
if (filterArgs.getOperator() == SequenceOperator.SCAN) {
@@ -614,8 +577,7 @@ public class YqlParser implements Parser {
}
phrase.setIndexName(field);
- if (resegment
- && getAnnotation(ast, IMPLICIT_TRANSFORMS, Boolean.class, Boolean.TRUE, IMPLICIT_TRANSFORMS_DESCRIPTION)) {
+ if (getAnnotation(ast, IMPLICIT_TRANSFORMS, Boolean.class, Boolean.TRUE, IMPLICIT_TRANSFORMS_DESCRIPTION)) {
words = segmenter.segment(origin.getValue(), currentlyParsing.getLanguage());
}
@@ -719,16 +681,16 @@ public class YqlParser implements Parser {
return Language.ENGLISH;
}
- private String getStringContents(OperatorNode<ExpressionOperator> propertySniffer) {
- switch (propertySniffer.getOperator()) {
+ private String getStringContents(OperatorNode<ExpressionOperator> operator) {
+ switch (operator.getOperator()) {
case LITERAL:
- return propertySniffer.getArgument(0, String.class);
+ return operator.getArgument(0, String.class);
case VARREF:
Preconditions.checkState(userQuery != null,
"properties must be available when trying to fetch user input");
- return userQuery.properties().getString(propertySniffer.getArgument(0, String.class));
+ return userQuery.properties().getString(operator.getArgument(0, String.class));
default:
- throw newUnexpectedArgumentException(propertySniffer.getOperator(),
+ throw newUnexpectedArgumentException(operator.getOperator(),
ExpressionOperator.LITERAL, ExpressionOperator.VARREF);
}
}
@@ -1317,6 +1279,8 @@ public class YqlParser implements Parser {
"setting for whether to use suffix match of input data");
boolean substrMatch = getAnnotation(ast, SUBSTRING, Boolean.class, Boolean.FALSE,
"setting for whether to use substring match of input data");
+ String grammar = getAnnotation(ast, USER_INPUT_GRAMMAR, String.class,
+ Query.Type.ALL.toString(), "grammar for handling word input");
Preconditions.checkArgument((prefixMatch ? 1 : 0) +
(substrMatch ? 1 : 0) + (suffixMatch ? 1 : 0) < 2,
"Only one of prefix, substring and suffix can be set.");
@@ -1336,21 +1300,21 @@ public class YqlParser implements Parser {
wordItem = new WordItem(wordData, fromQuery);
break;
case POSSIBLY:
- if (shouldResegmentWord(field, fromQuery)) {
- wordItem = resegment(field, ast, wordData, fromQuery, parent, language);
+ if (shouldSegment(field, fromQuery) && ! grammar.equals(USER_INPUT_RAW)) {
+ wordItem = segment(field, ast, wordData, fromQuery, parent, language);
} else {
wordItem = new WordItem(wordData, fromQuery);
}
break;
case ALWAYS:
- wordItem = resegment(field, ast, wordData, fromQuery, parent, language);
+ wordItem = segment(field, ast, wordData, fromQuery, parent, language);
break;
default:
throw new IllegalArgumentException("Unexpected segmenting rule: " + segmentPolicy);
}
}
if (wordItem instanceof WordItem) {
- prepareWord(field, ast, fromQuery, (WordItem) wordItem);
+ prepareWord(field, ast, (WordItem) wordItem);
}
if (language != Language.ENGLISH) // mark the language used, unless it's the default
((Item)wordItem).setLanguage(language);
@@ -1358,13 +1322,13 @@ public class YqlParser implements Parser {
}
@SuppressWarnings({"deprecation"})
- private boolean shouldResegmentWord(String field, boolean fromQuery) {
- return resegment && fromQuery && ! indexFactsSession.getIndex(field).isAttribute();
+ private boolean shouldSegment(String field, boolean fromQuery) {
+ return fromQuery && ! indexFactsSession.getIndex(field).isAttribute();
}
@NonNull
- private TaggableItem resegment(String field, OperatorNode<ExpressionOperator> ast, String wordData,
- boolean fromQuery, Class<?> parent, Language language) {
+ private TaggableItem segment(String field, OperatorNode<ExpressionOperator> ast, String wordData,
+ boolean fromQuery, Class<?> parent, Language language) {
String toSegment = wordData;
Substring s = getOrigin(ast);
Language usedLanguage = language == null ? currentlyParsing.getLanguage() : language;
@@ -1383,7 +1347,7 @@ public class YqlParser implements Parser {
((PhraseSegmentItem) wordItem).setIndexName(field);
for (String w : words) {
WordItem segment = new WordItem(w, fromQuery);
- prepareWord(field, ast, fromQuery, segment);
+ prepareWord(field, ast, segment);
((PhraseSegmentItem) wordItem).addItem(segment);
}
((PhraseSegmentItem) wordItem).lock();
@@ -1400,15 +1364,9 @@ public class YqlParser implements Parser {
return parent == EquivItem.class;
}
- private void prepareWord(String field, OperatorNode<ExpressionOperator> ast, boolean fromQuery, WordItem wordItem) {
+ private void prepareWord(String field, OperatorNode<ExpressionOperator> ast, WordItem wordItem) {
wordItem.setIndexName(field);
wordStyleSettings(ast, wordItem);
- if (shouldResegmentWord(field, fromQuery)) {
- // force re-stemming, new case normalization, etc
- wordItem.setStemmed(false);
- wordItem.setLowercased(false);
- wordItem.setNormalizable(true);
- }
}
@NonNull
@@ -1710,14 +1668,6 @@ public class YqlParser implements Parser {
return new IllegalArgumentException(out.toString());
}
- String getSegmenterBackend() {
- return segmenterBackend;
- }
-
- Version getSegmenterVersion() {
- return segmenterVersion;
- }
-
private static final class ConnectedItem {
final double weight;
diff --git a/container-search/src/test/java/com/yahoo/search/yql/ResegmentingTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/ResegmentingTestCase.java
deleted file mode 100644
index 340fbf7342b..00000000000
--- a/container-search/src/test/java/com/yahoo/search/yql/ResegmentingTestCase.java
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.search.yql;
-
-import static org.junit.Assert.assertEquals;
-
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
-import com.yahoo.search.query.parser.Parsable;
-import com.yahoo.search.query.parser.ParserEnvironment;
-
-/**
- * Check rules for resegmenting words in YQL+ when segmenter is deemed
- * incompatible. The class under testing is {@link YqlParser}.
- *
- * @author Steinar Knutsen
- */
-public class ResegmentingTestCase {
-
- private YqlParser parser;
-
- @Before
- public void setUp() throws Exception {
- ParserEnvironment env = new ParserEnvironment();
- parser = new YqlParser(env);
- }
-
- @After
- public void tearDown() throws Exception {
- parser = null;
- }
-
- @Test
- public final void testWord() {
- assertEquals(
- "title:'a b'",
- parser.parse(
- new Parsable()
- .setQuery("select * from sources * where [{\"segmenter\": {\"version\": \"18.47.39\", \"backend\": \"nonexistant\"}}] (title contains \"a b\");"))
- .toString());
- }
-
- @Test
- public final void testPhraseSegment() {
- assertEquals(
- "title:'c d'",
- parser.parse(
- new Parsable()
- .setQuery("select * from sources * where"
- + " [{\"segmenter\": {\"version\": \"18.47.39\", \"backend\": \"nonexistant\"}}]"
- + " (title contains ([{\"origin\": {\"offset\": 0, \"length\":3, \"original\": \"c d\"}}]"
- + " phrase(\"a\", \"b\")));"))
- .toString());
- }
-
- @Test
- public final void testPhraseInEquiv() {
- assertEquals(
- "EQUIV title:a title:'c d'",
- parser.parse(
- new Parsable()
- .setQuery("select * from sources * where"
- + " [{\"segmenter\": {\"version\": \"18.47.39\", \"backend\": \"nonexistant\"}}]"
- + " (title contains"
- + " equiv(\"a\","
- + " ([{\"origin\": {\"offset\": 0, \"length\":3, \"original\": \"c d\"}}]\"b\")"
- + ")"
- + ");"))
- .toString());
- }
-
- @Test
- public final void testPhraseSegmentToAndSegment() {
- assertEquals(
- "SAND title:c title:d",
- parser.parse(
- new Parsable()
- .setQuery("select * from sources * where"
- + " [{\"segmenter\": {\"version\": \"18.47.39\", \"backend\": \"nonexistant\"}}]"
- + " (title contains ([{\"origin\": {\"offset\": 0, \"length\":3, \"original\": \"c d\"}, \"andSegmenting\": true}]"
- + " phrase(\"a\", \"b\")));"))
- .toString());
- }
-
- @Test
- public final void testPhraseSegmentInPhrase() {
- assertEquals(
- "title:\"a 'c d'\"",
- parser.parse(
- new Parsable()
- .setQuery("select * from sources * where [{\"segmenter\": {\"version\": \"18.47.39\", \"backend\": \"nonexistant\"}}]"
- + " (title contains phrase(\"a\","
- + " ([{\"origin\": {\"offset\": 0, \"length\":3, \"original\": \"c d\"}}]"
- + " phrase(\"e\", \"f\"))));"))
- .toString());
- }
-
- @Test
- public final void testWordNoImplicitTransforms() {
- assertEquals(
- "title:a b",
- parser.parse(
- new Parsable()
- .setQuery("select * from sources * where [{\"segmenter\": {\"version\": \"18.47.39\", \"backend\": \"nonexistant\"}}] (title contains ([{\"implicitTransforms\": false}]\"a b\"));"))
- .toString());
- }
-
- @Test
- public final void testPhraseSegmentNoImplicitTransforms() {
- assertEquals(
- "title:'a b'",
- parser.parse(
- new Parsable()
- .setQuery("select * from sources * where"
- + " [{\"segmenter\": {\"version\": \"18.47.39\", \"backend\": \"nonexistant\"}}]"
- + " (title contains ([{\"origin\": {\"offset\": 0, \"length\":3, \"original\": \"c d\"}, \"implicitTransforms\": false}]"
- + " phrase(\"a\", \"b\")));"))
- .toString());
- }
-
- @Test
- public final void testPhraseSegmentToAndSegmentNoImplicitTransforms() {
- assertEquals(
- "SAND title:a title:b",
- parser.parse(
- new Parsable()
- .setQuery("select * from sources * where"
- + " [{\"segmenter\": {\"version\": \"18.47.39\", \"backend\": \"nonexistant\"}}]"
- + " (title contains ([{\"origin\": {\"offset\": 0, \"length\":3, \"original\": \"c d\"}, \"andSegmenting\": true, \"implicitTransforms\": false}]"
- + " phrase(\"a\", \"b\")));"))
- .toString());
- }
-
- @Test
- public final void testPhraseSegmentInPhraseNoImplicitTransforms() {
- assertEquals(
- "title:\"a 'e f'\"",
- parser.parse(
- new Parsable()
- .setQuery("select * from sources * where [{\"segmenter\": {\"version\": \"18.47.39\", \"backend\": \"nonexistant\"}}]"
- + " (title contains phrase(\"a\","
- + " ([{\"origin\": {\"offset\": 0, \"length\":3, \"original\": \"c d\"}, \"implicitTransforms\": false}]"
- + " phrase(\"e\", \"f\"))));"))
- .toString());
- }
-
-}
diff --git a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
index b8ea1fd4f4b..127820bb7ae 100644
--- a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
@@ -11,6 +11,7 @@ import com.yahoo.prelude.query.IndexedItem;
import com.yahoo.prelude.query.ExactStringItem;
import com.yahoo.prelude.query.Item;
import com.yahoo.prelude.query.PhraseItem;
+import com.yahoo.prelude.query.PhraseSegmentItem;
import com.yahoo.prelude.query.PrefixItem;
import com.yahoo.prelude.query.QueryCanonicalizer;
import com.yahoo.prelude.query.RegExpItem;
@@ -320,12 +321,14 @@ public class YqlParserTestCase {
@Test
public void testRaw() {
+ // Default: Not raw, for comparison
Item root = parse("select foo from bar where baz contains (\"yoni jo dima\");").getRoot();
- assertTrue(root instanceof WordItem);
- assertFalse(root instanceof ExactStringItem);
- assertEquals("yoni jo dima", ((WordItem)root).getWord());
+ assertEquals("baz:'yoni jo dima'", root.toString());
+ assertFalse(root instanceof WordItem);
+ assertTrue(root instanceof PhraseSegmentItem);
root = parse("select foo from bar where baz contains ([{\"grammar\":\"raw\"}]\"yoni jo dima\");").getRoot();
+ assertEquals("baz:yoni jo dima", root.toString());
assertTrue(root instanceof WordItem);
assertFalse(root instanceof ExactStringItem);
assertEquals("yoni jo dima", ((WordItem)root).getWord());
@@ -735,37 +738,11 @@ public class YqlParserTestCase {
@Test
public void testSegmenting() {
- assertParse("select * from bar where ([{\"segmenter\": {\"version\": \"58.67.49\", \"backend\": " +
- "\"yell\"}}] title contains \"madonna\");",
- "title:madonna");
- assertEquals("yell", parser.getSegmenterBackend());
- assertEquals(new Version("58.67.49"), parser.getSegmenterVersion());
-
- assertParse("select * from bar where ([{\"segmenter\": {\"version\": \"8.7.3\", \"backend\": " +
- "\"yell\"}}]([{\"targetNumHits\": 9999438}] weakAnd(format contains \"online\", title contains " +
- "\"madonna\")));",
- "WAND(9999438) format:online title:madonna");
- assertEquals("yell", parser.getSegmenterBackend());
- assertEquals(new Version("8.7.3"), parser.getSegmenterVersion());
-
- assertParse("select * from bar where [{\"segmenter\": {\"version\": \"18.47.39\", \"backend\": " +
- "\"yell\"}}] ([{\"targetNumHits\": 99909438}] weakAnd(format contains \"online\", title contains " +
- "\"madonna\"));",
- "WAND(99909438) format:online title:madonna");
- assertEquals("yell", parser.getSegmenterBackend());
- assertEquals(new Version("18.47.39"), parser.getSegmenterVersion());
-
- assertParse("select * from bar where [{\"targetNumHits\": 99909438}] weakAnd(format contains " +
- "\"online\", title contains \"madonna\");",
- "WAND(99909438) format:online title:madonna");
- assertNull(parser.getSegmenterBackend());
- assertNull(parser.getSegmenterVersion());
-
- assertParse("select * from bar where [{\"segmenter\": {\"version\": \"58.67.49\", \"backend\": " +
- "\"yell\"}}](title contains \"madonna\") order by shoesize;",
- "title:madonna");
- assertEquals("yell", parser.getSegmenterBackend());
- assertEquals(new Version("58.67.49"), parser.getSegmenterVersion());
+ assertParse("select * from bar where title contains 'foo.bar';",
+ "title:'foo bar'");
+
+ assertParse("select * from bar where title contains 'foo&123';",
+ "title:'foo 123'");
}
@Test