summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-04-19 11:19:18 +0000
committerTor Brede Vekterli <vekterli@vespa.ai>2024-04-19 13:45:59 +0000
commit0afbf14df1ee158167f70016545e799af1e433dc (patch)
tree5af98617f61cb76fcfe897585c9c2712955de3b9
parent433cb01e19f6bb51d6a2d029482a6e16431cb055 (diff)
Wire fuzzy prefix matching support through the query stack
Adds `prefix:[true|false]` annotation support to the `fuzzy` query operator in the YQL and JSON query languages. Fuzzy prefix matching semantics are wired through to the matcher implementations for both indexed and streaming search. Example usage: {maxEditDistance:1,prefix:true}fuzzy("foo") Will match `foo`, `foobar`, `foxtrot`, `zookeeper` and so on. It can be combined with the existing prefix locking feature: {maxEditDistance:1,prefixLength:2,prefix:true}fuzzy("foo") Which will match `foo`, `foobar`, `foxtrot` etc, but _not_ `zookeeper` since the locked prefix (`fo`) does not match. Due to the complexities involved with extending the legacy binary query stack representation, signalling prefix matching for the fuzzy term is done by pragmatically adding a new, generic "prefix matching" term-level flag. This is currently ignored for everything except fuzzy query items. Modernizing the query stack format to make it more extensible (i.e. move encoding to Protobuf) is on the backlog...!
-rw-r--r--container-search/abi-spec.json5
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java59
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/Item.java14
-rw-r--r--container-search/src/main/java/com/yahoo/search/query/SelectParser.java3
-rw-r--r--container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java25
-rw-r--r--container-search/src/main/java/com/yahoo/search/yql/YqlParser.java9
-rw-r--r--container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java23
-rw-r--r--container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java5
-rw-r--r--container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java22
-rw-r--r--container-search/src/test/java/com/yahoo/select/SelectTestCase.java35
-rw-r--r--searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp54
-rw-r--r--searchlib/src/tests/query/customtypevisitor_test.cpp2
-rw-r--r--searchlib/src/tests/query/query_visitor_test.cpp2
-rw-r--r--searchlib/src/tests/query/querybuilder_test.cpp47
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp12
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h6
-rw-r--r--searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp10
-rw-r--r--searchlib/src/vespa/searchlib/parsequery/parse.h2
-rw-r--r--searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h21
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_simple.cpp5
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_simple.h12
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp11
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h2
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynode.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.h5
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/querybuilder.h9
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/queryreplicator.h3
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/simplequery.h6
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp7
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h10
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/term.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/term.h8
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/termnodes.h23
-rw-r--r--streamingvisitors/src/tests/searcher/searcher_test.cpp114
-rw-r--r--vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp39
-rw-r--r--vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp6
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp6
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h3
38 files changed, 443 insertions, 189 deletions
diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json
index 07f0449e61a..5e66e1bb746 100644
--- a/container-search/abi-spec.json
+++ b/container-search/abi-spec.json
@@ -555,11 +555,15 @@
"public"
],
"methods" : [
+ "public void <init>(java.lang.String, boolean, java.lang.String, int, int, boolean)",
"public void <init>(java.lang.String, boolean, java.lang.String, int, int)",
"public void setMaxEditDistance(int)",
"public void setPrefixLength(int)",
"public int getPrefixLength()",
"public int getMaxEditDistance()",
+ "public boolean isPrefixMatch()",
+ "public void setPrefixMatch(boolean)",
+ "protected boolean hasPrefixMatchSemantics()",
"public void setValue(java.lang.String)",
"public java.lang.String getRawWord()",
"public boolean isWords()",
@@ -820,6 +824,7 @@
"public abstract java.lang.String getName()",
"public void setFilter(boolean)",
"public boolean isFilter()",
+ "protected boolean hasPrefixMatchSemantics()",
"public com.yahoo.prelude.query.Item$ItemCreator getCreator()",
"public void setCreator(com.yahoo.prelude.query.Item$ItemCreator)",
"public void setWeight(int)",
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java
index 3cf86a70985..b900dee20ba 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java
@@ -16,15 +16,21 @@ public class FuzzyItem extends TermItem {
private int maxEditDistance;
private int prefixLength;
+ private boolean prefixMatch;
public static int DEFAULT_MAX_EDIT_DISTANCE = 2;
public static int DEFAULT_PREFIX_LENGTH = 0;
- public FuzzyItem(String indexName, boolean isFromQuery, String term, int maxEditDistance, int prefixLength) {
+ public FuzzyItem(String indexName, boolean isFromQuery, String term, int maxEditDistance, int prefixLength, boolean prefixMatch) {
super(indexName, isFromQuery, null);
setValue(term);
setMaxEditDistance(maxEditDistance);
setPrefixLength(prefixLength);
+ setPrefixMatch(prefixMatch);
+ }
+
+ public FuzzyItem(String indexName, boolean isFromQuery, String term, int maxEditDistance, int prefixLength) {
+ this(indexName, isFromQuery, term, maxEditDistance, prefixLength, false);
}
public void setMaxEditDistance(int maxEditDistance) {
@@ -43,6 +49,19 @@ public class FuzzyItem extends TermItem {
return this.maxEditDistance;
}
+ public boolean isPrefixMatch() {
+ return this.prefixMatch;
+ }
+
+ public void setPrefixMatch(boolean prefixMatch) {
+ this.prefixMatch = prefixMatch;
+ }
+
+ @Override
+ protected boolean hasPrefixMatchSemantics() {
+ return this.prefixMatch;
+ }
+
@Override
public void setValue(String value) {
this.term = value;
@@ -89,43 +108,39 @@ public class FuzzyItem extends TermItem {
}
@Override
- public boolean equals(Object obj) {
- if (this == obj) {
- return true;
- }
- if (!super.equals(obj)) {
- return false;
- }
- if (getClass() != obj.getClass()) {
- return false;
- }
- FuzzyItem other = (FuzzyItem) obj;
- if (!this.term.equals(other.term)) return false;
- if (this.maxEditDistance != other.maxEditDistance) return false;
- if (this.prefixLength != other.prefixLength) return false;
- return true;
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ if (!super.equals(o)) return false;
+ FuzzyItem fuzzyItem = (FuzzyItem) o;
+ return maxEditDistance == fuzzyItem.maxEditDistance &&
+ prefixLength == fuzzyItem.prefixLength &&
+ prefixMatch == fuzzyItem.prefixMatch &&
+ Objects.equals(term, fuzzyItem.term);
}
@Override
public int hashCode() {
- return Objects.hash(super.hashCode(), term, maxEditDistance, prefixLength);
+ return Objects.hash(super.hashCode(), term, maxEditDistance, prefixLength, prefixMatch);
}
@Override
protected void appendHeadingString(StringBuilder buffer) {
buffer.append(getName());
- buffer.append("(");
+ buffer.append('(');
buffer.append(this.term);
- buffer.append(",");
+ buffer.append(',');
buffer.append(this.maxEditDistance);
- buffer.append(",");
+ buffer.append(',');
buffer.append(this.prefixLength);
- buffer.append(")");
- buffer.append(" ");
+ buffer.append(',');
+ buffer.append(this.prefixMatch);
+ buffer.append(") ");
}
@Override
protected void encodeThis(ByteBuffer buffer) {
+ // Prefix matching is communicated via term header flags
super.encodeThis(buffer);
putString(getIndexedString(), buffer);
IntegerCompressor.putCompressedPositiveNumber(this.maxEditDistance, buffer);
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/Item.java b/container-search/src/main/java/com/yahoo/prelude/query/Item.java
index f43b55424e6..099c546e3f0 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/Item.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/Item.java
@@ -161,6 +161,16 @@ public abstract class Item implements Cloneable {
}
/**
+ * Indicates that a query item that does not normally match with prefix semantics
+ * should do so for this particular query item instance.
+ *
+ * False by default; should be overridden by subclasses that want to signal this behavior.
+ */
+ protected boolean hasPrefixMatchSemantics() {
+ return false;
+ }
+
+ /**
* Returns the item creator value.
*
* @deprecated use isFilter(boolean)
@@ -286,6 +296,7 @@ public abstract class Item implements Cloneable {
byte FLAGS_SPECIALTOKEN = 0x02;
byte FLAGS_NOPOSITIONDATA = 0x04;
byte FLAGS_ISFILTER = 0x08;
+ byte FLAGS_PREFIX_MATCH = 0x10;
byte ret = 0;
if (!isRanked()) {
@@ -300,6 +311,9 @@ public abstract class Item implements Cloneable {
if (isFilter()) {
ret |= FLAGS_ISFILTER;
}
+ if (hasPrefixMatchSemantics()) {
+ ret |= FLAGS_PREFIX_MATCH;
+ }
return ret;
}
diff --git a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java
index c897afe144c..c90612425fa 100644
--- a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java
+++ b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java
@@ -1150,8 +1150,9 @@ public class SelectParser implements Parser {
Integer maxEditDistance = getIntegerAnnotation(MAX_EDIT_DISTANCE, annotations, FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE);
Integer prefixLength = getIntegerAnnotation(PREFIX_LENGTH, annotations, FuzzyItem.DEFAULT_PREFIX_LENGTH);
+ boolean prefixMatch = getBoolAnnotation(PREFIX, annotations, Boolean.FALSE);
- FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength);
+ FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength, prefixMatch);
return leafStyleSettings(getAnnotations(value), fuzzy);
}
diff --git a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java
index 634163bf0c2..a354006aa9b 100644
--- a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java
+++ b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java
@@ -551,24 +551,31 @@ public class VespaSerializer {
static String fuzzyAnnotations(FuzzyItem fuzzyItem) {
boolean isMaxEditDistanceSet = fuzzyItem.getMaxEditDistance() != FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE;
boolean isPrefixLengthSet = fuzzyItem.getPrefixLength() != FuzzyItem.DEFAULT_PREFIX_LENGTH;
- boolean anyAnnotationSet = isMaxEditDistanceSet || isPrefixLengthSet;
+ boolean isPrefixMatch = fuzzyItem.isPrefixMatch();
+ boolean anyAnnotationSet = isMaxEditDistanceSet || isPrefixLengthSet || isPrefixMatch;
- StringBuilder builder = new StringBuilder();
- if (anyAnnotationSet) {
- builder.append("{");
+ if (!anyAnnotationSet) {
+ return "";
}
+
+ StringBuilder builder = new StringBuilder();
+ builder.append("{");
if (isMaxEditDistanceSet) {
builder.append(MAX_EDIT_DISTANCE + ":").append(fuzzyItem.getMaxEditDistance());
- }
- if (isMaxEditDistanceSet && isPrefixLengthSet) {
- builder.append(",");
+ if (isPrefixLengthSet || isPrefixMatch) {
+ builder.append(",");
+ }
}
if (isPrefixLengthSet) {
builder.append(PREFIX_LENGTH + ":").append(fuzzyItem.getPrefixLength());
+ if (isPrefixMatch) {
+ builder.append(",");
+ }
}
- if (anyAnnotationSet) {
- builder.append("}");
+ if (isPrefixMatch) {
+ builder.append(PREFIX).append(':').append(fuzzyItem.isPrefixMatch());
}
+ builder.append("}");
return builder.toString();
}
}
diff --git a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
index e66cac5766c..fb4ec5ba872 100644
--- a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
+++ b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
@@ -1385,7 +1385,14 @@ public class YqlParser implements Parser {
FuzzyItem.DEFAULT_PREFIX_LENGTH,
PREFIX_LENGTH_DESCRIPTION);
- FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength);
+ boolean prefixMatch = getAnnotation(
+ ast,
+ PREFIX,
+ Boolean.class,
+ Boolean.FALSE,
+ "setting for whether to use prefix match of input data");
+
+ FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength, prefixMatch);
return leafStyleSettings(ast, fuzzy);
}
diff --git a/container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java b/container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java
index c4b8c9f2044..027152bfd69 100644
--- a/container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java
@@ -55,14 +55,13 @@ public class ValidateFuzzySearcherTestCase {
searcher = new ValidateFuzzySearcher();
}
- private String makeQuery(String attribute, String query, int maxEditDistance, int prefixLength) {
- return "select * from sources * where " + attribute +
- " contains ({maxEditDistance:" + maxEditDistance + ", prefixLength:" + prefixLength +"}" +
- "fuzzy(\"" + query + "\"))";
+ private String makeQuery(String attribute, String query, int maxEditDistance, int prefixLength, boolean prefixMatch) {
+ return "select * from sources * where %s contains ({maxEditDistance:%d,prefixLength:%d,prefix:%b}fuzzy(\"%s\"))"
+ .formatted(attribute, maxEditDistance, prefixLength, prefixMatch, query);
}
private String makeQuery(String attribute, String query) {
- return makeQuery(attribute, query, 2, 0);
+ return makeQuery(attribute, query, 2, 0, false);
}
@@ -76,7 +75,7 @@ public class ValidateFuzzySearcherTestCase {
if (validAttributes.contains(attribute)) {
assertNull(r.hits().getError());
} else {
- assertErrMsg("FUZZY(fuzzy,2,0) " + attribute + ":fuzzy field is not a string attribute", r);
+ assertErrMsg("FUZZY(fuzzy,2,0,false) " + attribute + ":fuzzy field is not a string attribute", r);
}
}
}
@@ -85,28 +84,28 @@ public class ValidateFuzzySearcherTestCase {
void testInvalidEmptyStringQuery() {
String q = makeQuery("string_single", "");
Result r = doSearch(searcher, q);
- assertErrMsg("FUZZY(,2,0) string_single: fuzzy query must be non-empty", r);
+ assertErrMsg("FUZZY(,2,0,false) string_single: fuzzy query must be non-empty", r);
}
@Test
void testInvalidQueryWrongMaxEditDistance() {
- String q = makeQuery("string_single", "fuzzy", -1, 0);
+ String q = makeQuery("string_single", "fuzzy", -1, 0, false);
Result r = doSearch(searcher, q);
- assertErrMsg("FUZZY(fuzzy,-1,0) string_single:fuzzy has invalid maxEditDistance -1: Must be >= 0", r);
+ assertErrMsg("FUZZY(fuzzy,-1,0,false) string_single:fuzzy has invalid maxEditDistance -1: Must be >= 0", r);
}
@Test
void testInvalidQueryWrongPrefixLength() {
- String q = makeQuery("string_single", "fuzzy", 2, -1);
+ String q = makeQuery("string_single", "fuzzy", 2, -1, true);
Result r = doSearch(searcher, q);
- assertErrMsg("FUZZY(fuzzy,2,-1) string_single:fuzzy has invalid prefixLength -1: Must be >= 0", r);
+ assertErrMsg("FUZZY(fuzzy,2,-1,true) string_single:fuzzy has invalid prefixLength -1: Must be >= 0", r);
}
@Test
void testInvalidQueryWrongAttributeName() {
String q = makeQuery("wrong_name", "fuzzy");
Result r = doSearch(searcher, q);
- assertErrMsg("FUZZY(fuzzy,2,0) wrong_name:fuzzy field is not a string attribute", r);
+ assertErrMsg("FUZZY(fuzzy,2,0,false) wrong_name:fuzzy field is not a string attribute", r);
}
private static void assertErrMsg(String message, Result r) {
diff --git a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java
index 20ca81234a6..b5e2839c4c0 100644
--- a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java
@@ -464,7 +464,12 @@ public class VespaSerializerTestCase {
@Test
void testFuzzyAnnotations() {
+ parseAndConfirm("foo contains ({maxEditDistance:3}fuzzy(\"a\"))");
parseAndConfirm("foo contains ({maxEditDistance:3,prefixLength:5}fuzzy(\"a\"))");
+ parseAndConfirm("foo contains ({maxEditDistance:3,prefixLength:5,prefix:true}fuzzy(\"a\"))");
+ parseAndConfirm("foo contains ({prefixLength:5,prefix:true}fuzzy(\"a\"))");
+ parseAndConfirm("foo contains ({maxEditDistance:3,prefix:true}fuzzy(\"a\"))");
+ parseAndConfirm("foo contains ({prefix:true}fuzzy(\"a\"))");
}
@Test
diff --git a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
index 29a651aabf4..91f5984481a 100644
--- a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
@@ -437,23 +437,27 @@ public class YqlParserTestCase {
QueryTree x = parse("select foo from bar where baz contains fuzzy(\"a b\")");
Item root = x.getRoot();
assertSame(FuzzyItem.class, root.getClass());
- assertEquals("baz", ((FuzzyItem) root).getIndexName());
- assertEquals("a b", ((FuzzyItem) root).stringValue());
- assertEquals(FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE, ((FuzzyItem) root).getMaxEditDistance());
- assertEquals(FuzzyItem.DEFAULT_PREFIX_LENGTH, ((FuzzyItem) root).getPrefixLength());
+ var fuzzy = (FuzzyItem) root;
+ assertEquals("baz", fuzzy.getIndexName());
+ assertEquals("a b", fuzzy.stringValue());
+ assertEquals(FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE, fuzzy.getMaxEditDistance());
+ assertEquals(FuzzyItem.DEFAULT_PREFIX_LENGTH, fuzzy.getPrefixLength());
+ assertFalse(fuzzy.isPrefixMatch());
}
@Test
void testFuzzyAnnotations() {
QueryTree x = parse(
- "select foo from bar where baz contains ({maxEditDistance: 3, prefixLength: 10}fuzzy(\"a b\"))"
+ "select foo from bar where baz contains ({maxEditDistance: 3, prefixLength: 10, prefix: true}fuzzy(\"a b\"))"
);
Item root = x.getRoot();
assertSame(FuzzyItem.class, root.getClass());
- assertEquals("baz", ((FuzzyItem) root).getIndexName());
- assertEquals("a b", ((FuzzyItem) root).stringValue());
- assertEquals(3, ((FuzzyItem) root).getMaxEditDistance());
- assertEquals(10, ((FuzzyItem) root).getPrefixLength());
+ var fuzzy = (FuzzyItem) root;
+ assertEquals("baz", fuzzy.getIndexName());
+ assertEquals("a b", fuzzy.stringValue());
+ assertEquals(3, fuzzy.getMaxEditDistance());
+ assertEquals(10, fuzzy.getPrefixLength());
+ assertTrue(fuzzy.isPrefixMatch());
}
@Test
diff --git a/container-search/src/test/java/com/yahoo/select/SelectTestCase.java b/container-search/src/test/java/com/yahoo/select/SelectTestCase.java
index f4571f04a5d..f863816dab2 100644
--- a/container-search/src/test/java/com/yahoo/select/SelectTestCase.java
+++ b/container-search/src/test/java/com/yahoo/select/SelectTestCase.java
@@ -671,8 +671,39 @@ public class SelectTestCase {
QueryTree x = parseWhere("{ \"contains\": [\"description\", { \"fuzzy\": [\"a b\"] }] }");
Item root = x.getRoot();
assertSame(FuzzyItem.class, root.getClass());
- assertEquals("description", ((FuzzyItem) root).getIndexName());
- assertEquals("a b", ((FuzzyItem) root).stringValue());
+ var fuzzy = (FuzzyItem) root;
+ assertEquals("description", fuzzy.getIndexName());
+ assertEquals("a b", fuzzy.stringValue());
+ assertEquals(FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE, fuzzy.getMaxEditDistance());
+ assertEquals(FuzzyItem.DEFAULT_PREFIX_LENGTH, fuzzy.getPrefixLength());
+ assertFalse(fuzzy.isPrefixMatch());
+ }
+
+ @Test
+ void fuzzy_with_annotations() {
+ var where = """
+ {
+ "contains": ["description", {
+ "fuzzy": {
+ "children": ["a b"],
+ "attributes": {
+ "maxEditDistance": 3,
+ "prefixLength": 10,
+ "prefix": true
+ }
+ }
+ }]
+ }
+ """;
+ QueryTree x = parseWhere(where);
+ Item root = x.getRoot();
+ assertSame(FuzzyItem.class, root.getClass());
+ var fuzzy = (FuzzyItem) root;
+ assertEquals("description", fuzzy.getIndexName());
+ assertEquals("a b", fuzzy.stringValue());
+ assertEquals(3, fuzzy.getMaxEditDistance());
+ assertEquals(10, fuzzy.getPrefixLength());
+ assertTrue(fuzzy.isPrefixMatch());
}
//------------------------------------------------------------------- grouping tests
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
index 8ba8c62c5ff..55810c02550 100644
--- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
+++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
@@ -120,11 +120,12 @@ struct MatchStats {
template <bool collect_matches>
void
-brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words)
+brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size,
+ bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words)
{
auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
vespalib::Timer timer;
- FuzzyMatcher matcher(target, 2, prefix_size, cased);
+ FuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match);
auto itr = view.begin();
size_t matches = 0;
size_t seeks = 0;
@@ -144,11 +145,12 @@ brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumS
template <bool collect_matches>
void
-dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words)
+dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size,
+ bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words)
{
auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
vespalib::Timer timer;
- DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit);
+ DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Explicit);
Utf8Reader reader(vespalib::stringref(target.data(), target.size()));
std::string target_copy;
Utf8Writer<std::string> writer(target_copy);
@@ -187,11 +189,12 @@ dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& st
template <bool collect_matches>
void
-dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words)
+dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumStore& store, uint32_t prefix_size,
+ bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words)
{
auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
vespalib::Timer timer;
- DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit);
+ DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Explicit);
auto itr = view.begin();
size_t matches = 0;
size_t seeks = 0;
@@ -247,20 +250,23 @@ struct DfaFuzzyMatcherTest : public ::testing::TestWithParam<TestParam> {
updater.commit();
store.freeze_dictionary();
}
- void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) {
+ void expect_prefix_matches(std::string_view target, uint32_t prefix_size, bool prefix_match, const StringVector& exp_matches) {
MatchStats stats;
StringVector brute_force_matches;
StringVector dfa_matches;
StringVector dfa_no_skip_matches;
bool cased = GetParam()._cased;
SCOPED_TRACE(target);
- brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, brute_force_matches);
- dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, dfa_matches);
- dfa_fuzzy_match_in_dictionary_no_skip<true>(target, store, prefix_size, cased, stats, dfa_no_skip_matches);
+ brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, prefix_match, stats, brute_force_matches);
+ dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, prefix_match, stats, dfa_matches);
+ dfa_fuzzy_match_in_dictionary_no_skip<true>(target, store, prefix_size, cased, prefix_match, stats, dfa_no_skip_matches);
EXPECT_EQ(exp_matches, brute_force_matches);
EXPECT_EQ(exp_matches, dfa_matches);
EXPECT_EQ(exp_matches, dfa_no_skip_matches);
}
+ void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) {
+ expect_prefix_matches(target, prefix_size, false, exp_matches);
+ }
void expect_matches(std::string_view target, const StringVector& exp_matches) {
expect_prefix_matches(target, 0, exp_matches);
}
@@ -284,11 +290,12 @@ TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary)
expect_matches("forcecast", {"forecast"});
}
-TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size)
+TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_lock_length)
{
bool cased = GetParam()._cased;
StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill",
- "for", "forbid", "force", "ford", "forearm", "forecast", "forest", "H", "HA", "h", "ha", char_from_u8(u8"Ørn"), char_from_u8(u8"øre"), char_from_u8(u8"Ås"), char_from_u8(u8"ås")};
+ "for", "forbid", "force", "ford", "forearm", "forecast", "forest", "H", "HA", "h", "ha",
+ char_from_u8(u8"Ørn"), char_from_u8(u8"øre"), char_from_u8(u8"Ås"), char_from_u8(u8"ås")};
populate_dictionary(words);
expect_prefix_matches("a", 1, {});
expect_prefix_matches("b", 1, {"bob"});
@@ -321,6 +328,25 @@ TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size)
}
}
+TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_matching) {
+ // We include the empty string to make "everything matches"-checks easier since
+ // the dictionary implicitly returns such an empty string sentinel already.
+ StringVector words = {"", "board", "boat", "bob", "door", "food", "foot", "football", "foothill",
+ "for", "forbid", "force", "ford", "forearm", "forecast", "forest"};
+ populate_dictionary(words);
+ expect_prefix_matches("bo", 0, true, words); // matches everything
+ expect_prefix_matches("bo", 2, true, {"board", "boat", "bob"});
+ expect_prefix_matches("boar", 0, true, {"board", "boat", "bob", "door", "for", "forbid",
+ "force", "ford", "forearm", "forecast", "forest"});
+ expect_prefix_matches("board", 0, true, {"board", "boat", "ford"});
+ expect_prefix_matches("footb", 0, true, {"food", "foot", "football", "foothill", "forbid"});
+ expect_prefix_matches("footba", 0, true, {"foot", "football", "foothill"});
+ expect_prefix_matches("footbal", 0, true, {"football", "foothill"});
+ expect_prefix_matches("football", 0, true, {"football", "foothill"});
+ expect_prefix_matches("footballs", 0, true, {"football"});
+ expect_prefix_matches("z", 1, true, {});
+}
+
void
benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool cased, bool dfa_algorithm)
{
@@ -329,9 +355,9 @@ benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDicti
for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) {
const auto& entry = dict[i];
if (dfa_algorithm) {
- dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy);
+ dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, false, stats, dummy);
} else {
- brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy);
+ brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, false, stats, dummy);
}
}
std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl;
diff --git a/searchlib/src/tests/query/customtypevisitor_test.cpp b/searchlib/src/tests/query/customtypevisitor_test.cpp
index 815ff3bf1fb..fb89f2ef061 100644
--- a/searchlib/src/tests/query/customtypevisitor_test.cpp
+++ b/searchlib/src/tests/query/customtypevisitor_test.cpp
@@ -37,7 +37,7 @@ struct MyRangeTerm : InitTerm<RangeTerm> {};
struct MyStringTerm : InitTerm<StringTerm> {};
struct MySubstrTerm : InitTerm<SubstringTerm> {};
struct MySuffixTerm : InitTerm<SuffixTerm> {};
-struct MyFuzzyTerm : FuzzyTerm { MyFuzzyTerm(): FuzzyTerm("term", "view", 0, Weight(0), 2, 0) {} };
+struct MyFuzzyTerm : FuzzyTerm { MyFuzzyTerm(): FuzzyTerm("term", "view", 0, Weight(0), 2, 0, false) {} };
struct MyWeakAnd : WeakAnd { MyWeakAnd() : WeakAnd(1234, "view") {} };
struct MyWeightedSetTerm : WeightedSetTerm { MyWeightedSetTerm() : WeightedSetTerm(0, "view", 0, Weight(42)) {} };
struct MyDotProduct : DotProduct { MyDotProduct() : DotProduct(0, "view", 0, Weight(42)) {} };
diff --git a/searchlib/src/tests/query/query_visitor_test.cpp b/searchlib/src/tests/query/query_visitor_test.cpp
index b0c9c4e570e..bfce382b684 100644
--- a/searchlib/src/tests/query/query_visitor_test.cpp
+++ b/searchlib/src/tests/query/query_visitor_test.cpp
@@ -88,7 +88,7 @@ TEST("requireThatAllNodesCanBeVisited") {
checkVisit<NearestNeighborTerm>(new SimpleNearestNeighborTerm("query_tensor", "doc_tensor", 0, Weight(0), 123, true, 321, 100100.25));
checkVisit<TrueQueryNode>(new SimpleTrue());
checkVisit<FalseQueryNode>(new SimpleFalse());
- checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0), 2, 0));
+ checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0), 2, 0, false));
checkVisit<InTerm>(new SimpleInTerm(std::make_unique<StringTermVector>(0), MultiTerm::Type::STRING, "field", 0, Weight(0)));
}
diff --git a/searchlib/src/tests/query/querybuilder_test.cpp b/searchlib/src/tests/query/querybuilder_test.cpp
index be795c8c94b..c70fe5b510b 100644
--- a/searchlib/src/tests/query/querybuilder_test.cpp
+++ b/searchlib/src/tests/query/querybuilder_test.cpp
@@ -117,7 +117,7 @@ Node::UP createQueryTree() {
builder.add_true_node();
builder.add_false_node();
}
- builder.addFuzzyTerm(str[5], view[5], id[5], weight[5], 3, 1);
+ builder.addFuzzyTerm(str[5], view[5], id[5], weight[5], 3, 1, false);
}
Node::UP node = builder.build();
ASSERT_TRUE(node.get());
@@ -326,8 +326,8 @@ void checkQueryTreeTypes(Node *node) {
auto* fuzzy_term = as_node<FuzzyTerm>(and_node->getChildren()[12]);
EXPECT_TRUE(checkTerm(fuzzy_term, str[5], view[5], id[5], weight[5]));
- EXPECT_EQUAL(3u, fuzzy_term->getMaxEditDistance());
- EXPECT_EQUAL(1u, fuzzy_term->getPrefixLength());
+ EXPECT_EQUAL(3u, fuzzy_term->max_edit_distance());
+ EXPECT_EQUAL(1u, fuzzy_term->prefix_lock_length());
}
struct AbstractTypes {
@@ -452,8 +452,9 @@ struct MyTrue : TrueQueryNode {};
struct MyFalse : FalseQueryNode {};
struct MyFuzzyTerm : FuzzyTerm {
MyFuzzyTerm(const Type &t, const string &f, int32_t i, Weight w,
- uint32_t m, uint32_t p)
- : FuzzyTerm(t, f, i, w, m, p) {
+ uint32_t m, uint32_t p, bool prefix_match)
+ : FuzzyTerm(t, f, i, w, m, p, prefix_match)
+ {
}
};
struct MyInTerm : InTerm {
@@ -645,23 +646,27 @@ TEST("require that All Range Syntaxes Work") {
EXPECT_TRUE(range2 == range_term->getTerm());
}
-TEST("require that fuzzy node can be created") {
- QueryBuilder<SimpleQueryNodeTypes> builder;
- builder.addFuzzyTerm("term", "view", 0, Weight(0), 3, 1);
- Node::UP node = builder.build();
+TEST("fuzzy node can be created") {
+ for (bool prefix_match : {false, true}) {
+ QueryBuilder<SimpleQueryNodeTypes> builder;
+ builder.addFuzzyTerm("term", "view", 0, Weight(0), 3, 1, prefix_match);
+ Node::UP node = builder.build();
- string stackDump = StackDumpCreator::create(*node);
- {
- SimpleQueryStackDumpIterator iterator(stackDump);
- Node::UP new_node = QueryTreeCreator<SimpleQueryNodeTypes>::create(iterator);
- FuzzyTerm *fuzzy_node = as_node<FuzzyTerm>(new_node.get());
- EXPECT_EQUAL(3u, fuzzy_node->getMaxEditDistance());
- EXPECT_EQUAL(1u, fuzzy_node->getPrefixLength());
- }
- {
- search::QueryTermSimple::UP queryTermSimple = search::QueryTermDecoder::decodeTerm(stackDump);
- EXPECT_EQUAL(3u, queryTermSimple->getFuzzyMaxEditDistance());
- EXPECT_EQUAL(1u, queryTermSimple->getFuzzyPrefixLength());
+ string stackDump = StackDumpCreator::create(*node);
+ {
+ SimpleQueryStackDumpIterator iterator(stackDump);
+ Node::UP new_node = QueryTreeCreator<SimpleQueryNodeTypes>::create(iterator);
+ auto *fuzzy_node = as_node<FuzzyTerm>(new_node.get());
+ EXPECT_EQUAL(3u, fuzzy_node->max_edit_distance());
+ EXPECT_EQUAL(1u, fuzzy_node->prefix_lock_length());
+ EXPECT_EQUAL(prefix_match, fuzzy_node->prefix_match());
+ }
+ {
+ search::QueryTermSimple::UP queryTermSimple = search::QueryTermDecoder::decodeTerm(stackDump);
+ EXPECT_EQUAL(3u, queryTermSimple->fuzzy_max_edit_distance());
+ EXPECT_EQUAL(1u, queryTermSimple->fuzzy_prefix_lock_length());
+ EXPECT_EQUAL(prefix_match, queryTermSimple->fuzzy_prefix_match());
+ }
}
}
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
index 5f3ab9cd3d8..3ae2bdd3598 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
@@ -44,8 +44,12 @@ extract_suffix(std::string_view target, uint32_t prefix_size)
}
-DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, LevenshteinDfa::DfaType dfa_type)
- : _dfa(vespalib::fuzzy::LevenshteinDfa::build(extract_suffix(target, prefix_size), max_edits, (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), dfa_type)),
+DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size,
+ bool cased, bool prefix_match, LevenshteinDfa::DfaType dfa_type)
+ : _dfa(vespalib::fuzzy::LevenshteinDfa::build(extract_suffix(target, prefix_size), max_edits,
+ (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased),
+ dfa_type, // TODO reorder args
+ (prefix_match ? LevenshteinDfa::Matching::Prefix : LevenshteinDfa::Matching::FullString))),
_successor(),
_prefix(extract_prefix(target, prefix_size, cased)),
_prefix_size(prefix_size),
@@ -54,8 +58,8 @@ DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uin
_successor = _prefix;
}
-DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased)
- : DfaFuzzyMatcher(target, max_edits, prefix_size, cased, LevenshteinDfa::DfaType::Table)
+DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match)
+ : DfaFuzzyMatcher(target, max_edits, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Table)
{
}
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
index 653af602c0d..ee57e9e6583 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
@@ -26,8 +26,10 @@ private:
const char* skip_prefix(const char* word) const;
public:
- DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type);
- DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased); // Defaults to table-based DFA
+ DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match,
+ vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type);
+ // Defaults to table-based DFA:
+ DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match);
~DfaFuzzyMatcher();
[[nodiscard]] static constexpr bool supports_max_edits(uint8_t edits) noexcept {
diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
index f1a643dc376..c6cb6e4a637 100644
--- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
@@ -49,17 +49,19 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased, vespali
? vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::None)
: vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::IgnoreCase);
} else if (isFuzzy()) {
- auto max_edit_dist = term.getFuzzyMaxEditDistance();
+ const auto max_edit_dist = term.fuzzy_max_edit_distance();
_fuzzyMatcher = std::make_unique<vespalib::FuzzyMatcher>(term.getTerm(),
max_edit_dist,
- term.getFuzzyPrefixLength(),
- isCased());
+ term.fuzzy_prefix_lock_length(),
+ isCased(),
+ term.fuzzy_prefix_match());
if ((fuzzy_matching_algorithm != FMA::BruteForce) &&
(max_edit_dist > 0 && max_edit_dist <= 2)) {
_dfa_fuzzy_matcher = std::make_unique<DfaFuzzyMatcher>(term.getTerm(),
max_edit_dist,
- term.getFuzzyPrefixLength(),
+ term.fuzzy_prefix_lock_length(),
isCased(),
+ term.fuzzy_prefix_match(),
to_dfa_type(fuzzy_matching_algorithm));
}
} else if (isCased()) {
diff --git a/searchlib/src/vespa/searchlib/parsequery/parse.h b/searchlib/src/vespa/searchlib/parsequery/parse.h
index 5e3b1dffe3a..a05c68bea46 100644
--- a/searchlib/src/vespa/searchlib/parsequery/parse.h
+++ b/searchlib/src/vespa/searchlib/parsequery/parse.h
@@ -87,6 +87,8 @@ public:
IFLAG_NORANK = 0x00000001, // this term should not be ranked (not exposed to rank framework)
IFLAG_SPECIALTOKEN = 0x00000002,
IFLAG_NOPOSITIONDATA = 0x00000004, // we should not use position data when ranking this term
+ IFLAG_FILTER = 0x00000008, // see GetCreator(flags) below
+ IFLAG_PREFIX_MATCH = 0x00000010
};
/** Extra information on each item (creator id) coded in bit 3 of flags */
diff --git a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h
index d15e1ca0512..bf6e3abf1ea 100644
--- a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h
+++ b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h
@@ -113,9 +113,18 @@ public:
uint32_t getUniqueId() const noexcept { return _currUniqueId; }
// Get the flags of the current item.
- bool hasNoRankFlag() const noexcept { return (_currFlags & ParseItem::IFLAG_NORANK) != 0; }
- bool hasSpecialTokenFlag() const noexcept { return (_currFlags & ParseItem::IFLAG_SPECIALTOKEN) != 0; }
- bool hasNoPositionDataFlag() const noexcept { return (_currFlags & ParseItem::IFLAG_NOPOSITIONDATA) != 0; }
+ [[nodiscard]] bool hasNoRankFlag() const noexcept {
+ return (_currFlags & ParseItem::IFLAG_NORANK) != 0;
+ }
+ [[nodiscard]] bool hasSpecialTokenFlag() const noexcept {
+ return (_currFlags & ParseItem::IFLAG_SPECIALTOKEN) != 0;
+ }
+ [[nodiscard]] bool hasNoPositionDataFlag() const noexcept {
+ return (_currFlags & ParseItem::IFLAG_NOPOSITIONDATA) != 0;
+ }
+ [[nodiscard]] bool has_prefix_match_semantics() const noexcept {
+ return (_currFlags & ParseItem::IFLAG_PREFIX_MATCH) != 0;
+ }
uint32_t getArity() const noexcept { return _currArity; }
@@ -127,9 +136,9 @@ public:
bool getAllowApproximate() const noexcept { return (_extraIntArg2 != 0); }
uint32_t getExploreAdditionalHits() const noexcept { return _extraIntArg3; }
- // fuzzy match arguments
- uint32_t getFuzzyMaxEditDistance() const noexcept { return _extraIntArg1; }
- uint32_t getFuzzyPrefixLength() const noexcept { return _extraIntArg2; }
+ // fuzzy match arguments (see also: has_prefix_match_semantics() for fuzzy prefix matching)
+ [[nodiscard]] uint32_t fuzzy_max_edit_distance() const noexcept { return _extraIntArg1; }
+ [[nodiscard]] uint32_t fuzzy_prefix_lock_length() const noexcept { return _extraIntArg2; }
std::unique_ptr<query::PredicateQueryTerm> getPredicateQueryTerm();
std::unique_ptr<query::TermVector> get_terms();
diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp
index 060cd5015b3..09fc443cf0e 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp
+++ b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp
@@ -248,10 +248,11 @@ QueryTermSimple::QueryTermSimple(const string & term_, Type type)
_type(type),
_diversityCutoffStrict(false),
_valid(true),
+ _fuzzy_prefix_match(false),
_term(term_),
_diversityAttribute(),
- _fuzzyMaxEditDistance(2),
- _fuzzyPrefixLength(0)
+ _fuzzy_max_edit_distance(2),
+ _fuzzy_prefix_lock_length(0)
{
if (isFullRange(_term)) {
stringref rest(_term.c_str() + 1, _term.size() - 2);
diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h
index a740afb0340..b0ee7aa54bc 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_simple.h
+++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h
@@ -44,8 +44,9 @@ public:
size_t getDiversityCutoffGroups() const noexcept { return _diversityCutoffGroups; }
bool getDiversityCutoffStrict() const noexcept { return _diversityCutoffStrict; }
vespalib::stringref getDiversityAttribute() const noexcept { return _diversityAttribute; }
- size_t getFuzzyMaxEditDistance() const noexcept { return _fuzzyMaxEditDistance; }
- size_t getFuzzyPrefixLength() const noexcept { return _fuzzyPrefixLength; }
+ [[nodiscard]] size_t fuzzy_max_edit_distance() const noexcept { return _fuzzy_max_edit_distance; }
+ [[nodiscard]] size_t fuzzy_prefix_lock_length() const noexcept { return _fuzzy_prefix_lock_length; }
+ [[nodiscard]] bool fuzzy_prefix_match() const noexcept { return _fuzzy_prefix_match; }
bool getAsIntegerTerm(int64_t & lower, int64_t & upper) const noexcept;
bool getAsFloatTerm(double & lower, double & upper) const noexcept;
bool getAsFloatTerm(float & lower, float & upper) const noexcept;
@@ -77,14 +78,17 @@ private:
Type _type;
bool _diversityCutoffStrict;
bool _valid;
+protected:
+ bool _fuzzy_prefix_match; // set in QueryTerm
+private:
string _term;
stringref _diversityAttribute;
template <typename T, typename D>
bool getAsNumericTerm(T & lower, T & upper, D d) const noexcept;
protected:
- uint32_t _fuzzyMaxEditDistance; // set in QueryTerm
- uint32_t _fuzzyPrefixLength;
+ uint32_t _fuzzy_max_edit_distance; // set in QueryTerm
+ uint32_t _fuzzy_prefix_lock_length;
};
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp
index f33fe44369a..8a4ec184de8 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp
@@ -13,20 +13,21 @@ constexpr bool normalizing_implies_cased(Normalizing norm) noexcept {
FuzzyTerm::FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term,
const string& index, Type type, Normalizing normalizing,
- uint8_t max_edits, uint32_t prefix_size)
+ uint8_t max_edits, uint32_t prefix_lock_length, bool prefix_match)
: QueryTerm(std::move(result_base), term, index, type, normalizing),
_dfa_matcher(),
_fallback_matcher()
{
- setFuzzyMaxEditDistance(max_edits);
- setFuzzyPrefixLength(prefix_size);
+ set_fuzzy_max_edit_distance(max_edits);
+ set_fuzzy_prefix_lock_length(prefix_lock_length);
+ set_fuzzy_prefix_match(prefix_match);
std::string_view term_view(term.data(), term.size());
const bool cased = normalizing_implies_cased(normalizing);
if (attribute::DfaFuzzyMatcher::supports_max_edits(max_edits)) {
- _dfa_matcher = std::make_unique<attribute::DfaFuzzyMatcher>(term_view, max_edits, prefix_size, cased);
+ _dfa_matcher = std::make_unique<attribute::DfaFuzzyMatcher>(term_view, max_edits, prefix_lock_length, cased, prefix_match);
} else {
- _fallback_matcher = std::make_unique<vespalib::FuzzyMatcher>(term_view, max_edits, prefix_size, cased);
+ _fallback_matcher = std::make_unique<vespalib::FuzzyMatcher>(term_view, max_edits, prefix_lock_length, cased, prefix_match);
}
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h
index c6c88b18969..552a839cb3b 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h
@@ -23,7 +23,7 @@ class FuzzyTerm : public QueryTerm {
public:
FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term,
const string& index, Type type, Normalizing normalizing,
- uint8_t max_edits, uint32_t prefix_size);
+ uint8_t max_edits, uint32_t prefix_lock_length, bool prefix_match);
~FuzzyTerm() override;
[[nodiscard]] FuzzyTerm* as_fuzzy_term() noexcept override { return this; }
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
index 94a479fd2d3..68e276d0fde 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
@@ -130,7 +130,8 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
qt = std::make_unique<RegexpTerm>(factory.create(), ssTerm, ssIndex, TermType::REGEXP, normalize_mode);
} else if (sTerm == TermType::FUZZYTERM) {
qt = std::make_unique<FuzzyTerm>(factory.create(), ssTerm, ssIndex, TermType::FUZZYTERM, normalize_mode,
- queryRep.getFuzzyMaxEditDistance(), queryRep.getFuzzyPrefixLength());
+ queryRep.fuzzy_max_edit_distance(), queryRep.fuzzy_prefix_lock_length(),
+ queryRep.has_prefix_match_semantics());
} else [[likely]] {
qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode);
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
index 05b12804d52..0e0ab98295b 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
@@ -100,8 +100,9 @@ public:
void visitMembers(vespalib::ObjectVisitor &visitor) const override;
void setIndex(const string & index_) override { _index = index_; }
const string & getIndex() const override { return _index; }
- void setFuzzyMaxEditDistance(uint32_t fuzzyMaxEditDistance) { _fuzzyMaxEditDistance = fuzzyMaxEditDistance; }
- void setFuzzyPrefixLength(uint32_t fuzzyPrefixLength) { _fuzzyPrefixLength = fuzzyPrefixLength; }
+ void set_fuzzy_max_edit_distance(uint32_t fuzzy_max_edit_distance) noexcept { _fuzzy_max_edit_distance = fuzzy_max_edit_distance; }
+ void set_fuzzy_prefix_lock_length(uint32_t fuzzy_prefix_length) noexcept { _fuzzy_prefix_lock_length = fuzzy_prefix_length; }
+ void set_fuzzy_prefix_match(bool prefix_match) noexcept { _fuzzy_prefix_match = prefix_match; }
virtual NearestNeighborQueryNode* as_nearest_neighbor_query_node() noexcept;
virtual MultiTerm* as_multi_term() noexcept;
virtual const MultiTerm* as_multi_term() const noexcept;
diff --git a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h
index 41990af6908..a498076bb09 100644
--- a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h
+++ b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h
@@ -226,8 +226,8 @@ create_nearest_neighbor_term(vespalib::stringref query_tensor_name, vespalib::st
template <class NodeTypes>
typename NodeTypes::FuzzyTerm *
createFuzzyTerm(vespalib::stringref term, vespalib::stringref view, int32_t id, Weight weight,
- uint32_t maxEditDistance, uint32_t prefixLength) {
- return new typename NodeTypes::FuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength);
+ uint32_t max_edit_distance, uint32_t prefix_lock_length, bool prefix_match) {
+ return new typename NodeTypes::FuzzyTerm(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match);
}
template <class NodeTypes>
@@ -343,9 +343,10 @@ public:
return addTerm(createRegExpTerm<NodeTypes>(term, view, id, weight));
}
typename NodeTypes::FuzzyTerm &addFuzzyTerm(stringref term, stringref view, int32_t id, Weight weight,
- uint32_t maxEditDistance, uint32_t prefixLength) {
+ uint32_t max_edit_distance, uint32_t prefix_lock_length,
+ bool prefix_match) {
adjustWeight(weight);
- return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight, maxEditDistance, prefixLength));
+ return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match));
}
typename NodeTypes::NearestNeighborTerm &add_nearest_neighbor_term(stringref query_tensor_name, stringref field_name,
int32_t id, Weight weight, uint32_t target_num_hits,
diff --git a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h
index f578bb57e21..116e677d439 100644
--- a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h
+++ b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h
@@ -201,7 +201,8 @@ private:
replicate(node, _builder.addFuzzyTerm(
node.getTerm(), node.getView(),
node.getId(), node.getWeight(),
- node.getMaxEditDistance(), node.getPrefixLength()));
+ node.max_edit_distance(), node.prefix_lock_length(),
+ node.prefix_match()));
}
std::unique_ptr<TermVector> replicate_subterms(const InTerm& original) {
diff --git a/searchlib/src/vespa/searchlib/query/tree/simplequery.h b/searchlib/src/vespa/searchlib/query/tree/simplequery.h
index 535402bf14d..34bc39ae504 100644
--- a/searchlib/src/vespa/searchlib/query/tree/simplequery.h
+++ b/searchlib/src/vespa/searchlib/query/tree/simplequery.h
@@ -163,8 +163,10 @@ struct SimpleNearestNeighborTerm : NearestNeighborTerm {
struct SimpleFuzzyTerm : FuzzyTerm {
SimpleFuzzyTerm(const Type &term, vespalib::stringref view,
int32_t id, Weight weight,
- uint32_t maxEditDistance, uint32_t prefixLength)
- : FuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength) {
+ uint32_t max_edit_distance, uint32_t prefix_lock_length,
+ bool prefix_match)
+ : FuzzyTerm(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match)
+ {
}
~SimpleFuzzyTerm() override;
};
diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp
index 6707712ee69..b71a86a214a 100644
--- a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp
+++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp
@@ -225,6 +225,9 @@ class QueryNodeConverter : public QueryVisitor {
if (!node.usePositionData()) {
flags |= ParseItem::IFLAG_NOPOSITIONDATA;
}
+ if (node.prefix_match()) {
+ flags |= ParseItem::IFLAG_PREFIX_MATCH;
+ }
if (flags != 0) {
features |= ParseItem::IF_FLAGS;
}
@@ -289,8 +292,8 @@ class QueryNodeConverter : public QueryVisitor {
void visit(FuzzyTerm &node) override {
createTerm(node, ParseItem::ITEM_FUZZY);
- appendCompressedPositiveNumber(node.getMaxEditDistance());
- appendCompressedPositiveNumber(node.getPrefixLength());
+ appendCompressedPositiveNumber(node.max_edit_distance());
+ appendCompressedPositiveNumber(node.prefix_lock_length());
}
void visit(NearestNeighborTerm &node) override {
diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h
index 7c743a72567..6f7d7ddd9ad 100644
--- a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h
+++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h
@@ -42,6 +42,9 @@ public:
if (queryStack.hasNoPositionDataFlag()) {
t->setPositionData(false);
}
+ if (queryStack.has_prefix_match_semantics()) [[unlikely]] {
+ t->set_prefix_match(true);
+ }
}
}
if (builder.hasError()) {
@@ -185,9 +188,10 @@ private:
} else if (type == ParseItem::ITEM_REGEXP) {
t = &builder.addRegExpTerm(term, view, id, weight);
} else if (type == ParseItem::ITEM_FUZZY) {
- uint32_t maxEditDistance = queryStack.getFuzzyMaxEditDistance();
- uint32_t prefixLength = queryStack.getFuzzyPrefixLength();
- t = &builder.addFuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength);
+ uint32_t max_edit_distance = queryStack.fuzzy_max_edit_distance();
+ uint32_t prefix_lock_length = queryStack.fuzzy_prefix_lock_length();
+ bool prefix_match = queryStack.has_prefix_match_semantics();
+ t = &builder.addFuzzyTerm(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match);
} else if (type == ParseItem::ITEM_STRING_IN) {
t = &builder.add_in_term(queryStack.get_terms(), MultiTerm::Type::STRING, view, id, weight);
} else if (type == ParseItem::ITEM_NUMERIC_IN) {
diff --git a/searchlib/src/vespa/searchlib/query/tree/term.cpp b/searchlib/src/vespa/searchlib/query/tree/term.cpp
index 45ba259a799..35f6ea5c8f5 100644
--- a/searchlib/src/vespa/searchlib/query/tree/term.cpp
+++ b/searchlib/src/vespa/searchlib/query/tree/term.cpp
@@ -12,12 +12,14 @@ Term::Term(vespalib::stringref view, int32_t id, Weight weight) :
_id(id),
_weight(weight),
_ranked(true),
- _position_data(true)
+ _position_data(true),
+ _prefix_match(false)
{ }
void Term::setStateFrom(const Term& other) {
setRanked(other.isRanked());
setPositionData(other.usePositionData());
+ set_prefix_match(other.prefix_match());
// too late to copy this state:
assert(_view == other.getView());
assert(_id == other.getId());
diff --git a/searchlib/src/vespa/searchlib/query/tree/term.h b/searchlib/src/vespa/searchlib/query/tree/term.h
index 2f57c3cb06d..678d018d111 100644
--- a/searchlib/src/vespa/searchlib/query/tree/term.h
+++ b/searchlib/src/vespa/searchlib/query/tree/term.h
@@ -18,6 +18,7 @@ class Term
Weight _weight;
bool _ranked;
bool _position_data;
+ bool _prefix_match;
public:
virtual ~Term() = 0;
@@ -25,14 +26,17 @@ public:
void setView(const vespalib::string & view) { _view = view; }
void setRanked(bool ranked) noexcept { _ranked = ranked; }
void setPositionData(bool position_data) noexcept { _position_data = position_data; }
+ // Used for fuzzy prefix matching. Not to be confused with distinct Prefix query term type
+ void set_prefix_match(bool prefix_match) noexcept { _prefix_match = prefix_match; }
void setStateFrom(const Term& other);
const vespalib::string & getView() const noexcept { return _view; }
Weight getWeight() const noexcept { return _weight; }
int32_t getId() const noexcept { return _id; }
- bool isRanked() const noexcept { return _ranked; }
- bool usePositionData() const noexcept { return _position_data; }
+ [[nodiscard]] bool isRanked() const noexcept { return _ranked; }
+ [[nodiscard]] bool usePositionData() const noexcept { return _position_data; }
+ [[nodiscard]] bool prefix_match() const noexcept { return _prefix_match; }
static bool isPossibleRangeTerm(vespalib::stringref term) noexcept {
return (term[0] == '[' || term[0] == '<' || term[0] == '>');
diff --git a/searchlib/src/vespa/searchlib/query/tree/termnodes.h b/searchlib/src/vespa/searchlib/query/tree/termnodes.h
index 97e659dcc77..42dc0717368 100644
--- a/searchlib/src/vespa/searchlib/query/tree/termnodes.h
+++ b/searchlib/src/vespa/searchlib/query/tree/termnodes.h
@@ -119,19 +119,22 @@ public:
//-----------------------------------------------------------------------------
class FuzzyTerm : public QueryNodeMixin<FuzzyTerm, StringBase> {
-private:
- uint32_t _maxEditDistance;
- uint32_t _prefixLength;
+ uint32_t _max_edit_distance;
+ uint32_t _prefix_lock_length;
+ // Prefix match mode is stored in parent Term
public:
FuzzyTerm(const Type &term, vespalib::stringref view,
- int32_t id, Weight weight, uint32_t maxEditDistance, uint32_t prefixLength)
- : QueryNodeMixinType(term, view, id, weight),
- _maxEditDistance(maxEditDistance),
- _prefixLength(prefixLength)
- {}
+ int32_t id, Weight weight, uint32_t max_edit_distance,
+ uint32_t prefix_lock_length, bool prefix_match)
+ : QueryNodeMixinType(term, view, id, weight),
+ _max_edit_distance(max_edit_distance),
+ _prefix_lock_length(prefix_lock_length)
+ {
+ set_prefix_match(prefix_match);
+ }
- uint32_t getMaxEditDistance() const { return _maxEditDistance; }
- uint32_t getPrefixLength() const { return _prefixLength; }
+ [[nodiscard]] uint32_t max_edit_distance() const { return _max_edit_distance; }
+ [[nodiscard]] uint32_t prefix_lock_length() const { return _prefix_lock_length; }
virtual ~FuzzyTerm() = 0;
};
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index ce9636895b4..84c3a542661 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -75,31 +75,47 @@ std::string_view maybe_consume_into(std::string_view str, T& val_out) {
return str.substr(ptr - str.data());
}
-// Parse optional max edits and prefix lock length from term string.
+// Parse optional prefix match mode, max edits and prefix lock length from term string.
// Syntax:
-// "term" -> {2, 0, "term"} (default max edits & prefix length)
-// "{1}term" -> {1, 0, "term"}
-// "{1,3}term" -> {1, 3, "term"}
+// "term" -> {2, 0, false, "term"} (default max edits, prefix length and prefix match mode)
+// "{p}term" -> {2, 0, true, "term"}
+// "{1}term" -> {1, 0, false, "term"}
+// "{p1}term" -> {1, 0, true, "term"}
+// "{1,3}term" -> {1, 3, false, "term"}
+// "{p1,3}term" -> {1, 3, true, "term"}
+// .. and so on
//
// Note: this is not a "proper" parser (it accepts empty numeric values); only for testing!
-std::tuple<uint8_t, uint32_t, std::string_view> parse_fuzzy_params(std::string_view term) {
+std::tuple<uint8_t, uint32_t, bool, std::string_view> parse_fuzzy_params(std::string_view term) {
+ std::string_view orig_term = term;
if (term.empty() || term[0] != '{') {
- return {2, 0, term};
+ return {2, 0, false, term};
}
+ term = term.substr(1); // skip '{'
uint8_t max_edits = 2;
uint32_t prefix_length = 0;
- term = maybe_consume_into(term.substr(1), max_edits);
+ bool prefix_match = false;
+ if (!term.empty() && term[0] == 'p') {
+ prefix_match = true;
+ term = term.substr(1); // skip 'p'
+ }
+ if (!term.empty() && term[0] == '}') {
+ return {2, 0, prefix_match, term.substr(1)};
+ }
+ term = maybe_consume_into(term, max_edits);
if (term.empty() || (term[0] != ',' && term[0] != '}')) {
- throw std::invalid_argument("malformed fuzzy params at (or after) max_edits");
+ throw std::invalid_argument("malformed fuzzy params at (or after) max_edits: " +
+ std::string(term) + " in string " + std::string(orig_term));
}
if (term[0] == '}') {
- return {max_edits, prefix_length, term.substr(1)};
+ return {max_edits, prefix_length, prefix_match, term.substr(1)};
}
term = maybe_consume_into(term.substr(1), prefix_length);
if (term.empty() || term[0] != '}') {
- throw std::invalid_argument("malformed fuzzy params at (or after) prefix_length");
+ throw std::invalid_argument("malformed fuzzy params at (or after) prefix_length: " +
+ std::string(term) + " in string " + std::string(orig_term));
}
- return {max_edits, prefix_length, term.substr(1)};
+ return {max_edits, prefix_length, prefix_match, term.substr(1)};
}
}
@@ -115,9 +131,10 @@ private:
if (pt.second == TermType::REGEXP) {
qtv.push_back(std::make_unique<RegexpTerm>(eqnr.create(), pt.first, effective_index, TermType::REGEXP, normalizing));
} else if (pt.second == TermType::FUZZYTERM) {
- auto [max_edits, prefix_length, actual_term] = parse_fuzzy_params(pt.first);
+ auto [max_edits, prefix_lock_length, prefix_match, actual_term] = parse_fuzzy_params(pt.first);
qtv.push_back(std::make_unique<FuzzyTerm>(eqnr.create(), vespalib::stringref(actual_term.data(), actual_term.size()),
- effective_index, TermType::FUZZYTERM, normalizing, max_edits, prefix_length));
+ effective_index, TermType::FUZZYTERM, normalizing, max_edits,
+ prefix_lock_length, prefix_match));
} else {
qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing));
}
@@ -531,25 +548,29 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs)
return true;
}
-TEST("parsing of test-only fuzzy term params can extract numeric values") {
+void check_fuzzy_param_parsing(std::string_view term, std::string_view exp_term,
+ uint8_t exp_max_edits, uint32_t exp_prefix_length, bool exp_prefix)
+{
uint8_t max_edits = 0;
- uint32_t prefix_length = 1234;
+ uint32_t prefix_length = 0;
+ bool prefix = false;
std::string_view out;
- std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("myterm");
- EXPECT_EQUAL(max_edits, 2u);
- EXPECT_EQUAL(prefix_length, 0u);
- EXPECT_EQUAL(out, "myterm");
+ std::tie(max_edits, prefix_length, prefix, out) = parse_fuzzy_params(term);
+ EXPECT_EQUAL(static_cast<uint32_t>(max_edits), static_cast<uint32_t>(exp_max_edits)); // don't print as char...
+ EXPECT_EQUAL(prefix_length, exp_prefix_length);
+ EXPECT_EQUAL(prefix, exp_prefix);
+ EXPECT_EQUAL(out, exp_term);
- std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("{3}myterm");
- EXPECT_EQUAL(max_edits, 3u);
- EXPECT_EQUAL(prefix_length, 0u);
- EXPECT_EQUAL(out, "myterm");
+}
- std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("{2,70}myterm");
- EXPECT_EQUAL(max_edits, 2u);
- EXPECT_EQUAL(prefix_length, 70u);
- EXPECT_EQUAL(out, "myterm");
+TEST("parsing of test-only fuzzy term params can extract expected values") {
+ check_fuzzy_param_parsing("myterm", "myterm", 2, 0, false);
+ check_fuzzy_param_parsing("{3}myterm", "myterm", 3, 0, false);
+ check_fuzzy_param_parsing("{p}myterm", "myterm", 2, 0, true);
+ check_fuzzy_param_parsing("{p1}myterm", "myterm", 1, 0, true);
+ check_fuzzy_param_parsing("{2,70}myterm", "myterm", 2, 70, false);
+ check_fuzzy_param_parsing("{p2,70}myterm", "myterm", 2, 70, true);
}
TEST("verify correct term parsing") {
@@ -640,6 +661,14 @@ TEST("utf8 substring search with empty term")
assertFieldInfo(fs, "", "abc", QTFieldInfo().setFieldLength(0));
}
+Hits is_hit() {
+ return Hits().add({0, 0});
+}
+
+Hits no_hits() {
+ return {};
+}
+
TEST("utf8 suffix search") {
UTF8SuffixStringFieldSearcher fs(0);
std::string field = "operators and operator overloading";
@@ -837,6 +866,37 @@ TEST("utf8 flexible searcher fuzzy matching treats field as 1 word") {
TEST_DO(assertFieldInfo(fs, "%{1}foo", "foo bar baz", QTFieldInfo(0, 0, 1)));
}
+TEST("utf8 flexible searcher supports fuzzy prefix matching") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ TEST_DO(assertString(fs, "%{p0}z", "zoid", is_hit()));
+ TEST_DO(assertString(fs, "%{p0}zo", "zoid", is_hit()));
+ TEST_DO(assertString(fs, "%{p0}zo", "Zoid", is_hit())); // uncased
+ TEST_DO(assertString(fs, "%{p0}Zo", "zoid", is_hit())); // uncased
+ TEST_DO(assertString(fs, "%{p0}zoid", "zoid", is_hit()));
+ TEST_DO(assertString(fs, "%{p0}x", "zoid", no_hits()));
+ TEST_DO(assertString(fs, "%{p1}zo", "boid", is_hit()));
+ TEST_DO(assertString(fs, "%{p1}zo", "blid", no_hits()));
+ TEST_DO(assertString(fs, "%{p1}yam", "hamburger", is_hit()));
+ TEST_DO(assertString(fs, "%{p1}yam", "humbug", no_hits()));
+ TEST_DO(assertString(fs, "%{p2}yam", "humbug", is_hit()));
+ TEST_DO(assertString(fs, "%{p2}catfo", "dogfood", no_hits()));
+ TEST_DO(assertString(fs, "%{p3}catfo", "dogfood", is_hit()));
+ TEST_DO(assertString(fs, "%{p100}abcd", "anything you want", is_hit())); // trivially matches
+}
+
+TEST("utf8 flexible searcher supports fuzzy prefix matching combined with prefix locking") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ TEST_DO(assertString(fs, "%{p0,4}zoid", "zoid", is_hit()));
+ TEST_DO(assertString(fs, "%{p0,4}zoidber", "zoidberg", is_hit()));
+ TEST_DO(assertString(fs, "%{p1,4}zoidber", "zoidburg", is_hit()));
+ TEST_DO(assertString(fs, "%{p1,4}zoidber", "zoidblurgh", no_hits()));
+ TEST_DO(assertString(fs, "%{p1,4}zoidbe", "zoidblurgh", is_hit()));
+ TEST_DO(assertString(fs, "%{p1,4}zoidberg", "boidberg", no_hits()));
+ TEST_DO(assertString(fs, "%{p1,4}zoidber", "zoidburger", is_hit()));
+ TEST_DO(assertString(fs, "%{p1,4}zoidber", "zoidbananas", no_hits()));
+ TEST_DO(assertString(fs, "%{p2,4}zoidber", "zoidbananas", is_hit()));
+}
+
TEST("bool search") {
BoolFieldSearcher fs(0);
TEST_DO(assertBool(fs, "true", true, true));
diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
index d94120e5bcf..9e4550db073 100644
--- a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
+++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
@@ -33,7 +33,7 @@ TEST(FuzzyMatcherTest, get_suffix_edge_cases) {
}
TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
- FuzzyMatcher fuzzy("abc", 2, 0, false);
+ FuzzyMatcher fuzzy("abc", 2, 0, false, false);
EXPECT_TRUE(fuzzy.isMatch("abc"));
EXPECT_TRUE(fuzzy.isMatch("ABC"));
EXPECT_TRUE(fuzzy.isMatch("ab1"));
@@ -42,15 +42,15 @@ TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
}
TEST(FuzzyMatcherTest, fuzzy_match_cased) {
- FuzzyMatcher fuzzy("abc", 2, 0, true);
+ FuzzyMatcher fuzzy("abc", 2, 0, true, false);
EXPECT_TRUE(fuzzy.isMatch("abc"));
EXPECT_TRUE(fuzzy.isMatch("abC"));
EXPECT_TRUE(fuzzy.isMatch("aBC"));
EXPECT_FALSE(fuzzy.isMatch("ABC"));
}
-TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
- FuzzyMatcher fuzzy("abcdef", 2, 2, false);
+TEST(FuzzyMatcherTest, fuzzy_match_with_prefix_locking) {
+ FuzzyMatcher fuzzy("abcdef", 2, 2, false, false);
EXPECT_TRUE(fuzzy.isMatch("abcdef"));
EXPECT_TRUE(fuzzy.isMatch("ABCDEF"));
EXPECT_TRUE(fuzzy.isMatch("abcde1"));
@@ -59,22 +59,43 @@ TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
EXPECT_FALSE(fuzzy.isMatch("12cdef"));
}
-TEST(FuzzyMatcherTest, get_prefix_is_empty) {
- FuzzyMatcher fuzzy("whatever", 2, 0, false);
+TEST(FuzzyMatcherTest, get_prefix_lock_length_is_zero) {
+ FuzzyMatcher fuzzy("whatever", 2, 0, false, false);
EXPECT_EQ(fuzzy.getPrefix(), "");
}
TEST(FuzzyMatcherTest, term_is_empty) {
- FuzzyMatcher fuzzy("", 2, 0, false);
+ FuzzyMatcher fuzzy("", 2, 0, false, false);
EXPECT_TRUE(fuzzy.isMatch(""));
EXPECT_TRUE(fuzzy.isMatch("a"));
EXPECT_TRUE(fuzzy.isMatch("aa"));
EXPECT_FALSE(fuzzy.isMatch("aaa"));
}
-TEST(FuzzyMatcherTest, get_prefix_non_empty) {
- FuzzyMatcher fuzzy("abcd", 2, 2, false);
+TEST(FuzzyMatcherTest, get_prefix_lock_length_non_zero) {
+ FuzzyMatcher fuzzy("abcd", 2, 2, false, false);
EXPECT_EQ(fuzzy.getPrefix(), "ab");
}
+TEST(FuzzyMatcherTest, fuzzy_prefix_matching_without_prefix_lock_length) {
+ FuzzyMatcher fuzzy("abc", 1, 0, false, true);
+ EXPECT_EQ(fuzzy.getPrefix(), "");
+ EXPECT_TRUE(fuzzy.isMatch("abc"));
+ EXPECT_TRUE(fuzzy.isMatch("abcdefgh"));
+ EXPECT_TRUE(fuzzy.isMatch("ab"));
+ EXPECT_TRUE(fuzzy.isMatch("abd"));
+ EXPECT_TRUE(fuzzy.isMatch("xabc"));
+ EXPECT_FALSE(fuzzy.isMatch("xy"));
+}
+
+TEST(FuzzyMatcherTest, fuzzy_prefix_matching_with_prefix_lock_length) {
+ FuzzyMatcher fuzzy("zoid", 1, 2, false, true);
+ EXPECT_EQ(fuzzy.getPrefix(), "zo");
+ EXPECT_TRUE(fuzzy.isMatch("zoidberg"));
+ EXPECT_TRUE(fuzzy.isMatch("zold"));
+ EXPECT_TRUE(fuzzy.isMatch("zoldberg"));
+ EXPECT_FALSE(fuzzy.isMatch("zoxx"));
+ EXPECT_FALSE(fuzzy.isMatch("loid"));
+}
+
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
index 918cbc624db..61824fa4abf 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
@@ -74,7 +74,11 @@ TEST(LevenshteinDistance, prefix_match_edge_cases) {
EXPECT_EQ(prefix_calculate("xy", "", 2), std::optional{2});
EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt);
- // Max edits > 2 cases; not supported by DFA implementation.
+ // Max edits not in {1, 2} cases; not supported by DFA implementation.
+ EXPECT_EQ(prefix_calculate("", "", 0), std::optional{0});
+ EXPECT_EQ(prefix_calculate("abc", "abc", 0), std::optional{0});
+ EXPECT_EQ(prefix_calculate("abc", "abcde", 0), std::optional{0});
+ EXPECT_EQ(prefix_calculate("abc", "dbc", 0), std::nullopt);
EXPECT_EQ(prefix_calculate("abc", "", 3), std::optional{3});
EXPECT_EQ(prefix_calculate("abc", "xy", 3), std::optional{3});
EXPECT_EQ(prefix_calculate("abc", "xyz", 3), std::optional{3});
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
index 71388f43b89..fbfa50aa57f 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
@@ -29,10 +29,12 @@ vespalib::FuzzyMatcher::FuzzyMatcher():
_folded_term_codepoints_suffix()
{}
-vespalib::FuzzyMatcher::FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased):
+vespalib::FuzzyMatcher::FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size,
+ bool is_cased, bool is_prefix):
_max_edit_distance(max_edit_distance),
_prefix_size(prefix_size),
_is_cased(is_cased),
+ _is_prefix(is_prefix),
_folded_term_codepoints(_is_cased ? cased_convert_to_ucs4(term) : LowerCase::convert_to_ucs4(term)),
_folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)),
_folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size))
@@ -73,7 +75,7 @@ bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const {
return LevenshteinDistance::calculate(
_folded_term_codepoints_suffix,
get_suffix(targetCodepoints, _prefix_size),
- _max_edit_distance).has_value();
+ _max_edit_distance, _is_prefix).has_value();
}
vespalib::string vespalib::FuzzyMatcher::getPrefix() const {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
index aae6ca1c6e5..2c17283d20a 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
@@ -24,6 +24,7 @@ private:
uint32_t _max_edit_distance; // max edit distance
uint32_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy
bool _is_cased;
+ bool _is_prefix;
std::vector<uint32_t> _folded_term_codepoints;
@@ -34,7 +35,7 @@ public:
FuzzyMatcher();
FuzzyMatcher(const FuzzyMatcher &) = delete;
FuzzyMatcher & operator = (const FuzzyMatcher &) = delete;
- FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased);
+ FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased, bool is_prefix);
~FuzzyMatcher();
[[nodiscard]] bool isMatch(std::string_view target) const;