aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-04-22 20:15:59 +0200
committerGitHub <noreply@github.com>2024-04-22 20:15:59 +0200
commit796b99170deed76263e24be1af1d5c2762e9366c (patch)
treef3d99e375b6077616f147046d7948f96193a93f3
parent173eaab851d081d5b557bc3743de0d5669936d0a (diff)
parent0afbf14df1ee158167f70016545e799af1e433dc (diff)
Merge pull request #30976 from vespa-engine/vekterli/wire-fuzzy-prefix-match-through-query-stack
Wire fuzzy prefix matching support through the query stack
-rw-r--r--container-search/abi-spec.json5
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java59
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/Item.java14
-rw-r--r--container-search/src/main/java/com/yahoo/search/query/SelectParser.java3
-rw-r--r--container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java25
-rw-r--r--container-search/src/main/java/com/yahoo/search/yql/YqlParser.java9
-rw-r--r--container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java23
-rw-r--r--container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java5
-rw-r--r--container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java22
-rw-r--r--container-search/src/test/java/com/yahoo/select/SelectTestCase.java35
-rw-r--r--searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp54
-rw-r--r--searchlib/src/tests/query/customtypevisitor_test.cpp2
-rw-r--r--searchlib/src/tests/query/query_visitor_test.cpp2
-rw-r--r--searchlib/src/tests/query/querybuilder_test.cpp47
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp12
-rw-r--r--searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h6
-rw-r--r--searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp10
-rw-r--r--searchlib/src/vespa/searchlib/parsequery/parse.h2
-rw-r--r--searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h21
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_simple.cpp5
-rw-r--r--searchlib/src/vespa/searchlib/query/query_term_simple.h12
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp11
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h2
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynode.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.h5
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/querybuilder.h9
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/queryreplicator.h3
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/simplequery.h6
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp7
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h10
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/term.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/term.h8
-rw-r--r--searchlib/src/vespa/searchlib/query/tree/termnodes.h23
-rw-r--r--streamingvisitors/src/tests/searcher/searcher_test.cpp114
-rw-r--r--vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp39
-rw-r--r--vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp6
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp6
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h3
38 files changed, 443 insertions, 189 deletions
diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json
index 07f0449e61a..5e66e1bb746 100644
--- a/container-search/abi-spec.json
+++ b/container-search/abi-spec.json
@@ -555,11 +555,15 @@
"public"
],
"methods" : [
+ "public void <init>(java.lang.String, boolean, java.lang.String, int, int, boolean)",
"public void <init>(java.lang.String, boolean, java.lang.String, int, int)",
"public void setMaxEditDistance(int)",
"public void setPrefixLength(int)",
"public int getPrefixLength()",
"public int getMaxEditDistance()",
+ "public boolean isPrefixMatch()",
+ "public void setPrefixMatch(boolean)",
+ "protected boolean hasPrefixMatchSemantics()",
"public void setValue(java.lang.String)",
"public java.lang.String getRawWord()",
"public boolean isWords()",
@@ -820,6 +824,7 @@
"public abstract java.lang.String getName()",
"public void setFilter(boolean)",
"public boolean isFilter()",
+ "protected boolean hasPrefixMatchSemantics()",
"public com.yahoo.prelude.query.Item$ItemCreator getCreator()",
"public void setCreator(com.yahoo.prelude.query.Item$ItemCreator)",
"public void setWeight(int)",
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java
index 3cf86a70985..b900dee20ba 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java
@@ -16,15 +16,21 @@ public class FuzzyItem extends TermItem {
private int maxEditDistance;
private int prefixLength;
+ private boolean prefixMatch;
public static int DEFAULT_MAX_EDIT_DISTANCE = 2;
public static int DEFAULT_PREFIX_LENGTH = 0;
- public FuzzyItem(String indexName, boolean isFromQuery, String term, int maxEditDistance, int prefixLength) {
+ public FuzzyItem(String indexName, boolean isFromQuery, String term, int maxEditDistance, int prefixLength, boolean prefixMatch) {
super(indexName, isFromQuery, null);
setValue(term);
setMaxEditDistance(maxEditDistance);
setPrefixLength(prefixLength);
+ setPrefixMatch(prefixMatch);
+ }
+
+ public FuzzyItem(String indexName, boolean isFromQuery, String term, int maxEditDistance, int prefixLength) {
+ this(indexName, isFromQuery, term, maxEditDistance, prefixLength, false);
}
public void setMaxEditDistance(int maxEditDistance) {
@@ -43,6 +49,19 @@ public class FuzzyItem extends TermItem {
return this.maxEditDistance;
}
+ public boolean isPrefixMatch() {
+ return this.prefixMatch;
+ }
+
+ public void setPrefixMatch(boolean prefixMatch) {
+ this.prefixMatch = prefixMatch;
+ }
+
+ @Override
+ protected boolean hasPrefixMatchSemantics() {
+ return this.prefixMatch;
+ }
+
@Override
public void setValue(String value) {
this.term = value;
@@ -89,43 +108,39 @@ public class FuzzyItem extends TermItem {
}
@Override
- public boolean equals(Object obj) {
- if (this == obj) {
- return true;
- }
- if (!super.equals(obj)) {
- return false;
- }
- if (getClass() != obj.getClass()) {
- return false;
- }
- FuzzyItem other = (FuzzyItem) obj;
- if (!this.term.equals(other.term)) return false;
- if (this.maxEditDistance != other.maxEditDistance) return false;
- if (this.prefixLength != other.prefixLength) return false;
- return true;
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ if (!super.equals(o)) return false;
+ FuzzyItem fuzzyItem = (FuzzyItem) o;
+ return maxEditDistance == fuzzyItem.maxEditDistance &&
+ prefixLength == fuzzyItem.prefixLength &&
+ prefixMatch == fuzzyItem.prefixMatch &&
+ Objects.equals(term, fuzzyItem.term);
}
@Override
public int hashCode() {
- return Objects.hash(super.hashCode(), term, maxEditDistance, prefixLength);
+ return Objects.hash(super.hashCode(), term, maxEditDistance, prefixLength, prefixMatch);
}
@Override
protected void appendHeadingString(StringBuilder buffer) {
buffer.append(getName());
- buffer.append("(");
+ buffer.append('(');
buffer.append(this.term);
- buffer.append(",");
+ buffer.append(',');
buffer.append(this.maxEditDistance);
- buffer.append(",");
+ buffer.append(',');
buffer.append(this.prefixLength);
- buffer.append(")");
- buffer.append(" ");
+ buffer.append(',');
+ buffer.append(this.prefixMatch);
+ buffer.append(") ");
}
@Override
protected void encodeThis(ByteBuffer buffer) {
+ // Prefix matching is communicated via term header flags
super.encodeThis(buffer);
putString(getIndexedString(), buffer);
IntegerCompressor.putCompressedPositiveNumber(this.maxEditDistance, buffer);
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/Item.java b/container-search/src/main/java/com/yahoo/prelude/query/Item.java
index f43b55424e6..099c546e3f0 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/Item.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/Item.java
@@ -161,6 +161,16 @@ public abstract class Item implements Cloneable {
}
/**
+ * Indicates that a query item that does not normally match with prefix semantics
+ * should do so for this particular query item instance.
+ *
+ * False by default; should be overridden by subclasses that want to signal this behavior.
+ */
+ protected boolean hasPrefixMatchSemantics() {
+ return false;
+ }
+
+ /**
* Returns the item creator value.
*
* @deprecated use isFilter(boolean)
@@ -286,6 +296,7 @@ public abstract class Item implements Cloneable {
byte FLAGS_SPECIALTOKEN = 0x02;
byte FLAGS_NOPOSITIONDATA = 0x04;
byte FLAGS_ISFILTER = 0x08;
+ byte FLAGS_PREFIX_MATCH = 0x10;
byte ret = 0;
if (!isRanked()) {
@@ -300,6 +311,9 @@ public abstract class Item implements Cloneable {
if (isFilter()) {
ret |= FLAGS_ISFILTER;
}
+ if (hasPrefixMatchSemantics()) {
+ ret |= FLAGS_PREFIX_MATCH;
+ }
return ret;
}
diff --git a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java
index c897afe144c..c90612425fa 100644
--- a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java
+++ b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java
@@ -1150,8 +1150,9 @@ public class SelectParser implements Parser {
Integer maxEditDistance = getIntegerAnnotation(MAX_EDIT_DISTANCE, annotations, FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE);
Integer prefixLength = getIntegerAnnotation(PREFIX_LENGTH, annotations, FuzzyItem.DEFAULT_PREFIX_LENGTH);
+ boolean prefixMatch = getBoolAnnotation(PREFIX, annotations, Boolean.FALSE);
- FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength);
+ FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength, prefixMatch);
return leafStyleSettings(getAnnotations(value), fuzzy);
}
diff --git a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java
index 634163bf0c2..a354006aa9b 100644
--- a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java
+++ b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java
@@ -551,24 +551,31 @@ public class VespaSerializer {
static String fuzzyAnnotations(FuzzyItem fuzzyItem) {
boolean isMaxEditDistanceSet = fuzzyItem.getMaxEditDistance() != FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE;
boolean isPrefixLengthSet = fuzzyItem.getPrefixLength() != FuzzyItem.DEFAULT_PREFIX_LENGTH;
- boolean anyAnnotationSet = isMaxEditDistanceSet || isPrefixLengthSet;
+ boolean isPrefixMatch = fuzzyItem.isPrefixMatch();
+ boolean anyAnnotationSet = isMaxEditDistanceSet || isPrefixLengthSet || isPrefixMatch;
- StringBuilder builder = new StringBuilder();
- if (anyAnnotationSet) {
- builder.append("{");
+ if (!anyAnnotationSet) {
+ return "";
}
+
+ StringBuilder builder = new StringBuilder();
+ builder.append("{");
if (isMaxEditDistanceSet) {
builder.append(MAX_EDIT_DISTANCE + ":").append(fuzzyItem.getMaxEditDistance());
- }
- if (isMaxEditDistanceSet && isPrefixLengthSet) {
- builder.append(",");
+ if (isPrefixLengthSet || isPrefixMatch) {
+ builder.append(",");
+ }
}
if (isPrefixLengthSet) {
builder.append(PREFIX_LENGTH + ":").append(fuzzyItem.getPrefixLength());
+ if (isPrefixMatch) {
+ builder.append(",");
+ }
}
- if (anyAnnotationSet) {
- builder.append("}");
+ if (isPrefixMatch) {
+ builder.append(PREFIX).append(':').append(fuzzyItem.isPrefixMatch());
}
+ builder.append("}");
return builder.toString();
}
}
diff --git a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
index e66cac5766c..fb4ec5ba872 100644
--- a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
+++ b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java
@@ -1385,7 +1385,14 @@ public class YqlParser implements Parser {
FuzzyItem.DEFAULT_PREFIX_LENGTH,
PREFIX_LENGTH_DESCRIPTION);
- FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength);
+ boolean prefixMatch = getAnnotation(
+ ast,
+ PREFIX,
+ Boolean.class,
+ Boolean.FALSE,
+ "setting for whether to use prefix match of input data");
+
+ FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength, prefixMatch);
return leafStyleSettings(ast, fuzzy);
}
diff --git a/container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java b/container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java
index c4b8c9f2044..027152bfd69 100644
--- a/container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java
@@ -55,14 +55,13 @@ public class ValidateFuzzySearcherTestCase {
searcher = new ValidateFuzzySearcher();
}
- private String makeQuery(String attribute, String query, int maxEditDistance, int prefixLength) {
- return "select * from sources * where " + attribute +
- " contains ({maxEditDistance:" + maxEditDistance + ", prefixLength:" + prefixLength +"}" +
- "fuzzy(\"" + query + "\"))";
+ private String makeQuery(String attribute, String query, int maxEditDistance, int prefixLength, boolean prefixMatch) {
+ return "select * from sources * where %s contains ({maxEditDistance:%d,prefixLength:%d,prefix:%b}fuzzy(\"%s\"))"
+ .formatted(attribute, maxEditDistance, prefixLength, prefixMatch, query);
}
private String makeQuery(String attribute, String query) {
- return makeQuery(attribute, query, 2, 0);
+ return makeQuery(attribute, query, 2, 0, false);
}
@@ -76,7 +75,7 @@ public class ValidateFuzzySearcherTestCase {
if (validAttributes.contains(attribute)) {
assertNull(r.hits().getError());
} else {
- assertErrMsg("FUZZY(fuzzy,2,0) " + attribute + ":fuzzy field is not a string attribute", r);
+ assertErrMsg("FUZZY(fuzzy,2,0,false) " + attribute + ":fuzzy field is not a string attribute", r);
}
}
}
@@ -85,28 +84,28 @@ public class ValidateFuzzySearcherTestCase {
void testInvalidEmptyStringQuery() {
String q = makeQuery("string_single", "");
Result r = doSearch(searcher, q);
- assertErrMsg("FUZZY(,2,0) string_single: fuzzy query must be non-empty", r);
+ assertErrMsg("FUZZY(,2,0,false) string_single: fuzzy query must be non-empty", r);
}
@Test
void testInvalidQueryWrongMaxEditDistance() {
- String q = makeQuery("string_single", "fuzzy", -1, 0);
+ String q = makeQuery("string_single", "fuzzy", -1, 0, false);
Result r = doSearch(searcher, q);
- assertErrMsg("FUZZY(fuzzy,-1,0) string_single:fuzzy has invalid maxEditDistance -1: Must be >= 0", r);
+ assertErrMsg("FUZZY(fuzzy,-1,0,false) string_single:fuzzy has invalid maxEditDistance -1: Must be >= 0", r);
}
@Test
void testInvalidQueryWrongPrefixLength() {
- String q = makeQuery("string_single", "fuzzy", 2, -1);
+ String q = makeQuery("string_single", "fuzzy", 2, -1, true);
Result r = doSearch(searcher, q);
- assertErrMsg("FUZZY(fuzzy,2,-1) string_single:fuzzy has invalid prefixLength -1: Must be >= 0", r);
+ assertErrMsg("FUZZY(fuzzy,2,-1,true) string_single:fuzzy has invalid prefixLength -1: Must be >= 0", r);
}
@Test
void testInvalidQueryWrongAttributeName() {
String q = makeQuery("wrong_name", "fuzzy");
Result r = doSearch(searcher, q);
- assertErrMsg("FUZZY(fuzzy,2,0) wrong_name:fuzzy field is not a string attribute", r);
+ assertErrMsg("FUZZY(fuzzy,2,0,false) wrong_name:fuzzy field is not a string attribute", r);
}
private static void assertErrMsg(String message, Result r) {
diff --git a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java
index 20ca81234a6..b5e2839c4c0 100644
--- a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java
@@ -464,7 +464,12 @@ public class VespaSerializerTestCase {
@Test
void testFuzzyAnnotations() {
+ parseAndConfirm("foo contains ({maxEditDistance:3}fuzzy(\"a\"))");
parseAndConfirm("foo contains ({maxEditDistance:3,prefixLength:5}fuzzy(\"a\"))");
+ parseAndConfirm("foo contains ({maxEditDistance:3,prefixLength:5,prefix:true}fuzzy(\"a\"))");
+ parseAndConfirm("foo contains ({prefixLength:5,prefix:true}fuzzy(\"a\"))");
+ parseAndConfirm("foo contains ({maxEditDistance:3,prefix:true}fuzzy(\"a\"))");
+ parseAndConfirm("foo contains ({prefix:true}fuzzy(\"a\"))");
}
@Test
diff --git a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
index 29a651aabf4..91f5984481a 100644
--- a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
@@ -437,23 +437,27 @@ public class YqlParserTestCase {
QueryTree x = parse("select foo from bar where baz contains fuzzy(\"a b\")");
Item root = x.getRoot();
assertSame(FuzzyItem.class, root.getClass());
- assertEquals("baz", ((FuzzyItem) root).getIndexName());
- assertEquals("a b", ((FuzzyItem) root).stringValue());
- assertEquals(FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE, ((FuzzyItem) root).getMaxEditDistance());
- assertEquals(FuzzyItem.DEFAULT_PREFIX_LENGTH, ((FuzzyItem) root).getPrefixLength());
+ var fuzzy = (FuzzyItem) root;
+ assertEquals("baz", fuzzy.getIndexName());
+ assertEquals("a b", fuzzy.stringValue());
+ assertEquals(FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE, fuzzy.getMaxEditDistance());
+ assertEquals(FuzzyItem.DEFAULT_PREFIX_LENGTH, fuzzy.getPrefixLength());
+ assertFalse(fuzzy.isPrefixMatch());
}
@Test
void testFuzzyAnnotations() {
QueryTree x = parse(
- "select foo from bar where baz contains ({maxEditDistance: 3, prefixLength: 10}fuzzy(\"a b\"))"
+ "select foo from bar where baz contains ({maxEditDistance: 3, prefixLength: 10, prefix: true}fuzzy(\"a b\"))"
);
Item root = x.getRoot();
assertSame(FuzzyItem.class, root.getClass());
- assertEquals("baz", ((FuzzyItem) root).getIndexName());
- assertEquals("a b", ((FuzzyItem) root).stringValue());
- assertEquals(3, ((FuzzyItem) root).getMaxEditDistance());
- assertEquals(10, ((FuzzyItem) root).getPrefixLength());
+ var fuzzy = (FuzzyItem) root;
+ assertEquals("baz", fuzzy.getIndexName());
+ assertEquals("a b", fuzzy.stringValue());
+ assertEquals(3, fuzzy.getMaxEditDistance());
+ assertEquals(10, fuzzy.getPrefixLength());
+ assertTrue(fuzzy.isPrefixMatch());
}
@Test
diff --git a/container-search/src/test/java/com/yahoo/select/SelectTestCase.java b/container-search/src/test/java/com/yahoo/select/SelectTestCase.java
index f4571f04a5d..f863816dab2 100644
--- a/container-search/src/test/java/com/yahoo/select/SelectTestCase.java
+++ b/container-search/src/test/java/com/yahoo/select/SelectTestCase.java
@@ -671,8 +671,39 @@ public class SelectTestCase {
QueryTree x = parseWhere("{ \"contains\": [\"description\", { \"fuzzy\": [\"a b\"] }] }");
Item root = x.getRoot();
assertSame(FuzzyItem.class, root.getClass());
- assertEquals("description", ((FuzzyItem) root).getIndexName());
- assertEquals("a b", ((FuzzyItem) root).stringValue());
+ var fuzzy = (FuzzyItem) root;
+ assertEquals("description", fuzzy.getIndexName());
+ assertEquals("a b", fuzzy.stringValue());
+ assertEquals(FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE, fuzzy.getMaxEditDistance());
+ assertEquals(FuzzyItem.DEFAULT_PREFIX_LENGTH, fuzzy.getPrefixLength());
+ assertFalse(fuzzy.isPrefixMatch());
+ }
+
+ @Test
+ void fuzzy_with_annotations() {
+ var where = """
+ {
+ "contains": ["description", {
+ "fuzzy": {
+ "children": ["a b"],
+ "attributes": {
+ "maxEditDistance": 3,
+ "prefixLength": 10,
+ "prefix": true
+ }
+ }
+ }]
+ }
+ """;
+ QueryTree x = parseWhere(where);
+ Item root = x.getRoot();
+ assertSame(FuzzyItem.class, root.getClass());
+ var fuzzy = (FuzzyItem) root;
+ assertEquals("description", fuzzy.getIndexName());
+ assertEquals("a b", fuzzy.stringValue());
+ assertEquals(3, fuzzy.getMaxEditDistance());
+ assertEquals(10, fuzzy.getPrefixLength());
+ assertTrue(fuzzy.isPrefixMatch());
}
//------------------------------------------------------------------- grouping tests
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
index 8ba8c62c5ff..55810c02550 100644
--- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
+++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp
@@ -120,11 +120,12 @@ struct MatchStats {
template <bool collect_matches>
void
-brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words)
+brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size,
+ bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words)
{
auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
vespalib::Timer timer;
- FuzzyMatcher matcher(target, 2, prefix_size, cased);
+ FuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match);
auto itr = view.begin();
size_t matches = 0;
size_t seeks = 0;
@@ -144,11 +145,12 @@ brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumS
template <bool collect_matches>
void
-dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words)
+dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size,
+ bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words)
{
auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
vespalib::Timer timer;
- DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit);
+ DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Explicit);
Utf8Reader reader(vespalib::stringref(target.data(), target.size()));
std::string target_copy;
Utf8Writer<std::string> writer(target_copy);
@@ -187,11 +189,12 @@ dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& st
template <bool collect_matches>
void
-dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words)
+dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumStore& store, uint32_t prefix_size,
+ bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words)
{
auto view = store.get_dictionary().get_posting_dictionary().getFrozenView();
vespalib::Timer timer;
- DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit);
+ DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Explicit);
auto itr = view.begin();
size_t matches = 0;
size_t seeks = 0;
@@ -247,20 +250,23 @@ struct DfaFuzzyMatcherTest : public ::testing::TestWithParam<TestParam> {
updater.commit();
store.freeze_dictionary();
}
- void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) {
+ void expect_prefix_matches(std::string_view target, uint32_t prefix_size, bool prefix_match, const StringVector& exp_matches) {
MatchStats stats;
StringVector brute_force_matches;
StringVector dfa_matches;
StringVector dfa_no_skip_matches;
bool cased = GetParam()._cased;
SCOPED_TRACE(target);
- brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, brute_force_matches);
- dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, dfa_matches);
- dfa_fuzzy_match_in_dictionary_no_skip<true>(target, store, prefix_size, cased, stats, dfa_no_skip_matches);
+ brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, prefix_match, stats, brute_force_matches);
+ dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, prefix_match, stats, dfa_matches);
+ dfa_fuzzy_match_in_dictionary_no_skip<true>(target, store, prefix_size, cased, prefix_match, stats, dfa_no_skip_matches);
EXPECT_EQ(exp_matches, brute_force_matches);
EXPECT_EQ(exp_matches, dfa_matches);
EXPECT_EQ(exp_matches, dfa_no_skip_matches);
}
+ void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) {
+ expect_prefix_matches(target, prefix_size, false, exp_matches);
+ }
void expect_matches(std::string_view target, const StringVector& exp_matches) {
expect_prefix_matches(target, 0, exp_matches);
}
@@ -284,11 +290,12 @@ TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary)
expect_matches("forcecast", {"forecast"});
}
-TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size)
+TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_lock_length)
{
bool cased = GetParam()._cased;
StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill",
- "for", "forbid", "force", "ford", "forearm", "forecast", "forest", "H", "HA", "h", "ha", char_from_u8(u8"Ørn"), char_from_u8(u8"øre"), char_from_u8(u8"Ås"), char_from_u8(u8"ås")};
+ "for", "forbid", "force", "ford", "forearm", "forecast", "forest", "H", "HA", "h", "ha",
+ char_from_u8(u8"Ørn"), char_from_u8(u8"øre"), char_from_u8(u8"Ås"), char_from_u8(u8"ås")};
populate_dictionary(words);
expect_prefix_matches("a", 1, {});
expect_prefix_matches("b", 1, {"bob"});
@@ -321,6 +328,25 @@ TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size)
}
}
+TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_matching) {
+ // We include the empty string to make "everything matches"-checks easier since
+ // the dictionary implicitly returns such an empty string sentinel already.
+ StringVector words = {"", "board", "boat", "bob", "door", "food", "foot", "football", "foothill",
+ "for", "forbid", "force", "ford", "forearm", "forecast", "forest"};
+ populate_dictionary(words);
+ expect_prefix_matches("bo", 0, true, words); // matches everything
+ expect_prefix_matches("bo", 2, true, {"board", "boat", "bob"});
+ expect_prefix_matches("boar", 0, true, {"board", "boat", "bob", "door", "for", "forbid",
+ "force", "ford", "forearm", "forecast", "forest"});
+ expect_prefix_matches("board", 0, true, {"board", "boat", "ford"});
+ expect_prefix_matches("footb", 0, true, {"food", "foot", "football", "foothill", "forbid"});
+ expect_prefix_matches("footba", 0, true, {"foot", "football", "foothill"});
+ expect_prefix_matches("footbal", 0, true, {"football", "foothill"});
+ expect_prefix_matches("football", 0, true, {"football", "foothill"});
+ expect_prefix_matches("footballs", 0, true, {"football"});
+ expect_prefix_matches("z", 1, true, {});
+}
+
void
benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool cased, bool dfa_algorithm)
{
@@ -329,9 +355,9 @@ benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDicti
for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) {
const auto& entry = dict[i];
if (dfa_algorithm) {
- dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy);
+ dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, false, stats, dummy);
} else {
- brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy);
+ brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, false, stats, dummy);
}
}
std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl;
diff --git a/searchlib/src/tests/query/customtypevisitor_test.cpp b/searchlib/src/tests/query/customtypevisitor_test.cpp
index 815ff3bf1fb..fb89f2ef061 100644
--- a/searchlib/src/tests/query/customtypevisitor_test.cpp
+++ b/searchlib/src/tests/query/customtypevisitor_test.cpp
@@ -37,7 +37,7 @@ struct MyRangeTerm : InitTerm<RangeTerm> {};
struct MyStringTerm : InitTerm<StringTerm> {};
struct MySubstrTerm : InitTerm<SubstringTerm> {};
struct MySuffixTerm : InitTerm<SuffixTerm> {};
-struct MyFuzzyTerm : FuzzyTerm { MyFuzzyTerm(): FuzzyTerm("term", "view", 0, Weight(0), 2, 0) {} };
+struct MyFuzzyTerm : FuzzyTerm { MyFuzzyTerm(): FuzzyTerm("term", "view", 0, Weight(0), 2, 0, false) {} };
struct MyWeakAnd : WeakAnd { MyWeakAnd() : WeakAnd(1234, "view") {} };
struct MyWeightedSetTerm : WeightedSetTerm { MyWeightedSetTerm() : WeightedSetTerm(0, "view", 0, Weight(42)) {} };
struct MyDotProduct : DotProduct { MyDotProduct() : DotProduct(0, "view", 0, Weight(42)) {} };
diff --git a/searchlib/src/tests/query/query_visitor_test.cpp b/searchlib/src/tests/query/query_visitor_test.cpp
index b0c9c4e570e..bfce382b684 100644
--- a/searchlib/src/tests/query/query_visitor_test.cpp
+++ b/searchlib/src/tests/query/query_visitor_test.cpp
@@ -88,7 +88,7 @@ TEST("requireThatAllNodesCanBeVisited") {
checkVisit<NearestNeighborTerm>(new SimpleNearestNeighborTerm("query_tensor", "doc_tensor", 0, Weight(0), 123, true, 321, 100100.25));
checkVisit<TrueQueryNode>(new SimpleTrue());
checkVisit<FalseQueryNode>(new SimpleFalse());
- checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0), 2, 0));
+ checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0), 2, 0, false));
checkVisit<InTerm>(new SimpleInTerm(std::make_unique<StringTermVector>(0), MultiTerm::Type::STRING, "field", 0, Weight(0)));
}
diff --git a/searchlib/src/tests/query/querybuilder_test.cpp b/searchlib/src/tests/query/querybuilder_test.cpp
index be795c8c94b..c70fe5b510b 100644
--- a/searchlib/src/tests/query/querybuilder_test.cpp
+++ b/searchlib/src/tests/query/querybuilder_test.cpp
@@ -117,7 +117,7 @@ Node::UP createQueryTree() {
builder.add_true_node();
builder.add_false_node();
}
- builder.addFuzzyTerm(str[5], view[5], id[5], weight[5], 3, 1);
+ builder.addFuzzyTerm(str[5], view[5], id[5], weight[5], 3, 1, false);
}
Node::UP node = builder.build();
ASSERT_TRUE(node.get());
@@ -326,8 +326,8 @@ void checkQueryTreeTypes(Node *node) {
auto* fuzzy_term = as_node<FuzzyTerm>(and_node->getChildren()[12]);
EXPECT_TRUE(checkTerm(fuzzy_term, str[5], view[5], id[5], weight[5]));
- EXPECT_EQUAL(3u, fuzzy_term->getMaxEditDistance());
- EXPECT_EQUAL(1u, fuzzy_term->getPrefixLength());
+ EXPECT_EQUAL(3u, fuzzy_term->max_edit_distance());
+ EXPECT_EQUAL(1u, fuzzy_term->prefix_lock_length());
}
struct AbstractTypes {
@@ -452,8 +452,9 @@ struct MyTrue : TrueQueryNode {};
struct MyFalse : FalseQueryNode {};
struct MyFuzzyTerm : FuzzyTerm {
MyFuzzyTerm(const Type &t, const string &f, int32_t i, Weight w,
- uint32_t m, uint32_t p)
- : FuzzyTerm(t, f, i, w, m, p) {
+ uint32_t m, uint32_t p, bool prefix_match)
+ : FuzzyTerm(t, f, i, w, m, p, prefix_match)
+ {
}
};
struct MyInTerm : InTerm {
@@ -645,23 +646,27 @@ TEST("require that All Range Syntaxes Work") {
EXPECT_TRUE(range2 == range_term->getTerm());
}
-TEST("require that fuzzy node can be created") {
- QueryBuilder<SimpleQueryNodeTypes> builder;
- builder.addFuzzyTerm("term", "view", 0, Weight(0), 3, 1);
- Node::UP node = builder.build();
+TEST("fuzzy node can be created") {
+ for (bool prefix_match : {false, true}) {
+ QueryBuilder<SimpleQueryNodeTypes> builder;
+ builder.addFuzzyTerm("term", "view", 0, Weight(0), 3, 1, prefix_match);
+ Node::UP node = builder.build();
- string stackDump = StackDumpCreator::create(*node);
- {
- SimpleQueryStackDumpIterator iterator(stackDump);
- Node::UP new_node = QueryTreeCreator<SimpleQueryNodeTypes>::create(iterator);
- FuzzyTerm *fuzzy_node = as_node<FuzzyTerm>(new_node.get());
- EXPECT_EQUAL(3u, fuzzy_node->getMaxEditDistance());
- EXPECT_EQUAL(1u, fuzzy_node->getPrefixLength());
- }
- {
- search::QueryTermSimple::UP queryTermSimple = search::QueryTermDecoder::decodeTerm(stackDump);
- EXPECT_EQUAL(3u, queryTermSimple->getFuzzyMaxEditDistance());
- EXPECT_EQUAL(1u, queryTermSimple->getFuzzyPrefixLength());
+ string stackDump = StackDumpCreator::create(*node);
+ {
+ SimpleQueryStackDumpIterator iterator(stackDump);
+ Node::UP new_node = QueryTreeCreator<SimpleQueryNodeTypes>::create(iterator);
+ auto *fuzzy_node = as_node<FuzzyTerm>(new_node.get());
+ EXPECT_EQUAL(3u, fuzzy_node->max_edit_distance());
+ EXPECT_EQUAL(1u, fuzzy_node->prefix_lock_length());
+ EXPECT_EQUAL(prefix_match, fuzzy_node->prefix_match());
+ }
+ {
+ search::QueryTermSimple::UP queryTermSimple = search::QueryTermDecoder::decodeTerm(stackDump);
+ EXPECT_EQUAL(3u, queryTermSimple->fuzzy_max_edit_distance());
+ EXPECT_EQUAL(1u, queryTermSimple->fuzzy_prefix_lock_length());
+ EXPECT_EQUAL(prefix_match, queryTermSimple->fuzzy_prefix_match());
+ }
}
}
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
index 5f3ab9cd3d8..3ae2bdd3598 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp
@@ -44,8 +44,12 @@ extract_suffix(std::string_view target, uint32_t prefix_size)
}
-DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, LevenshteinDfa::DfaType dfa_type)
- : _dfa(vespalib::fuzzy::LevenshteinDfa::build(extract_suffix(target, prefix_size), max_edits, (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), dfa_type)),
+DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size,
+ bool cased, bool prefix_match, LevenshteinDfa::DfaType dfa_type)
+ : _dfa(vespalib::fuzzy::LevenshteinDfa::build(extract_suffix(target, prefix_size), max_edits,
+ (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased),
+ dfa_type, // TODO reorder args
+ (prefix_match ? LevenshteinDfa::Matching::Prefix : LevenshteinDfa::Matching::FullString))),
_successor(),
_prefix(extract_prefix(target, prefix_size, cased)),
_prefix_size(prefix_size),
@@ -54,8 +58,8 @@ DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uin
_successor = _prefix;
}
-DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased)
- : DfaFuzzyMatcher(target, max_edits, prefix_size, cased, LevenshteinDfa::DfaType::Table)
+DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match)
+ : DfaFuzzyMatcher(target, max_edits, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Table)
{
}
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
index 653af602c0d..ee57e9e6583 100644
--- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
+++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h
@@ -26,8 +26,10 @@ private:
const char* skip_prefix(const char* word) const;
public:
- DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type);
- DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased); // Defaults to table-based DFA
+ DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match,
+ vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type);
+ // Defaults to table-based DFA:
+ DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match);
~DfaFuzzyMatcher();
[[nodiscard]] static constexpr bool supports_max_edits(uint8_t edits) noexcept {
diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
index f1a643dc376..c6cb6e4a637 100644
--- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
@@ -49,17 +49,19 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased, vespali
? vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::None)
: vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::IgnoreCase);
} else if (isFuzzy()) {
- auto max_edit_dist = term.getFuzzyMaxEditDistance();
+ const auto max_edit_dist = term.fuzzy_max_edit_distance();
_fuzzyMatcher = std::make_unique<vespalib::FuzzyMatcher>(term.getTerm(),
max_edit_dist,
- term.getFuzzyPrefixLength(),
- isCased());
+ term.fuzzy_prefix_lock_length(),
+ isCased(),
+ term.fuzzy_prefix_match());
if ((fuzzy_matching_algorithm != FMA::BruteForce) &&
(max_edit_dist > 0 && max_edit_dist <= 2)) {
_dfa_fuzzy_matcher = std::make_unique<DfaFuzzyMatcher>(term.getTerm(),
max_edit_dist,
- term.getFuzzyPrefixLength(),
+ term.fuzzy_prefix_lock_length(),
isCased(),
+ term.fuzzy_prefix_match(),
to_dfa_type(fuzzy_matching_algorithm));
}
} else if (isCased()) {
diff --git a/searchlib/src/vespa/searchlib/parsequery/parse.h b/searchlib/src/vespa/searchlib/parsequery/parse.h
index 5e3b1dffe3a..a05c68bea46 100644
--- a/searchlib/src/vespa/searchlib/parsequery/parse.h
+++ b/searchlib/src/vespa/searchlib/parsequery/parse.h
@@ -87,6 +87,8 @@ public:
IFLAG_NORANK = 0x00000001, // this term should not be ranked (not exposed to rank framework)
IFLAG_SPECIALTOKEN = 0x00000002,
IFLAG_NOPOSITIONDATA = 0x00000004, // we should not use position data when ranking this term
+ IFLAG_FILTER = 0x00000008, // see GetCreator(flags) below
+ IFLAG_PREFIX_MATCH = 0x00000010
};
/** Extra information on each item (creator id) coded in bit 3 of flags */
diff --git a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h
index d15e1ca0512..bf6e3abf1ea 100644
--- a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h
+++ b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h
@@ -113,9 +113,18 @@ public:
uint32_t getUniqueId() const noexcept { return _currUniqueId; }
// Get the flags of the current item.
- bool hasNoRankFlag() const noexcept { return (_currFlags & ParseItem::IFLAG_NORANK) != 0; }
- bool hasSpecialTokenFlag() const noexcept { return (_currFlags & ParseItem::IFLAG_SPECIALTOKEN) != 0; }
- bool hasNoPositionDataFlag() const noexcept { return (_currFlags & ParseItem::IFLAG_NOPOSITIONDATA) != 0; }
+ [[nodiscard]] bool hasNoRankFlag() const noexcept {
+ return (_currFlags & ParseItem::IFLAG_NORANK) != 0;
+ }
+ [[nodiscard]] bool hasSpecialTokenFlag() const noexcept {
+ return (_currFlags & ParseItem::IFLAG_SPECIALTOKEN) != 0;
+ }
+ [[nodiscard]] bool hasNoPositionDataFlag() const noexcept {
+ return (_currFlags & ParseItem::IFLAG_NOPOSITIONDATA) != 0;
+ }
+ [[nodiscard]] bool has_prefix_match_semantics() const noexcept {
+ return (_currFlags & ParseItem::IFLAG_PREFIX_MATCH) != 0;
+ }
uint32_t getArity() const noexcept { return _currArity; }
@@ -127,9 +136,9 @@ public:
bool getAllowApproximate() const noexcept { return (_extraIntArg2 != 0); }
uint32_t getExploreAdditionalHits() const noexcept { return _extraIntArg3; }
- // fuzzy match arguments
- uint32_t getFuzzyMaxEditDistance() const noexcept { return _extraIntArg1; }
- uint32_t getFuzzyPrefixLength() const noexcept { return _extraIntArg2; }
+ // fuzzy match arguments (see also: has_prefix_match_semantics() for fuzzy prefix matching)
+ [[nodiscard]] uint32_t fuzzy_max_edit_distance() const noexcept { return _extraIntArg1; }
+ [[nodiscard]] uint32_t fuzzy_prefix_lock_length() const noexcept { return _extraIntArg2; }
std::unique_ptr<query::PredicateQueryTerm> getPredicateQueryTerm();
std::unique_ptr<query::TermVector> get_terms();
diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp
index 060cd5015b3..09fc443cf0e 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp
+++ b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp
@@ -248,10 +248,11 @@ QueryTermSimple::QueryTermSimple(const string & term_, Type type)
_type(type),
_diversityCutoffStrict(false),
_valid(true),
+ _fuzzy_prefix_match(false),
_term(term_),
_diversityAttribute(),
- _fuzzyMaxEditDistance(2),
- _fuzzyPrefixLength(0)
+ _fuzzy_max_edit_distance(2),
+ _fuzzy_prefix_lock_length(0)
{
if (isFullRange(_term)) {
stringref rest(_term.c_str() + 1, _term.size() - 2);
diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h
index a740afb0340..b0ee7aa54bc 100644
--- a/searchlib/src/vespa/searchlib/query/query_term_simple.h
+++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h
@@ -44,8 +44,9 @@ public:
size_t getDiversityCutoffGroups() const noexcept { return _diversityCutoffGroups; }
bool getDiversityCutoffStrict() const noexcept { return _diversityCutoffStrict; }
vespalib::stringref getDiversityAttribute() const noexcept { return _diversityAttribute; }
- size_t getFuzzyMaxEditDistance() const noexcept { return _fuzzyMaxEditDistance; }
- size_t getFuzzyPrefixLength() const noexcept { return _fuzzyPrefixLength; }
+ [[nodiscard]] size_t fuzzy_max_edit_distance() const noexcept { return _fuzzy_max_edit_distance; }
+ [[nodiscard]] size_t fuzzy_prefix_lock_length() const noexcept { return _fuzzy_prefix_lock_length; }
+ [[nodiscard]] bool fuzzy_prefix_match() const noexcept { return _fuzzy_prefix_match; }
bool getAsIntegerTerm(int64_t & lower, int64_t & upper) const noexcept;
bool getAsFloatTerm(double & lower, double & upper) const noexcept;
bool getAsFloatTerm(float & lower, float & upper) const noexcept;
@@ -77,14 +78,17 @@ private:
Type _type;
bool _diversityCutoffStrict;
bool _valid;
+protected:
+ bool _fuzzy_prefix_match; // set in QueryTerm
+private:
string _term;
stringref _diversityAttribute;
template <typename T, typename D>
bool getAsNumericTerm(T & lower, T & upper, D d) const noexcept;
protected:
- uint32_t _fuzzyMaxEditDistance; // set in QueryTerm
- uint32_t _fuzzyPrefixLength;
+ uint32_t _fuzzy_max_edit_distance; // set in QueryTerm
+ uint32_t _fuzzy_prefix_lock_length;
};
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp
index f33fe44369a..8a4ec184de8 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp
@@ -13,20 +13,21 @@ constexpr bool normalizing_implies_cased(Normalizing norm) noexcept {
FuzzyTerm::FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term,
const string& index, Type type, Normalizing normalizing,
- uint8_t max_edits, uint32_t prefix_size)
+ uint8_t max_edits, uint32_t prefix_lock_length, bool prefix_match)
: QueryTerm(std::move(result_base), term, index, type, normalizing),
_dfa_matcher(),
_fallback_matcher()
{
- setFuzzyMaxEditDistance(max_edits);
- setFuzzyPrefixLength(prefix_size);
+ set_fuzzy_max_edit_distance(max_edits);
+ set_fuzzy_prefix_lock_length(prefix_lock_length);
+ set_fuzzy_prefix_match(prefix_match);
std::string_view term_view(term.data(), term.size());
const bool cased = normalizing_implies_cased(normalizing);
if (attribute::DfaFuzzyMatcher::supports_max_edits(max_edits)) {
- _dfa_matcher = std::make_unique<attribute::DfaFuzzyMatcher>(term_view, max_edits, prefix_size, cased);
+ _dfa_matcher = std::make_unique<attribute::DfaFuzzyMatcher>(term_view, max_edits, prefix_lock_length, cased, prefix_match);
} else {
- _fallback_matcher = std::make_unique<vespalib::FuzzyMatcher>(term_view, max_edits, prefix_size, cased);
+ _fallback_matcher = std::make_unique<vespalib::FuzzyMatcher>(term_view, max_edits, prefix_lock_length, cased, prefix_match);
}
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h
index c6c88b18969..552a839cb3b 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h
@@ -23,7 +23,7 @@ class FuzzyTerm : public QueryTerm {
public:
FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term,
const string& index, Type type, Normalizing normalizing,
- uint8_t max_edits, uint32_t prefix_size);
+ uint8_t max_edits, uint32_t prefix_lock_length, bool prefix_match);
~FuzzyTerm() override;
[[nodiscard]] FuzzyTerm* as_fuzzy_term() noexcept override { return this; }
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
index 94a479fd2d3..68e276d0fde 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
@@ -130,7 +130,8 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
qt = std::make_unique<RegexpTerm>(factory.create(), ssTerm, ssIndex, TermType::REGEXP, normalize_mode);
} else if (sTerm == TermType::FUZZYTERM) {
qt = std::make_unique<FuzzyTerm>(factory.create(), ssTerm, ssIndex, TermType::FUZZYTERM, normalize_mode,
- queryRep.getFuzzyMaxEditDistance(), queryRep.getFuzzyPrefixLength());
+ queryRep.fuzzy_max_edit_distance(), queryRep.fuzzy_prefix_lock_length(),
+ queryRep.has_prefix_match_semantics());
} else [[likely]] {
qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode);
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
index 05b12804d52..0e0ab98295b 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
@@ -100,8 +100,9 @@ public:
void visitMembers(vespalib::ObjectVisitor &visitor) const override;
void setIndex(const string & index_) override { _index = index_; }
const string & getIndex() const override { return _index; }
- void setFuzzyMaxEditDistance(uint32_t fuzzyMaxEditDistance) { _fuzzyMaxEditDistance = fuzzyMaxEditDistance; }
- void setFuzzyPrefixLength(uint32_t fuzzyPrefixLength) { _fuzzyPrefixLength = fuzzyPrefixLength; }
+ void set_fuzzy_max_edit_distance(uint32_t fuzzy_max_edit_distance) noexcept { _fuzzy_max_edit_distance = fuzzy_max_edit_distance; }
+ void set_fuzzy_prefix_lock_length(uint32_t fuzzy_prefix_length) noexcept { _fuzzy_prefix_lock_length = fuzzy_prefix_length; }
+ void set_fuzzy_prefix_match(bool prefix_match) noexcept { _fuzzy_prefix_match = prefix_match; }
virtual NearestNeighborQueryNode* as_nearest_neighbor_query_node() noexcept;
virtual MultiTerm* as_multi_term() noexcept;
virtual const MultiTerm* as_multi_term() const noexcept;
diff --git a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h
index 41990af6908..a498076bb09 100644
--- a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h
+++ b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h
@@ -226,8 +226,8 @@ create_nearest_neighbor_term(vespalib::stringref query_tensor_name, vespalib::st
template <class NodeTypes>
typename NodeTypes::FuzzyTerm *
createFuzzyTerm(vespalib::stringref term, vespalib::stringref view, int32_t id, Weight weight,
- uint32_t maxEditDistance, uint32_t prefixLength) {
- return new typename NodeTypes::FuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength);
+ uint32_t max_edit_distance, uint32_t prefix_lock_length, bool prefix_match) {
+ return new typename NodeTypes::FuzzyTerm(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match);
}
template <class NodeTypes>
@@ -343,9 +343,10 @@ public:
return addTerm(createRegExpTerm<NodeTypes>(term, view, id, weight));
}
typename NodeTypes::FuzzyTerm &addFuzzyTerm(stringref term, stringref view, int32_t id, Weight weight,
- uint32_t maxEditDistance, uint32_t prefixLength) {
+ uint32_t max_edit_distance, uint32_t prefix_lock_length,
+ bool prefix_match) {
adjustWeight(weight);
- return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight, maxEditDistance, prefixLength));
+ return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match));
}
typename NodeTypes::NearestNeighborTerm &add_nearest_neighbor_term(stringref query_tensor_name, stringref field_name,
int32_t id, Weight weight, uint32_t target_num_hits,
diff --git a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h
index f578bb57e21..116e677d439 100644
--- a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h
+++ b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h
@@ -201,7 +201,8 @@ private:
replicate(node, _builder.addFuzzyTerm(
node.getTerm(), node.getView(),
node.getId(), node.getWeight(),
- node.getMaxEditDistance(), node.getPrefixLength()));
+ node.max_edit_distance(), node.prefix_lock_length(),
+ node.prefix_match()));
}
std::unique_ptr<TermVector> replicate_subterms(const InTerm& original) {
diff --git a/searchlib/src/vespa/searchlib/query/tree/simplequery.h b/searchlib/src/vespa/searchlib/query/tree/simplequery.h
index 535402bf14d..34bc39ae504 100644
--- a/searchlib/src/vespa/searchlib/query/tree/simplequery.h
+++ b/searchlib/src/vespa/searchlib/query/tree/simplequery.h
@@ -163,8 +163,10 @@ struct SimpleNearestNeighborTerm : NearestNeighborTerm {
struct SimpleFuzzyTerm : FuzzyTerm {
SimpleFuzzyTerm(const Type &term, vespalib::stringref view,
int32_t id, Weight weight,
- uint32_t maxEditDistance, uint32_t prefixLength)
- : FuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength) {
+ uint32_t max_edit_distance, uint32_t prefix_lock_length,
+ bool prefix_match)
+ : FuzzyTerm(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match)
+ {
}
~SimpleFuzzyTerm() override;
};
diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp
index 6707712ee69..b71a86a214a 100644
--- a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp
+++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp
@@ -225,6 +225,9 @@ class QueryNodeConverter : public QueryVisitor {
if (!node.usePositionData()) {
flags |= ParseItem::IFLAG_NOPOSITIONDATA;
}
+ if (node.prefix_match()) {
+ flags |= ParseItem::IFLAG_PREFIX_MATCH;
+ }
if (flags != 0) {
features |= ParseItem::IF_FLAGS;
}
@@ -289,8 +292,8 @@ class QueryNodeConverter : public QueryVisitor {
void visit(FuzzyTerm &node) override {
createTerm(node, ParseItem::ITEM_FUZZY);
- appendCompressedPositiveNumber(node.getMaxEditDistance());
- appendCompressedPositiveNumber(node.getPrefixLength());
+ appendCompressedPositiveNumber(node.max_edit_distance());
+ appendCompressedPositiveNumber(node.prefix_lock_length());
}
void visit(NearestNeighborTerm &node) override {
diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h
index 7c743a72567..6f7d7ddd9ad 100644
--- a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h
+++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h
@@ -42,6 +42,9 @@ public:
if (queryStack.hasNoPositionDataFlag()) {
t->setPositionData(false);
}
+ if (queryStack.has_prefix_match_semantics()) [[unlikely]] {
+ t->set_prefix_match(true);
+ }
}
}
if (builder.hasError()) {
@@ -185,9 +188,10 @@ private:
} else if (type == ParseItem::ITEM_REGEXP) {
t = &builder.addRegExpTerm(term, view, id, weight);
} else if (type == ParseItem::ITEM_FUZZY) {
- uint32_t maxEditDistance = queryStack.getFuzzyMaxEditDistance();
- uint32_t prefixLength = queryStack.getFuzzyPrefixLength();
- t = &builder.addFuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength);
+ uint32_t max_edit_distance = queryStack.fuzzy_max_edit_distance();
+ uint32_t prefix_lock_length = queryStack.fuzzy_prefix_lock_length();
+ bool prefix_match = queryStack.has_prefix_match_semantics();
+ t = &builder.addFuzzyTerm(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match);
} else if (type == ParseItem::ITEM_STRING_IN) {
t = &builder.add_in_term(queryStack.get_terms(), MultiTerm::Type::STRING, view, id, weight);
} else if (type == ParseItem::ITEM_NUMERIC_IN) {
diff --git a/searchlib/src/vespa/searchlib/query/tree/term.cpp b/searchlib/src/vespa/searchlib/query/tree/term.cpp
index 45ba259a799..35f6ea5c8f5 100644
--- a/searchlib/src/vespa/searchlib/query/tree/term.cpp
+++ b/searchlib/src/vespa/searchlib/query/tree/term.cpp
@@ -12,12 +12,14 @@ Term::Term(vespalib::stringref view, int32_t id, Weight weight) :
_id(id),
_weight(weight),
_ranked(true),
- _position_data(true)
+ _position_data(true),
+ _prefix_match(false)
{ }
void Term::setStateFrom(const Term& other) {
setRanked(other.isRanked());
setPositionData(other.usePositionData());
+ set_prefix_match(other.prefix_match());
// too late to copy this state:
assert(_view == other.getView());
assert(_id == other.getId());
diff --git a/searchlib/src/vespa/searchlib/query/tree/term.h b/searchlib/src/vespa/searchlib/query/tree/term.h
index 2f57c3cb06d..678d018d111 100644
--- a/searchlib/src/vespa/searchlib/query/tree/term.h
+++ b/searchlib/src/vespa/searchlib/query/tree/term.h
@@ -18,6 +18,7 @@ class Term
Weight _weight;
bool _ranked;
bool _position_data;
+ bool _prefix_match;
public:
virtual ~Term() = 0;
@@ -25,14 +26,17 @@ public:
void setView(const vespalib::string & view) { _view = view; }
void setRanked(bool ranked) noexcept { _ranked = ranked; }
void setPositionData(bool position_data) noexcept { _position_data = position_data; }
+ // Used for fuzzy prefix matching. Not to be confused with distinct Prefix query term type
+ void set_prefix_match(bool prefix_match) noexcept { _prefix_match = prefix_match; }
void setStateFrom(const Term& other);
const vespalib::string & getView() const noexcept { return _view; }
Weight getWeight() const noexcept { return _weight; }
int32_t getId() const noexcept { return _id; }
- bool isRanked() const noexcept { return _ranked; }
- bool usePositionData() const noexcept { return _position_data; }
+ [[nodiscard]] bool isRanked() const noexcept { return _ranked; }
+ [[nodiscard]] bool usePositionData() const noexcept { return _position_data; }
+ [[nodiscard]] bool prefix_match() const noexcept { return _prefix_match; }
static bool isPossibleRangeTerm(vespalib::stringref term) noexcept {
return (term[0] == '[' || term[0] == '<' || term[0] == '>');
diff --git a/searchlib/src/vespa/searchlib/query/tree/termnodes.h b/searchlib/src/vespa/searchlib/query/tree/termnodes.h
index 97e659dcc77..42dc0717368 100644
--- a/searchlib/src/vespa/searchlib/query/tree/termnodes.h
+++ b/searchlib/src/vespa/searchlib/query/tree/termnodes.h
@@ -119,19 +119,22 @@ public:
//-----------------------------------------------------------------------------
class FuzzyTerm : public QueryNodeMixin<FuzzyTerm, StringBase> {
-private:
- uint32_t _maxEditDistance;
- uint32_t _prefixLength;
+ uint32_t _max_edit_distance;
+ uint32_t _prefix_lock_length;
+ // Prefix match mode is stored in parent Term
public:
FuzzyTerm(const Type &term, vespalib::stringref view,
- int32_t id, Weight weight, uint32_t maxEditDistance, uint32_t prefixLength)
- : QueryNodeMixinType(term, view, id, weight),
- _maxEditDistance(maxEditDistance),
- _prefixLength(prefixLength)
- {}
+ int32_t id, Weight weight, uint32_t max_edit_distance,
+ uint32_t prefix_lock_length, bool prefix_match)
+ : QueryNodeMixinType(term, view, id, weight),
+ _max_edit_distance(max_edit_distance),
+ _prefix_lock_length(prefix_lock_length)
+ {
+ set_prefix_match(prefix_match);
+ }
- uint32_t getMaxEditDistance() const { return _maxEditDistance; }
- uint32_t getPrefixLength() const { return _prefixLength; }
+ [[nodiscard]] uint32_t max_edit_distance() const { return _max_edit_distance; }
+ [[nodiscard]] uint32_t prefix_lock_length() const { return _prefix_lock_length; }
virtual ~FuzzyTerm() = 0;
};
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index ce9636895b4..84c3a542661 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -75,31 +75,47 @@ std::string_view maybe_consume_into(std::string_view str, T& val_out) {
return str.substr(ptr - str.data());
}
-// Parse optional max edits and prefix lock length from term string.
+// Parse optional prefix match mode, max edits and prefix lock length from term string.
// Syntax:
-// "term" -> {2, 0, "term"} (default max edits & prefix length)
-// "{1}term" -> {1, 0, "term"}
-// "{1,3}term" -> {1, 3, "term"}
+// "term" -> {2, 0, false, "term"} (default max edits, prefix length and prefix match mode)
+// "{p}term" -> {2, 0, true, "term"}
+// "{1}term" -> {1, 0, false, "term"}
+// "{p1}term" -> {1, 0, true, "term"}
+// "{1,3}term" -> {1, 3, false, "term"}
+// "{p1,3}term" -> {1, 3, true, "term"}
+// .. and so on
//
// Note: this is not a "proper" parser (it accepts empty numeric values); only for testing!
-std::tuple<uint8_t, uint32_t, std::string_view> parse_fuzzy_params(std::string_view term) {
+std::tuple<uint8_t, uint32_t, bool, std::string_view> parse_fuzzy_params(std::string_view term) {
+ std::string_view orig_term = term;
if (term.empty() || term[0] != '{') {
- return {2, 0, term};
+ return {2, 0, false, term};
}
+ term = term.substr(1); // skip '{'
uint8_t max_edits = 2;
uint32_t prefix_length = 0;
- term = maybe_consume_into(term.substr(1), max_edits);
+ bool prefix_match = false;
+ if (!term.empty() && term[0] == 'p') {
+ prefix_match = true;
+ term = term.substr(1); // skip 'p'
+ }
+ if (!term.empty() && term[0] == '}') {
+ return {2, 0, prefix_match, term.substr(1)};
+ }
+ term = maybe_consume_into(term, max_edits);
if (term.empty() || (term[0] != ',' && term[0] != '}')) {
- throw std::invalid_argument("malformed fuzzy params at (or after) max_edits");
+ throw std::invalid_argument("malformed fuzzy params at (or after) max_edits: " +
+ std::string(term) + " in string " + std::string(orig_term));
}
if (term[0] == '}') {
- return {max_edits, prefix_length, term.substr(1)};
+ return {max_edits, prefix_length, prefix_match, term.substr(1)};
}
term = maybe_consume_into(term.substr(1), prefix_length);
if (term.empty() || term[0] != '}') {
- throw std::invalid_argument("malformed fuzzy params at (or after) prefix_length");
+ throw std::invalid_argument("malformed fuzzy params at (or after) prefix_length: " +
+ std::string(term) + " in string " + std::string(orig_term));
}
- return {max_edits, prefix_length, term.substr(1)};
+ return {max_edits, prefix_length, prefix_match, term.substr(1)};
}
}
@@ -115,9 +131,10 @@ private:
if (pt.second == TermType::REGEXP) {
qtv.push_back(std::make_unique<RegexpTerm>(eqnr.create(), pt.first, effective_index, TermType::REGEXP, normalizing));
} else if (pt.second == TermType::FUZZYTERM) {
- auto [max_edits, prefix_length, actual_term] = parse_fuzzy_params(pt.first);
+ auto [max_edits, prefix_lock_length, prefix_match, actual_term] = parse_fuzzy_params(pt.first);
qtv.push_back(std::make_unique<FuzzyTerm>(eqnr.create(), vespalib::stringref(actual_term.data(), actual_term.size()),
- effective_index, TermType::FUZZYTERM, normalizing, max_edits, prefix_length));
+ effective_index, TermType::FUZZYTERM, normalizing, max_edits,
+ prefix_lock_length, prefix_match));
} else {
qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing));
}
@@ -531,25 +548,29 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs)
return true;
}
-TEST("parsing of test-only fuzzy term params can extract numeric values") {
+void check_fuzzy_param_parsing(std::string_view term, std::string_view exp_term,
+ uint8_t exp_max_edits, uint32_t exp_prefix_length, bool exp_prefix)
+{
uint8_t max_edits = 0;
- uint32_t prefix_length = 1234;
+ uint32_t prefix_length = 0;
+ bool prefix = false;
std::string_view out;
- std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("myterm");
- EXPECT_EQUAL(max_edits, 2u);
- EXPECT_EQUAL(prefix_length, 0u);
- EXPECT_EQUAL(out, "myterm");
+ std::tie(max_edits, prefix_length, prefix, out) = parse_fuzzy_params(term);
+ EXPECT_EQUAL(static_cast<uint32_t>(max_edits), static_cast<uint32_t>(exp_max_edits)); // don't print as char...
+ EXPECT_EQUAL(prefix_length, exp_prefix_length);
+ EXPECT_EQUAL(prefix, exp_prefix);
+ EXPECT_EQUAL(out, exp_term);
- std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("{3}myterm");
- EXPECT_EQUAL(max_edits, 3u);
- EXPECT_EQUAL(prefix_length, 0u);
- EXPECT_EQUAL(out, "myterm");
+}
- std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("{2,70}myterm");
- EXPECT_EQUAL(max_edits, 2u);
- EXPECT_EQUAL(prefix_length, 70u);
- EXPECT_EQUAL(out, "myterm");
+TEST("parsing of test-only fuzzy term params can extract expected values") {
+ check_fuzzy_param_parsing("myterm", "myterm", 2, 0, false);
+ check_fuzzy_param_parsing("{3}myterm", "myterm", 3, 0, false);
+ check_fuzzy_param_parsing("{p}myterm", "myterm", 2, 0, true);
+ check_fuzzy_param_parsing("{p1}myterm", "myterm", 1, 0, true);
+ check_fuzzy_param_parsing("{2,70}myterm", "myterm", 2, 70, false);
+ check_fuzzy_param_parsing("{p2,70}myterm", "myterm", 2, 70, true);
}
TEST("verify correct term parsing") {
@@ -640,6 +661,14 @@ TEST("utf8 substring search with empty term")
assertFieldInfo(fs, "", "abc", QTFieldInfo().setFieldLength(0));
}
+Hits is_hit() {
+ return Hits().add({0, 0});
+}
+
+Hits no_hits() {
+ return {};
+}
+
TEST("utf8 suffix search") {
UTF8SuffixStringFieldSearcher fs(0);
std::string field = "operators and operator overloading";
@@ -837,6 +866,37 @@ TEST("utf8 flexible searcher fuzzy matching treats field as 1 word") {
TEST_DO(assertFieldInfo(fs, "%{1}foo", "foo bar baz", QTFieldInfo(0, 0, 1)));
}
+TEST("utf8 flexible searcher supports fuzzy prefix matching") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ TEST_DO(assertString(fs, "%{p0}z", "zoid", is_hit()));
+ TEST_DO(assertString(fs, "%{p0}zo", "zoid", is_hit()));
+ TEST_DO(assertString(fs, "%{p0}zo", "Zoid", is_hit())); // uncased
+ TEST_DO(assertString(fs, "%{p0}Zo", "zoid", is_hit())); // uncased
+ TEST_DO(assertString(fs, "%{p0}zoid", "zoid", is_hit()));
+ TEST_DO(assertString(fs, "%{p0}x", "zoid", no_hits()));
+ TEST_DO(assertString(fs, "%{p1}zo", "boid", is_hit()));
+ TEST_DO(assertString(fs, "%{p1}zo", "blid", no_hits()));
+ TEST_DO(assertString(fs, "%{p1}yam", "hamburger", is_hit()));
+ TEST_DO(assertString(fs, "%{p1}yam", "humbug", no_hits()));
+ TEST_DO(assertString(fs, "%{p2}yam", "humbug", is_hit()));
+ TEST_DO(assertString(fs, "%{p2}catfo", "dogfood", no_hits()));
+ TEST_DO(assertString(fs, "%{p3}catfo", "dogfood", is_hit()));
+ TEST_DO(assertString(fs, "%{p100}abcd", "anything you want", is_hit())); // trivially matches
+}
+
+TEST("utf8 flexible searcher supports fuzzy prefix matching combined with prefix locking") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ TEST_DO(assertString(fs, "%{p0,4}zoid", "zoid", is_hit()));
+ TEST_DO(assertString(fs, "%{p0,4}zoidber", "zoidberg", is_hit()));
+ TEST_DO(assertString(fs, "%{p1,4}zoidber", "zoidburg", is_hit()));
+ TEST_DO(assertString(fs, "%{p1,4}zoidber", "zoidblurgh", no_hits()));
+ TEST_DO(assertString(fs, "%{p1,4}zoidbe", "zoidblurgh", is_hit()));
+ TEST_DO(assertString(fs, "%{p1,4}zoidberg", "boidberg", no_hits()));
+ TEST_DO(assertString(fs, "%{p1,4}zoidber", "zoidburger", is_hit()));
+ TEST_DO(assertString(fs, "%{p1,4}zoidber", "zoidbananas", no_hits()));
+ TEST_DO(assertString(fs, "%{p2,4}zoidber", "zoidbananas", is_hit()));
+}
+
TEST("bool search") {
BoolFieldSearcher fs(0);
TEST_DO(assertBool(fs, "true", true, true));
diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
index d94120e5bcf..9e4550db073 100644
--- a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
+++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
@@ -33,7 +33,7 @@ TEST(FuzzyMatcherTest, get_suffix_edge_cases) {
}
TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
- FuzzyMatcher fuzzy("abc", 2, 0, false);
+ FuzzyMatcher fuzzy("abc", 2, 0, false, false);
EXPECT_TRUE(fuzzy.isMatch("abc"));
EXPECT_TRUE(fuzzy.isMatch("ABC"));
EXPECT_TRUE(fuzzy.isMatch("ab1"));
@@ -42,15 +42,15 @@ TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
}
TEST(FuzzyMatcherTest, fuzzy_match_cased) {
- FuzzyMatcher fuzzy("abc", 2, 0, true);
+ FuzzyMatcher fuzzy("abc", 2, 0, true, false);
EXPECT_TRUE(fuzzy.isMatch("abc"));
EXPECT_TRUE(fuzzy.isMatch("abC"));
EXPECT_TRUE(fuzzy.isMatch("aBC"));
EXPECT_FALSE(fuzzy.isMatch("ABC"));
}
-TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
- FuzzyMatcher fuzzy("abcdef", 2, 2, false);
+TEST(FuzzyMatcherTest, fuzzy_match_with_prefix_locking) {
+ FuzzyMatcher fuzzy("abcdef", 2, 2, false, false);
EXPECT_TRUE(fuzzy.isMatch("abcdef"));
EXPECT_TRUE(fuzzy.isMatch("ABCDEF"));
EXPECT_TRUE(fuzzy.isMatch("abcde1"));
@@ -59,22 +59,43 @@ TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
EXPECT_FALSE(fuzzy.isMatch("12cdef"));
}
-TEST(FuzzyMatcherTest, get_prefix_is_empty) {
- FuzzyMatcher fuzzy("whatever", 2, 0, false);
+TEST(FuzzyMatcherTest, get_prefix_lock_length_is_zero) {
+ FuzzyMatcher fuzzy("whatever", 2, 0, false, false);
EXPECT_EQ(fuzzy.getPrefix(), "");
}
TEST(FuzzyMatcherTest, term_is_empty) {
- FuzzyMatcher fuzzy("", 2, 0, false);
+ FuzzyMatcher fuzzy("", 2, 0, false, false);
EXPECT_TRUE(fuzzy.isMatch(""));
EXPECT_TRUE(fuzzy.isMatch("a"));
EXPECT_TRUE(fuzzy.isMatch("aa"));
EXPECT_FALSE(fuzzy.isMatch("aaa"));
}
-TEST(FuzzyMatcherTest, get_prefix_non_empty) {
- FuzzyMatcher fuzzy("abcd", 2, 2, false);
+TEST(FuzzyMatcherTest, get_prefix_lock_length_non_zero) {
+ FuzzyMatcher fuzzy("abcd", 2, 2, false, false);
EXPECT_EQ(fuzzy.getPrefix(), "ab");
}
+TEST(FuzzyMatcherTest, fuzzy_prefix_matching_without_prefix_lock_length) {
+ FuzzyMatcher fuzzy("abc", 1, 0, false, true);
+ EXPECT_EQ(fuzzy.getPrefix(), "");
+ EXPECT_TRUE(fuzzy.isMatch("abc"));
+ EXPECT_TRUE(fuzzy.isMatch("abcdefgh"));
+ EXPECT_TRUE(fuzzy.isMatch("ab"));
+ EXPECT_TRUE(fuzzy.isMatch("abd"));
+ EXPECT_TRUE(fuzzy.isMatch("xabc"));
+ EXPECT_FALSE(fuzzy.isMatch("xy"));
+}
+
+TEST(FuzzyMatcherTest, fuzzy_prefix_matching_with_prefix_lock_length) {
+ FuzzyMatcher fuzzy("zoid", 1, 2, false, true);
+ EXPECT_EQ(fuzzy.getPrefix(), "zo");
+ EXPECT_TRUE(fuzzy.isMatch("zoidberg"));
+ EXPECT_TRUE(fuzzy.isMatch("zold"));
+ EXPECT_TRUE(fuzzy.isMatch("zoldberg"));
+ EXPECT_FALSE(fuzzy.isMatch("zoxx"));
+ EXPECT_FALSE(fuzzy.isMatch("loid"));
+}
+
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
index 918cbc624db..61824fa4abf 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
@@ -74,7 +74,11 @@ TEST(LevenshteinDistance, prefix_match_edge_cases) {
EXPECT_EQ(prefix_calculate("xy", "", 2), std::optional{2});
EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt);
- // Max edits > 2 cases; not supported by DFA implementation.
+ // Max edits not in {1, 2} cases; not supported by DFA implementation.
+ EXPECT_EQ(prefix_calculate("", "", 0), std::optional{0});
+ EXPECT_EQ(prefix_calculate("abc", "abc", 0), std::optional{0});
+ EXPECT_EQ(prefix_calculate("abc", "abcde", 0), std::optional{0});
+ EXPECT_EQ(prefix_calculate("abc", "dbc", 0), std::nullopt);
EXPECT_EQ(prefix_calculate("abc", "", 3), std::optional{3});
EXPECT_EQ(prefix_calculate("abc", "xy", 3), std::optional{3});
EXPECT_EQ(prefix_calculate("abc", "xyz", 3), std::optional{3});
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
index 71388f43b89..fbfa50aa57f 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
@@ -29,10 +29,12 @@ vespalib::FuzzyMatcher::FuzzyMatcher():
_folded_term_codepoints_suffix()
{}
-vespalib::FuzzyMatcher::FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased):
+vespalib::FuzzyMatcher::FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size,
+ bool is_cased, bool is_prefix):
_max_edit_distance(max_edit_distance),
_prefix_size(prefix_size),
_is_cased(is_cased),
+ _is_prefix(is_prefix),
_folded_term_codepoints(_is_cased ? cased_convert_to_ucs4(term) : LowerCase::convert_to_ucs4(term)),
_folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)),
_folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size))
@@ -73,7 +75,7 @@ bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const {
return LevenshteinDistance::calculate(
_folded_term_codepoints_suffix,
get_suffix(targetCodepoints, _prefix_size),
- _max_edit_distance).has_value();
+ _max_edit_distance, _is_prefix).has_value();
}
vespalib::string vespalib::FuzzyMatcher::getPrefix() const {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
index aae6ca1c6e5..2c17283d20a 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
@@ -24,6 +24,7 @@ private:
uint32_t _max_edit_distance; // max edit distance
uint32_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy
bool _is_cased;
+ bool _is_prefix;
std::vector<uint32_t> _folded_term_codepoints;
@@ -34,7 +35,7 @@ public:
FuzzyMatcher();
FuzzyMatcher(const FuzzyMatcher &) = delete;
FuzzyMatcher & operator = (const FuzzyMatcher &) = delete;
- FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased);
+ FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased, bool is_prefix);
~FuzzyMatcher();
[[nodiscard]] bool isMatch(std::string_view target) const;