diff options
38 files changed, 443 insertions, 189 deletions
diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json index 07f0449e61a..5e66e1bb746 100644 --- a/container-search/abi-spec.json +++ b/container-search/abi-spec.json @@ -555,11 +555,15 @@ "public" ], "methods" : [ + "public void <init>(java.lang.String, boolean, java.lang.String, int, int, boolean)", "public void <init>(java.lang.String, boolean, java.lang.String, int, int)", "public void setMaxEditDistance(int)", "public void setPrefixLength(int)", "public int getPrefixLength()", "public int getMaxEditDistance()", + "public boolean isPrefixMatch()", + "public void setPrefixMatch(boolean)", + "protected boolean hasPrefixMatchSemantics()", "public void setValue(java.lang.String)", "public java.lang.String getRawWord()", "public boolean isWords()", @@ -820,6 +824,7 @@ "public abstract java.lang.String getName()", "public void setFilter(boolean)", "public boolean isFilter()", + "protected boolean hasPrefixMatchSemantics()", "public com.yahoo.prelude.query.Item$ItemCreator getCreator()", "public void setCreator(com.yahoo.prelude.query.Item$ItemCreator)", "public void setWeight(int)", diff --git a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java index 3cf86a70985..b900dee20ba 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/FuzzyItem.java @@ -16,15 +16,21 @@ public class FuzzyItem extends TermItem { private int maxEditDistance; private int prefixLength; + private boolean prefixMatch; public static int DEFAULT_MAX_EDIT_DISTANCE = 2; public static int DEFAULT_PREFIX_LENGTH = 0; - public FuzzyItem(String indexName, boolean isFromQuery, String term, int maxEditDistance, int prefixLength) { + public FuzzyItem(String indexName, boolean isFromQuery, String term, int maxEditDistance, int prefixLength, boolean prefixMatch) { super(indexName, isFromQuery, null); setValue(term); setMaxEditDistance(maxEditDistance); setPrefixLength(prefixLength); + setPrefixMatch(prefixMatch); + } + + public FuzzyItem(String indexName, boolean isFromQuery, String term, int maxEditDistance, int prefixLength) { + this(indexName, isFromQuery, term, maxEditDistance, prefixLength, false); } public void setMaxEditDistance(int maxEditDistance) { @@ -43,6 +49,19 @@ public class FuzzyItem extends TermItem { return this.maxEditDistance; } + public boolean isPrefixMatch() { + return this.prefixMatch; + } + + public void setPrefixMatch(boolean prefixMatch) { + this.prefixMatch = prefixMatch; + } + + @Override + protected boolean hasPrefixMatchSemantics() { + return this.prefixMatch; + } + @Override public void setValue(String value) { this.term = value; @@ -89,43 +108,39 @@ public class FuzzyItem extends TermItem { } @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (!super.equals(obj)) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - FuzzyItem other = (FuzzyItem) obj; - if (!this.term.equals(other.term)) return false; - if (this.maxEditDistance != other.maxEditDistance) return false; - if (this.prefixLength != other.prefixLength) return false; - return true; + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + if (!super.equals(o)) return false; + FuzzyItem fuzzyItem = (FuzzyItem) o; + return maxEditDistance == fuzzyItem.maxEditDistance && + prefixLength == fuzzyItem.prefixLength && + prefixMatch == fuzzyItem.prefixMatch && + Objects.equals(term, fuzzyItem.term); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), term, maxEditDistance, prefixLength); + return Objects.hash(super.hashCode(), term, maxEditDistance, prefixLength, prefixMatch); } @Override protected void appendHeadingString(StringBuilder buffer) { buffer.append(getName()); - buffer.append("("); + buffer.append('('); buffer.append(this.term); - buffer.append(","); + buffer.append(','); buffer.append(this.maxEditDistance); - buffer.append(","); + buffer.append(','); buffer.append(this.prefixLength); - buffer.append(")"); - buffer.append(" "); + buffer.append(','); + buffer.append(this.prefixMatch); + buffer.append(") "); } @Override protected void encodeThis(ByteBuffer buffer) { + // Prefix matching is communicated via term header flags super.encodeThis(buffer); putString(getIndexedString(), buffer); IntegerCompressor.putCompressedPositiveNumber(this.maxEditDistance, buffer); diff --git a/container-search/src/main/java/com/yahoo/prelude/query/Item.java b/container-search/src/main/java/com/yahoo/prelude/query/Item.java index f43b55424e6..099c546e3f0 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/Item.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/Item.java @@ -161,6 +161,16 @@ public abstract class Item implements Cloneable { } /** + * Indicates that a query item that does not normally match with prefix semantics + * should do so for this particular query item instance. + * + * False by default; should be overridden by subclasses that want to signal this behavior. + */ + protected boolean hasPrefixMatchSemantics() { + return false; + } + + /** * Returns the item creator value. * * @deprecated use isFilter(boolean) @@ -286,6 +296,7 @@ public abstract class Item implements Cloneable { byte FLAGS_SPECIALTOKEN = 0x02; byte FLAGS_NOPOSITIONDATA = 0x04; byte FLAGS_ISFILTER = 0x08; + byte FLAGS_PREFIX_MATCH = 0x10; byte ret = 0; if (!isRanked()) { @@ -300,6 +311,9 @@ public abstract class Item implements Cloneable { if (isFilter()) { ret |= FLAGS_ISFILTER; } + if (hasPrefixMatchSemantics()) { + ret |= FLAGS_PREFIX_MATCH; + } return ret; } diff --git a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java index c897afe144c..c90612425fa 100644 --- a/container-search/src/main/java/com/yahoo/search/query/SelectParser.java +++ b/container-search/src/main/java/com/yahoo/search/query/SelectParser.java @@ -1150,8 +1150,9 @@ public class SelectParser implements Parser { Integer maxEditDistance = getIntegerAnnotation(MAX_EDIT_DISTANCE, annotations, FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE); Integer prefixLength = getIntegerAnnotation(PREFIX_LENGTH, annotations, FuzzyItem.DEFAULT_PREFIX_LENGTH); + boolean prefixMatch = getBoolAnnotation(PREFIX, annotations, Boolean.FALSE); - FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength); + FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength, prefixMatch); return leafStyleSettings(getAnnotations(value), fuzzy); } diff --git a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java index 634163bf0c2..a354006aa9b 100644 --- a/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java +++ b/container-search/src/main/java/com/yahoo/search/yql/VespaSerializer.java @@ -551,24 +551,31 @@ public class VespaSerializer { static String fuzzyAnnotations(FuzzyItem fuzzyItem) { boolean isMaxEditDistanceSet = fuzzyItem.getMaxEditDistance() != FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE; boolean isPrefixLengthSet = fuzzyItem.getPrefixLength() != FuzzyItem.DEFAULT_PREFIX_LENGTH; - boolean anyAnnotationSet = isMaxEditDistanceSet || isPrefixLengthSet; + boolean isPrefixMatch = fuzzyItem.isPrefixMatch(); + boolean anyAnnotationSet = isMaxEditDistanceSet || isPrefixLengthSet || isPrefixMatch; - StringBuilder builder = new StringBuilder(); - if (anyAnnotationSet) { - builder.append("{"); + if (!anyAnnotationSet) { + return ""; } + + StringBuilder builder = new StringBuilder(); + builder.append("{"); if (isMaxEditDistanceSet) { builder.append(MAX_EDIT_DISTANCE + ":").append(fuzzyItem.getMaxEditDistance()); - } - if (isMaxEditDistanceSet && isPrefixLengthSet) { - builder.append(","); + if (isPrefixLengthSet || isPrefixMatch) { + builder.append(","); + } } if (isPrefixLengthSet) { builder.append(PREFIX_LENGTH + ":").append(fuzzyItem.getPrefixLength()); + if (isPrefixMatch) { + builder.append(","); + } } - if (anyAnnotationSet) { - builder.append("}"); + if (isPrefixMatch) { + builder.append(PREFIX).append(':').append(fuzzyItem.isPrefixMatch()); } + builder.append("}"); return builder.toString(); } } diff --git a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java index e66cac5766c..fb4ec5ba872 100644 --- a/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java +++ b/container-search/src/main/java/com/yahoo/search/yql/YqlParser.java @@ -1385,7 +1385,14 @@ public class YqlParser implements Parser { FuzzyItem.DEFAULT_PREFIX_LENGTH, PREFIX_LENGTH_DESCRIPTION); - FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength); + boolean prefixMatch = getAnnotation( + ast, + PREFIX, + Boolean.class, + Boolean.FALSE, + "setting for whether to use prefix match of input data"); + + FuzzyItem fuzzy = new FuzzyItem(field, true, wordData, maxEditDistance, prefixLength, prefixMatch); return leafStyleSettings(ast, fuzzy); } diff --git a/container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java b/container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java index c4b8c9f2044..027152bfd69 100644 --- a/container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/searchers/ValidateFuzzySearcherTestCase.java @@ -55,14 +55,13 @@ public class ValidateFuzzySearcherTestCase { searcher = new ValidateFuzzySearcher(); } - private String makeQuery(String attribute, String query, int maxEditDistance, int prefixLength) { - return "select * from sources * where " + attribute + - " contains ({maxEditDistance:" + maxEditDistance + ", prefixLength:" + prefixLength +"}" + - "fuzzy(\"" + query + "\"))"; + private String makeQuery(String attribute, String query, int maxEditDistance, int prefixLength, boolean prefixMatch) { + return "select * from sources * where %s contains ({maxEditDistance:%d,prefixLength:%d,prefix:%b}fuzzy(\"%s\"))" + .formatted(attribute, maxEditDistance, prefixLength, prefixMatch, query); } private String makeQuery(String attribute, String query) { - return makeQuery(attribute, query, 2, 0); + return makeQuery(attribute, query, 2, 0, false); } @@ -76,7 +75,7 @@ public class ValidateFuzzySearcherTestCase { if (validAttributes.contains(attribute)) { assertNull(r.hits().getError()); } else { - assertErrMsg("FUZZY(fuzzy,2,0) " + attribute + ":fuzzy field is not a string attribute", r); + assertErrMsg("FUZZY(fuzzy,2,0,false) " + attribute + ":fuzzy field is not a string attribute", r); } } } @@ -85,28 +84,28 @@ public class ValidateFuzzySearcherTestCase { void testInvalidEmptyStringQuery() { String q = makeQuery("string_single", ""); Result r = doSearch(searcher, q); - assertErrMsg("FUZZY(,2,0) string_single: fuzzy query must be non-empty", r); + assertErrMsg("FUZZY(,2,0,false) string_single: fuzzy query must be non-empty", r); } @Test void testInvalidQueryWrongMaxEditDistance() { - String q = makeQuery("string_single", "fuzzy", -1, 0); + String q = makeQuery("string_single", "fuzzy", -1, 0, false); Result r = doSearch(searcher, q); - assertErrMsg("FUZZY(fuzzy,-1,0) string_single:fuzzy has invalid maxEditDistance -1: Must be >= 0", r); + assertErrMsg("FUZZY(fuzzy,-1,0,false) string_single:fuzzy has invalid maxEditDistance -1: Must be >= 0", r); } @Test void testInvalidQueryWrongPrefixLength() { - String q = makeQuery("string_single", "fuzzy", 2, -1); + String q = makeQuery("string_single", "fuzzy", 2, -1, true); Result r = doSearch(searcher, q); - assertErrMsg("FUZZY(fuzzy,2,-1) string_single:fuzzy has invalid prefixLength -1: Must be >= 0", r); + assertErrMsg("FUZZY(fuzzy,2,-1,true) string_single:fuzzy has invalid prefixLength -1: Must be >= 0", r); } @Test void testInvalidQueryWrongAttributeName() { String q = makeQuery("wrong_name", "fuzzy"); Result r = doSearch(searcher, q); - assertErrMsg("FUZZY(fuzzy,2,0) wrong_name:fuzzy field is not a string attribute", r); + assertErrMsg("FUZZY(fuzzy,2,0,false) wrong_name:fuzzy field is not a string attribute", r); } private static void assertErrMsg(String message, Result r) { diff --git a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java index 20ca81234a6..b5e2839c4c0 100644 --- a/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/yql/VespaSerializerTestCase.java @@ -464,7 +464,12 @@ public class VespaSerializerTestCase { @Test void testFuzzyAnnotations() { + parseAndConfirm("foo contains ({maxEditDistance:3}fuzzy(\"a\"))"); parseAndConfirm("foo contains ({maxEditDistance:3,prefixLength:5}fuzzy(\"a\"))"); + parseAndConfirm("foo contains ({maxEditDistance:3,prefixLength:5,prefix:true}fuzzy(\"a\"))"); + parseAndConfirm("foo contains ({prefixLength:5,prefix:true}fuzzy(\"a\"))"); + parseAndConfirm("foo contains ({maxEditDistance:3,prefix:true}fuzzy(\"a\"))"); + parseAndConfirm("foo contains ({prefix:true}fuzzy(\"a\"))"); } @Test diff --git a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java index 29a651aabf4..91f5984481a 100644 --- a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java @@ -437,23 +437,27 @@ public class YqlParserTestCase { QueryTree x = parse("select foo from bar where baz contains fuzzy(\"a b\")"); Item root = x.getRoot(); assertSame(FuzzyItem.class, root.getClass()); - assertEquals("baz", ((FuzzyItem) root).getIndexName()); - assertEquals("a b", ((FuzzyItem) root).stringValue()); - assertEquals(FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE, ((FuzzyItem) root).getMaxEditDistance()); - assertEquals(FuzzyItem.DEFAULT_PREFIX_LENGTH, ((FuzzyItem) root).getPrefixLength()); + var fuzzy = (FuzzyItem) root; + assertEquals("baz", fuzzy.getIndexName()); + assertEquals("a b", fuzzy.stringValue()); + assertEquals(FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE, fuzzy.getMaxEditDistance()); + assertEquals(FuzzyItem.DEFAULT_PREFIX_LENGTH, fuzzy.getPrefixLength()); + assertFalse(fuzzy.isPrefixMatch()); } @Test void testFuzzyAnnotations() { QueryTree x = parse( - "select foo from bar where baz contains ({maxEditDistance: 3, prefixLength: 10}fuzzy(\"a b\"))" + "select foo from bar where baz contains ({maxEditDistance: 3, prefixLength: 10, prefix: true}fuzzy(\"a b\"))" ); Item root = x.getRoot(); assertSame(FuzzyItem.class, root.getClass()); - assertEquals("baz", ((FuzzyItem) root).getIndexName()); - assertEquals("a b", ((FuzzyItem) root).stringValue()); - assertEquals(3, ((FuzzyItem) root).getMaxEditDistance()); - assertEquals(10, ((FuzzyItem) root).getPrefixLength()); + var fuzzy = (FuzzyItem) root; + assertEquals("baz", fuzzy.getIndexName()); + assertEquals("a b", fuzzy.stringValue()); + assertEquals(3, fuzzy.getMaxEditDistance()); + assertEquals(10, fuzzy.getPrefixLength()); + assertTrue(fuzzy.isPrefixMatch()); } @Test diff --git a/container-search/src/test/java/com/yahoo/select/SelectTestCase.java b/container-search/src/test/java/com/yahoo/select/SelectTestCase.java index f4571f04a5d..f863816dab2 100644 --- a/container-search/src/test/java/com/yahoo/select/SelectTestCase.java +++ b/container-search/src/test/java/com/yahoo/select/SelectTestCase.java @@ -671,8 +671,39 @@ public class SelectTestCase { QueryTree x = parseWhere("{ \"contains\": [\"description\", { \"fuzzy\": [\"a b\"] }] }"); Item root = x.getRoot(); assertSame(FuzzyItem.class, root.getClass()); - assertEquals("description", ((FuzzyItem) root).getIndexName()); - assertEquals("a b", ((FuzzyItem) root).stringValue()); + var fuzzy = (FuzzyItem) root; + assertEquals("description", fuzzy.getIndexName()); + assertEquals("a b", fuzzy.stringValue()); + assertEquals(FuzzyItem.DEFAULT_MAX_EDIT_DISTANCE, fuzzy.getMaxEditDistance()); + assertEquals(FuzzyItem.DEFAULT_PREFIX_LENGTH, fuzzy.getPrefixLength()); + assertFalse(fuzzy.isPrefixMatch()); + } + + @Test + void fuzzy_with_annotations() { + var where = """ + { + "contains": ["description", { + "fuzzy": { + "children": ["a b"], + "attributes": { + "maxEditDistance": 3, + "prefixLength": 10, + "prefix": true + } + } + }] + } + """; + QueryTree x = parseWhere(where); + Item root = x.getRoot(); + assertSame(FuzzyItem.class, root.getClass()); + var fuzzy = (FuzzyItem) root; + assertEquals("description", fuzzy.getIndexName()); + assertEquals("a b", fuzzy.stringValue()); + assertEquals(3, fuzzy.getMaxEditDistance()); + assertEquals(10, fuzzy.getPrefixLength()); + assertTrue(fuzzy.isPrefixMatch()); } //------------------------------------------------------------------- grouping tests diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp index 8ba8c62c5ff..55810c02550 100644 --- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp +++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp @@ -120,11 +120,12 @@ struct MatchStats { template <bool collect_matches> void -brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words) +brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, + bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words) { auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); vespalib::Timer timer; - FuzzyMatcher matcher(target, 2, prefix_size, cased); + FuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match); auto itr = view.begin(); size_t matches = 0; size_t seeks = 0; @@ -144,11 +145,12 @@ brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumS template <bool collect_matches> void -dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words) +dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, + bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words) { auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); vespalib::Timer timer; - DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit); + DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Explicit); Utf8Reader reader(vespalib::stringref(target.data(), target.size())); std::string target_copy; Utf8Writer<std::string> writer(target_copy); @@ -187,11 +189,12 @@ dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& st template <bool collect_matches> void -dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words) +dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, + bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words) { auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); vespalib::Timer timer; - DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit); + DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Explicit); auto itr = view.begin(); size_t matches = 0; size_t seeks = 0; @@ -247,20 +250,23 @@ struct DfaFuzzyMatcherTest : public ::testing::TestWithParam<TestParam> { updater.commit(); store.freeze_dictionary(); } - void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) { + void expect_prefix_matches(std::string_view target, uint32_t prefix_size, bool prefix_match, const StringVector& exp_matches) { MatchStats stats; StringVector brute_force_matches; StringVector dfa_matches; StringVector dfa_no_skip_matches; bool cased = GetParam()._cased; SCOPED_TRACE(target); - brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, brute_force_matches); - dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, dfa_matches); - dfa_fuzzy_match_in_dictionary_no_skip<true>(target, store, prefix_size, cased, stats, dfa_no_skip_matches); + brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, prefix_match, stats, brute_force_matches); + dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, prefix_match, stats, dfa_matches); + dfa_fuzzy_match_in_dictionary_no_skip<true>(target, store, prefix_size, cased, prefix_match, stats, dfa_no_skip_matches); EXPECT_EQ(exp_matches, brute_force_matches); EXPECT_EQ(exp_matches, dfa_matches); EXPECT_EQ(exp_matches, dfa_no_skip_matches); } + void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) { + expect_prefix_matches(target, prefix_size, false, exp_matches); + } void expect_matches(std::string_view target, const StringVector& exp_matches) { expect_prefix_matches(target, 0, exp_matches); } @@ -284,11 +290,12 @@ TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary) expect_matches("forcecast", {"forecast"}); } -TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size) +TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_lock_length) { bool cased = GetParam()._cased; StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill", - "for", "forbid", "force", "ford", "forearm", "forecast", "forest", "H", "HA", "h", "ha", char_from_u8(u8"Ørn"), char_from_u8(u8"øre"), char_from_u8(u8"Ås"), char_from_u8(u8"ås")}; + "for", "forbid", "force", "ford", "forearm", "forecast", "forest", "H", "HA", "h", "ha", + char_from_u8(u8"Ørn"), char_from_u8(u8"øre"), char_from_u8(u8"Ås"), char_from_u8(u8"ås")}; populate_dictionary(words); expect_prefix_matches("a", 1, {}); expect_prefix_matches("b", 1, {"bob"}); @@ -321,6 +328,25 @@ TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size) } } +TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_matching) { + // We include the empty string to make "everything matches"-checks easier since + // the dictionary implicitly returns such an empty string sentinel already. + StringVector words = {"", "board", "boat", "bob", "door", "food", "foot", "football", "foothill", + "for", "forbid", "force", "ford", "forearm", "forecast", "forest"}; + populate_dictionary(words); + expect_prefix_matches("bo", 0, true, words); // matches everything + expect_prefix_matches("bo", 2, true, {"board", "boat", "bob"}); + expect_prefix_matches("boar", 0, true, {"board", "boat", "bob", "door", "for", "forbid", + "force", "ford", "forearm", "forecast", "forest"}); + expect_prefix_matches("board", 0, true, {"board", "boat", "ford"}); + expect_prefix_matches("footb", 0, true, {"food", "foot", "football", "foothill", "forbid"}); + expect_prefix_matches("footba", 0, true, {"foot", "football", "foothill"}); + expect_prefix_matches("footbal", 0, true, {"football", "foothill"}); + expect_prefix_matches("football", 0, true, {"football", "foothill"}); + expect_prefix_matches("footballs", 0, true, {"football"}); + expect_prefix_matches("z", 1, true, {}); +} + void benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool cased, bool dfa_algorithm) { @@ -329,9 +355,9 @@ benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDicti for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) { const auto& entry = dict[i]; if (dfa_algorithm) { - dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy); + dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, false, stats, dummy); } else { - brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy); + brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, false, stats, dummy); } } std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl; diff --git a/searchlib/src/tests/query/customtypevisitor_test.cpp b/searchlib/src/tests/query/customtypevisitor_test.cpp index 815ff3bf1fb..fb89f2ef061 100644 --- a/searchlib/src/tests/query/customtypevisitor_test.cpp +++ b/searchlib/src/tests/query/customtypevisitor_test.cpp @@ -37,7 +37,7 @@ struct MyRangeTerm : InitTerm<RangeTerm> {}; struct MyStringTerm : InitTerm<StringTerm> {}; struct MySubstrTerm : InitTerm<SubstringTerm> {}; struct MySuffixTerm : InitTerm<SuffixTerm> {}; -struct MyFuzzyTerm : FuzzyTerm { MyFuzzyTerm(): FuzzyTerm("term", "view", 0, Weight(0), 2, 0) {} }; +struct MyFuzzyTerm : FuzzyTerm { MyFuzzyTerm(): FuzzyTerm("term", "view", 0, Weight(0), 2, 0, false) {} }; struct MyWeakAnd : WeakAnd { MyWeakAnd() : WeakAnd(1234, "view") {} }; struct MyWeightedSetTerm : WeightedSetTerm { MyWeightedSetTerm() : WeightedSetTerm(0, "view", 0, Weight(42)) {} }; struct MyDotProduct : DotProduct { MyDotProduct() : DotProduct(0, "view", 0, Weight(42)) {} }; diff --git a/searchlib/src/tests/query/query_visitor_test.cpp b/searchlib/src/tests/query/query_visitor_test.cpp index b0c9c4e570e..bfce382b684 100644 --- a/searchlib/src/tests/query/query_visitor_test.cpp +++ b/searchlib/src/tests/query/query_visitor_test.cpp @@ -88,7 +88,7 @@ TEST("requireThatAllNodesCanBeVisited") { checkVisit<NearestNeighborTerm>(new SimpleNearestNeighborTerm("query_tensor", "doc_tensor", 0, Weight(0), 123, true, 321, 100100.25)); checkVisit<TrueQueryNode>(new SimpleTrue()); checkVisit<FalseQueryNode>(new SimpleFalse()); - checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0), 2, 0)); + checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0), 2, 0, false)); checkVisit<InTerm>(new SimpleInTerm(std::make_unique<StringTermVector>(0), MultiTerm::Type::STRING, "field", 0, Weight(0))); } diff --git a/searchlib/src/tests/query/querybuilder_test.cpp b/searchlib/src/tests/query/querybuilder_test.cpp index be795c8c94b..c70fe5b510b 100644 --- a/searchlib/src/tests/query/querybuilder_test.cpp +++ b/searchlib/src/tests/query/querybuilder_test.cpp @@ -117,7 +117,7 @@ Node::UP createQueryTree() { builder.add_true_node(); builder.add_false_node(); } - builder.addFuzzyTerm(str[5], view[5], id[5], weight[5], 3, 1); + builder.addFuzzyTerm(str[5], view[5], id[5], weight[5], 3, 1, false); } Node::UP node = builder.build(); ASSERT_TRUE(node.get()); @@ -326,8 +326,8 @@ void checkQueryTreeTypes(Node *node) { auto* fuzzy_term = as_node<FuzzyTerm>(and_node->getChildren()[12]); EXPECT_TRUE(checkTerm(fuzzy_term, str[5], view[5], id[5], weight[5])); - EXPECT_EQUAL(3u, fuzzy_term->getMaxEditDistance()); - EXPECT_EQUAL(1u, fuzzy_term->getPrefixLength()); + EXPECT_EQUAL(3u, fuzzy_term->max_edit_distance()); + EXPECT_EQUAL(1u, fuzzy_term->prefix_lock_length()); } struct AbstractTypes { @@ -452,8 +452,9 @@ struct MyTrue : TrueQueryNode {}; struct MyFalse : FalseQueryNode {}; struct MyFuzzyTerm : FuzzyTerm { MyFuzzyTerm(const Type &t, const string &f, int32_t i, Weight w, - uint32_t m, uint32_t p) - : FuzzyTerm(t, f, i, w, m, p) { + uint32_t m, uint32_t p, bool prefix_match) + : FuzzyTerm(t, f, i, w, m, p, prefix_match) + { } }; struct MyInTerm : InTerm { @@ -645,23 +646,27 @@ TEST("require that All Range Syntaxes Work") { EXPECT_TRUE(range2 == range_term->getTerm()); } -TEST("require that fuzzy node can be created") { - QueryBuilder<SimpleQueryNodeTypes> builder; - builder.addFuzzyTerm("term", "view", 0, Weight(0), 3, 1); - Node::UP node = builder.build(); +TEST("fuzzy node can be created") { + for (bool prefix_match : {false, true}) { + QueryBuilder<SimpleQueryNodeTypes> builder; + builder.addFuzzyTerm("term", "view", 0, Weight(0), 3, 1, prefix_match); + Node::UP node = builder.build(); - string stackDump = StackDumpCreator::create(*node); - { - SimpleQueryStackDumpIterator iterator(stackDump); - Node::UP new_node = QueryTreeCreator<SimpleQueryNodeTypes>::create(iterator); - FuzzyTerm *fuzzy_node = as_node<FuzzyTerm>(new_node.get()); - EXPECT_EQUAL(3u, fuzzy_node->getMaxEditDistance()); - EXPECT_EQUAL(1u, fuzzy_node->getPrefixLength()); - } - { - search::QueryTermSimple::UP queryTermSimple = search::QueryTermDecoder::decodeTerm(stackDump); - EXPECT_EQUAL(3u, queryTermSimple->getFuzzyMaxEditDistance()); - EXPECT_EQUAL(1u, queryTermSimple->getFuzzyPrefixLength()); + string stackDump = StackDumpCreator::create(*node); + { + SimpleQueryStackDumpIterator iterator(stackDump); + Node::UP new_node = QueryTreeCreator<SimpleQueryNodeTypes>::create(iterator); + auto *fuzzy_node = as_node<FuzzyTerm>(new_node.get()); + EXPECT_EQUAL(3u, fuzzy_node->max_edit_distance()); + EXPECT_EQUAL(1u, fuzzy_node->prefix_lock_length()); + EXPECT_EQUAL(prefix_match, fuzzy_node->prefix_match()); + } + { + search::QueryTermSimple::UP queryTermSimple = search::QueryTermDecoder::decodeTerm(stackDump); + EXPECT_EQUAL(3u, queryTermSimple->fuzzy_max_edit_distance()); + EXPECT_EQUAL(1u, queryTermSimple->fuzzy_prefix_lock_length()); + EXPECT_EQUAL(prefix_match, queryTermSimple->fuzzy_prefix_match()); + } } } diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp index 5f3ab9cd3d8..3ae2bdd3598 100644 --- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp +++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp @@ -44,8 +44,12 @@ extract_suffix(std::string_view target, uint32_t prefix_size) } -DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, LevenshteinDfa::DfaType dfa_type) - : _dfa(vespalib::fuzzy::LevenshteinDfa::build(extract_suffix(target, prefix_size), max_edits, (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), dfa_type)), +DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, + bool cased, bool prefix_match, LevenshteinDfa::DfaType dfa_type) + : _dfa(vespalib::fuzzy::LevenshteinDfa::build(extract_suffix(target, prefix_size), max_edits, + (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), + dfa_type, // TODO reorder args + (prefix_match ? LevenshteinDfa::Matching::Prefix : LevenshteinDfa::Matching::FullString))), _successor(), _prefix(extract_prefix(target, prefix_size, cased)), _prefix_size(prefix_size), @@ -54,8 +58,8 @@ DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uin _successor = _prefix; } -DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased) - : DfaFuzzyMatcher(target, max_edits, prefix_size, cased, LevenshteinDfa::DfaType::Table) +DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match) + : DfaFuzzyMatcher(target, max_edits, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Table) { } diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h index 653af602c0d..ee57e9e6583 100644 --- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h +++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h @@ -26,8 +26,10 @@ private: const char* skip_prefix(const char* word) const; public: - DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type); - DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased); // Defaults to table-based DFA + DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match, + vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type); + // Defaults to table-based DFA: + DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match); ~DfaFuzzyMatcher(); [[nodiscard]] static constexpr bool supports_max_edits(uint8_t edits) noexcept { diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp index f1a643dc376..c6cb6e4a637 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp +++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp @@ -49,17 +49,19 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased, vespali ? vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::None) : vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::IgnoreCase); } else if (isFuzzy()) { - auto max_edit_dist = term.getFuzzyMaxEditDistance(); + const auto max_edit_dist = term.fuzzy_max_edit_distance(); _fuzzyMatcher = std::make_unique<vespalib::FuzzyMatcher>(term.getTerm(), max_edit_dist, - term.getFuzzyPrefixLength(), - isCased()); + term.fuzzy_prefix_lock_length(), + isCased(), + term.fuzzy_prefix_match()); if ((fuzzy_matching_algorithm != FMA::BruteForce) && (max_edit_dist > 0 && max_edit_dist <= 2)) { _dfa_fuzzy_matcher = std::make_unique<DfaFuzzyMatcher>(term.getTerm(), max_edit_dist, - term.getFuzzyPrefixLength(), + term.fuzzy_prefix_lock_length(), isCased(), + term.fuzzy_prefix_match(), to_dfa_type(fuzzy_matching_algorithm)); } } else if (isCased()) { diff --git a/searchlib/src/vespa/searchlib/parsequery/parse.h b/searchlib/src/vespa/searchlib/parsequery/parse.h index 5e3b1dffe3a..a05c68bea46 100644 --- a/searchlib/src/vespa/searchlib/parsequery/parse.h +++ b/searchlib/src/vespa/searchlib/parsequery/parse.h @@ -87,6 +87,8 @@ public: IFLAG_NORANK = 0x00000001, // this term should not be ranked (not exposed to rank framework) IFLAG_SPECIALTOKEN = 0x00000002, IFLAG_NOPOSITIONDATA = 0x00000004, // we should not use position data when ranking this term + IFLAG_FILTER = 0x00000008, // see GetCreator(flags) below + IFLAG_PREFIX_MATCH = 0x00000010 }; /** Extra information on each item (creator id) coded in bit 3 of flags */ diff --git a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h index d15e1ca0512..bf6e3abf1ea 100644 --- a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h +++ b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h @@ -113,9 +113,18 @@ public: uint32_t getUniqueId() const noexcept { return _currUniqueId; } // Get the flags of the current item. - bool hasNoRankFlag() const noexcept { return (_currFlags & ParseItem::IFLAG_NORANK) != 0; } - bool hasSpecialTokenFlag() const noexcept { return (_currFlags & ParseItem::IFLAG_SPECIALTOKEN) != 0; } - bool hasNoPositionDataFlag() const noexcept { return (_currFlags & ParseItem::IFLAG_NOPOSITIONDATA) != 0; } + [[nodiscard]] bool hasNoRankFlag() const noexcept { + return (_currFlags & ParseItem::IFLAG_NORANK) != 0; + } + [[nodiscard]] bool hasSpecialTokenFlag() const noexcept { + return (_currFlags & ParseItem::IFLAG_SPECIALTOKEN) != 0; + } + [[nodiscard]] bool hasNoPositionDataFlag() const noexcept { + return (_currFlags & ParseItem::IFLAG_NOPOSITIONDATA) != 0; + } + [[nodiscard]] bool has_prefix_match_semantics() const noexcept { + return (_currFlags & ParseItem::IFLAG_PREFIX_MATCH) != 0; + } uint32_t getArity() const noexcept { return _currArity; } @@ -127,9 +136,9 @@ public: bool getAllowApproximate() const noexcept { return (_extraIntArg2 != 0); } uint32_t getExploreAdditionalHits() const noexcept { return _extraIntArg3; } - // fuzzy match arguments - uint32_t getFuzzyMaxEditDistance() const noexcept { return _extraIntArg1; } - uint32_t getFuzzyPrefixLength() const noexcept { return _extraIntArg2; } + // fuzzy match arguments (see also: has_prefix_match_semantics() for fuzzy prefix matching) + [[nodiscard]] uint32_t fuzzy_max_edit_distance() const noexcept { return _extraIntArg1; } + [[nodiscard]] uint32_t fuzzy_prefix_lock_length() const noexcept { return _extraIntArg2; } std::unique_ptr<query::PredicateQueryTerm> getPredicateQueryTerm(); std::unique_ptr<query::TermVector> get_terms(); diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp index 060cd5015b3..09fc443cf0e 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp @@ -248,10 +248,11 @@ QueryTermSimple::QueryTermSimple(const string & term_, Type type) _type(type), _diversityCutoffStrict(false), _valid(true), + _fuzzy_prefix_match(false), _term(term_), _diversityAttribute(), - _fuzzyMaxEditDistance(2), - _fuzzyPrefixLength(0) + _fuzzy_max_edit_distance(2), + _fuzzy_prefix_lock_length(0) { if (isFullRange(_term)) { stringref rest(_term.c_str() + 1, _term.size() - 2); diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h index a740afb0340..b0ee7aa54bc 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.h +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h @@ -44,8 +44,9 @@ public: size_t getDiversityCutoffGroups() const noexcept { return _diversityCutoffGroups; } bool getDiversityCutoffStrict() const noexcept { return _diversityCutoffStrict; } vespalib::stringref getDiversityAttribute() const noexcept { return _diversityAttribute; } - size_t getFuzzyMaxEditDistance() const noexcept { return _fuzzyMaxEditDistance; } - size_t getFuzzyPrefixLength() const noexcept { return _fuzzyPrefixLength; } + [[nodiscard]] size_t fuzzy_max_edit_distance() const noexcept { return _fuzzy_max_edit_distance; } + [[nodiscard]] size_t fuzzy_prefix_lock_length() const noexcept { return _fuzzy_prefix_lock_length; } + [[nodiscard]] bool fuzzy_prefix_match() const noexcept { return _fuzzy_prefix_match; } bool getAsIntegerTerm(int64_t & lower, int64_t & upper) const noexcept; bool getAsFloatTerm(double & lower, double & upper) const noexcept; bool getAsFloatTerm(float & lower, float & upper) const noexcept; @@ -77,14 +78,17 @@ private: Type _type; bool _diversityCutoffStrict; bool _valid; +protected: + bool _fuzzy_prefix_match; // set in QueryTerm +private: string _term; stringref _diversityAttribute; template <typename T, typename D> bool getAsNumericTerm(T & lower, T & upper, D d) const noexcept; protected: - uint32_t _fuzzyMaxEditDistance; // set in QueryTerm - uint32_t _fuzzyPrefixLength; + uint32_t _fuzzy_max_edit_distance; // set in QueryTerm + uint32_t _fuzzy_prefix_lock_length; }; } diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp index f33fe44369a..8a4ec184de8 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp @@ -13,20 +13,21 @@ constexpr bool normalizing_implies_cased(Normalizing norm) noexcept { FuzzyTerm::FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term, const string& index, Type type, Normalizing normalizing, - uint8_t max_edits, uint32_t prefix_size) + uint8_t max_edits, uint32_t prefix_lock_length, bool prefix_match) : QueryTerm(std::move(result_base), term, index, type, normalizing), _dfa_matcher(), _fallback_matcher() { - setFuzzyMaxEditDistance(max_edits); - setFuzzyPrefixLength(prefix_size); + set_fuzzy_max_edit_distance(max_edits); + set_fuzzy_prefix_lock_length(prefix_lock_length); + set_fuzzy_prefix_match(prefix_match); std::string_view term_view(term.data(), term.size()); const bool cased = normalizing_implies_cased(normalizing); if (attribute::DfaFuzzyMatcher::supports_max_edits(max_edits)) { - _dfa_matcher = std::make_unique<attribute::DfaFuzzyMatcher>(term_view, max_edits, prefix_size, cased); + _dfa_matcher = std::make_unique<attribute::DfaFuzzyMatcher>(term_view, max_edits, prefix_lock_length, cased, prefix_match); } else { - _fallback_matcher = std::make_unique<vespalib::FuzzyMatcher>(term_view, max_edits, prefix_size, cased); + _fallback_matcher = std::make_unique<vespalib::FuzzyMatcher>(term_view, max_edits, prefix_lock_length, cased, prefix_match); } } diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h index c6c88b18969..552a839cb3b 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h +++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h @@ -23,7 +23,7 @@ class FuzzyTerm : public QueryTerm { public: FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term, const string& index, Type type, Normalizing normalizing, - uint8_t max_edits, uint32_t prefix_size); + uint8_t max_edits, uint32_t prefix_lock_length, bool prefix_match); ~FuzzyTerm() override; [[nodiscard]] FuzzyTerm* as_fuzzy_term() noexcept override { return this; } diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp index 94a479fd2d3..68e276d0fde 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp @@ -130,7 +130,8 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor qt = std::make_unique<RegexpTerm>(factory.create(), ssTerm, ssIndex, TermType::REGEXP, normalize_mode); } else if (sTerm == TermType::FUZZYTERM) { qt = std::make_unique<FuzzyTerm>(factory.create(), ssTerm, ssIndex, TermType::FUZZYTERM, normalize_mode, - queryRep.getFuzzyMaxEditDistance(), queryRep.getFuzzyPrefixLength()); + queryRep.fuzzy_max_edit_distance(), queryRep.fuzzy_prefix_lock_length(), + queryRep.has_prefix_match_semantics()); } else [[likely]] { qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode); } diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h index 05b12804d52..0e0ab98295b 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h @@ -100,8 +100,9 @@ public: void visitMembers(vespalib::ObjectVisitor &visitor) const override; void setIndex(const string & index_) override { _index = index_; } const string & getIndex() const override { return _index; } - void setFuzzyMaxEditDistance(uint32_t fuzzyMaxEditDistance) { _fuzzyMaxEditDistance = fuzzyMaxEditDistance; } - void setFuzzyPrefixLength(uint32_t fuzzyPrefixLength) { _fuzzyPrefixLength = fuzzyPrefixLength; } + void set_fuzzy_max_edit_distance(uint32_t fuzzy_max_edit_distance) noexcept { _fuzzy_max_edit_distance = fuzzy_max_edit_distance; } + void set_fuzzy_prefix_lock_length(uint32_t fuzzy_prefix_length) noexcept { _fuzzy_prefix_lock_length = fuzzy_prefix_length; } + void set_fuzzy_prefix_match(bool prefix_match) noexcept { _fuzzy_prefix_match = prefix_match; } virtual NearestNeighborQueryNode* as_nearest_neighbor_query_node() noexcept; virtual MultiTerm* as_multi_term() noexcept; virtual const MultiTerm* as_multi_term() const noexcept; diff --git a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h index 41990af6908..a498076bb09 100644 --- a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h +++ b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h @@ -226,8 +226,8 @@ create_nearest_neighbor_term(vespalib::stringref query_tensor_name, vespalib::st template <class NodeTypes> typename NodeTypes::FuzzyTerm * createFuzzyTerm(vespalib::stringref term, vespalib::stringref view, int32_t id, Weight weight, - uint32_t maxEditDistance, uint32_t prefixLength) { - return new typename NodeTypes::FuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength); + uint32_t max_edit_distance, uint32_t prefix_lock_length, bool prefix_match) { + return new typename NodeTypes::FuzzyTerm(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match); } template <class NodeTypes> @@ -343,9 +343,10 @@ public: return addTerm(createRegExpTerm<NodeTypes>(term, view, id, weight)); } typename NodeTypes::FuzzyTerm &addFuzzyTerm(stringref term, stringref view, int32_t id, Weight weight, - uint32_t maxEditDistance, uint32_t prefixLength) { + uint32_t max_edit_distance, uint32_t prefix_lock_length, + bool prefix_match) { adjustWeight(weight); - return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight, maxEditDistance, prefixLength)); + return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match)); } typename NodeTypes::NearestNeighborTerm &add_nearest_neighbor_term(stringref query_tensor_name, stringref field_name, int32_t id, Weight weight, uint32_t target_num_hits, diff --git a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h index f578bb57e21..116e677d439 100644 --- a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h +++ b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h @@ -201,7 +201,8 @@ private: replicate(node, _builder.addFuzzyTerm( node.getTerm(), node.getView(), node.getId(), node.getWeight(), - node.getMaxEditDistance(), node.getPrefixLength())); + node.max_edit_distance(), node.prefix_lock_length(), + node.prefix_match())); } std::unique_ptr<TermVector> replicate_subterms(const InTerm& original) { diff --git a/searchlib/src/vespa/searchlib/query/tree/simplequery.h b/searchlib/src/vespa/searchlib/query/tree/simplequery.h index 535402bf14d..34bc39ae504 100644 --- a/searchlib/src/vespa/searchlib/query/tree/simplequery.h +++ b/searchlib/src/vespa/searchlib/query/tree/simplequery.h @@ -163,8 +163,10 @@ struct SimpleNearestNeighborTerm : NearestNeighborTerm { struct SimpleFuzzyTerm : FuzzyTerm { SimpleFuzzyTerm(const Type &term, vespalib::stringref view, int32_t id, Weight weight, - uint32_t maxEditDistance, uint32_t prefixLength) - : FuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength) { + uint32_t max_edit_distance, uint32_t prefix_lock_length, + bool prefix_match) + : FuzzyTerm(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match) + { } ~SimpleFuzzyTerm() override; }; diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp index 6707712ee69..b71a86a214a 100644 --- a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp +++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp @@ -225,6 +225,9 @@ class QueryNodeConverter : public QueryVisitor { if (!node.usePositionData()) { flags |= ParseItem::IFLAG_NOPOSITIONDATA; } + if (node.prefix_match()) { + flags |= ParseItem::IFLAG_PREFIX_MATCH; + } if (flags != 0) { features |= ParseItem::IF_FLAGS; } @@ -289,8 +292,8 @@ class QueryNodeConverter : public QueryVisitor { void visit(FuzzyTerm &node) override { createTerm(node, ParseItem::ITEM_FUZZY); - appendCompressedPositiveNumber(node.getMaxEditDistance()); - appendCompressedPositiveNumber(node.getPrefixLength()); + appendCompressedPositiveNumber(node.max_edit_distance()); + appendCompressedPositiveNumber(node.prefix_lock_length()); } void visit(NearestNeighborTerm &node) override { diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h index 7c743a72567..6f7d7ddd9ad 100644 --- a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h +++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h @@ -42,6 +42,9 @@ public: if (queryStack.hasNoPositionDataFlag()) { t->setPositionData(false); } + if (queryStack.has_prefix_match_semantics()) [[unlikely]] { + t->set_prefix_match(true); + } } } if (builder.hasError()) { @@ -185,9 +188,10 @@ private: } else if (type == ParseItem::ITEM_REGEXP) { t = &builder.addRegExpTerm(term, view, id, weight); } else if (type == ParseItem::ITEM_FUZZY) { - uint32_t maxEditDistance = queryStack.getFuzzyMaxEditDistance(); - uint32_t prefixLength = queryStack.getFuzzyPrefixLength(); - t = &builder.addFuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength); + uint32_t max_edit_distance = queryStack.fuzzy_max_edit_distance(); + uint32_t prefix_lock_length = queryStack.fuzzy_prefix_lock_length(); + bool prefix_match = queryStack.has_prefix_match_semantics(); + t = &builder.addFuzzyTerm(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match); } else if (type == ParseItem::ITEM_STRING_IN) { t = &builder.add_in_term(queryStack.get_terms(), MultiTerm::Type::STRING, view, id, weight); } else if (type == ParseItem::ITEM_NUMERIC_IN) { diff --git a/searchlib/src/vespa/searchlib/query/tree/term.cpp b/searchlib/src/vespa/searchlib/query/tree/term.cpp index 45ba259a799..35f6ea5c8f5 100644 --- a/searchlib/src/vespa/searchlib/query/tree/term.cpp +++ b/searchlib/src/vespa/searchlib/query/tree/term.cpp @@ -12,12 +12,14 @@ Term::Term(vespalib::stringref view, int32_t id, Weight weight) : _id(id), _weight(weight), _ranked(true), - _position_data(true) + _position_data(true), + _prefix_match(false) { } void Term::setStateFrom(const Term& other) { setRanked(other.isRanked()); setPositionData(other.usePositionData()); + set_prefix_match(other.prefix_match()); // too late to copy this state: assert(_view == other.getView()); assert(_id == other.getId()); diff --git a/searchlib/src/vespa/searchlib/query/tree/term.h b/searchlib/src/vespa/searchlib/query/tree/term.h index 2f57c3cb06d..678d018d111 100644 --- a/searchlib/src/vespa/searchlib/query/tree/term.h +++ b/searchlib/src/vespa/searchlib/query/tree/term.h @@ -18,6 +18,7 @@ class Term Weight _weight; bool _ranked; bool _position_data; + bool _prefix_match; public: virtual ~Term() = 0; @@ -25,14 +26,17 @@ public: void setView(const vespalib::string & view) { _view = view; } void setRanked(bool ranked) noexcept { _ranked = ranked; } void setPositionData(bool position_data) noexcept { _position_data = position_data; } + // Used for fuzzy prefix matching. Not to be confused with distinct Prefix query term type + void set_prefix_match(bool prefix_match) noexcept { _prefix_match = prefix_match; } void setStateFrom(const Term& other); const vespalib::string & getView() const noexcept { return _view; } Weight getWeight() const noexcept { return _weight; } int32_t getId() const noexcept { return _id; } - bool isRanked() const noexcept { return _ranked; } - bool usePositionData() const noexcept { return _position_data; } + [[nodiscard]] bool isRanked() const noexcept { return _ranked; } + [[nodiscard]] bool usePositionData() const noexcept { return _position_data; } + [[nodiscard]] bool prefix_match() const noexcept { return _prefix_match; } static bool isPossibleRangeTerm(vespalib::stringref term) noexcept { return (term[0] == '[' || term[0] == '<' || term[0] == '>'); diff --git a/searchlib/src/vespa/searchlib/query/tree/termnodes.h b/searchlib/src/vespa/searchlib/query/tree/termnodes.h index 97e659dcc77..42dc0717368 100644 --- a/searchlib/src/vespa/searchlib/query/tree/termnodes.h +++ b/searchlib/src/vespa/searchlib/query/tree/termnodes.h @@ -119,19 +119,22 @@ public: //----------------------------------------------------------------------------- class FuzzyTerm : public QueryNodeMixin<FuzzyTerm, StringBase> { -private: - uint32_t _maxEditDistance; - uint32_t _prefixLength; + uint32_t _max_edit_distance; + uint32_t _prefix_lock_length; + // Prefix match mode is stored in parent Term public: FuzzyTerm(const Type &term, vespalib::stringref view, - int32_t id, Weight weight, uint32_t maxEditDistance, uint32_t prefixLength) - : QueryNodeMixinType(term, view, id, weight), - _maxEditDistance(maxEditDistance), - _prefixLength(prefixLength) - {} + int32_t id, Weight weight, uint32_t max_edit_distance, + uint32_t prefix_lock_length, bool prefix_match) + : QueryNodeMixinType(term, view, id, weight), + _max_edit_distance(max_edit_distance), + _prefix_lock_length(prefix_lock_length) + { + set_prefix_match(prefix_match); + } - uint32_t getMaxEditDistance() const { return _maxEditDistance; } - uint32_t getPrefixLength() const { return _prefixLength; } + [[nodiscard]] uint32_t max_edit_distance() const { return _max_edit_distance; } + [[nodiscard]] uint32_t prefix_lock_length() const { return _prefix_lock_length; } virtual ~FuzzyTerm() = 0; }; diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index ce9636895b4..84c3a542661 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -75,31 +75,47 @@ std::string_view maybe_consume_into(std::string_view str, T& val_out) { return str.substr(ptr - str.data()); } -// Parse optional max edits and prefix lock length from term string. +// Parse optional prefix match mode, max edits and prefix lock length from term string. // Syntax: -// "term" -> {2, 0, "term"} (default max edits & prefix length) -// "{1}term" -> {1, 0, "term"} -// "{1,3}term" -> {1, 3, "term"} +// "term" -> {2, 0, false, "term"} (default max edits, prefix length and prefix match mode) +// "{p}term" -> {2, 0, true, "term"} +// "{1}term" -> {1, 0, false, "term"} +// "{p1}term" -> {1, 0, true, "term"} +// "{1,3}term" -> {1, 3, false, "term"} +// "{p1,3}term" -> {1, 3, true, "term"} +// .. and so on // // Note: this is not a "proper" parser (it accepts empty numeric values); only for testing! -std::tuple<uint8_t, uint32_t, std::string_view> parse_fuzzy_params(std::string_view term) { +std::tuple<uint8_t, uint32_t, bool, std::string_view> parse_fuzzy_params(std::string_view term) { + std::string_view orig_term = term; if (term.empty() || term[0] != '{') { - return {2, 0, term}; + return {2, 0, false, term}; } + term = term.substr(1); // skip '{' uint8_t max_edits = 2; uint32_t prefix_length = 0; - term = maybe_consume_into(term.substr(1), max_edits); + bool prefix_match = false; + if (!term.empty() && term[0] == 'p') { + prefix_match = true; + term = term.substr(1); // skip 'p' + } + if (!term.empty() && term[0] == '}') { + return {2, 0, prefix_match, term.substr(1)}; + } + term = maybe_consume_into(term, max_edits); if (term.empty() || (term[0] != ',' && term[0] != '}')) { - throw std::invalid_argument("malformed fuzzy params at (or after) max_edits"); + throw std::invalid_argument("malformed fuzzy params at (or after) max_edits: " + + std::string(term) + " in string " + std::string(orig_term)); } if (term[0] == '}') { - return {max_edits, prefix_length, term.substr(1)}; + return {max_edits, prefix_length, prefix_match, term.substr(1)}; } term = maybe_consume_into(term.substr(1), prefix_length); if (term.empty() || term[0] != '}') { - throw std::invalid_argument("malformed fuzzy params at (or after) prefix_length"); + throw std::invalid_argument("malformed fuzzy params at (or after) prefix_length: " + + std::string(term) + " in string " + std::string(orig_term)); } - return {max_edits, prefix_length, term.substr(1)}; + return {max_edits, prefix_length, prefix_match, term.substr(1)}; } } @@ -115,9 +131,10 @@ private: if (pt.second == TermType::REGEXP) { qtv.push_back(std::make_unique<RegexpTerm>(eqnr.create(), pt.first, effective_index, TermType::REGEXP, normalizing)); } else if (pt.second == TermType::FUZZYTERM) { - auto [max_edits, prefix_length, actual_term] = parse_fuzzy_params(pt.first); + auto [max_edits, prefix_lock_length, prefix_match, actual_term] = parse_fuzzy_params(pt.first); qtv.push_back(std::make_unique<FuzzyTerm>(eqnr.create(), vespalib::stringref(actual_term.data(), actual_term.size()), - effective_index, TermType::FUZZYTERM, normalizing, max_edits, prefix_length)); + effective_index, TermType::FUZZYTERM, normalizing, max_edits, + prefix_lock_length, prefix_match)); } else { qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing)); } @@ -531,25 +548,29 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs) return true; } -TEST("parsing of test-only fuzzy term params can extract numeric values") { +void check_fuzzy_param_parsing(std::string_view term, std::string_view exp_term, + uint8_t exp_max_edits, uint32_t exp_prefix_length, bool exp_prefix) +{ uint8_t max_edits = 0; - uint32_t prefix_length = 1234; + uint32_t prefix_length = 0; + bool prefix = false; std::string_view out; - std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("myterm"); - EXPECT_EQUAL(max_edits, 2u); - EXPECT_EQUAL(prefix_length, 0u); - EXPECT_EQUAL(out, "myterm"); + std::tie(max_edits, prefix_length, prefix, out) = parse_fuzzy_params(term); + EXPECT_EQUAL(static_cast<uint32_t>(max_edits), static_cast<uint32_t>(exp_max_edits)); // don't print as char... + EXPECT_EQUAL(prefix_length, exp_prefix_length); + EXPECT_EQUAL(prefix, exp_prefix); + EXPECT_EQUAL(out, exp_term); - std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("{3}myterm"); - EXPECT_EQUAL(max_edits, 3u); - EXPECT_EQUAL(prefix_length, 0u); - EXPECT_EQUAL(out, "myterm"); +} - std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("{2,70}myterm"); - EXPECT_EQUAL(max_edits, 2u); - EXPECT_EQUAL(prefix_length, 70u); - EXPECT_EQUAL(out, "myterm"); +TEST("parsing of test-only fuzzy term params can extract expected values") { + check_fuzzy_param_parsing("myterm", "myterm", 2, 0, false); + check_fuzzy_param_parsing("{3}myterm", "myterm", 3, 0, false); + check_fuzzy_param_parsing("{p}myterm", "myterm", 2, 0, true); + check_fuzzy_param_parsing("{p1}myterm", "myterm", 1, 0, true); + check_fuzzy_param_parsing("{2,70}myterm", "myterm", 2, 70, false); + check_fuzzy_param_parsing("{p2,70}myterm", "myterm", 2, 70, true); } TEST("verify correct term parsing") { @@ -640,6 +661,14 @@ TEST("utf8 substring search with empty term") assertFieldInfo(fs, "", "abc", QTFieldInfo().setFieldLength(0)); } +Hits is_hit() { + return Hits().add({0, 0}); +} + +Hits no_hits() { + return {}; +} + TEST("utf8 suffix search") { UTF8SuffixStringFieldSearcher fs(0); std::string field = "operators and operator overloading"; @@ -837,6 +866,37 @@ TEST("utf8 flexible searcher fuzzy matching treats field as 1 word") { TEST_DO(assertFieldInfo(fs, "%{1}foo", "foo bar baz", QTFieldInfo(0, 0, 1))); } +TEST("utf8 flexible searcher supports fuzzy prefix matching") { + UTF8FlexibleStringFieldSearcher fs(0); + TEST_DO(assertString(fs, "%{p0}z", "zoid", is_hit())); + TEST_DO(assertString(fs, "%{p0}zo", "zoid", is_hit())); + TEST_DO(assertString(fs, "%{p0}zo", "Zoid", is_hit())); // uncased + TEST_DO(assertString(fs, "%{p0}Zo", "zoid", is_hit())); // uncased + TEST_DO(assertString(fs, "%{p0}zoid", "zoid", is_hit())); + TEST_DO(assertString(fs, "%{p0}x", "zoid", no_hits())); + TEST_DO(assertString(fs, "%{p1}zo", "boid", is_hit())); + TEST_DO(assertString(fs, "%{p1}zo", "blid", no_hits())); + TEST_DO(assertString(fs, "%{p1}yam", "hamburger", is_hit())); + TEST_DO(assertString(fs, "%{p1}yam", "humbug", no_hits())); + TEST_DO(assertString(fs, "%{p2}yam", "humbug", is_hit())); + TEST_DO(assertString(fs, "%{p2}catfo", "dogfood", no_hits())); + TEST_DO(assertString(fs, "%{p3}catfo", "dogfood", is_hit())); + TEST_DO(assertString(fs, "%{p100}abcd", "anything you want", is_hit())); // trivially matches +} + +TEST("utf8 flexible searcher supports fuzzy prefix matching combined with prefix locking") { + UTF8FlexibleStringFieldSearcher fs(0); + TEST_DO(assertString(fs, "%{p0,4}zoid", "zoid", is_hit())); + TEST_DO(assertString(fs, "%{p0,4}zoidber", "zoidberg", is_hit())); + TEST_DO(assertString(fs, "%{p1,4}zoidber", "zoidburg", is_hit())); + TEST_DO(assertString(fs, "%{p1,4}zoidber", "zoidblurgh", no_hits())); + TEST_DO(assertString(fs, "%{p1,4}zoidbe", "zoidblurgh", is_hit())); + TEST_DO(assertString(fs, "%{p1,4}zoidberg", "boidberg", no_hits())); + TEST_DO(assertString(fs, "%{p1,4}zoidber", "zoidburger", is_hit())); + TEST_DO(assertString(fs, "%{p1,4}zoidber", "zoidbananas", no_hits())); + TEST_DO(assertString(fs, "%{p2,4}zoidber", "zoidbananas", is_hit())); +} + TEST("bool search") { BoolFieldSearcher fs(0); TEST_DO(assertBool(fs, "true", true, true)); diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp index d94120e5bcf..9e4550db073 100644 --- a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp +++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp @@ -33,7 +33,7 @@ TEST(FuzzyMatcherTest, get_suffix_edge_cases) { } TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) { - FuzzyMatcher fuzzy("abc", 2, 0, false); + FuzzyMatcher fuzzy("abc", 2, 0, false, false); EXPECT_TRUE(fuzzy.isMatch("abc")); EXPECT_TRUE(fuzzy.isMatch("ABC")); EXPECT_TRUE(fuzzy.isMatch("ab1")); @@ -42,15 +42,15 @@ TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) { } TEST(FuzzyMatcherTest, fuzzy_match_cased) { - FuzzyMatcher fuzzy("abc", 2, 0, true); + FuzzyMatcher fuzzy("abc", 2, 0, true, false); EXPECT_TRUE(fuzzy.isMatch("abc")); EXPECT_TRUE(fuzzy.isMatch("abC")); EXPECT_TRUE(fuzzy.isMatch("aBC")); EXPECT_FALSE(fuzzy.isMatch("ABC")); } -TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) { - FuzzyMatcher fuzzy("abcdef", 2, 2, false); +TEST(FuzzyMatcherTest, fuzzy_match_with_prefix_locking) { + FuzzyMatcher fuzzy("abcdef", 2, 2, false, false); EXPECT_TRUE(fuzzy.isMatch("abcdef")); EXPECT_TRUE(fuzzy.isMatch("ABCDEF")); EXPECT_TRUE(fuzzy.isMatch("abcde1")); @@ -59,22 +59,43 @@ TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) { EXPECT_FALSE(fuzzy.isMatch("12cdef")); } -TEST(FuzzyMatcherTest, get_prefix_is_empty) { - FuzzyMatcher fuzzy("whatever", 2, 0, false); +TEST(FuzzyMatcherTest, get_prefix_lock_length_is_zero) { + FuzzyMatcher fuzzy("whatever", 2, 0, false, false); EXPECT_EQ(fuzzy.getPrefix(), ""); } TEST(FuzzyMatcherTest, term_is_empty) { - FuzzyMatcher fuzzy("", 2, 0, false); + FuzzyMatcher fuzzy("", 2, 0, false, false); EXPECT_TRUE(fuzzy.isMatch("")); EXPECT_TRUE(fuzzy.isMatch("a")); EXPECT_TRUE(fuzzy.isMatch("aa")); EXPECT_FALSE(fuzzy.isMatch("aaa")); } -TEST(FuzzyMatcherTest, get_prefix_non_empty) { - FuzzyMatcher fuzzy("abcd", 2, 2, false); +TEST(FuzzyMatcherTest, get_prefix_lock_length_non_zero) { + FuzzyMatcher fuzzy("abcd", 2, 2, false, false); EXPECT_EQ(fuzzy.getPrefix(), "ab"); } +TEST(FuzzyMatcherTest, fuzzy_prefix_matching_without_prefix_lock_length) { + FuzzyMatcher fuzzy("abc", 1, 0, false, true); + EXPECT_EQ(fuzzy.getPrefix(), ""); + EXPECT_TRUE(fuzzy.isMatch("abc")); + EXPECT_TRUE(fuzzy.isMatch("abcdefgh")); + EXPECT_TRUE(fuzzy.isMatch("ab")); + EXPECT_TRUE(fuzzy.isMatch("abd")); + EXPECT_TRUE(fuzzy.isMatch("xabc")); + EXPECT_FALSE(fuzzy.isMatch("xy")); +} + +TEST(FuzzyMatcherTest, fuzzy_prefix_matching_with_prefix_lock_length) { + FuzzyMatcher fuzzy("zoid", 1, 2, false, true); + EXPECT_EQ(fuzzy.getPrefix(), "zo"); + EXPECT_TRUE(fuzzy.isMatch("zoidberg")); + EXPECT_TRUE(fuzzy.isMatch("zold")); + EXPECT_TRUE(fuzzy.isMatch("zoldberg")); + EXPECT_FALSE(fuzzy.isMatch("zoxx")); + EXPECT_FALSE(fuzzy.isMatch("loid")); +} + GTEST_MAIN_RUN_ALL_TESTS() diff --git a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp index 918cbc624db..61824fa4abf 100644 --- a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp +++ b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp @@ -74,7 +74,11 @@ TEST(LevenshteinDistance, prefix_match_edge_cases) { EXPECT_EQ(prefix_calculate("xy", "", 2), std::optional{2}); EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt); - // Max edits > 2 cases; not supported by DFA implementation. + // Max edits not in {1, 2} cases; not supported by DFA implementation. + EXPECT_EQ(prefix_calculate("", "", 0), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abc", 0), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcde", 0), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "dbc", 0), std::nullopt); EXPECT_EQ(prefix_calculate("abc", "", 3), std::optional{3}); EXPECT_EQ(prefix_calculate("abc", "xy", 3), std::optional{3}); EXPECT_EQ(prefix_calculate("abc", "xyz", 3), std::optional{3}); diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp index 71388f43b89..fbfa50aa57f 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp @@ -29,10 +29,12 @@ vespalib::FuzzyMatcher::FuzzyMatcher(): _folded_term_codepoints_suffix() {} -vespalib::FuzzyMatcher::FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased): +vespalib::FuzzyMatcher::FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, + bool is_cased, bool is_prefix): _max_edit_distance(max_edit_distance), _prefix_size(prefix_size), _is_cased(is_cased), + _is_prefix(is_prefix), _folded_term_codepoints(_is_cased ? cased_convert_to_ucs4(term) : LowerCase::convert_to_ucs4(term)), _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)), _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size)) @@ -73,7 +75,7 @@ bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const { return LevenshteinDistance::calculate( _folded_term_codepoints_suffix, get_suffix(targetCodepoints, _prefix_size), - _max_edit_distance).has_value(); + _max_edit_distance, _is_prefix).has_value(); } vespalib::string vespalib::FuzzyMatcher::getPrefix() const { diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h index aae6ca1c6e5..2c17283d20a 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h @@ -24,6 +24,7 @@ private: uint32_t _max_edit_distance; // max edit distance uint32_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy bool _is_cased; + bool _is_prefix; std::vector<uint32_t> _folded_term_codepoints; @@ -34,7 +35,7 @@ public: FuzzyMatcher(); FuzzyMatcher(const FuzzyMatcher &) = delete; FuzzyMatcher & operator = (const FuzzyMatcher &) = delete; - FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased); + FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased, bool is_prefix); ~FuzzyMatcher(); [[nodiscard]] bool isMatch(std::string_view target) const; |