diff options
author | Tor Brede Vekterli <vekterli@vespa.ai> | 2024-04-19 11:19:18 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@vespa.ai> | 2024-04-19 13:45:59 +0000 |
commit | 0afbf14df1ee158167f70016545e799af1e433dc (patch) | |
tree | 5af98617f61cb76fcfe897585c9c2712955de3b9 /vespalib | |
parent | 433cb01e19f6bb51d6a2d029482a6e16431cb055 (diff) |
Wire fuzzy prefix matching support through the query stack
Adds `prefix:[true|false]` annotation support to the `fuzzy`
query operator in the YQL and JSON query languages. Fuzzy
prefix matching semantics are wired through to the matcher
implementations for both indexed and streaming search.
Example usage:
{maxEditDistance:1,prefix:true}fuzzy("foo")
Will match `foo`, `foobar`, `foxtrot`, `zookeeper` and so on.
It can be combined with the existing prefix locking feature:
{maxEditDistance:1,prefixLength:2,prefix:true}fuzzy("foo")
Which will match `foo`, `foobar`, `foxtrot` etc, but _not_
`zookeeper` since the locked prefix (`fo`) does not match.
Due to the complexities involved with extending the legacy binary
query stack representation, signalling prefix matching for the
fuzzy term is done by pragmatically adding a new, generic "prefix
matching" term-level flag. This is currently ignored for
everything except fuzzy query items.
Modernizing the query stack format to make it more extensible
(i.e. move encoding to Protobuf) is on the backlog...!
Diffstat (limited to 'vespalib')
4 files changed, 41 insertions, 13 deletions
diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp index d94120e5bcf..9e4550db073 100644 --- a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp +++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp @@ -33,7 +33,7 @@ TEST(FuzzyMatcherTest, get_suffix_edge_cases) { } TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) { - FuzzyMatcher fuzzy("abc", 2, 0, false); + FuzzyMatcher fuzzy("abc", 2, 0, false, false); EXPECT_TRUE(fuzzy.isMatch("abc")); EXPECT_TRUE(fuzzy.isMatch("ABC")); EXPECT_TRUE(fuzzy.isMatch("ab1")); @@ -42,15 +42,15 @@ TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) { } TEST(FuzzyMatcherTest, fuzzy_match_cased) { - FuzzyMatcher fuzzy("abc", 2, 0, true); + FuzzyMatcher fuzzy("abc", 2, 0, true, false); EXPECT_TRUE(fuzzy.isMatch("abc")); EXPECT_TRUE(fuzzy.isMatch("abC")); EXPECT_TRUE(fuzzy.isMatch("aBC")); EXPECT_FALSE(fuzzy.isMatch("ABC")); } -TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) { - FuzzyMatcher fuzzy("abcdef", 2, 2, false); +TEST(FuzzyMatcherTest, fuzzy_match_with_prefix_locking) { + FuzzyMatcher fuzzy("abcdef", 2, 2, false, false); EXPECT_TRUE(fuzzy.isMatch("abcdef")); EXPECT_TRUE(fuzzy.isMatch("ABCDEF")); EXPECT_TRUE(fuzzy.isMatch("abcde1")); @@ -59,22 +59,43 @@ TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) { EXPECT_FALSE(fuzzy.isMatch("12cdef")); } -TEST(FuzzyMatcherTest, get_prefix_is_empty) { - FuzzyMatcher fuzzy("whatever", 2, 0, false); +TEST(FuzzyMatcherTest, get_prefix_lock_length_is_zero) { + FuzzyMatcher fuzzy("whatever", 2, 0, false, false); EXPECT_EQ(fuzzy.getPrefix(), ""); } TEST(FuzzyMatcherTest, term_is_empty) { - FuzzyMatcher fuzzy("", 2, 0, false); + FuzzyMatcher fuzzy("", 2, 0, false, false); EXPECT_TRUE(fuzzy.isMatch("")); EXPECT_TRUE(fuzzy.isMatch("a")); EXPECT_TRUE(fuzzy.isMatch("aa")); EXPECT_FALSE(fuzzy.isMatch("aaa")); } -TEST(FuzzyMatcherTest, get_prefix_non_empty) { - FuzzyMatcher fuzzy("abcd", 2, 2, false); +TEST(FuzzyMatcherTest, get_prefix_lock_length_non_zero) { + FuzzyMatcher fuzzy("abcd", 2, 2, false, false); EXPECT_EQ(fuzzy.getPrefix(), "ab"); } +TEST(FuzzyMatcherTest, fuzzy_prefix_matching_without_prefix_lock_length) { + FuzzyMatcher fuzzy("abc", 1, 0, false, true); + EXPECT_EQ(fuzzy.getPrefix(), ""); + EXPECT_TRUE(fuzzy.isMatch("abc")); + EXPECT_TRUE(fuzzy.isMatch("abcdefgh")); + EXPECT_TRUE(fuzzy.isMatch("ab")); + EXPECT_TRUE(fuzzy.isMatch("abd")); + EXPECT_TRUE(fuzzy.isMatch("xabc")); + EXPECT_FALSE(fuzzy.isMatch("xy")); +} + +TEST(FuzzyMatcherTest, fuzzy_prefix_matching_with_prefix_lock_length) { + FuzzyMatcher fuzzy("zoid", 1, 2, false, true); + EXPECT_EQ(fuzzy.getPrefix(), "zo"); + EXPECT_TRUE(fuzzy.isMatch("zoidberg")); + EXPECT_TRUE(fuzzy.isMatch("zold")); + EXPECT_TRUE(fuzzy.isMatch("zoldberg")); + EXPECT_FALSE(fuzzy.isMatch("zoxx")); + EXPECT_FALSE(fuzzy.isMatch("loid")); +} + GTEST_MAIN_RUN_ALL_TESTS() diff --git a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp index 918cbc624db..61824fa4abf 100644 --- a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp +++ b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp @@ -74,7 +74,11 @@ TEST(LevenshteinDistance, prefix_match_edge_cases) { EXPECT_EQ(prefix_calculate("xy", "", 2), std::optional{2}); EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt); - // Max edits > 2 cases; not supported by DFA implementation. + // Max edits not in {1, 2} cases; not supported by DFA implementation. + EXPECT_EQ(prefix_calculate("", "", 0), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abc", 0), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "abcde", 0), std::optional{0}); + EXPECT_EQ(prefix_calculate("abc", "dbc", 0), std::nullopt); EXPECT_EQ(prefix_calculate("abc", "", 3), std::optional{3}); EXPECT_EQ(prefix_calculate("abc", "xy", 3), std::optional{3}); EXPECT_EQ(prefix_calculate("abc", "xyz", 3), std::optional{3}); diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp index 71388f43b89..fbfa50aa57f 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp @@ -29,10 +29,12 @@ vespalib::FuzzyMatcher::FuzzyMatcher(): _folded_term_codepoints_suffix() {} -vespalib::FuzzyMatcher::FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased): +vespalib::FuzzyMatcher::FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, + bool is_cased, bool is_prefix): _max_edit_distance(max_edit_distance), _prefix_size(prefix_size), _is_cased(is_cased), + _is_prefix(is_prefix), _folded_term_codepoints(_is_cased ? cased_convert_to_ucs4(term) : LowerCase::convert_to_ucs4(term)), _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)), _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size)) @@ -73,7 +75,7 @@ bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const { return LevenshteinDistance::calculate( _folded_term_codepoints_suffix, get_suffix(targetCodepoints, _prefix_size), - _max_edit_distance).has_value(); + _max_edit_distance, _is_prefix).has_value(); } vespalib::string vespalib::FuzzyMatcher::getPrefix() const { diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h index aae6ca1c6e5..2c17283d20a 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h @@ -24,6 +24,7 @@ private: uint32_t _max_edit_distance; // max edit distance uint32_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy bool _is_cased; + bool _is_prefix; std::vector<uint32_t> _folded_term_codepoints; @@ -34,7 +35,7 @@ public: FuzzyMatcher(); FuzzyMatcher(const FuzzyMatcher &) = delete; FuzzyMatcher & operator = (const FuzzyMatcher &) = delete; - FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased); + FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased, bool is_prefix); ~FuzzyMatcher(); [[nodiscard]] bool isMatch(std::string_view target) const; |