summaryrefslogtreecommitdiffstats
path: root/vespalib/src/tests
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-04-19 11:19:18 +0000
committerTor Brede Vekterli <vekterli@vespa.ai>2024-04-19 13:45:59 +0000
commit0afbf14df1ee158167f70016545e799af1e433dc (patch)
tree5af98617f61cb76fcfe897585c9c2712955de3b9 /vespalib/src/tests
parent433cb01e19f6bb51d6a2d029482a6e16431cb055 (diff)
Wire fuzzy prefix matching support through the query stack
Adds `prefix:[true|false]` annotation support to the `fuzzy` query operator in the YQL and JSON query languages. Fuzzy prefix matching semantics are wired through to the matcher implementations for both indexed and streaming search. Example usage: {maxEditDistance:1,prefix:true}fuzzy("foo") Will match `foo`, `foobar`, `foxtrot`, `zookeeper` and so on. It can be combined with the existing prefix locking feature: {maxEditDistance:1,prefixLength:2,prefix:true}fuzzy("foo") Which will match `foo`, `foobar`, `foxtrot` etc, but _not_ `zookeeper` since the locked prefix (`fo`) does not match. Due to the complexities involved with extending the legacy binary query stack representation, signalling prefix matching for the fuzzy term is done by pragmatically adding a new, generic "prefix matching" term-level flag. This is currently ignored for everything except fuzzy query items. Modernizing the query stack format to make it more extensible (i.e. move encoding to Protobuf) is on the backlog...!
Diffstat (limited to 'vespalib/src/tests')
-rw-r--r--vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp39
-rw-r--r--vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp6
2 files changed, 35 insertions, 10 deletions
diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
index d94120e5bcf..9e4550db073 100644
--- a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
+++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
@@ -33,7 +33,7 @@ TEST(FuzzyMatcherTest, get_suffix_edge_cases) {
}
TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
- FuzzyMatcher fuzzy("abc", 2, 0, false);
+ FuzzyMatcher fuzzy("abc", 2, 0, false, false);
EXPECT_TRUE(fuzzy.isMatch("abc"));
EXPECT_TRUE(fuzzy.isMatch("ABC"));
EXPECT_TRUE(fuzzy.isMatch("ab1"));
@@ -42,15 +42,15 @@ TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
}
TEST(FuzzyMatcherTest, fuzzy_match_cased) {
- FuzzyMatcher fuzzy("abc", 2, 0, true);
+ FuzzyMatcher fuzzy("abc", 2, 0, true, false);
EXPECT_TRUE(fuzzy.isMatch("abc"));
EXPECT_TRUE(fuzzy.isMatch("abC"));
EXPECT_TRUE(fuzzy.isMatch("aBC"));
EXPECT_FALSE(fuzzy.isMatch("ABC"));
}
-TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
- FuzzyMatcher fuzzy("abcdef", 2, 2, false);
+TEST(FuzzyMatcherTest, fuzzy_match_with_prefix_locking) {
+ FuzzyMatcher fuzzy("abcdef", 2, 2, false, false);
EXPECT_TRUE(fuzzy.isMatch("abcdef"));
EXPECT_TRUE(fuzzy.isMatch("ABCDEF"));
EXPECT_TRUE(fuzzy.isMatch("abcde1"));
@@ -59,22 +59,43 @@ TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
EXPECT_FALSE(fuzzy.isMatch("12cdef"));
}
-TEST(FuzzyMatcherTest, get_prefix_is_empty) {
- FuzzyMatcher fuzzy("whatever", 2, 0, false);
+TEST(FuzzyMatcherTest, get_prefix_lock_length_is_zero) {
+ FuzzyMatcher fuzzy("whatever", 2, 0, false, false);
EXPECT_EQ(fuzzy.getPrefix(), "");
}
TEST(FuzzyMatcherTest, term_is_empty) {
- FuzzyMatcher fuzzy("", 2, 0, false);
+ FuzzyMatcher fuzzy("", 2, 0, false, false);
EXPECT_TRUE(fuzzy.isMatch(""));
EXPECT_TRUE(fuzzy.isMatch("a"));
EXPECT_TRUE(fuzzy.isMatch("aa"));
EXPECT_FALSE(fuzzy.isMatch("aaa"));
}
-TEST(FuzzyMatcherTest, get_prefix_non_empty) {
- FuzzyMatcher fuzzy("abcd", 2, 2, false);
+TEST(FuzzyMatcherTest, get_prefix_lock_length_non_zero) {
+ FuzzyMatcher fuzzy("abcd", 2, 2, false, false);
EXPECT_EQ(fuzzy.getPrefix(), "ab");
}
+TEST(FuzzyMatcherTest, fuzzy_prefix_matching_without_prefix_lock_length) {
+ FuzzyMatcher fuzzy("abc", 1, 0, false, true);
+ EXPECT_EQ(fuzzy.getPrefix(), "");
+ EXPECT_TRUE(fuzzy.isMatch("abc"));
+ EXPECT_TRUE(fuzzy.isMatch("abcdefgh"));
+ EXPECT_TRUE(fuzzy.isMatch("ab"));
+ EXPECT_TRUE(fuzzy.isMatch("abd"));
+ EXPECT_TRUE(fuzzy.isMatch("xabc"));
+ EXPECT_FALSE(fuzzy.isMatch("xy"));
+}
+
+TEST(FuzzyMatcherTest, fuzzy_prefix_matching_with_prefix_lock_length) {
+ FuzzyMatcher fuzzy("zoid", 1, 2, false, true);
+ EXPECT_EQ(fuzzy.getPrefix(), "zo");
+ EXPECT_TRUE(fuzzy.isMatch("zoidberg"));
+ EXPECT_TRUE(fuzzy.isMatch("zold"));
+ EXPECT_TRUE(fuzzy.isMatch("zoldberg"));
+ EXPECT_FALSE(fuzzy.isMatch("zoxx"));
+ EXPECT_FALSE(fuzzy.isMatch("loid"));
+}
+
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
index 918cbc624db..61824fa4abf 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_distance_test.cpp
@@ -74,7 +74,11 @@ TEST(LevenshteinDistance, prefix_match_edge_cases) {
EXPECT_EQ(prefix_calculate("xy", "", 2), std::optional{2});
EXPECT_EQ(prefix_calculate("xyz", "", 2), std::nullopt);
- // Max edits > 2 cases; not supported by DFA implementation.
+ // Max edits not in {1, 2} cases; not supported by DFA implementation.
+ EXPECT_EQ(prefix_calculate("", "", 0), std::optional{0});
+ EXPECT_EQ(prefix_calculate("abc", "abc", 0), std::optional{0});
+ EXPECT_EQ(prefix_calculate("abc", "abcde", 0), std::optional{0});
+ EXPECT_EQ(prefix_calculate("abc", "dbc", 0), std::nullopt);
EXPECT_EQ(prefix_calculate("abc", "", 3), std::optional{3});
EXPECT_EQ(prefix_calculate("abc", "xy", 3), std::optional{3});
EXPECT_EQ(prefix_calculate("abc", "xyz", 3), std::optional{3});