summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexey Chernyshev <aleksei@spotify.com>2022-05-02 14:45:30 +0200
committerAlexey Chernyshev <aleksei@spotify.com>2022-05-04 10:40:35 +0200
commit2aa476818678187d1dcb88ff44e137608b439d50 (patch)
tree7bb19cf6ea39161cae90e7133ff7569a3ec300ab
parent7585981b32937d2b13da1a8f94b42c8a0833a4c2 (diff)
Supporting cased match for fuzzy operator
-rw-r--r--searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp17
-rw-r--r--vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp18
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp32
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h29
4 files changed, 54 insertions, 42 deletions
diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
index cd233b75438..5df0efe256e 100644
--- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp
@@ -23,14 +23,15 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased)
} else {
_regex = vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::IgnoreCase);
}
+ } else if (isFuzzy()) {
+ _fuzzyMatcher = vespalib::FuzzyMatcher(
+ term.getTerm(),
+ term.getFuzzyMaxEditDistance(),
+ term.getFuzzyPrefixLength(),
+ isCased());
} else if (isCased()) {
_term._char = term.getTerm();
_termLen = term.getTermLen();
- } else if (isFuzzy()) {
- _fuzzyMatcher = vespalib::FuzzyMatcher::from_term(
- term.getTerm(),
- term.getFuzzyMaxEditDistance(),
- term.getFuzzyPrefixLength());
} else {
term.term(_term._ucs4);
}
@@ -45,13 +46,13 @@ StringSearchHelper::isMatch(const char *src) const {
if (__builtin_expect(isRegex(), false)) {
return getRegex().valid() ? getRegex().partial_match(std::string_view(src)) : false;
}
+ if (__builtin_expect(isFuzzy(), false)) {
+ return getFuzzyMatcher().isMatch(src);
+ }
if (__builtin_expect(isCased(), false)) {
int res = strncmp(_term._char, src, _termLen);
return (res == 0) && (src[_termLen] == 0 || isPrefix());
}
- if (__builtin_expect(isFuzzy(), false)) {
- return getFuzzyMatcher().isMatch(src);
- }
vespalib::Utf8ReaderForZTS u8reader(src);
uint32_t j = 0;
uint32_t val;
diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
index 1395915fa5f..a0e261fb925 100644
--- a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
+++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
@@ -33,7 +33,7 @@ TEST(FuzzyMatcherTest, get_suffix_edge_cases) {
}
TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
- FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abc", 2, 0);
+ FuzzyMatcher fuzzy("abc", 2, 0, false);
EXPECT_TRUE(fuzzy.isMatch("abc"));
EXPECT_TRUE(fuzzy.isMatch("ABC"));
EXPECT_TRUE(fuzzy.isMatch("ab1"));
@@ -41,8 +41,16 @@ TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
EXPECT_FALSE(fuzzy.isMatch("123"));
}
+TEST(FuzzyMatcherTest, fuzzy_match_cased) {
+ FuzzyMatcher fuzzy("abc", 2, 0, true);
+ EXPECT_TRUE(fuzzy.isMatch("abc"));
+ EXPECT_TRUE(fuzzy.isMatch("abC"));
+ EXPECT_TRUE(fuzzy.isMatch("aBC"));
+ EXPECT_FALSE(fuzzy.isMatch("ABC"));
+}
+
TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
- FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcdef", 2, 2);
+ FuzzyMatcher fuzzy("abcdef", 2, 2, false);
EXPECT_TRUE(fuzzy.isMatch("abcdef"));
EXPECT_TRUE(fuzzy.isMatch("ABCDEF"));
EXPECT_TRUE(fuzzy.isMatch("abcde1"));
@@ -52,12 +60,12 @@ TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
}
TEST(FuzzyMatcherTest, get_prefix_is_empty) {
- FuzzyMatcher fuzzy = FuzzyMatcher::from_term("whatever", 2, 0);
+ FuzzyMatcher fuzzy("whatever", 2, 0, false);
EXPECT_EQ(fuzzy.getPrefix(), "");
}
TEST(FuzzyMatcherTest, term_is_empty) {
- FuzzyMatcher fuzzy = FuzzyMatcher::from_term("", 2, 0);
+ FuzzyMatcher fuzzy("", 2, 0, false);
EXPECT_TRUE(fuzzy.isMatch(""));
EXPECT_TRUE(fuzzy.isMatch("a"));
EXPECT_TRUE(fuzzy.isMatch("aa"));
@@ -65,7 +73,7 @@ TEST(FuzzyMatcherTest, term_is_empty) {
}
TEST(FuzzyMatcherTest, get_prefix_non_empty) {
- FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcd", 2, 2);
+ FuzzyMatcher fuzzy("abcd", 2, 2, false);
EXPECT_EQ(fuzzy.getPrefix(), "ab");
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
index 21fdb86a48d..89449944b73 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
@@ -6,8 +6,34 @@
#include <vespa/vespalib/text/lowercase.h>
#include <vespa/vespalib/text/utf8.h>
-vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength) {
- return FuzzyMatcher(LowerCase::convert_to_ucs4(term), maxEditDistance, prefixLength);
+namespace {
+
+ std::vector<uint32_t> cased_convert_to_ucs4(std::string_view input) {
+ std::vector<uint32_t> result;
+ result.reserve(input.size());
+ vespalib::Utf8Reader reader(input.data());
+ while (reader.hasMore()) {
+ result.emplace_back(reader.getChar());
+ }
+ return result;
+ }
+
+} // anonymous
+
+vespalib::FuzzyMatcher::FuzzyMatcher():
+ _max_edit_distance(DefaultMaxEditDistance),
+ _prefix_size(DefaultPrefixSize),
+ _is_cased(false)
+{}
+
+vespalib::FuzzyMatcher::FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased):
+ _max_edit_distance(max_edit_distance),
+ _prefix_size(prefix_size),
+ _is_cased(is_cased)
+{
+ _folded_term_codepoints = _is_cased ? cased_convert_to_ucs4(term) : LowerCase::convert_to_ucs4(term);
+ _folded_term_codepoints_prefix = get_prefix(_folded_term_codepoints, _prefix_size);
+ _folded_term_codepoints_suffix = get_suffix(_folded_term_codepoints, _prefix_size);
}
std::span<const uint32_t> vespalib::FuzzyMatcher::get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength) {
@@ -29,7 +55,7 @@ std::span<const uint32_t> vespalib::FuzzyMatcher::get_suffix(const std::vector<u
}
bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const {
- std::vector<uint32_t> targetCodepoints = LowerCase::convert_to_ucs4(target);
+ std::vector<uint32_t> targetCodepoints = _is_cased ? cased_convert_to_ucs4(target) : LowerCase::convert_to_ucs4(target);
if (_prefix_size > 0) { // prefix comparison is meaningless if it's empty
std::span<const uint32_t> targetPrefix = get_prefix(targetCodepoints, _prefix_size);
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
index 12cc039706f..dcebc56babb 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
@@ -25,6 +25,7 @@ private:
uint32_t _max_edit_distance; // max edit distance
uint32_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy
+ bool _is_cased;
std::vector<uint32_t> _folded_term_codepoints;
@@ -32,38 +33,14 @@ private:
std::span<const uint32_t> _folded_term_codepoints_suffix;
public:
- FuzzyMatcher():
- _max_edit_distance(DefaultMaxEditDistance),
- _prefix_size(DefaultPrefixSize),
- _folded_term_codepoints(),
- _folded_term_codepoints_prefix(),
- _folded_term_codepoints_suffix()
- {}
+ FuzzyMatcher();
- FuzzyMatcher(const std::vector<uint32_t>&& codepoints):
- _max_edit_distance(DefaultMaxEditDistance),
- _prefix_size(DefaultPrefixSize),
- _folded_term_codepoints(codepoints),
- _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)),
- _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size))
- {}
-
- FuzzyMatcher(const std::vector<uint32_t>&& codepoints, uint32_t max_edit_distance, uint32_t prefix_size):
- _max_edit_distance(max_edit_distance),
- _prefix_size(prefix_size),
- _folded_term_codepoints(codepoints),
- _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)),
- _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size))
- {}
+ FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased);
[[nodiscard]] bool isMatch(std::string_view target) const;
[[nodiscard]] vespalib::string getPrefix() const;
- ///
-
- static FuzzyMatcher from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength);
-
static std::span<const uint32_t> get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength);
static std::span<const uint32_t> get_suffix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength);