diff options
author | Alexey Chernyshev <aleksei@spotify.com> | 2022-05-02 14:45:30 +0200 |
---|---|---|
committer | Alexey Chernyshev <aleksei@spotify.com> | 2022-05-04 10:40:35 +0200 |
commit | 2aa476818678187d1dcb88ff44e137608b439d50 (patch) | |
tree | 7bb19cf6ea39161cae90e7133ff7569a3ec300ab | |
parent | 7585981b32937d2b13da1a8f94b42c8a0833a4c2 (diff) |
Supporting cased match for fuzzy operator
4 files changed, 54 insertions, 42 deletions
diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp index cd233b75438..5df0efe256e 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp +++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp @@ -23,14 +23,15 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased) } else { _regex = vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::IgnoreCase); } + } else if (isFuzzy()) { + _fuzzyMatcher = vespalib::FuzzyMatcher( + term.getTerm(), + term.getFuzzyMaxEditDistance(), + term.getFuzzyPrefixLength(), + isCased()); } else if (isCased()) { _term._char = term.getTerm(); _termLen = term.getTermLen(); - } else if (isFuzzy()) { - _fuzzyMatcher = vespalib::FuzzyMatcher::from_term( - term.getTerm(), - term.getFuzzyMaxEditDistance(), - term.getFuzzyPrefixLength()); } else { term.term(_term._ucs4); } @@ -45,13 +46,13 @@ StringSearchHelper::isMatch(const char *src) const { if (__builtin_expect(isRegex(), false)) { return getRegex().valid() ? getRegex().partial_match(std::string_view(src)) : false; } + if (__builtin_expect(isFuzzy(), false)) { + return getFuzzyMatcher().isMatch(src); + } if (__builtin_expect(isCased(), false)) { int res = strncmp(_term._char, src, _termLen); return (res == 0) && (src[_termLen] == 0 || isPrefix()); } - if (__builtin_expect(isFuzzy(), false)) { - return getFuzzyMatcher().isMatch(src); - } vespalib::Utf8ReaderForZTS u8reader(src); uint32_t j = 0; uint32_t val; diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp index 1395915fa5f..a0e261fb925 100644 --- a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp +++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp @@ -33,7 +33,7 @@ TEST(FuzzyMatcherTest, get_suffix_edge_cases) { } TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) { - FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abc", 2, 0); + FuzzyMatcher fuzzy("abc", 2, 0, false); EXPECT_TRUE(fuzzy.isMatch("abc")); EXPECT_TRUE(fuzzy.isMatch("ABC")); EXPECT_TRUE(fuzzy.isMatch("ab1")); @@ -41,8 +41,16 @@ TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) { EXPECT_FALSE(fuzzy.isMatch("123")); } +TEST(FuzzyMatcherTest, fuzzy_match_cased) { + FuzzyMatcher fuzzy("abc", 2, 0, true); + EXPECT_TRUE(fuzzy.isMatch("abc")); + EXPECT_TRUE(fuzzy.isMatch("abC")); + EXPECT_TRUE(fuzzy.isMatch("aBC")); + EXPECT_FALSE(fuzzy.isMatch("ABC")); +} + TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) { - FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcdef", 2, 2); + FuzzyMatcher fuzzy("abcdef", 2, 2, false); EXPECT_TRUE(fuzzy.isMatch("abcdef")); EXPECT_TRUE(fuzzy.isMatch("ABCDEF")); EXPECT_TRUE(fuzzy.isMatch("abcde1")); @@ -52,12 +60,12 @@ TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) { } TEST(FuzzyMatcherTest, get_prefix_is_empty) { - FuzzyMatcher fuzzy = FuzzyMatcher::from_term("whatever", 2, 0); + FuzzyMatcher fuzzy("whatever", 2, 0, false); EXPECT_EQ(fuzzy.getPrefix(), ""); } TEST(FuzzyMatcherTest, term_is_empty) { - FuzzyMatcher fuzzy = FuzzyMatcher::from_term("", 2, 0); + FuzzyMatcher fuzzy("", 2, 0, false); EXPECT_TRUE(fuzzy.isMatch("")); EXPECT_TRUE(fuzzy.isMatch("a")); EXPECT_TRUE(fuzzy.isMatch("aa")); @@ -65,7 +73,7 @@ TEST(FuzzyMatcherTest, term_is_empty) { } TEST(FuzzyMatcherTest, get_prefix_non_empty) { - FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcd", 2, 2); + FuzzyMatcher fuzzy("abcd", 2, 2, false); EXPECT_EQ(fuzzy.getPrefix(), "ab"); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp index 21fdb86a48d..89449944b73 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp @@ -6,8 +6,34 @@ #include <vespa/vespalib/text/lowercase.h> #include <vespa/vespalib/text/utf8.h> -vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength) { - return FuzzyMatcher(LowerCase::convert_to_ucs4(term), maxEditDistance, prefixLength); +namespace { + + std::vector<uint32_t> cased_convert_to_ucs4(std::string_view input) { + std::vector<uint32_t> result; + result.reserve(input.size()); + vespalib::Utf8Reader reader(input.data()); + while (reader.hasMore()) { + result.emplace_back(reader.getChar()); + } + return result; + } + +} // anonymous + +vespalib::FuzzyMatcher::FuzzyMatcher(): + _max_edit_distance(DefaultMaxEditDistance), + _prefix_size(DefaultPrefixSize), + _is_cased(false) +{} + +vespalib::FuzzyMatcher::FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased): + _max_edit_distance(max_edit_distance), + _prefix_size(prefix_size), + _is_cased(is_cased) +{ + _folded_term_codepoints = _is_cased ? cased_convert_to_ucs4(term) : LowerCase::convert_to_ucs4(term); + _folded_term_codepoints_prefix = get_prefix(_folded_term_codepoints, _prefix_size); + _folded_term_codepoints_suffix = get_suffix(_folded_term_codepoints, _prefix_size); } std::span<const uint32_t> vespalib::FuzzyMatcher::get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength) { @@ -29,7 +55,7 @@ std::span<const uint32_t> vespalib::FuzzyMatcher::get_suffix(const std::vector<u } bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const { - std::vector<uint32_t> targetCodepoints = LowerCase::convert_to_ucs4(target); + std::vector<uint32_t> targetCodepoints = _is_cased ? cased_convert_to_ucs4(target) : LowerCase::convert_to_ucs4(target); if (_prefix_size > 0) { // prefix comparison is meaningless if it's empty std::span<const uint32_t> targetPrefix = get_prefix(targetCodepoints, _prefix_size); diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h index 12cc039706f..dcebc56babb 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h @@ -25,6 +25,7 @@ private: uint32_t _max_edit_distance; // max edit distance uint32_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy + bool _is_cased; std::vector<uint32_t> _folded_term_codepoints; @@ -32,38 +33,14 @@ private: std::span<const uint32_t> _folded_term_codepoints_suffix; public: - FuzzyMatcher(): - _max_edit_distance(DefaultMaxEditDistance), - _prefix_size(DefaultPrefixSize), - _folded_term_codepoints(), - _folded_term_codepoints_prefix(), - _folded_term_codepoints_suffix() - {} + FuzzyMatcher(); - FuzzyMatcher(const std::vector<uint32_t>&& codepoints): - _max_edit_distance(DefaultMaxEditDistance), - _prefix_size(DefaultPrefixSize), - _folded_term_codepoints(codepoints), - _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)), - _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size)) - {} - - FuzzyMatcher(const std::vector<uint32_t>&& codepoints, uint32_t max_edit_distance, uint32_t prefix_size): - _max_edit_distance(max_edit_distance), - _prefix_size(prefix_size), - _folded_term_codepoints(codepoints), - _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)), - _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size)) - {} + FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased); [[nodiscard]] bool isMatch(std::string_view target) const; [[nodiscard]] vespalib::string getPrefix() const; - /// - - static FuzzyMatcher from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength); - static std::span<const uint32_t> get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength); static std::span<const uint32_t> get_suffix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength); |