summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorAlexey Chernyshev <aleksei@spotify.com>2022-05-02 14:45:30 +0200
committerAlexey Chernyshev <aleksei@spotify.com>2022-05-04 10:40:35 +0200
commit2aa476818678187d1dcb88ff44e137608b439d50 (patch)
tree7bb19cf6ea39161cae90e7133ff7569a3ec300ab /vespalib
parent7585981b32937d2b13da1a8f94b42c8a0833a4c2 (diff)
Supporting cased match for fuzzy operator
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp18
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp32
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h29
3 files changed, 45 insertions, 34 deletions
diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
index 1395915fa5f..a0e261fb925 100644
--- a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
+++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
@@ -33,7 +33,7 @@ TEST(FuzzyMatcherTest, get_suffix_edge_cases) {
}
TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
- FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abc", 2, 0);
+ FuzzyMatcher fuzzy("abc", 2, 0, false);
EXPECT_TRUE(fuzzy.isMatch("abc"));
EXPECT_TRUE(fuzzy.isMatch("ABC"));
EXPECT_TRUE(fuzzy.isMatch("ab1"));
@@ -41,8 +41,16 @@ TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
EXPECT_FALSE(fuzzy.isMatch("123"));
}
+TEST(FuzzyMatcherTest, fuzzy_match_cased) {
+ FuzzyMatcher fuzzy("abc", 2, 0, true);
+ EXPECT_TRUE(fuzzy.isMatch("abc"));
+ EXPECT_TRUE(fuzzy.isMatch("abC"));
+ EXPECT_TRUE(fuzzy.isMatch("aBC"));
+ EXPECT_FALSE(fuzzy.isMatch("ABC"));
+}
+
TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
- FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcdef", 2, 2);
+ FuzzyMatcher fuzzy("abcdef", 2, 2, false);
EXPECT_TRUE(fuzzy.isMatch("abcdef"));
EXPECT_TRUE(fuzzy.isMatch("ABCDEF"));
EXPECT_TRUE(fuzzy.isMatch("abcde1"));
@@ -52,12 +60,12 @@ TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
}
TEST(FuzzyMatcherTest, get_prefix_is_empty) {
- FuzzyMatcher fuzzy = FuzzyMatcher::from_term("whatever", 2, 0);
+ FuzzyMatcher fuzzy("whatever", 2, 0, false);
EXPECT_EQ(fuzzy.getPrefix(), "");
}
TEST(FuzzyMatcherTest, term_is_empty) {
- FuzzyMatcher fuzzy = FuzzyMatcher::from_term("", 2, 0);
+ FuzzyMatcher fuzzy("", 2, 0, false);
EXPECT_TRUE(fuzzy.isMatch(""));
EXPECT_TRUE(fuzzy.isMatch("a"));
EXPECT_TRUE(fuzzy.isMatch("aa"));
@@ -65,7 +73,7 @@ TEST(FuzzyMatcherTest, term_is_empty) {
}
TEST(FuzzyMatcherTest, get_prefix_non_empty) {
- FuzzyMatcher fuzzy = FuzzyMatcher::from_term("abcd", 2, 2);
+ FuzzyMatcher fuzzy("abcd", 2, 2, false);
EXPECT_EQ(fuzzy.getPrefix(), "ab");
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
index 21fdb86a48d..89449944b73 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
@@ -6,8 +6,34 @@
#include <vespa/vespalib/text/lowercase.h>
#include <vespa/vespalib/text/utf8.h>
-vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength) {
- return FuzzyMatcher(LowerCase::convert_to_ucs4(term), maxEditDistance, prefixLength);
+namespace {
+
+ std::vector<uint32_t> cased_convert_to_ucs4(std::string_view input) {
+ std::vector<uint32_t> result;
+ result.reserve(input.size());
+ vespalib::Utf8Reader reader(input.data());
+ while (reader.hasMore()) {
+ result.emplace_back(reader.getChar());
+ }
+ return result;
+ }
+
+} // anonymous
+
+vespalib::FuzzyMatcher::FuzzyMatcher():
+ _max_edit_distance(DefaultMaxEditDistance),
+ _prefix_size(DefaultPrefixSize),
+ _is_cased(false)
+{}
+
+vespalib::FuzzyMatcher::FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased):
+ _max_edit_distance(max_edit_distance),
+ _prefix_size(prefix_size),
+ _is_cased(is_cased)
+{
+ _folded_term_codepoints = _is_cased ? cased_convert_to_ucs4(term) : LowerCase::convert_to_ucs4(term);
+ _folded_term_codepoints_prefix = get_prefix(_folded_term_codepoints, _prefix_size);
+ _folded_term_codepoints_suffix = get_suffix(_folded_term_codepoints, _prefix_size);
}
std::span<const uint32_t> vespalib::FuzzyMatcher::get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength) {
@@ -29,7 +55,7 @@ std::span<const uint32_t> vespalib::FuzzyMatcher::get_suffix(const std::vector<u
}
bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const {
- std::vector<uint32_t> targetCodepoints = LowerCase::convert_to_ucs4(target);
+ std::vector<uint32_t> targetCodepoints = _is_cased ? cased_convert_to_ucs4(target) : LowerCase::convert_to_ucs4(target);
if (_prefix_size > 0) { // prefix comparison is meaningless if it's empty
std::span<const uint32_t> targetPrefix = get_prefix(targetCodepoints, _prefix_size);
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
index 12cc039706f..dcebc56babb 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
@@ -25,6 +25,7 @@ private:
uint32_t _max_edit_distance; // max edit distance
uint32_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy
+ bool _is_cased;
std::vector<uint32_t> _folded_term_codepoints;
@@ -32,38 +33,14 @@ private:
std::span<const uint32_t> _folded_term_codepoints_suffix;
public:
- FuzzyMatcher():
- _max_edit_distance(DefaultMaxEditDistance),
- _prefix_size(DefaultPrefixSize),
- _folded_term_codepoints(),
- _folded_term_codepoints_prefix(),
- _folded_term_codepoints_suffix()
- {}
+ FuzzyMatcher();
- FuzzyMatcher(const std::vector<uint32_t>&& codepoints):
- _max_edit_distance(DefaultMaxEditDistance),
- _prefix_size(DefaultPrefixSize),
- _folded_term_codepoints(codepoints),
- _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)),
- _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size))
- {}
-
- FuzzyMatcher(const std::vector<uint32_t>&& codepoints, uint32_t max_edit_distance, uint32_t prefix_size):
- _max_edit_distance(max_edit_distance),
- _prefix_size(prefix_size),
- _folded_term_codepoints(codepoints),
- _folded_term_codepoints_prefix(get_prefix(_folded_term_codepoints, _prefix_size)),
- _folded_term_codepoints_suffix(get_suffix(_folded_term_codepoints, _prefix_size))
- {}
+ FuzzyMatcher(std::string_view term, uint32_t max_edit_distance, uint32_t prefix_size, bool is_cased);
[[nodiscard]] bool isMatch(std::string_view target) const;
[[nodiscard]] vespalib::string getPrefix() const;
- ///
-
- static FuzzyMatcher from_term(std::string_view term, uint32_t maxEditDistance, uint32_t prefixLength);
-
static std::span<const uint32_t> get_prefix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength);
static std::span<const uint32_t> get_suffix(const std::vector<uint32_t>& termCodepoints, uint32_t prefixLength);