summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorAlexey Chernyshev <aleksei@spotify.com>2022-03-22 16:47:14 +0100
committerAlexey Chernyshev <aleksei@spotify.com>2022-03-23 16:21:05 +0100
commit6bcdc1ac1c1c3ce8b30472926098df989b9f7019 (patch)
tree1bec251b1711a093a196c25be896c680282abd24 /vespalib
parent32358e1689cded19a3c5d0213b0ef0c5329c1e33 (diff)
Addressing more comments
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/tests/fuzzy/CMakeLists.txt16
-rw-r--r--vespalib/src/tests/fuzzy/fuzzy.cpp33
-rw-r--r--vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp42
-rw-r--r--vespalib/src/tests/fuzzy/levenstein_distance_test.cpp39
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt3
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy.h60
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp29
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h59
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp (renamed from vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp)48
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h15
10 files changed, 202 insertions, 142 deletions
diff --git a/vespalib/src/tests/fuzzy/CMakeLists.txt b/vespalib/src/tests/fuzzy/CMakeLists.txt
index f58602296a5..2a415a9ad62 100644
--- a/vespalib/src/tests/fuzzy/CMakeLists.txt
+++ b/vespalib/src/tests/fuzzy/CMakeLists.txt
@@ -1,8 +1,18 @@
# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-vespa_add_executable(vespalib_fuzzy_test_app TEST
+vespa_add_executable(vespalib_fuzzy_matcher_test_app TEST
SOURCES
- fuzzy.cpp
+ fuzzy_matcher_test.cpp
DEPENDS
vespalib
+ GTest::GTest
)
-vespa_add_test(NAME vespalib_fuzzy_test_app COMMAND vespalib_fuzzy_test_app)
+vespa_add_test(NAME vespalib_fuzzy_matcher_test_app COMMAND vespalib_fuzzy_matcher_test_app)
+
+vespa_add_executable(vespalib_levenstein_distance_test_app TEST
+ SOURCES
+ levenstein_distance_test.cpp
+ DEPENDS
+ vespalib
+ GTest::GTest
+ )
+vespa_add_test(NAME vespalib_levenstein_distance_test_app COMMAND vespalib_levenstein_distance_test_app) \ No newline at end of file
diff --git a/vespalib/src/tests/fuzzy/fuzzy.cpp b/vespalib/src/tests/fuzzy/fuzzy.cpp
deleted file mode 100644
index 9ffb77b3742..00000000000
--- a/vespalib/src/tests/fuzzy/fuzzy.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vespalib/testkit/test_kit.h>
-#include <vespa/vespalib/fuzzy/fuzzy.h>
-
-using namespace vespalib;
-
-
-TEST("require that levenstein distance works") {
- EXPECT_EQUAL(0u, Fuzzy::levenstein_distance("abc", "abc", 2).value());
- EXPECT_EQUAL(0u, Fuzzy::levenstein_distance("abc", "ABC", 2).value());
- EXPECT_EQUAL(1u, Fuzzy::levenstein_distance("abc", "abd", 2).value());
- EXPECT_EQUAL(1u, Fuzzy::levenstein_distance("ABC", "abd", 2).value());
- EXPECT_EQUAL(2u, Fuzzy::levenstein_distance("ABC", "add", 2).value());
- EXPECT_FALSE(Fuzzy::levenstein_distance("ABC", "ddd", 2).has_value());
-}
-
-TEST("require that extracting of a prefix works") {
- Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 2, 2);
- EXPECT_EQUAL("pr", fuzzy.getPrefix());
-}
-
-TEST("require that empty prefix works") {
- Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 0, 2);
- EXPECT_EQUAL("", fuzzy.getPrefix());
-}
-
-TEST("require that longer prefix size works") {
- Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 100, 2);
- EXPECT_EQUAL("prefix", fuzzy.getPrefix());
-}
-
-
-TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
new file mode 100644
index 00000000000..60a4eab3f57
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/fuzzy_matcher_test.cpp
@@ -0,0 +1,42 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/fuzzy/fuzzy_matcher.h>
+#include <vespa/vespalib/text/lowercase.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using namespace vespalib;
+
+FuzzyMatcher from_term(std::string_view term, uint8_t threshold, uint8_t prefix_size) {
+ return {LowerCase::convert_to_ucs4(term), threshold, prefix_size};
+}
+
+TEST(FuzzyMatcherTest, fuzzy_match_empty_prefix) {
+ FuzzyMatcher fuzzy = from_term("abc", 2, 0);
+ EXPECT_TRUE(fuzzy.isMatch("abc"));
+ EXPECT_TRUE(fuzzy.isMatch("ABC"));
+ EXPECT_TRUE(fuzzy.isMatch("ab1"));
+ EXPECT_TRUE(fuzzy.isMatch("a12"));
+ EXPECT_FALSE(fuzzy.isMatch("123"));
+}
+
+TEST(FuzzyMatcherTest, fuzzy_match_with_prefix) {
+ FuzzyMatcher fuzzy = from_term("abcdef", 2, 2);
+ EXPECT_TRUE(fuzzy.isMatch("abcdef"));
+ EXPECT_TRUE(fuzzy.isMatch("ABCDEF"));
+ EXPECT_TRUE(fuzzy.isMatch("abcde1"));
+ EXPECT_TRUE(fuzzy.isMatch("abcd12"));
+ EXPECT_FALSE(fuzzy.isMatch("abc123"));
+ EXPECT_TRUE(fuzzy.isMatch("12cdef")); // prefix match is not enforced
+}
+
+TEST(FuzzyMatcherTest, get_prefix_is_empty) {
+ FuzzyMatcher fuzzy = from_term("whatever", 2, 0);
+ EXPECT_EQ(fuzzy.getPrefix(), "");
+}
+
+TEST(FuzzyMatcherTest, get_prefix_non_empty) {
+ FuzzyMatcher fuzzy = from_term("abcd", 2, 2);
+ EXPECT_EQ(fuzzy.getPrefix(), "ab");
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/vespalib/src/tests/fuzzy/levenstein_distance_test.cpp b/vespalib/src/tests/fuzzy/levenstein_distance_test.cpp
new file mode 100644
index 00000000000..efdcc82fce1
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/levenstein_distance_test.cpp
@@ -0,0 +1,39 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/fuzzy/levenstein_distance.h>
+#include <vespa/vespalib/text/lowercase.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) {
+ std::vector<uint32_t> leftCodepoints = vespalib::LowerCase::convert_to_ucs4(left);
+ std::vector<uint32_t> rightCodepoints = vespalib::LowerCase::convert_to_ucs4(right);
+
+ std::optional<uint32_t> leftRight = vespalib::LevensteinDistance::calculate(leftCodepoints,rightCodepoints, threshold);
+ std::optional<uint32_t> rightLeft = vespalib::LevensteinDistance::calculate(rightCodepoints,leftCodepoints, threshold);
+
+ EXPECT_EQ(leftRight, rightLeft); // should be independent whether left or right strings are swapped
+
+ return leftRight;
+}
+
+TEST(LevensteinDistance, calculate_edgecases) {
+ EXPECT_EQ(calculate("abc", "abc", 2), std::optional{0});
+ EXPECT_EQ(calculate("abc", "ab1", 2), std::optional{1});
+ EXPECT_EQ(calculate("abc", "1bc", 2), std::optional{1});
+ EXPECT_EQ(calculate("abc", "a1c", 2), std::optional{1});
+ EXPECT_EQ(calculate("abc", "ab", 2), std::optional{1});
+ EXPECT_EQ(calculate("abc", "abcd", 2), std::optional{1});
+ EXPECT_EQ(calculate("bc", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("ab", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("cd", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("ad", "abcd", 2), std::optional{2});
+ EXPECT_EQ(calculate("abc", "a12", 2), std::optional{2});
+ EXPECT_EQ(calculate("abc", "123", 2), std::nullopt);
+ EXPECT_EQ(calculate("a", "", 2), std::optional{1});
+ EXPECT_EQ(calculate("ab", "", 2), std::optional{2});
+ EXPECT_EQ(calculate("abc", "", 2), std::nullopt);
+ EXPECT_EQ(calculate("abc", "123", 2), std::nullopt);
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
+
diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
index 0c6c8f5c446..0f7583da1d7 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
@@ -1,7 +1,8 @@
# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
vespa_add_library(vespalib_vespalib_fuzzy OBJECT
SOURCES
- fuzzy.cpp
+ fuzzy_matcher.cpp
+ levenstein_distance.cpp
DEPENDS
)
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h
deleted file mode 100644
index 775c25445e3..00000000000
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#pragma once
-
-#include <string_view>
-#include <utility>
-#include <vector>
-#include <optional>
-
-#include <vespa/vespalib/stllike/string.h>
-
-namespace vespalib {
-
-class Fuzzy {
-private:
- static constexpr uint8_t DefaultPrefixSize = 0u;
- static constexpr uint8_t DefaultEditDistance = 2u;
-
- std::vector<uint32_t> _folded_term_codepoints;
-
- uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e non-fuzzy
- uint8_t _edit_distance; // max edit distance
-
-
-public:
- Fuzzy():
- _folded_term_codepoints(),
- _prefix_size(DefaultPrefixSize),
- _edit_distance(DefaultEditDistance)
- {}
-
- Fuzzy(std::vector<uint32_t> codepoints):
- _folded_term_codepoints(std::move(codepoints)),
- _prefix_size(DefaultPrefixSize),
- _edit_distance(DefaultEditDistance)
- {}
-
- Fuzzy(std::vector<uint32_t> codepoints, uint8_t prefix_size, uint8_t edit_distance):
- _folded_term_codepoints(std::move(codepoints)),
- _prefix_size(prefix_size),
- _edit_distance(edit_distance)
- {}
-
- [[nodiscard]] bool isMatch(std::string_view src) const;
-
- [[nodiscard]] vespalib::string getPrefix() const;
-
- ///
-
- static Fuzzy from_term(std::string_view term);
-
- static std::optional<uint32_t> levenstein_distance(const std::vector<uint32_t>& source, const std::vector<uint32_t>& target, uint32_t threshold);
-
- static std::optional<uint32_t> levenstein_distance(std::string_view source, std::string_view target, uint32_t threshold);
-
- static std::vector<uint32_t> folded_codepoints(const char* src, size_t srcSize);
-
- static std::vector<uint32_t> folded_codepoints(std::string_view src);
-
-};
-
-}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
new file mode 100644
index 00000000000..495e08c42e3
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.cpp
@@ -0,0 +1,29 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "fuzzy_matcher.h"
+#include "levenstein_distance.h"
+
+#include <vespa/vespalib/text/lowercase.h>
+#include <vespa/vespalib/text/utf8.h>
+
+vespalib::FuzzyMatcher vespalib::FuzzyMatcher::from_term(std::string_view term) {
+ return FuzzyMatcher(LowerCase::convert_to_ucs4(term));
+}
+
+bool vespalib::FuzzyMatcher::isMatch(std::string_view target) const {
+ std::vector<uint32_t> targetCodepoints = LowerCase::convert_to_ucs4(target);
+
+ return LevensteinDistance::calculate(
+ _folded_term_codepoints,
+ targetCodepoints,
+ _max_edit_distance).has_value();
+}
+
+vespalib::string vespalib::FuzzyMatcher::getPrefix() const {
+ vespalib::string prefix;
+ Utf8Writer writer(prefix);
+ for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) {
+ writer.putChar(_folded_term_codepoints.at(i));
+ }
+ return prefix;
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
new file mode 100644
index 00000000000..bfcf98d79fd
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matcher.h
@@ -0,0 +1,59 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <string_view>
+#include <vector>
+
+#include <vespa/vespalib/stllike/string.h>
+
+namespace vespalib {
+
+/**
+ * Fuzzy matching between a lowercased instances of query and document terms based on Levenstein distance.
+ * Class has two main parameters:
+ * - prefix size, i.e the size of the prefix that is considered frozen.
+ * - max edit distance, i.e an upper bound for edit distance for it to be a match between terms
+ * Note that prefix size is not impacting matching logic,
+ * it's expected to be used to generate prefix for the lookup in the BTree for the fast-search logic.
+ * But there is no code to actually enforcing it.
+ */
+class FuzzyMatcher {
+private:
+ static constexpr uint8_t DefaultPrefixSize = 0u;
+ static constexpr uint8_t DefaultMaxEditDistance = 2u;
+
+ std::vector<uint32_t> _folded_term_codepoints;
+
+ uint8_t _max_edit_distance; // max edit distance
+ uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e. non-fuzzy
+
+public:
+ FuzzyMatcher():
+ _folded_term_codepoints(),
+ _max_edit_distance(DefaultMaxEditDistance),
+ _prefix_size(DefaultPrefixSize)
+ {}
+
+ FuzzyMatcher(std::vector<uint32_t> codepoints):
+ _folded_term_codepoints(std::move(codepoints)),
+ _max_edit_distance(DefaultMaxEditDistance),
+ _prefix_size(DefaultPrefixSize)
+ {}
+
+ FuzzyMatcher(std::vector<uint32_t> codepoints, uint8_t max_edit_distance, uint8_t prefix_size):
+ _folded_term_codepoints(std::move(codepoints)),
+ _max_edit_distance(max_edit_distance),
+ _prefix_size(prefix_size)
+ {}
+
+ [[nodiscard]] bool isMatch(std::string_view target) const;
+
+ [[nodiscard]] vespalib::string getPrefix() const;
+
+ ///
+
+ static FuzzyMatcher from_term(std::string_view term);
+
+};
+
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp
index f4744101a97..5ee0f2d4a2d 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.cpp
@@ -1,43 +1,16 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
// Levenstein distance algorithm is based off Java implementation from apache commons-text library licensed under the Apache 2.0 license.
-#include "fuzzy.h"
+#include "levenstein_distance.h"
#include <limits>
-#include <vespa/vespalib/text/utf8.h>
-#include <vespa/vespalib/text/lowercase.h>
-
-vespalib::Fuzzy vespalib::Fuzzy::from_term(std::string_view term) {
- return Fuzzy(folded_codepoints(term));
-}
-
-std::vector<uint32_t> vespalib::Fuzzy::folded_codepoints(const char* src, size_t srcSize) {
- std::vector<uint32_t> result;
- result.reserve(srcSize);
- Utf8ReaderForZTS srcReader(src);
- while (srcReader.hasMore()) {
- result.emplace_back(LowerCase::convert(srcReader.getChar()));
- }
- return result;
-}
-
-std::vector<uint32_t> vespalib::Fuzzy::folded_codepoints(std::string_view src) {
- return folded_codepoints(src.data(), src.size());
-}
-
-std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(std::string_view source, std::string_view target, uint32_t threshold) {
- std::vector<uint32_t> sourceCodepoints = folded_codepoints(source.data(), source.size());
- std::vector<uint32_t> targetCodepoints = folded_codepoints(target.data(), target.size());
- return levenstein_distance(sourceCodepoints, targetCodepoints, threshold);
-}
-
-std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold) {
+std::optional<uint32_t> vespalib::LevensteinDistance::calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold) {
uint32_t n = left.size();
uint32_t m = right.size();
if (n > m) {
- return levenstein_distance(right, left, threshold);
+ return calculate(right, left, threshold);
}
// if one string is empty, the edit distance is necessarily the length
@@ -83,7 +56,6 @@ std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(const std::vector<u
n : std::min(n, j + threshold);
// ignore entry left of leftmost
- // printf("j = %d, min = %d, max = %d, n = %d\n", j, min, max, n);
if (min > 1) {
d[min - 1] = std::numeric_limits<uint32_t>::max();
}
@@ -112,17 +84,3 @@ std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(const std::vector<u
}
return std::nullopt;
}
-
-bool vespalib::Fuzzy::isMatch(std::string_view src) const {
- std::vector<uint32_t> srcCodepoints = folded_codepoints(src);
- return levenstein_distance(_folded_term_codepoints, srcCodepoints, _edit_distance).has_value();
-}
-
-vespalib::string vespalib::Fuzzy::getPrefix() const {
- vespalib::string prefix;
- Utf8Writer writer(prefix);
- for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) {
- writer.putChar(_folded_term_codepoints.at(i));
- }
- return prefix;
-}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h
new file mode 100644
index 00000000000..e109db3178c
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenstein_distance.h
@@ -0,0 +1,15 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <optional>
+#include <cstdint>
+#include <vector>
+
+namespace vespalib {
+
+class LevensteinDistance {
+public:
+ static std::optional<uint32_t> calculate(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold);
+};
+
+}