summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorAlexey Chernyshev <aleksei@spotify.com>2022-03-10 16:33:07 +0100
committerAlexey Chernyshev <aleksei@spotify.com>2022-03-23 16:20:59 +0100
commitd9805209e3b0e33be3c0cc454c4604043663c1c4 (patch)
tree7446c79f68acd8775233ace4d5a70058f90c8406 /vespalib
parenta2b1e6654cabc90ddf7422e58adf641876e5201c (diff)
Introducing fuzzy search
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/CMakeLists.txt2
-rw-r--r--vespalib/src/tests/fuzzy/.gitignore4
-rw-r--r--vespalib/src/tests/fuzzy/CMakeLists.txt8
-rw-r--r--vespalib/src/tests/fuzzy/fuzzy.cpp33
-rw-r--r--vespalib/src/vespa/vespalib/CMakeLists.txt1
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt7
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp144
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/fuzzy.h60
8 files changed, 259 insertions, 0 deletions
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt
index 29a0c1e7bb8..c3c7fee2cef 100644
--- a/vespalib/CMakeLists.txt
+++ b/vespalib/CMakeLists.txt
@@ -62,6 +62,7 @@ vespa_define_module(
src/tests/explore_modern_cpp
src/tests/false
src/tests/fiddle
+ src/tests/fuzzy
src/tests/gencnt
src/tests/guard
src/tests/host_name
@@ -169,6 +170,7 @@ vespa_define_module(
src/vespa/vespalib/data
src/vespa/vespalib/data/slime
src/vespa/vespalib/datastore
+ src/vespa/vespalib/fuzzy
src/vespa/vespalib/geo
src/vespa/vespalib/hwaccelrated
src/vespa/vespalib/io
diff --git a/vespalib/src/tests/fuzzy/.gitignore b/vespalib/src/tests/fuzzy/.gitignore
new file mode 100644
index 00000000000..cd5ca9b3b88
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/.gitignore
@@ -0,0 +1,4 @@
+.depend
+Makefile
+fuzzy_test
+vespalib_fuzzy_test_app
diff --git a/vespalib/src/tests/fuzzy/CMakeLists.txt b/vespalib/src/tests/fuzzy/CMakeLists.txt
new file mode 100644
index 00000000000..f58602296a5
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vespalib_fuzzy_test_app TEST
+ SOURCES
+ fuzzy.cpp
+ DEPENDS
+ vespalib
+ )
+vespa_add_test(NAME vespalib_fuzzy_test_app COMMAND vespalib_fuzzy_test_app)
diff --git a/vespalib/src/tests/fuzzy/fuzzy.cpp b/vespalib/src/tests/fuzzy/fuzzy.cpp
new file mode 100644
index 00000000000..9ffb77b3742
--- /dev/null
+++ b/vespalib/src/tests/fuzzy/fuzzy.cpp
@@ -0,0 +1,33 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/vespalib/testkit/test_kit.h>
+#include <vespa/vespalib/fuzzy/fuzzy.h>
+
+using namespace vespalib;
+
+
+TEST("require that levenstein distance works") {
+ EXPECT_EQUAL(0u, Fuzzy::levenstein_distance("abc", "abc", 2).value());
+ EXPECT_EQUAL(0u, Fuzzy::levenstein_distance("abc", "ABC", 2).value());
+ EXPECT_EQUAL(1u, Fuzzy::levenstein_distance("abc", "abd", 2).value());
+ EXPECT_EQUAL(1u, Fuzzy::levenstein_distance("ABC", "abd", 2).value());
+ EXPECT_EQUAL(2u, Fuzzy::levenstein_distance("ABC", "add", 2).value());
+ EXPECT_FALSE(Fuzzy::levenstein_distance("ABC", "ddd", 2).has_value());
+}
+
+TEST("require that extracting of a prefix works") {
+ Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 2, 2);
+ EXPECT_EQUAL("pr", fuzzy.getPrefix());
+}
+
+TEST("require that empty prefix works") {
+ Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 0, 2);
+ EXPECT_EQUAL("", fuzzy.getPrefix());
+}
+
+TEST("require that longer prefix size works") {
+ Fuzzy fuzzy(Fuzzy::folded_codepoints("prefix"), 100, 2);
+ EXPECT_EQUAL("prefix", fuzzy.getPrefix());
+}
+
+
+TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/vespalib/src/vespa/vespalib/CMakeLists.txt b/vespalib/src/vespa/vespalib/CMakeLists.txt
index dea0f509bad..1fffddf2f2f 100644
--- a/vespalib/src/vespa/vespalib/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/CMakeLists.txt
@@ -7,6 +7,7 @@ vespa_add_library(vespalib
$<TARGET_OBJECTS:vespalib_vespalib_data>
$<TARGET_OBJECTS:vespalib_vespalib_data_slime>
$<TARGET_OBJECTS:vespalib_vespalib_datastore>
+ $<TARGET_OBJECTS:vespalib_vespalib_fuzzy>
$<TARGET_OBJECTS:vespalib_vespalib_geo>
$<TARGET_OBJECTS:vespalib_vespalib_hwaccelrated>
$<TARGET_OBJECTS:vespalib_vespalib_io>
diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
new file mode 100644
index 00000000000..0c6c8f5c446
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(vespalib_vespalib_fuzzy OBJECT
+ SOURCES
+ fuzzy.cpp
+ DEPENDS
+ )
+
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp
new file mode 100644
index 00000000000..55d11b74123
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.cpp
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Levenstein distance algorithm is based off Java implementation from apache commons-text library
+ */
+
+#include "fuzzy.h"
+
+#include <limits>
+
+#include <vespa/vespalib/text/utf8.h>
+#include <vespa/vespalib/text/lowercase.h>
+
+vespalib::Fuzzy vespalib::Fuzzy::from_term(std::string_view term) {
+ return Fuzzy(folded_codepoints(term));
+}
+
+std::vector<uint32_t> vespalib::Fuzzy::folded_codepoints(const char* src, size_t srcSize) {
+ std::vector<uint32_t> result;
+ result.reserve(srcSize);
+ Utf8ReaderForZTS srcReader(src);
+ while (srcReader.hasMore()) {
+ result.emplace_back(LowerCase::convert(srcReader.getChar()));
+ }
+ return result;
+}
+
+std::vector<uint32_t> vespalib::Fuzzy::folded_codepoints(std::string_view src) {
+ return folded_codepoints(src.data(), src.size());
+}
+
+std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(std::string_view source, std::string_view target, uint32_t threshold) {
+ std::vector<uint32_t> sourceCodepoints = folded_codepoints(source.data(), source.size());
+ std::vector<uint32_t> targetCodepoints = folded_codepoints(target.data(), target.size());
+ return levenstein_distance(sourceCodepoints, targetCodepoints, threshold);
+}
+
+std::optional<uint32_t> vespalib::Fuzzy::levenstein_distance(const std::vector<uint32_t>& left, const std::vector<uint32_t>& right, uint32_t threshold) {
+ uint32_t n = left.size();
+ uint32_t m = right.size();
+
+ if (n > m) {
+ return levenstein_distance(right, left, threshold);
+ }
+
+ // if one string is empty, the edit distance is necessarily the length
+ // of the other
+ if (n == 0) {
+ return m <= threshold ? std::optional(m) : std::nullopt;
+ }
+ if (m == 0) {
+ return n <= threshold ? std::optional(n) : std::nullopt;
+ }
+
+
+ // the edit distance cannot be less than the length difference
+ if (m - n > threshold) {
+ return std::nullopt;
+ }
+
+ std::vector<uint32_t> p(n+1); // 'previous' cost array, horizontally
+ std::vector<uint32_t> d(n+1); // cost array, horizontally
+
+ const uint32_t boundary = std::min(n, threshold) + 1;
+
+ for (uint32_t i = 0; i < boundary; ++i) {
+ p[i] = i;
+ }
+ // these fills ensure that the value above the rightmost entry of our
+ // stripe will be ignored in following loop iterations
+ for (uint32_t i = boundary; i < p.size(); ++i) {
+ p[i] = std::numeric_limits<uint32_t>::max();
+ }
+ for (uint32_t i = 0; i < d.size(); ++i) {
+ d[i] = std::numeric_limits<uint32_t>::max();
+ }
+
+ // iterates through t
+ for (uint32_t j = 1; j <= m; ++j) {
+ uint32_t rightJ = right[j - 1]; // jth character of right
+ d[0] = j;
+
+ int32_t min = std::max(1, static_cast<int32_t>(j) - static_cast<int32_t>(threshold));
+
+ uint32_t max = j > std::numeric_limits<uint32_t>::max() - threshold ?
+ n : std::min(n, j + threshold);
+
+ // ignore entry left of leftmost
+ // printf("j = %d, min = %d, max = %d, n = %d\n", j, min, max, n);
+ if (min > 1) {
+ d[min - 1] = std::numeric_limits<uint32_t>::max();
+ }
+
+ uint32_t lowerBound = std::numeric_limits<uint32_t>::max();
+
+ for (uint32_t i = min; i <= max; ++i) {
+ if (left[i - 1] == rightJ) {
+ // diagonally left and up
+ d[i] = p[i - 1];
+ } else {
+ // 1 + minimum of cell to the left, to the top, diagonally
+ // left and up
+
+ d[i] = 1 + std::min(std::min(d[i - 1], p[i]), p[i - 1]);
+ }
+ lowerBound = std::min(lowerBound, d[i]);
+ }
+ if (lowerBound > threshold) {
+ return std::nullopt;
+ }
+ std::swap(p, d);
+ }
+ if (p[n] <= threshold) {
+ return {p[n]};
+ }
+ return std::nullopt;
+}
+
+bool vespalib::Fuzzy::isMatch(std::string_view src) const {
+ std::vector<uint32_t> srcCodepoints = folded_codepoints(src);
+ return levenstein_distance(_folded_term_codepoints, srcCodepoints, _edit_distance).has_value();
+}
+
+vespalib::string vespalib::Fuzzy::getPrefix() const {
+ vespalib::string prefix;
+ Utf8Writer writer(prefix);
+ for (uint8_t i=0; i <_prefix_size && i <_folded_term_codepoints.size(); ++i) {
+ writer.putChar(_folded_term_codepoints.at(i));
+ }
+ return prefix;
+}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h
new file mode 100644
index 00000000000..775c25445e3
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <string_view>
+#include <utility>
+#include <vector>
+#include <optional>
+
+#include <vespa/vespalib/stllike/string.h>
+
+namespace vespalib {
+
+class Fuzzy {
+private:
+ static constexpr uint8_t DefaultPrefixSize = 0u;
+ static constexpr uint8_t DefaultEditDistance = 2u;
+
+ std::vector<uint32_t> _folded_term_codepoints;
+
+ uint8_t _prefix_size; // prefix of a term that is considered frozen, i.e non-fuzzy
+ uint8_t _edit_distance; // max edit distance
+
+
+public:
+ Fuzzy():
+ _folded_term_codepoints(),
+ _prefix_size(DefaultPrefixSize),
+ _edit_distance(DefaultEditDistance)
+ {}
+
+ Fuzzy(std::vector<uint32_t> codepoints):
+ _folded_term_codepoints(std::move(codepoints)),
+ _prefix_size(DefaultPrefixSize),
+ _edit_distance(DefaultEditDistance)
+ {}
+
+ Fuzzy(std::vector<uint32_t> codepoints, uint8_t prefix_size, uint8_t edit_distance):
+ _folded_term_codepoints(std::move(codepoints)),
+ _prefix_size(prefix_size),
+ _edit_distance(edit_distance)
+ {}
+
+ [[nodiscard]] bool isMatch(std::string_view src) const;
+
+ [[nodiscard]] vespalib::string getPrefix() const;
+
+ ///
+
+ static Fuzzy from_term(std::string_view term);
+
+ static std::optional<uint32_t> levenstein_distance(const std::vector<uint32_t>& source, const std::vector<uint32_t>& target, uint32_t threshold);
+
+ static std::optional<uint32_t> levenstein_distance(std::string_view source, std::string_view target, uint32_t threshold);
+
+ static std::vector<uint32_t> folded_codepoints(const char* src, size_t srcSize);
+
+ static std::vector<uint32_t> folded_codepoints(std::string_view src);
+
+};
+
+}