diff options
Diffstat (limited to 'vespalib/src')
18 files changed, 2663 insertions, 3 deletions
diff --git a/vespalib/src/tests/fuzzy/CMakeLists.txt b/vespalib/src/tests/fuzzy/CMakeLists.txt index bc48e775711..00a89d0a604 100644 --- a/vespalib/src/tests/fuzzy/CMakeLists.txt +++ b/vespalib/src/tests/fuzzy/CMakeLists.txt @@ -16,3 +16,12 @@ vespa_add_executable(vespalib_levenshtein_distance_test_app TEST GTest::GTest ) vespa_add_test(NAME vespalib_levenshtein_distance_test_app COMMAND vespalib_levenshtein_distance_test_app) + +vespa_add_executable(vespalib_levenshtein_dfa_test_app TEST + SOURCES + levenshtein_dfa_test.cpp + DEPENDS + vespalib + GTest::GTest +) +vespa_add_test(NAME vespalib_levenshtein_dfa_test_app COMMAND vespalib_levenshtein_dfa_test_app) diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp new file mode 100644 index 00000000000..6966fd0b703 --- /dev/null +++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp @@ -0,0 +1,507 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/vespalib/fuzzy/levenshtein_dfa.h> +#include <vespa/vespalib/fuzzy/dfa_stepping_base.h> +#include <vespa/vespalib/fuzzy/unicode_utils.h> +#include <vespa/vespalib/fuzzy/levenshtein_distance.h> // For benchmarking purposes +#include <vespa/vespalib/util/benchmark_timer.h> +#include <charconv> +#include <concepts> +#include <filesystem> +#include <fstream> +#include <string> +#include <string_view> +#include <gtest/gtest.h> + +using namespace ::testing; +using namespace vespalib::fuzzy; +namespace fs = std::filesystem; + +static std::string benchmark_dictionary; + +struct LevenshteinDfaTest : TestWithParam<LevenshteinDfa::DfaType> { + + static LevenshteinDfa::DfaType dfa_type() noexcept { return GetParam(); } + + static std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) { + auto dfa_lhs = LevenshteinDfa::build(left, threshold, dfa_type()); + auto maybe_match_lhs = dfa_lhs.match(right, nullptr); + + auto dfa_rhs = LevenshteinDfa::build(right, threshold, dfa_type()); + auto maybe_match_rhs = dfa_rhs.match(left, nullptr); + + EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches()); + if (maybe_match_lhs.matches()) { + EXPECT_EQ(maybe_match_lhs.edits(), maybe_match_rhs.edits()); + return {maybe_match_lhs.edits()}; + } + return std::nullopt; + } + + static std::optional<uint32_t> calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold) { + std::string_view lhs_ch(reinterpret_cast<const char*>(left.data()), left.size()); + std::string_view rhs_ch(reinterpret_cast<const char*>(right.data()), right.size()); + return calculate(lhs_ch, rhs_ch, threshold); + } + +}; + +INSTANTIATE_TEST_SUITE_P(AllDfaTypes, + LevenshteinDfaTest, + Values(LevenshteinDfa::DfaType::Explicit, + LevenshteinDfa::DfaType::Implicit), + PrintToStringParamName()); + +// Same as existing non-DFA Levenshtein tests, but with some added instantiations +// for smaller max distances. +TEST_P(LevenshteinDfaTest, edge_cases_have_correct_edit_distance) { + EXPECT_EQ(calculate("abc", "abc", 2), std::optional{0}); + for (auto max : {1, 2}) { + EXPECT_EQ(calculate("abc", "ab1", max), std::optional{1}) << max; + EXPECT_EQ(calculate("abc", "1bc", max), std::optional{1}) << max; + EXPECT_EQ(calculate("abc", "a1c", max), std::optional{1}) << max; + EXPECT_EQ(calculate("abc", "ab", max), std::optional{1}) << max; + EXPECT_EQ(calculate("abc", "abcd", max), std::optional{1}) << max; + EXPECT_EQ(calculate("a", "", max), std::optional{1}) << max; + } + EXPECT_EQ(calculate("bc", "abcd", 2), std::optional{2}); + EXPECT_EQ(calculate("ab", "abcd", 2), std::optional{2}); + EXPECT_EQ(calculate("cd", "abcd", 2), std::optional{2}); + EXPECT_EQ(calculate("ad", "abcd", 2), std::optional{2}); + EXPECT_EQ(calculate("abc", "a12", 2), std::optional{2}); + EXPECT_EQ(calculate("abc", "123", 2), std::nullopt); + EXPECT_EQ(calculate("ab", "", 1), std::nullopt); + EXPECT_EQ(calculate("ab", "", 2), std::optional{2}); + EXPECT_EQ(calculate("abc", "", 2), std::nullopt); + EXPECT_EQ(calculate("abc", "123", 2), std::nullopt); +} + +TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) { + // Each hiragana/katakana/kanji corresponds to multiple (3) UTF-8 chars but a single UTF-32 code point. + EXPECT_EQ(calculate(u8"猫", u8"猫", 2), std::optional{0}); + EXPECT_EQ(calculate(u8"猫", u8"犬", 2), std::optional{1}); + EXPECT_EQ(calculate(u8"猫と犬", u8"犬と猫", 2), std::optional{2}); + EXPECT_EQ(calculate(u8"猫は好き", u8"犬が好き", 2), std::optional{2}); + EXPECT_EQ(calculate(u8"カラオケ", u8"カラオケ", 2), std::optional{0}); + EXPECT_EQ(calculate(u8"カラオケ", u8"カラoケ", 2), std::optional{1}); + EXPECT_EQ(calculate(u8"カラオケ", u8"カraオケ", 2), std::optional{2}); + EXPECT_EQ(calculate(u8"kaラオケ", u8"カラオケ", 2), std::optional{2}); + EXPECT_EQ(calculate(u8"カラオケ", u8"カラoke", 2), std::nullopt); +} + +void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor) { + std::string successor; + auto m = dfa.match(source, &successor); + if (m.matches()) { + FAIL() << "Expected '" << source << "' to emit a successor, but it " + << "matched with " << static_cast<uint32_t>(m.edits()) + << " edits (of max " << static_cast<uint32_t>(m.max_edits()) << " edits)"; + } + EXPECT_EQ(successor, expected_successor); + EXPECT_TRUE(dfa.match(successor, nullptr).matches()); +} + +TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings) { + auto dfa = LevenshteinDfa::build("food", 1, dfa_type()); + + test_dfa_successor(dfa, "", "\x01""food"); + test_dfa_successor(dfa, "faa", "faod"); + test_dfa_successor(dfa, "fooooo", "foop"); + test_dfa_successor(dfa, "ooof", "pfood"); + test_dfa_successor(dfa, "fo", "fo\x01""d"); + test_dfa_successor(dfa, "oo", "ood"); + test_dfa_successor(dfa, "ooo", "oood"); + test_dfa_successor(dfa, "foh", "fohd"); + test_dfa_successor(dfa, "foho", "fohod"); + test_dfa_successor(dfa, "foxx", "foyd"); + test_dfa_successor(dfa, "xfa", "xfood"); + test_dfa_successor(dfa, "gg", "good"); + test_dfa_successor(dfa, "gp", "hfood"); + test_dfa_successor(dfa, "ep", "f\x01""od"); + test_dfa_successor(dfa, "hfoodz", "hood"); + test_dfa_successor(dfa, "aooodz", "bfood"); + + // Also works with Unicode + // 2 chars + test_dfa_successor(dfa, "\xc3\x86""x", // "Æx" + "\xc3\x87""food"); // "Çfood" + // 3 chars + test_dfa_successor(dfa, "\xe7\x8c\xab""\xe3\x81\xaf", // "猫は" + "\xe7\x8c\xac""food"); // "猬food" + // 4 chars + test_dfa_successor(dfa, "\xf0\x9f\xa4\xa9""abc", // <starry eyed emoji>abc + "\xf0\x9f\xa4\xa9""food"); // <starry eyed emoji>food + + // Note that as a general rule, emojis are fickle beasts to deal with since a single + // emoji often takes up multiple code points, which we consider separate characters + // but a user sees as a single actual rendered glyph. + // Multi-code point character edit distance support is left as an exercise for the reader :D +} + +TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) { + auto dfa = LevenshteinDfa::build("food", 1, dfa_type()); + // The successor string must be lexicographically larger than the input string. + // In the presence of a wildcard output edge we handle this by increase the input + // character by 1 and encoding it back as UTF-8. + // It is possible (though arguably very unlikely) that the input character is + // U+10FFFF, which is the maximum valid Unicode character. We have to ensure that + // we can encode U+10FFFF + 1, even though it's technically outside the valid range. + // Luckily, UTF-8 can technically (there's that word again) encode up to U+1FFFFF, + // so the resulting string is byte-wise greater, and that's what matters since we + // don't guarantee that the successor string is _valid_ UTF-8. + // This problem does not happen with the target string, as it's an invalid character + // and will be replaced with the Unicode replacement char before we ever see it. + test_dfa_successor(dfa, "\xf4\x8f\xbf\xbf""xyz", // U+10FFFF + "\xf4\x90\x80\x80""food");// U+10FFFF+1 +} + +TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_empty_target) { + auto dfa = LevenshteinDfa::build("", 1, dfa_type()); + test_dfa_successor(dfa, "aa", "b"); + test_dfa_successor(dfa, "b\x01", "c"); + test_dfa_successor(dfa, "vespa", "w"); +} + +// We should normally be able to rely on higher-level components to ensure we +// only receive valid UTF-8, but make sure we don't choke on it if we do get it. +TEST_P(LevenshteinDfaTest, malformed_utf8_is_replaced_with_placeholder_char) { + // 0xff is not a valid encoding and is implicitly converted to U+FFFD, + // which is the standard Unicode replacement character. + EXPECT_EQ(calculate("\xff", "a", 2), std::optional{1}); + EXPECT_EQ(calculate("\xff\xff", "a", 2), std::optional{2}); + EXPECT_EQ(calculate("a", "\xff", 2), std::optional{1}); + EXPECT_EQ(calculate("a", "\xff\xff\xff", 2), std::nullopt); + EXPECT_EQ(calculate("\xff", "\xef\xbf\xbd"/*U+FFFD*/, 2), std::optional{0}); +} + +TEST_P(LevenshteinDfaTest, unsupported_max_edits_value_throws) { + EXPECT_THROW((void)LevenshteinDfa::build("abc", 0, dfa_type()), std::invalid_argument); + EXPECT_THROW((void)LevenshteinDfa::build("abc", 3, dfa_type()), std::invalid_argument); +} + +// Turn integer v into its bitwise string representation with the MSB as the leftmost character. +template <std::unsigned_integral T> +std::string bits_to_str(T v) { + constexpr const uint8_t n_bits = sizeof(T) * 8; + std::string ret(n_bits, '0'); + for (uint8_t bit = 0; bit < n_bits; ++bit) { + if (v & (1 << bit)) { + ret[n_bits - bit - 1] = '1'; + } + } + return ret; +} + +using DfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::DfaType, uint32_t>; + +struct LevenshteinDfaSuccessorTest : TestWithParam<DfaTypeAndMaxEdits> { + // Print test suffix as e.g. "/Explicit_1" instead of just a GTest-chosen number. + static std::string stringify_params(const TestParamInfo<ParamType>& info) { + std::ostringstream ss; + ss << std::get<0>(info.param) << '_' << std::get<1>(info.param); + return ss.str(); + } +}; + +INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, + LevenshteinDfaSuccessorTest, + Combine(Values(LevenshteinDfa::DfaType::Explicit, + LevenshteinDfa::DfaType::Implicit), + Values(1, 2)), + LevenshteinDfaSuccessorTest::stringify_params); + +/** + * Exhaustively test successor generation by matching all target and source strings + * in {0,1}^8 against each other. Since we generate bit strings identical to the + * bit patterns of the underlying counter(s), any string at index `i+1` will compare + * lexicographically greater than the one at `i`. We use this to test that we never + * miss a valid match that comes between a mismatch and its generated successor. + * + * For each mismatch we note the successor it emitted. Verify that each subsequent + * match() invocation for a source string < the successor results in a mismatch. + * + * We test this for both max edit distance 1 and 2. Despite being an exhaustive test, + * this completes in a few dozen milliseconds even with ASan instrumentation. + * + * Inspired by approach used by Lucene DFA exhaustive testing. + */ +TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) { + const auto [dfa_type, max_edits] = GetParam(); + for (uint32_t i = 0; i < 256; ++i) { + const auto target = bits_to_str(static_cast<uint8_t>(i)); + auto target_dfa = LevenshteinDfa::build(target, max_edits, dfa_type); + std::string skip_to, successor; + for (uint32_t j = 0; j < 256; ++j) { + const auto source = bits_to_str(static_cast<uint8_t>(j)); + auto maybe_match = target_dfa.match(source, &successor); + if (maybe_match.matches() && !skip_to.empty()) { + ASSERT_GE(source, skip_to); + } else if (!maybe_match.matches()) { + ASSERT_FALSE(successor.empty()) << source; + ASSERT_GE(successor, skip_to) << source; + ASSERT_GT(successor, source) << source; + skip_to = successor; + } + } + } +} + +namespace { + +template <uint8_t MaxEdits> +void explore(const DfaSteppingBase<FixedMaxEditDistanceTraits<MaxEdits>>& stepper, + const typename DfaSteppingBase<FixedMaxEditDistanceTraits<MaxEdits>>::StateType& in_state) +{ + ASSERT_EQ(stepper.can_match(stepper.step(in_state, WILDCARD)), + stepper.can_wildcard_step(in_state)); + if (!stepper.can_match(in_state)) { + return; // reached the end of the line + } + // DFS-explore all matching transitions, as well as one non-matching transition + auto t = stepper.transitions(in_state); + for (uint32_t c: t.u32_chars()) { + ASSERT_NO_FATAL_FAILURE(explore(stepper, stepper.step(in_state, c))); + } + ASSERT_NO_FATAL_FAILURE(explore(stepper, stepper.step(in_state, WILDCARD))); +} + +} // anon ns + +using StateStepperTypes = Types< + DfaSteppingBase<FixedMaxEditDistanceTraits<1>>, + DfaSteppingBase<FixedMaxEditDistanceTraits<2>> +>; + +template <typename SteppingBase> +struct LevenshteinSparseStateTest : Test {}; + +TYPED_TEST_SUITE(LevenshteinSparseStateTest, StateStepperTypes); + +// "Meta-test" for checking that the `can_wildcard_step` predicate function is +// functionally equivalent to evaluating `can_match(stepper.step(in_state, WILDCARD))` +TYPED_TEST(LevenshteinSparseStateTest, wildcard_step_predcate_is_equivalent_to_step_with_can_match) { + for (const char* target : {"", "a", "ab", "abc", "abcdef", "aaaaa"}) { + auto u32_target = utf8_string_to_utf32(target); + TypeParam stepper(u32_target); + ASSERT_NO_FATAL_FAILURE(explore(stepper, stepper.start())); + } +} + +template <typename T> +void do_not_optimize_away(T&& t) noexcept { + asm volatile("" : : "m"(t) : "memory"); // Clobber the value to avoid losing it to compiler optimizations +} + +enum class BenchmarkType { + DfaExplicit, + DfaImplicit, + Legacy +}; + +const char* to_s(BenchmarkType t) noexcept { + // Note: need underscores since this is used as part of GTest-generated test instance names + switch (t) { + case BenchmarkType::DfaExplicit: return "DFA_explicit"; + case BenchmarkType::DfaImplicit: return "DFA_implicit"; + case BenchmarkType::Legacy: return "legacy"; + } + abort(); +} + +[[nodiscard]] bool benchmarking_enabled() noexcept { + return !benchmark_dictionary.empty(); +} + +[[nodiscard]] std::vector<uint32_t> string_lengths() { + return {2, 8, 16, 64, 256, 1024, 1024*16, 1024*64}; +} + +struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkType> { + + static std::string stringify_params(const TestParamInfo<ParamType>& info) { + return to_s(info.param); + } + + void SetUp() override { + if (!benchmarking_enabled()) { + GTEST_SKIP() << "benchmarking not enabled"; + } + } + + static BenchmarkType benchmark_type() noexcept { return GetParam(); } + + static const std::vector<std::string>& load_dictionary_once() { + static auto sorted_lines = read_and_sort_all_lines(fs::path(benchmark_dictionary)); + return sorted_lines; + } + + static std::vector<std::string> read_and_sort_all_lines(const fs::path& file_path) { + std::ifstream ifs(file_path); + if (!ifs.is_open()) { + throw std::invalid_argument("File does not exist"); + } + std::vector<std::string> lines; + std::string line; + while (std::getline(ifs, line)) { + lines.emplace_back(line); + } + std::sort(lines.begin(), lines.end()); + return lines; + } +}; + +INSTANTIATE_TEST_SUITE_P(AllDfaTypes, + LevenshteinBenchmarkTest, + Values(BenchmarkType::DfaExplicit, + BenchmarkType::DfaImplicit, + BenchmarkType::Legacy), + LevenshteinBenchmarkTest::stringify_params); + +// ("abc", 1) => "a" +// ("abc", 3) => "abc" +// ("abc", 7) => "abcabca" +// ... and so on. +std::string repeated_string(std::string_view str, uint32_t sz) { + uint32_t chunks = sz / str.size(); + std::string ret; + ret.reserve(sz); + for (uint32_t i = 0; i < chunks; ++i) { + ret += str; + } + uint32_t rem = sz % str.size(); + ret += str.substr(0, rem); + return ret; +} + +TEST_P(LevenshteinBenchmarkTest, benchmark_worst_case_matching_excluding_setup_time) { + using vespalib::BenchmarkTimer; + const auto type = benchmark_type(); + fprintf(stderr, "------ %s ------\n", to_s(type)); + for (uint8_t k : {1, 2}) { + for (uint32_t sz : string_lengths()) { + // Use same string as both source and target. This is the worst case in that the entire + // string must be matched and any sparse representation is always maximally filled since + // we never expend any edits via mismatches. + // Also ensure that we have multiple out-edges per node (i.e. don't just repeat "AAA" etc.). + std::string str = repeated_string("abcde", sz); + double min_time_s; + if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) { + auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit + : LevenshteinDfa::DfaType::Implicit; + auto dfa = LevenshteinDfa::build(str, k, dfa_type); + min_time_s = BenchmarkTimer::benchmark([&] { + auto res = dfa.match(str, nullptr); // not benchmarking successor generation + do_not_optimize_away(res); + }, 1.0); + } else { + min_time_s = BenchmarkTimer::benchmark([&] { + auto str_u32 = utf8_string_to_utf32(str); // Must be done per term, so included in benchmark body + auto res = vespalib::LevenshteinDistance::calculate(str_u32, str_u32, k); + do_not_optimize_away(res); + }, 1.0); + } + fprintf(stderr, "k=%u, sz=%u: \t%g us\n", k, sz, min_time_s * 1000000.0); + } + } +} + +TEST(LevenshteinExplicitDfaBenchmarkTest, benchmark_explicit_dfa_construction) { + if (!benchmarking_enabled()) { + GTEST_SKIP() << "benchmarking not enabled"; + } + using vespalib::BenchmarkTimer; + for (uint8_t k : {1, 2}) { + for (uint32_t sz : string_lengths()) { + std::string str = repeated_string("abcde", sz); + double min_time_s = BenchmarkTimer::benchmark([&] { + auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit); + do_not_optimize_away(dfa); + }, 2.0); + auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit); + size_t mem_usage = dfa.memory_usage(); + fprintf(stderr, "k=%u, sz=%u: \t%g us \t%zu bytes\n", k, sz, min_time_s * 1000000.0, mem_usage); + } + } +} + +TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) { + using vespalib::BenchmarkTimer; + const auto type = benchmark_type(); + const auto dict = load_dictionary_once(); + std::vector target_lengths = {1, 2, 4, 8, 12, 16, 24, 32, 64}; + fprintf(stderr, "------ %s ------\n", to_s(type)); + for (uint8_t k : {1, 2}) { + for (uint32_t sz : target_lengths) { + std::string str = repeated_string("abcde", sz); + double min_time_s; + if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) { + auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit + : LevenshteinDfa::DfaType::Implicit; + auto dfa = LevenshteinDfa::build(str, k, dfa_type); + min_time_s = BenchmarkTimer::benchmark([&] { + for (const auto& line : dict) { + auto res = dfa.match(line, nullptr); + do_not_optimize_away(res); + } + }, 2.0); + } else { + min_time_s = BenchmarkTimer::benchmark([&] { + auto target_u32 = utf8_string_to_utf32(str); + for (const auto& line : dict) { + auto line_u32 = utf8_string_to_utf32(line); + auto res = vespalib::LevenshteinDistance::calculate(line_u32, target_u32, k); + do_not_optimize_away(res); + } + }, 2.0); + } + fprintf(stderr, "k=%u, sz=%u: \t%g us\n", k, sz, min_time_s * 1000000.0); + } + } +} + +TEST_P(LevenshteinBenchmarkTest, benchmark_skipping_dictionary_scan) { + const auto type = benchmark_type(); + if (type == BenchmarkType::Legacy) { + GTEST_SKIP() << "Skipping not supported for legacy implementation"; + } + using vespalib::BenchmarkTimer; + const auto dict = load_dictionary_once(); + std::vector target_lengths = {1, 2, 4, 8, 12, 16, 24, 32, 64}; + fprintf(stderr, "------ %s ------\n", to_s(type)); + for (uint8_t k : {1, 2}) { + for (uint32_t sz : target_lengths) { + std::string str = repeated_string("abcde", sz); + auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit + : LevenshteinDfa::DfaType::Implicit; + auto dfa = LevenshteinDfa::build(str, k, dfa_type); + double min_time_s = BenchmarkTimer::benchmark([&] { + auto iter = dict.cbegin(); + auto end = dict.cend(); + std::string successor; + while (iter != end) { + auto maybe_match = dfa.match(*iter, &successor); + if (maybe_match.matches()) { + ++iter; + } else { + iter = std::lower_bound(iter, end, successor); + } + } + }, 2.0); + fprintf(stderr, "k=%u, sz=%u: \t%g us\n", k, sz, min_time_s * 1000000.0); + } + } +} + +// TODO: +// - explicit successor generation benchmark + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + if (argc > 1) { + benchmark_dictionary = argv[1]; + if (!fs::exists(fs::path(benchmark_dictionary))) { + fprintf(stderr, "Benchmark dictionary file '%s' does not exist\n", benchmark_dictionary.c_str()); + return 1; + } + } + return RUN_ALL_TESTS(); +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt index 1d770163e06..bdbb03bcfee 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt @@ -1,8 +1,12 @@ # Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_library(vespalib_vespalib_fuzzy OBJECT - SOURCES + SOURCES + explicit_levenshtein_dfa.cpp fuzzy_matcher.cpp + implicit_levenshtein_dfa.cpp + levenshtein_dfa.cpp levenshtein_distance.cpp - DEPENDS - ) + unicode_utils.cpp + DEPENDS +) diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h new file mode 100644 index 00000000000..c445c60cc01 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h @@ -0,0 +1,70 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <concepts> +#include <cstdint> + +namespace vespalib::fuzzy { + +// Concept that all DFA matcher implementations must satisfy +template <typename T> +concept DfaMatcher = requires(T a) { + typename T::StateType; + typename T::StateParamType; + typename T::EdgeType; + + // Initial (starting) state of the DFA + { a.start() } -> std::same_as<typename T::StateType>; + + // Whether a given state constitutes a string match within the maximum number of edits + { a.is_match(typename T::StateType{}) } -> std::same_as<bool>; + + // Whether a given state _may_ result in a match, either in the given state or in the + // future if the remaining string input is within the max edit distance + { a.can_match(typename T::StateType{}) } -> std::same_as<bool>; + + // Whether the given state is a valid state. Used for invariant checking. + { a.valid_state(typename T::StateType{}) } -> std::same_as<bool>; + + // Iff the given state represents a terminal matching state, returns the number of + // edits required to reach the state. Otherwise, returns max edits + 1. + { a.match_edit_distance(typename T::StateType{}) } -> std::same_as<uint8_t>; + + // Returns the state that is the result of matching the single logical Levenshtein + // matrix row represented by the given state with the input u32 character value. + { a.match_input(typename T::StateType{}, uint32_t{}) } -> std::same_as<typename T::StateType>; + + // Returns the state that is the result of matching the single logical Levenshtein + // matrix row represented by the given state with a sentinel character that cannot + // match any character in the target string (i.e. is always a mismatch). + { a.match_wildcard(typename T::StateType{}) } -> std::same_as<typename T::StateType>; + + // Whether there is exists an out edge from the given state that can accept a + // _higher_ UTF-32 code point value (character) than the input u32 value. Such an + // edge _may_ be a wildcard edge, which accepts any character. + { a.has_higher_out_edge(typename T::StateType{}, uint32_t{}) } -> std::same_as<bool>; + + // Whether there exists an out edge from the given state whose u32 character value + // _exactly_ matches the input u32 value. + { a.has_exact_explicit_out_edge(typename T::StateType{}, uint32_t{}) } -> std::same_as<bool>; + + // Returns the out edge `e` from the given state that satisfies _both_: + // 1. higher than the given u32 value + // 2. no other out edges are lower than `e` + // Only called in a context where the caller already knows that such an edge must exist. + { a.lowest_higher_explicit_out_edge(typename T::StateType{}, uint32_t{}) } -> std::same_as<typename T::EdgeType>; + + // Returns the out edge from the given state that has the lowest character value + { a.smallest_explicit_out_edge(typename T::StateType{}) } -> std::same_as<typename T::EdgeType>; + + // Whether the given edge is a valid edge. Used for invariant checking. + { a.valid_edge(typename T::EdgeType{}) } -> std::same_as<bool>; + + // For a given edge, returns the UTF-32 code point value the edge represents + { a.edge_to_u32char(typename T::EdgeType{}) } -> std::same_as<uint32_t>; + + // Returns the state that is the result of following the given edge from the given state. + { a.edge_to_state(typename T::StateType{}, typename T::EdgeType{}) } -> std::same_as<typename T::StateType>; +}; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_stepping_base.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_stepping_base.h new file mode 100644 index 00000000000..7e7881c5a14 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_stepping_base.h @@ -0,0 +1,299 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "sparse_state.h" +#include <span> + +namespace vespalib::fuzzy { + +template <typename Traits> +struct DfaSteppingBase { + using StateType = Traits::StateType; + using TransitionsType = Traits::TransitionsType; + + std::span<const uint32_t> _u32_str; // TODO std::u32string_view + + DfaSteppingBase(std::span<const uint32_t> str) noexcept + : _u32_str(str) + { + } + + [[nodiscard]] static constexpr uint8_t max_edits() noexcept { + return Traits::max_edits(); + } + + /** + * Returns the initial state of the DFA. This represents the first row in the + * Levenshtein matrix. + */ + [[nodiscard]] StateType start() const { + StateType ret; + const auto j = std::min(static_cast<uint32_t>(max_edits()), + static_cast<uint32_t>(_u32_str.size())); // e.g. the empty string as target + for (uint32_t i = 0; i <= j; ++i) { + ret.append(i, i); + } + return ret; + } + + /** + * DFA stepping function that takes an input (sparse) state and a 32-bit character value + * (does not have to be valid UTF-32, but usually is) and generates a resulting state + * that represents applying the Levenshtein algorithm on a particular matrix row using + * the provided source string character. + * + * The returned state only includes elements where the edit distance (cost) is within + * the maximum number of edits. All other elements are implicitly beyond the max + * edit distance. It doesn't matter _how_ far beyond they are, since we have a fixed + * maximum to consider. + * + * Stepping a non-matching state S (can_match(S) == false) results in another non- + * matching state. + * + * As an example, this is a visualization of stepping through all source characters of + * the string "fxod" when matching the target string "food" with max edits k=1. + * Note: the actual internal representation is logical <column#, cost> tuples, but + * rendering as a matrix makes things easier to understand. Elements _not_ part of the + * state are rendered as '-'. + * + * f o o d + * start(): [0 1 - - -] + * 'f': [1 0 1 - -] + * 'x': [- 1 1 - -] + * 'o': [- - 1 1 -] + * 'd': [- - - - 1] + * + * In this case, the resulting edit distance is 1, with one substitution 'x' -> 'o'. + * + * If we pull out our trusty pen & paper and do the full matrix calculations, we see + * that the above is equivalent to the full matrix with all costs > k pruned away: + * + * f o o d + * [0 1 2 3 4] + * f [1 0 1 2 3] + * x [2 1 1 2 3] + * o [3 2 1 1 2] + * d [4 3 2 2 1] + * + * Since we're working on sparse states, stepping requires a bit of manual edge case + * handling when compared to a dense representation. + * + * We first have to handle the case where our state includes the 0th matrix column. + * In an explicit Levenshtein matrix of target string length n, source string length m, + * the first column is always the values [0, m], increasing with 1 per row (the first + * _row_ is handled by start()). + * + * To mirror this, if our sparse state includes column 0 we have to increment it by 1, + * unless doing so would bring the cost beyond our max number of edits, in which case + * we don't bother including the column in the new state at all. These correspond to + * the start() -> 'f' -> 'x' transitions in the example above. + * + * What remains is then to do the actual Levenshtein insert/delete/substitute formula + * for matching positions in the matrix. Let d represent the logical (full) Levenshtein + * distance matrix and cell d[i, j] be the minimum number of edits between source string + * character at i+1 and target string character at j+1: + * + * Insertion cost: d[i, j-1] + 1 + * Deletion cost: d[i-1, j] + 1 + * Substitution cost: d[i-1, j-1] + (s[i-1] == t[j-1] ? 1 :0) + * + * d[i, j] = min(Insertion cost, Deletion cost, Substitution cost) + * + * We have to turn this slightly on the head, as instead of going through a matrix row + * and "pulling" values from the previous row, we have to go through a state representing + * the previous row and "push" new values instead (iff these values are within max edits). + * This also means we compute costs for indexes offset by 1 from the source state index + * (can be visualized as the element one down diagonally to the right). + * + * Insertion considers the current row only, i.e. the state being generated. We always + * work left to right in column order, so we can check if the last element (if any) + * in our _new_ sparse state is equal to the index of our source state element. If not, + * we know that it was beyond max edits. Max edits + 1 is inherently beyond max edits + * and need not be included. + * + * Deletion considers the cell directly above our own, which is part of the input state + * if it exists. Since we're computing the costs of cells at index + 1, we know that the + * only way for this cell to be present in the state is if the _next_ element of our + * input state exists and has an index equal to index + 1. If so, the deletion cost is + * the cost recorded for this element + 1. + * + * Substitution considers the cell diagonally up to the left. This very conveniently + * happens to be the input state cell we're currently working from, so it's therefore + * always present. + * + * Example stepping with c='x', max edits k=1: + * + * ====== Initially ====== + * + * f o o d + * state_in: [1 0 1 - -] (0:1, 1:0, 2:1) + * out: [] () + * + * We have a 0th column in state_in, but incrementing it results in 2 > k, so not + * appended to out. + * + * ====== State (0:1), computing for index 1 ====== + * + * Insertion: out state is empty (no cell to our left), so implicit insertion cost + * is > k + * Deletion: state_in[1] is (1:0), which means it represents the cell just above + * index 1. Deletion cost is therefore 0+1 = 1 + * Substitution: (t[0] = 'f') != (c = 'x'), so substitution cost is 1+1 = 2 + * + * Min cost is 1, which is <= k. Appending to output. + * + * out: [- 1] (1:1) + * + * ====== State (1:0), computing for index 2 ====== + * + * Insertion: last element in out has index 1 (cell to our immediate left) with cost + * 1, so insertion cost is 1+1 = 2 + * Deletion: state_in[2] is (2:1), which means it represents the cell just above + * index 2. Deletion cost is therefore 1+1 = 2 + * Substitution: (t[1] = 'o') != (c = 'x'), so substitution cost is 0+1 = 1 + * + * Min cost is 1, which is <= k. Appending to output. + * + * out: [- 1 1] (1:1, 2:1) + * + * ====== State (2:1), computing for index 3 ====== + * + * Insertion: last element in out has index 2 (cell to our immediate left) with cost + * 1, so insertion cost is 1+1 = 2 + * Deletion: state_in[3] does not exist, so implicit deletion cost is > k + * Substitution: (t[2] = 'o') != (c = 'x'), so substitution cost is 1+1 = 2 + * + * Min cost is 2, which is > k. Not appending to output. + * + * Resulting output state (right-padded for clarity): + * + * [- 1 1 - -] (1:1, 2:1) + * + */ + [[nodiscard]] StateType step(const StateType& state_in, uint32_t c) const { + if (state_in.empty()) { + return state_in; // A non-matching state can only step to another equally non-matching state + } + StateType new_state; + if ((state_in.index(0) == 0) && (state_in.cost(0) < max_edits())) { + new_state.append(0, state_in.cost(0) + 1); + } + for (uint32_t i = 0; i < state_in.size(); ++i) { + const auto idx = state_in.index(i); + if (idx == _u32_str.size()) [[unlikely]] { + break; // Can't process beyond matrix width + } + const uint8_t sub_cost = (_u32_str[idx] == c) ? 0 : 1; + // For our Levenshtein insert/delete/sub ops, we know that if a particular index is _not_ + // in the sparse state, its implicit distance is beyond the max edits, and need not be + // considered. + auto dist = state_in.cost(i) + sub_cost; // (Substitution) + if (!new_state.empty() && (new_state.last_index() == idx)) { // (Insertion) anything to our immediate left? + dist = std::min(dist, new_state.last_cost() + 1); + } + if ((i < state_in.size() - 1) && (state_in.index(i + 1) == idx + 1)) { // (Deletion) anything immediately above? + dist = std::min(dist, state_in.cost(i + 1) + 1); + } + if (dist <= max_edits()) { + new_state.append(idx + 1, dist); + } + } + return new_state; + } + + /** + * Simplified version of step() which does not assemble a new state, but only checks + * whether _any_ mismatching character can be substituted in and still result in a + * potentially matching state. This is the case if the resulting state would contain + * _at least one_ entry (recalling that we only retain entries that are within the + * max number of edits). + * + * Consider using this directly instead of `can_match(step(state, WILDCARD))`, + * which has the exact same semantics, but requires computing the full (sparse) + * state before checking if it has any element at all. can_wildcard_step() just + * jumps straight to the last part. + */ + [[nodiscard]] bool can_wildcard_step(const StateType& state_in) const noexcept { + if (state_in.empty()) { + return false; // by definition + } + if ((state_in.index(0) == 0) && (state_in.cost(0) < max_edits())) { + return true; + } + for (uint32_t i = 0; i < state_in.size(); ++i) { + const auto idx = state_in.index(i); + if (idx == _u32_str.size()) [[unlikely]] { + break; + } + const uint8_t sub_cost = 1; // by definition + auto dist = state_in.cost(i) + sub_cost; + // Insertion only looks at the entries already computed in the current row + // and always increases the cost by 1. Since we always bail out immediately if + // there would have been at least one entry within max edits, we transitively + // know that since we have not bailed out yet there is no way we can get here + // and have insertion actually yield a match. So skip computing it entirely. + if ((i < state_in.size() - 1) && (state_in.index(i + 1) == idx + 1)) { + dist = std::min(dist, state_in.cost(i + 1) + 1); + } + if (dist <= max_edits()) { + return true; + } + } + return false; + } + + /** + * Checks if the given state represents a terminal state within the max number of edits + */ + [[nodiscard]] bool is_match(const StateType& state) const noexcept { + // If the last index is equal to the string's length, it means we were able to match + // the entire string and still be within the max edit distance. + return (!state.empty() && state.last_index() == static_cast<uint32_t>(_u32_str.size())); + } + + /** + * Iff the input state represents a terminal matching state, returns the number of + * edits required to reach the state. Otherwise, returns max edits + 1. + */ + [[nodiscard]] uint8_t match_edit_distance(const StateType& state) const noexcept { + if (!is_match(state)) { + return max_edits() + 1; + } + return state.last_cost(); + } + + /** + * Returns whether the given state _may_ end up matching the target string, + * depending on the remaining source string characters. + * + * Note: is_match(s) => can_match(s) is true, but + * can_match(s) => is_match(s) is false + */ + [[nodiscard]] bool can_match(const StateType& state) const noexcept { + // The presence of any entries at all indicates that we may still potentially match + // the target string if the remaining input is within the maximum number of edits. + return !state.empty(); + } + + /** + * All valid character transitions from this state are those that are reachable + * within the max edit distance. + */ + TransitionsType transitions(const StateType& state) const { + TransitionsType t; + for (size_t i = 0; i < state.size(); ++i) { + const auto idx = state.index(i); + if (idx < _u32_str.size()) [[likely]] { + t.add_char(_u32_str[idx]); + } + } + // We must ensure transitions are in increasing character order, so that the + // lowest possible higher char than any candidate char can be found with a + // simple "first-fit" linear scan. + t.sort(); + return t; + } + +}; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/examples/food_dfa.svg b/vespalib/src/vespa/vespalib/fuzzy/examples/food_dfa.svg new file mode 100644 index 00000000000..0974e1d161f --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/examples/food_dfa.svg @@ -0,0 +1,286 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><!-- Generated by graphviz version 2.40.1 (20161225.0304) + --><!-- Title: levenshtein_dfa Pages: 1 --><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="333pt" height="488pt" viewBox="0.00 0.00 333.00 488.00"> +<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 484)"> +<title>levenshtein_dfa</title> +<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-484 329,-484 329,4 -4,4"/> +<!-- 0 --> +<g id="node1" class="node"> +<title>0</title> +<ellipse fill="none" stroke="#000000" cx="211" cy="-462" rx="18" ry="18"/> +<text text-anchor="middle" x="211" y="-457.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">0</text> +</g> +<!-- 1 --> +<g id="node2" class="node"> +<title>1</title> +<ellipse fill="none" stroke="#000000" cx="157" cy="-373.2" rx="18" ry="18"/> +<text text-anchor="middle" x="157" y="-369" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">1</text> +</g> +<!-- 0->1 --> +<g id="edge1" class="edge"> +<title>0->1</title> +<path fill="none" stroke="#000000" d="M201.5939,-446.5322C193.3592,-432.9906 181.2571,-413.0894 171.7374,-397.4348"/> +<polygon fill="#000000" stroke="#000000" points="174.6658,-395.5142 166.4795,-388.7885 168.6849,-399.1513 174.6658,-395.5142"/> +<text text-anchor="middle" x="189.9453" y="-413.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">f</text> +</g> +<!-- 2 --> +<g id="node3" class="node"> +<title>2</title> +<ellipse fill="none" stroke="#000000" cx="211" cy="-373.2" rx="18" ry="18"/> +<text text-anchor="middle" x="211" y="-369" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">2</text> +</g> +<!-- 0->2 --> +<g id="edge2" class="edge"> +<title>0->2</title> +<path fill="none" stroke="#000000" d="M211,-443.6006C211,-431.4949 211,-415.4076 211,-401.6674"/> +<polygon fill="#000000" stroke="#000000" points="214.5001,-401.272 211,-391.272 207.5001,-401.2721 214.5001,-401.272"/> +<text text-anchor="middle" x="214.8913" y="-413.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text> +</g> +<!-- 3 --> +<g id="node4" class="node"> +<title>3</title> +<ellipse fill="none" stroke="#000000" cx="265" cy="-373.2" rx="18" ry="18"/> +<text text-anchor="middle" x="265" y="-369" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">3</text> +</g> +<!-- 0->3 --> +<g id="edge3" class="edge"> +<title>0->3</title> +<path fill="none" stroke="#000000" d="M220.4061,-446.5322C228.6408,-432.9906 240.7429,-413.0894 250.2626,-397.4348"/> +<polygon fill="#000000" stroke="#000000" points="253.3151,-399.1513 255.5205,-388.7885 247.3342,-395.5142 253.3151,-399.1513"/> +<text text-anchor="middle" x="244.7223" y="-413.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text> +</g> +<!-- 4 --> +<g id="node5" class="node"> +<title>4</title> +<ellipse fill="none" stroke="#000000" cx="157" cy="-284.4" rx="18" ry="18"/> +<text text-anchor="middle" x="157" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">4</text> +</g> +<!-- 1->4 --> +<g id="edge4" class="edge"> +<title>1->4</title> +<path fill="none" stroke="#000000" d="M157,-354.8006C157,-342.6949 157,-326.6076 157,-312.8674"/> +<polygon fill="#000000" stroke="#000000" points="160.5001,-312.472 157,-302.472 153.5001,-312.4721 160.5001,-312.472"/> +<text text-anchor="middle" x="158.9453" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">f</text> +</g> +<!-- 1->4 --> +<g id="edge6" class="edge"> +<title>1->4</title> +<path fill="none" stroke="#000000" d="M156.1231,-354.8902C155.8898,-349.221 155.6708,-342.9535 155.5554,-337.2 155.4057,-329.7348 155.4057,-327.8652 155.5554,-320.4 155.6041,-317.9728 155.6712,-315.454 155.7501,-312.9273"/> +<polygon fill="#000000" stroke="#000000" points="159.2558,-312.8309 156.1231,-302.7098 152.2605,-312.5755 159.2558,-312.8309"/> +<text text-anchor="middle" x="158.7223" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text> +</g> +<!-- 5 --> +<g id="node6" class="node"> +<title>5</title> +<ellipse fill="none" stroke="#000000" cx="103" cy="-284.4" rx="18" ry="18"/> +<text text-anchor="middle" x="103" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">5</text> +</g> +<!-- 1->5 --> +<g id="edge5" class="edge"> +<title>1->5</title> +<path fill="none" stroke="#000000" d="M147.5939,-357.7322C139.3592,-344.1906 127.2571,-324.2894 117.7374,-308.6348"/> +<polygon fill="#000000" stroke="#000000" points="120.6658,-306.7142 112.4795,-299.9885 114.6849,-310.3513 120.6658,-306.7142"/> +<text text-anchor="middle" x="138.8913" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text> +</g> +<!-- 6 --> +<g id="node7" class="node"> +<title>6</title> +<ellipse fill="none" stroke="#000000" cx="307" cy="-284.4" rx="18" ry="18"/> +<text text-anchor="middle" x="307" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">6</text> +</g> +<!-- 2->6 --> +<g id="edge7" class="edge"> +<title>2->6</title> +<path fill="none" stroke="#000000" d="M224.3484,-360.8527C240.3968,-346.0079 267.511,-320.9274 286.2858,-303.5607"/> +<polygon fill="#000000" stroke="#000000" points="288.8006,-306.0022 293.765,-296.6424 284.0473,-300.8635 288.8006,-306.0022"/> +<text text-anchor="middle" x="268.9453" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">f</text> +</g> +<!-- 7 --> +<g id="node8" class="node"> +<title>7</title> +<ellipse fill="none" stroke="#000000" cx="190" cy="-195.6" rx="18" ry="18"/> +<text text-anchor="middle" x="190" y="-191.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">7</text> +</g> +<!-- 2->7 --> +<g id="edge8" class="edge"> +<title>2->7</title> +<path fill="none" stroke="#000000" d="M208.8708,-355.1934C205.2075,-324.212 197.6865,-260.606 193.3267,-223.7346"/> +<polygon fill="#000000" stroke="#000000" points="196.7849,-223.1741 192.1348,-213.6543 189.8333,-223.9961 196.7849,-223.1741"/> +<text text-anchor="middle" x="205.8913" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text> +</g> +<!-- 3->6 --> +<g id="edge9" class="edge"> +<title>3->6</title> +<path fill="none" stroke="#000000" d="M272.7034,-356.9127C278.881,-343.8515 287.6665,-325.2765 294.7989,-310.1966"/> +<polygon fill="#000000" stroke="#000000" points="298.0914,-311.4211 299.2032,-300.8848 291.7635,-308.4281 298.0914,-311.4211"/> +<text text-anchor="middle" x="290.9453" y="-324.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">f</text> +</g> +<!-- 8 --> +<g id="node9" class="node"> +<title>8</title> +<ellipse fill="none" stroke="#000000" cx="263" cy="-195.6" rx="18" ry="18"/> +<text text-anchor="middle" x="263" y="-191.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">8</text> +</g> +<!-- 3->8 --> +<g id="edge10" class="edge"> +<title>3->8</title> +<path fill="none" stroke="#000000" d="M264.7972,-355.1934C264.4483,-324.212 263.732,-260.606 263.3168,-223.7346"/> +<polygon fill="#000000" stroke="#000000" points="266.8158,-223.6142 263.2033,-213.6543 259.8162,-223.6931 266.8158,-223.6142"/> +<text text-anchor="middle" x="267.8913" y="-280.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text> +</g> +<!-- 4->7 --> +<g id="edge11" class="edge"> +<title>4->7</title> +<path fill="none" stroke="#000000" d="M163.3627,-267.2785C168.1106,-254.5023 174.6869,-236.8062 180.1171,-222.194"/> +<polygon fill="#000000" stroke="#000000" points="183.4541,-223.2619 183.6568,-212.669 176.8925,-220.8234 183.4541,-223.2619"/> +<text text-anchor="middle" x="180.8913" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text> +</g> +<!-- 5->7 --> +<g id="edge14" class="edge"> +<title>5->7</title> +<path fill="none" stroke="#000000" d="M115.8371,-271.2973C130.1833,-256.6543 153.5826,-232.7709 170.2601,-215.7483"/> +<polygon fill="#000000" stroke="#000000" points="172.8948,-218.0603 177.393,-208.4678 167.8946,-213.1615 172.8948,-218.0603"/> +<text text-anchor="middle" x="157.7223" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text> +</g> +<!-- 9 --> +<g id="node10" class="node"> +<title>9</title> +<ellipse fill="#d3d3d3" stroke="#000000" cx="60" cy="-195.6" rx="18" ry="18"/> +<text text-anchor="middle" x="60" y="-191.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">9(1)</text> +</g> +<!-- 5->9 --> +<g id="edge12" class="edge"> +<title>5->9</title> +<path fill="none" stroke="#000000" d="M95.1131,-268.1127C88.7885,-255.0515 79.7938,-236.4765 72.4916,-221.3966"/> +<polygon fill="#000000" stroke="#000000" points="75.4909,-219.5597 67.9825,-212.0848 69.1907,-222.6105 75.4909,-219.5597"/> +<text text-anchor="middle" x="89.8913" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text> +</g> +<!-- 10 --> +<g id="node11" class="node"> +<title>10</title> +<ellipse fill="#d3d3d3" stroke="#000000" cx="126" cy="-195.6" rx="18" ry="18"/> +<text text-anchor="middle" x="126" y="-191.4" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">10(1)</text> +</g> +<!-- 5->10 --> +<g id="edge13" class="edge"> +<title>5->10</title> +<path fill="none" stroke="#000000" d="M107.5441,-266.856C110.7887,-254.3287 115.2168,-237.2326 118.9222,-222.9264"/> +<polygon fill="#000000" stroke="#000000" points="122.3462,-223.6657 121.4654,-213.1076 115.5698,-221.9105 122.3462,-223.6657"/> +<text text-anchor="middle" x="120.8913" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text> +</g> +<!-- 6->8 --> +<g id="edge15" class="edge"> +<title>6->8</title> +<path fill="none" stroke="#000000" d="M298.9297,-268.1127C292.4172,-254.9694 283.1382,-236.2426 275.6413,-221.1125"/> +<polygon fill="#000000" stroke="#000000" points="278.5951,-219.1904 271.0191,-211.784 272.3229,-222.2984 278.5951,-219.1904"/> +<text text-anchor="middle" x="291.8913" y="-235.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text> +</g> +<!-- 11 --> +<g id="node12" class="node"> +<title>11</title> +<ellipse fill="none" stroke="#000000" cx="170" cy="-106.8" rx="18" ry="18"/> +<text text-anchor="middle" x="170" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">11</text> +</g> +<!-- 7->11 --> +<g id="edge16" class="edge"> +<title>7->11</title> +<path fill="none" stroke="#000000" d="M185.9527,-177.63C183.1694,-165.2722 179.4189,-148.6197 176.2537,-134.5662"/> +<polygon fill="#000000" stroke="#000000" points="179.5848,-133.4268 173.973,-124.4402 172.7558,-134.9649 179.5848,-133.4268"/> +<text text-anchor="middle" x="184.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text> +</g> +<!-- 12 --> +<g id="node13" class="node"> +<title>12</title> +<ellipse fill="#d3d3d3" stroke="#000000" cx="113" cy="-18" rx="18" ry="18"/> +<text text-anchor="middle" x="113" y="-13.8" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">12(1)</text> +</g> +<!-- 7->12 --> +<g id="edge17" class="edge"> +<title>7->12</title> +<path fill="none" stroke="#000000" d="M195.7305,-178.1814C201.8942,-156.377 209.2729,-118.3097 197,-88.8 185.9488,-62.2279 158.7426,-42.3831 138.2647,-30.5667"/> +<polygon fill="#000000" stroke="#000000" points="139.8619,-27.4509 129.4116,-25.7072 136.4936,-33.5872 139.8619,-27.4509"/> +<text text-anchor="middle" x="206.8913" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text> +</g> +<!-- 8->11 --> +<g id="edge18" class="edge"> +<title>8->11</title> +<path fill="none" stroke="#000000" d="M249.6754,-182.8771C234.3441,-168.2382 208.9734,-144.0133 190.9836,-126.8359"/> +<polygon fill="#000000" stroke="#000000" points="192.9495,-123.8738 183.2999,-119.4993 188.1154,-128.9366 192.9495,-123.8738"/> +<text text-anchor="middle" x="227.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text> +</g> +<!-- 13 --> +<g id="node14" class="node"> +<title>13</title> +<ellipse fill="#d3d3d3" stroke="#000000" cx="18" cy="-106.8" rx="18" ry="18"/> +<text text-anchor="middle" x="18" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">13(1)</text> +</g> +<!-- 9->13 --> +<g id="edge19" class="edge"> +<title>9->13</title> +<path fill="none" stroke="#000000" d="M44.2356,-186.804C34.7839,-180.5699 23.5878,-171.2526 18.2174,-159.6 14.7103,-151.9903 13.6799,-143.0834 13.8048,-134.7757"/> +<polygon fill="#000000" stroke="#000000" points="17.3023,-134.9283 14.4812,-124.716 10.3181,-134.4586 17.3023,-134.9283"/> +<text text-anchor="middle" x="22.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text> +</g> +<!-- 9->13 --> +<g id="edge21" class="edge"> +<title>9->13</title> +<path fill="none" stroke="#000000" d="M52.2966,-179.3127C46.119,-166.2515 37.3335,-147.6765 30.2011,-132.5966"/> +<polygon fill="#000000" stroke="#000000" points="33.2365,-130.8281 25.7968,-123.2848 26.9086,-133.8211 33.2365,-130.8281"/> +<text text-anchor="middle" x="45.7223" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text> +</g> +<!-- 14 --> +<g id="node15" class="node"> +<title>14</title> +<ellipse fill="#d3d3d3" stroke="#000000" cx="72" cy="-106.8" rx="18" ry="18"/> +<text text-anchor="middle" x="72" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">14(0)</text> +</g> +<!-- 9->14 --> +<g id="edge20" class="edge"> +<title>9->14</title> +<path fill="none" stroke="#000000" d="M62.4284,-177.63C64.0874,-165.353 66.3193,-148.8372 68.2105,-134.8421"/> +<polygon fill="#000000" stroke="#000000" points="71.7044,-135.1219 69.5752,-124.7433 64.7675,-134.1845 71.7044,-135.1219"/> +<text text-anchor="middle" x="71.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text> +</g> +<!-- 10->11 --> +<g id="edge22" class="edge"> +<title>10->11</title> +<path fill="none" stroke="#000000" d="M134.0703,-179.3127C140.5828,-166.1694 149.8618,-147.4426 157.3587,-132.3125"/> +<polygon fill="#000000" stroke="#000000" points="160.6771,-133.4984 161.9809,-122.984 154.4049,-130.3904 160.6771,-133.4984"/> +<text text-anchor="middle" x="155.8913" y="-147" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">o</text> +</g> +<!-- 10->12 --> +<g id="edge23" class="edge"> +<title>10->12</title> +<path fill="none" stroke="#000000" d="M124.6819,-177.5934C122.4142,-146.612 117.7583,-83.006 115.0594,-46.1346"/> +<polygon fill="#000000" stroke="#000000" points="118.5423,-45.772 114.3215,-36.0543 111.561,-46.2831 118.5423,-45.772"/> +<text text-anchor="middle" x="124.8913" y="-102.6" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text> +</g> +<!-- 11->12 --> +<g id="edge24" class="edge"> +<title>11->12</title> +<path fill="none" stroke="#000000" d="M161.5849,-90.7772C155.8346,-80.1448 147.859,-65.9928 140,-54 137.0775,-49.5402 133.7942,-44.8997 130.5502,-40.4917"/> +<polygon fill="#000000" stroke="#000000" points="133.2767,-38.294 124.4657,-32.4105 127.6846,-42.5045 133.2767,-38.294"/> +<text text-anchor="middle" x="153.8913" y="-58.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text> +</g> +<!-- 13->12 --> +<g id="edge25" class="edge"> +<title>13->12</title> +<path fill="none" stroke="#000000" d="M29.0516,-92.0851C37.7254,-81.0086 50.4346,-65.7741 63.2174,-54 71.136,-46.7062 80.5495,-39.5675 89.0481,-33.5954"/> +<polygon fill="#000000" stroke="#000000" points="91.192,-36.3695 97.4722,-27.8367 87.2416,-30.5907 91.192,-36.3695"/> +<text text-anchor="middle" x="67.8913" y="-58.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text> +</g> +<!-- 14->12 --> +<g id="edge26" class="edge"> +<title>14->12</title> +<path fill="none" stroke="#000000" d="M79.7118,-90.0974C85.7248,-77.0741 94.1818,-58.7574 101.0669,-43.8455"/> +<polygon fill="#000000" stroke="#000000" points="104.3085,-45.1739 105.3228,-34.6277 97.9532,-42.2395 104.3085,-45.1739"/> +<text text-anchor="middle" x="100.8913" y="-58.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">d</text> +</g> +<!-- 14->12 --> +<g id="edge27" class="edge"> +<title>14->12</title> +<path fill="none" stroke="#000000" d="M71.9481,-88.4225C72.5604,-77.8966 74.4493,-64.6966 79.5554,-54 82.4978,-47.8362 86.8709,-42.0067 91.4921,-36.9052"/> +<polygon fill="#000000" stroke="#000000" points="94.2341,-39.1107 98.7881,-29.5445 89.2625,-34.1828 94.2341,-39.1107"/> +<text text-anchor="middle" x="82.7223" y="-58.2" font-family="Helvetica,Arial,sans-serif" font-size="14.00" fill="#000000">*</text> +</g> +</g> +</svg>
\ No newline at end of file diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.cpp new file mode 100644 index 00000000000..f78de5cc082 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.cpp @@ -0,0 +1,11 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "explicit_levenshtein_dfa.hpp" + +namespace vespalib::fuzzy { + +template class ExplicitLevenshteinDfaImpl<1>; +template class ExplicitLevenshteinDfaImpl<2>; +template class ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>; +template class ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h new file mode 100644 index 00000000000..49baad21530 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h @@ -0,0 +1,147 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "dfa_stepping_base.h" +#include "levenshtein_dfa.h" +#include "sparse_state.h" +#include "unicode_utils.h" +#include <vector> + +namespace vespalib::fuzzy { + +// A doomed state is one that cannot possibly match the target string +constexpr const uint32_t DOOMED = UINT32_MAX; + +template <uint8_t MaxEdits> +struct DfaNode { + static constexpr uint8_t MaxCharOutEdges = diag(MaxEdits); // Not counting wildcard edge + + struct Edge { + uint32_t u32ch; + uint32_t node; + }; + + std::array<Edge, MaxCharOutEdges> match_out_edges_buf; + uint32_t wildcard_edge_to = DOOMED; + uint8_t num_match_out_edges = 0; + uint8_t edits = UINT8_MAX; + + [[nodiscard]] bool has_wildcard_edge() const noexcept { + return wildcard_edge_to != DOOMED; + } + + [[nodiscard]] uint32_t wildcard_edge_to_or_doomed() const noexcept { + return wildcard_edge_to; + } + + [[nodiscard]] std::span<const Edge> match_out_edges() const noexcept { + return std::span(match_out_edges_buf.begin(), num_match_out_edges); + } + + [[nodiscard]] uint32_t match_or_doomed(uint32_t ch) const noexcept { + // Always prefer the exact matching edges + for (const auto& e : match_out_edges()) { + if (e.u32ch == ch) { + return e.node; + } + } + // Fallback to wildcard edge if possible (could be doomed) + return wildcard_edge_to; + } + + [[nodiscard]] bool has_exact_match(uint32_t ch) const noexcept { + for (const auto& e : match_out_edges()) { + if (e.u32ch == ch) { + return true; + } + } + return false; + } + + [[nodiscard]] size_t has_higher_out_edge(uint32_t ch) const noexcept { + if (has_wildcard_edge()) { + return true; // implicitly possible to substitute a higher out edge char + } + return lowest_higher_explicit_out_edge(ch) != nullptr; + } + + [[nodiscard]] const Edge* lowest_higher_explicit_out_edge(uint32_t ch) const noexcept { + // Important: these _must_ be sorted in increasing code point order + for (const auto& e : match_out_edges()) { + if (e.u32ch > ch) { + return &e; + } + } + return nullptr; + } + + void add_match_out_edge(uint32_t out_char, uint32_t out_node) noexcept { + assert(num_match_out_edges < MaxCharOutEdges); + match_out_edges_buf[num_match_out_edges] = Edge(out_char, out_node); + ++num_match_out_edges; + } + + void set_wildcard_out_edge(uint32_t out_node) noexcept { + assert(wildcard_edge_to == DOOMED); + wildcard_edge_to = out_node; + } +}; + +template <uint8_t MaxEdits> +class ExplicitLevenshteinDfaImpl final : public LevenshteinDfa::Impl { +public: + static_assert(MaxEdits > 0 && MaxEdits <= UINT8_MAX/2); + + using DfaNodeType = DfaNode<MaxEdits>; + using MatchResult = LevenshteinDfa::MatchResult; +private: + std::vector<DfaNodeType> _nodes; +public: + ExplicitLevenshteinDfaImpl() noexcept = default; + ~ExplicitLevenshteinDfaImpl() override = default; + + static constexpr uint8_t max_edits() noexcept { return MaxEdits; } + + void ensure_node_array_large_enough_for_index(uint32_t node_index) { + if (node_index >= _nodes.size()) { + _nodes.resize(node_index + 1); + } + } + + void set_node_edit_distance(uint32_t node_index, uint8_t edits) { + _nodes[node_index].edits = edits; + } + + void add_outgoing_edge(uint32_t from_node_idx, uint32_t to_node_idx, uint32_t out_char) { + _nodes[from_node_idx].add_match_out_edge(out_char, to_node_idx); + } + + void set_wildcard_edge(uint32_t from_node_idx, uint32_t to_node_idx) { + _nodes[from_node_idx].set_wildcard_out_edge(to_node_idx); + } + + [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override; + + [[nodiscard]] size_t memory_usage() const noexcept override { + return sizeof(DfaNodeType) * _nodes.size(); + } + + void dump_as_graphviz(std::ostream& os) const override; +}; + +template <typename Traits> +class ExplicitLevenshteinDfaBuilder { + std::vector<uint32_t> _u32_str_buf; // TODO std::u32string +public: + explicit ExplicitLevenshteinDfaBuilder(std::string_view str) + : ExplicitLevenshteinDfaBuilder(utf8_string_to_utf32(str)) + {} + + explicit ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str) noexcept + : _u32_str_buf(std::move(str)) + {} + + [[nodiscard]] LevenshteinDfa build_dfa() const; +}; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp new file mode 100644 index 00000000000..0960219aff3 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp @@ -0,0 +1,228 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "explicit_levenshtein_dfa.h" +#include "match_algorithm.hpp" +#include <vespa/vespalib/stllike/hash_map.h> +#include <vespa/vespalib/stllike/hash_map.hpp> +#include <iostream> +#include <span> +#include <queue> + +namespace vespalib::fuzzy { + +// DfaMatcher adapter for explicit DFA implementation +template <uint8_t MaxEdits> +struct ExplicitDfaMatcher { + using DfaNodeType = typename ExplicitLevenshteinDfaImpl<MaxEdits>::DfaNodeType; + using StateType = const DfaNodeType*; + using EdgeType = const DfaNodeType::Edge*; + + using StateParamType = const DfaNodeType*; + + const std::span<const DfaNodeType> _nodes; + + explicit ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes) noexcept + : _nodes(nodes) + {} + + static constexpr uint8_t max_edits() noexcept { return MaxEdits; } + + StateType start() const noexcept { + return &_nodes[0]; + } + bool has_higher_out_edge(StateType node, uint32_t mch) const noexcept { + return node->has_higher_out_edge(mch); + } + StateType match_input(StateType node, uint32_t mch) const noexcept { + auto maybe_node_idx = node->match_or_doomed(mch); + return ((maybe_node_idx != DOOMED) ? &_nodes[maybe_node_idx] : nullptr); + } + bool is_match(StateType node) const noexcept { + return node->edits <= max_edits(); + } + bool can_match(StateType node) const noexcept { + return node != nullptr; + } + uint8_t match_edit_distance(StateType node) const noexcept { + return node->edits; + } + bool valid_state(StateType node) const noexcept { + return node != nullptr; + } + StateType match_wildcard(StateType node) const noexcept { + auto edge_to = node->wildcard_edge_to_or_doomed(); + return ((edge_to != DOOMED) ? &_nodes[edge_to] : nullptr); + } + bool has_exact_explicit_out_edge(StateType node, uint32_t ch) const noexcept { + return node->has_exact_match(ch); + } + EdgeType lowest_higher_explicit_out_edge(StateType node, uint32_t ch) const noexcept { + return node->lowest_higher_explicit_out_edge(ch); + } + EdgeType smallest_explicit_out_edge(StateType node) const noexcept { + // Out-edges are pre-ordered in increasing code point order, so the first + // element is always the smallest possible matching character. + assert(!node->match_out_edges().empty()); + return &node->match_out_edges().front(); + } + bool valid_edge(EdgeType edge) const noexcept { + return edge != nullptr; + } + uint32_t edge_to_u32char(EdgeType edge) const noexcept { + return edge->u32ch; + } + StateType edge_to_state([[maybe_unused]] StateType node, EdgeType edge) const noexcept { + return &_nodes[edge->node]; + } +}; + +template <uint8_t MaxEdits> +LevenshteinDfa::MatchResult +ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const { + ExplicitDfaMatcher<MaxEdits> matcher(_nodes); + return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out); +} + +template <uint8_t MaxEdits> +void ExplicitLevenshteinDfaImpl<MaxEdits>::dump_as_graphviz(std::ostream& os) const { + os << std::dec << "digraph levenshtein_dfa {\n"; + os << " fontname=\"Helvetica,Arial,sans-serif\"\n"; + os << " node [shape=circle, fontname=\"Helvetica,Arial,sans-serif\", fixedsize=true];\n"; + os << " edge [fontname=\"Helvetica,Arial,sans-serif\"];\n"; + for (size_t i = 0; i < _nodes.size(); ++i) { + const auto& node = _nodes[i]; + if (node.edits <= max_edits()) { + os << " " << i << " [label=\"" << i << "(" << static_cast<int>(node.edits) << ")\", style=\"filled\"];\n"; + } + for (const auto& edge : node.match_out_edges()) { + std::string as_utf8; + append_utf32_char_as_utf8(as_utf8, edge.u32ch); + os << " " << i << " -> " << edge.node << " [label=\"" << as_utf8 << "\"];\n"; + } + if (node.wildcard_edge_to != DOOMED) { + os << " " << i << " -> " << node.wildcard_edge_to << " [label=\"*\"];\n"; + } + } + os << "}\n"; +} + +namespace { + +template <typename StateType> +struct ExploreState { + using NodeIdAndExplored = std::pair<uint32_t, bool>; + using SparseExploredStates = vespalib::hash_map<StateType, NodeIdAndExplored, typename StateType::hash>; + + uint32_t state_counter; + SparseExploredStates explored_states; + + ExploreState(); + ~ExploreState(); + + [[nodiscard]] SparseExploredStates::iterator node_of(const StateType& state) { + auto maybe_explored = explored_states.find(state); + if (maybe_explored != explored_states.end()) { + return maybe_explored; + } + uint32_t this_node = state_counter; + assert(state_counter < UINT32_MAX); + ++state_counter; + return explored_states.insert(std::make_pair(state, std::make_pair(this_node, false))).first; // not yet explored; + } + + [[nodiscard]] bool already_explored(const SparseExploredStates::iterator& node) const noexcept { + return node->second.second; + } + + void tag_as_explored(SparseExploredStates::iterator& node) noexcept { + node->second.second = true; + } +}; + +template <typename StateType> +ExploreState<StateType>::ExploreState() + : state_counter(0), + explored_states() +{} + +template <typename StateType> +ExploreState<StateType>::~ExploreState() = default; + +template <typename Traits> +class ExplicitLevenshteinDfaBuilderImpl : public DfaSteppingBase<Traits> { + using Base = DfaSteppingBase<Traits>; + + using StateType = typename Base::StateType; + using TransitionsType = typename Base::TransitionsType; + + using Base::_u32_str; + using Base::max_edits; + using Base::start; + using Base::match_edit_distance; + using Base::step; + using Base::is_match; + using Base::can_match; + using Base::transitions; +public: + explicit ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str) noexcept + : DfaSteppingBase<Traits>(str) + { + assert(str.size() < UINT32_MAX / max_out_edges_per_node()); + } + + [[nodiscard]] static constexpr uint8_t max_out_edges_per_node() noexcept { + // Max possible out transition characters (2k+1) + one wildcard edge. + return diag(max_edits()) + 1; + } + + [[nodiscard]] LevenshteinDfa build_dfa() const; +}; + +template <typename Traits> +LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const { + auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>(); + ExploreState<StateType> exp; + // Use BFS instead of DFS to ensure most node edges point to nodes that are allocated _after_ + // the parent node, which means the CPU can skip ahead instead of ping-ponging back and forth. + // This does _not_ always hold, such as if you have A->B and A->C->B (i.e. both parent and + // grandparent have a transition to the same state), in which case B may be allocated before C. + std::queue<StateType> to_explore; + to_explore.push(start()); + while (!to_explore.empty()) { + auto state = std::move(to_explore.front()); + to_explore.pop(); + auto this_node = exp.node_of(state); // note: invalidated by subsequent calls to node_of + if (exp.already_explored(this_node)) { + continue; + } + exp.tag_as_explored(this_node); + const auto this_node_idx = this_node->second.first; + dfa->ensure_node_array_large_enough_for_index(this_node_idx); + dfa->set_node_edit_distance(this_node_idx, match_edit_distance(state)); + auto t = transitions(state); + for (uint32_t out_c : t.u32_chars()) { + auto new_state = step(state, out_c); + auto out_node = exp.node_of(new_state); + dfa->add_outgoing_edge(this_node_idx, out_node->second.first, out_c); + to_explore.push(std::move(new_state)); + } + auto wildcard_state = step(state, WILDCARD); + if (can_match(wildcard_state)) { + auto out_node = exp.node_of(wildcard_state); + dfa->set_wildcard_edge(this_node_idx, out_node->second.first); + to_explore.push(std::move(wildcard_state)); + } // else: don't bother + } + return LevenshteinDfa(std::move(dfa)); +} + +} // anon ns + +template <typename Traits> +LevenshteinDfa ExplicitLevenshteinDfaBuilder<Traits>::build_dfa() const { + ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf); + return builder.build_dfa(); +} + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp new file mode 100644 index 00000000000..8b9d2eddcac --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp @@ -0,0 +1,9 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "implicit_levenshtein_dfa.hpp" + +namespace vespalib::fuzzy { + +template class ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>; +template class ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h new file mode 100644 index 00000000000..0846b95d135 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h @@ -0,0 +1,35 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "levenshtein_dfa.h" +#include "unicode_utils.h" +#include <vector> + +namespace vespalib::fuzzy { + +template <typename Traits> +class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl { + std::vector<uint32_t> _u32_str_buf; // TODO std::u32string +public: + using MatchResult = LevenshteinDfa::MatchResult; + + explicit ImplicitLevenshteinDfa(std::string_view str) + : ImplicitLevenshteinDfa(utf8_string_to_utf32(str)) + {} + + explicit ImplicitLevenshteinDfa(std::vector<uint32_t> str) noexcept + : _u32_str_buf(std::move(str)) + {} + + ~ImplicitLevenshteinDfa() override = default; + + [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override; + + [[nodiscard]] size_t memory_usage() const noexcept override { + return _u32_str_buf.size() * sizeof(uint32_t); + } + + void dump_as_graphviz(std::ostream& os) const override; +}; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp new file mode 100644 index 00000000000..4ee468e424b --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp @@ -0,0 +1,121 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "dfa_stepping_base.h" +#include "implicit_levenshtein_dfa.h" +#include "match_algorithm.hpp" +#include "sparse_state.h" +#include <cassert> +#include <stdexcept> + +namespace vespalib::fuzzy { + +// DfaMatcher adapter for implicit DFA implementation +template <typename Traits> +struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> { + using Base = DfaSteppingBase<Traits>; + + using StateType = typename Base::StateType; + using EdgeType = uint32_t; // Just the raw u32 character value + + using StateParamType = const StateType&; + + using Base::_u32_str; + using Base::max_edits; + using Base::start; + using Base::match_edit_distance; + using Base::step; + using Base::can_wildcard_step; + using Base::is_match; + using Base::can_match; + + explicit ImplicitDfaMatcher(std::span<const uint32_t> u32_str) noexcept + : Base(u32_str) + {} + + // start, is_match, can_match, match_edit_distance are all provided by base type + + template <typename F> + bool has_any_char_matching(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) { + for (uint32_t i = 0; i < state.size(); ++i) { + const auto idx = state.index(i); + if ((idx < _u32_str.size()) && f(_u32_str[idx])) { + return true; + } + } + return false; + } + + template <typename F> + void for_each_char(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) { + for (uint32_t i = 0; i < state.size(); ++i) { + const auto idx = state.index(i); + if ((idx < _u32_str.size())) [[likely]] { + f(_u32_str[idx]); + } + } + } + + bool has_explicit_higher_out_edge(const StateType& state, uint32_t ch) const noexcept { + return has_any_char_matching(state, [ch](uint32_t state_ch) noexcept { + return state_ch > ch; + }); + } + + bool has_higher_out_edge(const StateType& state, uint32_t mch) const noexcept { + return (has_explicit_higher_out_edge(state, mch) || can_wildcard_step(state)); + } + StateType match_input(const StateType& state, uint32_t mch) const noexcept { + return step(state, mch); + } + bool valid_state(const StateType& state) const noexcept { + return !state.empty(); + } + StateType match_wildcard(const StateType& state) const noexcept { + return step(state, WILDCARD); + } + bool has_exact_explicit_out_edge(const StateType& state, uint32_t ch) const noexcept { + return has_any_char_matching(state, [ch](uint32_t state_ch) noexcept { + return state_ch == ch; + }); + } + EdgeType lowest_higher_explicit_out_edge(const StateType& state, uint32_t ch) const noexcept { + uint32_t min_ch = UINT32_MAX; + for_each_char(state, [ch, &min_ch](uint32_t state_ch) noexcept { + if ((state_ch > ch) && (state_ch < min_ch)) { + min_ch = state_ch; + } + }); + return min_ch; + } + EdgeType smallest_explicit_out_edge(const StateType& state) const noexcept { + uint32_t min_ch = UINT32_MAX; + for_each_char(state, [&min_ch](uint32_t state_ch) noexcept { + min_ch = std::min(min_ch, state_ch); + }); + return min_ch; + } + bool valid_edge(EdgeType edge) const noexcept { + return edge != UINT32_MAX; + } + uint32_t edge_to_u32char(EdgeType edge) const noexcept { + return edge; + } + StateType edge_to_state(const StateType& state, EdgeType edge) const noexcept { + return step(state, edge); + } +}; + +template <typename Traits> +LevenshteinDfa::MatchResult +ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const { + ImplicitDfaMatcher<Traits> matcher(_u32_str_buf); + return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out); +} + +template <typename Traits> +void ImplicitLevenshteinDfa<Traits>::dump_as_graphviz(std::ostream&) const { + throw std::runtime_error("Graphviz output not available for implicit Levenshtein DFA"); +} + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp new file mode 100644 index 00000000000..e75ef8365bf --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp @@ -0,0 +1,83 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "explicit_levenshtein_dfa.h" +#include "implicit_levenshtein_dfa.h" +#include "levenshtein_dfa.h" +#include <vespa/vespalib/util/stringfmt.h> +#include <memory> + +namespace vespalib::fuzzy { + +LevenshteinDfa::LevenshteinDfa(std::unique_ptr<Impl> impl) noexcept + : _impl(std::move(impl)) +{} + +LevenshteinDfa::LevenshteinDfa(LevenshteinDfa&&) noexcept = default; +LevenshteinDfa& LevenshteinDfa::operator=(LevenshteinDfa&&) noexcept = default; + +LevenshteinDfa::~LevenshteinDfa() = default; + +LevenshteinDfa::MatchResult +LevenshteinDfa::match(std::string_view u8str, std::string* successor_out) const { + return _impl->match(u8str, successor_out); +} + +size_t LevenshteinDfa::memory_usage() const noexcept { + return _impl->memory_usage(); +} + +void LevenshteinDfa::dump_as_graphviz(std::ostream& out) const { + _impl->dump_as_graphviz(out); +} + +LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, DfaType dfa_type) { + if (max_edits != 1 && max_edits != 2) { + throw std::invalid_argument(make_string("Levenshtein DFA max_edits must be in {1, 2}, was %u", max_edits)); + } + if (dfa_type == DfaType::Implicit) { + if (max_edits == 1) { + return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(target_string)); + } else { // max_edits == 2 + return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(target_string)); + } + } else { // DfaType::Explicit + if (max_edits == 1) { + return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(target_string).build_dfa(); + } else { // max_edits == 2 + return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(target_string).build_dfa(); + } + } + +} + +LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits) { + // TODO automatically select implementation based on target length/max edits? + // Suggestion: + // - Explicit DFA iff (k == 1 && |target| <= 256) || (k == 2 && |target| <= 64). + // - Implicit DFA otherwise. + // This keeps memory overhead < 64k and DFA construction time < 300 usec (measured on + // an M1 Pro; your mileage may vary etc). + // Ideally the implicit DFA would always be the fastest (or at least approximately as + // fast as the explicit DFA), but this is not yet the case. + return build(target_string, max_edits, DfaType::Implicit); +} + +std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos) { + if (mos.matches()) { + os << "match(" << static_cast<int>(mos.edits()) << " edits)"; + } else { + os << "mismatch"; + } + return os; +} + +std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt) { + if (dt == LevenshteinDfa::DfaType::Implicit) { + os << "Implicit"; + } else { + assert(dt == LevenshteinDfa::DfaType::Explicit); + os << "Explicit"; + } + return os; +} + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h new file mode 100644 index 00000000000..a26ccbe87ee --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h @@ -0,0 +1,244 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <cstdint> +#include <iosfwd> +#include <memory> +#include <string> +#include <string_view> + +namespace vespalib::fuzzy { + +/** + * Levenshtein Deterministic Finite Automata (DFA) + * + * The Levenshtein distance (or edit distance) is the minimum number of edits (additions, + * deletions or substitutions) needed to transform a particular source string s to a + * particular target string t. + * + * Let m be the length of the source string and n be the length of the target string. + * + * The classic dynamic programming algorithm uses a n x m cost matrix and is therefore + * O(nm) in space and time. By observing that only 2 rows of the matrix are actually + * needed, this is commonly reduced to O(n) space complexity (still O(nm) time complexity). + * When the maximum number of allowed edits is constrained to k, some clever observations + * about the nature of the cost matrix allows for reducing the time complexity down to + * O(kn) (more specifically, O((2k+1) * n)). When k is fixed (e.g. k in {1, 2}), the + * time complexity simplifies down to O(n). + * + * This implements code for building and evaluating Levenshtein Deterministic Finite + * Automata, where the resulting DFA efficiently matches all possible source strings that + * can be transformed to the target string within k max edits. This allows for easy linear + * matching of strings. + * + * Inspiration: + * - http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata + * - https://julesjacobs.com/2015/06/17/disqus-levenshtein-simple-and-fast.html + * + * The latter in particular was a close inspiration for the sparse DFA state management. + * + * ====== Dictionary skipping via successor string generation ====== + * + * Scanning for edit distance matches frequently takes place against a sorted dictionary. + * When matching using a DFA, in the case where the source string does _not_ match, we can + * generate the _successor_ string; the next matching string that is lexicographically + * _greater_ than the source string. This string has the invariant that there are no + * possibly matching strings within k edits ordered after the source string but before + * the successor. + * + * This lets us do possibly massive leaps forward in the dictionary, turning a dictionary + * scan into a sublinear operation. + * + * Note that the implemented successor algorithm is slightly different from that described + * in the above blog post. The implemented algorithm requires zero extra data structures + * than the DFA itself and the target string and tries to be extra clever with reducing + * the number of code point conversions required + * + * ====== Unicode support ====== + * + * Matching and successor generation is fully Unicode-aware. All input strings are expected + * to be in UTF-8, and the generated successor is also encoded as UTF-8 (with some caveats; + * see the documentation for match()). + * + * Internally, matching is done on UTF-32 code points and the DFA itself is built around + * UTF-32. This is unlike Lucene, which converts a UTF-32 DFA to an equivalent UTF-8 DFA. + * + * ====== Memory usage ====== + * + * There is always a baseline DFA memory usage O(n) in the target string, as the + * underlying DFA needs to convert the input UTF-8 string to explicit UTF-32 chars. + * + * Aside from the baseline, memory usage depends on whether an explicit or implicit DFA + * is used. + * + * ------ Explicit DFA ------ + * + * The explicit DFA graph takes up quite a bit more memory than the original string + * representation (one reason is the use of UTF-32 characters under the hood). + * + * Expected upper bound memory usage for a string of length n with max edits k is + * + * (2k+1) * N(k) * n * W(k) + * + * where N(1) is expected to be 32 and N(2) is 48, W(1) is 1.34 and W(2) is 3.2 (empirically + * derived). + * + * Memory usage during building is higher due to keeping track of the set of generated + * states in a hash table, but still linear in input size. This extra memory is freed + * once building is complete. + * + * ------ Implicit DFA ------ + * + * Implicit DFAs have a O(1) memory usage during evaluation, which all lives on the stack + * or in registers (this does not include the successor string, which is provided by the + * caller). + * + * Since the sparse state stepping is currently not as fast as explicit DFA node traversal, + * string matching is slower than with the explicit DFA. + * + * ====== In short ====== + * + * - Immutable; build once, run many times. + * - Explicit DFA build time is amortized linear in target string size. + * - Implicit DFA build time is O(1) (aside from initial UTF-32 conversion) + * - Zero-allocation matching. + * - Matching takes in raw UTF-8 input, no need to pre-convert. + * - Streaming UTF-8 to UTF-32 conversion; fully unicode-aware (DFA uses UTF-32 code + * points internally). + * - If required, it's possible (but not currently implemented) to bake case + * insensitive matching semantics into the generated DFA itself. + * - Allows for dictionary forward-skipping via successor algorithm. + * - Amortized zero allocations for successor string building when reusing string + * between matches. + * - Successor string is generated in-place as UTF-8 and can be directly used as input + * to a byte-wise dictionary seek. + */ +class LevenshteinDfa { +public: + class MatchResult { + uint8_t _max_edits; + uint8_t _edits; + public: + constexpr MatchResult(uint8_t max_edits, uint8_t edits) noexcept + : _max_edits(max_edits), + _edits(edits) + {} + + static constexpr MatchResult make_match(uint8_t max_edits, uint8_t edits) noexcept { + return {max_edits, edits}; + } + + static constexpr MatchResult make_mismatch(uint8_t max_edits) noexcept { + return {max_edits, static_cast<uint8_t>(max_edits + 1)}; + } + + [[nodiscard]] constexpr bool matches() const noexcept { return _edits <= _max_edits; } + [[nodiscard]] constexpr uint8_t edits() const noexcept { return _edits; } + [[nodiscard]] constexpr uint8_t max_edits() const noexcept { return _max_edits; } + }; + + struct Impl { + virtual ~Impl() = default; + [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::string* successor_out) const = 0; + [[nodiscard]] virtual size_t memory_usage() const noexcept = 0; + virtual void dump_as_graphviz(std::ostream& out) const = 0; + }; + +private: + std::unique_ptr<Impl> _impl; +public: + explicit LevenshteinDfa(std::unique_ptr<Impl> impl) noexcept; + LevenshteinDfa(LevenshteinDfa&&) noexcept; + LevenshteinDfa& operator=(LevenshteinDfa&&) noexcept; + LevenshteinDfa(const LevenshteinDfa&) = delete; + LevenshteinDfa& operator=(const LevenshteinDfa&) = delete; + ~LevenshteinDfa(); + + /** + * Attempts to match the source string `source` with the target string this DFA was + * built with, emitting a successor string on mismatch if `successor_out` != nullptr. + * + * `source` must not contain any null UTF-8 chars. + * + * Match case: + * Iff `source` is _within_ the maximum edit distance, returns a MatchResult with + * matches() == true and edits() == the actual edit distance. If `successor_out` + * is not nullptr, the string pointed to is _not_ modified. + * + * Mismatch case: + * Iff `source` is _beyond_ the maximum edit distance, returns a MatchResult with + * matches() == false. + * + * Iff `successor_out` is not nullptr, the following holds: + * - `successor_out` is modified to contain the next (in byte-wise ordering) possible + * _matching_ string S so that there exists no other matching string S' that is + * greater than `source` but smaller than S. + * - `successor_out` contains UTF-8 bytes that are within what UTF-8 can legally + * encode in bitwise form, but the _code points_ they encode may not be valid. + * In particular, surrogate pair ranges and U+10FFFF+1 may be encoded, neither of + * which are valid UTF-8. + * + * It is expected that the consumer of `successor_out` is only interested in the + * memcmp()-ordering of strings and not whether they are technically valid Unicode. + * This should be the case for low-level dictionary data structures etc. + * + * Memory allocation: + * This function does not directly or indirectly allocate any heap memory if either: + * + * - the input string is within the max edit distance, or + * - `successor_out` is nullptr, or + * - `successor_out` has sufficient capacity to hold the generated successor + * + * By reusing the successor string across many calls, this therefore amortizes memory + * allocations down to near zero per invocation. + */ + [[nodiscard]] MatchResult match(std::string_view source, std::string* successor_out) const; + + /** + * Returns how much memory is used by the underlying DFA representation, in bytes. + */ + [[nodiscard]] size_t memory_usage() const noexcept; + + enum class DfaType { + Implicit, + Explicit + }; + + /** + * Builds and returns a Levenshtein DFA that matches all strings within `max_edits` + * edits of `target_string`. The type of DFA returned is specified by dfa_type. + * + * `max_edits` must be in {1, 2}. Throws std::invalid_argument if outside range. + * + * `target_string` must not contain any null UTF-8 chars. + */ + [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, + uint8_t max_edits, + DfaType dfa_type); + + /** + * Same as build() but currently always returns an implicit DFA. + */ + [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits); + + /** + * Dumps the DFA as a Graphviz graph in text format to the provided output stream. + * + * Note: Only supported for _explicit_ DFAs. Trying to call this function on an implicit + * DFA will throw a std::runtime_error, as there is no concrete underlying graph + * structure to dump. + * + * Note that only _matching_ state transitions are present in the DFA, and therefore only + * such transitions are present in the generated graph. Overall this makes the graph for + * longer strings much more manageable, as the number of out-edges from a particular depth + * in the graph depends on the max number of edits and not on the length of the string + * itself. Otherwise, you'd have a whole bunch of nodes with out-edges to the same terminal + * non-matching state node. + */ + void dump_as_graphviz(std::ostream& out) const; +}; + +std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos); +std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt); + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp new file mode 100644 index 00000000000..206b69f8ebe --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp @@ -0,0 +1,291 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "dfa_matcher.h" +#include "levenshtein_dfa.h" +#include "unicode_utils.h" +#include <vespa/vespalib/text/utf8.h> +#include <cassert> +#include <concepts> + +namespace vespalib::fuzzy { + +/** + * Implementation of algorithm for linear-time k-max edits string matching and successor + * string generation over an abstract DFA representation. + * + * The implementation is agnostic to how the underlying DFA is implemented, but requires + * an appropriate adapter that satisfies the DfaMatcher concept contracts. + */ +template <uint8_t MaxEdits> +struct MatchAlgorithm { + using MatchResult = LevenshteinDfa::MatchResult; + + static constexpr uint8_t max_edits() noexcept { return MaxEdits; } + + /** + * Matches UTF-8 source string `source` against the target DFA, optionally generating + * the successor string iff the source string is not within the maximum number of edits + * of the target string. + * + * The actual match loop is very simple: we try to match the DFA as far as we can + * before either consuming all input (source string) characters or ending up in a non- + * matching state before we have consumed all input. In the former case, we may be in + * a matching state (consider matching "foo" with the target string "food"; after + * consuming all input we'll be in a matching state with 1 edit). In the latter case, + * the input string cannot possible match. + * + * If we end up in a matching state, all is well. We simply return a MatchResult with + * the number of edits the state represents. + * + * The interesting bit happens the string does _not_ match and we are asked to provide a + * _successor_ string that _does_ match and is strictly greater in lexicographic order. + * + * We lean on some core invariants: + * + * - The m x n (|source| x |target|) Levenshtein matrix provides, for any m[i, j] with + * i in [1, m], j in [1, n], the _minimum possible_ number of edits that can transform + * the source string prefix of length `i` to the target string prefix of length `j`. + * This means there is no way of transforming said source prefix using _fewer_ edits. + * + * - Any given DFA state corresponds to a unique row in the Levenshtein matrix, thus + * transitively inheriting the invariants of the matrix row elements themselves, such + * as representing the minimum number of edits. + * + * We have two mismatch cases: + * + * 1. We've matched the entire source string without ending in an accepting state. + * + * This can only happen if the input is a (possibly edited) prefix of the target string. + * Any and all _longer_ strings with this prefix is inherently lexicographically greater, + * so we emit the smallest possible suffix that turns prefix || suffix into a matching + * string. + * + * See emit_smallest_matching_suffix() for details. + * + * 2. We've matched a prefix of the source string without ending in an accepting state. + * + * This case is trickier than when the entire source string is a prefix, as we cannot + * just emit a suffix to the source to create a matching, lexicographically greater string. + * + * Consider source "foxx" and target "food". There exists no suffix S in "food" that can + * turn "foxx" || S into a matching string within k=1 edits. + * + * So we have to backtrack to somewhere. + * + * That "somewhere" is the state that maximizes the size of the source prefix while + * allowing us to emit a greater suffix. + * + * For each state we visit, we check if there exists at least one higher out edge than the + * one taken out from that state (this is possibly a wildcard edge). If one exists, we + * copy the state to `last_state_with_higher_out` and remember the state's source string + * prefix as well as the source string character that transitions us away from the state + * (this will be our candidate for building a greater suffix). + * + * When we fail to match the entire source string, we know that last_state_with_higher_out + * represents the last possible branching point (and therefore the longest prefix) where + * we can substitute in or insert a higher character, in turn creating a greater suffix. + * + * Proof by contradiction: let `last_state_with_higher_out` be S and assume there exists + * a state S' that has a greater source string prefix than S while still allowing for + * emitting a lexicographically greater suffix that is within max edits k. We terminate + * the match loop once can_match(X) is false for any state X, where X subsumes S by + * definition. For S' to exist, it must be possible for a transition to exist from X to + * a later state that can have a higher out edge. However, edit distance costs can + * never decrease, only stay constant (with matching substitutions) or increase (with + * insertions, deletions or non-matching substitutions), so it's impossible to follow + * an out-edge from X to any later potentially matching state. Thus, S' can not exist + * and we have a contradiction. + * + * Since we want to generate the smallest possible larger string that matches, we ideally + * want to emit a character that is +1 of the source character after the shared prefix. + * This is using the "higher out"-character we remembered earlier. We do this if we have + * a wildcard out edge (or if there exists an explicit out-edge for value char+1). + * Otherwise, we have to follow the highest explicitly present out-edge. + * + * Once we have emitted one single character that gets us lexicographically higher than + * the source string, we then emit the smallest possible suffix to this. This uses the + * same minimal suffix generation logic as mismatch case 1). + * + * See `backtrack_and_emit_greater_suffix()` for details. + * + * Example: + * (This is easiest to follow by looking at examples/food_dfa.svg) + * + * Source "foxx", target "food" and k=1: + * + * After matching "fo" with 0 edits we reach a state with out-edges {d, o, *}. This state + * has an implicitly higher out-edge (*) and we remember it and the char 'x' for later. + * Edge 'x' can only happen via *, so we take that path. + * + * After matching "fox" with 1 edit we reach a state with out-edges {d, o}. There is + * no out-edge for 'x' and the state is not a matching state, so we need to backtrack + * and generate a successor. + * + * We backtrack to the state representing "fo" and emit it as a successor prefix. We + * observe that this state has a wildcard out-edge and emit 'x'+1 == 'y' to the successor + * string and continue with emitting the smallest suffix. We now have a successor + * prefix of "foy", with which we reach the same logical state as we did with "fox" + * previously. The smallest out-edge here is 'd', so we take it. This leaves us in an + * accepting (matching) state, so suffix generation completes. + * + * "foxx" -> "foyd" + * + * Note that it's possible for the prefix to be empty, which results in a successor + * that has nothing in common with the source altogether. + * Example: "gp" -> "hfood" (+1 char value case) + * + * Performance note: + * Both the input and successor output strings are in UTF-8 format. To avoid doing + * duplicate work, we keep track of the byte length of the string prefix that will be + * part of the successor and simply copy it verbatim instead of building the string + * from converted UTF-32 -> UTF-8 chars as we go. + * + * TODO we could probably also optimize the smallest suffix generation with this when + * we know we can no longer insert any smaller char substitutions and the only way + * to complete the string is to emit it verbatim. + * - To do this we'd need both the original UTF-8 target string as well as a + * secondary vector that maps u32 character index to the corresponding UTF-8 index. + * Both trivial to get as part of DFA initialization. + */ + template <DfaMatcher Matcher> + static MatchResult match(const Matcher& matcher, + std::string_view source, + std::string* successor_out) + { + using StateType = typename Matcher::StateType; + vespalib::Utf8Reader u8_reader(source.data(), source.size()); + uint32_t n_prefix_u8_bytes = 0; + uint32_t char_after_prefix = 0; + StateType last_state_with_higher_out = StateType{}; + + StateType state = matcher.start(); + while (u8_reader.hasMore()) { + const auto u8_pos_before_char = u8_reader.getPos(); + const uint32_t mch = u8_reader.getChar(); + if (successor_out && matcher.has_higher_out_edge(state, mch)) { + last_state_with_higher_out = state; + n_prefix_u8_bytes = u8_pos_before_char; + char_after_prefix = mch; + } + auto maybe_next = matcher.match_input(state, mch); + if (matcher.can_match(maybe_next)) { + state = maybe_next; + } else { + // Can never match; find the successor if requested + if (successor_out) { + *successor_out = source.substr(0, n_prefix_u8_bytes); + assert(matcher.valid_state(last_state_with_higher_out)); + backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out, + char_after_prefix, *successor_out); + } + return MatchResult::make_mismatch(max_edits()); + } + } + const auto edits = matcher.match_edit_distance(state); + if (edits <= max_edits()) { + return MatchResult::make_match(max_edits(), edits); + } + if (successor_out) { + *successor_out = source; + emit_smallest_matching_suffix(matcher, state, *successor_out); + } + return MatchResult::make_mismatch(max_edits()); + } + + /** + * Instantly backtrack to the last possible branching point in the DFA where we can + * choose some higher outgoing edge character value and still match the DFA. If the node + * has a wildcard edge, we can bump the input char by one and generate the smallest + * possible matching suffix to that. Otherwise, choose the smallest out edge that is + * greater than the input character at that location and _then_ emit the smallest + * matching prefix. + * + * precondition: `last_node_with_higher_out` has either a wildcard edge or a char match + * edge that compares greater than `input_at_branch`. + */ + template <DfaMatcher Matcher> + static void backtrack_and_emit_greater_suffix( + const Matcher& matcher, + typename Matcher::StateParamType last_state_with_higher_out, + const uint32_t input_at_branch, + std::string& successor) + { + auto wildcard_state = matcher.match_wildcard(last_state_with_higher_out); + if (matcher.can_match(wildcard_state)) { + // `input_at_branch` may be U+10FFFF, with +1 being outside legal Unicode _code point_ + // range but _within_ what UTF-8 can technically _encode_. + // We assume that successor-consumers do not care about anything except byte-wise + // ordering. This is similar to what RE2's PossibleMatchRange emits to represent a + // UTF-8 upper bound, so not without precedent. + // If the resulting character corresponds to an existing out-edge we _must_ take it + // instead of the wildcard edge, or we'll end up in the wrong state. + const auto next_char = input_at_branch + 1; + if (!matcher.has_exact_explicit_out_edge(last_state_with_higher_out, next_char)) { + append_utf32_char_as_utf8(successor, next_char); + emit_smallest_matching_suffix(matcher, wildcard_state, successor); + return; + } // else: handle exact match below (it will be found as the first higher out edge) + } + const auto first_highest_edge = matcher.lowest_higher_explicit_out_edge(last_state_with_higher_out, input_at_branch); + assert(matcher.valid_edge(first_highest_edge)); + append_utf32_char_as_utf8(successor, matcher.edge_to_u32char(first_highest_edge)); + emit_smallest_matching_suffix(matcher, matcher.edge_to_state(last_state_with_higher_out, first_highest_edge), successor); + } + + /** + * The smallest possible suffix is generated by following the smallest out-edge per state, + * until we reach a state that is a match. It is possible that the smallest out edge is a + * "wildcard" edge (our terminology), which means that we can insert/substitute an arbitrary + * character and still have `can_match(resulting state)` be true. In this case we emit the + * smallest possible non-null UTF-8 character (0x01). + * + * Examples: + * (These are easiest to follow by looking at examples/food_dfa.svg) + * + * Source "fo", target "food" and k=1: + * + * After matching "fo" we have 1 edit to spare. The smallest valid, non-empty UTF-8 suffix + * to this string must necessarily begin with 0x01, so that's what we emit. The smallest + * edge we can follow from the resulting state is 'd', and that is a accepting (matching) + * state. + * + * "fo" -> "fo\x01d" + * + * Source "fx", target "food" and k=1: + * + * After matching "fx" we have no edits to spare. The smallest character reachable from + * the state is 'o' (in fact, it is the only out edge available since we're down to zero + * available edits). The next state has an out-edge to 'd' and 'o', and we choose 'd' + * since it is smallest. This leaves us in an accepting (matching) state and we terminate + * the loop. + * + * "fx" -> "fxod" + */ + // TODO consider variant for only emitting _prefix of suffix_ to avoid having to generate + // the full string? Won't generate a matching string, but will be lexicographically greater. + template <DfaMatcher Matcher> + static void emit_smallest_matching_suffix( + const Matcher& matcher, + typename Matcher::StateParamType from, + std::string& str) + { + auto state = from; + while (!matcher.is_match(state)) { + // If we can take a wildcard path, emit the smallest possible valid UTF-8 character (0x01). + // Otherwise, find the smallest char that can eventually lead us to a match. + auto wildcard_state = matcher.match_wildcard(state); + if (matcher.can_match(wildcard_state)) { + str += '\x01'; + state = wildcard_state; + } else { + const auto smallest_out_edge = matcher.smallest_explicit_out_edge(state); + assert(matcher.valid_edge(smallest_out_edge)); + append_utf32_char_as_utf8(str, matcher.edge_to_u32char(smallest_out_edge)); + state = matcher.edge_to_state(state, smallest_out_edge); + } + } + } +}; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h b/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h new file mode 100644 index 00000000000..40cfa5e6409 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h @@ -0,0 +1,175 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <algorithm> +#include <array> +#include <cassert> +#include <cstdint> +#include <ostream> +#include <span> +#include <xxh3.h> // TODO factor out? + +namespace vespalib::fuzzy { + +// Sentinel U32 char for state stepping that cannot match any target string characters +constexpr const uint32_t WILDCARD = UINT32_MAX; + +/** + * diag(n) is the width of the diagonal of the cost matrix that can possibly be + * within k edits. This means that for a fixed k, it suffices to maintain state + * for up to and including diag(k) consecutive cells for any given matrix row. + */ +constexpr inline uint8_t diag(uint8_t k) noexcept { + return k*2 + 1; +} + +template <uint8_t MaxEdits> +struct FixedSparseState { +private: + static_assert(MaxEdits > 0 && MaxEdits <= UINT8_MAX/2); + + std::array<uint32_t, diag(MaxEdits)> indices; + std::array<uint8_t, diag(MaxEdits)> costs; // elems are 1-1 with indices vector + uint8_t sz; +public: + constexpr FixedSparseState() noexcept : indices(), costs(), sz(0) {} + + [[nodiscard]] constexpr bool empty() const noexcept { + return (sz == 0); + } + + [[nodiscard]] constexpr uint32_t size() const noexcept { + return sz; + } + + [[nodiscard]] constexpr uint32_t index(uint32_t entry_idx) const noexcept { + return indices[entry_idx]; + } + + [[nodiscard]] constexpr uint8_t cost(uint32_t entry_idx) const noexcept { + return costs[entry_idx]; + } + + // Precondition: !empty() + [[nodiscard]] constexpr uint32_t last_index() const noexcept { + return indices[sz - 1]; + } + + // Precondition: !empty() + [[nodiscard]] constexpr uint8_t last_cost() const noexcept { + return costs[sz - 1]; + } + + void append(uint32_t index, uint8_t cost) noexcept { + assert(sz < diag(MaxEdits)); + indices[sz] = index; + costs[sz] = cost; + ++sz; + } + + constexpr bool operator==(const FixedSparseState& rhs) const noexcept { + if (sz != rhs.sz) { + return false; + } + return (std::equal(indices.begin(), indices.begin() + sz, rhs.indices.begin()) && + std::equal(costs.begin(), costs.begin() + sz, rhs.costs.begin())); + } + + struct hash { + size_t operator()(const FixedSparseState& s) const noexcept { + static_assert(std::is_same_v<uint32_t, std::decay_t<decltype(s.indices[0])>>); + static_assert(std::is_same_v<uint8_t, std::decay_t<decltype(s.costs[0])>>); + // FIXME GCC 12.2 worse-than-useless(tm) warning false positives :I +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" + return (XXH3_64bits(s.indices.data(), s.sz * sizeof(uint32_t)) ^ + XXH3_64bits(s.costs.data(), s.sz)); +#pragma GCC diagnostic pop + } + }; +}; + +/** + * Prints sparse states as a single matrix row. Columns prior to any state index + * are printed explicitly as '-' characters to make states line up when printed. + * + * Example output for the state (2:1, 3:1): + * + * [-, -, 1, 1] + * + * Only meant as a debugging aid during development, as states with high indices + * will emit very large strings. + */ +template <uint8_t MaxEdits> [[maybe_unused]] +std::ostream& operator<<(std::ostream& os, const FixedSparseState<MaxEdits>& s) { + os << "["; + size_t last_idx = 0; + for (size_t i = 0; i < s.size(); ++i) { + if (i != 0) { + os << ", "; + } + for (size_t j = last_idx; j < s.indices[i]; ++j) { + os << "-, "; + } + last_idx = s.indices[i] + 1; + os << static_cast<uint32_t>(s.costs[i]); + } + os << "]"; + return os; +} + +template <uint8_t MaxEdits> +struct FixedMaxEditsTransitions { + static_assert(MaxEdits > 0 && MaxEdits <= UINT8_MAX/2); + + std::array<uint32_t, diag(MaxEdits)> out_u32_chars; + uint8_t size; + + constexpr FixedMaxEditsTransitions() noexcept : out_u32_chars(), size(0) {} + + [[nodiscard]] constexpr bool has_char(uint32_t u32ch) const noexcept { + for (uint8_t i = 0; i < size; ++i) { + if (out_u32_chars[i] == u32ch) { + return true; + } + } + return false; + } + + void add_char(uint32_t u32ch) noexcept { + if (!has_char(u32ch)) { + assert(size < diag(MaxEdits)); + out_u32_chars[size] = u32ch; + ++size; + } + } + + constexpr std::span<const uint32_t> u32_chars() const noexcept { + return {out_u32_chars.begin(), out_u32_chars.begin() + size}; + } + + constexpr std::span<uint32_t> u32_chars() noexcept { + return {out_u32_chars.begin(), out_u32_chars.begin() + size}; + } + + void sort() noexcept { + // TODO use custom sorting networks for fixed array sizes <= 5? + // FIXME GCC 12.2 worse-than-useless(tm) warning false positives :I +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" + std::sort(out_u32_chars.begin(), out_u32_chars.begin() + size); +#pragma GCC diagnostic pop + } +}; + +template <uint8_t MaxEdits> +struct FixedMaxEditDistanceTraits { + static_assert(MaxEdits > 0 && MaxEdits <= UINT8_MAX/2); + using StateType = FixedSparseState<MaxEdits>; + using TransitionsType = FixedMaxEditsTransitions<MaxEdits>; + constexpr static uint8_t max_edits() noexcept { + return MaxEdits; + } +}; + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp new file mode 100644 index 00000000000..648be234562 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp @@ -0,0 +1,108 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "unicode_utils.h" +#include <vespa/vespalib/text/utf8.h> +#include <vespa/vespalib/util/stringfmt.h> +#include <stdexcept> + +namespace vespalib::fuzzy { + +std::vector<uint32_t> utf8_string_to_utf32(std::string_view str) { + vespalib::stringref ch_str(str.data(), str.size()); + vespalib::Utf8Reader utf8_reader(ch_str); + std::vector<uint32_t> u32ret; + u32ret.reserve(str.size()); // Will over-allocate for all non-ASCII + while (utf8_reader.hasMore()) { + u32ret.emplace_back(utf8_reader.getChar()); + } + return u32ret; +} + +std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str) { + return utf8_string_to_utf32(std::string_view(reinterpret_cast<const char*>(u8str.data()), u8str.size())); +} + +[[noreturn]] void throw_bad_code_point(uint32_t codepoint) __attribute__((noinline)); +[[noreturn]] void throw_bad_code_point(uint32_t codepoint) { + throw std::invalid_argument(make_string("invalid UTF-32 codepoint: U+%04X (%u)", codepoint, codepoint)); +} + +namespace { + +/** + * Encodes a single UTF-32 `codepoint` to a 1-4 byte UTF-8 sequence. + * ` + * `u8buf` must point to a buffer with at least 4 writable bytes. + * + * Returns the number of bytes written. + * + * See comments on append_utf32_char_as_utf8() as to why this is not a generic UTF-8 + * encoding function that can be used in all possible scenarios. + */ +[[nodiscard]] uint8_t encode_utf8_char(uint32_t codepoint, unsigned char* u8buf) { + constexpr const uint8_t low_6bits_mask = 0x3F; + + // Yanked and modified from utf8.cpp: + if (codepoint < 0x80) { + u8buf[0] = (char) codepoint; + return 1; + } else if (codepoint < 0x800) { + char low6 = (codepoint & low_6bits_mask); + low6 |= 0x80; + codepoint >>= 6; + char first5 = codepoint; + first5 |= 0xC0; + u8buf[0] = first5; + u8buf[1] = low6; + return 2; + } else if (codepoint < 0x10000) { + char low6 = (codepoint & low_6bits_mask); + low6 |= 0x80; + + codepoint >>= 6; + char mid6 = (codepoint & low_6bits_mask); + mid6 |= 0x80; + + codepoint >>= 6; + char first4 = codepoint; + first4 |= 0xE0; + + u8buf[0] = first4; + u8buf[1] = mid6; + u8buf[2] = low6; + return 3; + } else if (codepoint <= 0x110000) { // Explicitly _include_ U+10FFFF + 1! + char low6 = (codepoint & low_6bits_mask); + low6 |= 0x80; + + codepoint >>= 6; + char mid6 = (codepoint & low_6bits_mask); + mid6 |= 0x80; + + codepoint >>= 6; + char hi6 = (codepoint & low_6bits_mask); + hi6 |= 0x80; + + codepoint >>= 6; + char first3 = codepoint; + first3 |= 0xF0; + + u8buf[0] = first3; + u8buf[1] = hi6; + u8buf[2] = mid6; + u8buf[3] = low6; + return 4; + } else { + throw_bad_code_point(codepoint); + } +} + +} // anon ns + +// TODO optimize inlined in header for case where u32_char is < 0x80? +void append_utf32_char_as_utf8(std::string& out_str, uint32_t u32_char) { + unsigned char u8buf[4]; + uint8_t u8bytes = encode_utf8_char(u32_char, u8buf); + out_str.append(reinterpret_cast<const char*>(u8buf), u8bytes); +} + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h new file mode 100644 index 00000000000..8627b01ff6a --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h @@ -0,0 +1,33 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <cstdint> +#include <string> +#include <string_view> +#include <vector> + +namespace vespalib::fuzzy { + +std::vector<uint32_t> utf8_string_to_utf32(std::string_view str); + +std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str); + +/** + * Encodes a single UTF-32 codepoint `u32_char` to a 1-4 byte UTF-8 sequence and + * appends it to `out_str.` + * + * Note that this will happily encode code points that aren't technically part of + * the valid UTF-8 range, but which will still be correct in memcmp() byte-wise + * ordering, which is the API contract we expose. + * + * In particular, this includes: + * - high/low surrogate ranges U+D800 through U+DFFF (surrogate pairs not allowed + * in UTF-8) + * - U+10FFFF + 1 (outside max code point range by one) + * + * ... So don't copy this function for use as a general UTF-8 emitter, as it is not + * _technically_ conformant! + */ +void append_utf32_char_as_utf8(std::string& out_str, uint32_t u32_char); + +} |