From 6d696cd634a6bd4ea1b4c046ade9bfc5b5d246a8 Mon Sep 17 00:00:00 2001 From: Tor Brede Vekterli Date: Thu, 14 Sep 2023 14:48:13 +0000 Subject: Add support for case-insensitive matching to Levenshtein DFAs Adds matching modes `Cased` and `Uncased`. `Cased` requires UTF-32 code points to match exactly, and successor strings are guaranteed to be strictly higher than the source (candidate) string in `memcmp` order. This mirrors the behavior of the current DFA implementation. `Uncased` treats all characters as if they were lowercased, both for the target and source strings. The target (query) string is explicitly lowercased at DFA build-time to avoid duplicate work. Source strings are implicitly lowercased character by character on-demand during matching. Important ordering note: Successor strings for `Uncased` are generated _as if_ the source string was originally all in lowercase form. This requires some extra added handling when emitting successor prefixes, as we can't just blindly copy UTF-8 bytes from the source string as we do when matching in `Cased` mode. A new casing-dimension has been added to most parameterized unit tests. --- vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp | 222 +++++++++++++++------ vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h | 5 + .../vespalib/fuzzy/explicit_levenshtein_dfa.h | 17 +- .../vespalib/fuzzy/explicit_levenshtein_dfa.hpp | 21 +- .../vespalib/fuzzy/implicit_levenshtein_dfa.h | 12 +- .../vespalib/fuzzy/implicit_levenshtein_dfa.hpp | 11 +- .../src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp | 30 ++- .../src/vespa/vespalib/fuzzy/levenshtein_dfa.h | 40 +++- .../src/vespa/vespalib/fuzzy/match_algorithm.hpp | 60 +++++- .../src/vespa/vespalib/fuzzy/unicode_utils.cpp | 27 ++- vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h | 3 + 11 files changed, 344 insertions(+), 104 deletions(-) diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp index 6966fd0b703..c466d0bfb3e 100644 --- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp +++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp @@ -1,10 +1,10 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include #include -#include +#include #include // For benchmarking purposes +#include +#include #include -#include #include #include #include @@ -18,38 +18,67 @@ namespace fs = std::filesystem; static std::string benchmark_dictionary; -struct LevenshteinDfaTest : TestWithParam { +using CasingAndDfaType = std::tuple; - static LevenshteinDfa::DfaType dfa_type() noexcept { return GetParam(); } +namespace { - static std::optional calculate(std::string_view left, std::string_view right, uint32_t threshold) { - auto dfa_lhs = LevenshteinDfa::build(left, threshold, dfa_type()); - auto maybe_match_lhs = dfa_lhs.match(right, nullptr); +std::optional do_calculate(std::string_view left, std::string_view right, uint32_t threshold, + LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) +{ + auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type); + auto maybe_match_lhs = dfa_lhs.match(right, nullptr); - auto dfa_rhs = LevenshteinDfa::build(right, threshold, dfa_type()); - auto maybe_match_rhs = dfa_rhs.match(left, nullptr); + auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type); + auto maybe_match_rhs = dfa_rhs.match(left, nullptr); - EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches()); - if (maybe_match_lhs.matches()) { - EXPECT_EQ(maybe_match_lhs.edits(), maybe_match_rhs.edits()); - return {maybe_match_lhs.edits()}; - } - return std::nullopt; + EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches()); + if (maybe_match_lhs.matches()) { + EXPECT_EQ(maybe_match_lhs.edits(), maybe_match_rhs.edits()); + return {maybe_match_lhs.edits()}; + } + return std::nullopt; +} + +std::optional do_calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold, + LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) +{ + std::string_view lhs_ch(reinterpret_cast(left.data()), left.size()); + std::string_view rhs_ch(reinterpret_cast(right.data()), right.size()); + return do_calculate(lhs_ch, rhs_ch, threshold, casing, dfa_type); +} + +} + +struct LevenshteinDfaTest : TestWithParam { + + static LevenshteinDfa::Casing casing() noexcept { return std::get<0>(GetParam()); } + static LevenshteinDfa::DfaType dfa_type() noexcept { return std::get<1>(GetParam()); } + + static std::string stringify_params(const TestParamInfo& info) { + std::ostringstream ss; + ss << std::get<0>(info.param) << '_' << std::get<1>(info.param); + return ss.str(); + } + + static std::optional calculate(std::string_view left, std::string_view right, uint32_t threshold) { + return do_calculate(left, right, threshold, casing(), dfa_type()); } static std::optional calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold) { - std::string_view lhs_ch(reinterpret_cast(left.data()), left.size()); - std::string_view rhs_ch(reinterpret_cast(right.data()), right.size()); - return calculate(lhs_ch, rhs_ch, threshold); + return do_calculate(left, right, threshold, casing(), dfa_type()); } }; -INSTANTIATE_TEST_SUITE_P(AllDfaTypes, +// All the baseline DFA tests use lowercase only, so they should have the exact same outcome +// for both cased and uncased matching. +INSTANTIATE_TEST_SUITE_P(AllCasingAndDfaTypes, LevenshteinDfaTest, - Values(LevenshteinDfa::DfaType::Explicit, - LevenshteinDfa::DfaType::Implicit), - PrintToStringParamName()); + Combine(Values(LevenshteinDfa::Casing::Uncased, + LevenshteinDfa::Casing::Cased), + Values(LevenshteinDfa::DfaType::Explicit, + LevenshteinDfa::DfaType::Implicit)), + LevenshteinDfaTest::stringify_params); // Same as existing non-DFA Levenshtein tests, but with some added instantiations // for smaller max distances. @@ -101,7 +130,7 @@ void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std: } TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings) { - auto dfa = LevenshteinDfa::build("food", 1, dfa_type()); + auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type()); test_dfa_successor(dfa, "", "\x01""food"); test_dfa_successor(dfa, "faa", "faod"); @@ -122,8 +151,8 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings // Also works with Unicode // 2 chars - test_dfa_successor(dfa, "\xc3\x86""x", // "Æx" - "\xc3\x87""food"); // "Çfood" + test_dfa_successor(dfa, "\xc3\xa6""x", // "æx" + "\xc3\xa7""food"); // "çfood" // 3 chars test_dfa_successor(dfa, "\xe7\x8c\xab""\xe3\x81\xaf", // "猫は" "\xe7\x8c\xac""food"); // "猬food" @@ -138,7 +167,7 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings } TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) { - auto dfa = LevenshteinDfa::build("food", 1, dfa_type()); + auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type()); // The successor string must be lexicographically larger than the input string. // In the presence of a wildcard output edge we handle this by increase the input // character by 1 and encoding it back as UTF-8. @@ -155,7 +184,7 @@ TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_ } TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_empty_target) { - auto dfa = LevenshteinDfa::build("", 1, dfa_type()); + auto dfa = LevenshteinDfa::build("", 1, casing(), dfa_type()); test_dfa_successor(dfa, "aa", "b"); test_dfa_successor(dfa, "b\x01", "c"); test_dfa_successor(dfa, "vespa", "w"); @@ -170,12 +199,80 @@ TEST_P(LevenshteinDfaTest, malformed_utf8_is_replaced_with_placeholder_char) { EXPECT_EQ(calculate("\xff\xff", "a", 2), std::optional{2}); EXPECT_EQ(calculate("a", "\xff", 2), std::optional{1}); EXPECT_EQ(calculate("a", "\xff\xff\xff", 2), std::nullopt); - EXPECT_EQ(calculate("\xff", "\xef\xbf\xbd"/*U+FFFD*/, 2), std::optional{0}); + EXPECT_EQ(calculate("\xff", "\xef\xbf\xbd"/*U+FFFD*/, 2), std::optional{0}); } TEST_P(LevenshteinDfaTest, unsupported_max_edits_value_throws) { - EXPECT_THROW((void)LevenshteinDfa::build("abc", 0, dfa_type()), std::invalid_argument); - EXPECT_THROW((void)LevenshteinDfa::build("abc", 3, dfa_type()), std::invalid_argument); + EXPECT_THROW((void)LevenshteinDfa::build("abc", 0, casing(), dfa_type()), std::invalid_argument); + EXPECT_THROW((void)LevenshteinDfa::build("abc", 3, casing(), dfa_type()), std::invalid_argument); +} + +struct LevenshteinDfaCasingTest : TestWithParam { + static LevenshteinDfa::DfaType dfa_type() noexcept { return GetParam(); } + + static std::optional calculate_cased(std::string_view left, std::string_view right, uint32_t threshold) { + return do_calculate(left, right, threshold, LevenshteinDfa::Casing::Cased, dfa_type()); + } + + static std::optional calculate_uncased(std::string_view left, std::string_view right, uint32_t threshold) { + return do_calculate(left, right, threshold, LevenshteinDfa::Casing::Uncased, dfa_type()); + } +}; + +INSTANTIATE_TEST_SUITE_P(AllDfaTypes, + LevenshteinDfaCasingTest, + Values(LevenshteinDfa::DfaType::Explicit, + LevenshteinDfa::DfaType::Implicit), + PrintToStringParamName()); + +TEST_P(LevenshteinDfaCasingTest, uncased_edge_cases_have_correct_edit_distance) { + for (auto max : {1, 2}) { + EXPECT_EQ(calculate_uncased("abc", "ABC", max), std::optional{0}) << max; + EXPECT_EQ(calculate_uncased("Abc", "aB1", max), std::optional{1}) << max; + EXPECT_EQ(calculate_uncased("aBC", "1bc", max), std::optional{1}) << max; + EXPECT_EQ(calculate_uncased("Abc", "a1C", max), std::optional{1}) << max; + EXPECT_EQ(calculate_uncased("aBc", "AB", max), std::optional{1}) << max; + EXPECT_EQ(calculate_uncased("ABC", "abcd", max), std::optional{1}) << max; + } + EXPECT_EQ(calculate_uncased("bc", "aBCd", 2), std::optional{2}); + EXPECT_EQ(calculate_uncased("ab", "AbCd", 2), std::optional{2}); + EXPECT_EQ(calculate_uncased("CD", "AbcD", 2), std::optional{2}); + EXPECT_EQ(calculate_uncased("ad", "AbcD", 2), std::optional{2}); +} + +TEST_P(LevenshteinDfaCasingTest, cased_edge_cases_have_correct_edit_distance) { + for (auto max : {1, 2}) { + EXPECT_EQ(calculate_cased("abc", "abC", max), std::optional{1}) << max; + EXPECT_EQ(calculate_cased("Abc", "aB1", max), std::nullopt) << max; + EXPECT_EQ(calculate_cased("aBC", "1bc", max), std::nullopt) << max; + EXPECT_EQ(calculate_cased("Abc", "a1C", max), std::nullopt) << max; + EXPECT_EQ(calculate_cased("ABC", "abcd", max), std::nullopt) << max; + } + EXPECT_EQ(calculate_cased("abc", "ABC", 2), std::nullopt); + EXPECT_EQ(calculate_cased("abc", "aBC", 2), std::optional{2}); + EXPECT_EQ(calculate_cased("bc", "aBCd", 2), std::nullopt); + EXPECT_EQ(calculate_cased("ab", "AbCd", 2), std::nullopt); + EXPECT_EQ(calculate_cased("CD", "AbcD", 2), std::nullopt); + EXPECT_EQ(calculate_cased("ad", "AbcD", 2), std::nullopt); + EXPECT_EQ(calculate_cased("ad", "aBCd", 2), std::optional{2}); +} + +TEST_P(LevenshteinDfaCasingTest, uncased_successor_is_emitted_as_if_match_term_was_lowercased) { + auto dfa = LevenshteinDfa::build("FOOD", 1, LevenshteinDfa::Casing::Uncased, dfa_type()); + // This is a subset of the other successor test cases + test_dfa_successor(dfa, "", "\x01""food"); + test_dfa_successor(dfa, "FAA", "faod"); + test_dfa_successor(dfa, "fOoOoO", "foop"); + test_dfa_successor(dfa, "OOOf", "pfood"); + test_dfa_successor(dfa, "Fo", "fo\x01""d"); + test_dfa_successor(dfa, "oO", "ood"); + test_dfa_successor(dfa, "OOO", "oood"); + test_dfa_successor(dfa, "FOXX", "foyd"); + test_dfa_successor(dfa, "GG", "good"); + test_dfa_successor(dfa, "Gp", "hfood"); + test_dfa_successor(dfa, "EP", "f\x01""od"); + test_dfa_successor(dfa, "Hfoodz", "hood"); + test_dfa_successor(dfa, "Aooodz", "bfood"); } // Turn integer v into its bitwise string representation with the MSB as the leftmost character. @@ -191,20 +288,22 @@ std::string bits_to_str(T v) { return ret; } -using DfaTypeAndMaxEdits = std::tuple; +using CasingAndDfaTypeAndMaxEdits = std::tuple; -struct LevenshteinDfaSuccessorTest : TestWithParam { - // Print test suffix as e.g. "/Explicit_1" instead of just a GTest-chosen number. +struct LevenshteinDfaSuccessorTest : TestWithParam { + // Print test suffix as e.g. "/Uncased_Explicit_1" instead of just a GTest-chosen number. static std::string stringify_params(const TestParamInfo& info) { std::ostringstream ss; - ss << std::get<0>(info.param) << '_' << std::get<1>(info.param); + ss << std::get<0>(info.param) << '_' << std::get<1>(info.param) << '_' << std::get<2>(info.param); return ss.str(); } }; INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, LevenshteinDfaSuccessorTest, - Combine(Values(LevenshteinDfa::DfaType::Explicit, + Combine(Values(LevenshteinDfa::Casing::Uncased, + LevenshteinDfa::Casing::Cased), + Values(LevenshteinDfa::DfaType::Explicit, LevenshteinDfa::DfaType::Implicit), Values(1, 2)), LevenshteinDfaSuccessorTest::stringify_params); @@ -225,10 +324,10 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits, * Inspired by approach used by Lucene DFA exhaustive testing. */ TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) { - const auto [dfa_type, max_edits] = GetParam(); + const auto [casing, dfa_type, max_edits] = GetParam(); for (uint32_t i = 0; i < 256; ++i) { const auto target = bits_to_str(static_cast(i)); - auto target_dfa = LevenshteinDfa::build(target, max_edits, dfa_type); + auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type); std::string skip_to, successor; for (uint32_t j = 0; j < 256; ++j) { const auto source = bits_to_str(static_cast(j)); @@ -298,11 +397,10 @@ enum class BenchmarkType { }; const char* to_s(BenchmarkType t) noexcept { - // Note: need underscores since this is used as part of GTest-generated test instance names switch (t) { - case BenchmarkType::DfaExplicit: return "DFA_explicit"; - case BenchmarkType::DfaImplicit: return "DFA_implicit"; - case BenchmarkType::Legacy: return "legacy"; + case BenchmarkType::DfaExplicit: return "DfaExplicit"; + case BenchmarkType::DfaImplicit: return "DfaImplicit"; + case BenchmarkType::Legacy: return "Legacy"; } abort(); } @@ -315,10 +413,14 @@ const char* to_s(BenchmarkType t) noexcept { return {2, 8, 16, 64, 256, 1024, 1024*16, 1024*64}; } -struct LevenshteinBenchmarkTest : TestWithParam { +using BenchmarkTypeAndCasing = std::tuple; + +struct LevenshteinBenchmarkTest : TestWithParam { static std::string stringify_params(const TestParamInfo& info) { - return to_s(info.param); + std::ostringstream ss; + ss << to_s(std::get<0>(info.param)) << '_' << std::get<1>(info.param); + return ss.str(); } void SetUp() override { @@ -327,7 +429,8 @@ struct LevenshteinBenchmarkTest : TestWithParam { } } - static BenchmarkType benchmark_type() noexcept { return GetParam(); } + static BenchmarkType benchmark_type() noexcept { return std::get<0>(GetParam()); } + static LevenshteinDfa::Casing casing() noexcept { return std::get<1>(GetParam()); } static const std::vector& load_dictionary_once() { static auto sorted_lines = read_and_sort_all_lines(fs::path(benchmark_dictionary)); @@ -351,9 +454,11 @@ struct LevenshteinBenchmarkTest : TestWithParam { INSTANTIATE_TEST_SUITE_P(AllDfaTypes, LevenshteinBenchmarkTest, - Values(BenchmarkType::DfaExplicit, - BenchmarkType::DfaImplicit, - BenchmarkType::Legacy), + Combine(Values(BenchmarkType::DfaExplicit, + BenchmarkType::DfaImplicit, + BenchmarkType::Legacy), + Values(LevenshteinDfa::Casing::Cased, + LevenshteinDfa::Casing::Uncased)), LevenshteinBenchmarkTest::stringify_params); // ("abc", 1) => "a" @@ -382,12 +487,12 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_worst_case_matching_excluding_setup_t // string must be matched and any sparse representation is always maximally filled since // we never expend any edits via mismatches. // Also ensure that we have multiple out-edges per node (i.e. don't just repeat "AAA" etc.). - std::string str = repeated_string("abcde", sz); + std::string str = repeated_string("aBcDeFgHiJ", sz); double min_time_s; if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) { auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit : LevenshteinDfa::DfaType::Implicit; - auto dfa = LevenshteinDfa::build(str, k, dfa_type); + auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type); min_time_s = BenchmarkTimer::benchmark([&] { auto res = dfa.match(str, nullptr); // not benchmarking successor generation do_not_optimize_away(res); @@ -408,15 +513,16 @@ TEST(LevenshteinExplicitDfaBenchmarkTest, benchmark_explicit_dfa_construction) { if (!benchmarking_enabled()) { GTEST_SKIP() << "benchmarking not enabled"; } + const auto casing = LevenshteinDfa::Casing::Cased; // For building, casing only affects initial string normalization using vespalib::BenchmarkTimer; for (uint8_t k : {1, 2}) { for (uint32_t sz : string_lengths()) { - std::string str = repeated_string("abcde", sz); + std::string str = repeated_string("aBcDeFgHiJ", sz); double min_time_s = BenchmarkTimer::benchmark([&] { - auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit); + auto dfa = LevenshteinDfa::build(str, k, casing, LevenshteinDfa::DfaType::Explicit); do_not_optimize_away(dfa); }, 2.0); - auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit); + auto dfa = LevenshteinDfa::build(str, k, casing, LevenshteinDfa::DfaType::Explicit); size_t mem_usage = dfa.memory_usage(); fprintf(stderr, "k=%u, sz=%u: \t%g us \t%zu bytes\n", k, sz, min_time_s * 1000000.0, mem_usage); } @@ -431,12 +537,12 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) { fprintf(stderr, "------ %s ------\n", to_s(type)); for (uint8_t k : {1, 2}) { for (uint32_t sz : target_lengths) { - std::string str = repeated_string("abcde", sz); + std::string str = repeated_string("aBcDeFgHiJ", sz); double min_time_s; if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) { auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit : LevenshteinDfa::DfaType::Implicit; - auto dfa = LevenshteinDfa::build(str, k, dfa_type); + auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type); min_time_s = BenchmarkTimer::benchmark([&] { for (const auto& line : dict) { auto res = dfa.match(line, nullptr); @@ -447,7 +553,9 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) { min_time_s = BenchmarkTimer::benchmark([&] { auto target_u32 = utf8_string_to_utf32(str); for (const auto& line : dict) { - auto line_u32 = utf8_string_to_utf32(line); + std::vector line_u32 = ((casing() == LevenshteinDfa::Casing::Uncased) + ? vespalib::LowerCase::convert_to_ucs4(std::string_view(str)) + : utf8_string_to_utf32(line)); auto res = vespalib::LevenshteinDistance::calculate(line_u32, target_u32, k); do_not_optimize_away(res); } @@ -472,7 +580,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_skipping_dictionary_scan) { std::string str = repeated_string("abcde", sz); auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit : LevenshteinDfa::DfaType::Implicit; - auto dfa = LevenshteinDfa::build(str, k, dfa_type); + auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type); double min_time_s = BenchmarkTimer::benchmark([&] { auto iter = dict.cbegin(); auto end = dict.cend(); diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h index c445c60cc01..405371462ab 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h +++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h @@ -13,6 +13,11 @@ concept DfaMatcher = requires(T a) { typename T::StateParamType; typename T::EdgeType; + // Whether the matching is case-sensitive or not. If false, all source string code points will + // be implicitly lower-cased prior to state stepping. For case-insensitive (i.e. uncased) + // matching to have the expected semantics, the actual target string must be pre-lowercased. + { a.is_cased() } -> std::same_as; + // Initial (starting) state of the DFA { a.start() } -> std::same_as; diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h index 8a4c9bab2b4..6e8068a7419 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h @@ -96,8 +96,11 @@ public: using MatchResult = LevenshteinDfa::MatchResult; private: std::vector _nodes; + const bool _is_cased; public: - ExplicitLevenshteinDfaImpl() noexcept = default; + explicit ExplicitLevenshteinDfaImpl(bool is_cased) noexcept + : _is_cased(is_cased) + {} ~ExplicitLevenshteinDfaImpl() override = default; static constexpr uint8_t max_edits() noexcept { return MaxEdits; } @@ -131,14 +134,12 @@ public: template class ExplicitLevenshteinDfaBuilder { - std::vector _u32_str_buf; // TODO std::u32string + const std::vector _u32_str_buf; // TODO std::u32string + const bool _is_cased; public: - explicit ExplicitLevenshteinDfaBuilder(std::string_view str) - : ExplicitLevenshteinDfaBuilder(utf8_string_to_utf32(str)) - {} - - explicit ExplicitLevenshteinDfaBuilder(std::vector str) noexcept - : _u32_str_buf(std::move(str)) + ExplicitLevenshteinDfaBuilder(std::vector str, bool is_cased) noexcept + : _u32_str_buf(std::move(str)), + _is_cased(is_cased) {} [[nodiscard]] LevenshteinDfa build_dfa() const; diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp index 9a5d3d1bdc7..16b1021de7b 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp @@ -21,13 +21,17 @@ struct ExplicitDfaMatcher { using StateParamType = const DfaNodeType*; const std::span _nodes; + const bool _is_cased; - explicit ExplicitDfaMatcher(const std::span nodes) noexcept - : _nodes(nodes) + ExplicitDfaMatcher(const std::span nodes, bool is_cased) noexcept + : _nodes(nodes), + _is_cased(is_cased) {} static constexpr uint8_t max_edits() noexcept { return MaxEdits; } + bool is_cased() const noexcept { return _is_cased; } + StateType start() const noexcept { return &_nodes[0]; } @@ -80,7 +84,7 @@ struct ExplicitDfaMatcher { template LevenshteinDfa::MatchResult ExplicitLevenshteinDfaImpl::match(std::string_view u8str, std::string* successor_out) const { - ExplicitDfaMatcher matcher(_nodes); + ExplicitDfaMatcher matcher(_nodes, _is_cased); return MatchAlgorithm::match(matcher, u8str, successor_out); } @@ -164,9 +168,12 @@ class ExplicitLevenshteinDfaBuilderImpl : public DfaSteppingBase { using Base::is_match; using Base::can_match; using Base::transitions; + + const bool _is_cased; public: - explicit ExplicitLevenshteinDfaBuilderImpl(std::span str) noexcept - : DfaSteppingBase(str) + ExplicitLevenshteinDfaBuilderImpl(std::span str, bool is_cased) noexcept + : DfaSteppingBase(str), + _is_cased(is_cased) { assert(str.size() < UINT32_MAX / max_out_edges_per_node()); } @@ -181,7 +188,7 @@ public: template LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl::build_dfa() const { - auto dfa = std::make_unique>(); + auto dfa = std::make_unique>(_is_cased); ExploreState exp; // Use BFS instead of DFS to ensure most node edges point to nodes that are allocated _after_ // the parent node, which means the CPU can skip ahead instead of ping-ponging back and forth. @@ -221,7 +228,7 @@ LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl::build_dfa() const { template LevenshteinDfa ExplicitLevenshteinDfaBuilder::build_dfa() const { - ExplicitLevenshteinDfaBuilderImpl builder(_u32_str_buf); + ExplicitLevenshteinDfaBuilderImpl builder(_u32_str_buf, _is_cased); return builder.build_dfa(); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h index 0846b95d135..8ef521ba14f 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h @@ -9,16 +9,14 @@ namespace vespalib::fuzzy { template class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl { - std::vector _u32_str_buf; // TODO std::u32string + const std::vector _u32_str_buf; // TODO std::u32string + const bool _is_cased; public: using MatchResult = LevenshteinDfa::MatchResult; - explicit ImplicitLevenshteinDfa(std::string_view str) - : ImplicitLevenshteinDfa(utf8_string_to_utf32(str)) - {} - - explicit ImplicitLevenshteinDfa(std::vector str) noexcept - : _u32_str_buf(std::move(str)) + ImplicitLevenshteinDfa(std::vector str, bool is_cased) noexcept + : _u32_str_buf(std::move(str)), + _is_cased(is_cased) {} ~ImplicitLevenshteinDfa() override = default; diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp index 4ee468e424b..e962dba0f18 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp @@ -29,12 +29,17 @@ struct ImplicitDfaMatcher : public DfaSteppingBase { using Base::is_match; using Base::can_match; - explicit ImplicitDfaMatcher(std::span u32_str) noexcept - : Base(u32_str) + const bool _is_cased; + + ImplicitDfaMatcher(std::span u32_str, bool is_cased) noexcept + : Base(u32_str), + _is_cased(is_cased) {} // start, is_match, can_match, match_edit_distance are all provided by base type + bool is_cased() const noexcept { return _is_cased; } + template bool has_any_char_matching(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) { for (uint32_t i = 0; i < state.size(); ++i) { @@ -109,7 +114,7 @@ struct ImplicitDfaMatcher : public DfaSteppingBase { template LevenshteinDfa::MatchResult ImplicitLevenshteinDfa::match(std::string_view u8str, std::string* successor_out) const { - ImplicitDfaMatcher matcher(_u32_str_buf); + ImplicitDfaMatcher matcher(_u32_str_buf, _is_cased); return MatchAlgorithm::match(matcher, u8str, successor_out); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp index e75ef8365bf..77691ffefd8 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp @@ -2,6 +2,7 @@ #include "explicit_levenshtein_dfa.h" #include "implicit_levenshtein_dfa.h" #include "levenshtein_dfa.h" +#include "unicode_utils.h" #include #include @@ -29,27 +30,30 @@ void LevenshteinDfa::dump_as_graphviz(std::ostream& out) const { _impl->dump_as_graphviz(out); } -LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, DfaType dfa_type) { +LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type) { if (max_edits != 1 && max_edits != 2) { throw std::invalid_argument(make_string("Levenshtein DFA max_edits must be in {1, 2}, was %u", max_edits)); } + const bool is_cased = (casing == Casing::Cased); + auto target_string_u32 = is_cased ? utf8_string_to_utf32(target_string) + : utf8_string_to_utf32_lowercased(target_string); if (dfa_type == DfaType::Implicit) { if (max_edits == 1) { - return LevenshteinDfa(std::make_unique>>(target_string)); + return LevenshteinDfa(std::make_unique>>(std::move(target_string_u32), is_cased)); } else { // max_edits == 2 - return LevenshteinDfa(std::make_unique>>(target_string)); + return LevenshteinDfa(std::make_unique>>(std::move(target_string_u32), is_cased)); } } else { // DfaType::Explicit if (max_edits == 1) { - return ExplicitLevenshteinDfaBuilder>(target_string).build_dfa(); + return ExplicitLevenshteinDfaBuilder>(std::move(target_string_u32), is_cased).build_dfa(); } else { // max_edits == 2 - return ExplicitLevenshteinDfaBuilder>(target_string).build_dfa(); + return ExplicitLevenshteinDfaBuilder>(std::move(target_string_u32), is_cased).build_dfa(); } } } -LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits) { +LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) { // TODO automatically select implementation based on target length/max edits? // Suggestion: // - Explicit DFA iff (k == 1 && |target| <= 256) || (k == 2 && |target| <= 64). @@ -58,7 +62,7 @@ LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max // an M1 Pro; your mileage may vary etc). // Ideally the implicit DFA would always be the fastest (or at least approximately as // fast as the explicit DFA), but this is not yet the case. - return build(target_string, max_edits, DfaType::Implicit); + return build(target_string, max_edits, casing, DfaType::Implicit); } std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos) { @@ -70,7 +74,7 @@ std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mo return os; } -std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt) { +std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt) { if (dt == LevenshteinDfa::DfaType::Implicit) { os << "Implicit"; } else { @@ -80,4 +84,14 @@ std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt) { return os; } +std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c) { + if (c == LevenshteinDfa::Casing::Uncased) { + os << "Uncased"; + } else { + assert(c == LevenshteinDfa::Casing::Cased); + os << "Cased"; + } + return os; +} + } diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h index a26ccbe87ee..a8b2b77f647 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h @@ -182,6 +182,13 @@ public: * memcmp()-ordering of strings and not whether they are technically valid Unicode. * This should be the case for low-level dictionary data structures etc. * + * Ordering notes: when the DFA is created as Uncased, the target and source strings + * are treated as lowercase when matching. The successor string is in this case generated + * _as if_ the input source string was originally lowercase. This is a special case + * intended for case-folded dictionaries that implicitly order strings by their lowercased + * form, but where they are stored in their original cased form. The (raw) cased form + * is what is passed to the DFA match() function. + * * Memory allocation: * This function does not directly or indirectly allocate any heap memory if either: * @@ -204,6 +211,29 @@ public: Explicit }; + /** + * Specifies the character case matching semantics of the DFA. + */ + enum class Casing { + /** + * Characters are case-normalized (i.e. lowercased) prior to matching. + * Example: In uncased mode, 'A' and 'a' are considered exact matches and do not + * consume edits. + * + * See the ordering notes for `match()` on what Uncased implies when generating + * successor strings. + */ + Uncased, + /** + * Characters are not case-normalized prior to matching. + * Example: 'A' and 'a' are considered separate characters and will consume edits. + * + * Cased mode preserves strict memcmp() UTF-8 ordering between source and successor + * strings. + */ + Cased + }; + /** * Builds and returns a Levenshtein DFA that matches all strings within `max_edits` * edits of `target_string`. The type of DFA returned is specified by dfa_type. @@ -212,14 +242,13 @@ public: * * `target_string` must not contain any null UTF-8 chars. */ - [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, - uint8_t max_edits, - DfaType dfa_type); + [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits, + Casing casing, DfaType dfa_type); /** * Same as build() but currently always returns an implicit DFA. */ - [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits); + [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits, Casing casing); /** * Dumps the DFA as a Graphviz graph in text format to the provided output stream. @@ -239,6 +268,7 @@ public: }; std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos); -std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt); +std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt); +std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp index 206b69f8ebe..d9c4e0ffe36 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp @@ -4,6 +4,7 @@ #include "dfa_matcher.h" #include "levenshtein_dfa.h" #include "unicode_utils.h" +#include #include #include #include @@ -139,7 +140,8 @@ struct MatchAlgorithm { * Both the input and successor output strings are in UTF-8 format. To avoid doing * duplicate work, we keep track of the byte length of the string prefix that will be * part of the successor and simply copy it verbatim instead of building the string - * from converted UTF-32 -> UTF-8 chars as we go. + * from converted UTF-32 -> UTF-8 chars as we go. This optimization cannot be used + * when one or more of the prefix characters have been lowercase-transformed. * * TODO we could probably also optimize the smallest suffix generation with this when * we know we can no longer insert any smaller char substitutions and the only way @@ -147,6 +149,10 @@ struct MatchAlgorithm { * - To do this we'd need both the original UTF-8 target string as well as a * secondary vector that maps u32 character index to the corresponding UTF-8 index. * Both trivial to get as part of DFA initialization. + * + * TODO let matcher know if source string is pre-normalized (i.e. lowercased). + * TODO std::u32string output; no need to re-encode to UTF-8. + * TODO consider opportunistically appending prefix as we go instead of only when needed. */ template static MatchResult match(const Matcher& matcher, @@ -154,15 +160,20 @@ struct MatchAlgorithm { std::string* successor_out) { using StateType = typename Matcher::StateType; - vespalib::Utf8Reader u8_reader(source.data(), source.size()); + Utf8Reader u8_reader(source.data(), source.size()); uint32_t n_prefix_u8_bytes = 0; uint32_t char_after_prefix = 0; StateType last_state_with_higher_out = StateType{}; + bool can_use_raw_prefix = true; StateType state = matcher.start(); while (u8_reader.hasMore()) { const auto u8_pos_before_char = u8_reader.getPos(); - const uint32_t mch = u8_reader.getChar(); + const uint32_t raw_mch = u8_reader.getChar(); + const uint32_t mch = normalized_match_char(raw_mch, matcher.is_cased()); + if (raw_mch != mch) { + can_use_raw_prefix = false; // FIXME this is pessimistic; considers entire string, not just prefix + } if (successor_out && matcher.has_higher_out_edge(state, mch)) { last_state_with_higher_out = state; n_prefix_u8_bytes = u8_pos_before_char; @@ -174,7 +185,8 @@ struct MatchAlgorithm { } else { // Can never match; find the successor if requested if (successor_out) { - *successor_out = source.substr(0, n_prefix_u8_bytes); + emit_successor_prefix(*successor_out, source, n_prefix_u8_bytes, + matcher.is_cased() || can_use_raw_prefix); assert(matcher.valid_state(last_state_with_higher_out)); backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out, char_after_prefix, *successor_out); @@ -187,7 +199,8 @@ struct MatchAlgorithm { return MatchResult::make_match(max_edits(), edits); } if (successor_out) { - *successor_out = source; + emit_successor_prefix(*successor_out, source, source.size(), + matcher.is_cased() || can_use_raw_prefix); emit_smallest_matching_suffix(matcher, state, *successor_out); } return MatchResult::make_mismatch(max_edits()); @@ -286,6 +299,43 @@ struct MatchAlgorithm { } } } + + /** + * The successor prefix is the prefix of the source string up to (but not including) the + * point where we emit a lexicographically higher character. Ideally we can just copy the + * UTF-8 bytes verbatim from the source into the successor. This is possible when one of + * the following holds: + * + * - DFA uses Cased (i.e. exact) matching, or + * - DFA uses Uncased, but none of the characters in the prefix triggered a lowercase + * transform. This means the prefix is already as-if lowercased, and we can copy it + * verbatim. + * + * In the case that we can't copy verbatim, we currently have to explicitly normalize the + * prefix by converting it to its lowercased form. + * + * Example: Uncased matching for target "food" with input "FOXX". This generates the + * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different + * ordering when compared byte-wise against an implicitly lowercased dictionary. + */ + static void emit_successor_prefix(std::string& successor_out, std::string_view source, + uint32_t n_prefix_u8_bytes, bool emit_raw_prefix_u8_bytes) + { + if (emit_raw_prefix_u8_bytes) { + successor_out = source.substr(0, n_prefix_u8_bytes); + } else { + // TODO avoid duplicate work...! :I + successor_out.clear(); + Utf8Reader u8_reader(source.data(), source.size()); + while (u8_reader.getPos() < n_prefix_u8_bytes) { + append_utf32_char_as_utf8(successor_out, LowerCase::convert(u8_reader.getChar())); + } + } + } + + static uint32_t normalized_match_char(uint32_t in_ch, bool is_cased) noexcept { + return (is_cased ? in_ch : LowerCase::convert(in_ch)); + } }; } diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp index 648be234562..f2c6259fd10 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp @@ -1,22 +1,41 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "unicode_utils.h" +#include #include #include #include namespace vespalib::fuzzy { -std::vector utf8_string_to_utf32(std::string_view str) { - vespalib::stringref ch_str(str.data(), str.size()); - vespalib::Utf8Reader utf8_reader(ch_str); +namespace { + +template +std::vector utf8_string_to_utf32_impl(std::string_view str) { + stringref ch_str(str.data(), str.size()); + Utf8Reader utf8_reader(ch_str); // TODO consider integrating simdutf library std::vector u32ret; u32ret.reserve(str.size()); // Will over-allocate for all non-ASCII while (utf8_reader.hasMore()) { - u32ret.emplace_back(utf8_reader.getChar()); + if constexpr (ToLowercase) { + u32ret.emplace_back(LowerCase::convert(utf8_reader.getChar())); + } else { + u32ret.emplace_back(utf8_reader.getChar()); + } + } return u32ret; } +} + +std::vector utf8_string_to_utf32_lowercased(std::string_view str) { + return utf8_string_to_utf32_impl(str); +} + +std::vector utf8_string_to_utf32(std::string_view str) { + return utf8_string_to_utf32_impl(str); +} + std::vector utf8_string_to_utf32(std::u8string_view u8str) { return utf8_string_to_utf32(std::string_view(reinterpret_cast(u8str.data()), u8str.size())); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h index 8627b01ff6a..711c4f1fcd0 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h +++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h @@ -8,6 +8,9 @@ namespace vespalib::fuzzy { +// UTF-8 -> UTF-32 conversion with lowercasing of all characters +std::vector utf8_string_to_utf32_lowercased(std::string_view str); +// UTF-8 -> UTF-32 conversion without case conversion std::vector utf8_string_to_utf32(std::string_view str); std::vector utf8_string_to_utf32(std::u8string_view u8str); -- cgit v1.2.3