diff options
author | Tor Brede Vekterli <vekterli@yahooinc.com> | 2023-09-18 16:19:57 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-18 16:19:57 +0200 |
commit | eede788d1c20d2f246f44287308dad69487369ea (patch) | |
tree | 71406c55b9310a79bb60f93e57314729001c8500 | |
parent | 5132cfd082fbc0ef2f0c7c4a760925b613cc975e (diff) | |
parent | 4a004d2e8d472c660cc9c2b73ee637c5247968da (diff) |
Merge pull request #28558 from vespa-engine/vekterli/dfa-successor-generation-optimization
Levenshtein DFA successor generation optimization and UTF-32 output
13 files changed, 249 insertions, 54 deletions
diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h index 072eb5e1333..6b873020994 100644 --- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h +++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h @@ -25,7 +25,7 @@ public: template <typename DictionaryConstIteratorType> bool is_match(const char* word, DictionaryConstIteratorType& itr, const DfaStringComparator::DataStoreType& data_store) { - auto match = _dfa.match(word, &_successor); + auto match = _dfa.match(word, _successor); if (match.matches()) { return true; } else { diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp index c466d0bfb3e..c235cb99509 100644 --- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp +++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp @@ -26,10 +26,10 @@ std::optional<uint32_t> do_calculate(std::string_view left, std::string_view rig LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type) { auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type); - auto maybe_match_lhs = dfa_lhs.match(right, nullptr); + auto maybe_match_lhs = dfa_lhs.match(right); auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type); - auto maybe_match_rhs = dfa_rhs.match(left, nullptr); + auto maybe_match_rhs = dfa_rhs.match(left); EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches()); if (maybe_match_lhs.matches()) { @@ -47,6 +47,11 @@ std::optional<uint32_t> do_calculate(std::u8string_view left, std::u8string_view return do_calculate(lhs_ch, rhs_ch, threshold, casing, dfa_type); } +void expect_utf32_string_code_point_equal_to_utf8(std::span<const uint32_t> u32str, const std::string& u8str) { + auto as_utf8 = utf32_string_to_utf8(u32str); + EXPECT_EQ(as_utf8, u8str); +} + } struct LevenshteinDfaTest : TestWithParam<CasingAndDfaType> { @@ -119,14 +124,20 @@ TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) { void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor) { std::string successor; - auto m = dfa.match(source, &successor); + auto m = dfa.match(source, successor); if (m.matches()) { FAIL() << "Expected '" << source << "' to emit a successor, but it " << "matched with " << static_cast<uint32_t>(m.edits()) << " edits (of max " << static_cast<uint32_t>(m.max_edits()) << " edits)"; } EXPECT_EQ(successor, expected_successor); - EXPECT_TRUE(dfa.match(successor, nullptr).matches()); + EXPECT_TRUE(dfa.match(successor).matches()); + + // Make sure the UTF-32 successor output is codepoint-wise identical to the UTF-8 successor + std::vector<uint32_t> u32successor; + m = dfa.match(source, u32successor); + EXPECT_FALSE(m.matches()); + expect_utf32_string_code_point_equal_to_utf8(u32successor, successor); } TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings) { @@ -331,7 +342,7 @@ TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) { std::string skip_to, successor; for (uint32_t j = 0; j < 256; ++j) { const auto source = bits_to_str(static_cast<uint8_t>(j)); - auto maybe_match = target_dfa.match(source, &successor); + auto maybe_match = target_dfa.match(source, successor); if (maybe_match.matches() && !skip_to.empty()) { ASSERT_GE(source, skip_to); } else if (!maybe_match.matches()) { @@ -494,7 +505,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_worst_case_matching_excluding_setup_t : LevenshteinDfa::DfaType::Implicit; auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type); min_time_s = BenchmarkTimer::benchmark([&] { - auto res = dfa.match(str, nullptr); // not benchmarking successor generation + auto res = dfa.match(str); // not benchmarking successor generation do_not_optimize_away(res); }, 1.0); } else { @@ -545,7 +556,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) { auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type); min_time_s = BenchmarkTimer::benchmark([&] { for (const auto& line : dict) { - auto res = dfa.match(line, nullptr); + auto res = dfa.match(line); do_not_optimize_away(res); } }, 2.0); @@ -586,7 +597,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_skipping_dictionary_scan) { auto end = dict.cend(); std::string successor; while (iter != end) { - auto maybe_match = dfa.match(*iter, &successor); + auto maybe_match = dfa.match(*iter, successor); if (maybe_match.matches()) { ++iter; } else { diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h index 405371462ab..6c7e8a35b4e 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h +++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h @@ -3,12 +3,14 @@ #include <concepts> #include <cstdint> +#include <string> +#include <vector> namespace vespalib::fuzzy { // Concept that all DFA matcher implementations must satisfy template <typename T> -concept DfaMatcher = requires(T a) { +concept DfaMatcher = requires(T a, std::string u8str, std::vector<uint32_t> u32str) { typename T::StateType; typename T::StateParamType; typename T::EdgeType; @@ -44,7 +46,7 @@ concept DfaMatcher = requires(T a) { // match any character in the target string (i.e. is always a mismatch). { a.match_wildcard(typename T::StateType{}) } -> std::same_as<typename T::StateType>; - // Whether there is exists an out edge from the given state that can accept a + // Whether there exists an out edge from the given state that can accept a // _higher_ UTF-32 code point value (character) than the input u32 value. Such an // edge _may_ be a wildcard edge, which accepts any character. { a.has_higher_out_edge(typename T::StateType{}, uint32_t{}) } -> std::same_as<bool>; @@ -70,6 +72,24 @@ concept DfaMatcher = requires(T a) { // Returns the state that is the result of following the given edge from the given state. { a.edge_to_state(typename T::StateType{}, typename T::EdgeType{}) } -> std::same_as<typename T::StateType>; + + // Returns true iff the only way for the remaining input string to match the target string + // is for each subsequent character to match exactly. More precisely, this means that it is + // not possible to perform any more edits on the string. This will be the case if the + // current row of the Levenshtein matrix contains only 1 entry <= max_edits, and its cost + // is equal to max_edits. + // It is OK for an implementation to always return false. In this case, a slower code path + // (per-state stepping and character output) will be used for emitting the suffix. + { a.implies_exact_match_suffix(typename T::StateType{}) } -> std::same_as<bool>; + + // Verbatim emit a suffix of the target string that will turn the prefix represented + // by the input state concatenated with the suffix into a matching string. + // Precondition: implies_match_suffix(state) == true, i.e. the state is guaranteed to + // afford no edits anywhere. + { a.emit_exact_match_suffix(typename T::StateType{}, u8str) } -> std::same_as<void>; + + // Same as above, but for raw UTF-32 code point output + { a.emit_exact_match_suffix(typename T::StateType{}, u32str) } -> std::same_as<void>; }; } diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h index 6e8068a7419..630e07738fb 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h @@ -125,11 +125,16 @@ public: [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override; + [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const override; + [[nodiscard]] size_t memory_usage() const noexcept override { return sizeof(DfaNodeType) * _nodes.size(); } void dump_as_graphviz(std::ostream& os) const override; +private: + template <typename SuccessorT> + [[nodiscard]] MatchResult match_impl(std::string_view u8str, SuccessorT* successor_out) const; }; template <typename Traits> diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp index 16b1021de7b..7860d841fbf 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp @@ -79,16 +79,41 @@ struct ExplicitDfaMatcher { StateType edge_to_state([[maybe_unused]] StateType node, EdgeType edge) const noexcept { return &_nodes[edge->node]; } + constexpr bool implies_exact_match_suffix(const StateType&) const noexcept { + // TODO; caller will currently just fall back to explicit state stepping + return false; + } + void emit_exact_match_suffix(const StateType&, std::string&) const { + // TODO (will never be called as long as `implies_exact_match_suffix()` returns false) + abort(); + } + void emit_exact_match_suffix(const StateType&, std::vector<uint32_t>&) const { + // TODO (will never be called as long as `implies_exact_match_suffix()` returns false) + abort(); + } }; template <uint8_t MaxEdits> +template <typename SuccessorT> LevenshteinDfa::MatchResult -ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const { +ExplicitLevenshteinDfaImpl<MaxEdits>::match_impl(std::string_view u8str, SuccessorT* successor_out) const { ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased); return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out); } template <uint8_t MaxEdits> +LevenshteinDfa::MatchResult +ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const { + return match_impl(u8str, successor_out); +} + +template <uint8_t MaxEdits> +LevenshteinDfa::MatchResult +ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::vector<uint32_t>* successor_out) const { + return match_impl(u8str, successor_out); +} + +template <uint8_t MaxEdits> void ExplicitLevenshteinDfaImpl<MaxEdits>::dump_as_graphviz(std::ostream& os) const { os << std::dec << "digraph levenshtein_dfa {\n"; os << " fontname=\"Helvetica,Arial,sans-serif\"\n"; @@ -101,7 +126,7 @@ void ExplicitLevenshteinDfaImpl<MaxEdits>::dump_as_graphviz(std::ostream& os) co } for (const auto& edge : node.match_out_edges()) { std::string as_utf8; - append_utf32_char_as_utf8(as_utf8, edge.u32ch); + append_utf32_char(as_utf8, edge.u32ch); os << " " << i << " -> " << edge.node << " [label=\"" << as_utf8 << "\"];\n"; } if (node.wildcard_edge_to != DOOMED) { diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp index 8b9d2eddcac..031973fd35d 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp @@ -1,8 +1,34 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "implicit_levenshtein_dfa.hpp" +#include "unicode_utils.h" +#include <cassert> namespace vespalib::fuzzy { +/** + * Builds two separate vectors that exist alongside the (possibly case-normalized) UTF-32 + * target string: + * + * - The UTF-8 representation of the (possibly case-normalized) target string + * - An offset vector that maps each UTF-32 string index to the first byte of the + * equivalent UTF-8 character. + * + * These are used for efficiently dumping an UTF-8 target string suffix from an UTF-32 + * target string index. + */ +template <typename Traits> +void ImplicitLevenshteinDfa<Traits>::precompute_utf8_target_with_offsets() { + _target_utf8_char_offsets.reserve(_u32_str_buf.size()); + _target_as_utf8.reserve(_u32_str_buf.size()); // Under-reserves for all non-ASCII (TODO count via simdutf?) + // Important: the precomputed UTF-8 target is based on the potentially case-normalized + // target string, ensuring that we don't emit raw target chars for uncased successors. + for (uint32_t u32ch : _u32_str_buf) { + _target_utf8_char_offsets.emplace_back(static_cast<uint32_t>(_target_as_utf8.size())); + append_utf32_char(_target_as_utf8, u32ch); + } + assert(_target_as_utf8.size() < UINT32_MAX); +} + template class ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>; template class ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>; diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h index 8ef521ba14f..bb4a0918593 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h @@ -10,24 +10,37 @@ namespace vespalib::fuzzy { template <typename Traits> class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl { const std::vector<uint32_t> _u32_str_buf; // TODO std::u32string + std::string _target_as_utf8; + std::vector<uint32_t> _target_utf8_char_offsets; const bool _is_cased; public: using MatchResult = LevenshteinDfa::MatchResult; - ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased) noexcept + ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased) : _u32_str_buf(std::move(str)), + _target_as_utf8(), + _target_utf8_char_offsets(), _is_cased(is_cased) - {} + { + precompute_utf8_target_with_offsets(); + } ~ImplicitLevenshteinDfa() override = default; [[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override; + [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const override; + [[nodiscard]] size_t memory_usage() const noexcept override { return _u32_str_buf.size() * sizeof(uint32_t); } void dump_as_graphviz(std::ostream& os) const override; +private: + template <typename SuccessorT> + [[nodiscard]] MatchResult match_impl(std::string_view u8str, SuccessorT* successor_out) const; + + void precompute_utf8_target_with_offsets(); }; } diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp index e962dba0f18..25fd3fdcc4e 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp @@ -29,10 +29,17 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> { using Base::is_match; using Base::can_match; - const bool _is_cased; - - ImplicitDfaMatcher(std::span<const uint32_t> u32_str, bool is_cased) noexcept + std::span<const char> _target_as_utf8; + std::span<const uint32_t> _target_utf8_char_offsets; + const bool _is_cased; + + ImplicitDfaMatcher(std::span<const uint32_t> u32_str, + std::span<const char> target_as_utf8, + std::span<const uint32_t> target_utf8_char_offsets, + bool is_cased) noexcept : Base(u32_str), + _target_as_utf8(target_as_utf8), + _target_utf8_char_offsets(target_utf8_char_offsets), _is_cased(is_cased) {} @@ -109,16 +116,45 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> { StateType edge_to_state(const StateType& state, EdgeType edge) const noexcept { return step(state, edge); } + bool implies_exact_match_suffix(const StateType& state) const noexcept { + // Only one entry in the sparse matrix row and it implies no edits can be done. + // I.e. only way to match the target string suffix is to match it _exactly_. + return (state.size() == 1 && state.cost(0) == max_edits()); + } + // Precondition: implies_match_suffix(state) + void emit_exact_match_suffix(const StateType& state, std::string& u8_out) const { + const uint32_t target_u8_offset = _target_utf8_char_offsets[state.index(0)]; + u8_out.append(_target_as_utf8.data() + target_u8_offset, _target_as_utf8.size() - target_u8_offset); + } + void emit_exact_match_suffix(const StateType& state, std::vector<uint32_t>& u32_out) const { + // TODO ranged insert + for (uint32_t i = state.index(0); i < _u32_str.size(); ++i) { + u32_out.push_back(_u32_str[i]); + } + } }; template <typename Traits> +template <typename SuccessorT> LevenshteinDfa::MatchResult -ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const { - ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _is_cased); +ImplicitLevenshteinDfa<Traits>::match_impl(std::string_view u8str, SuccessorT* successor_out) const { + ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased); return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out); } template <typename Traits> +LevenshteinDfa::MatchResult +ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const { + return match_impl(u8str, successor_out); +} + +template <typename Traits> +LevenshteinDfa::MatchResult +ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::vector<uint32_t>* successor_out) const { + return match_impl(u8str, successor_out); +} + +template <typename Traits> void ImplicitLevenshteinDfa<Traits>::dump_as_graphviz(std::ostream&) const { throw std::runtime_error("Graphviz output not available for implicit Levenshtein DFA"); } diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp index 77691ffefd8..6e38821851b 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp @@ -18,8 +18,18 @@ LevenshteinDfa& LevenshteinDfa::operator=(LevenshteinDfa&&) noexcept = default; LevenshteinDfa::~LevenshteinDfa() = default; LevenshteinDfa::MatchResult -LevenshteinDfa::match(std::string_view u8str, std::string* successor_out) const { - return _impl->match(u8str, successor_out); +LevenshteinDfa::match(std::string_view u8str) const { + return _impl->match(u8str, static_cast<std::vector<uint32_t>*>(nullptr)); // TODO rewire +} + +LevenshteinDfa::MatchResult +LevenshteinDfa::match(std::string_view u8str, std::string& successor_out) const { + return _impl->match(u8str, &successor_out); +} + +LevenshteinDfa::MatchResult +LevenshteinDfa::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const { + return _impl->match(u8str, &successor_out); } size_t LevenshteinDfa::memory_usage() const noexcept { diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h index a8b2b77f647..85ad98e2a09 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h @@ -6,6 +6,7 @@ #include <memory> #include <string> #include <string_view> +#include <vector> namespace vespalib::fuzzy { @@ -140,6 +141,7 @@ public: struct Impl { virtual ~Impl() = default; [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::string* successor_out) const = 0; + [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const = 0; [[nodiscard]] virtual size_t memory_usage() const noexcept = 0; virtual void dump_as_graphviz(std::ostream& out) const = 0; }; @@ -169,7 +171,17 @@ public: * Iff `source` is _beyond_ the maximum edit distance, returns a MatchResult with * matches() == false. * - * Iff `successor_out` is not nullptr, the following holds: + */ + [[nodiscard]] MatchResult match(std::string_view source) const; + + /** + * Attempts to match the source string `source` with the target string this DFA was + * built with, emitting a successor string into `successor_out` on mismatch. + * + * See `match(source)` for semantics of returned MatchResult. + * + * In the case of a _mismatch_, the following holds: + * * - `successor_out` is modified to contain the next (in byte-wise ordering) possible * _matching_ string S so that there exists no other matching string S' that is * greater than `source` but smaller than S. @@ -199,7 +211,17 @@ public: * By reusing the successor string across many calls, this therefore amortizes memory * allocations down to near zero per invocation. */ - [[nodiscard]] MatchResult match(std::string_view source, std::string* successor_out) const; + [[nodiscard]] MatchResult match(std::string_view source, std::string& successor_out) const; + + /** + * Same as `match(source, successor_out)`, but where the successor string is defined in + * terms of UTF-32, not UTF-8. This avoids the need for encoding characters to UTF-8 + * internally, and is therefore expected to be more efficient. + * + * The code point ordering of the UTF-32 successor string is identical to that its UTF-8 + * equivalent. + */ + [[nodiscard]] MatchResult match(std::string_view source, std::vector<uint32_t>& successor_out) const; /** * Returns how much memory is used by the underlying DFA representation, in bytes. diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp index d9c4e0ffe36..2b3c06aa7cf 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp @@ -143,21 +143,13 @@ struct MatchAlgorithm { * from converted UTF-32 -> UTF-8 chars as we go. This optimization cannot be used * when one or more of the prefix characters have been lowercase-transformed. * - * TODO we could probably also optimize the smallest suffix generation with this when - * we know we can no longer insert any smaller char substitutions and the only way - * to complete the string is to emit it verbatim. - * - To do this we'd need both the original UTF-8 target string as well as a - * secondary vector that maps u32 character index to the corresponding UTF-8 index. - * Both trivial to get as part of DFA initialization. - * * TODO let matcher know if source string is pre-normalized (i.e. lowercased). - * TODO std::u32string output; no need to re-encode to UTF-8. * TODO consider opportunistically appending prefix as we go instead of only when needed. */ - template <DfaMatcher Matcher> + template <DfaMatcher Matcher, typename SuccessorT> static MatchResult match(const Matcher& matcher, std::string_view source, - std::string* successor_out) + SuccessorT* successor_out) { using StateType = typename Matcher::StateType; Utf8Reader u8_reader(source.data(), source.size()); @@ -217,12 +209,12 @@ struct MatchAlgorithm { * precondition: `last_node_with_higher_out` has either a wildcard edge or a char match * edge that compares greater than `input_at_branch`. */ - template <DfaMatcher Matcher> + template <DfaMatcher Matcher, typename SuccessorT> static void backtrack_and_emit_greater_suffix( const Matcher& matcher, typename Matcher::StateParamType last_state_with_higher_out, const uint32_t input_at_branch, - std::string& successor) + SuccessorT& successor) { auto wildcard_state = matcher.match_wildcard(last_state_with_higher_out); if (matcher.can_match(wildcard_state)) { @@ -235,14 +227,14 @@ struct MatchAlgorithm { // instead of the wildcard edge, or we'll end up in the wrong state. const auto next_char = input_at_branch + 1; if (!matcher.has_exact_explicit_out_edge(last_state_with_higher_out, next_char)) { - append_utf32_char_as_utf8(successor, next_char); + append_utf32_char(successor, next_char); emit_smallest_matching_suffix(matcher, wildcard_state, successor); return; } // else: handle exact match below (it will be found as the first higher out edge) } const auto first_highest_edge = matcher.lowest_higher_explicit_out_edge(last_state_with_higher_out, input_at_branch); assert(matcher.valid_edge(first_highest_edge)); - append_utf32_char_as_utf8(successor, matcher.edge_to_u32char(first_highest_edge)); + append_utf32_char(successor, matcher.edge_to_u32char(first_highest_edge)); emit_smallest_matching_suffix(matcher, matcher.edge_to_state(last_state_with_higher_out, first_highest_edge), successor); } @@ -277,29 +269,41 @@ struct MatchAlgorithm { */ // TODO consider variant for only emitting _prefix of suffix_ to avoid having to generate // the full string? Won't generate a matching string, but will be lexicographically greater. - template <DfaMatcher Matcher> + template <DfaMatcher Matcher, typename SuccessorT> static void emit_smallest_matching_suffix( const Matcher& matcher, typename Matcher::StateParamType from, - std::string& str) + SuccessorT& str) { auto state = from; while (!matcher.is_match(state)) { + // Optimization: if the only way for the remaining suffix to match is for it to be + // exactly equal to the suffix of the target string, emit it directly instead of + // stepping the state per character. This allows for matcher-specific shortcuts. + if (matcher.implies_exact_match_suffix(state)) { + matcher.emit_exact_match_suffix(state, str); + return; + } // If we can take a wildcard path, emit the smallest possible valid UTF-8 character (0x01). // Otherwise, find the smallest char that can eventually lead us to a match. auto wildcard_state = matcher.match_wildcard(state); if (matcher.can_match(wildcard_state)) { - str += '\x01'; + str.push_back(0x01); state = wildcard_state; } else { const auto smallest_out_edge = matcher.smallest_explicit_out_edge(state); assert(matcher.valid_edge(smallest_out_edge)); - append_utf32_char_as_utf8(str, matcher.edge_to_u32char(smallest_out_edge)); + append_utf32_char(str, matcher.edge_to_u32char(smallest_out_edge)); state = matcher.edge_to_state(state, smallest_out_edge); } } } + template <typename T> + static constexpr bool has_8bit_value_type() noexcept { + return sizeof(typename T::value_type) == 1; + } + /** * The successor prefix is the prefix of the source string up to (but not including) the * point where we emit a lexicographically higher character. Ideally we can just copy the @@ -318,19 +322,23 @@ struct MatchAlgorithm { * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different * ordering when compared byte-wise against an implicitly lowercased dictionary. */ - static void emit_successor_prefix(std::string& successor_out, std::string_view source, + template <typename SuccessorT> + static void emit_successor_prefix(SuccessorT& successor_out, std::string_view source, uint32_t n_prefix_u8_bytes, bool emit_raw_prefix_u8_bytes) { - if (emit_raw_prefix_u8_bytes) { - successor_out = source.substr(0, n_prefix_u8_bytes); - } else { - // TODO avoid duplicate work...! :I - successor_out.clear(); - Utf8Reader u8_reader(source.data(), source.size()); - while (u8_reader.getPos() < n_prefix_u8_bytes) { - append_utf32_char_as_utf8(successor_out, LowerCase::convert(u8_reader.getChar())); + // TODO redesign prefix output wiring + if constexpr (has_8bit_value_type<SuccessorT>()) { + if (emit_raw_prefix_u8_bytes) { + successor_out = source.substr(0, n_prefix_u8_bytes); + return; } } + // TODO avoid duplicate work...! :I + successor_out.clear(); + Utf8Reader u8_reader(source.data(), source.size()); + while (u8_reader.getPos() < n_prefix_u8_bytes) { + append_utf32_char(successor_out, LowerCase::convert(u8_reader.getChar())); + } } static uint32_t normalized_match_char(uint32_t in_ch, bool is_cased) noexcept { diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp index f2c6259fd10..043618db3b5 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp +++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp @@ -40,6 +40,14 @@ std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str) { return utf8_string_to_utf32(std::string_view(reinterpret_cast<const char*>(u8str.data()), u8str.size())); } +std::string utf32_string_to_utf8(std::span<const uint32_t> u32str) { + std::string u8out; + for (uint32_t u32ch : u32str) { + append_utf32_char(u8out, u32ch); + } + return u8out; +} + [[noreturn]] void throw_bad_code_point(uint32_t codepoint) __attribute__((noinline)); [[noreturn]] void throw_bad_code_point(uint32_t codepoint) { throw std::invalid_argument(make_string("invalid UTF-32 codepoint: U+%04X (%u)", codepoint, codepoint)); @@ -54,7 +62,7 @@ namespace { * * Returns the number of bytes written. * - * See comments on append_utf32_char_as_utf8() as to why this is not a generic UTF-8 + * See comments on append_utf32_char() as to why this is not a generic UTF-8 * encoding function that can be used in all possible scenarios. */ [[nodiscard]] uint8_t encode_utf8_char(uint32_t codepoint, unsigned char* u8buf) { @@ -118,7 +126,7 @@ namespace { } // anon ns // TODO optimize inlined in header for case where u32_char is < 0x80? -void append_utf32_char_as_utf8(std::string& out_str, uint32_t u32_char) { +void append_utf32_char(std::string& out_str, uint32_t u32_char) { unsigned char u8buf[4]; uint8_t u8bytes = encode_utf8_char(u32_char, u8buf); out_str.append(reinterpret_cast<const char*>(u8buf), u8bytes); diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h index 711c4f1fcd0..7af0eb6e5e3 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h +++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h @@ -2,6 +2,7 @@ #pragma once #include <cstdint> +#include <span> #include <string> #include <string_view> #include <vector> @@ -15,6 +16,8 @@ std::vector<uint32_t> utf8_string_to_utf32(std::string_view str); std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str); +std::string utf32_string_to_utf8(std::span<const uint32_t> u32str); + /** * Encodes a single UTF-32 codepoint `u32_char` to a 1-4 byte UTF-8 sequence and * appends it to `out_str.` @@ -31,6 +34,14 @@ std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str); * ... So don't copy this function for use as a general UTF-8 emitter, as it is not * _technically_ conformant! */ -void append_utf32_char_as_utf8(std::string& out_str, uint32_t u32_char); +void append_utf32_char(std::string& out_str, uint32_t u32_char); + +inline void append_utf32_char(std::u32string& out_str, uint32_t u32_char) { + out_str.push_back(u32_char); // no conversion needed +} + +inline void append_utf32_char(std::vector<uint32_t>& out_str, uint32_t u32_char) { + out_str.push_back(u32_char); // no conversion needed +} } |