diff options
author | Tor Brede Vekterli <vekterli@yahooinc.com> | 2023-09-25 14:31:54 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@yahooinc.com> | 2023-09-25 14:38:15 +0000 |
commit | 751c8d7e54f3f9b9fdcc7b77ee6a021c70655e96 (patch) | |
tree | bcdc1bdf7538554b80224ce48981a9bc7585b737 | |
parent | b4af421142168c36cc1e8c9bae735731a68fcb20 (diff) |
Generate Levenshtein successor prefix "as we go" during match loop
Moving from an on-demand generation means that we only decode
(and possibly normalize) UTF-8 characters _once_ instead of
twice in the case of UTF-32 output or non-normalized input.
These changes also make it much more easy to add future support
for preserving a caller-supplied successor prefix, which would
be used for prefix-locked dictionary matching.
Note: this subtly changes the API to potentially _always_ mutate
the input successor string. The API documentation has been updated
to reflect this. No current users of the API should be affected.
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h | 26 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp | 78 |
2 files changed, 29 insertions, 75 deletions
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h index c6ca06d4de3..6eb666c1af8 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h +++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h @@ -58,8 +58,8 @@ namespace vespalib::fuzzy { * ====== Unicode support ====== * * Matching and successor generation is fully Unicode-aware. All input strings are expected - * to be in UTF-8, and the generated successor is also encoded as UTF-8 (with some caveats; - * see the documentation for match()). + * to be in UTF-8, and the generated successor is encoded as UTF-8 (with some caveats; see + * the documentation for match()) or UTF-32, depending on the chosen `match()` overload. * * Internally, matching is done on UTF-32 code points and the DFA itself is built around * UTF-32. This is unlike Lucene, which converts a UTF-32 DFA to an equivalent UTF-8 DFA. @@ -159,7 +159,7 @@ public: /** * Attempts to match the source string `source` with the target string this DFA was - * built with, emitting a successor string on mismatch if `successor_out` != nullptr. + * built with. * * `source` must not contain any null UTF-8 chars. * @@ -181,11 +181,14 @@ public: * * See `match(source)` for semantics of returned MatchResult. * + * In the case of a _match_, the contents of `successor_out` is unspecified. It may be + * preemptively modified as part of the matching loop itself. + * * In the case of a _mismatch_, the following holds: * - * - `successor_out` is modified to contain the next (in byte-wise ordering) possible - * _matching_ string S so that there exists no other matching string S' that is - * greater than `source` but smaller than S. + * - `successor_out` contains the next (in byte-wise ordering) possible _matching_ + * string S so that there exists no other matching string S' that is greater than + * `source` but smaller than S. * - `successor_out` contains UTF-8 bytes that are within what UTF-8 can legally * encode in bitwise form, but the _code points_ they encode may not be valid. * In particular, surrogate pair ranges and U+10FFFF+1 may be encoded, neither of @@ -203,12 +206,8 @@ public: * is what is passed to the DFA match() function. * * Memory allocation: - * This function does not directly or indirectly allocate any heap memory if either: - * - * - the input string is within the max edit distance, or - * - `successor_out` is nullptr, or - * - `successor_out` has sufficient capacity to hold the generated successor - * + * This function does not directly or indirectly allocate any heap memory if the + * `successor_out` string provided is large enough to fit any generated successor. * By reusing the successor string across many calls, this therefore amortizes memory * allocations down to near zero per invocation. */ @@ -220,7 +219,8 @@ public: * internally, and is therefore expected to be more efficient. * * The code point ordering of the UTF-32 successor string is identical to that its UTF-8 - * equivalent. + * equivalent. This includes the special cases where the successor may contain code points + * outside the legal Unicode range. */ [[nodiscard]] MatchResult match(std::string_view source, std::vector<uint32_t>& successor_out) const; diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp index fb5ec32abc7..654889d87bf 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp +++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp @@ -136,39 +136,37 @@ struct MatchAlgorithm { * that has nothing in common with the source altogether. * Example: "gp" -> "hfood" (+1 char value case) * - * Performance note: - * Both the input and successor output strings are in UTF-8 format. To avoid doing - * duplicate work, we keep track of the byte length of the string prefix that will be - * part of the successor and simply copy it verbatim instead of building the string - * from converted UTF-32 -> UTF-8 chars as we go. This optimization cannot be used - * when one or more of the prefix characters have been lowercase-transformed. + * Note for cased vs. uncased matching: when uncased matching is specified, we always + * match "as if" both the target and source strings are lowercased. This means that + * successor strings are generated based on this form, _not_ on the original form. + * Example: uncased matching for target "food" with input "FOXX". This generates the + * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different + * ordering when compared byte-wise against an implicitly lowercased dictionary. * * TODO let matcher know if source string is pre-normalized (i.e. lowercased). - * TODO consider opportunistically appending prefix as we go instead of only when needed. */ template <DfaMatcher Matcher, typename SuccessorT> static MatchResult match(const Matcher& matcher, std::string_view source, SuccessorT& successor_out) { + successor_out.clear(); // TODO allow for preserving existing prefix + using StateType = typename Matcher::StateType; Utf8Reader u8_reader(source.data(), source.size()); - uint32_t n_prefix_u8_bytes = 0; + uint32_t n_prefix_chars = 0; uint32_t char_after_prefix = 0; StateType last_state_with_higher_out = StateType{}; - bool can_use_raw_prefix = true; StateType state = matcher.start(); while (u8_reader.hasMore()) { - const auto u8_pos_before_char = u8_reader.getPos(); - const uint32_t raw_mch = u8_reader.getChar(); - const uint32_t mch = normalized_match_char(raw_mch, matcher.is_cased()); - if (raw_mch != mch) { - can_use_raw_prefix = false; // FIXME this is pessimistic; considers entire string, not just prefix - } + const auto pos_before_char = static_cast<uint32_t>(successor_out.size()); + const uint32_t raw_mch = u8_reader.getChar(); + const uint32_t mch = normalized_match_char(raw_mch, matcher.is_cased()); + append_utf32_char(successor_out, mch); if (matcher.has_higher_out_edge(state, mch)) { last_state_with_higher_out = state; - n_prefix_u8_bytes = u8_pos_before_char; + n_prefix_chars = pos_before_char; char_after_prefix = mch; } auto maybe_next = matcher.match_input(state, mch); @@ -176,8 +174,7 @@ struct MatchAlgorithm { state = maybe_next; } else { // Can never match; find the successor - emit_successor_prefix(successor_out, source, n_prefix_u8_bytes, - matcher.is_cased() || can_use_raw_prefix); + successor_out.resize(n_prefix_chars); // Always <= successor_out.size() assert(matcher.valid_state(last_state_with_higher_out)); backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out, char_after_prefix, successor_out); @@ -188,8 +185,7 @@ struct MatchAlgorithm { if (edits <= max_edits()) { return MatchResult::make_match(max_edits(), edits); } - emit_successor_prefix(successor_out, source, source.size(), - matcher.is_cased() || can_use_raw_prefix); + // Successor prefix already filled, just need to emit the suffix emit_smallest_matching_suffix(matcher, state, successor_out); return MatchResult::make_mismatch(max_edits()); } @@ -320,48 +316,6 @@ struct MatchAlgorithm { } } - template <typename T> - static constexpr bool has_8bit_value_type() noexcept { - return sizeof(typename T::value_type) == 1; - } - - /** - * The successor prefix is the prefix of the source string up to (but not including) the - * point where we emit a lexicographically higher character. Ideally we can just copy the - * UTF-8 bytes verbatim from the source into the successor. This is possible when one of - * the following holds: - * - * - DFA uses Cased (i.e. exact) matching, or - * - DFA uses Uncased, but none of the characters in the prefix triggered a lowercase - * transform. This means the prefix is already as-if lowercased, and we can copy it - * verbatim. - * - * In the case that we can't copy verbatim, we currently have to explicitly normalize the - * prefix by converting it to its lowercased form. - * - * Example: Uncased matching for target "food" with input "FOXX". This generates the - * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different - * ordering when compared byte-wise against an implicitly lowercased dictionary. - */ - template <typename SuccessorT> - static void emit_successor_prefix(SuccessorT& successor_out, std::string_view source, - uint32_t n_prefix_u8_bytes, bool emit_raw_prefix_u8_bytes) - { - // TODO redesign prefix output wiring - if constexpr (has_8bit_value_type<SuccessorT>()) { - if (emit_raw_prefix_u8_bytes) { - successor_out = source.substr(0, n_prefix_u8_bytes); - return; - } - } - // TODO avoid duplicate work...! :I - successor_out.clear(); - Utf8Reader u8_reader(source.data(), source.size()); - while (u8_reader.getPos() < n_prefix_u8_bytes) { - append_utf32_char(successor_out, LowerCase::convert(u8_reader.getChar())); - } - } - static uint32_t normalized_match_char(uint32_t in_ch, bool is_cased) noexcept { return (is_cased ? in_ch : LowerCase::convert(in_ch)); } |