aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2023-09-26 10:42:09 +0200
committerGitHub <noreply@github.com>2023-09-26 10:42:09 +0200
commit9f5423b83345c4e67184f407646addd08180ce58 (patch)
tree32324704283e526972e2ec30a25e527f48a1f2a4
parent47442b2cf99e6ccd244d2f9c694186e759a8766f (diff)
parent96dc50a40ca298182e347c600f327aa99dc5659e (diff)
Merge pull request #28651 from vespa-engine/vekterli/generate-successor-prefix-as-we-go
Generate Levenshtein successor prefix "as we go" during match loop
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h26
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp78
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/sparse_state.h6
3 files changed, 32 insertions, 78 deletions
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
index c6ca06d4de3..6eb666c1af8 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -58,8 +58,8 @@ namespace vespalib::fuzzy {
* ====== Unicode support ======
*
* Matching and successor generation is fully Unicode-aware. All input strings are expected
- * to be in UTF-8, and the generated successor is also encoded as UTF-8 (with some caveats;
- * see the documentation for match()).
+ * to be in UTF-8, and the generated successor is encoded as UTF-8 (with some caveats; see
+ * the documentation for match()) or UTF-32, depending on the chosen `match()` overload.
*
* Internally, matching is done on UTF-32 code points and the DFA itself is built around
* UTF-32. This is unlike Lucene, which converts a UTF-32 DFA to an equivalent UTF-8 DFA.
@@ -159,7 +159,7 @@ public:
/**
* Attempts to match the source string `source` with the target string this DFA was
- * built with, emitting a successor string on mismatch if `successor_out` != nullptr.
+ * built with.
*
* `source` must not contain any null UTF-8 chars.
*
@@ -181,11 +181,14 @@ public:
*
* See `match(source)` for semantics of returned MatchResult.
*
+ * In the case of a _match_, the contents of `successor_out` is unspecified. It may be
+ * preemptively modified as part of the matching loop itself.
+ *
* In the case of a _mismatch_, the following holds:
*
- * - `successor_out` is modified to contain the next (in byte-wise ordering) possible
- * _matching_ string S so that there exists no other matching string S' that is
- * greater than `source` but smaller than S.
+ * - `successor_out` contains the next (in byte-wise ordering) possible _matching_
+ * string S so that there exists no other matching string S' that is greater than
+ * `source` but smaller than S.
* - `successor_out` contains UTF-8 bytes that are within what UTF-8 can legally
* encode in bitwise form, but the _code points_ they encode may not be valid.
* In particular, surrogate pair ranges and U+10FFFF+1 may be encoded, neither of
@@ -203,12 +206,8 @@ public:
* is what is passed to the DFA match() function.
*
* Memory allocation:
- * This function does not directly or indirectly allocate any heap memory if either:
- *
- * - the input string is within the max edit distance, or
- * - `successor_out` is nullptr, or
- * - `successor_out` has sufficient capacity to hold the generated successor
- *
+ * This function does not directly or indirectly allocate any heap memory if the
+ * `successor_out` string provided is large enough to fit any generated successor.
* By reusing the successor string across many calls, this therefore amortizes memory
* allocations down to near zero per invocation.
*/
@@ -220,7 +219,8 @@ public:
* internally, and is therefore expected to be more efficient.
*
* The code point ordering of the UTF-32 successor string is identical to that its UTF-8
- * equivalent.
+ * equivalent. This includes the special cases where the successor may contain code points
+ * outside the legal Unicode range.
*/
[[nodiscard]] MatchResult match(std::string_view source, std::vector<uint32_t>& successor_out) const;
diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
index fb5ec32abc7..654889d87bf 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
@@ -136,39 +136,37 @@ struct MatchAlgorithm {
* that has nothing in common with the source altogether.
* Example: "gp" -> "hfood" (+1 char value case)
*
- * Performance note:
- * Both the input and successor output strings are in UTF-8 format. To avoid doing
- * duplicate work, we keep track of the byte length of the string prefix that will be
- * part of the successor and simply copy it verbatim instead of building the string
- * from converted UTF-32 -> UTF-8 chars as we go. This optimization cannot be used
- * when one or more of the prefix characters have been lowercase-transformed.
+ * Note for cased vs. uncased matching: when uncased matching is specified, we always
+ * match "as if" both the target and source strings are lowercased. This means that
+ * successor strings are generated based on this form, _not_ on the original form.
+ * Example: uncased matching for target "food" with input "FOXX". This generates the
+ * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different
+ * ordering when compared byte-wise against an implicitly lowercased dictionary.
*
* TODO let matcher know if source string is pre-normalized (i.e. lowercased).
- * TODO consider opportunistically appending prefix as we go instead of only when needed.
*/
template <DfaMatcher Matcher, typename SuccessorT>
static MatchResult match(const Matcher& matcher,
std::string_view source,
SuccessorT& successor_out)
{
+ successor_out.clear(); // TODO allow for preserving existing prefix
+
using StateType = typename Matcher::StateType;
Utf8Reader u8_reader(source.data(), source.size());
- uint32_t n_prefix_u8_bytes = 0;
+ uint32_t n_prefix_chars = 0;
uint32_t char_after_prefix = 0;
StateType last_state_with_higher_out = StateType{};
- bool can_use_raw_prefix = true;
StateType state = matcher.start();
while (u8_reader.hasMore()) {
- const auto u8_pos_before_char = u8_reader.getPos();
- const uint32_t raw_mch = u8_reader.getChar();
- const uint32_t mch = normalized_match_char(raw_mch, matcher.is_cased());
- if (raw_mch != mch) {
- can_use_raw_prefix = false; // FIXME this is pessimistic; considers entire string, not just prefix
- }
+ const auto pos_before_char = static_cast<uint32_t>(successor_out.size());
+ const uint32_t raw_mch = u8_reader.getChar();
+ const uint32_t mch = normalized_match_char(raw_mch, matcher.is_cased());
+ append_utf32_char(successor_out, mch);
if (matcher.has_higher_out_edge(state, mch)) {
last_state_with_higher_out = state;
- n_prefix_u8_bytes = u8_pos_before_char;
+ n_prefix_chars = pos_before_char;
char_after_prefix = mch;
}
auto maybe_next = matcher.match_input(state, mch);
@@ -176,8 +174,7 @@ struct MatchAlgorithm {
state = maybe_next;
} else {
// Can never match; find the successor
- emit_successor_prefix(successor_out, source, n_prefix_u8_bytes,
- matcher.is_cased() || can_use_raw_prefix);
+ successor_out.resize(n_prefix_chars); // Always <= successor_out.size()
assert(matcher.valid_state(last_state_with_higher_out));
backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out,
char_after_prefix, successor_out);
@@ -188,8 +185,7 @@ struct MatchAlgorithm {
if (edits <= max_edits()) {
return MatchResult::make_match(max_edits(), edits);
}
- emit_successor_prefix(successor_out, source, source.size(),
- matcher.is_cased() || can_use_raw_prefix);
+ // Successor prefix already filled, just need to emit the suffix
emit_smallest_matching_suffix(matcher, state, successor_out);
return MatchResult::make_mismatch(max_edits());
}
@@ -320,48 +316,6 @@ struct MatchAlgorithm {
}
}
- template <typename T>
- static constexpr bool has_8bit_value_type() noexcept {
- return sizeof(typename T::value_type) == 1;
- }
-
- /**
- * The successor prefix is the prefix of the source string up to (but not including) the
- * point where we emit a lexicographically higher character. Ideally we can just copy the
- * UTF-8 bytes verbatim from the source into the successor. This is possible when one of
- * the following holds:
- *
- * - DFA uses Cased (i.e. exact) matching, or
- * - DFA uses Uncased, but none of the characters in the prefix triggered a lowercase
- * transform. This means the prefix is already as-if lowercased, and we can copy it
- * verbatim.
- *
- * In the case that we can't copy verbatim, we currently have to explicitly normalize the
- * prefix by converting it to its lowercased form.
- *
- * Example: Uncased matching for target "food" with input "FOXX". This generates the
- * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different
- * ordering when compared byte-wise against an implicitly lowercased dictionary.
- */
- template <typename SuccessorT>
- static void emit_successor_prefix(SuccessorT& successor_out, std::string_view source,
- uint32_t n_prefix_u8_bytes, bool emit_raw_prefix_u8_bytes)
- {
- // TODO redesign prefix output wiring
- if constexpr (has_8bit_value_type<SuccessorT>()) {
- if (emit_raw_prefix_u8_bytes) {
- successor_out = source.substr(0, n_prefix_u8_bytes);
- return;
- }
- }
- // TODO avoid duplicate work...! :I
- successor_out.clear();
- Utf8Reader u8_reader(source.data(), source.size());
- while (u8_reader.getPos() < n_prefix_u8_bytes) {
- append_utf32_char(successor_out, LowerCase::convert(u8_reader.getChar()));
- }
- }
-
static uint32_t normalized_match_char(uint32_t in_ch, bool is_cased) noexcept {
return (is_cased ? in_ch : LowerCase::convert(in_ch));
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h b/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h
index d20cfc07a9a..dfec0bac4a8 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/sparse_state.h
@@ -112,11 +112,11 @@ std::ostream& operator<<(std::ostream& os, const FixedSparseState<MaxEdits>& s)
if (i != 0) {
os << ", ";
}
- for (size_t j = last_idx; j < s.indices[i]; ++j) {
+ for (size_t j = last_idx; j < s.index(i); ++j) {
os << "-, ";
}
- last_idx = s.indices[i] + 1;
- os << static_cast<uint32_t>(s.costs[i]);
+ last_idx = s.index(i) + 1;
+ os << static_cast<uint32_t>(s.cost(i));
}
os << "]";
return os;