summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2023-09-18 12:10:40 +0000
committerTor Brede Vekterli <vekterli@yahooinc.com>2023-09-18 13:21:04 +0000
commit23a0c0dfe271fe301cde7537a43269ca4cb14432 (patch)
tree75345415c8f2c646c542ba4c4cb186c83bb5af80
parent15e0836eade4614e944a0946b327305e5c8966cf (diff)
Support raw UTF-32 successor string output
Avoids need to encode UTF-32 characters to UTF-8 internally, as this is likely to be subsequently reversed by a caller that itself operates on UTF-32 code points. Change the `match()` API by introducing a separate overload that does not produce a successor, and add two explicit successor string type overloads that take the string by ref, not pointer.
-rw-r--r--vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp27
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h5
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp21
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp12
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h5
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp25
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp14
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h26
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp49
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp8
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h3
11 files changed, 154 insertions, 41 deletions
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
index c466d0bfb3e..c235cb99509 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
@@ -26,10 +26,10 @@ std::optional<uint32_t> do_calculate(std::string_view left, std::string_view rig
LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
{
auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type);
- auto maybe_match_lhs = dfa_lhs.match(right, nullptr);
+ auto maybe_match_lhs = dfa_lhs.match(right);
auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type);
- auto maybe_match_rhs = dfa_rhs.match(left, nullptr);
+ auto maybe_match_rhs = dfa_rhs.match(left);
EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches());
if (maybe_match_lhs.matches()) {
@@ -47,6 +47,11 @@ std::optional<uint32_t> do_calculate(std::u8string_view left, std::u8string_view
return do_calculate(lhs_ch, rhs_ch, threshold, casing, dfa_type);
}
+void expect_utf32_string_code_point_equal_to_utf8(std::span<const uint32_t> u32str, const std::string& u8str) {
+ auto as_utf8 = utf32_string_to_utf8(u32str);
+ EXPECT_EQ(as_utf8, u8str);
+}
+
}
struct LevenshteinDfaTest : TestWithParam<CasingAndDfaType> {
@@ -119,14 +124,20 @@ TEST_P(LevenshteinDfaTest, distance_is_in_utf32_code_point_space) {
void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std::string_view expected_successor) {
std::string successor;
- auto m = dfa.match(source, &successor);
+ auto m = dfa.match(source, successor);
if (m.matches()) {
FAIL() << "Expected '" << source << "' to emit a successor, but it "
<< "matched with " << static_cast<uint32_t>(m.edits())
<< " edits (of max " << static_cast<uint32_t>(m.max_edits()) << " edits)";
}
EXPECT_EQ(successor, expected_successor);
- EXPECT_TRUE(dfa.match(successor, nullptr).matches());
+ EXPECT_TRUE(dfa.match(successor).matches());
+
+ // Make sure the UTF-32 successor output is codepoint-wise identical to the UTF-8 successor
+ std::vector<uint32_t> u32successor;
+ m = dfa.match(source, u32successor);
+ EXPECT_FALSE(m.matches());
+ expect_utf32_string_code_point_equal_to_utf8(u32successor, successor);
}
TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings) {
@@ -331,7 +342,7 @@ TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) {
std::string skip_to, successor;
for (uint32_t j = 0; j < 256; ++j) {
const auto source = bits_to_str(static_cast<uint8_t>(j));
- auto maybe_match = target_dfa.match(source, &successor);
+ auto maybe_match = target_dfa.match(source, successor);
if (maybe_match.matches() && !skip_to.empty()) {
ASSERT_GE(source, skip_to);
} else if (!maybe_match.matches()) {
@@ -494,7 +505,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_worst_case_matching_excluding_setup_t
: LevenshteinDfa::DfaType::Implicit;
auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
min_time_s = BenchmarkTimer::benchmark([&] {
- auto res = dfa.match(str, nullptr); // not benchmarking successor generation
+ auto res = dfa.match(str); // not benchmarking successor generation
do_not_optimize_away(res);
}, 1.0);
} else {
@@ -545,7 +556,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) {
auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
min_time_s = BenchmarkTimer::benchmark([&] {
for (const auto& line : dict) {
- auto res = dfa.match(line, nullptr);
+ auto res = dfa.match(line);
do_not_optimize_away(res);
}
}, 2.0);
@@ -586,7 +597,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_skipping_dictionary_scan) {
auto end = dict.cend();
std::string successor;
while (iter != end) {
- auto maybe_match = dfa.match(*iter, &successor);
+ auto maybe_match = dfa.match(*iter, successor);
if (maybe_match.matches()) {
++iter;
} else {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
index 6e8068a7419..630e07738fb 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
@@ -125,11 +125,16 @@ public:
[[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override;
+ [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const override;
+
[[nodiscard]] size_t memory_usage() const noexcept override {
return sizeof(DfaNodeType) * _nodes.size();
}
void dump_as_graphviz(std::ostream& os) const override;
+private:
+ template <typename SuccessorT>
+ [[nodiscard]] MatchResult match_impl(std::string_view u8str, SuccessorT* successor_out) const;
};
template <typename Traits>
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
index 8ec05199bbe..7860d841fbf 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
@@ -80,23 +80,40 @@ struct ExplicitDfaMatcher {
return &_nodes[edge->node];
}
constexpr bool implies_exact_match_suffix(const StateType&) const noexcept {
- // TODO
+ // TODO; caller will currently just fall back to explicit state stepping
return false;
}
void emit_exact_match_suffix(const StateType&, std::string&) const {
// TODO (will never be called as long as `implies_exact_match_suffix()` returns false)
abort();
}
+ void emit_exact_match_suffix(const StateType&, std::vector<uint32_t>&) const {
+ // TODO (will never be called as long as `implies_exact_match_suffix()` returns false)
+ abort();
+ }
};
template <uint8_t MaxEdits>
+template <typename SuccessorT>
LevenshteinDfa::MatchResult
-ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const {
+ExplicitLevenshteinDfaImpl<MaxEdits>::match_impl(std::string_view u8str, SuccessorT* successor_out) const {
ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
}
template <uint8_t MaxEdits>
+LevenshteinDfa::MatchResult
+ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const {
+ return match_impl(u8str, successor_out);
+}
+
+template <uint8_t MaxEdits>
+LevenshteinDfa::MatchResult
+ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::vector<uint32_t>* successor_out) const {
+ return match_impl(u8str, successor_out);
+}
+
+template <uint8_t MaxEdits>
void ExplicitLevenshteinDfaImpl<MaxEdits>::dump_as_graphviz(std::ostream& os) const {
os << std::dec << "digraph levenshtein_dfa {\n";
os << " fontname=\"Helvetica,Arial,sans-serif\"\n";
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
index 7c77f7a0ed2..031973fd35d 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.cpp
@@ -5,9 +5,21 @@
namespace vespalib::fuzzy {
+/**
+ * Builds two separate vectors that exist alongside the (possibly case-normalized) UTF-32
+ * target string:
+ *
+ * - The UTF-8 representation of the (possibly case-normalized) target string
+ * - An offset vector that maps each UTF-32 string index to the first byte of the
+ * equivalent UTF-8 character.
+ *
+ * These are used for efficiently dumping an UTF-8 target string suffix from an UTF-32
+ * target string index.
+ */
template <typename Traits>
void ImplicitLevenshteinDfa<Traits>::precompute_utf8_target_with_offsets() {
_target_utf8_char_offsets.reserve(_u32_str_buf.size());
+ _target_as_utf8.reserve(_u32_str_buf.size()); // Under-reserves for all non-ASCII (TODO count via simdutf?)
// Important: the precomputed UTF-8 target is based on the potentially case-normalized
// target string, ensuring that we don't emit raw target chars for uncased successors.
for (uint32_t u32ch : _u32_str_buf) {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
index 0b8ddc71ed4..bb4a0918593 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
@@ -29,12 +29,17 @@ public:
[[nodiscard]] MatchResult match(std::string_view u8str, std::string* successor_out) const override;
+ [[nodiscard]] MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const override;
+
[[nodiscard]] size_t memory_usage() const noexcept override {
return _u32_str_buf.size() * sizeof(uint32_t);
}
void dump_as_graphviz(std::ostream& os) const override;
private:
+ template <typename SuccessorT>
+ [[nodiscard]] MatchResult match_impl(std::string_view u8str, SuccessorT* successor_out) const;
+
void precompute_utf8_target_with_offsets();
};
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
index 24c63931661..25fd3fdcc4e 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
@@ -117,7 +117,8 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
return step(state, edge);
}
bool implies_exact_match_suffix(const StateType& state) const noexcept {
- // Only one entry in the sparse matrix row and it implies no edits can be done
+ // Only one entry in the sparse matrix row and it implies no edits can be done.
+ // I.e. only way to match the target string suffix is to match it _exactly_.
return (state.size() == 1 && state.cost(0) == max_edits());
}
// Precondition: implies_match_suffix(state)
@@ -125,17 +126,35 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
const uint32_t target_u8_offset = _target_utf8_char_offsets[state.index(0)];
u8_out.append(_target_as_utf8.data() + target_u8_offset, _target_as_utf8.size() - target_u8_offset);
}
- // TODO emit_exact_match_suffix(const StateType& state, std::u32string& u32_out)
+ void emit_exact_match_suffix(const StateType& state, std::vector<uint32_t>& u32_out) const {
+ // TODO ranged insert
+ for (uint32_t i = state.index(0); i < _u32_str.size(); ++i) {
+ u32_out.push_back(_u32_str[i]);
+ }
+ }
};
template <typename Traits>
+template <typename SuccessorT>
LevenshteinDfa::MatchResult
-ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const {
+ImplicitLevenshteinDfa<Traits>::match_impl(std::string_view u8str, SuccessorT* successor_out) const {
ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _target_as_utf8, _target_utf8_char_offsets, _is_cased);
return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
}
template <typename Traits>
+LevenshteinDfa::MatchResult
+ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const {
+ return match_impl(u8str, successor_out);
+}
+
+template <typename Traits>
+LevenshteinDfa::MatchResult
+ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::vector<uint32_t>* successor_out) const {
+ return match_impl(u8str, successor_out);
+}
+
+template <typename Traits>
void ImplicitLevenshteinDfa<Traits>::dump_as_graphviz(std::ostream&) const {
throw std::runtime_error("Graphviz output not available for implicit Levenshtein DFA");
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
index 77691ffefd8..6e38821851b 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
@@ -18,8 +18,18 @@ LevenshteinDfa& LevenshteinDfa::operator=(LevenshteinDfa&&) noexcept = default;
LevenshteinDfa::~LevenshteinDfa() = default;
LevenshteinDfa::MatchResult
-LevenshteinDfa::match(std::string_view u8str, std::string* successor_out) const {
- return _impl->match(u8str, successor_out);
+LevenshteinDfa::match(std::string_view u8str) const {
+ return _impl->match(u8str, static_cast<std::vector<uint32_t>*>(nullptr)); // TODO rewire
+}
+
+LevenshteinDfa::MatchResult
+LevenshteinDfa::match(std::string_view u8str, std::string& successor_out) const {
+ return _impl->match(u8str, &successor_out);
+}
+
+LevenshteinDfa::MatchResult
+LevenshteinDfa::match(std::string_view u8str, std::vector<uint32_t>& successor_out) const {
+ return _impl->match(u8str, &successor_out);
}
size_t LevenshteinDfa::memory_usage() const noexcept {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
index a8b2b77f647..85ad98e2a09 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -6,6 +6,7 @@
#include <memory>
#include <string>
#include <string_view>
+#include <vector>
namespace vespalib::fuzzy {
@@ -140,6 +141,7 @@ public:
struct Impl {
virtual ~Impl() = default;
[[nodiscard]] virtual MatchResult match(std::string_view u8str, std::string* successor_out) const = 0;
+ [[nodiscard]] virtual MatchResult match(std::string_view u8str, std::vector<uint32_t>* successor_out) const = 0;
[[nodiscard]] virtual size_t memory_usage() const noexcept = 0;
virtual void dump_as_graphviz(std::ostream& out) const = 0;
};
@@ -169,7 +171,17 @@ public:
* Iff `source` is _beyond_ the maximum edit distance, returns a MatchResult with
* matches() == false.
*
- * Iff `successor_out` is not nullptr, the following holds:
+ */
+ [[nodiscard]] MatchResult match(std::string_view source) const;
+
+ /**
+ * Attempts to match the source string `source` with the target string this DFA was
+ * built with, emitting a successor string into `successor_out` on mismatch.
+ *
+ * See `match(source)` for semantics of returned MatchResult.
+ *
+ * In the case of a _mismatch_, the following holds:
+ *
* - `successor_out` is modified to contain the next (in byte-wise ordering) possible
* _matching_ string S so that there exists no other matching string S' that is
* greater than `source` but smaller than S.
@@ -199,7 +211,17 @@ public:
* By reusing the successor string across many calls, this therefore amortizes memory
* allocations down to near zero per invocation.
*/
- [[nodiscard]] MatchResult match(std::string_view source, std::string* successor_out) const;
+ [[nodiscard]] MatchResult match(std::string_view source, std::string& successor_out) const;
+
+ /**
+ * Same as `match(source, successor_out)`, but where the successor string is defined in
+ * terms of UTF-32, not UTF-8. This avoids the need for encoding characters to UTF-8
+ * internally, and is therefore expected to be more efficient.
+ *
+ * The code point ordering of the UTF-32 successor string is identical to that its UTF-8
+ * equivalent.
+ */
+ [[nodiscard]] MatchResult match(std::string_view source, std::vector<uint32_t>& successor_out) const;
/**
* Returns how much memory is used by the underlying DFA representation, in bytes.
diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
index ff1835e7bf1..2b3c06aa7cf 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
@@ -143,21 +143,13 @@ struct MatchAlgorithm {
* from converted UTF-32 -> UTF-8 chars as we go. This optimization cannot be used
* when one or more of the prefix characters have been lowercase-transformed.
*
- * TODO we could probably also optimize the smallest suffix generation with this when
- * we know we can no longer insert any smaller char substitutions and the only way
- * to complete the string is to emit it verbatim.
- * - To do this we'd need both the original UTF-8 target string as well as a
- * secondary vector that maps u32 character index to the corresponding UTF-8 index.
- * Both trivial to get as part of DFA initialization.
- *
* TODO let matcher know if source string is pre-normalized (i.e. lowercased).
- * TODO std::u32string output; no need to re-encode to UTF-8.
* TODO consider opportunistically appending prefix as we go instead of only when needed.
*/
- template <DfaMatcher Matcher>
+ template <DfaMatcher Matcher, typename SuccessorT>
static MatchResult match(const Matcher& matcher,
std::string_view source,
- std::string* successor_out)
+ SuccessorT* successor_out)
{
using StateType = typename Matcher::StateType;
Utf8Reader u8_reader(source.data(), source.size());
@@ -217,12 +209,12 @@ struct MatchAlgorithm {
* precondition: `last_node_with_higher_out` has either a wildcard edge or a char match
* edge that compares greater than `input_at_branch`.
*/
- template <DfaMatcher Matcher>
+ template <DfaMatcher Matcher, typename SuccessorT>
static void backtrack_and_emit_greater_suffix(
const Matcher& matcher,
typename Matcher::StateParamType last_state_with_higher_out,
const uint32_t input_at_branch,
- std::string& successor)
+ SuccessorT& successor)
{
auto wildcard_state = matcher.match_wildcard(last_state_with_higher_out);
if (matcher.can_match(wildcard_state)) {
@@ -277,11 +269,11 @@ struct MatchAlgorithm {
*/
// TODO consider variant for only emitting _prefix of suffix_ to avoid having to generate
// the full string? Won't generate a matching string, but will be lexicographically greater.
- template <DfaMatcher Matcher>
+ template <DfaMatcher Matcher, typename SuccessorT>
static void emit_smallest_matching_suffix(
const Matcher& matcher,
typename Matcher::StateParamType from,
- std::string& str)
+ SuccessorT& str)
{
auto state = from;
while (!matcher.is_match(state)) {
@@ -296,7 +288,7 @@ struct MatchAlgorithm {
// Otherwise, find the smallest char that can eventually lead us to a match.
auto wildcard_state = matcher.match_wildcard(state);
if (matcher.can_match(wildcard_state)) {
- str += '\x01';
+ str.push_back(0x01);
state = wildcard_state;
} else {
const auto smallest_out_edge = matcher.smallest_explicit_out_edge(state);
@@ -307,6 +299,11 @@ struct MatchAlgorithm {
}
}
+ template <typename T>
+ static constexpr bool has_8bit_value_type() noexcept {
+ return sizeof(typename T::value_type) == 1;
+ }
+
/**
* The successor prefix is the prefix of the source string up to (but not including) the
* point where we emit a lexicographically higher character. Ideally we can just copy the
@@ -325,19 +322,23 @@ struct MatchAlgorithm {
* successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different
* ordering when compared byte-wise against an implicitly lowercased dictionary.
*/
- static void emit_successor_prefix(std::string& successor_out, std::string_view source,
+ template <typename SuccessorT>
+ static void emit_successor_prefix(SuccessorT& successor_out, std::string_view source,
uint32_t n_prefix_u8_bytes, bool emit_raw_prefix_u8_bytes)
{
- if (emit_raw_prefix_u8_bytes) {
- successor_out = source.substr(0, n_prefix_u8_bytes);
- } else {
- // TODO avoid duplicate work...! :I
- successor_out.clear();
- Utf8Reader u8_reader(source.data(), source.size());
- while (u8_reader.getPos() < n_prefix_u8_bytes) {
- append_utf32_char(successor_out, LowerCase::convert(u8_reader.getChar()));
+ // TODO redesign prefix output wiring
+ if constexpr (has_8bit_value_type<SuccessorT>()) {
+ if (emit_raw_prefix_u8_bytes) {
+ successor_out = source.substr(0, n_prefix_u8_bytes);
+ return;
}
}
+ // TODO avoid duplicate work...! :I
+ successor_out.clear();
+ Utf8Reader u8_reader(source.data(), source.size());
+ while (u8_reader.getPos() < n_prefix_u8_bytes) {
+ append_utf32_char(successor_out, LowerCase::convert(u8_reader.getChar()));
+ }
}
static uint32_t normalized_match_char(uint32_t in_ch, bool is_cased) noexcept {
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
index f72518cc68f..043618db3b5 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
@@ -40,6 +40,14 @@ std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str) {
return utf8_string_to_utf32(std::string_view(reinterpret_cast<const char*>(u8str.data()), u8str.size()));
}
+std::string utf32_string_to_utf8(std::span<const uint32_t> u32str) {
+ std::string u8out;
+ for (uint32_t u32ch : u32str) {
+ append_utf32_char(u8out, u32ch);
+ }
+ return u8out;
+}
+
[[noreturn]] void throw_bad_code_point(uint32_t codepoint) __attribute__((noinline));
[[noreturn]] void throw_bad_code_point(uint32_t codepoint) {
throw std::invalid_argument(make_string("invalid UTF-32 codepoint: U+%04X (%u)", codepoint, codepoint));
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
index a2e9e232555..7af0eb6e5e3 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
@@ -2,6 +2,7 @@
#pragma once
#include <cstdint>
+#include <span>
#include <string>
#include <string_view>
#include <vector>
@@ -15,6 +16,8 @@ std::vector<uint32_t> utf8_string_to_utf32(std::string_view str);
std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str);
+std::string utf32_string_to_utf8(std::span<const uint32_t> u32str);
+
/**
* Encodes a single UTF-32 codepoint `u32_char` to a 1-4 byte UTF-8 sequence and
* appends it to `out_str.`