summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2023-09-15 15:40:52 +0200
committerGitHub <noreply@github.com>2023-09-15 15:40:52 +0200
commit699ecbd2c056d95adbcd99a70e9169d12edb2ba4 (patch)
tree354819f4565f958cfcad28f984ffd6a960525ffd
parent7e3d557d89bb0f9efc283e98478d568c801b4d96 (diff)
parent6d696cd634a6bd4ea1b4c046ade9bfc5b5d246a8 (diff)
Merge pull request #28540 from vespa-engine/vekterli/case-normalized-levenshtein-dfa-matching
Add support for case-insensitive matching to Levenshtein DFAs
-rw-r--r--vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp222
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h5
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h17
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp21
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h12
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp11
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp30
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h40
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp60
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp27
-rw-r--r--vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h3
11 files changed, 344 insertions, 104 deletions
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
index 6966fd0b703..c466d0bfb3e 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
@@ -1,10 +1,10 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
#include <vespa/vespalib/fuzzy/dfa_stepping_base.h>
-#include <vespa/vespalib/fuzzy/unicode_utils.h>
+#include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
#include <vespa/vespalib/fuzzy/levenshtein_distance.h> // For benchmarking purposes
+#include <vespa/vespalib/fuzzy/unicode_utils.h>
+#include <vespa/vespalib/text/lowercase.h>
#include <vespa/vespalib/util/benchmark_timer.h>
-#include <charconv>
#include <concepts>
#include <filesystem>
#include <fstream>
@@ -18,38 +18,67 @@ namespace fs = std::filesystem;
static std::string benchmark_dictionary;
-struct LevenshteinDfaTest : TestWithParam<LevenshteinDfa::DfaType> {
+using CasingAndDfaType = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaType>;
- static LevenshteinDfa::DfaType dfa_type() noexcept { return GetParam(); }
+namespace {
- static std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) {
- auto dfa_lhs = LevenshteinDfa::build(left, threshold, dfa_type());
- auto maybe_match_lhs = dfa_lhs.match(right, nullptr);
+std::optional<uint32_t> do_calculate(std::string_view left, std::string_view right, uint32_t threshold,
+ LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
+{
+ auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type);
+ auto maybe_match_lhs = dfa_lhs.match(right, nullptr);
- auto dfa_rhs = LevenshteinDfa::build(right, threshold, dfa_type());
- auto maybe_match_rhs = dfa_rhs.match(left, nullptr);
+ auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type);
+ auto maybe_match_rhs = dfa_rhs.match(left, nullptr);
- EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches());
- if (maybe_match_lhs.matches()) {
- EXPECT_EQ(maybe_match_lhs.edits(), maybe_match_rhs.edits());
- return {maybe_match_lhs.edits()};
- }
- return std::nullopt;
+ EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches());
+ if (maybe_match_lhs.matches()) {
+ EXPECT_EQ(maybe_match_lhs.edits(), maybe_match_rhs.edits());
+ return {maybe_match_lhs.edits()};
+ }
+ return std::nullopt;
+}
+
+std::optional<uint32_t> do_calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold,
+ LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
+{
+ std::string_view lhs_ch(reinterpret_cast<const char*>(left.data()), left.size());
+ std::string_view rhs_ch(reinterpret_cast<const char*>(right.data()), right.size());
+ return do_calculate(lhs_ch, rhs_ch, threshold, casing, dfa_type);
+}
+
+}
+
+struct LevenshteinDfaTest : TestWithParam<CasingAndDfaType> {
+
+ static LevenshteinDfa::Casing casing() noexcept { return std::get<0>(GetParam()); }
+ static LevenshteinDfa::DfaType dfa_type() noexcept { return std::get<1>(GetParam()); }
+
+ static std::string stringify_params(const TestParamInfo<ParamType>& info) {
+ std::ostringstream ss;
+ ss << std::get<0>(info.param) << '_' << std::get<1>(info.param);
+ return ss.str();
+ }
+
+ static std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) {
+ return do_calculate(left, right, threshold, casing(), dfa_type());
}
static std::optional<uint32_t> calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold) {
- std::string_view lhs_ch(reinterpret_cast<const char*>(left.data()), left.size());
- std::string_view rhs_ch(reinterpret_cast<const char*>(right.data()), right.size());
- return calculate(lhs_ch, rhs_ch, threshold);
+ return do_calculate(left, right, threshold, casing(), dfa_type());
}
};
-INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
+// All the baseline DFA tests use lowercase only, so they should have the exact same outcome
+// for both cased and uncased matching.
+INSTANTIATE_TEST_SUITE_P(AllCasingAndDfaTypes,
LevenshteinDfaTest,
- Values(LevenshteinDfa::DfaType::Explicit,
- LevenshteinDfa::DfaType::Implicit),
- PrintToStringParamName());
+ Combine(Values(LevenshteinDfa::Casing::Uncased,
+ LevenshteinDfa::Casing::Cased),
+ Values(LevenshteinDfa::DfaType::Explicit,
+ LevenshteinDfa::DfaType::Implicit)),
+ LevenshteinDfaTest::stringify_params);
// Same as existing non-DFA Levenshtein tests, but with some added instantiations
// for smaller max distances.
@@ -101,7 +130,7 @@ void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std:
}
TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings) {
- auto dfa = LevenshteinDfa::build("food", 1, dfa_type());
+ auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type());
test_dfa_successor(dfa, "", "\x01""food");
test_dfa_successor(dfa, "faa", "faod");
@@ -122,8 +151,8 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings
// Also works with Unicode
// 2 chars
- test_dfa_successor(dfa, "\xc3\x86""x", // "Æx"
- "\xc3\x87""food"); // "Çfood"
+ test_dfa_successor(dfa, "\xc3\xa6""x", // "æx"
+ "\xc3\xa7""food"); // "çfood"
// 3 chars
test_dfa_successor(dfa, "\xe7\x8c\xab""\xe3\x81\xaf", // "猫は"
"\xe7\x8c\xac""food"); // "猬food"
@@ -138,7 +167,7 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings
}
TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) {
- auto dfa = LevenshteinDfa::build("food", 1, dfa_type());
+ auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type());
// The successor string must be lexicographically larger than the input string.
// In the presence of a wildcard output edge we handle this by increase the input
// character by 1 and encoding it back as UTF-8.
@@ -155,7 +184,7 @@ TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_
}
TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_empty_target) {
- auto dfa = LevenshteinDfa::build("", 1, dfa_type());
+ auto dfa = LevenshteinDfa::build("", 1, casing(), dfa_type());
test_dfa_successor(dfa, "aa", "b");
test_dfa_successor(dfa, "b\x01", "c");
test_dfa_successor(dfa, "vespa", "w");
@@ -170,12 +199,80 @@ TEST_P(LevenshteinDfaTest, malformed_utf8_is_replaced_with_placeholder_char) {
EXPECT_EQ(calculate("\xff\xff", "a", 2), std::optional{2});
EXPECT_EQ(calculate("a", "\xff", 2), std::optional{1});
EXPECT_EQ(calculate("a", "\xff\xff\xff", 2), std::nullopt);
- EXPECT_EQ(calculate("\xff", "\xef\xbf\xbd"/*U+FFFD*/, 2), std::optional{0});
+ EXPECT_EQ(calculate("\xff", "\xef\xbf\xbd"/*U+FFFD*/, 2), std::optional{0});
}
TEST_P(LevenshteinDfaTest, unsupported_max_edits_value_throws) {
- EXPECT_THROW((void)LevenshteinDfa::build("abc", 0, dfa_type()), std::invalid_argument);
- EXPECT_THROW((void)LevenshteinDfa::build("abc", 3, dfa_type()), std::invalid_argument);
+ EXPECT_THROW((void)LevenshteinDfa::build("abc", 0, casing(), dfa_type()), std::invalid_argument);
+ EXPECT_THROW((void)LevenshteinDfa::build("abc", 3, casing(), dfa_type()), std::invalid_argument);
+}
+
+struct LevenshteinDfaCasingTest : TestWithParam<LevenshteinDfa::DfaType> {
+ static LevenshteinDfa::DfaType dfa_type() noexcept { return GetParam(); }
+
+ static std::optional<uint32_t> calculate_cased(std::string_view left, std::string_view right, uint32_t threshold) {
+ return do_calculate(left, right, threshold, LevenshteinDfa::Casing::Cased, dfa_type());
+ }
+
+ static std::optional<uint32_t> calculate_uncased(std::string_view left, std::string_view right, uint32_t threshold) {
+ return do_calculate(left, right, threshold, LevenshteinDfa::Casing::Uncased, dfa_type());
+ }
+};
+
+INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
+ LevenshteinDfaCasingTest,
+ Values(LevenshteinDfa::DfaType::Explicit,
+ LevenshteinDfa::DfaType::Implicit),
+ PrintToStringParamName());
+
+TEST_P(LevenshteinDfaCasingTest, uncased_edge_cases_have_correct_edit_distance) {
+ for (auto max : {1, 2}) {
+ EXPECT_EQ(calculate_uncased("abc", "ABC", max), std::optional{0}) << max;
+ EXPECT_EQ(calculate_uncased("Abc", "aB1", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate_uncased("aBC", "1bc", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate_uncased("Abc", "a1C", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate_uncased("aBc", "AB", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate_uncased("ABC", "abcd", max), std::optional{1}) << max;
+ }
+ EXPECT_EQ(calculate_uncased("bc", "aBCd", 2), std::optional{2});
+ EXPECT_EQ(calculate_uncased("ab", "AbCd", 2), std::optional{2});
+ EXPECT_EQ(calculate_uncased("CD", "AbcD", 2), std::optional{2});
+ EXPECT_EQ(calculate_uncased("ad", "AbcD", 2), std::optional{2});
+}
+
+TEST_P(LevenshteinDfaCasingTest, cased_edge_cases_have_correct_edit_distance) {
+ for (auto max : {1, 2}) {
+ EXPECT_EQ(calculate_cased("abc", "abC", max), std::optional{1}) << max;
+ EXPECT_EQ(calculate_cased("Abc", "aB1", max), std::nullopt) << max;
+ EXPECT_EQ(calculate_cased("aBC", "1bc", max), std::nullopt) << max;
+ EXPECT_EQ(calculate_cased("Abc", "a1C", max), std::nullopt) << max;
+ EXPECT_EQ(calculate_cased("ABC", "abcd", max), std::nullopt) << max;
+ }
+ EXPECT_EQ(calculate_cased("abc", "ABC", 2), std::nullopt);
+ EXPECT_EQ(calculate_cased("abc", "aBC", 2), std::optional{2});
+ EXPECT_EQ(calculate_cased("bc", "aBCd", 2), std::nullopt);
+ EXPECT_EQ(calculate_cased("ab", "AbCd", 2), std::nullopt);
+ EXPECT_EQ(calculate_cased("CD", "AbcD", 2), std::nullopt);
+ EXPECT_EQ(calculate_cased("ad", "AbcD", 2), std::nullopt);
+ EXPECT_EQ(calculate_cased("ad", "aBCd", 2), std::optional{2});
+}
+
+TEST_P(LevenshteinDfaCasingTest, uncased_successor_is_emitted_as_if_match_term_was_lowercased) {
+ auto dfa = LevenshteinDfa::build("FOOD", 1, LevenshteinDfa::Casing::Uncased, dfa_type());
+ // This is a subset of the other successor test cases
+ test_dfa_successor(dfa, "", "\x01""food");
+ test_dfa_successor(dfa, "FAA", "faod");
+ test_dfa_successor(dfa, "fOoOoO", "foop");
+ test_dfa_successor(dfa, "OOOf", "pfood");
+ test_dfa_successor(dfa, "Fo", "fo\x01""d");
+ test_dfa_successor(dfa, "oO", "ood");
+ test_dfa_successor(dfa, "OOO", "oood");
+ test_dfa_successor(dfa, "FOXX", "foyd");
+ test_dfa_successor(dfa, "GG", "good");
+ test_dfa_successor(dfa, "Gp", "hfood");
+ test_dfa_successor(dfa, "EP", "f\x01""od");
+ test_dfa_successor(dfa, "Hfoodz", "hood");
+ test_dfa_successor(dfa, "Aooodz", "bfood");
}
// Turn integer v into its bitwise string representation with the MSB as the leftmost character.
@@ -191,20 +288,22 @@ std::string bits_to_str(T v) {
return ret;
}
-using DfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::DfaType, uint32_t>;
+using CasingAndDfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaType, uint32_t>;
-struct LevenshteinDfaSuccessorTest : TestWithParam<DfaTypeAndMaxEdits> {
- // Print test suffix as e.g. "/Explicit_1" instead of just a GTest-chosen number.
+struct LevenshteinDfaSuccessorTest : TestWithParam<CasingAndDfaTypeAndMaxEdits> {
+ // Print test suffix as e.g. "/Uncased_Explicit_1" instead of just a GTest-chosen number.
static std::string stringify_params(const TestParamInfo<ParamType>& info) {
std::ostringstream ss;
- ss << std::get<0>(info.param) << '_' << std::get<1>(info.param);
+ ss << std::get<0>(info.param) << '_' << std::get<1>(info.param) << '_' << std::get<2>(info.param);
return ss.str();
}
};
INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
LevenshteinDfaSuccessorTest,
- Combine(Values(LevenshteinDfa::DfaType::Explicit,
+ Combine(Values(LevenshteinDfa::Casing::Uncased,
+ LevenshteinDfa::Casing::Cased),
+ Values(LevenshteinDfa::DfaType::Explicit,
LevenshteinDfa::DfaType::Implicit),
Values(1, 2)),
LevenshteinDfaSuccessorTest::stringify_params);
@@ -225,10 +324,10 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
* Inspired by approach used by Lucene DFA exhaustive testing.
*/
TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) {
- const auto [dfa_type, max_edits] = GetParam();
+ const auto [casing, dfa_type, max_edits] = GetParam();
for (uint32_t i = 0; i < 256; ++i) {
const auto target = bits_to_str(static_cast<uint8_t>(i));
- auto target_dfa = LevenshteinDfa::build(target, max_edits, dfa_type);
+ auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type);
std::string skip_to, successor;
for (uint32_t j = 0; j < 256; ++j) {
const auto source = bits_to_str(static_cast<uint8_t>(j));
@@ -298,11 +397,10 @@ enum class BenchmarkType {
};
const char* to_s(BenchmarkType t) noexcept {
- // Note: need underscores since this is used as part of GTest-generated test instance names
switch (t) {
- case BenchmarkType::DfaExplicit: return "DFA_explicit";
- case BenchmarkType::DfaImplicit: return "DFA_implicit";
- case BenchmarkType::Legacy: return "legacy";
+ case BenchmarkType::DfaExplicit: return "DfaExplicit";
+ case BenchmarkType::DfaImplicit: return "DfaImplicit";
+ case BenchmarkType::Legacy: return "Legacy";
}
abort();
}
@@ -315,10 +413,14 @@ const char* to_s(BenchmarkType t) noexcept {
return {2, 8, 16, 64, 256, 1024, 1024*16, 1024*64};
}
-struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkType> {
+using BenchmarkTypeAndCasing = std::tuple<BenchmarkType, LevenshteinDfa::Casing>;
+
+struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkTypeAndCasing> {
static std::string stringify_params(const TestParamInfo<ParamType>& info) {
- return to_s(info.param);
+ std::ostringstream ss;
+ ss << to_s(std::get<0>(info.param)) << '_' << std::get<1>(info.param);
+ return ss.str();
}
void SetUp() override {
@@ -327,7 +429,8 @@ struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkType> {
}
}
- static BenchmarkType benchmark_type() noexcept { return GetParam(); }
+ static BenchmarkType benchmark_type() noexcept { return std::get<0>(GetParam()); }
+ static LevenshteinDfa::Casing casing() noexcept { return std::get<1>(GetParam()); }
static const std::vector<std::string>& load_dictionary_once() {
static auto sorted_lines = read_and_sort_all_lines(fs::path(benchmark_dictionary));
@@ -351,9 +454,11 @@ struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkType> {
INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
LevenshteinBenchmarkTest,
- Values(BenchmarkType::DfaExplicit,
- BenchmarkType::DfaImplicit,
- BenchmarkType::Legacy),
+ Combine(Values(BenchmarkType::DfaExplicit,
+ BenchmarkType::DfaImplicit,
+ BenchmarkType::Legacy),
+ Values(LevenshteinDfa::Casing::Cased,
+ LevenshteinDfa::Casing::Uncased)),
LevenshteinBenchmarkTest::stringify_params);
// ("abc", 1) => "a"
@@ -382,12 +487,12 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_worst_case_matching_excluding_setup_t
// string must be matched and any sparse representation is always maximally filled since
// we never expend any edits via mismatches.
// Also ensure that we have multiple out-edges per node (i.e. don't just repeat "AAA" etc.).
- std::string str = repeated_string("abcde", sz);
+ std::string str = repeated_string("aBcDeFgHiJ", sz);
double min_time_s;
if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) {
auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
: LevenshteinDfa::DfaType::Implicit;
- auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+ auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
min_time_s = BenchmarkTimer::benchmark([&] {
auto res = dfa.match(str, nullptr); // not benchmarking successor generation
do_not_optimize_away(res);
@@ -408,15 +513,16 @@ TEST(LevenshteinExplicitDfaBenchmarkTest, benchmark_explicit_dfa_construction) {
if (!benchmarking_enabled()) {
GTEST_SKIP() << "benchmarking not enabled";
}
+ const auto casing = LevenshteinDfa::Casing::Cased; // For building, casing only affects initial string normalization
using vespalib::BenchmarkTimer;
for (uint8_t k : {1, 2}) {
for (uint32_t sz : string_lengths()) {
- std::string str = repeated_string("abcde", sz);
+ std::string str = repeated_string("aBcDeFgHiJ", sz);
double min_time_s = BenchmarkTimer::benchmark([&] {
- auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit);
+ auto dfa = LevenshteinDfa::build(str, k, casing, LevenshteinDfa::DfaType::Explicit);
do_not_optimize_away(dfa);
}, 2.0);
- auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit);
+ auto dfa = LevenshteinDfa::build(str, k, casing, LevenshteinDfa::DfaType::Explicit);
size_t mem_usage = dfa.memory_usage();
fprintf(stderr, "k=%u, sz=%u: \t%g us \t%zu bytes\n", k, sz, min_time_s * 1000000.0, mem_usage);
}
@@ -431,12 +537,12 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) {
fprintf(stderr, "------ %s ------\n", to_s(type));
for (uint8_t k : {1, 2}) {
for (uint32_t sz : target_lengths) {
- std::string str = repeated_string("abcde", sz);
+ std::string str = repeated_string("aBcDeFgHiJ", sz);
double min_time_s;
if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) {
auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
: LevenshteinDfa::DfaType::Implicit;
- auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+ auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
min_time_s = BenchmarkTimer::benchmark([&] {
for (const auto& line : dict) {
auto res = dfa.match(line, nullptr);
@@ -447,7 +553,9 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) {
min_time_s = BenchmarkTimer::benchmark([&] {
auto target_u32 = utf8_string_to_utf32(str);
for (const auto& line : dict) {
- auto line_u32 = utf8_string_to_utf32(line);
+ std::vector<uint32_t> line_u32 = ((casing() == LevenshteinDfa::Casing::Uncased)
+ ? vespalib::LowerCase::convert_to_ucs4(std::string_view(str))
+ : utf8_string_to_utf32(line));
auto res = vespalib::LevenshteinDistance::calculate(line_u32, target_u32, k);
do_not_optimize_away(res);
}
@@ -472,7 +580,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_skipping_dictionary_scan) {
std::string str = repeated_string("abcde", sz);
auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
: LevenshteinDfa::DfaType::Implicit;
- auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+ auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
double min_time_s = BenchmarkTimer::benchmark([&] {
auto iter = dict.cbegin();
auto end = dict.cend();
diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
index c445c60cc01..405371462ab 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
@@ -13,6 +13,11 @@ concept DfaMatcher = requires(T a) {
typename T::StateParamType;
typename T::EdgeType;
+ // Whether the matching is case-sensitive or not. If false, all source string code points will
+ // be implicitly lower-cased prior to state stepping. For case-insensitive (i.e. uncased)
+ // matching to have the expected semantics, the actual target string must be pre-lowercased.
+ { a.is_cased() } -> std::same_as<bool>;
+
// Initial (starting) state of the DFA
{ a.start() } -> std::same_as<typename T::StateType>;
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
index 8a4c9bab2b4..6e8068a7419 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
@@ -96,8 +96,11 @@ public:
using MatchResult = LevenshteinDfa::MatchResult;
private:
std::vector<DfaNodeType> _nodes;
+ const bool _is_cased;
public:
- ExplicitLevenshteinDfaImpl() noexcept = default;
+ explicit ExplicitLevenshteinDfaImpl(bool is_cased) noexcept
+ : _is_cased(is_cased)
+ {}
~ExplicitLevenshteinDfaImpl() override = default;
static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
@@ -131,14 +134,12 @@ public:
template <typename Traits>
class ExplicitLevenshteinDfaBuilder {
- std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+ const std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+ const bool _is_cased;
public:
- explicit ExplicitLevenshteinDfaBuilder(std::string_view str)
- : ExplicitLevenshteinDfaBuilder(utf8_string_to_utf32(str))
- {}
-
- explicit ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str) noexcept
- : _u32_str_buf(std::move(str))
+ ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str, bool is_cased) noexcept
+ : _u32_str_buf(std::move(str)),
+ _is_cased(is_cased)
{}
[[nodiscard]] LevenshteinDfa build_dfa() const;
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
index 9a5d3d1bdc7..16b1021de7b 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
@@ -21,13 +21,17 @@ struct ExplicitDfaMatcher {
using StateParamType = const DfaNodeType*;
const std::span<const DfaNodeType> _nodes;
+ const bool _is_cased;
- explicit ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes) noexcept
- : _nodes(nodes)
+ ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes, bool is_cased) noexcept
+ : _nodes(nodes),
+ _is_cased(is_cased)
{}
static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
+ bool is_cased() const noexcept { return _is_cased; }
+
StateType start() const noexcept {
return &_nodes[0];
}
@@ -80,7 +84,7 @@ struct ExplicitDfaMatcher {
template <uint8_t MaxEdits>
LevenshteinDfa::MatchResult
ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const {
- ExplicitDfaMatcher<MaxEdits> matcher(_nodes);
+ ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
}
@@ -164,9 +168,12 @@ class ExplicitLevenshteinDfaBuilderImpl : public DfaSteppingBase<Traits> {
using Base::is_match;
using Base::can_match;
using Base::transitions;
+
+ const bool _is_cased;
public:
- explicit ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str) noexcept
- : DfaSteppingBase<Traits>(str)
+ ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str, bool is_cased) noexcept
+ : DfaSteppingBase<Traits>(str),
+ _is_cased(is_cased)
{
assert(str.size() < UINT32_MAX / max_out_edges_per_node());
}
@@ -181,7 +188,7 @@ public:
template <typename Traits>
LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const {
- auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>();
+ auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>(_is_cased);
ExploreState<StateType> exp;
// Use BFS instead of DFS to ensure most node edges point to nodes that are allocated _after_
// the parent node, which means the CPU can skip ahead instead of ping-ponging back and forth.
@@ -221,7 +228,7 @@ LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const {
template <typename Traits>
LevenshteinDfa ExplicitLevenshteinDfaBuilder<Traits>::build_dfa() const {
- ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf);
+ ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf, _is_cased);
return builder.build_dfa();
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
index 0846b95d135..8ef521ba14f 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
@@ -9,16 +9,14 @@ namespace vespalib::fuzzy {
template <typename Traits>
class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl {
- std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+ const std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+ const bool _is_cased;
public:
using MatchResult = LevenshteinDfa::MatchResult;
- explicit ImplicitLevenshteinDfa(std::string_view str)
- : ImplicitLevenshteinDfa(utf8_string_to_utf32(str))
- {}
-
- explicit ImplicitLevenshteinDfa(std::vector<uint32_t> str) noexcept
- : _u32_str_buf(std::move(str))
+ ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased) noexcept
+ : _u32_str_buf(std::move(str)),
+ _is_cased(is_cased)
{}
~ImplicitLevenshteinDfa() override = default;
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
index 4ee468e424b..e962dba0f18 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
@@ -29,12 +29,17 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
using Base::is_match;
using Base::can_match;
- explicit ImplicitDfaMatcher(std::span<const uint32_t> u32_str) noexcept
- : Base(u32_str)
+ const bool _is_cased;
+
+ ImplicitDfaMatcher(std::span<const uint32_t> u32_str, bool is_cased) noexcept
+ : Base(u32_str),
+ _is_cased(is_cased)
{}
// start, is_match, can_match, match_edit_distance are all provided by base type
+ bool is_cased() const noexcept { return _is_cased; }
+
template <typename F>
bool has_any_char_matching(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) {
for (uint32_t i = 0; i < state.size(); ++i) {
@@ -109,7 +114,7 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
template <typename Traits>
LevenshteinDfa::MatchResult
ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const {
- ImplicitDfaMatcher<Traits> matcher(_u32_str_buf);
+ ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _is_cased);
return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
index e75ef8365bf..77691ffefd8 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
@@ -2,6 +2,7 @@
#include "explicit_levenshtein_dfa.h"
#include "implicit_levenshtein_dfa.h"
#include "levenshtein_dfa.h"
+#include "unicode_utils.h"
#include <vespa/vespalib/util/stringfmt.h>
#include <memory>
@@ -29,27 +30,30 @@ void LevenshteinDfa::dump_as_graphviz(std::ostream& out) const {
_impl->dump_as_graphviz(out);
}
-LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, DfaType dfa_type) {
+LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type) {
if (max_edits != 1 && max_edits != 2) {
throw std::invalid_argument(make_string("Levenshtein DFA max_edits must be in {1, 2}, was %u", max_edits));
}
+ const bool is_cased = (casing == Casing::Cased);
+ auto target_string_u32 = is_cased ? utf8_string_to_utf32(target_string)
+ : utf8_string_to_utf32_lowercased(target_string);
if (dfa_type == DfaType::Implicit) {
if (max_edits == 1) {
- return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(target_string));
+ return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(std::move(target_string_u32), is_cased));
} else { // max_edits == 2
- return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(target_string));
+ return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased));
}
} else { // DfaType::Explicit
if (max_edits == 1) {
- return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(target_string).build_dfa();
+ return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased).build_dfa();
} else { // max_edits == 2
- return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(target_string).build_dfa();
+ return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased).build_dfa();
}
}
}
-LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits) {
+LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) {
// TODO automatically select implementation based on target length/max edits?
// Suggestion:
// - Explicit DFA iff (k == 1 && |target| <= 256) || (k == 2 && |target| <= 64).
@@ -58,7 +62,7 @@ LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max
// an M1 Pro; your mileage may vary etc).
// Ideally the implicit DFA would always be the fastest (or at least approximately as
// fast as the explicit DFA), but this is not yet the case.
- return build(target_string, max_edits, DfaType::Implicit);
+ return build(target_string, max_edits, casing, DfaType::Implicit);
}
std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos) {
@@ -70,7 +74,7 @@ std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mo
return os;
}
-std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt) {
+std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt) {
if (dt == LevenshteinDfa::DfaType::Implicit) {
os << "Implicit";
} else {
@@ -80,4 +84,14 @@ std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt) {
return os;
}
+std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c) {
+ if (c == LevenshteinDfa::Casing::Uncased) {
+ os << "Uncased";
+ } else {
+ assert(c == LevenshteinDfa::Casing::Cased);
+ os << "Cased";
+ }
+ return os;
+}
+
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
index a26ccbe87ee..a8b2b77f647 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -182,6 +182,13 @@ public:
* memcmp()-ordering of strings and not whether they are technically valid Unicode.
* This should be the case for low-level dictionary data structures etc.
*
+ * Ordering notes: when the DFA is created as Uncased, the target and source strings
+ * are treated as lowercase when matching. The successor string is in this case generated
+ * _as if_ the input source string was originally lowercase. This is a special case
+ * intended for case-folded dictionaries that implicitly order strings by their lowercased
+ * form, but where they are stored in their original cased form. The (raw) cased form
+ * is what is passed to the DFA match() function.
+ *
* Memory allocation:
* This function does not directly or indirectly allocate any heap memory if either:
*
@@ -205,6 +212,29 @@ public:
};
/**
+ * Specifies the character case matching semantics of the DFA.
+ */
+ enum class Casing {
+ /**
+ * Characters are case-normalized (i.e. lowercased) prior to matching.
+ * Example: In uncased mode, 'A' and 'a' are considered exact matches and do not
+ * consume edits.
+ *
+ * See the ordering notes for `match()` on what Uncased implies when generating
+ * successor strings.
+ */
+ Uncased,
+ /**
+ * Characters are not case-normalized prior to matching.
+ * Example: 'A' and 'a' are considered separate characters and will consume edits.
+ *
+ * Cased mode preserves strict memcmp() UTF-8 ordering between source and successor
+ * strings.
+ */
+ Cased
+ };
+
+ /**
* Builds and returns a Levenshtein DFA that matches all strings within `max_edits`
* edits of `target_string`. The type of DFA returned is specified by dfa_type.
*
@@ -212,14 +242,13 @@ public:
*
* `target_string` must not contain any null UTF-8 chars.
*/
- [[nodiscard]] static LevenshteinDfa build(std::string_view target_string,
- uint8_t max_edits,
- DfaType dfa_type);
+ [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits,
+ Casing casing, DfaType dfa_type);
/**
* Same as build() but currently always returns an implicit DFA.
*/
- [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits);
+ [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits, Casing casing);
/**
* Dumps the DFA as a Graphviz graph in text format to the provided output stream.
@@ -239,6 +268,7 @@ public:
};
std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos);
-std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt);
+std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt);
+std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c);
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
index 206b69f8ebe..d9c4e0ffe36 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
@@ -4,6 +4,7 @@
#include "dfa_matcher.h"
#include "levenshtein_dfa.h"
#include "unicode_utils.h"
+#include <vespa/vespalib/text/lowercase.h>
#include <vespa/vespalib/text/utf8.h>
#include <cassert>
#include <concepts>
@@ -139,7 +140,8 @@ struct MatchAlgorithm {
* Both the input and successor output strings are in UTF-8 format. To avoid doing
* duplicate work, we keep track of the byte length of the string prefix that will be
* part of the successor and simply copy it verbatim instead of building the string
- * from converted UTF-32 -> UTF-8 chars as we go.
+ * from converted UTF-32 -> UTF-8 chars as we go. This optimization cannot be used
+ * when one or more of the prefix characters have been lowercase-transformed.
*
* TODO we could probably also optimize the smallest suffix generation with this when
* we know we can no longer insert any smaller char substitutions and the only way
@@ -147,6 +149,10 @@ struct MatchAlgorithm {
* - To do this we'd need both the original UTF-8 target string as well as a
* secondary vector that maps u32 character index to the corresponding UTF-8 index.
* Both trivial to get as part of DFA initialization.
+ *
+ * TODO let matcher know if source string is pre-normalized (i.e. lowercased).
+ * TODO std::u32string output; no need to re-encode to UTF-8.
+ * TODO consider opportunistically appending prefix as we go instead of only when needed.
*/
template <DfaMatcher Matcher>
static MatchResult match(const Matcher& matcher,
@@ -154,15 +160,20 @@ struct MatchAlgorithm {
std::string* successor_out)
{
using StateType = typename Matcher::StateType;
- vespalib::Utf8Reader u8_reader(source.data(), source.size());
+ Utf8Reader u8_reader(source.data(), source.size());
uint32_t n_prefix_u8_bytes = 0;
uint32_t char_after_prefix = 0;
StateType last_state_with_higher_out = StateType{};
+ bool can_use_raw_prefix = true;
StateType state = matcher.start();
while (u8_reader.hasMore()) {
const auto u8_pos_before_char = u8_reader.getPos();
- const uint32_t mch = u8_reader.getChar();
+ const uint32_t raw_mch = u8_reader.getChar();
+ const uint32_t mch = normalized_match_char(raw_mch, matcher.is_cased());
+ if (raw_mch != mch) {
+ can_use_raw_prefix = false; // FIXME this is pessimistic; considers entire string, not just prefix
+ }
if (successor_out && matcher.has_higher_out_edge(state, mch)) {
last_state_with_higher_out = state;
n_prefix_u8_bytes = u8_pos_before_char;
@@ -174,7 +185,8 @@ struct MatchAlgorithm {
} else {
// Can never match; find the successor if requested
if (successor_out) {
- *successor_out = source.substr(0, n_prefix_u8_bytes);
+ emit_successor_prefix(*successor_out, source, n_prefix_u8_bytes,
+ matcher.is_cased() || can_use_raw_prefix);
assert(matcher.valid_state(last_state_with_higher_out));
backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out,
char_after_prefix, *successor_out);
@@ -187,7 +199,8 @@ struct MatchAlgorithm {
return MatchResult::make_match(max_edits(), edits);
}
if (successor_out) {
- *successor_out = source;
+ emit_successor_prefix(*successor_out, source, source.size(),
+ matcher.is_cased() || can_use_raw_prefix);
emit_smallest_matching_suffix(matcher, state, *successor_out);
}
return MatchResult::make_mismatch(max_edits());
@@ -286,6 +299,43 @@ struct MatchAlgorithm {
}
}
}
+
+ /**
+ * The successor prefix is the prefix of the source string up to (but not including) the
+ * point where we emit a lexicographically higher character. Ideally we can just copy the
+ * UTF-8 bytes verbatim from the source into the successor. This is possible when one of
+ * the following holds:
+ *
+ * - DFA uses Cased (i.e. exact) matching, or
+ * - DFA uses Uncased, but none of the characters in the prefix triggered a lowercase
+ * transform. This means the prefix is already as-if lowercased, and we can copy it
+ * verbatim.
+ *
+ * In the case that we can't copy verbatim, we currently have to explicitly normalize the
+ * prefix by converting it to its lowercased form.
+ *
+ * Example: Uncased matching for target "food" with input "FOXX". This generates the
+ * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different
+ * ordering when compared byte-wise against an implicitly lowercased dictionary.
+ */
+ static void emit_successor_prefix(std::string& successor_out, std::string_view source,
+ uint32_t n_prefix_u8_bytes, bool emit_raw_prefix_u8_bytes)
+ {
+ if (emit_raw_prefix_u8_bytes) {
+ successor_out = source.substr(0, n_prefix_u8_bytes);
+ } else {
+ // TODO avoid duplicate work...! :I
+ successor_out.clear();
+ Utf8Reader u8_reader(source.data(), source.size());
+ while (u8_reader.getPos() < n_prefix_u8_bytes) {
+ append_utf32_char_as_utf8(successor_out, LowerCase::convert(u8_reader.getChar()));
+ }
+ }
+ }
+
+ static uint32_t normalized_match_char(uint32_t in_ch, bool is_cased) noexcept {
+ return (is_cased ? in_ch : LowerCase::convert(in_ch));
+ }
};
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
index 648be234562..f2c6259fd10 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
@@ -1,22 +1,41 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "unicode_utils.h"
+#include <vespa/vespalib/text/lowercase.h>
#include <vespa/vespalib/text/utf8.h>
#include <vespa/vespalib/util/stringfmt.h>
#include <stdexcept>
namespace vespalib::fuzzy {
-std::vector<uint32_t> utf8_string_to_utf32(std::string_view str) {
- vespalib::stringref ch_str(str.data(), str.size());
- vespalib::Utf8Reader utf8_reader(ch_str);
+namespace {
+
+template <bool ToLowercase>
+std::vector<uint32_t> utf8_string_to_utf32_impl(std::string_view str) {
+ stringref ch_str(str.data(), str.size());
+ Utf8Reader utf8_reader(ch_str); // TODO consider integrating simdutf library
std::vector<uint32_t> u32ret;
u32ret.reserve(str.size()); // Will over-allocate for all non-ASCII
while (utf8_reader.hasMore()) {
- u32ret.emplace_back(utf8_reader.getChar());
+ if constexpr (ToLowercase) {
+ u32ret.emplace_back(LowerCase::convert(utf8_reader.getChar()));
+ } else {
+ u32ret.emplace_back(utf8_reader.getChar());
+ }
+
}
return u32ret;
}
+}
+
+std::vector<uint32_t> utf8_string_to_utf32_lowercased(std::string_view str) {
+ return utf8_string_to_utf32_impl<true>(str);
+}
+
+std::vector<uint32_t> utf8_string_to_utf32(std::string_view str) {
+ return utf8_string_to_utf32_impl<false>(str);
+}
+
std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str) {
return utf8_string_to_utf32(std::string_view(reinterpret_cast<const char*>(u8str.data()), u8str.size()));
}
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
index 8627b01ff6a..711c4f1fcd0 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
@@ -8,6 +8,9 @@
namespace vespalib::fuzzy {
+// UTF-8 -> UTF-32 conversion with lowercasing of all characters
+std::vector<uint32_t> utf8_string_to_utf32_lowercased(std::string_view str);
+// UTF-8 -> UTF-32 conversion without case conversion
std::vector<uint32_t> utf8_string_to_utf32(std::string_view str);
std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str);