Merge pull request #28540 from vespa-engine/vekterli/case-normalized-levenshtein-dfa-matching

Add support for case-insensitive matching to Levenshtein DFAs
author: Tor Brede Vekterli <vekterli@yahooinc.com> 2023-09-15 15:40:52 +0200
committer: GitHub <noreply@github.com> 2023-09-15 15:40:52 +0200
commit: 699ecbd2c056d95adbcd99a70e9169d12edb2ba4 (patch)
tree: 354819f4565f958cfcad28f984ffd6a960525ffd
parent: 7e3d557d89bb0f9efc283e98478d568c801b4d96 (diff)
parent: 6d696cd634a6bd4ea1b4c046ade9bfc5b5d246a8 (diff)
11 files changed, 344 insertions, 104 deletions
diff --git a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
index 6966fd0b703..c466d0bfb3e 100644
--- a/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
+++ b/vespalib/src/tests/fuzzy/levenshtein_dfa_test.cpp
@@ -1,10 +1,10 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
 #include <vespa/vespalib/fuzzy/dfa_stepping_base.h>
-#include <vespa/vespalib/fuzzy/unicode_utils.h>
+#include <vespa/vespalib/fuzzy/levenshtein_dfa.h>
 #include <vespa/vespalib/fuzzy/levenshtein_distance.h> // For benchmarking purposes
+#include <vespa/vespalib/fuzzy/unicode_utils.h>
+#include <vespa/vespalib/text/lowercase.h>
 #include <vespa/vespalib/util/benchmark_timer.h>
-#include <charconv>
 #include <concepts>
 #include <filesystem>
 #include <fstream>
@@ -18,38 +18,67 @@ namespace fs = std::filesystem;
 
 static std::string benchmark_dictionary;
 
-struct LevenshteinDfaTest : TestWithParam<LevenshteinDfa::DfaType> {
+using CasingAndDfaType = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaType>;
 
-    static LevenshteinDfa::DfaType dfa_type() noexcept { return GetParam(); }
+namespace {
 
-    static std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) {
-        auto dfa_lhs = LevenshteinDfa::build(left, threshold, dfa_type());
-        auto maybe_match_lhs = dfa_lhs.match(right, nullptr);
+std::optional<uint32_t> do_calculate(std::string_view left, std::string_view right, uint32_t threshold,
+                                     LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
+{
+    auto dfa_lhs = LevenshteinDfa::build(left, threshold, casing, dfa_type);
+    auto maybe_match_lhs = dfa_lhs.match(right, nullptr);
 
-        auto dfa_rhs = LevenshteinDfa::build(right, threshold, dfa_type());
-        auto maybe_match_rhs = dfa_rhs.match(left, nullptr);
+    auto dfa_rhs = LevenshteinDfa::build(right, threshold, casing, dfa_type);
+    auto maybe_match_rhs = dfa_rhs.match(left, nullptr);
 
-        EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches());
-        if (maybe_match_lhs.matches()) {
-            EXPECT_EQ(maybe_match_lhs.edits(), maybe_match_rhs.edits());
-            return {maybe_match_lhs.edits()};
-        }
-        return std::nullopt;
+    EXPECT_EQ(maybe_match_lhs.matches(), maybe_match_rhs.matches());
+    if (maybe_match_lhs.matches()) {
+        EXPECT_EQ(maybe_match_lhs.edits(), maybe_match_rhs.edits());
+        return {maybe_match_lhs.edits()};
+    }
+    return std::nullopt;
+}
+
+std::optional<uint32_t> do_calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold,
+                                     LevenshteinDfa::Casing casing, LevenshteinDfa::DfaType dfa_type)
+{
+    std::string_view lhs_ch(reinterpret_cast<const char*>(left.data()), left.size());
+    std::string_view rhs_ch(reinterpret_cast<const char*>(right.data()), right.size());
+    return do_calculate(lhs_ch, rhs_ch, threshold, casing, dfa_type);
+}
+
+}
+
+struct LevenshteinDfaTest : TestWithParam<CasingAndDfaType> {
+
+    static LevenshteinDfa::Casing casing() noexcept { return std::get<0>(GetParam()); }
+    static LevenshteinDfa::DfaType dfa_type() noexcept { return std::get<1>(GetParam()); }
+
+    static std::string stringify_params(const TestParamInfo<ParamType>& info) {
+        std::ostringstream ss;
+        ss << std::get<0>(info.param) << '_' << std::get<1>(info.param);
+        return ss.str();
+    }
+
+    static std::optional<uint32_t> calculate(std::string_view left, std::string_view right, uint32_t threshold) {
+        return do_calculate(left, right, threshold, casing(), dfa_type());
     }
 
     static std::optional<uint32_t> calculate(std::u8string_view left, std::u8string_view right, uint32_t threshold) {
-        std::string_view lhs_ch(reinterpret_cast<const char*>(left.data()), left.size());
-        std::string_view rhs_ch(reinterpret_cast<const char*>(right.data()), right.size());
-        return calculate(lhs_ch, rhs_ch, threshold);
+        return do_calculate(left, right, threshold, casing(), dfa_type());
     }
 
 };
 
-INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
+// All the baseline DFA tests use lowercase only, so they should have the exact same outcome
+// for both cased and uncased matching.
+INSTANTIATE_TEST_SUITE_P(AllCasingAndDfaTypes,
                          LevenshteinDfaTest,
-                         Values(LevenshteinDfa::DfaType::Explicit,
-                                LevenshteinDfa::DfaType::Implicit),
-                         PrintToStringParamName());
+                         Combine(Values(LevenshteinDfa::Casing::Uncased,
+                                        LevenshteinDfa::Casing::Cased),
+                                 Values(LevenshteinDfa::DfaType::Explicit,
+                                        LevenshteinDfa::DfaType::Implicit)),
+                         LevenshteinDfaTest::stringify_params);
 
 // Same as existing non-DFA Levenshtein tests, but with some added instantiations
 // for smaller max distances.
@@ -101,7 +130,7 @@ void test_dfa_successor(const LevenshteinDfa& dfa, std::string_view source, std:
 }
 
 TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings) {
-    auto dfa = LevenshteinDfa::build("food", 1, dfa_type());
+    auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type());
 
     test_dfa_successor(dfa, "",       "\x01""food");
     test_dfa_successor(dfa, "faa",    "faod");
@@ -122,8 +151,8 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings
 
     // Also works with Unicode
     // 2 chars
-    test_dfa_successor(dfa, "\xc3\x86""x",                // "Æx"
-                            "\xc3\x87""food");            // "Çfood"
+    test_dfa_successor(dfa, "\xc3\xa6""x",                // "æx"
+                            "\xc3\xa7""food");            // "çfood"
     // 3 chars
     test_dfa_successor(dfa, "\xe7\x8c\xab""\xe3\x81\xaf", // "猫は"
                             "\xe7\x8c\xac""food");        // "猬food"
@@ -138,7 +167,7 @@ TEST_P(LevenshteinDfaTest, can_generate_successors_to_mismatching_source_strings
 }
 
 TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_input) {
-    auto dfa = LevenshteinDfa::build("food", 1, dfa_type());
+    auto dfa = LevenshteinDfa::build("food", 1, casing(), dfa_type());
     // The successor string must be lexicographically larger than the input string.
     // In the presence of a wildcard output edge we handle this by increase the input
     // character by 1 and encoding it back as UTF-8.
@@ -155,7 +184,7 @@ TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_max_unicode_code_point_
 }
 
 TEST_P(LevenshteinDfaTest, successor_is_well_defined_for_empty_target) {
-    auto dfa = LevenshteinDfa::build("", 1, dfa_type());
+    auto dfa = LevenshteinDfa::build("", 1, casing(), dfa_type());
     test_dfa_successor(dfa, "aa",    "b");
     test_dfa_successor(dfa, "b\x01", "c");
     test_dfa_successor(dfa, "vespa", "w");
@@ -170,12 +199,80 @@ TEST_P(LevenshteinDfaTest, malformed_utf8_is_replaced_with_placeholder_char) {
     EXPECT_EQ(calculate("\xff\xff", "a", 2),     std::optional{2});
     EXPECT_EQ(calculate("a", "\xff", 2),         std::optional{1});
     EXPECT_EQ(calculate("a", "\xff\xff\xff", 2), std::nullopt);
-    EXPECT_EQ(calculate("\xff", "\xef\xbf\xbd"/*U+FFFD*/, 2),  std::optional{0});
+    EXPECT_EQ(calculate("\xff", "\xef\xbf\xbd"/*U+FFFD*/, 2), std::optional{0});
 }
 
 TEST_P(LevenshteinDfaTest, unsupported_max_edits_value_throws) {
-    EXPECT_THROW((void)LevenshteinDfa::build("abc", 0, dfa_type()), std::invalid_argument);
-    EXPECT_THROW((void)LevenshteinDfa::build("abc", 3, dfa_type()), std::invalid_argument);
+    EXPECT_THROW((void)LevenshteinDfa::build("abc", 0, casing(), dfa_type()), std::invalid_argument);
+    EXPECT_THROW((void)LevenshteinDfa::build("abc", 3, casing(), dfa_type()), std::invalid_argument);
+}
+
+struct LevenshteinDfaCasingTest : TestWithParam<LevenshteinDfa::DfaType> {
+    static LevenshteinDfa::DfaType dfa_type() noexcept { return GetParam(); }
+
+    static std::optional<uint32_t> calculate_cased(std::string_view left, std::string_view right, uint32_t threshold) {
+        return do_calculate(left, right, threshold, LevenshteinDfa::Casing::Cased, dfa_type());
+    }
+
+    static std::optional<uint32_t> calculate_uncased(std::string_view left, std::string_view right, uint32_t threshold) {
+        return do_calculate(left, right, threshold, LevenshteinDfa::Casing::Uncased, dfa_type());
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
+                         LevenshteinDfaCasingTest,
+                         Values(LevenshteinDfa::DfaType::Explicit,
+                                LevenshteinDfa::DfaType::Implicit),
+                         PrintToStringParamName());
+
+TEST_P(LevenshteinDfaCasingTest, uncased_edge_cases_have_correct_edit_distance) {
+    for (auto max : {1, 2}) {
+        EXPECT_EQ(calculate_uncased("abc", "ABC", max),  std::optional{0}) << max;
+        EXPECT_EQ(calculate_uncased("Abc", "aB1", max),  std::optional{1}) << max;
+        EXPECT_EQ(calculate_uncased("aBC", "1bc", max),  std::optional{1}) << max;
+        EXPECT_EQ(calculate_uncased("Abc", "a1C", max),  std::optional{1}) << max;
+        EXPECT_EQ(calculate_uncased("aBc", "AB", max),   std::optional{1}) << max;
+        EXPECT_EQ(calculate_uncased("ABC", "abcd", max), std::optional{1}) << max;
+    }
+    EXPECT_EQ(calculate_uncased("bc", "aBCd", 2),  std::optional{2});
+    EXPECT_EQ(calculate_uncased("ab", "AbCd", 2),  std::optional{2});
+    EXPECT_EQ(calculate_uncased("CD", "AbcD", 2),  std::optional{2});
+    EXPECT_EQ(calculate_uncased("ad", "AbcD", 2),  std::optional{2});
+}
+
+TEST_P(LevenshteinDfaCasingTest, cased_edge_cases_have_correct_edit_distance) {
+    for (auto max : {1, 2}) {
+        EXPECT_EQ(calculate_cased("abc", "abC", max),  std::optional{1}) << max;
+        EXPECT_EQ(calculate_cased("Abc", "aB1", max),  std::nullopt) << max;
+        EXPECT_EQ(calculate_cased("aBC", "1bc", max),  std::nullopt) << max;
+        EXPECT_EQ(calculate_cased("Abc", "a1C", max),  std::nullopt) << max;
+        EXPECT_EQ(calculate_cased("ABC", "abcd", max), std::nullopt) << max;
+    }
+    EXPECT_EQ(calculate_cased("abc", "ABC", 2),  std::nullopt);
+    EXPECT_EQ(calculate_cased("abc", "aBC", 2),  std::optional{2});
+    EXPECT_EQ(calculate_cased("bc", "aBCd", 2),  std::nullopt);
+    EXPECT_EQ(calculate_cased("ab", "AbCd", 2),  std::nullopt);
+    EXPECT_EQ(calculate_cased("CD", "AbcD", 2),  std::nullopt);
+    EXPECT_EQ(calculate_cased("ad", "AbcD", 2),  std::nullopt);
+    EXPECT_EQ(calculate_cased("ad", "aBCd", 2),  std::optional{2});
+}
+
+TEST_P(LevenshteinDfaCasingTest, uncased_successor_is_emitted_as_if_match_term_was_lowercased) {
+    auto dfa = LevenshteinDfa::build("FOOD", 1, LevenshteinDfa::Casing::Uncased, dfa_type());
+    // This is a subset of the other successor test cases
+    test_dfa_successor(dfa, "",       "\x01""food");
+    test_dfa_successor(dfa, "FAA",    "faod");
+    test_dfa_successor(dfa, "fOoOoO", "foop");
+    test_dfa_successor(dfa, "OOOf",   "pfood");
+    test_dfa_successor(dfa, "Fo",     "fo\x01""d");
+    test_dfa_successor(dfa, "oO",     "ood");
+    test_dfa_successor(dfa, "OOO",    "oood");
+    test_dfa_successor(dfa, "FOXX",   "foyd");
+    test_dfa_successor(dfa, "GG",     "good");
+    test_dfa_successor(dfa, "Gp",     "hfood");
+    test_dfa_successor(dfa, "EP",     "f\x01""od");
+    test_dfa_successor(dfa, "Hfoodz", "hood");
+    test_dfa_successor(dfa, "Aooodz", "bfood");
 }
 
 // Turn integer v into its bitwise string representation with the MSB as the leftmost character.
@@ -191,20 +288,22 @@ std::string bits_to_str(T v) {
     return ret;
 }
 
-using DfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::DfaType, uint32_t>;
+using CasingAndDfaTypeAndMaxEdits = std::tuple<LevenshteinDfa::Casing, LevenshteinDfa::DfaType, uint32_t>;
 
-struct LevenshteinDfaSuccessorTest : TestWithParam<DfaTypeAndMaxEdits> {
-    // Print test suffix as e.g. "/Explicit_1" instead of just a GTest-chosen number.
+struct LevenshteinDfaSuccessorTest : TestWithParam<CasingAndDfaTypeAndMaxEdits> {
+    // Print test suffix as e.g. "/Uncased_Explicit_1" instead of just a GTest-chosen number.
     static std::string stringify_params(const TestParamInfo<ParamType>& info) {
         std::ostringstream ss;
-        ss << std::get<0>(info.param) << '_' << std::get<1>(info.param);
+        ss << std::get<0>(info.param) << '_' << std::get<1>(info.param) << '_' << std::get<2>(info.param);
         return ss.str();
     }
 };
 
 INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
                          LevenshteinDfaSuccessorTest,
-                         Combine(Values(LevenshteinDfa::DfaType::Explicit,
+                         Combine(Values(LevenshteinDfa::Casing::Uncased,
+                                        LevenshteinDfa::Casing::Cased),
+                                 Values(LevenshteinDfa::DfaType::Explicit,
                                         LevenshteinDfa::DfaType::Implicit),
                                  Values(1, 2)),
                          LevenshteinDfaSuccessorTest::stringify_params);
@@ -225,10 +324,10 @@ INSTANTIATE_TEST_SUITE_P(SupportedMaxEdits,
  * Inspired by approach used by Lucene DFA exhaustive testing.
  */
 TEST_P(LevenshteinDfaSuccessorTest, exhaustive_successor_test) {
-    const auto [dfa_type, max_edits] = GetParam();
+    const auto [casing, dfa_type, max_edits] = GetParam();
     for (uint32_t i = 0; i < 256; ++i) {
         const auto target = bits_to_str(static_cast<uint8_t>(i));
-        auto target_dfa = LevenshteinDfa::build(target, max_edits, dfa_type);
+        auto target_dfa = LevenshteinDfa::build(target, max_edits, casing, dfa_type);
         std::string skip_to, successor;
         for (uint32_t j = 0; j < 256; ++j) {
             const auto source = bits_to_str(static_cast<uint8_t>(j));
@@ -298,11 +397,10 @@ enum class BenchmarkType {
 };
 
 const char* to_s(BenchmarkType t) noexcept {
-    // Note: need underscores since this is used as part of GTest-generated test instance names
     switch (t) {
-    case BenchmarkType::DfaExplicit: return "DFA_explicit";
-    case BenchmarkType::DfaImplicit: return "DFA_implicit";
-    case BenchmarkType::Legacy:      return "legacy";
+    case BenchmarkType::DfaExplicit: return "DfaExplicit";
+    case BenchmarkType::DfaImplicit: return "DfaImplicit";
+    case BenchmarkType::Legacy:      return "Legacy";
     }
     abort();
 }
@@ -315,10 +413,14 @@ const char* to_s(BenchmarkType t) noexcept {
     return {2, 8, 16, 64, 256, 1024, 1024*16, 1024*64};
 }
 
-struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkType> {
+using BenchmarkTypeAndCasing = std::tuple<BenchmarkType, LevenshteinDfa::Casing>;
+
+struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkTypeAndCasing> {
 
     static std::string stringify_params(const TestParamInfo<ParamType>& info) {
-        return to_s(info.param);
+        std::ostringstream ss;
+        ss << to_s(std::get<0>(info.param)) << '_' << std::get<1>(info.param);
+        return ss.str();
     }
 
     void SetUp() override {
@@ -327,7 +429,8 @@ struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkType> {
         }
     }
 
-    static BenchmarkType benchmark_type() noexcept { return GetParam(); }
+    static BenchmarkType benchmark_type() noexcept  { return std::get<0>(GetParam()); }
+    static LevenshteinDfa::Casing casing() noexcept { return std::get<1>(GetParam()); }
 
     static const std::vector<std::string>& load_dictionary_once() {
         static auto sorted_lines = read_and_sort_all_lines(fs::path(benchmark_dictionary));
@@ -351,9 +454,11 @@ struct LevenshteinBenchmarkTest : TestWithParam<BenchmarkType> {
 
 INSTANTIATE_TEST_SUITE_P(AllDfaTypes,
                          LevenshteinBenchmarkTest,
-                         Values(BenchmarkType::DfaExplicit,
-                                BenchmarkType::DfaImplicit,
-                                BenchmarkType::Legacy),
+                         Combine(Values(BenchmarkType::DfaExplicit,
+                                        BenchmarkType::DfaImplicit,
+                                        BenchmarkType::Legacy),
+                                 Values(LevenshteinDfa::Casing::Cased,
+                                        LevenshteinDfa::Casing::Uncased)),
                          LevenshteinBenchmarkTest::stringify_params);
 
 // ("abc", 1) => "a"
@@ -382,12 +487,12 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_worst_case_matching_excluding_setup_t
             // string must be matched and any sparse representation is always maximally filled since
             // we never expend any edits via mismatches.
             // Also ensure that we have multiple out-edges per node (i.e. don't just repeat "AAA" etc.).
-            std::string str = repeated_string("abcde", sz);
+            std::string str = repeated_string("aBcDeFgHiJ", sz);
             double min_time_s;
             if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) {
                 auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
                                                                      : LevenshteinDfa::DfaType::Implicit;
-                auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+                auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
                 min_time_s = BenchmarkTimer::benchmark([&] {
                     auto res = dfa.match(str, nullptr); // not benchmarking successor generation
                     do_not_optimize_away(res);
@@ -408,15 +513,16 @@ TEST(LevenshteinExplicitDfaBenchmarkTest, benchmark_explicit_dfa_construction) {
     if (!benchmarking_enabled()) {
         GTEST_SKIP() << "benchmarking not enabled";
     }
+    const auto casing = LevenshteinDfa::Casing::Cased; // For building, casing only affects initial string normalization
     using vespalib::BenchmarkTimer;
     for (uint8_t k : {1, 2}) {
         for (uint32_t sz : string_lengths()) {
-            std::string str = repeated_string("abcde", sz);
+            std::string str = repeated_string("aBcDeFgHiJ", sz);
             double min_time_s = BenchmarkTimer::benchmark([&] {
-                auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit);
+                auto dfa = LevenshteinDfa::build(str, k, casing, LevenshteinDfa::DfaType::Explicit);
                 do_not_optimize_away(dfa);
             }, 2.0);
-            auto dfa = LevenshteinDfa::build(str, k, LevenshteinDfa::DfaType::Explicit);
+            auto dfa = LevenshteinDfa::build(str, k, casing, LevenshteinDfa::DfaType::Explicit);
             size_t mem_usage = dfa.memory_usage();
             fprintf(stderr, "k=%u, sz=%u: \t%g us \t%zu bytes\n", k, sz, min_time_s * 1000000.0, mem_usage);
         }
@@ -431,12 +537,12 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) {
     fprintf(stderr, "------ %s ------\n", to_s(type));
     for (uint8_t k : {1, 2}) {
         for (uint32_t sz : target_lengths) {
-            std::string str = repeated_string("abcde", sz);
+            std::string str = repeated_string("aBcDeFgHiJ", sz);
             double min_time_s;
             if (type == BenchmarkType::DfaExplicit || type == BenchmarkType::DfaImplicit) {
                 auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
                                                                      : LevenshteinDfa::DfaType::Implicit;
-                auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+                auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
                 min_time_s = BenchmarkTimer::benchmark([&] {
                     for (const auto& line : dict) {
                         auto res = dfa.match(line, nullptr);
@@ -447,7 +553,9 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_brute_force_dictionary_scan) {
                 min_time_s = BenchmarkTimer::benchmark([&] {
                     auto target_u32 = utf8_string_to_utf32(str);
                     for (const auto& line : dict) {
-                        auto line_u32 = utf8_string_to_utf32(line);
+                        std::vector<uint32_t> line_u32 = ((casing() == LevenshteinDfa::Casing::Uncased)
+                                                          ? vespalib::LowerCase::convert_to_ucs4(std::string_view(str))
+                                                          : utf8_string_to_utf32(line));
                         auto res = vespalib::LevenshteinDistance::calculate(line_u32, target_u32, k);
                         do_not_optimize_away(res);
                     }
@@ -472,7 +580,7 @@ TEST_P(LevenshteinBenchmarkTest, benchmark_skipping_dictionary_scan) {
             std::string str = repeated_string("abcde", sz);
             auto dfa_type = (type == BenchmarkType::DfaExplicit) ? LevenshteinDfa::DfaType::Explicit
                                                                  : LevenshteinDfa::DfaType::Implicit;
-            auto dfa = LevenshteinDfa::build(str, k, dfa_type);
+            auto dfa = LevenshteinDfa::build(str, k, casing(), dfa_type);
             double min_time_s = BenchmarkTimer::benchmark([&] {
                 auto iter = dict.cbegin();
                 auto end = dict.cend();
diff --git a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
index c445c60cc01..405371462ab 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/dfa_matcher.h
@@ -13,6 +13,11 @@ concept DfaMatcher = requires(T a) {
     typename T::StateParamType;
     typename T::EdgeType;
 
+    // Whether the matching is case-sensitive or not. If false, all source string code points will
+    // be implicitly lower-cased prior to state stepping. For case-insensitive (i.e. uncased)
+    // matching to have the expected semantics, the actual target string must be pre-lowercased.
+    { a.is_cased() } -> std::same_as<bool>;
+
     // Initial (starting) state of the DFA
     { a.start() } -> std::same_as<typename T::StateType>;
 
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
index 8a4c9bab2b4..6e8068a7419 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.h
@@ -96,8 +96,11 @@ public:
     using MatchResult = LevenshteinDfa::MatchResult;
 private:
     std::vector<DfaNodeType> _nodes;
+    const bool               _is_cased;
 public:
-    ExplicitLevenshteinDfaImpl() noexcept = default;
+    explicit ExplicitLevenshteinDfaImpl(bool is_cased) noexcept
+        : _is_cased(is_cased)
+    {}
     ~ExplicitLevenshteinDfaImpl() override = default;
 
     static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
@@ -131,14 +134,12 @@ public:
 
 template <typename Traits>
 class ExplicitLevenshteinDfaBuilder {
-    std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+    const std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+    const bool                  _is_cased;
 public:
-    explicit ExplicitLevenshteinDfaBuilder(std::string_view str)
-        : ExplicitLevenshteinDfaBuilder(utf8_string_to_utf32(str))
-    {}
-
-    explicit ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str) noexcept
-        : _u32_str_buf(std::move(str))
+    ExplicitLevenshteinDfaBuilder(std::vector<uint32_t> str, bool is_cased) noexcept
+        : _u32_str_buf(std::move(str)),
+          _is_cased(is_cased)
     {}
 
     [[nodiscard]] LevenshteinDfa build_dfa() const;
diff --git a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
index 9a5d3d1bdc7..16b1021de7b 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/explicit_levenshtein_dfa.hpp
@@ -21,13 +21,17 @@ struct ExplicitDfaMatcher {
     using StateParamType = const DfaNodeType*;
 
     const std::span<const DfaNodeType> _nodes;
+    const bool                         _is_cased;
 
-    explicit ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes) noexcept
-        : _nodes(nodes)
+    ExplicitDfaMatcher(const std::span<const DfaNodeType> nodes, bool is_cased) noexcept
+        : _nodes(nodes),
+          _is_cased(is_cased)
     {}
 
     static constexpr uint8_t max_edits() noexcept { return MaxEdits; }
 
+    bool is_cased() const noexcept { return _is_cased; }
+
     StateType start() const noexcept {
         return &_nodes[0];
     }
@@ -80,7 +84,7 @@ struct ExplicitDfaMatcher {
 template <uint8_t MaxEdits>
 LevenshteinDfa::MatchResult
 ExplicitLevenshteinDfaImpl<MaxEdits>::match(std::string_view u8str, std::string* successor_out) const {
-    ExplicitDfaMatcher<MaxEdits> matcher(_nodes);
+    ExplicitDfaMatcher<MaxEdits> matcher(_nodes, _is_cased);
     return MatchAlgorithm<MaxEdits>::match(matcher, u8str, successor_out);
 }
 
@@ -164,9 +168,12 @@ class ExplicitLevenshteinDfaBuilderImpl : public DfaSteppingBase<Traits> {
     using Base::is_match;
     using Base::can_match;
     using Base::transitions;
+
+    const bool _is_cased;
 public:
-    explicit ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str) noexcept
-        : DfaSteppingBase<Traits>(str)
+    ExplicitLevenshteinDfaBuilderImpl(std::span<const uint32_t> str, bool is_cased) noexcept
+        : DfaSteppingBase<Traits>(str),
+          _is_cased(is_cased)
     {
         assert(str.size() < UINT32_MAX / max_out_edges_per_node());
     }
@@ -181,7 +188,7 @@ public:
 
 template <typename Traits>
 LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const {
-    auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>();
+    auto dfa = std::make_unique<ExplicitLevenshteinDfaImpl<max_edits()>>(_is_cased);
     ExploreState<StateType> exp;
     // Use BFS instead of DFS to ensure most node edges point to nodes that are allocated _after_
     // the parent node, which means the CPU can skip ahead instead of ping-ponging back and forth.
@@ -221,7 +228,7 @@ LevenshteinDfa ExplicitLevenshteinDfaBuilderImpl<Traits>::build_dfa() const {
 
 template <typename Traits>
 LevenshteinDfa ExplicitLevenshteinDfaBuilder<Traits>::build_dfa() const {
-    ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf);
+    ExplicitLevenshteinDfaBuilderImpl<Traits> builder(_u32_str_buf, _is_cased);
     return builder.build_dfa();
 }
 
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
index 0846b95d135..8ef521ba14f 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.h
@@ -9,16 +9,14 @@ namespace vespalib::fuzzy {
 
 template <typename Traits>
 class ImplicitLevenshteinDfa final : public LevenshteinDfa::Impl {
-    std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+    const std::vector<uint32_t> _u32_str_buf; // TODO std::u32string
+    const bool                  _is_cased;
 public:
     using MatchResult = LevenshteinDfa::MatchResult;
 
-    explicit ImplicitLevenshteinDfa(std::string_view str)
-        : ImplicitLevenshteinDfa(utf8_string_to_utf32(str))
-    {}
-
-    explicit ImplicitLevenshteinDfa(std::vector<uint32_t> str) noexcept
-        : _u32_str_buf(std::move(str))
+    ImplicitLevenshteinDfa(std::vector<uint32_t> str, bool is_cased) noexcept
+        : _u32_str_buf(std::move(str)),
+          _is_cased(is_cased)
     {}
 
     ~ImplicitLevenshteinDfa() override = default;
diff --git a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
index 4ee468e424b..e962dba0f18 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/implicit_levenshtein_dfa.hpp
@@ -29,12 +29,17 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
     using Base::is_match;
     using Base::can_match;
 
-    explicit ImplicitDfaMatcher(std::span<const uint32_t> u32_str) noexcept
-        : Base(u32_str)
+    const bool _is_cased;
+
+    ImplicitDfaMatcher(std::span<const uint32_t> u32_str, bool is_cased) noexcept
+        : Base(u32_str),
+          _is_cased(is_cased)
     {}
 
     // start, is_match, can_match, match_edit_distance are all provided by base type
 
+    bool is_cased() const noexcept { return _is_cased; }
+
     template <typename F>
     bool has_any_char_matching(const StateType& state, F&& f) const noexcept(noexcept(f(uint32_t{}))) {
         for (uint32_t i = 0; i < state.size(); ++i) {
@@ -109,7 +114,7 @@ struct ImplicitDfaMatcher : public DfaSteppingBase<Traits> {
 template <typename Traits>
 LevenshteinDfa::MatchResult
 ImplicitLevenshteinDfa<Traits>::match(std::string_view u8str, std::string* successor_out) const {
-    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf);
+    ImplicitDfaMatcher<Traits> matcher(_u32_str_buf, _is_cased);
     return MatchAlgorithm<Traits::max_edits()>::match(matcher, u8str, successor_out);
 }
 
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
index e75ef8365bf..77691ffefd8 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.cpp
@@ -2,6 +2,7 @@
 #include "explicit_levenshtein_dfa.h"
 #include "implicit_levenshtein_dfa.h"
 #include "levenshtein_dfa.h"
+#include "unicode_utils.h"
 #include <vespa/vespalib/util/stringfmt.h>
 #include <memory>
 
@@ -29,27 +30,30 @@ void LevenshteinDfa::dump_as_graphviz(std::ostream& out) const {
     _impl->dump_as_graphviz(out);
 }
 
-LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, DfaType dfa_type) {
+LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing, DfaType dfa_type) {
     if (max_edits != 1 && max_edits != 2) {
         throw std::invalid_argument(make_string("Levenshtein DFA max_edits must be in {1, 2}, was %u", max_edits));
     }
+    const bool is_cased = (casing == Casing::Cased);
+    auto target_string_u32 = is_cased ? utf8_string_to_utf32(target_string)
+                                      : utf8_string_to_utf32_lowercased(target_string);
     if (dfa_type == DfaType::Implicit) {
         if (max_edits == 1) {
-            return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(target_string));
+            return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<1>>>(std::move(target_string_u32), is_cased));
         } else { // max_edits == 2
-            return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(target_string));
+            return LevenshteinDfa(std::make_unique<ImplicitLevenshteinDfa<FixedMaxEditDistanceTraits<2>>>(std::move(target_string_u32), is_cased));
         }
     } else { // DfaType::Explicit
         if (max_edits == 1) {
-            return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(target_string).build_dfa();
+            return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<1>>(std::move(target_string_u32), is_cased).build_dfa();
         } else { // max_edits == 2
-            return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(target_string).build_dfa();
+            return ExplicitLevenshteinDfaBuilder<FixedMaxEditDistanceTraits<2>>(std::move(target_string_u32), is_cased).build_dfa();
         }
     }
 
 }
 
-LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits) {
+LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max_edits, Casing casing) {
     // TODO automatically select implementation based on target length/max edits?
     //  Suggestion:
     //   - Explicit DFA iff (k == 1 && |target| <= 256) || (k == 2 && |target| <= 64).
@@ -58,7 +62,7 @@ LevenshteinDfa LevenshteinDfa::build(std::string_view target_string, uint8_t max
     //  an M1 Pro; your mileage may vary etc).
     //  Ideally the implicit DFA would always be the fastest (or at least approximately as
     //  fast as the explicit DFA), but this is not yet the case.
-    return build(target_string, max_edits, DfaType::Implicit);
+    return build(target_string, max_edits, casing, DfaType::Implicit);
 }
 
 std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos) {
@@ -70,7 +74,7 @@ std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mo
     return os;
 }
 
-std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt) {
+std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt) {
     if (dt == LevenshteinDfa::DfaType::Implicit) {
         os << "Implicit";
     } else {
@@ -80,4 +84,14 @@ std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt) {
     return os;
 }
 
+std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c) {
+    if (c == LevenshteinDfa::Casing::Uncased) {
+        os << "Uncased";
+    } else {
+        assert(c == LevenshteinDfa::Casing::Cased);
+        os << "Cased";
+    }
+    return os;
+}
+
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
index a26ccbe87ee..a8b2b77f647 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/levenshtein_dfa.h
@@ -182,6 +182,13 @@ public:
      * memcmp()-ordering of strings and not whether they are technically valid Unicode.
      * This should be the case for low-level dictionary data structures etc.
      *
+     * Ordering notes: when the DFA is created as Uncased, the target and source strings
+     * are treated as lowercase when matching. The successor string is in this case generated
+     * _as if_ the input source string was originally lowercase. This is a special case
+     * intended for case-folded dictionaries that implicitly order strings by their lowercased
+     * form, but where they are stored in their original cased form. The (raw) cased form
+     * is what is passed to the DFA match() function.
+     *
      * Memory allocation:
      * This function does not directly or indirectly allocate any heap memory if either:
      *
@@ -205,6 +212,29 @@ public:
     };
 
     /**
+     * Specifies the character case matching semantics of the DFA.
+     */
+    enum class Casing {
+        /**
+         * Characters are case-normalized (i.e. lowercased) prior to matching.
+         * Example: In uncased mode, 'A' and 'a' are considered exact matches and do not
+         * consume edits.
+         *
+         * See the ordering notes for `match()` on what Uncased implies when generating
+         * successor strings.
+         */
+        Uncased,
+        /**
+         * Characters are not case-normalized prior to matching.
+         * Example: 'A' and 'a' are considered separate characters and will consume edits.
+         *
+         * Cased mode preserves strict memcmp() UTF-8 ordering between source and successor
+         * strings.
+         */
+        Cased
+    };
+
+    /**
      * Builds and returns a Levenshtein DFA that matches all strings within `max_edits`
      * edits of `target_string`. The type of DFA returned is specified by dfa_type.
      *
@@ -212,14 +242,13 @@ public:
      *
      * `target_string` must not contain any null UTF-8 chars.
      */
-    [[nodiscard]] static LevenshteinDfa build(std::string_view target_string,
-                                              uint8_t max_edits,
-                                              DfaType dfa_type);
+    [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits,
+                                              Casing casing, DfaType dfa_type);
 
     /**
      * Same as build() but currently always returns an implicit DFA.
      */
-    [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits);
+    [[nodiscard]] static LevenshteinDfa build(std::string_view target_string, uint8_t max_edits, Casing casing);
 
     /**
      * Dumps the DFA as a Graphviz graph in text format to the provided output stream.
@@ -239,6 +268,7 @@ public:
 };
 
 std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::MatchResult& mos);
-std::ostream& operator<<(std::ostream& os, const LevenshteinDfa::DfaType& dt);
+std::ostream& operator<<(std::ostream& os, LevenshteinDfa::DfaType dt);
+std::ostream& operator<<(std::ostream& os, LevenshteinDfa::Casing c);
 
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
index 206b69f8ebe..d9c4e0ffe36 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/match_algorithm.hpp
@@ -4,6 +4,7 @@
 #include "dfa_matcher.h"
 #include "levenshtein_dfa.h"
 #include "unicode_utils.h"
+#include <vespa/vespalib/text/lowercase.h>
 #include <vespa/vespalib/text/utf8.h>
 #include <cassert>
 #include <concepts>
@@ -139,7 +140,8 @@ struct MatchAlgorithm {
      * Both the input and successor output strings are in UTF-8 format. To avoid doing
      * duplicate work, we keep track of the byte length of the string prefix that will be
      * part of the successor and simply copy it verbatim instead of building the string
-     * from converted UTF-32 -> UTF-8 chars as we go.
+     * from converted UTF-32 -> UTF-8 chars as we go. This optimization cannot be used
+     * when one or more of the prefix characters have been lowercase-transformed.
      *
      * TODO we could probably also optimize the smallest suffix generation with this when
      *   we know we can no longer insert any smaller char substitutions and the only way
@@ -147,6 +149,10 @@ struct MatchAlgorithm {
      *    - To do this we'd need both the original UTF-8 target string as well as a
      *      secondary vector that maps u32 character index to the corresponding UTF-8 index.
      *      Both trivial to get as part of DFA initialization.
+     *
+     * TODO let matcher know if source string is pre-normalized (i.e. lowercased).
+     * TODO std::u32string output; no need to re-encode to UTF-8.
+     * TODO consider opportunistically appending prefix as we go instead of only when needed.
      */
     template <DfaMatcher Matcher>
     static MatchResult match(const Matcher& matcher,
@@ -154,15 +160,20 @@ struct MatchAlgorithm {
                              std::string* successor_out)
     {
         using StateType = typename Matcher::StateType;
-        vespalib::Utf8Reader u8_reader(source.data(), source.size());
+        Utf8Reader u8_reader(source.data(), source.size());
         uint32_t n_prefix_u8_bytes = 0;
         uint32_t char_after_prefix = 0;
         StateType last_state_with_higher_out = StateType{};
+        bool can_use_raw_prefix = true;
 
         StateType state = matcher.start();
         while (u8_reader.hasMore()) {
             const auto u8_pos_before_char = u8_reader.getPos();
-            const uint32_t mch = u8_reader.getChar();
+            const uint32_t raw_mch = u8_reader.getChar();
+            const uint32_t mch     = normalized_match_char(raw_mch, matcher.is_cased());
+            if (raw_mch != mch) {
+                can_use_raw_prefix = false; // FIXME this is pessimistic; considers entire string, not just prefix
+            }
             if (successor_out && matcher.has_higher_out_edge(state, mch)) {
                 last_state_with_higher_out = state;
                 n_prefix_u8_bytes = u8_pos_before_char;
@@ -174,7 +185,8 @@ struct MatchAlgorithm {
             } else {
                 // Can never match; find the successor if requested
                 if (successor_out) {
-                    *successor_out = source.substr(0, n_prefix_u8_bytes);
+                    emit_successor_prefix(*successor_out, source, n_prefix_u8_bytes,
+                                          matcher.is_cased() || can_use_raw_prefix);
                     assert(matcher.valid_state(last_state_with_higher_out));
                     backtrack_and_emit_greater_suffix(matcher, last_state_with_higher_out,
                                                       char_after_prefix, *successor_out);
@@ -187,7 +199,8 @@ struct MatchAlgorithm {
             return MatchResult::make_match(max_edits(), edits);
         }
         if (successor_out) {
-            *successor_out = source;
+            emit_successor_prefix(*successor_out, source, source.size(),
+                                  matcher.is_cased() || can_use_raw_prefix);
             emit_smallest_matching_suffix(matcher, state, *successor_out);
         }
         return MatchResult::make_mismatch(max_edits());
@@ -286,6 +299,43 @@ struct MatchAlgorithm {
             }
         }
     }
+
+    /**
+     * The successor prefix is the prefix of the source string up to (but not including) the
+     * point where we emit a lexicographically higher character. Ideally we can just copy the
+     * UTF-8 bytes verbatim from the source into the successor. This is possible when one of
+     * the following holds:
+     *
+     *   - DFA uses Cased (i.e. exact) matching, or
+     *   - DFA uses Uncased, but none of the characters in the prefix triggered a lowercase
+     *     transform. This means the prefix is already as-if lowercased, and we can copy it
+     *     verbatim.
+     *
+     * In the case that we can't copy verbatim, we currently have to explicitly normalize the
+     * prefix by converting it to its lowercased form.
+     *
+     * Example: Uncased matching for target "food" with input "FOXX". This generates the
+     * successor "foyd" (and _not_ "FOyd"), as the latter would imply a completely different
+     * ordering when compared byte-wise against an implicitly lowercased dictionary.
+     */
+    static void emit_successor_prefix(std::string& successor_out, std::string_view source,
+                                      uint32_t n_prefix_u8_bytes, bool emit_raw_prefix_u8_bytes)
+    {
+        if (emit_raw_prefix_u8_bytes) {
+            successor_out = source.substr(0, n_prefix_u8_bytes);
+        } else {
+            // TODO avoid duplicate work...! :I
+            successor_out.clear();
+            Utf8Reader u8_reader(source.data(), source.size());
+            while (u8_reader.getPos() < n_prefix_u8_bytes) {
+                append_utf32_char_as_utf8(successor_out, LowerCase::convert(u8_reader.getChar()));
+            }
+        }
+    }
+
+    static uint32_t normalized_match_char(uint32_t in_ch, bool is_cased) noexcept {
+        return (is_cased ? in_ch : LowerCase::convert(in_ch));
+    }
 };
 
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
index 648be234562..f2c6259fd10 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.cpp
@@ -1,22 +1,41 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "unicode_utils.h"
+#include <vespa/vespalib/text/lowercase.h>
 #include <vespa/vespalib/text/utf8.h>
 #include <vespa/vespalib/util/stringfmt.h>
 #include <stdexcept>
 
 namespace vespalib::fuzzy {
 
-std::vector<uint32_t> utf8_string_to_utf32(std::string_view str) {
-    vespalib::stringref ch_str(str.data(), str.size());
-    vespalib::Utf8Reader utf8_reader(ch_str);
+namespace {
+
+template <bool ToLowercase>
+std::vector<uint32_t> utf8_string_to_utf32_impl(std::string_view str) {
+    stringref ch_str(str.data(), str.size());
+    Utf8Reader utf8_reader(ch_str); // TODO consider integrating simdutf library
     std::vector<uint32_t> u32ret;
     u32ret.reserve(str.size()); // Will over-allocate for all non-ASCII
     while (utf8_reader.hasMore()) {
-        u32ret.emplace_back(utf8_reader.getChar());
+        if constexpr (ToLowercase) {
+            u32ret.emplace_back(LowerCase::convert(utf8_reader.getChar()));
+        } else {
+            u32ret.emplace_back(utf8_reader.getChar());
+        }
+
     }
     return u32ret;
 }
 
+}
+
+std::vector<uint32_t> utf8_string_to_utf32_lowercased(std::string_view str) {
+    return utf8_string_to_utf32_impl<true>(str);
+}
+
+std::vector<uint32_t> utf8_string_to_utf32(std::string_view str) {
+    return utf8_string_to_utf32_impl<false>(str);
+}
+
 std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str) {
     return utf8_string_to_utf32(std::string_view(reinterpret_cast<const char*>(u8str.data()), u8str.size()));
 }
diff --git a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
index 8627b01ff6a..711c4f1fcd0 100644
--- a/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
+++ b/vespalib/src/vespa/vespalib/fuzzy/unicode_utils.h
@@ -8,6 +8,9 @@
 
 namespace vespalib::fuzzy {
 
+// UTF-8 -> UTF-32 conversion with lowercasing of all characters
+std::vector<uint32_t> utf8_string_to_utf32_lowercased(std::string_view str);
+// UTF-8 -> UTF-32 conversion without case conversion
 std::vector<uint32_t> utf8_string_to_utf32(std::string_view str);
 
 std::vector<uint32_t> utf8_string_to_utf32(std::u8string_view u8str);
author	Tor Brede Vekterli <vekterli@yahooinc.com>	2023-09-15 15:40:52 +0200
committer	GitHub <noreply@github.com>	2023-09-15 15:40:52 +0200
commit	699ecbd2c056d95adbcd99a70e9169d12edb2ba4 (patch)
tree	354819f4565f958cfcad28f984ffd6a960525ffd
parent	7e3d557d89bb0f9efc283e98478d568c801b4d96 (diff)
parent	6d696cd634a6bd4ea1b4c046ade9bfc5b5d246a8 (diff)