From 33c708225110a2b88de0a167f254138ff47408a2 Mon Sep 17 00:00:00 2001 From: Geir Storli Date: Fri, 18 Mar 2022 15:50:08 +0000 Subject: Add function to lowercase an utf8 string while converting it to ucs4 codepoints. --- vespalib/src/tests/text/lowercase/lowercase_test.cpp | 19 ++++++++++++------- vespalib/src/vespa/vespalib/text/lowercase.cpp | 12 ++++++++++++ vespalib/src/vespa/vespalib/text/lowercase.h | 10 ++++++++++ 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/vespalib/src/tests/text/lowercase/lowercase_test.cpp b/vespalib/src/tests/text/lowercase/lowercase_test.cpp index 8117dc20666..ffc6dce427c 100644 --- a/vespalib/src/tests/text/lowercase/lowercase_test.cpp +++ b/vespalib/src/tests/text/lowercase/lowercase_test.cpp @@ -9,13 +9,8 @@ LOG_SETUP("lowercase_test"); using namespace vespalib; -TEST_SETUP(Test); - -int -Test::Main() +TEST("test basic lowercase") { - TEST_INIT("lowercase_test"); - EXPECT_EQUAL('a', LowerCase::convert('A')); EXPECT_EQUAL((int8_t)'a', LowerCase::convert((int8_t)'A')); EXPECT_EQUAL((uint8_t)'a', LowerCase::convert((uint8_t)'A')); @@ -34,5 +29,15 @@ Test::Main() } // printf("lowercase( %d )= %d\n", hi, lo); } - TEST_DONE(); } + +TEST("lowercase utf8 string to ucs4") +{ + auto res = LowerCase::convert_to_ucs4(std::string_view("ABC")); + EXPECT_EQUAL(3u, res.size()); + EXPECT_EQUAL((uint32_t)'a', res[0]); + EXPECT_EQUAL((uint32_t)'b', res[1]); + EXPECT_EQUAL((uint32_t)'c', res[2]); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/vespa/vespalib/text/lowercase.cpp b/vespalib/src/vespa/vespalib/text/lowercase.cpp index 604027d2687..de6c5956fcb 100644 --- a/vespalib/src/vespa/vespalib/text/lowercase.cpp +++ b/vespalib/src/vespa/vespalib/text/lowercase.cpp @@ -20,6 +20,18 @@ LowerCase::convert(vespalib::stringref input) return output; } +std::vector +LowerCase::convert_to_ucs4(vespalib::stringref input) +{ + std::vector result; + result.reserve(input.size()); + Utf8Reader reader(input.data()); + while (reader.hasMore()) { + result.emplace_back(convert(reader.getChar())); + } + return result; +} + /* NOTE: the tables below are generated from Java code to diff --git a/vespalib/src/vespa/vespalib/text/lowercase.h b/vespalib/src/vespa/vespalib/text/lowercase.h index e0ed742c3c5..dc081c6ba2d 100644 --- a/vespalib/src/vespa/vespalib/text/lowercase.h +++ b/vespalib/src/vespa/vespalib/text/lowercase.h @@ -5,6 +5,7 @@ #pragma once #include +#include namespace vespalib { @@ -103,6 +104,15 @@ public: * CHARACTER (U+FFFD). **/ static vespalib::string convert(vespalib::stringref input); + + /** + * Lowercase a string in UTF-8 format while converting it to UCS-4 codepoints. + */ + static std::vector convert_to_ucs4(vespalib::stringref input); + static std::vector convert_to_ucs4(std::string_view input) { + return convert_to_ucs4(vespalib::stringref(input.data(), input.size())); + } + }; -- cgit v1.2.3