diff options
author | Geir Storli <geirst@yahooinc.com> | 2022-03-18 17:01:41 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-03-18 17:01:41 +0100 |
commit | df9dcfecf12e995052ae6be81f808c3d393bb7b9 (patch) | |
tree | 74d3b05164ca60360fec8d1e5ee56186c899f864 | |
parent | 47942e08dfa992a9c86b4f4954ff25cdb7ce7b12 (diff) | |
parent | 33c708225110a2b88de0a167f254138ff47408a2 (diff) |
Merge pull request #21752 from vespa-engine/geirst/lowercase-convert-to-ucs4
Add function to lowercase an utf8 string while converting it to ucs4 …
-rw-r--r-- | vespalib/src/tests/text/lowercase/lowercase_test.cpp | 19 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/text/lowercase.cpp | 12 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/text/lowercase.h | 10 |
3 files changed, 34 insertions, 7 deletions
diff --git a/vespalib/src/tests/text/lowercase/lowercase_test.cpp b/vespalib/src/tests/text/lowercase/lowercase_test.cpp index 8117dc20666..ffc6dce427c 100644 --- a/vespalib/src/tests/text/lowercase/lowercase_test.cpp +++ b/vespalib/src/tests/text/lowercase/lowercase_test.cpp @@ -9,13 +9,8 @@ LOG_SETUP("lowercase_test"); using namespace vespalib; -TEST_SETUP(Test); - -int -Test::Main() +TEST("test basic lowercase") { - TEST_INIT("lowercase_test"); - EXPECT_EQUAL('a', LowerCase::convert('A')); EXPECT_EQUAL((int8_t)'a', LowerCase::convert((int8_t)'A')); EXPECT_EQUAL((uint8_t)'a', LowerCase::convert((uint8_t)'A')); @@ -34,5 +29,15 @@ Test::Main() } // printf("lowercase( %d )= %d\n", hi, lo); } - TEST_DONE(); } + +TEST("lowercase utf8 string to ucs4") +{ + auto res = LowerCase::convert_to_ucs4(std::string_view("ABC")); + EXPECT_EQUAL(3u, res.size()); + EXPECT_EQUAL((uint32_t)'a', res[0]); + EXPECT_EQUAL((uint32_t)'b', res[1]); + EXPECT_EQUAL((uint32_t)'c', res[2]); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/vespa/vespalib/text/lowercase.cpp b/vespalib/src/vespa/vespalib/text/lowercase.cpp index 604027d2687..de6c5956fcb 100644 --- a/vespalib/src/vespa/vespalib/text/lowercase.cpp +++ b/vespalib/src/vespa/vespalib/text/lowercase.cpp @@ -20,6 +20,18 @@ LowerCase::convert(vespalib::stringref input) return output; } +std::vector<uint32_t> +LowerCase::convert_to_ucs4(vespalib::stringref input) +{ + std::vector<uint32_t> result; + result.reserve(input.size()); + Utf8Reader reader(input.data()); + while (reader.hasMore()) { + result.emplace_back(convert(reader.getChar())); + } + return result; +} + /* NOTE: the tables below are generated from Java code to diff --git a/vespalib/src/vespa/vespalib/text/lowercase.h b/vespalib/src/vespa/vespalib/text/lowercase.h index e0ed742c3c5..dc081c6ba2d 100644 --- a/vespalib/src/vespa/vespalib/text/lowercase.h +++ b/vespalib/src/vespa/vespalib/text/lowercase.h @@ -5,6 +5,7 @@ #pragma once #include <vespa/vespalib/stllike/string.h> +#include <vector> namespace vespalib { @@ -103,6 +104,15 @@ public: * CHARACTER (U+FFFD). **/ static vespalib::string convert(vespalib::stringref input); + + /** + * Lowercase a string in UTF-8 format while converting it to UCS-4 codepoints. + */ + static std::vector<uint32_t> convert_to_ucs4(vespalib::stringref input); + static std::vector<uint32_t> convert_to_ucs4(std::string_view input) { + return convert_to_ucs4(vespalib::stringref(input.data(), input.size())); + } + }; |