diff options
Diffstat (limited to 'vespalib/src')
-rw-r--r-- | vespalib/src/tests/text/lowercase/lowercase_test.cpp | 19 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/text/lowercase.cpp | 12 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/text/lowercase.h | 10 |
3 files changed, 34 insertions, 7 deletions
diff --git a/vespalib/src/tests/text/lowercase/lowercase_test.cpp b/vespalib/src/tests/text/lowercase/lowercase_test.cpp index 8117dc20666..ffc6dce427c 100644 --- a/vespalib/src/tests/text/lowercase/lowercase_test.cpp +++ b/vespalib/src/tests/text/lowercase/lowercase_test.cpp @@ -9,13 +9,8 @@ LOG_SETUP("lowercase_test"); using namespace vespalib; -TEST_SETUP(Test); - -int -Test::Main() +TEST("test basic lowercase") { - TEST_INIT("lowercase_test"); - EXPECT_EQUAL('a', LowerCase::convert('A')); EXPECT_EQUAL((int8_t)'a', LowerCase::convert((int8_t)'A')); EXPECT_EQUAL((uint8_t)'a', LowerCase::convert((uint8_t)'A')); @@ -34,5 +29,15 @@ Test::Main() } // printf("lowercase( %d )= %d\n", hi, lo); } - TEST_DONE(); } + +TEST("lowercase utf8 string to ucs4") +{ + auto res = LowerCase::convert_to_ucs4(std::string_view("ABC")); + EXPECT_EQUAL(3u, res.size()); + EXPECT_EQUAL((uint32_t)'a', res[0]); + EXPECT_EQUAL((uint32_t)'b', res[1]); + EXPECT_EQUAL((uint32_t)'c', res[2]); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/vespa/vespalib/text/lowercase.cpp b/vespalib/src/vespa/vespalib/text/lowercase.cpp index 604027d2687..de6c5956fcb 100644 --- a/vespalib/src/vespa/vespalib/text/lowercase.cpp +++ b/vespalib/src/vespa/vespalib/text/lowercase.cpp @@ -20,6 +20,18 @@ LowerCase::convert(vespalib::stringref input) return output; } +std::vector<uint32_t> +LowerCase::convert_to_ucs4(vespalib::stringref input) +{ + std::vector<uint32_t> result; + result.reserve(input.size()); + Utf8Reader reader(input.data()); + while (reader.hasMore()) { + result.emplace_back(convert(reader.getChar())); + } + return result; +} + /* NOTE: the tables below are generated from Java code to diff --git a/vespalib/src/vespa/vespalib/text/lowercase.h b/vespalib/src/vespa/vespalib/text/lowercase.h index e0ed742c3c5..dc081c6ba2d 100644 --- a/vespalib/src/vespa/vespalib/text/lowercase.h +++ b/vespalib/src/vespa/vespalib/text/lowercase.h @@ -5,6 +5,7 @@ #pragma once #include <vespa/vespalib/stllike/string.h> +#include <vector> namespace vespalib { @@ -103,6 +104,15 @@ public: * CHARACTER (U+FFFD). **/ static vespalib::string convert(vespalib::stringref input); + + /** + * Lowercase a string in UTF-8 format while converting it to UCS-4 codepoints. + */ + static std::vector<uint32_t> convert_to_ucs4(vespalib::stringref input); + static std::vector<uint32_t> convert_to_ucs4(std::string_view input) { + return convert_to_ucs4(vespalib::stringref(input.data(), input.size())); + } + }; |