aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2022-03-18 17:01:41 +0100
committerGitHub <noreply@github.com>2022-03-18 17:01:41 +0100
commitdf9dcfecf12e995052ae6be81f808c3d393bb7b9 (patch)
tree74d3b05164ca60360fec8d1e5ee56186c899f864
parent47942e08dfa992a9c86b4f4954ff25cdb7ce7b12 (diff)
parent33c708225110a2b88de0a167f254138ff47408a2 (diff)
Merge pull request #21752 from vespa-engine/geirst/lowercase-convert-to-ucs4
Add function to lowercase an utf8 string while converting it to ucs4 …
-rw-r--r--vespalib/src/tests/text/lowercase/lowercase_test.cpp19
-rw-r--r--vespalib/src/vespa/vespalib/text/lowercase.cpp12
-rw-r--r--vespalib/src/vespa/vespalib/text/lowercase.h10
3 files changed, 34 insertions, 7 deletions
diff --git a/vespalib/src/tests/text/lowercase/lowercase_test.cpp b/vespalib/src/tests/text/lowercase/lowercase_test.cpp
index 8117dc20666..ffc6dce427c 100644
--- a/vespalib/src/tests/text/lowercase/lowercase_test.cpp
+++ b/vespalib/src/tests/text/lowercase/lowercase_test.cpp
@@ -9,13 +9,8 @@ LOG_SETUP("lowercase_test");
using namespace vespalib;
-TEST_SETUP(Test);
-
-int
-Test::Main()
+TEST("test basic lowercase")
{
- TEST_INIT("lowercase_test");
-
EXPECT_EQUAL('a', LowerCase::convert('A'));
EXPECT_EQUAL((int8_t)'a', LowerCase::convert((int8_t)'A'));
EXPECT_EQUAL((uint8_t)'a', LowerCase::convert((uint8_t)'A'));
@@ -34,5 +29,15 @@ Test::Main()
}
// printf("lowercase( %d )= %d\n", hi, lo);
}
- TEST_DONE();
}
+
+TEST("lowercase utf8 string to ucs4")
+{
+ auto res = LowerCase::convert_to_ucs4(std::string_view("ABC"));
+ EXPECT_EQUAL(3u, res.size());
+ EXPECT_EQUAL((uint32_t)'a', res[0]);
+ EXPECT_EQUAL((uint32_t)'b', res[1]);
+ EXPECT_EQUAL((uint32_t)'c', res[2]);
+}
+
+TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/vespalib/src/vespa/vespalib/text/lowercase.cpp b/vespalib/src/vespa/vespalib/text/lowercase.cpp
index 604027d2687..de6c5956fcb 100644
--- a/vespalib/src/vespa/vespalib/text/lowercase.cpp
+++ b/vespalib/src/vespa/vespalib/text/lowercase.cpp
@@ -20,6 +20,18 @@ LowerCase::convert(vespalib::stringref input)
return output;
}
+std::vector<uint32_t>
+LowerCase::convert_to_ucs4(vespalib::stringref input)
+{
+ std::vector<uint32_t> result;
+ result.reserve(input.size());
+ Utf8Reader reader(input.data());
+ while (reader.hasMore()) {
+ result.emplace_back(convert(reader.getChar()));
+ }
+ return result;
+}
+
/*
NOTE: the tables below are generated from Java code to
diff --git a/vespalib/src/vespa/vespalib/text/lowercase.h b/vespalib/src/vespa/vespalib/text/lowercase.h
index e0ed742c3c5..dc081c6ba2d 100644
--- a/vespalib/src/vespa/vespalib/text/lowercase.h
+++ b/vespalib/src/vespa/vespalib/text/lowercase.h
@@ -5,6 +5,7 @@
#pragma once
#include <vespa/vespalib/stllike/string.h>
+#include <vector>
namespace vespalib {
@@ -103,6 +104,15 @@ public:
* CHARACTER (U+FFFD).
**/
static vespalib::string convert(vespalib::stringref input);
+
+ /**
+ * Lowercase a string in UTF-8 format while converting it to UCS-4 codepoints.
+ */
+ static std::vector<uint32_t> convert_to_ucs4(vespalib::stringref input);
+ static std::vector<uint32_t> convert_to_ucs4(std::string_view input) {
+ return convert_to_ucs4(vespalib::stringref(input.data(), input.size()));
+ }
+
};