diff options
author | Arne Juul <arnej@verizonmedia.com> | 2020-06-30 07:39:04 +0000 |
---|---|---|
committer | Arne Juul <arnej@verizonmedia.com> | 2020-06-30 07:39:04 +0000 |
commit | 975afb02388be225e7e7bd827e6aaa8e5ccf7aea (patch) | |
tree | 5aa6d5514093e43f339083bfd409913e06ad5269 | |
parent | 2df4a50f754fc8d1021dc7c201e050279a4a47dd (diff) |
move UTF-8 filtering to vespalib::Utf8
-rw-r--r-- | vespalib/src/vespa/vespalib/text/utf8.cpp | 17 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/text/utf8.h | 9 |
2 files changed, 25 insertions, 1 deletions
diff --git a/vespalib/src/vespa/vespalib/text/utf8.cpp b/vespalib/src/vespa/vespalib/text/utf8.cpp index 8f73479f10f..58b587d45b5 100644 --- a/vespalib/src/vespa/vespalib/text/utf8.cpp +++ b/vespalib/src/vespa/vespalib/text/utf8.cpp @@ -233,5 +233,20 @@ Utf8Writer<Target>::putChar(uint32_t codepoint) template class Utf8Writer<vespalib::string>; template class Utf8Writer<std::string>; +template <typename T> +T Utf8::filter_invalid_sequences(const T& input) +{ + T retval; + Utf8Reader reader(input.c_str(), input.size()); + Utf8Writer writer(retval); + while (reader.hasMore()) { + uint32_t ch = reader.getChar(); + writer.putChar(ch); + } + return retval; +} + +template vespalib::string Utf8::filter_invalid_sequences(const vespalib::string&); +template std::string Utf8::filter_invalid_sequences(const std::string&); -} // namespace vespalib +} // namespace diff --git a/vespalib/src/vespa/vespalib/text/utf8.h b/vespalib/src/vespa/vespalib/text/utf8.h index d2b8204ce08..e65aaee9708 100644 --- a/vespalib/src/vespa/vespalib/text/utf8.h +++ b/vespalib/src/vespa/vespalib/text/utf8.h @@ -28,6 +28,15 @@ public: }; /** + * Filter a string (std::string or vespalib::string) + * and replace any invalid UTF8 sequences with the + * standard replacement char U+FFFD; note that any + * UTF-8 encoded surrogates are also considered invalid. + **/ + template <typename T> + static T filter_invalid_sequences(const T& input); + + /** * check if a byte is valid as the first byte of an UTF-8 character. * @param c the byte to be checked * @return true if a valid UTF-8 character can start with this byte |