move UTF-8 filtering to vespalib::Utf8

author: Arne Juul <arnej@verizonmedia.com> 2020-06-30 07:39:04 +0000
committer: Arne Juul <arnej@verizonmedia.com> 2020-06-30 07:39:04 +0000
commit: 975afb02388be225e7e7bd827e6aaa8e5ccf7aea (patch)
tree: 5aa6d5514093e43f339083bfd409913e06ad5269 /vespalib
parent: 2df4a50f754fc8d1021dc7c201e050279a4a47dd (diff)
2 files changed, 25 insertions, 1 deletions
diff --git a/vespalib/src/vespa/vespalib/text/utf8.cpp b/vespalib/src/vespa/vespalib/text/utf8.cpp
index 8f73479f10f..58b587d45b5 100644
--- a/vespalib/src/vespa/vespalib/text/utf8.cpp
+++ b/vespalib/src/vespa/vespalib/text/utf8.cpp
@@ -233,5 +233,20 @@ Utf8Writer<Target>::putChar(uint32_t codepoint)
 template class Utf8Writer<vespalib::string>;
 template class Utf8Writer<std::string>;
 
+template <typename T>
+T Utf8::filter_invalid_sequences(const T& input)
+{
+    T retval;
+    Utf8Reader reader(input.c_str(), input.size());
+    Utf8Writer writer(retval);
+    while (reader.hasMore()) {
+        uint32_t ch = reader.getChar();
+        writer.putChar(ch);
+    }
+    return retval;
+}
+
+template vespalib::string Utf8::filter_invalid_sequences(const vespalib::string&);
+template std::string Utf8::filter_invalid_sequences(const std::string&);
 
-} // namespace vespalib
+} // namespace
diff --git a/vespalib/src/vespa/vespalib/text/utf8.h b/vespalib/src/vespa/vespalib/text/utf8.h
index d2b8204ce08..e65aaee9708 100644
--- a/vespalib/src/vespa/vespalib/text/utf8.h
+++ b/vespalib/src/vespa/vespalib/text/utf8.h
@@ -28,6 +28,15 @@ public:
     };
 
     /**
+     * Filter a string (std::string or vespalib::string)
+     * and replace any invalid UTF8 sequences with the
+     * standard replacement char U+FFFD; note that any
+     * UTF-8 encoded surrogates are also considered invalid.
+     **/
+    template <typename T>
+    static T filter_invalid_sequences(const T& input);
+
+    /**
      * check if a byte is valid as the first byte of an UTF-8 character.
      * @param c the byte to be checked
      * @return true if a valid UTF-8 character can start with this byte
author	Arne Juul <arnej@verizonmedia.com>	2020-06-30 07:39:04 +0000
committer	Arne Juul <arnej@verizonmedia.com>	2020-06-30 07:39:04 +0000
commit	975afb02388be225e7e7bd827e6aaa8e5ccf7aea (patch)
tree	5aa6d5514093e43f339083bfd409913e06ad5269 /vespalib
parent	2df4a50f754fc8d1021dc7c201e050279a4a47dd (diff)