summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2020-06-30 07:39:04 +0000
committerArne Juul <arnej@verizonmedia.com>2020-06-30 07:39:04 +0000
commit975afb02388be225e7e7bd827e6aaa8e5ccf7aea (patch)
tree5aa6d5514093e43f339083bfd409913e06ad5269 /vespalib
parent2df4a50f754fc8d1021dc7c201e050279a4a47dd (diff)
move UTF-8 filtering to vespalib::Utf8
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/vespa/vespalib/text/utf8.cpp17
-rw-r--r--vespalib/src/vespa/vespalib/text/utf8.h9
2 files changed, 25 insertions, 1 deletions
diff --git a/vespalib/src/vespa/vespalib/text/utf8.cpp b/vespalib/src/vespa/vespalib/text/utf8.cpp
index 8f73479f10f..58b587d45b5 100644
--- a/vespalib/src/vespa/vespalib/text/utf8.cpp
+++ b/vespalib/src/vespa/vespalib/text/utf8.cpp
@@ -233,5 +233,20 @@ Utf8Writer<Target>::putChar(uint32_t codepoint)
template class Utf8Writer<vespalib::string>;
template class Utf8Writer<std::string>;
+template <typename T>
+T Utf8::filter_invalid_sequences(const T& input)
+{
+ T retval;
+ Utf8Reader reader(input.c_str(), input.size());
+ Utf8Writer writer(retval);
+ while (reader.hasMore()) {
+ uint32_t ch = reader.getChar();
+ writer.putChar(ch);
+ }
+ return retval;
+}
+
+template vespalib::string Utf8::filter_invalid_sequences(const vespalib::string&);
+template std::string Utf8::filter_invalid_sequences(const std::string&);
-} // namespace vespalib
+} // namespace
diff --git a/vespalib/src/vespa/vespalib/text/utf8.h b/vespalib/src/vespa/vespalib/text/utf8.h
index d2b8204ce08..e65aaee9708 100644
--- a/vespalib/src/vespa/vespalib/text/utf8.h
+++ b/vespalib/src/vespa/vespalib/text/utf8.h
@@ -28,6 +28,15 @@ public:
};
/**
+ * Filter a string (std::string or vespalib::string)
+ * and replace any invalid UTF8 sequences with the
+ * standard replacement char U+FFFD; note that any
+ * UTF-8 encoded surrogates are also considered invalid.
+ **/
+ template <typename T>
+ static T filter_invalid_sequences(const T& input);
+
+ /**
* check if a byte is valid as the first byte of an UTF-8 character.
* @param c the byte to be checked
* @return true if a valid UTF-8 character can start with this byte