diff options
author | Geir Storli <geirst@verizonmedia.com> | 2020-06-30 10:46:43 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-06-30 10:46:43 +0200 |
commit | 5aea1d09520da60523f5f972b50890ce930d6d54 (patch) | |
tree | 10daab3e99414f6ac70ceb89864300291615a98a | |
parent | 848174d1efea5d117db2accd228247e649c21803 (diff) | |
parent | 8c1b6e8cece2f1a317a05052e8bf35934680743e (diff) |
Merge pull request #13729 from vespa-engine/arnej/filter-invalid-utf8
filter invalid UTF-8 (including encoded surrogates) to make protobuf …
-rw-r--r-- | logd/src/logd/proto_converter.cpp | 3 | ||||
-rw-r--r-- | logd/src/tests/proto_converter/proto_converter_test.cpp | 16 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/text/utf8.cpp | 25 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/text/utf8.h | 16 |
4 files changed, 53 insertions, 7 deletions
diff --git a/logd/src/logd/proto_converter.cpp b/logd/src/logd/proto_converter.cpp index b3facd4ef4a..e4331e00480 100644 --- a/logd/src/logd/proto_converter.cpp +++ b/logd/src/logd/proto_converter.cpp @@ -1,6 +1,7 @@ // Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "proto_converter.h" +#include <vespa/vespalib/text/utf8.h> using ns_log::LogMessage; using ns_log::Logger; @@ -59,7 +60,7 @@ ProtoConverter::log_message_to_proto(const LogMessage& message, ProtoLogMessage& proto.set_service(message.service()); proto.set_component(message.component()); proto.set_level(convert_level(message.level())); - proto.set_payload(message.payload()); + proto.set_payload(vespalib::Utf8::filter_invalid_sequences(message.payload())); } } diff --git a/logd/src/tests/proto_converter/proto_converter_test.cpp b/logd/src/tests/proto_converter/proto_converter_test.cpp index aa0b00e34d6..702752e8482 100644 --- a/logd/src/tests/proto_converter/proto_converter_test.cpp +++ b/logd/src/tests/proto_converter/proto_converter_test.cpp @@ -84,5 +84,21 @@ TEST_F(LogRequestTest, log_messages_are_converted_to_request) ProtoLogLevel::LogMessage_Level_EVENT, "bar_payload", proto.log_messages(1)); } +// UTF-8 encoding of \U+FFFD +#define FFFD "\xEF\xBF\xBD" + +TEST_F(LogRequestTest, invalid_utf8_is_filtered) +{ + messages.emplace_back(12345, "foo_host", 3, 5, "foo_service", "foo_component", Logger::info, + "valid: \xE2\x82\xAC and \xEF\xBF\xBA; semi-valid: \xED\xA0\xBD\xED\xB8\x80; invalid: \xCC surrogate \xED\xBF\xBF overlong \xC1\x81 end" + ); + convert(); + EXPECT_EQ(1, proto.log_messages_size()); + expect_proto_log_message_equal(12345, "foo_host", 3, 5, "foo_service", "foo_component", + ProtoLogLevel::LogMessage_Level_INFO, + "valid: \xE2\x82\xAC and \xEF\xBF\xBA; semi-valid: " FFFD FFFD "; invalid: " FFFD " surrogate " FFFD " overlong " FFFD FFFD " end", + proto.log_messages(0)); +} + GTEST_MAIN_RUN_ALL_TESTS() diff --git a/vespalib/src/vespa/vespalib/text/utf8.cpp b/vespalib/src/vespa/vespalib/text/utf8.cpp index 660178aaaa1..58b587d45b5 100644 --- a/vespalib/src/vespa/vespalib/text/utf8.cpp +++ b/vespalib/src/vespa/vespalib/text/utf8.cpp @@ -175,8 +175,9 @@ Utf8ReaderForZTS::getComplexChar(unsigned char firstbyte, uint32_t fallback) } -Utf8Writer& -Utf8Writer::putChar(uint32_t codepoint) +template <typename Target> +Utf8Writer<Target>& +Utf8Writer<Target>::putChar(uint32_t codepoint) { if (codepoint < 0x80) { _target.push_back((char)codepoint); @@ -229,5 +230,23 @@ Utf8Writer::putChar(uint32_t codepoint) return *this; } +template class Utf8Writer<vespalib::string>; +template class Utf8Writer<std::string>; -} // namespace vespalib +template <typename T> +T Utf8::filter_invalid_sequences(const T& input) +{ + T retval; + Utf8Reader reader(input.c_str(), input.size()); + Utf8Writer writer(retval); + while (reader.hasMore()) { + uint32_t ch = reader.getChar(); + writer.putChar(ch); + } + return retval; +} + +template vespalib::string Utf8::filter_invalid_sequences(const vespalib::string&); +template std::string Utf8::filter_invalid_sequences(const std::string&); + +} // namespace diff --git a/vespalib/src/vespa/vespalib/text/utf8.h b/vespalib/src/vespa/vespalib/text/utf8.h index 0c75203fbbe..e65aaee9708 100644 --- a/vespalib/src/vespa/vespalib/text/utf8.h +++ b/vespalib/src/vespa/vespalib/text/utf8.h @@ -28,6 +28,15 @@ public: }; /** + * Filter a string (std::string or vespalib::string) + * and replace any invalid UTF8 sequences with the + * standard replacement char U+FFFD; note that any + * UTF-8 encoded surrogates are also considered invalid. + **/ + template <typename T> + static T filter_invalid_sequences(const T& input); + + /** * check if a byte is valid as the first byte of an UTF-8 character. * @param c the byte to be checked * @return true if a valid UTF-8 character can start with this byte @@ -155,7 +164,7 @@ protected: first_high_surrogate = 0xD800, last_high_surrogate = 0xDBFF, first_low_surrogate = 0xDC00, - last_low_surrogate = 0xDCFF + last_low_surrogate = 0xDFFF }; }; @@ -321,9 +330,10 @@ public: /** * @brief Writer class that appends UTF-8 characters to a string **/ +template <typename Target> class Utf8Writer : public Utf8 { - string &_target; + Target &_target; public: /** * construct a writer appending to the given string @@ -331,7 +341,7 @@ public: * that the writer will append to. Must be writable * and must be kept alive while the writer is active. **/ - Utf8Writer(string &target) : _target(target) {} + Utf8Writer(Target &target) : _target(target) {} /** * append the given character to the target string. |