summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2020-06-30 10:46:43 +0200
committerGitHub <noreply@github.com>2020-06-30 10:46:43 +0200
commit5aea1d09520da60523f5f972b50890ce930d6d54 (patch)
tree10daab3e99414f6ac70ceb89864300291615a98a
parent848174d1efea5d117db2accd228247e649c21803 (diff)
parent8c1b6e8cece2f1a317a05052e8bf35934680743e (diff)
Merge pull request #13729 from vespa-engine/arnej/filter-invalid-utf8
filter invalid UTF-8 (including encoded surrogates) to make protobuf …
-rw-r--r--logd/src/logd/proto_converter.cpp3
-rw-r--r--logd/src/tests/proto_converter/proto_converter_test.cpp16
-rw-r--r--vespalib/src/vespa/vespalib/text/utf8.cpp25
-rw-r--r--vespalib/src/vespa/vespalib/text/utf8.h16
4 files changed, 53 insertions, 7 deletions
diff --git a/logd/src/logd/proto_converter.cpp b/logd/src/logd/proto_converter.cpp
index b3facd4ef4a..e4331e00480 100644
--- a/logd/src/logd/proto_converter.cpp
+++ b/logd/src/logd/proto_converter.cpp
@@ -1,6 +1,7 @@
// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "proto_converter.h"
+#include <vespa/vespalib/text/utf8.h>
using ns_log::LogMessage;
using ns_log::Logger;
@@ -59,7 +60,7 @@ ProtoConverter::log_message_to_proto(const LogMessage& message, ProtoLogMessage&
proto.set_service(message.service());
proto.set_component(message.component());
proto.set_level(convert_level(message.level()));
- proto.set_payload(message.payload());
+ proto.set_payload(vespalib::Utf8::filter_invalid_sequences(message.payload()));
}
}
diff --git a/logd/src/tests/proto_converter/proto_converter_test.cpp b/logd/src/tests/proto_converter/proto_converter_test.cpp
index aa0b00e34d6..702752e8482 100644
--- a/logd/src/tests/proto_converter/proto_converter_test.cpp
+++ b/logd/src/tests/proto_converter/proto_converter_test.cpp
@@ -84,5 +84,21 @@ TEST_F(LogRequestTest, log_messages_are_converted_to_request)
ProtoLogLevel::LogMessage_Level_EVENT, "bar_payload", proto.log_messages(1));
}
+// UTF-8 encoding of \U+FFFD
+#define FFFD "\xEF\xBF\xBD"
+
+TEST_F(LogRequestTest, invalid_utf8_is_filtered)
+{
+ messages.emplace_back(12345, "foo_host", 3, 5, "foo_service", "foo_component", Logger::info,
+ "valid: \xE2\x82\xAC and \xEF\xBF\xBA; semi-valid: \xED\xA0\xBD\xED\xB8\x80; invalid: \xCC surrogate \xED\xBF\xBF overlong \xC1\x81 end"
+ );
+ convert();
+ EXPECT_EQ(1, proto.log_messages_size());
+ expect_proto_log_message_equal(12345, "foo_host", 3, 5, "foo_service", "foo_component",
+ ProtoLogLevel::LogMessage_Level_INFO,
+ "valid: \xE2\x82\xAC and \xEF\xBF\xBA; semi-valid: " FFFD FFFD "; invalid: " FFFD " surrogate " FFFD " overlong " FFFD FFFD " end",
+ proto.log_messages(0));
+}
+
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/vespalib/src/vespa/vespalib/text/utf8.cpp b/vespalib/src/vespa/vespalib/text/utf8.cpp
index 660178aaaa1..58b587d45b5 100644
--- a/vespalib/src/vespa/vespalib/text/utf8.cpp
+++ b/vespalib/src/vespa/vespalib/text/utf8.cpp
@@ -175,8 +175,9 @@ Utf8ReaderForZTS::getComplexChar(unsigned char firstbyte, uint32_t fallback)
}
-Utf8Writer&
-Utf8Writer::putChar(uint32_t codepoint)
+template <typename Target>
+Utf8Writer<Target>&
+Utf8Writer<Target>::putChar(uint32_t codepoint)
{
if (codepoint < 0x80) {
_target.push_back((char)codepoint);
@@ -229,5 +230,23 @@ Utf8Writer::putChar(uint32_t codepoint)
return *this;
}
+template class Utf8Writer<vespalib::string>;
+template class Utf8Writer<std::string>;
-} // namespace vespalib
+template <typename T>
+T Utf8::filter_invalid_sequences(const T& input)
+{
+ T retval;
+ Utf8Reader reader(input.c_str(), input.size());
+ Utf8Writer writer(retval);
+ while (reader.hasMore()) {
+ uint32_t ch = reader.getChar();
+ writer.putChar(ch);
+ }
+ return retval;
+}
+
+template vespalib::string Utf8::filter_invalid_sequences(const vespalib::string&);
+template std::string Utf8::filter_invalid_sequences(const std::string&);
+
+} // namespace
diff --git a/vespalib/src/vespa/vespalib/text/utf8.h b/vespalib/src/vespa/vespalib/text/utf8.h
index 0c75203fbbe..e65aaee9708 100644
--- a/vespalib/src/vespa/vespalib/text/utf8.h
+++ b/vespalib/src/vespa/vespalib/text/utf8.h
@@ -28,6 +28,15 @@ public:
};
/**
+ * Filter a string (std::string or vespalib::string)
+ * and replace any invalid UTF8 sequences with the
+ * standard replacement char U+FFFD; note that any
+ * UTF-8 encoded surrogates are also considered invalid.
+ **/
+ template <typename T>
+ static T filter_invalid_sequences(const T& input);
+
+ /**
* check if a byte is valid as the first byte of an UTF-8 character.
* @param c the byte to be checked
* @return true if a valid UTF-8 character can start with this byte
@@ -155,7 +164,7 @@ protected:
first_high_surrogate = 0xD800,
last_high_surrogate = 0xDBFF,
first_low_surrogate = 0xDC00,
- last_low_surrogate = 0xDCFF
+ last_low_surrogate = 0xDFFF
};
};
@@ -321,9 +330,10 @@ public:
/**
* @brief Writer class that appends UTF-8 characters to a string
**/
+template <typename Target>
class Utf8Writer : public Utf8
{
- string &_target;
+ Target &_target;
public:
/**
* construct a writer appending to the given string
@@ -331,7 +341,7 @@ public:
* that the writer will append to. Must be writable
* and must be kept alive while the writer is active.
**/
- Utf8Writer(string &target) : _target(target) {}
+ Utf8Writer(Target &target) : _target(target) {}
/**
* append the given character to the target string.