diff options
author | Geir Storli <geirstorli@yahoo.no> | 2017-02-09 14:07:35 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-02-09 14:07:35 +0100 |
commit | f7d13af1f8b6b880473f3ae832400e5bddd96257 (patch) | |
tree | a8f41d32d204c5483128dd92fafb406a523003e2 | |
parent | ee78e9d3af060ee832c9527d09e8f858ae2f7bbf (diff) | |
parent | e8767798f65a8587f1b7ed253deb3dbfc463ffba (diff) |
Merge pull request #1729 from yahoo/havardpe/support-compressed-constants
Havardpe/support compressed constants
18 files changed, 360 insertions, 37 deletions
diff --git a/eval/src/tests/eval/value_cache/bad_lz4.json.lz4 b/eval/src/tests/eval/value_cache/bad_lz4.json.lz4 Binary files differnew file mode 100644 index 00000000000..e17d91251a3 --- /dev/null +++ b/eval/src/tests/eval/value_cache/bad_lz4.json.lz4 diff --git a/eval/src/tests/eval/value_cache/dense.json.lz4 b/eval/src/tests/eval/value_cache/dense.json.lz4 Binary files differnew file mode 100644 index 00000000000..166d74ed6f6 --- /dev/null +++ b/eval/src/tests/eval/value_cache/dense.json.lz4 diff --git a/eval/src/tests/eval/value_cache/sparse.json.lz4 b/eval/src/tests/eval/value_cache/sparse.json.lz4 Binary files differnew file mode 100644 index 00000000000..0de6fae56e1 --- /dev/null +++ b/eval/src/tests/eval/value_cache/sparse.json.lz4 diff --git a/eval/src/tests/eval/value_cache/tensor_loader_test.cpp b/eval/src/tests/eval/value_cache/tensor_loader_test.cpp index d3ec7aae546..df0728a9d78 100644 --- a/eval/src/tests/eval/value_cache/tensor_loader_test.cpp +++ b/eval/src/tests/eval/value_cache/tensor_loader_test.cpp @@ -62,12 +62,20 @@ TEST_F("require that dense tensors can be loaded", ConstantTensorLoader(SimpleTe TEST_DO(verify_tensor(make_dense_tensor(), f1.create(TEST_PATH("dense.json"), "tensor(x[2],y[2])"))); } -TEST_F("require that sparse tensors can be loaded", ConstantTensorLoader(SimpleTensorEngine::ref())) { - TEST_DO(verify_tensor(make_sparse_tensor(), f1.create(TEST_PATH("sparse.json"), "tensor(x{},y{})"))); -} - TEST_F("require that mixed tensors can be loaded", ConstantTensorLoader(SimpleTensorEngine::ref())) { TEST_DO(verify_tensor(make_mixed_tensor(), f1.create(TEST_PATH("mixed.json"), "tensor(x{},y[2])"))); } +TEST_F("require that lz4 compressed dense tensor can be loaded", ConstantTensorLoader(SimpleTensorEngine::ref())) { + TEST_DO(verify_tensor(make_dense_tensor(), f1.create(TEST_PATH("dense.json.lz4"), "tensor(x[2],y[2])"))); +} + +TEST_F("require that lz4 compressed sparse tensor can be loaded", ConstantTensorLoader(SimpleTensorEngine::ref())) { + TEST_DO(verify_tensor(make_sparse_tensor(), f1.create(TEST_PATH("sparse.json.lz4"), "tensor(x{},y{})"))); +} + +TEST_F("require that bad lz4 file fails to load creating empty result", ConstantTensorLoader(SimpleTensorEngine::ref())) { + TEST_DO(verify_tensor(dense_tensor_nocells(), f1.create(TEST_PATH("bad_lz4.json.lz4"), "tensor(x[2],y[2])"))); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/eval/src/vespa/eval/eval/value_cache/constant_tensor_loader.cpp b/eval/src/vespa/eval/eval/value_cache/constant_tensor_loader.cpp index fc6181acc02..65c36ca0ef2 100644 --- a/eval/src/vespa/eval/eval/value_cache/constant_tensor_loader.cpp +++ b/eval/src/vespa/eval/eval/value_cache/constant_tensor_loader.cpp @@ -10,6 +10,7 @@ #include <vespa/eval/eval/tensor_engine.h> #include <vespa/eval/eval/tensor_spec.h> #include <vespa/vespalib/io/mapped_file_input.h> +#include <vespa/vespalib/data/lz4_input_decoder.h> LOG_SETUP(".vespalib.eval.value_cache.constant_tensor_loader"); @@ -42,6 +43,31 @@ struct AddressExtractor : ObjectTraverser { } }; +void decode_json(const vespalib::string &path, Input &input, Slime &slime) { + if (slime::JsonFormat::decode(input, slime) == 0) { + LOG(warning, "file contains invalid json: %s", path.c_str()); + } +} + +void decode_json(const vespalib::string &path, Slime &slime) { + MappedFileInput file(path); + if (!file.valid()) { + LOG(warning, "could not read file: %s", path.c_str()); + } else { + if (ends_with(path, ".lz4")) { + size_t buffer_size = 64 * 1024; + Lz4InputDecoder lz4_decoder(file, buffer_size); + decode_json(path, lz4_decoder, slime); + if (lz4_decoder.failed()) { + LOG(warning, "file contains lz4 errors (%s): %s", + lz4_decoder.reason().c_str(), path.c_str()); + } + } else { + decode_json(path, file, slime); + } + } +} + } // namespace vespalib::eval::<unnamed> using ErrorConstant = SimpleConstantValue<ErrorValue>; @@ -57,12 +83,7 @@ ConstantTensorLoader::create(const vespalib::string &path, const vespalib::strin return std::make_unique<TensorConstant>(_engine.type_of(*tensor), std::move(tensor)); } Slime slime; - MappedFileInput file(path); - if (!file.valid()) { - LOG(warning, "could not read file: %s", path.c_str()); - } else if (slime::JsonFormat::decode(file.get(), slime) == 0) { - LOG(warning, "file contains invalid json: %s", path.c_str()); - } + decode_json(path, slime); std::set<vespalib::string> indexed; for (const auto &dimension: value_type.dimensions()) { if (dimension.is_indexed()) { diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt index b030c4056f8..b775cce68f2 100644 --- a/vespalib/CMakeLists.txt +++ b/vespalib/CMakeLists.txt @@ -4,6 +4,9 @@ vespa_define_module( fastos vespalog + EXTERNAL_DEPENDS + lz4 + APPS src/apps/make_fixture_macros @@ -21,6 +24,7 @@ vespa_define_module( src/tests/component src/tests/compress src/tests/data/input_reader + src/tests/data/lz4_encode_decode src/tests/data/memory_input src/tests/data/output_writer src/tests/data/simple_buffer diff --git a/vespalib/src/tests/data/input_reader/input_reader_test.cpp b/vespalib/src/tests/data/input_reader/input_reader_test.cpp index 54c0613b6da..9ef127be364 100644 --- a/vespalib/src/tests/data/input_reader/input_reader_test.cpp +++ b/vespalib/src/tests/data/input_reader/input_reader_test.cpp @@ -2,32 +2,18 @@ #include <vespa/vespalib/testkit/test_kit.h> #include <vespa/vespalib/data/memory_input.h> #include <vespa/vespalib/data/input_reader.h> +#include <vespa/vespalib/test/chunked_input.h> #include <algorithm> using namespace vespalib; - -// make sure input is split into chunks -struct ChunkedInput : Input { - Input &input; - ChunkedInput(Input &input_in) : input(input_in) {} - Memory obtain() override { - Memory memory = input.obtain(); - memory.size = std::min(memory.size, size_t(3)); - return memory; - } - Input &evict(size_t bytes) override { - EXPECT_LESS_EQUAL(bytes, 3u); - input.evict(bytes); - return *this; - } -}; +using vespalib::test::ChunkedInput; TEST("input reader smoke test") { const char *data = "abc\n" "foo bar\n" "2 + 2 = 4\n"; MemoryInput memory_input(data); - ChunkedInput input(memory_input); + ChunkedInput input(memory_input, 3); { InputReader src(input); EXPECT_EQUAL(src.get_offset(), 0u); @@ -91,7 +77,7 @@ TEST("require that input can be explicitly failed with custom message") { TEST("require that reading a byte sequence crossing the end of input fails") { const char *data = "1234567890"; MemoryInput memory_input(data); - ChunkedInput input(memory_input); + ChunkedInput input(memory_input, 3); { InputReader src(input); EXPECT_EQUAL(src.read(15), Memory()); diff --git a/vespalib/src/tests/data/lz4_encode_decode/CMakeLists.txt b/vespalib/src/tests/data/lz4_encode_decode/CMakeLists.txt new file mode 100644 index 00000000000..98e50a9a1c4 --- /dev/null +++ b/vespalib/src/tests/data/lz4_encode_decode/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vespalib_lz4_encode_decode_test_app TEST + SOURCES + lz4_encode_decode_test.cpp + DEPENDS + vespalib +) +vespa_add_test(NAME vespalib_lz4_encode_decode_test_app COMMAND vespalib_lz4_encode_decode_test_app) diff --git a/vespalib/src/tests/data/lz4_encode_decode/lz4_encode_decode_test.cpp b/vespalib/src/tests/data/lz4_encode_decode/lz4_encode_decode_test.cpp new file mode 100644 index 00000000000..8593a44d3f6 --- /dev/null +++ b/vespalib/src/tests/data/lz4_encode_decode/lz4_encode_decode_test.cpp @@ -0,0 +1,47 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/vespalib/testkit/test_kit.h> +#include <vespa/vespalib/data/lz4_output_encoder.h> +#include <vespa/vespalib/data/lz4_input_decoder.h> +#include <vespa/vespalib/data/simple_buffer.h> +#include <vespa/vespalib/data/memory_input.h> +#include <vespa/vespalib/test/chunked_input.h> + +using namespace vespalib; +using vespalib::test::ChunkedInput; + +void transfer(Input &input, Output &output) { + for (Memory src = input.obtain(); src.size > 0; src = input.obtain()) { + auto dst = output.reserve(src.size); + ASSERT_GREATER_EQUAL(dst.size, src.size); + memcpy(dst.data, src.data, src.size); + output.commit(src.size); + input.evict(src.size); + } +} + +TEST("require that lz4 encode-decode works") { + SimpleBuffer data; + for (size_t i = 0; i < 100; ++i) { + data.add((i % 7) + (i * 5) + (i >> 3)); + } + SimpleBuffer encoded; + { + MemoryInput memory_input(data.get()); + ChunkedInput chunked_input(memory_input, 3); + Lz4OutputEncoder lz4_encoder(encoded, 10); + transfer(chunked_input, lz4_encoder); + } + SimpleBuffer decoded; + { + MemoryInput memory_input(encoded.get()); + ChunkedInput chunked_input(memory_input, 3); + Lz4InputDecoder input_decoder(chunked_input, 10); + transfer(input_decoder, decoded); + EXPECT_TRUE(!input_decoder.failed()); + EXPECT_EQUAL(input_decoder.reason(), vespalib::string()); + } + EXPECT_NOT_EQUAL(data.get(), encoded.get()); + EXPECT_EQUAL(data.get(), decoded.get()); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/vespa/vespalib/data/CMakeLists.txt b/vespalib/src/vespa/vespalib/data/CMakeLists.txt index 9d574953016..fd7afd71101 100644 --- a/vespalib/src/vespa/vespalib/data/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/data/CMakeLists.txt @@ -3,6 +3,8 @@ vespa_add_library(vespalib_vespalib_data OBJECT SOURCES input.cpp input_reader.cpp + lz4_input_decoder.cpp + lz4_output_encoder.cpp memory.cpp memory_input.cpp memorydatastore.cpp diff --git a/vespalib/src/vespa/vespalib/data/lz4_input_decoder.cpp b/vespalib/src/vespa/vespalib/data/lz4_input_decoder.cpp new file mode 100644 index 00000000000..c7f0fce664f --- /dev/null +++ b/vespalib/src/vespa/vespalib/data/lz4_input_decoder.cpp @@ -0,0 +1,87 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include "lz4_input_decoder.h" +#include <lz4frame.h> + +namespace vespalib { + +void +Lz4InputDecoder::fail(const char *reason) +{ + _failed = true; + _reason = reason; + _eof = true; +} + +void +Lz4InputDecoder::decode_more() +{ + assert((_pos == _used) && !_eof); + Memory memory = _input.obtain(); + if (memory.size == 0) { + auto res = LZ4F_freeDecompressionContext(_ctx); + _ctx = nullptr; + _eof = true; + if (LZ4F_isError(res)) { + fail(LZ4F_getErrorName(res)); + } + } else { + size_t input_size = memory.size; + size_t output_size = _buffer.size(); + auto res = LZ4F_decompress(_ctx, + &_buffer[0], &output_size, + memory.data, &input_size, + nullptr); + if (LZ4F_isError(res)) { + fail(LZ4F_getErrorName(res)); + } else if (input_size == 0) { + fail("lz4 refusing to eat input"); + } else { + assert(input_size <= memory.size); + assert(output_size <= _buffer.size()); + _input.evict(input_size); + _used = output_size; + _pos = 0; + } + } +} + +Lz4InputDecoder::Lz4InputDecoder(Input &input, size_t buffer_size) + : _input(input), + _buffer(buffer_size, 0), + _used(0), + _pos(0), + _eof(false), + _failed(false), + _reason(), + _ctx(nullptr) +{ + auto res = LZ4F_createDecompressionContext(&_ctx, LZ4F_VERSION); + if (LZ4F_isError(res)) { + fail(LZ4F_getErrorName(res)); + } +} + +Lz4InputDecoder::~Lz4InputDecoder() +{ + LZ4F_freeDecompressionContext(_ctx); +} + +Memory +Lz4InputDecoder::obtain() +{ + while ((_pos == _used) && !_eof) { + decode_more(); + } + return Memory(&_buffer[_pos], (_used - _pos)); +} + +Input & +Lz4InputDecoder::evict(size_t bytes) +{ + _pos += bytes; + return *this; +} + +} // namespace vespalib diff --git a/vespalib/src/vespa/vespalib/data/lz4_input_decoder.h b/vespalib/src/vespa/vespalib/data/lz4_input_decoder.h new file mode 100644 index 00000000000..f711aef30b5 --- /dev/null +++ b/vespalib/src/vespa/vespalib/data/lz4_input_decoder.h @@ -0,0 +1,37 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "input.h" + +struct LZ4F_dctx_s; + +namespace vespalib { + +/** + * Input filter decompressing data stored in framed lz4 format. + **/ +class Lz4InputDecoder : public Input +{ +private: + Input &_input; + std::vector<char> _buffer; + size_t _used; + size_t _pos; + bool _eof; + bool _failed; + vespalib::string _reason; + LZ4F_dctx_s *_ctx; + + void fail(const char *reason); + void decode_more(); +public: + Lz4InputDecoder(Input &input, size_t buffer_size); + ~Lz4InputDecoder(); + Memory obtain() override; + Input &evict(size_t bytes) override; + bool failed() const { return _failed; } + const vespalib::string &reason() const { return _reason; } +}; + +} // namespace vespalib diff --git a/vespalib/src/vespa/vespalib/data/lz4_output_encoder.cpp b/vespalib/src/vespa/vespalib/data/lz4_output_encoder.cpp new file mode 100644 index 00000000000..60f8546bde1 --- /dev/null +++ b/vespalib/src/vespa/vespalib/data/lz4_output_encoder.cpp @@ -0,0 +1,54 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include "lz4_output_encoder.h" +#include <lz4frame.h> + +namespace vespalib { + +void +Lz4OutputEncoder::encode_frame() +{ + auto dst = _output.reserve(LZ4F_compressFrameBound(_used, nullptr)); + size_t written = LZ4F_compressFrame(dst.data, dst.size, &_buffer[0], _used, nullptr); + assert(!LZ4F_isError(written)); + assert(written <= dst.size); + _output.commit(written); + _used = 0; +} + +Lz4OutputEncoder::Lz4OutputEncoder(Output &output, size_t buffer_size) + : _output(output), + _buffer(buffer_size, 0), + _used(0), + _limit(buffer_size) +{ +} + +Lz4OutputEncoder::~Lz4OutputEncoder() +{ + if (_used > 0) { + encode_frame(); + } +} + +WritableMemory +Lz4OutputEncoder::reserve(size_t bytes) +{ + if ((_used + bytes) > _buffer.size()) { + _buffer.resize(_used + bytes, 0); + } + return WritableMemory(&_buffer[_used], (_buffer.size() - _used)); +} + +Output & +Lz4OutputEncoder::commit(size_t bytes) +{ + _used += bytes; + if (_used >= _limit) { + encode_frame(); + } + return *this; +} + +} // namespace vespalib diff --git a/vespalib/src/vespa/vespalib/data/lz4_output_encoder.h b/vespalib/src/vespa/vespalib/data/lz4_output_encoder.h new file mode 100644 index 00000000000..6936a9a0169 --- /dev/null +++ b/vespalib/src/vespa/vespalib/data/lz4_output_encoder.h @@ -0,0 +1,30 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "output.h" + +namespace vespalib { + +/** + * Output filter compressing data into framed lz4 format. This class + * will use the simple LZ4 compression API to encode complete frames + * at a time, trading performance for simplicity. + **/ +class Lz4OutputEncoder : public Output +{ +private: + Output &_output; + std::vector<char> _buffer; + size_t _used; + size_t _limit; + + void encode_frame(); +public: + Lz4OutputEncoder(Output &output, size_t buffer_size); + ~Lz4OutputEncoder(); + WritableMemory reserve(size_t bytes) override; + Output &commit(size_t bytes) override; +}; + +} // namespace vespalib diff --git a/vespalib/src/vespa/vespalib/data/simple_buffer.cpp b/vespalib/src/vespa/vespalib/data/simple_buffer.cpp index 4fe71df1435..a9bf80ac845 100644 --- a/vespalib/src/vespa/vespalib/data/simple_buffer.cpp +++ b/vespalib/src/vespa/vespalib/data/simple_buffer.cpp @@ -29,6 +29,7 @@ SimpleBuffer::evict(size_t bytes) WritableMemory SimpleBuffer::reserve(size_t bytes) { + assert((_used + bytes) >= _used); _data.resize(_used + bytes, char(0x55)); return WritableMemory(&_data[_used], bytes); } diff --git a/vespalib/src/vespa/vespalib/data/slime/json_format.cpp b/vespalib/src/vespa/vespalib/data/slime/json_format.cpp index e8ebcfb9e10..30ac5d98042 100644 --- a/vespalib/src/vespa/vespalib/data/slime/json_format.cpp +++ b/vespalib/src/vespa/vespalib/data/slime/json_format.cpp @@ -178,7 +178,7 @@ struct JsonDecoder { vespalib::string key; vespalib::string value; - JsonDecoder(InputReader &input) : in(input), c(in.read()), key(), value() {} + JsonDecoder(InputReader &reader) : in(reader), c(in.read()), key(), value() {} void next() { if (in.obtain() > 0) { @@ -484,18 +484,24 @@ JsonFormat::encode(const Slime &slime, Output &output, bool compact) } size_t -JsonFormat::decode(const Memory &memory, Slime &slime) +JsonFormat::decode(Input &input, Slime &slime) { - MemoryInput memory_input(memory); - InputReader input(memory_input); - JsonDecoder decoder(input); + InputReader reader(input); + JsonDecoder decoder(reader); decoder.decodeValue(slime); - if (input.failed()) { + if (reader.failed()) { slime.wrap("partial_result"); - slime.get().setLong("offending_offset", input.get_offset()); - slime.get().setString("error_message", input.get_error_message()); + slime.get().setLong("offending_offset", reader.get_offset()); + slime.get().setString("error_message", reader.get_error_message()); } - return input.failed() ? 0 : input.get_offset(); + return reader.failed() ? 0 : reader.get_offset(); +} + +size_t +JsonFormat::decode(const Memory &memory, Slime &slime) +{ + MemoryInput input(memory); + return decode(input, slime); } } // namespace vespalib::slime diff --git a/vespalib/src/vespa/vespalib/data/slime/json_format.h b/vespalib/src/vespa/vespalib/data/slime/json_format.h index e334cd55546..3d998e5bcd4 100644 --- a/vespalib/src/vespa/vespalib/data/slime/json_format.h +++ b/vespalib/src/vespa/vespalib/data/slime/json_format.h @@ -18,6 +18,7 @@ class Inspector; struct JsonFormat { static void encode(const Inspector &inspector, Output &output, bool compact); static void encode(const Slime &slime, Output &output, bool compact); + static size_t decode(Input &input, Slime &slime); static size_t decode(const Memory &memory, Slime &slime); }; diff --git a/vespalib/src/vespa/vespalib/test/chunked_input.h b/vespalib/src/vespa/vespalib/test/chunked_input.h new file mode 100644 index 00000000000..d73adb8782f --- /dev/null +++ b/vespalib/src/vespa/vespalib/test/chunked_input.h @@ -0,0 +1,31 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/vespalib/data/input.h> + +namespace vespalib { +namespace test { + +/** + * Input filter making sure the input is split into chunks no larger + * than the maximum chunk size given to the constuctor. + **/ +struct ChunkedInput : Input { + Input &input; + size_t max_chunk_size; + ChunkedInput(Input &input_in, size_t max_chunk_size_in) + : input(input_in), max_chunk_size(max_chunk_size_in) {} + Memory obtain() override { + Memory memory = input.obtain(); + memory.size = std::min(memory.size, max_chunk_size); + return memory; + } + Input &evict(size_t bytes) override { + EXPECT_LESS_EQUAL(bytes, max_chunk_size); + input.evict(bytes); + return *this; + } +}; + +} // namespace test +} // namespace vespalib |