From 0b8175ae9aa1a525c867787bad989a2d0355d2fe Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Wed, 14 Feb 2024 10:54:47 +0100 Subject: Add search::DataBufferWriter. Add search::predicate::DocumentFeatureStoreSaver. Use search::BufferWriter API when saving document feature store. --- .../predicate/document_features_store_test.cpp | 20 ++- .../src/vespa/searchlib/predicate/CMakeLists.txt | 1 + .../predicate/document_features_store.cpp | 109 +--------------- .../searchlib/predicate/document_features_store.h | 5 +- .../predicate/document_features_store_saver.cpp | 138 +++++++++++++++++++++ .../predicate/document_features_store_saver.h | 29 +++++ .../vespa/searchlib/predicate/predicate_index.cpp | 9 +- searchlib/src/vespa/searchlib/util/CMakeLists.txt | 1 + .../vespa/searchlib/util/data_buffer_writer.cpp | 33 +++++ .../src/vespa/searchlib/util/data_buffer_writer.h | 24 ++++ 10 files changed, 257 insertions(+), 112 deletions(-) create mode 100644 searchlib/src/vespa/searchlib/predicate/document_features_store_saver.cpp create mode 100644 searchlib/src/vespa/searchlib/predicate/document_features_store_saver.h create mode 100644 searchlib/src/vespa/searchlib/util/data_buffer_writer.cpp create mode 100644 searchlib/src/vespa/searchlib/util/data_buffer_writer.h diff --git a/searchlib/src/tests/predicate/document_features_store_test.cpp b/searchlib/src/tests/predicate/document_features_store_test.cpp index fd30041deec..11ca20349c3 100644 --- a/searchlib/src/tests/predicate/document_features_store_test.cpp +++ b/searchlib/src/tests/predicate/document_features_store_test.cpp @@ -1,16 +1,18 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. // Unit tests for document_features_store. -#include -LOG_SETUP("document_features_store_test"); - #include +#include +#include #include #include #include #include #include +#include +LOG_SETUP("document_features_store_test"); + using namespace search; using namespace search::predicate; using std::string; @@ -21,6 +23,14 @@ const uint64_t hash1 = 0x12345678; const uint64_t hash2 = 0x123456789a; const uint32_t doc_id = 42; +void +save_document_features_store(DocumentFeaturesStore& store, vespalib::DataBuffer& buffer) +{ + DataBufferWriter writer(buffer); + store.make_saver()->save(writer); + writer.flush(); +} + TEST("require that DocumentFeaturesStore can store features.") { DocumentFeaturesStore features_store(10); PredicateTreeAnnotations annotations; @@ -191,7 +201,7 @@ TEST("require that DocumentFeaturesStore can be serialized") { expectHash("foo=100-199", features); vespalib::DataBuffer buffer; - features_store.serialize(buffer); + save_document_features_store(features_store, buffer); DocumentFeaturesStore features_store2(buffer); features = features_store2.get(doc_id); @@ -213,7 +223,7 @@ TEST("require that serialization cleans up wordstore") { EXPECT_EQUAL(562524u, features_store.getMemoryUsage().usedBytes()); vespalib::DataBuffer buffer; - features_store.serialize(buffer); + save_document_features_store(features_store, buffer); DocumentFeaturesStore features_store2(buffer); EXPECT_EQUAL(562464u, features_store2.getMemoryUsage().usedBytes()); } diff --git a/searchlib/src/vespa/searchlib/predicate/CMakeLists.txt b/searchlib/src/vespa/searchlib/predicate/CMakeLists.txt index 9c28a3cdddb..acb0391143d 100644 --- a/searchlib/src/vespa/searchlib/predicate/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/predicate/CMakeLists.txt @@ -2,6 +2,7 @@ vespa_add_library(searchlib_predicate OBJECT SOURCES document_features_store.cpp + document_features_store_saver.cpp predicate_index.cpp predicate_interval.cpp predicate_interval_store.cpp diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp b/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp index 6653dc5733b..a4e415501b7 100644 --- a/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp +++ b/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "document_features_store.h" +#include "document_features_store_saver.h" #include "predicate_range_expander.h" #include #include @@ -273,112 +274,10 @@ DocumentFeaturesStore::getMemoryUsage() const { return usage; } -namespace { - -template -void -findUsedWords(const RefsVector& refs, const RangesStore& ranges, - unordered_map &word_map, - vector &word_list) -{ - for (auto& cur_refs : refs) { - auto ranges_ref = cur_refs._ranges; - if (ranges_ref.valid()) { - auto range_vector = ranges.get(ranges_ref); - for (const auto& range : range_vector) { - if (!word_map.count(range.label_ref.ref())) { - word_map[range.label_ref.ref()] = word_list.size(); - word_list.push_back(range.label_ref); - } - } - } - } -} - -void -serializeWords(DataBuffer &buffer, const vector &word_list, - const memoryindex::WordStore &word_store) -{ - buffer.writeInt32(word_list.size()); - for (const auto &word_ref : word_list) { - const char *word = word_store.getWord(word_ref); - uint32_t len = strlen(word); - buffer.writeInt32(len); - buffer.writeBytes(word, len); - } -} - -template -void -serialize_ranges(DataBuffer &buffer, const RefsVector& refs, const RangesStore &ranges, - unordered_map &word_map) -{ - uint32_t ranges_size = 0; - if (!refs.empty()) { - assert(!refs.front()._ranges.valid()); - for (auto& cur_refs : refs) { - if (cur_refs._ranges.valid()) { - ++ranges_size; - } - } - } - buffer.writeInt32(ranges_size); - for (uint32_t doc_id = 0; doc_id < refs.size(); ++doc_id) { - auto ranges_ref = refs[doc_id]._ranges; - if (ranges_ref.valid()) { - buffer.writeInt32(doc_id); - auto range_vector = ranges.get(ranges_ref); - buffer.writeInt32(range_vector.size()); - for (const auto &range : range_vector) { - buffer.writeInt32(word_map[range.label_ref.ref()]); - buffer.writeInt64(range.from); - buffer.writeInt64(range.to); - } - } - } -} - -template -void -serialize_features(DataBuffer &buffer, const RefsVector& refs, const FeaturesStore& features) +std::unique_ptr +DocumentFeaturesStore::make_saver() const { - uint32_t features_size = 0; - if (!refs.empty()) { - assert(!refs.front()._features.valid()); - for (auto& cur_refs : refs) { - if (cur_refs._features.valid()) { - ++features_size; - } - } - } - buffer.writeInt32(features_size); - for (uint32_t doc_id = 0; doc_id < refs.size(); ++doc_id) { - auto features_ref = refs[doc_id]._features; - if (features_ref.valid()) { - buffer.writeInt32(doc_id); - auto feature_vector = features.get(features_ref); - buffer.writeInt32(feature_vector.size()); - for (const auto &feature : feature_vector) { - buffer.writeInt64(feature); - } - } - } -} - -} // namespace - -void -DocumentFeaturesStore::serialize(DataBuffer &buffer) const -{ - vector word_list; - unordered_map word_map; - - findUsedWords(_refs, _ranges, word_map, word_list); - - buffer.writeInt16(_arity); - serializeWords(buffer, word_list, _word_store); - serialize_ranges(buffer, _refs, _ranges, word_map); - serialize_features(buffer, _refs, _features); + return std::make_unique(*this); } } diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store.h b/searchlib/src/vespa/searchlib/predicate/document_features_store.h index bc4d58b1c04..d12e703957a 100644 --- a/searchlib/src/vespa/searchlib/predicate/document_features_store.h +++ b/searchlib/src/vespa/searchlib/predicate/document_features_store.h @@ -14,6 +14,8 @@ namespace search::predicate { +class DocumentFeaturesStoreSaver; + /** * Class used to track the {featureId, docId} pairs that are inserted * into the btree memory index dictionary. These pairs are later used @@ -21,6 +23,7 @@ namespace search::predicate { * lists of the dictionary. */ class DocumentFeaturesStore { + friend class DocumentFeaturesStoreSaver; using WordStore = memoryindex::WordStore; struct Range { vespalib::datastore::EntryRef label_ref; @@ -92,7 +95,7 @@ public: void assign_generation(generation_t current_gen); vespalib::MemoryUsage getMemoryUsage() const; - void serialize(vespalib::DataBuffer &buffer) const; + std::unique_ptr make_saver() const; }; } diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.cpp b/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.cpp new file mode 100644 index 00000000000..08a3592da14 --- /dev/null +++ b/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.cpp @@ -0,0 +1,138 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "document_features_store_saver.h" +#include +#include + +using vespalib::datastore::EntryRef; +using search::BufferWriter; + +namespace search::predicate { + +DocumentFeaturesStoreSaver::DocumentFeaturesStoreSaver(const DocumentFeaturesStore& store) + : _refs(store._refs), + _features(store._features), + _ranges(store._ranges), + _word_store(store._word_store), + _arity(store._arity) +{ +} + +DocumentFeaturesStoreSaver::~DocumentFeaturesStoreSaver() = default; + +namespace { + +template +void nbo_write(BufferWriter& writer, T value) +{ + auto value_nbo = vespalib::nbo::n2h(value); + writer.write(&value_nbo, sizeof(value_nbo)); +} + +template +void +find_used_words(const RefsVector& refs, const RangesStore& ranges, + std::unordered_map& word_map, + std::vector& word_list) +{ + for (auto& cur_refs : refs) { + auto ranges_ref = cur_refs._ranges; + if (ranges_ref.valid()) { + auto range_vector = ranges.get(ranges_ref); + for (const auto& range : range_vector) { + if (!word_map.count(range.label_ref.ref())) { + word_map[range.label_ref.ref()] = word_list.size(); + word_list.push_back(range.label_ref); + } + } + } + } +} + +void +serialize_words(BufferWriter& writer, const std::vector& word_list, + const memoryindex::WordStore& word_store) +{ + nbo_write(writer, word_list.size());; + for (const auto &word_ref : word_list) { + const char *word = word_store.getWord(word_ref); + uint32_t len = strlen(word); + nbo_write(writer, len); + writer.write(word, len); + } +} + +template +void +serialize_ranges(BufferWriter& writer, const RefsVector& refs, const RangesStore& ranges, + std::unordered_map& word_map) +{ + uint32_t ranges_size = 0; + if (!refs.empty()) { + assert(!refs.front()._ranges.valid()); + for (auto& cur_refs : refs) { + if (cur_refs._ranges.valid()) { + ++ranges_size; + } + } + } + nbo_write(writer, ranges_size); + for (uint32_t doc_id = 0; doc_id < refs.size(); ++doc_id) { + auto ranges_ref = refs[doc_id]._ranges; + if (ranges_ref.valid()) { + nbo_write(writer, doc_id); + auto range_vector = ranges.get(ranges_ref); + nbo_write(writer, range_vector.size()); + for (const auto &range : range_vector) { + nbo_write(writer, word_map[range.label_ref.ref()]); + nbo_write(writer, range.from); + nbo_write(writer, range.to); + } + } + } +} + +template +void +serialize_features(BufferWriter& writer, const RefsVector& refs, const FeaturesStore& features) +{ + uint32_t features_size = 0; + if (!refs.empty()) { + assert(!refs.front()._features.valid()); + for (auto& cur_refs : refs) { + if (cur_refs._features.valid()) { + ++features_size; + } + } + } + nbo_write(writer, features_size); + for (uint32_t doc_id = 0; doc_id < refs.size(); ++doc_id) { + auto features_ref = refs[doc_id]._features; + if (features_ref.valid()) { + nbo_write(writer, doc_id); + auto feature_vector = features.get(features_ref); + nbo_write(writer, feature_vector.size()); + for (const auto &feature : feature_vector) { + nbo_write(writer, feature); + } + } + } +} + +} // namespace + +void +DocumentFeaturesStoreSaver::save(BufferWriter& writer) const +{ + std::vector word_list; + std::unordered_map word_map; + + find_used_words(_refs, _ranges, word_map, word_list); + + nbo_write(writer, _arity); + serialize_words(writer, word_list, _word_store); + serialize_ranges(writer, _refs, _ranges, word_map); + serialize_features(writer, _refs, _features); +} + +} diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.h b/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.h new file mode 100644 index 00000000000..373349dae0c --- /dev/null +++ b/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.h @@ -0,0 +1,29 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "document_features_store.h" + +namespace search { class BufferWriter; } + +namespace search::predicate { + +class DocumentFeaturesStoreSaver { + using RefsVector = DocumentFeaturesStore::RefsVector; + using FeaturesStore = DocumentFeaturesStore::FeaturesStore; + using RangesStore = DocumentFeaturesStore::RangesStore; + using WordStore = DocumentFeaturesStore::WordStore; + + const RefsVector& _refs; // TODO: Use copy when saving in flush thread + const FeaturesStore& _features; + const RangesStore& _ranges; + const WordStore& _word_store; + const uint32_t _arity; + +public: + DocumentFeaturesStoreSaver(const DocumentFeaturesStore& store); + ~DocumentFeaturesStoreSaver(); + void save(BufferWriter& writer) const; +}; + +} diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp index cf7d7dafa05..324e3fd99e8 100644 --- a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp +++ b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp @@ -1,7 +1,9 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "predicate_index.h" +#include "document_features_store_saver.h" #include "predicate_hash.h" +#include #include #include #include @@ -127,7 +129,12 @@ PredicateIndex::~PredicateIndex() = default; void PredicateIndex::serialize(DataBuffer &buffer, SerializeStats& stats) const { - _features_store.serialize(buffer); + { + auto saver = _features_store.make_saver(); + DataBufferWriter writer(buffer); + saver->save(writer); + writer.flush(); + } stats._features_len = buffer.getDataLen(); auto old_len = buffer.getDataLen(); buffer.writeInt16(_arity); diff --git a/searchlib/src/vespa/searchlib/util/CMakeLists.txt b/searchlib/src/vespa/searchlib/util/CMakeLists.txt index e9661b5e919..a90ca1191dd 100644 --- a/searchlib/src/vespa/searchlib/util/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/util/CMakeLists.txt @@ -4,6 +4,7 @@ vespa_add_library(searchlib_util OBJECT bufferwriter.cpp comprbuffer.cpp comprfile.cpp + data_buffer_writer.cpp dirtraverse.cpp drainingbufferwriter.cpp file_with_header.cpp diff --git a/searchlib/src/vespa/searchlib/util/data_buffer_writer.cpp b/searchlib/src/vespa/searchlib/util/data_buffer_writer.cpp new file mode 100644 index 00000000000..fb5061f394e --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/data_buffer_writer.cpp @@ -0,0 +1,33 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "data_buffer_writer.h" +#include + +namespace search { + +namespace { + +constexpr size_t buffer_size = 4_Ki; + +} + +DataBufferWriter::DataBufferWriter(vespalib::DataBuffer& data_buffer) + : _data_buffer(data_buffer) +{ + _data_buffer.ensureFree(buffer_size); + setup(_data_buffer.getFree(), _data_buffer.getFreeLen()); +} + +DataBufferWriter::~DataBufferWriter() = default; + +void +DataBufferWriter::flush() +{ + if (usedLen() > 0) { + _data_buffer.moveFreeToData(usedLen()); + _data_buffer.ensureFree(buffer_size); + setup(_data_buffer.getFree(), _data_buffer.getFreeLen()); + } +} + +} diff --git a/searchlib/src/vespa/searchlib/util/data_buffer_writer.h b/searchlib/src/vespa/searchlib/util/data_buffer_writer.h new file mode 100644 index 00000000000..0d16bdfc4f4 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/data_buffer_writer.h @@ -0,0 +1,24 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "bufferwriter.h" + +namespace vespalib { class DataBuffer; } + +namespace search { + +/** + * Class to write to a data buffer, used during migration of + * attribute vector saver implementation. + */ +class DataBufferWriter : public BufferWriter +{ + vespalib::DataBuffer& _data_buffer; +public: + DataBufferWriter(vespalib::DataBuffer& data_buffer); + ~DataBufferWriter(); + void flush() override; +}; + +} -- cgit v1.2.3 From 67dc593eab77cc8d42d48ce5c27b360ae777eda7 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Wed, 14 Feb 2024 14:32:57 +0100 Subject: Add class comment for search::predicate::DocumentFeaturesStoreSaver. --- .../src/vespa/searchlib/predicate/document_features_store_saver.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.h b/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.h index 373349dae0c..630ac3900f5 100644 --- a/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.h +++ b/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.h @@ -8,6 +8,10 @@ namespace search { class BufferWriter; } namespace search::predicate { +/* + * Class used to save a DocumentFeaturesStore instance, streaming the + * serialized data via a BufferWriter. + */ class DocumentFeaturesStoreSaver { using RefsVector = DocumentFeaturesStore::RefsVector; using FeaturesStore = DocumentFeaturesStore::FeaturesStore; -- cgit v1.2.3