aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2024-02-14 17:04:36 +0100
committerGitHub <noreply@github.com>2024-02-14 17:04:36 +0100
commit5c5a0a43b2eee22b621ea17dfff39691abc28be5 (patch)
tree7955fb4711b5e6886c5a47fcae98217299c3bf56
parent88720570660b294c5b0e433f1a55cf2fcf6e4a5f (diff)
parent67dc593eab77cc8d42d48ce5c27b360ae777eda7 (diff)
Merge pull request #30266 from vespa-engine/toregge/add-data-buffer-buffer-writerv8.305.17
Add search::DataBufferWriter.
-rw-r--r--searchlib/src/tests/predicate/document_features_store_test.cpp20
-rw-r--r--searchlib/src/vespa/searchlib/predicate/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/predicate/document_features_store.cpp109
-rw-r--r--searchlib/src/vespa/searchlib/predicate/document_features_store.h5
-rw-r--r--searchlib/src/vespa/searchlib/predicate/document_features_store_saver.cpp138
-rw-r--r--searchlib/src/vespa/searchlib/predicate/document_features_store_saver.h33
-rw-r--r--searchlib/src/vespa/searchlib/predicate/predicate_index.cpp9
-rw-r--r--searchlib/src/vespa/searchlib/util/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/util/data_buffer_writer.cpp33
-rw-r--r--searchlib/src/vespa/searchlib/util/data_buffer_writer.h24
10 files changed, 261 insertions, 112 deletions
diff --git a/searchlib/src/tests/predicate/document_features_store_test.cpp b/searchlib/src/tests/predicate/document_features_store_test.cpp
index fd30041deec..11ca20349c3 100644
--- a/searchlib/src/tests/predicate/document_features_store_test.cpp
+++ b/searchlib/src/tests/predicate/document_features_store_test.cpp
@@ -1,16 +1,18 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
// Unit tests for document_features_store.
-#include <vespa/log/log.h>
-LOG_SETUP("document_features_store_test");
-
#include <vespa/searchlib/predicate/document_features_store.h>
+#include <vespa/searchlib/util/data_buffer_writer.h>
+#include <vespa/searchlib/predicate/document_features_store_saver.h>
#include <vespa/searchlib/predicate/predicate_index.h>
#include <vespa/searchlib/predicate/predicate_tree_annotator.h>
#include <vespa/searchlib/predicate/predicate_hash.h>
#include <vespa/vespalib/testkit/testapp.h>
#include <string>
+#include <vespa/log/log.h>
+LOG_SETUP("document_features_store_test");
+
using namespace search;
using namespace search::predicate;
using std::string;
@@ -21,6 +23,14 @@ const uint64_t hash1 = 0x12345678;
const uint64_t hash2 = 0x123456789a;
const uint32_t doc_id = 42;
+void
+save_document_features_store(DocumentFeaturesStore& store, vespalib::DataBuffer& buffer)
+{
+ DataBufferWriter writer(buffer);
+ store.make_saver()->save(writer);
+ writer.flush();
+}
+
TEST("require that DocumentFeaturesStore can store features.") {
DocumentFeaturesStore features_store(10);
PredicateTreeAnnotations annotations;
@@ -191,7 +201,7 @@ TEST("require that DocumentFeaturesStore can be serialized") {
expectHash("foo=100-199", features);
vespalib::DataBuffer buffer;
- features_store.serialize(buffer);
+ save_document_features_store(features_store, buffer);
DocumentFeaturesStore features_store2(buffer);
features = features_store2.get(doc_id);
@@ -213,7 +223,7 @@ TEST("require that serialization cleans up wordstore") {
EXPECT_EQUAL(562524u, features_store.getMemoryUsage().usedBytes());
vespalib::DataBuffer buffer;
- features_store.serialize(buffer);
+ save_document_features_store(features_store, buffer);
DocumentFeaturesStore features_store2(buffer);
EXPECT_EQUAL(562464u, features_store2.getMemoryUsage().usedBytes());
}
diff --git a/searchlib/src/vespa/searchlib/predicate/CMakeLists.txt b/searchlib/src/vespa/searchlib/predicate/CMakeLists.txt
index 9c28a3cdddb..acb0391143d 100644
--- a/searchlib/src/vespa/searchlib/predicate/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/predicate/CMakeLists.txt
@@ -2,6 +2,7 @@
vespa_add_library(searchlib_predicate OBJECT
SOURCES
document_features_store.cpp
+ document_features_store_saver.cpp
predicate_index.cpp
predicate_interval.cpp
predicate_interval_store.cpp
diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp b/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp
index 6653dc5733b..a4e415501b7 100644
--- a/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp
+++ b/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp
@@ -1,6 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "document_features_store.h"
+#include "document_features_store_saver.h"
#include "predicate_range_expander.h"
#include <vespa/vespalib/btree/btree.hpp>
#include <vespa/vespalib/btree/btreeroot.hpp>
@@ -273,112 +274,10 @@ DocumentFeaturesStore::getMemoryUsage() const {
return usage;
}
-namespace {
-
-template <typename RefsVector, typename RangesStore>
-void
-findUsedWords(const RefsVector& refs, const RangesStore& ranges,
- unordered_map<uint32_t, uint32_t> &word_map,
- vector<EntryRef> &word_list)
-{
- for (auto& cur_refs : refs) {
- auto ranges_ref = cur_refs._ranges;
- if (ranges_ref.valid()) {
- auto range_vector = ranges.get(ranges_ref);
- for (const auto& range : range_vector) {
- if (!word_map.count(range.label_ref.ref())) {
- word_map[range.label_ref.ref()] = word_list.size();
- word_list.push_back(range.label_ref);
- }
- }
- }
- }
-}
-
-void
-serializeWords(DataBuffer &buffer, const vector<EntryRef> &word_list,
- const memoryindex::WordStore &word_store)
-{
- buffer.writeInt32(word_list.size());
- for (const auto &word_ref : word_list) {
- const char *word = word_store.getWord(word_ref);
- uint32_t len = strlen(word);
- buffer.writeInt32(len);
- buffer.writeBytes(word, len);
- }
-}
-
-template <typename RefsVector, typename RangesStore>
-void
-serialize_ranges(DataBuffer &buffer, const RefsVector& refs, const RangesStore &ranges,
- unordered_map<uint32_t, uint32_t> &word_map)
-{
- uint32_t ranges_size = 0;
- if (!refs.empty()) {
- assert(!refs.front()._ranges.valid());
- for (auto& cur_refs : refs) {
- if (cur_refs._ranges.valid()) {
- ++ranges_size;
- }
- }
- }
- buffer.writeInt32(ranges_size);
- for (uint32_t doc_id = 0; doc_id < refs.size(); ++doc_id) {
- auto ranges_ref = refs[doc_id]._ranges;
- if (ranges_ref.valid()) {
- buffer.writeInt32(doc_id);
- auto range_vector = ranges.get(ranges_ref);
- buffer.writeInt32(range_vector.size());
- for (const auto &range : range_vector) {
- buffer.writeInt32(word_map[range.label_ref.ref()]);
- buffer.writeInt64(range.from);
- buffer.writeInt64(range.to);
- }
- }
- }
-}
-
-template <typename RefsVector, typename FeaturesStore>
-void
-serialize_features(DataBuffer &buffer, const RefsVector& refs, const FeaturesStore& features)
+std::unique_ptr<DocumentFeaturesStoreSaver>
+DocumentFeaturesStore::make_saver() const
{
- uint32_t features_size = 0;
- if (!refs.empty()) {
- assert(!refs.front()._features.valid());
- for (auto& cur_refs : refs) {
- if (cur_refs._features.valid()) {
- ++features_size;
- }
- }
- }
- buffer.writeInt32(features_size);
- for (uint32_t doc_id = 0; doc_id < refs.size(); ++doc_id) {
- auto features_ref = refs[doc_id]._features;
- if (features_ref.valid()) {
- buffer.writeInt32(doc_id);
- auto feature_vector = features.get(features_ref);
- buffer.writeInt32(feature_vector.size());
- for (const auto &feature : feature_vector) {
- buffer.writeInt64(feature);
- }
- }
- }
-}
-
-} // namespace
-
-void
-DocumentFeaturesStore::serialize(DataBuffer &buffer) const
-{
- vector<EntryRef> word_list;
- unordered_map<uint32_t, uint32_t> word_map;
-
- findUsedWords(_refs, _ranges, word_map, word_list);
-
- buffer.writeInt16(_arity);
- serializeWords(buffer, word_list, _word_store);
- serialize_ranges(buffer, _refs, _ranges, word_map);
- serialize_features(buffer, _refs, _features);
+ return std::make_unique<DocumentFeaturesStoreSaver>(*this);
}
}
diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store.h b/searchlib/src/vespa/searchlib/predicate/document_features_store.h
index bc4d58b1c04..d12e703957a 100644
--- a/searchlib/src/vespa/searchlib/predicate/document_features_store.h
+++ b/searchlib/src/vespa/searchlib/predicate/document_features_store.h
@@ -14,6 +14,8 @@
namespace search::predicate {
+class DocumentFeaturesStoreSaver;
+
/**
* Class used to track the {featureId, docId} pairs that are inserted
* into the btree memory index dictionary. These pairs are later used
@@ -21,6 +23,7 @@ namespace search::predicate {
* lists of the dictionary.
*/
class DocumentFeaturesStore {
+ friend class DocumentFeaturesStoreSaver;
using WordStore = memoryindex::WordStore;
struct Range {
vespalib::datastore::EntryRef label_ref;
@@ -92,7 +95,7 @@ public:
void assign_generation(generation_t current_gen);
vespalib::MemoryUsage getMemoryUsage() const;
- void serialize(vespalib::DataBuffer &buffer) const;
+ std::unique_ptr<DocumentFeaturesStoreSaver> make_saver() const;
};
}
diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.cpp b/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.cpp
new file mode 100644
index 00000000000..08a3592da14
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.cpp
@@ -0,0 +1,138 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "document_features_store_saver.h"
+#include <vespa/searchlib/util/bufferwriter.h>
+#include <vespa/vespalib/objects/nbo.h>
+
+using vespalib::datastore::EntryRef;
+using search::BufferWriter;
+
+namespace search::predicate {
+
+DocumentFeaturesStoreSaver::DocumentFeaturesStoreSaver(const DocumentFeaturesStore& store)
+ : _refs(store._refs),
+ _features(store._features),
+ _ranges(store._ranges),
+ _word_store(store._word_store),
+ _arity(store._arity)
+{
+}
+
+DocumentFeaturesStoreSaver::~DocumentFeaturesStoreSaver() = default;
+
+namespace {
+
+template <typename T>
+void nbo_write(BufferWriter& writer, T value)
+{
+ auto value_nbo = vespalib::nbo::n2h(value);
+ writer.write(&value_nbo, sizeof(value_nbo));
+}
+
+template <typename RefsVector, typename RangesStore>
+void
+find_used_words(const RefsVector& refs, const RangesStore& ranges,
+ std::unordered_map<uint32_t, uint32_t>& word_map,
+ std::vector<EntryRef>& word_list)
+{
+ for (auto& cur_refs : refs) {
+ auto ranges_ref = cur_refs._ranges;
+ if (ranges_ref.valid()) {
+ auto range_vector = ranges.get(ranges_ref);
+ for (const auto& range : range_vector) {
+ if (!word_map.count(range.label_ref.ref())) {
+ word_map[range.label_ref.ref()] = word_list.size();
+ word_list.push_back(range.label_ref);
+ }
+ }
+ }
+ }
+}
+
+void
+serialize_words(BufferWriter& writer, const std::vector<EntryRef>& word_list,
+ const memoryindex::WordStore& word_store)
+{
+ nbo_write<uint32_t>(writer, word_list.size());;
+ for (const auto &word_ref : word_list) {
+ const char *word = word_store.getWord(word_ref);
+ uint32_t len = strlen(word);
+ nbo_write(writer, len);
+ writer.write(word, len);
+ }
+}
+
+template <typename RefsVector, typename RangesStore>
+void
+serialize_ranges(BufferWriter& writer, const RefsVector& refs, const RangesStore& ranges,
+ std::unordered_map<uint32_t, uint32_t>& word_map)
+{
+ uint32_t ranges_size = 0;
+ if (!refs.empty()) {
+ assert(!refs.front()._ranges.valid());
+ for (auto& cur_refs : refs) {
+ if (cur_refs._ranges.valid()) {
+ ++ranges_size;
+ }
+ }
+ }
+ nbo_write(writer, ranges_size);
+ for (uint32_t doc_id = 0; doc_id < refs.size(); ++doc_id) {
+ auto ranges_ref = refs[doc_id]._ranges;
+ if (ranges_ref.valid()) {
+ nbo_write(writer, doc_id);
+ auto range_vector = ranges.get(ranges_ref);
+ nbo_write<uint32_t>(writer, range_vector.size());
+ for (const auto &range : range_vector) {
+ nbo_write(writer, word_map[range.label_ref.ref()]);
+ nbo_write(writer, range.from);
+ nbo_write(writer, range.to);
+ }
+ }
+ }
+}
+
+template <typename RefsVector, typename FeaturesStore>
+void
+serialize_features(BufferWriter& writer, const RefsVector& refs, const FeaturesStore& features)
+{
+ uint32_t features_size = 0;
+ if (!refs.empty()) {
+ assert(!refs.front()._features.valid());
+ for (auto& cur_refs : refs) {
+ if (cur_refs._features.valid()) {
+ ++features_size;
+ }
+ }
+ }
+ nbo_write(writer, features_size);
+ for (uint32_t doc_id = 0; doc_id < refs.size(); ++doc_id) {
+ auto features_ref = refs[doc_id]._features;
+ if (features_ref.valid()) {
+ nbo_write(writer, doc_id);
+ auto feature_vector = features.get(features_ref);
+ nbo_write<uint32_t>(writer, feature_vector.size());
+ for (const auto &feature : feature_vector) {
+ nbo_write(writer, feature);
+ }
+ }
+ }
+}
+
+} // namespace
+
+void
+DocumentFeaturesStoreSaver::save(BufferWriter& writer) const
+{
+ std::vector<EntryRef> word_list;
+ std::unordered_map<uint32_t, uint32_t> word_map;
+
+ find_used_words(_refs, _ranges, word_map, word_list);
+
+ nbo_write<uint16_t>(writer, _arity);
+ serialize_words(writer, word_list, _word_store);
+ serialize_ranges(writer, _refs, _ranges, word_map);
+ serialize_features(writer, _refs, _features);
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.h b/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.h
new file mode 100644
index 00000000000..630ac3900f5
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/predicate/document_features_store_saver.h
@@ -0,0 +1,33 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "document_features_store.h"
+
+namespace search { class BufferWriter; }
+
+namespace search::predicate {
+
+/*
+ * Class used to save a DocumentFeaturesStore instance, streaming the
+ * serialized data via a BufferWriter.
+ */
+class DocumentFeaturesStoreSaver {
+ using RefsVector = DocumentFeaturesStore::RefsVector;
+ using FeaturesStore = DocumentFeaturesStore::FeaturesStore;
+ using RangesStore = DocumentFeaturesStore::RangesStore;
+ using WordStore = DocumentFeaturesStore::WordStore;
+
+ const RefsVector& _refs; // TODO: Use copy when saving in flush thread
+ const FeaturesStore& _features;
+ const RangesStore& _ranges;
+ const WordStore& _word_store;
+ const uint32_t _arity;
+
+public:
+ DocumentFeaturesStoreSaver(const DocumentFeaturesStore& store);
+ ~DocumentFeaturesStoreSaver();
+ void save(BufferWriter& writer) const;
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp
index cf7d7dafa05..324e3fd99e8 100644
--- a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp
+++ b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp
@@ -1,7 +1,9 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "predicate_index.h"
+#include "document_features_store_saver.h"
#include "predicate_hash.h"
+#include <vespa/searchlib/util/data_buffer_writer.h>
#include <vespa/vespalib/datastore/buffer_type.hpp>
#include <vespa/vespalib/btree/btree.hpp>
#include <vespa/vespalib/btree/btreeroot.hpp>
@@ -127,7 +129,12 @@ PredicateIndex::~PredicateIndex() = default;
void
PredicateIndex::serialize(DataBuffer &buffer, SerializeStats& stats) const {
- _features_store.serialize(buffer);
+ {
+ auto saver = _features_store.make_saver();
+ DataBufferWriter writer(buffer);
+ saver->save(writer);
+ writer.flush();
+ }
stats._features_len = buffer.getDataLen();
auto old_len = buffer.getDataLen();
buffer.writeInt16(_arity);
diff --git a/searchlib/src/vespa/searchlib/util/CMakeLists.txt b/searchlib/src/vespa/searchlib/util/CMakeLists.txt
index e9661b5e919..a90ca1191dd 100644
--- a/searchlib/src/vespa/searchlib/util/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/util/CMakeLists.txt
@@ -4,6 +4,7 @@ vespa_add_library(searchlib_util OBJECT
bufferwriter.cpp
comprbuffer.cpp
comprfile.cpp
+ data_buffer_writer.cpp
dirtraverse.cpp
drainingbufferwriter.cpp
file_with_header.cpp
diff --git a/searchlib/src/vespa/searchlib/util/data_buffer_writer.cpp b/searchlib/src/vespa/searchlib/util/data_buffer_writer.cpp
new file mode 100644
index 00000000000..fb5061f394e
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/data_buffer_writer.cpp
@@ -0,0 +1,33 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "data_buffer_writer.h"
+#include <vespa/vespalib/data/databuffer.h>
+
+namespace search {
+
+namespace {
+
+constexpr size_t buffer_size = 4_Ki;
+
+}
+
+DataBufferWriter::DataBufferWriter(vespalib::DataBuffer& data_buffer)
+ : _data_buffer(data_buffer)
+{
+ _data_buffer.ensureFree(buffer_size);
+ setup(_data_buffer.getFree(), _data_buffer.getFreeLen());
+}
+
+DataBufferWriter::~DataBufferWriter() = default;
+
+void
+DataBufferWriter::flush()
+{
+ if (usedLen() > 0) {
+ _data_buffer.moveFreeToData(usedLen());
+ _data_buffer.ensureFree(buffer_size);
+ setup(_data_buffer.getFree(), _data_buffer.getFreeLen());
+ }
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/data_buffer_writer.h b/searchlib/src/vespa/searchlib/util/data_buffer_writer.h
new file mode 100644
index 00000000000..0d16bdfc4f4
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/data_buffer_writer.h
@@ -0,0 +1,24 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "bufferwriter.h"
+
+namespace vespalib { class DataBuffer; }
+
+namespace search {
+
+/**
+ * Class to write to a data buffer, used during migration of
+ * attribute vector saver implementation.
+ */
+class DataBufferWriter : public BufferWriter
+{
+ vespalib::DataBuffer& _data_buffer;
+public:
+ DataBufferWriter(vespalib::DataBuffer& data_buffer);
+ ~DataBufferWriter();
+ void flush() override;
+};
+
+}