diff options
author | Tor Egge <Tor.Egge@online.no> | 2024-02-12 17:23:20 +0100 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2024-02-12 17:23:20 +0100 |
commit | 158e69f96f5aced5655048ef97182ce535ad4190 (patch) | |
tree | 2a3cf2ed0b90e845f331f9277437cc7fee38eafc /searchlib/src | |
parent | 9bf43297fe60aaffc0e2334b39ffb2f57294adb6 (diff) |
Add extra logging when saving predicate attribute.
Diffstat (limited to 'searchlib/src')
7 files changed, 65 insertions, 10 deletions
diff --git a/searchlib/src/tests/predicate/predicate_index_test.cpp b/searchlib/src/tests/predicate/predicate_index_test.cpp index 40b650e489a..8e72fc1dbd9 100644 --- a/searchlib/src/tests/predicate/predicate_index_test.cpp +++ b/searchlib/src/tests/predicate/predicate_index_test.cpp @@ -292,7 +292,8 @@ TEST("require that PredicateIndex can be (de)serialized") { index.commit(); vespalib::DataBuffer buffer; - index.serialize(buffer); + PredicateIndex::SerializeStats dummy_stats; + index.serialize(buffer, dummy_stats); uint32_t doc_id_limit; DocIdLimitFinder finder(doc_id_limit); PredicateIndex index2(generation_holder, dummy_provider, simple_index_config, @@ -336,7 +337,8 @@ TEST("require that DocumentFeaturesStore is restored on deserialization") { EXPECT_FALSE(index.getIntervalIndex().lookup(hash).valid()); indexFeature(index, doc_id, min_feature, {{hash, interval}}, {{hash2, bounds}}); vespalib::DataBuffer buffer; - index.serialize(buffer); + PredicateIndex::SerializeStats dummy_stats; + index.serialize(buffer, dummy_stats); uint32_t doc_id_limit; DocIdLimitFinder finder(doc_id_limit); PredicateIndex index2(generation_holder, dummy_provider, simple_index_config, diff --git a/searchlib/src/tests/predicate/simple_index_test.cpp b/searchlib/src/tests/predicate/simple_index_test.cpp index 8cd36a26f6e..c37d377e668 100644 --- a/searchlib/src/tests/predicate/simple_index_test.cpp +++ b/searchlib/src/tests/predicate/simple_index_test.cpp @@ -176,7 +176,8 @@ TEST_FF("require that SimpleIndex can be serialized and deserialized.", Fixture, } f1.commit(); vespalib::DataBuffer buffer; - f1.index().serialize(buffer, MyDataSerializer()); + SimpleIndex<MyData>::SerializeStats dummy_stats; + f1.index().serialize(buffer, MyDataSerializer(), dummy_stats); MyObserver observer; MyDataDeserializer deserializer; f2.index().deserialize(buffer, deserializer, observer, PredicateAttribute::PREDICATE_ATTRIBUTE_VERSION); diff --git a/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp b/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp index ddf71063306..1f596699bca 100644 --- a/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp +++ b/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp @@ -141,8 +141,11 @@ PredicateAttribute::before_inc_generation(generation_t current_gen) void PredicateAttribute::onSave(IAttributeSaveTarget &saveTarget) { LOG(info, "Saving predicate attribute version %d", getVersion()); + vespalib::string name(getBaseFileName()); + PredicateIndex::SerializeStats stats; IAttributeSaveTarget::Buffer buffer(saveTarget.datWriter().allocBuf(4_Ki)); - _index->serialize(*buffer); + _index->serialize(*buffer, stats); + size_t predicate_index_len = buffer->getDataLen(); uint32_t highest_doc_id = static_cast<uint32_t>(_min_feature.size() - 1); buffer->writeInt32(highest_doc_id); for (size_t i = 1; i <= highest_doc_id; ++i) { @@ -152,6 +155,21 @@ PredicateAttribute::onSave(IAttributeSaveTarget &saveTarget) { buffer->writeInt16(_interval_range_vector[i]); } buffer->writeInt16(_max_interval_range); + auto min_feature_and_interval_range_vector_len = buffer->getDataLen() - predicate_index_len; + auto total_len = buffer->getDataLen(); + LOG(info, "Serialized predicate attribute %s: " + "{features=%zu, zeros=%zu, " + "interval={dictionary=%zu, btrees=%zu, bytes=%zu}, " + "interval_with_bounds={dictionary %zu, btrees=%zu, bytes=%zu}, " + "predicate-index_len=%zu, " + "min_feature and interval_range_vector=%zu, total=%zu}", + name.c_str(), + stats._features_len, stats._zeroes_len, + stats._interval._dictionary_size, stats._interval._btree_count, stats._interval._bytes, + stats._interval_with_bounds._dictionary_size, stats._interval_with_bounds._btree_count, stats._interval_with_bounds._bytes, + predicate_index_len, + min_feature_and_interval_range_vector_len, + total_len); saveTarget.datWriter().writeBuf(std::move(buffer)); } diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp index c24c2f53f1d..7e3d62640cf 100644 --- a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp +++ b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp @@ -126,17 +126,20 @@ PredicateIndex::PredicateIndex(GenerationHolder &genHolder, PredicateIndex::~PredicateIndex() = default; void -PredicateIndex::serialize(DataBuffer &buffer) const { +PredicateIndex::serialize(DataBuffer &buffer, SerializeStats& stats) const { _features_store.serialize(buffer); + stats._features_len = buffer.getDataLen(); + auto old_len = buffer.getDataLen(); buffer.writeInt16(_arity); buffer.writeInt32(_zero_constraint_docs.size()); for (auto it = _zero_constraint_docs.begin(); it.valid(); ++it) { buffer.writeInt32(it.getKey()); } + stats._zeroes_len = buffer.getDataLen() - old_len; IntervalSerializer<Interval> interval_serializer(_interval_store); - _interval_index.serialize(buffer, interval_serializer); + _interval_index.serialize(buffer, interval_serializer, stats._interval); IntervalSerializer<IntervalWithBounds> bounds_serializer(_interval_store); - _bounds_index.serialize(buffer, bounds_serializer); + _bounds_index.serialize(buffer, bounds_serializer, stats._interval_with_bounds); } void diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.h b/searchlib/src/vespa/searchlib/predicate/predicate_index.h index 439187bccd7..351fa3a1a9f 100644 --- a/searchlib/src/vespa/searchlib/predicate/predicate_index.h +++ b/searchlib/src/vespa/searchlib/predicate/predicate_index.h @@ -37,6 +37,19 @@ public: using GenerationHolder = vespalib::GenerationHolder; using BTreeIterator = SimpleIndex<vespalib::datastore::EntryRef>::BTreeIterator; using VectorIterator = SimpleIndex<vespalib::datastore::EntryRef>::VectorIterator; + struct SerializeStats { + size_t _features_len; + size_t _zeroes_len; + IntervalIndex::SerializeStats _interval; + BoundsIndex::SerializeStats _interval_with_bounds; + SerializeStats() + : _features_len(0), + _zeroes_len(0), + _interval(), + _interval_with_bounds() + { + } + }; private: uint32_t _arity; const DocIdLimitProvider &_limit_provider; @@ -66,7 +79,7 @@ public: SimpleIndexDeserializeObserver<> & observer, uint32_t version); ~PredicateIndex() override; - void serialize(vespalib::DataBuffer &buffer) const; + void serialize(vespalib::DataBuffer &buffer, SerializeStats& stats) const; void onDeserializationCompleted(); void indexEmptyDocument(uint32_t doc_id); diff --git a/searchlib/src/vespa/searchlib/predicate/simple_index.h b/searchlib/src/vespa/searchlib/predicate/simple_index.h index 3290aaf929e..0fb6ce6e9db 100644 --- a/searchlib/src/vespa/searchlib/predicate/simple_index.h +++ b/searchlib/src/vespa/searchlib/predicate/simple_index.h @@ -136,6 +136,17 @@ public: using PostingVector = vespalib::RcuVectorBase<Posting>; using VectorStore = vespalib::btree::BTree<Key, std::shared_ptr<PostingVector>, vespalib::btree::NoAggregated>; using VectorIterator = PostingVectorIterator<Posting, Key, DocId>; + struct SerializeStats { + size_t _dictionary_size; + size_t _btree_count; + size_t _bytes; + SerializeStats() + : _dictionary_size(0), + _btree_count(0), + _bytes(0) + { + } + }; private: using GenerationHolder = vespalib::GenerationHolder; @@ -176,7 +187,7 @@ public: ~SimpleIndex(); void serialize(vespalib::DataBuffer &buffer, - const PostingSerializer<Posting> &serializer) const; + const PostingSerializer<Posting> &serializer, SerializeStats& stats) const; void deserialize(vespalib::DataBuffer &buffer, PostingDeserializer<Posting> &deserializer, SimpleIndexDeserializeObserver<Key, DocId> &observer, uint32_t version); diff --git a/searchlib/src/vespa/searchlib/predicate/simple_index.hpp b/searchlib/src/vespa/searchlib/predicate/simple_index.hpp index 46441a33692..0b5c8cbdb62 100644 --- a/searchlib/src/vespa/searchlib/predicate/simple_index.hpp +++ b/searchlib/src/vespa/searchlib/predicate/simple_index.hpp @@ -69,9 +69,12 @@ SimpleIndex<Posting, Key, DocId>::~SimpleIndex() { template <typename Posting, typename Key, typename DocId> void -SimpleIndex<Posting, Key, DocId>::serialize(vespalib::DataBuffer &buffer, const PostingSerializer<Posting> &serializer) const { +SimpleIndex<Posting, Key, DocId>::serialize(vespalib::DataBuffer &buffer, const PostingSerializer<Posting> &serializer, SerializeStats& stats) const { assert(sizeof(Key) <= sizeof(uint64_t)); assert(sizeof(DocId) <= sizeof(uint32_t)); + stats = SerializeStats(); + stats._dictionary_size = _dictionary.size(); + auto old_size = buffer.getDataLen(); buffer.writeInt32(_dictionary.size()); for (auto it = _dictionary.begin(); it.valid(); ++it) { vespalib::datastore::EntryRef ref = it.getData(); @@ -79,12 +82,16 @@ SimpleIndex<Posting, Key, DocId>::serialize(vespalib::DataBuffer &buffer, const auto posting_it = _btree_posting_lists.begin(ref); if (!posting_it.valid()) continue; + if (posting_it.size() > 8u) { + ++stats._btree_count; + } buffer.writeInt64(it.getKey()); // Key for (; posting_it.valid(); ++posting_it) { buffer.writeInt32(posting_it.getKey()); // DocId serializer.serialize(posting_it.getData(), buffer); } } + stats._bytes = buffer.getDataLen() - old_size; } template <typename Posting, typename Key, typename DocId> |