summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2024-02-12 19:52:16 +0100
committerGitHub <noreply@github.com>2024-02-12 19:52:16 +0100
commit98fba492d4a51e2eaba59790256fc526f32203a9 (patch)
treec66b5dc706eb32f0bedc57342c08aad5b25a44eb
parent370941e361e15568df5e34be4cecc9a9db86548b (diff)
parent158e69f96f5aced5655048ef97182ce535ad4190 (diff)
Merge pull request #30247 from vespa-engine/toregge/add-extra-logging-when-saving-predicate-attribute
Add extra logging when saving predicate attribute.
-rw-r--r--searchlib/src/tests/predicate/predicate_index_test.cpp6
-rw-r--r--searchlib/src/tests/predicate/simple_index_test.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp20
-rw-r--r--searchlib/src/vespa/searchlib/predicate/predicate_index.cpp9
-rw-r--r--searchlib/src/vespa/searchlib/predicate/predicate_index.h15
-rw-r--r--searchlib/src/vespa/searchlib/predicate/simple_index.h13
-rw-r--r--searchlib/src/vespa/searchlib/predicate/simple_index.hpp9
7 files changed, 65 insertions, 10 deletions
diff --git a/searchlib/src/tests/predicate/predicate_index_test.cpp b/searchlib/src/tests/predicate/predicate_index_test.cpp
index 40b650e489a..8e72fc1dbd9 100644
--- a/searchlib/src/tests/predicate/predicate_index_test.cpp
+++ b/searchlib/src/tests/predicate/predicate_index_test.cpp
@@ -292,7 +292,8 @@ TEST("require that PredicateIndex can be (de)serialized") {
index.commit();
vespalib::DataBuffer buffer;
- index.serialize(buffer);
+ PredicateIndex::SerializeStats dummy_stats;
+ index.serialize(buffer, dummy_stats);
uint32_t doc_id_limit;
DocIdLimitFinder finder(doc_id_limit);
PredicateIndex index2(generation_holder, dummy_provider, simple_index_config,
@@ -336,7 +337,8 @@ TEST("require that DocumentFeaturesStore is restored on deserialization") {
EXPECT_FALSE(index.getIntervalIndex().lookup(hash).valid());
indexFeature(index, doc_id, min_feature, {{hash, interval}}, {{hash2, bounds}});
vespalib::DataBuffer buffer;
- index.serialize(buffer);
+ PredicateIndex::SerializeStats dummy_stats;
+ index.serialize(buffer, dummy_stats);
uint32_t doc_id_limit;
DocIdLimitFinder finder(doc_id_limit);
PredicateIndex index2(generation_holder, dummy_provider, simple_index_config,
diff --git a/searchlib/src/tests/predicate/simple_index_test.cpp b/searchlib/src/tests/predicate/simple_index_test.cpp
index 8cd36a26f6e..c37d377e668 100644
--- a/searchlib/src/tests/predicate/simple_index_test.cpp
+++ b/searchlib/src/tests/predicate/simple_index_test.cpp
@@ -176,7 +176,8 @@ TEST_FF("require that SimpleIndex can be serialized and deserialized.", Fixture,
}
f1.commit();
vespalib::DataBuffer buffer;
- f1.index().serialize(buffer, MyDataSerializer());
+ SimpleIndex<MyData>::SerializeStats dummy_stats;
+ f1.index().serialize(buffer, MyDataSerializer(), dummy_stats);
MyObserver observer;
MyDataDeserializer deserializer;
f2.index().deserialize(buffer, deserializer, observer, PredicateAttribute::PREDICATE_ATTRIBUTE_VERSION);
diff --git a/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp b/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp
index ddf71063306..1f596699bca 100644
--- a/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp
@@ -141,8 +141,11 @@ PredicateAttribute::before_inc_generation(generation_t current_gen)
void
PredicateAttribute::onSave(IAttributeSaveTarget &saveTarget) {
LOG(info, "Saving predicate attribute version %d", getVersion());
+ vespalib::string name(getBaseFileName());
+ PredicateIndex::SerializeStats stats;
IAttributeSaveTarget::Buffer buffer(saveTarget.datWriter().allocBuf(4_Ki));
- _index->serialize(*buffer);
+ _index->serialize(*buffer, stats);
+ size_t predicate_index_len = buffer->getDataLen();
uint32_t highest_doc_id = static_cast<uint32_t>(_min_feature.size() - 1);
buffer->writeInt32(highest_doc_id);
for (size_t i = 1; i <= highest_doc_id; ++i) {
@@ -152,6 +155,21 @@ PredicateAttribute::onSave(IAttributeSaveTarget &saveTarget) {
buffer->writeInt16(_interval_range_vector[i]);
}
buffer->writeInt16(_max_interval_range);
+ auto min_feature_and_interval_range_vector_len = buffer->getDataLen() - predicate_index_len;
+ auto total_len = buffer->getDataLen();
+ LOG(info, "Serialized predicate attribute %s: "
+ "{features=%zu, zeros=%zu, "
+ "interval={dictionary=%zu, btrees=%zu, bytes=%zu}, "
+ "interval_with_bounds={dictionary %zu, btrees=%zu, bytes=%zu}, "
+ "predicate-index_len=%zu, "
+ "min_feature and interval_range_vector=%zu, total=%zu}",
+ name.c_str(),
+ stats._features_len, stats._zeroes_len,
+ stats._interval._dictionary_size, stats._interval._btree_count, stats._interval._bytes,
+ stats._interval_with_bounds._dictionary_size, stats._interval_with_bounds._btree_count, stats._interval_with_bounds._bytes,
+ predicate_index_len,
+ min_feature_and_interval_range_vector_len,
+ total_len);
saveTarget.datWriter().writeBuf(std::move(buffer));
}
diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp
index c24c2f53f1d..7e3d62640cf 100644
--- a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp
+++ b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp
@@ -126,17 +126,20 @@ PredicateIndex::PredicateIndex(GenerationHolder &genHolder,
PredicateIndex::~PredicateIndex() = default;
void
-PredicateIndex::serialize(DataBuffer &buffer) const {
+PredicateIndex::serialize(DataBuffer &buffer, SerializeStats& stats) const {
_features_store.serialize(buffer);
+ stats._features_len = buffer.getDataLen();
+ auto old_len = buffer.getDataLen();
buffer.writeInt16(_arity);
buffer.writeInt32(_zero_constraint_docs.size());
for (auto it = _zero_constraint_docs.begin(); it.valid(); ++it) {
buffer.writeInt32(it.getKey());
}
+ stats._zeroes_len = buffer.getDataLen() - old_len;
IntervalSerializer<Interval> interval_serializer(_interval_store);
- _interval_index.serialize(buffer, interval_serializer);
+ _interval_index.serialize(buffer, interval_serializer, stats._interval);
IntervalSerializer<IntervalWithBounds> bounds_serializer(_interval_store);
- _bounds_index.serialize(buffer, bounds_serializer);
+ _bounds_index.serialize(buffer, bounds_serializer, stats._interval_with_bounds);
}
void
diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.h b/searchlib/src/vespa/searchlib/predicate/predicate_index.h
index 439187bccd7..351fa3a1a9f 100644
--- a/searchlib/src/vespa/searchlib/predicate/predicate_index.h
+++ b/searchlib/src/vespa/searchlib/predicate/predicate_index.h
@@ -37,6 +37,19 @@ public:
using GenerationHolder = vespalib::GenerationHolder;
using BTreeIterator = SimpleIndex<vespalib::datastore::EntryRef>::BTreeIterator;
using VectorIterator = SimpleIndex<vespalib::datastore::EntryRef>::VectorIterator;
+ struct SerializeStats {
+ size_t _features_len;
+ size_t _zeroes_len;
+ IntervalIndex::SerializeStats _interval;
+ BoundsIndex::SerializeStats _interval_with_bounds;
+ SerializeStats()
+ : _features_len(0),
+ _zeroes_len(0),
+ _interval(),
+ _interval_with_bounds()
+ {
+ }
+ };
private:
uint32_t _arity;
const DocIdLimitProvider &_limit_provider;
@@ -66,7 +79,7 @@ public:
SimpleIndexDeserializeObserver<> & observer, uint32_t version);
~PredicateIndex() override;
- void serialize(vespalib::DataBuffer &buffer) const;
+ void serialize(vespalib::DataBuffer &buffer, SerializeStats& stats) const;
void onDeserializationCompleted();
void indexEmptyDocument(uint32_t doc_id);
diff --git a/searchlib/src/vespa/searchlib/predicate/simple_index.h b/searchlib/src/vespa/searchlib/predicate/simple_index.h
index 3290aaf929e..0fb6ce6e9db 100644
--- a/searchlib/src/vespa/searchlib/predicate/simple_index.h
+++ b/searchlib/src/vespa/searchlib/predicate/simple_index.h
@@ -136,6 +136,17 @@ public:
using PostingVector = vespalib::RcuVectorBase<Posting>;
using VectorStore = vespalib::btree::BTree<Key, std::shared_ptr<PostingVector>, vespalib::btree::NoAggregated>;
using VectorIterator = PostingVectorIterator<Posting, Key, DocId>;
+ struct SerializeStats {
+ size_t _dictionary_size;
+ size_t _btree_count;
+ size_t _bytes;
+ SerializeStats()
+ : _dictionary_size(0),
+ _btree_count(0),
+ _bytes(0)
+ {
+ }
+ };
private:
using GenerationHolder = vespalib::GenerationHolder;
@@ -176,7 +187,7 @@ public:
~SimpleIndex();
void serialize(vespalib::DataBuffer &buffer,
- const PostingSerializer<Posting> &serializer) const;
+ const PostingSerializer<Posting> &serializer, SerializeStats& stats) const;
void deserialize(vespalib::DataBuffer &buffer,
PostingDeserializer<Posting> &deserializer,
SimpleIndexDeserializeObserver<Key, DocId> &observer, uint32_t version);
diff --git a/searchlib/src/vespa/searchlib/predicate/simple_index.hpp b/searchlib/src/vespa/searchlib/predicate/simple_index.hpp
index 46441a33692..0b5c8cbdb62 100644
--- a/searchlib/src/vespa/searchlib/predicate/simple_index.hpp
+++ b/searchlib/src/vespa/searchlib/predicate/simple_index.hpp
@@ -69,9 +69,12 @@ SimpleIndex<Posting, Key, DocId>::~SimpleIndex() {
template <typename Posting, typename Key, typename DocId>
void
-SimpleIndex<Posting, Key, DocId>::serialize(vespalib::DataBuffer &buffer, const PostingSerializer<Posting> &serializer) const {
+SimpleIndex<Posting, Key, DocId>::serialize(vespalib::DataBuffer &buffer, const PostingSerializer<Posting> &serializer, SerializeStats& stats) const {
assert(sizeof(Key) <= sizeof(uint64_t));
assert(sizeof(DocId) <= sizeof(uint32_t));
+ stats = SerializeStats();
+ stats._dictionary_size = _dictionary.size();
+ auto old_size = buffer.getDataLen();
buffer.writeInt32(_dictionary.size());
for (auto it = _dictionary.begin(); it.valid(); ++it) {
vespalib::datastore::EntryRef ref = it.getData();
@@ -79,12 +82,16 @@ SimpleIndex<Posting, Key, DocId>::serialize(vespalib::DataBuffer &buffer, const
auto posting_it = _btree_posting_lists.begin(ref);
if (!posting_it.valid())
continue;
+ if (posting_it.size() > 8u) {
+ ++stats._btree_count;
+ }
buffer.writeInt64(it.getKey()); // Key
for (; posting_it.valid(); ++posting_it) {
buffer.writeInt32(posting_it.getKey()); // DocId
serializer.serialize(posting_it.getData(), buffer);
}
}
+ stats._bytes = buffer.getDataLen() - old_size;
}
template <typename Posting, typename Key, typename DocId>