diff options
Diffstat (limited to 'searchlib')
13 files changed, 86 insertions, 61 deletions
diff --git a/searchlib/src/tests/predicate/predicate_index_test.cpp b/searchlib/src/tests/predicate/predicate_index_test.cpp index 40b650e489a..8e72fc1dbd9 100644 --- a/searchlib/src/tests/predicate/predicate_index_test.cpp +++ b/searchlib/src/tests/predicate/predicate_index_test.cpp @@ -292,7 +292,8 @@ TEST("require that PredicateIndex can be (de)serialized") { index.commit(); vespalib::DataBuffer buffer; - index.serialize(buffer); + PredicateIndex::SerializeStats dummy_stats; + index.serialize(buffer, dummy_stats); uint32_t doc_id_limit; DocIdLimitFinder finder(doc_id_limit); PredicateIndex index2(generation_holder, dummy_provider, simple_index_config, @@ -336,7 +337,8 @@ TEST("require that DocumentFeaturesStore is restored on deserialization") { EXPECT_FALSE(index.getIntervalIndex().lookup(hash).valid()); indexFeature(index, doc_id, min_feature, {{hash, interval}}, {{hash2, bounds}}); vespalib::DataBuffer buffer; - index.serialize(buffer); + PredicateIndex::SerializeStats dummy_stats; + index.serialize(buffer, dummy_stats); uint32_t doc_id_limit; DocIdLimitFinder finder(doc_id_limit); PredicateIndex index2(generation_holder, dummy_provider, simple_index_config, diff --git a/searchlib/src/tests/predicate/simple_index_test.cpp b/searchlib/src/tests/predicate/simple_index_test.cpp index 8cd36a26f6e..c37d377e668 100644 --- a/searchlib/src/tests/predicate/simple_index_test.cpp +++ b/searchlib/src/tests/predicate/simple_index_test.cpp @@ -176,7 +176,8 @@ TEST_FF("require that SimpleIndex can be serialized and deserialized.", Fixture, } f1.commit(); vespalib::DataBuffer buffer; - f1.index().serialize(buffer, MyDataSerializer()); + SimpleIndex<MyData>::SerializeStats dummy_stats; + f1.index().serialize(buffer, MyDataSerializer(), dummy_stats); MyObserver observer; MyDataDeserializer deserializer; f2.index().deserialize(buffer, deserializer, observer, PredicateAttribute::PREDICATE_ATTRIBUTE_VERSION); diff --git a/searchlib/src/vespa/searchlib/aggregation/hitsaggregationresult.h b/searchlib/src/vespa/searchlib/aggregation/hitsaggregationresult.h index 0bf33ea33e8..f90ee3c2312 100644 --- a/searchlib/src/vespa/searchlib/aggregation/hitsaggregationresult.h +++ b/searchlib/src/vespa/searchlib/aggregation/hitsaggregationresult.h @@ -16,8 +16,8 @@ public: class SummaryGenerator { public: - virtual ~SummaryGenerator() { } - virtual vespalib::ConstBufferRef fillSummary(DocId lid, const SummaryClassType & summaryClass) = 0; + virtual ~SummaryGenerator() = default; + virtual vespalib::ConstBufferRef fillSummary(DocId lid, vespalib::stringref summaryClass) = 0; }; private: @@ -26,7 +26,7 @@ private: void onAggregate(const ResultNode &result, const document::Document & doc, HitRank rank) override; const ResultNode & onGetRank() const override; - SummaryClassType _summaryClass; + vespalib::string _summaryClass; uint32_t _maxHits; HitList _hits; bool _isOrdered; @@ -50,8 +50,8 @@ public: ~HitsAggregationResult() override; void postMerge() override { _hits.postMerge(_maxHits); } void setSummaryGenerator(SummaryGenerator & summaryGenerator) { _summaryGenerator = &summaryGenerator; } - const SummaryClassType & getSummaryClass() const { return _summaryClass; } - HitsAggregationResult setSummaryClass(const SummaryClassType & summaryClass) { _summaryClass = summaryClass; return *this; } + const vespalib::string & getSummaryClass() const { return _summaryClass; } + HitsAggregationResult setSummaryClass(vespalib::stringref summaryClass) { _summaryClass = summaryClass; return *this; } HitsAggregationResult &setMaxHits(uint32_t maxHits) { _maxHits = (maxHits == 0) ? std::numeric_limits<uint32_t>::max() : maxHits; return *this; diff --git a/searchlib/src/vespa/searchlib/aggregation/vdshit.h b/searchlib/src/vespa/searchlib/aggregation/vdshit.h index 11cfe9b3b18..32a35c22977 100644 --- a/searchlib/src/vespa/searchlib/aggregation/vdshit.h +++ b/searchlib/src/vespa/searchlib/aggregation/vdshit.h @@ -2,7 +2,6 @@ #pragma once #include "hit.h" -#include "aggregationresult.h" namespace search::aggregation { @@ -10,19 +9,18 @@ class VdsHit : public Hit { public: using Summary = std::vector<uint8_t>; - using DocId = vespalib::string; DECLARE_IDENTIFIABLE_NS2(search, aggregation, VdsHit); DECLARE_NBO_SERIALIZE; VdsHit() noexcept : Hit(), _docId(), _summary() {} - VdsHit(DocId docId, HitRank rank) noexcept : Hit(rank), _docId(docId), _summary() {} - ~VdsHit(); + VdsHit(vespalib::stringref docId, HitRank rank) noexcept : Hit(rank), _docId(docId), _summary() {} + ~VdsHit() override; VdsHit *clone() const override { return new VdsHit(*this); } void visitMembers(vespalib::ObjectVisitor &visitor) const override; - const DocId & getDocId() const noexcept { return _docId; } + const vespalib::string & getDocId() const noexcept { return _docId; } const Summary & getSummary() const noexcept { return _summary; } - VdsHit & setDocId(DocId & docId) noexcept { _docId = docId; return *this; } + VdsHit & setDocId(vespalib::stringref docId) noexcept { _docId = docId; return *this; } VdsHit & setSummary(const void * buf, size_t sz) noexcept { - const uint8_t * v(static_cast<const uint8_t *>(buf)); + const auto * v(static_cast<const uint8_t *>(buf)); Summary n(v, v+sz); _summary.swap(n); return *this; @@ -30,8 +28,8 @@ public: bool operator < (const VdsHit &b) const noexcept { return cmp(b) < 0; } private: - DocId _docId; - Summary _summary; + vespalib::string _docId; + Summary _summary; }; } diff --git a/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp b/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp index ddf71063306..1f596699bca 100644 --- a/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp +++ b/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp @@ -141,8 +141,11 @@ PredicateAttribute::before_inc_generation(generation_t current_gen) void PredicateAttribute::onSave(IAttributeSaveTarget &saveTarget) { LOG(info, "Saving predicate attribute version %d", getVersion()); + vespalib::string name(getBaseFileName()); + PredicateIndex::SerializeStats stats; IAttributeSaveTarget::Buffer buffer(saveTarget.datWriter().allocBuf(4_Ki)); - _index->serialize(*buffer); + _index->serialize(*buffer, stats); + size_t predicate_index_len = buffer->getDataLen(); uint32_t highest_doc_id = static_cast<uint32_t>(_min_feature.size() - 1); buffer->writeInt32(highest_doc_id); for (size_t i = 1; i <= highest_doc_id; ++i) { @@ -152,6 +155,21 @@ PredicateAttribute::onSave(IAttributeSaveTarget &saveTarget) { buffer->writeInt16(_interval_range_vector[i]); } buffer->writeInt16(_max_interval_range); + auto min_feature_and_interval_range_vector_len = buffer->getDataLen() - predicate_index_len; + auto total_len = buffer->getDataLen(); + LOG(info, "Serialized predicate attribute %s: " + "{features=%zu, zeros=%zu, " + "interval={dictionary=%zu, btrees=%zu, bytes=%zu}, " + "interval_with_bounds={dictionary %zu, btrees=%zu, bytes=%zu}, " + "predicate-index_len=%zu, " + "min_feature and interval_range_vector=%zu, total=%zu}", + name.c_str(), + stats._features_len, stats._zeroes_len, + stats._interval._dictionary_size, stats._interval._btree_count, stats._interval._bytes, + stats._interval_with_bounds._dictionary_size, stats._interval_with_bounds._btree_count, stats._interval_with_bounds._bytes, + predicate_index_len, + min_feature_and_interval_range_vector_len, + total_len); saveTarget.datWriter().writeBuf(std::move(buffer)); } diff --git a/searchlib/src/vespa/searchlib/index/indexbuilder.h b/searchlib/src/vespa/searchlib/index/indexbuilder.h index 9615bfd9428..71b826698ef 100644 --- a/searchlib/src/vespa/searchlib/index/indexbuilder.h +++ b/searchlib/src/vespa/searchlib/index/indexbuilder.h @@ -10,6 +10,13 @@ class DocIdAndFeatures; class Schema; class WordDocElementWordPosFeatures; +/** + * Interface for building an index for a single field + * The index should be built as follows: + * Add the set of unique words in sorted order. + * For each word add the set of document ids in sorted order. + * For each document id add the position information for that document. + */ class FieldIndexBuilder { public: virtual ~FieldIndexBuilder() = default; @@ -20,16 +27,11 @@ public: /** * Interface used to build an index for the set of index fields specified in a schema. - * - * The index should be built as follows: - * For each field add the set of unique words in sorted order. - * For each word add the set of document ids in sorted order. - * For each document id add the position information for that document. + * Create and complete one field builder at the time. */ class IndexBuilder { protected: const Schema &_schema; - public: explicit IndexBuilder(const Schema &schema); virtual ~IndexBuilder(); diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp b/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp index a3f10f14d54..1f580d707e8 100644 --- a/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp +++ b/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp @@ -16,19 +16,11 @@ using std::vector; namespace search::predicate { -void -DocumentFeaturesStore::setCurrent(uint32_t docId, FeatureVector *features) { - _currDocId = docId; - _currFeatures = features; -} - DocumentFeaturesStore::DocumentFeaturesStore(uint32_t arity) : _docs(), _ranges(), _word_store(), _word_index(), - _currDocId(0), - _currFeatures(), _numFeatures(0), _numRanges(0), _arity(arity) { @@ -108,20 +100,6 @@ DocumentFeaturesStore::~DocumentFeaturesStore() { } void -DocumentFeaturesStore::insert(uint64_t featureId, uint32_t docId) { - assert(docId != 0); - if (_currDocId != docId) { - auto docsItr = _docs.find(docId); - if (docsItr == _docs.end()) { - docsItr = _docs.insert(std::make_pair(docId, FeatureVector())).first; - } - setCurrent(docId, &docsItr->second); - } - _currFeatures->push_back(featureId); - ++_numFeatures; -} - -void DocumentFeaturesStore::insert(const PredicateTreeAnnotations &annotations, uint32_t doc_id) { assert(doc_id != 0); if (!annotations.features.empty()) { @@ -189,9 +167,6 @@ DocumentFeaturesStore::remove(uint32_t doc_id) { (_numRanges - range_itr->second.size()) : 0; _ranges.erase(range_itr); } - if (_currDocId == doc_id) { - setCurrent(0, nullptr); - } } vespalib::MemoryUsage diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store.h b/searchlib/src/vespa/searchlib/predicate/document_features_store.h index 9225076000f..3b8aed53ca1 100644 --- a/searchlib/src/vespa/searchlib/predicate/document_features_store.h +++ b/searchlib/src/vespa/searchlib/predicate/document_features_store.h @@ -57,14 +57,10 @@ class DocumentFeaturesStore { RangeFeaturesMap _ranges; WordStore _word_store; WordIndex _word_index; - uint32_t _currDocId; - FeatureVector *_currFeatures; size_t _numFeatures; size_t _numRanges; uint32_t _arity; - void setCurrent(uint32_t docId, FeatureVector *features); - public: using FeatureSet = std::unordered_set<uint64_t>; @@ -72,7 +68,6 @@ public: DocumentFeaturesStore(vespalib::DataBuffer &buffer); ~DocumentFeaturesStore(); - void insert(uint64_t featureId, uint32_t docId); void insert(const PredicateTreeAnnotations &annotations, uint32_t docId); FeatureSet get(uint32_t docId) const; void remove(uint32_t docId); diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp index c24c2f53f1d..7e3d62640cf 100644 --- a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp +++ b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp @@ -126,17 +126,20 @@ PredicateIndex::PredicateIndex(GenerationHolder &genHolder, PredicateIndex::~PredicateIndex() = default; void -PredicateIndex::serialize(DataBuffer &buffer) const { +PredicateIndex::serialize(DataBuffer &buffer, SerializeStats& stats) const { _features_store.serialize(buffer); + stats._features_len = buffer.getDataLen(); + auto old_len = buffer.getDataLen(); buffer.writeInt16(_arity); buffer.writeInt32(_zero_constraint_docs.size()); for (auto it = _zero_constraint_docs.begin(); it.valid(); ++it) { buffer.writeInt32(it.getKey()); } + stats._zeroes_len = buffer.getDataLen() - old_len; IntervalSerializer<Interval> interval_serializer(_interval_store); - _interval_index.serialize(buffer, interval_serializer); + _interval_index.serialize(buffer, interval_serializer, stats._interval); IntervalSerializer<IntervalWithBounds> bounds_serializer(_interval_store); - _bounds_index.serialize(buffer, bounds_serializer); + _bounds_index.serialize(buffer, bounds_serializer, stats._interval_with_bounds); } void diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.h b/searchlib/src/vespa/searchlib/predicate/predicate_index.h index 439187bccd7..351fa3a1a9f 100644 --- a/searchlib/src/vespa/searchlib/predicate/predicate_index.h +++ b/searchlib/src/vespa/searchlib/predicate/predicate_index.h @@ -37,6 +37,19 @@ public: using GenerationHolder = vespalib::GenerationHolder; using BTreeIterator = SimpleIndex<vespalib::datastore::EntryRef>::BTreeIterator; using VectorIterator = SimpleIndex<vespalib::datastore::EntryRef>::VectorIterator; + struct SerializeStats { + size_t _features_len; + size_t _zeroes_len; + IntervalIndex::SerializeStats _interval; + BoundsIndex::SerializeStats _interval_with_bounds; + SerializeStats() + : _features_len(0), + _zeroes_len(0), + _interval(), + _interval_with_bounds() + { + } + }; private: uint32_t _arity; const DocIdLimitProvider &_limit_provider; @@ -66,7 +79,7 @@ public: SimpleIndexDeserializeObserver<> & observer, uint32_t version); ~PredicateIndex() override; - void serialize(vespalib::DataBuffer &buffer) const; + void serialize(vespalib::DataBuffer &buffer, SerializeStats& stats) const; void onDeserializationCompleted(); void indexEmptyDocument(uint32_t doc_id); diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_tree_annotator.h b/searchlib/src/vespa/searchlib/predicate/predicate_tree_annotator.h index 3e8f9c98f22..389c346a61b 100644 --- a/searchlib/src/vespa/searchlib/predicate/predicate_tree_annotator.h +++ b/searchlib/src/vespa/searchlib/predicate/predicate_tree_annotator.h @@ -22,7 +22,7 @@ constexpr uint32_t MIN_INTERVAL = 0x0001; constexpr uint32_t MAX_INTERVAL = 0xffff; struct PredicateTreeAnnotations { - PredicateTreeAnnotations(uint32_t mf=0, uint16_t ir=MAX_INTERVAL); + explicit PredicateTreeAnnotations(uint32_t mf=0, uint16_t ir=MAX_INTERVAL); ~PredicateTreeAnnotations(); uint32_t min_feature; uint16_t interval_range; diff --git a/searchlib/src/vespa/searchlib/predicate/simple_index.h b/searchlib/src/vespa/searchlib/predicate/simple_index.h index 3290aaf929e..0fb6ce6e9db 100644 --- a/searchlib/src/vespa/searchlib/predicate/simple_index.h +++ b/searchlib/src/vespa/searchlib/predicate/simple_index.h @@ -136,6 +136,17 @@ public: using PostingVector = vespalib::RcuVectorBase<Posting>; using VectorStore = vespalib::btree::BTree<Key, std::shared_ptr<PostingVector>, vespalib::btree::NoAggregated>; using VectorIterator = PostingVectorIterator<Posting, Key, DocId>; + struct SerializeStats { + size_t _dictionary_size; + size_t _btree_count; + size_t _bytes; + SerializeStats() + : _dictionary_size(0), + _btree_count(0), + _bytes(0) + { + } + }; private: using GenerationHolder = vespalib::GenerationHolder; @@ -176,7 +187,7 @@ public: ~SimpleIndex(); void serialize(vespalib::DataBuffer &buffer, - const PostingSerializer<Posting> &serializer) const; + const PostingSerializer<Posting> &serializer, SerializeStats& stats) const; void deserialize(vespalib::DataBuffer &buffer, PostingDeserializer<Posting> &deserializer, SimpleIndexDeserializeObserver<Key, DocId> &observer, uint32_t version); diff --git a/searchlib/src/vespa/searchlib/predicate/simple_index.hpp b/searchlib/src/vespa/searchlib/predicate/simple_index.hpp index 46441a33692..0b5c8cbdb62 100644 --- a/searchlib/src/vespa/searchlib/predicate/simple_index.hpp +++ b/searchlib/src/vespa/searchlib/predicate/simple_index.hpp @@ -69,9 +69,12 @@ SimpleIndex<Posting, Key, DocId>::~SimpleIndex() { template <typename Posting, typename Key, typename DocId> void -SimpleIndex<Posting, Key, DocId>::serialize(vespalib::DataBuffer &buffer, const PostingSerializer<Posting> &serializer) const { +SimpleIndex<Posting, Key, DocId>::serialize(vespalib::DataBuffer &buffer, const PostingSerializer<Posting> &serializer, SerializeStats& stats) const { assert(sizeof(Key) <= sizeof(uint64_t)); assert(sizeof(DocId) <= sizeof(uint32_t)); + stats = SerializeStats(); + stats._dictionary_size = _dictionary.size(); + auto old_size = buffer.getDataLen(); buffer.writeInt32(_dictionary.size()); for (auto it = _dictionary.begin(); it.valid(); ++it) { vespalib::datastore::EntryRef ref = it.getData(); @@ -79,12 +82,16 @@ SimpleIndex<Posting, Key, DocId>::serialize(vespalib::DataBuffer &buffer, const auto posting_it = _btree_posting_lists.begin(ref); if (!posting_it.valid()) continue; + if (posting_it.size() > 8u) { + ++stats._btree_count; + } buffer.writeInt64(it.getKey()); // Key for (; posting_it.valid(); ++posting_it) { buffer.writeInt32(posting_it.getKey()); // DocId serializer.serialize(posting_it.getData(), buffer); } } + stats._bytes = buffer.getDataLen() - old_size; } template <typename Posting, typename Key, typename DocId> |