aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/predicate/predicate_index_test.cpp6
-rw-r--r--searchlib/src/tests/predicate/simple_index_test.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/aggregation/hitsaggregationresult.h10
-rw-r--r--searchlib/src/vespa/searchlib/aggregation/vdshit.h16
-rw-r--r--searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp20
-rw-r--r--searchlib/src/vespa/searchlib/index/indexbuilder.h14
-rw-r--r--searchlib/src/vespa/searchlib/predicate/document_features_store.cpp25
-rw-r--r--searchlib/src/vespa/searchlib/predicate/document_features_store.h5
-rw-r--r--searchlib/src/vespa/searchlib/predicate/predicate_index.cpp9
-rw-r--r--searchlib/src/vespa/searchlib/predicate/predicate_index.h15
-rw-r--r--searchlib/src/vespa/searchlib/predicate/predicate_tree_annotator.h2
-rw-r--r--searchlib/src/vespa/searchlib/predicate/simple_index.h13
-rw-r--r--searchlib/src/vespa/searchlib/predicate/simple_index.hpp9
13 files changed, 86 insertions, 61 deletions
diff --git a/searchlib/src/tests/predicate/predicate_index_test.cpp b/searchlib/src/tests/predicate/predicate_index_test.cpp
index 40b650e489a..8e72fc1dbd9 100644
--- a/searchlib/src/tests/predicate/predicate_index_test.cpp
+++ b/searchlib/src/tests/predicate/predicate_index_test.cpp
@@ -292,7 +292,8 @@ TEST("require that PredicateIndex can be (de)serialized") {
index.commit();
vespalib::DataBuffer buffer;
- index.serialize(buffer);
+ PredicateIndex::SerializeStats dummy_stats;
+ index.serialize(buffer, dummy_stats);
uint32_t doc_id_limit;
DocIdLimitFinder finder(doc_id_limit);
PredicateIndex index2(generation_holder, dummy_provider, simple_index_config,
@@ -336,7 +337,8 @@ TEST("require that DocumentFeaturesStore is restored on deserialization") {
EXPECT_FALSE(index.getIntervalIndex().lookup(hash).valid());
indexFeature(index, doc_id, min_feature, {{hash, interval}}, {{hash2, bounds}});
vespalib::DataBuffer buffer;
- index.serialize(buffer);
+ PredicateIndex::SerializeStats dummy_stats;
+ index.serialize(buffer, dummy_stats);
uint32_t doc_id_limit;
DocIdLimitFinder finder(doc_id_limit);
PredicateIndex index2(generation_holder, dummy_provider, simple_index_config,
diff --git a/searchlib/src/tests/predicate/simple_index_test.cpp b/searchlib/src/tests/predicate/simple_index_test.cpp
index 8cd36a26f6e..c37d377e668 100644
--- a/searchlib/src/tests/predicate/simple_index_test.cpp
+++ b/searchlib/src/tests/predicate/simple_index_test.cpp
@@ -176,7 +176,8 @@ TEST_FF("require that SimpleIndex can be serialized and deserialized.", Fixture,
}
f1.commit();
vespalib::DataBuffer buffer;
- f1.index().serialize(buffer, MyDataSerializer());
+ SimpleIndex<MyData>::SerializeStats dummy_stats;
+ f1.index().serialize(buffer, MyDataSerializer(), dummy_stats);
MyObserver observer;
MyDataDeserializer deserializer;
f2.index().deserialize(buffer, deserializer, observer, PredicateAttribute::PREDICATE_ATTRIBUTE_VERSION);
diff --git a/searchlib/src/vespa/searchlib/aggregation/hitsaggregationresult.h b/searchlib/src/vespa/searchlib/aggregation/hitsaggregationresult.h
index 0bf33ea33e8..f90ee3c2312 100644
--- a/searchlib/src/vespa/searchlib/aggregation/hitsaggregationresult.h
+++ b/searchlib/src/vespa/searchlib/aggregation/hitsaggregationresult.h
@@ -16,8 +16,8 @@ public:
class SummaryGenerator
{
public:
- virtual ~SummaryGenerator() { }
- virtual vespalib::ConstBufferRef fillSummary(DocId lid, const SummaryClassType & summaryClass) = 0;
+ virtual ~SummaryGenerator() = default;
+ virtual vespalib::ConstBufferRef fillSummary(DocId lid, vespalib::stringref summaryClass) = 0;
};
private:
@@ -26,7 +26,7 @@ private:
void onAggregate(const ResultNode &result, const document::Document & doc, HitRank rank) override;
const ResultNode & onGetRank() const override;
- SummaryClassType _summaryClass;
+ vespalib::string _summaryClass;
uint32_t _maxHits;
HitList _hits;
bool _isOrdered;
@@ -50,8 +50,8 @@ public:
~HitsAggregationResult() override;
void postMerge() override { _hits.postMerge(_maxHits); }
void setSummaryGenerator(SummaryGenerator & summaryGenerator) { _summaryGenerator = &summaryGenerator; }
- const SummaryClassType & getSummaryClass() const { return _summaryClass; }
- HitsAggregationResult setSummaryClass(const SummaryClassType & summaryClass) { _summaryClass = summaryClass; return *this; }
+ const vespalib::string & getSummaryClass() const { return _summaryClass; }
+ HitsAggregationResult setSummaryClass(vespalib::stringref summaryClass) { _summaryClass = summaryClass; return *this; }
HitsAggregationResult &setMaxHits(uint32_t maxHits) {
_maxHits = (maxHits == 0) ? std::numeric_limits<uint32_t>::max() : maxHits;
return *this;
diff --git a/searchlib/src/vespa/searchlib/aggregation/vdshit.h b/searchlib/src/vespa/searchlib/aggregation/vdshit.h
index 11cfe9b3b18..32a35c22977 100644
--- a/searchlib/src/vespa/searchlib/aggregation/vdshit.h
+++ b/searchlib/src/vespa/searchlib/aggregation/vdshit.h
@@ -2,7 +2,6 @@
#pragma once
#include "hit.h"
-#include "aggregationresult.h"
namespace search::aggregation {
@@ -10,19 +9,18 @@ class VdsHit : public Hit
{
public:
using Summary = std::vector<uint8_t>;
- using DocId = vespalib::string;
DECLARE_IDENTIFIABLE_NS2(search, aggregation, VdsHit);
DECLARE_NBO_SERIALIZE;
VdsHit() noexcept : Hit(), _docId(), _summary() {}
- VdsHit(DocId docId, HitRank rank) noexcept : Hit(rank), _docId(docId), _summary() {}
- ~VdsHit();
+ VdsHit(vespalib::stringref docId, HitRank rank) noexcept : Hit(rank), _docId(docId), _summary() {}
+ ~VdsHit() override;
VdsHit *clone() const override { return new VdsHit(*this); }
void visitMembers(vespalib::ObjectVisitor &visitor) const override;
- const DocId & getDocId() const noexcept { return _docId; }
+ const vespalib::string & getDocId() const noexcept { return _docId; }
const Summary & getSummary() const noexcept { return _summary; }
- VdsHit & setDocId(DocId & docId) noexcept { _docId = docId; return *this; }
+ VdsHit & setDocId(vespalib::stringref docId) noexcept { _docId = docId; return *this; }
VdsHit & setSummary(const void * buf, size_t sz) noexcept {
- const uint8_t * v(static_cast<const uint8_t *>(buf));
+ const auto * v(static_cast<const uint8_t *>(buf));
Summary n(v, v+sz);
_summary.swap(n);
return *this;
@@ -30,8 +28,8 @@ public:
bool operator < (const VdsHit &b) const noexcept { return cmp(b) < 0; }
private:
- DocId _docId;
- Summary _summary;
+ vespalib::string _docId;
+ Summary _summary;
};
}
diff --git a/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp b/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp
index ddf71063306..1f596699bca 100644
--- a/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/predicate_attribute.cpp
@@ -141,8 +141,11 @@ PredicateAttribute::before_inc_generation(generation_t current_gen)
void
PredicateAttribute::onSave(IAttributeSaveTarget &saveTarget) {
LOG(info, "Saving predicate attribute version %d", getVersion());
+ vespalib::string name(getBaseFileName());
+ PredicateIndex::SerializeStats stats;
IAttributeSaveTarget::Buffer buffer(saveTarget.datWriter().allocBuf(4_Ki));
- _index->serialize(*buffer);
+ _index->serialize(*buffer, stats);
+ size_t predicate_index_len = buffer->getDataLen();
uint32_t highest_doc_id = static_cast<uint32_t>(_min_feature.size() - 1);
buffer->writeInt32(highest_doc_id);
for (size_t i = 1; i <= highest_doc_id; ++i) {
@@ -152,6 +155,21 @@ PredicateAttribute::onSave(IAttributeSaveTarget &saveTarget) {
buffer->writeInt16(_interval_range_vector[i]);
}
buffer->writeInt16(_max_interval_range);
+ auto min_feature_and_interval_range_vector_len = buffer->getDataLen() - predicate_index_len;
+ auto total_len = buffer->getDataLen();
+ LOG(info, "Serialized predicate attribute %s: "
+ "{features=%zu, zeros=%zu, "
+ "interval={dictionary=%zu, btrees=%zu, bytes=%zu}, "
+ "interval_with_bounds={dictionary %zu, btrees=%zu, bytes=%zu}, "
+ "predicate-index_len=%zu, "
+ "min_feature and interval_range_vector=%zu, total=%zu}",
+ name.c_str(),
+ stats._features_len, stats._zeroes_len,
+ stats._interval._dictionary_size, stats._interval._btree_count, stats._interval._bytes,
+ stats._interval_with_bounds._dictionary_size, stats._interval_with_bounds._btree_count, stats._interval_with_bounds._bytes,
+ predicate_index_len,
+ min_feature_and_interval_range_vector_len,
+ total_len);
saveTarget.datWriter().writeBuf(std::move(buffer));
}
diff --git a/searchlib/src/vespa/searchlib/index/indexbuilder.h b/searchlib/src/vespa/searchlib/index/indexbuilder.h
index 9615bfd9428..71b826698ef 100644
--- a/searchlib/src/vespa/searchlib/index/indexbuilder.h
+++ b/searchlib/src/vespa/searchlib/index/indexbuilder.h
@@ -10,6 +10,13 @@ class DocIdAndFeatures;
class Schema;
class WordDocElementWordPosFeatures;
+/**
+ * Interface for building an index for a single field
+ * The index should be built as follows:
+ * Add the set of unique words in sorted order.
+ * For each word add the set of document ids in sorted order.
+ * For each document id add the position information for that document.
+ */
class FieldIndexBuilder {
public:
virtual ~FieldIndexBuilder() = default;
@@ -20,16 +27,11 @@ public:
/**
* Interface used to build an index for the set of index fields specified in a schema.
- *
- * The index should be built as follows:
- * For each field add the set of unique words in sorted order.
- * For each word add the set of document ids in sorted order.
- * For each document id add the position information for that document.
+ * Create and complete one field builder at the time.
*/
class IndexBuilder {
protected:
const Schema &_schema;
-
public:
explicit IndexBuilder(const Schema &schema);
virtual ~IndexBuilder();
diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp b/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp
index a3f10f14d54..1f580d707e8 100644
--- a/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp
+++ b/searchlib/src/vespa/searchlib/predicate/document_features_store.cpp
@@ -16,19 +16,11 @@ using std::vector;
namespace search::predicate {
-void
-DocumentFeaturesStore::setCurrent(uint32_t docId, FeatureVector *features) {
- _currDocId = docId;
- _currFeatures = features;
-}
-
DocumentFeaturesStore::DocumentFeaturesStore(uint32_t arity)
: _docs(),
_ranges(),
_word_store(),
_word_index(),
- _currDocId(0),
- _currFeatures(),
_numFeatures(0),
_numRanges(0),
_arity(arity) {
@@ -108,20 +100,6 @@ DocumentFeaturesStore::~DocumentFeaturesStore() {
}
void
-DocumentFeaturesStore::insert(uint64_t featureId, uint32_t docId) {
- assert(docId != 0);
- if (_currDocId != docId) {
- auto docsItr = _docs.find(docId);
- if (docsItr == _docs.end()) {
- docsItr = _docs.insert(std::make_pair(docId, FeatureVector())).first;
- }
- setCurrent(docId, &docsItr->second);
- }
- _currFeatures->push_back(featureId);
- ++_numFeatures;
-}
-
-void
DocumentFeaturesStore::insert(const PredicateTreeAnnotations &annotations, uint32_t doc_id) {
assert(doc_id != 0);
if (!annotations.features.empty()) {
@@ -189,9 +167,6 @@ DocumentFeaturesStore::remove(uint32_t doc_id) {
(_numRanges - range_itr->second.size()) : 0;
_ranges.erase(range_itr);
}
- if (_currDocId == doc_id) {
- setCurrent(0, nullptr);
- }
}
vespalib::MemoryUsage
diff --git a/searchlib/src/vespa/searchlib/predicate/document_features_store.h b/searchlib/src/vespa/searchlib/predicate/document_features_store.h
index 9225076000f..3b8aed53ca1 100644
--- a/searchlib/src/vespa/searchlib/predicate/document_features_store.h
+++ b/searchlib/src/vespa/searchlib/predicate/document_features_store.h
@@ -57,14 +57,10 @@ class DocumentFeaturesStore {
RangeFeaturesMap _ranges;
WordStore _word_store;
WordIndex _word_index;
- uint32_t _currDocId;
- FeatureVector *_currFeatures;
size_t _numFeatures;
size_t _numRanges;
uint32_t _arity;
- void setCurrent(uint32_t docId, FeatureVector *features);
-
public:
using FeatureSet = std::unordered_set<uint64_t>;
@@ -72,7 +68,6 @@ public:
DocumentFeaturesStore(vespalib::DataBuffer &buffer);
~DocumentFeaturesStore();
- void insert(uint64_t featureId, uint32_t docId);
void insert(const PredicateTreeAnnotations &annotations, uint32_t docId);
FeatureSet get(uint32_t docId) const;
void remove(uint32_t docId);
diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp
index c24c2f53f1d..7e3d62640cf 100644
--- a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp
+++ b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp
@@ -126,17 +126,20 @@ PredicateIndex::PredicateIndex(GenerationHolder &genHolder,
PredicateIndex::~PredicateIndex() = default;
void
-PredicateIndex::serialize(DataBuffer &buffer) const {
+PredicateIndex::serialize(DataBuffer &buffer, SerializeStats& stats) const {
_features_store.serialize(buffer);
+ stats._features_len = buffer.getDataLen();
+ auto old_len = buffer.getDataLen();
buffer.writeInt16(_arity);
buffer.writeInt32(_zero_constraint_docs.size());
for (auto it = _zero_constraint_docs.begin(); it.valid(); ++it) {
buffer.writeInt32(it.getKey());
}
+ stats._zeroes_len = buffer.getDataLen() - old_len;
IntervalSerializer<Interval> interval_serializer(_interval_store);
- _interval_index.serialize(buffer, interval_serializer);
+ _interval_index.serialize(buffer, interval_serializer, stats._interval);
IntervalSerializer<IntervalWithBounds> bounds_serializer(_interval_store);
- _bounds_index.serialize(buffer, bounds_serializer);
+ _bounds_index.serialize(buffer, bounds_serializer, stats._interval_with_bounds);
}
void
diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.h b/searchlib/src/vespa/searchlib/predicate/predicate_index.h
index 439187bccd7..351fa3a1a9f 100644
--- a/searchlib/src/vespa/searchlib/predicate/predicate_index.h
+++ b/searchlib/src/vespa/searchlib/predicate/predicate_index.h
@@ -37,6 +37,19 @@ public:
using GenerationHolder = vespalib::GenerationHolder;
using BTreeIterator = SimpleIndex<vespalib::datastore::EntryRef>::BTreeIterator;
using VectorIterator = SimpleIndex<vespalib::datastore::EntryRef>::VectorIterator;
+ struct SerializeStats {
+ size_t _features_len;
+ size_t _zeroes_len;
+ IntervalIndex::SerializeStats _interval;
+ BoundsIndex::SerializeStats _interval_with_bounds;
+ SerializeStats()
+ : _features_len(0),
+ _zeroes_len(0),
+ _interval(),
+ _interval_with_bounds()
+ {
+ }
+ };
private:
uint32_t _arity;
const DocIdLimitProvider &_limit_provider;
@@ -66,7 +79,7 @@ public:
SimpleIndexDeserializeObserver<> & observer, uint32_t version);
~PredicateIndex() override;
- void serialize(vespalib::DataBuffer &buffer) const;
+ void serialize(vespalib::DataBuffer &buffer, SerializeStats& stats) const;
void onDeserializationCompleted();
void indexEmptyDocument(uint32_t doc_id);
diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_tree_annotator.h b/searchlib/src/vespa/searchlib/predicate/predicate_tree_annotator.h
index 3e8f9c98f22..389c346a61b 100644
--- a/searchlib/src/vespa/searchlib/predicate/predicate_tree_annotator.h
+++ b/searchlib/src/vespa/searchlib/predicate/predicate_tree_annotator.h
@@ -22,7 +22,7 @@ constexpr uint32_t MIN_INTERVAL = 0x0001;
constexpr uint32_t MAX_INTERVAL = 0xffff;
struct PredicateTreeAnnotations {
- PredicateTreeAnnotations(uint32_t mf=0, uint16_t ir=MAX_INTERVAL);
+ explicit PredicateTreeAnnotations(uint32_t mf=0, uint16_t ir=MAX_INTERVAL);
~PredicateTreeAnnotations();
uint32_t min_feature;
uint16_t interval_range;
diff --git a/searchlib/src/vespa/searchlib/predicate/simple_index.h b/searchlib/src/vespa/searchlib/predicate/simple_index.h
index 3290aaf929e..0fb6ce6e9db 100644
--- a/searchlib/src/vespa/searchlib/predicate/simple_index.h
+++ b/searchlib/src/vespa/searchlib/predicate/simple_index.h
@@ -136,6 +136,17 @@ public:
using PostingVector = vespalib::RcuVectorBase<Posting>;
using VectorStore = vespalib::btree::BTree<Key, std::shared_ptr<PostingVector>, vespalib::btree::NoAggregated>;
using VectorIterator = PostingVectorIterator<Posting, Key, DocId>;
+ struct SerializeStats {
+ size_t _dictionary_size;
+ size_t _btree_count;
+ size_t _bytes;
+ SerializeStats()
+ : _dictionary_size(0),
+ _btree_count(0),
+ _bytes(0)
+ {
+ }
+ };
private:
using GenerationHolder = vespalib::GenerationHolder;
@@ -176,7 +187,7 @@ public:
~SimpleIndex();
void serialize(vespalib::DataBuffer &buffer,
- const PostingSerializer<Posting> &serializer) const;
+ const PostingSerializer<Posting> &serializer, SerializeStats& stats) const;
void deserialize(vespalib::DataBuffer &buffer,
PostingDeserializer<Posting> &deserializer,
SimpleIndexDeserializeObserver<Key, DocId> &observer, uint32_t version);
diff --git a/searchlib/src/vespa/searchlib/predicate/simple_index.hpp b/searchlib/src/vespa/searchlib/predicate/simple_index.hpp
index 46441a33692..0b5c8cbdb62 100644
--- a/searchlib/src/vespa/searchlib/predicate/simple_index.hpp
+++ b/searchlib/src/vespa/searchlib/predicate/simple_index.hpp
@@ -69,9 +69,12 @@ SimpleIndex<Posting, Key, DocId>::~SimpleIndex() {
template <typename Posting, typename Key, typename DocId>
void
-SimpleIndex<Posting, Key, DocId>::serialize(vespalib::DataBuffer &buffer, const PostingSerializer<Posting> &serializer) const {
+SimpleIndex<Posting, Key, DocId>::serialize(vespalib::DataBuffer &buffer, const PostingSerializer<Posting> &serializer, SerializeStats& stats) const {
assert(sizeof(Key) <= sizeof(uint64_t));
assert(sizeof(DocId) <= sizeof(uint32_t));
+ stats = SerializeStats();
+ stats._dictionary_size = _dictionary.size();
+ auto old_size = buffer.getDataLen();
buffer.writeInt32(_dictionary.size());
for (auto it = _dictionary.begin(); it.valid(); ++it) {
vespalib::datastore::EntryRef ref = it.getData();
@@ -79,12 +82,16 @@ SimpleIndex<Posting, Key, DocId>::serialize(vespalib::DataBuffer &buffer, const
auto posting_it = _btree_posting_lists.begin(ref);
if (!posting_it.valid())
continue;
+ if (posting_it.size() > 8u) {
+ ++stats._btree_count;
+ }
buffer.writeInt64(it.getKey()); // Key
for (; posting_it.valid(); ++posting_it) {
buffer.writeInt32(posting_it.getKey()); // DocId
serializer.serialize(posting_it.getData(), buffer);
}
}
+ stats._bytes = buffer.getDataLen() - old_size;
}
template <typename Posting, typename Key, typename DocId>