aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2021-12-06 17:21:34 +0100
committerGitHub <noreply@github.com>2021-12-06 17:21:34 +0100
commit091a90a1b5a4db8ade3369c7c416a4e02491bbb9 (patch)
tree3b584c71a9816fcb341d98a3318e6b80c6085994 /searchlib
parentffdbd053a2b57383b2d463e8050394776b14abdf (diff)
parent1330d2c3d3b8647b6053ac37e95503cd0278e2e3 (diff)
Merge pull request #20356 from vespa-engine/toregge/filter-early-on-buffer-id-for-normalize-values-and-foreach-values
Filter early on buffer id and pass vector of entries in normalize_values
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/attribute/enumstore/enumstore_test.cpp204
-rw-r--r--searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp159
-rw-r--r--searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h3
-rw-r--r--searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h19
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postingstore.cpp195
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postingstore.h9
6 files changed, 477 insertions, 112 deletions
diff --git a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp
index 9c25429932b..40ff25ff976 100644
--- a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp
+++ b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp
@@ -8,6 +8,21 @@ LOG_SETUP("enumstore_test");
using Type = search::DictionaryConfig::Type;
using vespalib::datastore::EntryRef;
+using vespalib::datastore::EntryRefFilter;
+using RefT = vespalib::datastore::EntryRefT<22>;
+
+namespace vespalib::datastore {
+
+/*
+ * Print EntryRef as RefT which is used by test_normalize_posting_lists and
+ * test_foreach_posting_list to differentiate between buffers
+ */
+void PrintTo(const EntryRef &ref, std::ostream* os) {
+ RefT iref(ref);
+ *os << "RefT(" << iref.offset() << "," << iref.bufferId() << ")";
+}
+
+}
namespace search {
@@ -597,6 +612,11 @@ public:
void update_posting_idx(EnumIndex enum_idx, EntryRef old_posting_idx, EntryRef new_posting_idx);
EnumIndex insert_value(size_t value_idx);
+ void populate_sample_data(uint32_t cnt);
+ std::vector<EntryRef> get_sample_values(uint32_t cnt);
+ void clear_sample_values(uint32_t cnt);
+ void test_normalize_posting_lists(bool use_filter, bool one_filter);
+ void test_foreach_posting_list(bool one_filter);
static EntryRef fake_pidx() { return EntryRef(42); }
};
@@ -620,6 +640,149 @@ EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::insert_value(size_t val
return enum_idx;
}
+namespace {
+/*
+ * large_population should trigger multiple callbacks from normalize_values
+ * and foreach_value
+ */
+constexpr uint32_t large_population = 1200;
+
+uint32_t select_buffer(uint32_t i) {
+ if ((i % 2) == 0) {
+ return 0;
+ }
+ if ((i % 3) == 0) {
+ return 1;
+ }
+ if ((i % 5) == 0) {
+ return 2;
+ }
+ return 3;
+}
+
+EntryRef make_fake_pidx(uint32_t i) { return RefT(i + 200, select_buffer(i)); }
+EntryRef make_fake_adjusted_pidx(uint32_t i) { return RefT(i + 500, select_buffer(i)); }
+EntryRef adjust_fake_pidx(EntryRef ref) { RefT iref(ref); return RefT(iref.offset() + 300, iref.bufferId()); }
+
+}
+
+
+template <typename EnumStoreTypeAndDictionaryType>
+void
+EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::populate_sample_data(uint32_t cnt)
+{
+ auto& dict = store.get_dictionary();
+ for (uint32_t i = 0; i < cnt; ++i) {
+ auto enum_idx = store.insert(i);
+ EXPECT_TRUE(enum_idx.valid());
+ EntryRef posting_idx(make_fake_pidx(i));
+ dict.update_posting_list(enum_idx, store.get_comparator(), [posting_idx](EntryRef) noexcept -> EntryRef { return posting_idx; });
+ }
+}
+
+template <typename EnumStoreTypeAndDictionaryType>
+std::vector<EntryRef>
+EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::get_sample_values(uint32_t cnt)
+{
+ std::vector<EntryRef> result;
+ result.reserve(cnt);
+ store.freeze_dictionary();
+ auto& dict = store.get_dictionary();
+ for (uint32_t i = 0; i < cnt; ++i) {
+ auto compare = store.make_comparator(i);
+ auto enum_idx = dict.find(compare);
+ EXPECT_TRUE(enum_idx.valid());
+ EntryRef posting_idx;
+ dict.update_posting_list(enum_idx, compare, [&posting_idx](EntryRef ref) noexcept { posting_idx = ref; return ref; });;
+ auto find_result = dict.find_posting_list(compare, dict.get_frozen_root());
+ EXPECT_EQ(enum_idx, find_result.first);
+ EXPECT_EQ(posting_idx, find_result.second);
+ result.emplace_back(find_result.second);
+ }
+ return result;
+}
+
+template <typename EnumStoreTypeAndDictionaryType>
+void
+EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::clear_sample_values(uint32_t cnt)
+{
+ auto& dict = store.get_dictionary();
+ for (uint32_t i = 0; i < cnt; ++i) {
+ auto comparator = store.make_comparator(i);
+ auto enum_idx = dict.find(comparator);
+ EXPECT_TRUE(enum_idx.valid());
+ dict.update_posting_list(enum_idx, comparator, [](EntryRef) noexcept -> EntryRef { return EntryRef(); });
+ }
+}
+
+namespace {
+
+EntryRefFilter make_entry_ref_filter(bool one_filter)
+{
+ if (one_filter) {
+ EntryRefFilter filter(RefT::numBuffers(), RefT::offset_bits);
+ filter.add_buffer(3);
+ return filter;
+ }
+ return EntryRefFilter::create_all_filter(RefT::numBuffers(), RefT::offset_bits);
+}
+
+}
+
+template <typename EnumStoreTypeAndDictionaryType>
+void
+EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::test_normalize_posting_lists(bool use_filter, bool one_filter)
+{
+ populate_sample_data(large_population);
+ auto& dict = store.get_dictionary();
+ std::vector<EntryRef> exp_refs;
+ std::vector<EntryRef> exp_adjusted_refs;
+ exp_refs.reserve(large_population);
+ exp_adjusted_refs.reserve(large_population);
+ for (uint32_t i = 0; i < large_population; ++i) {
+ exp_refs.emplace_back(make_fake_pidx(i));
+ if (!use_filter || !one_filter || select_buffer(i) == 3) {
+ exp_adjusted_refs.emplace_back(make_fake_adjusted_pidx(i));
+ } else {
+ exp_adjusted_refs.emplace_back(make_fake_pidx(i));
+ }
+ }
+ EXPECT_EQ(exp_refs, get_sample_values(large_population));
+ if (use_filter) {
+ auto filter = make_entry_ref_filter(one_filter);
+ auto dummy = [](std::vector<EntryRef>&) noexcept { };
+ auto adjust_refs = [](std::vector<EntryRef> &refs) noexcept { for (auto &ref : refs) { ref = adjust_fake_pidx(ref); } };
+ EXPECT_FALSE(dict.normalize_posting_lists(dummy, filter));
+ EXPECT_EQ(exp_refs, get_sample_values(large_population));
+ EXPECT_TRUE(dict.normalize_posting_lists(adjust_refs, filter));
+ } else {
+ auto dummy = [](EntryRef posting_idx) noexcept { return posting_idx; };
+ auto adjust_refs = [](EntryRef ref) noexcept { return adjust_fake_pidx(ref); };
+ EXPECT_FALSE(dict.normalize_posting_lists(dummy));
+ EXPECT_EQ(exp_refs, get_sample_values(large_population));
+ EXPECT_TRUE(dict.normalize_posting_lists(adjust_refs));
+ }
+ EXPECT_EQ(exp_adjusted_refs, get_sample_values(large_population));
+ clear_sample_values(large_population);
+}
+
+template <typename EnumStoreTypeAndDictionaryType>
+void
+EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::test_foreach_posting_list(bool one_filter)
+{
+ auto filter = make_entry_ref_filter(one_filter);
+ populate_sample_data(large_population);
+ auto& dict = store.get_dictionary();
+ std::vector<EntryRef> exp_refs;
+ auto save_exp_refs = [&exp_refs](std::vector<EntryRef>& refs) { exp_refs.insert(exp_refs.end(), refs.begin(), refs.end()); };
+ EXPECT_FALSE(dict.normalize_posting_lists(save_exp_refs, filter));
+ std::vector<EntryRef> act_refs;
+ auto save_act_refs = [&act_refs](const std::vector<EntryRef>& refs) { act_refs.insert(act_refs.end(), refs.begin(), refs.end()); };
+ dict.foreach_posting_list(save_act_refs, filter);
+ EXPECT_EQ(exp_refs, act_refs);
+ clear_sample_values(large_population);
+}
+
// Disable warnings emitted by gtest generated files when using typed tests
#pragma GCC diagnostic push
#ifndef __clang__
@@ -678,26 +841,27 @@ TYPED_TEST(EnumStoreDictionaryTest, find_posting_list_works)
TYPED_TEST(EnumStoreDictionaryTest, normalize_posting_lists_works)
{
- auto value_0_idx = this->insert_value(0);
- this->update_posting_idx(value_0_idx, EntryRef(), this->fake_pidx());
- this->store.freeze_dictionary();
- auto& dict = this->store.get_dictionary();
- auto root = dict.get_frozen_root();
- auto find_result = dict.find_posting_list(this->make_bound_comparator(0), root);
- EXPECT_EQ(value_0_idx, find_result.first);
- EXPECT_EQ(this->fake_pidx(), find_result.second);
- auto dummy = [](EntryRef posting_idx) noexcept { return posting_idx; };
- std::vector<EntryRef> saved_refs;
- auto save_refs_and_clear = [&saved_refs](EntryRef posting_idx) { saved_refs.push_back(posting_idx); return EntryRef(); };
- EXPECT_FALSE(dict.normalize_posting_lists(dummy));
- EXPECT_TRUE(dict.normalize_posting_lists(save_refs_and_clear));
- EXPECT_FALSE(dict.normalize_posting_lists(save_refs_and_clear));
- EXPECT_EQ((std::vector<EntryRef>{ this->fake_pidx(), EntryRef() }), saved_refs);
- this->store.freeze_dictionary();
- root = dict.get_frozen_root();
- find_result = dict.find_posting_list(this->make_bound_comparator(0), root);
- EXPECT_EQ(value_0_idx, find_result.first);
- EXPECT_EQ(EntryRef(), find_result.second);
+ this->test_normalize_posting_lists(false, false);
+}
+
+TYPED_TEST(EnumStoreDictionaryTest, normalize_posting_lists_with_all_filter_works)
+{
+ this->test_normalize_posting_lists(true, false);
+}
+
+TYPED_TEST(EnumStoreDictionaryTest, normalize_posting_lists_with_one_filter_works)
+{
+ this->test_normalize_posting_lists(true, true);
+}
+
+TYPED_TEST(EnumStoreDictionaryTest, foreach_posting_list_with_all_filter_works)
+{
+ this->test_foreach_posting_list(false);
+}
+
+TYPED_TEST(EnumStoreDictionaryTest, foreach_posting_list_with_one_filter_works)
+{
+ this->test_foreach_posting_list(true);
}
namespace {
diff --git a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp
index 6c929ad5981..8bc28abc238 100644
--- a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp
@@ -311,6 +311,165 @@ EnumStoreDictionary<BTreeDictionaryT, HashDictionaryT>::normalize_posting_lists(
}
template <>
+bool
+EnumStoreDictionary<EnumTree>::normalize_posting_lists(std::function<void(std::vector<EntryRef>&)>, const EntryRefFilter&)
+{
+ LOG_ABORT("should not be reached");
+}
+
+namespace {
+
+template <typename HashDictionaryT>
+class ChangeWriterBase
+{
+protected:
+ HashDictionaryT* _hash_dict;
+ static constexpr bool has_hash_dictionary = true;
+ ChangeWriterBase()
+ : _hash_dict(nullptr)
+ {
+ }
+public:
+ void set_hash_dict(HashDictionaryT &hash_dict) { _hash_dict = &hash_dict; }
+};
+
+template <>
+class ChangeWriterBase<vespalib::datastore::NoHashDictionary>
+{
+protected:
+ static constexpr bool has_hash_dictionary = false;
+ ChangeWriterBase() = default;
+};
+
+template <typename HashDictionaryT>
+class ChangeWriter : public ChangeWriterBase<HashDictionaryT> {
+ using Parent = ChangeWriterBase<HashDictionaryT>;
+ using Parent::has_hash_dictionary;
+ std::vector<std::pair<EntryRef,uint32_t*>> _tree_refs;
+public:
+ ChangeWriter(uint32_t capacity);
+ ~ChangeWriter();
+ bool write(const std::vector<EntryRef>& refs);
+ void emplace_back(EntryRef key, uint32_t& tree_ref) { _tree_refs.emplace_back(std::make_pair(key, &tree_ref)); }
+};
+
+template <typename HashDictionaryT>
+ChangeWriter<HashDictionaryT>::ChangeWriter(uint32_t capacity)
+ : ChangeWriterBase<HashDictionaryT>(),
+ _tree_refs()
+{
+ _tree_refs.reserve(capacity);
+}
+
+template <typename HashDictionaryT>
+ChangeWriter<HashDictionaryT>::~ChangeWriter() = default;
+
+template <typename HashDictionaryT>
+bool
+ChangeWriter<HashDictionaryT>::write(const std::vector<EntryRef> &refs)
+{
+ bool changed = false;
+ assert(refs.size() == _tree_refs.size());
+ auto tree_ref = _tree_refs.begin();
+ for (auto ref : refs) {
+ EntryRef old_ref(*tree_ref->second);
+ if (ref != old_ref) {
+ if (!changed) {
+ // Note: Needs review when porting to other platforms
+ // Assumes that other CPUs observes stores from this CPU in order
+ std::atomic_thread_fence(std::memory_order_release);
+ changed = true;
+ }
+ *tree_ref->second = ref.ref();
+ if constexpr (has_hash_dictionary) {
+ auto find_result = this->_hash_dict->find(this->_hash_dict->get_default_comparator(), tree_ref->first);
+ assert(find_result != nullptr && find_result->first.load_relaxed() == tree_ref->first);
+ assert(find_result->second.load_relaxed() == old_ref);
+ find_result->second.store_release(ref);
+ }
+ }
+ ++tree_ref;
+ }
+ assert(tree_ref == _tree_refs.end());
+ _tree_refs.clear();
+ return changed;
+}
+
+}
+
+template <typename BTreeDictionaryT, typename HashDictionaryT>
+bool
+EnumStoreDictionary<BTreeDictionaryT, HashDictionaryT>::normalize_posting_lists(std::function<void(std::vector<EntryRef>&)> normalize, const EntryRefFilter& filter)
+{
+ if constexpr (has_btree_dictionary) {
+ std::vector<EntryRef> refs;
+ refs.reserve(1024);
+ bool changed = false;
+ ChangeWriter<HashDictionaryT> change_writer(refs.capacity());
+ if constexpr (has_hash_dictionary) {
+ change_writer.set_hash_dict(this->_hash_dict);
+ }
+ auto& dict = this->_btree_dict;
+ for (auto itr = dict.begin(); itr.valid(); ++itr) {
+ EntryRef ref(itr.getData());
+ if (ref.valid()) {
+ if (filter.has(ref)) {
+ refs.emplace_back(ref);
+ change_writer.emplace_back(itr.getKey(), itr.getWData());
+ if (refs.size() >= refs.capacity()) {
+ normalize(refs);
+ changed |= change_writer.write(refs);
+ refs.clear();
+ }
+ }
+ }
+ }
+ if (!refs.empty()) {
+ normalize(refs);
+ changed |= change_writer.write(refs);
+ }
+ return changed;
+ } else {
+ return this->_hash_dict.normalize_values(normalize, filter);
+ }
+}
+
+template <>
+void
+EnumStoreDictionary<EnumTree>::foreach_posting_list(std::function<void(const std::vector<EntryRef>&)>, const EntryRefFilter&)
+{
+ LOG_ABORT("should not be reached");
+}
+
+template <typename BTreeDictionaryT, typename HashDictionaryT>
+void
+EnumStoreDictionary<BTreeDictionaryT, HashDictionaryT>::foreach_posting_list(std::function<void(const std::vector<EntryRef>&)> callback, const EntryRefFilter& filter)
+{
+ if constexpr (has_btree_dictionary) {
+ std::vector<EntryRef> refs;
+ refs.reserve(1024);
+ auto& dict = this->_btree_dict;
+ for (auto itr = dict.begin(); itr.valid(); ++itr) {
+ EntryRef ref(itr.getData());
+ if (ref.valid()) {
+ if (filter.has(ref)) {
+ refs.emplace_back(ref);
+ if (refs.size() >= refs.capacity()) {
+ callback(refs);
+ refs.clear();
+ }
+ }
+ }
+ }
+ if (!refs.empty()) {
+ callback(refs);
+ }
+ } else {
+ this->_hash_dict.foreach_value(callback, filter);
+ }
+}
+
+template <>
const EnumPostingTree &
EnumStoreDictionary<EnumTree>::get_posting_dictionary() const
{
diff --git a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h
index 4d0509c0eb1..db1176c5484 100644
--- a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h
+++ b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h
@@ -16,6 +16,7 @@ template <typename BTreeDictionaryT, typename HashDictionaryT = vespalib::datast
class EnumStoreDictionary : public vespalib::datastore::UniqueStoreDictionary<BTreeDictionaryT, IEnumStoreDictionary, HashDictionaryT> {
protected:
using EntryRef = IEnumStoreDictionary::EntryRef;
+ using EntryRefFilter = IEnumStoreDictionary::EntryRefFilter;
using Index = IEnumStoreDictionary::Index;
using BTreeDictionaryType = BTreeDictionaryT;
using EntryComparator = IEnumStoreDictionary::EntryComparator;
@@ -54,6 +55,8 @@ public:
void clear_all_posting_lists(std::function<void(EntryRef)> clearer) override;
void update_posting_list(Index idx, const EntryComparator& cmp, std::function<EntryRef(EntryRef)> updater) override;
bool normalize_posting_lists(std::function<EntryRef(EntryRef)> normalize) override;
+ bool normalize_posting_lists(std::function<void(std::vector<EntryRef>&)> normalize, const EntryRefFilter& filter) override;
+ void foreach_posting_list(std::function<void(const std::vector<EntryRef>&)> callback, const EntryRefFilter& filter) override;
const EnumPostingTree& get_posting_dictionary() const override;
};
diff --git a/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h b/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h
index a8cf6881b86..a9716ec5d05 100644
--- a/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h
+++ b/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h
@@ -30,6 +30,7 @@ class IEnumStoreDictionary : public vespalib::datastore::IUniqueStoreDictionary
public:
using EntryRef = vespalib::datastore::EntryRef;
using EntryComparator = vespalib::datastore::EntryComparator;
+ using EntryRefFilter = vespalib::datastore::EntryRefFilter;
using EnumVector = IEnumStore::EnumVector;
using Index = IEnumStore::Index;
using IndexList = IEnumStore::IndexList;
@@ -52,7 +53,25 @@ public:
virtual Index remap_index(Index idx) = 0;
virtual void clear_all_posting_lists(std::function<void(EntryRef)> clearer) = 0;
virtual void update_posting_list(Index idx, const EntryComparator& cmp, std::function<EntryRef(EntryRef)> updater) = 0;
+ /*
+ * Scan dictionary and call normalize function for each value. If
+ * returned value is different then write back the modified value to
+ * the dictionary. Only used by unit tests.
+ */
virtual bool normalize_posting_lists(std::function<EntryRef(EntryRef)> normalize) = 0;
+ /*
+ * Scan dictionary and call normalize function for batches of values
+ * that pass the filter. Write back modified values to the dictionary.
+ * Used by compaction of posting lists when moving short arrays,
+ * bitvectors or btree roots.
+ */
+ virtual bool normalize_posting_lists(std::function<void(std::vector<EntryRef>&)> normalize, const EntryRefFilter& filter) = 0;
+ /*
+ * Scan dictionary and call callback function for batches of values
+ * that pass the filter. Used by compaction of posting lists when
+ * moving btree nodes.
+ */
+ virtual void foreach_posting_list(std::function<void(const std::vector<EntryRef>&)> callback, const EntryRefFilter& filter) = 0;
virtual const EnumPostingTree& get_posting_dictionary() const = 0;
};
diff --git a/searchlib/src/vespa/searchlib/attribute/postingstore.cpp b/searchlib/src/vespa/searchlib/attribute/postingstore.cpp
index 3451c2b0456..8ed8a0cfbee 100644
--- a/searchlib/src/vespa/searchlib/attribute/postingstore.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/postingstore.cpp
@@ -7,11 +7,13 @@
#include <vespa/vespalib/btree/btreeiterator.hpp>
#include <vespa/vespalib/btree/btreerootbase.cpp>
#include <vespa/vespalib/datastore/datastore.hpp>
+#include <vespa/vespalib/datastore/entry_ref_filter.h>
#include <vespa/vespalib/datastore/buffer_type.hpp>
namespace search::attribute {
using vespalib::btree::BTreeNoLeafData;
+using vespalib::datastore::EntryRefFilter;
// #define FORCE_BITVECTORS
@@ -127,45 +129,47 @@ PostingStore<DataT>::removeSparseBitVectors()
}
}
if (needscan) {
- res = _dictionary.normalize_posting_lists([this](EntryRef posting_idx) -> EntryRef
- { return consider_remove_sparse_bitvector(posting_idx); });
+ EntryRefFilter filter(RefType::numBuffers(), RefType::offset_bits);
+ filter.add_buffers(_bvType.get_active_buffers());
+ res = _dictionary.normalize_posting_lists([this](std::vector<EntryRef>& refs)
+ { consider_remove_sparse_bitvector(refs); },
+ filter);
}
return res;
}
template <typename DataT>
-typename PostingStore<DataT>::EntryRef
-PostingStore<DataT>::consider_remove_sparse_bitvector(EntryRef ref)
+void
+PostingStore<DataT>::consider_remove_sparse_bitvector(std::vector<EntryRef>& refs)
{
- if (!ref.valid() || !isBitVector(getTypeId(EntryRef(ref)))) {
- return ref;
- }
- RefType iRef(ref);
- uint32_t typeId = getTypeId(iRef);
- assert(isBitVector(typeId));
- assert(_bvs.find(ref.ref() )!= _bvs.end());
- BitVectorEntry *bve = getWBitVectorEntry(iRef);
- BitVector &bv = *bve->_bv.get();
- uint32_t docFreq = bv.countTrueBits();
- if (bve->_tree.valid()) {
- RefType iRef2(bve->_tree);
- assert(isBTree(iRef2));
- const BTreeType *tree = getTreeEntry(iRef2);
- assert(tree->size(_allocator) == docFreq);
- (void) tree;
- }
- if (docFreq < _minBvDocFreq) {
- dropBitVector(ref);
- if (ref.valid()) {
+ for (auto& ref : refs) {
+ RefType iRef(ref);
+ assert(iRef.valid());
+ uint32_t typeId = getTypeId(iRef);
+ assert(isBitVector(typeId));
+ assert(_bvs.find(iRef.ref()) != _bvs.end());
+ BitVectorEntry *bve = getWBitVectorEntry(iRef);
+ BitVector &bv = *bve->_bv.get();
+ uint32_t docFreq = bv.countTrueBits();
+ if (bve->_tree.valid()) {
+ RefType iRef2(bve->_tree);
+ assert(isBTree(iRef2));
+ const BTreeType *tree = getTreeEntry(iRef2);
+ assert(tree->size(_allocator) == docFreq);
+ (void) tree;
+ }
+ if (docFreq < _minBvDocFreq) {
+ dropBitVector(ref);
iRef = ref;
- typeId = getTypeId(iRef);
- if (isBTree(typeId)) {
- BTreeType *tree = getWTreeEntry(iRef);
- normalizeTree(ref, tree, false);
+ if (iRef.valid()) {
+ typeId = getTypeId(iRef);
+ if (isBTree(typeId)) {
+ BTreeType *tree = getWTreeEntry(iRef);
+ normalizeTree(ref, tree, false);
+ }
}
}
}
- return ref;
}
template <typename DataT>
@@ -647,74 +651,75 @@ PostingStore<DataT>::update_stat()
template <typename DataT>
void
-PostingStore<DataT>::move_btree_nodes(EntryRef ref)
+PostingStore<DataT>::move_btree_nodes(const std::vector<EntryRef>& refs)
{
- if (ref.valid()) {
+ for (auto ref : refs) {
RefType iRef(ref);
+ assert(iRef.valid());
uint32_t typeId = getTypeId(iRef);
uint32_t clusterSize = getClusterSize(typeId);
- if (clusterSize == 0) {
- if (isBitVector(typeId)) {
- BitVectorEntry *bve = getWBitVectorEntry(iRef);
- RefType iRef2(bve->_tree);
- if (iRef2.valid()) {
- assert(isBTree(iRef2));
- BTreeType *tree = getWTreeEntry(iRef2);
- tree->move_nodes(_allocator);
- }
- } else {
- BTreeType *tree = getWTreeEntry(iRef);
+ assert(clusterSize == 0);
+ if (isBitVector(typeId)) {
+ BitVectorEntry *bve = getWBitVectorEntry(iRef);
+ RefType iRef2(bve->_tree);
+ if (iRef2.valid()) {
+ assert(isBTree(iRef2));
+ BTreeType *tree = getWTreeEntry(iRef2);
tree->move_nodes(_allocator);
}
+ } else {
+ assert(isBTree(typeId));
+ BTreeType *tree = getWTreeEntry(iRef);
+ tree->move_nodes(_allocator);
}
}
}
template <typename DataT>
-typename PostingStore<DataT>::EntryRef
-PostingStore<DataT>::move(EntryRef ref)
+void
+PostingStore<DataT>::move(std::vector<EntryRef>& refs)
{
- if (!ref.valid()) {
- return EntryRef();
- }
- RefType iRef(ref);
- uint32_t typeId = getTypeId(iRef);
- uint32_t clusterSize = getClusterSize(typeId);
- if (clusterSize == 0) {
- if (isBitVector(typeId)) {
- BitVectorEntry *bve = getWBitVectorEntry(iRef);
- RefType iRef2(bve->_tree);
- if (iRef2.valid()) {
- assert(isBTree(iRef2));
- if (_store.getCompacting(iRef2)) {
- BTreeType *tree = getWTreeEntry(iRef2);
- auto ref_and_ptr = allocBTreeCopy(*tree);
- tree->prepare_hold();
- bve->_tree = ref_and_ptr.ref;
+ for (auto& ref : refs) {
+ RefType iRef(ref);
+ assert(iRef.valid());
+ uint32_t typeId = getTypeId(iRef);
+ uint32_t clusterSize = getClusterSize(typeId);
+ if (clusterSize == 0) {
+ if (isBitVector(typeId)) {
+ BitVectorEntry *bve = getWBitVectorEntry(iRef);
+ RefType iRef2(bve->_tree);
+ if (iRef2.valid()) {
+ assert(isBTree(iRef2));
+ if (_store.getCompacting(iRef2)) {
+ BTreeType *tree = getWTreeEntry(iRef2);
+ auto ref_and_ptr = allocBTreeCopy(*tree);
+ tree->prepare_hold();
+ // Note: Needs review when porting to other platforms
+ // Assumes that other CPUs observes stores from this CPU in order
+ std::atomic_thread_fence(std::memory_order_release);
+ bve->_tree = ref_and_ptr.ref;
+ }
}
+ if (_store.getCompacting(iRef)) {
+ auto new_ref = allocBitVectorCopy(*bve).ref;
+ _bvs.erase(iRef.ref());
+ _bvs.insert(new_ref.ref());
+ ref = new_ref;
+ }
+ } else {
+ assert(isBTree(typeId));
+ assert(_store.getCompacting(iRef));
+ BTreeType *tree = getWTreeEntry(iRef);
+ auto ref_and_ptr = allocBTreeCopy(*tree);
+ tree->prepare_hold();
+ ref = ref_and_ptr.ref;
}
- if (!_store.getCompacting(ref)) {
- return ref;
- }
- auto new_ref = allocBitVectorCopy(*bve).ref;
- _bvs.erase(ref.ref());
- _bvs.insert(new_ref.ref());
- return new_ref;
} else {
- if (!_store.getCompacting(ref)) {
- return ref;
- }
- BTreeType *tree = getWTreeEntry(iRef);
- auto ref_and_ptr = allocBTreeCopy(*tree);
- tree->prepare_hold();
- return ref_and_ptr.ref;
+ assert(_store.getCompacting(iRef));
+ const KeyDataType *shortArray = getKeyDataEntry(iRef, clusterSize);
+ ref = allocKeyDataCopy(shortArray, clusterSize).ref;
}
}
- if (!_store.getCompacting(ref)) {
- return ref;
- }
- const KeyDataType *shortArray = getKeyDataEntry(iRef, clusterSize);
- return allocKeyDataCopy(shortArray, clusterSize).ref;
}
template <typename DataT>
@@ -722,11 +727,12 @@ void
PostingStore<DataT>::compact_worst_btree_nodes()
{
auto to_hold = this->start_compact_worst_btree_nodes();
- _dictionary.normalize_posting_lists([this](EntryRef posting_idx) -> EntryRef
- {
- move_btree_nodes(posting_idx);
- return posting_idx;
- });
+ EntryRefFilter filter(RefType::numBuffers(), RefType::offset_bits);
+ // Only look at buffers containing bitvectors and btree roots
+ filter.add_buffers(this->_treeType.get_active_buffers());
+ filter.add_buffers(_bvType.get_active_buffers());
+ _dictionary.foreach_posting_list([this](const std::vector<EntryRef>& refs)
+ { move_btree_nodes(refs); }, filter);
this->finish_compact_worst_btree_nodes(to_hold);
}
@@ -735,8 +741,23 @@ void
PostingStore<DataT>::compact_worst_buffers()
{
auto to_hold = this->start_compact_worst_buffers();
- _dictionary.normalize_posting_lists([this](EntryRef posting_idx) -> EntryRef
- { return move(posting_idx); });
+ bool compact_btree_roots = false;
+ EntryRefFilter filter(RefType::numBuffers(), RefType::offset_bits);
+ filter.add_buffers(to_hold);
+ // Start with looking at buffers being compacted
+ for (uint32_t buffer_id : to_hold) {
+ if (isBTree(_store.getBufferState(buffer_id).getTypeId())) {
+ compact_btree_roots = true;
+ }
+ }
+ if (compact_btree_roots) {
+ // If we are compacting btree roots then we also have to look at bitvector
+ // buffers
+ filter.add_buffers(_bvType.get_active_buffers());
+ }
+ _dictionary.normalize_posting_lists([this](std::vector<EntryRef>& refs)
+ { return move(refs); },
+ filter);
this->finishCompact(to_hold);
}
diff --git a/searchlib/src/vespa/searchlib/attribute/postingstore.h b/searchlib/src/vespa/searchlib/attribute/postingstore.h
index a0f0be1c430..2b119a55158 100644
--- a/searchlib/src/vespa/searchlib/attribute/postingstore.h
+++ b/searchlib/src/vespa/searchlib/attribute/postingstore.h
@@ -89,6 +89,7 @@ public:
using Parent::getWTreeEntry;
using Parent::getTreeEntry;
using Parent::getKeyDataEntry;
+ using Parent::isBTree;
using Parent::clusterLimit;
using Parent::allocBTree;
using Parent::allocBTreeCopy;
@@ -105,10 +106,8 @@ public:
~PostingStore();
bool removeSparseBitVectors() override;
- EntryRef consider_remove_sparse_bitvector(EntryRef ref);
+ void consider_remove_sparse_bitvector(std::vector<EntryRef> &refs);
static bool isBitVector(uint32_t typeId) { return typeId == BUFFERTYPE_BITVECTOR; }
- static bool isBTree(uint32_t typeId) { return typeId == BUFFERTYPE_BTREE; }
- bool isBTree(RefType ref) const { return isBTree(getTypeId(ref)); }
void applyNew(EntryRef &ref, AddIter a, AddIter ae);
@@ -188,8 +187,8 @@ public:
vespalib::MemoryUsage getMemoryUsage() const;
vespalib::MemoryUsage update_stat();
- void move_btree_nodes(EntryRef ref);
- EntryRef move(EntryRef ref);
+ void move_btree_nodes(const std::vector<EntryRef> &refs);
+ void move(std::vector<EntryRef>& refs);
void compact_worst_btree_nodes();
void compact_worst_buffers();