diff options
author | Geir Storli <geirst@yahooinc.com> | 2021-12-06 17:21:34 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-12-06 17:21:34 +0100 |
commit | 091a90a1b5a4db8ade3369c7c416a4e02491bbb9 (patch) | |
tree | 3b584c71a9816fcb341d98a3318e6b80c6085994 /searchlib | |
parent | ffdbd053a2b57383b2d463e8050394776b14abdf (diff) | |
parent | 1330d2c3d3b8647b6053ac37e95503cd0278e2e3 (diff) |
Merge pull request #20356 from vespa-engine/toregge/filter-early-on-buffer-id-for-normalize-values-and-foreach-values
Filter early on buffer id and pass vector of entries in normalize_values
Diffstat (limited to 'searchlib')
6 files changed, 477 insertions, 112 deletions
diff --git a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp index 9c25429932b..40ff25ff976 100644 --- a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp +++ b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp @@ -8,6 +8,21 @@ LOG_SETUP("enumstore_test"); using Type = search::DictionaryConfig::Type; using vespalib::datastore::EntryRef; +using vespalib::datastore::EntryRefFilter; +using RefT = vespalib::datastore::EntryRefT<22>; + +namespace vespalib::datastore { + +/* + * Print EntryRef as RefT which is used by test_normalize_posting_lists and + * test_foreach_posting_list to differentiate between buffers + */ +void PrintTo(const EntryRef &ref, std::ostream* os) { + RefT iref(ref); + *os << "RefT(" << iref.offset() << "," << iref.bufferId() << ")"; +} + +} namespace search { @@ -597,6 +612,11 @@ public: void update_posting_idx(EnumIndex enum_idx, EntryRef old_posting_idx, EntryRef new_posting_idx); EnumIndex insert_value(size_t value_idx); + void populate_sample_data(uint32_t cnt); + std::vector<EntryRef> get_sample_values(uint32_t cnt); + void clear_sample_values(uint32_t cnt); + void test_normalize_posting_lists(bool use_filter, bool one_filter); + void test_foreach_posting_list(bool one_filter); static EntryRef fake_pidx() { return EntryRef(42); } }; @@ -620,6 +640,149 @@ EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::insert_value(size_t val return enum_idx; } +namespace { +/* + * large_population should trigger multiple callbacks from normalize_values + * and foreach_value + */ +constexpr uint32_t large_population = 1200; + +uint32_t select_buffer(uint32_t i) { + if ((i % 2) == 0) { + return 0; + } + if ((i % 3) == 0) { + return 1; + } + if ((i % 5) == 0) { + return 2; + } + return 3; +} + +EntryRef make_fake_pidx(uint32_t i) { return RefT(i + 200, select_buffer(i)); } +EntryRef make_fake_adjusted_pidx(uint32_t i) { return RefT(i + 500, select_buffer(i)); } +EntryRef adjust_fake_pidx(EntryRef ref) { RefT iref(ref); return RefT(iref.offset() + 300, iref.bufferId()); } + +} + + +template <typename EnumStoreTypeAndDictionaryType> +void +EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::populate_sample_data(uint32_t cnt) +{ + auto& dict = store.get_dictionary(); + for (uint32_t i = 0; i < cnt; ++i) { + auto enum_idx = store.insert(i); + EXPECT_TRUE(enum_idx.valid()); + EntryRef posting_idx(make_fake_pidx(i)); + dict.update_posting_list(enum_idx, store.get_comparator(), [posting_idx](EntryRef) noexcept -> EntryRef { return posting_idx; }); + } +} + +template <typename EnumStoreTypeAndDictionaryType> +std::vector<EntryRef> +EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::get_sample_values(uint32_t cnt) +{ + std::vector<EntryRef> result; + result.reserve(cnt); + store.freeze_dictionary(); + auto& dict = store.get_dictionary(); + for (uint32_t i = 0; i < cnt; ++i) { + auto compare = store.make_comparator(i); + auto enum_idx = dict.find(compare); + EXPECT_TRUE(enum_idx.valid()); + EntryRef posting_idx; + dict.update_posting_list(enum_idx, compare, [&posting_idx](EntryRef ref) noexcept { posting_idx = ref; return ref; });; + auto find_result = dict.find_posting_list(compare, dict.get_frozen_root()); + EXPECT_EQ(enum_idx, find_result.first); + EXPECT_EQ(posting_idx, find_result.second); + result.emplace_back(find_result.second); + } + return result; +} + +template <typename EnumStoreTypeAndDictionaryType> +void +EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::clear_sample_values(uint32_t cnt) +{ + auto& dict = store.get_dictionary(); + for (uint32_t i = 0; i < cnt; ++i) { + auto comparator = store.make_comparator(i); + auto enum_idx = dict.find(comparator); + EXPECT_TRUE(enum_idx.valid()); + dict.update_posting_list(enum_idx, comparator, [](EntryRef) noexcept -> EntryRef { return EntryRef(); }); + } +} + +namespace { + +EntryRefFilter make_entry_ref_filter(bool one_filter) +{ + if (one_filter) { + EntryRefFilter filter(RefT::numBuffers(), RefT::offset_bits); + filter.add_buffer(3); + return filter; + } + return EntryRefFilter::create_all_filter(RefT::numBuffers(), RefT::offset_bits); +} + +} + +template <typename EnumStoreTypeAndDictionaryType> +void +EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::test_normalize_posting_lists(bool use_filter, bool one_filter) +{ + populate_sample_data(large_population); + auto& dict = store.get_dictionary(); + std::vector<EntryRef> exp_refs; + std::vector<EntryRef> exp_adjusted_refs; + exp_refs.reserve(large_population); + exp_adjusted_refs.reserve(large_population); + for (uint32_t i = 0; i < large_population; ++i) { + exp_refs.emplace_back(make_fake_pidx(i)); + if (!use_filter || !one_filter || select_buffer(i) == 3) { + exp_adjusted_refs.emplace_back(make_fake_adjusted_pidx(i)); + } else { + exp_adjusted_refs.emplace_back(make_fake_pidx(i)); + } + } + EXPECT_EQ(exp_refs, get_sample_values(large_population)); + if (use_filter) { + auto filter = make_entry_ref_filter(one_filter); + auto dummy = [](std::vector<EntryRef>&) noexcept { }; + auto adjust_refs = [](std::vector<EntryRef> &refs) noexcept { for (auto &ref : refs) { ref = adjust_fake_pidx(ref); } }; + EXPECT_FALSE(dict.normalize_posting_lists(dummy, filter)); + EXPECT_EQ(exp_refs, get_sample_values(large_population)); + EXPECT_TRUE(dict.normalize_posting_lists(adjust_refs, filter)); + } else { + auto dummy = [](EntryRef posting_idx) noexcept { return posting_idx; }; + auto adjust_refs = [](EntryRef ref) noexcept { return adjust_fake_pidx(ref); }; + EXPECT_FALSE(dict.normalize_posting_lists(dummy)); + EXPECT_EQ(exp_refs, get_sample_values(large_population)); + EXPECT_TRUE(dict.normalize_posting_lists(adjust_refs)); + } + EXPECT_EQ(exp_adjusted_refs, get_sample_values(large_population)); + clear_sample_values(large_population); +} + +template <typename EnumStoreTypeAndDictionaryType> +void +EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::test_foreach_posting_list(bool one_filter) +{ + auto filter = make_entry_ref_filter(one_filter); + populate_sample_data(large_population); + auto& dict = store.get_dictionary(); + std::vector<EntryRef> exp_refs; + auto save_exp_refs = [&exp_refs](std::vector<EntryRef>& refs) { exp_refs.insert(exp_refs.end(), refs.begin(), refs.end()); }; + EXPECT_FALSE(dict.normalize_posting_lists(save_exp_refs, filter)); + std::vector<EntryRef> act_refs; + auto save_act_refs = [&act_refs](const std::vector<EntryRef>& refs) { act_refs.insert(act_refs.end(), refs.begin(), refs.end()); }; + dict.foreach_posting_list(save_act_refs, filter); + EXPECT_EQ(exp_refs, act_refs); + clear_sample_values(large_population); +} + // Disable warnings emitted by gtest generated files when using typed tests #pragma GCC diagnostic push #ifndef __clang__ @@ -678,26 +841,27 @@ TYPED_TEST(EnumStoreDictionaryTest, find_posting_list_works) TYPED_TEST(EnumStoreDictionaryTest, normalize_posting_lists_works) { - auto value_0_idx = this->insert_value(0); - this->update_posting_idx(value_0_idx, EntryRef(), this->fake_pidx()); - this->store.freeze_dictionary(); - auto& dict = this->store.get_dictionary(); - auto root = dict.get_frozen_root(); - auto find_result = dict.find_posting_list(this->make_bound_comparator(0), root); - EXPECT_EQ(value_0_idx, find_result.first); - EXPECT_EQ(this->fake_pidx(), find_result.second); - auto dummy = [](EntryRef posting_idx) noexcept { return posting_idx; }; - std::vector<EntryRef> saved_refs; - auto save_refs_and_clear = [&saved_refs](EntryRef posting_idx) { saved_refs.push_back(posting_idx); return EntryRef(); }; - EXPECT_FALSE(dict.normalize_posting_lists(dummy)); - EXPECT_TRUE(dict.normalize_posting_lists(save_refs_and_clear)); - EXPECT_FALSE(dict.normalize_posting_lists(save_refs_and_clear)); - EXPECT_EQ((std::vector<EntryRef>{ this->fake_pidx(), EntryRef() }), saved_refs); - this->store.freeze_dictionary(); - root = dict.get_frozen_root(); - find_result = dict.find_posting_list(this->make_bound_comparator(0), root); - EXPECT_EQ(value_0_idx, find_result.first); - EXPECT_EQ(EntryRef(), find_result.second); + this->test_normalize_posting_lists(false, false); +} + +TYPED_TEST(EnumStoreDictionaryTest, normalize_posting_lists_with_all_filter_works) +{ + this->test_normalize_posting_lists(true, false); +} + +TYPED_TEST(EnumStoreDictionaryTest, normalize_posting_lists_with_one_filter_works) +{ + this->test_normalize_posting_lists(true, true); +} + +TYPED_TEST(EnumStoreDictionaryTest, foreach_posting_list_with_all_filter_works) +{ + this->test_foreach_posting_list(false); +} + +TYPED_TEST(EnumStoreDictionaryTest, foreach_posting_list_with_one_filter_works) +{ + this->test_foreach_posting_list(true); } namespace { diff --git a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp index 6c929ad5981..8bc28abc238 100644 --- a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp +++ b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp @@ -311,6 +311,165 @@ EnumStoreDictionary<BTreeDictionaryT, HashDictionaryT>::normalize_posting_lists( } template <> +bool +EnumStoreDictionary<EnumTree>::normalize_posting_lists(std::function<void(std::vector<EntryRef>&)>, const EntryRefFilter&) +{ + LOG_ABORT("should not be reached"); +} + +namespace { + +template <typename HashDictionaryT> +class ChangeWriterBase +{ +protected: + HashDictionaryT* _hash_dict; + static constexpr bool has_hash_dictionary = true; + ChangeWriterBase() + : _hash_dict(nullptr) + { + } +public: + void set_hash_dict(HashDictionaryT &hash_dict) { _hash_dict = &hash_dict; } +}; + +template <> +class ChangeWriterBase<vespalib::datastore::NoHashDictionary> +{ +protected: + static constexpr bool has_hash_dictionary = false; + ChangeWriterBase() = default; +}; + +template <typename HashDictionaryT> +class ChangeWriter : public ChangeWriterBase<HashDictionaryT> { + using Parent = ChangeWriterBase<HashDictionaryT>; + using Parent::has_hash_dictionary; + std::vector<std::pair<EntryRef,uint32_t*>> _tree_refs; +public: + ChangeWriter(uint32_t capacity); + ~ChangeWriter(); + bool write(const std::vector<EntryRef>& refs); + void emplace_back(EntryRef key, uint32_t& tree_ref) { _tree_refs.emplace_back(std::make_pair(key, &tree_ref)); } +}; + +template <typename HashDictionaryT> +ChangeWriter<HashDictionaryT>::ChangeWriter(uint32_t capacity) + : ChangeWriterBase<HashDictionaryT>(), + _tree_refs() +{ + _tree_refs.reserve(capacity); +} + +template <typename HashDictionaryT> +ChangeWriter<HashDictionaryT>::~ChangeWriter() = default; + +template <typename HashDictionaryT> +bool +ChangeWriter<HashDictionaryT>::write(const std::vector<EntryRef> &refs) +{ + bool changed = false; + assert(refs.size() == _tree_refs.size()); + auto tree_ref = _tree_refs.begin(); + for (auto ref : refs) { + EntryRef old_ref(*tree_ref->second); + if (ref != old_ref) { + if (!changed) { + // Note: Needs review when porting to other platforms + // Assumes that other CPUs observes stores from this CPU in order + std::atomic_thread_fence(std::memory_order_release); + changed = true; + } + *tree_ref->second = ref.ref(); + if constexpr (has_hash_dictionary) { + auto find_result = this->_hash_dict->find(this->_hash_dict->get_default_comparator(), tree_ref->first); + assert(find_result != nullptr && find_result->first.load_relaxed() == tree_ref->first); + assert(find_result->second.load_relaxed() == old_ref); + find_result->second.store_release(ref); + } + } + ++tree_ref; + } + assert(tree_ref == _tree_refs.end()); + _tree_refs.clear(); + return changed; +} + +} + +template <typename BTreeDictionaryT, typename HashDictionaryT> +bool +EnumStoreDictionary<BTreeDictionaryT, HashDictionaryT>::normalize_posting_lists(std::function<void(std::vector<EntryRef>&)> normalize, const EntryRefFilter& filter) +{ + if constexpr (has_btree_dictionary) { + std::vector<EntryRef> refs; + refs.reserve(1024); + bool changed = false; + ChangeWriter<HashDictionaryT> change_writer(refs.capacity()); + if constexpr (has_hash_dictionary) { + change_writer.set_hash_dict(this->_hash_dict); + } + auto& dict = this->_btree_dict; + for (auto itr = dict.begin(); itr.valid(); ++itr) { + EntryRef ref(itr.getData()); + if (ref.valid()) { + if (filter.has(ref)) { + refs.emplace_back(ref); + change_writer.emplace_back(itr.getKey(), itr.getWData()); + if (refs.size() >= refs.capacity()) { + normalize(refs); + changed |= change_writer.write(refs); + refs.clear(); + } + } + } + } + if (!refs.empty()) { + normalize(refs); + changed |= change_writer.write(refs); + } + return changed; + } else { + return this->_hash_dict.normalize_values(normalize, filter); + } +} + +template <> +void +EnumStoreDictionary<EnumTree>::foreach_posting_list(std::function<void(const std::vector<EntryRef>&)>, const EntryRefFilter&) +{ + LOG_ABORT("should not be reached"); +} + +template <typename BTreeDictionaryT, typename HashDictionaryT> +void +EnumStoreDictionary<BTreeDictionaryT, HashDictionaryT>::foreach_posting_list(std::function<void(const std::vector<EntryRef>&)> callback, const EntryRefFilter& filter) +{ + if constexpr (has_btree_dictionary) { + std::vector<EntryRef> refs; + refs.reserve(1024); + auto& dict = this->_btree_dict; + for (auto itr = dict.begin(); itr.valid(); ++itr) { + EntryRef ref(itr.getData()); + if (ref.valid()) { + if (filter.has(ref)) { + refs.emplace_back(ref); + if (refs.size() >= refs.capacity()) { + callback(refs); + refs.clear(); + } + } + } + } + if (!refs.empty()) { + callback(refs); + } + } else { + this->_hash_dict.foreach_value(callback, filter); + } +} + +template <> const EnumPostingTree & EnumStoreDictionary<EnumTree>::get_posting_dictionary() const { diff --git a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h index 4d0509c0eb1..db1176c5484 100644 --- a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h +++ b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h @@ -16,6 +16,7 @@ template <typename BTreeDictionaryT, typename HashDictionaryT = vespalib::datast class EnumStoreDictionary : public vespalib::datastore::UniqueStoreDictionary<BTreeDictionaryT, IEnumStoreDictionary, HashDictionaryT> { protected: using EntryRef = IEnumStoreDictionary::EntryRef; + using EntryRefFilter = IEnumStoreDictionary::EntryRefFilter; using Index = IEnumStoreDictionary::Index; using BTreeDictionaryType = BTreeDictionaryT; using EntryComparator = IEnumStoreDictionary::EntryComparator; @@ -54,6 +55,8 @@ public: void clear_all_posting_lists(std::function<void(EntryRef)> clearer) override; void update_posting_list(Index idx, const EntryComparator& cmp, std::function<EntryRef(EntryRef)> updater) override; bool normalize_posting_lists(std::function<EntryRef(EntryRef)> normalize) override; + bool normalize_posting_lists(std::function<void(std::vector<EntryRef>&)> normalize, const EntryRefFilter& filter) override; + void foreach_posting_list(std::function<void(const std::vector<EntryRef>&)> callback, const EntryRefFilter& filter) override; const EnumPostingTree& get_posting_dictionary() const override; }; diff --git a/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h b/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h index a8cf6881b86..a9716ec5d05 100644 --- a/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h +++ b/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h @@ -30,6 +30,7 @@ class IEnumStoreDictionary : public vespalib::datastore::IUniqueStoreDictionary public: using EntryRef = vespalib::datastore::EntryRef; using EntryComparator = vespalib::datastore::EntryComparator; + using EntryRefFilter = vespalib::datastore::EntryRefFilter; using EnumVector = IEnumStore::EnumVector; using Index = IEnumStore::Index; using IndexList = IEnumStore::IndexList; @@ -52,7 +53,25 @@ public: virtual Index remap_index(Index idx) = 0; virtual void clear_all_posting_lists(std::function<void(EntryRef)> clearer) = 0; virtual void update_posting_list(Index idx, const EntryComparator& cmp, std::function<EntryRef(EntryRef)> updater) = 0; + /* + * Scan dictionary and call normalize function for each value. If + * returned value is different then write back the modified value to + * the dictionary. Only used by unit tests. + */ virtual bool normalize_posting_lists(std::function<EntryRef(EntryRef)> normalize) = 0; + /* + * Scan dictionary and call normalize function for batches of values + * that pass the filter. Write back modified values to the dictionary. + * Used by compaction of posting lists when moving short arrays, + * bitvectors or btree roots. + */ + virtual bool normalize_posting_lists(std::function<void(std::vector<EntryRef>&)> normalize, const EntryRefFilter& filter) = 0; + /* + * Scan dictionary and call callback function for batches of values + * that pass the filter. Used by compaction of posting lists when + * moving btree nodes. + */ + virtual void foreach_posting_list(std::function<void(const std::vector<EntryRef>&)> callback, const EntryRefFilter& filter) = 0; virtual const EnumPostingTree& get_posting_dictionary() const = 0; }; diff --git a/searchlib/src/vespa/searchlib/attribute/postingstore.cpp b/searchlib/src/vespa/searchlib/attribute/postingstore.cpp index 3451c2b0456..8ed8a0cfbee 100644 --- a/searchlib/src/vespa/searchlib/attribute/postingstore.cpp +++ b/searchlib/src/vespa/searchlib/attribute/postingstore.cpp @@ -7,11 +7,13 @@ #include <vespa/vespalib/btree/btreeiterator.hpp> #include <vespa/vespalib/btree/btreerootbase.cpp> #include <vespa/vespalib/datastore/datastore.hpp> +#include <vespa/vespalib/datastore/entry_ref_filter.h> #include <vespa/vespalib/datastore/buffer_type.hpp> namespace search::attribute { using vespalib::btree::BTreeNoLeafData; +using vespalib::datastore::EntryRefFilter; // #define FORCE_BITVECTORS @@ -127,45 +129,47 @@ PostingStore<DataT>::removeSparseBitVectors() } } if (needscan) { - res = _dictionary.normalize_posting_lists([this](EntryRef posting_idx) -> EntryRef - { return consider_remove_sparse_bitvector(posting_idx); }); + EntryRefFilter filter(RefType::numBuffers(), RefType::offset_bits); + filter.add_buffers(_bvType.get_active_buffers()); + res = _dictionary.normalize_posting_lists([this](std::vector<EntryRef>& refs) + { consider_remove_sparse_bitvector(refs); }, + filter); } return res; } template <typename DataT> -typename PostingStore<DataT>::EntryRef -PostingStore<DataT>::consider_remove_sparse_bitvector(EntryRef ref) +void +PostingStore<DataT>::consider_remove_sparse_bitvector(std::vector<EntryRef>& refs) { - if (!ref.valid() || !isBitVector(getTypeId(EntryRef(ref)))) { - return ref; - } - RefType iRef(ref); - uint32_t typeId = getTypeId(iRef); - assert(isBitVector(typeId)); - assert(_bvs.find(ref.ref() )!= _bvs.end()); - BitVectorEntry *bve = getWBitVectorEntry(iRef); - BitVector &bv = *bve->_bv.get(); - uint32_t docFreq = bv.countTrueBits(); - if (bve->_tree.valid()) { - RefType iRef2(bve->_tree); - assert(isBTree(iRef2)); - const BTreeType *tree = getTreeEntry(iRef2); - assert(tree->size(_allocator) == docFreq); - (void) tree; - } - if (docFreq < _minBvDocFreq) { - dropBitVector(ref); - if (ref.valid()) { + for (auto& ref : refs) { + RefType iRef(ref); + assert(iRef.valid()); + uint32_t typeId = getTypeId(iRef); + assert(isBitVector(typeId)); + assert(_bvs.find(iRef.ref()) != _bvs.end()); + BitVectorEntry *bve = getWBitVectorEntry(iRef); + BitVector &bv = *bve->_bv.get(); + uint32_t docFreq = bv.countTrueBits(); + if (bve->_tree.valid()) { + RefType iRef2(bve->_tree); + assert(isBTree(iRef2)); + const BTreeType *tree = getTreeEntry(iRef2); + assert(tree->size(_allocator) == docFreq); + (void) tree; + } + if (docFreq < _minBvDocFreq) { + dropBitVector(ref); iRef = ref; - typeId = getTypeId(iRef); - if (isBTree(typeId)) { - BTreeType *tree = getWTreeEntry(iRef); - normalizeTree(ref, tree, false); + if (iRef.valid()) { + typeId = getTypeId(iRef); + if (isBTree(typeId)) { + BTreeType *tree = getWTreeEntry(iRef); + normalizeTree(ref, tree, false); + } } } } - return ref; } template <typename DataT> @@ -647,74 +651,75 @@ PostingStore<DataT>::update_stat() template <typename DataT> void -PostingStore<DataT>::move_btree_nodes(EntryRef ref) +PostingStore<DataT>::move_btree_nodes(const std::vector<EntryRef>& refs) { - if (ref.valid()) { + for (auto ref : refs) { RefType iRef(ref); + assert(iRef.valid()); uint32_t typeId = getTypeId(iRef); uint32_t clusterSize = getClusterSize(typeId); - if (clusterSize == 0) { - if (isBitVector(typeId)) { - BitVectorEntry *bve = getWBitVectorEntry(iRef); - RefType iRef2(bve->_tree); - if (iRef2.valid()) { - assert(isBTree(iRef2)); - BTreeType *tree = getWTreeEntry(iRef2); - tree->move_nodes(_allocator); - } - } else { - BTreeType *tree = getWTreeEntry(iRef); + assert(clusterSize == 0); + if (isBitVector(typeId)) { + BitVectorEntry *bve = getWBitVectorEntry(iRef); + RefType iRef2(bve->_tree); + if (iRef2.valid()) { + assert(isBTree(iRef2)); + BTreeType *tree = getWTreeEntry(iRef2); tree->move_nodes(_allocator); } + } else { + assert(isBTree(typeId)); + BTreeType *tree = getWTreeEntry(iRef); + tree->move_nodes(_allocator); } } } template <typename DataT> -typename PostingStore<DataT>::EntryRef -PostingStore<DataT>::move(EntryRef ref) +void +PostingStore<DataT>::move(std::vector<EntryRef>& refs) { - if (!ref.valid()) { - return EntryRef(); - } - RefType iRef(ref); - uint32_t typeId = getTypeId(iRef); - uint32_t clusterSize = getClusterSize(typeId); - if (clusterSize == 0) { - if (isBitVector(typeId)) { - BitVectorEntry *bve = getWBitVectorEntry(iRef); - RefType iRef2(bve->_tree); - if (iRef2.valid()) { - assert(isBTree(iRef2)); - if (_store.getCompacting(iRef2)) { - BTreeType *tree = getWTreeEntry(iRef2); - auto ref_and_ptr = allocBTreeCopy(*tree); - tree->prepare_hold(); - bve->_tree = ref_and_ptr.ref; + for (auto& ref : refs) { + RefType iRef(ref); + assert(iRef.valid()); + uint32_t typeId = getTypeId(iRef); + uint32_t clusterSize = getClusterSize(typeId); + if (clusterSize == 0) { + if (isBitVector(typeId)) { + BitVectorEntry *bve = getWBitVectorEntry(iRef); + RefType iRef2(bve->_tree); + if (iRef2.valid()) { + assert(isBTree(iRef2)); + if (_store.getCompacting(iRef2)) { + BTreeType *tree = getWTreeEntry(iRef2); + auto ref_and_ptr = allocBTreeCopy(*tree); + tree->prepare_hold(); + // Note: Needs review when porting to other platforms + // Assumes that other CPUs observes stores from this CPU in order + std::atomic_thread_fence(std::memory_order_release); + bve->_tree = ref_and_ptr.ref; + } } + if (_store.getCompacting(iRef)) { + auto new_ref = allocBitVectorCopy(*bve).ref; + _bvs.erase(iRef.ref()); + _bvs.insert(new_ref.ref()); + ref = new_ref; + } + } else { + assert(isBTree(typeId)); + assert(_store.getCompacting(iRef)); + BTreeType *tree = getWTreeEntry(iRef); + auto ref_and_ptr = allocBTreeCopy(*tree); + tree->prepare_hold(); + ref = ref_and_ptr.ref; } - if (!_store.getCompacting(ref)) { - return ref; - } - auto new_ref = allocBitVectorCopy(*bve).ref; - _bvs.erase(ref.ref()); - _bvs.insert(new_ref.ref()); - return new_ref; } else { - if (!_store.getCompacting(ref)) { - return ref; - } - BTreeType *tree = getWTreeEntry(iRef); - auto ref_and_ptr = allocBTreeCopy(*tree); - tree->prepare_hold(); - return ref_and_ptr.ref; + assert(_store.getCompacting(iRef)); + const KeyDataType *shortArray = getKeyDataEntry(iRef, clusterSize); + ref = allocKeyDataCopy(shortArray, clusterSize).ref; } } - if (!_store.getCompacting(ref)) { - return ref; - } - const KeyDataType *shortArray = getKeyDataEntry(iRef, clusterSize); - return allocKeyDataCopy(shortArray, clusterSize).ref; } template <typename DataT> @@ -722,11 +727,12 @@ void PostingStore<DataT>::compact_worst_btree_nodes() { auto to_hold = this->start_compact_worst_btree_nodes(); - _dictionary.normalize_posting_lists([this](EntryRef posting_idx) -> EntryRef - { - move_btree_nodes(posting_idx); - return posting_idx; - }); + EntryRefFilter filter(RefType::numBuffers(), RefType::offset_bits); + // Only look at buffers containing bitvectors and btree roots + filter.add_buffers(this->_treeType.get_active_buffers()); + filter.add_buffers(_bvType.get_active_buffers()); + _dictionary.foreach_posting_list([this](const std::vector<EntryRef>& refs) + { move_btree_nodes(refs); }, filter); this->finish_compact_worst_btree_nodes(to_hold); } @@ -735,8 +741,23 @@ void PostingStore<DataT>::compact_worst_buffers() { auto to_hold = this->start_compact_worst_buffers(); - _dictionary.normalize_posting_lists([this](EntryRef posting_idx) -> EntryRef - { return move(posting_idx); }); + bool compact_btree_roots = false; + EntryRefFilter filter(RefType::numBuffers(), RefType::offset_bits); + filter.add_buffers(to_hold); + // Start with looking at buffers being compacted + for (uint32_t buffer_id : to_hold) { + if (isBTree(_store.getBufferState(buffer_id).getTypeId())) { + compact_btree_roots = true; + } + } + if (compact_btree_roots) { + // If we are compacting btree roots then we also have to look at bitvector + // buffers + filter.add_buffers(_bvType.get_active_buffers()); + } + _dictionary.normalize_posting_lists([this](std::vector<EntryRef>& refs) + { return move(refs); }, + filter); this->finishCompact(to_hold); } diff --git a/searchlib/src/vespa/searchlib/attribute/postingstore.h b/searchlib/src/vespa/searchlib/attribute/postingstore.h index a0f0be1c430..2b119a55158 100644 --- a/searchlib/src/vespa/searchlib/attribute/postingstore.h +++ b/searchlib/src/vespa/searchlib/attribute/postingstore.h @@ -89,6 +89,7 @@ public: using Parent::getWTreeEntry; using Parent::getTreeEntry; using Parent::getKeyDataEntry; + using Parent::isBTree; using Parent::clusterLimit; using Parent::allocBTree; using Parent::allocBTreeCopy; @@ -105,10 +106,8 @@ public: ~PostingStore(); bool removeSparseBitVectors() override; - EntryRef consider_remove_sparse_bitvector(EntryRef ref); + void consider_remove_sparse_bitvector(std::vector<EntryRef> &refs); static bool isBitVector(uint32_t typeId) { return typeId == BUFFERTYPE_BITVECTOR; } - static bool isBTree(uint32_t typeId) { return typeId == BUFFERTYPE_BTREE; } - bool isBTree(RefType ref) const { return isBTree(getTypeId(ref)); } void applyNew(EntryRef &ref, AddIter a, AddIter ae); @@ -188,8 +187,8 @@ public: vespalib::MemoryUsage getMemoryUsage() const; vespalib::MemoryUsage update_stat(); - void move_btree_nodes(EntryRef ref); - EntryRef move(EntryRef ref); + void move_btree_nodes(const std::vector<EntryRef> &refs); + void move(std::vector<EntryRef>& refs); void compact_worst_btree_nodes(); void compact_worst_buffers(); |