diff options
author | Tor Egge <Tor.Egge@online.no> | 2021-12-03 18:55:45 +0100 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2021-12-03 18:55:45 +0100 |
commit | 472cb247d6d4d5e8cd58d537f58ce9515ef79869 (patch) | |
tree | abeba8051fbcca0f4a4a700d8f51b03e18d31b38 | |
parent | 42e3653b19f95847d14b9b90478a967907611d98 (diff) |
Filter early on buffer id and pass vector of entries in normalize_posting_lists
and foreach_posting_list EnumStoreDictionary member functions to limit number
of callbacks.
6 files changed, 231 insertions, 16 deletions
diff --git a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp index 9c25429932b..38cf8413e0f 100644 --- a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp +++ b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp @@ -597,6 +597,7 @@ public: void update_posting_idx(EnumIndex enum_idx, EntryRef old_posting_idx, EntryRef new_posting_idx); EnumIndex insert_value(size_t value_idx); + void test_normalize_posting_lists(bool use_filter); static EntryRef fake_pidx() { return EntryRef(42); } }; @@ -620,6 +621,44 @@ EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::insert_value(size_t val return enum_idx; } +template <typename EnumStoreTypeAndDictionaryType> +void +EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::test_normalize_posting_lists(bool use_filter) +{ + auto value_0_idx = this->insert_value(0); + this->update_posting_idx(value_0_idx, EntryRef(), this->fake_pidx()); + this->store.freeze_dictionary(); + auto& dict = this->store.get_dictionary(); + auto root = dict.get_frozen_root(); + auto find_result = dict.find_posting_list(this->make_bound_comparator(0), root); + EXPECT_EQ(value_0_idx, find_result.first); + EXPECT_EQ(this->fake_pidx(), find_result.second); + std::vector<EntryRef> saved_refs; + if (use_filter) { + auto dummy = [](std::vector<EntryRef>&) noexcept { }; + auto save_refs_and_clear = [&saved_refs](std::vector<EntryRef>& refs) { saved_refs.insert(saved_refs.end(), refs.begin(), refs.end()); for (auto& ref : refs) { ref = EntryRef(); } }; + std::vector<bool> filter(2, true); + uint32_t entry_ref_offset_bits = 31; + EXPECT_FALSE(dict.normalize_posting_lists(dummy, filter, entry_ref_offset_bits)); + EXPECT_TRUE(dict.normalize_posting_lists(save_refs_and_clear, filter, entry_ref_offset_bits)); + EXPECT_FALSE(dict.normalize_posting_lists(save_refs_and_clear, filter, entry_ref_offset_bits)); + EXPECT_EQ((std::vector<EntryRef>{ this->fake_pidx() }), saved_refs); + } else { + auto dummy = [](EntryRef posting_idx) noexcept { return posting_idx; }; + auto save_refs_and_clear = [&saved_refs](EntryRef posting_idx) { saved_refs.push_back(posting_idx); return EntryRef(); }; + EXPECT_FALSE(dict.normalize_posting_lists(dummy)); + EXPECT_TRUE(dict.normalize_posting_lists(save_refs_and_clear)); + EXPECT_FALSE(dict.normalize_posting_lists(save_refs_and_clear)); + EXPECT_EQ((std::vector<EntryRef>{ this->fake_pidx(), EntryRef() }), saved_refs); + } + this->store.freeze_dictionary(); + root = dict.get_frozen_root(); + find_result = dict.find_posting_list(this->make_bound_comparator(0), root); + EXPECT_EQ(value_0_idx, find_result.first); + EXPECT_EQ(EntryRef(), find_result.second); +} + + // Disable warnings emitted by gtest generated files when using typed tests #pragma GCC diagnostic push #ifndef __clang__ @@ -678,26 +717,26 @@ TYPED_TEST(EnumStoreDictionaryTest, find_posting_list_works) TYPED_TEST(EnumStoreDictionaryTest, normalize_posting_lists_works) { + this->test_normalize_posting_lists(false); +} + +TYPED_TEST(EnumStoreDictionaryTest, normalize_posting_lists_with_filter_works) +{ + this->test_normalize_posting_lists(true); +} + +TYPED_TEST(EnumStoreDictionaryTest, foreach_posting_list_with_filter_works) +{ + std::vector<bool> filter(2, true); + uint32_t entry_ref_offset_bits = 31; + std::vector<EntryRef> saved_refs; + auto save_refs = [&saved_refs](const std::vector<EntryRef>& refs) { saved_refs.insert(saved_refs.end(), refs.begin(), refs.end()); }; auto value_0_idx = this->insert_value(0); this->update_posting_idx(value_0_idx, EntryRef(), this->fake_pidx()); this->store.freeze_dictionary(); auto& dict = this->store.get_dictionary(); - auto root = dict.get_frozen_root(); - auto find_result = dict.find_posting_list(this->make_bound_comparator(0), root); - EXPECT_EQ(value_0_idx, find_result.first); - EXPECT_EQ(this->fake_pidx(), find_result.second); - auto dummy = [](EntryRef posting_idx) noexcept { return posting_idx; }; - std::vector<EntryRef> saved_refs; - auto save_refs_and_clear = [&saved_refs](EntryRef posting_idx) { saved_refs.push_back(posting_idx); return EntryRef(); }; - EXPECT_FALSE(dict.normalize_posting_lists(dummy)); - EXPECT_TRUE(dict.normalize_posting_lists(save_refs_and_clear)); - EXPECT_FALSE(dict.normalize_posting_lists(save_refs_and_clear)); - EXPECT_EQ((std::vector<EntryRef>{ this->fake_pidx(), EntryRef() }), saved_refs); - this->store.freeze_dictionary(); - root = dict.get_frozen_root(); - find_result = dict.find_posting_list(this->make_bound_comparator(0), root); - EXPECT_EQ(value_0_idx, find_result.first); - EXPECT_EQ(EntryRef(), find_result.second); + dict.foreach_posting_list(save_refs, filter, entry_ref_offset_bits); + EXPECT_EQ((std::vector<EntryRef>{ this->fake_pidx() }), saved_refs); } namespace { diff --git a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp index 6c929ad5981..f77b9426b32 100644 --- a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp +++ b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp @@ -311,6 +311,167 @@ EnumStoreDictionary<BTreeDictionaryT, HashDictionaryT>::normalize_posting_lists( } template <> +bool +EnumStoreDictionary<EnumTree>::normalize_posting_lists(std::function<void(std::vector<EntryRef>&)>, const std::vector<bool> &, uint32_t) +{ + LOG_ABORT("should not be reached"); +} + +namespace { + +template <typename HashDictionaryT> +class ChangeWriterBase +{ +protected: + HashDictionaryT* _hash_dict; + static constexpr bool has_hash_dictionary = true; + ChangeWriterBase() + : _hash_dict(nullptr) + { + } +public: + void set_hash_dict(HashDictionaryT &hash_dict) { _hash_dict = &hash_dict; } +}; + +template <> +class ChangeWriterBase<vespalib::datastore::NoHashDictionary> +{ +protected: + static constexpr bool has_hash_dictionary = false; + ChangeWriterBase() = default; +}; + +template <typename HashDictionaryT> +class ChangeWriter : public ChangeWriterBase<HashDictionaryT> { + using Parent = ChangeWriterBase<HashDictionaryT>; + using Parent::has_hash_dictionary; + std::vector<std::pair<EntryRef,uint32_t*>> _tree_refs; +public: + ChangeWriter(uint32_t capacity); + ~ChangeWriter(); + bool write(const std::vector<EntryRef>& refs); + void emplace_back(EntryRef key, uint32_t& tree_ref) { _tree_refs.emplace_back(std::make_pair(key, &tree_ref)); } +}; + +template <typename HashDictionaryT> +ChangeWriter<HashDictionaryT>::ChangeWriter(uint32_t capacity) + : ChangeWriterBase<HashDictionaryT>(), + _tree_refs() +{ + _tree_refs.reserve(capacity); +} + +template <typename HashDictionaryT> +ChangeWriter<HashDictionaryT>::~ChangeWriter() = default; + +template <typename HashDictionaryT> +bool +ChangeWriter<HashDictionaryT>::write(const std::vector<EntryRef> &refs) +{ + bool changed = false; + assert(refs.size() == _tree_refs.size()); + auto tree_ref = _tree_refs.begin(); + for (auto ref : refs) { + EntryRef old_ref(*tree_ref->second); + if (ref != old_ref) { + if (!changed) { + // Note: Needs review when porting to other platforms + // Assumes that other CPUs observes stores from this CPU in order + std::atomic_thread_fence(std::memory_order_release); + changed = true; + } + *tree_ref->second = ref.ref(); + if constexpr (has_hash_dictionary) { + auto find_result = this->_hash_dict->find(this->_hash_dict->get_default_comparator(), tree_ref->first); + assert(find_result != nullptr && find_result->first.load_relaxed() == tree_ref->first); + assert(find_result->second.load_relaxed() == old_ref); + find_result->second.store_release(ref); + } + } + ++tree_ref; + } + assert(tree_ref == _tree_refs.end()); + _tree_refs.clear(); + return changed; +} + +} + +template <typename BTreeDictionaryT, typename HashDictionaryT> +bool +EnumStoreDictionary<BTreeDictionaryT, HashDictionaryT>::normalize_posting_lists(std::function<void(std::vector<EntryRef>&)> normalize, const std::vector<bool>& filter, uint32_t entry_ref_offset_bits) +{ + if constexpr (has_btree_dictionary) { + std::vector<EntryRef> refs; + refs.reserve(1024); + bool changed = false; + ChangeWriter<HashDictionaryT> change_writer(refs.capacity()); + if constexpr (has_hash_dictionary) { + change_writer.set_hash_dict(this->_hash_dict); + } + auto& dict = this->_btree_dict; + for (auto itr = dict.begin(); itr.valid(); ++itr) { + EntryRef ref(itr.getData()); + if (ref.valid()) { + uint32_t buffer_id = ref.buffer_id(entry_ref_offset_bits); + if (filter[buffer_id]) { + refs.emplace_back(ref); + change_writer.emplace_back(itr.getKey(), itr.getWData()); + if (refs.size() >= refs.capacity()) { + normalize(refs); + changed |= change_writer.write(refs); + refs.clear(); + } + } + } + } + if (!refs.empty()) { + normalize(refs); + changed |= change_writer.write(refs); + } + return changed; + } else { + return this->_hash_dict.normalize_values(normalize, filter, entry_ref_offset_bits); + } +} + +template <> +void +EnumStoreDictionary<EnumTree>::foreach_posting_list(std::function<void(const std::vector<EntryRef>&)>, const std::vector<bool>&, uint32_t) +{ + LOG_ABORT("should not be reached"); +} + +template <typename BTreeDictionaryT, typename HashDictionaryT> +void +EnumStoreDictionary<BTreeDictionaryT, HashDictionaryT>::foreach_posting_list(std::function<void(const std::vector<EntryRef>&)> callback, const std::vector<bool>& filter, uint32_t entry_ref_offset_bits) +{ + if constexpr (has_btree_dictionary) { + std::vector<EntryRef> refs; + refs.reserve(1024); + auto& dict = this->_btree_dict; + for (auto itr = dict.begin(); itr.valid(); ++itr) { + EntryRef ref(itr.getData()); + if (ref.valid()) { + uint32_t buffer_id = ref.buffer_id(entry_ref_offset_bits); + if (filter[buffer_id]) { + refs.emplace_back(ref); + if (refs.size() >= refs.capacity()) { + callback(refs); + refs.clear(); + } + } + } + } + if (!refs.empty()) { + callback(refs); + } + } else { + this->_hash_dict.foreach_value(callback, filter, entry_ref_offset_bits); + } +} + +template <> const EnumPostingTree & EnumStoreDictionary<EnumTree>::get_posting_dictionary() const { diff --git a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h index 4d0509c0eb1..a17ffd14864 100644 --- a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h +++ b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h @@ -54,6 +54,8 @@ public: void clear_all_posting_lists(std::function<void(EntryRef)> clearer) override; void update_posting_list(Index idx, const EntryComparator& cmp, std::function<EntryRef(EntryRef)> updater) override; bool normalize_posting_lists(std::function<EntryRef(EntryRef)> normalize) override; + bool normalize_posting_lists(std::function<void(std::vector<EntryRef>&)> normalize, const std::vector<bool>& filter, uint32_t entry_ref_offset_bits) override; + void foreach_posting_list(std::function<void(const std::vector<EntryRef>&)> callback, const std::vector<bool>& filter, uint32_t entry_ref_offset_bits) override; const EnumPostingTree& get_posting_dictionary() const override; }; diff --git a/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h b/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h index a8cf6881b86..4a00c72d6ba 100644 --- a/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h +++ b/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h @@ -53,6 +53,8 @@ public: virtual void clear_all_posting_lists(std::function<void(EntryRef)> clearer) = 0; virtual void update_posting_list(Index idx, const EntryComparator& cmp, std::function<EntryRef(EntryRef)> updater) = 0; virtual bool normalize_posting_lists(std::function<EntryRef(EntryRef)> normalize) = 0; + virtual bool normalize_posting_lists(std::function<void(std::vector<EntryRef>&)> normalize, const std::vector<bool>& filter, uint32_t entry_ref_offset_bits) = 0; + virtual void foreach_posting_list(std::function<void(const std::vector<EntryRef>&)> callback, const std::vector<bool>& filter, uint32_t entry_ref_offset_bits) = 0; virtual const EnumPostingTree& get_posting_dictionary() const = 0; }; diff --git a/vespalib/src/vespa/vespalib/btree/btreeiterator.h b/vespalib/src/vespa/vespalib/btree/btreeiterator.h index 325ce0e0e47..30123b1946e 100644 --- a/vespalib/src/vespa/vespalib/btree/btreeiterator.h +++ b/vespalib/src/vespa/vespalib/btree/btreeiterator.h @@ -113,6 +113,9 @@ public: return _node->getData(_idx); } + // Only use during compaction when changing reference to moved value + DataType &getWData() { return getWNode()->getWData(_idx); } + bool valid() const { @@ -881,6 +884,9 @@ public: _leaf.getWNode()->writeData(_leaf.getIdx(), data); } + // Only use during compaction when changing reference to moved value + DataType &getWData() { return _leaf.getWData(); } + /** * Set a new key for the current iterator position. * The new key must have the same semantic meaning as the old key. diff --git a/vespalib/src/vespa/vespalib/btree/btreenode.h b/vespalib/src/vespa/vespalib/btree/btreenode.h index d8752d77f0b..468f17fcd1a 100644 --- a/vespalib/src/vespa/vespalib/btree/btreenode.h +++ b/vespalib/src/vespa/vespalib/btree/btreenode.h @@ -99,6 +99,8 @@ public: } const DataT &getData(uint32_t idx) const { return _data[idx]; } + // Only use during compaction when changing reference to moved value + DataT &getWData(uint32_t idx) { return _data[idx]; } void setData(uint32_t idx, const DataT &data) { _data[idx] = data; } static bool hasData() { return true; } }; @@ -120,6 +122,9 @@ public: return BTreeNoLeafData::_instance; } + // Only use during compaction when changing reference to moved value + BTreeNoLeafData &getWData(uint32_t) const { return BTreeNoLeafData::_instance; } + void setData(uint32_t idx, const BTreeNoLeafData &data) { (void) idx; (void) data; |