aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2021-12-03 18:55:45 +0100
committerTor Egge <Tor.Egge@online.no>2021-12-03 18:55:45 +0100
commit472cb247d6d4d5e8cd58d537f58ce9515ef79869 (patch)
treeabeba8051fbcca0f4a4a700d8f51b03e18d31b38
parent42e3653b19f95847d14b9b90478a967907611d98 (diff)
Filter early on buffer id and pass vector of entries in normalize_posting_lists
and foreach_posting_list EnumStoreDictionary member functions to limit number of callbacks.
-rw-r--r--searchlib/src/tests/attribute/enumstore/enumstore_test.cpp71
-rw-r--r--searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp161
-rw-r--r--searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h2
-rw-r--r--searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h2
-rw-r--r--vespalib/src/vespa/vespalib/btree/btreeiterator.h6
-rw-r--r--vespalib/src/vespa/vespalib/btree/btreenode.h5
6 files changed, 231 insertions, 16 deletions
diff --git a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp
index 9c25429932b..38cf8413e0f 100644
--- a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp
+++ b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp
@@ -597,6 +597,7 @@ public:
void update_posting_idx(EnumIndex enum_idx, EntryRef old_posting_idx, EntryRef new_posting_idx);
EnumIndex insert_value(size_t value_idx);
+ void test_normalize_posting_lists(bool use_filter);
static EntryRef fake_pidx() { return EntryRef(42); }
};
@@ -620,6 +621,44 @@ EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::insert_value(size_t val
return enum_idx;
}
+template <typename EnumStoreTypeAndDictionaryType>
+void
+EnumStoreDictionaryTest<EnumStoreTypeAndDictionaryType>::test_normalize_posting_lists(bool use_filter)
+{
+ auto value_0_idx = this->insert_value(0);
+ this->update_posting_idx(value_0_idx, EntryRef(), this->fake_pidx());
+ this->store.freeze_dictionary();
+ auto& dict = this->store.get_dictionary();
+ auto root = dict.get_frozen_root();
+ auto find_result = dict.find_posting_list(this->make_bound_comparator(0), root);
+ EXPECT_EQ(value_0_idx, find_result.first);
+ EXPECT_EQ(this->fake_pidx(), find_result.second);
+ std::vector<EntryRef> saved_refs;
+ if (use_filter) {
+ auto dummy = [](std::vector<EntryRef>&) noexcept { };
+ auto save_refs_and_clear = [&saved_refs](std::vector<EntryRef>& refs) { saved_refs.insert(saved_refs.end(), refs.begin(), refs.end()); for (auto& ref : refs) { ref = EntryRef(); } };
+ std::vector<bool> filter(2, true);
+ uint32_t entry_ref_offset_bits = 31;
+ EXPECT_FALSE(dict.normalize_posting_lists(dummy, filter, entry_ref_offset_bits));
+ EXPECT_TRUE(dict.normalize_posting_lists(save_refs_and_clear, filter, entry_ref_offset_bits));
+ EXPECT_FALSE(dict.normalize_posting_lists(save_refs_and_clear, filter, entry_ref_offset_bits));
+ EXPECT_EQ((std::vector<EntryRef>{ this->fake_pidx() }), saved_refs);
+ } else {
+ auto dummy = [](EntryRef posting_idx) noexcept { return posting_idx; };
+ auto save_refs_and_clear = [&saved_refs](EntryRef posting_idx) { saved_refs.push_back(posting_idx); return EntryRef(); };
+ EXPECT_FALSE(dict.normalize_posting_lists(dummy));
+ EXPECT_TRUE(dict.normalize_posting_lists(save_refs_and_clear));
+ EXPECT_FALSE(dict.normalize_posting_lists(save_refs_and_clear));
+ EXPECT_EQ((std::vector<EntryRef>{ this->fake_pidx(), EntryRef() }), saved_refs);
+ }
+ this->store.freeze_dictionary();
+ root = dict.get_frozen_root();
+ find_result = dict.find_posting_list(this->make_bound_comparator(0), root);
+ EXPECT_EQ(value_0_idx, find_result.first);
+ EXPECT_EQ(EntryRef(), find_result.second);
+}
+
+
// Disable warnings emitted by gtest generated files when using typed tests
#pragma GCC diagnostic push
#ifndef __clang__
@@ -678,26 +717,26 @@ TYPED_TEST(EnumStoreDictionaryTest, find_posting_list_works)
TYPED_TEST(EnumStoreDictionaryTest, normalize_posting_lists_works)
{
+ this->test_normalize_posting_lists(false);
+}
+
+TYPED_TEST(EnumStoreDictionaryTest, normalize_posting_lists_with_filter_works)
+{
+ this->test_normalize_posting_lists(true);
+}
+
+TYPED_TEST(EnumStoreDictionaryTest, foreach_posting_list_with_filter_works)
+{
+ std::vector<bool> filter(2, true);
+ uint32_t entry_ref_offset_bits = 31;
+ std::vector<EntryRef> saved_refs;
+ auto save_refs = [&saved_refs](const std::vector<EntryRef>& refs) { saved_refs.insert(saved_refs.end(), refs.begin(), refs.end()); };
auto value_0_idx = this->insert_value(0);
this->update_posting_idx(value_0_idx, EntryRef(), this->fake_pidx());
this->store.freeze_dictionary();
auto& dict = this->store.get_dictionary();
- auto root = dict.get_frozen_root();
- auto find_result = dict.find_posting_list(this->make_bound_comparator(0), root);
- EXPECT_EQ(value_0_idx, find_result.first);
- EXPECT_EQ(this->fake_pidx(), find_result.second);
- auto dummy = [](EntryRef posting_idx) noexcept { return posting_idx; };
- std::vector<EntryRef> saved_refs;
- auto save_refs_and_clear = [&saved_refs](EntryRef posting_idx) { saved_refs.push_back(posting_idx); return EntryRef(); };
- EXPECT_FALSE(dict.normalize_posting_lists(dummy));
- EXPECT_TRUE(dict.normalize_posting_lists(save_refs_and_clear));
- EXPECT_FALSE(dict.normalize_posting_lists(save_refs_and_clear));
- EXPECT_EQ((std::vector<EntryRef>{ this->fake_pidx(), EntryRef() }), saved_refs);
- this->store.freeze_dictionary();
- root = dict.get_frozen_root();
- find_result = dict.find_posting_list(this->make_bound_comparator(0), root);
- EXPECT_EQ(value_0_idx, find_result.first);
- EXPECT_EQ(EntryRef(), find_result.second);
+ dict.foreach_posting_list(save_refs, filter, entry_ref_offset_bits);
+ EXPECT_EQ((std::vector<EntryRef>{ this->fake_pidx() }), saved_refs);
}
namespace {
diff --git a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp
index 6c929ad5981..f77b9426b32 100644
--- a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp
@@ -311,6 +311,167 @@ EnumStoreDictionary<BTreeDictionaryT, HashDictionaryT>::normalize_posting_lists(
}
template <>
+bool
+EnumStoreDictionary<EnumTree>::normalize_posting_lists(std::function<void(std::vector<EntryRef>&)>, const std::vector<bool> &, uint32_t)
+{
+ LOG_ABORT("should not be reached");
+}
+
+namespace {
+
+template <typename HashDictionaryT>
+class ChangeWriterBase
+{
+protected:
+ HashDictionaryT* _hash_dict;
+ static constexpr bool has_hash_dictionary = true;
+ ChangeWriterBase()
+ : _hash_dict(nullptr)
+ {
+ }
+public:
+ void set_hash_dict(HashDictionaryT &hash_dict) { _hash_dict = &hash_dict; }
+};
+
+template <>
+class ChangeWriterBase<vespalib::datastore::NoHashDictionary>
+{
+protected:
+ static constexpr bool has_hash_dictionary = false;
+ ChangeWriterBase() = default;
+};
+
+template <typename HashDictionaryT>
+class ChangeWriter : public ChangeWriterBase<HashDictionaryT> {
+ using Parent = ChangeWriterBase<HashDictionaryT>;
+ using Parent::has_hash_dictionary;
+ std::vector<std::pair<EntryRef,uint32_t*>> _tree_refs;
+public:
+ ChangeWriter(uint32_t capacity);
+ ~ChangeWriter();
+ bool write(const std::vector<EntryRef>& refs);
+ void emplace_back(EntryRef key, uint32_t& tree_ref) { _tree_refs.emplace_back(std::make_pair(key, &tree_ref)); }
+};
+
+template <typename HashDictionaryT>
+ChangeWriter<HashDictionaryT>::ChangeWriter(uint32_t capacity)
+ : ChangeWriterBase<HashDictionaryT>(),
+ _tree_refs()
+{
+ _tree_refs.reserve(capacity);
+}
+
+template <typename HashDictionaryT>
+ChangeWriter<HashDictionaryT>::~ChangeWriter() = default;
+
+template <typename HashDictionaryT>
+bool
+ChangeWriter<HashDictionaryT>::write(const std::vector<EntryRef> &refs)
+{
+ bool changed = false;
+ assert(refs.size() == _tree_refs.size());
+ auto tree_ref = _tree_refs.begin();
+ for (auto ref : refs) {
+ EntryRef old_ref(*tree_ref->second);
+ if (ref != old_ref) {
+ if (!changed) {
+ // Note: Needs review when porting to other platforms
+ // Assumes that other CPUs observes stores from this CPU in order
+ std::atomic_thread_fence(std::memory_order_release);
+ changed = true;
+ }
+ *tree_ref->second = ref.ref();
+ if constexpr (has_hash_dictionary) {
+ auto find_result = this->_hash_dict->find(this->_hash_dict->get_default_comparator(), tree_ref->first);
+ assert(find_result != nullptr && find_result->first.load_relaxed() == tree_ref->first);
+ assert(find_result->second.load_relaxed() == old_ref);
+ find_result->second.store_release(ref);
+ }
+ }
+ ++tree_ref;
+ }
+ assert(tree_ref == _tree_refs.end());
+ _tree_refs.clear();
+ return changed;
+}
+
+}
+
+template <typename BTreeDictionaryT, typename HashDictionaryT>
+bool
+EnumStoreDictionary<BTreeDictionaryT, HashDictionaryT>::normalize_posting_lists(std::function<void(std::vector<EntryRef>&)> normalize, const std::vector<bool>& filter, uint32_t entry_ref_offset_bits)
+{
+ if constexpr (has_btree_dictionary) {
+ std::vector<EntryRef> refs;
+ refs.reserve(1024);
+ bool changed = false;
+ ChangeWriter<HashDictionaryT> change_writer(refs.capacity());
+ if constexpr (has_hash_dictionary) {
+ change_writer.set_hash_dict(this->_hash_dict);
+ }
+ auto& dict = this->_btree_dict;
+ for (auto itr = dict.begin(); itr.valid(); ++itr) {
+ EntryRef ref(itr.getData());
+ if (ref.valid()) {
+ uint32_t buffer_id = ref.buffer_id(entry_ref_offset_bits);
+ if (filter[buffer_id]) {
+ refs.emplace_back(ref);
+ change_writer.emplace_back(itr.getKey(), itr.getWData());
+ if (refs.size() >= refs.capacity()) {
+ normalize(refs);
+ changed |= change_writer.write(refs);
+ refs.clear();
+ }
+ }
+ }
+ }
+ if (!refs.empty()) {
+ normalize(refs);
+ changed |= change_writer.write(refs);
+ }
+ return changed;
+ } else {
+ return this->_hash_dict.normalize_values(normalize, filter, entry_ref_offset_bits);
+ }
+}
+
+template <>
+void
+EnumStoreDictionary<EnumTree>::foreach_posting_list(std::function<void(const std::vector<EntryRef>&)>, const std::vector<bool>&, uint32_t)
+{
+ LOG_ABORT("should not be reached");
+}
+
+template <typename BTreeDictionaryT, typename HashDictionaryT>
+void
+EnumStoreDictionary<BTreeDictionaryT, HashDictionaryT>::foreach_posting_list(std::function<void(const std::vector<EntryRef>&)> callback, const std::vector<bool>& filter, uint32_t entry_ref_offset_bits)
+{
+ if constexpr (has_btree_dictionary) {
+ std::vector<EntryRef> refs;
+ refs.reserve(1024);
+ auto& dict = this->_btree_dict;
+ for (auto itr = dict.begin(); itr.valid(); ++itr) {
+ EntryRef ref(itr.getData());
+ if (ref.valid()) {
+ uint32_t buffer_id = ref.buffer_id(entry_ref_offset_bits);
+ if (filter[buffer_id]) {
+ refs.emplace_back(ref);
+ if (refs.size() >= refs.capacity()) {
+ callback(refs);
+ refs.clear();
+ }
+ }
+ }
+ }
+ if (!refs.empty()) {
+ callback(refs);
+ }
+ } else {
+ this->_hash_dict.foreach_value(callback, filter, entry_ref_offset_bits);
+ }
+}
+
+template <>
const EnumPostingTree &
EnumStoreDictionary<EnumTree>::get_posting_dictionary() const
{
diff --git a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h
index 4d0509c0eb1..a17ffd14864 100644
--- a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h
+++ b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h
@@ -54,6 +54,8 @@ public:
void clear_all_posting_lists(std::function<void(EntryRef)> clearer) override;
void update_posting_list(Index idx, const EntryComparator& cmp, std::function<EntryRef(EntryRef)> updater) override;
bool normalize_posting_lists(std::function<EntryRef(EntryRef)> normalize) override;
+ bool normalize_posting_lists(std::function<void(std::vector<EntryRef>&)> normalize, const std::vector<bool>& filter, uint32_t entry_ref_offset_bits) override;
+ void foreach_posting_list(std::function<void(const std::vector<EntryRef>&)> callback, const std::vector<bool>& filter, uint32_t entry_ref_offset_bits) override;
const EnumPostingTree& get_posting_dictionary() const override;
};
diff --git a/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h b/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h
index a8cf6881b86..4a00c72d6ba 100644
--- a/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h
+++ b/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h
@@ -53,6 +53,8 @@ public:
virtual void clear_all_posting_lists(std::function<void(EntryRef)> clearer) = 0;
virtual void update_posting_list(Index idx, const EntryComparator& cmp, std::function<EntryRef(EntryRef)> updater) = 0;
virtual bool normalize_posting_lists(std::function<EntryRef(EntryRef)> normalize) = 0;
+ virtual bool normalize_posting_lists(std::function<void(std::vector<EntryRef>&)> normalize, const std::vector<bool>& filter, uint32_t entry_ref_offset_bits) = 0;
+ virtual void foreach_posting_list(std::function<void(const std::vector<EntryRef>&)> callback, const std::vector<bool>& filter, uint32_t entry_ref_offset_bits) = 0;
virtual const EnumPostingTree& get_posting_dictionary() const = 0;
};
diff --git a/vespalib/src/vespa/vespalib/btree/btreeiterator.h b/vespalib/src/vespa/vespalib/btree/btreeiterator.h
index 325ce0e0e47..30123b1946e 100644
--- a/vespalib/src/vespa/vespalib/btree/btreeiterator.h
+++ b/vespalib/src/vespa/vespalib/btree/btreeiterator.h
@@ -113,6 +113,9 @@ public:
return _node->getData(_idx);
}
+ // Only use during compaction when changing reference to moved value
+ DataType &getWData() { return getWNode()->getWData(_idx); }
+
bool
valid() const
{
@@ -881,6 +884,9 @@ public:
_leaf.getWNode()->writeData(_leaf.getIdx(), data);
}
+ // Only use during compaction when changing reference to moved value
+ DataType &getWData() { return _leaf.getWData(); }
+
/**
* Set a new key for the current iterator position.
* The new key must have the same semantic meaning as the old key.
diff --git a/vespalib/src/vespa/vespalib/btree/btreenode.h b/vespalib/src/vespa/vespalib/btree/btreenode.h
index d8752d77f0b..468f17fcd1a 100644
--- a/vespalib/src/vespa/vespalib/btree/btreenode.h
+++ b/vespalib/src/vespa/vespalib/btree/btreenode.h
@@ -99,6 +99,8 @@ public:
}
const DataT &getData(uint32_t idx) const { return _data[idx]; }
+ // Only use during compaction when changing reference to moved value
+ DataT &getWData(uint32_t idx) { return _data[idx]; }
void setData(uint32_t idx, const DataT &data) { _data[idx] = data; }
static bool hasData() { return true; }
};
@@ -120,6 +122,9 @@ public:
return BTreeNoLeafData::_instance;
}
+ // Only use during compaction when changing reference to moved value
+ BTreeNoLeafData &getWData(uint32_t) const { return BTreeNoLeafData::_instance; }
+
void setData(uint32_t idx, const BTreeNoLeafData &data) {
(void) idx;
(void) data;