From 6bbbbd15f93adb6d8f640c8f9d1adaf1cff4fa77 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Tue, 3 Sep 2019 11:26:16 +0200 Subject: Add EnumStoreFoldedDictionary class, used when multiple entries in the dictionary might share a posting list. Ensure that valid posting list reference is kept at the first of these entries. --- .../tests/attribute/enumstore/enumstore_test.cpp | 4 +- .../searchlib/attribute/enum_store_dictionary.cpp | 91 ++++++++++++++-------- .../searchlib/attribute/enum_store_dictionary.h | 30 +++++-- .../src/vespa/searchlib/attribute/enumstore.cpp | 8 +- .../src/vespa/searchlib/attribute/enumstore.h | 9 +-- .../src/vespa/searchlib/attribute/enumstore.hpp | 78 ++----------------- .../src/vespa/searchlib/attribute/i_enum_store.h | 4 +- .../searchlib/attribute/i_enum_store_dictionary.h | 6 +- .../searchlib/attribute/postinglistattribute.cpp | 2 +- 9 files changed, 103 insertions(+), 129 deletions(-) (limited to 'searchlib') diff --git a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp index 60cad3e9c48..4d1d0b6fff5 100644 --- a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp +++ b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp @@ -280,7 +280,7 @@ EnumStoreTest::testHoldListAndGeneration() ses.decRefCount(idx); EXPECT_EQUAL(0u, ses.getRefCount(idx)); } - ses.freeUnusedEnums(true); + ses.freeUnusedEnums(); // check readers again checkReaders(ses, sesGen, readers); @@ -304,7 +304,7 @@ void decRefCount(NumericEnumStore& store, NumericEnumStore::Index idx) { store.decRefCount(idx); - store.freeUnusedEnums(false); + store.freeUnusedEnums(); generation_t gen = 5; store.transferHoldLists(gen); store.trimHoldLists(gen + 1); diff --git a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp index 24c0eeabd2c..a080c91cb34 100644 --- a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp +++ b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.cpp @@ -14,6 +14,10 @@ #include LOG_SETUP(".searchlib.attribute.enum_store_dictionary"); +using search::datastore::EntryComparator; +using search::datastore::EntryRef; +using search::datastore::UniqueStoreAddResult; + namespace search { using btree::BTreeNode; @@ -76,44 +80,19 @@ EnumStoreDictionary::fixupRefCounts(const EnumVector& hist) template void EnumStoreDictionary::removeUnusedEnums(const IndexSet& unused, - const datastore::EntryComparator& cmp, - const datastore::EntryComparator* fcmp) + const datastore::EntryComparator& cmp) { - using Iterator = typename DictionaryT::Iterator; if (unused.empty()) { return; } - Iterator it(BTreeNode::Ref(), this->_dict.getAllocator()); - for (const auto& idx : unused) { - it.lower_bound(this->_dict.getRoot(), idx, cmp); - assert(it.valid() && !cmp(idx, it.getKey())); - if (Iterator::hasData() && fcmp != nullptr) { - typename DictionaryT::DataType pidx(it.getData()); - this->_dict.remove(it); - if (!it.valid() || (*fcmp)(idx, it.getKey())) { - continue; // Next entry does not use same posting list - } - --it; - if (it.valid() && !(*fcmp)(it.getKey(), idx)) { - continue; // Previous entry uses same posting list - } - if (it.valid()) { - ++it; - } else { - it.begin(); - } - this->_dict.thaw(it); - it.writeData(pidx); - } else { - this->_dict.remove(it); - } - } + for (const auto& ref : unused) { + this->remove(cmp, ref); + } } template void -EnumStoreDictionary::freeUnusedEnums(const datastore::EntryComparator& cmp, - const datastore::EntryComparator* fcmp) +EnumStoreDictionary::freeUnusedEnums(const datastore::EntryComparator& cmp) { IndexSet unused; @@ -121,20 +100,19 @@ EnumStoreDictionary::freeUnusedEnums(const datastore::EntryComparat for (auto iter = this->_dict.begin(); iter.valid(); ++iter) { _enumStore.freeUnusedEnum(iter.getKey(), unused); } - removeUnusedEnums(unused, cmp, fcmp); + removeUnusedEnums(unused, cmp); } template void EnumStoreDictionary::freeUnusedEnums(const IndexSet& toRemove, - const datastore::EntryComparator& cmp, - const datastore::EntryComparator* fcmp) + const datastore::EntryComparator& cmp) { IndexSet unused; for (const auto& index : toRemove) { _enumStore.freeUnusedEnum(index, unused); } - removeUnusedEnums(unused, cmp, fcmp); + removeUnusedEnums(unused, cmp); } template @@ -218,6 +196,51 @@ EnumStoreDictionary::hasData() const return DictionaryT::LeafNodeType::hasData(); } +EnumStoreFoldedDictionary::EnumStoreFoldedDictionary(IEnumStore& enumStore, std::unique_ptr folded_compare) + : EnumStoreDictionary(enumStore), + _folded_compare(std::move(folded_compare)) +{ +} + +EnumStoreFoldedDictionary::~EnumStoreFoldedDictionary() = default; + +UniqueStoreAddResult +EnumStoreFoldedDictionary::add(const EntryComparator &comp, std::function insertEntry) +{ + auto it = _dict.lowerBound(EntryRef(), comp); + if (it.valid() && !comp(EntryRef(), it.getKey())) { + // Entry already exists + return UniqueStoreAddResult(it.getKey(), false); + } + EntryRef newRef = insertEntry(); + _dict.insert(it, newRef, EntryRef()); + // Maybe move posting list reference from next entry + ++it; + if (it.valid() && it.getData().valid() && !(*_folded_compare)(newRef, it.getKey())) { + EntryRef posting_list_ref(it.getData()); + _dict.thaw(it); + it.writeData(EntryRef()); + --it; + assert(it.valid() && it.getKey() == newRef); + it.writeData(posting_list_ref); + } + return UniqueStoreAddResult(newRef, true); +} + +void +EnumStoreFoldedDictionary::remove(const EntryComparator &comp, EntryRef ref) +{ + assert(ref.valid()); + auto it = _dict.lowerBound(ref, comp); + assert(it.valid() && it.getKey() == ref); + EntryRef posting_list_ref(it.getData()); + _dict.remove(it); + // Maybe copy posting list reference to next entry + if (posting_list_ref.valid() && it.valid() && !it.getData().valid() && !(*_folded_compare)(ref, it.getKey())) { + this->_dict.thaw(it); + it.writeData(posting_list_ref); + } +} template class EnumStoreDictionary; diff --git a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h index 31f52e4e965..38a7d6b1985 100644 --- a/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h +++ b/searchlib/src/vespa/searchlib/attribute/enum_store_dictionary.h @@ -38,15 +38,12 @@ public: void fixupRefCounts(const EnumVector& hist) override; void removeUnusedEnums(const IndexSet& unused, - const datastore::EntryComparator& cmp, - const datastore::EntryComparator* fcmp); + const datastore::EntryComparator& cmp); - void freeUnusedEnums(const datastore::EntryComparator& cmp, - const datastore::EntryComparator* fcmp) override; + void freeUnusedEnums(const datastore::EntryComparator& cmp) override; void freeUnusedEnums(const IndexSet& toRemove, - const datastore::EntryComparator& cmp, - const datastore::EntryComparator* fcmp) override; + const datastore::EntryComparator& cmp) override; bool findIndex(const datastore::EntryComparator& cmp, Index& idx) const override; bool findFrozenIndex(const datastore::EntryComparator& cmp, Index& idx) const override; @@ -62,6 +59,27 @@ public: bool hasData() const override; }; +/** + * Concrete dictionary for an enum store that extends the + * functionality of a unique store dictionary. + * + * Special handling of value (posting list reference) is added to + * ensure that entries with same folded key share a posting list + * (e.g. case insensitive search) and posting list reference is found + * for the first of these entries. + */ +class EnumStoreFoldedDictionary : public EnumStoreDictionary +{ +private: + std::unique_ptr _folded_compare; + +public: + EnumStoreFoldedDictionary(IEnumStore& enumStore, std::unique_ptr folded_compare); + ~EnumStoreFoldedDictionary() override; + datastore::UniqueStoreAddResult add(const datastore::EntryComparator& comp, std::function insertEntry) override; + void remove(const datastore::EntryComparator& comp, datastore::EntryRef ref) override; +}; + extern template class btree::BTreeNodeT; diff --git a/searchlib/src/vespa/searchlib/attribute/enumstore.cpp b/searchlib/src/vespa/searchlib/attribute/enumstore.cpp index 7ce65193c40..1d5b4348af8 100644 --- a/searchlib/src/vespa/searchlib/attribute/enumstore.cpp +++ b/searchlib/src/vespa/searchlib/attribute/enumstore.cpp @@ -47,10 +47,14 @@ EnumStoreT::deserialize(const void* src, } std::unique_ptr -make_enum_store_dictionary(IEnumStore &store, bool has_postings) +make_enum_store_dictionary(IEnumStore &store, bool has_postings, std::unique_ptr folded_compare) { if (has_postings) { - return std::make_unique>(store); + if (folded_compare) { + return std::make_unique(store, std::move(folded_compare)); + } else { + return std::make_unique>(store); + } } else { return std::make_unique>(store); } diff --git a/searchlib/src/vespa/searchlib/attribute/enumstore.h b/searchlib/src/vespa/searchlib/attribute/enumstore.h index 94252239975..c422cf14cda 100644 --- a/searchlib/src/vespa/searchlib/attribute/enumstore.h +++ b/searchlib/src/vespa/searchlib/attribute/enumstore.h @@ -243,20 +243,15 @@ public: std::vector findFoldedEnums(DataType value) const; void addEnum(DataType value, Index &newIdx); bool findIndex(DataType value, Index &idx) const; - void freeUnusedEnums(bool movePostingidx) override; + void freeUnusedEnums() override; void freeUnusedEnums(const IndexSet& toRemove); vespalib::MemoryUsage update_stat() override; std::unique_ptr consider_compact(const CompactionStrategy& compaction_strategy) override; std::unique_ptr compact_worst(bool compact_memory, bool compact_address_space) override; - -private: - template - void addEnum(DataType value, Index& newIdx, Dictionary& dict); - }; std::unique_ptr -make_enum_store_dictionary(IEnumStore &store, bool has_postings); +make_enum_store_dictionary(IEnumStore &store, bool has_postings, std::unique_ptr folded_compare); vespalib::asciistream & operator << (vespalib::asciistream & os, const IEnumStore::Index & idx); diff --git a/searchlib/src/vespa/searchlib/attribute/enumstore.hpp b/searchlib/src/vespa/searchlib/attribute/enumstore.hpp index 574712798c2..e107b6da787 100644 --- a/searchlib/src/vespa/searchlib/attribute/enumstore.hpp +++ b/searchlib/src/vespa/searchlib/attribute/enumstore.hpp @@ -34,7 +34,7 @@ void EnumStoreT::freeUnusedEnum(Index idx, IndexSet& unused) template EnumStoreT::EnumStoreT(bool has_postings) - : _store(make_enum_store_dictionary(*this, has_postings)), + : _store(make_enum_store_dictionary(*this, has_postings, EntryType::hasFold() ? std::make_unique(*this) : std::unique_ptr())), _dict(static_cast(_store.get_dictionary())), _cached_values_memory_usage(), _cached_values_address_space_usage(0, 0, (1ull << 32)) @@ -169,15 +169,10 @@ EnumStoreT::findIndex(DataType value, Index &idx) const template void -EnumStoreT::freeUnusedEnums(bool movePostingIdx) +EnumStoreT::freeUnusedEnums() { ComparatorType cmp(*this); - if (EntryType::hasFold() && movePostingIdx) { - FoldedComparatorType fcmp(*this); - _dict.freeUnusedEnums(cmp, &fcmp); - } else { - _dict.freeUnusedEnums(cmp, nullptr); - } + _dict.freeUnusedEnums(cmp); } template @@ -185,75 +180,16 @@ void EnumStoreT::freeUnusedEnums(const IndexSet& toRemove) { ComparatorType cmp(*this); - if (EntryType::hasFold()) { - FoldedComparatorType fcmp(*this); - _dict.freeUnusedEnums(toRemove, cmp, &fcmp); - } else { - _dict.freeUnusedEnums(toRemove, cmp, nullptr); - } -} - -template -template -void -EnumStoreT::addEnum(DataType value, Index& newIdx, Dictionary& dict) -{ - typedef typename Dictionary::Iterator DictionaryIterator; - - // check if already present - ComparatorType cmp(*this, value); - DictionaryIterator it(btree::BTreeNode::Ref(), dict.getAllocator()); - it.lower_bound(dict.getRoot(), Index(), cmp); - if (it.valid() && !cmp(Index(), it.getKey())) { - newIdx = it.getKey(); - return; - } - - newIdx = _store.get_allocator().allocate(value); - - // TODO: Move this logic to "add/insert" on the dictionary - // update tree with new index - dict.insert(it, newIdx, typename Dictionary::DataType()); - - // Copy posting list idx from next entry if same folded value. - // Only for string posting list attributes, i.e. dictionary has - // data and entry type has folded compare. - if (DictionaryIterator::hasData() && EntryType::hasFold()) { - FoldedComparatorType foldCmp(*this); - ++it; - if (!it.valid() || foldCmp(newIdx, it.getKey())) { - return; // Next entry does not use same posting list - } - --it; - --it; - if (it.valid() && !foldCmp(it.getKey(), newIdx)) { - return; // Previous entry uses same posting list - } - if (it.valid()) { - ++it; - } else { - it.begin(); - } - assert(it.valid() && it.getKey() == newIdx); - ++it; - typename Dictionary::DataType pidx(it.getData()); - dict.thaw(it); - it.writeData(typename Dictionary::DataType()); - --it; - assert(it.valid() && it.getKey() == newIdx); - it.writeData(pidx); - } + _dict.freeUnusedEnums(toRemove, cmp); } template void EnumStoreT::addEnum(DataType value, Index& newIdx) { - if (_dict.hasData()) { - addEnum(value, newIdx, static_cast &>(_dict).getDictionary()); - } else { - addEnum(value, newIdx, static_cast &>(_dict).getDictionary()); - } + ComparatorType cmp(*this, value); + auto add_result = _dict.add(cmp, [this, &value]() -> EntryRef { return _store.get_allocator().allocate(value); }); + newIdx = add_result.ref(); } template diff --git a/searchlib/src/vespa/searchlib/attribute/i_enum_store.h b/searchlib/src/vespa/searchlib/attribute/i_enum_store.h index 2a9842075e6..8a9058d8766 100644 --- a/searchlib/src/vespa/searchlib/attribute/i_enum_store.h +++ b/searchlib/src/vespa/searchlib/attribute/i_enum_store.h @@ -51,7 +51,7 @@ public: virtual ssize_t deserialize0(const void* src, size_t available, IndexVector& idx) = 0; virtual void fixupRefCount(Index idx, uint32_t refCount) = 0; virtual void freeUnusedEnum(Index idx, IndexSet& unused) = 0; - virtual void freeUnusedEnums(bool movePostingIdx) = 0; + virtual void freeUnusedEnums() = 0; virtual bool foldedChange(const Index& idx1, const Index& idx2) = 0; virtual IEnumStoreDictionary& getEnumStoreDict() = 0; virtual const IEnumStoreDictionary& getEnumStoreDict() const = 0; @@ -94,7 +94,7 @@ public: fixupRefCount(ti.getKey(), *hi); } assert(!ti.valid()); - freeUnusedEnums(false); + freeUnusedEnums(); } }; diff --git a/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h b/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h index a4c14256835..5c2510346c8 100644 --- a/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h +++ b/searchlib/src/vespa/searchlib/attribute/i_enum_store_dictionary.h @@ -42,11 +42,9 @@ public: virtual ssize_t deserialize(const void* src, size_t available, IndexVector& idx) = 0; virtual void fixupRefCounts(const EnumVector& hist) = 0; - virtual void freeUnusedEnums(const datastore::EntryComparator& cmp, - const datastore::EntryComparator* fcmp) = 0; + virtual void freeUnusedEnums(const datastore::EntryComparator& cmp) = 0; virtual void freeUnusedEnums(const IndexSet& toRemove, - const datastore::EntryComparator& cmp, - const datastore::EntryComparator* fcmp) = 0; + const datastore::EntryComparator& cmp) = 0; virtual bool findIndex(const datastore::EntryComparator& cmp, Index& idx) const = 0; virtual bool findFrozenIndex(const datastore::EntryComparator& cmp, Index& idx) const = 0; virtual std::vector diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistattribute.cpp b/searchlib/src/vespa/searchlib/attribute/postinglistattribute.cpp index c8ee6ab50b2..e6e390ef3a9 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistattribute.cpp +++ b/searchlib/src/vespa/searchlib/attribute/postinglistattribute.cpp @@ -108,7 +108,7 @@ PostingListAttributeBase

::fillPostingsFixupEnumBase(const LoadedEnumAttribute &postings._removals[0], &postings._removals[0] + postings._removals.size()); posting_itr.writeData(newIndex); - enumStore.freeUnusedEnums(false); + enumStore.freeUnusedEnums(); } template -- cgit v1.2.3