diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2019-10-24 09:05:08 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2019-10-24 13:47:11 +0000 |
commit | 89c90492cacf3ae7b359fe9127fc5c197545a762 (patch) | |
tree | f486ce241f0d4d8a1a7433fb6e202d591b3051dc | |
parent | 24a59a6e18a0090211e294d918226421e18a4177 (diff) |
Make multi value attribute inserts O(n) instead of O(n^2)
Previous code had O(n^2) complexity for
* Weighted set appends, removes and weight updates
* Array element removes (in-place erasing)
New code moves all weighted set operations to a temporary
hash map from key to weight, building a value vector at
the very end. For arrays it defers all removals to the
very end, amortizing cost down to linear time by tracking
element removal times and avoiding push_backs instead of
performing explicit element erasures.
This fixes #11069
3 files changed, 129 insertions, 62 deletions
diff --git a/searchcore/src/tests/proton/docsummary/docsummary.cpp b/searchcore/src/tests/proton/docsummary/docsummary.cpp index 0e521e473ae..26abd6be914 100644 --- a/searchcore/src/tests/proton/docsummary/docsummary.cpp +++ b/searchcore/src/tests/proton/docsummary/docsummary.cpp @@ -765,12 +765,14 @@ Test::requireThatAttributesAreUsed() EXPECT_EQUAL(2u, rep->docsums.size()); + // FIXME the expected output ordering of weighted set fields is currently inherently linked + // to the internal ordering of such attributes. Should be decoupled, as this is very fragile. EXPECT_TRUE(assertSlime("{ba:10,bb:10.1," "bc:'foo'," "bd:[20,30]," "be:[20.2,30.3]," "bf:['bar','baz']," - "bg:[{item:40,weight:2},{item:50,weight:3}]," + "bg:[{item:50,weight:3},{item:40,weight:2}]," "bh:[{item:40.4,weight:4},{item:50.5,weight:5}]," "bi:[{item:'quux',weight:7},{item:'qux',weight:6}]," "bj:'0x01020178017901016601674008000000000000'}", *rep, 0, true)); diff --git a/searchlib/src/vespa/searchlib/attribute/multivalueattribute.h b/searchlib/src/vespa/searchlib/attribute/multivalueattribute.h index ebcbd6853cb..66ca6bd2eac 100644 --- a/searchlib/src/vespa/searchlib/attribute/multivalueattribute.h +++ b/searchlib/src/vespa/searchlib/attribute/multivalueattribute.h @@ -66,6 +66,9 @@ private: uint64_t getTotalValueCount() const override; + void apply_attribute_changes_to_array(DocumentValues& docValues); + void apply_attribute_changes_to_wset(DocumentValues& docValues); + public: void clearDocs(DocId lidLow, DocId lidLimit) override; void onShrinkLidSpace() override ; diff --git a/searchlib/src/vespa/searchlib/attribute/multivalueattribute.hpp b/searchlib/src/vespa/searchlib/attribute/multivalueattribute.hpp index 2e16bf13b95..f07efcafa9a 100644 --- a/searchlib/src/vespa/searchlib/attribute/multivalueattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/multivalueattribute.hpp @@ -3,6 +3,8 @@ #pragma once #include <vespa/searchlib/attribute/multivalueattribute.h> +#include <vespa/vespalib/stllike/hash_map.h> +#include <vespa/vespalib/stllike/hash_map.hpp> namespace search { @@ -27,9 +29,7 @@ MultiValueAttribute(const vespalib::string &baseFileName, } template <typename B, typename M> -MultiValueAttribute<B, M>::~MultiValueAttribute() -{ -} +MultiValueAttribute<B, M>::~MultiValueAttribute() = default; template <typename B, typename M> int32_t MultiValueAttribute<B, M>::getWeight(DocId doc, uint32_t idx) const @@ -38,100 +38,162 @@ int32_t MultiValueAttribute<B, M>::getWeight(DocId doc, uint32_t idx) const return ((idx < values.size()) ? values[idx].weight() : 1); } +namespace { + +template <typename T> +struct HashFn { + using type = vespalib::hash<T>; +}; + +template <> +struct HashFn<search::datastore::EntryRef> { + struct EntryRefHasher { + size_t operator() (const search::datastore::EntryRef& v) const noexcept { + return v.ref(); + } + }; + using type = EntryRefHasher; +}; + +} template <typename B, typename M> void MultiValueAttribute<B, M>::applyAttributeChanges(DocumentValues & docValues) { + if (this->hasArrayType()) { + apply_attribute_changes_to_array(docValues); + return; + } else if (this->hasWeightedSetType()) { + apply_attribute_changes_to_wset(docValues); + return; + } +} + +template <typename B, typename M> +void +MultiValueAttribute<B, M>::apply_attribute_changes_to_array(DocumentValues& docValues) +{ // compute new values for each document with changes for (ChangeVectorIterator current(this->_changes.begin()), end(this->_changes.end()); (current != end); ) { DocId doc = current->_doc; - - MultiValueArrayRef oldValues(_mvMapping.get(doc)); - ValueVector newValues(oldValues.cbegin(), oldValues.cend()); - // find last clear doc - ChangeVectorIterator lastClearDoc = end; + ChangeVectorIterator last_clear_doc = end; for (ChangeVectorIterator iter = current; (iter != end) && (iter->_doc == doc); ++iter) { if (iter->_type == ChangeBase::CLEARDOC) { - lastClearDoc = iter; + last_clear_doc = iter; } } - // use last clear doc if found - if (lastClearDoc != end) { - current = lastClearDoc; + if (last_clear_doc != end) { + current = last_clear_doc; } + MultiValueArrayRef old_values(_mvMapping.get(doc)); + ValueVector new_values(old_values.cbegin(), old_values.cend()); + vespalib::hash_map<ValueType, size_t, typename HashFn<ValueType>::type> tombstones; // iterate through all changes for this document for (; (current != end) && (current->_doc == doc); ++current) { - if (current->_type == ChangeBase::CLEARDOC) { - newValues.clear(); + new_values.clear(); + tombstones.clear(); continue; } - ValueType data; bool hasData = extractChangeData(*current, data); - + if (!hasData) { + continue; + } if (current->_type == ChangeBase::APPEND) { - if (hasData) { - if (this->hasArrayType()) { - newValues.push_back(MultiValueType(data, current->_weight)); - } else if (this->hasWeightedSetType()) { - ValueVectorIterator witer; - for (witer = newValues.begin(); witer != newValues.end(); ++witer) { - if (witer->value() == data) { - break; - } - } - if (witer != newValues.end()) { - witer->setWeight(current->_weight); - } else { - newValues.push_back(MultiValueType(data, current->_weight)); - } - } - } + new_values.emplace_back(data, current->_weight); } else if (current->_type == ChangeBase::REMOVE) { - if (hasData) { - for (ValueVectorIterator witer = newValues.begin(); witer != newValues.end(); ) { - if (witer->value() == data) { - witer = newValues.erase(witer); - } else { - ++witer; - } - } + // Defer all removals to the very end by tracking when, during value vector build time, + // a removal was encountered for a particular value. All values < this index will be ignored. + tombstones[data] = new_values.size(); + } + } + // Optimize for the case where nothing was explicitly removed. + if (!tombstones.empty()) { + ValueVector culled; + culled.reserve(new_values.size()); + for (size_t i = 0; i < new_values.size(); ++i) { + auto iter = tombstones.find(new_values[i].value()); + if (iter == tombstones.end() || (iter->second <= i)) { + culled.emplace_back(new_values[i]); } + } + culled.swap(new_values); + } + this->checkSetMaxValueCount(new_values.size()); + docValues.emplace_back(doc, std::move(new_values)); + } +} + +template <typename B, typename M> +void +MultiValueAttribute<B, M>::apply_attribute_changes_to_wset(DocumentValues& docValues) +{ + // compute new values for each document with changes + for (ChangeVectorIterator current(this->_changes.begin()), end(this->_changes.end()); (current != end); ) { + const DocId doc = current->_doc; + // find last clear doc + ChangeVectorIterator last_clear_doc = end; + size_t max_elems_inserted = 0; + for (ChangeVectorIterator iter = current; (iter != end) && (iter->_doc == doc); ++iter) { + if (iter->_type == ChangeBase::CLEARDOC) { + last_clear_doc = iter; + } + ++max_elems_inserted; + } + // use last clear doc if found + if (last_clear_doc != end) { + current = last_clear_doc; + } + MultiValueArrayRef old_values(_mvMapping.get(doc)); + vespalib::hash_map<ValueType, int32_t, typename HashFn<ValueType>::type> wset_inserted; + wset_inserted.resize((old_values.size() + max_elems_inserted) * 2); + for (const auto& e : old_values) { + wset_inserted[e.value()] = e.weight(); + } + // iterate through all changes for this document + for (; (current != end) && (current->_doc == doc); ++current) { + if (current->_type == ChangeBase::CLEARDOC) { + wset_inserted.clear(); + continue; + } + ValueType data; + bool hasData = extractChangeData(*current, data); + if (!hasData) { + continue; + } + if (current->_type == ChangeBase::APPEND) { + wset_inserted[data] = current->_weight; + } else if (current->_type == ChangeBase::REMOVE) { + wset_inserted.erase(data); } else if ((current->_type >= ChangeBase::INCREASEWEIGHT) && (current->_type <= ChangeBase::DIVWEIGHT)) { - if (this->hasWeightedSetType() && hasData) { - ValueVectorIterator witer; - for (witer = newValues.begin(); witer != newValues.end(); ++witer) { - if (witer->value() == data) { - break; - } + auto existing = wset_inserted.find(data); + if (existing != wset_inserted.end()) { + existing->second = this->applyWeightChange(existing->second, *current); + if ((existing->second == 0) && this->getInternalCollectionType().removeIfZero()) { + wset_inserted.erase(existing); } - if (witer != newValues.end()) { - witer->setWeight(this->applyWeightChange(witer->weight(), *current)); - if (witer->weight() == 0 && this->getInternalCollectionType().removeIfZero()) { - newValues.erase(witer); - } - } else if (this->getInternalCollectionType().createIfNonExistant()) { - int32_t weight = this->applyWeightChange(0, *current); - if (weight != 0 || !this->getInternalCollectionType().removeIfZero()) { - newValues.push_back(MultiValueType(data, weight)); - } + } else if (this->getInternalCollectionType().createIfNonExistant()) { + int32_t weight = this->applyWeightChange(0, *current); + if (weight != 0 || !this->getInternalCollectionType().removeIfZero()) { + wset_inserted.insert(std::make_pair(data, weight)); } } } } - this->checkSetMaxValueCount(newValues.size()); + std::vector<MultiValueType> new_values; + new_values.reserve(wset_inserted.size()); + wset_inserted.for_each([&new_values](const auto& e){ new_values.emplace_back(e.first, e.second); }); - docValues.push_back(std::make_pair(doc, ValueVector())); - docValues.back().second.swap(newValues); + this->checkSetMaxValueCount(new_values.size()); + docValues.emplace_back(doc, std::move(new_values)); } } - template <typename B, typename M> vespalib::AddressSpace MultiValueAttribute<B, M>::getMultiValueAddressSpaceUsage() const |