summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--searchcore/src/tests/proton/docsummary/docsummary.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/attribute/multivalueattribute.h3
-rw-r--r--searchlib/src/vespa/searchlib/attribute/multivalueattribute.hpp184
3 files changed, 129 insertions, 62 deletions
diff --git a/searchcore/src/tests/proton/docsummary/docsummary.cpp b/searchcore/src/tests/proton/docsummary/docsummary.cpp
index 0e521e473ae..26abd6be914 100644
--- a/searchcore/src/tests/proton/docsummary/docsummary.cpp
+++ b/searchcore/src/tests/proton/docsummary/docsummary.cpp
@@ -765,12 +765,14 @@ Test::requireThatAttributesAreUsed()
EXPECT_EQUAL(2u, rep->docsums.size());
+ // FIXME the expected output ordering of weighted set fields is currently inherently linked
+ // to the internal ordering of such attributes. Should be decoupled, as this is very fragile.
EXPECT_TRUE(assertSlime("{ba:10,bb:10.1,"
"bc:'foo',"
"bd:[20,30],"
"be:[20.2,30.3],"
"bf:['bar','baz'],"
- "bg:[{item:40,weight:2},{item:50,weight:3}],"
+ "bg:[{item:50,weight:3},{item:40,weight:2}],"
"bh:[{item:40.4,weight:4},{item:50.5,weight:5}],"
"bi:[{item:'quux',weight:7},{item:'qux',weight:6}],"
"bj:'0x01020178017901016601674008000000000000'}", *rep, 0, true));
diff --git a/searchlib/src/vespa/searchlib/attribute/multivalueattribute.h b/searchlib/src/vespa/searchlib/attribute/multivalueattribute.h
index ebcbd6853cb..66ca6bd2eac 100644
--- a/searchlib/src/vespa/searchlib/attribute/multivalueattribute.h
+++ b/searchlib/src/vespa/searchlib/attribute/multivalueattribute.h
@@ -66,6 +66,9 @@ private:
uint64_t getTotalValueCount() const override;
+ void apply_attribute_changes_to_array(DocumentValues& docValues);
+ void apply_attribute_changes_to_wset(DocumentValues& docValues);
+
public:
void clearDocs(DocId lidLow, DocId lidLimit) override;
void onShrinkLidSpace() override ;
diff --git a/searchlib/src/vespa/searchlib/attribute/multivalueattribute.hpp b/searchlib/src/vespa/searchlib/attribute/multivalueattribute.hpp
index 2e16bf13b95..f07efcafa9a 100644
--- a/searchlib/src/vespa/searchlib/attribute/multivalueattribute.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/multivalueattribute.hpp
@@ -3,6 +3,8 @@
#pragma once
#include <vespa/searchlib/attribute/multivalueattribute.h>
+#include <vespa/vespalib/stllike/hash_map.h>
+#include <vespa/vespalib/stllike/hash_map.hpp>
namespace search {
@@ -27,9 +29,7 @@ MultiValueAttribute(const vespalib::string &baseFileName,
}
template <typename B, typename M>
-MultiValueAttribute<B, M>::~MultiValueAttribute()
-{
-}
+MultiValueAttribute<B, M>::~MultiValueAttribute() = default;
template <typename B, typename M>
int32_t MultiValueAttribute<B, M>::getWeight(DocId doc, uint32_t idx) const
@@ -38,100 +38,162 @@ int32_t MultiValueAttribute<B, M>::getWeight(DocId doc, uint32_t idx) const
return ((idx < values.size()) ? values[idx].weight() : 1);
}
+namespace {
+
+template <typename T>
+struct HashFn {
+ using type = vespalib::hash<T>;
+};
+
+template <>
+struct HashFn<search::datastore::EntryRef> {
+ struct EntryRefHasher {
+ size_t operator() (const search::datastore::EntryRef& v) const noexcept {
+ return v.ref();
+ }
+ };
+ using type = EntryRefHasher;
+};
+
+}
template <typename B, typename M>
void
MultiValueAttribute<B, M>::applyAttributeChanges(DocumentValues & docValues)
{
+ if (this->hasArrayType()) {
+ apply_attribute_changes_to_array(docValues);
+ return;
+ } else if (this->hasWeightedSetType()) {
+ apply_attribute_changes_to_wset(docValues);
+ return;
+ }
+}
+
+template <typename B, typename M>
+void
+MultiValueAttribute<B, M>::apply_attribute_changes_to_array(DocumentValues& docValues)
+{
// compute new values for each document with changes
for (ChangeVectorIterator current(this->_changes.begin()), end(this->_changes.end()); (current != end); ) {
DocId doc = current->_doc;
-
- MultiValueArrayRef oldValues(_mvMapping.get(doc));
- ValueVector newValues(oldValues.cbegin(), oldValues.cend());
-
// find last clear doc
- ChangeVectorIterator lastClearDoc = end;
+ ChangeVectorIterator last_clear_doc = end;
for (ChangeVectorIterator iter = current; (iter != end) && (iter->_doc == doc); ++iter) {
if (iter->_type == ChangeBase::CLEARDOC) {
- lastClearDoc = iter;
+ last_clear_doc = iter;
}
}
-
// use last clear doc if found
- if (lastClearDoc != end) {
- current = lastClearDoc;
+ if (last_clear_doc != end) {
+ current = last_clear_doc;
}
+ MultiValueArrayRef old_values(_mvMapping.get(doc));
+ ValueVector new_values(old_values.cbegin(), old_values.cend());
+ vespalib::hash_map<ValueType, size_t, typename HashFn<ValueType>::type> tombstones;
// iterate through all changes for this document
for (; (current != end) && (current->_doc == doc); ++current) {
-
if (current->_type == ChangeBase::CLEARDOC) {
- newValues.clear();
+ new_values.clear();
+ tombstones.clear();
continue;
}
-
ValueType data;
bool hasData = extractChangeData(*current, data);
-
+ if (!hasData) {
+ continue;
+ }
if (current->_type == ChangeBase::APPEND) {
- if (hasData) {
- if (this->hasArrayType()) {
- newValues.push_back(MultiValueType(data, current->_weight));
- } else if (this->hasWeightedSetType()) {
- ValueVectorIterator witer;
- for (witer = newValues.begin(); witer != newValues.end(); ++witer) {
- if (witer->value() == data) {
- break;
- }
- }
- if (witer != newValues.end()) {
- witer->setWeight(current->_weight);
- } else {
- newValues.push_back(MultiValueType(data, current->_weight));
- }
- }
- }
+ new_values.emplace_back(data, current->_weight);
} else if (current->_type == ChangeBase::REMOVE) {
- if (hasData) {
- for (ValueVectorIterator witer = newValues.begin(); witer != newValues.end(); ) {
- if (witer->value() == data) {
- witer = newValues.erase(witer);
- } else {
- ++witer;
- }
- }
+ // Defer all removals to the very end by tracking when, during value vector build time,
+ // a removal was encountered for a particular value. All values < this index will be ignored.
+ tombstones[data] = new_values.size();
+ }
+ }
+ // Optimize for the case where nothing was explicitly removed.
+ if (!tombstones.empty()) {
+ ValueVector culled;
+ culled.reserve(new_values.size());
+ for (size_t i = 0; i < new_values.size(); ++i) {
+ auto iter = tombstones.find(new_values[i].value());
+ if (iter == tombstones.end() || (iter->second <= i)) {
+ culled.emplace_back(new_values[i]);
}
+ }
+ culled.swap(new_values);
+ }
+ this->checkSetMaxValueCount(new_values.size());
+ docValues.emplace_back(doc, std::move(new_values));
+ }
+}
+
+template <typename B, typename M>
+void
+MultiValueAttribute<B, M>::apply_attribute_changes_to_wset(DocumentValues& docValues)
+{
+ // compute new values for each document with changes
+ for (ChangeVectorIterator current(this->_changes.begin()), end(this->_changes.end()); (current != end); ) {
+ const DocId doc = current->_doc;
+ // find last clear doc
+ ChangeVectorIterator last_clear_doc = end;
+ size_t max_elems_inserted = 0;
+ for (ChangeVectorIterator iter = current; (iter != end) && (iter->_doc == doc); ++iter) {
+ if (iter->_type == ChangeBase::CLEARDOC) {
+ last_clear_doc = iter;
+ }
+ ++max_elems_inserted;
+ }
+ // use last clear doc if found
+ if (last_clear_doc != end) {
+ current = last_clear_doc;
+ }
+ MultiValueArrayRef old_values(_mvMapping.get(doc));
+ vespalib::hash_map<ValueType, int32_t, typename HashFn<ValueType>::type> wset_inserted;
+ wset_inserted.resize((old_values.size() + max_elems_inserted) * 2);
+ for (const auto& e : old_values) {
+ wset_inserted[e.value()] = e.weight();
+ }
+ // iterate through all changes for this document
+ for (; (current != end) && (current->_doc == doc); ++current) {
+ if (current->_type == ChangeBase::CLEARDOC) {
+ wset_inserted.clear();
+ continue;
+ }
+ ValueType data;
+ bool hasData = extractChangeData(*current, data);
+ if (!hasData) {
+ continue;
+ }
+ if (current->_type == ChangeBase::APPEND) {
+ wset_inserted[data] = current->_weight;
+ } else if (current->_type == ChangeBase::REMOVE) {
+ wset_inserted.erase(data);
} else if ((current->_type >= ChangeBase::INCREASEWEIGHT) && (current->_type <= ChangeBase::DIVWEIGHT)) {
- if (this->hasWeightedSetType() && hasData) {
- ValueVectorIterator witer;
- for (witer = newValues.begin(); witer != newValues.end(); ++witer) {
- if (witer->value() == data) {
- break;
- }
+ auto existing = wset_inserted.find(data);
+ if (existing != wset_inserted.end()) {
+ existing->second = this->applyWeightChange(existing->second, *current);
+ if ((existing->second == 0) && this->getInternalCollectionType().removeIfZero()) {
+ wset_inserted.erase(existing);
}
- if (witer != newValues.end()) {
- witer->setWeight(this->applyWeightChange(witer->weight(), *current));
- if (witer->weight() == 0 && this->getInternalCollectionType().removeIfZero()) {
- newValues.erase(witer);
- }
- } else if (this->getInternalCollectionType().createIfNonExistant()) {
- int32_t weight = this->applyWeightChange(0, *current);
- if (weight != 0 || !this->getInternalCollectionType().removeIfZero()) {
- newValues.push_back(MultiValueType(data, weight));
- }
+ } else if (this->getInternalCollectionType().createIfNonExistant()) {
+ int32_t weight = this->applyWeightChange(0, *current);
+ if (weight != 0 || !this->getInternalCollectionType().removeIfZero()) {
+ wset_inserted.insert(std::make_pair(data, weight));
}
}
}
}
- this->checkSetMaxValueCount(newValues.size());
+ std::vector<MultiValueType> new_values;
+ new_values.reserve(wset_inserted.size());
+ wset_inserted.for_each([&new_values](const auto& e){ new_values.emplace_back(e.first, e.second); });
- docValues.push_back(std::make_pair(doc, ValueVector()));
- docValues.back().second.swap(newValues);
+ this->checkSetMaxValueCount(new_values.size());
+ docValues.emplace_back(doc, std::move(new_values));
}
}
-
template <typename B, typename M>
vespalib::AddressSpace
MultiValueAttribute<B, M>::getMultiValueAddressSpaceUsage() const