// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "postingchange.h" #include "multi_value_mapping.h" #include "postinglistattribute.h" #include #include #include #include using vespalib::datastore::AtomicEntryRef; namespace search { namespace { template struct CompareValue { bool operator()(const WeightedIndex& lhs, const WeightedIndex& rhs) const { return multivalue::get_value_ref(lhs).load_relaxed() < multivalue::get_value_ref(rhs).load_relaxed(); }; }; void removeDupAdditions(PostingChange::A &additions) { if (additions.empty()) return; if (additions.size() == 1) return; std::sort(additions.begin(), additions.end()); auto i = additions.begin(); auto ie = additions.end(); auto d = i; for (++i; i != ie; ++i, ++d) { if (d->_key == i->_key) break; } if (i == ie) return; // no dups found for (++i; i != ie; ++i) { if (d->_key != i->_key) { ++d; *d = *i; } } additions.resize(d - additions.begin() + 1); } void removeDupAdditions(PostingChange::A &additions) { if (additions.empty()) return; if (additions.size() == 1u) return; std::sort(additions.begin(), additions.end()); auto i = additions.begin(); auto ie = additions.end(); auto d = i; for (++i; i != ie; ++i, ++d) { if (d->_key == i->_key) break; } if (i == ie) return; // no dups found // sum weights together d->setData(d->getData() + i->getData()); for (++i; i != ie; ++i) { if (d->_key != i->_key) { ++d; *d = *i; } else { // sum weights together d->setData(d->getData() + i->getData()); } } additions.resize(d - additions.begin() + 1); } void removeDupRemovals(std::vector &removals) { if (removals.empty()) return; if (removals.size() == 1u) return; std::sort(removals.begin(), removals.end()); auto i = removals.begin(); auto ie = removals.end(); auto d = i; for (++i; i != ie; ++i, ++d) { if (*d == *i) break; } if (i == ie) return; // no dups found for (++i; i != ie; ++i) { if (*d != *i) { ++d; *d = *i; } } removals.resize(d - removals.begin() + 1); } } IEnumStore::Index EnumIndexMapper::map(IEnumStore::Index original) const { return original; } template <> void PostingChange::removeDups() { removeDupAdditions(_additions); removeDupRemovals(_removals); } template <> void PostingChange::removeDups() { removeDupAdditions(_additions); removeDupRemovals(_removals); } template PostingChange

::PostingChange() = default; template PostingChange

::~PostingChange() = default; template class ActualChangeComputer { public: using EnumIndex = IEnumStore::Index; using AlwaysWeightedIndexVector = std::vector>; using WeightedIndexVector = std::vector; void compute(const WeightedIndex * entriesNew, size_t szNew, const WeightedIndex * entriesOld, size_t szOld, AlwaysWeightedIndexVector & added, AlwaysWeightedIndexVector & changed, AlwaysWeightedIndexVector & removed); ActualChangeComputer(const vespalib::datastore::EntryComparator& compare, const EnumIndexMapper& mapper); ~ActualChangeComputer(); private: WeightedIndexVector _oldEntries; WeightedIndexVector _newEntries; vespalib::hash_map _cachedMapping; const vespalib::datastore::EntryComparator &_compare; const EnumIndexMapper &_mapper; const bool _hasFold; static void copyFast(WeightedIndexVector &dst, const WeightedIndex *src, size_t sz) { dst.insert(dst.begin(), src, src + sz); } EnumIndex mapEnumIndex(EnumIndex unmapped) { auto itr = _cachedMapping.insert(std::make_pair(unmapped.ref(), 0)); if (itr.second) { itr.first->second = _mapper.map(unmapped).ref(); } return EnumIndex(vespalib::datastore::EntryRef(itr.first->second)); } void copyMapped(WeightedIndexVector &dst, const WeightedIndex *src, size_t sz) { const WeightedIndex *srce = src + sz; for (const WeightedIndex *i = src; i < srce; ++i) { dst.emplace_back(multivalue::ValueBuilder::build(AtomicEntryRef(mapEnumIndex(multivalue::get_value_ref(*i).load_relaxed())), multivalue::get_weight(*i))); } } void copyEntries(WeightedIndexVector &dst, const WeightedIndex *src, size_t sz) { dst.reserve(sz); dst.clear(); if (_hasFold) { copyMapped(dst, src, sz); } else { copyFast(dst, src, sz); } std::sort(dst.begin(), dst.end(), CompareValue()); } }; template class MergeDupIterator { using InnerIter = typename std::vector::const_iterator; using EnumIndex = IEnumStore::Index; using Entry = multivalue::WeightedValue; InnerIter _cur; InnerIter _end; Entry _entry; bool _valid; void merge() { EnumIndex idx = multivalue::get_value_ref(*_cur).load_relaxed(); int32_t weight = multivalue::get_weight(*_cur); ++_cur; while (_cur != _end && multivalue::get_value_ref(*_cur).load_relaxed() == idx) { // sum weights together. Overflow is not handled. weight += multivalue::get_weight(*_cur); ++_cur; } _entry = Entry(AtomicEntryRef(idx), weight); } public: MergeDupIterator(const std::vector &vec) : _cur(vec.begin()), _end(vec.end()), _entry(), _valid(_cur != _end) { if (_valid) { merge(); } } bool valid() const { return _valid; } const Entry &entry() const { return _entry; } EnumIndex value() const { return _entry.value_ref().load_relaxed(); } int32_t weight() const { return _entry.weight(); } void next() { if (_cur != _end) { merge(); } else { _valid = false; } } }; template ActualChangeComputer::ActualChangeComputer(const vespalib::datastore::EntryComparator &compare, const EnumIndexMapper &mapper) : _oldEntries(), _newEntries(), _cachedMapping(), _compare(compare), _mapper(mapper), _hasFold(mapper.hasFold()) { } template ActualChangeComputer::~ActualChangeComputer() = default; template void ActualChangeComputer::compute(const WeightedIndex * entriesNew, size_t szNew, const WeightedIndex * entriesOld, size_t szOld, AlwaysWeightedIndexVector & added, AlwaysWeightedIndexVector & changed, AlwaysWeightedIndexVector & removed) { copyEntries(_newEntries, entriesNew, szNew); copyEntries(_oldEntries, entriesOld, szOld); MergeDupIterator oldIt(_oldEntries); MergeDupIterator newIt(_newEntries); while (newIt.valid() && oldIt.valid()) { if (newIt.value() == oldIt.value()) { if (newIt.weight() != oldIt.weight()) { changed.push_back(newIt.entry()); } newIt.next(); oldIt.next(); } else if (newIt.value() < oldIt.value()) { added.push_back(newIt.entry()); newIt.next(); } else { removed.push_back(oldIt.entry()); oldIt.next(); } } while (newIt.valid()) { added.push_back(newIt.entry()); newIt.next(); } while (oldIt.valid()) { removed.push_back(oldIt.entry()); oldIt.next(); } } template template PostingMap PostingChangeComputerT:: compute(const MultivalueMapping & mvm, const DocIndices & docIndices, const vespalib::datastore::EntryComparator & compare, const EnumIndexMapper & mapper) { using AC = ActualChangeComputer; AC actualChange(compare, mapper); typename AC::AlwaysWeightedIndexVector added, changed, removed; PostingMap changePost; // generate add postings and remove postings for (const auto & docIndex : docIndices) { vespalib::ConstArrayRef oldIndices(mvm.get(docIndex.first)); added.clear(), changed.clear(), removed.clear(); actualChange.compute(docIndex.second.data(), docIndex.second.size(), oldIndices.data(), oldIndices.size(), added, changed, removed); for (const auto & wi : added) { changePost[EnumPostingPair(wi.value_ref().load_relaxed(), &compare)].add(docIndex.first, wi.weight()); } for (const auto & wi : removed) { changePost[EnumPostingPair(wi.value_ref().load_relaxed(), &compare)].remove(docIndex.first); } for (const auto & wi : changed) { changePost[EnumPostingPair(wi.value_ref().load_relaxed(), &compare)].remove(docIndex.first).add(docIndex.first, wi.weight()); } } return changePost; } template class PostingChange; template class PostingChange; using WeightedPostingChange = PostingChange >; using WeightedPostingChangeMap = std::map; using WeightedIndex = multivalue::WeightedValue; using ValueIndex = AtomicEntryRef; using WeightedMultiValueMapping = attribute::MultiValueMapping; using ValueMultiValueMapping = attribute::MultiValueMapping; using DocIndicesWeighted = std::vector>>; using DocIndicesValue = std::vector>>; template WeightedPostingChangeMap PostingChangeComputerT ::compute(const WeightedMultiValueMapping &, const DocIndicesWeighted &, const vespalib::datastore::EntryComparator &, const EnumIndexMapper &); template WeightedPostingChangeMap PostingChangeComputerT ::compute(const ValueMultiValueMapping &, const DocIndicesValue &, const vespalib::datastore::EntryComparator &, const EnumIndexMapper &); } namespace vespalib { template class Array; template class Array; }