diff options
author | Tor Egge <Tor.Egge@online.no> | 2022-05-03 11:38:23 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2022-05-03 11:38:23 +0200 |
commit | 5cdb4ab60fb8b2aa58fe4d0abb1a611bf26b13e0 (patch) | |
tree | 25ed368826c719880191457fac2256e636ad3b9e /searchlib | |
parent | b942ed39f4d06cf714d62235978399910753a5b9 (diff) |
Use vespalib::datastore::AtomicEntryRef in search::memoryindex::PostingListEntry.
Add guard bytes to feature store for each memory index commit to avoid
conflict between decoder overrun and addition of new features.
Diffstat (limited to 'searchlib')
9 files changed, 106 insertions, 68 deletions
diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp index a9b7bbd4603..23d96f7b81c 100644 --- a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp +++ b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp @@ -22,6 +22,7 @@ #include <vespa/vespalib/util/gate.h> #include <vespa/vespalib/util/destructor_callbacks.h> #include <vespa/vespalib/util/sequencedtaskexecutor.h> +#include <unordered_set> #include <vespa/vespalib/gtest/gtest.h> @@ -290,33 +291,21 @@ public: MockFieldIndex::~MockFieldIndex() = default; /** - * MockWordStoreScan is a helper class to ensure that previous word is + * MockWordStoreScan is a helper class to ensure that previous words are * still stored safely in memory, to satisfy OrderedFieldIndexInserter * needs. */ class MockWordStoreScan { - vespalib::string _word0; - vespalib::string _word1; - vespalib::string *_prevWord; - vespalib::string *_word; + std::unordered_set<vespalib::string, vespalib::hash<vespalib::string>> _words; public: MockWordStoreScan() - : _word0(), - _word1(), - _prevWord(&_word0), - _word(&_word1) + : _words() { } ~MockWordStoreScan(); - const vespalib::string &getWord() const { - return *_word; - } - const vespalib::string &setWord(const vespalib::string &word) { - std::swap(_prevWord, _word); - *_word = word; - return *_word; + return *_words.insert(word).first; } }; @@ -759,11 +748,10 @@ TEST_F(FieldIndexCollectionTest, require_that_multiple_insert_and_remove_works) for (uint32_t di = 0; di < (uint32_t) w; ++di) { // insert inserter.add(di * 3); } - EXPECT_EQ((w - 'a' + 1u) + ('z' - 'a' +1u) * fi, - inserter.getNumUniqueWords()); } } EXPECT_TRUE(inserter.assertPostings()); + EXPECT_EQ(('z' - 'a' +1u) * numFields, inserter.getNumUniqueWords()); inserter.rewind(); for (uint32_t fi = 0; fi < numFields; ++fi) { MyDrainRemoves drainRemoves(inserter.getFieldIndexes(), fi); diff --git a/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp b/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp index 42b260db9bd..b37300375a8 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp @@ -106,6 +106,16 @@ FeatureStore::addFeatures(uint32_t packedIndex, const DocIdAndFeatures &features } void +FeatureStore::add_features_guard_bytes() +{ + uint32_t len = DECODE_SAFETY; + uint32_t pad = RefType::pad(len); + auto result = _store.rawAllocator<int8_t>(_typeId).alloc(len + pad); + memset(result.data, 0, len + pad); + _store.incDead(result.ref, len + pad); +} + +void FeatureStore::getFeatures(uint32_t packedIndex, vespalib::datastore::EntryRef ref, DocIdAndFeatures &features) { setupForField(packedIndex, _d); diff --git a/searchlib/src/vespa/searchlib/memoryindex/feature_store.h b/searchlib/src/vespa/searchlib/memoryindex/feature_store.h index 9f17d369208..3c1b64a1488 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/feature_store.h +++ b/searchlib/src/vespa/searchlib/memoryindex/feature_store.h @@ -25,7 +25,7 @@ private: using DocIdAndFeatures = index::DocIdAndFeatures; using PosOccFieldsParams = bitcompression::PosOccFieldsParams; - static const uint32_t DECODE_SAFETY = 16; + static constexpr uint32_t DECODE_SAFETY = 16; DataStoreType _store; @@ -105,6 +105,20 @@ public: */ std::pair<vespalib::datastore::EntryRef, uint64_t> addFeatures(uint32_t packedIndex, const DocIdAndFeatures &features); + /* + * Decoding of bitwise compressed data can read up to DECODE_SAFETY + * bytes beyond end of compressed data. This can cause issues with future + * features being written after new features are made visible for readers. + * Adding guard bytes when flushing OrderedFieldIndexInserter before + * updating the posting lists and dictionary ensures that the decoder + * overrun beyond the compressed data either goes into other features + * already written or into the guard area. + * + * If buffer type is changed to have a nonzero numArraysForNewBuffer then + * extra logic to add guard bytes is needed when switching primary buffer + * to avoid issues if the buffer is resumed as primary buffer later on. + */ + void add_features_guard_bytes(); /** * Get features from feature store. diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp index 60e22ee4a1a..59289bfbf8f 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp @@ -124,11 +124,7 @@ FieldIndex<interleaved_features>::compactFeatures() // Filter on which buffers to move features from when // performing incremental compaction. - EntryRef newFeatures = _featureStore.moveFeatures(packedIndex, posting_entry.get_features()); - - // Features must be written before reference is updated. - std::atomic_thread_fence(std::memory_order_release); - + EntryRef newFeatures = _featureStore.moveFeatures(packedIndex, posting_entry.get_features_relaxed()); // Reference the moved data posting_entry.update_features(newFeatures); } @@ -141,11 +137,7 @@ FieldIndex<interleaved_features>::compactFeatures() // Filter on which buffers to move features from when // performing incremental compaction. - EntryRef newFeatures = _featureStore.moveFeatures(packedIndex, posting_entry.get_features()); - - // Features must be written before reference is updated. - std::atomic_thread_fence(std::memory_order_release); - + EntryRef newFeatures = _featureStore.moveFeatures(packedIndex, posting_entry.get_features_relaxed()); // Reference the moved data posting_entry.update_features(newFeatures); } @@ -184,7 +176,7 @@ FieldIndex<interleaved_features>::dump(search::index::IndexBuilder & indexBuilde const PostingListEntryType &entry(pitr.getData()); features.set_num_occs(entry.get_num_occs()); features.set_field_length(entry.get_field_length()); - _featureStore.setupForReadFeatures(entry.get_features(), decoder); + _featureStore.setupForReadFeatures(entry.get_features_relaxed(), decoder); decoder.readFeatures(features); indexBuilder.add_document(features); } @@ -197,7 +189,7 @@ FieldIndex<interleaved_features>::dump(search::index::IndexBuilder & indexBuilde const PostingListEntryType &entry(kd->getData()); features.set_num_occs(entry.get_num_occs()); features.set_field_length(entry.get_field_length()); - _featureStore.setupForReadFeatures(entry.get_features(), decoder); + _featureStore.setupForReadFeatures(entry.get_features_relaxed(), decoder); decoder.readFeatures(features); indexBuilder.add_document(features); } diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index_base.h b/searchlib/src/vespa/searchlib/memoryindex/field_index_base.h index ea373662290..93fe77ba50b 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_index_base.h +++ b/searchlib/src/vespa/searchlib/memoryindex/field_index_base.h @@ -97,6 +97,8 @@ public: return _featureStore.addFeatures(_fieldId, features).first; } + void add_features_guard_bytes() { _featureStore.add_features_guard_bytes(); } + FieldIndexBase(const index::Schema& schema, uint32_t fieldId); FieldIndexBase(const index::Schema& schema, uint32_t fieldId, const index::FieldLengthInfo& info); ~FieldIndexBase(); diff --git a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp index be6de7d3360..a12544a9da5 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp @@ -15,6 +15,7 @@ #include <vespa/vespalib/btree/btreeiterator.hpp> #include <vespa/vespalib/btree/btreeroot.hpp> #include <vespa/vespalib/btree/btree.hpp> +#include <cstdio> #include <vespa/log/log.h> LOG_SETUP(".searchlib.memoryindex.ordered_document_inserter"); @@ -38,7 +39,10 @@ OrderedFieldIndexInserter<interleaved_features>::OrderedFieldIndexInserter(Field _dItr(_fieldIndex.getDictionaryTree().begin()), _listener(_fieldIndex.getDocumentRemover()), _removes(), - _adds() + _adds(), + _word_entries(), + _removes_offset(0), + _adds_offset(0) { } @@ -49,22 +53,12 @@ template <bool interleaved_features> void OrderedFieldIndexInserter<interleaved_features>::flushWord() { - if (_removes.empty() && _adds.empty()) { + if (_removes.size() == _removes_offset && _adds.size() == _adds_offset) { return; } - //XXX: Feature store leak, removed features not marked dead - PostingListStore &postingListStore(_fieldIndex.getPostingListStore()); - vespalib::datastore::EntryRef pidx(_dItr.getData().load_relaxed()); - postingListStore.apply(pidx, - &_adds[0], - &_adds[0] + _adds.size(), - &_removes[0], - &_removes[0] + _removes.size()); - if (pidx != _dItr.getData().load_relaxed()) { - _dItr.getWData().store_release(pidx); - } - _removes.clear(); - _adds.clear(); + _word_entries.emplace_back(_word, _adds.size() - _adds_offset, _removes.size() - _removes_offset); + _adds_offset = _adds.size(); + _removes_offset = _removes.size(); } template <bool interleaved_features> @@ -72,6 +66,55 @@ void OrderedFieldIndexInserter<interleaved_features>::flush() { flushWord(); + assert(_adds_offset == _adds.size()); + assert(_removes_offset == _removes.size()); + if (!_adds.empty()) { + _fieldIndex.add_features_guard_bytes(); + } + const WordStore &wordStore(_fieldIndex.getWordStore()); + PostingListStore &postingListStore(_fieldIndex.getPostingListStore()); + size_t adds_offset = 0; + size_t removes_offset = 0; + for (const auto& word_entry : _word_entries) { + auto word = std::get<0>(word_entry); + vespalib::ConstArrayRef<PostingListKeyDataType> adds(_adds.data() + adds_offset, std::get<1>(word_entry)); + vespalib::ConstArrayRef<uint32_t> removes(_removes.data() + removes_offset, std::get<2>(word_entry)); + KeyComp cmp(wordStore, word); + WordKey key; + if (_dItr.valid() && cmp(_dItr.getKey(), key)) { + _dItr.binarySeek(key, cmp); + } + if (!_dItr.valid() || cmp(key, _dItr.getKey())) { + vespalib::datastore::EntryRef wordRef = _fieldIndex.addWord(word); + WordKey insertKey(wordRef); + DictionaryTree &dTree(_fieldIndex.getDictionaryTree()); + dTree.insert(_dItr, insertKey, vespalib::datastore::AtomicEntryRef()); + } + assert(_dItr.valid()); + assert(word == wordStore.getWord(_dItr.getKey()._wordRef)); + for (auto& add_entry : adds) { + _listener.insert(_dItr.getKey()._wordRef, add_entry._key); + } + //XXX: Feature store leak, removed features not marked dead + vespalib::datastore::EntryRef pidx(_dItr.getData().load_relaxed()); + postingListStore.apply(pidx, + adds.begin(), + adds.end(), + removes.begin(), + removes.end()); + if (pidx != _dItr.getData().load_relaxed()) { + _dItr.getWData().store_release(pidx); + } + adds_offset += adds.size(); + removes_offset += removes.size(); + } + assert(adds_offset == _adds.size()); + assert(removes_offset == _removes.size()); + _removes_offset = 0; + _adds_offset = 0; + _adds.clear(); + _removes.clear(); + _word_entries.clear(); _listener.flush(); } @@ -86,27 +129,12 @@ template <bool interleaved_features> void OrderedFieldIndexInserter<interleaved_features>::setNextWord(const vespalib::stringref word) { + flushWord(); // TODO: Adjust here if zero length words should be legal. assert(_word < word); _word = word; _prevDocId = noDocId; _prevAdd = false; - flushWord(); - const WordStore &wordStore(_fieldIndex.getWordStore()); - KeyComp cmp(wordStore, _word); - WordKey key; - if (_dItr.valid() && cmp(_dItr.getKey(), key)) { - _dItr.binarySeek(key, cmp); - } - if (!_dItr.valid() || cmp(key, _dItr.getKey())) { - vespalib::datastore::EntryRef wordRef = _fieldIndex.addWord(_word); - - WordKey insertKey(wordRef); - DictionaryTree &dTree(_fieldIndex.getDictionaryTree()); - dTree.insert(_dItr, insertKey, vespalib::datastore::AtomicEntryRef()); - } - assert(_dItr.valid()); - assert(_word == wordStore.getWord(_dItr.getKey()._wordRef)); } template <bool interleaved_features> @@ -121,7 +149,6 @@ OrderedFieldIndexInserter<interleaved_features>::add(uint32_t docId, _adds.push_back(PostingListKeyDataType(docId, PostingListEntryType(featureRef, cap_u16(features.num_occs()), cap_u16(features.field_length())))); - _listener.insert(_dItr.getKey()._wordRef, docId); _prevDocId = docId; _prevAdd = true; } diff --git a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.h b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.h index 6fd9578d0bf..ed4c6d68b5f 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.h +++ b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.h @@ -41,6 +41,10 @@ private: // Pending changes to posting list for (_word) std::vector<uint32_t> _removes; std::vector<PostingListKeyDataType> _adds; + using WordEntry = std::tuple<vespalib::stringref, size_t, size_t>; + std::vector<WordEntry> _word_entries; + size_t _removes_offset; + size_t _adds_offset; static constexpr uint32_t noFieldId = std::numeric_limits<uint32_t>::max(); static constexpr uint32_t noDocId = std::numeric_limits<uint32_t>::max(); diff --git a/searchlib/src/vespa/searchlib/memoryindex/posting_list_entry.h b/searchlib/src/vespa/searchlib/memoryindex/posting_list_entry.h index e906ba7366e..ef0866e957b 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/posting_list_entry.h +++ b/searchlib/src/vespa/searchlib/memoryindex/posting_list_entry.h @@ -2,7 +2,7 @@ # pragma once -#include <vespa/vespalib/datastore/entryref.h> +#include <vespa/vespalib/datastore/atomic_entry_ref.h> namespace search::memoryindex { @@ -49,7 +49,7 @@ public: template <bool interleaved_features> class PostingListEntry : public std::conditional_t<interleaved_features, InterleavedFeatures, NoInterleavedFeatures> { using ParentType = std::conditional_t<interleaved_features, InterleavedFeatures, NoInterleavedFeatures>; - mutable vespalib::datastore::EntryRef _features; // reference to compressed features + mutable vespalib::datastore::AtomicEntryRef _features; // reference to compressed features public: explicit PostingListEntry(vespalib::datastore::EntryRef features, uint16_t num_occs, uint16_t field_length) @@ -64,14 +64,15 @@ public: { } - vespalib::datastore::EntryRef get_features() const { return _features; } + vespalib::datastore::EntryRef get_features() const noexcept { return _features.load_acquire(); } + vespalib::datastore::EntryRef get_features_relaxed() const noexcept { return _features.load_relaxed(); } /* * Reference moved features (used when compacting FeatureStore). * The moved features must have the same content as the original * features. */ - void update_features(vespalib::datastore::EntryRef features) const { _features = features; } + void update_features(vespalib::datastore::EntryRef features) const { _features.store_release(features); } }; template class PostingListEntry<false>; diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp index 666eed8f1e8..433f543ab92 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp @@ -263,7 +263,7 @@ FakeMemTreeOccMgr::flush() lastWord = wordIdx; if (i->getRemove()) { if (itr.valid() && itr.getKey() == docId) { - uint64_t bits = _featureStore.bitSize(fw->getPackedIndex(), EntryRef(itr.getData().get_features())); + uint64_t bits = _featureStore.bitSize(fw->getPackedIndex(), EntryRef(itr.getData().get_features_relaxed())); _featureSizes[wordIdx] -= RefType::align((bits + 7) / 8) * 8; tree.remove(itr); } |