summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2022-05-03 11:38:23 +0200
committerTor Egge <Tor.Egge@online.no>2022-05-03 11:38:23 +0200
commit5cdb4ab60fb8b2aa58fe4d0abb1a611bf26b13e0 (patch)
tree25ed368826c719880191457fac2256e636ad3b9e /searchlib
parentb942ed39f4d06cf714d62235978399910753a5b9 (diff)
Use vespalib::datastore::AtomicEntryRef in search::memoryindex::PostingListEntry.
Add guard bytes to feature store for each memory index commit to avoid conflict between decoder overrun and addition of new features.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/memoryindex/field_index/field_index_test.cpp24
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp10
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/feature_store.h16
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/field_index.cpp16
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/field_index_base.h2
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp91
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.h4
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/posting_list_entry.h9
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp2
9 files changed, 106 insertions, 68 deletions
diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp
index a9b7bbd4603..23d96f7b81c 100644
--- a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp
+++ b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp
@@ -22,6 +22,7 @@
#include <vespa/vespalib/util/gate.h>
#include <vespa/vespalib/util/destructor_callbacks.h>
#include <vespa/vespalib/util/sequencedtaskexecutor.h>
+#include <unordered_set>
#include <vespa/vespalib/gtest/gtest.h>
@@ -290,33 +291,21 @@ public:
MockFieldIndex::~MockFieldIndex() = default;
/**
- * MockWordStoreScan is a helper class to ensure that previous word is
+ * MockWordStoreScan is a helper class to ensure that previous words are
* still stored safely in memory, to satisfy OrderedFieldIndexInserter
* needs.
*/
class MockWordStoreScan {
- vespalib::string _word0;
- vespalib::string _word1;
- vespalib::string *_prevWord;
- vespalib::string *_word;
+ std::unordered_set<vespalib::string, vespalib::hash<vespalib::string>> _words;
public:
MockWordStoreScan()
- : _word0(),
- _word1(),
- _prevWord(&_word0),
- _word(&_word1)
+ : _words()
{ }
~MockWordStoreScan();
- const vespalib::string &getWord() const {
- return *_word;
- }
-
const vespalib::string &setWord(const vespalib::string &word) {
- std::swap(_prevWord, _word);
- *_word = word;
- return *_word;
+ return *_words.insert(word).first;
}
};
@@ -759,11 +748,10 @@ TEST_F(FieldIndexCollectionTest, require_that_multiple_insert_and_remove_works)
for (uint32_t di = 0; di < (uint32_t) w; ++di) { // insert
inserter.add(di * 3);
}
- EXPECT_EQ((w - 'a' + 1u) + ('z' - 'a' +1u) * fi,
- inserter.getNumUniqueWords());
}
}
EXPECT_TRUE(inserter.assertPostings());
+ EXPECT_EQ(('z' - 'a' +1u) * numFields, inserter.getNumUniqueWords());
inserter.rewind();
for (uint32_t fi = 0; fi < numFields; ++fi) {
MyDrainRemoves drainRemoves(inserter.getFieldIndexes(), fi);
diff --git a/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp b/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp
index 42b260db9bd..b37300375a8 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp
@@ -106,6 +106,16 @@ FeatureStore::addFeatures(uint32_t packedIndex, const DocIdAndFeatures &features
}
void
+FeatureStore::add_features_guard_bytes()
+{
+ uint32_t len = DECODE_SAFETY;
+ uint32_t pad = RefType::pad(len);
+ auto result = _store.rawAllocator<int8_t>(_typeId).alloc(len + pad);
+ memset(result.data, 0, len + pad);
+ _store.incDead(result.ref, len + pad);
+}
+
+void
FeatureStore::getFeatures(uint32_t packedIndex, vespalib::datastore::EntryRef ref, DocIdAndFeatures &features)
{
setupForField(packedIndex, _d);
diff --git a/searchlib/src/vespa/searchlib/memoryindex/feature_store.h b/searchlib/src/vespa/searchlib/memoryindex/feature_store.h
index 9f17d369208..3c1b64a1488 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/feature_store.h
+++ b/searchlib/src/vespa/searchlib/memoryindex/feature_store.h
@@ -25,7 +25,7 @@ private:
using DocIdAndFeatures = index::DocIdAndFeatures;
using PosOccFieldsParams = bitcompression::PosOccFieldsParams;
- static const uint32_t DECODE_SAFETY = 16;
+ static constexpr uint32_t DECODE_SAFETY = 16;
DataStoreType _store;
@@ -105,6 +105,20 @@ public:
*/
std::pair<vespalib::datastore::EntryRef, uint64_t> addFeatures(uint32_t packedIndex, const DocIdAndFeatures &features);
+ /*
+ * Decoding of bitwise compressed data can read up to DECODE_SAFETY
+ * bytes beyond end of compressed data. This can cause issues with future
+ * features being written after new features are made visible for readers.
+ * Adding guard bytes when flushing OrderedFieldIndexInserter before
+ * updating the posting lists and dictionary ensures that the decoder
+ * overrun beyond the compressed data either goes into other features
+ * already written or into the guard area.
+ *
+ * If buffer type is changed to have a nonzero numArraysForNewBuffer then
+ * extra logic to add guard bytes is needed when switching primary buffer
+ * to avoid issues if the buffer is resumed as primary buffer later on.
+ */
+ void add_features_guard_bytes();
/**
* Get features from feature store.
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp
index 60e22ee4a1a..59289bfbf8f 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp
@@ -124,11 +124,7 @@ FieldIndex<interleaved_features>::compactFeatures()
// Filter on which buffers to move features from when
// performing incremental compaction.
- EntryRef newFeatures = _featureStore.moveFeatures(packedIndex, posting_entry.get_features());
-
- // Features must be written before reference is updated.
- std::atomic_thread_fence(std::memory_order_release);
-
+ EntryRef newFeatures = _featureStore.moveFeatures(packedIndex, posting_entry.get_features_relaxed());
// Reference the moved data
posting_entry.update_features(newFeatures);
}
@@ -141,11 +137,7 @@ FieldIndex<interleaved_features>::compactFeatures()
// Filter on which buffers to move features from when
// performing incremental compaction.
- EntryRef newFeatures = _featureStore.moveFeatures(packedIndex, posting_entry.get_features());
-
- // Features must be written before reference is updated.
- std::atomic_thread_fence(std::memory_order_release);
-
+ EntryRef newFeatures = _featureStore.moveFeatures(packedIndex, posting_entry.get_features_relaxed());
// Reference the moved data
posting_entry.update_features(newFeatures);
}
@@ -184,7 +176,7 @@ FieldIndex<interleaved_features>::dump(search::index::IndexBuilder & indexBuilde
const PostingListEntryType &entry(pitr.getData());
features.set_num_occs(entry.get_num_occs());
features.set_field_length(entry.get_field_length());
- _featureStore.setupForReadFeatures(entry.get_features(), decoder);
+ _featureStore.setupForReadFeatures(entry.get_features_relaxed(), decoder);
decoder.readFeatures(features);
indexBuilder.add_document(features);
}
@@ -197,7 +189,7 @@ FieldIndex<interleaved_features>::dump(search::index::IndexBuilder & indexBuilde
const PostingListEntryType &entry(kd->getData());
features.set_num_occs(entry.get_num_occs());
features.set_field_length(entry.get_field_length());
- _featureStore.setupForReadFeatures(entry.get_features(), decoder);
+ _featureStore.setupForReadFeatures(entry.get_features_relaxed(), decoder);
decoder.readFeatures(features);
indexBuilder.add_document(features);
}
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index_base.h b/searchlib/src/vespa/searchlib/memoryindex/field_index_base.h
index ea373662290..93fe77ba50b 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_index_base.h
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_index_base.h
@@ -97,6 +97,8 @@ public:
return _featureStore.addFeatures(_fieldId, features).first;
}
+ void add_features_guard_bytes() { _featureStore.add_features_guard_bytes(); }
+
FieldIndexBase(const index::Schema& schema, uint32_t fieldId);
FieldIndexBase(const index::Schema& schema, uint32_t fieldId, const index::FieldLengthInfo& info);
~FieldIndexBase();
diff --git a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp
index be6de7d3360..a12544a9da5 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp
@@ -15,6 +15,7 @@
#include <vespa/vespalib/btree/btreeiterator.hpp>
#include <vespa/vespalib/btree/btreeroot.hpp>
#include <vespa/vespalib/btree/btree.hpp>
+#include <cstdio>
#include <vespa/log/log.h>
LOG_SETUP(".searchlib.memoryindex.ordered_document_inserter");
@@ -38,7 +39,10 @@ OrderedFieldIndexInserter<interleaved_features>::OrderedFieldIndexInserter(Field
_dItr(_fieldIndex.getDictionaryTree().begin()),
_listener(_fieldIndex.getDocumentRemover()),
_removes(),
- _adds()
+ _adds(),
+ _word_entries(),
+ _removes_offset(0),
+ _adds_offset(0)
{
}
@@ -49,22 +53,12 @@ template <bool interleaved_features>
void
OrderedFieldIndexInserter<interleaved_features>::flushWord()
{
- if (_removes.empty() && _adds.empty()) {
+ if (_removes.size() == _removes_offset && _adds.size() == _adds_offset) {
return;
}
- //XXX: Feature store leak, removed features not marked dead
- PostingListStore &postingListStore(_fieldIndex.getPostingListStore());
- vespalib::datastore::EntryRef pidx(_dItr.getData().load_relaxed());
- postingListStore.apply(pidx,
- &_adds[0],
- &_adds[0] + _adds.size(),
- &_removes[0],
- &_removes[0] + _removes.size());
- if (pidx != _dItr.getData().load_relaxed()) {
- _dItr.getWData().store_release(pidx);
- }
- _removes.clear();
- _adds.clear();
+ _word_entries.emplace_back(_word, _adds.size() - _adds_offset, _removes.size() - _removes_offset);
+ _adds_offset = _adds.size();
+ _removes_offset = _removes.size();
}
template <bool interleaved_features>
@@ -72,6 +66,55 @@ void
OrderedFieldIndexInserter<interleaved_features>::flush()
{
flushWord();
+ assert(_adds_offset == _adds.size());
+ assert(_removes_offset == _removes.size());
+ if (!_adds.empty()) {
+ _fieldIndex.add_features_guard_bytes();
+ }
+ const WordStore &wordStore(_fieldIndex.getWordStore());
+ PostingListStore &postingListStore(_fieldIndex.getPostingListStore());
+ size_t adds_offset = 0;
+ size_t removes_offset = 0;
+ for (const auto& word_entry : _word_entries) {
+ auto word = std::get<0>(word_entry);
+ vespalib::ConstArrayRef<PostingListKeyDataType> adds(_adds.data() + adds_offset, std::get<1>(word_entry));
+ vespalib::ConstArrayRef<uint32_t> removes(_removes.data() + removes_offset, std::get<2>(word_entry));
+ KeyComp cmp(wordStore, word);
+ WordKey key;
+ if (_dItr.valid() && cmp(_dItr.getKey(), key)) {
+ _dItr.binarySeek(key, cmp);
+ }
+ if (!_dItr.valid() || cmp(key, _dItr.getKey())) {
+ vespalib::datastore::EntryRef wordRef = _fieldIndex.addWord(word);
+ WordKey insertKey(wordRef);
+ DictionaryTree &dTree(_fieldIndex.getDictionaryTree());
+ dTree.insert(_dItr, insertKey, vespalib::datastore::AtomicEntryRef());
+ }
+ assert(_dItr.valid());
+ assert(word == wordStore.getWord(_dItr.getKey()._wordRef));
+ for (auto& add_entry : adds) {
+ _listener.insert(_dItr.getKey()._wordRef, add_entry._key);
+ }
+ //XXX: Feature store leak, removed features not marked dead
+ vespalib::datastore::EntryRef pidx(_dItr.getData().load_relaxed());
+ postingListStore.apply(pidx,
+ adds.begin(),
+ adds.end(),
+ removes.begin(),
+ removes.end());
+ if (pidx != _dItr.getData().load_relaxed()) {
+ _dItr.getWData().store_release(pidx);
+ }
+ adds_offset += adds.size();
+ removes_offset += removes.size();
+ }
+ assert(adds_offset == _adds.size());
+ assert(removes_offset == _removes.size());
+ _removes_offset = 0;
+ _adds_offset = 0;
+ _adds.clear();
+ _removes.clear();
+ _word_entries.clear();
_listener.flush();
}
@@ -86,27 +129,12 @@ template <bool interleaved_features>
void
OrderedFieldIndexInserter<interleaved_features>::setNextWord(const vespalib::stringref word)
{
+ flushWord();
// TODO: Adjust here if zero length words should be legal.
assert(_word < word);
_word = word;
_prevDocId = noDocId;
_prevAdd = false;
- flushWord();
- const WordStore &wordStore(_fieldIndex.getWordStore());
- KeyComp cmp(wordStore, _word);
- WordKey key;
- if (_dItr.valid() && cmp(_dItr.getKey(), key)) {
- _dItr.binarySeek(key, cmp);
- }
- if (!_dItr.valid() || cmp(key, _dItr.getKey())) {
- vespalib::datastore::EntryRef wordRef = _fieldIndex.addWord(_word);
-
- WordKey insertKey(wordRef);
- DictionaryTree &dTree(_fieldIndex.getDictionaryTree());
- dTree.insert(_dItr, insertKey, vespalib::datastore::AtomicEntryRef());
- }
- assert(_dItr.valid());
- assert(_word == wordStore.getWord(_dItr.getKey()._wordRef));
}
template <bool interleaved_features>
@@ -121,7 +149,6 @@ OrderedFieldIndexInserter<interleaved_features>::add(uint32_t docId,
_adds.push_back(PostingListKeyDataType(docId, PostingListEntryType(featureRef,
cap_u16(features.num_occs()),
cap_u16(features.field_length()))));
- _listener.insert(_dItr.getKey()._wordRef, docId);
_prevDocId = docId;
_prevAdd = true;
}
diff --git a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.h b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.h
index 6fd9578d0bf..ed4c6d68b5f 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.h
+++ b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.h
@@ -41,6 +41,10 @@ private:
// Pending changes to posting list for (_word)
std::vector<uint32_t> _removes;
std::vector<PostingListKeyDataType> _adds;
+ using WordEntry = std::tuple<vespalib::stringref, size_t, size_t>;
+ std::vector<WordEntry> _word_entries;
+ size_t _removes_offset;
+ size_t _adds_offset;
static constexpr uint32_t noFieldId = std::numeric_limits<uint32_t>::max();
static constexpr uint32_t noDocId = std::numeric_limits<uint32_t>::max();
diff --git a/searchlib/src/vespa/searchlib/memoryindex/posting_list_entry.h b/searchlib/src/vespa/searchlib/memoryindex/posting_list_entry.h
index e906ba7366e..ef0866e957b 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/posting_list_entry.h
+++ b/searchlib/src/vespa/searchlib/memoryindex/posting_list_entry.h
@@ -2,7 +2,7 @@
# pragma once
-#include <vespa/vespalib/datastore/entryref.h>
+#include <vespa/vespalib/datastore/atomic_entry_ref.h>
namespace search::memoryindex {
@@ -49,7 +49,7 @@ public:
template <bool interleaved_features>
class PostingListEntry : public std::conditional_t<interleaved_features, InterleavedFeatures, NoInterleavedFeatures> {
using ParentType = std::conditional_t<interleaved_features, InterleavedFeatures, NoInterleavedFeatures>;
- mutable vespalib::datastore::EntryRef _features; // reference to compressed features
+ mutable vespalib::datastore::AtomicEntryRef _features; // reference to compressed features
public:
explicit PostingListEntry(vespalib::datastore::EntryRef features, uint16_t num_occs, uint16_t field_length)
@@ -64,14 +64,15 @@ public:
{
}
- vespalib::datastore::EntryRef get_features() const { return _features; }
+ vespalib::datastore::EntryRef get_features() const noexcept { return _features.load_acquire(); }
+ vespalib::datastore::EntryRef get_features_relaxed() const noexcept { return _features.load_relaxed(); }
/*
* Reference moved features (used when compacting FeatureStore).
* The moved features must have the same content as the original
* features.
*/
- void update_features(vespalib::datastore::EntryRef features) const { _features = features; }
+ void update_features(vespalib::datastore::EntryRef features) const { _features.store_release(features); }
};
template class PostingListEntry<false>;
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp
index 666eed8f1e8..433f543ab92 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp
@@ -263,7 +263,7 @@ FakeMemTreeOccMgr::flush()
lastWord = wordIdx;
if (i->getRemove()) {
if (itr.valid() && itr.getKey() == docId) {
- uint64_t bits = _featureStore.bitSize(fw->getPackedIndex(), EntryRef(itr.getData().get_features()));
+ uint64_t bits = _featureStore.bitSize(fw->getPackedIndex(), EntryRef(itr.getData().get_features_relaxed()));
_featureSizes[wordIdx] -= RefType::align((bits + 7) / 8) * 8;
tree.remove(itr);
}