diff options
Diffstat (limited to 'searchlib')
55 files changed, 1895 insertions, 2504 deletions
diff --git a/searchlib/src/apps/vespa-index-inspect/vespa-index-inspect.cpp b/searchlib/src/apps/vespa-index-inspect/vespa-index-inspect.cpp index 4e4d90e6871..90953f78c40 100644 --- a/searchlib/src/apps/vespa-index-inspect/vespa-index-inspect.cpp +++ b/searchlib/src/apps/vespa-index-inspect/vespa-index-inspect.cpp @@ -94,15 +94,13 @@ unpackFeatures(std::vector<PosEntry> &entries, uint64_t wordNum, const DocIdAndFeatures &features) { - std::vector<search::index::WordDocElementFeatures>::const_iterator - element = features._elements.begin(); - std::vector<search::index::WordDocElementWordPosFeatures>:: - const_iterator position = features._wordPositions.begin(); - uint32_t numElements = features._elements.size(); + auto element = features.elements().begin(); + auto position = features.word_positions().begin(); + uint32_t numElements = features.elements().size(); while (numElements--) { uint32_t numOccs = element->getNumOccs(); while (numOccs--) { - entries.push_back(PosEntry(features._docId, + entries.push_back(PosEntry(features.doc_id(), fieldId, element->getElementId(), position->getWordPos(), @@ -447,7 +445,7 @@ ShowPostingListSubApp::readPostings(const SchemaUtil::IndexIterator &index, if (r.isValid()) r.read(); while (r.isValid()) { - uint32_t docId = r._docIdAndFeatures._docId; + uint32_t docId = r._docIdAndFeatures.doc_id(); if (docId >= _minDocId && docId < _docIdLimit) { unpackFeatures(entries, index.getIndex(), r._wordNum, r._docIdAndFeatures); diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index bd814b0ad32..cbbaa518b16 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -40,9 +40,6 @@ static bool operator==(const Tensor &lhs, const Tensor &rhs) vespalib::string sparseSpec("tensor(x{},y{})"); vespalib::string denseSpec("tensor(x[2],y[3])"); -vespalib::string denseAbstractSpec_xy("tensor(x[],y[])"); -vespalib::string denseAbstractSpec_x("tensor(x[2],y[])"); -vespalib::string denseAbstractSpec_y("tensor(x[],y[3])"); struct Fixture { @@ -307,7 +304,7 @@ Fixture::testSaveLoad() void Fixture::testCompaction() { - if (_useDenseTensorAttribute && _denseTensors && !_cfg.tensorType().is_abstract()) { + if (_useDenseTensorAttribute && _denseTensors) { LOG(info, "Skipping compaction test for tensor '%s' which is using free-lists", _cfg.tensorType().to_spec().c_str()); return; } @@ -411,34 +408,4 @@ TEST("Test dense tensors with dense tensor attribute") testAll([]() { return std::make_shared<Fixture>(denseSpec, true); }); } -TEST("Test dense tensors with generic tensor attribute with unbound x and y dims") -{ - testAll([]() { return std::make_shared<Fixture>(denseAbstractSpec_xy); }); -} - -TEST("Test dense tensors with dense tensor attribute with unbound x and y dims") -{ - testAll([]() { return std::make_shared<Fixture>(denseAbstractSpec_xy, true); }); -} - -TEST("Test dense tensors with generic tensor attribute with unbound x dim") -{ - testAll([]() { return std::make_shared<Fixture>(denseAbstractSpec_x); }); -} - -TEST("Test dense tensors with dense tensor attribute with unbound x dim") -{ - testAll([]() { return std::make_shared<Fixture>(denseAbstractSpec_x, true); }); -} - -TEST("Test dense tensors with generic tensor attribute with unbound y dim") -{ - testAll([]() { return std::make_shared<Fixture>(denseAbstractSpec_y); }); -} - -TEST("Test dense tensors with dense tensor attribute with unbound y dim") -{ - testAll([]() { return std::make_shared<Fixture>(denseAbstractSpec_y, true); }); -} - TEST_MAIN() { TEST_RUN_ALL(); vespalib::unlink("test.dat"); } diff --git a/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp b/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp index e33158e559f..fab2ed734cd 100644 --- a/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp +++ b/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp @@ -62,10 +62,10 @@ FieldWriterWrapper & FieldWriterWrapper::add(uint32_t docId) { DocIdAndFeatures daf; - daf._docId = docId; - daf._elements.push_back(WordDocElementFeatures(0)); - daf._elements.back().setNumOccs(1); - daf._wordPositions.push_back(WordDocElementWordPosFeatures(0)); + daf.set_doc_id(docId); + daf.elements().emplace_back(0); + daf.elements().back().setNumOccs(1); + daf.word_positions().emplace_back(0); //LOG(info, "add(%" PRIu64 ", %u)", wordNum, docId); _writer.add(daf); return *this; diff --git a/searchlib/src/tests/features/ranking_expression/ranking_expression_test.cpp b/searchlib/src/tests/features/ranking_expression/ranking_expression_test.cpp index 2419f450950..c7c3447a4cc 100644 --- a/searchlib/src/tests/features/ranking_expression/ranking_expression_test.cpp +++ b/searchlib/src/tests/features/ranking_expression/ranking_expression_test.cpp @@ -54,7 +54,7 @@ ExpressionReplacer::SP make_replacer() { auto replacer = std::make_shared<ListExpressionReplacer>(); replacer->add(std::make_unique<NullExpressionReplacer>()); replacer->add(std::make_unique<DummyReplacer>("foo", FeatureType::number())); - replacer->add(std::make_unique<DummyReplacer>("bar", FeatureType::object(ValueType::from_spec("tensor(x[])")))); + replacer->add(std::make_unique<DummyReplacer>("bar", FeatureType::object(ValueType::from_spec("tensor(x[5])")))); return replacer; } @@ -124,15 +124,6 @@ TEST("require that ranking expression can resolve to concrete complex type") { FeatureType::object(ValueType::from_spec("tensor(x{},y{},z{})")))); } -TEST("require that ranking expression can resolve to abstract complex type") { - TEST_DO(verify_output_type({{"a", "tensor"}}, "a*b", FeatureType::object(ValueType::from_spec("tensor")))); -} - -TEST("require that ranking expression can resolve to 'any' type") { - TEST_DO(verify_output_type({{"a", "tensor(x{},y{})"}, {"b", "tensor"}}, "a*b", - FeatureType::object(ValueType::from_spec("any")))); -} - TEST("require that setup fails for incompatible types") { TEST_DO(verify_setup_fail({{"a", "tensor(x{},y{})"}, {"b", "tensor(y[10],z{})"}}, "a*b")); } @@ -150,7 +141,7 @@ TEST("require that replaced expressions override result type") { TEST_DO(verify_output_type({{"b", "tensor(z{})"}}, "foo*b*c", FeatureType::number())); TEST_DO(verify_output_type({{"b", "tensor(z{})"}}, "a*b*bar", - FeatureType::object(ValueType::from_spec("tensor(x[])")))); + FeatureType::object(ValueType::from_spec("tensor(x[5])")))); TEST_DO(verify_output_type({{"b", "tensor(z{})"}}, "foo*b*bar", FeatureType::number())); } diff --git a/searchlib/src/tests/memoryindex/datastore/feature_store_test.cpp b/searchlib/src/tests/memoryindex/datastore/feature_store_test.cpp index 49e9d613861..aca83d67a8a 100644 --- a/searchlib/src/tests/memoryindex/datastore/feature_store_test.cpp +++ b/searchlib/src/tests/memoryindex/datastore/feature_store_test.cpp @@ -41,27 +41,27 @@ Test::assertFeatures(const DocIdAndFeatures &exp, const DocIdAndFeatures &act) { // docid is not encoded as part of features - if (!EXPECT_EQUAL(exp._elements.size(), - act._elements.size())) + if (!EXPECT_EQUAL(exp.elements().size(), + act.elements().size())) return false; - for (size_t i = 0; i < exp._elements.size(); ++i) { - if (!EXPECT_EQUAL(exp._elements[i]._elementId, - act._elements[i]._elementId)) + for (size_t i = 0; i < exp.elements().size(); ++i) { + if (!EXPECT_EQUAL(exp.elements()[i].getElementId(), + act.elements()[i].getElementId())) return false; - if (!EXPECT_EQUAL(exp._elements[i]._numOccs, - act._elements[i]._numOccs)) + if (!EXPECT_EQUAL(exp.elements()[i].getNumOccs(), + act.elements()[i].getNumOccs())) return false; - if (!EXPECT_EQUAL(exp._elements[i]._weight, act._elements[i]._weight)) + if (!EXPECT_EQUAL(exp.elements()[i].getWeight(), act.elements()[i].getWeight())) return false; - if (!EXPECT_EQUAL(exp._elements[i]._elementLen, - act._elements[i]._elementLen)) + if (!EXPECT_EQUAL(exp.elements()[i].getElementLen(), + act.elements()[i].getElementLen())) return false; } - if (!EXPECT_EQUAL(exp._wordPositions.size(), act._wordPositions.size())) + if (!EXPECT_EQUAL(exp.word_positions().size(), act.word_positions().size())) return false; - for (size_t i = 0; i < exp._wordPositions.size(); ++i) { - if (!EXPECT_EQUAL(exp._wordPositions[i]._wordPos, - act._wordPositions[i]._wordPos)) return false; + for (size_t i = 0; i < exp.word_positions().size(); ++i) { + if (!EXPECT_EQUAL(exp.word_positions()[i].getWordPos(), + act.word_positions()[i].getWordPos())) return false; } return true; } @@ -73,13 +73,13 @@ getFeatures(uint32_t numOccs, uint32_t elemLen) { DocIdAndFeatures f; - f._docId = 0; - f._elements.push_back(WordDocElementFeatures(0)); - f._elements.back().setNumOccs(numOccs); - f._elements.back().setWeight(weight); - f._elements.back().setElementLen(elemLen); + f.set_doc_id(0); + f.elements().push_back(WordDocElementFeatures(0)); + f.elements().back().setNumOccs(numOccs); + f.elements().back().setWeight(weight); + f.elements().back().setElementLen(elemLen); for (uint32_t i = 0; i < numOccs; ++i) { - f._wordPositions.push_back(WordDocElementWordPosFeatures(i)); + f.word_positions().push_back(WordDocElementWordPosFeatures(i)); } return f; } diff --git a/searchlib/src/tests/memoryindex/field_index/CMakeLists.txt b/searchlib/src/tests/memoryindex/field_index/CMakeLists.txt index 767097b99db..a09d6baf1a5 100644 --- a/searchlib/src/tests/memoryindex/field_index/CMakeLists.txt +++ b/searchlib/src/tests/memoryindex/field_index/CMakeLists.txt @@ -5,5 +5,15 @@ vespa_add_executable(searchlib_field_index_test_app TEST DEPENDS searchlib searchlib_test + gtest ) vespa_add_test(NAME searchlib_field_index_test_app COMMAND searchlib_field_index_test_app) + +vespa_add_executable(searchlib_field_index_iterator_test_app TEST + SOURCES + field_index_iterator_test.cpp + DEPENDS + searchlib + searchlib_test +) +vespa_add_test(NAME searchlib_field_index_iterator_test_app COMMAND searchlib_field_index_iterator_test_app) diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_iterator_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_iterator_test.cpp new file mode 100644 index 00000000000..df7f80e8601 --- /dev/null +++ b/searchlib/src/tests/memoryindex/field_index/field_index_iterator_test.cpp @@ -0,0 +1,73 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchcommon/common/schema.h> +#include <vespa/searchlib/memoryindex/field_index.h> +#include <vespa/searchlib/memoryindex/posting_iterator.h> +#include <vespa/searchlib/test/memoryindex/wrap_inserter.h> +#include <vespa/searchlib/test/searchiteratorverifier.h> +#include <vespa/vespalib/testkit/testapp.h> + +#include <vespa/log/log.h> +LOG_SETUP("field_index_iterator_test"); + +using namespace search::fef; +using namespace search::index; +using namespace search::memoryindex::test; +using namespace search::memoryindex; + +using search::index::schema::DataType; +using search::test::SearchIteratorVerifier; + +class Verifier : public SearchIteratorVerifier { +private: + mutable TermFieldMatchData _tfmd; + FieldIndex _field_index; + +public: + Verifier(const Schema& schema) + : _tfmd(), + _field_index(schema, 0) + { + WrapInserter inserter(_field_index); + inserter.word("a"); + for (uint32_t docId : getExpectedDocIds()) { + inserter.add(docId); + } + inserter.flush(); + } + ~Verifier() {} + + SearchIterator::UP create(bool strict) const override { + (void) strict; + TermFieldMatchDataArray match_data; + match_data.add(&_tfmd); + return std::make_unique<PostingIterator>(_field_index.find("a"), + _field_index.getFeatureStore(), 0, match_data); + } +}; + +Schema +get_schema() +{ + Schema result; + result.addIndexField(Schema::IndexField("f0", DataType::STRING)); + return result; +} + +struct Fixture { + Schema schema; + Verifier verifier; + Fixture() + : schema(get_schema()), + verifier(schema) + { + } +}; + +TEST_F("require that posting iterator conforms", Fixture) +{ + f.verifier.verify(); +} + +TEST_MAIN() { TEST_RUN_ALL(); } + diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp index 3a635756ec7..2b9b77d32a3 100644 --- a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp +++ b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp @@ -9,17 +9,18 @@ #include <vespa/searchlib/fef/fieldpositionsiterator.h> #include <vespa/searchlib/fef/termfieldmatchdata.h> #include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/docidandfeatures.h> #include <vespa/searchlib/index/dummyfileheadercontext.h> #include <vespa/searchlib/memoryindex/document_inverter.h> #include <vespa/searchlib/memoryindex/field_index_collection.h> #include <vespa/searchlib/memoryindex/field_inverter.h> #include <vespa/searchlib/memoryindex/ordered_field_index_inserter.h> #include <vespa/searchlib/memoryindex/posting_iterator.h> -#include <vespa/searchlib/test/searchiteratorverifier.h> -#include <vespa/vespalib/testkit/testapp.h> +#include <vespa/searchlib/test/memoryindex/wrap_inserter.h> +#include <vespa/vespalib/gtest/gtest.h> #include <vespa/log/log.h> -LOG_SETUP("dictionary_test"); +LOG_SETUP("field_index_test"); namespace search { @@ -32,45 +33,35 @@ using document::Document; using queryeval::SearchIterator; using search::index::schema::CollectionType; using search::index::schema::DataType; -using test::SearchIteratorVerifier; using vespalib::GenerationHandler; namespace memoryindex { -typedef FieldIndex::PostingList PostingList; -typedef PostingList::ConstIterator PostingConstItr; +using test::WrapInserter; +using PostingList = FieldIndex::PostingList; +using PostingConstItr = PostingList::ConstIterator; class MyBuilder : public IndexBuilder { private: std::stringstream _ss; bool _insideWord; bool _insideField; - bool _insideDoc; - bool _insideElem; bool _firstWord; bool _firstField; bool _firstDoc; - bool _firstElem; - bool _firstPos; -public: +public: MyBuilder(const Schema &schema) : IndexBuilder(schema), _ss(), _insideWord(false), _insideField(false), - _insideDoc(false), - _insideElem(false), _firstWord(true), _firstField(true), - _firstDoc(true), - _firstElem(true), - _firstPos(true) + _firstDoc(true) {} - virtual void - startWord(vespalib::stringref word) override - { + virtual void startWord(vespalib::stringref word) override { assert(_insideField); assert(!_insideWord); if (!_firstWord) @@ -80,19 +71,14 @@ public: _insideWord = true; } - virtual void - endWord() override - { + virtual void endWord() override { assert(_insideWord); - assert(!_insideDoc); _ss << "]"; _firstWord = false; _insideWord = false; } - virtual void - startField(uint32_t fieldId) override - { + virtual void startField(uint32_t fieldId) override { assert(!_insideField); if (!_firstField) _ss << ","; _ss << "f=" << fieldId << "["; @@ -100,9 +86,7 @@ public: _insideField = true; } - virtual void - endField() override - { + virtual void endField() override { assert(_insideField); assert(!_insideWord); _ss << "]"; @@ -110,63 +94,36 @@ public: _insideField = false; } - virtual void - startDocument(uint32_t docId) override - { + virtual void add_document(const DocIdAndFeatures &features) override { assert(_insideWord); - assert(!_insideDoc); - if (!_firstDoc) _ss << ","; - _ss << "d=" << docId << "["; - _firstElem = true; - _insideDoc = true; - } - - virtual void - endDocument() override - { - assert(_insideDoc); - assert(!_insideElem); - _ss << "]"; - _firstDoc = false; - _insideDoc = false; - } - - virtual void - startElement(uint32_t elementId, - int32_t weight, - uint32_t elementLen) override - { - assert(_insideDoc); - assert(!_insideElem); - if (!_firstElem) + if (!_firstDoc) { _ss << ","; - _ss << "e=" << elementId << - ",w=" << weight << ",l=" << elementLen << "["; - _firstPos = true; - _insideElem = true; - } - - virtual void - endElement() override - { - assert(_insideElem); + } + _ss << "d=" << features.doc_id() << "["; + bool first_elem = true; + size_t word_pos_offset = 0; + for (const auto& elem : features.elements()) { + if (!first_elem) { + _ss << ","; + } + _ss << "e=" << elem.getElementId() << ",w=" << elem.getWeight() << ",l=" << elem.getElementLen() << "["; + bool first_pos = true; + for (size_t i = 0; i < elem.getNumOccs(); ++i) { + if (!first_pos) { + _ss << ","; + } + _ss << features.word_positions()[i + word_pos_offset].getWordPos(); + first_pos = false; + } + word_pos_offset += elem.getNumOccs(); + _ss << "]"; + first_elem = false; + } _ss << "]"; - _firstElem = false; - _insideElem = false; - } - - virtual void - addOcc(const WordDocElementWordPosFeatures &features) override - { - assert(_insideElem); - if (!_firstPos) _ss << ","; - _ss << features.getWordPos(); - _firstPos = false; + _firstDoc = false; } - std::string - toStr() const - { + std::string toStr() const { return _ss.str(); } }; @@ -186,8 +143,9 @@ toString(FieldPositionsIterator posItr, first = false; if (hasElements) { ss << "[e=" << posItr.getElementId(); - if (hasWeights) + if (hasWeights) { ss << ",w=" << posItr.getElementWeight(); + } ss << ",l=" << posItr.getElementLen() << "]"; } } @@ -198,10 +156,10 @@ toString(FieldPositionsIterator posItr, bool assertPostingList(const std::string &exp, PostingConstItr itr, - const FeatureStore *store = NULL) + const FeatureStore *store = nullptr) { std::stringstream ss; - FeatureStore::DecodeContextCooked decoder(NULL); + FeatureStore::DecodeContextCooked decoder(nullptr); TermFieldMatchData tfmd; TermFieldMatchDataArray matchData; matchData.add(&tfmd); @@ -210,7 +168,7 @@ assertPostingList(const std::string &exp, if (i > 0) ss << ","; uint32_t docId = itr.getKey(); ss << docId; - if (store != NULL) { // consider features as well + if (store != nullptr) { // consider features as well EntryRef ref(itr.getData()); store->setupForField(0, decoder); store->setupForUnpackFeatures(ref, decoder); @@ -219,7 +177,9 @@ assertPostingList(const std::string &exp, } } ss << "]"; - return EXPECT_EQUAL(exp, ss.str()); + bool result = (exp == ss.str()); + EXPECT_EQ(exp, ss.str()); + return result; } bool @@ -236,15 +196,13 @@ assertPostingList(std::vector<uint32_t> &exp, PostingConstItr itr) } -namespace -{ +namespace { /** * A simple mockup of a memory field index, used to verify * that we get correct posting lists from real memory field index. */ -class MockFieldIndex -{ +class MockFieldIndex { std::map<std::pair<vespalib::string, uint32_t>, std::set<uint32_t>> _dict; vespalib::string _word; uint32_t _fieldId; @@ -252,32 +210,23 @@ class MockFieldIndex public: ~MockFieldIndex(); void - setNextWord(const vespalib::string &word) - { + setNextWord(const vespalib::string &word) { _word = word; } - void - setNextField(uint32_t fieldId) - { + void setNextField(uint32_t fieldId) { _fieldId = fieldId; } - void - add(uint32_t docId) - { + void add(uint32_t docId) { _dict[std::make_pair(_word, _fieldId)].insert(docId); } - void - remove(uint32_t docId) - { + void remove(uint32_t docId) { _dict[std::make_pair(_word, _fieldId)].erase(docId); } - std::vector<uint32_t> - find(const vespalib::string &word, uint32_t fieldId) - { + std::vector<uint32_t> find(const vespalib::string &word, uint32_t fieldId) { std::vector<uint32_t> res; for (auto docId : _dict[std::make_pair(word, fieldId)] ) { res.push_back(docId); @@ -285,13 +234,11 @@ public: return res; } - auto begin() - { + auto begin() { return _dict.begin(); } - auto end() - { + auto end() { return _dict.end(); } }; @@ -303,8 +250,7 @@ MockFieldIndex::~MockFieldIndex() = default; * still stored safely in memory, to satisfy OrderedFieldIndexInserter * needs. */ -class MockWordStoreScan -{ +class MockWordStoreScan { vespalib::string _word0; vespalib::string _word1; vespalib::string *_prevWord; @@ -319,15 +265,11 @@ public: { } ~MockWordStoreScan(); - const vespalib::string & - getWord() const - { + const vespalib::string &getWord() const { return *_word; } - const vespalib::string & - setWord(const vespalib::string &word) - { + const vespalib::string &setWord(const vespalib::string &word) { std::swap(_prevWord, _word); *_word = word; return *_word; @@ -341,8 +283,7 @@ MockWordStoreScan::~MockWordStoreScan() = default; * and a real memory index. Mockup version is used to calculate expected * answers. */ -class MyInserter -{ +class MyInserter { MockWordStoreScan _wordStoreScan; MockFieldIndex _mock; FieldIndexCollection _fieldIndexes; @@ -361,17 +302,13 @@ public: } ~MyInserter(); - void - setNextWord(const vespalib::string &word) - { + void setNextWord(const vespalib::string &word) { const vespalib::string &w = _wordStoreScan.setWord(word); _inserter->setNextWord(w); _mock.setNextWord(w); } - void - setNextField(uint32_t fieldId) - { + void setNextField(uint32_t fieldId) { if (_inserter != nullptr) { _inserter->flush(); } @@ -380,32 +317,26 @@ public: _mock.setNextField(fieldId); } - void - add(uint32_t docId) - { + void add(uint32_t docId) { _inserter->add(docId, _features); _mock.add(docId); } - void - remove(uint32_t docId) - { + void remove(uint32_t docId) { _inserter->remove(docId); _mock.remove(docId); } - bool - assertPosting(const vespalib::string &word, - uint32_t fieldId) - { + bool assertPosting(const vespalib::string &word, + uint32_t fieldId) { std::vector<uint32_t> exp = _mock.find(word, fieldId); PostingConstItr itr = _fieldIndexes.find(word, fieldId); - return EXPECT_TRUE(assertPostingList(exp, itr)); + bool result = assertPostingList(exp, itr); + EXPECT_TRUE(result); + return result; } - bool - assertPostings() - { + bool assertPostings() { if (_inserter != nullptr) { _inserter->flush(); } @@ -413,25 +344,23 @@ public: auto &wf = wfp.first; auto &word = wf.first; auto fieldId = wf.second; - if (!EXPECT_TRUE(assertPosting(word, fieldId))) { + bool result = assertPosting(word, fieldId); + EXPECT_TRUE(result); + if (!result) { return false; } } return true; } - void - rewind() - { + void rewind() { if (_inserter != nullptr) { _inserter->flush(); _inserter = nullptr; } } - uint32_t - getNumUniqueWords() - { + uint32_t getNumUniqueWords() { return _fieldIndexes.getNumUniqueWords(); } @@ -439,6 +368,7 @@ public: }; MyInserter::~MyInserter() = default; + void myremove(uint32_t docId, DocumentInverter &inv, FieldIndexCollection &fieldIndexes, ISequencedTaskExecutor &invertThreads) @@ -448,63 +378,7 @@ myremove(uint32_t docId, DocumentInverter &inv, FieldIndexCollection &fieldIndex inv.pushDocuments(fieldIndexes, std::shared_ptr<IDestructorCallback>()); } - -class WrapInserter -{ - OrderedFieldIndexInserter &_inserter; -public: - WrapInserter(FieldIndexCollection &fieldIndexes, uint32_t fieldId) - : _inserter(fieldIndexes.getFieldIndex(fieldId)->getInserter()) - { - } - - WrapInserter &word(vespalib::stringref word_) - { - _inserter.setNextWord(word_); - return *this; - } - - WrapInserter &add(uint32_t docId, const index::DocIdAndFeatures &features) - { - _inserter.add(docId, features); - return *this; - } - - WrapInserter &add(uint32_t docId) - { - DocIdAndPosOccFeatures features; - features.addNextOcc(0, 0, 1, 1); - return add(docId, features); - } - - WrapInserter &remove(uint32_t docId) - { - _inserter.remove(docId); - return *this; - } - - WrapInserter &flush() - { - _inserter.flush(); - return *this; - } - - WrapInserter &rewind() - { - _inserter.rewind(); - return *this; - } - - datastore::EntryRef - getWordRef() - { - return _inserter.getWordRef(); - } -}; - - -class MyDrainRemoves : IFieldIndexRemoveListener -{ +class MyDrainRemoves : IFieldIndexRemoveListener { FieldIndexRemover &_remover; public: virtual void remove(const vespalib::stringref, uint32_t) override { } @@ -514,8 +388,12 @@ public: { } - void drain(uint32_t docId) + MyDrainRemoves(FieldIndex& field_index) + : _remover(field_index.getDocumentRemover()) { + } + + void drain(uint32_t docId) { _remover.remove(docId, *this); } }; @@ -526,7 +404,6 @@ myPushDocument(DocumentInverter &inv, FieldIndexCollection &fieldIndexes) inv.pushDocuments(fieldIndexes, std::shared_ptr<IDestructorCallback>()); } - const FeatureStore * featureStorePtr(const FieldIndexCollection &fieldIndexes, uint32_t fieldId) { @@ -539,7 +416,6 @@ featureStoreRef(const FieldIndexCollection &fieldIndexes, uint32_t fieldId) return fieldIndexes.getFieldIndex(fieldId)->getFeatureStore(); } - DataStoreBase::MemStats getFeatureStoreMemStats(const FieldIndexCollection &fieldIndexes) { @@ -553,8 +429,8 @@ getFeatureStoreMemStats(const FieldIndexCollection &fieldIndexes) return res; } - -void myCommit(FieldIndexCollection &fieldIndexes, ISequencedTaskExecutor &pushThreads) +void +myCommit(FieldIndexCollection &fieldIndexes, ISequencedTaskExecutor &pushThreads) { uint32_t fieldId = 0; for (auto &fieldIndex : fieldIndexes.getFieldIndexes()) { @@ -566,7 +442,6 @@ void myCommit(FieldIndexCollection &fieldIndexes, ISequencedTaskExecutor &pushTh pushThreads.sync(); } - void myCompactFeatures(FieldIndexCollection &fieldIndexes, ISequencedTaskExecutor &pushThreads) { @@ -581,57 +456,77 @@ myCompactFeatures(FieldIndexCollection &fieldIndexes, ISequencedTaskExecutor &pu } - -struct Fixture +Schema +make_single_field_schema() { - Schema _schema; - Fixture() : _schema() { - _schema.addIndexField(Schema::IndexField("f0", DataType::STRING)); - _schema.addIndexField(Schema::IndexField("f1", DataType::STRING)); - _schema.addIndexField(Schema::IndexField("f2", DataType::STRING, CollectionType::ARRAY)); - _schema.addIndexField(Schema::IndexField("f3", DataType::STRING, CollectionType::WEIGHTEDSET)); + Schema result; + result.addIndexField(Schema::IndexField("f0", DataType::STRING)); + return result; +} + +struct FieldIndexTest : public ::testing::Test { + Schema schema; + FieldIndex idx; + FieldIndexTest() + : schema(make_single_field_schema()), + idx(schema, 0) + { } - const Schema & getSchema() const { return _schema; } }; -// TODO: Rewrite most tests to use FieldIndex directly instead of going via FieldIndexCollection. +Schema +make_multi_field_schema() +{ + Schema result; + result.addIndexField(Schema::IndexField("f0", DataType::STRING)); + result.addIndexField(Schema::IndexField("f1", DataType::STRING)); + result.addIndexField(Schema::IndexField("f2", DataType::STRING, CollectionType::ARRAY)); + result.addIndexField(Schema::IndexField("f3", DataType::STRING, CollectionType::WEIGHTEDSET)); + return result; +} + +struct FieldIndexCollectionTest : public ::testing::Test { + Schema schema; + FieldIndexCollection fic; + FieldIndexCollectionTest() + : schema(make_multi_field_schema()), + fic(schema) + { + } + ~FieldIndexCollectionTest() {} +}; -TEST_F("requireThatFreshInsertWorks", Fixture) +TEST_F(FieldIndexTest, require_that_fresh_insert_works) { - FieldIndexCollection fic(f.getSchema()); - SequencedTaskExecutor pushThreads(2); - EXPECT_TRUE(assertPostingList("[]", fic.find("a", 0))); - EXPECT_TRUE(assertPostingList("[]", fic.findFrozen("a", 0))); - EXPECT_EQUAL(0u, fic.getNumUniqueWords()); - WrapInserter(fic, 0).word("a").add(10).flush(); - EXPECT_TRUE(assertPostingList("[10]", fic.find("a", 0))); - EXPECT_TRUE(assertPostingList("[]", fic.findFrozen("a", 0))); - myCommit(fic, pushThreads); - EXPECT_TRUE(assertPostingList("[10]", fic.findFrozen("a", 0))); - EXPECT_EQUAL(1u, fic.getNumUniqueWords()); + EXPECT_TRUE(assertPostingList("[]", idx.find("a"))); + EXPECT_TRUE(assertPostingList("[]", idx.findFrozen("a"))); + EXPECT_EQ(0u, idx.getNumUniqueWords()); + WrapInserter(idx).word("a").add(10).flush(); + EXPECT_TRUE(assertPostingList("[10]", idx.find("a"))); + EXPECT_TRUE(assertPostingList("[]", idx.findFrozen("a"))); + idx.commit(); + EXPECT_TRUE(assertPostingList("[10]", idx.findFrozen("a"))); + EXPECT_EQ(1u, idx.getNumUniqueWords()); } -TEST_F("requireThatAppendInsertWorks", Fixture) +TEST_F(FieldIndexTest, require_that_append_insert_works) { - FieldIndexCollection fic(f.getSchema()); - SequencedTaskExecutor pushThreads(2); - WrapInserter(fic, 0).word("a").add(10).flush().rewind(). - word("a").add(5).flush(); - EXPECT_TRUE(assertPostingList("[5,10]", fic.find("a", 0))); - EXPECT_TRUE(assertPostingList("[]", fic.findFrozen("a", 0))); - WrapInserter(fic, 0).rewind().word("a").add(20).flush(); - EXPECT_TRUE(assertPostingList("[5,10,20]", fic.find("a", 0))); - EXPECT_TRUE(assertPostingList("[]", fic.findFrozen("a", 0))); - myCommit(fic, pushThreads); - EXPECT_TRUE(assertPostingList("[5,10,20]", fic.findFrozen("a", 0))); + WrapInserter(idx).word("a").add(10).flush().rewind(). + word("a").add(5).flush(); + EXPECT_TRUE(assertPostingList("[5,10]", idx.find("a"))); + EXPECT_TRUE(assertPostingList("[]", idx.findFrozen("a"))); + WrapInserter(idx).rewind().word("a").add(20).flush(); + EXPECT_TRUE(assertPostingList("[5,10,20]", idx.find("a"))); + EXPECT_TRUE(assertPostingList("[]", idx.findFrozen("a"))); + idx.commit(); + EXPECT_TRUE(assertPostingList("[5,10,20]", idx.findFrozen("a"))); } -TEST_F("requireThatMultiplePostingListsCanExist", Fixture) +TEST_F(FieldIndexCollectionTest, require_that_multiple_posting_lists_across_multiple_fields_can_exist) { - FieldIndexCollection fic(f.getSchema()); WrapInserter(fic, 0).word("a").add(10).word("b").add(11).add(15).flush(); WrapInserter(fic, 1).word("a").add(5).word("b").add(12).flush(); - EXPECT_EQUAL(4u, fic.getNumUniqueWords()); + EXPECT_EQ(4u, fic.getNumUniqueWords()); EXPECT_TRUE(assertPostingList("[10]", fic.find("a", 0))); EXPECT_TRUE(assertPostingList("[5]", fic.find("a", 1))); EXPECT_TRUE(assertPostingList("[11,15]", fic.find("b", 0))); @@ -640,28 +535,27 @@ TEST_F("requireThatMultiplePostingListsCanExist", Fixture) EXPECT_TRUE(assertPostingList("[]", fic.find("c", 0))); } -TEST_F("requireThatRemoveWorks", Fixture) +TEST_F(FieldIndexTest, require_that_remove_works) { - FieldIndexCollection fic(f.getSchema()); - WrapInserter(fic, 0).word("a").remove(10).flush(); - EXPECT_TRUE(assertPostingList("[]", fic.find("a", 0))); - WrapInserter(fic, 0).add(10).add(20).add(30).flush(); - EXPECT_TRUE(assertPostingList("[10,20,30]", fic.find("a", 0))); - WrapInserter(fic, 0).rewind().word("a").remove(10).flush(); - EXPECT_TRUE(assertPostingList("[20,30]", fic.find("a", 0))); - WrapInserter(fic, 0).remove(20).flush(); - EXPECT_TRUE(assertPostingList("[30]", fic.find("a", 0))); - WrapInserter(fic, 0).remove(30).flush(); - EXPECT_TRUE(assertPostingList("[]", fic.find("a", 0))); - EXPECT_EQUAL(1u, fic.getNumUniqueWords()); - MyDrainRemoves(fic, 0).drain(10); - WrapInserter(fic, 0).rewind().word("a").add(10).flush(); - EXPECT_TRUE(assertPostingList("[10]", fic.find("a", 0))); + WrapInserter(idx).word("a").remove(10).flush(); + EXPECT_TRUE(assertPostingList("[]", idx.find("a"))); + WrapInserter(idx).add(10).add(20).add(30).flush(); + EXPECT_TRUE(assertPostingList("[10,20,30]", idx.find("a"))); + WrapInserter(idx).rewind().word("a").remove(10).flush(); + EXPECT_TRUE(assertPostingList("[20,30]", idx.find("a"))); + WrapInserter(idx).remove(20).flush(); + EXPECT_TRUE(assertPostingList("[30]", idx.find("a"))); + WrapInserter(idx).remove(30).flush(); + EXPECT_TRUE(assertPostingList("[]", idx.find("a"))); + EXPECT_EQ(1u, idx.getNumUniqueWords()); + MyDrainRemoves(idx).drain(10); + WrapInserter(idx).rewind().word("a").add(10).flush(); + EXPECT_TRUE(assertPostingList("[10]", idx.find("a"))); } -TEST_F("requireThatMultipleInsertAndRemoveWorks", Fixture) +TEST_F(FieldIndexCollectionTest, require_that_multiple_insert_and_remove_works) { - MyInserter inserter(f.getSchema()); + MyInserter inserter(schema); uint32_t numFields = 4; for (uint32_t fi = 0; fi < numFields; ++fi) { inserter.setNextField(fi); @@ -671,8 +565,8 @@ TEST_F("requireThatMultipleInsertAndRemoveWorks", Fixture) for (uint32_t di = 0; di < (uint32_t) w; ++di) { // insert inserter.add(di * 3); } - EXPECT_EQUAL((w - 'a' + 1u) + ('z' - 'a' +1u) * fi, - inserter.getNumUniqueWords()); + EXPECT_EQ((w - 'a' + 1u) + ('z' - 'a' +1u) * fi, + inserter.getNumUniqueWords()); } } EXPECT_TRUE(inserter.assertPostings()); @@ -707,12 +601,10 @@ addElement(DocIdAndFeatures &f, uint32_t numOccs, int32_t weight = 1) { - f._elements.push_back(WordDocElementFeatures(f._elements.size())); - f._elements.back().setElementLen(elemLen); - f._elements.back().setWeight(weight); - f._elements.back().setNumOccs(numOccs); + f.elements().emplace_back(f.elements().size(), weight, elemLen); + f.elements().back().setNumOccs(numOccs); for (uint32_t i = 0; i < numOccs; ++i) { - f._wordPositions.push_back(WordDocElementWordPosFeatures(i)); + f.word_positions().emplace_back(i); } } @@ -724,9 +616,8 @@ getFeatures(uint32_t elemLen, uint32_t numOccs, int32_t weight = 1) return f; } -TEST_F("requireThatFeaturesAreInPostingLists", Fixture) +TEST_F(FieldIndexCollectionTest, require_that_features_are_in_posting_lists) { - FieldIndexCollection fic(f.getSchema()); WrapInserter(fic, 0).word("a").add(1, getFeatures(4, 2)).flush(); EXPECT_TRUE(assertPostingList("[1{4:0,1}]", fic.find("a", 0), @@ -742,47 +633,9 @@ TEST_F("requireThatFeaturesAreInPostingLists", Fixture) featureStorePtr(fic, 1))); } -class Verifier : public SearchIteratorVerifier { -public: - Verifier(const Schema & schema); - ~Verifier(); - - SearchIterator::UP create(bool strict) const override { - (void) strict; - TermFieldMatchDataArray matchData; - matchData.add(&_tfmd); - return std::make_unique<PostingIterator>(_fieldIndexes.find("a", 0), featureStoreRef(_fieldIndexes, 0), 0, matchData); - } - -private: - mutable TermFieldMatchData _tfmd; - FieldIndexCollection _fieldIndexes; -}; - - -Verifier::Verifier(const Schema & schema) - : _tfmd(), - _fieldIndexes(schema) -{ - WrapInserter inserter(_fieldIndexes, 0); - inserter.word("a"); - for (uint32_t docId : getExpectedDocIds()) { - inserter.add(docId); - } - inserter.flush(); -} -Verifier::~Verifier() {} - -TEST_F("require that postingiterator conforms", Fixture) { - Verifier verifier(f.getSchema()); - verifier.verify(); - -} - -TEST_F("requireThatPostingIteratorIsWorking", Fixture) +TEST_F(FieldIndexTest, require_that_posting_iterator_is_working) { - FieldIndexCollection fic(f.getSchema()); - WrapInserter(fic, 0).word("a").add(10, getFeatures(4, 1)). + WrapInserter(idx).word("a").add(10, getFeatures(4, 1)). add(20, getFeatures(5, 2)). add(30, getFeatures(6, 1)). add(40, getFeatures(7, 2)).flush(); @@ -790,166 +643,166 @@ TEST_F("requireThatPostingIteratorIsWorking", Fixture) TermFieldMatchDataArray matchData; matchData.add(&tfmd); { - PostingIterator itr(fic.find("not", 0), - featureStoreRef(fic, 0), + PostingIterator itr(idx.find("not"), + idx.getFeatureStore(), 0, matchData); itr.initFullRange(); EXPECT_TRUE(itr.isAtEnd()); } { - PostingIterator itr(fic.find("a", 0), - featureStoreRef(fic, 0), + PostingIterator itr(idx.find("a"), + idx.getFeatureStore(), 0, matchData); itr.initFullRange(); - EXPECT_EQUAL(10u, itr.getDocId()); + EXPECT_EQ(10u, itr.getDocId()); itr.unpack(10); - EXPECT_EQUAL("{4:0}", toString(tfmd.getIterator())); + EXPECT_EQ("{4:0}", toString(tfmd.getIterator())); EXPECT_TRUE(!itr.seek(25)); - EXPECT_EQUAL(30u, itr.getDocId()); + EXPECT_EQ(30u, itr.getDocId()); itr.unpack(30); - EXPECT_EQUAL("{6:0}", toString(tfmd.getIterator())); + EXPECT_EQ("{6:0}", toString(tfmd.getIterator())); EXPECT_TRUE(itr.seek(40)); - EXPECT_EQUAL(40u, itr.getDocId()); + EXPECT_EQ(40u, itr.getDocId()); itr.unpack(40); - EXPECT_EQUAL("{7:0,1}", toString(tfmd.getIterator())); + EXPECT_EQ("{7:0,1}", toString(tfmd.getIterator())); EXPECT_TRUE(!itr.seek(41)); EXPECT_TRUE(itr.isAtEnd()); } } -TEST_F("requireThatDumpingToIndexBuilderIsWorking", Fixture) +TEST_F(FieldIndexCollectionTest, require_that_basic_dumping_to_index_builder_is_working) { - { - MyBuilder b(f.getSchema()); - WordDocElementWordPosFeatures wpf; - b.startField(4); - b.startWord("a"); - b.startDocument(2); - b.startElement(0, 10, 20); - wpf.setWordPos(1); - b.addOcc(wpf); - wpf.setWordPos(3); - b.addOcc(wpf); - b.endElement(); - b.endDocument(); - b.endWord(); - b.endField(); - EXPECT_EQUAL("f=4[w=a[d=2[e=0,w=10,l=20[1,3]]]]", b.toStr()); - } - { - FieldIndexCollection fic(f.getSchema()); - MyBuilder b(f.getSchema()); - DocIdAndFeatures df; - WrapInserter(fic, 1).word("a").add(5, getFeatures(2, 1)). + MyBuilder b(schema); + WordDocElementWordPosFeatures wpf; + b.startField(4); + b.startWord("a"); + DocIdAndFeatures features; + features.set_doc_id(2); + features.elements().emplace_back(0, 10, 20); + features.elements().back().setNumOccs(2); + features.word_positions().emplace_back(1); + features.word_positions().emplace_back(3); + b.add_document(features); + b.endWord(); + b.endField(); + EXPECT_EQ("f=4[w=a[d=2[e=0,w=10,l=20[1,3]]]]", b.toStr()); +} + +TEST_F(FieldIndexCollectionTest, require_that_dumping_of_multiple_fields_to_index_builder_is_working) +{ + MyBuilder b(schema); + DocIdAndFeatures df; + WrapInserter(fic, 1).word("a").add(5, getFeatures(2, 1)). add(7, getFeatures(3, 2)). word("b").add(5, getFeatures(12, 2)).flush(); - df = getFeatures(4, 1); - addElement(df, 5, 2); - WrapInserter(fic, 2).word("a").add(5, df); - df = getFeatures(6, 1); - addElement(df, 7, 2); - WrapInserter(fic, 2).add(7, df).flush(); - - df = getFeatures(8, 1, 12); - addElement(df, 9, 2, 13); - WrapInserter(fic, 3).word("a").add(5, df); - df = getFeatures(10, 1, 14); - addElement(df, 11, 2, 15); - WrapInserter(fic, 3).add(7, df).flush(); - - fic.dump(b); + df = getFeatures(4, 1); + addElement(df, 5, 2); + WrapInserter(fic, 2).word("a").add(5, df); + df = getFeatures(6, 1); + addElement(df, 7, 2); + WrapInserter(fic, 2).add(7, df).flush(); + + df = getFeatures(8, 1, 12); + addElement(df, 9, 2, 13); + WrapInserter(fic, 3).word("a").add(5, df); + df = getFeatures(10, 1, 14); + addElement(df, 11, 2, 15); + WrapInserter(fic, 3).add(7, df).flush(); + + fic.dump(b); + + EXPECT_EQ("f=0[]," + "f=1[w=a[d=5[e=0,w=1,l=2[0]],d=7[e=0,w=1,l=3[0,1]]]," + "w=b[d=5[e=0,w=1,l=12[0,1]]]]," + "f=2[w=a[d=5[e=0,w=1,l=4[0],e=1,w=1,l=5[0,1]]," + "d=7[e=0,w=1,l=6[0],e=1,w=1,l=7[0,1]]]]," + "f=3[w=a[d=5[e=0,w=12,l=8[0],e=1,w=13,l=9[0,1]]," + "d=7[e=0,w=14,l=10[0],e=1,w=15,l=11[0,1]]]]", + b.toStr()); +} - EXPECT_EQUAL("f=0[]," - "f=1[w=a[d=5[e=0,w=1,l=2[0]],d=7[e=0,w=1,l=3[0,1]]]," - "w=b[d=5[e=0,w=1,l=12[0,1]]]]," - "f=2[w=a[d=5[e=0,w=1,l=4[0],e=1,w=1,l=5[0,1]]," - "d=7[e=0,w=1,l=6[0],e=1,w=1,l=7[0,1]]]]," - "f=3[w=a[d=5[e=0,w=12,l=8[0],e=1,w=13,l=9[0,1]]," - "d=7[e=0,w=14,l=10[0],e=1,w=15,l=11[0,1]]]]", - b.toStr()); - } - { // test word with no docs - FieldIndexCollection fic(f.getSchema()); - WrapInserter(fic, 0).word("a").add(2, getFeatures(2, 1)). +TEST_F(FieldIndexCollectionTest, require_that_dumping_words_with_no_docs_to_index_builder_is_working) +{ + WrapInserter(fic, 0).word("a").add(2, getFeatures(2, 1)). word("b").add(4, getFeatures(4, 1)).flush().rewind(). word("a").remove(2).flush(); - { - MyBuilder b(f.getSchema()); - fic.dump(b); - EXPECT_EQUAL("f=0[w=b[d=4[e=0,w=1,l=4[0]]]],f=1[],f=2[],f=3[]", - b.toStr()); - } - { - search::diskindex::IndexBuilder b(f.getSchema()); - b.setPrefix("dump"); - TuneFileIndexing tuneFileIndexing; - DummyFileHeaderContext fileHeaderContext; - b.open(5, 2, tuneFileIndexing, fileHeaderContext); - fic.dump(b); - b.close(); - } + { + MyBuilder b(schema); + fic.dump(b); + EXPECT_EQ("f=0[w=b[d=4[e=0,w=1,l=4[0]]]],f=1[],f=2[],f=3[]", + b.toStr()); + } + { + search::diskindex::IndexBuilder b(schema); + b.setPrefix("dump"); + TuneFileIndexing tuneFileIndexing; + DummyFileHeaderContext fileHeaderContext; + b.open(5, 2, tuneFileIndexing, fileHeaderContext); + fic.dump(b); + b.close(); } } - -template <typename FixtureBase> -class FieldIndexFixture : public FixtureBase -{ +class InverterTest : public ::testing::Test { public: - using FixtureBase::getSchema; + Schema _schema; FieldIndexCollection _fic; DocBuilder _b; SequencedTaskExecutor _invertThreads; SequencedTaskExecutor _pushThreads; DocumentInverter _inv; - FieldIndexFixture() - : FixtureBase(), - _fic(getSchema()), - _b(getSchema()), + InverterTest(const Schema& schema) + : _schema(schema), + _fic(_schema), + _b(_schema), _invertThreads(2), _pushThreads(2), - _inv(getSchema(), _invertThreads, _pushThreads) + _inv(_schema, _invertThreads, _pushThreads) { } }; +class BasicInverterTest : public InverterTest { +public: + BasicInverterTest() : InverterTest(make_multi_field_schema()) {} +}; -TEST_F("requireThatInversionIsWorking", FieldIndexFixture<Fixture>) +TEST_F(BasicInverterTest, require_that_inversion_is_working) { Document::UP doc; - f._b.startDocument("doc::10"); - f._b.startIndexField("f0"). + _b.startDocument("doc::10"); + _b.startIndexField("f0"). addStr("a").addStr("b").addStr("c").addStr("d"). endField(); - doc = f._b.endDocument(); - f._inv.invertDocument(10, *doc); - f._invertThreads.sync(); - myPushDocument(f._inv, f._fic); - f._pushThreads.sync(); - - f._b.startDocument("doc::20"); - f._b.startIndexField("f0"). + doc = _b.endDocument(); + _inv.invertDocument(10, *doc); + _invertThreads.sync(); + myPushDocument(_inv, _fic); + _pushThreads.sync(); + + _b.startDocument("doc::20"); + _b.startIndexField("f0"). addStr("a").addStr("a").addStr("b").addStr("c").addStr("d"). endField(); - doc = f._b.endDocument(); - f._inv.invertDocument(20, *doc); - f._invertThreads.sync(); - myPushDocument(f._inv, f._fic); - f._pushThreads.sync(); - - f._b.startDocument("doc::30"); - f._b.startIndexField("f0"). + doc = _b.endDocument(); + _inv.invertDocument(20, *doc); + _invertThreads.sync(); + myPushDocument(_inv, _fic); + _pushThreads.sync(); + + _b.startDocument("doc::30"); + _b.startIndexField("f0"). addStr("a").addStr("b").addStr("c").addStr("d"). addStr("e").addStr("f"). endField(); - f._b.startIndexField("f1"). + _b.startIndexField("f1"). addStr("\nw2").addStr("w").addStr("x"). addStr("\nw3").addStr("y").addStr("z"). endField(); - f._b.startIndexField("f2"). + _b.startIndexField("f2"). startElement(4). addStr("w").addStr("x"). endElement(). @@ -957,7 +810,7 @@ TEST_F("requireThatInversionIsWorking", FieldIndexFixture<Fixture>) addStr("y").addStr("z"). endElement(). endField(); - f._b.startIndexField("f3"). + _b.startIndexField("f3"). startElement(6). addStr("w").addStr("x"). endElement(). @@ -965,56 +818,56 @@ TEST_F("requireThatInversionIsWorking", FieldIndexFixture<Fixture>) addStr("y").addStr("z"). endElement(). endField(); - doc = f._b.endDocument(); - f._inv.invertDocument(30, *doc); - f._invertThreads.sync(); - myPushDocument(f._inv, f._fic); - f._pushThreads.sync(); - - f._b.startDocument("doc::40"); - f._b.startIndexField("f0"). + doc = _b.endDocument(); + _inv.invertDocument(30, *doc); + _invertThreads.sync(); + myPushDocument(_inv, _fic); + _pushThreads.sync(); + + _b.startDocument("doc::40"); + _b.startIndexField("f0"). addStr("a").addStr("a").addStr("b").addStr("c").addStr("a"). addStr("e").addStr("f"). endField(); - doc = f._b.endDocument(); - f._inv.invertDocument(40, *doc); - f._invertThreads.sync(); - myPushDocument(f._inv, f._fic); - f._pushThreads.sync(); - - f._b.startDocument("doc::999"); - f._b.startIndexField("f0"). + doc = _b.endDocument(); + _inv.invertDocument(40, *doc); + _invertThreads.sync(); + myPushDocument(_inv, _fic); + _pushThreads.sync(); + + _b.startDocument("doc::999"); + _b.startIndexField("f0"). addStr("this").addStr("is").addStr("_a_").addStr("test"). addStr("for").addStr("insertion").addStr("speed").addStr("with"). addStr("more").addStr("than").addStr("just").addStr("__a__"). addStr("few").addStr("words").addStr("present").addStr("in"). addStr("some").addStr("of").addStr("the").addStr("fields"). endField(); - f._b.startIndexField("f1"). + _b.startIndexField("f1"). addStr("the").addStr("other").addStr("field").addStr("also"). addStr("has").addStr("some").addStr("content"). endField(); - f._b.startIndexField("f2"). + _b.startIndexField("f2"). startElement(1). addStr("strange").addStr("things").addStr("here"). addStr("has").addStr("some").addStr("content"). endElement(). endField(); - f._b.startIndexField("f3"). + _b.startIndexField("f3"). startElement(3). addStr("not").addStr("a").addStr("weighty").addStr("argument"). endElement(). endField(); - doc = f._b.endDocument(); + doc = _b.endDocument(); for (uint32_t docId = 10000; docId < 20000; ++docId) { - f._inv.invertDocument(docId, *doc); - f._invertThreads.sync(); - myPushDocument(f._inv, f._fic); - f._pushThreads.sync(); + _inv.invertDocument(docId, *doc); + _invertThreads.sync(); + myPushDocument(_inv, _fic); + _pushThreads.sync(); } - f._pushThreads.sync(); - DataStoreBase::MemStats beforeStats = getFeatureStoreMemStats(f._fic); + _pushThreads.sync(); + DataStoreBase::MemStats beforeStats = getFeatureStoreMemStats(_fic); LOG(info, "Before feature compaction: allocElems=%zu, usedElems=%zu" ", deadElems=%zu, holdElems=%zu" @@ -1027,14 +880,14 @@ TEST_F("requireThatInversionIsWorking", FieldIndexFixture<Fixture>) beforeStats._freeBuffers, beforeStats._activeBuffers, beforeStats._holdBuffers); - myCompactFeatures(f._fic, f._pushThreads); + myCompactFeatures(_fic, _pushThreads); std::vector<std::unique_ptr<GenerationHandler::Guard>> guards; - for (auto &fieldIndex : f._fic.getFieldIndexes()) { + for (auto &fieldIndex : _fic.getFieldIndexes()) { guards.push_back(std::make_unique<GenerationHandler::Guard> (fieldIndex->takeGenerationGuard())); } - myCommit(f._fic, f._pushThreads); - DataStoreBase::MemStats duringStats = getFeatureStoreMemStats(f._fic); + myCommit(_fic, _pushThreads); + DataStoreBase::MemStats duringStats = getFeatureStoreMemStats(_fic); LOG(info, "During feature compaction: allocElems=%zu, usedElems=%zu" ", deadElems=%zu, holdElems=%zu" @@ -1048,8 +901,8 @@ TEST_F("requireThatInversionIsWorking", FieldIndexFixture<Fixture>) duringStats._activeBuffers, duringStats._holdBuffers); guards.clear(); - myCommit(f._fic, f._pushThreads); - DataStoreBase::MemStats afterStats = getFeatureStoreMemStats(f._fic); + myCommit(_fic, _pushThreads); + DataStoreBase::MemStats afterStats = getFeatureStoreMemStats(_fic); LOG(info, "After feature compaction: allocElems=%zu, usedElems=%zu" ", deadElems=%zu, holdElems=%zu" @@ -1067,116 +920,115 @@ TEST_F("requireThatInversionIsWorking", FieldIndexFixture<Fixture>) TermFieldMatchDataArray matchData; matchData.add(&tfmd); { - PostingIterator itr(f._fic.findFrozen("not", 0), featureStoreRef(f._fic, 0), 0, matchData); + PostingIterator itr(_fic.findFrozen("not", 0), featureStoreRef(_fic, 0), 0, matchData); itr.initFullRange(); EXPECT_TRUE(itr.isAtEnd()); } { - PostingIterator itr(f._fic.findFrozen("a", 0), featureStoreRef(f._fic, 0), 0, matchData); + PostingIterator itr(_fic.findFrozen("a", 0), featureStoreRef(_fic, 0), 0, matchData); itr.initFullRange(); - EXPECT_EQUAL(10u, itr.getDocId()); + EXPECT_EQ(10u, itr.getDocId()); itr.unpack(10); - EXPECT_EQUAL("{4:0}", toString(tfmd.getIterator())); + EXPECT_EQ("{4:0}", toString(tfmd.getIterator())); EXPECT_TRUE(!itr.seek(25)); - EXPECT_EQUAL(30u, itr.getDocId()); + EXPECT_EQ(30u, itr.getDocId()); itr.unpack(30); - EXPECT_EQUAL("{6:0}", toString(tfmd.getIterator())); + EXPECT_EQ("{6:0}", toString(tfmd.getIterator())); EXPECT_TRUE(itr.seek(40)); - EXPECT_EQUAL(40u, itr.getDocId()); + EXPECT_EQ(40u, itr.getDocId()); itr.unpack(40); - EXPECT_EQUAL("{7:0,1,4}", toString(tfmd.getIterator())); + EXPECT_EQ("{7:0,1,4}", toString(tfmd.getIterator())); EXPECT_TRUE(!itr.seek(41)); EXPECT_TRUE(itr.isAtEnd()); } { - PostingIterator itr(f._fic.findFrozen("x", 0), featureStoreRef(f._fic, 0), 0, matchData); + PostingIterator itr(_fic.findFrozen("x", 0), featureStoreRef(_fic, 0), 0, matchData); itr.initFullRange(); EXPECT_TRUE(itr.isAtEnd()); } { - PostingIterator itr(f._fic.findFrozen("x", 1), featureStoreRef(f._fic, 1), 1, matchData); + PostingIterator itr(_fic.findFrozen("x", 1), featureStoreRef(_fic, 1), 1, matchData); itr.initFullRange(); - EXPECT_EQUAL(30u, itr.getDocId()); + EXPECT_EQ(30u, itr.getDocId()); itr.unpack(30); - EXPECT_EQUAL("{6:2[e=0,w=1,l=6]}", toString(tfmd.getIterator(), true, true)); + EXPECT_EQ("{6:2[e=0,w=1,l=6]}", toString(tfmd.getIterator(), true, true)); } { - PostingIterator itr(f._fic.findFrozen("x", 2), featureStoreRef(f._fic, 2), 2, matchData); + PostingIterator itr(_fic.findFrozen("x", 2), featureStoreRef(_fic, 2), 2, matchData); itr.initFullRange(); - EXPECT_EQUAL(30u, itr.getDocId()); + EXPECT_EQ(30u, itr.getDocId()); itr.unpack(30); // weight is hardcoded to 1 for new style il doc array field - EXPECT_EQUAL("{2:1[e=0,w=1,l=2]}", toString(tfmd.getIterator(), true, true)); + EXPECT_EQ("{2:1[e=0,w=1,l=2]}", toString(tfmd.getIterator(), true, true)); } { - PostingIterator itr(f._fic.findFrozen("x", 3), featureStoreRef(f._fic, 3), 3, matchData); + PostingIterator itr(_fic.findFrozen("x", 3), featureStoreRef(_fic, 3), 3, matchData); itr.initFullRange(); - EXPECT_EQUAL(30u, itr.getDocId()); + EXPECT_EQ(30u, itr.getDocId()); itr.unpack(30); - EXPECT_EQUAL("{2:1[e=0,w=6,l=2]}", - toString(tfmd.getIterator(), true, true)); + EXPECT_EQ("{2:1[e=0,w=6,l=2]}", + toString(tfmd.getIterator(), true, true)); } } -TEST_F("requireThatInverterHandlesRemoveViaDocumentRemover", - FieldIndexFixture<Fixture>) +TEST_F(BasicInverterTest, require_that_inverter_handles_remove_via_document_remover) { Document::UP doc; - f._b.startDocument("doc::1"); - f._b.startIndexField("f0").addStr("a").addStr("b").endField(); - f._b.startIndexField("f1").addStr("a").addStr("c").endField(); - Document::UP doc1 = f._b.endDocument(); - f._inv.invertDocument(1, *doc1.get()); - f._invertThreads.sync(); - myPushDocument(f._inv, f._fic); - f._pushThreads.sync(); - - f._b.startDocument("doc::2"); - f._b.startIndexField("f0").addStr("b").addStr("c").endField(); - Document::UP doc2 = f._b.endDocument(); - f._inv.invertDocument(2, *doc2.get()); - f._invertThreads.sync(); - myPushDocument(f._inv, f._fic); - f._pushThreads.sync(); - - EXPECT_TRUE(assertPostingList("[1]", f._fic.find("a", 0))); - EXPECT_TRUE(assertPostingList("[1,2]", f._fic.find("b", 0))); - EXPECT_TRUE(assertPostingList("[2]", f._fic.find("c", 0))); - EXPECT_TRUE(assertPostingList("[1]", f._fic.find("a", 1))); - EXPECT_TRUE(assertPostingList("[1]", f._fic.find("c", 1))); - - myremove(1, f._inv, f._fic, f._invertThreads); - f._pushThreads.sync(); - - EXPECT_TRUE(assertPostingList("[]", f._fic.find("a", 0))); - EXPECT_TRUE(assertPostingList("[2]", f._fic.find("b", 0))); - EXPECT_TRUE(assertPostingList("[2]", f._fic.find("c", 0))); - EXPECT_TRUE(assertPostingList("[]", f._fic.find("a", 1))); - EXPECT_TRUE(assertPostingList("[]", f._fic.find("c", 1))); + _b.startDocument("doc::1"); + _b.startIndexField("f0").addStr("a").addStr("b").endField(); + _b.startIndexField("f1").addStr("a").addStr("c").endField(); + Document::UP doc1 = _b.endDocument(); + _inv.invertDocument(1, *doc1.get()); + _invertThreads.sync(); + myPushDocument(_inv, _fic); + _pushThreads.sync(); + + _b.startDocument("doc::2"); + _b.startIndexField("f0").addStr("b").addStr("c").endField(); + Document::UP doc2 = _b.endDocument(); + _inv.invertDocument(2, *doc2.get()); + _invertThreads.sync(); + myPushDocument(_inv, _fic); + _pushThreads.sync(); + + EXPECT_TRUE(assertPostingList("[1]", _fic.find("a", 0))); + EXPECT_TRUE(assertPostingList("[1,2]", _fic.find("b", 0))); + EXPECT_TRUE(assertPostingList("[2]", _fic.find("c", 0))); + EXPECT_TRUE(assertPostingList("[1]", _fic.find("a", 1))); + EXPECT_TRUE(assertPostingList("[1]", _fic.find("c", 1))); + + myremove(1, _inv, _fic, _invertThreads); + _pushThreads.sync(); + + EXPECT_TRUE(assertPostingList("[]", _fic.find("a", 0))); + EXPECT_TRUE(assertPostingList("[2]", _fic.find("b", 0))); + EXPECT_TRUE(assertPostingList("[2]", _fic.find("c", 0))); + EXPECT_TRUE(assertPostingList("[]", _fic.find("a", 1))); + EXPECT_TRUE(assertPostingList("[]", _fic.find("c", 1))); } -class UriFixture +Schema +make_uri_schema() { + Schema result; + result.addUriIndexFields(Schema::IndexField("iu", DataType::STRING)); + result.addUriIndexFields(Schema::IndexField("iau", DataType::STRING, CollectionType::ARRAY)); + result.addUriIndexFields(Schema::IndexField("iwu", DataType::STRING, CollectionType::WEIGHTEDSET)); + return result; +} + +class UriInverterTest : public InverterTest { public: - Schema _schema; - UriFixture() - : _schema() - { - _schema.addUriIndexFields(Schema::IndexField("iu", DataType::STRING)); - _schema.addUriIndexFields(Schema::IndexField("iau", DataType::STRING, CollectionType::ARRAY)); - _schema.addUriIndexFields(Schema::IndexField("iwu", DataType::STRING, CollectionType::WEIGHTEDSET)); - } - const Schema & getSchema() const { return _schema; } + UriInverterTest() : InverterTest(make_uri_schema()) {} }; - -TEST_F("requireThatUriIndexingIsWorking", FieldIndexFixture<UriFixture>) +TEST_F(UriInverterTest, require_that_uri_indexing_is_working) { Document::UP doc; - f._b.startDocument("doc::10"); - f._b.startIndexField("iu"). + _b.startDocument("doc::10"); + _b.startIndexField("iu"). startSubField("all"). addUrlTokenizedString("http://www.example.com:81/fluke?ab=2#4"). endSubField(). @@ -1199,7 +1051,7 @@ TEST_F("requireThatUriIndexingIsWorking", FieldIndexFixture<UriFixture>) addUrlTokenizedString("4"). endSubField(). endField(); - f._b.startIndexField("iau"). + _b.startIndexField("iau"). startElement(1). startSubField("all"). addUrlTokenizedString("http://www.example.com:82/fluke?ab=2#8"). @@ -1247,7 +1099,7 @@ TEST_F("requireThatUriIndexingIsWorking", FieldIndexFixture<UriFixture>) endSubField(). endElement(). endField(); - f._b.startIndexField("iwu"). + _b.startIndexField("iwu"). startElement(4). startSubField("all"). addUrlTokenizedString("http://www.example.com:83/fluke?ab=2#12"). @@ -1295,141 +1147,131 @@ TEST_F("requireThatUriIndexingIsWorking", FieldIndexFixture<UriFixture>) endSubField(). endElement(). endField(); - doc = f._b.endDocument(); - f._inv.invertDocument(10, *doc); - f._invertThreads.sync(); - myPushDocument(f._inv, f._fic); + doc = _b.endDocument(); + _inv.invertDocument(10, *doc); + _invertThreads.sync(); + myPushDocument(_inv, _fic); - f._pushThreads.sync(); + _pushThreads.sync(); TermFieldMatchData tfmd; TermFieldMatchDataArray matchData; matchData.add(&tfmd); { - uint32_t fieldId = f.getSchema().getIndexFieldId("iu"); - PostingIterator itr(f._fic.findFrozen("not", fieldId), - featureStoreRef(f._fic, fieldId), + uint32_t fieldId = _schema.getIndexFieldId("iu"); + PostingIterator itr(_fic.findFrozen("not", fieldId), + featureStoreRef(_fic, fieldId), fieldId, matchData); itr.initFullRange(); EXPECT_TRUE(itr.isAtEnd()); } { - uint32_t fieldId = f.getSchema().getIndexFieldId("iu"); - PostingIterator itr(f._fic.findFrozen("example", fieldId), - featureStoreRef(f._fic, fieldId), + uint32_t fieldId = _schema.getIndexFieldId("iu"); + PostingIterator itr(_fic.findFrozen("example", fieldId), + featureStoreRef(_fic, fieldId), fieldId, matchData); itr.initFullRange(); - EXPECT_EQUAL(10u, itr.getDocId()); + EXPECT_EQ(10u, itr.getDocId()); itr.unpack(10); - EXPECT_EQUAL("{9:2}", toString(tfmd.getIterator())); + EXPECT_EQ("{9:2}", toString(tfmd.getIterator())); EXPECT_TRUE(!itr.seek(25)); EXPECT_TRUE(itr.isAtEnd()); } { - uint32_t fieldId = f.getSchema().getIndexFieldId("iau"); - PostingIterator itr(f._fic.findFrozen("example", fieldId), - featureStoreRef(f._fic, fieldId), + uint32_t fieldId = _schema.getIndexFieldId("iau"); + PostingIterator itr(_fic.findFrozen("example", fieldId), + featureStoreRef(_fic, fieldId), fieldId, matchData); itr.initFullRange(); - EXPECT_EQUAL(10u, itr.getDocId()); + EXPECT_EQ(10u, itr.getDocId()); itr.unpack(10); - EXPECT_EQUAL("{9:2[e=0,l=9]}", - toString(tfmd.getIterator(), true, false)); + EXPECT_EQ("{9:2[e=0,l=9]}", + toString(tfmd.getIterator(), true, false)); EXPECT_TRUE(!itr.seek(25)); EXPECT_TRUE(itr.isAtEnd()); } { - uint32_t fieldId = f.getSchema().getIndexFieldId("iwu"); - PostingIterator itr(f._fic.findFrozen("example", fieldId), - featureStoreRef(f._fic, fieldId), + uint32_t fieldId = _schema.getIndexFieldId("iwu"); + PostingIterator itr(_fic.findFrozen("example", fieldId), + featureStoreRef(_fic, fieldId), fieldId, matchData); itr.initFullRange(); - EXPECT_EQUAL(10u, itr.getDocId()); + EXPECT_EQ(10u, itr.getDocId()); itr.unpack(10); - EXPECT_EQUAL("{9:2[e=0,w=4,l=9]}", - toString(tfmd.getIterator(), true, true)); + EXPECT_EQ("{9:2[e=0,w=4,l=9]}", + toString(tfmd.getIterator(), true, true)); EXPECT_TRUE(!itr.seek(25)); EXPECT_TRUE(itr.isAtEnd()); } { - search::diskindex::IndexBuilder dib(f.getSchema()); + search::diskindex::IndexBuilder dib(_schema); dib.setPrefix("urldump"); TuneFileIndexing tuneFileIndexing; DummyFileHeaderContext fileHeaderContext; - dib.open(11, f._fic.getNumUniqueWords(), tuneFileIndexing, + dib.open(11, _fic.getNumUniqueWords(), tuneFileIndexing, fileHeaderContext); - f._fic.dump(dib); + _fic.dump(dib); dib.close(); } } - -class SingleFieldFixture -{ +class CjkInverterTest : public InverterTest { public: - Schema _schema; - SingleFieldFixture() - : _schema() - { - _schema.addIndexField(Schema::IndexField("i", DataType::STRING)); - } - const Schema & getSchema() const { return _schema; } + CjkInverterTest() : InverterTest(make_single_field_schema()) {} }; -TEST_F("requireThatCjkIndexingIsWorking", FieldIndexFixture<SingleFieldFixture>) +TEST_F(CjkInverterTest, require_that_cjk_indexing_is_working) { Document::UP doc; - f._b.startDocument("doc::10"); - f._b.startIndexField("i"). + _b.startDocument("doc::10"); + _b.startIndexField("f0"). addStr("我就是那个"). setAutoSpace(false). addStr("大灰狼"). setAutoSpace(true). endField(); - doc = f._b.endDocument(); - f._inv.invertDocument(10, *doc); - f._invertThreads.sync(); - myPushDocument(f._inv, f._fic); + doc = _b.endDocument(); + _inv.invertDocument(10, *doc); + _invertThreads.sync(); + myPushDocument(_inv, _fic); - f._pushThreads.sync(); + _pushThreads.sync(); TermFieldMatchData tfmd; TermFieldMatchDataArray matchData; matchData.add(&tfmd); + uint32_t fieldId = _schema.getIndexFieldId("f0"); { - uint32_t fieldId = f.getSchema().getIndexFieldId("i"); - PostingIterator itr(f._fic.findFrozen("not", fieldId), - featureStoreRef(f._fic, fieldId), + PostingIterator itr(_fic.findFrozen("not", fieldId), + featureStoreRef(_fic, fieldId), fieldId, matchData); itr.initFullRange(); EXPECT_TRUE(itr.isAtEnd()); } { - uint32_t fieldId = f.getSchema().getIndexFieldId("i"); - PostingIterator itr(f._fic.findFrozen("我就" + PostingIterator itr(_fic.findFrozen("我就" "是那个", fieldId), - featureStoreRef(f._fic, fieldId), + featureStoreRef(_fic, fieldId), fieldId, matchData); itr.initFullRange(); - EXPECT_EQUAL(10u, itr.getDocId()); + EXPECT_EQ(10u, itr.getDocId()); itr.unpack(10); - EXPECT_EQUAL("{2:0}", toString(tfmd.getIterator())); + EXPECT_EQ("{2:0}", toString(tfmd.getIterator())); EXPECT_TRUE(!itr.seek(25)); EXPECT_TRUE(itr.isAtEnd()); } { - uint32_t fieldId = f.getSchema().getIndexFieldId("i"); - PostingIterator itr(f._fic.findFrozen("大灰" + PostingIterator itr(_fic.findFrozen("大灰" "狼", fieldId), - featureStoreRef(f._fic, fieldId), + featureStoreRef(_fic, fieldId), fieldId, matchData); itr.initFullRange(); - EXPECT_EQUAL(10u, itr.getDocId()); + EXPECT_EQ(10u, itr.getDocId()); itr.unpack(10); - EXPECT_EQUAL("{2:1}", toString(tfmd.getIterator())); + EXPECT_EQ("{2:1}", toString(tfmd.getIterator())); EXPECT_TRUE(!itr.seek(25)); EXPECT_TRUE(itr.isAtEnd()); } @@ -1441,80 +1283,74 @@ insertAndAssertTuple(const vespalib::string &word, uint32_t fieldId, uint32_t do { EntryRef wordRef = WrapInserter(dict, fieldId).rewind().word(word). add(docId).flush().getWordRef(); - EXPECT_EQUAL(word, - dict.getFieldIndex(fieldId)->getWordStore().getWord(wordRef)); + EXPECT_EQ(word, dict.getFieldIndex(fieldId)->getWordStore().getWord(wordRef)); MyDrainRemoves(dict, fieldId).drain(docId); } -TEST_F("require that insert tells which word ref that was inserted", Fixture) +TEST_F(FieldIndexCollectionTest, require_that_insert_tells_which_word_ref_that_was_inserted) { - FieldIndexCollection d(f.getSchema()); - insertAndAssertTuple("a", 1, 11, d); - insertAndAssertTuple("b", 1, 11, d); - insertAndAssertTuple("a", 2, 11, d); - - insertAndAssertTuple("a", 1, 22, d); - insertAndAssertTuple("b", 2, 22, d); - insertAndAssertTuple("c", 2, 22, d); + insertAndAssertTuple("a", 1, 11, fic); + insertAndAssertTuple("b", 1, 11, fic); + insertAndAssertTuple("a", 2, 11, fic); + + insertAndAssertTuple("a", 1, 22, fic); + insertAndAssertTuple("b", 2, 22, fic); + insertAndAssertTuple("c", 2, 22, fic); } -struct RemoverFixture : public Fixture -{ - FieldIndexCollection _fic; +struct RemoverTest : public FieldIndexCollectionTest { SequencedTaskExecutor _invertThreads; SequencedTaskExecutor _pushThreads; - RemoverFixture() - : - Fixture(), - _fic(getSchema()), - _invertThreads(2), - _pushThreads(2) + RemoverTest() + : FieldIndexCollectionTest(), + _invertThreads(2), + _pushThreads(2) { } void assertPostingLists(const vespalib::string &e1, const vespalib::string &e2, const vespalib::string &e3) { - EXPECT_TRUE(assertPostingList(e1, _fic.find("a", 1))); - EXPECT_TRUE(assertPostingList(e2, _fic.find("a", 2))); - EXPECT_TRUE(assertPostingList(e3, _fic.find("b", 1))); + EXPECT_TRUE(assertPostingList(e1, fic.find("a", 1))); + EXPECT_TRUE(assertPostingList(e2, fic.find("a", 2))); + EXPECT_TRUE(assertPostingList(e3, fic.find("b", 1))); } void remove(uint32_t docId) { - DocumentInverter inv(getSchema(), _invertThreads, _pushThreads); - myremove(docId, inv, _fic, _invertThreads); + DocumentInverter inv(schema, _invertThreads, _pushThreads); + myremove(docId, inv, fic, _invertThreads); _pushThreads.sync(); - EXPECT_FALSE(_fic.getFieldIndex(0u)->getDocumentRemover(). + EXPECT_FALSE(fic.getFieldIndex(0u)->getDocumentRemover(). getStore().get(docId).valid()); } }; -TEST_F("require that document remover can remove several documents", RemoverFixture) +TEST_F(RemoverTest, require_that_document_remover_can_remove_several_documents) { - WrapInserter(f._fic, 1).word("a").add(11).add(13).add(15). - word("b").add(11).add(15).flush(); - WrapInserter(f._fic, 2).word("a").add(11).add(13).flush(); - f.assertPostingLists("[11,13,15]", "[11,13]", "[11,15]"); + WrapInserter(fic, 1).word("a").add(11).add(13).add(15). + word("b").add(11).add(15).flush(); + WrapInserter(fic, 2).word("a").add(11).add(13).flush(); + assertPostingLists("[11,13,15]", "[11,13]", "[11,15]"); - f.remove(13); - f.assertPostingLists("[11,15]", "[11]", "[11,15]"); + remove(13); + assertPostingLists("[11,15]", "[11]", "[11,15]"); - f.remove(11); - f.assertPostingLists("[15]", "[]", "[15]"); + remove(11); + assertPostingLists("[15]", "[]", "[15]"); - f.remove(15); - f.assertPostingLists("[]", "[]", "[]"); + remove(15); + assertPostingLists("[]", "[]", "[]"); } -TEST_F("require that removal of non-existing document does not do anything", RemoverFixture) +TEST_F(RemoverTest, require_that_removal_of_non_existing_document_does_not_do_anything) { - WrapInserter(f._fic, 1).word("a").add(11).word("b").add(11).flush(); - WrapInserter(f._fic, 2).word("a").add(11).flush(); - f.assertPostingLists("[11]", "[11]", "[11]"); - f.remove(13); - f.assertPostingLists("[11]", "[11]", "[11]"); + WrapInserter(fic, 1).word("a").add(11).word("b").add(11).flush(); + WrapInserter(fic, 2).word("a").add(11).flush(); + assertPostingLists("[11]", "[11]", "[11]"); + remove(13); + assertPostingLists("[11]", "[11]", "[11]"); } -} // namespace memoryindex -} // namespace search +} +} -TEST_MAIN() { TEST_RUN_ALL(); } +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/tests/tensor/dense_tensor_store/dense_tensor_store_test.cpp b/searchlib/src/tests/tensor/dense_tensor_store/dense_tensor_store_test.cpp index 9f47dede46a..1d80fec720a 100644 --- a/searchlib/src/tests/tensor/dense_tensor_store/dense_tensor_store_test.cpp +++ b/searchlib/src/tests/tensor/dense_tensor_store/dense_tensor_store_test.cpp @@ -60,34 +60,6 @@ TEST_F("require that we can store 1d bound tensor", Fixture("tensor(x[3])")) add({{"x", 2}}, 5)); } -TEST_F("require that we can store 1d un-bound tensor", Fixture("tensor(x[])")) -{ - f.assertSetAndGetTensor(TensorSpec("tensor(x[3])"). - add({{"x", 0}}, 2). - add({{"x", 1}}, 3). - add({{"x", 2}}, 5)); -} - -TEST_F("require that un-bound dimension is concrete in returned 2d tensor", Fixture("tensor(x[3],y[])")) -{ - f.assertSetAndGetTensor(TensorSpec("tensor(x[3],y[2])"). - add({{"x", 0}, {"y", 0}}, 2). - add({{"x", 0}, {"y", 1}}, 3). - add({{"x", 1}, {"y", 0}}, 5). - add({{"x", 1}, {"y", 1}}, 7). - add({{"x", 2}, {"y", 0}}, 11). - add({{"x", 2}, {"y", 1}}, 13)); -} - -TEST_F("require that un-bound dimensions are concrete in returned 3d tensor", Fixture("tensor(x[],y[2],z[])")) -{ - f.assertSetAndGetTensor(TensorSpec("tensor(x[1],y[2],z[2])"). - add({{"x", 0}, {"y", 0}, {"z", 0}}, 2). - add({{"x", 0}, {"y", 0}, {"z", 1}}, 3). - add({{"x", 0}, {"y", 1}, {"z", 0}}, 5). - add({{"x", 0}, {"y", 1}, {"z", 1}}, 7)); -} - TEST_F("require that correct empty tensor is returned for 1d bound tensor", Fixture("tensor(x[3])")) { f.assertEmptyTensor(TensorSpec("tensor(x[3])"). @@ -96,21 +68,6 @@ TEST_F("require that correct empty tensor is returned for 1d bound tensor", Fixt add({{"x", 2}}, 0)); } -TEST_F("require that empty 2d tensor has size 1 in un-bound dimension", Fixture("tensor(x[3],y[])")) -{ - f.assertEmptyTensor(TensorSpec("tensor(x[3],y[1])"). - add({{"x", 0}, {"y", 0}}, 0). - add({{"x", 1}, {"y", 0}}, 0). - add({{"x", 2}, {"y", 0}}, 0)); -} - -TEST_F("require that empty 3d tensor has size 1 in un-bound dimensions", Fixture("tensor(x[],y[2],z[])")) -{ - f.assertEmptyTensor(TensorSpec("tensor(x[1],y[2],z[1])"). - add({{"x", 0}, {"y", 0}, {"z", 0}}, 0). - add({{"x", 0}, {"y", 1}, {"z", 0}}, 0)); -} - void assertArraySize(const vespalib::string &tensorType, uint32_t expArraySize) { Fixture f(tensorType); @@ -122,13 +79,7 @@ TEST("require that array size is calculated correctly") TEST_DO(assertArraySize("tensor(x[1])", 32)); TEST_DO(assertArraySize("tensor(x[10])", 96)); TEST_DO(assertArraySize("tensor(x[3])", 32)); - TEST_DO(assertArraySize("tensor(x[3],y[])", 32)); - TEST_DO(assertArraySize("tensor(x[3],y[],z[])", 32)); - TEST_DO(assertArraySize("tensor(x[3],y[],z[],z2[])", 64)); TEST_DO(assertArraySize("tensor(x[10],y[10])", 800)); - TEST_DO(assertArraySize("tensor(x[])", 32)); - TEST_DO(assertArraySize("tensor(x[],x2[],x3[],x4[],x5[],x6[])", 32)); - TEST_DO(assertArraySize("tensor(x[],x2[],x3[],x4[],x5[],x6[],x7[])", 64)); } TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.cpp b/searchlib/src/vespa/searchlib/bitcompression/compression.cpp index dfcdd991b22..83776d22fee 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/compression.cpp +++ b/searchlib/src/vespa/searchlib/bitcompression/compression.cpp @@ -133,10 +133,8 @@ vespalib::string noFeatures = "NoFeatures"; } -template <bool bigEndian> void -FeatureDecodeContext<bigEndian>:: -readBytes(uint8_t *buf, size_t len) +DecodeContext64Base::readBytes(uint8_t *buf, size_t len) { while (len > 0) { // Ensure that buffer to read from isn't empty @@ -167,9 +165,8 @@ readBytes(uint8_t *buf, size_t len) } -template <bool bigEndian> uint32_t -FeatureDecodeContext<bigEndian>:: +DecodeContext64Base:: readHeader(vespalib::GenericHeader &header, int64_t fileSize) { size_t hhSize = vespalib::GenericHeader::getMinSize(); diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.h b/searchlib/src/vespa/searchlib/bitcompression/compression.h index 67e23aabc1e..b9166f675aa 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/compression.h +++ b/searchlib/src/vespa/searchlib/bitcompression/compression.h @@ -1136,16 +1136,18 @@ public: // File position for end of buffer minus byte address of end of buffer // minus sizeof uint64_t. Then shifted left by 3 to represent bits. uint64_t _fileReadBias; + search::ComprFileReadContext *_readContext; DecodeContext64Base() : search::ComprFileDecodeContext(), _valI(nullptr), - _valE(nullptr), + _valE(static_cast<const uint64_t *>(nullptr) - 1), _realValE(nullptr), _val(0), _cacheInt(0), _preRead(0), - _fileReadBias(0) + _fileReadBias(0), + _readContext(nullptr) { } @@ -1163,7 +1165,8 @@ public: _val(val), _cacheInt(cacheInt), _preRead(preRead), - _fileReadBias(0) + _fileReadBias(0), + _readContext(nullptr) { } @@ -1183,6 +1186,7 @@ public: _cacheInt = rhs._cacheInt; _preRead = rhs._preRead; _fileReadBias = rhs._fileReadBias; + _readContext = rhs._readContext; return *this; } @@ -1278,6 +1282,26 @@ public: return (val >> 1); } } + + void setReadContext(search::ComprFileReadContext *readContext) { + _readContext = readContext; + } + search::ComprFileReadContext *getReadContext() const { + return _readContext; + } + void readComprBuffer() { + _readContext->readComprBuffer(); + } + void readComprBufferIfNeeded() { + if (__builtin_expect(_valI >= _valE, false)) { + readComprBuffer(); + } + } + virtual uint64_t readBits(uint32_t length) = 0; + virtual void align(uint32_t alignment) = 0; + virtual uint64_t decode_exp_golomb(int k) = 0; + void readBytes(uint8_t *buf, size_t len); + uint32_t readHeader(vespalib::GenericHeader &header, int64_t fileSize); }; @@ -1299,7 +1323,7 @@ public: DecodeContext64(const uint64_t *compr, int bitOffset) : DecodeContext64Base(compr + 1, - nullptr, + static_cast<const uint64_t *>(nullptr) - 1, nullptr, 0, EC::bswap(*compr), @@ -1385,10 +1409,12 @@ public: }; void skipBits(int bits) override { + readComprBufferIfNeeded(); while (bits >= 64) { _val = 0; ReadBits(64, _val, _cacheInt, _preRead, _valI); bits -= 64; + readComprBufferIfNeeded(); } if (bits > 0) { if (bigEndian) { @@ -1397,6 +1423,7 @@ public: _val >>= bits; } ReadBits(bits, _val, _cacheInt, _preRead, _valI); + readComprBufferIfNeeded(); } } @@ -1436,7 +1463,7 @@ public: } uint64_t - readBits(uint32_t length) + readBits(uint32_t length) override { uint64_t res; if (length < 64) { @@ -1452,20 +1479,32 @@ public: _val = 0; } UC64_READBITS(_val, _valI, _preRead, _cacheInt, EC); + readComprBufferIfNeeded(); return res; } + uint64_t decode_exp_golomb(int k) override { + uint32_t length; + uint64_t val64; + UC64_DECODEEXPGOLOMB(_val, _valI, _preRead, _cacheInt, k, EC); + readComprBufferIfNeeded(); + return val64; + } + void - align(uint32_t alignment) + align(uint32_t alignment) override { + readComprBufferIfNeeded(); uint64_t pad = (- getReadOffset()) & (alignment - 1); while (pad > 64) { (void) readBits(64); pad -= 64; + readComprBufferIfNeeded(); } if (pad > 0) { (void) readBits(pad); } + readComprBufferIfNeeded(); } /* @@ -1489,7 +1528,6 @@ template <bool bigEndian> class FeatureDecodeContext : public DecodeContext64<bigEndian> { public: - search::ComprFileReadContext *_readContext; typedef DecodeContext64<bigEndian> ParentClass; typedef index::DocIdAndFeatures DocIdAndFeatures; typedef index::PostingListParams PostingListParams; @@ -1504,68 +1542,29 @@ public: using ParentClass::getBitOffset; using ParentClass::readBits; using ParentClass::ReadBits; + using ParentClass::readComprBuffer; + using ParentClass::readComprBufferIfNeeded; + using ParentClass::readHeader; + using ParentClass::readBytes; FeatureDecodeContext() - : ParentClass(), - _readContext(nullptr) + : ParentClass() { } FeatureDecodeContext(const uint64_t *compr, int bitOffset) - : ParentClass(compr, bitOffset), - _readContext(nullptr) + : ParentClass(compr, bitOffset) { } FeatureDecodeContext(const uint64_t *compr, int bitOffset, uint64_t bitLength) - : ParentClass(compr, bitOffset, bitLength), - _readContext(nullptr) - { - } - - FeatureDecodeContext & - operator=(const FeatureDecodeContext &rhs) - { - ParentClass::operator=(rhs); - _readContext = rhs._readContext; - return *this; - } - - void - setReadContext(search::ComprFileReadContext *readContext) - { - _readContext = readContext; - } - - search::ComprFileReadContext * - getReadContext() const - { - return _readContext; - } - - void - readComprBuffer() - { - _readContext->readComprBuffer(); - } - - void - readComprBufferIfNeeded() + : ParentClass(compr, bitOffset, bitLength) { - if (__builtin_expect(_valI >= _valE, false)) { - readComprBuffer(); - } } - void - readBytes(uint8_t *buf, size_t len); - - virtual uint32_t - readHeader(vespalib::GenericHeader &header, int64_t fileSize); - virtual void readHeader(const vespalib::GenericHeader &header, const vespalib::string &prefix); @@ -1594,41 +1593,6 @@ public: */ virtual void getParams(PostingListParams ¶ms) const; - - void skipBits(int bits) override { - readComprBufferIfNeeded(); - while (bits >= 64) { - _val = 0; - ReadBits(64, _val, _cacheInt, _preRead, _valI); - bits -= 64; - readComprBufferIfNeeded(); - } - if (bits > 0) { - if (bigEndian) { - _val <<= bits; - } else { - _val >>= bits; - } - ReadBits(bits, _val, _cacheInt, _preRead, _valI); - readComprBufferIfNeeded(); - } - } - - void - align(uint32_t alignment) - { - readComprBufferIfNeeded(); - uint64_t pad = (- getReadOffset()) & (alignment - 1); - while (pad > 64) { - (void) readBits(64); - pad -= 64; - readComprBufferIfNeeded(); - } - if (pad > 0) { - (void) readBits(pad); - } - readComprBufferIfNeeded(); - } }; typedef FeatureDecodeContext<true> FeatureDecodeContextBE; diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp b/searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp index d4f663f32cc..9f5d3cf751f 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp +++ b/searchlib/src/vespa/searchlib/bitcompression/posocccompression.cpp @@ -12,8 +12,6 @@ LOG_SETUP(".posocccompression"); using search::index::DocIdAndFeatures; -using search::index::WordDocElementFeatures; -using search::index::WordDocElementWordPosFeatures; using search::index::PostingListParams; using search::index::SchemaUtil; using search::index::Schema; @@ -343,8 +341,8 @@ readFeatures(search::index::DocIdAndFeatures &features) uint64_t val64; const uint64_t *valE = _valE; - features.clearFeatures((oPreRead == 0) ? 0 : 64 - oPreRead); - features.setRaw(true); + features.clear_features((oPreRead == 0) ? 0 : 64 - oPreRead); + features.set_has_raw_data(true); const uint64_t *rawFeatures = (oPreRead == 0) ? (oCompr - 1) : (oCompr - 2); uint64_t rawFeaturesStartBitPos = @@ -373,7 +371,7 @@ readFeatures(search::index::DocIdAndFeatures &features) } if (__builtin_expect(oCompr >= valE, false)) { while (rawFeatures < oCompr) { - features._blob.push_back(*rawFeatures); + features.blob().push_back(*rawFeatures); ++rawFeatures; } UC64_DECODECONTEXT_STORE(o, _); @@ -394,7 +392,7 @@ readFeatures(search::index::DocIdAndFeatures &features) do { if (__builtin_expect(oCompr >= valE, false)) { while (rawFeatures < oCompr) { - features._blob.push_back(*rawFeatures); + features.blob().push_back(*rawFeatures); ++rawFeatures; } UC64_DECODECONTEXT_STORE(o, _); @@ -410,7 +408,7 @@ readFeatures(search::index::DocIdAndFeatures &features) for (uint32_t pos = 1; pos < numPositions; ++pos) { if (__builtin_expect(oCompr >= valE, false)) { while (rawFeatures < oCompr) { - features._blob.push_back(*rawFeatures); + features.blob().push_back(*rawFeatures); ++rawFeatures; } UC64_DECODECONTEXT_STORE(o, _); @@ -429,9 +427,9 @@ readFeatures(search::index::DocIdAndFeatures &features) _fileReadBias + (reinterpret_cast<unsigned long>(oCompr) << 3) - oPreRead; - features._bitLength = rawFeaturesEndBitPos - rawFeaturesStartBitPos; + features.set_bit_length(rawFeaturesEndBitPos - rawFeaturesStartBitPos); while (rawFeatures < oCompr) { - features._blob.push_back(*rawFeatures); + features.blob().push_back(*rawFeatures); ++rawFeatures; } if (__builtin_expect(oCompr >= valE, false)) { @@ -450,8 +448,8 @@ readFeatures(search::index::DocIdAndFeatures &features) uint64_t val64; const uint64_t *valE = _valE; - features.clearFeatures(); - features.setRaw(false); + features.clear_features(); + features.set_has_raw_data(false); const PosOccFieldParams &fieldParams = _fieldsParams->getFieldParams()[0]; uint32_t numElements = 1; @@ -470,14 +468,13 @@ readFeatures(search::index::DocIdAndFeatures &features) EC); elementId += static_cast<uint32_t>(val64); } - features._elements. - push_back(WordDocElementFeatures(elementId)); + features.elements().emplace_back(elementId); if (fieldParams._hasElementWeights) { UC64_DECODEEXPGOLOMB_SMALL_NS(o, K_VALUE_POSOCC_ELEMENTWEIGHT, EC); int32_t elementWeight = this->convertToSigned(val64); - features._elements.back().setWeight(elementWeight); + features.elements().back().setWeight(elementWeight); } if (__builtin_expect(oCompr >= valE, false)) { UC64_DECODECONTEXT_STORE(o, _); @@ -489,7 +486,7 @@ readFeatures(search::index::DocIdAndFeatures &features) K_VALUE_POSOCC_ELEMENTLEN, EC); uint32_t elementLen = static_cast<uint32_t>(val64) + 1; - features._elements.back().setElementLen(elementLen); + features.elements().back().setElementLen(elementLen); UC64_DECODEEXPGOLOMB_SMALL_NS(o, K_VALUE_POSOCC_NUMPOSITIONS, EC); @@ -507,9 +504,8 @@ readFeatures(search::index::DocIdAndFeatures &features) K_VALUE_POSOCC_FIRST_WORDPOS, EC); wordPos = static_cast<uint32_t>(val64); - features._elements.back().incNumOccs(); - features._wordPositions.push_back( - WordDocElementWordPosFeatures(wordPos)); + features.elements().back().incNumOccs(); + features.word_positions().emplace_back(wordPos); } while (0); for (uint32_t pos = 1; pos < numPositions; ++pos) { if (__builtin_expect(oCompr >= valE, false)) { @@ -522,9 +518,8 @@ readFeatures(search::index::DocIdAndFeatures &features) K_VALUE_POSOCC_DELTA_WORDPOS, EC); wordPos += 1 + static_cast<uint32_t>(val64); - features._elements.back().incNumOccs(); - features._wordPositions.push_back( - WordDocElementWordPosFeatures(wordPos)); + features.elements().back().incNumOccs(); + features.word_positions().emplace_back(wordPos); } } UC64_DECODECONTEXT_STORE(o, _); @@ -732,23 +727,19 @@ void EG2PosOccEncodeContext<bigEndian>:: writeFeatures(const search::index::DocIdAndFeatures &features) { - if (features.getRaw()) { - writeBits(&features._blob[0], - features._bitOffset, features._bitLength); + if (features.has_raw_data()) { + writeBits(features.blob().data(), + features.bit_offset(), features.bit_length()); return; } - typedef WordDocElementFeatures Elements; - typedef WordDocElementWordPosFeatures Positions; - std::vector<Elements>::const_iterator element = features._elements.begin(); - - std::vector<Positions>::const_iterator position = - features._wordPositions.begin(); + auto element = features.elements().begin(); + auto position = features.word_positions().begin(); const PosOccFieldParams &fieldParams = _fieldsParams->getFieldParams()[0]; - uint32_t numElements = features._elements.size(); + uint32_t numElements = features.elements().size(); if (fieldParams._hasElements) { assert(numElements > 0u); encodeExpGolomb(numElements - 1, @@ -854,8 +845,8 @@ readFeatures(search::index::DocIdAndFeatures &features) uint64_t val64; const uint64_t *valE = _valE; - features.clearFeatures((oPreRead == 0) ? 0 : 64 - oPreRead); - features.setRaw(true); + features.clear_features((oPreRead == 0) ? 0 : 64 - oPreRead); + features.set_has_raw_data(true); const uint64_t *rawFeatures = (oPreRead == 0) ? (oCompr - 1) : (oCompr - 2); uint64_t rawFeaturesStartBitPos = @@ -885,7 +876,7 @@ readFeatures(search::index::DocIdAndFeatures &features) } if (__builtin_expect(oCompr >= valE, false)) { while (rawFeatures < oCompr) { - features._blob.push_back(*rawFeatures); + features.blob().push_back(*rawFeatures); ++rawFeatures; } UC64_DECODECONTEXT_STORE(o, _); @@ -910,7 +901,7 @@ readFeatures(search::index::DocIdAndFeatures &features) for (uint32_t pos = 0; pos < numPositions; ++pos) { if (__builtin_expect(oCompr >= valE, false)) { while (rawFeatures < oCompr) { - features._blob.push_back(*rawFeatures); + features.blob().push_back(*rawFeatures); ++rawFeatures; } UC64_DECODECONTEXT_STORE(o, _); @@ -929,9 +920,9 @@ readFeatures(search::index::DocIdAndFeatures &features) _fileReadBias + (reinterpret_cast<unsigned long>(oCompr) << 3) - oPreRead; - features._bitLength = rawFeaturesEndBitPos - rawFeaturesStartBitPos; + features.set_bit_length(rawFeaturesEndBitPos - rawFeaturesStartBitPos); while (rawFeatures < oCompr) { - features._blob.push_back(*rawFeatures); + features.blob().push_back(*rawFeatures); ++rawFeatures; } if (__builtin_expect(oCompr >= valE, false)) { @@ -950,8 +941,8 @@ readFeatures(search::index::DocIdAndFeatures &features) uint64_t val64; const uint64_t *valE = _valE; - features.clearFeatures(); - features.setRaw(false); + features.clear_features(); + features.set_has_raw_data(false); const PosOccFieldParams &fieldParams = _fieldsParams->getFieldParams()[0]; @@ -972,14 +963,13 @@ readFeatures(search::index::DocIdAndFeatures &features) EC); elementId += static_cast<uint32_t>(val64); } - features._elements. - push_back(WordDocElementFeatures(elementId)); + features.elements().emplace_back(elementId); if (fieldParams._hasElementWeights) { UC64_DECODEEXPGOLOMB_SMALL_NS(o, K_VALUE_POSOCC_ELEMENTWEIGHT, EC); int32_t elementWeight = this->convertToSigned(val64); - features._elements.back().setWeight(elementWeight); + features.elements().back().setWeight(elementWeight); } if (__builtin_expect(oCompr >= valE, false)) { UC64_DECODECONTEXT_STORE(o, _); @@ -991,13 +981,13 @@ readFeatures(search::index::DocIdAndFeatures &features) elementLenK, EC); uint32_t elementLen = static_cast<uint32_t>(val64) + 1; - features._elements.back().setElementLen(elementLen); + features.elements().back().setElementLen(elementLen); UC64_DECODEEXPGOLOMB_SMALL_NS(o, K_VALUE_POSOCC_NUMPOSITIONS, EC); uint32_t numPositions = static_cast<uint32_t>(val64) + 1; - features._bitLength = numPositions * 64; + features.set_bit_length(numPositions * 64); uint32_t wordPosK = EGPosOccEncodeContext<bigEndian>:: calcWordPosK(numPositions, elementLen); @@ -1014,9 +1004,8 @@ readFeatures(search::index::DocIdAndFeatures &features) wordPosK, EC); wordPos += 1 + static_cast<uint32_t>(val64); - features._elements.back().incNumOccs(); - features._wordPositions.push_back( - WordDocElementWordPosFeatures(wordPos)); + features.elements().back().incNumOccs(); + features.word_positions().emplace_back(wordPos); } } UC64_DECODECONTEXT_STORE(o, _); @@ -1227,23 +1216,19 @@ void EGPosOccEncodeContext<bigEndian>:: writeFeatures(const search::index::DocIdAndFeatures &features) { - if (features.getRaw()) { - writeBits(&features._blob[0], - features._bitOffset, features._bitLength); + if (features.has_raw_data()) { + writeBits(features.blob().data(), + features.bit_offset(), features.bit_length()); return; } - typedef WordDocElementFeatures Elements; - typedef WordDocElementWordPosFeatures Positions; - - std::vector<Elements>::const_iterator element = features._elements.begin(); - std::vector<Positions>::const_iterator position = - features._wordPositions.begin(); + auto element = features.elements().begin(); + auto position = features.word_positions().begin(); const PosOccFieldParams &fieldParams = _fieldsParams->getFieldParams()[0]; uint32_t elementLenK = calcElementLenK(fieldParams._avgElemLen); - uint32_t numElements = features._elements.size(); + uint32_t numElements = features.elements().size(); if (fieldParams._hasElements) { assert(numElements > 0u); encodeExpGolomb(numElements - 1, diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocccompression.h b/searchlib/src/vespa/searchlib/bitcompression/posocccompression.h index a5d46045ec5..d500dacd7d4 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/posocccompression.h +++ b/searchlib/src/vespa/searchlib/bitcompression/posocccompression.h @@ -48,9 +48,9 @@ public: assert(elementLen == _elements.back().getElementLen()); } assert(_elements.back().getNumOccs() == 0 || - wordPos > _wordPositions.back().getWordPos()); + wordPos > _word_positions.back().getWordPos()); _elements.back().incNumOccs(); - _wordPositions.emplace_back(wordPos); + _word_positions.emplace_back(wordPos); } }; diff --git a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt index 104994ad038..2fea4f2bab7 100644 --- a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt @@ -19,6 +19,7 @@ vespa_add_library(searchlib_diskindex OBJECT pagedict4randread.cpp wordnummapper.cpp zc4_posting_header.cpp + zc4_posting_reader.cpp zc4_posting_writer.cpp zc4_posting_writer_base.cpp zcbuf.cpp diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp index d71ddc2c2d6..64a54187254 100644 --- a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp @@ -39,7 +39,9 @@ DiskIndex::Key::Key() = default; DiskIndex::Key::Key(const IndexList & indexes, vespalib::stringref word) : _word(word), _indexes(indexes) -{ } +{ +} + DiskIndex::Key::~Key() = default; DiskIndex::DiskIndex(const vespalib::string &indexDir, size_t cacheSize) @@ -73,7 +75,6 @@ DiskIndex::loadSchema() return true; } - bool DiskIndex::openDictionaries(const TuneFileSearch &tuneFileSearch) { @@ -91,7 +92,6 @@ DiskIndex::openDictionaries(const TuneFileSearch &tuneFileSearch) return true; } - bool DiskIndex::openField(const vespalib::string &fieldDir, const TuneFileSearch &tuneFileSearch) @@ -147,7 +147,6 @@ DiskIndex::openField(const vespalib::string &fieldDir, return true; } - bool DiskIndex::setup(const TuneFileSearch &tuneFileSearch) { @@ -165,7 +164,6 @@ DiskIndex::setup(const TuneFileSearch &tuneFileSearch) return true; } - bool DiskIndex::setup(const TuneFileSearch &tuneFileSearch, const DiskIndex &old) @@ -315,7 +313,6 @@ DiskIndex::readPostingList(const LookupResult &lookupRes) const return handle; } - BitVector::UP DiskIndex::readBitVector(const LookupResult &lookupRes) const { @@ -327,7 +324,6 @@ DiskIndex::readBitVector(const LookupResult &lookupRes) const return dict->lookup(lookupRes.wordNum); } - void DiskIndex::calculateSize() { @@ -335,19 +331,18 @@ DiskIndex::calculateSize() _size = dirt.GetTreeSize(); } - namespace { DiskIndex::LookupResult _G_nothing; -class LookupCache -{ +class LookupCache { public: LookupCache(DiskIndex & diskIndex, const std::vector<uint32_t> & fieldIds) : _diskIndex(diskIndex), _fieldIds(fieldIds), _cache() - { } + { + } const DiskIndex::LookupResult & lookup(const vespalib::string & word, uint32_t fieldId) { Cache::const_iterator it = _cache.find(word); @@ -363,14 +358,14 @@ public: return _G_nothing; } private: + typedef vespalib::hash_map<vespalib::string, DiskIndex::LookupResultVector> Cache; DiskIndex & _diskIndex; const std::vector<uint32_t> & _fieldIds; Cache _cache; }; -class CreateBlueprintVisitor : public CreateBlueprintVisitorHelper -{ +class CreateBlueprintVisitor : public CreateBlueprintVisitorHelper { private: LookupCache &_cache; DiskIndex &_diskIndex; @@ -391,8 +386,7 @@ public: } template <class TermNode> - void visitTerm(TermNode &n) - { + void visitTerm(TermNode &n) { const vespalib::string termStr = termAsString(n); const DiskIndex::LookupResult & lookupRes = _cache.lookup(termStr, _fieldId); if (lookupRes.valid()) { @@ -418,7 +412,6 @@ public: void visit(PredicateQuery &) override { } }; - Blueprint::UP createBlueprintHelper(LookupCache & cache, DiskIndex & diskIndex, const IRequestContext & requestContext, const FieldSpec &field, uint32_t fieldId, const Node &term) @@ -442,7 +435,6 @@ DiskIndex::createBlueprint(const IRequestContext & requestContext, const FieldSp return createBlueprintHelper(cache, *this, requestContext, field, fieldIds[0], term); } - Blueprint::UP DiskIndex::createBlueprint(const IRequestContext & requestContext, const FieldSpecList &fields, const Node &term) { diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.h b/searchlib/src/vespa/searchlib/diskindex/diskindex.h index 4bef53a3030..d83b2f56d7c 100644 --- a/searchlib/src/vespa/searchlib/diskindex/diskindex.h +++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.h @@ -12,14 +12,13 @@ namespace search::diskindex { /** - * This class represents a disk index with a common dictionary, and - * posting list files and bit vector files for each field. - * Parts of the disk dictionary and all bit vector - * dictionaries are loaded into memory during setup. All other files - * are just opened, ready for later access. - **/ -class DiskIndex : public queryeval::Searchable -{ + * This class represents a disk index that contains a set of field indexes that are independent of each other. + * + * Each field index has a dictionary, posting list files and bit vector files. + * Parts of the disk dictionary and all bit vector dictionaries are loaded into memory during setup. + * All other files are just opened, ready for later access. + */ +class DiskIndex : public queryeval::Searchable { public: /** * The result after performing a disk dictionary lookup. @@ -60,11 +59,12 @@ public: vespalib::string _word; IndexList _indexes; }; + private: - typedef index::PostingListFileRandRead DiskPostingFile; - typedef Zc4PosOccRandRead DiskPostingFileReal; - typedef ZcPosOccRandRead DiskPostingFileDynamicKReal; - typedef vespalib::cache<vespalib::CacheParam<vespalib::LruParam<Key, LookupResultVector>, DiskIndex>> Cache; + using DiskPostingFile = index::PostingListFileRandRead; + using DiskPostingFileReal = Zc4PosOccRandRead; + using DiskPostingFileDynamicKReal = ZcPosOccRandRead; + using Cache = vespalib::cache<vespalib::CacheParam<vespalib::LruParam<Key, LookupResultVector>, DiskIndex>>; vespalib::string _indexDir; size_t _cacheSize; @@ -83,11 +83,11 @@ private: public: /** - * Create a view of the disk index located in the given directory - * described by the given schema. + * Create a view of the disk index located in the given directory. * * @param indexDir the directory where the disk index is located. - **/ + * @param cacheSize optional size (in bytes) of the disk dictionary lookup cache. + */ DiskIndex(const vespalib::string &indexDir, size_t cacheSize=0); ~DiskIndex(); @@ -95,29 +95,27 @@ public: * Setup this instance by opening and loading relevant index files. * * @return true if this instance was successfully setup. - **/ + */ bool setup(const TuneFileSearch &tuneFileSearch); bool setup(const TuneFileSearch &tuneFileSearch, const DiskIndex &old); /** - * Perform a dictionary lookup for the given word in the given - * field. + * Perform a dictionary lookup for the given word in the given field. * - * @param indexId the id of the field to - * perform lookup for. + * @param indexId the id of the field to perform lookup for. * @param word the word to lookup. * @return the lookup result or nullptr if the word is not found. - **/ + */ LookupResult::UP lookup(uint32_t indexId, vespalib::stringref word); - LookupResultVector lookup(const std::vector<uint32_t> & indexes, vespalib::stringref word); + LookupResultVector lookup(const std::vector<uint32_t> & indexes, vespalib::stringref word); /** * Read the posting list corresponding to the given lookup result. * * @param lookupRes the result of the previous dictionary lookup. * @return a handle for the posting list in memory. - **/ + */ index::PostingListHandle::UP readPostingList(const LookupResult &lookupRes) const; /** @@ -126,22 +124,19 @@ public: * @param lookupRes the result of the previous dictionary lookup. * @return the bit vector or nullptr if no bit vector exists for the * word in the lookup result. - **/ + */ BitVector::UP readBitVector(const LookupResult &lookupRes) const; - queryeval::Blueprint::UP - createBlueprint(const queryeval::IRequestContext & requestContext, - const queryeval::FieldSpec &field, - const query::Node &term) override; + queryeval::Blueprint::UP createBlueprint(const queryeval::IRequestContext & requestContext, + const queryeval::FieldSpec &field, + const query::Node &term) override; - queryeval::Blueprint::UP - createBlueprint(const queryeval::IRequestContext & requestContext, - const queryeval::FieldSpecList &fields, - const query::Node &term) override; + queryeval::Blueprint::UP createBlueprint(const queryeval::IRequestContext & requestContext, + const queryeval::FieldSpecList &fields, + const query::Node &term) override; /** * Get the size on disk of this index. - * @return the size of the index. */ uint64_t getSize() const { return _size; } diff --git a/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp index f6e4da945e0..34e64a9b558 100644 --- a/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp @@ -69,7 +69,7 @@ makePosOccWrite(const vespalib::string &name, fileHeader.getBigEndian() && fileHeader.getFormats().size() == 2 && fileHeader.getFormats()[0] == - ZcPosOccSeqRead::getIdentifier() && + Zc4PosOccSeqRead::getIdentifier(true) && fileHeader.getFormats()[1] == ZcPosOccSeqRead::getSubIdentifier()) { dynamicK = true; @@ -77,7 +77,7 @@ makePosOccWrite(const vespalib::string &name, fileHeader.getBigEndian() && fileHeader.getFormats().size() == 2 && fileHeader.getFormats()[0] == - Zc4PosOccSeqRead::getIdentifier() && + Zc4PosOccSeqRead::getIdentifier(false) && fileHeader.getFormats()[1] == Zc4PosOccSeqRead::getSubIdentifier()) { dynamicK = false; @@ -115,7 +115,7 @@ makePosOccRead(const vespalib::string &name, fileHeader.getBigEndian() && fileHeader.getFormats().size() == 2 && fileHeader.getFormats()[0] == - ZcPosOccSeqRead::getIdentifier() && + Zc4PosOccSeqRead::getIdentifier(true) && fileHeader.getFormats()[1] == ZcPosOccSeqRead::getSubIdentifier()) { dynamicK = true; @@ -123,7 +123,7 @@ makePosOccRead(const vespalib::string &name, fileHeader.getBigEndian() && fileHeader.getFormats().size() == 2 && fileHeader.getFormats()[0] == - Zc4PosOccSeqRead::getIdentifier() && + Zc4PosOccSeqRead::getIdentifier(false) && fileHeader.getFormats()[1] == Zc4PosOccSeqRead::getSubIdentifier()) { dynamicK = false; diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp index 96b106a15da..a41f0412294 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp @@ -63,7 +63,7 @@ void FieldReader::readDocIdAndFeatures() { _oldposoccfile->readDocIdAndFeatures(_docIdAndFeatures); - _docIdAndFeatures._docId = _docIdMapper.mapDocId(_docIdAndFeatures._docId); + _docIdAndFeatures.set_doc_id(_docIdMapper.mapDocId(_docIdAndFeatures.doc_id())); } @@ -75,13 +75,13 @@ FieldReader::read() readCounts(); if (_wordNum == noWordNumHigh()) { assert(_residue == 0); - _docIdAndFeatures._docId = NO_DOC; + _docIdAndFeatures.set_doc_id(NO_DOC); return; } } --_residue; readDocIdAndFeatures(); - if (_docIdAndFeatures._docId != NO_DOC) { + if (_docIdAndFeatures.doc_id() != NO_DOC) { return; } } @@ -267,26 +267,26 @@ FieldReaderStripInfo::read() if (_wordNum == noWordNumHigh()) { return; } - assert(!features.getRaw()); - uint32_t numElements = features._elements.size(); + assert(!features.has_raw_data()); + uint32_t numElements = features.elements().size(); assert(numElements > 0); std::vector<Element>::iterator element = - features._elements.begin(); + features.elements().begin(); if (_hasElements) { if (!_hasElementWeights) { for (uint32_t elementDone = 0; elementDone < numElements; ++elementDone, ++element) { element->setWeight(1); } - assert(element == features._elements.end()); + assert(element == features.elements().end()); } } else { if (element->getElementId() != 0) { continue; // Drop this entry, try to read new entry } element->setWeight(1); - features._wordPositions.resize(element->getNumOccs()); + features.word_positions().resize(element->getNumOccs()); if (numElements > 1) { - features._elements.resize(1); + features.elements().resize(1); } } break; diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h index a73ffa149a9..50748d037c0 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h +++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h @@ -85,7 +85,7 @@ public: bool operator<(const FieldReader &rhs) const { return _wordNum < rhs._wordNum || (_wordNum == rhs._wordNum && - _docIdAndFeatures._docId < rhs._docIdAndFeatures._docId); + _docIdAndFeatures.doc_id() < rhs._docIdAndFeatures.doc_id()); } virtual void setup(const WordNumMapping &wordNumMapping, const DocIdMapping &docIdMapping); diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp index 6454c0851a7..8c2b33a933e 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp @@ -98,7 +98,6 @@ FieldWriter::open(const vespalib::string &prefix, return true; } - void FieldWriter::flush() { @@ -120,7 +119,6 @@ FieldWriter::flush() } } - void FieldWriter::newWord(uint64_t wordNum, vespalib::stringref word) { @@ -134,14 +132,12 @@ FieldWriter::newWord(uint64_t wordNum, vespalib::stringref word) _prevDocId = 0; } - void FieldWriter::newWord(vespalib::stringref word) { newWord(_wordNum + 1, word); } - bool FieldWriter::close() { @@ -183,7 +179,6 @@ FieldWriter::getFeatureParams(PostingListParams ¶ms) _posoccfile->getFeatureParams(params); } - static const char *termOccNames[] = { "boolocc.bdat", @@ -199,7 +194,6 @@ static const char *termOccNames[] = nullptr, }; - void FieldWriter::remove(const vespalib::string &prefix) { diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h index 9a6edf90243..e5aa9788071 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h +++ b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h @@ -10,15 +10,13 @@ namespace search::diskindex { -/* - * FieldWriter is used to write a dictionary and posting list file - * together. +/** + * FieldWriter is used to write a dictionary and posting list file together. * * It is used by the fusion code to write the merged output for a field, * and by the memory index dump code to write a field to disk. */ -class FieldWriter -{ +class FieldWriter { private: uint64_t _wordNum; uint32_t _prevDocId; @@ -28,14 +26,15 @@ public: using DictionaryFileSeqWrite = index::DictionaryFileSeqWrite; - typedef index::PostingListFileSeqWrite PostingListFileSeqWrite; - typedef index::DocIdAndFeatures DocIdAndFeatures; - typedef index::Schema Schema; - typedef index::PostingListCounts PostingListCounts; - typedef index::PostingListParams PostingListParams; + using PostingListFileSeqWrite = index::PostingListFileSeqWrite; + using DocIdAndFeatures = index::DocIdAndFeatures; + using Schema = index::Schema; + using PostingListCounts = index::PostingListCounts; + using PostingListParams = index::PostingListParams; std::unique_ptr<DictionaryFileSeqWrite> _dictFile; std::unique_ptr<PostingListFileSeqWrite> _posoccfile; + private: BitVectorCandidate _bvc; BitVectorFileWrite _bmapfile; @@ -59,11 +58,11 @@ public: void newWord(vespalib::stringref word); void add(const DocIdAndFeatures &features) { - assert(features._docId < _docIdLimit); - assert(features._docId > _prevDocId); + assert(features.doc_id() < _docIdLimit); + assert(features.doc_id() > _prevDocId); _posoccfile->writeDocIdAndFeatures(features); - _bvc.add(features._docId); - _prevDocId = features._docId; + _bvc.add(features.doc_id()); + _prevDocId = features.doc_id(); } uint64_t getSparseWordNum() const { return _wordNum; } diff --git a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp index a3c37cb91f6..42f6971e53f 100644 --- a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp @@ -26,252 +26,65 @@ using index::WordDocElementFeatures; using index::schema::DataType; using vespalib::getLastErrorString; -uint32_t -noWordPos() -{ - return std::numeric_limits<uint32_t>::max(); -} - +class FileHandle { +private: + std::shared_ptr<FieldWriter> _fieldWriter; -class FileHandle -{ public: - FieldWriter *_fieldWriter; - DocIdAndFeatures _docIdAndFeatures; - FileHandle(); - ~FileHandle(); - void - open(vespalib::stringref dir, - const SchemaUtil::IndexIterator &index, - uint32_t docIdLimit, uint64_t numWordIds, - const TuneFileSeqWrite &tuneFileWrite, - const FileHeaderContext &fileHeaderContext); + void open(vespalib::stringref dir, + const SchemaUtil::IndexIterator &index, + uint32_t docIdLimit, uint64_t numWordIds, + const TuneFileSeqWrite &tuneFileWrite, + const FileHeaderContext &fileHeaderContext); - void - close(); -}; + void close(); + FieldWriter* writer() { return _fieldWriter.get(); } +}; } -class IndexBuilder::FieldHandle -{ +class IndexBuilder::FieldHandle { +private: + bool _valid; + const Schema *_schema; // Ptr to allow being std::vector member + uint32_t _fieldId; + IndexBuilder *_builder; // Ptr to allow being std::vector member + FileHandle _file; + public: FieldHandle(const Schema &schema, uint32_t fieldId, - IndexBuilder *ib); + IndexBuilder *builder); ~FieldHandle(); - static uint32_t - noDocRef() - { - return std::numeric_limits<uint32_t>::max(); - } - - static uint32_t - noElRef() - { - return std::numeric_limits<uint32_t>::max(); - } - - class FHWordDocFieldFeatures - { - public: - uint32_t _docId; - uint32_t _numElements; - - FHWordDocFieldFeatures(uint32_t docId) - : _docId(docId), - _numElements(0u) - { - } - - uint32_t - getDocId() const - { - return _docId; - } - - uint32_t - getNumElements() const - { - return _numElements; - } + void new_word(vespalib::stringref word); + void add_document(const index::DocIdAndFeatures &features); - void - incNumElements() - { - ++_numElements; - } - }; - - class FHWordDocElementFeatures - : public WordDocElementFeatures - { - public: - uint32_t _docRef; - - FHWordDocElementFeatures(uint32_t elementId, - int32_t weight, - uint32_t elementLen, - uint32_t docRef) - : WordDocElementFeatures(elementId), - _docRef(docRef) - { - setWeight(weight); - setElementLen(elementLen); - } - }; - - class FHWordDocElementWordPosFeatures - : public WordDocElementWordPosFeatures - { - public: - uint32_t _elementRef; - - FHWordDocElementWordPosFeatures( - const WordDocElementWordPosFeatures &features, - uint32_t elementRef) - : WordDocElementWordPosFeatures(features), - _elementRef(elementRef) - { - } - }; - - typedef vespalib::Array<FHWordDocFieldFeatures> FHWordDocFieldFeaturesVector; - typedef vespalib::Array<FHWordDocElementFeatures> FHWordDocElementFeaturesVector; - typedef vespalib::Array<FHWordDocElementWordPosFeatures> FHWordDocElementWordPosFeaturesVector; - - FHWordDocFieldFeaturesVector _wdff; - FHWordDocElementFeaturesVector _wdfef; - FHWordDocElementWordPosFeaturesVector _wdfepf; - - uint32_t _docRef; - uint32_t _elRef; - bool _valid; - const Schema *_schema; // Ptr to allow being std::vector member - uint32_t _fieldId; - IndexBuilder *_ib; // Ptr to allow being std::vector member - - uint32_t _lowestOKElementId; - uint32_t _lowestOKWordPos; - - FileHandle _files; - - void - startWord(vespalib::stringref word); - - void - endWord(); - - void - startDocument(uint32_t docId); - - void - endDocument(); - - void - startElement(uint32_t elementId, - int32_t weight, - uint32_t elementLen); - - void - endElement(); - - void - addOcc(const WordDocElementWordPosFeatures &features); - - void - setValid() - { - _valid = true; - } - - bool - getValid() const - { - return _valid; - } + const Schema::IndexField &getSchemaField(); + const vespalib::string &getName(); + vespalib::string getDir(); + void open(uint32_t docIdLimit, uint64_t numWordIds, + const TuneFileSeqWrite &tuneFileWrite, + const FileHeaderContext &fileHeaderContext); + void close(); - const Schema::IndexField & - getSchemaField(); - - const vespalib::string & - getName(); - - vespalib::string - getDir(); - - void - open(uint32_t docIdLimit, uint64_t numWordIds, - const TuneFileSeqWrite &tuneFileWrite, - const FileHeaderContext &fileHeaderContext); - - void - close(); - - uint32_t - getIndexId() const - { - return _fieldId; - } + void setValid() { _valid = true; } + bool getValid() const { return _valid; } + uint32_t getIndexId() const { return _fieldId; } }; -namespace { - -class SingleIterator -{ -public: - typedef IndexBuilder::FieldHandle FH; - FH::FHWordDocFieldFeaturesVector::const_iterator _dFeatures; - FH::FHWordDocFieldFeaturesVector::const_iterator _dFeaturesE; - FH::FHWordDocElementFeaturesVector::const_iterator _elFeatures; - FH::FHWordDocElementWordPosFeaturesVector::const_iterator _pFeatures; - uint32_t _docId; - uint32_t _localFieldId; - - SingleIterator(FH &fieldHandle, uint32_t localFieldId); - - void - appendFeatures(DocIdAndFeatures &features); - - bool - isValid() const - { - return _dFeatures != _dFeaturesE; - } - - bool - operator<(const SingleIterator &rhs) const - { - if (_docId != rhs._docId) { - return _docId < rhs._docId; - } - return _localFieldId < rhs._localFieldId; - } -}; - - -} - - FileHandle::FileHandle() - : _fieldWriter(nullptr), - _docIdAndFeatures() -{ -} - - -FileHandle::~FileHandle() + : _fieldWriter() { - delete _fieldWriter; } +FileHandle::~FileHandle() = default; void FileHandle::open(vespalib::stringref dir, @@ -280,9 +93,9 @@ FileHandle::open(vespalib::stringref dir, const TuneFileSeqWrite &tuneFileWrite, const FileHeaderContext &fileHeaderContext) { - assert(_fieldWriter == nullptr); + assert(_fieldWriter.get() == nullptr); - _fieldWriter = new FieldWriter(docIdLimit, numWordIds); + _fieldWriter = std::make_shared<FieldWriter>(docIdLimit, numWordIds); if (!_fieldWriter->open(dir + "/", 64, 262144u, false, index.getSchema(), index.getIndex(), @@ -293,18 +106,16 @@ FileHandle::open(vespalib::stringref dir, } } - void FileHandle::close() { bool ret = true; if (_fieldWriter != nullptr) { bool closeRes = _fieldWriter->close(); - delete _fieldWriter; - _fieldWriter = nullptr; + _fieldWriter.reset(); if (!closeRes) { LOG(error, - "Could not close term writer"); + "Could not close field writer"); ret = false; } } @@ -312,206 +123,66 @@ FileHandle::close() (void) ret; } - IndexBuilder::FieldHandle::FieldHandle(const Schema &schema, uint32_t fieldId, - IndexBuilder *ib) - : _wdff(), - _wdfef(), - _wdfepf(), - _docRef(noDocRef()), - _elRef(noElRef()), - _valid(false), + IndexBuilder *builder) + : _valid(false), _schema(&schema), _fieldId(fieldId), - _ib(ib), - _lowestOKElementId(0u), - _lowestOKWordPos(0u), - _files() + _builder(builder), + _file() { } - IndexBuilder::FieldHandle::~FieldHandle() = default; - void -IndexBuilder::FieldHandle::startWord(vespalib::stringref word) +IndexBuilder::FieldHandle::new_word(vespalib::stringref word) { assert(_valid); - _files._fieldWriter->newWord(word); + _file.writer()->newWord(word); } - void -IndexBuilder::FieldHandle::endWord() +IndexBuilder::FieldHandle::add_document(const index::DocIdAndFeatures &features) { - DocIdAndFeatures &features = _files._docIdAndFeatures; - SingleIterator si(*this, 0u); - for (; si.isValid();) { - features.clear(si._docId); - si.appendFeatures(features); - _files._fieldWriter->add(features); - } - assert(si._elFeatures == _wdfef.end()); - assert(si._pFeatures == _wdfepf.end()); - _wdff.clear(); - _wdfef.clear(); - _wdfepf.clear(); - _docRef = noDocRef(); - _elRef = noElRef(); + _file.writer()->add(features); } - -void -IndexBuilder::FieldHandle::startDocument(uint32_t docId) -{ - assert(_docRef == noDocRef()); - assert(_wdff.empty() || _wdff.back().getDocId() < docId); - _wdff.push_back(FHWordDocFieldFeatures(docId)); - _docRef = _wdff.size() - 1; - _lowestOKElementId = 0u; -} - - -void -IndexBuilder::FieldHandle::endDocument() -{ - assert(_docRef != noDocRef()); - assert(_elRef == noElRef()); - FHWordDocFieldFeatures &ff = _wdff[_docRef]; - assert(ff.getNumElements() > 0); - (void) ff; - _docRef = noDocRef(); -} - - -void -IndexBuilder::FieldHandle:: -startElement(uint32_t elementId, - int32_t weight, - uint32_t elementLen) -{ - assert(_docRef != noDocRef()); - assert(_elRef == noElRef()); - assert(elementId >= _lowestOKElementId); - - FHWordDocFieldFeatures &ff = _wdff[_docRef]; - _wdfef.push_back( - FHWordDocElementFeatures(elementId, - weight, - elementLen, - _docRef)); - ff.incNumElements(); - _elRef = _wdfef.size() - 1; - _lowestOKWordPos = 0u; -} - - -void -IndexBuilder::FieldHandle::endElement() -{ - assert(_elRef != noElRef()); - FHWordDocElementFeatures &ef = _wdfef[_elRef]; - assert(ef.getNumOccs() > 0); - _elRef = noElRef(); - _lowestOKElementId = ef.getElementId() + 1; -} - - -void -IndexBuilder::FieldHandle:: -addOcc(const WordDocElementWordPosFeatures &features) -{ - assert(_elRef != noElRef()); - FHWordDocElementFeatures &ef = _wdfef[_elRef]; - uint32_t wordPos = features.getWordPos(); - assert(wordPos < ef.getElementLen()); - assert(wordPos >= _lowestOKWordPos); - _lowestOKWordPos = wordPos; - _wdfepf.push_back( - FHWordDocElementWordPosFeatures(features, - _elRef)); - ef.incNumOccs(); -} - - const Schema::IndexField & IndexBuilder::FieldHandle::getSchemaField() { return _schema->getIndexField(_fieldId); } - const vespalib::string & IndexBuilder::FieldHandle::getName() { return getSchemaField().getName(); - } - vespalib::string IndexBuilder::FieldHandle::getDir() { - return _ib->appendToPrefix(getName()); + return _builder->appendToPrefix(getName()); } - void IndexBuilder::FieldHandle::open(uint32_t docIdLimit, uint64_t numWordIds, const TuneFileSeqWrite &tuneFileWrite, const FileHeaderContext &fileHeaderContext) { - _files.open(getDir(), - SchemaUtil::IndexIterator(*_schema, getIndexId()), - docIdLimit, numWordIds, tuneFileWrite, fileHeaderContext); + _file.open(getDir(), + SchemaUtil::IndexIterator(*_schema, getIndexId()), + docIdLimit, numWordIds, tuneFileWrite, fileHeaderContext); } - void IndexBuilder::FieldHandle::close() { - _files.close(); -} - - -SingleIterator::SingleIterator(FH &fieldHandle, uint32_t localFieldId) - : _dFeatures(fieldHandle._wdff.begin()), - _dFeaturesE(fieldHandle._wdff.end()), - _elFeatures(fieldHandle._wdfef.begin()), - _pFeatures(fieldHandle._wdfepf.begin()), - _docId(_dFeatures->getDocId()), - _localFieldId(localFieldId) -{ + _file.close(); } - -void -SingleIterator::appendFeatures(DocIdAndFeatures &features) -{ - uint32_t elCount = _dFeatures->getNumElements(); - for (uint32_t elId = 0; elId < elCount; ++elId, ++_elFeatures) { - features._elements.push_back(*_elFeatures); - features._elements.back().setNumOccs(0); - uint32_t posCount = _elFeatures->getNumOccs(); - uint32_t lastWordPos = noWordPos(); - for (uint32_t posId = 0; posId < posCount; ++posId, ++_pFeatures) { - uint32_t wordPos = _pFeatures->getWordPos(); - if (wordPos != lastWordPos) { - lastWordPos = wordPos; - features._elements.back().incNumOccs(); - features._wordPositions.push_back(*_pFeatures); - } - } - } - ++_dFeatures; - if (_dFeatures != _dFeaturesE) { - _docId = _dFeatures->getDocId(); - } -} - - IndexBuilder::IndexBuilder(const Schema &schema) : index::IndexBuilder(schema), _currentField(nullptr), @@ -541,53 +212,6 @@ IndexBuilder::IndexBuilder(const Schema &schema) IndexBuilder::~IndexBuilder() = default; void -IndexBuilder::startWord(vespalib::stringref word) -{ - assert(_currentField != nullptr); - assert(!_inWord); - // TODO: Check sort order - _curWord = word; - _inWord = true; - _currentField->startWord(word); -} - - -void -IndexBuilder::endWord() -{ - assert(_inWord); - assert(_currentField != nullptr); - _currentField->endWord(); - _inWord = false; - _lowestOKDocId = 1u; -} - - -void -IndexBuilder::startDocument(uint32_t docId) -{ - assert(_curDocId == noDocId()); - assert(docId >= _lowestOKDocId); - assert(docId < _docIdLimit); - assert(_currentField != nullptr); - _curDocId = docId; - assert(_curDocId != noDocId()); - _currentField->startDocument(docId); -} - - -void -IndexBuilder::endDocument() -{ - assert(_curDocId != noDocId()); - assert(_currentField != nullptr); - _currentField->endDocument(); - _lowestOKDocId = _curDocId + 1; - _curDocId = noDocId(); -} - - -void IndexBuilder::startField(uint32_t fieldId) { assert(_curDocId == noDocId()); @@ -598,51 +222,50 @@ IndexBuilder::startField(uint32_t fieldId) assert(_currentField != nullptr); } - void IndexBuilder::endField() { assert(_curDocId == noDocId()); assert(!_inWord); assert(_currentField != nullptr); - _lowestOKFieldId = _currentField->_fieldId + 1; + _lowestOKFieldId = _currentField->getIndexId() + 1; _currentField = nullptr; } - void -IndexBuilder::startElement(uint32_t elementId, - int32_t weight, - uint32_t elementLen) +IndexBuilder::startWord(vespalib::stringref word) { assert(_currentField != nullptr); - _currentField->startElement(elementId, weight, elementLen); + assert(!_inWord); + // TODO: Check sort order + _curWord = word; + _inWord = true; + _currentField->new_word(word); } - void -IndexBuilder::endElement() +IndexBuilder::endWord() { + assert(_inWord); assert(_currentField != nullptr); - _currentField->endElement(); + _inWord = false; + _lowestOKDocId = 1u; } - void -IndexBuilder::addOcc(const WordDocElementWordPosFeatures &features) +IndexBuilder::add_document(const index::DocIdAndFeatures &features) { + assert(_inWord); assert(_currentField != nullptr); - _currentField->addOcc(features); + _currentField->add_document(features); } - void IndexBuilder::setPrefix(vespalib::stringref prefix) { _prefix = prefix; } - vespalib::string IndexBuilder::appendToPrefix(vespalib::stringref name) { @@ -652,7 +275,6 @@ IndexBuilder::appendToPrefix(vespalib::stringref name) return _prefix + "/" + name; } - void IndexBuilder::open(uint32_t docIdLimit, uint64_t numWordIds, const TuneFileIndexing &tuneFileIndexing, @@ -682,7 +304,6 @@ IndexBuilder::open(uint32_t docIdLimit, uint64_t numWordIds, } } - void IndexBuilder::close() { diff --git a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h index fa818bf08e6..a1a77d608cd 100644 --- a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h +++ b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h @@ -13,12 +13,16 @@ namespace search::diskindex { class BitVectorCandidate; -class IndexBuilder : public index::IndexBuilder -{ +/** + * Class used to build a disk index for the set of index fields specified in a schema. + * + * The resulting disk index consists of field indexes that are independent of each other. + */ +class IndexBuilder : public index::IndexBuilder { public: class FieldHandle; - typedef index::Schema Schema; + using Schema = index::Schema; private: // Text fields FieldHandle *_currentField; @@ -32,7 +36,7 @@ private: uint32_t _docIdLimit; uint64_t _numWordIds; - const Schema &_schema; // Ptr to allow being std::vector member + const Schema &_schema; static uint32_t noDocId() { return std::numeric_limits<uint32_t>::max(); @@ -45,23 +49,16 @@ private: public: typedef index::WordDocElementWordPosFeatures WordDocElementWordPosFeatures; - // schema argument must live until indexbuilder has been deleted. + // Schema argument must live until IndexBuilder has been deleted. IndexBuilder(const Schema &schema); ~IndexBuilder() override; - void startWord(vespalib::stringref word) override; - void endWord() override; - void startDocument(uint32_t docId) override; - void endDocument() override; void startField(uint32_t fieldId) override; void endField() override; - void startElement(uint32_t elementId, int32_t weight, uint32_t elementLen) override; - void endElement() override; - void addOcc(const WordDocElementWordPosFeatures &features) override; - - // TODO: methods for attribute vectors. + void startWord(vespalib::stringref word) override; + void endWord() override; + void add_document(const index::DocIdAndFeatures &features) override; - // TODO: methods for document summary. void setPrefix(vespalib::stringref prefix); vespalib::string appendToPrefix(vespalib::stringref name); diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp index 5288d054ef0..2149a44f5ce 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp @@ -20,35 +20,13 @@ Zc4PostingHeader::Zc4PostingHeader() { } -template <bool bigEndian> void Zc4PostingHeader::read(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams ¶ms) { - using EC = bitcompression::FeatureEncodeContext<bigEndian>; - UC64_DECODECONTEXT_CONSTRUCTOR(o, decode_context._); - uint32_t length; - uint64_t val64; - - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC); - _num_docs = static_cast<uint32_t>(val64) + 1; - bool has_more = false; - if (__builtin_expect(_num_docs >= params._min_chunk_docs, false)) { - if (bigEndian) { - has_more = static_cast<int64_t>(oVal) < 0; - oVal <<= 1; - length = 1; - } else { - has_more = (oVal & 1) != 0; - oVal >>= 1; - length = 1; - } - UC64_READBITS_NS(o, EC); - } - if (params._dynamic_k) { - _doc_id_k = EC::calcDocIdK((_has_more || has_more) ? 1 : _num_docs, params._doc_id_limit); - } else { - _doc_id_k = K_VALUE_ZCPOSTING_LASTDOCID; - } + using EC = bitcompression::FeatureEncodeContext<true>; + _num_docs = decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_NUMDOCS) + 1; + bool has_more = (_num_docs >= params._min_chunk_docs) ? (decode_context.readBits(1) != 0) : false; + _doc_id_k = params._dynamic_k ? EC::calcDocIdK((_has_more || has_more) ? 1 : _num_docs, params._doc_id_limit) : K_VALUE_ZCPOSTING_LASTDOCID; if (_num_docs < params._min_skip_docs && !_has_more) { _doc_ids_size = 0; _l1_skip_size = 0; @@ -58,47 +36,16 @@ Zc4PostingHeader::read(bitcompression::DecodeContext64Base &decode_context, cons _features_size = 0; _last_doc_id = 0; } else { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC); - _doc_ids_size = val64 + 1; - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L1SKIPSIZE, EC); - _l1_skip_size = val64; - if (_l1_skip_size != 0) { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC); - _l2_skip_size = val64; - } - if (_l2_skip_size != 0) { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC); - _l3_skip_size = val64; - } - if (_l3_skip_size != 0) { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC); - _l4_skip_size = val64; - } - if (params._encode_features) { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC); - _features_size = val64; - } else { - _features_size = 0; - } - UC64_DECODEEXPGOLOMB_NS(o, _doc_id_k, EC); - _last_doc_id = params._doc_id_limit - 1 - val64; - uint64_t bytePad = oPreRead & 7; - if (bytePad > 0) { - length = bytePad; - UC64_READBITS_NS(o, EC); - } + _doc_ids_size = decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_DOCIDSSIZE) + 1; + _l1_skip_size = decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_L1SKIPSIZE); + _l2_skip_size = (_l1_skip_size != 0) ? decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_L2SKIPSIZE) : 0; + _l3_skip_size = (_l2_skip_size != 0) ? decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_L3SKIPSIZE) : 0; + _l4_skip_size = (_l3_skip_size != 0) ? decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_L4SKIPSIZE) : 0; + _features_size = params._encode_features ? decode_context.decode_exp_golomb(K_VALUE_ZCPOSTING_FEATURESSIZE) : 0; + _last_doc_id = params._doc_id_limit - 1 - decode_context.decode_exp_golomb(_doc_id_k); + decode_context.align(8); } - UC64_DECODECONTEXT_STORE(o, decode_context._); _has_more = has_more; } -template -void -Zc4PostingHeader::read<false>(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams ¶ms); - -template -void -Zc4PostingHeader::read<true>(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams ¶ms); - - } diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h index 7382f59d176..d4032864e16 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h @@ -27,7 +27,6 @@ struct Zc4PostingHeader { Zc4PostingHeader(); - template <bool bigEndian> void read(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams ¶ms); }; diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp new file mode 100644 index 00000000000..c0e1115521c --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp @@ -0,0 +1,438 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "zc4_posting_reader.h" +#include <vespa/searchlib/index/docidandfeatures.h> + +namespace search::diskindex { + +using index::PostingListCounts; +using index::DocIdAndFeatures; +using bitcompression::FeatureEncodeContext; + + +template <bool bigEndian> +Zc4PostingReader<bigEndian>::Zc4PostingReader(bool dynamic_k) + : _decodeContext(nullptr), + _docIdK(K_VALUE_ZCPOSTING_DELTA_DOCID), + _prevDocId(0), + _numDocs(0), + _readContext(sizeof(uint64_t)), + _has_more(false), + _posting_params(64, 1 << 30, 10000000, dynamic_k, true), + _lastDocId(0), + _zcDocIds(), + _l1Skip(), + _l2Skip(), + _l3Skip(), + _l4Skip(), + _chunkNo(0), + _l1SkipDocId(0), + _l1SkipDocIdPos(0), + _l1SkipFeaturesPos(0), + _l2SkipDocId(0), + _l2SkipDocIdPos(0), + _l2SkipL1SkipPos(0), + _l2SkipFeaturesPos(0), + _l3SkipDocId(0), + _l3SkipDocIdPos(0), + _l3SkipL1SkipPos(0), + _l3SkipL2SkipPos(0), + _l3SkipFeaturesPos(0), + _l4SkipDocId(0), + _l4SkipDocIdPos(0), + _l4SkipL1SkipPos(0), + _l4SkipL2SkipPos(0), + _l4SkipL3SkipPos(0), + _l4SkipFeaturesPos(0), + _featuresSize(0), + _counts(), + _residue(0) +{ +} + +template <bool bigEndian> +Zc4PostingReader<bigEndian>::~Zc4PostingReader() +{ +} + +template <bool bigEndian> +void +Zc4PostingReader<bigEndian>::read_common_word_doc_id_and_features(DocIdAndFeatures &features) +{ + if ((_zcDocIds._valI >= _zcDocIds._valE) && _has_more) { + read_word_start(); // Read start of next chunk + } + // Split docid & features. + assert(_zcDocIds._valI < _zcDocIds._valE); + uint32_t docIdPos = _zcDocIds.pos(); + uint32_t docId = _prevDocId + 1 + _zcDocIds.decode(); + features.set_doc_id(docId); + _prevDocId = docId; + assert(docId <= _lastDocId); + if (docId > _l1SkipDocId) { + _l1SkipDocIdPos += _l1Skip.decode() + 1; + assert(docIdPos == _l1SkipDocIdPos); + uint64_t featuresPos = _decodeContext->getReadOffset(); + if (_posting_params._encode_features) { + _l1SkipFeaturesPos += _l1Skip.decode() + 1; + assert(featuresPos == _l1SkipFeaturesPos); + } + (void) featuresPos; + if (docId > _l2SkipDocId) { + _l2SkipDocIdPos += _l2Skip.decode() + 1; + assert(docIdPos == _l2SkipDocIdPos); + if (_posting_params._encode_features) { + _l2SkipFeaturesPos += _l2Skip.decode() + 1; + assert(featuresPos == _l2SkipFeaturesPos); + } + _l2SkipL1SkipPos += _l2Skip.decode() + 1; + assert(_l1Skip.pos() == _l2SkipL1SkipPos); + if (docId > _l3SkipDocId) { + _l3SkipDocIdPos += _l3Skip.decode() + 1; + assert(docIdPos == _l3SkipDocIdPos); + if (_posting_params._encode_features) { + _l3SkipFeaturesPos += _l3Skip.decode() + 1; + assert(featuresPos == _l3SkipFeaturesPos); + } + _l3SkipL1SkipPos += _l3Skip.decode() + 1; + assert(_l1Skip.pos() == _l3SkipL1SkipPos); + _l3SkipL2SkipPos += _l3Skip.decode() + 1; + assert(_l2Skip.pos() == _l3SkipL2SkipPos); + if (docId > _l4SkipDocId) { + _l4SkipDocIdPos += _l4Skip.decode() + 1; + assert(docIdPos == _l4SkipDocIdPos); + (void) docIdPos; + if (_posting_params._encode_features) { + _l4SkipFeaturesPos += _l4Skip.decode() + 1; + assert(featuresPos == _l4SkipFeaturesPos); + } + _l4SkipL1SkipPos += _l4Skip.decode() + 1; + assert(_l1Skip.pos() == _l4SkipL1SkipPos); + _l4SkipL2SkipPos += _l4Skip.decode() + 1; + assert(_l2Skip.pos() == _l4SkipL2SkipPos); + _l4SkipL3SkipPos += _l4Skip.decode() + 1; + assert(_l3Skip.pos() == _l4SkipL3SkipPos); + _l4SkipDocId += _l4Skip.decode() + 1; + assert(_l4SkipDocId <= _lastDocId); + assert(_l4SkipDocId >= docId); + } + _l3SkipDocId += _l3Skip.decode() + 1; + assert(_l3SkipDocId <= _lastDocId); + assert(_l3SkipDocId <= _l4SkipDocId); + assert(_l3SkipDocId >= docId); + } + _l2SkipDocId += _l2Skip.decode() + 1; + assert(_l2SkipDocId <= _lastDocId); + assert(_l2SkipDocId <= _l4SkipDocId); + assert(_l2SkipDocId <= _l3SkipDocId); + assert(_l2SkipDocId >= docId); + } + _l1SkipDocId += _l1Skip.decode() + 1; + assert(_l1SkipDocId <= _lastDocId); + assert(_l1SkipDocId <= _l4SkipDocId); + assert(_l1SkipDocId <= _l3SkipDocId); + assert(_l1SkipDocId <= _l2SkipDocId); + assert(_l1SkipDocId >= docId); + } + if (docId < _lastDocId) { + // Assert more space available when not yet at last docid + assert(_zcDocIds._valI < _zcDocIds._valE); + } else { + // Assert that space has been used when at last docid + assert(_zcDocIds._valI == _zcDocIds._valE); + // Assert that we've read to end of skip info + assert(_l1SkipDocId == _lastDocId); + assert(_l2SkipDocId == _lastDocId); + assert(_l3SkipDocId == _lastDocId); + assert(_l4SkipDocId == _lastDocId); + if (!_has_more) { + _chunkNo = 0; + } + } + if (_posting_params._encode_features) { + _decodeContext->readFeatures(features); + } + --_residue; +} + +template <bool bigEndian> +void +Zc4PostingReader<bigEndian>::read_doc_id_and_features(DocIdAndFeatures &features) +{ + if (_residue == 0 && !_has_more) { + if (_residue == 0) { + // Don't read past end of posting list. + features.clear(static_cast<uint32_t>(-1)); + return; + } + } + if (_lastDocId > 0) { + read_common_word_doc_id_and_features(features); + return; + } + // Interleaves docid & features + using EC = FeatureEncodeContext<bigEndian>; + DecodeContext &d = *_decodeContext; + uint32_t length; + uint64_t val64; + UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); + + UC64_DECODEEXPGOLOMB_SMALL_NS(o, _docIdK, EC); + uint32_t docId = _prevDocId + 1 + val64; + features.set_doc_id(docId); + _prevDocId = docId; + UC64_DECODECONTEXT_STORE(o, d._); + if (__builtin_expect(oCompr >= d._valE, false)) { + _readContext.readComprBuffer(); + } + if (_posting_params._encode_features) { + _decodeContext->readFeatures(features); + } + --_residue; +} + +template <bool bigEndian> +void +Zc4PostingReader<bigEndian>::read_word_start_with_skip() +{ + using EC = FeatureEncodeContext<bigEndian>; + DecodeContext &d = *_decodeContext; + UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); + uint32_t length; + uint64_t val64; + const uint64_t *valE = d._valE; + + if (_has_more) { + ++_chunkNo; + } else { + _chunkNo = 0; + } + assert(_numDocs >= _posting_params._min_skip_docs || _has_more); + bool has_more = false; + if (__builtin_expect(_numDocs >= _posting_params._min_chunk_docs, false)) { + if (bigEndian) { + has_more = static_cast<int64_t>(oVal) < 0; + oVal <<= 1; + } else { + has_more = (oVal & 1) != 0; + oVal >>= 1; + } + length = 1; + UC64_READBITS_NS(o, EC); + } + if (_posting_params._dynamic_k) { + _docIdK = EC::calcDocIdK((_has_more || has_more) ? 1 : _numDocs, + _posting_params._doc_id_limit); + } + if (_has_more || has_more) { + assert(has_more == (_chunkNo + 1 < _counts._segments.size())); + assert(_numDocs == _counts._segments[_chunkNo]._numDocs); + if (has_more) { + assert(_numDocs >= _posting_params._min_skip_docs); + assert(_numDocs >= _posting_params._min_chunk_docs); + } + } else { + assert(_numDocs >= _posting_params._min_skip_docs); + assert(_numDocs == _counts._numDocs); + } + if (__builtin_expect(oCompr >= valE, false)) { + UC64_DECODECONTEXT_STORE(o, d._); + _readContext.readComprBuffer(); + valE = d._valE; + UC64_DECODECONTEXT_LOAD(o, d._); + } + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC); + uint32_t docIdsSize = val64 + 1; + UC64_DECODEEXPGOLOMB_NS(o, + K_VALUE_ZCPOSTING_L1SKIPSIZE, + EC); + uint32_t l1SkipSize = val64; + if (__builtin_expect(oCompr >= valE, false)) { + UC64_DECODECONTEXT_STORE(o, d._); + _readContext.readComprBuffer(); + valE = d._valE; + UC64_DECODECONTEXT_LOAD(o, d._); + } + uint32_t l2SkipSize = 0; + if (l1SkipSize != 0) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC); + l2SkipSize = val64; + } + uint32_t l3SkipSize = 0; + if (l2SkipSize != 0) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC); + l3SkipSize = val64; + } + if (__builtin_expect(oCompr >= valE, false)) { + UC64_DECODECONTEXT_STORE(o, d._); + _readContext.readComprBuffer(); + valE = d._valE; + UC64_DECODECONTEXT_LOAD(o, d._); + } + uint32_t l4SkipSize = 0; + if (l3SkipSize != 0) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC); + l4SkipSize = val64; + } + if (_posting_params._encode_features) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC); + _featuresSize = val64; + } + if (__builtin_expect(oCompr >= valE, false)) { + UC64_DECODECONTEXT_STORE(o, d._); + _readContext.readComprBuffer(); + valE = d._valE; + UC64_DECODECONTEXT_LOAD(o, d._); + } + if (_posting_params._dynamic_k) { + UC64_DECODEEXPGOLOMB_NS(o, _docIdK, EC); + } else { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_LASTDOCID, EC); + } + _lastDocId = _posting_params._doc_id_limit - 1 - val64; + if (_has_more || has_more) { + assert(_lastDocId == _counts._segments[_chunkNo]._lastDoc); + } + + if (__builtin_expect(oCompr >= valE, false)) { + UC64_DECODECONTEXT_STORE(o, d._); + _readContext.readComprBuffer(); + valE = d._valE; + UC64_DECODECONTEXT_LOAD(o, d._); + } + uint64_t bytePad = oPreRead & 7; + if (bytePad > 0) { + length = bytePad; + if (bigEndian) { + oVal <<= length; + } else { + oVal >>= length; + } + UC64_READBITS_NS(o, EC); + } + UC64_DECODECONTEXT_STORE(o, d._); + if (__builtin_expect(oCompr >= valE, false)) { + _readContext.readComprBuffer(); + } + _zcDocIds.clearReserve(docIdsSize); + _l1Skip.clearReserve(l1SkipSize); + _l2Skip.clearReserve(l2SkipSize); + _l3Skip.clearReserve(l3SkipSize); + _l4Skip.clearReserve(l4SkipSize); + _decodeContext->readBytes(_zcDocIds._valI, docIdsSize); + _zcDocIds._valE = _zcDocIds._valI + docIdsSize; + if (l1SkipSize > 0) { + _decodeContext->readBytes(_l1Skip._valI, l1SkipSize); + } + _l1Skip._valE = _l1Skip._valI + l1SkipSize; + if (l2SkipSize > 0) { + _decodeContext->readBytes(_l2Skip._valI, l2SkipSize); + } + _l2Skip._valE = _l2Skip._valI + l2SkipSize; + if (l3SkipSize > 0) { + _decodeContext->readBytes(_l3Skip._valI, l3SkipSize); + } + _l3Skip._valE = _l3Skip._valI + l3SkipSize; + if (l4SkipSize > 0) { + _decodeContext->readBytes(_l4Skip._valI, l4SkipSize); + } + _l4Skip._valE = _l4Skip._valI + l4SkipSize; + + if (l1SkipSize > 0) { + _l1SkipDocId = _l1Skip.decode() + 1 + _prevDocId; + } else { + _l1SkipDocId = _lastDocId; + } + if (l2SkipSize > 0) { + _l2SkipDocId = _l2Skip.decode() + 1 + _prevDocId; + } else { + _l2SkipDocId = _lastDocId; + } + if (l3SkipSize > 0) { + _l3SkipDocId = _l3Skip.decode() + 1 + _prevDocId; + } else { + _l3SkipDocId = _lastDocId; + } + if (l4SkipSize > 0) { + _l4SkipDocId = _l4Skip.decode() + 1 + _prevDocId; + } else { + _l4SkipDocId = _lastDocId; + } + _l1SkipDocIdPos = 0; + _l1SkipFeaturesPos = _decodeContext->getReadOffset(); + _l2SkipDocIdPos = 0; + _l2SkipL1SkipPos = 0; + _l2SkipFeaturesPos = _decodeContext->getReadOffset(); + _l3SkipDocIdPos = 0; + _l3SkipL1SkipPos = 0; + _l3SkipL2SkipPos = 0; + _l3SkipFeaturesPos = _decodeContext->getReadOffset(); + _l4SkipDocIdPos = 0; + _l4SkipL1SkipPos = 0; + _l4SkipL2SkipPos = 0; + _l4SkipL3SkipPos = 0; + _l4SkipFeaturesPos = _decodeContext->getReadOffset(); + _has_more = has_more; + // Decode context is now positioned at start of features +} + +template <bool bigEndian> +void +Zc4PostingReader<bigEndian>::read_word_start() +{ + using EC = FeatureEncodeContext<bigEndian>; + UC64_DECODECONTEXT_CONSTRUCTOR(o, _decodeContext->_); + uint32_t length; + uint64_t val64; + const uint64_t *valE = _decodeContext->_valE; + + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC); + UC64_DECODECONTEXT_STORE(o, _decodeContext->_); + if (oCompr >= valE) { + _readContext.readComprBuffer(); + } + _numDocs = static_cast<uint32_t>(val64) + 1; + _residue = _numDocs; + _prevDocId = _has_more ? _lastDocId : 0u; + assert(_numDocs <= _counts._numDocs); + assert(_numDocs == _counts._numDocs || + _numDocs >= _posting_params._min_chunk_docs || + _has_more); + + if (_numDocs >= _posting_params._min_skip_docs || _has_more) { + read_word_start_with_skip(); + // Decode context is not positioned at start of features + } else { + if (_posting_params._dynamic_k) { + _docIdK = EC::calcDocIdK(_numDocs, _posting_params._doc_id_limit); + } + _lastDocId = 0u; + // Decode context is not positioned at start of docids & features + } +} + +template <bool bigEndian> +void +Zc4PostingReader<bigEndian>::set_counts(const PostingListCounts &counts) +{ + assert(!_has_more && _residue == 0); // Previous words must have been read. + _counts = counts; + assert((_counts._numDocs == 0) == (_counts._bitLength == 0)); + if (_counts._numDocs > 0) { + read_word_start(); + } +} + +template <bool bigEndian> +void +Zc4PostingReader<bigEndian>::set_decode_features(DecodeContext *decode_features) +{ + _decodeContext = decode_features; + _decodeContext->setReadContext(&_readContext); + _readContext.setDecodeContext(_decodeContext); +} + +template class Zc4PostingReader<false>; +template class Zc4PostingReader<true>; + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h new file mode 100644 index 00000000000..d8161da15d5 --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h @@ -0,0 +1,96 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "zc4_posting_writer.h" +#include <vespa/searchlib/index/postinglistfile.h> +#include <vespa/fastos/file.h> +#include "zc4_posting_params.h" + +namespace search::index { + class PostingListCountFileSeqRead; +} + +namespace search::diskindex { + +/* + * Class used to read posting lists of type "Zc.4" and "Zc.5" (dynamic k). + * + * Common words have docid deltas and skip info separate from + * features. + * + * Rare words do not have skip info, and docid deltas and features are + * interleaved. + */ +template <bool bigEndian> +class Zc4PostingReader +{ + +protected: + using DecodeContext = bitcompression::FeatureDecodeContext<bigEndian>; + + DecodeContext *_decodeContext; + uint32_t _docIdK; + uint32_t _prevDocId; // Previous document id + uint32_t _numDocs; // Documents in chunk or word + search::ComprFileReadContext _readContext; + bool _has_more; + Zc4PostingParams _posting_params; + uint32_t _lastDocId; // last document in chunk or word + + ZcBuf _zcDocIds; // Document id deltas + ZcBuf _l1Skip; // L1 skip info + ZcBuf _l2Skip; // L2 skip info + ZcBuf _l3Skip; // L3 skip info + ZcBuf _l4Skip; // L4 skip info + + uint64_t _numWords; // Number of words in file + uint32_t _chunkNo; // Chunk number + + // Variables for validating skip information while reading + uint32_t _l1SkipDocId; + uint32_t _l1SkipDocIdPos; + uint64_t _l1SkipFeaturesPos; + uint32_t _l2SkipDocId; + uint32_t _l2SkipDocIdPos; + uint32_t _l2SkipL1SkipPos; + uint64_t _l2SkipFeaturesPos; + uint32_t _l3SkipDocId; + uint32_t _l3SkipDocIdPos; + uint32_t _l3SkipL1SkipPos; + uint32_t _l3SkipL2SkipPos; + uint64_t _l3SkipFeaturesPos; + uint32_t _l4SkipDocId; + uint32_t _l4SkipDocIdPos; + uint32_t _l4SkipL1SkipPos; + uint32_t _l4SkipL2SkipPos; + uint32_t _l4SkipL3SkipPos; + uint64_t _l4SkipFeaturesPos; + + // Variable for validating chunk information while reading + uint64_t _featuresSize; + index::PostingListCounts _counts; + + uint32_t _residue; // Number of unread documents after word header + void read_common_word_doc_id_and_features(index::DocIdAndFeatures &features); + void read_word_start_with_skip(); + void read_word_start(); +public: + Zc4PostingReader(bool dynamic_k); + Zc4PostingReader(const Zc4PostingReader &) = delete; + Zc4PostingReader(Zc4PostingReader &&) = delete; + Zc4PostingReader &operator=(const Zc4PostingReader &) = delete; + Zc4PostingReader &operator=(Zc4PostingReader &&) = delete; + ~Zc4PostingReader(); + void read_doc_id_and_features(index::DocIdAndFeatures &features); + void set_counts(const index::PostingListCounts &counts); + void set_decode_features(DecodeContext *decode_features); + DecodeContext &get_decode_features() const { return *_decodeContext; } + ComprFileReadContext &get_read_context() { return _readContext; } + Zc4PostingParams &get_posting_params() { return _posting_params; } +}; + +extern template class Zc4PostingReader<false>; +extern template class Zc4PostingReader<true>; + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp index 477db7095ed..78d18cb5550 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp @@ -153,11 +153,11 @@ Zc4PostingWriter<bigEndian>::write_docid_and_features(const DocIdAndFeatures &fe uint64_t writeOffset = _encode_features->getWriteOffset(); uint64_t featureSize = writeOffset - _featureOffset; assert(static_cast<uint32_t>(featureSize) == featureSize); - _docIds.push_back(std::make_pair(features._docId, + _docIds.push_back(std::make_pair(features.doc_id(), static_cast<uint32_t>(featureSize))); _featureOffset = writeOffset; } else { - _docIds.push_back(std::make_pair(features._docId, uint32_t(0))); + _docIds.push_back(std::make_pair(features.doc_id(), uint32_t(0))); } } diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp index 51f7a2ea151..5ab37cecc3d 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp @@ -8,6 +8,189 @@ using search::index::PostingListParams; namespace search::diskindex { +namespace { + +class DocIdEncoder { +protected: + uint32_t _doc_id; + uint32_t _doc_id_pos; + uint32_t _feature_pos; + using DocIdAndFeatureSize = std::pair<uint32_t, uint32_t>; + +public: + DocIdEncoder() + : _doc_id(0u), + _doc_id_pos(0u), + _feature_pos(0u) + { + } + + void write(ZcBuf &zc_buf, const DocIdAndFeatureSize &doc_id_and_feature_size); + void set_doc_id(uint32_t doc_id) { _doc_id = doc_id; } + uint32_t get_doc_id() const { return _doc_id; } + uint32_t get_doc_id_pos() const { return _doc_id_pos; } + uint32_t get_feature_pos() const { return _feature_pos; } +}; + +class L1SkipEncoder : public DocIdEncoder { +protected: + uint32_t _stride_check; + uint32_t _l1_skip_pos; + const bool _encode_features; + +public: + L1SkipEncoder(bool encode_features) + : DocIdEncoder(), + _stride_check(0u), + _l1_skip_pos(0u), + _encode_features(encode_features) + { + } + + void encode_skip(ZcBuf &zc_buf, const DocIdEncoder &doc_id_encoder); + void write_skip(ZcBuf &zc_buf, const DocIdEncoder &doc_id_encoder); + bool should_write_skip(uint32_t stride) { return ++_stride_check >= stride; } + void dec_stride_check() { --_stride_check; } + void write_partial_skip(ZcBuf &zc_buf, uint32_t doc_id); + uint32_t get_l1_skip_pos() const { return _l1_skip_pos; } +}; + +struct L2SkipEncoder : public L1SkipEncoder { +protected: + uint32_t _l2_skip_pos; + +public: + L2SkipEncoder(bool encode_features) + : L1SkipEncoder(encode_features), + _l2_skip_pos(0u) + { + } + + void encode_skip(ZcBuf &zc_buf, const L1SkipEncoder &l1_skip); + void write_skip(ZcBuf &zc_buf, const L1SkipEncoder &l1_skip); + uint32_t get_l2_skip_pos() const { return _l2_skip_pos; } +}; + +class L3SkipEncoder : public L2SkipEncoder { +protected: + uint32_t _l3_skip_pos; + +public: + L3SkipEncoder(bool encode_features) + : L2SkipEncoder(encode_features), + _l3_skip_pos(0u) + { + } + + void encode_skip(ZcBuf &zc_buf, const L2SkipEncoder &l2_skip); + void write_skip(ZcBuf &zc_buf, const L2SkipEncoder &l2_skip); + uint32_t get_l3_skip_pos() const { return _l3_skip_pos; } +}; + +class L4SkipEncoder : public L3SkipEncoder { + +public: + L4SkipEncoder(bool encode_features) + : L3SkipEncoder(encode_features) + { + } + + void encode_skip(ZcBuf &zc_buf, const L3SkipEncoder &l3_skip); + void write_skip(ZcBuf &zc_buf, const L3SkipEncoder &l3_skip); +}; + +void +DocIdEncoder::write(ZcBuf &zc_buf, const DocIdAndFeatureSize &doc_id_and_feature_size) +{ + _feature_pos += doc_id_and_feature_size.second; + zc_buf.encode(doc_id_and_feature_size.first - _doc_id - 1); + _doc_id = doc_id_and_feature_size.first; + _doc_id_pos = zc_buf.size(); +} + +void +L1SkipEncoder::encode_skip(ZcBuf &zc_buf, const DocIdEncoder &doc_id_encoder) +{ + _stride_check = 0; + // doc id + uint32_t doc_id_delta = doc_id_encoder.get_doc_id() - _doc_id; + assert(static_cast<int32_t>(doc_id_delta) > 0); + zc_buf.encode(doc_id_delta - 1); + _doc_id = doc_id_encoder.get_doc_id(); + // doc id pos + zc_buf.encode(doc_id_encoder.get_doc_id_pos() - _doc_id_pos - 1); + _doc_id_pos = doc_id_encoder.get_doc_id_pos(); + if (_encode_features) { + // features pos + zc_buf.encode(doc_id_encoder.get_feature_pos() - _feature_pos - 1); + _feature_pos = doc_id_encoder.get_feature_pos(); + } +} + +void +L1SkipEncoder::write_skip(ZcBuf &zc_buf, const DocIdEncoder &doc_id_encoder) +{ + encode_skip(zc_buf, doc_id_encoder); + _l1_skip_pos = zc_buf.size(); +} + +void +L1SkipEncoder::write_partial_skip(ZcBuf &zc_buf, uint32_t doc_id) +{ + if (zc_buf.size() > 0) { + zc_buf.encode(doc_id - _doc_id - 1); + } +} + +void +L2SkipEncoder::encode_skip(ZcBuf &zc_buf, const L1SkipEncoder &l1_skip) +{ + L1SkipEncoder::encode_skip(zc_buf, l1_skip); + // L1 skip pos + zc_buf.encode(l1_skip.get_l1_skip_pos() - _l1_skip_pos - 1); + _l1_skip_pos = l1_skip.get_l1_skip_pos(); +} + +void +L2SkipEncoder::write_skip(ZcBuf &zc_buf, const L1SkipEncoder &l1_skip) +{ + encode_skip(zc_buf, l1_skip); + _l2_skip_pos = zc_buf.size(); +} + +void +L3SkipEncoder::encode_skip(ZcBuf &zc_buf, const L2SkipEncoder &l2_skip) +{ + L2SkipEncoder::encode_skip(zc_buf, l2_skip); + // L2 skip pos + zc_buf.encode(l2_skip.get_l2_skip_pos() - _l2_skip_pos - 1); + _l2_skip_pos = l2_skip.get_l2_skip_pos(); +} + +void +L3SkipEncoder::write_skip(ZcBuf &zc_buf, const L2SkipEncoder &l2_skip) +{ + encode_skip(zc_buf, l2_skip); + _l3_skip_pos = zc_buf.size(); +} + +void +L4SkipEncoder::encode_skip(ZcBuf &zc_buf, const L3SkipEncoder &l3_skip) +{ + L3SkipEncoder::encode_skip(zc_buf, l3_skip); + // L3 skip pos + zc_buf.encode(l3_skip.get_l3_skip_pos() - _l3_skip_pos - 1); + _l3_skip_pos = l3_skip.get_l3_skip_pos(); +} + +void +L4SkipEncoder::write_skip(ZcBuf &zc_buf, const L3SkipEncoder &l3_skip) +{ + encode_skip(zc_buf, l3_skip); +} + +} + Zc4PostingWriterBase::Zc4PostingWriterBase(PostingListCounts &counts) : _minChunkDocs(1 << 30), _minSkipDocs(64), @@ -45,159 +228,42 @@ Zc4PostingWriterBase::~Zc4PostingWriterBase() #define L4SKIPSTRIDE 8 void -Zc4PostingWriterBase::calc_skip_info(bool encodeFeatures) +Zc4PostingWriterBase::calc_skip_info(bool encode_features) { - uint32_t lastDocId = 0u; - uint32_t lastL1SkipDocId = 0u; - uint32_t lastL1SkipDocIdPos = 0; - uint32_t lastL1SkipFeaturePos = 0; - uint32_t lastL2SkipDocId = 0u; - uint32_t lastL2SkipDocIdPos = 0; - uint32_t lastL2SkipFeaturePos = 0; - uint32_t lastL2SkipL1SkipPos = 0; - uint32_t lastL3SkipDocId = 0u; - uint32_t lastL3SkipDocIdPos = 0; - uint32_t lastL3SkipFeaturePos = 0; - uint32_t lastL3SkipL1SkipPos = 0; - uint32_t lastL3SkipL2SkipPos = 0; - uint32_t lastL4SkipDocId = 0u; - uint32_t lastL4SkipDocIdPos = 0; - uint32_t lastL4SkipFeaturePos = 0; - uint32_t lastL4SkipL1SkipPos = 0; - uint32_t lastL4SkipL2SkipPos = 0; - uint32_t lastL4SkipL3SkipPos = 0; - unsigned int l1SkipCnt = 0; - unsigned int l2SkipCnt = 0; - unsigned int l3SkipCnt = 0; - unsigned int l4SkipCnt = 0; - uint64_t featurePos = 0; - - std::vector<DocIdAndFeatureSize>::const_iterator dit = _docIds.begin(); - std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end(); - + DocIdEncoder doc_id_encoder; + L1SkipEncoder l1_skip_encoder(encode_features); + L2SkipEncoder l2_skip_encoder(encode_features); + L3SkipEncoder l3_skip_encoder(encode_features); + L4SkipEncoder l4_skip_encoder(encode_features); + l1_skip_encoder.dec_stride_check(); if (!_counts._segments.empty()) { - lastDocId = _counts._segments.back()._lastDoc; - lastL1SkipDocId = lastDocId; - lastL2SkipDocId = lastDocId; - lastL3SkipDocId = lastDocId; - lastL4SkipDocId = lastDocId; + uint32_t doc_id = _counts._segments.back()._lastDoc; + doc_id_encoder.set_doc_id(doc_id); + l1_skip_encoder.set_doc_id(doc_id); + l2_skip_encoder.set_doc_id(doc_id); + l3_skip_encoder.set_doc_id(doc_id); + l4_skip_encoder.set_doc_id(doc_id); } - - for (; dit != dite; ++dit) { - if (l1SkipCnt >= L1SKIPSTRIDE) { - // L1 docid delta - uint32_t docIdDelta = lastDocId - lastL1SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - _l1Skip.encode(docIdDelta - 1); - lastL1SkipDocId = lastDocId; - // L1 docid pos - uint64_t docIdPos = _zcDocIds.size(); - _l1Skip.encode(docIdPos - lastL1SkipDocIdPos - 1); - lastL1SkipDocIdPos = docIdPos; - if (encodeFeatures) { - // L1 features pos - _l1Skip.encode(featurePos - lastL1SkipFeaturePos - 1); - lastL1SkipFeaturePos = featurePos; - } - l1SkipCnt = 0; - ++l2SkipCnt; - if (l2SkipCnt >= L2SKIPSTRIDE) { - // L2 docid delta - docIdDelta = lastDocId - lastL2SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - _l2Skip.encode(docIdDelta - 1); - lastL2SkipDocId = lastDocId; - // L2 docid pos - docIdPos = _zcDocIds.size(); - _l2Skip.encode(docIdPos - lastL2SkipDocIdPos - 1); - lastL2SkipDocIdPos = docIdPos; - if (encodeFeatures) { - // L2 features pos - _l2Skip.encode(featurePos - lastL2SkipFeaturePos - 1); - lastL2SkipFeaturePos = featurePos; - } - // L2 L1Skip pos - uint64_t l1SkipPos = _l1Skip.size(); - _l2Skip.encode(l1SkipPos - lastL2SkipL1SkipPos - 1); - lastL2SkipL1SkipPos = l1SkipPos; - l2SkipCnt = 0; - ++l3SkipCnt; - if (l3SkipCnt >= L3SKIPSTRIDE) { - // L3 docid delta - docIdDelta = lastDocId - lastL3SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - _l3Skip.encode(docIdDelta - 1); - lastL3SkipDocId = lastDocId; - // L3 docid pos - docIdPos = _zcDocIds.size(); - _l3Skip.encode(docIdPos - lastL3SkipDocIdPos - 1); - lastL3SkipDocIdPos = docIdPos; - if (encodeFeatures) { - // L3 features pos - _l3Skip.encode(featurePos - lastL3SkipFeaturePos - 1); - lastL3SkipFeaturePos = featurePos; - } - // L3 L1Skip pos - l1SkipPos = _l1Skip.size(); - _l3Skip.encode(l1SkipPos - lastL3SkipL1SkipPos - 1); - lastL3SkipL1SkipPos = l1SkipPos; - // L3 L2Skip pos - uint64_t l2SkipPos = _l2Skip.size(); - _l3Skip.encode(l2SkipPos - lastL3SkipL2SkipPos - 1); - lastL3SkipL2SkipPos = l2SkipPos; - l3SkipCnt = 0; - ++l4SkipCnt; - if (l4SkipCnt >= L4SKIPSTRIDE) { - // L4 docid delta - docIdDelta = lastDocId - lastL4SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - _l4Skip.encode(docIdDelta - 1); - lastL4SkipDocId = lastDocId; - // L4 docid pos - docIdPos = _zcDocIds.size(); - _l4Skip.encode(docIdPos - lastL4SkipDocIdPos - 1); - lastL4SkipDocIdPos = docIdPos; - if (encodeFeatures) { - // L4 features pos - _l4Skip.encode(featurePos - lastL4SkipFeaturePos - 1); - lastL4SkipFeaturePos = featurePos; - } - // L4 L1Skip pos - l1SkipPos = _l1Skip.size(); - _l4Skip.encode(l1SkipPos - lastL4SkipL1SkipPos - 1); - lastL4SkipL1SkipPos = l1SkipPos; - // L4 L2Skip pos - l2SkipPos = _l2Skip.size(); - _l4Skip.encode(l2SkipPos - lastL4SkipL2SkipPos - 1); - lastL4SkipL2SkipPos = l2SkipPos; - // L4 L3Skip pos - uint64_t l3SkipPos = _l3Skip.size(); - _l4Skip.encode(l3SkipPos - lastL4SkipL3SkipPos - 1); - lastL4SkipL3SkipPos = l3SkipPos; - l4SkipCnt = 0; + for (const auto &doc_id_and_feature_size : _docIds) { + if (l1_skip_encoder.should_write_skip(L1SKIPSTRIDE)) { + l1_skip_encoder.write_skip(_l1Skip, doc_id_encoder); + if (l2_skip_encoder.should_write_skip(L2SKIPSTRIDE)) { + l2_skip_encoder.write_skip(_l2Skip, l1_skip_encoder); + if (l3_skip_encoder.should_write_skip(L3SKIPSTRIDE)) { + l3_skip_encoder.write_skip(_l3Skip, l2_skip_encoder); + if (l4_skip_encoder.should_write_skip(L4SKIPSTRIDE)) { + l4_skip_encoder.write_skip(_l4Skip, l3_skip_encoder); } } } } - uint32_t docId = dit->first; - featurePos += dit->second; - _zcDocIds.encode(docId - lastDocId - 1); - lastDocId = docId; - ++l1SkipCnt; + doc_id_encoder.write(_zcDocIds, doc_id_and_feature_size); } // Extra partial entries for skip tables to simplify iterator during search - if (_l1Skip.size() > 0) { - _l1Skip.encode(lastDocId - lastL1SkipDocId - 1); - } - if (_l2Skip.size() > 0) { - _l2Skip.encode(lastDocId - lastL2SkipDocId - 1); - } - if (_l3Skip.size() > 0) { - _l3Skip.encode(lastDocId - lastL3SkipDocId - 1); - } - if (_l4Skip.size() > 0) { - _l4Skip.encode(lastDocId - lastL4SkipDocId - 1); - } + l1_skip_encoder.write_partial_skip(_l1Skip, doc_id_encoder.get_doc_id()); + l2_skip_encoder.write_partial_skip(_l2Skip, doc_id_encoder.get_doc_id()); + l3_skip_encoder.write_partial_skip(_l3Skip, doc_id_encoder.get_doc_id()); + l4_skip_encoder.write_partial_skip(_l4Skip, doc_id_encoder.get_doc_id()); } void diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h index e803fc692c3..6da59028803 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h @@ -47,7 +47,7 @@ protected: Zc4PostingWriterBase &operator=(Zc4PostingWriterBase &&) = delete; Zc4PostingWriterBase(index::PostingListCounts &counts); ~Zc4PostingWriterBase(); - void calc_skip_info(bool encodeFeatures); + void calc_skip_info(bool encode_features); void clear_skip_info(); public: diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp index 10c08af92cb..3ae2a631cb1 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp @@ -16,14 +16,12 @@ using search::index::PostingListCountFileSeqRead; using search::index::PostingListCountFileSeqWrite; Zc4PosOccSeqRead::Zc4PosOccSeqRead(PostingListCountFileSeqRead *countFile) - : Zc4PostingSeqRead(countFile), + : Zc4PostingSeqRead(countFile, false), _fieldsParams(), _cookedDecodeContext(&_fieldsParams), _rawDecodeContext(&_fieldsParams) { - _decodeContext = &_cookedDecodeContext; - _decodeContext->setReadContext(&_readContext); - _readContext.setDecodeContext(_decodeContext); + _reader.set_decode_features(&_cookedDecodeContext); } @@ -31,18 +29,17 @@ void Zc4PosOccSeqRead:: setFeatureParams(const PostingListParams ¶ms) { - bool oldCooked = _decodeContext == &_cookedDecodeContext; + bool oldCooked = &_reader.get_decode_features() == &_cookedDecodeContext; bool newCooked = oldCooked; params.get("cooked", newCooked); if (oldCooked != newCooked) { if (newCooked) { _cookedDecodeContext = _rawDecodeContext; - _decodeContext = &_cookedDecodeContext; + _reader.set_decode_features(&_cookedDecodeContext); } else { _rawDecodeContext = _cookedDecodeContext; - _decodeContext = &_rawDecodeContext; + _reader.set_decode_features(&_rawDecodeContext); } - _readContext.setDecodeContext(_decodeContext); } } @@ -69,14 +66,12 @@ Zc4PosOccSeqWrite::Zc4PosOccSeqWrite(const Schema &schema, ZcPosOccSeqRead::ZcPosOccSeqRead(PostingListCountFileSeqRead *countFile) - : ZcPostingSeqRead(countFile), + : Zc4PostingSeqRead(countFile, true), _fieldsParams(), _cookedDecodeContext(&_fieldsParams), _rawDecodeContext(&_fieldsParams) { - _decodeContext = &_cookedDecodeContext; - _decodeContext->setReadContext(&_readContext); - _readContext.setDecodeContext(_decodeContext); + _reader.set_decode_features(&_cookedDecodeContext); } @@ -84,18 +79,17 @@ void ZcPosOccSeqRead:: setFeatureParams(const PostingListParams ¶ms) { - bool oldCooked = _decodeContext == &_cookedDecodeContext; + bool oldCooked = &_reader.get_decode_features() == &_cookedDecodeContext; bool newCooked = oldCooked; params.get("cooked", newCooked); if (oldCooked != newCooked) { if (newCooked) { _cookedDecodeContext = _rawDecodeContext; - _decodeContext = &_cookedDecodeContext; + _reader.set_decode_features(&_cookedDecodeContext); } else { _rawDecodeContext = _cookedDecodeContext; - _decodeContext = &_rawDecodeContext; + _reader.set_decode_features(&_rawDecodeContext); } - _readContext.setDecodeContext(_decodeContext); } } diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.h b/searchlib/src/vespa/searchlib/diskindex/zcposocc.h index cd21fb02f33..1e0555116ce 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.h @@ -34,7 +34,7 @@ public: }; -class ZcPosOccSeqRead : public ZcPostingSeqRead +class ZcPosOccSeqRead : public Zc4PostingSeqRead { private: bitcompression::PosOccFieldsParams _fieldsParams; diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp index e40842737c9..a0203b64197 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp @@ -29,60 +29,19 @@ using bitcompression::FeatureEncodeContextBE; using vespalib::getLastErrorString; -Zc4PostingSeqRead:: -Zc4PostingSeqRead(PostingListCountFileSeqRead *countFile) +Zc4PostingSeqRead::Zc4PostingSeqRead(PostingListCountFileSeqRead *countFile, bool dynamic_k) : PostingListFileSeqRead(), - _decodeContext(), - _docIdK(0), - _prevDocId(0), - _numDocs(0), - _readContext(sizeof(uint64_t)), + _reader(dynamic_k), _file(), - _hasMore(false), - _dynamicK(false), - _lastDocId(0), - _minChunkDocs(1 << 30), - _minSkipDocs(64), - _docIdLimit(10000000), - _zcDocIds(), - _l1Skip(), - _l2Skip(), - _l3Skip(), - _l4Skip(), _numWords(0), _fileBitSize(0), - _chunkNo(0), - _l1SkipDocId(0), - _l1SkipDocIdPos(0), - _l1SkipFeaturesPos(0), - _l2SkipDocId(0), - _l2SkipDocIdPos(0), - _l2SkipL1SkipPos(0), - _l2SkipFeaturesPos(0), - _l3SkipDocId(0), - _l3SkipDocIdPos(0), - _l3SkipL1SkipPos(0), - _l3SkipL2SkipPos(0), - _l3SkipFeaturesPos(0), - _l4SkipDocId(0), - _l4SkipDocIdPos(0), - _l4SkipL1SkipPos(0), - _l4SkipL2SkipPos(0), - _l4SkipL3SkipPos(0), - _l4SkipFeaturesPos(0), - _featuresSize(0), - _countFile(countFile), - _headerBitLen(0), - _rangeEndOffset(0), - _readAheadEndOffset(0), - _wordStart(0), - _residue(0) + _countFile(countFile) { if (_countFile != nullptr) { PostingListParams params; _countFile->getParams(params); - params.get("docIdLimit", _docIdLimit); - params.get("minChunkDocs", _minChunkDocs); + params.get("docIdLimit", _reader.get_posting_params()._doc_id_limit); + params.get("minChunkDocs", _reader.get_posting_params()._min_chunk_docs); } } @@ -91,387 +50,16 @@ Zc4PostingSeqRead::~Zc4PostingSeqRead() { } - -void -Zc4PostingSeqRead:: -readCommonWordDocIdAndFeatures(DocIdAndFeatures &features) -{ - if ((_zcDocIds._valI >= _zcDocIds._valE) && _hasMore) { - readWordStart(); // Read start of next chunk - } - // Split docid & features. - assert(_zcDocIds._valI < _zcDocIds._valE); - uint32_t docIdPos = _zcDocIds.pos(); - uint32_t docId = _prevDocId + 1 + _zcDocIds.decode(); - features._docId = docId; - _prevDocId = docId; - assert(docId <= _lastDocId); - if (docId > _l1SkipDocId) { - _l1SkipDocIdPos += _l1Skip.decode() + 1; - assert(docIdPos == _l1SkipDocIdPos); - _l1SkipFeaturesPos += _l1Skip.decode() + 1; - uint64_t featuresPos = _decodeContext->getReadOffset(); - assert(featuresPos == _l1SkipFeaturesPos); - (void) featuresPos; - if (docId > _l2SkipDocId) { - _l2SkipDocIdPos += _l2Skip.decode() + 1; - assert(docIdPos == _l2SkipDocIdPos); - _l2SkipFeaturesPos += _l2Skip.decode() + 1; - assert(featuresPos == _l2SkipFeaturesPos); - _l2SkipL1SkipPos += _l2Skip.decode() + 1; - assert(_l1Skip.pos() == _l2SkipL1SkipPos); - if (docId > _l3SkipDocId) { - _l3SkipDocIdPos += _l3Skip.decode() + 1; - assert(docIdPos == _l3SkipDocIdPos); - _l3SkipFeaturesPos += _l3Skip.decode() + 1; - assert(featuresPos == _l3SkipFeaturesPos); - _l3SkipL1SkipPos += _l3Skip.decode() + 1; - assert(_l1Skip.pos() == _l3SkipL1SkipPos); - _l3SkipL2SkipPos += _l3Skip.decode() + 1; - assert(_l2Skip.pos() == _l3SkipL2SkipPos); - if (docId > _l4SkipDocId) { - _l4SkipDocIdPos += _l4Skip.decode() + 1; - assert(docIdPos == _l4SkipDocIdPos); - (void) docIdPos; - _l4SkipFeaturesPos += _l4Skip.decode() + 1; - assert(featuresPos == _l4SkipFeaturesPos); - _l4SkipL1SkipPos += _l4Skip.decode() + 1; - assert(_l1Skip.pos() == _l4SkipL1SkipPos); - _l4SkipL2SkipPos += _l4Skip.decode() + 1; - assert(_l2Skip.pos() == _l4SkipL2SkipPos); - _l4SkipL3SkipPos += _l4Skip.decode() + 1; - assert(_l3Skip.pos() == _l4SkipL3SkipPos); - _l4SkipDocId += _l4Skip.decode() + 1; - assert(_l4SkipDocId <= _lastDocId); - assert(_l4SkipDocId >= docId); - } - _l3SkipDocId += _l3Skip.decode() + 1; - assert(_l3SkipDocId <= _lastDocId); - assert(_l3SkipDocId <= _l4SkipDocId); - assert(_l3SkipDocId >= docId); - } - _l2SkipDocId += _l2Skip.decode() + 1; - assert(_l2SkipDocId <= _lastDocId); - assert(_l2SkipDocId <= _l4SkipDocId); - assert(_l2SkipDocId <= _l3SkipDocId); - assert(_l2SkipDocId >= docId); - } - _l1SkipDocId += _l1Skip.decode() + 1; - assert(_l1SkipDocId <= _lastDocId); - assert(_l1SkipDocId <= _l4SkipDocId); - assert(_l1SkipDocId <= _l3SkipDocId); - assert(_l1SkipDocId <= _l2SkipDocId); - assert(_l1SkipDocId >= docId); - } - if (docId < _lastDocId) { - // Assert more space available when not yet at last docid - assert(_zcDocIds._valI < _zcDocIds._valE); - } else { - // Assert that space has been used when at last docid - assert(_zcDocIds._valI == _zcDocIds._valE); - // Assert that we've read to end of skip info - assert(_l1SkipDocId == _lastDocId); - assert(_l2SkipDocId == _lastDocId); - assert(_l3SkipDocId == _lastDocId); - assert(_l4SkipDocId == _lastDocId); - if (!_hasMore) { - _chunkNo = 0; - } - } - _decodeContext->readFeatures(features); - --_residue; -} - - -void -Zc4PostingSeqRead:: -readDocIdAndFeatures(DocIdAndFeatures &features) -{ - if (_residue == 0 && !_hasMore) { - if (_rangeEndOffset != 0) { - DecodeContext &d = *_decodeContext; - uint64_t curOffset = d.getReadOffset(); - assert(curOffset <= _rangeEndOffset); - if (curOffset < _rangeEndOffset) { - readWordStart(); - } - } - if (_residue == 0) { - // Don't read past end of posting list. - features.clear(static_cast<uint32_t>(-1)); - return; - } - } - if (_lastDocId > 0) { - return readCommonWordDocIdAndFeatures(features); - } - // Interleaves docid & features - typedef FeatureEncodeContextBE EC; - DecodeContext &d = *_decodeContext; - uint32_t length; - uint64_t val64; - UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); - - UC64BE_DECODEEXPGOLOMB_SMALL_NS(o, - K_VALUE_ZCPOSTING_DELTA_DOCID, - EC); - uint32_t docId = _prevDocId + 1 + val64; - features._docId = docId; - _prevDocId = docId; - UC64_DECODECONTEXT_STORE(o, d._); - if (__builtin_expect(oCompr >= d._valE, false)) { - _readContext.readComprBuffer(); - } - _decodeContext->readFeatures(features); - --_residue; -} - - -void -Zc4PostingSeqRead::readWordStartWithSkip() -{ - typedef FeatureEncodeContextBE EC; - DecodeContext &d = *_decodeContext; - UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); - uint32_t length; - uint64_t val64; - const uint64_t *valE = d._valE; - - if (_hasMore) { - ++_chunkNo; - } else { - _chunkNo = 0; - } - assert(_numDocs >= _minSkipDocs || _hasMore); - bool hasMore = false; - if (__builtin_expect(_numDocs >= _minChunkDocs, false)) { - hasMore = static_cast<int64_t>(oVal) < 0; - oVal <<= 1; - length = 1; - UC64BE_READBITS_NS(o, EC); - } - if (_dynamicK) { - _docIdK = EC::calcDocIdK((_hasMore || hasMore) ? 1 : _numDocs, - _docIdLimit); - } - if (_hasMore || hasMore) { - if (_rangeEndOffset == 0) { - assert(hasMore == (_chunkNo + 1 < _counts._segments.size())); - assert(_numDocs == _counts._segments[_chunkNo]._numDocs); - } - if (hasMore) { - assert(_numDocs >= _minSkipDocs); - assert(_numDocs >= _minChunkDocs); - } - } else { - assert(_numDocs >= _minSkipDocs); - if (_rangeEndOffset == 0) { - assert(_numDocs == _counts._numDocs); - } - } - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_DOCIDSSIZE, - EC); - uint32_t docIdsSize = val64 + 1; - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_L1SKIPSIZE, - EC); - uint32_t l1SkipSize = val64; - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - uint32_t l2SkipSize = 0; - if (l1SkipSize != 0) { - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_L2SKIPSIZE, - EC); - l2SkipSize = val64; - } - uint32_t l3SkipSize = 0; - if (l2SkipSize != 0) { - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_L3SKIPSIZE, - EC); - l3SkipSize = val64; - } - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - uint32_t l4SkipSize = 0; - if (l3SkipSize != 0) { - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_L4SKIPSIZE, - EC); - l4SkipSize = val64; - } - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_FEATURESSIZE, - EC); - _featuresSize = val64; - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - if (_dynamicK) { - UC64BE_DECODEEXPGOLOMB_NS(o, - _docIdK, - EC); - } else { - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_LASTDOCID, - EC); - } - _lastDocId = _docIdLimit - 1 - val64; - if (_hasMore || hasMore) { - if (_rangeEndOffset == 0) { - assert(_lastDocId == _counts._segments[_chunkNo]._lastDoc); - } - } - - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - uint64_t bytePad = oPreRead & 7; - if (bytePad > 0) { - length = bytePad; - oVal <<= length; - UC64BE_READBITS_NS(o, EC); - } - UC64_DECODECONTEXT_STORE(o, d._); - if (__builtin_expect(oCompr >= valE, false)) { - _readContext.readComprBuffer(); - } - _zcDocIds.clearReserve(docIdsSize); - _l1Skip.clearReserve(l1SkipSize); - _l2Skip.clearReserve(l2SkipSize); - _l3Skip.clearReserve(l3SkipSize); - _l4Skip.clearReserve(l4SkipSize); - _decodeContext->readBytes(_zcDocIds._valI, docIdsSize); - _zcDocIds._valE = _zcDocIds._valI + docIdsSize; - if (l1SkipSize > 0) { - _decodeContext->readBytes(_l1Skip._valI, l1SkipSize); - } - _l1Skip._valE = _l1Skip._valI + l1SkipSize; - if (l2SkipSize > 0) { - _decodeContext->readBytes(_l2Skip._valI, l2SkipSize); - } - _l2Skip._valE = _l2Skip._valI + l2SkipSize; - if (l3SkipSize > 0) { - _decodeContext->readBytes(_l3Skip._valI, l3SkipSize); - } - _l3Skip._valE = _l3Skip._valI + l3SkipSize; - if (l4SkipSize > 0) { - _decodeContext->readBytes(_l4Skip._valI, l4SkipSize); - } - _l4Skip._valE = _l4Skip._valI + l4SkipSize; - - if (l1SkipSize > 0) { - _l1SkipDocId = _l1Skip.decode() + 1 + _prevDocId; - } else { - _l1SkipDocId = _lastDocId; - } - if (l2SkipSize > 0) { - _l2SkipDocId = _l2Skip.decode() + 1 + _prevDocId; - } else { - _l2SkipDocId = _lastDocId; - } - if (l3SkipSize > 0) { - _l3SkipDocId = _l3Skip.decode() + 1 + _prevDocId; - } else { - _l3SkipDocId = _lastDocId; - } - if (l4SkipSize > 0) { - _l4SkipDocId = _l4Skip.decode() + 1 + _prevDocId; - } else { - _l4SkipDocId = _lastDocId; - } - _l1SkipDocIdPos = 0; - _l1SkipFeaturesPos = _decodeContext->getReadOffset(); - _l2SkipDocIdPos = 0; - _l2SkipL1SkipPos = 0; - _l2SkipFeaturesPos = _decodeContext->getReadOffset(); - _l3SkipDocIdPos = 0; - _l3SkipL1SkipPos = 0; - _l3SkipL2SkipPos = 0; - _l3SkipFeaturesPos = _decodeContext->getReadOffset(); - _l4SkipDocIdPos = 0; - _l4SkipL1SkipPos = 0; - _l4SkipL2SkipPos = 0; - _l4SkipL3SkipPos = 0; - _l4SkipFeaturesPos = _decodeContext->getReadOffset(); - _hasMore = hasMore; - // Decode context is now positioned at start of features -} - - void -Zc4PostingSeqRead::readWordStart() +Zc4PostingSeqRead::readDocIdAndFeatures(DocIdAndFeatures &features) { - typedef FeatureEncodeContextBE EC; - UC64_DECODECONTEXT_CONSTRUCTOR(o, _decodeContext->_); - uint32_t length; - uint64_t val64; - const uint64_t *valE = _decodeContext->_valE; - - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_NUMDOCS, - EC); - UC64_DECODECONTEXT_STORE(o, _decodeContext->_); - if (oCompr >= valE) { - _readContext.readComprBuffer(); - } - _numDocs = static_cast<uint32_t>(val64) + 1; - _residue = _numDocs; - _prevDocId = _hasMore ? _lastDocId : 0u; - if (_rangeEndOffset == 0) { - assert(_numDocs <= _counts._numDocs); - assert(_numDocs == _counts._numDocs || - _numDocs >= _minChunkDocs || - _hasMore); - } - - if (_numDocs >= _minSkipDocs || _hasMore) { - readWordStartWithSkip(); - // Decode context is not positioned at start of features - } else { - if (_dynamicK) { - _docIdK = EC::calcDocIdK(_numDocs, _docIdLimit); - } - _lastDocId = 0u; - // Decode context is not positioned at start of docids & features - } + _reader.read_doc_id_and_features(features); } - void Zc4PostingSeqRead::readCounts(const PostingListCounts &counts) { - assert(!_hasMore); // Previous words must have been read. - - _counts = counts; - - assert((_counts._numDocs == 0) == (_counts._bitLength == 0)); - if (_counts._numDocs > 0) { - _wordStart = _decodeContext->getReadOffset(); - readWordStart(); - } + _reader.set_counts(counts); } @@ -484,16 +72,17 @@ Zc4PostingSeqRead::open(const vespalib::string &name, } bool res = _file.OpenReadOnly(name.c_str()); if (res) { - _readContext.setFile(&_file); - _readContext.setFileSize(_file.GetSize()); - DecodeContext &d = *_decodeContext; - _readContext.allocComprBuf(65536u, 32768u); + auto &readContext = _reader.get_read_context(); + readContext.setFile(&_file); + readContext.setFileSize(_file.GetSize()); + auto &d = _reader.get_decode_features(); + readContext.allocComprBuf(65536u, 32768u); d.emptyBuffer(0); - _readContext.readComprBuffer(); + readContext.readComprBuffer(); readHeader(); if (d._valI >= d._valE) { - _readContext.readComprBuffer(); + readContext.readComprBuffer(); } } else { LOG(error, "could not open %s: %s", @@ -506,9 +95,10 @@ Zc4PostingSeqRead::open(const vespalib::string &name, bool Zc4PostingSeqRead::close() { - _readContext.dropComprBuf(); + auto &readContext = _reader.get_read_context(); + readContext.dropComprBuf(); _file.Close(); - _readContext.setFile(nullptr); + readContext.setFile(nullptr); return true; } @@ -524,29 +114,30 @@ Zc4PostingSeqRead::getParams(PostingListParams ¶ms) uint32_t countMinChunkDocs = 0; countParams.get("docIdLimit", countDocIdLimit); countParams.get("minChunkDocs", countMinChunkDocs); - assert(_docIdLimit == countDocIdLimit); - assert(_minChunkDocs == countMinChunkDocs); + assert(_reader.get_posting_params()._doc_id_limit == countDocIdLimit); + assert(_reader.get_posting_params()._min_chunk_docs == countMinChunkDocs); } else { params.clear(); - params.set("docIdLimit", _docIdLimit); - params.set("minChunkDocs", _minChunkDocs); + params.set("docIdLimit", _reader.get_posting_params()._doc_id_limit); + params.set("minChunkDocs", _reader.get_posting_params()._min_chunk_docs); } - params.set("minSkipDocs", _minSkipDocs); + params.set("minSkipDocs", _reader.get_posting_params()._min_skip_docs); } void Zc4PostingSeqRead::getFeatureParams(PostingListParams ¶ms) { - _decodeContext->getParams(params); + _reader.get_decode_features().getParams(params); } void Zc4PostingSeqRead::readHeader() { - FeatureDecodeContextBE &d = *_decodeContext; - const vespalib::string &myId = _dynamicK ? myId5 : myId4; + FeatureDecodeContextBE &d = _reader.get_decode_features(); + auto &posting_params = _reader.get_posting_params(); + const vespalib::string &myId = posting_params._dynamic_k ? myId5 : myId4; vespalib::FileHeader header; d.readHeader(header, _file.getSize()); @@ -571,9 +162,9 @@ Zc4PostingSeqRead::readHeader() (void) myId; assert(header.getTag("format.1").asString() == d.getIdentifier()); _numWords = header.getTag("numWords").asInteger(); - _minChunkDocs = header.getTag("minChunkDocs").asInteger(); - _docIdLimit = header.getTag("docIdLimit").asInteger(); - _minSkipDocs = header.getTag("minSkipDocs").asInteger(); + posting_params._min_chunk_docs = header.getTag("minChunkDocs").asInteger(); + posting_params._doc_id_limit = header.getTag("docIdLimit").asInteger(); + posting_params._min_skip_docs = header.getTag("minSkipDocs").asInteger(); assert(header.getTag("endian").asString() == "big"); // Read feature decoding specific subheader d.readHeader(header, "features."); @@ -585,38 +176,9 @@ Zc4PostingSeqRead::readHeader() const vespalib::string & -Zc4PostingSeqRead::getIdentifier() -{ - return myId4; -} - - -uint64_t -Zc4PostingSeqRead::getCurrentPostingOffset() const +Zc4PostingSeqRead::getIdentifier(bool dynamic_k) { - FeatureDecodeContextBE &d = *_decodeContext; - return d.getReadOffset() - _headerBitLen; -} - - -void -Zc4PostingSeqRead::setPostingOffset(uint64_t offset, - uint64_t endOffset, - uint64_t readAheadOffset) -{ - assert(_residue == 0); // Only to be called between posting lists - - FeatureDecodeContextBE &d = *_decodeContext; - - _rangeEndOffset = endOffset + _headerBitLen; - _readAheadEndOffset = readAheadOffset + _headerBitLen; - _readContext.setStopOffset(_readAheadEndOffset, false); - uint64_t newOffset = offset + _headerBitLen; - if (newOffset != d.getReadOffset()) { - _readContext.setPosition(newOffset); - assert(newOffset == d.getReadOffset()); - _readContext.readComprBuffer(); - } + return (dynamic_k ? myId5 : myId4); } @@ -809,65 +371,6 @@ getFeatureParams(PostingListParams ¶ms) } -ZcPostingSeqRead::ZcPostingSeqRead(PostingListCountFileSeqRead *countFile) - : Zc4PostingSeqRead(countFile) -{ - _dynamicK = true; -} - - -void -ZcPostingSeqRead:: -readDocIdAndFeatures(DocIdAndFeatures &features) -{ - if (_residue == 0 && !_hasMore) { - if (_rangeEndOffset != 0) { - DecodeContext &d = *_decodeContext; - uint64_t curOffset = d.getReadOffset(); - assert(curOffset <= _rangeEndOffset); - if (curOffset < _rangeEndOffset) { - readWordStart(); - } - } - if (_residue == 0) { - // Don't read past end of posting list. - features.clear(static_cast<uint32_t>(-1)); - return; - } - } - if (_lastDocId > 0) { - readCommonWordDocIdAndFeatures(features); - return; - } - // Interleaves docid & features - typedef FeatureEncodeContextBE EC; - DecodeContext &d = *_decodeContext; - uint32_t length; - uint64_t val64; - UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); - - UC64BE_DECODEEXPGOLOMB_SMALL_NS(o, - _docIdK, - EC); - uint32_t docId = _prevDocId + 1 + val64; - features._docId = docId; - _prevDocId = docId; - UC64_DECODECONTEXT_STORE(o, d._); - if (__builtin_expect(oCompr >= d._valE, false)) { - _readContext.readComprBuffer(); - } - _decodeContext->readFeatures(features); - --_residue; -} - - -const vespalib::string & -ZcPostingSeqRead::getIdentifier() -{ - return myId5; -} - - ZcPostingSeqWrite::ZcPostingSeqWrite(PostingListCountFileSeqWrite *countFile) : Zc4PostingSeqWrite(countFile) { diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.h b/searchlib/src/vespa/searchlib/diskindex/zcposting.h index 96cc306cea8..01049e720a9 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposting.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.h @@ -3,8 +3,10 @@ #pragma once #include "zc4_posting_writer.h" +#include "zc4_posting_reader.h" #include <vespa/searchlib/index/postinglistfile.h> #include <vespa/fastos/file.h> +#include "zc4_posting_params.h" namespace search::index { class PostingListCountFileSeqRead; @@ -19,63 +21,14 @@ class Zc4PostingSeqRead : public index::PostingListFileSeqRead Zc4PostingSeqRead &operator=(const Zc4PostingSeqRead &); protected: - typedef bitcompression::FeatureDecodeContextBE DecodeContext; - typedef bitcompression::FeatureEncodeContextBE EncodeContext; - - DecodeContext *_decodeContext; - uint32_t _docIdK; - uint32_t _prevDocId; // Previous document id - uint32_t _numDocs; // Documents in chunk or word - search::ComprFileReadContext _readContext; + Zc4PostingReader<true> _reader; FastOS_File _file; - bool _hasMore; - bool _dynamicK; // Caclulate EG compression parameters ? - uint32_t _lastDocId; // last document in chunk or word - uint32_t _minChunkDocs; // # of documents needed for chunking - uint32_t _minSkipDocs; // # of documents needed for skipping - uint32_t _docIdLimit; // Limit for document ids (docId < docIdLimit) - - ZcBuf _zcDocIds; // Document id deltas - ZcBuf _l1Skip; // L1 skip info - ZcBuf _l2Skip; // L2 skip info - ZcBuf _l3Skip; // L3 skip info - ZcBuf _l4Skip; // L4 skip info - uint64_t _numWords; // Number of words in file uint64_t _fileBitSize; - uint32_t _chunkNo; // Chunk number - - // Variables for validating skip information while reading - uint32_t _l1SkipDocId; - uint32_t _l1SkipDocIdPos; - uint64_t _l1SkipFeaturesPos; - uint32_t _l2SkipDocId; - uint32_t _l2SkipDocIdPos; - uint32_t _l2SkipL1SkipPos; - uint64_t _l2SkipFeaturesPos; - uint32_t _l3SkipDocId; - uint32_t _l3SkipDocIdPos; - uint32_t _l3SkipL1SkipPos; - uint32_t _l3SkipL2SkipPos; - uint64_t _l3SkipFeaturesPos; - uint32_t _l4SkipDocId; - uint32_t _l4SkipDocIdPos; - uint32_t _l4SkipL1SkipPos; - uint32_t _l4SkipL2SkipPos; - uint32_t _l4SkipL3SkipPos; - uint64_t _l4SkipFeaturesPos; - - // Variable for validating chunk information while reading - uint64_t _featuresSize; index::PostingListCountFileSeqRead *const _countFile; - uint64_t _headerBitLen; // Size of file header in bits - uint64_t _rangeEndOffset; // End offset for word pair - uint64_t _readAheadEndOffset; // Readahead end offset for word pair - uint64_t _wordStart; // last word header position - uint32_t _residue; // Number of unread documents after word header public: - Zc4PostingSeqRead(index::PostingListCountFileSeqRead *countFile); + Zc4PostingSeqRead(index::PostingListCountFileSeqRead *countFile, bool dynamic_k); ~Zc4PostingSeqRead(); @@ -83,11 +36,6 @@ public: typedef index::PostingListCounts PostingListCounts; typedef index::PostingListParams PostingListParams; - /** - * Read document id and features for common word. - */ - virtual void readCommonWordDocIdAndFeatures(DocIdAndFeatures &features); - void readDocIdAndFeatures(DocIdAndFeatures &features) override; void readCounts(const PostingListCounts &counts) override; // Fill in for next word bool open(const vespalib::string &name, const TuneFileSeqRead &tuneFileRead) override; @@ -97,28 +45,7 @@ public: void readWordStartWithSkip(); void readWordStart(); void readHeader(); - static const vespalib::string &getIdentifier(); - - // Methods used when generating posting list for common word pairs. - - /* - * Get current posting offset, measured in bits. First posting list - * starts at 0, i.e. file header is not accounted for here. - * - * @return current posting offset, measured in bits. - */ - uint64_t getCurrentPostingOffset() const override; - - /** - * Set current posting offset, measured in bits. First posting - * list starts at 0, i.e. file header is not accounted for here. - * - * @param Offset start of posting lists for word pair. - * @param endOffset end of posting lists for word pair. - * @param readAheadOffset end of posting list for either this or a - * later word pair, depending on disk seek cost. - */ - void setPostingOffset(uint64_t offset, uint64_t endOffset, uint64_t readAheadOffset) override; + static const vespalib::string &getIdentifier(bool dynamic_k); }; @@ -161,15 +88,6 @@ public: void updateHeader(); }; - -class ZcPostingSeqRead : public Zc4PostingSeqRead -{ -public: - ZcPostingSeqRead(index::PostingListCountFileSeqRead *countFile); - void readDocIdAndFeatures(DocIdAndFeatures &features) override; - static const vespalib::string &getIdentifier(); -}; - class ZcPostingSeqWrite : public Zc4PostingSeqWrite { public: diff --git a/searchlib/src/vespa/searchlib/features/attributefeature.cpp b/searchlib/src/vespa/searchlib/features/attributefeature.cpp index b3ebd0f3822..56d02ce6d4e 100644 --- a/searchlib/src/vespa/searchlib/features/attributefeature.cpp +++ b/searchlib/src/vespa/searchlib/features/attributefeature.cpp @@ -295,10 +295,13 @@ AttributeBlueprint::setup(const search::fef::IIndexEnvironment & env, vespalib::string attrType = type::Attribute::lookup(env.getProperties(), _attrName); if (!attrType.empty()) { _tensorType = ValueType::from_spec(attrType); + if (_tensorType.is_error()) { + LOG(error, "%s: invalid type: '%s'", getName().c_str(), attrType.c_str()); + } } - FeatureType output_type = _tensorType.is_tensor() - ? FeatureType::object(_tensorType) - : FeatureType::number(); + FeatureType output_type = _tensorType.is_double() + ? FeatureType::number() + : FeatureType::object(_tensorType); describeOutput("value", "The value of a single value attribute, " "the value at the given index of an array attribute, " "the given key of a weighted set attribute, or" @@ -309,7 +312,7 @@ AttributeBlueprint::setup(const search::fef::IIndexEnvironment & env, describeOutput("count", "Returns the number of elements in this array or weighted set attribute."); } env.hintAttributeAccess(_attrName); - return true; + return !_tensorType.is_error(); } search::fef::Blueprint::UP diff --git a/searchlib/src/vespa/searchlib/features/constant_feature.cpp b/searchlib/src/vespa/searchlib/features/constant_feature.cpp index 4d76512ab00..ced9d95fb33 100644 --- a/searchlib/src/vespa/searchlib/features/constant_feature.cpp +++ b/searchlib/src/vespa/searchlib/features/constant_feature.cpp @@ -63,8 +63,10 @@ ConstantBlueprint::setup(const IIndexEnvironment &env, { _key = params[0].getValue(); _value = env.getConstantValue(_key); - if (!_value || _value->type().is_error()) { + if (!_value) { LOG(error, "Constant '%s' not found", _key.c_str()); + } else if (_value->type().is_error()) { + LOG(error, "Constant '%s' has invalid type", _key.c_str()); } FeatureType output_type = _value ? FeatureType::object(_value->type()) : diff --git a/searchlib/src/vespa/searchlib/features/queryfeature.cpp b/searchlib/src/vespa/searchlib/features/queryfeature.cpp index eb7eb427283..b9041901ced 100644 --- a/searchlib/src/vespa/searchlib/features/queryfeature.cpp +++ b/searchlib/src/vespa/searchlib/features/queryfeature.cpp @@ -98,12 +98,15 @@ QueryBlueprint::setup(const IIndexEnvironment &env, const ParameterList ¶ms) vespalib::string queryFeatureType = type::QueryFeature::lookup(env.getProperties(), _key); if (!queryFeatureType.empty()) { _valueType = ValueType::from_spec(queryFeatureType); + if (_valueType.is_error()) { + LOG(error, "%s: invalid type: '%s'", getName().c_str(), queryFeatureType.c_str()); + } } - FeatureType output_type = _valueType.is_tensor() - ? FeatureType::object(_valueType) - : FeatureType::number(); + FeatureType output_type = _valueType.is_double() + ? FeatureType::number() + : FeatureType::object(_valueType); describeOutput("out", "The value looked up in query properties using the given key.", output_type); - return true; + return !_valueType.is_error(); } namespace { diff --git a/searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp b/searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp index 72865d042e7..b2c8c64d55a 100644 --- a/searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp +++ b/searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp @@ -239,9 +239,6 @@ RankingExpressionBlueprint::setup(const fef::IIndexEnvironment &env, LOG(error, "rank expression contains type errors: %s\n", script.c_str()); return false; } - if (root_type.is_any()) { - LOG(warning, "rank expression could produce run-time type errors: %s\n", script.c_str()); - } auto compile_issues = CompiledFunction::detect_issues(rank_function); auto interpret_issues = InterpretedFunction::detect_issues(rank_function); if (do_compile && compile_issues && !interpret_issues) { diff --git a/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp b/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp index 513c542637d..07b4da8a85f 100644 --- a/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp +++ b/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp @@ -7,15 +7,15 @@ LOG_SETUP(".index.docidandfeatures"); namespace search::index { DocIdAndFeatures::DocIdAndFeatures() - : _docId(0), - _wordDocFeatures(), + : _doc_id(0), _elements(), - _wordPositions(), + _word_positions(), _blob(), - _bitOffset(0u), - _bitLength(0u), - _raw(false) -{ } + _bit_offset(0u), + _bit_length(0u), + _has_raw_data(false) +{ +} DocIdAndFeatures::DocIdAndFeatures(const DocIdAndFeatures &) = default; DocIdAndFeatures & DocIdAndFeatures::operator = (const DocIdAndFeatures &) = default; diff --git a/searchlib/src/vespa/searchlib/index/docidandfeatures.h b/searchlib/src/vespa/searchlib/index/docidandfeatures.h index d1d44f78aa6..a063712a79e 100644 --- a/searchlib/src/vespa/searchlib/index/docidandfeatures.h +++ b/searchlib/src/vespa/searchlib/index/docidandfeatures.h @@ -7,57 +7,25 @@ namespace search::index { -/* +/** * The following feature classes are not self contained. To reduce * memory allocator pressure, the DocIdAndFeatures class contains a * flattened representation of the features at different levels. */ -/* - * (word, doc) features. - * - * Present as member in DocIdAndFeatures. - */ -class WordDocFeatures { -public: - // TODO: add support for user features - - WordDocFeatures() { } - void clear() { } -}; - -/* - * (word, doc, field) features. - * - * Present as vector element in DocIdAndFeatures. - */ -class WordDocFieldFeatures { -public: - uint32_t _numElements; // Number of array indexes - // TODO: add support for user features - - WordDocFieldFeatures() - : _numElements(0u) - {} - - uint32_t getNumElements() const { return _numElements; } - void setNumElements(uint32_t numElements) { _numElements = numElements; } - void incNumElements() { ++_numElements; } -}; - -/* - * (word, doc, field, element) features. +/** + * (word, doc, element) features. * * Present as vector element in DocIdAndFeatures. */ class WordDocElementFeatures { -public: +private: uint32_t _elementId; // Array index uint32_t _numOccs; int32_t _weight; uint32_t _elementLen; - // TODO: add support for user features +public: WordDocElementFeatures() : _elementId(0u), _numOccs(0u), @@ -93,16 +61,16 @@ public: void incNumOccs() { ++_numOccs; } }; -/* - * (word, doc, field, element, wordpos) features. +/** + * (word, doc, element, wordpos) features. * * Present as vector element in DocIdAndFeatures. */ class WordDocElementWordPosFeatures { -public: +private: uint32_t _wordPos; - // TODO: add support for user features +public: WordDocElementWordPosFeatures() : _wordPos(0u) {} @@ -116,30 +84,27 @@ public: }; /** - * Class for minimal common representation of features available for a - * (word, doc) pair, used by index fusion to shuffle information from + * Class for minimal common representation of features available for a (word, doc) pair. + * + * Used in memory index and disk index posting lists and by index fusion to shuffle information from * input files to the output file without having to know all the details. */ class DocIdAndFeatures { public: - uint32_t _docId; // Current Docid - // generic feature data, flattened to avoid excessive allocator usage - WordDocFeatures _wordDocFeatures; + using RawData = std::vector<uint64_t>; + +protected: + uint32_t _doc_id; // Current document id std::vector<WordDocElementFeatures> _elements; - std::vector<WordDocElementWordPosFeatures> _wordPositions; -#ifdef notyet - // user blobs (packed) - UserFeatures _userFeatures; - // TODO: Determine how to handle big endian versus little endian user - // features, and whether set of user features is contiguous in file or - // interleaved with predefined features (word position, word weight) -#endif - // raw data (file format specific, packed) - std::vector<uint64_t> _blob; // Feature data for (word, docid) pair - uint32_t _bitOffset; // Offset of feature start ([0..63]) - uint32_t _bitLength; // Length of features - bool _raw; // + std::vector<WordDocElementWordPosFeatures> _word_positions; + // Raw data (file format specific, packed) + RawData _blob; // Feature data for (word, docid) pair + uint32_t _bit_offset; // Offset of feature start ([0..63]) + uint32_t _bit_length; // Length of features + bool _has_raw_data; + +public: DocIdAndFeatures(); DocIdAndFeatures(const DocIdAndFeatures &); DocIdAndFeatures & operator = (const DocIdAndFeatures &); @@ -147,37 +112,49 @@ public: DocIdAndFeatures & operator = (DocIdAndFeatures &&) = default; ~DocIdAndFeatures(); - void clearFeatures() { - _wordDocFeatures.clear(); + void clear_features() { _elements.clear(); - _wordPositions.clear(); - _bitOffset = 0u; - _bitLength = 0u; + _word_positions.clear(); + _bit_offset = 0u; + _bit_length = 0u; _blob.clear(); } - void clearFeatures(uint32_t bitOffset) { - _wordDocFeatures.clear(); + void clear_features(uint32_t bit_offset) { _elements.clear(); - _wordPositions.clear(); - _bitOffset = bitOffset; - _bitLength = 0u; + _word_positions.clear(); + _bit_offset = bit_offset; + _bit_length = 0u; _blob.clear(); } - void clear(uint32_t docId) { - _docId = docId; - clearFeatures(); + void clear(uint32_t doc_id) { + _doc_id = doc_id; + clear_features(); } - void clear(uint32_t docId, uint32_t bitOffset) { - _docId = docId; - clearFeatures(bitOffset); + void clear(uint32_t doc_id, uint32_t bit_offset) { + _doc_id = doc_id; + clear_features(bit_offset); } - void setRaw(bool raw) { _raw = raw; } - bool getRaw() const { return _raw; } + uint32_t doc_id() const { return _doc_id; } + void set_doc_id(uint32_t val) { _doc_id = val; } + + const std::vector<WordDocElementFeatures>& elements() const { return _elements; } + std::vector<WordDocElementFeatures>& elements() { return _elements; } + + const std::vector<WordDocElementWordPosFeatures>& word_positions() const { return _word_positions; } + std::vector<WordDocElementWordPosFeatures>& word_positions() { return _word_positions; } + + const RawData& blob() const { return _blob; } + RawData& blob() { return _blob; } + uint32_t bit_offset() const { return _bit_offset; } + uint32_t bit_length() const { return _bit_length; } + void set_bit_length(uint32_t val) { _bit_length = val; } + bool has_raw_data() const { return _has_raw_data; } + void set_has_raw_data(bool val) { _has_raw_data = val; } }; } diff --git a/searchlib/src/vespa/searchlib/index/indexbuilder.cpp b/searchlib/src/vespa/searchlib/index/indexbuilder.cpp index 6b88c51e6cc..d585238107a 100644 --- a/searchlib/src/vespa/searchlib/index/indexbuilder.cpp +++ b/searchlib/src/vespa/searchlib/index/indexbuilder.cpp @@ -6,7 +6,8 @@ namespace search::index { IndexBuilder::IndexBuilder(const Schema &schema) : _schema(schema) -{ } +{ +} IndexBuilder::~IndexBuilder() = default; diff --git a/searchlib/src/vespa/searchlib/index/indexbuilder.h b/searchlib/src/vespa/searchlib/index/indexbuilder.h index 66ca740a20c..cf9df4bd154 100644 --- a/searchlib/src/vespa/searchlib/index/indexbuilder.h +++ b/searchlib/src/vespa/searchlib/index/indexbuilder.h @@ -5,9 +5,18 @@ namespace search::index { +class DocIdAndFeatures; class Schema; class WordDocElementWordPosFeatures; +/** + * Interface used to build an index for the set of index fields specified in a schema. + * + * The index should be built as follows: + * For each field add the set of unique words in sorted order. + * For each word add the set of document ids in sorted order. + * For each document id add the position information for that document. + */ class IndexBuilder { protected: const Schema &_schema; @@ -15,39 +24,12 @@ protected: public: IndexBuilder(const Schema &schema); - virtual - ~IndexBuilder(); - - virtual void - startWord(vespalib::stringref word) = 0; - - virtual void - endWord() = 0; - - virtual void - startDocument(uint32_t docId) = 0; - - virtual void - endDocument() = 0; - - virtual void - startField(uint32_t fieldId) = 0; - - virtual void - endField() = 0; - - virtual void - startElement(uint32_t elementId, int32_t weight, uint32_t elementLen) = 0; - - virtual void - endElement() = 0; - - virtual void - addOcc(const WordDocElementWordPosFeatures &features) = 0; - - // TODO: methods for attribute vectors. - - // TODO: methods for document summary. + virtual ~IndexBuilder(); + virtual void startField(uint32_t fieldId) = 0; + virtual void endField() = 0; + virtual void startWord(vespalib::stringref word) = 0; + virtual void endWord() = 0; + virtual void add_document(const DocIdAndFeatures &features) = 0; }; } diff --git a/searchlib/src/vespa/searchlib/index/postinglistfile.cpp b/searchlib/src/vespa/searchlib/index/postinglistfile.cpp index 0f0860f9145..52c6b85a0b8 100644 --- a/searchlib/src/vespa/searchlib/index/postinglistfile.cpp +++ b/searchlib/src/vespa/searchlib/index/postinglistfile.cpp @@ -6,8 +6,6 @@ namespace search::index { PostingListFileSeqRead::PostingListFileSeqRead() - : _counts(), - _residueDocs(0) { } diff --git a/searchlib/src/vespa/searchlib/index/postinglistfile.h b/searchlib/src/vespa/searchlib/index/postinglistfile.h index 194ac519a19..1e7dde7f139 100644 --- a/searchlib/src/vespa/searchlib/index/postinglistfile.h +++ b/searchlib/src/vespa/searchlib/index/postinglistfile.h @@ -19,9 +19,6 @@ class DocIdAndFeatures; * for words. */ class PostingListFileSeqRead { -protected: - PostingListCounts _counts; - unsigned int _residueDocs; // Docids left to read for word public: PostingListFileSeqRead(); @@ -63,34 +60,6 @@ public: * Get current (word, docid) feature parameters. */ virtual void getFeatureParams(PostingListParams ¶ms); - - // Methods used when generating posting list for common word pairs. - - /* - * Get current posting offset, measured in bits. First posting list - * starts at 0, i.e. file header is not accounted for here. - * - * @return current posting offset, measured in bits. - */ - virtual uint64_t getCurrentPostingOffset() const = 0; - - /** - * Set current posting offset, measured in bits. First posting - * list starts at 0, i.e. file header is not accounted for here. - * - * @param Offset start of posting lists for word pair. - * @param endOffset end of posting lists for word pair. - * @param readAheadOffset end of posting list for either this or a - * later word pair, depending on disk seek cost. - */ - virtual void setPostingOffset(uint64_t offset, uint64_t endOffset, uint64_t readAheadOffset) = 0; - - /** - * Get counts read by last readCounts(). - */ - const PostingListCounts &getCounts() const { return _counts; } - - PostingListCounts &getCounts() { return _counts; } }; /** diff --git a/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp b/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp index 974fcc01c36..1d55ed76a09 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/feature_store.cpp @@ -21,7 +21,7 @@ FeatureStore::writeFeatures(uint32_t packedIndex, const DocIdAndFeatures &featur oldOffset = 0; assert(_f.getWriteOffset() == oldOffset); } - assert(!features.getRaw()); + assert(!features.has_raw_data()); _f.writeFeatures(features); return oldOffset; } diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp index 7d10895c32f..e79cab28dec 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp @@ -169,23 +169,10 @@ FieldIndex::dump(search::index::IndexBuilder & indexBuilder) for (; pitr.valid(); ++pitr) { uint32_t docId = pitr.getKey(); EntryRef featureRef(pitr.getData()); - indexBuilder.startDocument(docId); _featureStore.setupForReadFeatures(featureRef, decoder); decoder.readFeatures(features); - size_t poff = 0; - uint32_t wpIdx = 0u; - size_t numElements = features._elements.size(); - for (size_t i = 0; i < numElements; ++i) { - const WordDocElementFeatures & fef = features._elements[i]; - indexBuilder.startElement(fef.getElementId(), fef.getWeight(), fef.getElementLen()); - for (size_t j = 0; j < fef.getNumOccs(); ++j, ++wpIdx) { - assert(wpIdx == poff + j); - indexBuilder.addOcc(features._wordPositions[poff + j]); - } - poff += fef.getNumOccs(); - indexBuilder.endElement(); - } - indexBuilder.endDocument(); + features.set_doc_id(docId); + indexBuilder.add_document(features); } } else { const PostingListKeyDataType *kd = @@ -194,23 +181,10 @@ FieldIndex::dump(search::index::IndexBuilder & indexBuilder) for (; kd != kde; ++kd) { uint32_t docId = kd->_key; EntryRef featureRef(kd->getData()); - indexBuilder.startDocument(docId); _featureStore.setupForReadFeatures(featureRef, decoder); decoder.readFeatures(features); - size_t poff = 0; - uint32_t wpIdx = 0u; - size_t numElements = features._elements.size(); - for (size_t i = 0; i < numElements; ++i) { - const WordDocElementFeatures & fef = features._elements[i]; - indexBuilder.startElement(fef.getElementId(), fef.getWeight(), fef.getElementLen()); - for (size_t j = 0; j < fef.getNumOccs(); ++j, ++wpIdx) { - assert(wpIdx == poff + j); - indexBuilder.addOcc(features._wordPositions[poff + j]); - } - poff += fef.getNumOccs(); - indexBuilder.endElement(); - } - indexBuilder.endDocument(); + features.set_doc_id(docId); + indexBuilder.add_document(features); } } indexBuilder.endWord(); diff --git a/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp b/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp index f0bb1eb6519..1e25878a33e 100644 --- a/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp +++ b/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp @@ -7,6 +7,7 @@ namespace search::diskindex { +using index::DocIdAndFeatures; using index::DummyFileHeaderContext; using index::Schema; using index::WordDocElementWordPosFeatures; @@ -17,13 +18,17 @@ struct Builder search::diskindex::IndexBuilder _ib; TuneFileIndexing _tuneFileIndexing; DummyFileHeaderContext _fileHeaderContext; + DocIdAndFeatures _features; Builder(const std::string &dir, const Schema &s, uint32_t docIdLimit, uint64_t numWordIds, bool directio) - : _ib(s) + : _ib(s), + _tuneFileIndexing(), + _fileHeaderContext(), + _features() { if (directio) { _tuneFileIndexing._read.setWantDirectIO(); @@ -37,11 +42,11 @@ struct Builder void addDoc(uint32_t docId) { - _ib.startDocument(docId); - _ib.startElement(0, 1, 1); - _ib.addOcc(WordDocElementWordPosFeatures(0)); - _ib.endElement(); - _ib.endDocument(); + _features.clear(docId); + _features.elements().emplace_back(0, 1, 1); + _features.elements().back().setNumOccs(1); + _features.word_positions().emplace_back(0); + _ib.add_document(_features); } void diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp index 9cbbd136148..d59417a1e78 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakememtreeocc.cpp @@ -206,7 +206,7 @@ FakeMemTreeOccMgr::add(uint32_t wordIdx, index::DocIdAndFeatures &features) _featureSizes[wordIdx] += RefType::align((r.second + 7) / 8) * 8; - _unflushed.push_back(PendingOp(wordIdx, features._docId, r.first)); + _unflushed.push_back(PendingOp(wordIdx, features.doc_id(), r.first)); if (_unflushed.size() >= 10000) flush(); diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp index 1fa518af28f..8f6c16658c9 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp @@ -584,7 +584,7 @@ FakeWord::validate(FieldReader &fieldReader, for (residue = numDocs; residue > 0; --residue) { assert(fieldReader._wordNum == wordNum); DocIdAndFeatures &features(fieldReader._docIdAndFeatures); - docId = features._docId; + docId = features.doc_id(); assert(d != de); assert(d->_docId == docId); if (matchData.valid()) { @@ -598,15 +598,15 @@ FakeWord::validate(FieldReader &fieldReader, typedef WordDocElementWordPosFeatures Positions; std::vector<Elements>::const_iterator element = - features._elements.begin(); + features.elements().begin(); std::vector<Positions>::const_iterator position = - features._wordPositions.begin(); + features.word_positions().begin(); TermFieldMatchData *tfmd = matchData[0]; assert(tfmd != 0); - tfmd->reset(features._docId); + tfmd->reset(features.doc_id()); - uint32_t elementResidue = features._elements.size(); + uint32_t elementResidue = features.elements().size(); while (elementResidue != 0) { uint32_t positionResidue = element->getNumOccs(); while (positionResidue != 0) { diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp index 3d4567ed2ab..f6c6e5a64f3 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp @@ -3,9 +3,10 @@ #include "fakezcfilterocc.h" #include "fpfactory.h" #include <vespa/searchlib/diskindex/zcposocciterators.h> -#include <vespa/searchlib/diskindex/zc4_posting_writer.h> #include <vespa/searchlib/diskindex/zc4_posting_header.h> #include <vespa/searchlib/diskindex/zc4_posting_params.h> +#include <vespa/searchlib/diskindex/zc4_posting_reader.h> +#include <vespa/searchlib/diskindex/zc4_posting_writer.h> using search::fef::TermFieldMatchData; using search::fef::TermFieldMatchDataArray; @@ -125,10 +126,12 @@ void FakeZcFilterOcc::setup(const FakeWord &fw, bool doFeatures, bool dynamicK) { - if (_bigEndian) + if (_bigEndian) { setupT<true>(fw, doFeatures, dynamicK); - else + } else { setupT<false>(fw, doFeatures, dynamicK); + } + validate_read(fw, doFeatures, dynamicK); } @@ -208,7 +211,7 @@ FakeZcFilterOcc::read_header(bool doFeatures, bool dynamicK, uint32_t min_skip_d decode_context.setPosition({ _compressed.first, 0 }); Zc4PostingParams params(min_skip_docs, min_chunk_docs, _docIdLimit, dynamicK, doFeatures); Zc4PostingHeader header; - header.read<bigEndian>(decode_context, params); + header.read(decode_context, params); _docIdsSize = header._doc_ids_size; _l1SkipSize = header._l1_skip_size; _l2SkipSize = header._l2_skip_size; @@ -219,6 +222,63 @@ FakeZcFilterOcc::read_header(bool doFeatures, bool dynamicK, uint32_t min_skip_d } +void +FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const +{ + if (_bigEndian) { + validate_read<true>(fw, encode_features, dynamic_k); + } else { + validate_read<false>(fw, encode_features, dynamic_k); + } +} + +template <bool bigEndian> +void +FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const +{ + bitcompression::EGPosOccDecodeContextCooked<bigEndian> decode_context_dynamic_k(&_fieldsParams); + bitcompression::EG2PosOccDecodeContextCooked<bigEndian> decode_context_static_k(&_fieldsParams); + bitcompression::FeatureDecodeContext<bigEndian> &decode_context_dynamic_k_upcast = decode_context_dynamic_k; + bitcompression::FeatureDecodeContext<bigEndian> &decode_context_static_k_upcast = decode_context_static_k; + bitcompression::FeatureDecodeContext<bigEndian> &decode_context = dynamic_k ? decode_context_dynamic_k_upcast : decode_context_static_k_upcast; + Zc4PostingReader<bigEndian> reader(dynamic_k); + reader.set_decode_features(&decode_context); + auto ¶ms = reader.get_posting_params(); + params._min_skip_docs = 1; + params._min_chunk_docs = 1000000000; + params._doc_id_limit = _docIdLimit; + params._encode_features = encode_features; + reader.get_read_context().reference_compressed_buffer(_compressed.first, _compressed.second); + assert(decode_context.getReadOffset() == 0u); + PostingListCounts counts; + counts._bitLength = _compressedBits; + counts._numDocs = _hitDocs; + reader.set_counts(counts); + auto word_pos_iterator(fw._wordPosFeatures.begin()); + auto word_pos_iterator_end(fw._wordPosFeatures.end()); + DocIdAndPosOccFeatures check_features; + DocIdAndFeatures features; + uint32_t hits = 0; + for (const auto &doc : fw._postings) { + if (encode_features) { + fw.setupFeatures(doc, &*word_pos_iterator, check_features); + word_pos_iterator += doc._positions; + } else { + check_features.clear(doc._docId); + } + reader.read_doc_id_and_features(features); + assert(features.doc_id() == doc._docId); + assert(features.elements().size() == check_features.elements().size()); + assert(features.word_positions().size() == check_features.word_positions().size()); + ++hits; + } + if (encode_features) { + assert(word_pos_iterator == word_pos_iterator_end); + } + reader.read_doc_id_and_features(features); + assert(static_cast<int32_t>(features.doc_id()) == -1); +} + FakeZcFilterOcc::~FakeZcFilterOcc() { free(_compressedMalloc); @@ -369,7 +429,7 @@ FakeFilterOccZCArrayIterator::initRange(uint32_t begin, uint32_t end) DecodeContext &d = _decodeContext; Zc4PostingParams params(1, 1000000000, _docIdLimit, true, false); Zc4PostingHeader header; - header.read<true>(d, params); + header.read(d, params); assert((d.getBitOffset() & 7) == 0); const uint8_t *bcompr = d.getByteCompr(); _valI = bcompr; @@ -590,7 +650,7 @@ initRange(uint32_t begin, uint32_t end) DecodeContext &d = _decodeContext; Zc4PostingParams params(1, 1000000000, _docIdLimit, true, false); Zc4PostingHeader header; - header.read<true>(d, params); + header.read(d, params); _lastDocId = header._last_doc_id; assert((d.getBitOffset() & 7) == 0); const uint8_t *bcompr = d.getByteCompr(); diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h index b68e3866461..36738a0f5a8 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h @@ -40,6 +40,10 @@ protected: template <bool bigEndian> void read_header(bool do_features, bool dynamic_k, uint32_t min_skip_docs, uint32_t min_cunk_docs); + void validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const; + template <bool bigEndian> + void validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const; + public: FakeZcFilterOcc(const FakeWord &fw); FakeZcFilterOcc(const FakeWord &fw, bool bigEndian, const char *nameSuffix); diff --git a/searchlib/src/vespa/searchlib/test/memoryindex/ordered_field_index_inserter.h b/searchlib/src/vespa/searchlib/test/memoryindex/ordered_field_index_inserter.h index 08473f9fc6c..a341e36045e 100644 --- a/searchlib/src/vespa/searchlib/test/memoryindex/ordered_field_index_inserter.h +++ b/searchlib/src/vespa/searchlib/test/memoryindex/ordered_field_index_inserter.h @@ -53,9 +53,9 @@ public: _ss << "a=" << docId; if (_verbose) { _ss << "("; - auto wpi = features._wordPositions.begin(); + auto wpi = features.word_positions().begin(); bool firstElement = true; - for (auto &el : features._elements) { + for (auto &el : features.elements()) { if (!firstElement) { _ss << ","; } diff --git a/searchlib/src/vespa/searchlib/test/memoryindex/wrap_inserter.h b/searchlib/src/vespa/searchlib/test/memoryindex/wrap_inserter.h new file mode 100644 index 00000000000..eeb09898aa2 --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/memoryindex/wrap_inserter.h @@ -0,0 +1,64 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/memoryindex/field_index_collection.h> +#include <vespa/searchlib/memoryindex/ordered_field_index_inserter.h> + +namespace search::memoryindex::test { + +/** + * Test class used to populate a FieldIndex. + */ +class WrapInserter { +private: + OrderedFieldIndexInserter& _inserter; + +public: + WrapInserter(FieldIndexCollection& field_indexes, uint32_t field_id) + : _inserter(field_indexes.getFieldIndex(field_id)->getInserter()) + { + } + + WrapInserter(FieldIndex& field_index) + : _inserter(field_index.getInserter()) + { + } + + WrapInserter& word(vespalib::stringref word_) { + _inserter.setNextWord(word_); + return *this; + } + + WrapInserter& add(uint32_t doc_id, const index::DocIdAndFeatures& features) { + _inserter.add(doc_id, features); + return *this; + } + + WrapInserter& add(uint32_t doc_id) { + index::DocIdAndPosOccFeatures features; + features.addNextOcc(0, 0, 1, 1); + return add(doc_id, features); + } + + WrapInserter& remove(uint32_t doc_id) { + _inserter.remove(doc_id); + return *this; + } + + WrapInserter& flush() { + _inserter.flush(); + return *this; + } + + WrapInserter& rewind() { + _inserter.rewind(); + return *this; + } + + datastore::EntryRef getWordRef() { + return _inserter.getWordRef(); + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/util/comprfile.cpp b/searchlib/src/vespa/searchlib/util/comprfile.cpp index 155bb194f97..400a93acd26 100644 --- a/searchlib/src/vespa/searchlib/util/comprfile.cpp +++ b/searchlib/src/vespa/searchlib/util/comprfile.cpp @@ -408,6 +408,25 @@ ComprFileReadContext::referenceWriteContext(const ComprFileWriteContext &rhs) } } +void +ComprFileReadContext::reference_compressed_buffer(void *buffer, size_t usedUnits) +{ + ComprFileDecodeContext *d = getDecodeContext(); + + _comprBuf = buffer; + _comprBufSize = usedUnits; + setBufferEndFilePos(static_cast<uint64_t>(usedUnits) * _unitSize); + setFileSize(static_cast<uint64_t>(usedUnits) * _unitSize); + if (d != NULL) { + d->afterRead(_comprBuf, + usedUnits, + static_cast<uint64_t>(usedUnits) * _unitSize, + false); + d->setupBits(0); + setBitOffset(-1); + assert(d->getBitPosV() == 0); + } +} ComprFileWriteContext:: ComprFileWriteContext(ComprFileEncodeContext &encodeContext) diff --git a/searchlib/src/vespa/searchlib/util/comprfile.h b/searchlib/src/vespa/searchlib/util/comprfile.h index d4de1d305fa..431126dee47 100644 --- a/searchlib/src/vespa/searchlib/util/comprfile.h +++ b/searchlib/src/vespa/searchlib/util/comprfile.h @@ -137,6 +137,7 @@ public: * long as rhs is live and unchanged. */ void referenceWriteContext(const ComprFileWriteContext &rhs); + void reference_compressed_buffer(void *buffer, size_t usedUnits); }; |