diff options
16 files changed, 290 insertions, 337 deletions
diff --git a/searchcore/src/tests/proton/feed_and_search/feed_and_search_test.cpp b/searchcore/src/tests/proton/feed_and_search/feed_and_search_test.cpp index c773abef69f..0936f05b222 100644 --- a/searchcore/src/tests/proton/feed_and_search/feed_and_search_test.cpp +++ b/searchcore/src/tests/proton/feed_and_search/feed_and_search_test.cpp @@ -176,12 +176,14 @@ void Test::requireThatMemoryIndexCanBeDumpedAndSearched() { const string index_dir = "test_index"; const uint32_t docIdLimit = memory_index.getDocIdLimit(); const uint64_t num_words = memory_index.getNumWords(); - IndexBuilder index_builder(schema, index_dir, docIdLimit); search::TuneFileIndexing tuneFileIndexing; DummyFileHeaderContext fileHeaderContext; - index_builder.open(num_words, MockFieldLengthInspector(), tuneFileIndexing, fileHeaderContext); - memory_index.dump(index_builder); - index_builder.close(); + { + MockFieldLengthInspector fieldLengthInspector; + IndexBuilder index_builder(schema, index_dir, docIdLimit, num_words, + fieldLengthInspector, tuneFileIndexing, fileHeaderContext); + memory_index.dump(index_builder); + } // Fusion test. Keep all documents to get an "indentical" copy. const string index_dir2 = "test_index2"; diff --git a/searchcore/src/tests/proton/index/fusionrunner_test.cpp b/searchcore/src/tests/proton/index/fusionrunner_test.cpp index ac02c644885..a58d7540d7c 100644 --- a/searchcore/src/tests/proton/index/fusionrunner_test.cpp +++ b/searchcore/src/tests/proton/index/fusionrunner_test.cpp @@ -23,7 +23,6 @@ #include <vespa/vespalib/util/gate.h> #include <vespa/vespalib/util/destructor_callbacks.h> #include <vespa/vespalib/testkit/testapp.h> -#include <vespa/vespalib/util/size_literals.h> #include <vespa/vespalib/stllike/asciistream.h> #include <filesystem> #include <set> @@ -96,7 +95,7 @@ class Test : public vespalib::TestApp { public: Test(); - ~Test(); + ~Test() override; int Main() override; }; @@ -145,17 +144,15 @@ getSchema() void Test::setUp() { std::filesystem::remove_all(std::filesystem::path(base_dir)); - _fusion_runner.reset(new FusionRunner(base_dir, getSchema(), - TuneFileAttributes(), - _fileHeaderContext)); + _fusion_runner = std::make_unique<FusionRunner>(base_dir, getSchema(), TuneFileAttributes(), _fileHeaderContext); const string selector_base = base_dir + "/index.flush.0/selector"; - _selector.reset(new FixedSourceSelector(0, selector_base)); + _selector = std::make_unique<FixedSourceSelector>(0, selector_base); _fusion_spec = FusionSpec(); } void Test::tearDown() { std::filesystem::remove_all(std::filesystem::path(base_dir)); - _selector.reset(0); + _selector.reset(nullptr); } Document::UP buildDocument(DocBuilder & doc_builder, int id, const string &word) { @@ -200,12 +197,14 @@ void Test::createIndex(const string &dir, uint32_t id, bool fusion) { addDocument(doc_builder, memory_index, *_selector, id, id + 3, "qux"); const uint32_t docIdLimit = std::min(memory_index.getDocIdLimit(), _selector->getDocIdLimit()); - IndexBuilder index_builder(schema, index_dir, docIdLimit); - TuneFileIndexing tuneFileIndexing; TuneFileAttributes tuneFileAttributes; - index_builder.open(memory_index.getNumWords(), MockFieldLengthInspector(), tuneFileIndexing, _fileHeaderContext); - memory_index.dump(index_builder); - index_builder.close(); + { + TuneFileIndexing tuneFileIndexing; + MockFieldLengthInspector fieldLengthInspector; + IndexBuilder index_builder(schema, index_dir, docIdLimit, memory_index.getNumWords(), fieldLengthInspector, + tuneFileIndexing, _fileHeaderContext); + memory_index.dump(index_builder); + } _selector->extractSaveInfo(index_dir + "/selector")->save(tuneFileAttributes, _fileHeaderContext); } diff --git a/searchcore/src/vespa/searchcore/proton/index/memoryindexwrapper.cpp b/searchcore/src/vespa/searchcore/proton/index/memoryindexwrapper.cpp index fb14534c47c..b4d32cb4376 100644 --- a/searchcore/src/vespa/searchcore/proton/index/memoryindexwrapper.cpp +++ b/searchcore/src/vespa/searchcore/proton/index/memoryindexwrapper.cpp @@ -36,11 +36,10 @@ MemoryIndexWrapper::flushToDisk(const vespalib::string &flushDir, uint32_t docId { const uint64_t numWords = _index.getNumWords(); _index.freeze(); // TODO(geirst): is this needed anymore? - IndexBuilder indexBuilder(_index.getSchema(), flushDir, docIdLimit); SerialNumFileHeaderContext fileHeaderContext(_fileHeaderContext, serialNum); - indexBuilder.open(numWords, *this, _tuneFileIndexing, fileHeaderContext); + IndexBuilder indexBuilder(_index.getSchema(), flushDir, docIdLimit, + numWords, *this, _tuneFileIndexing, fileHeaderContext); _index.dump(indexBuilder); - indexBuilder.close(); } search::SerialNum diff --git a/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp b/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp index b96e63bb47e..552ce0e2f0b 100644 --- a/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp +++ b/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp @@ -382,7 +382,8 @@ DiskIndexTest::build_index(const IOSettings& io_settings, const EmptySettings& e if (empty_settings._empty_word) { name << "we"; } - openIndex(name.str(), io_settings._use_directio, io_settings._use_mmap, empty_settings._empty_field, empty_settings._empty_doc, empty_settings._empty_word); + openIndex(name.str(), io_settings._use_directio, io_settings._use_mmap, empty_settings._empty_field, + empty_settings._empty_doc, empty_settings._empty_word); } void diff --git a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp index 57022927dbe..c45b5787c4c 100644 --- a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp +++ b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp @@ -22,8 +22,6 @@ #include <vespa/searchlib/memoryindex/field_index_collection.h> #include <vespa/searchlib/memoryindex/posting_iterator.h> #include <vespa/searchlib/test/index/mock_field_length_inspector.h> -#include <vespa/searchlib/util/filekit.h> -#include <vespa/vespalib/btree/btreenode.hpp> #include <vespa/vespalib/btree/btreenodeallocator.hpp> #include <vespa/vespalib/util/gate.h> #include <vespa/vespalib/util/destructor_callbacks.h> @@ -365,7 +363,7 @@ FusionTest::requireThatFusionIsWorking(const vespalib::string &prefix, bool dire const vespalib::string dump2dir = prefix + "dump2"; constexpr uint32_t numDocs = 12 + 1; - IndexBuilder ib(schema, dump2dir, numDocs); + const uint32_t numWords = fic.getNumUniqueWords(); MockFieldLengthInspector mock_field_length_inspector; TuneFileIndexing tuneFileIndexing; @@ -379,10 +377,11 @@ FusionTest::requireThatFusionIsWorking(const vespalib::string &prefix, bool dire if (readmmap) { tuneFileSearch._read.setWantMemoryMap(); } - ib.open(numWords, mock_field_length_inspector, tuneFileIndexing, fileHeaderContext); - fic.dump(ib); - ib.close(); - + { + IndexBuilder ib(schema, dump2dir, numDocs, numWords, + mock_field_length_inspector, tuneFileIndexing, fileHeaderContext); + fic.dump(ib); + } vespalib::ThreadStackExecutor executor(4); do { @@ -480,12 +479,10 @@ FusionTest::make_simple_index(const vespalib::string &dump_dir, const IFieldLeng inv.invertDocument(10, *doc10, {}); myPushDocument(inv); - IndexBuilder ib(_schema, dump_dir, numDocs); TuneFileIndexing tuneFileIndexing; DummyFileHeaderContext fileHeaderContext; - ib.open(numWords, field_length_inspector, tuneFileIndexing, fileHeaderContext); + IndexBuilder ib(_schema, dump_dir, numDocs, numWords, field_length_inspector, tuneFileIndexing, fileHeaderContext); fic.dump(ib); - ib.close(); } bool diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp index d309da26feb..1a28a960b7e 100644 --- a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp +++ b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp @@ -6,10 +6,8 @@ #include <vespa/document/fieldvalue/stringfieldvalue.h> #include <vespa/document/fieldvalue/weightedsetfieldvalue.h> #include <vespa/document/repo/configbuilder.h> -#include <vespa/searchlib/diskindex/fusion.h> #include <vespa/searchlib/diskindex/indexbuilder.h> #include <vespa/searchlib/diskindex/zcposoccrandread.h> -#include <vespa/searchlib/fef/fieldpositionsiterator.h> #include <vespa/searchlib/fef/termfieldmatchdata.h> #include <vespa/searchlib/index/docidandfeatures.h> #include <vespa/searchlib/index/dummyfileheadercontext.h> @@ -69,76 +67,79 @@ using NormalFieldIndex = FieldIndex<false>; class MyBuilder : public IndexBuilder { private: std::stringstream _ss; - bool _insideWord; - bool _insideField; - bool _firstWord; bool _firstField; - bool _firstDoc; -public: - explicit MyBuilder(const Schema &schema); - ~MyBuilder() override; - - void startWord(vespalib::stringref word) override { - assert(_insideField); - assert(!_insideWord); - if (!_firstWord) - _ss << ","; - _ss << "w=" << word << "["; - _firstDoc = true; - _insideWord = true; - } - - void endWord() override { - assert(_insideWord); - _ss << "]"; - _firstWord = false; - _insideWord = false; - } - - void startField(uint32_t fieldId) override { - assert(!_insideField); - if (!_firstField) _ss << ","; - _ss << "f=" << fieldId << "["; - _firstWord = true; - _insideField = true; - } - - void endField() override { - assert(_insideField); - assert(!_insideWord); - _ss << "]"; - _firstField = false; - _insideField = false; - } + class FieldIndexBuilder : public index::FieldIndexBuilder { + public: + explicit FieldIndexBuilder(std::stringstream & ss) + : _ss(ss), + _insideWord(false), + _firstWord(true), + _firstDoc(true) + {} + ~FieldIndexBuilder() override { + assert(!_insideWord); + _ss << "]"; + } + void startWord(vespalib::stringref word) override { + assert(!_insideWord); + if (!_firstWord) + _ss << ","; + _ss << "w=" << word << "["; + _firstDoc = true; + _insideWord = true; + } - void add_document(const DocIdAndFeatures &features) override { - assert(_insideWord); - if (!_firstDoc) { - _ss << ","; + void endWord() override { + assert(_insideWord); + _ss << "]"; + _firstWord = false; + _insideWord = false; } - _ss << "d=" << features.doc_id() << "["; - bool first_elem = true; - size_t word_pos_offset = 0; - for (const auto& elem : features.elements()) { - if (!first_elem) { + void add_document(const DocIdAndFeatures &features) override { + assert(_insideWord); + if (!_firstDoc) { _ss << ","; } - _ss << "e=" << elem.getElementId() << ",w=" << elem.getWeight() << ",l=" << elem.getElementLen() << "["; - bool first_pos = true; - for (size_t i = 0; i < elem.getNumOccs(); ++i) { - if (!first_pos) { + _ss << "d=" << features.doc_id() << "["; + bool first_elem = true; + size_t word_pos_offset = 0; + for (const auto& elem : features.elements()) { + if (!first_elem) { _ss << ","; } - _ss << features.word_positions()[i + word_pos_offset].getWordPos(); - first_pos = false; + _ss << "e=" << elem.getElementId() << ",w=" << elem.getWeight() << ",l=" << elem.getElementLen() << "["; + bool first_pos = true; + for (size_t i = 0; i < elem.getNumOccs(); ++i) { + if (!first_pos) { + _ss << ","; + } + _ss << features.word_positions()[i + word_pos_offset].getWordPos(); + first_pos = false; + } + word_pos_offset += elem.getNumOccs(); + _ss << "]"; + first_elem = false; } - word_pos_offset += elem.getNumOccs(); _ss << "]"; - first_elem = false; + _firstDoc = false; } - _ss << "]"; - _firstDoc = false; + private: + std::stringstream & _ss; + bool _insideWord; + bool _firstWord; + bool _firstDoc; + }; +public: + explicit MyBuilder(const Schema &schema); + ~MyBuilder() override; + + std::unique_ptr<index::FieldIndexBuilder> + startField(uint32_t fieldId) override { + if (!_firstField) _ss << ","; + _ss << "f=" << fieldId << "["; + _firstField = false; + return std::make_unique<FieldIndexBuilder>(_ss); } std::string toStr() const { @@ -149,11 +150,7 @@ public: MyBuilder::MyBuilder(const Schema &schema) : IndexBuilder(schema), _ss(), - _insideWord(false), - _insideField(false), - _firstWord(true), - _firstField(true), - _firstDoc(true) + _firstField(true) {} MyBuilder::~MyBuilder() = default; @@ -826,18 +823,19 @@ TEST_F(FieldIndexCollectionTest, require_that_features_are_in_posting_lists) TEST_F(FieldIndexCollectionTest, require_that_basic_dumping_to_index_builder_is_working) { MyBuilder b(schema); - WordDocElementWordPosFeatures wpf; - b.startField(4); - b.startWord("a"); - DocIdAndFeatures features; - features.set_doc_id(2); - features.elements().emplace_back(0, 10, 20); - features.elements().back().setNumOccs(2); - features.word_positions().emplace_back(1); - features.word_positions().emplace_back(3); - b.add_document(features); - b.endWord(); - b.endField(); + { + WordDocElementWordPosFeatures wpf; + auto fb = b.startField(4); + fb->startWord("a"); + DocIdAndFeatures features; + features.set_doc_id(2); + features.elements().emplace_back(0, 10, 20); + features.elements().back().setNumOccs(2); + features.word_positions().emplace_back(1); + features.word_positions().emplace_back(3); + fb->add_document(features); + fb->endWord(); + } EXPECT_EQ("f=4[w=a[d=2[e=0,w=10,l=20[1,3]]]]", b.toStr()); } @@ -887,12 +885,12 @@ TEST_F(FieldIndexCollectionTest, require_that_dumping_words_with_no_docs_to_inde b.toStr()); } { - search::diskindex::IndexBuilder b(schema, "dump", 5); TuneFileIndexing tuneFileIndexing; DummyFileHeaderContext fileHeaderContext; - b.open(2, MockFieldLengthInspector(), tuneFileIndexing, fileHeaderContext); + MockFieldLengthInspector fieldLengthInspector; + search::diskindex::IndexBuilder b(schema, "dump", 5, 2, fieldLengthInspector, + tuneFileIndexing, fileHeaderContext); fic.dump(b); - b.close(); } } @@ -1235,12 +1233,12 @@ TEST_F(UriInverterTest, require_that_uri_indexing_is_working) EXPECT_TRUE(itr->isAtEnd()); } { - search::diskindex::IndexBuilder dib(_schema, "urldump", 11); TuneFileIndexing tuneFileIndexing; DummyFileHeaderContext fileHeaderContext; - dib.open(_fic.getNumUniqueWords(), MockFieldLengthInspector(), tuneFileIndexing, fileHeaderContext); + MockFieldLengthInspector fieldLengthInspector; + search::diskindex::IndexBuilder dib(_schema, "urldump", 11, _fic.getNumUniqueWords(), + fieldLengthInspector, tuneFileIndexing, fileHeaderContext); _fic.dump(dib); - dib.close(); } } diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp index 350f4dfd145..bf295acec75 100644 --- a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp @@ -81,8 +81,7 @@ bool DiskIndex::openDictionaries(const TuneFileSearch &tuneFileSearch) { for (SchemaUtil::IndexIterator itr(_schema); itr.isValid(); ++itr) { - vespalib::string dictName = - _indexDir + "/" + itr.getName() + "/dictionary"; + vespalib::string dictName = _indexDir + "/" + itr.getName() + "/dictionary"; auto dict = std::make_unique<PageDict4RandRead>(); if (!dict->open(dictName, tuneFileSearch._read)) { LOG(warning, "Could not open disk dictionary '%s'", dictName.c_str()); @@ -152,7 +151,10 @@ DiskIndex::openField(const vespalib::string &fieldDir, bool DiskIndex::setup(const TuneFileSearch &tuneFileSearch) { - if (!loadSchema() || !openDictionaries(tuneFileSearch)) { + if (!loadSchema() ) { + return false; + } + if (!openDictionaries(tuneFileSearch)) { return false; } for (SchemaUtil::IndexIterator itr(_schema); itr.isValid(); ++itr) { diff --git a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp index 3c3ee4166bb..5cf80d06c87 100644 --- a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp @@ -50,17 +50,16 @@ public: FieldWriter* writer() { return _fieldWriter.get(); } }; -} - -class IndexBuilder::FieldHandle { +class FieldHandle { private: const Schema &_schema; IndexBuilder &_builder; FileHandle _file; const uint32_t _fieldId; - const bool _valid; public: - FieldHandle(const Schema &schema, uint32_t fieldId, IndexBuilder & builder, bool valid) noexcept; + FieldHandle(const Schema &schema, uint32_t fieldId, IndexBuilder & builder, uint32_t docIdLimit, + uint64_t numWordIds, const IFieldLengthInspector & field_length_inspector, + const TuneFileSeqWrite &tuneFileWrite, const FileHeaderContext &fileHeaderContext); ~FieldHandle(); void new_word(vespalib::stringref word); @@ -69,16 +68,68 @@ public: const Schema::IndexField &getSchemaField(); const vespalib::string &getName(); vespalib::string getDir(); - void open(uint32_t docIdLimit, uint64_t numWordIds, - const FieldLengthInfo &field_length_info, - const TuneFileSeqWrite &tuneFileWrite, - const FileHeaderContext &fileHeaderContext); void close(); + uint32_t getIndexId() const noexcept { return _fieldId; } +}; + +class FieldIndexBuilder : public index::FieldIndexBuilder { +public: + FieldIndexBuilder(const Schema &schema, uint32_t fieldId, IndexBuilder & builder, uint32_t docidLimit, + uint64_t numWordIds, const IFieldLengthInspector & field_length_inspector, + const TuneFileSeqWrite &tuneFileWrite, const FileHeaderContext &fileHeaderContext); + ~FieldIndexBuilder() override; + void startWord(vespalib::stringref word) override; + void endWord() override; + void add_document(const DocIdAndFeatures &features) override; +private: + FieldHandle _field; + vespalib::string _curWord; + uint32_t _curDocId; + bool _inWord; + + static constexpr uint32_t noDocId() { + return std::numeric_limits<uint32_t>::max(); + } - bool getValid() const { return _valid; } - uint32_t getIndexId() const { return _fieldId; } + static constexpr uint64_t noWordNumHigh() { + return std::numeric_limits<uint64_t>::max(); + } }; +FieldIndexBuilder::FieldIndexBuilder(const Schema &schema, uint32_t fieldId, IndexBuilder & builder, uint32_t docidLimit, + uint64_t numWordIds, const IFieldLengthInspector & field_length_inspector, + const TuneFileSeqWrite &tuneFileWrite, const FileHeaderContext &fileHeaderContext) + : _field(schema, fieldId, builder, docidLimit, numWordIds, field_length_inspector, tuneFileWrite, fileHeaderContext), + _curWord(), + _curDocId(noDocId()), + _inWord(false) +{} + +FieldIndexBuilder::~FieldIndexBuilder() = default; + +void +FieldIndexBuilder::startWord(vespalib::stringref word) +{ + assert(!_inWord); + // TODO: Check sort order + _curWord = word; + _inWord = true; + _field.new_word(word); +} + +void +FieldIndexBuilder::endWord() +{ + assert(_inWord); + _inWord = false; +} + +void +FieldIndexBuilder::add_document(const index::DocIdAndFeatures &features) +{ + assert(_inWord); + _field.add_document(features); +} FileHandle::FileHandle() : _fieldWriter() @@ -126,175 +177,92 @@ FileHandle::close() (void) ret; } -IndexBuilder::FieldHandle::FieldHandle(const Schema &schema, uint32_t fieldId, IndexBuilder &builder, bool valid) noexcept - : _schema(schema), - _builder(builder), - _file(), - _fieldId(fieldId), - _valid(valid) +FieldHandle::FieldHandle(const Schema &schema, uint32_t fieldId, IndexBuilder &builder, uint32_t docIdLimit, + uint64_t numWordIds, const IFieldLengthInspector & field_length_inspector, + const TuneFileSeqWrite &tuneFileWrite, const FileHeaderContext &fileHeaderContext) + : _schema(schema), + _builder(builder), + _file(), + _fieldId(fieldId) { + std::filesystem::create_directory(std::filesystem::path(getDir())); + _file.open(getDir(), SchemaUtil::IndexIterator(_schema, getIndexId()), docIdLimit, numWordIds, + field_length_inspector.get_field_length_info(getName()), tuneFileWrite, fileHeaderContext); } -IndexBuilder::FieldHandle::~FieldHandle() = default; +FieldHandle::~FieldHandle() { + close(); +} void -IndexBuilder::FieldHandle::new_word(vespalib::stringref word) +FieldHandle::new_word(vespalib::stringref word) { - assert(_valid); _file.writer()->newWord(word); } void -IndexBuilder::FieldHandle::add_document(const index::DocIdAndFeatures &features) +FieldHandle::add_document(const index::DocIdAndFeatures &features) { _file.writer()->add(features); } const Schema::IndexField & -IndexBuilder::FieldHandle::getSchemaField() +FieldHandle::getSchemaField() { return _schema.getIndexField(_fieldId); } const vespalib::string & -IndexBuilder::FieldHandle::getName() +FieldHandle::getName() { return getSchemaField().getName(); } vespalib::string -IndexBuilder::FieldHandle::getDir() +FieldHandle::getDir() { return _builder.appendToPrefix(getName()); } void -IndexBuilder::FieldHandle::open(uint32_t docIdLimit, uint64_t numWordIds, - const FieldLengthInfo &field_length_info, - const TuneFileSeqWrite &tuneFileWrite, - const FileHeaderContext &fileHeaderContext) +FieldHandle::close() { - _file.open(getDir(), SchemaUtil::IndexIterator(_schema, getIndexId()), - docIdLimit, numWordIds, field_length_info, tuneFileWrite, fileHeaderContext); + _file.close(); + vespalib::File::sync(getDir()); } -void -IndexBuilder::FieldHandle::close() -{ - _file.close(); } -std::vector<IndexBuilder::FieldHandle> -IndexBuilder::extractFields(const Schema &schema, IndexBuilder & builder) { - std::vector<IndexBuilder::FieldHandle> fields; +std::vector<int32_t> +extractFields(const Schema &schema) { + std::vector<int32_t> fields; fields.reserve(schema.getNumIndexFields()); // TODO: Filter for text indexes for (uint32_t i = 0; i < schema.getNumIndexFields(); ++i) { const Schema::IndexField &iField = schema.getIndexField(i); // Only know how to handle string index for now. bool valid = (iField.getDataType() == DataType::STRING); - fields.emplace_back(schema, i, builder, valid); + fields.push_back( valid ? i : -1); } return fields; } -IndexBuilder::IndexBuilder(const Schema &schema, vespalib::stringref prefix, uint32_t docIdLimit) +IndexBuilder::IndexBuilder(const Schema &schema, vespalib::stringref prefix, uint32_t docIdLimit, + uint64_t numWordIds, const index::IFieldLengthInspector &field_length_inspector, + const TuneFileIndexing &tuneFileIndexing, const search::common::FileHeaderContext &fileHeaderContext) : index::IndexBuilder(schema), _schema(schema), - _fields(extractFields(schema, *this)), + _fields(extractFields(schema)), _prefix(prefix), - _curWord(), _docIdLimit(docIdLimit), - _curFieldId(-1), - _lowestOKFieldId(0u), - _curDocId(noDocId()), - _inWord(false) + _numWordIds(numWordIds), + _field_length_inspector(field_length_inspector), + _tuneFileIndexing(tuneFileIndexing), + _fileHeaderContext(fileHeaderContext) { -} - -IndexBuilder::~IndexBuilder() = default; - -IndexBuilder::FieldHandle & -IndexBuilder::currentField() { - assert(_curFieldId >= 0); - assert(_curFieldId < int32_t(_fields.size())); - return _fields[_curFieldId]; -} -void -IndexBuilder::startField(uint32_t fieldId) -{ - assert(_curDocId == noDocId()); - assert(_curFieldId == -1); - assert(fieldId < _fields.size()); - assert(fieldId >= _lowestOKFieldId); - _curFieldId = fieldId; -} - -void -IndexBuilder::endField() -{ - assert(_curDocId == noDocId()); - assert(!_inWord); - _lowestOKFieldId = currentField().getIndexId() + 1; - _curFieldId = -1; -} - -void -IndexBuilder::startWord(vespalib::stringref word) -{ - assert(!_inWord); - // TODO: Check sort order - _curWord = word; - _inWord = true; - currentField().new_word(word); -} - -void -IndexBuilder::endWord() -{ - assert(_inWord); - assert(_curFieldId != -1); - _inWord = false; -} - -void -IndexBuilder::add_document(const index::DocIdAndFeatures &features) -{ - assert(_inWord); - currentField().add_document(features); -} - -vespalib::string -IndexBuilder::appendToPrefix(vespalib::stringref name) const -{ - if (_prefix.empty()) { - return name; - } - return _prefix + "/" + name; -} - -void -IndexBuilder::open(uint64_t numWordIds, - const IFieldLengthInspector &field_length_inspector, - const TuneFileIndexing &tuneFileIndexing, - const FileHeaderContext &fileHeaderContext) -{ - std::vector<uint32_t> indexes; - if (!_prefix.empty()) { std::filesystem::create_directory(std::filesystem::path(_prefix)); } - // TODO: Filter for text indexes - for (FieldHandle & fh : _fields) { - if (!fh.getValid()) { - continue; - } - std::filesystem::create_directory(std::filesystem::path(fh.getDir())); - fh.open(_docIdLimit, numWordIds, - field_length_inspector.get_field_length_info(fh.getName()), - tuneFileIndexing._write, fileHeaderContext); - indexes.push_back(fh.getIndexId()); - } vespalib::string schemaFile = appendToPrefix("schema.txt"); if (!_schema.saveToFile(schemaFile)) { LOG(error, "Cannot save schema to \"%s\"", schemaFile.c_str()); @@ -302,16 +270,7 @@ IndexBuilder::open(uint64_t numWordIds, } } -void -IndexBuilder::close() -{ - // TODO: Filter for text indexes - for (FieldHandle & fh : _fields) { - if (fh.getValid()) { - fh.close(); - vespalib::File::sync(fh.getDir()); - } - } +IndexBuilder::~IndexBuilder() { if (!docsummary::DocumentSummary::writeDocIdLimit(_prefix, _docIdLimit)) { LOG(error, "Could not write docsum count in dir %s: %s", _prefix.c_str(), getLastErrorString().c_str()); @@ -319,4 +278,23 @@ IndexBuilder::close() } } +std::unique_ptr<index::FieldIndexBuilder> +IndexBuilder::startField(uint32_t fieldId) { + if (_fields[fieldId] >= 0) { + return std::make_unique<FieldIndexBuilder>(_schema, fieldId, *this, _docIdLimit, _numWordIds, + _field_length_inspector, _tuneFileIndexing._write, + _fileHeaderContext); + } + return {}; +} + +vespalib::string +IndexBuilder::appendToPrefix(vespalib::stringref name) const +{ + if (_prefix.empty()) { + return name; + } + return _prefix + "/" + name; +} + } diff --git a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h index c5a2f6e1536..4ef6ab4a813 100644 --- a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h +++ b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h @@ -22,43 +22,26 @@ class BitVectorCandidate; class IndexBuilder : public index::IndexBuilder { public: // Schema argument must live until IndexBuilder has been deleted. - IndexBuilder(const index::Schema &schema, vespalib::stringref prefix, uint32_t docIdLimit); + IndexBuilder(const index::Schema &schema, vespalib::stringref prefix, uint32_t docIdLimit, + uint64_t numWordIds, const index::IFieldLengthInspector &field_length_inspector, + const TuneFileIndexing &tuneFileIndexing, const search::common::FileHeaderContext &fileHeaderContext); ~IndexBuilder() override; - void startField(uint32_t fieldId) override; - void endField() override; - void startWord(vespalib::stringref word) override; - void endWord() override; - void add_document(const index::DocIdAndFeatures &features) override; + std::unique_ptr<index::FieldIndexBuilder> startField(uint32_t fieldId) override; vespalib::string appendToPrefix(vespalib::stringref name) const; - - void open(uint64_t numWordIds, const index::IFieldLengthInspector &field_length_inspector, - const TuneFileIndexing &tuneFileIndexing, - const common::FileHeaderContext &fileHandleContext); - - void close(); private: - class FieldHandle; const index::Schema &_schema; - std::vector<FieldHandle> _fields; + std::vector<int> _fields; const vespalib::string _prefix; - vespalib::string _curWord; const uint32_t _docIdLimit; - int32_t _curFieldId; - uint32_t _lowestOKFieldId; - uint32_t _curDocId; - bool _inWord; - - static std::vector<IndexBuilder::FieldHandle> extractFields(const index::Schema &schema, IndexBuilder & builder); - - static uint32_t noDocId() { - return std::numeric_limits<uint32_t>::max(); - } + const uint32_t _numWordIds; + const index::IFieldLengthInspector &_field_length_inspector; + const TuneFileIndexing &_tuneFileIndexing; + const search::common::FileHeaderContext &_fileHeaderContext; static uint64_t noWordNumHigh() { return std::numeric_limits<uint64_t>::max(); } - FieldHandle & currentField(); }; } diff --git a/searchlib/src/vespa/searchlib/index/indexbuilder.h b/searchlib/src/vespa/searchlib/index/indexbuilder.h index 37f8a9c30be..9615bfd9428 100644 --- a/searchlib/src/vespa/searchlib/index/indexbuilder.h +++ b/searchlib/src/vespa/searchlib/index/indexbuilder.h @@ -2,6 +2,7 @@ #pragma once #include <vespa/vespalib/stllike/string.h> +#include <memory> namespace search::index { @@ -9,6 +10,14 @@ class DocIdAndFeatures; class Schema; class WordDocElementWordPosFeatures; +class FieldIndexBuilder { +public: + virtual ~FieldIndexBuilder() = default; + virtual void startWord(vespalib::stringref word) = 0; + virtual void endWord() = 0; + virtual void add_document(const DocIdAndFeatures &features) = 0; +}; + /** * Interface used to build an index for the set of index fields specified in a schema. * @@ -22,14 +31,9 @@ protected: const Schema &_schema; public: - IndexBuilder(const Schema &schema); - + explicit IndexBuilder(const Schema &schema); virtual ~IndexBuilder(); - virtual void startField(uint32_t fieldId) = 0; - virtual void endField() = 0; - virtual void startWord(vespalib::stringref word) = 0; - virtual void endWord() = 0; - virtual void add_document(const DocIdAndFeatures &features) = 0; + virtual std::unique_ptr<FieldIndexBuilder> startField(uint32_t fieldId) = 0; }; } diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp index 24582d7e692..db3a0019d94 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp @@ -149,7 +149,7 @@ FieldIndex<interleaved_features>::compactFeatures() template <bool interleaved_features> void -FieldIndex<interleaved_features>::dump(search::index::IndexBuilder & indexBuilder) +FieldIndex<interleaved_features>::dump(search::index::FieldIndexBuilder & indexBuilder) { vespalib::stringref word; FeatureStore::DecodeContextCooked decoder(nullptr); diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index.h b/searchlib/src/vespa/searchlib/memoryindex/field_index.h index 9ae9d1b2aef..0b245300a7b 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_index.h +++ b/searchlib/src/vespa/searchlib/memoryindex/field_index.h @@ -82,7 +82,7 @@ public: void compactFeatures() override; - void dump(search::index::IndexBuilder & indexBuilder) override; + void dump(search::index::FieldIndexBuilder & indexBuilder) override; vespalib::MemoryUsage getMemoryUsage() const override; PostingListStore &getPostingListStore() { return _postingListStore; } @@ -98,8 +98,7 @@ public: /** * Should only by used by unit tests. */ - queryeval::SearchIterator::UP make_search_iterator(const vespalib::string& term, - uint32_t field_id, + queryeval::SearchIterator::UP make_search_iterator(const vespalib::string& term, uint32_t field_id, fef::TermFieldMatchDataArray match_data) const; std::unique_ptr<queryeval::SimpleLeafBlueprint> make_term_blueprint(const vespalib::string& term, diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index_collection.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_index_collection.cpp index bd933bb118f..0264b5e968b 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_index_collection.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/field_index_collection.cpp @@ -6,13 +6,9 @@ #include <vespa/searchlib/bitcompression/posocccompression.h> #include <vespa/searchlib/index/i_field_length_inspector.h> #include <vespa/searchcommon/common/schema.h> -#include <vespa/vespalib/btree/btree.hpp> #include <vespa/vespalib/btree/btreeiterator.hpp> -#include <vespa/vespalib/btree/btreenode.hpp> #include <vespa/vespalib/btree/btreenodeallocator.hpp> #include <vespa/vespalib/btree/btreenodestore.hpp> -#include <vespa/vespalib/btree/btreeroot.hpp> -#include <vespa/vespalib/btree/btreestore.hpp> #include <vespa/vespalib/util/exceptions.h> namespace search { @@ -46,9 +42,10 @@ void FieldIndexCollection::dump(search::index::IndexBuilder &indexBuilder) { for (uint32_t fieldId = 0; fieldId < _numFields; ++fieldId) { - indexBuilder.startField(fieldId); - _fieldIndexes[fieldId]->dump(indexBuilder); - indexBuilder.endField(); + auto fieldIndexBuilder = indexBuilder.startField(fieldId); + if (fieldIndexBuilder) { + _fieldIndexes[fieldId]->dump(*fieldIndexBuilder); + } } } diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index_collection.h b/searchlib/src/vespa/searchlib/memoryindex/field_index_collection.h index 6736ed2c2ad..34a09c5572f 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_index_collection.h +++ b/searchlib/src/vespa/searchlib/memoryindex/field_index_collection.h @@ -10,6 +10,7 @@ namespace search::index { class IFieldLengthInspector; class Schema; + class IndexBuilder; } namespace search::memoryindex { diff --git a/searchlib/src/vespa/searchlib/memoryindex/i_field_index.h b/searchlib/src/vespa/searchlib/memoryindex/i_field_index.h index ee075290dc9..b7cad9d1c38 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/i_field_index.h +++ b/searchlib/src/vespa/searchlib/memoryindex/i_field_index.h @@ -12,7 +12,7 @@ namespace search::queryeval { } namespace search::index { class FieldLengthCalculator; -class IndexBuilder; +class FieldIndexBuilder; } namespace search::memoryindex { @@ -37,7 +37,7 @@ public: virtual FieldIndexRemover& getDocumentRemover() = 0; virtual index::FieldLengthCalculator& get_calculator() = 0; virtual void compactFeatures() = 0; - virtual void dump(search::index::IndexBuilder& indexBuilder) = 0; + virtual void dump(search::index::FieldIndexBuilder& builder) = 0; virtual std::unique_ptr<queryeval::SimpleLeafBlueprint> make_term_blueprint(const vespalib::string& term, const queryeval::FieldSpec& field, diff --git a/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp b/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp index a32cd284fc6..f4a603e7d30 100644 --- a/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp +++ b/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp @@ -35,39 +35,30 @@ class MockFieldLengthInspector : public IFieldLengthInspector { struct Builder { - search::diskindex::IndexBuilder _ib; MockFieldLengthInspector _mock_field_length_inspector; TuneFileIndexing _tuneFileIndexing; DummyFileHeaderContext _fileHeaderContext; + search::diskindex::IndexBuilder _ib; DocIdAndFeatures _features; - Builder(const std::string &dir, - const Schema &s, - uint32_t docIdLimit, - uint64_t numWordIds, - bool directio) - : _ib(s, dir, docIdLimit), - _tuneFileIndexing(), + Builder(const std::string &dir, const Schema &s, uint32_t docIdLimit, uint64_t numWordIds, bool directio) + : _tuneFileIndexing(), _fileHeaderContext(), + _ib(s, dir, docIdLimit,numWordIds, _mock_field_length_inspector, _tuneFileIndexing, _fileHeaderContext), _features() { if (directio) { _tuneFileIndexing._read.setWantDirectIO(); _tuneFileIndexing._write.setWantDirectIO(); } - _ib.open(numWordIds, _mock_field_length_inspector, _tuneFileIndexing, _fileHeaderContext); } - void addDoc(uint32_t docId) { + void addDoc(index::FieldIndexBuilder & fb, uint32_t docId) { _features.clear(docId); _features.elements().emplace_back(0, 1, 1); _features.elements().back().setNumOccs(1); _features.word_positions().emplace_back(0); - _ib.add_document(_features); - } - - void close() { - _ib.close(); + fb.add_document(_features); } }; @@ -84,37 +75,39 @@ TestDiskIndex::buildSchema() void TestDiskIndex::buildIndex(const std::string & dir, bool directio, - bool fieldEmpty, bool docEmpty, bool wordEmpty) + bool fieldEmpty, bool docEmpty, bool wordEmpty) { Builder b(dir, _schema, docEmpty ? 1 : 32, wordEmpty ? 0 : 2, directio); - if (!wordEmpty && !fieldEmpty && !docEmpty) { + + if (!fieldEmpty) { // f1 - b._ib.startField(0); - b._ib.startWord("w1"); - b.addDoc(1); - b.addDoc(3); - b._ib.endWord(); - b._ib.endField(); - // f2 - b._ib.startField(1); - b._ib.startWord("w1"); - b.addDoc(2); - b.addDoc(4); - b.addDoc(6); - b._ib.endWord(); - b._ib.startWord("w2"); - for (uint32_t docId = 1; docId < 18; ++docId) { - b.addDoc(docId); + auto fb = b._ib.startField(0); + if (!wordEmpty && !docEmpty) { + fb->startWord("w1"); + b.addDoc(*fb, 1); + b.addDoc(*fb, 3); + fb->endWord(); + } + fb = b._ib.startField(1); + if (!wordEmpty && !docEmpty) { + // f2 + fb->startWord("w1"); + b.addDoc(*fb, 2); + b.addDoc(*fb, 4); + b.addDoc(*fb, 6); + fb->endWord(); + fb->startWord("w2"); + for (uint32_t docId = 1; docId < 18; ++docId) { + b.addDoc(*fb, docId); + } + fb->endWord(); } - b._ib.endWord(); - b._ib.endField(); } - b.close(); } void TestDiskIndex::openIndex(const std::string &dir, bool directio, bool readmmap, - bool fieldEmpty, bool docEmpty, bool wordEmpty) + bool fieldEmpty, bool docEmpty, bool wordEmpty) { buildIndex(dir, directio, fieldEmpty, docEmpty, wordEmpty); TuneFileRandRead tuneFileRead; |