diff options
author | Geir Storli <geirst@verizonmedia.com> | 2019-06-07 09:47:32 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-06-07 09:47:32 +0200 |
commit | bdcbcbdde005d44ec502a2b7c5a77dd62d3bc2eb (patch) | |
tree | e1456747d7e84ffb28d2eb68ca06b837d3d2f858 | |
parent | 6955a69081747d44ba016de7c416e79454cbe090 (diff) | |
parent | 1d079165f38e46422e4b4f51189c97b3fbe9d125 (diff) |
Merge pull request #9719 from vespa-engine/toregge/save-average-field-length-in-posting-list-file-header
Save average field length in posting list file header.
31 files changed, 217 insertions, 27 deletions
diff --git a/searchcore/src/tests/proton/feed_and_search/feed_and_search.cpp b/searchcore/src/tests/proton/feed_and_search/feed_and_search.cpp index 9a343667fd6..23a87415f7f 100644 --- a/searchcore/src/tests/proton/feed_and_search/feed_and_search.cpp +++ b/searchcore/src/tests/proton/feed_and_search/feed_and_search.cpp @@ -172,7 +172,7 @@ void Test::requireThatMemoryIndexCanBeDumpedAndSearched() { const uint64_t num_words = memory_index.getNumWords(); search::TuneFileIndexing tuneFileIndexing; DummyFileHeaderContext fileHeaderContext; - index_builder.open(docIdLimit, num_words, tuneFileIndexing, fileHeaderContext); + index_builder.open(docIdLimit, num_words, MockFieldLengthInspector(), tuneFileIndexing, fileHeaderContext); memory_index.dump(index_builder); index_builder.close(); diff --git a/searchcore/src/tests/proton/index/fusionrunner_test.cpp b/searchcore/src/tests/proton/index/fusionrunner_test.cpp index 25e7a4ffa6b..e6cdbf8d6cb 100644 --- a/searchcore/src/tests/proton/index/fusionrunner_test.cpp +++ b/searchcore/src/tests/proton/index/fusionrunner_test.cpp @@ -188,6 +188,7 @@ void Test::createIndex(const string &dir, uint32_t id, bool fusion) { TuneFileIndexing tuneFileIndexing; TuneFileAttributes tuneFileAttributes; index_builder.open(docIdLimit, memory_index.getNumWords(), + MockFieldLengthInspector(), tuneFileIndexing, _fileHeaderContext); memory_index.dump(index_builder); diff --git a/searchcore/src/vespa/searchcore/proton/index/memoryindexwrapper.cpp b/searchcore/src/vespa/searchcore/proton/index/memoryindexwrapper.cpp index d206388ca04..59e943e25f4 100644 --- a/searchcore/src/vespa/searchcore/proton/index/memoryindexwrapper.cpp +++ b/searchcore/src/vespa/searchcore/proton/index/memoryindexwrapper.cpp @@ -42,7 +42,7 @@ MemoryIndexWrapper::flushToDisk(const vespalib::string &flushDir, indexBuilder.setPrefix(flushDir); SerialNumFileHeaderContext fileHeaderContext(_fileHeaderContext, serialNum); - indexBuilder.open(docIdLimit, numWords, _tuneFileIndexing, fileHeaderContext); + indexBuilder.open(docIdLimit, numWords, *this, _tuneFileIndexing, fileHeaderContext); _index.dump(indexBuilder); indexBuilder.close(); } diff --git a/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp b/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp index c562fea69c1..83b5c05effb 100644 --- a/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp +++ b/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp @@ -2,6 +2,7 @@ #include <vespa/log/log.h> LOG_SETUP("bitvector_test"); #include <vespa/vespalib/testkit/testapp.h> +#include <vespa/searchlib/index/field_length_info.h> #include <vespa/searchlib/diskindex/bitvectordictionary.h> #include <vespa/searchlib/diskindex/fieldwriter.h> #include <vespa/searchlib/index/dummyfileheadercontext.h> @@ -47,7 +48,7 @@ FieldWriterWrapper::open(const std::string &path, const common::FileHeaderContext &fileHeaderContext) { vespalib::mkdir(path, false); - return _writer.open(path, 64, 10000, false, false, schema, indexId, tuneFileWrite, fileHeaderContext); + return _writer.open(path, 64, 10000, false, false, schema, indexId, FieldLengthInfo(), tuneFileWrite, fileHeaderContext); } FieldWriterWrapper & diff --git a/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp b/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp index 7e24511435b..82a6f973cf9 100644 --- a/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp +++ b/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp @@ -57,6 +57,7 @@ private: void requireThatLookupIsWorking(bool fieldEmpty, bool docEmpty, bool wordEmpty); void requireThatWeCanReadPostingList(); + void require_that_we_can_get_field_length_info(); void requireThatWeCanReadBitVector(); void requireThatBlueprintIsCreated(); void requireThatBlueprintCanCreateSearchIterators(); @@ -193,6 +194,20 @@ Test::requireThatWeCanReadPostingList() } void +Test::require_that_we_can_get_field_length_info() +{ + auto info = _index->get_field_length_info("f1"); + EXPECT_EQUAL(3.5, info.get_average_field_length()); + EXPECT_EQUAL(21u, info.get_num_samples()); + info = _index->get_field_length_info("f2"); + EXPECT_EQUAL(4.0, info.get_average_field_length()); + EXPECT_EQUAL(23u, info.get_num_samples()); + info = _index->get_field_length_info("f3"); + EXPECT_EQUAL(0.0, info.get_average_field_length()); + EXPECT_EQUAL(0u, info.get_num_samples()); +} + +void Test::requireThatWeCanReadBitVector() { { // word 'w1' @@ -323,6 +338,7 @@ Test::Main() TEST_DO(openIndex("index/1", false, false, false, false, false)); TEST_DO(requireThatLookupIsWorking(false, false, false)); TEST_DO(requireThatWeCanReadPostingList()); + TEST_DO(require_that_we_can_get_field_length_info()); TEST_DO(requireThatWeCanReadBitVector()); TEST_DO(requireThatBlueprintIsCreated()); TEST_DO(requireThatBlueprintCanCreateSearchIterators()); @@ -330,6 +346,7 @@ Test::Main() TEST_DO(openIndex("index/2", true, false, false, false, false)); TEST_DO(requireThatLookupIsWorking(false, false, false)); TEST_DO(requireThatWeCanReadPostingList()); + TEST_DO(require_that_we_can_get_field_length_info()); TEST_DO(requireThatWeCanReadBitVector()); TEST_DO(requireThatBlueprintIsCreated()); TEST_DO(requireThatBlueprintCanCreateSearchIterators()); @@ -337,6 +354,7 @@ Test::Main() TEST_DO(openIndex("index/3", false, true, false, false, false)); TEST_DO(requireThatLookupIsWorking(false, false, false)); TEST_DO(requireThatWeCanReadPostingList()); + TEST_DO(require_that_we_can_get_field_length_info()); TEST_DO(requireThatWeCanReadBitVector()); TEST_DO(requireThatBlueprintIsCreated()); TEST_DO(requireThatBlueprintCanCreateSearchIterators()); @@ -344,6 +362,7 @@ Test::Main() TEST_DO(openIndex("index/4", true, true, false, false, false)); TEST_DO(requireThatLookupIsWorking(false, false, false)); TEST_DO(requireThatWeCanReadPostingList()); + TEST_DO(require_that_we_can_get_field_length_info()); TEST_DO(requireThatWeCanReadBitVector()); TEST_DO(requireThatBlueprintIsCreated()); TEST_DO(requireThatBlueprintCanCreateSearchIterators()); diff --git a/searchlib/src/tests/diskindex/fieldwriter/fieldwriter_test.cpp b/searchlib/src/tests/diskindex/fieldwriter/fieldwriter_test.cpp index cb38cd23409..2b9f8a5b201 100644 --- a/searchlib/src/tests/diskindex/fieldwriter/fieldwriter_test.cpp +++ b/searchlib/src/tests/diskindex/fieldwriter/fieldwriter_test.cpp @@ -6,6 +6,7 @@ #include <vespa/searchlib/test/fakedata/fakeword.h> #include <vespa/searchlib/test/fakedata/fakewordset.h> #include <vespa/searchlib/index/docidandfeatures.h> +#include <vespa/searchlib/index/field_length_info.h> #include <vespa/searchlib/index/postinglisthandle.h> #include <vespa/searchlib/diskindex/zcposocc.h> #include <vespa/searchlib/diskindex/zcposoccrandread.h> @@ -38,6 +39,7 @@ using search::fakedata::FakeWord; using search::fakedata::FakeWordSet; using search::fef::TermFieldMatchData; using search::fef::TermFieldMatchDataArray; +using search::index::FieldLengthInfo; using search::index::DummyFileHeaderContext; using search::index::PostingListCounts; using search::index::PostingListOffsetAndCounts; @@ -200,6 +202,7 @@ WrappedFieldWriter::open() minSkipDocs, minChunkDocs, _dynamicK, _encode_cheap_features, _schema, _indexId, + FieldLengthInfo(4.5, 42), tuneFileWrite, fileHeaderContext); } @@ -427,6 +430,10 @@ readField(FakeWordSet &wordSet, if (istate._fieldReader->isValid()) istate._fieldReader->read(); + auto field_length_info = istate._fieldReader->get_field_length_info(); + assert(4.5 == field_length_info.get_average_field_length()); + assert(42u == field_length_info.get_num_samples()); + TermFieldMatchData mdfield1; unsigned int wordNum = 1; @@ -503,6 +510,9 @@ randReadField(FakeWordSet &wordSet, bool openPostingRes = postingFile->open(pname, tuneFileRandRead); assert(openPostingRes); (void) openPostingRes; + auto field_length_info = postingFile->get_field_length_info(); + assert(4.5 == field_length_info.get_average_field_length()); + assert(42u == field_length_info.get_num_samples()); for (int loop = 0; loop < 1; ++loop) { unsigned int wordNum = 1; diff --git a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp index 694af2f1ad1..16d2a04df2e 100644 --- a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp +++ b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp @@ -326,6 +326,7 @@ Test::requireThatFusionIsWorking(const vespalib::string &prefix, bool directio, uint32_t numDocs = 12 + 1; uint32_t numWords = fic.getNumUniqueWords(); bool dynamicKPosOcc = false; + MockFieldLengthInspector mock_field_length_inspector; TuneFileIndexing tuneFileIndexing; TuneFileSearch tuneFileSearch; DummyFileHeaderContext fileHeaderContext; @@ -336,7 +337,7 @@ Test::requireThatFusionIsWorking(const vespalib::string &prefix, bool directio, } if (readmmap) tuneFileSearch._read.setWantMemoryMap(); - ib.open(numDocs, numWords, tuneFileIndexing, fileHeaderContext); + ib.open(numDocs, numWords, mock_field_length_inspector, tuneFileIndexing, fileHeaderContext); fic.dump(ib); ib.close(); diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp index 234cf9b5e84..05c905cdc32 100644 --- a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp +++ b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp @@ -740,7 +740,7 @@ TEST_F(FieldIndexCollectionTest, require_that_dumping_words_with_no_docs_to_inde b.setPrefix("dump"); TuneFileIndexing tuneFileIndexing; DummyFileHeaderContext fileHeaderContext; - b.open(5, 2, tuneFileIndexing, fileHeaderContext); + b.open(5, 2, MockFieldLengthInspector(), tuneFileIndexing, fileHeaderContext); fic.dump(b); b.close(); } @@ -1210,7 +1210,9 @@ TEST_F(UriInverterTest, require_that_uri_indexing_is_working) dib.setPrefix("urldump"); TuneFileIndexing tuneFileIndexing; DummyFileHeaderContext fileHeaderContext; - dib.open(11, _fic.getNumUniqueWords(), tuneFileIndexing, + dib.open(11, _fic.getNumUniqueWords(), + MockFieldLengthInspector(), + tuneFileIndexing, fileHeaderContext); _fic.dump(dib); dib.close(); diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp b/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp index 006c0b29ffb..b789bf16947 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp +++ b/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp @@ -21,7 +21,8 @@ PosOccFieldParams::PosOccFieldParams() _hasElementWeights(false), _avgElemLen(512), _collectionType(SINGLE), - _name() + _name(), + _field_length_info() { } @@ -128,14 +129,37 @@ PosOccFieldParams::setSchemaParams(const Schema &schema, uint32_t fieldId) _name = field.getName(); } +namespace { + +vespalib::string field_length_infix = "field_length."; + +struct FieldLengthKeys { + vespalib::string _average; + vespalib::string _samples; + FieldLengthKeys(const vespalib::string &prefix); + ~FieldLengthKeys(); +}; + +FieldLengthKeys::FieldLengthKeys(const vespalib::string &prefix) + : _average(prefix + field_length_infix + "average"), + _samples(prefix + field_length_infix + "samples") +{ +} + +FieldLengthKeys::~FieldLengthKeys() = default; + +} void -PosOccFieldParams::readHeader(const vespalib::GenericHeader &header, +PosOccFieldParams::readHeader(const GenericHeader &header, const vespalib::string &prefix) { + using Tag = GenericHeader::Tag; vespalib::string nameKey(prefix + "fieldName"); vespalib::string collKey(prefix + "collectionType"); vespalib::string avgElemLenKey(prefix + "avgElemLen"); + FieldLengthKeys field_length_keys(prefix); + _name = header.getTag(nameKey).asString(); Schema::CollectionType ct = schema::collectionTypeFromName(header.getTag(collKey).asString()); switch (ct) { @@ -158,17 +182,28 @@ PosOccFieldParams::readHeader(const vespalib::GenericHeader &header, LOG_ABORT("Bad collection type when reading field param in header"); } _avgElemLen = header.getTag(avgElemLenKey).asInteger(); + if (header.hasTag(field_length_keys._average) && + header.hasTag(field_length_keys._samples)) { + const auto &average_field_length_tag = header.getTag(field_length_keys._average); + const auto &field_length_samples_tag = header.getTag(field_length_keys._samples); + if (average_field_length_tag.getType() == Tag::Type::TYPE_FLOAT && + field_length_samples_tag.getType() == Tag::Type::TYPE_INTEGER) { + _field_length_info = index::FieldLengthInfo(average_field_length_tag.asFloat(), field_length_samples_tag.asInteger()); + } + } } void -PosOccFieldParams::writeHeader(vespalib::GenericHeader &header, +PosOccFieldParams::writeHeader(GenericHeader &header, const vespalib::string &prefix) const { + using Tag = GenericHeader::Tag; vespalib::string nameKey(prefix + "fieldName"); vespalib::string collKey(prefix + "collectionType"); vespalib::string avgElemLenKey(prefix + "avgElemLen"); - header.putTag(GenericHeader::Tag(nameKey, _name)); + FieldLengthKeys field_length_keys(prefix); + header.putTag(Tag(nameKey, _name)); Schema::CollectionType ct(schema::CollectionType::SINGLE); switch (_collectionType) { case SINGLE: @@ -183,8 +218,10 @@ PosOccFieldParams::writeHeader(vespalib::GenericHeader &header, default: LOG_ABORT("Bad collection type when writing field param in header"); } - header.putTag(GenericHeader::Tag(collKey, schema::getTypeName(ct))); - header.putTag(GenericHeader::Tag(avgElemLenKey, _avgElemLen)); + header.putTag(Tag(collKey, schema::getTypeName(ct))); + header.putTag(Tag(avgElemLenKey, _avgElemLen)); + header.putTag(Tag(field_length_keys._average, _field_length_info.get_average_field_length())); + header.putTag(Tag(field_length_keys._samples, static_cast<int64_t>(_field_length_info.get_num_samples()))); } } diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h b/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h index c781cec4db5..f053f558433 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h +++ b/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h @@ -4,6 +4,7 @@ #include <cstdint> #include <vespa/vespalib/stllike/string.h> +#include <vespa/searchlib/index/field_length_info.h> namespace search::index { class PostingListParams; @@ -33,6 +34,7 @@ public: uint32_t _avgElemLen; CollectionType _collectionType; vespalib::string _name; + index::FieldLengthInfo _field_length_info; PosOccFieldParams(); @@ -43,6 +45,8 @@ public: void setSchemaParams(const Schema &schema, uint32_t fieldId); void readHeader(const vespalib::GenericHeader &header, const vespalib::string &prefix); void writeHeader(vespalib::GenericHeader &header, const vespalib::string &prefix) const; + const index::FieldLengthInfo &get_field_length_info() const { return _field_length_info; } + void set_field_length_info(const index::FieldLengthInfo &field_length_info) { _field_length_info = field_length_info; } }; } diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp b/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp index 2e2674f98c6..f6a0ac0f783 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp +++ b/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp @@ -129,4 +129,11 @@ PosOccFieldsParams::writeHeader(vespalib::GenericHeader &header, } } +void +PosOccFieldsParams::set_field_length_info(const index::FieldLengthInfo &field_length_info) +{ + assert(!_params.empty()); + _params.front().set_field_length_info(field_length_info); +} + } diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h b/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h index f6ae886b3f0..2bc32bdd75b 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h +++ b/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h @@ -44,6 +44,7 @@ public: void setSchemaParams(const Schema &schema, const uint32_t indexId); void readHeader(const vespalib::GenericHeader &header, const vespalib::string &prefix); void writeHeader(vespalib::GenericHeader &header, const vespalib::string &prefix) const; + void set_field_length_info(const index::FieldLengthInfo &field_length_info); }; } diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp index b6d843e4e3c..a964ae1ce6a 100644 --- a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp @@ -464,4 +464,15 @@ DiskIndex::createBlueprint(const IRequestContext & requestContext, const FieldSp } } +FieldLengthInfo +DiskIndex::get_field_length_info(const vespalib::string& field_name) const +{ + uint32_t fieldId = _schema.getIndexFieldId(field_name); + if (fieldId != Schema::UNKNOWN_FIELD_ID) { + return _postingFiles[fieldId]->get_field_length_info(); + } else { + return FieldLengthInfo(); + } +} + } diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.h b/searchlib/src/vespa/searchlib/diskindex/diskindex.h index d83b2f56d7c..91fd33a2c4a 100644 --- a/searchlib/src/vespa/searchlib/diskindex/diskindex.h +++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.h @@ -5,6 +5,7 @@ #include "bitvectordictionary.h" #include "zcposoccrandread.h" #include <vespa/searchlib/index/dictionaryfile.h> +#include <vespa/searchlib/index/field_length_info.h> #include <vespa/searchlib/queryeval/searchable.h> #include <vespa/vespalib/stllike/string.h> #include <vespa/vespalib/stllike/cache.h> @@ -147,6 +148,8 @@ public: * Needed for the Cache::BackingStore interface. */ bool read(const Key & key, LookupResultVector & result); + + index::FieldLengthInfo get_field_length_info(const vespalib::string& field_name) const; }; void swap(DiskIndex::LookupResult & a, DiskIndex::LookupResult & b); diff --git a/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp index d03f7ca4149..1ee7b9ae9ae 100644 --- a/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp @@ -57,14 +57,15 @@ makePosOccWrite(PostingListCountFileSeqWrite *const posOccCountWrite, const PostingListParams ¶ms, const PostingListParams &featureParams, const Schema &schema, - uint32_t indexId) + uint32_t indexId, + const index::FieldLengthInfo &field_length_info) { std::unique_ptr<PostingListFileSeqWrite> posOccWrite; if (dynamicK) { - posOccWrite = std::make_unique<ZcPosOccSeqWrite>(schema, indexId, posOccCountWrite); + posOccWrite = std::make_unique<ZcPosOccSeqWrite>(schema, indexId, field_length_info, posOccCountWrite); } else { - posOccWrite = std::make_unique<Zc4PosOccSeqWrite>(schema, indexId, posOccCountWrite); + posOccWrite = std::make_unique<Zc4PosOccSeqWrite>(schema, indexId, field_length_info, posOccCountWrite); } posOccWrite->setFeatureParams(featureParams); diff --git a/searchlib/src/vespa/searchlib/diskindex/extposocc.h b/searchlib/src/vespa/searchlib/diskindex/extposocc.h index 285715849db..49852b18ad3 100644 --- a/searchlib/src/vespa/searchlib/diskindex/extposocc.h +++ b/searchlib/src/vespa/searchlib/diskindex/extposocc.h @@ -10,6 +10,7 @@ namespace search { } namespace search::index { + class FieldLengthInfo; class PostingListParams; class PostingListCountFileSeqWrite; class PostingListCountFileSeqRead; @@ -33,7 +34,8 @@ makePosOccWrite(index::PostingListCountFileSeqWrite *const posOccCountWrite, const index::PostingListParams ¶ms, const index::PostingListParams &featureParams, const index::Schema &schema, - uint32_t indexId); + uint32_t indexId, + const index::FieldLengthInfo &field_length_info); std::unique_ptr<index::PostingListFileSeqRead> makePosOccRead(const vespalib::string &name, diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp index c4cd6d3a22e..d3696e2f31c 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp @@ -18,6 +18,7 @@ vespalib::string PosOccIdCooked = "PosOcc.3.Cooked"; } using vespalib::getLastErrorString; +using search::index::FieldLengthInfo; using search::index::Schema; using search::index::SchemaUtil; using search::bitcompression::PosOccFieldParams; @@ -179,6 +180,11 @@ FieldReader::getFeatureParams(PostingListParams ¶ms) _oldposoccfile->getFeatureParams(params); } +const FieldLengthInfo & +FieldReader::get_field_length_info() const +{ + return _oldposoccfile->get_field_length_info(); +} std::unique_ptr<FieldReader> FieldReader::allocFieldReader(const SchemaUtil::IndexIterator &index, diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h index 243da21731b..ee237f5cc69 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h +++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h @@ -93,6 +93,7 @@ public: virtual void setFeatureParams(const PostingListParams ¶ms); virtual void getFeatureParams(PostingListParams ¶ms); uint32_t getDocIdLimit() const { return _docIdLimit; } + const index::FieldLengthInfo &get_field_length_info() const; static std::unique_ptr<FieldReader> allocFieldReader(const IndexIterator &index, const Schema &oldSchema); }; diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp index 7fb575da7f3..ae308db1a4f 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp @@ -9,6 +9,8 @@ LOG_SETUP(".diskindex.fieldwriter"); +using search::index::FieldLengthInfo; + namespace search::diskindex { using vespalib::getLastErrorString; @@ -39,6 +41,7 @@ FieldWriter::open(const vespalib::string &prefix, bool encode_cheap_features, const Schema &schema, const uint32_t indexId, + const FieldLengthInfo &field_length_info, const TuneFileSeqWrite &tuneFileWrite, const FileHeaderContext &fileHeaderContext) { @@ -66,7 +69,7 @@ FieldWriter::open(const vespalib::string &prefix, _dictFile = std::make_unique<PageDict4FileSeqWrite>(); _dictFile->setParams(countParams); - _posoccfile = makePosOccWrite(_dictFile.get(), dynamicKPosOccFormat, params, featureParams, schema, indexId); + _posoccfile = makePosOccWrite(_dictFile.get(), dynamicKPosOccFormat, params, featureParams, schema, indexId, field_length_info); vespalib::string cname = _prefix + "dictionary"; // Open output dictionary file diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h index c71bc4f4132..69c763bbd77 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h +++ b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h @@ -71,6 +71,7 @@ public: bool dynamicKPosOccFormat, bool encode_cheap_features, const Schema &schema, uint32_t indexId, + const index::FieldLengthInfo &field_length_info, const TuneFileSeqWrite &tuneFileWrite, const search::common::FileHeaderContext &fileHeaderContext); diff --git a/searchlib/src/vespa/searchlib/diskindex/fusion.cpp b/searchlib/src/vespa/searchlib/diskindex/fusion.cpp index 1000dcb605e..1ace5969b6b 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fusion.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/fusion.cpp @@ -4,6 +4,7 @@ #include "fieldreader.h" #include "dictionarywordreader.h" #include <vespa/vespalib/util/stringfmt.h> +#include <vespa/searchlib/index/field_length_info.h> #include <vespa/searchlib/util/filekit.h> #include <vespa/searchlib/util/dirtraverse.h> #include <vespa/vespalib/io/fileutil.h> @@ -26,6 +27,7 @@ using search::common::FileHeaderContext; using search::diskindex::DocIdMapping; using search::diskindex::WordNumMapping; using search::docsummary::DocumentSummary; +using search::index::FieldLengthInfo; using search::index::PostingListParams; using search::index::Schema; using search::index::SchemaUtil; @@ -324,13 +326,15 @@ Fusion::openInputFieldReaders(const SchemaUtil::IndexIterator &index, const Word bool -Fusion::openFieldWriter(const SchemaUtil::IndexIterator &index, FieldWriter &writer) +Fusion::openFieldWriter(const SchemaUtil::IndexIterator &index, FieldWriter &writer, const FieldLengthInfo &field_length_info) { vespalib::string dir = _outDir + "/" + index.getName(); if (!writer.open(dir + "/", 64, 262144, _dynamicKPosIndexFormat, index.use_experimental_posting_list_format(), index.getSchema(), - index.getIndex(), _tuneFileIndexing._write, _fileHeaderContext)) { + index.getIndex(), + field_length_info, + _tuneFileIndexing._write, _fileHeaderContext)) { throw IllegalArgumentException(make_string("Could not open output posocc + dictionary in %s", dir.c_str())); } return true; @@ -368,7 +372,11 @@ Fusion::mergeFieldPostings(const SchemaUtil::IndexIterator &index, const WordNum if (!openInputFieldReaders(index, list, readers)) { return false; } - if (!openFieldWriter(index, fieldWriter)) { + FieldLengthInfo field_length_info; + if (!readers.empty()) { + field_length_info = readers.back()->get_field_length_info(); + } + if (!openFieldWriter(index, fieldWriter, field_length_info)) { return false; } if (!setupMergeHeap(readers, fieldWriter, heap)) { diff --git a/searchlib/src/vespa/searchlib/diskindex/fusion.h b/searchlib/src/vespa/searchlib/diskindex/fusion.h index 53f9db75758..28060a9c4be 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fusion.h +++ b/searchlib/src/vespa/searchlib/diskindex/fusion.h @@ -11,6 +11,7 @@ namespace search { template <class IN> class PostingPriorityQueue; } namespace search { class TuneFileIndexing; } namespace search::common { class FileHeaderContext; } +namespace search::index { class FieldLengthInfo; } namespace search::diskindex { @@ -50,7 +51,7 @@ private: bool mergeField(uint32_t id); bool openInputFieldReaders(const SchemaUtil::IndexIterator &index, const WordNumMappingList & list, std::vector<std::unique_ptr<FieldReader> > & readers); - bool openFieldWriter(const SchemaUtil::IndexIterator &index, FieldWriter & writer); + bool openFieldWriter(const SchemaUtil::IndexIterator &index, FieldWriter & writer, const index::FieldLengthInfo &field_length_info); bool setupMergeHeap(const std::vector<std::unique_ptr<FieldReader> > & readers, FieldWriter &writer, PostingPriorityQueue<FieldReader> &heap); bool mergeFieldPostings(const SchemaUtil::IndexIterator &index, const WordNumMappingList & list, uint64_t numWordIds); diff --git a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp index f9620c35908..c2e311f18a6 100644 --- a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp @@ -2,6 +2,8 @@ #include "indexbuilder.h" #include <vespa/searchlib/index/docidandfeatures.h> +#include <vespa/searchlib/index/field_length_info.h> +#include <vespa/searchlib/index/i_field_length_inspector.h> #include <vespa/searchlib/index/schemautil.h> #include <vespa/searchlib/common/documentsummary.h> #include <vespa/vespalib/io/fileutil.h> @@ -19,6 +21,8 @@ namespace { using common::FileHeaderContext; using index::DocIdAndFeatures; +using index::FieldLengthInfo; +using index::IFieldLengthInspector; using index::PostingListCounts; using index::Schema; using index::SchemaUtil; @@ -37,6 +41,7 @@ public: void open(vespalib::stringref dir, const SchemaUtil::IndexIterator &index, uint32_t docIdLimit, uint64_t numWordIds, + const FieldLengthInfo &field_length_info, const TuneFileSeqWrite &tuneFileWrite, const FileHeaderContext &fileHeaderContext); @@ -69,6 +74,7 @@ public: const vespalib::string &getName(); vespalib::string getDir(); void open(uint32_t docIdLimit, uint64_t numWordIds, + const FieldLengthInfo &field_length_info, const TuneFileSeqWrite &tuneFileWrite, const FileHeaderContext &fileHeaderContext); void close(); @@ -90,6 +96,7 @@ void FileHandle::open(vespalib::stringref dir, const SchemaUtil::IndexIterator &index, uint32_t docIdLimit, uint64_t numWordIds, + const FieldLengthInfo &field_length_info, const TuneFileSeqWrite &tuneFileWrite, const FileHeaderContext &fileHeaderContext) { @@ -100,6 +107,7 @@ FileHandle::open(vespalib::stringref dir, if (!_fieldWriter->open(dir + "/", 64, 262144u, false, index.use_experimental_posting_list_format(), index.getSchema(), index.getIndex(), + field_length_info, tuneFileWrite, fileHeaderContext)) { LOG(error, "Could not open term writer %s for write (%s)", vespalib::string(dir).c_str(), getLastErrorString().c_str()); @@ -170,12 +178,15 @@ IndexBuilder::FieldHandle::getDir() void IndexBuilder::FieldHandle::open(uint32_t docIdLimit, uint64_t numWordIds, + const FieldLengthInfo &field_length_info, const TuneFileSeqWrite &tuneFileWrite, const FileHeaderContext &fileHeaderContext) { _file.open(getDir(), SchemaUtil::IndexIterator(*_schema, getIndexId()), - docIdLimit, numWordIds, tuneFileWrite, fileHeaderContext); + docIdLimit, numWordIds, + field_length_info, + tuneFileWrite, fileHeaderContext); } void @@ -278,6 +289,7 @@ IndexBuilder::appendToPrefix(vespalib::stringref name) void IndexBuilder::open(uint32_t docIdLimit, uint64_t numWordIds, + const IFieldLengthInspector &field_length_inspector, const TuneFileIndexing &tuneFileIndexing, const FileHeaderContext &fileHeaderContext) { @@ -294,7 +306,9 @@ IndexBuilder::open(uint32_t docIdLimit, uint64_t numWordIds, continue; } vespalib::mkdir(fh.getDir(), false); - fh.open(docIdLimit, numWordIds, tuneFileIndexing._write, + fh.open(docIdLimit, numWordIds, + field_length_inspector.get_field_length_info(fh.getName()), + tuneFileIndexing._write, fileHeaderContext); indexes.push_back(fh.getIndexId()); } diff --git a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h index a1a77d608cd..4423c7e91a0 100644 --- a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h +++ b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h @@ -8,6 +8,7 @@ #include <vector> namespace search::common { class FileHeaderContext; } +namespace search::index { class IFieldLengthInspector; } namespace search::diskindex { @@ -64,6 +65,7 @@ public: vespalib::string appendToPrefix(vespalib::stringref name); void open(uint32_t docIdLimit, uint64_t numWordIds, + const index::IFieldLengthInspector &field_length_inspector, const TuneFileIndexing &tuneFileIndexing, const common::FileHeaderContext &fileHandleContext); diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp index 3ae2a631cb1..e3bb400f4d3 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp @@ -12,6 +12,7 @@ namespace search::diskindex { using search::bitcompression::PosOccFieldsParams; using search::bitcompression::EG2PosOccDecodeContext; using search::bitcompression::EGPosOccDecodeContext; +using search::index::FieldLengthInfo; using search::index::PostingListCountFileSeqRead; using search::index::PostingListCountFileSeqWrite; @@ -52,9 +53,15 @@ Zc4PosOccSeqRead::getSubIdentifier() return d.getIdentifier(); } +const FieldLengthInfo & +Zc4PosOccSeqRead::get_field_length_info() const +{ + return _fieldsParams.getFieldParams()->get_field_length_info(); +} Zc4PosOccSeqWrite::Zc4PosOccSeqWrite(const Schema &schema, uint32_t indexId, + const FieldLengthInfo &field_length_info, PostingListCountFileSeqWrite *countFile) : Zc4PostingSeqWrite(countFile), _fieldsParams(), @@ -62,6 +69,7 @@ Zc4PosOccSeqWrite::Zc4PosOccSeqWrite(const Schema &schema, { _writer.set_encode_features(&_realEncodeFeatures); _fieldsParams.setSchemaParams(schema, indexId); + _fieldsParams.set_field_length_info(field_length_info); } @@ -102,9 +110,15 @@ ZcPosOccSeqRead::getSubIdentifier() return d.getIdentifier(); } +const FieldLengthInfo & +ZcPosOccSeqRead::get_field_length_info() const +{ + return _fieldsParams.getFieldParams()->get_field_length_info(); +} ZcPosOccSeqWrite::ZcPosOccSeqWrite(const Schema &schema, uint32_t indexId, + const FieldLengthInfo &field_length_info, PostingListCountFileSeqWrite *countFile) : ZcPostingSeqWrite(countFile), _fieldsParams(), @@ -112,6 +126,7 @@ ZcPosOccSeqWrite::ZcPosOccSeqWrite(const Schema &schema, { _writer.set_encode_features(&_realEncodeFeatures); _fieldsParams.setSchemaParams(schema, indexId); + _fieldsParams.set_field_length_info(field_length_info); } } diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.h b/searchlib/src/vespa/searchlib/diskindex/zcposocc.h index 27700399bfb..0615cf636de 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.h @@ -19,6 +19,7 @@ public: Zc4PosOccSeqRead(index::PostingListCountFileSeqRead *countFile); void setFeatureParams(const PostingListParams ¶ms) override; static const vespalib::string &getSubIdentifier(); + const index::FieldLengthInfo &get_field_length_info() const override; }; @@ -31,7 +32,9 @@ private: public: typedef index::Schema Schema; - Zc4PosOccSeqWrite(const Schema &schema, uint32_t indexId, index::PostingListCountFileSeqWrite *countFile); + Zc4PosOccSeqWrite(const Schema &schema, uint32_t indexId, + const index::FieldLengthInfo &field_length_info, + index::PostingListCountFileSeqWrite *countFile); }; @@ -45,6 +48,7 @@ public: ZcPosOccSeqRead(index::PostingListCountFileSeqRead *countFile); void setFeatureParams(const PostingListParams ¶ms) override; static const vespalib::string &getSubIdentifier(); + const index::FieldLengthInfo &get_field_length_info() const override; }; @@ -55,7 +59,9 @@ private: bitcompression::EGPosOccEncodeContext<true> _realEncodeFeatures; public: typedef index::Schema Schema; - ZcPosOccSeqWrite(const Schema &schema, uint32_t indexId, index::PostingListCountFileSeqWrite *countFile); + ZcPosOccSeqWrite(const Schema &schema, uint32_t indexId, + const index::FieldLengthInfo &field_length_info, + index::PostingListCountFileSeqWrite *countFile); }; } diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp index edbd78b9b01..aa4f15bc225 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp @@ -17,6 +17,7 @@ using search::bitcompression::EGPosOccDecodeContext; using search::bitcompression::EGPosOccDecodeContextCooked; using search::bitcompression::PosOccFieldsParams; using search::bitcompression::FeatureDecodeContext; +using search::index::FieldLengthInfo; using search::index::PostingListCounts; using search::index::PostingListHandle; using search::ComprFileReadContext; @@ -246,6 +247,11 @@ ZcPosOccRandRead::getSubIdentifier() return d.getIdentifier(); } +const FieldLengthInfo & +ZcPosOccRandRead::get_field_length_info() const +{ + return _fieldsParams.getFieldParams()->get_field_length_info(); +} Zc4PosOccRandRead:: Zc4PosOccRandRead() diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h index 26b23161cb1..7ae59611e35 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h @@ -50,6 +50,7 @@ public: virtual void readHeader(); static const vespalib::string &getIdentifier(); static const vespalib::string &getSubIdentifier(); + const index::FieldLengthInfo &get_field_length_info() const override; }; class Zc4PosOccRandRead : public ZcPosOccRandRead diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.h b/searchlib/src/vespa/searchlib/diskindex/zcposting.h index 01049e720a9..6ca87f01aea 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposting.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.h @@ -72,7 +72,8 @@ public: void writeDocIdAndFeatures(const DocIdAndFeatures &features) override; void flushWord() override; - bool open(const vespalib::string &name, const TuneFileSeqWrite &tuneFileWrite, + bool open(const vespalib::string &name, + const TuneFileSeqWrite &tuneFileWrite, const search::common::FileHeaderContext &fileHeaderContext) override; bool close() override; diff --git a/searchlib/src/vespa/searchlib/index/postinglistfile.h b/searchlib/src/vespa/searchlib/index/postinglistfile.h index 1e7dde7f139..1569bdd13b6 100644 --- a/searchlib/src/vespa/searchlib/index/postinglistfile.h +++ b/searchlib/src/vespa/searchlib/index/postinglistfile.h @@ -13,6 +13,7 @@ namespace search::common { class FileHeaderContext; } namespace search::index { class DocIdAndFeatures; +class FieldLengthInfo; /** * Interface for posting list files containing document ids and features @@ -60,6 +61,8 @@ public: * Get current (word, docid) feature parameters. */ virtual void getFeatureParams(PostingListParams ¶ms); + + virtual const FieldLengthInfo &get_field_length_info() const = 0; }; /** @@ -169,6 +172,8 @@ public: */ virtual bool close() = 0; + virtual const FieldLengthInfo &get_field_length_info() const = 0; + bool getMemoryMapped() const { return _memoryMapped; } protected: diff --git a/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp b/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp index 1e25878a33e..5de51ca3933 100644 --- a/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp +++ b/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp @@ -3,19 +3,39 @@ #include "testdiskindex.h" #include <vespa/searchlib/diskindex/indexbuilder.h> #include <vespa/searchlib/index/dummyfileheadercontext.h> +#include <vespa/searchlib/index/i_field_length_inspector.h> #include <vespa/vespalib/io/fileutil.h> namespace search::diskindex { using index::DocIdAndFeatures; using index::DummyFileHeaderContext; +using index::FieldLengthInfo; +using index::IFieldLengthInspector; using index::Schema; using index::WordDocElementWordPosFeatures; using index::schema::DataType; +namespace { + +class MockFieldLengthInspector : public IFieldLengthInspector { + FieldLengthInfo get_field_length_info(const vespalib::string& field_name) const override { + if (field_name == "f1") { + return FieldLengthInfo(3.5, 21); + } else if (field_name == "f2") { + return FieldLengthInfo(4.0, 23); + } else { + return FieldLengthInfo(); + } + } +}; + +} + struct Builder { search::diskindex::IndexBuilder _ib; + MockFieldLengthInspector _mock_field_length_inspector; TuneFileIndexing _tuneFileIndexing; DummyFileHeaderContext _fileHeaderContext; DocIdAndFeatures _features; @@ -35,7 +55,7 @@ struct Builder _tuneFileIndexing._write.setWantDirectIO(); } _ib.setPrefix(dir); - _ib.open(docIdLimit, numWordIds, _tuneFileIndexing, + _ib.open(docIdLimit, numWordIds, _mock_field_length_inspector, _tuneFileIndexing, _fileHeaderContext); } |