diff options
author | Tor Egge <Tor.Egge@broadpark.no> | 2019-06-19 19:16:00 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@broadpark.no> | 2019-06-20 10:47:58 +0200 |
commit | 38de0304985d85dc9da58e15ad494054bff5d5dc (patch) | |
tree | ddced6a34d60e8b6d3c60df069b8da56fe9e0d01 | |
parent | 2f2c641e9aa25476d46ff97846185da05b32d6d7 (diff) |
Reconstruct interleaved features as needed.
6 files changed, 168 insertions, 35 deletions
diff --git a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp index 4779ddcb10d..da29918a5fb 100644 --- a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp +++ b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp @@ -59,8 +59,8 @@ protected: const Schema & getSchema() const { return _schema; } void requireThatFusionIsWorking(const vespalib::string &prefix, bool directio, bool readmmap); - void make_empty_index(const vespalib::string &dump_dir, const IFieldLengthInspector &field_length_inspector); - void merge_empty_indexes(const vespalib::string &dump_dir, const std::vector<vespalib::string> &sources); + void make_simple_index(const vespalib::string &dump_dir, const IFieldLengthInspector &field_length_inspector); + void merge_simple_indexes(const vespalib::string &dump_dir, const std::vector<vespalib::string> &sources); public: FusionTest(); }; @@ -97,6 +97,72 @@ toString(FieldPositionsIterator posItr, bool hasElements = false, bool hasWeight return ss.str(); } +std::unique_ptr<Document> +make_doc10(DocBuilder &b) +{ + b.startDocument("doc::10"); + b.startIndexField("f0"). + addStr("a").addStr("b").addStr("c").addStr("d"). + addStr("e").addStr("f").addStr("z"). + endField(); + b.startIndexField("f1"). + addStr("w").addStr("x"). + addStr("y").addStr("z"). + endField(); + b.startIndexField("f2"). + startElement(4).addStr("ax").addStr("ay").addStr("z").endElement(). + startElement(5).addStr("ax").endElement(). + endField(); + b.startIndexField("f3"). + startElement(4).addStr("wx").addStr("z").endElement(). + endField(); + + return b.endDocument(); +} + +Schema::IndexField +make_index_field(vespalib::stringref name, CollectionType collection_type, bool interleaved_features) +{ + Schema::IndexField index_field(name, DataType::STRING, collection_type); + index_field.set_experimental_posting_list_format(interleaved_features); + return index_field; +} + +Schema +make_schema(bool interleaved_features) +{ + Schema schema; + schema.addIndexField(make_index_field("f0", CollectionType::SINGLE, interleaved_features)); + schema.addIndexField(make_index_field("f1", CollectionType::SINGLE, interleaved_features)); + schema.addIndexField(make_index_field("f2", CollectionType::ARRAY, interleaved_features)); + schema.addIndexField(make_index_field("f3", CollectionType::WEIGHTEDSET, interleaved_features)); + return schema; +} + +void +assert_interleaved_features(DiskIndex &d, const vespalib::string &field, const vespalib::string &term, uint32_t doc_id, uint32_t exp_num_occs, uint32_t exp_field_length) +{ + using LookupResult = DiskIndex::LookupResult; + using PostingListHandle = index::PostingListHandle; + using SearchIterator = search::queryeval::SearchIterator; + + const Schema &schema = d.getSchema(); + uint32_t field_id(schema.getIndexFieldId(field)); + std::unique_ptr<LookupResult> lookup_result(d.lookup(field_id, term)); + ASSERT_TRUE(lookup_result); + std::unique_ptr<PostingListHandle> handle(d.readPostingList(*lookup_result)); + ASSERT_TRUE(handle); + TermFieldMatchData tfmd; + TermFieldMatchDataArray tfmda; + tfmda.add(&tfmd); + std::unique_ptr<SearchIterator> sbap(handle->createIterator(lookup_result->counts, tfmda)); + sbap->initFullRange(); + EXPECT_TRUE(sbap->seek(doc_id)); + sbap->unpack(doc_id); + EXPECT_EQ(exp_num_occs, tfmd.getNumOccs()); + EXPECT_EQ(exp_field_length, tfmd.getFieldLength()); +} + void validateDiskIndex(DiskIndex &dw, bool f2HasElements, bool f3HasWeights) { @@ -253,24 +319,7 @@ FusionTest::requireThatFusionIsWorking(const vespalib::string &prefix, bool dire DocumentInverter inv(schema, invertThreads, pushThreads, fic); Document::UP doc; - b.startDocument("doc::10"); - b.startIndexField("f0"). - addStr("a").addStr("b").addStr("c").addStr("d"). - addStr("e").addStr("f").addStr("z"). - endField(); - b.startIndexField("f1"). - addStr("w").addStr("x"). - addStr("y").addStr("z"). - endField(); - b.startIndexField("f2"). - startElement(4).addStr("ax").addStr("ay").addStr("z").endElement(). - startElement(5).addStr("ax").endElement(). - endField(); - b.startIndexField("f3"). - startElement(4).addStr("wx").addStr("z").endElement(). - endField(); - - doc = b.endDocument(); + doc = make_doc10(b); inv.invertDocument(10, *doc); invertThreads.sync(); myPushDocument(inv); @@ -400,11 +449,21 @@ FusionTest::requireThatFusionIsWorking(const vespalib::string &prefix, bool dire } void -FusionTest::make_empty_index(const vespalib::string &dump_dir, const IFieldLengthInspector &field_length_inspector) +FusionTest::make_simple_index(const vespalib::string &dump_dir, const IFieldLengthInspector &field_length_inspector) { FieldIndexCollection fic(_schema, field_length_inspector); - uint32_t numDocs = 1; - uint32_t numWords = 1; + uint32_t numDocs = 20; + uint32_t numWords = 1000; + DocBuilder b(_schema); + SequencedTaskExecutor invertThreads(2); + SequencedTaskExecutor pushThreads(2); + DocumentInverter inv(_schema, invertThreads, pushThreads, fic); + + inv.invertDocument(10, *make_doc10(b)); + invertThreads.sync(); + myPushDocument(inv); + pushThreads.sync(); + IndexBuilder ib(_schema); TuneFileIndexing tuneFileIndexing; DummyFileHeaderContext fileHeaderContext; @@ -415,12 +474,12 @@ FusionTest::make_empty_index(const vespalib::string &dump_dir, const IFieldLengt } void -FusionTest::merge_empty_indexes(const vespalib::string &dump_dir, const std::vector<vespalib::string> &sources) +FusionTest::merge_simple_indexes(const vespalib::string &dump_dir, const std::vector<vespalib::string> &sources) { vespalib::ThreadStackExecutor executor(4, 0x10000); TuneFileIndexing tuneFileIndexing; DummyFileHeaderContext fileHeaderContext; - SelectorArray selector(1, 0); + SelectorArray selector(20, 0); ASSERT_TRUE(Fusion::merge(_schema, dump_dir, sources, selector, false, tuneFileIndexing, fileHeaderContext, executor)); @@ -428,12 +487,8 @@ FusionTest::merge_empty_indexes(const vespalib::string &dump_dir, const std::vec FusionTest::FusionTest() : ::testing::Test(), - _schema() + _schema(make_schema(false)) { - _schema.addIndexField(Schema::IndexField("f0", DataType::STRING)); - _schema.addIndexField(Schema::IndexField("f1", DataType::STRING)); - _schema.addIndexField(Schema::IndexField("f2", DataType::STRING, CollectionType::ARRAY)); - _schema.addIndexField(Schema::IndexField("f3", DataType::STRING, CollectionType::WEIGHTEDSET)); } TEST_F(FusionTest, require_that_normal_fusion_is_working) @@ -470,15 +525,31 @@ void clean_field_length_testdirs() TEST_F(FusionTest, require_that_average_field_length_is_preserved) { clean_field_length_testdirs(); - make_empty_index("fldump2", MockFieldLengthInspector()); - make_empty_index("fldump3", MyMockFieldLengthInspector()); - merge_empty_indexes("fldump4", {"fldump2", "fldump3"}); + make_simple_index("fldump2", MockFieldLengthInspector()); + make_simple_index("fldump3", MyMockFieldLengthInspector()); + merge_simple_indexes("fldump4", {"fldump2", "fldump3"}); DiskIndex disk_index("fldump4"); ASSERT_TRUE(disk_index.setup(TuneFileSearch())); EXPECT_EQ(3.5, disk_index.get_field_length_info("f0").get_average_field_length()); clean_field_length_testdirs(); } +TEST_F(FusionTest, require_that_interleaved_features_can_be_reconstructed) +{ + clean_field_length_testdirs(); + make_simple_index("fldump2", MockFieldLengthInspector()); + _schema = make_schema(true); // want interleaved features + merge_simple_indexes("fldump4", {"fldump2"}); + DiskIndex disk_index("fldump4"); + ASSERT_TRUE(disk_index.setup(TuneFileSearch())); + assert_interleaved_features(disk_index, "f0", "a", 10, 1, 7); + assert_interleaved_features(disk_index, "f1", "w", 10, 1, 4); + assert_interleaved_features(disk_index, "f2", "ax", 10, 2, 4); + assert_interleaved_features(disk_index, "f2", "ay", 10, 1, 3); + assert_interleaved_features(disk_index, "f3", "wx", 10, 1, 2); + clean_field_length_testdirs(); +} + } } diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp index d3696e2f31c..68d37c43cb2 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp @@ -14,6 +14,7 @@ LOG_SETUP(".diskindex.fieldreader"); namespace { vespalib::string PosOccIdCooked = "PosOcc.3.Cooked"; +vespalib::string interleaved_features("interleaved_features"); } @@ -192,12 +193,16 @@ FieldReader::allocFieldReader(const SchemaUtil::IndexIterator &index, { assert(index.isValid()); if (index.hasMatchingOldFields(oldSchema)) { - return std::make_unique<FieldReader>(); // The common case + if (!index.use_experimental_posting_list_format() || + index.has_matching_experimental_posting_list_format(oldSchema)) { + return std::make_unique<FieldReader>(); // The common case + } } if (!index.hasOldFields(oldSchema)) { return std::make_unique<FieldReaderEmpty>(index); // drop data } // field exists in old schema with different collection type setting + // or old field is missing wanted interleaved features. return std::make_unique<FieldReaderStripInfo>(index); // degraded } @@ -230,7 +235,9 @@ FieldReaderEmpty::getFeatureParams(PostingListParams ¶ms) FieldReaderStripInfo::FieldReaderStripInfo(const IndexIterator &index) : _hasElements(false), - _hasElementWeights(false) + _hasElementWeights(false), + _want_interleaved_features(index.use_experimental_posting_list_format()), + _regenerate_interleaved_features(false) { PosOccFieldsParams fieldsParams; fieldsParams.setSchemaParams(index.getSchema(), index.getIndex()); @@ -247,6 +254,26 @@ FieldReaderStripInfo::allowRawFeatures() return false; } +bool +FieldReaderStripInfo::open(const vespalib::string &prefix, const TuneFileSeqRead &tuneFileRead) +{ + if (!FieldReader::open(prefix, tuneFileRead)) { + return false; + } + if (_want_interleaved_features) { + PostingListParams params; + bool decode_interleaved_features = false; + _oldposoccfile->getParams(params); + params.get(interleaved_features, decode_interleaved_features); + if (!decode_interleaved_features) { + _regenerate_interleaved_features = true; + } + if (!_hasElements) { + _regenerate_interleaved_features = true; + } + } + return true; +} void FieldReaderStripInfo::read() @@ -283,6 +310,19 @@ FieldReaderStripInfo::read() } break; } + if (_regenerate_interleaved_features) { + // Regenerate interleaved featues from normal features. + uint32_t field_length = 0; + uint32_t num_occs = 0; + DocIdAndFeatures &features = _docIdAndFeatures; + for (const auto &element : features.elements()) { + field_length += element.getElementLen(); + num_occs += element.getNumOccs(); + } + // Note: Length of elements without occurrences is not included. + features.set_field_length(field_length); + features.set_num_occs(num_occs); + } } diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h index ee237f5cc69..106e10d2e80 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h +++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h @@ -117,15 +117,19 @@ public: /* * Field reader that strips information from source, e.g. remove * weights or discard nonzero elements, due to collection type change. + * It is also used to regenerate interleaved features from normal features. */ class FieldReaderStripInfo : public FieldReader { private: bool _hasElements; bool _hasElementWeights; + bool _want_interleaved_features; + bool _regenerate_interleaved_features; public: FieldReaderStripInfo(const IndexIterator &index); bool allowRawFeatures() override; + bool open(const vespalib::string &prefix, const TuneFileSeqRead &tuneFileRead) override; void read() override; void getFeatureParams(PostingListParams ¶ms) override; }; diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp index 3f154c44cb9..b8a0813e33c 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp @@ -123,6 +123,7 @@ Zc4PostingSeqRead::getParams(PostingListParams ¶ms) params.set("minChunkDocs", _reader.get_posting_params()._min_chunk_docs); } params.set("minSkipDocs", _reader.get_posting_params()._min_skip_docs); + params.set(interleaved_features, _reader.get_posting_params()._encode_interleaved_features); } @@ -357,6 +358,7 @@ getParams(PostingListParams ¶ms) params.set("minChunkDocs", _writer.get_min_chunk_docs()); } params.set("minSkipDocs", _writer.get_min_skip_docs()); + params.set(interleaved_features, _writer.get_encode_interleaved_features()); } diff --git a/searchlib/src/vespa/searchlib/index/schemautil.cpp b/searchlib/src/vespa/searchlib/index/schemautil.cpp index 7f3b7c8c2a9..66edd22d72e 100644 --- a/searchlib/src/vespa/searchlib/index/schemautil.cpp +++ b/searchlib/src/vespa/searchlib/index/schemautil.cpp @@ -69,6 +69,20 @@ SchemaUtil::IndexIterator::hasMatchingOldFields(const Schema &oldSchema) const } bool +SchemaUtil::IndexIterator::has_matching_experimental_posting_list_format(const Schema &oldSchema) const +{ + assert(isValid()); + const Schema::IndexField &newField = getSchema().getIndexField(getIndex()); + const vespalib::string &fieldName = newField.getName(); + uint32_t oldFieldId = oldSchema.getIndexFieldId(fieldName); + if (oldFieldId == Schema::UNKNOWN_FIELD_ID) { + return false; + } + const Schema::IndexField &oldField = oldSchema.getIndexField(oldFieldId); + return (oldField.use_experimental_posting_list_format() == newField.use_experimental_posting_list_format()); +} + +bool SchemaUtil::validateIndexField(const Schema::IndexField &field) { bool ok = true; diff --git a/searchlib/src/vespa/searchlib/index/schemautil.h b/searchlib/src/vespa/searchlib/index/schemautil.h index 69b79ecfedd..f575b4de9f5 100644 --- a/searchlib/src/vespa/searchlib/index/schemautil.h +++ b/searchlib/src/vespa/searchlib/index/schemautil.h @@ -119,6 +119,8 @@ public: * @param oldSchema old schema, present in an input index */ bool hasMatchingOldFields(const Schema &oldSchema) const; + + bool has_matching_experimental_posting_list_format(const Schema &oldSchema) const; }; static IndexSettings getIndexSettings(const Schema &schema, const uint32_t index); |