aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@broadpark.no>2019-06-19 19:16:00 +0200
committerTor Egge <Tor.Egge@broadpark.no>2019-06-20 10:47:58 +0200
commit38de0304985d85dc9da58e15ad494054bff5d5dc (patch)
treeddced6a34d60e8b6d3c60df069b8da56fe9e0d01
parent2f2c641e9aa25476d46ff97846185da05b32d6d7 (diff)
Reconstruct interleaved features as needed.
-rw-r--r--searchlib/src/tests/diskindex/fusion/fusion_test.cpp137
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp44
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldreader.h4
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposting.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/index/schemautil.cpp14
-rw-r--r--searchlib/src/vespa/searchlib/index/schemautil.h2
6 files changed, 168 insertions, 35 deletions
diff --git a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp
index 4779ddcb10d..da29918a5fb 100644
--- a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp
+++ b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp
@@ -59,8 +59,8 @@ protected:
const Schema & getSchema() const { return _schema; }
void requireThatFusionIsWorking(const vespalib::string &prefix, bool directio, bool readmmap);
- void make_empty_index(const vespalib::string &dump_dir, const IFieldLengthInspector &field_length_inspector);
- void merge_empty_indexes(const vespalib::string &dump_dir, const std::vector<vespalib::string> &sources);
+ void make_simple_index(const vespalib::string &dump_dir, const IFieldLengthInspector &field_length_inspector);
+ void merge_simple_indexes(const vespalib::string &dump_dir, const std::vector<vespalib::string> &sources);
public:
FusionTest();
};
@@ -97,6 +97,72 @@ toString(FieldPositionsIterator posItr, bool hasElements = false, bool hasWeight
return ss.str();
}
+std::unique_ptr<Document>
+make_doc10(DocBuilder &b)
+{
+ b.startDocument("doc::10");
+ b.startIndexField("f0").
+ addStr("a").addStr("b").addStr("c").addStr("d").
+ addStr("e").addStr("f").addStr("z").
+ endField();
+ b.startIndexField("f1").
+ addStr("w").addStr("x").
+ addStr("y").addStr("z").
+ endField();
+ b.startIndexField("f2").
+ startElement(4).addStr("ax").addStr("ay").addStr("z").endElement().
+ startElement(5).addStr("ax").endElement().
+ endField();
+ b.startIndexField("f3").
+ startElement(4).addStr("wx").addStr("z").endElement().
+ endField();
+
+ return b.endDocument();
+}
+
+Schema::IndexField
+make_index_field(vespalib::stringref name, CollectionType collection_type, bool interleaved_features)
+{
+ Schema::IndexField index_field(name, DataType::STRING, collection_type);
+ index_field.set_experimental_posting_list_format(interleaved_features);
+ return index_field;
+}
+
+Schema
+make_schema(bool interleaved_features)
+{
+ Schema schema;
+ schema.addIndexField(make_index_field("f0", CollectionType::SINGLE, interleaved_features));
+ schema.addIndexField(make_index_field("f1", CollectionType::SINGLE, interleaved_features));
+ schema.addIndexField(make_index_field("f2", CollectionType::ARRAY, interleaved_features));
+ schema.addIndexField(make_index_field("f3", CollectionType::WEIGHTEDSET, interleaved_features));
+ return schema;
+}
+
+void
+assert_interleaved_features(DiskIndex &d, const vespalib::string &field, const vespalib::string &term, uint32_t doc_id, uint32_t exp_num_occs, uint32_t exp_field_length)
+{
+ using LookupResult = DiskIndex::LookupResult;
+ using PostingListHandle = index::PostingListHandle;
+ using SearchIterator = search::queryeval::SearchIterator;
+
+ const Schema &schema = d.getSchema();
+ uint32_t field_id(schema.getIndexFieldId(field));
+ std::unique_ptr<LookupResult> lookup_result(d.lookup(field_id, term));
+ ASSERT_TRUE(lookup_result);
+ std::unique_ptr<PostingListHandle> handle(d.readPostingList(*lookup_result));
+ ASSERT_TRUE(handle);
+ TermFieldMatchData tfmd;
+ TermFieldMatchDataArray tfmda;
+ tfmda.add(&tfmd);
+ std::unique_ptr<SearchIterator> sbap(handle->createIterator(lookup_result->counts, tfmda));
+ sbap->initFullRange();
+ EXPECT_TRUE(sbap->seek(doc_id));
+ sbap->unpack(doc_id);
+ EXPECT_EQ(exp_num_occs, tfmd.getNumOccs());
+ EXPECT_EQ(exp_field_length, tfmd.getFieldLength());
+}
+
void
validateDiskIndex(DiskIndex &dw, bool f2HasElements, bool f3HasWeights)
{
@@ -253,24 +319,7 @@ FusionTest::requireThatFusionIsWorking(const vespalib::string &prefix, bool dire
DocumentInverter inv(schema, invertThreads, pushThreads, fic);
Document::UP doc;
- b.startDocument("doc::10");
- b.startIndexField("f0").
- addStr("a").addStr("b").addStr("c").addStr("d").
- addStr("e").addStr("f").addStr("z").
- endField();
- b.startIndexField("f1").
- addStr("w").addStr("x").
- addStr("y").addStr("z").
- endField();
- b.startIndexField("f2").
- startElement(4).addStr("ax").addStr("ay").addStr("z").endElement().
- startElement(5).addStr("ax").endElement().
- endField();
- b.startIndexField("f3").
- startElement(4).addStr("wx").addStr("z").endElement().
- endField();
-
- doc = b.endDocument();
+ doc = make_doc10(b);
inv.invertDocument(10, *doc);
invertThreads.sync();
myPushDocument(inv);
@@ -400,11 +449,21 @@ FusionTest::requireThatFusionIsWorking(const vespalib::string &prefix, bool dire
}
void
-FusionTest::make_empty_index(const vespalib::string &dump_dir, const IFieldLengthInspector &field_length_inspector)
+FusionTest::make_simple_index(const vespalib::string &dump_dir, const IFieldLengthInspector &field_length_inspector)
{
FieldIndexCollection fic(_schema, field_length_inspector);
- uint32_t numDocs = 1;
- uint32_t numWords = 1;
+ uint32_t numDocs = 20;
+ uint32_t numWords = 1000;
+ DocBuilder b(_schema);
+ SequencedTaskExecutor invertThreads(2);
+ SequencedTaskExecutor pushThreads(2);
+ DocumentInverter inv(_schema, invertThreads, pushThreads, fic);
+
+ inv.invertDocument(10, *make_doc10(b));
+ invertThreads.sync();
+ myPushDocument(inv);
+ pushThreads.sync();
+
IndexBuilder ib(_schema);
TuneFileIndexing tuneFileIndexing;
DummyFileHeaderContext fileHeaderContext;
@@ -415,12 +474,12 @@ FusionTest::make_empty_index(const vespalib::string &dump_dir, const IFieldLengt
}
void
-FusionTest::merge_empty_indexes(const vespalib::string &dump_dir, const std::vector<vespalib::string> &sources)
+FusionTest::merge_simple_indexes(const vespalib::string &dump_dir, const std::vector<vespalib::string> &sources)
{
vespalib::ThreadStackExecutor executor(4, 0x10000);
TuneFileIndexing tuneFileIndexing;
DummyFileHeaderContext fileHeaderContext;
- SelectorArray selector(1, 0);
+ SelectorArray selector(20, 0);
ASSERT_TRUE(Fusion::merge(_schema, dump_dir, sources, selector,
false,
tuneFileIndexing, fileHeaderContext, executor));
@@ -428,12 +487,8 @@ FusionTest::merge_empty_indexes(const vespalib::string &dump_dir, const std::vec
FusionTest::FusionTest()
: ::testing::Test(),
- _schema()
+ _schema(make_schema(false))
{
- _schema.addIndexField(Schema::IndexField("f0", DataType::STRING));
- _schema.addIndexField(Schema::IndexField("f1", DataType::STRING));
- _schema.addIndexField(Schema::IndexField("f2", DataType::STRING, CollectionType::ARRAY));
- _schema.addIndexField(Schema::IndexField("f3", DataType::STRING, CollectionType::WEIGHTEDSET));
}
TEST_F(FusionTest, require_that_normal_fusion_is_working)
@@ -470,15 +525,31 @@ void clean_field_length_testdirs()
TEST_F(FusionTest, require_that_average_field_length_is_preserved)
{
clean_field_length_testdirs();
- make_empty_index("fldump2", MockFieldLengthInspector());
- make_empty_index("fldump3", MyMockFieldLengthInspector());
- merge_empty_indexes("fldump4", {"fldump2", "fldump3"});
+ make_simple_index("fldump2", MockFieldLengthInspector());
+ make_simple_index("fldump3", MyMockFieldLengthInspector());
+ merge_simple_indexes("fldump4", {"fldump2", "fldump3"});
DiskIndex disk_index("fldump4");
ASSERT_TRUE(disk_index.setup(TuneFileSearch()));
EXPECT_EQ(3.5, disk_index.get_field_length_info("f0").get_average_field_length());
clean_field_length_testdirs();
}
+TEST_F(FusionTest, require_that_interleaved_features_can_be_reconstructed)
+{
+ clean_field_length_testdirs();
+ make_simple_index("fldump2", MockFieldLengthInspector());
+ _schema = make_schema(true); // want interleaved features
+ merge_simple_indexes("fldump4", {"fldump2"});
+ DiskIndex disk_index("fldump4");
+ ASSERT_TRUE(disk_index.setup(TuneFileSearch()));
+ assert_interleaved_features(disk_index, "f0", "a", 10, 1, 7);
+ assert_interleaved_features(disk_index, "f1", "w", 10, 1, 4);
+ assert_interleaved_features(disk_index, "f2", "ax", 10, 2, 4);
+ assert_interleaved_features(disk_index, "f2", "ay", 10, 1, 3);
+ assert_interleaved_features(disk_index, "f3", "wx", 10, 1, 2);
+ clean_field_length_testdirs();
+}
+
}
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
index d3696e2f31c..68d37c43cb2 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
@@ -14,6 +14,7 @@ LOG_SETUP(".diskindex.fieldreader");
namespace {
vespalib::string PosOccIdCooked = "PosOcc.3.Cooked";
+vespalib::string interleaved_features("interleaved_features");
}
@@ -192,12 +193,16 @@ FieldReader::allocFieldReader(const SchemaUtil::IndexIterator &index,
{
assert(index.isValid());
if (index.hasMatchingOldFields(oldSchema)) {
- return std::make_unique<FieldReader>(); // The common case
+ if (!index.use_experimental_posting_list_format() ||
+ index.has_matching_experimental_posting_list_format(oldSchema)) {
+ return std::make_unique<FieldReader>(); // The common case
+ }
}
if (!index.hasOldFields(oldSchema)) {
return std::make_unique<FieldReaderEmpty>(index); // drop data
}
// field exists in old schema with different collection type setting
+ // or old field is missing wanted interleaved features.
return std::make_unique<FieldReaderStripInfo>(index); // degraded
}
@@ -230,7 +235,9 @@ FieldReaderEmpty::getFeatureParams(PostingListParams &params)
FieldReaderStripInfo::FieldReaderStripInfo(const IndexIterator &index)
: _hasElements(false),
- _hasElementWeights(false)
+ _hasElementWeights(false),
+ _want_interleaved_features(index.use_experimental_posting_list_format()),
+ _regenerate_interleaved_features(false)
{
PosOccFieldsParams fieldsParams;
fieldsParams.setSchemaParams(index.getSchema(), index.getIndex());
@@ -247,6 +254,26 @@ FieldReaderStripInfo::allowRawFeatures()
return false;
}
+bool
+FieldReaderStripInfo::open(const vespalib::string &prefix, const TuneFileSeqRead &tuneFileRead)
+{
+ if (!FieldReader::open(prefix, tuneFileRead)) {
+ return false;
+ }
+ if (_want_interleaved_features) {
+ PostingListParams params;
+ bool decode_interleaved_features = false;
+ _oldposoccfile->getParams(params);
+ params.get(interleaved_features, decode_interleaved_features);
+ if (!decode_interleaved_features) {
+ _regenerate_interleaved_features = true;
+ }
+ if (!_hasElements) {
+ _regenerate_interleaved_features = true;
+ }
+ }
+ return true;
+}
void
FieldReaderStripInfo::read()
@@ -283,6 +310,19 @@ FieldReaderStripInfo::read()
}
break;
}
+ if (_regenerate_interleaved_features) {
+ // Regenerate interleaved featues from normal features.
+ uint32_t field_length = 0;
+ uint32_t num_occs = 0;
+ DocIdAndFeatures &features = _docIdAndFeatures;
+ for (const auto &element : features.elements()) {
+ field_length += element.getElementLen();
+ num_occs += element.getNumOccs();
+ }
+ // Note: Length of elements without occurrences is not included.
+ features.set_field_length(field_length);
+ features.set_num_occs(num_occs);
+ }
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h
index ee237f5cc69..106e10d2e80 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h
@@ -117,15 +117,19 @@ public:
/*
* Field reader that strips information from source, e.g. remove
* weights or discard nonzero elements, due to collection type change.
+ * It is also used to regenerate interleaved features from normal features.
*/
class FieldReaderStripInfo : public FieldReader
{
private:
bool _hasElements;
bool _hasElementWeights;
+ bool _want_interleaved_features;
+ bool _regenerate_interleaved_features;
public:
FieldReaderStripInfo(const IndexIterator &index);
bool allowRawFeatures() override;
+ bool open(const vespalib::string &prefix, const TuneFileSeqRead &tuneFileRead) override;
void read() override;
void getFeatureParams(PostingListParams &params) override;
};
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
index 3f154c44cb9..b8a0813e33c 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
@@ -123,6 +123,7 @@ Zc4PostingSeqRead::getParams(PostingListParams &params)
params.set("minChunkDocs", _reader.get_posting_params()._min_chunk_docs);
}
params.set("minSkipDocs", _reader.get_posting_params()._min_skip_docs);
+ params.set(interleaved_features, _reader.get_posting_params()._encode_interleaved_features);
}
@@ -357,6 +358,7 @@ getParams(PostingListParams &params)
params.set("minChunkDocs", _writer.get_min_chunk_docs());
}
params.set("minSkipDocs", _writer.get_min_skip_docs());
+ params.set(interleaved_features, _writer.get_encode_interleaved_features());
}
diff --git a/searchlib/src/vespa/searchlib/index/schemautil.cpp b/searchlib/src/vespa/searchlib/index/schemautil.cpp
index 7f3b7c8c2a9..66edd22d72e 100644
--- a/searchlib/src/vespa/searchlib/index/schemautil.cpp
+++ b/searchlib/src/vespa/searchlib/index/schemautil.cpp
@@ -69,6 +69,20 @@ SchemaUtil::IndexIterator::hasMatchingOldFields(const Schema &oldSchema) const
}
bool
+SchemaUtil::IndexIterator::has_matching_experimental_posting_list_format(const Schema &oldSchema) const
+{
+ assert(isValid());
+ const Schema::IndexField &newField = getSchema().getIndexField(getIndex());
+ const vespalib::string &fieldName = newField.getName();
+ uint32_t oldFieldId = oldSchema.getIndexFieldId(fieldName);
+ if (oldFieldId == Schema::UNKNOWN_FIELD_ID) {
+ return false;
+ }
+ const Schema::IndexField &oldField = oldSchema.getIndexField(oldFieldId);
+ return (oldField.use_experimental_posting_list_format() == newField.use_experimental_posting_list_format());
+}
+
+bool
SchemaUtil::validateIndexField(const Schema::IndexField &field)
{
bool ok = true;
diff --git a/searchlib/src/vespa/searchlib/index/schemautil.h b/searchlib/src/vespa/searchlib/index/schemautil.h
index 69b79ecfedd..f575b4de9f5 100644
--- a/searchlib/src/vespa/searchlib/index/schemautil.h
+++ b/searchlib/src/vespa/searchlib/index/schemautil.h
@@ -119,6 +119,8 @@ public:
* @param oldSchema old schema, present in an input index
*/
bool hasMatchingOldFields(const Schema &oldSchema) const;
+
+ bool has_matching_experimental_posting_list_format(const Schema &oldSchema) const;
};
static IndexSettings getIndexSettings(const Schema &schema, const uint32_t index);