diff options
Diffstat (limited to 'searchlib/src/tests/diskindex/fusion/fusion_test.cpp')
-rw-r--r-- | searchlib/src/tests/diskindex/fusion/fusion_test.cpp | 506 |
1 files changed, 506 insertions, 0 deletions
diff --git a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp new file mode 100644 index 00000000000..4191a8f8d2b --- /dev/null +++ b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp @@ -0,0 +1,506 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("fusion_test"); +#include <vespa/searchlib/diskindex/checkpointfile.h> +#include <vespa/searchlib/diskindex/fusion.h> +#include <vespa/searchlib/diskindex/indexbuilder.h> +#include <vespa/searchlib/diskindex/zcposoccrandread.h> +#include <vespa/searchlib/fef/fieldpositionsiterator.h> +#include <vespa/searchlib/fef/termfieldmatchdata.h> +#include <vespa/searchlib/fef/termfieldmatchdataarray.h> +#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/dummyfileheadercontext.h> +#include <vespa/searchlib/index/indexbuilder.h> +#include <vespa/searchlib/index/schemautil.h> +#include <vespa/searchlib/btree/btreeroot.hpp> +#include <vespa/searchlib/btree/btreenodeallocator.hpp> +#include <vespa/searchlib/btree/btreenode.hpp> +#include <vespa/searchlib/memoryindex/dictionary.h> +#include <vespa/searchlib/memoryindex/documentinverter.h> +#include <vespa/searchlib/memoryindex/featurestore.h> +#include <vespa/searchlib/memoryindex/postingiterator.h> +#include <vespa/searchlib/memoryindex/i_document_insert_listener.h> +#include <vespa/searchlib/diskindex/diskindex.h> +#include <vespa/vespalib/objects/nbostream.h> +#include <vespa/vespalib/testkit/testapp.h> +#include <vespa/searchlib/util/filekit.h> +#include <vespa/searchlib/common/sequencedtaskexecutor.h> + +namespace search +{ + + +using document::Document; +using fef::FieldPositionsIterator; +using fef::TermFieldMatchData; +using fef::TermFieldMatchDataArray; +using index::DocBuilder; +using index::DocIdAndFeatures; +using index::Schema; +using index::SchemaUtil; +using search::common::FileHeaderContext; +using search::index::DummyFileHeaderContext; +using memoryindex::Dictionary; +using memoryindex::DocumentInverter; +using queryeval::SearchIterator; + +namespace diskindex +{ + + +class Test : public vespalib::TestApp +{ +private: + Schema _schema; + const Schema & getSchema() const { return _schema; } + + void + requireThatFusionIsWorking(const vespalib::string &prefix, + bool directio, + bool readmmap); + +public: + Test(); + int Main(); +}; + + +namespace +{ + +void +myPushDocument(DocumentInverter &inv, Dictionary &d) +{ + inv.pushDocuments(d, std::shared_ptr<IDestructorCallback>()); +} + + +} + +vespalib::string +toString(FieldPositionsIterator posItr, + bool hasElements = false, bool hasWeights = false) +{ + vespalib::asciistream ss; + ss << "{"; + ss << posItr.getFieldLength() << ":"; + bool first = true; + for (; posItr.valid(); posItr.next()) { + if (!first) ss << ","; + ss << posItr.getPosition(); + first = false; + if (hasElements) { + ss << "[e=" << posItr.getElementId(); + if (hasWeights) + ss << ",w=" << posItr.getElementWeight(); + ss << ",l=" << posItr.getElementLen() << "]"; + } + } + ss << "}"; + return ss.str(); +} + + +#if 0 +vespalib::string +toString(DocIdAndFeatures &features) +{ + vespalib::asciistream ss; + ss << "{"; + std::vector<search::index::WordDocFieldElementFeatures>::const_iterator + element = features._elements.begin(); + std::vector<search::index::WordDocFieldElementWordPosFeatures>:: + const_iterator position = features._wordPositions.begin(); + for (; field != fielde; ++field) { + ss << "f=" << field->getFieldId() << "{"; + uint32_t numElements = field->getNumElements(); + while (numElements--) { + ss << "e=" << element->getElementId() << "," + << "ew=" << element->getWeight() << "," + << "el=" << element->getElementLen() << "{"; + uint32_t numOccs = element->getNumOccs(); + while (numOccs--) { + ss << position->getWordPos(); + if (numOccs != 0) + ss << ","; + } + ss << "}"; + if (numElements != 0) + ss << ","; + } + ss << "}"; + } + ss << "}"; + return ss.str(); +} +#endif + + +void +validateDiskIndex(DiskIndex &dw, + bool f2HasElements, + bool f3HasWeights) +{ + typedef DiskIndex::LookupResult LR; + typedef index::PostingListHandle PH; + typedef search::queryeval::SearchIterator SB; + + const Schema &schema(dw.getSchema()); + + { + uint32_t id1(schema.getIndexFieldId("f0")); + LR::UP lr1(dw.lookup(id1, "c")); + EXPECT_TRUE(lr1.get() != NULL); + PH::UP wh1(dw.readPostingList(*lr1)); + EXPECT_TRUE(wh1.get() != NULL); + TermFieldMatchData f0; + TermFieldMatchDataArray a; + a.add(&f0); + SB::UP sbap(wh1->createIterator(lr1->counts, a)); + sbap->initFullRange(); + EXPECT_EQUAL("{1000000:}", toString(f0.getIterator())); + EXPECT_TRUE(sbap->seek(10)); + sbap->unpack(10); + EXPECT_EQUAL("{7:2}", toString(f0.getIterator())); + } + { + uint32_t id1(schema.getIndexFieldId("f2")); + LR::UP lr1(dw.lookup(id1, "ax")); + EXPECT_TRUE(lr1.get() != NULL); + PH::UP wh1(dw.readPostingList(*lr1)); + EXPECT_TRUE(wh1.get() != NULL); + TermFieldMatchData f2; + TermFieldMatchDataArray a; + a.add(&f2); + SB::UP sbap(wh1->createIterator(lr1->counts, a)); + sbap->initFullRange(); + EXPECT_EQUAL("{1000000:}", toString(f2.getIterator())); + EXPECT_TRUE(sbap->seek(10)); + sbap->unpack(10); + if (f2HasElements) { + EXPECT_EQUAL("{3:0[e=0,l=3],0[e=1,l=1]}", + toString(f2.getIterator(), true)); + } else { + EXPECT_EQUAL("{3:0[e=0,l=3]}", + toString(f2.getIterator(), true)); + } + } + { + uint32_t id1(schema.getIndexFieldId("f3"));; + LR::UP lr1(dw.lookup(id1, "wx")); + EXPECT_TRUE(lr1.get() != NULL); + PH::UP wh1(dw.readPostingList(*lr1)); + EXPECT_TRUE(wh1.get() != NULL); + TermFieldMatchData f3; + TermFieldMatchDataArray a; + a.add(&f3); + SB::UP sbap(wh1->createIterator(lr1->counts, a)); + sbap->initFullRange(); + EXPECT_EQUAL("{1000000:}", toString(f3.getIterator())); + EXPECT_TRUE(sbap->seek(10)); + sbap->unpack(10); + if (f3HasWeights) { + EXPECT_EQUAL("{2:0[e=0,w=4,l=2]}", + toString(f3.getIterator(), true, true)); + } else { + EXPECT_EQUAL("{2:0[e=0,w=1,l=2]}", + toString(f3.getIterator(), true, true)); + } + } + { + uint32_t id1(schema.getIndexFieldId("f3"));; + LR::UP lr1(dw.lookup(id1, "zz")); + EXPECT_TRUE(lr1.get() != NULL); + PH::UP wh1(dw.readPostingList(*lr1)); + EXPECT_TRUE(wh1.get() != NULL); + TermFieldMatchData f3; + TermFieldMatchDataArray a; + a.add(&f3); + SB::UP sbap(wh1->createIterator(lr1->counts, a)); + sbap->initFullRange(); + EXPECT_EQUAL("{1000000:}", toString(f3.getIterator())); + EXPECT_TRUE(sbap->seek(11)); + sbap->unpack(11); + if (f3HasWeights) { + EXPECT_EQUAL("{1:0[e=0,w=-27,l=1]}", + toString(f3.getIterator(), true, true)); + } else { + EXPECT_EQUAL("{1:0[e=0,w=1,l=1]}", + toString(f3.getIterator(), true, true)); + } + } + { + uint32_t id1(schema.getIndexFieldId("f3"));; + LR::UP lr1(dw.lookup(id1, "zz0")); + EXPECT_TRUE(lr1.get() != NULL); + PH::UP wh1(dw.readPostingList(*lr1)); + EXPECT_TRUE(wh1.get() != NULL); + TermFieldMatchData f3; + TermFieldMatchDataArray a; + a.add(&f3); + SB::UP sbap(wh1->createIterator(lr1->counts, a)); + sbap->initFullRange(); + EXPECT_EQUAL("{1000000:}", toString(f3.getIterator())); + EXPECT_TRUE(sbap->seek(12)); + sbap->unpack(12); + if (f3HasWeights) { + EXPECT_EQUAL("{1:0[e=0,w=0,l=1]}", + toString(f3.getIterator(), true, true)); + } else { + EXPECT_EQUAL("{1:0[e=0,w=1,l=1]}", + toString(f3.getIterator(), true, true)); + } + } +} + + +void +Test::requireThatFusionIsWorking(const vespalib::string &prefix, + bool directio, + bool readmmap) +{ + Schema schema; + Schema schema2; + Schema schema3; + for (SchemaUtil::IndexIterator it(getSchema()); it.isValid(); ++it) { + const Schema::IndexField &iField = + _schema.getIndexField(it.getIndex()); + schema.addIndexField(Schema::IndexField(iField.getName(), + iField.getDataType(), + iField.getCollectionType())); + if (iField.getCollectionType() == Schema::WEIGHTEDSET) + schema2.addIndexField(Schema::IndexField(iField.getName(), + iField.getDataType(), + Schema::ARRAY)); + else + schema2.addIndexField(Schema::IndexField(iField.getName(), + iField.getDataType(), + iField.getCollectionType())); + schema3.addIndexField(Schema::IndexField(iField.getName(), + iField.getDataType(), + Schema::SINGLE)); + } + schema3.addIndexField(Schema::IndexField("f4", + Schema::STRING)); + schema.addFieldSet(Schema::FieldSet("nc0"). + addField("f0").addField("f1")); + schema2.addFieldSet(Schema::FieldSet("nc0"). + addField("f1").addField("f0")); + schema3.addFieldSet(Schema::FieldSet("nc2"). + addField("f0").addField("f1"). + addField("f2").addField("f3"). + addField("f4")); + Dictionary d(schema); + DocBuilder b(schema); + SequencedTaskExecutor invertThreads(2); + SequencedTaskExecutor pushThreads(2); + DocumentInverter inv(schema, invertThreads, pushThreads); + Document::UP doc; + + b.startDocument("doc::10"); + b.startIndexField("f0"). + addStr("a").addStr("b").addStr("c").addStr("d"). + addStr("e").addStr("f").addStr("z"). + endField(); + b.startIndexField("f1"). + addStr("w").addStr("x"). + addStr("y").addStr("z"). + endField(); + b.startIndexField("f2"). + startElement(4).addStr("ax").addStr("ay").addStr("z").endElement(). + startElement(5).addStr("ax").endElement(). + endField(); + b.startIndexField("f3"). + startElement(4).addStr("wx").addStr("z").endElement(). + endField(); + + doc = b.endDocument(); + inv.invertDocument(10, *doc); + invertThreads.sync(); + myPushDocument(inv, d); + pushThreads.sync(); + + b.startDocument("doc::11"). + startIndexField("f3"). + startElement(-27).addStr("zz").endElement(). + endField(); + doc = b.endDocument(); + inv.invertDocument(11, *doc); + invertThreads.sync(); + myPushDocument(inv, d); + pushThreads.sync(); + + b.startDocument("doc::12"). + startIndexField("f3"). + startElement(0).addStr("zz0").endElement(). + endField(); + doc = b.endDocument(); + inv.invertDocument(12, *doc); + invertThreads.sync(); + myPushDocument(inv, d); + pushThreads.sync(); + + IndexBuilder ib(schema); + vespalib::string dump2dir = prefix + "dump2"; + ib.setPrefix(dump2dir); + uint32_t numDocs = 12 + 1; + uint32_t numWords = d.getNumUniqueWords(); + bool dynamicKPosOcc = false; + TuneFileIndexing tuneFileIndexing; + TuneFileSearch tuneFileSearch; + DummyFileHeaderContext fileHeaderContext; + if (directio) { + tuneFileIndexing._read.setWantDirectIO(); + tuneFileIndexing._write.setWantDirectIO(); + tuneFileSearch._read.setWantDirectIO(); + } + if (readmmap) + tuneFileSearch._read.setWantMemoryMap(); + ib.open(numDocs, numWords, tuneFileIndexing, fileHeaderContext); + d.dump(ib); + ib.close(); + + vespalib::string tsName = dump2dir + "/.teststamp"; + typedef search::FileKit FileKit; + EXPECT_TRUE(FileKit::createStamp(tsName)); + EXPECT_TRUE(FileKit::hasStamp(tsName)); + EXPECT_TRUE(FileKit::removeStamp(tsName)); + EXPECT_FALSE(FileKit::hasStamp(tsName)); + + do { + DiskIndex dw2(prefix + "dump2"); + if (!EXPECT_TRUE(dw2.setup(tuneFileSearch))) + break; + TEST_DO(validateDiskIndex(dw2, true, true)); + } while (0); + + do { + std::vector<vespalib::string> sources; + SelectorArray selector(numDocs, 0); + sources.push_back(prefix + "dump2"); + if (!EXPECT_TRUE(Fusion::merge(schema, + prefix + "dump3", + sources, selector, + dynamicKPosOcc, + tuneFileIndexing, + fileHeaderContext))) + return; + } while (0); + do { + DiskIndex dw3(prefix + "dump3"); + if (!EXPECT_TRUE(dw3.setup(tuneFileSearch))) + break; + TEST_DO(validateDiskIndex(dw3, true, true)); + } while (0); + do { + std::vector<vespalib::string> sources; + SelectorArray selector(numDocs, 0); + sources.push_back(prefix + "dump3"); + if (!EXPECT_TRUE(Fusion::merge(schema2, + prefix + "dump4", + sources, selector, + dynamicKPosOcc, + tuneFileIndexing, + fileHeaderContext))) + return; + } while (0); + do { + DiskIndex dw4(prefix + "dump4"); + if (!EXPECT_TRUE(dw4.setup(tuneFileSearch))) + break; + TEST_DO(validateDiskIndex(dw4, true, false)); + } while (0); + do { + std::vector<vespalib::string> sources; + SelectorArray selector(numDocs, 0); + sources.push_back(prefix + "dump3"); + if (!EXPECT_TRUE(Fusion::merge(schema3, + prefix + "dump5", + sources, selector, + dynamicKPosOcc, + tuneFileIndexing, + fileHeaderContext))) + return; + } while (0); + do { + DiskIndex dw5(prefix + "dump5"); + if (!EXPECT_TRUE(dw5.setup(tuneFileSearch))) + break; + TEST_DO(validateDiskIndex(dw5, false, false)); + } while (0); + do { + std::vector<vespalib::string> sources; + SelectorArray selector(numDocs, 0); + sources.push_back(prefix + "dump3"); + if (!EXPECT_TRUE(Fusion::merge(schema, + prefix + "dump6", + sources, selector, + !dynamicKPosOcc, + tuneFileIndexing, + fileHeaderContext))) + return; + } while (0); + do { + DiskIndex dw6(prefix + "dump6"); + if (!EXPECT_TRUE(dw6.setup(tuneFileSearch))) + break; + TEST_DO(validateDiskIndex(dw6, true, true)); + } while (0); + do { + std::vector<vespalib::string> sources; + SelectorArray selector(numDocs, 0); + sources.push_back(prefix + "dump2"); + if (!EXPECT_TRUE(Fusion::merge(schema, + prefix + "dump3", + sources, selector, + dynamicKPosOcc, + tuneFileIndexing, + fileHeaderContext))) + return; + } while (0); + do { + DiskIndex dw3(prefix + "dump3"); + if (!EXPECT_TRUE(dw3.setup(tuneFileSearch))) + break; + TEST_DO(validateDiskIndex(dw3, true, true)); + } while (0); +} + + +Test::Test() + : _schema() +{ + _schema.addIndexField(Schema::IndexField("f0", Schema::STRING)); + _schema.addIndexField(Schema::IndexField("f1", Schema::STRING)); + _schema.addIndexField(Schema::IndexField("f2", Schema::STRING, + Schema::ARRAY)); + _schema.addIndexField(Schema::IndexField("f3", Schema::STRING, + Schema::WEIGHTEDSET)); +} + + +int +Test::Main() +{ + TEST_INIT("fusion_test"); + + if (_argc > 0) { + DummyFileHeaderContext::setCreator(_argv[0]); + } + + TEST_DO(requireThatFusionIsWorking("", false, false)); + TEST_DO(requireThatFusionIsWorking("d", true, false)); + TEST_DO(requireThatFusionIsWorking("m", false, true)); + TEST_DO(requireThatFusionIsWorking("dm", true, true)); + + TEST_DONE(); +} + +} + + +} + + +TEST_APPHOOK(search::diskindex::Test); |