diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /searchlib/src/tests/postinglistbm |
Publish
Diffstat (limited to 'searchlib/src/tests/postinglistbm')
-rw-r--r-- | searchlib/src/tests/postinglistbm/.gitignore | 10 | ||||
-rw-r--r-- | searchlib/src/tests/postinglistbm/CMakeLists.txt | 10 | ||||
-rw-r--r-- | searchlib/src/tests/postinglistbm/andstress.cpp | 536 | ||||
-rw-r--r-- | searchlib/src/tests/postinglistbm/andstress.h | 43 | ||||
-rw-r--r-- | searchlib/src/tests/postinglistbm/postinglistbm.cpp | 491 | ||||
-rw-r--r-- | searchlib/src/tests/postinglistbm/skip.txt | 75 |
6 files changed, 1165 insertions, 0 deletions
diff --git a/searchlib/src/tests/postinglistbm/.gitignore b/searchlib/src/tests/postinglistbm/.gitignore new file mode 100644 index 00000000000..ac71dde13e2 --- /dev/null +++ b/searchlib/src/tests/postinglistbm/.gitignore @@ -0,0 +1,10 @@ +*.core +*.ilk +*.pdb +.depend +Makefile +core +core.* +postinglistbm +postinglistbm.exe +searchlib_postinglistbm_app diff --git a/searchlib/src/tests/postinglistbm/CMakeLists.txt b/searchlib/src/tests/postinglistbm/CMakeLists.txt new file mode 100644 index 00000000000..403c12da1b1 --- /dev/null +++ b/searchlib/src/tests/postinglistbm/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_postinglistbm_app + SOURCES + postinglistbm.cpp + andstress.cpp + DEPENDS + searchlib_test + searchlib +) +vespa_add_test(NAME searchlib_postinglistbm_app NO_VALGRIND COMMAND searchlib_postinglistbm_app -q -a) diff --git a/searchlib/src/tests/postinglistbm/andstress.cpp b/searchlib/src/tests/postinglistbm/andstress.cpp new file mode 100644 index 00000000000..f3fabde0d61 --- /dev/null +++ b/searchlib/src/tests/postinglistbm/andstress.cpp @@ -0,0 +1,536 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".andstress"); +#include "andstress.h" +#include <vector> + +#include <vespa/searchlib/common/bitvector.h> +#include <vespa/searchlib/test/fakedata/fakeword.h> +#include <vespa/searchlib/test/fakedata/fakewordset.h> +#include <vespa/searchlib/test/fakedata/fakeposting.h> +#include <vespa/searchlib/test/fakedata/fakefilterocc.h> +#include <vespa/searchlib/test/fakedata/fakeegcompr64filterocc.h> +#include <vespa/searchlib/test/fakedata/fakezcfilterocc.h> +#include <vespa/searchlib/test/fakedata/fakezcbfilterocc.h> +#include <vespa/searchlib/test/fakedata/fpfactory.h> + +using search::fef::TermFieldMatchData; +using search::fef::TermFieldMatchDataArray; +using search::queryeval::SearchIterator; +using namespace search::fakedata; + +namespace postinglistbm +{ + +class AndStressWorker; + +class AndStressMaster +{ +private: + AndStressMaster(const AndStressMaster &); + + AndStressMaster & + operator=(const AndStressMaster &); + + search::Rand48 &_rnd; + unsigned int _numDocs; + unsigned int _commonDocFreq; + std::vector<std::string> _postingTypes; + unsigned int _loops; + unsigned int _skipCommonPairsRate; + uint32_t _stride; + bool _unpack; + + FastOS_ThreadPool *_threadPool; + std::vector<AndStressWorker *> _workers; + unsigned int _workersDone; + + FakeWordSet &_wordSet; + + std::vector<std::vector<FakePosting::SP> > _postings; + + FastOS_Cond _taskCond; + unsigned int _taskIdx; + uint32_t _numTasks; + +public: + typedef std::pair<FakePosting *, FakePosting *> Task; +private: + std::vector<Task> _tasks; +public: + AndStressMaster(search::Rand48 &rnd, + FakeWordSet &wordSet, + unsigned int numDocs, + unsigned int commonDocFreq, + const std::vector<std::string> &postingType, + unsigned int loops, + unsigned int skipCommonPairsRate, + uint32_t numTasks, + uint32_t stride, + bool unpack); + + ~AndStressMaster(void); + + void + run(void); + + void + makePostingsHelper(FPFactory *postingFactory, + const std::string &postingFormat, + bool validate, bool verbose); + + void + dropPostings(void); + + void + dropTasks(void); + + void + resetTasks(void); // Prepare for rerun + + void + setupTasks(unsigned int numTasks); + + Task * + getTask(void); + + unsigned int + getNumDocs(void) const + { + return _numDocs; + } + + bool + getUnpack(void) const + { + return _unpack; + } + + double + runWorkers(const std::string &postingFormat); +}; + + +class AndStressWorker : public FastOS_Runnable +{ +private: + AndStressWorker(const AndStressWorker &); + + AndStressWorker & + operator=(const AndStressWorker &); + + AndStressMaster &_master; + unsigned int _id; +public: + AndStressWorker(AndStressMaster &master, unsigned int id); + + ~AndStressWorker(void); + + virtual void + Run(FastOS_ThreadInterface *thisThread, void *arg); +}; + + +template <class P> +FakePosting * +makePosting(FakeWord &fw) +{ + return new P(fw); +} + + +AndStressMaster::AndStressMaster(search::Rand48 &rnd, + FakeWordSet &wordSet, + unsigned int numDocs, + unsigned int commonDocFreq, + const std::vector<std::string> &postingTypes, + unsigned int loops, + unsigned int skipCommonPairsRate, + uint32_t numTasks, + uint32_t stride, + bool unpack) + : _rnd(rnd), + _numDocs(numDocs), + _commonDocFreq(commonDocFreq), + _postingTypes(postingTypes), + _loops(loops), + _skipCommonPairsRate(skipCommonPairsRate), + _stride(stride), + _unpack(unpack), + _threadPool(NULL), + _workers(), + _workersDone(0), + _wordSet(wordSet), + _postings(FakeWordSet::NUM_WORDCLASSES), + _taskCond(), + _taskIdx(0), + _numTasks(numTasks), + _tasks() +{ + LOG(info, "AndStressMaster::AndStressMaster"); + + _threadPool = new FastOS_ThreadPool(128 * 1024, 400); +} + +template <class C> +static void +clearPtrVector(std::vector<C> &v) +{ + for (unsigned int i = 0; i < v.size(); ++i) + delete v[i]; + v.clear(); +} + + +AndStressMaster::~AndStressMaster(void) +{ + LOG(info, "AndStressMaster::~AndStressMaster"); + + _threadPool->Close(); + delete _threadPool; + _threadPool = NULL; + clearPtrVector(_workers); + dropPostings(); +} + + +void +AndStressMaster::dropPostings(void) +{ + for (unsigned int i = 0; i < _postings.size(); ++i) + _postings[i].clear(); + dropTasks(); +} + + +void +AndStressMaster::dropTasks(void) +{ + _tasks.clear(); + _taskIdx = 0; +} + + +void +AndStressMaster::resetTasks(void) +{ + _taskIdx = 0; +} + + +static void +makeSomePostings(FPFactory *postingFactory, + std::vector<FakeWord *> &w, + std::vector<FakePosting::SP> &p, + uint32_t stride, + bool validate, + bool verbose) +{ + for (unsigned int i = 0; i < w.size(); ++i) { + FakePosting::SP np(postingFactory->make(*w[i])); + if (validate) { + TermFieldMatchData md; + TermFieldMatchDataArray tfmda; + tfmda.add(&md); + + std::unique_ptr<SearchIterator> sb(np->createIterator(tfmda)); + if (np->hasWordPositions()) { + if (stride != 0) + w[i]->validate(sb.get(), tfmda, stride, verbose); + else + w[i]->validate(sb.get(), tfmda, verbose); + } else + w[i]->validate(sb.get(), verbose); + } + p.push_back(np); + } +} + +void +AndStressMaster::makePostingsHelper(FPFactory *postingFactory, + const std::string &postingFormat, + bool validate, bool verbose) +{ + FastOS_Time tv; + double before; + double after; + + tv.SetNow(); + before = tv.Secs(); + postingFactory->setup(_wordSet); + for (unsigned int i = 0; i < _wordSet._words.size(); ++i) + makeSomePostings(postingFactory, + _wordSet._words[i], _postings[i], + _stride, + validate, + verbose); + tv.SetNow(); + after = tv.Secs(); + LOG(info, + "AndStressMaster::makePostingsHelper elapsed %10.6f s for %s format", + after - before, + postingFormat.c_str()); +} + + +void +AndStressMaster::setupTasks(unsigned int numTasks) +{ + unsigned int wordclass1; + unsigned int wordclass2; + unsigned int word1idx; + unsigned int word2idx; + + for (unsigned int i = 0; i < numTasks; ++i) { + wordclass1 = _rnd.lrand48() % _postings.size(); + wordclass2 = _rnd.lrand48() % _postings.size(); + while (wordclass1 == FakeWordSet::COMMON_WORD && + wordclass2 == FakeWordSet::COMMON_WORD && + (_rnd.lrand48() % _skipCommonPairsRate) != 0) { + wordclass1 = _rnd.lrand48() % _postings.size(); + wordclass2 = _rnd.lrand48() % _postings.size(); + } + word1idx = _rnd.lrand48() % _postings[wordclass1].size(); + word2idx = _rnd.lrand48() % _postings[wordclass2].size(); + FakePosting::SP p1 = _postings[wordclass1][word1idx]; + FakePosting::SP p2 = _postings[wordclass2][word2idx]; + _tasks.push_back(std::make_pair(p1.get(), p2.get())); + } +} + + +AndStressMaster::Task * +AndStressMaster::getTask(void) +{ + Task *result = NULL; + _taskCond.Lock(); + if (_taskIdx < _tasks.size()) { + result = &_tasks[_taskIdx]; + ++_taskIdx; + } else { + _workersDone++; + if (_workersDone == _workers.size()) + _taskCond.Broadcast(); + } + _taskCond.Unlock(); + return result; +} + +void +AndStressMaster::run(void) +{ + LOG(info, "AndStressMaster::run"); + + std::vector<std::string>::const_iterator pti; + std::vector<std::string>::const_iterator ptie = _postingTypes.end() ; + + for (pti = _postingTypes.begin(); pti != ptie; ++pti) { + std::unique_ptr<FPFactory> ff(getFPFactory(*pti, _wordSet.getSchema())); + makePostingsHelper(ff.get(), *pti, true, false); + setupTasks(_numTasks); + double totalTime = 0; + for (unsigned int loop = 0; loop < _loops; ++loop) { + totalTime += runWorkers(*pti); + resetTasks(); + } + LOG(info, "AndStressMaster::average run elapsed %10.6f s for workers %s format", + totalTime / _loops, pti->c_str()); + dropPostings(); + } + FastOS_Thread::Sleep(250); +} + + +double +AndStressMaster::runWorkers(const std::string &postingFormat) +{ + FastOS_Time tv; + double before; + double after; + + tv.SetNow(); + before = tv.Secs(); + unsigned int numWorkers = 8; + for (unsigned int i = 0; i < numWorkers; ++i) + _workers.push_back(new AndStressWorker(*this, i)); + + for (unsigned int i = 0; i < _workers.size(); ++i) + _threadPool->NewThread(_workers[i]); + _taskCond.Lock(); + while (_workersDone < _workers.size()) + _taskCond.Wait(); + _taskCond.Unlock(); + tv.SetNow(); + after = tv.Secs(); + LOG(info, + "AndStressMaster::run elapsed %10.6f s for workers %s format", + after - before, + postingFormat.c_str()); + clearPtrVector(_workers); + _workersDone = 0; + return after - before; +} + + +AndStressWorker::AndStressWorker(AndStressMaster &master, unsigned int id) + : _master(master), + _id(id) +{ + LOG(debug, "AndStressWorker::AndStressWorker, id=%u", id); +} + +AndStressWorker::~AndStressWorker(void) +{ + LOG(debug, "AndStressWorker::~AndStressWorker, id=%u", _id); +} + + +static int +highLevelAndPairPostingScan(SearchIterator &sb1, + SearchIterator &sb2, + uint32_t numDocs, uint64_t *cycles) +{ + uint32_t hits = 0; + uint64_t before = fastos::ClockSystem::now(); + sb1.initFullRange(); + sb2.initFullRange(); + uint32_t docId = sb1.getDocId(); + while (docId < numDocs) { + if (sb1.seek(docId)) { + if (sb2.seek(docId)) { + ++hits; + ++docId; + } else if (docId < sb2.getDocId()) + docId = sb2.getDocId(); + else + ++docId; + } else if (docId < sb1.getDocId()) + docId= sb1.getDocId(); + else + ++docId; + } + uint64_t after = fastos::ClockSystem::now(); + *cycles = after - before; + return hits; +} + + +static int +highLevelAndPairPostingScanUnpack(SearchIterator &sb1, + SearchIterator &sb2, + uint32_t numDocs, + uint64_t *cycles) +{ + uint32_t hits = 0; + uint64_t before = fastos::ClockSystem::now(); + sb1.initFullRange(); + sb2.initFullRange(); + uint32_t docId = sb1.getDocId(); + while (docId < numDocs) { + if (sb1.seek(docId)) { + if (sb2.seek(docId)) { + ++hits; + sb1.unpack(docId); + sb2.unpack(docId); + ++docId; + } else if (docId < sb2.getDocId()) + docId = sb2.getDocId(); + else + ++docId; + } else if (docId < sb1.getDocId()) + docId= sb1.getDocId(); + else + ++docId; + } + uint64_t after = fastos::ClockSystem::now(); + *cycles = after - before; + return hits; +} + +void +testFakePair(FakePosting &f1, FakePosting &f2, unsigned int numDocs, + bool unpack) +{ + TermFieldMatchData md1; + TermFieldMatchDataArray tfmda1; + tfmda1.add(&md1); + std::unique_ptr<SearchIterator> sb1(f1.createIterator(tfmda1)); + + TermFieldMatchData md2; + TermFieldMatchDataArray tfmda2; + tfmda1.add(&md2); + std::unique_ptr<SearchIterator> sb2(f2.createIterator(tfmda2)); + + int hits = 0; + uint64_t scanUnpackTime = 0; + if (unpack) + hits = highLevelAndPairPostingScanUnpack(*sb1.get(), *sb2.get(), + numDocs, &scanUnpackTime); + else + hits = highLevelAndPairPostingScan(*sb1.get(), *sb2.get(), + numDocs, &scanUnpackTime); +#if 0 + printf("Fakepair %s AND %s => %d hits, %" PRIu64 " cycles\n", + f1.getName().c_str(), + f2.getName().c_str(), + hits, + scanUnpackTime); +#else + (void)hits; +#endif +} + +void +AndStressWorker::Run(FastOS_ThreadInterface *thisThread, void *arg) +{ + (void) thisThread; + (void) arg; + LOG(debug, "AndStressWorker::Run, id=%u", _id); + + bool unpack = _master.getUnpack(); + for (;;) { + AndStressMaster::Task *task = _master.getTask(); + if (task == NULL) + break; + testFakePair(*task->first, *task->second, _master.getNumDocs(), + unpack); + } +} + + +AndStress::AndStress(void) +{ + LOG(debug, "Andstress::AndStress"); +} + + +AndStress::~AndStress(void) +{ + LOG(debug, "Andstress::~AndStress"); +} + +void +AndStress::run(search::Rand48 &rnd, + FakeWordSet &wordSet, + unsigned int numDocs, + unsigned int commonDocFreq, + const std::vector<std::string> &postingTypes, + unsigned int loops, + unsigned int skipCommonPairsRate, + uint32_t numTasks, + uint32_t stride, + bool unpack) +{ + LOG(debug, "Andstress::run"); + AndStressMaster master(rnd, wordSet, + numDocs, commonDocFreq, postingTypes, loops, + skipCommonPairsRate, + numTasks, + stride, + unpack); + master.run(); +} + +} diff --git a/searchlib/src/tests/postinglistbm/andstress.h b/searchlib/src/tests/postinglistbm/andstress.h new file mode 100644 index 00000000000..458866b09d5 --- /dev/null +++ b/searchlib/src/tests/postinglistbm/andstress.h @@ -0,0 +1,43 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + + +#include <vector> +namespace search +{ +class Rand48; + +namespace fakedata +{ + +class FakeWordSet; + +} + +} + +namespace postinglistbm +{ + +class AndStress +{ +public: + AndStress(void); + + ~AndStress(void); + + void + run(search::Rand48 &rnd, + search::fakedata::FakeWordSet &wordSet, + unsigned int numDocs, + unsigned int commonDocFreq, + const std::vector<std::string> &postingTypes, + unsigned int loops, + unsigned int skipCommonPairsRate, + uint32_t numTasks, + uint32_t stride, + bool unpack); +}; + +} // namespace postinglistbm + diff --git a/searchlib/src/tests/postinglistbm/postinglistbm.cpp b/searchlib/src/tests/postinglistbm/postinglistbm.cpp new file mode 100644 index 00000000000..fc93eb42dcd --- /dev/null +++ b/searchlib/src/tests/postinglistbm/postinglistbm.cpp @@ -0,0 +1,491 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 2002-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("postinglistbm"); +#include <vespa/searchlib/common/bitvector.h> +#include <vespa/searchlib/common/resultset.h> +#include <vespa/searchlib/util/rand48.h> +#include "andstress.h" +#include <vespa/searchlib/test/fakedata/fakeword.h> +#include <vespa/searchlib/test/fakedata/fakeposting.h> +#include <vespa/searchlib/test/fakedata/fakewordset.h> +#include <vespa/searchlib/test/fakedata/fpfactory.h> +#include <vespa/searchlib/index/docidandfeatures.h> + +using search::ResultSet; +using search::fef::TermFieldMatchData; +using search::fef::TermFieldMatchDataArray; +using search::queryeval::SearchIterator; +using search::index::Schema; +using namespace search::fakedata; + +// needed to resolve external symbol from httpd.h on AIX +void FastS_block_usr2() {} + + +namespace postinglistbm +{ + +class PostingListBM : public FastOS_Application +{ +private: + bool _verbose; + uint32_t _numDocs; + uint32_t _commonDocFreq; + uint32_t _numWordsPerClass; + std::vector<std::string> _postingTypes; + uint32_t _loops; + unsigned int _skipCommonPairsRate; + FakeWordSet _wordSet; + uint32_t _stride; + bool _unpack; +public: + search::Rand48 _rnd; + +private: + void Usage(void); + + void + badPostingType(const std::string &postingType); + + void + testFake(const std::string &postingType, + const Schema &schema, + const FakeWord &fw); +public: + PostingListBM(void); + ~PostingListBM(void); + int Main(void); +}; + + +void +PostingListBM::Usage(void) +{ + printf("postinglistbm " + "[-C <skipCommonPairsRate>] " + "[-a] " + "[-c <commonDoqFreq>] " + "[-d <numDocs>] " + "[-l <numLoops>] " + "[-s <stride>] " + "[-t <postingType>] " + "[-u] " + "[-q] " + "[-v]\n"); +} + + +void +PostingListBM::badPostingType(const std::string &postingType) +{ + printf("Bad posting list type: %s\n", postingType.c_str()); + printf("Supported types: "); + + std::vector<std::string> postingTypes = getPostingTypes(); + std::vector<std::string>::const_iterator pti; + std::vector<std::string>::const_iterator ptie = postingTypes.end(); + bool first = true; + + for (pti = postingTypes.begin(); pti != ptie; ++pti) { + if (first) + first = false; + else + printf(", "); + printf("%s", pti->c_str()); + } + printf("\n"); +} + + +PostingListBM::PostingListBM(void) + : _verbose(false), + _numDocs(10000000), + _commonDocFreq(50000), + _numWordsPerClass(100), + _postingTypes(), + _loops(1), + _skipCommonPairsRate(1), + _wordSet(), + _stride(0), + _unpack(false), + _rnd() +{ +} + + +PostingListBM::~PostingListBM(void) +{ +} + + +static int +highLevelSinglePostingScan(SearchIterator &sb, uint32_t numDocs, uint64_t *cycles) +{ + uint32_t hits = 0; + uint64_t before = fastos::ClockSystem::now(); + sb.initFullRange(); + uint32_t docId = sb.getDocId(); + while (docId < numDocs) { + if (sb.seek(docId)) { + ++hits; + ++docId; + } else if (docId < sb.getDocId()) + docId= sb.getDocId(); + else + ++docId; + } + uint64_t after = fastos::ClockSystem::now(); + *cycles = after - before; + return hits; +} + + +static int +highLevelSinglePostingScanUnpack(SearchIterator &sb, + uint32_t numDocs, uint64_t *cycles) +{ + uint32_t hits = 0; + uint64_t before = fastos::ClockSystem::now(); + sb.initFullRange(); + uint32_t docId = sb.getDocId(); + while (docId < numDocs) { + if (sb.seek(docId)) { + ++hits; + sb.unpack(docId); + ++docId; + } else if (docId < sb.getDocId()) + docId= sb.getDocId(); + else + ++docId; + } + uint64_t after = fastos::ClockSystem::now(); + *cycles = after - before; + return hits; +} + + +static int +highLevelAndPairPostingScan(SearchIterator &sb1, + SearchIterator &sb2, + uint32_t numDocs, uint64_t *cycles) +{ + uint32_t hits = 0; + uint64_t before = fastos::ClockSystem::now(); + sb1.initFullRange(); + sb2.initFullRange(); + uint32_t docId = sb1.getDocId(); + while (docId < numDocs) { + if (sb1.seek(docId)) { + if (sb2.seek(docId)) { + ++hits; + ++docId; + } else if (docId < sb2.getDocId()) + docId = sb2.getDocId(); + else + ++docId; + } else if (docId < sb1.getDocId()) + docId= sb1.getDocId(); + else + ++docId; + } + uint64_t after = fastos::ClockSystem::now(); + *cycles = after - before; + return hits; +} + + +static int +highLevelAndPairPostingScanUnpack(SearchIterator &sb1, + SearchIterator &sb2, + uint32_t numDocs, + uint64_t *cycles) +{ + uint32_t hits = 0; + uint64_t before = fastos::ClockSystem::now(); + sb1.initFullRange(); + sb1.initFullRange(); + uint32_t docId = sb1.getDocId(); + while (docId < numDocs) { + if (sb1.seek(docId)) { + if (sb2.seek(docId)) { + ++hits; + sb1.unpack(docId); + sb2.unpack(docId); + ++docId; + } else if (docId < sb2.getDocId()) + docId = sb2.getDocId(); + else + ++docId; + } else if (docId < sb1.getDocId()) + docId= sb1.getDocId(); + else + ++docId; + } + uint64_t after = fastos::ClockSystem::now(); + *cycles = after - before; + return hits; +} + + +void +PostingListBM::testFake(const std::string &postingType, + const Schema &schema, + const FakeWord &fw) +{ + std::unique_ptr<FPFactory> ff(getFPFactory(postingType, schema)); + std::vector<const FakeWord *> v; + v.push_back(&fw); + ff->setup(v); + FakePosting::SP f(ff->make(fw)); + + printf("%s.bitsize=%d+%d+%d+%d+%d\n", + f->getName().c_str(), + static_cast<int>(f->bitSize()), + static_cast<int>(f->l1SkipBitSize()), + static_cast<int>(f->l2SkipBitSize()), + static_cast<int>(f->l3SkipBitSize()), + static_cast<int>(f->l4SkipBitSize())); + TermFieldMatchData md; + TermFieldMatchDataArray tfmda; + tfmda.add(&md); + + std::unique_ptr<SearchIterator> sb(f->createIterator(tfmda)); + if (f->hasWordPositions()) + fw.validate(sb.get(), tfmda, _verbose); + else + fw.validate(sb.get(), _verbose); + uint64_t scanTime = 0; + uint64_t scanUnpackTime = 0; + TermFieldMatchData md2; + TermFieldMatchDataArray tfmda2; + tfmda2.add(&md2); + + std::unique_ptr<SearchIterator> sb2(f->createIterator(tfmda2)); + int hits1 = highLevelSinglePostingScan(*sb2.get(), fw.getDocIdLimit(), + &scanTime); + TermFieldMatchData md3; + TermFieldMatchDataArray tfmda3; + tfmda3.add(&md3); + + std::unique_ptr<SearchIterator> sb3(f->createIterator(tfmda3)); + int hits2 = highLevelSinglePostingScanUnpack(*sb3.get(), fw.getDocIdLimit(), + &scanUnpackTime); + printf("testFake '%s' hits1=%d, hits2=%d, scanTime=%" PRIu64 + ", scanUnpackTime=%" PRIu64 "\n", + f->getName().c_str(), + hits1, hits2, scanTime, scanUnpackTime); +} + + +void +testFakePair(const std::string &postingType, + const Schema &schema, + bool unpack, + const FakeWord &fw1, const FakeWord &fw2) +{ + std::unique_ptr<FPFactory> ff(getFPFactory(postingType, schema)); + std::vector<const FakeWord *> v; + v.push_back(&fw1); + v.push_back(&fw2); + ff->setup(v); + FakePosting::SP f1(ff->make(fw1)); + FakePosting::SP f2(ff->make(fw2)); + + TermFieldMatchData md1; + TermFieldMatchDataArray tfmda1; + tfmda1.add(&md1); + std::unique_ptr<SearchIterator> sb1(f1->createIterator(tfmda1)); + + TermFieldMatchData md2; + TermFieldMatchDataArray tfmda2; + tfmda1.add(&md2); + std::unique_ptr<SearchIterator> sb2(f2->createIterator(tfmda2)); + + int hits = 0; + uint64_t scanUnpackTime = 0; + if (unpack) + hits = highLevelAndPairPostingScanUnpack(*sb1.get(), *sb2.get(), + fw1.getDocIdLimit(), &scanUnpackTime); + else + hits = highLevelAndPairPostingScan(*sb1.get(), *sb2.get(), + fw1.getDocIdLimit(), &scanUnpackTime); + printf("Fakepair %s AND %s => %d hits, %" PRIu64 " cycles\n", + f1->getName().c_str(), + f2->getName().c_str(), + hits, + scanUnpackTime); +} + + +int +PostingListBM::Main(void) +{ + int argi; + char c; + const char *optArg; + bool doandstress; + + doandstress = false; + argi = 1; + bool hasElements = false; + bool hasElementWeights = false; + bool quick = false; + + while ((c = GetOpt("C:ac:d:l:s:t:uvw:T:q", optArg, argi)) != -1) { + switch(c) { + case 'C': + _skipCommonPairsRate = atoi(optArg); + break; + case 'T': + if (strcmp(optArg, "single") == 0) { + hasElements = false; + hasElementWeights = false; + } else if (strcmp(optArg, "array") == 0) { + hasElements = true; + hasElementWeights = false; + } else if (strcmp(optArg, "weightedSet") == 0) { + hasElements = true; + hasElementWeights = true; + } else { + printf("Bad collection type: %s\n", optArg); + return 1; + } + break; + case 'a': + doandstress = true; + break; + case 'c': + _commonDocFreq = atoi(optArg); + break; + case 'd': + _numDocs = atoi(optArg); + break; + case 'l': + _loops = atoi(optArg); + break; + case 's': + _stride = atoi(optArg); + break; + case 't': + do { + Schema schema; + Schema::IndexField indexField("field0", + Schema::STRING, + Schema::SINGLE); + schema.addIndexField(indexField); + std::unique_ptr<FPFactory> ff(getFPFactory(optArg, schema)); + if (ff.get() == NULL) { + badPostingType(optArg); + return 1; + } + } while (0); + _postingTypes.push_back(optArg); + break; + case 'u': + _unpack = true; + break; + case 'v': + _verbose = true; + break; + case 'w': + _numWordsPerClass = atoi(optArg); + break; + case 'q': + quick = true; + _numDocs = 36000; + _commonDocFreq = 10000; + _numWordsPerClass = 5; + break; + default: + Usage(); + return 1; + } + } + + if (_commonDocFreq > _numDocs) { + Usage(); + return 1; + } + + _wordSet.setupParams(hasElements, hasElementWeights); + + uint32_t w1dfreq = 10; + uint32_t w4dfreq = 790000; + uint32_t w5dfreq = 290000; + uint32_t w4w5od = 100000; + uint32_t numTasks = 40000; + if (quick) { + w1dfreq = 2; + w4dfreq = 19000; + w5dfreq = 5000; + w4w5od = 1000; + numTasks = 40; + } + + + FakeWord word1(_numDocs, w1dfreq, w1dfreq / 2, "word1", _rnd, + _wordSet.getFieldsParams(), _wordSet.getPackedIndex()); + FakeWord word2(_numDocs, 1000, 500, "word2", word1, 4, _rnd, + _wordSet.getFieldsParams(), _wordSet.getPackedIndex()); + FakeWord word3(_numDocs, _commonDocFreq, _commonDocFreq / 2, + "word3", word1, 10, _rnd, + _wordSet.getFieldsParams(), _wordSet.getPackedIndex()); + FakeWord word4(_numDocs, w4dfreq, w4dfreq / 2, + "word4", _rnd, + _wordSet.getFieldsParams(), _wordSet.getPackedIndex()); + FakeWord word5(_numDocs, w5dfreq, w5dfreq / 2, + "word5", word4, w4w5od, _rnd, + _wordSet.getFieldsParams(), _wordSet.getPackedIndex()); + + if (_postingTypes.empty()) + _postingTypes = getPostingTypes(); + std::vector<std::string>::const_iterator pti; + std::vector<std::string>::const_iterator ptie = _postingTypes.end() ; + + for (pti = _postingTypes.begin(); pti != ptie; ++pti) { + testFake(*pti, _wordSet.getSchema(), word1); + testFake(*pti, _wordSet.getSchema(), word2); + testFake(*pti, _wordSet.getSchema(), word3); + } + + for (pti = _postingTypes.begin(); pti != ptie; ++pti) { + testFakePair(*pti, _wordSet.getSchema(), false, word1, word3); + testFakePair(*pti, _wordSet.getSchema(), false, word2, word3); + } + + for (pti = _postingTypes.begin(); pti != ptie; ++pti) { + testFakePair(*pti, _wordSet.getSchema(), false, word4, word5); + } + + if (doandstress) { + _wordSet.setupWords(_rnd, _numDocs, _commonDocFreq, _numWordsPerClass); + } + if (doandstress) { + AndStress andstress; + andstress.run(_rnd, _wordSet, + _numDocs, _commonDocFreq, _postingTypes, _loops, + _skipCommonPairsRate, + numTasks, + _stride, + _unpack); + } + return 0; +} + +} // namespace postinglistbm + +int +main(int argc, char **argv) +{ + postinglistbm::PostingListBM app; + + setvbuf(stdout, NULL, _IOLBF, 32768); + app._rnd.srand48(32); + return app.Entry(argc, argv); + + return 0; +} diff --git a/searchlib/src/tests/postinglistbm/skip.txt b/searchlib/src/tests/postinglistbm/skip.txt new file mode 100644 index 00000000000..9804bce3c33 --- /dev/null +++ b/searchlib/src/tests/postinglistbm/skip.txt @@ -0,0 +1,75 @@ +B tree view: + + Leaf Nodes: segments of docid delta list + Interior Nodes: Segments of skip info lists + + Interior Nodes 1 level above leaf nodes: L1 skip info + Interior Nodes 2 level above leaf nodes: L2 skip info + +Example posting list, with stride 4 for L1 skip and L2 skip: + +DocIdPos: 0 1 2 3| 4 5 6 7| 8 9 10 11| 12 13 14 15| 16 17 18 +DocId: 1 11 21 31|41 51 61 71|81 91 101 111|121 131 141 151|161 171 181 + +(Assume continued with every 10. docid present) + +Old L1 skip info, pointing to start of leaf nodes, with first docid in +leaf node pre-decoded (i.e. containing copy of first docid entry in leaf node): + +L1Pos: 0 1 2 3| 4 5 6 7| 8 9 10 11| 12 13 14 15| 16 +DocId: 41 81 121 161|201 241 281 321|361 401 441 481|521 561 601 641|681 +DocIdPos: 5 9 13 17| 21 25 29 33| 37 41 45 49| 53 57 61 65| 69 + +Old L2 skip info, pointing to start of interior nodes 1 level above leaf nodes +and containing copies of previous L1 skip entry: + +L2Pos: 0 1 2 3 +DocId: 161 321 481 641 +DocIdPos: 17 33 49 65 +L1Pos: 4 8 12 16 + +Reason for change of skip info view: Avoiding null skips, simplifying code. + +Skip from docId 1 to docId 115 first skips to DocId 81 before ending +up at DocId 121. If next seek is to below 161, a null skip to docid +121 is performed since docId delta unpacking caught up with supposedly +next L1 skip docid. With L1 skip stride being N, 1/N of longer seeks +will unpack N extra docids, eating up the advantage of first docid in +leaf node being pre-decoded. + +If a seek to docId 115 is followed by a seek to docId 121, an unpack +of docId 121 and a sek to a higher docid, this causes, with the old L1 +skip info, features for docId 81, 91 101, 111 to be decoded with the +result ignored before the features for docId 121 is decoded. For the +next seek, the null skip of DocId is also associated with a backwards +skip for features, so if the next feature to be decoded was for docId +141 then features for docId 121 will be decoded again and ignored. + +New L1 skip info, pointing to start of leaf nodes, without first docid +in leaf node pre-decoded (i.e. containing copy of last docid entry in +previous leaf node): + +L1Pos: 0 1 2 3| 4 5 6 7| 8 9 10 11| 12 13 14 15| 16 +DocId: 31 71 111 151|191 231 271 311|351 391 431 471|511 551 591 631|671 +DocIdPos: 4 8 12 16| 20 24 28 32| 36 40 44 48| 52 56 60 64| 68 + +New L2 skip info, pointing to start of interior nodes 1 level above leaf nodes +and containing copies of previous L1 skip entry: + +L2Pos: 0 1 2 3 +DocId: 151 311 471 631 +DocIdPos: 16 32 48 64 +L1Pos: 4 8 12 16 + +1 DocId delta is unpacked when using L1 or L2 skip, to get first docid +in leaf node. With old skip info, this wasn't needed. + +With new skip info, docid delta unpacking should never catch up with +next L1 skip docid (can become equal, but that's no longer sufficient +for triggering a skip). + +For each level upwards in skip info, one extra number is needed per element in +the skip info. + +For feature position (split docid/features), one extra number is needed per +element in the skip info. |