summaryrefslogtreecommitdiffstats
path: root/searchlib/src/tests/postinglistbm
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
committerJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
commit72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /searchlib/src/tests/postinglistbm
Publish
Diffstat (limited to 'searchlib/src/tests/postinglistbm')
-rw-r--r--searchlib/src/tests/postinglistbm/.gitignore10
-rw-r--r--searchlib/src/tests/postinglistbm/CMakeLists.txt10
-rw-r--r--searchlib/src/tests/postinglistbm/andstress.cpp536
-rw-r--r--searchlib/src/tests/postinglistbm/andstress.h43
-rw-r--r--searchlib/src/tests/postinglistbm/postinglistbm.cpp491
-rw-r--r--searchlib/src/tests/postinglistbm/skip.txt75
6 files changed, 1165 insertions, 0 deletions
diff --git a/searchlib/src/tests/postinglistbm/.gitignore b/searchlib/src/tests/postinglistbm/.gitignore
new file mode 100644
index 00000000000..ac71dde13e2
--- /dev/null
+++ b/searchlib/src/tests/postinglistbm/.gitignore
@@ -0,0 +1,10 @@
+*.core
+*.ilk
+*.pdb
+.depend
+Makefile
+core
+core.*
+postinglistbm
+postinglistbm.exe
+searchlib_postinglistbm_app
diff --git a/searchlib/src/tests/postinglistbm/CMakeLists.txt b/searchlib/src/tests/postinglistbm/CMakeLists.txt
new file mode 100644
index 00000000000..403c12da1b1
--- /dev/null
+++ b/searchlib/src/tests/postinglistbm/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(searchlib_postinglistbm_app
+ SOURCES
+ postinglistbm.cpp
+ andstress.cpp
+ DEPENDS
+ searchlib_test
+ searchlib
+)
+vespa_add_test(NAME searchlib_postinglistbm_app NO_VALGRIND COMMAND searchlib_postinglistbm_app -q -a)
diff --git a/searchlib/src/tests/postinglistbm/andstress.cpp b/searchlib/src/tests/postinglistbm/andstress.cpp
new file mode 100644
index 00000000000..f3fabde0d61
--- /dev/null
+++ b/searchlib/src/tests/postinglistbm/andstress.cpp
@@ -0,0 +1,536 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+LOG_SETUP(".andstress");
+#include "andstress.h"
+#include <vector>
+
+#include <vespa/searchlib/common/bitvector.h>
+#include <vespa/searchlib/test/fakedata/fakeword.h>
+#include <vespa/searchlib/test/fakedata/fakewordset.h>
+#include <vespa/searchlib/test/fakedata/fakeposting.h>
+#include <vespa/searchlib/test/fakedata/fakefilterocc.h>
+#include <vespa/searchlib/test/fakedata/fakeegcompr64filterocc.h>
+#include <vespa/searchlib/test/fakedata/fakezcfilterocc.h>
+#include <vespa/searchlib/test/fakedata/fakezcbfilterocc.h>
+#include <vespa/searchlib/test/fakedata/fpfactory.h>
+
+using search::fef::TermFieldMatchData;
+using search::fef::TermFieldMatchDataArray;
+using search::queryeval::SearchIterator;
+using namespace search::fakedata;
+
+namespace postinglistbm
+{
+
+class AndStressWorker;
+
+class AndStressMaster
+{
+private:
+ AndStressMaster(const AndStressMaster &);
+
+ AndStressMaster &
+ operator=(const AndStressMaster &);
+
+ search::Rand48 &_rnd;
+ unsigned int _numDocs;
+ unsigned int _commonDocFreq;
+ std::vector<std::string> _postingTypes;
+ unsigned int _loops;
+ unsigned int _skipCommonPairsRate;
+ uint32_t _stride;
+ bool _unpack;
+
+ FastOS_ThreadPool *_threadPool;
+ std::vector<AndStressWorker *> _workers;
+ unsigned int _workersDone;
+
+ FakeWordSet &_wordSet;
+
+ std::vector<std::vector<FakePosting::SP> > _postings;
+
+ FastOS_Cond _taskCond;
+ unsigned int _taskIdx;
+ uint32_t _numTasks;
+
+public:
+ typedef std::pair<FakePosting *, FakePosting *> Task;
+private:
+ std::vector<Task> _tasks;
+public:
+ AndStressMaster(search::Rand48 &rnd,
+ FakeWordSet &wordSet,
+ unsigned int numDocs,
+ unsigned int commonDocFreq,
+ const std::vector<std::string> &postingType,
+ unsigned int loops,
+ unsigned int skipCommonPairsRate,
+ uint32_t numTasks,
+ uint32_t stride,
+ bool unpack);
+
+ ~AndStressMaster(void);
+
+ void
+ run(void);
+
+ void
+ makePostingsHelper(FPFactory *postingFactory,
+ const std::string &postingFormat,
+ bool validate, bool verbose);
+
+ void
+ dropPostings(void);
+
+ void
+ dropTasks(void);
+
+ void
+ resetTasks(void); // Prepare for rerun
+
+ void
+ setupTasks(unsigned int numTasks);
+
+ Task *
+ getTask(void);
+
+ unsigned int
+ getNumDocs(void) const
+ {
+ return _numDocs;
+ }
+
+ bool
+ getUnpack(void) const
+ {
+ return _unpack;
+ }
+
+ double
+ runWorkers(const std::string &postingFormat);
+};
+
+
+class AndStressWorker : public FastOS_Runnable
+{
+private:
+ AndStressWorker(const AndStressWorker &);
+
+ AndStressWorker &
+ operator=(const AndStressWorker &);
+
+ AndStressMaster &_master;
+ unsigned int _id;
+public:
+ AndStressWorker(AndStressMaster &master, unsigned int id);
+
+ ~AndStressWorker(void);
+
+ virtual void
+ Run(FastOS_ThreadInterface *thisThread, void *arg);
+};
+
+
+template <class P>
+FakePosting *
+makePosting(FakeWord &fw)
+{
+ return new P(fw);
+}
+
+
+AndStressMaster::AndStressMaster(search::Rand48 &rnd,
+ FakeWordSet &wordSet,
+ unsigned int numDocs,
+ unsigned int commonDocFreq,
+ const std::vector<std::string> &postingTypes,
+ unsigned int loops,
+ unsigned int skipCommonPairsRate,
+ uint32_t numTasks,
+ uint32_t stride,
+ bool unpack)
+ : _rnd(rnd),
+ _numDocs(numDocs),
+ _commonDocFreq(commonDocFreq),
+ _postingTypes(postingTypes),
+ _loops(loops),
+ _skipCommonPairsRate(skipCommonPairsRate),
+ _stride(stride),
+ _unpack(unpack),
+ _threadPool(NULL),
+ _workers(),
+ _workersDone(0),
+ _wordSet(wordSet),
+ _postings(FakeWordSet::NUM_WORDCLASSES),
+ _taskCond(),
+ _taskIdx(0),
+ _numTasks(numTasks),
+ _tasks()
+{
+ LOG(info, "AndStressMaster::AndStressMaster");
+
+ _threadPool = new FastOS_ThreadPool(128 * 1024, 400);
+}
+
+template <class C>
+static void
+clearPtrVector(std::vector<C> &v)
+{
+ for (unsigned int i = 0; i < v.size(); ++i)
+ delete v[i];
+ v.clear();
+}
+
+
+AndStressMaster::~AndStressMaster(void)
+{
+ LOG(info, "AndStressMaster::~AndStressMaster");
+
+ _threadPool->Close();
+ delete _threadPool;
+ _threadPool = NULL;
+ clearPtrVector(_workers);
+ dropPostings();
+}
+
+
+void
+AndStressMaster::dropPostings(void)
+{
+ for (unsigned int i = 0; i < _postings.size(); ++i)
+ _postings[i].clear();
+ dropTasks();
+}
+
+
+void
+AndStressMaster::dropTasks(void)
+{
+ _tasks.clear();
+ _taskIdx = 0;
+}
+
+
+void
+AndStressMaster::resetTasks(void)
+{
+ _taskIdx = 0;
+}
+
+
+static void
+makeSomePostings(FPFactory *postingFactory,
+ std::vector<FakeWord *> &w,
+ std::vector<FakePosting::SP> &p,
+ uint32_t stride,
+ bool validate,
+ bool verbose)
+{
+ for (unsigned int i = 0; i < w.size(); ++i) {
+ FakePosting::SP np(postingFactory->make(*w[i]));
+ if (validate) {
+ TermFieldMatchData md;
+ TermFieldMatchDataArray tfmda;
+ tfmda.add(&md);
+
+ std::unique_ptr<SearchIterator> sb(np->createIterator(tfmda));
+ if (np->hasWordPositions()) {
+ if (stride != 0)
+ w[i]->validate(sb.get(), tfmda, stride, verbose);
+ else
+ w[i]->validate(sb.get(), tfmda, verbose);
+ } else
+ w[i]->validate(sb.get(), verbose);
+ }
+ p.push_back(np);
+ }
+}
+
+void
+AndStressMaster::makePostingsHelper(FPFactory *postingFactory,
+ const std::string &postingFormat,
+ bool validate, bool verbose)
+{
+ FastOS_Time tv;
+ double before;
+ double after;
+
+ tv.SetNow();
+ before = tv.Secs();
+ postingFactory->setup(_wordSet);
+ for (unsigned int i = 0; i < _wordSet._words.size(); ++i)
+ makeSomePostings(postingFactory,
+ _wordSet._words[i], _postings[i],
+ _stride,
+ validate,
+ verbose);
+ tv.SetNow();
+ after = tv.Secs();
+ LOG(info,
+ "AndStressMaster::makePostingsHelper elapsed %10.6f s for %s format",
+ after - before,
+ postingFormat.c_str());
+}
+
+
+void
+AndStressMaster::setupTasks(unsigned int numTasks)
+{
+ unsigned int wordclass1;
+ unsigned int wordclass2;
+ unsigned int word1idx;
+ unsigned int word2idx;
+
+ for (unsigned int i = 0; i < numTasks; ++i) {
+ wordclass1 = _rnd.lrand48() % _postings.size();
+ wordclass2 = _rnd.lrand48() % _postings.size();
+ while (wordclass1 == FakeWordSet::COMMON_WORD &&
+ wordclass2 == FakeWordSet::COMMON_WORD &&
+ (_rnd.lrand48() % _skipCommonPairsRate) != 0) {
+ wordclass1 = _rnd.lrand48() % _postings.size();
+ wordclass2 = _rnd.lrand48() % _postings.size();
+ }
+ word1idx = _rnd.lrand48() % _postings[wordclass1].size();
+ word2idx = _rnd.lrand48() % _postings[wordclass2].size();
+ FakePosting::SP p1 = _postings[wordclass1][word1idx];
+ FakePosting::SP p2 = _postings[wordclass2][word2idx];
+ _tasks.push_back(std::make_pair(p1.get(), p2.get()));
+ }
+}
+
+
+AndStressMaster::Task *
+AndStressMaster::getTask(void)
+{
+ Task *result = NULL;
+ _taskCond.Lock();
+ if (_taskIdx < _tasks.size()) {
+ result = &_tasks[_taskIdx];
+ ++_taskIdx;
+ } else {
+ _workersDone++;
+ if (_workersDone == _workers.size())
+ _taskCond.Broadcast();
+ }
+ _taskCond.Unlock();
+ return result;
+}
+
+void
+AndStressMaster::run(void)
+{
+ LOG(info, "AndStressMaster::run");
+
+ std::vector<std::string>::const_iterator pti;
+ std::vector<std::string>::const_iterator ptie = _postingTypes.end() ;
+
+ for (pti = _postingTypes.begin(); pti != ptie; ++pti) {
+ std::unique_ptr<FPFactory> ff(getFPFactory(*pti, _wordSet.getSchema()));
+ makePostingsHelper(ff.get(), *pti, true, false);
+ setupTasks(_numTasks);
+ double totalTime = 0;
+ for (unsigned int loop = 0; loop < _loops; ++loop) {
+ totalTime += runWorkers(*pti);
+ resetTasks();
+ }
+ LOG(info, "AndStressMaster::average run elapsed %10.6f s for workers %s format",
+ totalTime / _loops, pti->c_str());
+ dropPostings();
+ }
+ FastOS_Thread::Sleep(250);
+}
+
+
+double
+AndStressMaster::runWorkers(const std::string &postingFormat)
+{
+ FastOS_Time tv;
+ double before;
+ double after;
+
+ tv.SetNow();
+ before = tv.Secs();
+ unsigned int numWorkers = 8;
+ for (unsigned int i = 0; i < numWorkers; ++i)
+ _workers.push_back(new AndStressWorker(*this, i));
+
+ for (unsigned int i = 0; i < _workers.size(); ++i)
+ _threadPool->NewThread(_workers[i]);
+ _taskCond.Lock();
+ while (_workersDone < _workers.size())
+ _taskCond.Wait();
+ _taskCond.Unlock();
+ tv.SetNow();
+ after = tv.Secs();
+ LOG(info,
+ "AndStressMaster::run elapsed %10.6f s for workers %s format",
+ after - before,
+ postingFormat.c_str());
+ clearPtrVector(_workers);
+ _workersDone = 0;
+ return after - before;
+}
+
+
+AndStressWorker::AndStressWorker(AndStressMaster &master, unsigned int id)
+ : _master(master),
+ _id(id)
+{
+ LOG(debug, "AndStressWorker::AndStressWorker, id=%u", id);
+}
+
+AndStressWorker::~AndStressWorker(void)
+{
+ LOG(debug, "AndStressWorker::~AndStressWorker, id=%u", _id);
+}
+
+
+static int
+highLevelAndPairPostingScan(SearchIterator &sb1,
+ SearchIterator &sb2,
+ uint32_t numDocs, uint64_t *cycles)
+{
+ uint32_t hits = 0;
+ uint64_t before = fastos::ClockSystem::now();
+ sb1.initFullRange();
+ sb2.initFullRange();
+ uint32_t docId = sb1.getDocId();
+ while (docId < numDocs) {
+ if (sb1.seek(docId)) {
+ if (sb2.seek(docId)) {
+ ++hits;
+ ++docId;
+ } else if (docId < sb2.getDocId())
+ docId = sb2.getDocId();
+ else
+ ++docId;
+ } else if (docId < sb1.getDocId())
+ docId= sb1.getDocId();
+ else
+ ++docId;
+ }
+ uint64_t after = fastos::ClockSystem::now();
+ *cycles = after - before;
+ return hits;
+}
+
+
+static int
+highLevelAndPairPostingScanUnpack(SearchIterator &sb1,
+ SearchIterator &sb2,
+ uint32_t numDocs,
+ uint64_t *cycles)
+{
+ uint32_t hits = 0;
+ uint64_t before = fastos::ClockSystem::now();
+ sb1.initFullRange();
+ sb2.initFullRange();
+ uint32_t docId = sb1.getDocId();
+ while (docId < numDocs) {
+ if (sb1.seek(docId)) {
+ if (sb2.seek(docId)) {
+ ++hits;
+ sb1.unpack(docId);
+ sb2.unpack(docId);
+ ++docId;
+ } else if (docId < sb2.getDocId())
+ docId = sb2.getDocId();
+ else
+ ++docId;
+ } else if (docId < sb1.getDocId())
+ docId= sb1.getDocId();
+ else
+ ++docId;
+ }
+ uint64_t after = fastos::ClockSystem::now();
+ *cycles = after - before;
+ return hits;
+}
+
+void
+testFakePair(FakePosting &f1, FakePosting &f2, unsigned int numDocs,
+ bool unpack)
+{
+ TermFieldMatchData md1;
+ TermFieldMatchDataArray tfmda1;
+ tfmda1.add(&md1);
+ std::unique_ptr<SearchIterator> sb1(f1.createIterator(tfmda1));
+
+ TermFieldMatchData md2;
+ TermFieldMatchDataArray tfmda2;
+ tfmda1.add(&md2);
+ std::unique_ptr<SearchIterator> sb2(f2.createIterator(tfmda2));
+
+ int hits = 0;
+ uint64_t scanUnpackTime = 0;
+ if (unpack)
+ hits = highLevelAndPairPostingScanUnpack(*sb1.get(), *sb2.get(),
+ numDocs, &scanUnpackTime);
+ else
+ hits = highLevelAndPairPostingScan(*sb1.get(), *sb2.get(),
+ numDocs, &scanUnpackTime);
+#if 0
+ printf("Fakepair %s AND %s => %d hits, %" PRIu64 " cycles\n",
+ f1.getName().c_str(),
+ f2.getName().c_str(),
+ hits,
+ scanUnpackTime);
+#else
+ (void)hits;
+#endif
+}
+
+void
+AndStressWorker::Run(FastOS_ThreadInterface *thisThread, void *arg)
+{
+ (void) thisThread;
+ (void) arg;
+ LOG(debug, "AndStressWorker::Run, id=%u", _id);
+
+ bool unpack = _master.getUnpack();
+ for (;;) {
+ AndStressMaster::Task *task = _master.getTask();
+ if (task == NULL)
+ break;
+ testFakePair(*task->first, *task->second, _master.getNumDocs(),
+ unpack);
+ }
+}
+
+
+AndStress::AndStress(void)
+{
+ LOG(debug, "Andstress::AndStress");
+}
+
+
+AndStress::~AndStress(void)
+{
+ LOG(debug, "Andstress::~AndStress");
+}
+
+void
+AndStress::run(search::Rand48 &rnd,
+ FakeWordSet &wordSet,
+ unsigned int numDocs,
+ unsigned int commonDocFreq,
+ const std::vector<std::string> &postingTypes,
+ unsigned int loops,
+ unsigned int skipCommonPairsRate,
+ uint32_t numTasks,
+ uint32_t stride,
+ bool unpack)
+{
+ LOG(debug, "Andstress::run");
+ AndStressMaster master(rnd, wordSet,
+ numDocs, commonDocFreq, postingTypes, loops,
+ skipCommonPairsRate,
+ numTasks,
+ stride,
+ unpack);
+ master.run();
+}
+
+}
diff --git a/searchlib/src/tests/postinglistbm/andstress.h b/searchlib/src/tests/postinglistbm/andstress.h
new file mode 100644
index 00000000000..458866b09d5
--- /dev/null
+++ b/searchlib/src/tests/postinglistbm/andstress.h
@@ -0,0 +1,43 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+
+#include <vector>
+namespace search
+{
+class Rand48;
+
+namespace fakedata
+{
+
+class FakeWordSet;
+
+}
+
+}
+
+namespace postinglistbm
+{
+
+class AndStress
+{
+public:
+ AndStress(void);
+
+ ~AndStress(void);
+
+ void
+ run(search::Rand48 &rnd,
+ search::fakedata::FakeWordSet &wordSet,
+ unsigned int numDocs,
+ unsigned int commonDocFreq,
+ const std::vector<std::string> &postingTypes,
+ unsigned int loops,
+ unsigned int skipCommonPairsRate,
+ uint32_t numTasks,
+ uint32_t stride,
+ bool unpack);
+};
+
+} // namespace postinglistbm
+
diff --git a/searchlib/src/tests/postinglistbm/postinglistbm.cpp b/searchlib/src/tests/postinglistbm/postinglistbm.cpp
new file mode 100644
index 00000000000..fc93eb42dcd
--- /dev/null
+++ b/searchlib/src/tests/postinglistbm/postinglistbm.cpp
@@ -0,0 +1,491 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright (C) 2002-2003 Fast Search & Transfer ASA
+// Copyright (C) 2003 Overture Services Norway AS
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+LOG_SETUP("postinglistbm");
+#include <vespa/searchlib/common/bitvector.h>
+#include <vespa/searchlib/common/resultset.h>
+#include <vespa/searchlib/util/rand48.h>
+#include "andstress.h"
+#include <vespa/searchlib/test/fakedata/fakeword.h>
+#include <vespa/searchlib/test/fakedata/fakeposting.h>
+#include <vespa/searchlib/test/fakedata/fakewordset.h>
+#include <vespa/searchlib/test/fakedata/fpfactory.h>
+#include <vespa/searchlib/index/docidandfeatures.h>
+
+using search::ResultSet;
+using search::fef::TermFieldMatchData;
+using search::fef::TermFieldMatchDataArray;
+using search::queryeval::SearchIterator;
+using search::index::Schema;
+using namespace search::fakedata;
+
+// needed to resolve external symbol from httpd.h on AIX
+void FastS_block_usr2() {}
+
+
+namespace postinglistbm
+{
+
+class PostingListBM : public FastOS_Application
+{
+private:
+ bool _verbose;
+ uint32_t _numDocs;
+ uint32_t _commonDocFreq;
+ uint32_t _numWordsPerClass;
+ std::vector<std::string> _postingTypes;
+ uint32_t _loops;
+ unsigned int _skipCommonPairsRate;
+ FakeWordSet _wordSet;
+ uint32_t _stride;
+ bool _unpack;
+public:
+ search::Rand48 _rnd;
+
+private:
+ void Usage(void);
+
+ void
+ badPostingType(const std::string &postingType);
+
+ void
+ testFake(const std::string &postingType,
+ const Schema &schema,
+ const FakeWord &fw);
+public:
+ PostingListBM(void);
+ ~PostingListBM(void);
+ int Main(void);
+};
+
+
+void
+PostingListBM::Usage(void)
+{
+ printf("postinglistbm "
+ "[-C <skipCommonPairsRate>] "
+ "[-a] "
+ "[-c <commonDoqFreq>] "
+ "[-d <numDocs>] "
+ "[-l <numLoops>] "
+ "[-s <stride>] "
+ "[-t <postingType>] "
+ "[-u] "
+ "[-q] "
+ "[-v]\n");
+}
+
+
+void
+PostingListBM::badPostingType(const std::string &postingType)
+{
+ printf("Bad posting list type: %s\n", postingType.c_str());
+ printf("Supported types: ");
+
+ std::vector<std::string> postingTypes = getPostingTypes();
+ std::vector<std::string>::const_iterator pti;
+ std::vector<std::string>::const_iterator ptie = postingTypes.end();
+ bool first = true;
+
+ for (pti = postingTypes.begin(); pti != ptie; ++pti) {
+ if (first)
+ first = false;
+ else
+ printf(", ");
+ printf("%s", pti->c_str());
+ }
+ printf("\n");
+}
+
+
+PostingListBM::PostingListBM(void)
+ : _verbose(false),
+ _numDocs(10000000),
+ _commonDocFreq(50000),
+ _numWordsPerClass(100),
+ _postingTypes(),
+ _loops(1),
+ _skipCommonPairsRate(1),
+ _wordSet(),
+ _stride(0),
+ _unpack(false),
+ _rnd()
+{
+}
+
+
+PostingListBM::~PostingListBM(void)
+{
+}
+
+
+static int
+highLevelSinglePostingScan(SearchIterator &sb, uint32_t numDocs, uint64_t *cycles)
+{
+ uint32_t hits = 0;
+ uint64_t before = fastos::ClockSystem::now();
+ sb.initFullRange();
+ uint32_t docId = sb.getDocId();
+ while (docId < numDocs) {
+ if (sb.seek(docId)) {
+ ++hits;
+ ++docId;
+ } else if (docId < sb.getDocId())
+ docId= sb.getDocId();
+ else
+ ++docId;
+ }
+ uint64_t after = fastos::ClockSystem::now();
+ *cycles = after - before;
+ return hits;
+}
+
+
+static int
+highLevelSinglePostingScanUnpack(SearchIterator &sb,
+ uint32_t numDocs, uint64_t *cycles)
+{
+ uint32_t hits = 0;
+ uint64_t before = fastos::ClockSystem::now();
+ sb.initFullRange();
+ uint32_t docId = sb.getDocId();
+ while (docId < numDocs) {
+ if (sb.seek(docId)) {
+ ++hits;
+ sb.unpack(docId);
+ ++docId;
+ } else if (docId < sb.getDocId())
+ docId= sb.getDocId();
+ else
+ ++docId;
+ }
+ uint64_t after = fastos::ClockSystem::now();
+ *cycles = after - before;
+ return hits;
+}
+
+
+static int
+highLevelAndPairPostingScan(SearchIterator &sb1,
+ SearchIterator &sb2,
+ uint32_t numDocs, uint64_t *cycles)
+{
+ uint32_t hits = 0;
+ uint64_t before = fastos::ClockSystem::now();
+ sb1.initFullRange();
+ sb2.initFullRange();
+ uint32_t docId = sb1.getDocId();
+ while (docId < numDocs) {
+ if (sb1.seek(docId)) {
+ if (sb2.seek(docId)) {
+ ++hits;
+ ++docId;
+ } else if (docId < sb2.getDocId())
+ docId = sb2.getDocId();
+ else
+ ++docId;
+ } else if (docId < sb1.getDocId())
+ docId= sb1.getDocId();
+ else
+ ++docId;
+ }
+ uint64_t after = fastos::ClockSystem::now();
+ *cycles = after - before;
+ return hits;
+}
+
+
+static int
+highLevelAndPairPostingScanUnpack(SearchIterator &sb1,
+ SearchIterator &sb2,
+ uint32_t numDocs,
+ uint64_t *cycles)
+{
+ uint32_t hits = 0;
+ uint64_t before = fastos::ClockSystem::now();
+ sb1.initFullRange();
+ sb1.initFullRange();
+ uint32_t docId = sb1.getDocId();
+ while (docId < numDocs) {
+ if (sb1.seek(docId)) {
+ if (sb2.seek(docId)) {
+ ++hits;
+ sb1.unpack(docId);
+ sb2.unpack(docId);
+ ++docId;
+ } else if (docId < sb2.getDocId())
+ docId = sb2.getDocId();
+ else
+ ++docId;
+ } else if (docId < sb1.getDocId())
+ docId= sb1.getDocId();
+ else
+ ++docId;
+ }
+ uint64_t after = fastos::ClockSystem::now();
+ *cycles = after - before;
+ return hits;
+}
+
+
+void
+PostingListBM::testFake(const std::string &postingType,
+ const Schema &schema,
+ const FakeWord &fw)
+{
+ std::unique_ptr<FPFactory> ff(getFPFactory(postingType, schema));
+ std::vector<const FakeWord *> v;
+ v.push_back(&fw);
+ ff->setup(v);
+ FakePosting::SP f(ff->make(fw));
+
+ printf("%s.bitsize=%d+%d+%d+%d+%d\n",
+ f->getName().c_str(),
+ static_cast<int>(f->bitSize()),
+ static_cast<int>(f->l1SkipBitSize()),
+ static_cast<int>(f->l2SkipBitSize()),
+ static_cast<int>(f->l3SkipBitSize()),
+ static_cast<int>(f->l4SkipBitSize()));
+ TermFieldMatchData md;
+ TermFieldMatchDataArray tfmda;
+ tfmda.add(&md);
+
+ std::unique_ptr<SearchIterator> sb(f->createIterator(tfmda));
+ if (f->hasWordPositions())
+ fw.validate(sb.get(), tfmda, _verbose);
+ else
+ fw.validate(sb.get(), _verbose);
+ uint64_t scanTime = 0;
+ uint64_t scanUnpackTime = 0;
+ TermFieldMatchData md2;
+ TermFieldMatchDataArray tfmda2;
+ tfmda2.add(&md2);
+
+ std::unique_ptr<SearchIterator> sb2(f->createIterator(tfmda2));
+ int hits1 = highLevelSinglePostingScan(*sb2.get(), fw.getDocIdLimit(),
+ &scanTime);
+ TermFieldMatchData md3;
+ TermFieldMatchDataArray tfmda3;
+ tfmda3.add(&md3);
+
+ std::unique_ptr<SearchIterator> sb3(f->createIterator(tfmda3));
+ int hits2 = highLevelSinglePostingScanUnpack(*sb3.get(), fw.getDocIdLimit(),
+ &scanUnpackTime);
+ printf("testFake '%s' hits1=%d, hits2=%d, scanTime=%" PRIu64
+ ", scanUnpackTime=%" PRIu64 "\n",
+ f->getName().c_str(),
+ hits1, hits2, scanTime, scanUnpackTime);
+}
+
+
+void
+testFakePair(const std::string &postingType,
+ const Schema &schema,
+ bool unpack,
+ const FakeWord &fw1, const FakeWord &fw2)
+{
+ std::unique_ptr<FPFactory> ff(getFPFactory(postingType, schema));
+ std::vector<const FakeWord *> v;
+ v.push_back(&fw1);
+ v.push_back(&fw2);
+ ff->setup(v);
+ FakePosting::SP f1(ff->make(fw1));
+ FakePosting::SP f2(ff->make(fw2));
+
+ TermFieldMatchData md1;
+ TermFieldMatchDataArray tfmda1;
+ tfmda1.add(&md1);
+ std::unique_ptr<SearchIterator> sb1(f1->createIterator(tfmda1));
+
+ TermFieldMatchData md2;
+ TermFieldMatchDataArray tfmda2;
+ tfmda1.add(&md2);
+ std::unique_ptr<SearchIterator> sb2(f2->createIterator(tfmda2));
+
+ int hits = 0;
+ uint64_t scanUnpackTime = 0;
+ if (unpack)
+ hits = highLevelAndPairPostingScanUnpack(*sb1.get(), *sb2.get(),
+ fw1.getDocIdLimit(), &scanUnpackTime);
+ else
+ hits = highLevelAndPairPostingScan(*sb1.get(), *sb2.get(),
+ fw1.getDocIdLimit(), &scanUnpackTime);
+ printf("Fakepair %s AND %s => %d hits, %" PRIu64 " cycles\n",
+ f1->getName().c_str(),
+ f2->getName().c_str(),
+ hits,
+ scanUnpackTime);
+}
+
+
+int
+PostingListBM::Main(void)
+{
+ int argi;
+ char c;
+ const char *optArg;
+ bool doandstress;
+
+ doandstress = false;
+ argi = 1;
+ bool hasElements = false;
+ bool hasElementWeights = false;
+ bool quick = false;
+
+ while ((c = GetOpt("C:ac:d:l:s:t:uvw:T:q", optArg, argi)) != -1) {
+ switch(c) {
+ case 'C':
+ _skipCommonPairsRate = atoi(optArg);
+ break;
+ case 'T':
+ if (strcmp(optArg, "single") == 0) {
+ hasElements = false;
+ hasElementWeights = false;
+ } else if (strcmp(optArg, "array") == 0) {
+ hasElements = true;
+ hasElementWeights = false;
+ } else if (strcmp(optArg, "weightedSet") == 0) {
+ hasElements = true;
+ hasElementWeights = true;
+ } else {
+ printf("Bad collection type: %s\n", optArg);
+ return 1;
+ }
+ break;
+ case 'a':
+ doandstress = true;
+ break;
+ case 'c':
+ _commonDocFreq = atoi(optArg);
+ break;
+ case 'd':
+ _numDocs = atoi(optArg);
+ break;
+ case 'l':
+ _loops = atoi(optArg);
+ break;
+ case 's':
+ _stride = atoi(optArg);
+ break;
+ case 't':
+ do {
+ Schema schema;
+ Schema::IndexField indexField("field0",
+ Schema::STRING,
+ Schema::SINGLE);
+ schema.addIndexField(indexField);
+ std::unique_ptr<FPFactory> ff(getFPFactory(optArg, schema));
+ if (ff.get() == NULL) {
+ badPostingType(optArg);
+ return 1;
+ }
+ } while (0);
+ _postingTypes.push_back(optArg);
+ break;
+ case 'u':
+ _unpack = true;
+ break;
+ case 'v':
+ _verbose = true;
+ break;
+ case 'w':
+ _numWordsPerClass = atoi(optArg);
+ break;
+ case 'q':
+ quick = true;
+ _numDocs = 36000;
+ _commonDocFreq = 10000;
+ _numWordsPerClass = 5;
+ break;
+ default:
+ Usage();
+ return 1;
+ }
+ }
+
+ if (_commonDocFreq > _numDocs) {
+ Usage();
+ return 1;
+ }
+
+ _wordSet.setupParams(hasElements, hasElementWeights);
+
+ uint32_t w1dfreq = 10;
+ uint32_t w4dfreq = 790000;
+ uint32_t w5dfreq = 290000;
+ uint32_t w4w5od = 100000;
+ uint32_t numTasks = 40000;
+ if (quick) {
+ w1dfreq = 2;
+ w4dfreq = 19000;
+ w5dfreq = 5000;
+ w4w5od = 1000;
+ numTasks = 40;
+ }
+
+
+ FakeWord word1(_numDocs, w1dfreq, w1dfreq / 2, "word1", _rnd,
+ _wordSet.getFieldsParams(), _wordSet.getPackedIndex());
+ FakeWord word2(_numDocs, 1000, 500, "word2", word1, 4, _rnd,
+ _wordSet.getFieldsParams(), _wordSet.getPackedIndex());
+ FakeWord word3(_numDocs, _commonDocFreq, _commonDocFreq / 2,
+ "word3", word1, 10, _rnd,
+ _wordSet.getFieldsParams(), _wordSet.getPackedIndex());
+ FakeWord word4(_numDocs, w4dfreq, w4dfreq / 2,
+ "word4", _rnd,
+ _wordSet.getFieldsParams(), _wordSet.getPackedIndex());
+ FakeWord word5(_numDocs, w5dfreq, w5dfreq / 2,
+ "word5", word4, w4w5od, _rnd,
+ _wordSet.getFieldsParams(), _wordSet.getPackedIndex());
+
+ if (_postingTypes.empty())
+ _postingTypes = getPostingTypes();
+ std::vector<std::string>::const_iterator pti;
+ std::vector<std::string>::const_iterator ptie = _postingTypes.end() ;
+
+ for (pti = _postingTypes.begin(); pti != ptie; ++pti) {
+ testFake(*pti, _wordSet.getSchema(), word1);
+ testFake(*pti, _wordSet.getSchema(), word2);
+ testFake(*pti, _wordSet.getSchema(), word3);
+ }
+
+ for (pti = _postingTypes.begin(); pti != ptie; ++pti) {
+ testFakePair(*pti, _wordSet.getSchema(), false, word1, word3);
+ testFakePair(*pti, _wordSet.getSchema(), false, word2, word3);
+ }
+
+ for (pti = _postingTypes.begin(); pti != ptie; ++pti) {
+ testFakePair(*pti, _wordSet.getSchema(), false, word4, word5);
+ }
+
+ if (doandstress) {
+ _wordSet.setupWords(_rnd, _numDocs, _commonDocFreq, _numWordsPerClass);
+ }
+ if (doandstress) {
+ AndStress andstress;
+ andstress.run(_rnd, _wordSet,
+ _numDocs, _commonDocFreq, _postingTypes, _loops,
+ _skipCommonPairsRate,
+ numTasks,
+ _stride,
+ _unpack);
+ }
+ return 0;
+}
+
+} // namespace postinglistbm
+
+int
+main(int argc, char **argv)
+{
+ postinglistbm::PostingListBM app;
+
+ setvbuf(stdout, NULL, _IOLBF, 32768);
+ app._rnd.srand48(32);
+ return app.Entry(argc, argv);
+
+ return 0;
+}
diff --git a/searchlib/src/tests/postinglistbm/skip.txt b/searchlib/src/tests/postinglistbm/skip.txt
new file mode 100644
index 00000000000..9804bce3c33
--- /dev/null
+++ b/searchlib/src/tests/postinglistbm/skip.txt
@@ -0,0 +1,75 @@
+B tree view:
+
+ Leaf Nodes: segments of docid delta list
+ Interior Nodes: Segments of skip info lists
+
+ Interior Nodes 1 level above leaf nodes: L1 skip info
+ Interior Nodes 2 level above leaf nodes: L2 skip info
+
+Example posting list, with stride 4 for L1 skip and L2 skip:
+
+DocIdPos: 0 1 2 3| 4 5 6 7| 8 9 10 11| 12 13 14 15| 16 17 18
+DocId: 1 11 21 31|41 51 61 71|81 91 101 111|121 131 141 151|161 171 181
+
+(Assume continued with every 10. docid present)
+
+Old L1 skip info, pointing to start of leaf nodes, with first docid in
+leaf node pre-decoded (i.e. containing copy of first docid entry in leaf node):
+
+L1Pos: 0 1 2 3| 4 5 6 7| 8 9 10 11| 12 13 14 15| 16
+DocId: 41 81 121 161|201 241 281 321|361 401 441 481|521 561 601 641|681
+DocIdPos: 5 9 13 17| 21 25 29 33| 37 41 45 49| 53 57 61 65| 69
+
+Old L2 skip info, pointing to start of interior nodes 1 level above leaf nodes
+and containing copies of previous L1 skip entry:
+
+L2Pos: 0 1 2 3
+DocId: 161 321 481 641
+DocIdPos: 17 33 49 65
+L1Pos: 4 8 12 16
+
+Reason for change of skip info view: Avoiding null skips, simplifying code.
+
+Skip from docId 1 to docId 115 first skips to DocId 81 before ending
+up at DocId 121. If next seek is to below 161, a null skip to docid
+121 is performed since docId delta unpacking caught up with supposedly
+next L1 skip docid. With L1 skip stride being N, 1/N of longer seeks
+will unpack N extra docids, eating up the advantage of first docid in
+leaf node being pre-decoded.
+
+If a seek to docId 115 is followed by a seek to docId 121, an unpack
+of docId 121 and a sek to a higher docid, this causes, with the old L1
+skip info, features for docId 81, 91 101, 111 to be decoded with the
+result ignored before the features for docId 121 is decoded. For the
+next seek, the null skip of DocId is also associated with a backwards
+skip for features, so if the next feature to be decoded was for docId
+141 then features for docId 121 will be decoded again and ignored.
+
+New L1 skip info, pointing to start of leaf nodes, without first docid
+in leaf node pre-decoded (i.e. containing copy of last docid entry in
+previous leaf node):
+
+L1Pos: 0 1 2 3| 4 5 6 7| 8 9 10 11| 12 13 14 15| 16
+DocId: 31 71 111 151|191 231 271 311|351 391 431 471|511 551 591 631|671
+DocIdPos: 4 8 12 16| 20 24 28 32| 36 40 44 48| 52 56 60 64| 68
+
+New L2 skip info, pointing to start of interior nodes 1 level above leaf nodes
+and containing copies of previous L1 skip entry:
+
+L2Pos: 0 1 2 3
+DocId: 151 311 471 631
+DocIdPos: 16 32 48 64
+L1Pos: 4 8 12 16
+
+1 DocId delta is unpacked when using L1 or L2 skip, to get first docid
+in leaf node. With old skip info, this wasn't needed.
+
+With new skip info, docid delta unpacking should never catch up with
+next L1 skip docid (can become equal, but that's no longer sufficient
+for triggering a skip).
+
+For each level upwards in skip info, one extra number is needed per element in
+the skip info.
+
+For feature position (split docid/features), one extra number is needed per
+element in the skip info.