Publish

author: Jon Bratseth <bratseth@yahoo-inc.com> 2016-06-15 23:09:44 +0200
committer: Jon Bratseth <bratseth@yahoo-inc.com> 2016-06-15 23:09:44 +0200
commit: 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree: 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /searchlib/src/tests/postinglistbm
6 files changed, 1165 insertions, 0 deletions
diff --git a/searchlib/src/tests/postinglistbm/.gitignore b/searchlib/src/tests/postinglistbm/.gitignore
new file mode 100644
index 00000000000..ac71dde13e2
--- /dev/null
+++ b/searchlib/src/tests/postinglistbm/.gitignore
@@ -0,0 +1,10 @@
+*.core
+*.ilk
+*.pdb
+.depend
+Makefile
+core
+core.*
+postinglistbm
+postinglistbm.exe
+searchlib_postinglistbm_app
diff --git a/searchlib/src/tests/postinglistbm/CMakeLists.txt b/searchlib/src/tests/postinglistbm/CMakeLists.txt
new file mode 100644
index 00000000000..403c12da1b1
--- /dev/null
+++ b/searchlib/src/tests/postinglistbm/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(searchlib_postinglistbm_app
+    SOURCES
+    postinglistbm.cpp
+    andstress.cpp
+    DEPENDS
+    searchlib_test
+    searchlib
+)
+vespa_add_test(NAME searchlib_postinglistbm_app NO_VALGRIND COMMAND searchlib_postinglistbm_app -q -a)
diff --git a/searchlib/src/tests/postinglistbm/andstress.cpp b/searchlib/src/tests/postinglistbm/andstress.cpp
new file mode 100644
index 00000000000..f3fabde0d61
--- /dev/null
+++ b/searchlib/src/tests/postinglistbm/andstress.cpp
@@ -0,0 +1,536 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+LOG_SETUP(".andstress");
+#include "andstress.h"
+#include <vector>
+
+#include <vespa/searchlib/common/bitvector.h>
+#include <vespa/searchlib/test/fakedata/fakeword.h>
+#include <vespa/searchlib/test/fakedata/fakewordset.h>
+#include <vespa/searchlib/test/fakedata/fakeposting.h>
+#include <vespa/searchlib/test/fakedata/fakefilterocc.h>
+#include <vespa/searchlib/test/fakedata/fakeegcompr64filterocc.h>
+#include <vespa/searchlib/test/fakedata/fakezcfilterocc.h>
+#include <vespa/searchlib/test/fakedata/fakezcbfilterocc.h>
+#include <vespa/searchlib/test/fakedata/fpfactory.h>
+
+using search::fef::TermFieldMatchData;
+using search::fef::TermFieldMatchDataArray;
+using search::queryeval::SearchIterator;
+using namespace search::fakedata;
+
+namespace postinglistbm
+{
+
+class AndStressWorker;
+
+class AndStressMaster
+{
+private:
+    AndStressMaster(const AndStressMaster &);
+
+    AndStressMaster &
+    operator=(const AndStressMaster &);
+
+    search::Rand48 &_rnd;
+    unsigned int _numDocs;
+    unsigned int _commonDocFreq;
+    std::vector<std::string> _postingTypes;
+    unsigned int _loops;
+    unsigned int _skipCommonPairsRate;
+    uint32_t _stride;
+    bool _unpack;
+
+    FastOS_ThreadPool *_threadPool;
+    std::vector<AndStressWorker *> _workers;
+    unsigned int _workersDone;
+
+    FakeWordSet &_wordSet;
+
+    std::vector<std::vector<FakePosting::SP> > _postings;
+
+    FastOS_Cond _taskCond;
+    unsigned int _taskIdx;
+    uint32_t _numTasks;
+
+public:
+    typedef std::pair<FakePosting *, FakePosting *> Task;
+private:
+    std::vector<Task> _tasks;
+public:
+    AndStressMaster(search::Rand48 &rnd,
+                    FakeWordSet &wordSet,
+                    unsigned int numDocs,
+                    unsigned int commonDocFreq,
+                    const std::vector<std::string> &postingType,
+                    unsigned int loops,
+                    unsigned int skipCommonPairsRate,
+                    uint32_t numTasks,
+                    uint32_t stride,
+                    bool unpack);
+
+    ~AndStressMaster(void);
+
+    void
+    run(void);
+
+    void
+    makePostingsHelper(FPFactory *postingFactory,
+                       const std::string &postingFormat,
+                       bool validate, bool verbose);
+
+    void
+    dropPostings(void);
+
+    void
+    dropTasks(void);
+
+    void
+    resetTasks(void);	// Prepare for rerun
+
+    void
+    setupTasks(unsigned int numTasks);
+
+    Task *
+    getTask(void);
+
+    unsigned int
+    getNumDocs(void) const
+    {
+        return _numDocs;
+    }
+
+    bool
+    getUnpack(void) const
+    {
+        return _unpack;
+    }
+
+    double
+    runWorkers(const std::string &postingFormat);
+};
+
+
+class AndStressWorker : public FastOS_Runnable
+{
+private:
+    AndStressWorker(const AndStressWorker &);
+
+    AndStressWorker &
+    operator=(const AndStressWorker &);
+
+    AndStressMaster &_master;
+    unsigned int _id;
+public:
+    AndStressWorker(AndStressMaster &master, unsigned int id);
+
+    ~AndStressWorker(void);
+
+    virtual void
+    Run(FastOS_ThreadInterface *thisThread, void *arg);
+};
+
+
+template <class P>
+FakePosting *
+makePosting(FakeWord &fw)
+{
+    return new P(fw);
+}
+
+
+AndStressMaster::AndStressMaster(search::Rand48 &rnd,
+                                 FakeWordSet &wordSet,
+                                 unsigned int numDocs,
+                                 unsigned int commonDocFreq,
+                                 const std::vector<std::string> &postingTypes,
+                                 unsigned int loops,
+                                 unsigned int skipCommonPairsRate,
+                                 uint32_t numTasks,
+                                 uint32_t stride,
+                                 bool unpack)
+    : _rnd(rnd),
+      _numDocs(numDocs),
+      _commonDocFreq(commonDocFreq),
+      _postingTypes(postingTypes),
+      _loops(loops),
+      _skipCommonPairsRate(skipCommonPairsRate),
+      _stride(stride),
+      _unpack(unpack),
+      _threadPool(NULL),
+      _workers(),
+      _workersDone(0),
+      _wordSet(wordSet),
+      _postings(FakeWordSet::NUM_WORDCLASSES),
+      _taskCond(),
+      _taskIdx(0),
+      _numTasks(numTasks),
+      _tasks()
+{
+    LOG(info, "AndStressMaster::AndStressMaster");
+
+    _threadPool = new FastOS_ThreadPool(128 * 1024, 400);
+}
+
+template <class C>
+static void
+clearPtrVector(std::vector<C> &v)
+{
+    for (unsigned int i = 0; i < v.size(); ++i)
+        delete v[i];
+    v.clear();
+}
+
+
+AndStressMaster::~AndStressMaster(void)
+{
+    LOG(info, "AndStressMaster::~AndStressMaster");
+
+    _threadPool->Close();
+    delete _threadPool;
+    _threadPool = NULL;
+    clearPtrVector(_workers);
+    dropPostings();
+}
+
+
+void
+AndStressMaster::dropPostings(void)
+{
+    for (unsigned int i = 0; i < _postings.size(); ++i)
+        _postings[i].clear();
+    dropTasks();
+}
+
+
+void
+AndStressMaster::dropTasks(void)
+{
+    _tasks.clear();
+    _taskIdx = 0;
+}
+
+
+void
+AndStressMaster::resetTasks(void)
+{
+    _taskIdx = 0;
+}
+
+
+static void
+makeSomePostings(FPFactory *postingFactory,
+                 std::vector<FakeWord *> &w,
+                 std::vector<FakePosting::SP> &p,
+                 uint32_t stride,
+                 bool validate,
+                 bool verbose)
+{
+    for (unsigned int i = 0; i < w.size(); ++i) {
+        FakePosting::SP np(postingFactory->make(*w[i]));
+        if (validate) {
+            TermFieldMatchData md;
+            TermFieldMatchDataArray tfmda;
+            tfmda.add(&md);
+
+            std::unique_ptr<SearchIterator> sb(np->createIterator(tfmda));
+            if (np->hasWordPositions()) {
+                if (stride != 0)
+                    w[i]->validate(sb.get(), tfmda, stride, verbose);
+                else
+                    w[i]->validate(sb.get(), tfmda, verbose);
+            } else
+                w[i]->validate(sb.get(), verbose);
+        }
+        p.push_back(np);
+    }
+}
+
+void
+AndStressMaster::makePostingsHelper(FPFactory *postingFactory,
+                                    const std::string &postingFormat,
+                                    bool validate, bool verbose)
+{
+    FastOS_Time tv;
+    double before;
+    double after;
+
+    tv.SetNow();
+    before = tv.Secs();
+    postingFactory->setup(_wordSet);
+    for (unsigned int i = 0; i < _wordSet._words.size(); ++i)
+        makeSomePostings(postingFactory,
+                         _wordSet._words[i], _postings[i],
+                         _stride,
+                         validate,
+                         verbose);
+    tv.SetNow();
+    after = tv.Secs();
+    LOG(info,
+        "AndStressMaster::makePostingsHelper elapsed %10.6f s for %s format",
+        after - before,
+        postingFormat.c_str());
+}
+
+
+void
+AndStressMaster::setupTasks(unsigned int numTasks)
+{
+    unsigned int wordclass1;
+    unsigned int wordclass2;
+    unsigned int word1idx;
+    unsigned int word2idx;
+
+    for (unsigned int i = 0; i < numTasks; ++i) {
+        wordclass1 = _rnd.lrand48() % _postings.size();
+        wordclass2 = _rnd.lrand48() % _postings.size();
+        while (wordclass1 == FakeWordSet::COMMON_WORD &&
+               wordclass2 == FakeWordSet::COMMON_WORD &&
+               (_rnd.lrand48() % _skipCommonPairsRate) != 0) {
+            wordclass1 = _rnd.lrand48() % _postings.size();
+            wordclass2 = _rnd.lrand48() % _postings.size();
+        }
+        word1idx = _rnd.lrand48() % _postings[wordclass1].size();
+        word2idx = _rnd.lrand48() % _postings[wordclass2].size();
+        FakePosting::SP p1 = _postings[wordclass1][word1idx];
+        FakePosting::SP p2 = _postings[wordclass2][word2idx];
+        _tasks.push_back(std::make_pair(p1.get(), p2.get()));
+    }
+}
+
+
+AndStressMaster::Task *
+AndStressMaster::getTask(void)
+{
+    Task *result = NULL;
+    _taskCond.Lock();
+    if (_taskIdx < _tasks.size()) {
+        result = &_tasks[_taskIdx];
+        ++_taskIdx;
+    } else {
+        _workersDone++;
+        if (_workersDone == _workers.size())
+            _taskCond.Broadcast();
+    }
+    _taskCond.Unlock();
+    return result;
+}
+
+void
+AndStressMaster::run(void)
+{
+    LOG(info, "AndStressMaster::run");
+
+    std::vector<std::string>::const_iterator pti;
+    std::vector<std::string>::const_iterator ptie = _postingTypes.end() ;
+
+    for (pti = _postingTypes.begin(); pti != ptie; ++pti) {
+        std::unique_ptr<FPFactory> ff(getFPFactory(*pti, _wordSet.getSchema()));
+        makePostingsHelper(ff.get(), *pti, true, false);
+        setupTasks(_numTasks);
+        double totalTime = 0;
+        for (unsigned int loop = 0; loop < _loops; ++loop) {
+            totalTime += runWorkers(*pti);
+            resetTasks();
+        }
+        LOG(info, "AndStressMaster::average run elapsed %10.6f s for workers %s format",
+            totalTime / _loops, pti->c_str());
+        dropPostings();
+    }
+    FastOS_Thread::Sleep(250);
+}
+
+
+double
+AndStressMaster::runWorkers(const std::string &postingFormat)
+{
+    FastOS_Time tv;
+    double before;
+    double after;
+
+    tv.SetNow();
+    before = tv.Secs();
+    unsigned int numWorkers = 8;
+    for (unsigned int i = 0; i < numWorkers; ++i)
+        _workers.push_back(new AndStressWorker(*this, i));
+
+    for (unsigned int i = 0; i < _workers.size(); ++i)
+        _threadPool->NewThread(_workers[i]);
+    _taskCond.Lock();
+    while (_workersDone < _workers.size())
+        _taskCond.Wait();
+    _taskCond.Unlock();
+    tv.SetNow();
+    after = tv.Secs();
+    LOG(info,
+        "AndStressMaster::run elapsed %10.6f s for workers %s format",
+        after - before,
+        postingFormat.c_str());
+    clearPtrVector(_workers);
+    _workersDone = 0;
+    return after - before;
+}
+
+
+AndStressWorker::AndStressWorker(AndStressMaster &master, unsigned int id)
+    : _master(master),
+      _id(id)
+{
+    LOG(debug, "AndStressWorker::AndStressWorker, id=%u", id);
+}
+
+AndStressWorker::~AndStressWorker(void)
+{
+    LOG(debug, "AndStressWorker::~AndStressWorker, id=%u", _id);
+}
+
+
+static int
+highLevelAndPairPostingScan(SearchIterator &sb1,
+                            SearchIterator &sb2,
+                            uint32_t numDocs, uint64_t *cycles)
+{
+    uint32_t hits = 0;
+    uint64_t before = fastos::ClockSystem::now();
+    sb1.initFullRange();
+    sb2.initFullRange();
+    uint32_t docId = sb1.getDocId();
+    while (docId < numDocs) {
+        if (sb1.seek(docId)) {
+            if (sb2.seek(docId)) {
+                ++hits;
+                ++docId;
+            } else if (docId < sb2.getDocId())
+                docId = sb2.getDocId();
+            else
+                ++docId;
+        } else if (docId < sb1.getDocId())
+            docId= sb1.getDocId();
+        else
+            ++docId;
+    }
+    uint64_t after = fastos::ClockSystem::now();
+    *cycles = after - before;
+    return hits;
+}
+
+
+static int
+highLevelAndPairPostingScanUnpack(SearchIterator &sb1,
+                                  SearchIterator &sb2,
+                                  uint32_t numDocs,
+                                  uint64_t *cycles)
+{
+    uint32_t hits = 0;
+    uint64_t before = fastos::ClockSystem::now();
+    sb1.initFullRange();
+    sb2.initFullRange();
+    uint32_t docId = sb1.getDocId();
+    while (docId < numDocs) {
+        if (sb1.seek(docId)) {
+            if (sb2.seek(docId)) {
+                ++hits;
+                sb1.unpack(docId);
+                sb2.unpack(docId);
+                ++docId;
+            } else if (docId < sb2.getDocId())
+                docId = sb2.getDocId();
+            else
+                ++docId;
+        } else if (docId < sb1.getDocId())
+            docId= sb1.getDocId();
+        else
+            ++docId;
+    }
+    uint64_t after = fastos::ClockSystem::now();
+    *cycles = after - before;
+    return hits;
+}
+
+void
+testFakePair(FakePosting &f1, FakePosting &f2, unsigned int numDocs,
+             bool unpack)
+{
+    TermFieldMatchData md1;
+    TermFieldMatchDataArray tfmda1;
+    tfmda1.add(&md1);
+    std::unique_ptr<SearchIterator> sb1(f1.createIterator(tfmda1));
+
+    TermFieldMatchData md2;
+    TermFieldMatchDataArray tfmda2;
+    tfmda1.add(&md2);
+    std::unique_ptr<SearchIterator> sb2(f2.createIterator(tfmda2));
+
+    int hits = 0;
+    uint64_t scanUnpackTime = 0;
+    if (unpack)
+        hits = highLevelAndPairPostingScanUnpack(*sb1.get(), *sb2.get(),
+                numDocs, &scanUnpackTime);
+    else
+        hits = highLevelAndPairPostingScan(*sb1.get(), *sb2.get(),
+                numDocs, &scanUnpackTime);
+#if 0
+    printf("Fakepair %s AND %s => %d hits, %" PRIu64 " cycles\n",
+           f1.getName().c_str(),
+           f2.getName().c_str(),
+           hits,
+           scanUnpackTime);
+#else
+    (void)hits;
+#endif
+}
+
+void
+AndStressWorker::Run(FastOS_ThreadInterface *thisThread, void *arg)
+{
+    (void) thisThread;
+    (void) arg;
+    LOG(debug, "AndStressWorker::Run, id=%u", _id);
+
+    bool unpack = _master.getUnpack();
+    for (;;) {
+        AndStressMaster::Task *task = _master.getTask();
+        if (task == NULL)
+            break;
+        testFakePair(*task->first, *task->second, _master.getNumDocs(),
+                     unpack);
+    }
+}
+
+
+AndStress::AndStress(void)
+{
+    LOG(debug, "Andstress::AndStress");
+}
+
+
+AndStress::~AndStress(void)
+{
+    LOG(debug, "Andstress::~AndStress");
+}
+
+void
+AndStress::run(search::Rand48 &rnd,
+               FakeWordSet &wordSet,
+               unsigned int numDocs,
+               unsigned int commonDocFreq,
+               const std::vector<std::string> &postingTypes,
+               unsigned int loops,
+               unsigned int skipCommonPairsRate,
+               uint32_t numTasks,
+               uint32_t stride,
+               bool unpack)
+{
+    LOG(debug, "Andstress::run");
+    AndStressMaster master(rnd, wordSet,
+                           numDocs, commonDocFreq, postingTypes, loops,
+                           skipCommonPairsRate,
+                           numTasks,
+                           stride,
+                           unpack);
+    master.run();
+}
+
+}
diff --git a/searchlib/src/tests/postinglistbm/andstress.h b/searchlib/src/tests/postinglistbm/andstress.h
new file mode 100644
index 00000000000..458866b09d5
--- /dev/null
+++ b/searchlib/src/tests/postinglistbm/andstress.h
@@ -0,0 +1,43 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+
+#include <vector>
+namespace search
+{
+class Rand48;
+
+namespace fakedata
+{
+
+class FakeWordSet;
+
+}
+
+}
+
+namespace postinglistbm
+{
+
+class AndStress
+{
+public:
+    AndStress(void);
+
+    ~AndStress(void);
+
+    void
+    run(search::Rand48 &rnd,
+        search::fakedata::FakeWordSet &wordSet,
+        unsigned int numDocs,
+        unsigned int commonDocFreq,
+        const std::vector<std::string> &postingTypes,
+        unsigned int loops,
+        unsigned int skipCommonPairsRate,
+        uint32_t numTasks,
+        uint32_t stride,
+        bool unpack);
+};
+
+} // namespace postinglistbm
+
diff --git a/searchlib/src/tests/postinglistbm/postinglistbm.cpp b/searchlib/src/tests/postinglistbm/postinglistbm.cpp
new file mode 100644
index 00000000000..fc93eb42dcd
--- /dev/null
+++ b/searchlib/src/tests/postinglistbm/postinglistbm.cpp
@@ -0,0 +1,491 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright (C) 2002-2003 Fast Search & Transfer ASA
+// Copyright (C) 2003 Overture Services Norway AS
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+LOG_SETUP("postinglistbm");
+#include <vespa/searchlib/common/bitvector.h>
+#include <vespa/searchlib/common/resultset.h>
+#include <vespa/searchlib/util/rand48.h>
+#include "andstress.h"
+#include <vespa/searchlib/test/fakedata/fakeword.h>
+#include <vespa/searchlib/test/fakedata/fakeposting.h>
+#include <vespa/searchlib/test/fakedata/fakewordset.h>
+#include <vespa/searchlib/test/fakedata/fpfactory.h>
+#include <vespa/searchlib/index/docidandfeatures.h>
+
+using search::ResultSet;
+using search::fef::TermFieldMatchData;
+using search::fef::TermFieldMatchDataArray;
+using search::queryeval::SearchIterator;
+using search::index::Schema;
+using namespace search::fakedata;
+
+// needed to resolve external symbol from httpd.h on AIX
+void FastS_block_usr2() {}
+
+
+namespace postinglistbm
+{
+
+class PostingListBM : public FastOS_Application
+{
+private:
+    bool _verbose;
+    uint32_t _numDocs;
+    uint32_t _commonDocFreq;
+    uint32_t _numWordsPerClass;
+    std::vector<std::string> _postingTypes;
+    uint32_t _loops;
+    unsigned int _skipCommonPairsRate;
+    FakeWordSet _wordSet;
+    uint32_t _stride;
+    bool _unpack;
+public:
+    search::Rand48 _rnd;
+
+private:
+    void Usage(void);
+
+    void
+    badPostingType(const std::string &postingType);
+
+    void
+    testFake(const std::string &postingType,
+             const Schema &schema,
+             const FakeWord &fw);
+public:
+    PostingListBM(void);
+    ~PostingListBM(void);
+    int Main(void);
+};
+
+
+void
+PostingListBM::Usage(void)
+{
+    printf("postinglistbm "
+           "[-C <skipCommonPairsRate>] "
+           "[-a] "
+           "[-c <commonDoqFreq>] "
+           "[-d <numDocs>] "
+           "[-l <numLoops>] "
+           "[-s <stride>] "
+           "[-t <postingType>] "
+           "[-u] "
+           "[-q] "
+           "[-v]\n");
+}
+
+
+void
+PostingListBM::badPostingType(const std::string &postingType)
+{
+    printf("Bad posting list type: %s\n", postingType.c_str());
+    printf("Supported types: ");
+
+    std::vector<std::string> postingTypes = getPostingTypes();
+    std::vector<std::string>::const_iterator pti;
+    std::vector<std::string>::const_iterator ptie = postingTypes.end();
+    bool first = true;
+
+    for (pti = postingTypes.begin(); pti != ptie; ++pti) {
+        if (first)
+            first = false;
+        else
+            printf(", ");
+        printf("%s", pti->c_str());
+    }
+    printf("\n");
+}
+
+
+PostingListBM::PostingListBM(void)
+    : _verbose(false),
+      _numDocs(10000000),
+      _commonDocFreq(50000),
+      _numWordsPerClass(100),
+      _postingTypes(),
+      _loops(1),
+      _skipCommonPairsRate(1),
+      _wordSet(),
+      _stride(0),
+      _unpack(false),
+      _rnd()
+{
+}
+
+
+PostingListBM::~PostingListBM(void)
+{
+}
+
+
+static int
+highLevelSinglePostingScan(SearchIterator &sb, uint32_t numDocs, uint64_t *cycles)
+{
+    uint32_t hits = 0;
+    uint64_t before = fastos::ClockSystem::now();
+    sb.initFullRange();
+    uint32_t docId = sb.getDocId();
+    while (docId < numDocs) {
+        if (sb.seek(docId)) {
+            ++hits;
+            ++docId;
+        } else if (docId < sb.getDocId())
+            docId= sb.getDocId();
+        else
+            ++docId;
+    }
+    uint64_t after = fastos::ClockSystem::now();
+    *cycles = after - before;
+    return hits;
+}
+
+
+static int
+highLevelSinglePostingScanUnpack(SearchIterator &sb,
+                                 uint32_t numDocs, uint64_t *cycles)
+{
+    uint32_t hits = 0;
+    uint64_t before = fastos::ClockSystem::now();
+    sb.initFullRange();
+    uint32_t docId = sb.getDocId();
+    while (docId < numDocs) {
+        if (sb.seek(docId)) {
+            ++hits;
+            sb.unpack(docId);
+            ++docId;
+        } else if (docId < sb.getDocId())
+            docId= sb.getDocId();
+        else
+            ++docId;
+    }
+    uint64_t after = fastos::ClockSystem::now();
+    *cycles = after - before;
+    return hits;
+}
+
+
+static int
+highLevelAndPairPostingScan(SearchIterator &sb1,
+                            SearchIterator &sb2,
+                            uint32_t numDocs, uint64_t *cycles)
+{
+    uint32_t hits = 0;
+    uint64_t before = fastos::ClockSystem::now();
+    sb1.initFullRange();
+    sb2.initFullRange();
+    uint32_t docId = sb1.getDocId();
+    while (docId < numDocs) {
+        if (sb1.seek(docId)) {
+            if (sb2.seek(docId)) {
+                ++hits;
+                ++docId;
+            } else if (docId < sb2.getDocId())
+                docId = sb2.getDocId();
+            else
+                ++docId;
+        } else if (docId < sb1.getDocId())
+            docId= sb1.getDocId();
+        else
+            ++docId;
+    }
+    uint64_t after = fastos::ClockSystem::now();
+    *cycles = after - before;
+    return hits;
+}
+
+
+static int
+highLevelAndPairPostingScanUnpack(SearchIterator &sb1,
+                                  SearchIterator &sb2,
+                                  uint32_t numDocs,
+                                  uint64_t *cycles)
+{
+    uint32_t hits = 0;
+    uint64_t before = fastos::ClockSystem::now();
+    sb1.initFullRange();
+    sb1.initFullRange();
+    uint32_t docId = sb1.getDocId();
+    while (docId < numDocs) {
+        if (sb1.seek(docId)) {
+            if (sb2.seek(docId)) {
+                ++hits;
+                sb1.unpack(docId);
+                sb2.unpack(docId);
+                ++docId;
+            } else if (docId < sb2.getDocId())
+                docId = sb2.getDocId();
+            else
+                ++docId;
+        } else if (docId < sb1.getDocId())
+            docId= sb1.getDocId();
+        else
+            ++docId;
+    }
+    uint64_t after = fastos::ClockSystem::now();
+    *cycles = after - before;
+    return hits;
+}
+
+
+void
+PostingListBM::testFake(const std::string &postingType,
+                        const Schema &schema,
+                        const FakeWord &fw)
+{
+    std::unique_ptr<FPFactory> ff(getFPFactory(postingType, schema));
+    std::vector<const FakeWord *> v;
+    v.push_back(&fw);
+    ff->setup(v);
+    FakePosting::SP f(ff->make(fw));
+
+    printf("%s.bitsize=%d+%d+%d+%d+%d\n",
+           f->getName().c_str(),
+           static_cast<int>(f->bitSize()),
+           static_cast<int>(f->l1SkipBitSize()),
+           static_cast<int>(f->l2SkipBitSize()),
+           static_cast<int>(f->l3SkipBitSize()),
+           static_cast<int>(f->l4SkipBitSize()));
+    TermFieldMatchData md;
+    TermFieldMatchDataArray tfmda;
+    tfmda.add(&md);
+
+    std::unique_ptr<SearchIterator> sb(f->createIterator(tfmda));
+    if (f->hasWordPositions())
+        fw.validate(sb.get(), tfmda, _verbose);
+    else
+        fw.validate(sb.get(), _verbose);
+    uint64_t scanTime = 0;
+    uint64_t scanUnpackTime = 0;
+    TermFieldMatchData md2;
+    TermFieldMatchDataArray tfmda2;
+    tfmda2.add(&md2);
+
+    std::unique_ptr<SearchIterator> sb2(f->createIterator(tfmda2));
+    int hits1 = highLevelSinglePostingScan(*sb2.get(), fw.getDocIdLimit(),
+                                       &scanTime);
+    TermFieldMatchData md3;
+    TermFieldMatchDataArray tfmda3;
+    tfmda3.add(&md3);
+
+    std::unique_ptr<SearchIterator> sb3(f->createIterator(tfmda3));
+    int hits2 = highLevelSinglePostingScanUnpack(*sb3.get(), fw.getDocIdLimit(),
+            &scanUnpackTime);
+    printf("testFake '%s' hits1=%d, hits2=%d, scanTime=%" PRIu64
+           ", scanUnpackTime=%" PRIu64 "\n",
+           f->getName().c_str(),
+           hits1, hits2, scanTime, scanUnpackTime);
+}
+
+
+void
+testFakePair(const std::string &postingType,
+             const Schema &schema,
+             bool unpack,
+             const FakeWord &fw1, const FakeWord &fw2)
+{
+    std::unique_ptr<FPFactory> ff(getFPFactory(postingType, schema));
+    std::vector<const FakeWord *> v;
+    v.push_back(&fw1);
+    v.push_back(&fw2);
+    ff->setup(v);
+    FakePosting::SP f1(ff->make(fw1));
+    FakePosting::SP f2(ff->make(fw2));
+
+    TermFieldMatchData md1;
+    TermFieldMatchDataArray tfmda1;
+    tfmda1.add(&md1);
+    std::unique_ptr<SearchIterator> sb1(f1->createIterator(tfmda1));
+
+    TermFieldMatchData md2;
+    TermFieldMatchDataArray tfmda2;
+    tfmda1.add(&md2);
+    std::unique_ptr<SearchIterator> sb2(f2->createIterator(tfmda2));
+
+    int hits = 0;
+    uint64_t scanUnpackTime = 0;
+    if (unpack)
+        hits = highLevelAndPairPostingScanUnpack(*sb1.get(), *sb2.get(),
+                fw1.getDocIdLimit(), &scanUnpackTime);
+    else
+        hits = highLevelAndPairPostingScan(*sb1.get(), *sb2.get(),
+                fw1.getDocIdLimit(), &scanUnpackTime);
+    printf("Fakepair %s AND %s => %d hits, %" PRIu64 " cycles\n",
+           f1->getName().c_str(),
+           f2->getName().c_str(),
+           hits,
+           scanUnpackTime);
+}
+
+
+int
+PostingListBM::Main(void)
+{
+    int argi;
+    char c;
+    const char *optArg;
+    bool doandstress;
+
+    doandstress = false;
+    argi = 1;
+    bool hasElements = false;
+    bool hasElementWeights = false;
+    bool quick = false;
+
+    while ((c = GetOpt("C:ac:d:l:s:t:uvw:T:q", optArg, argi)) != -1) {
+        switch(c) {
+        case 'C':
+            _skipCommonPairsRate = atoi(optArg);
+            break;
+        case 'T':
+            if (strcmp(optArg, "single") == 0) {
+                hasElements = false;
+                hasElementWeights = false;
+            } else if (strcmp(optArg, "array") == 0) {
+                hasElements = true;
+                hasElementWeights = false;
+            } else if (strcmp(optArg, "weightedSet") == 0) {
+                hasElements = true;
+                hasElementWeights = true;
+            } else {
+                printf("Bad collection type: %s\n", optArg);
+                return 1;
+            }
+            break;
+        case 'a':
+            doandstress = true;
+            break;
+        case 'c':
+            _commonDocFreq = atoi(optArg);
+            break;
+        case 'd':
+            _numDocs = atoi(optArg);
+            break;
+        case 'l':
+            _loops = atoi(optArg);
+            break;
+        case 's':
+            _stride = atoi(optArg);
+            break;
+        case 't':
+            do {
+                Schema schema;
+                Schema::IndexField indexField("field0",
+                        Schema::STRING,
+                        Schema::SINGLE);
+                schema.addIndexField(indexField);
+                std::unique_ptr<FPFactory> ff(getFPFactory(optArg, schema));
+                if (ff.get() == NULL) {
+                    badPostingType(optArg);
+                    return 1;
+                }
+            } while (0);
+            _postingTypes.push_back(optArg);
+            break;
+        case 'u':
+            _unpack = true;
+            break;
+        case 'v':
+            _verbose = true;
+            break;
+        case 'w':
+            _numWordsPerClass = atoi(optArg);
+            break;
+        case 'q':
+            quick = true;
+            _numDocs = 36000;
+            _commonDocFreq = 10000;
+            _numWordsPerClass = 5;
+            break;
+        default:
+            Usage();
+            return 1;
+        }
+    }
+
+    if (_commonDocFreq > _numDocs) {
+        Usage();
+        return 1;
+    }
+
+    _wordSet.setupParams(hasElements, hasElementWeights);
+
+    uint32_t w1dfreq = 10;
+    uint32_t w4dfreq = 790000;
+    uint32_t w5dfreq = 290000;
+    uint32_t w4w5od = 100000;
+    uint32_t numTasks = 40000;
+    if (quick) {
+        w1dfreq = 2;
+        w4dfreq = 19000;
+        w5dfreq = 5000;
+        w4w5od = 1000;
+        numTasks = 40;
+    }
+    
+
+    FakeWord word1(_numDocs, w1dfreq, w1dfreq / 2, "word1", _rnd,
+                   _wordSet.getFieldsParams(), _wordSet.getPackedIndex());
+    FakeWord word2(_numDocs, 1000, 500, "word2", word1, 4, _rnd,
+                   _wordSet.getFieldsParams(), _wordSet.getPackedIndex());
+    FakeWord word3(_numDocs, _commonDocFreq, _commonDocFreq / 2,
+                   "word3", word1, 10, _rnd,
+                   _wordSet.getFieldsParams(), _wordSet.getPackedIndex());
+    FakeWord word4(_numDocs, w4dfreq, w4dfreq / 2,
+                   "word4", _rnd,
+                   _wordSet.getFieldsParams(), _wordSet.getPackedIndex());
+    FakeWord word5(_numDocs, w5dfreq, w5dfreq / 2,
+                   "word5", word4, w4w5od, _rnd,
+                   _wordSet.getFieldsParams(), _wordSet.getPackedIndex());
+
+    if (_postingTypes.empty())
+        _postingTypes = getPostingTypes();
+    std::vector<std::string>::const_iterator pti;
+    std::vector<std::string>::const_iterator ptie = _postingTypes.end() ;
+
+    for (pti = _postingTypes.begin(); pti != ptie; ++pti) {
+        testFake(*pti, _wordSet.getSchema(), word1);
+        testFake(*pti, _wordSet.getSchema(), word2);
+        testFake(*pti, _wordSet.getSchema(), word3);
+    }
+
+    for (pti = _postingTypes.begin(); pti != ptie; ++pti) {
+        testFakePair(*pti, _wordSet.getSchema(), false, word1, word3);
+        testFakePair(*pti, _wordSet.getSchema(), false, word2, word3);
+    }
+
+    for (pti = _postingTypes.begin(); pti != ptie; ++pti) {
+        testFakePair(*pti, _wordSet.getSchema(), false, word4, word5);
+    }
+
+    if (doandstress) {
+        _wordSet.setupWords(_rnd, _numDocs, _commonDocFreq, _numWordsPerClass);
+    }
+    if (doandstress) {
+        AndStress andstress;
+        andstress.run(_rnd, _wordSet,
+                      _numDocs, _commonDocFreq, _postingTypes, _loops,
+                      _skipCommonPairsRate,
+                      numTasks,
+                      _stride,
+                      _unpack);
+    }
+    return 0;
+}
+
+} // namespace postinglistbm
+
+int
+main(int argc, char **argv)
+{
+    postinglistbm::PostingListBM app;
+
+    setvbuf(stdout, NULL, _IOLBF, 32768);
+    app._rnd.srand48(32);
+    return app.Entry(argc, argv);
+
+    return 0;
+}
diff --git a/searchlib/src/tests/postinglistbm/skip.txt b/searchlib/src/tests/postinglistbm/skip.txt
new file mode 100644
index 00000000000..9804bce3c33
--- /dev/null
+++ b/searchlib/src/tests/postinglistbm/skip.txt
@@ -0,0 +1,75 @@
+B tree view:
+
+  Leaf Nodes: segments of docid delta list
+  Interior Nodes: Segments of skip info lists
+
+  Interior Nodes 1 level above leaf nodes: L1 skip info
+  Interior Nodes 2 level above leaf nodes: L2 skip info
+
+Example posting list, with stride 4 for L1 skip and L2 skip:
+
+DocIdPos:    0  1  2  3| 4  5  6  7| 8   9  10  11| 12  13  14  15| 16  17  18
+DocId:       1 11 21 31|41 51 61 71|81  91 101 111|121 131 141 151|161 171 181
+
+(Assume continued with every 10. docid present)
+
+Old L1 skip info, pointing to start of leaf nodes, with first docid in
+leaf node pre-decoded (i.e. containing copy of first docid entry in leaf node):
+
+L1Pos:     0  1   2   3|  4   5   6   7|  8   9  10  11| 12  13  14  15| 16 
+DocId:    41 81 121 161|201 241 281 321|361 401 441 481|521 561 601 641|681 
+DocIdPos:  5  9  13  17| 21  25  29  33| 37  41  45  49| 53  57  61  65| 69
+
+Old L2 skip info, pointing to start of interior nodes 1 level above leaf nodes
+and containing copies of previous L1 skip entry:
+
+L2Pos:      0   1   2   3
+DocId:    161 321 481 641
+DocIdPos:  17  33  49  65
+L1Pos:      4   8  12  16
+
+Reason for change of skip info view: Avoiding null skips, simplifying code.
+
+Skip from docId 1 to docId 115 first skips to DocId 81 before ending
+up at DocId 121.  If next seek is to below 161, a null skip to docid
+121 is performed since docId delta unpacking caught up with supposedly
+next L1 skip docid.  With L1 skip stride being N, 1/N of longer seeks
+will unpack N extra docids, eating up the advantage of first docid in
+leaf node being pre-decoded.
+
+If a seek to docId 115 is followed by a seek to docId 121, an unpack
+of docId 121 and a sek to a higher docid, this causes, with the old L1
+skip info, features for docId 81, 91 101, 111 to be decoded with the
+result ignored before the features for docId 121 is decoded.  For the
+next seek, the null skip of DocId is also associated with a backwards
+skip for features, so if the next feature to be decoded was for docId
+141 then features for docId 121 will be decoded again and ignored.
+
+New L1 skip info, pointing to start of leaf nodes, without first docid
+in leaf node pre-decoded (i.e. containing copy of last docid entry in
+previous leaf node):
+
+L1Pos:     0  1   2   3|  4   5   6   7|  8   9  10  11| 12  13  14  15| 16 
+DocId:    31 71 111 151|191 231 271 311|351 391 431 471|511 551 591 631|671 
+DocIdPos:  4  8  12  16| 20  24  28  32| 36  40  44  48| 52  56  60  64| 68
+
+New L2 skip info, pointing to start of interior nodes 1 level above leaf nodes
+and containing copies of previous L1 skip entry:
+
+L2Pos:      0   1   2   3
+DocId:    151 311 471 631
+DocIdPos:  16  32  48  64
+L1Pos:      4   8  12  16
+
+1 DocId delta is unpacked when using L1 or L2 skip, to get first docid
+in leaf node.  With old skip info, this wasn't needed.
+
+With new skip info, docid delta unpacking should never catch up with
+next L1 skip docid (can become equal, but that's no longer sufficient
+for triggering a skip).
+
+For each level upwards in skip info, one extra number is needed per element in
+the skip info.
+
+For feature position (split docid/features), one extra number is needed per
+element in the skip info.
author	Jon Bratseth <bratseth@yahoo-inc.com>	2016-06-15 23:09:44 +0200
committer	Jon Bratseth <bratseth@yahoo-inc.com>	2016-06-15 23:09:44 +0200
commit	72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree	2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /searchlib/src/tests/postinglistbm