diff options
Diffstat (limited to 'searchlib/src/tests/postinglistbm/postinglistbm.cpp')
-rw-r--r-- | searchlib/src/tests/postinglistbm/postinglistbm.cpp | 491 |
1 files changed, 491 insertions, 0 deletions
diff --git a/searchlib/src/tests/postinglistbm/postinglistbm.cpp b/searchlib/src/tests/postinglistbm/postinglistbm.cpp new file mode 100644 index 00000000000..fc93eb42dcd --- /dev/null +++ b/searchlib/src/tests/postinglistbm/postinglistbm.cpp @@ -0,0 +1,491 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 2002-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("postinglistbm"); +#include <vespa/searchlib/common/bitvector.h> +#include <vespa/searchlib/common/resultset.h> +#include <vespa/searchlib/util/rand48.h> +#include "andstress.h" +#include <vespa/searchlib/test/fakedata/fakeword.h> +#include <vespa/searchlib/test/fakedata/fakeposting.h> +#include <vespa/searchlib/test/fakedata/fakewordset.h> +#include <vespa/searchlib/test/fakedata/fpfactory.h> +#include <vespa/searchlib/index/docidandfeatures.h> + +using search::ResultSet; +using search::fef::TermFieldMatchData; +using search::fef::TermFieldMatchDataArray; +using search::queryeval::SearchIterator; +using search::index::Schema; +using namespace search::fakedata; + +// needed to resolve external symbol from httpd.h on AIX +void FastS_block_usr2() {} + + +namespace postinglistbm +{ + +class PostingListBM : public FastOS_Application +{ +private: + bool _verbose; + uint32_t _numDocs; + uint32_t _commonDocFreq; + uint32_t _numWordsPerClass; + std::vector<std::string> _postingTypes; + uint32_t _loops; + unsigned int _skipCommonPairsRate; + FakeWordSet _wordSet; + uint32_t _stride; + bool _unpack; +public: + search::Rand48 _rnd; + +private: + void Usage(void); + + void + badPostingType(const std::string &postingType); + + void + testFake(const std::string &postingType, + const Schema &schema, + const FakeWord &fw); +public: + PostingListBM(void); + ~PostingListBM(void); + int Main(void); +}; + + +void +PostingListBM::Usage(void) +{ + printf("postinglistbm " + "[-C <skipCommonPairsRate>] " + "[-a] " + "[-c <commonDoqFreq>] " + "[-d <numDocs>] " + "[-l <numLoops>] " + "[-s <stride>] " + "[-t <postingType>] " + "[-u] " + "[-q] " + "[-v]\n"); +} + + +void +PostingListBM::badPostingType(const std::string &postingType) +{ + printf("Bad posting list type: %s\n", postingType.c_str()); + printf("Supported types: "); + + std::vector<std::string> postingTypes = getPostingTypes(); + std::vector<std::string>::const_iterator pti; + std::vector<std::string>::const_iterator ptie = postingTypes.end(); + bool first = true; + + for (pti = postingTypes.begin(); pti != ptie; ++pti) { + if (first) + first = false; + else + printf(", "); + printf("%s", pti->c_str()); + } + printf("\n"); +} + + +PostingListBM::PostingListBM(void) + : _verbose(false), + _numDocs(10000000), + _commonDocFreq(50000), + _numWordsPerClass(100), + _postingTypes(), + _loops(1), + _skipCommonPairsRate(1), + _wordSet(), + _stride(0), + _unpack(false), + _rnd() +{ +} + + +PostingListBM::~PostingListBM(void) +{ +} + + +static int +highLevelSinglePostingScan(SearchIterator &sb, uint32_t numDocs, uint64_t *cycles) +{ + uint32_t hits = 0; + uint64_t before = fastos::ClockSystem::now(); + sb.initFullRange(); + uint32_t docId = sb.getDocId(); + while (docId < numDocs) { + if (sb.seek(docId)) { + ++hits; + ++docId; + } else if (docId < sb.getDocId()) + docId= sb.getDocId(); + else + ++docId; + } + uint64_t after = fastos::ClockSystem::now(); + *cycles = after - before; + return hits; +} + + +static int +highLevelSinglePostingScanUnpack(SearchIterator &sb, + uint32_t numDocs, uint64_t *cycles) +{ + uint32_t hits = 0; + uint64_t before = fastos::ClockSystem::now(); + sb.initFullRange(); + uint32_t docId = sb.getDocId(); + while (docId < numDocs) { + if (sb.seek(docId)) { + ++hits; + sb.unpack(docId); + ++docId; + } else if (docId < sb.getDocId()) + docId= sb.getDocId(); + else + ++docId; + } + uint64_t after = fastos::ClockSystem::now(); + *cycles = after - before; + return hits; +} + + +static int +highLevelAndPairPostingScan(SearchIterator &sb1, + SearchIterator &sb2, + uint32_t numDocs, uint64_t *cycles) +{ + uint32_t hits = 0; + uint64_t before = fastos::ClockSystem::now(); + sb1.initFullRange(); + sb2.initFullRange(); + uint32_t docId = sb1.getDocId(); + while (docId < numDocs) { + if (sb1.seek(docId)) { + if (sb2.seek(docId)) { + ++hits; + ++docId; + } else if (docId < sb2.getDocId()) + docId = sb2.getDocId(); + else + ++docId; + } else if (docId < sb1.getDocId()) + docId= sb1.getDocId(); + else + ++docId; + } + uint64_t after = fastos::ClockSystem::now(); + *cycles = after - before; + return hits; +} + + +static int +highLevelAndPairPostingScanUnpack(SearchIterator &sb1, + SearchIterator &sb2, + uint32_t numDocs, + uint64_t *cycles) +{ + uint32_t hits = 0; + uint64_t before = fastos::ClockSystem::now(); + sb1.initFullRange(); + sb1.initFullRange(); + uint32_t docId = sb1.getDocId(); + while (docId < numDocs) { + if (sb1.seek(docId)) { + if (sb2.seek(docId)) { + ++hits; + sb1.unpack(docId); + sb2.unpack(docId); + ++docId; + } else if (docId < sb2.getDocId()) + docId = sb2.getDocId(); + else + ++docId; + } else if (docId < sb1.getDocId()) + docId= sb1.getDocId(); + else + ++docId; + } + uint64_t after = fastos::ClockSystem::now(); + *cycles = after - before; + return hits; +} + + +void +PostingListBM::testFake(const std::string &postingType, + const Schema &schema, + const FakeWord &fw) +{ + std::unique_ptr<FPFactory> ff(getFPFactory(postingType, schema)); + std::vector<const FakeWord *> v; + v.push_back(&fw); + ff->setup(v); + FakePosting::SP f(ff->make(fw)); + + printf("%s.bitsize=%d+%d+%d+%d+%d\n", + f->getName().c_str(), + static_cast<int>(f->bitSize()), + static_cast<int>(f->l1SkipBitSize()), + static_cast<int>(f->l2SkipBitSize()), + static_cast<int>(f->l3SkipBitSize()), + static_cast<int>(f->l4SkipBitSize())); + TermFieldMatchData md; + TermFieldMatchDataArray tfmda; + tfmda.add(&md); + + std::unique_ptr<SearchIterator> sb(f->createIterator(tfmda)); + if (f->hasWordPositions()) + fw.validate(sb.get(), tfmda, _verbose); + else + fw.validate(sb.get(), _verbose); + uint64_t scanTime = 0; + uint64_t scanUnpackTime = 0; + TermFieldMatchData md2; + TermFieldMatchDataArray tfmda2; + tfmda2.add(&md2); + + std::unique_ptr<SearchIterator> sb2(f->createIterator(tfmda2)); + int hits1 = highLevelSinglePostingScan(*sb2.get(), fw.getDocIdLimit(), + &scanTime); + TermFieldMatchData md3; + TermFieldMatchDataArray tfmda3; + tfmda3.add(&md3); + + std::unique_ptr<SearchIterator> sb3(f->createIterator(tfmda3)); + int hits2 = highLevelSinglePostingScanUnpack(*sb3.get(), fw.getDocIdLimit(), + &scanUnpackTime); + printf("testFake '%s' hits1=%d, hits2=%d, scanTime=%" PRIu64 + ", scanUnpackTime=%" PRIu64 "\n", + f->getName().c_str(), + hits1, hits2, scanTime, scanUnpackTime); +} + + +void +testFakePair(const std::string &postingType, + const Schema &schema, + bool unpack, + const FakeWord &fw1, const FakeWord &fw2) +{ + std::unique_ptr<FPFactory> ff(getFPFactory(postingType, schema)); + std::vector<const FakeWord *> v; + v.push_back(&fw1); + v.push_back(&fw2); + ff->setup(v); + FakePosting::SP f1(ff->make(fw1)); + FakePosting::SP f2(ff->make(fw2)); + + TermFieldMatchData md1; + TermFieldMatchDataArray tfmda1; + tfmda1.add(&md1); + std::unique_ptr<SearchIterator> sb1(f1->createIterator(tfmda1)); + + TermFieldMatchData md2; + TermFieldMatchDataArray tfmda2; + tfmda1.add(&md2); + std::unique_ptr<SearchIterator> sb2(f2->createIterator(tfmda2)); + + int hits = 0; + uint64_t scanUnpackTime = 0; + if (unpack) + hits = highLevelAndPairPostingScanUnpack(*sb1.get(), *sb2.get(), + fw1.getDocIdLimit(), &scanUnpackTime); + else + hits = highLevelAndPairPostingScan(*sb1.get(), *sb2.get(), + fw1.getDocIdLimit(), &scanUnpackTime); + printf("Fakepair %s AND %s => %d hits, %" PRIu64 " cycles\n", + f1->getName().c_str(), + f2->getName().c_str(), + hits, + scanUnpackTime); +} + + +int +PostingListBM::Main(void) +{ + int argi; + char c; + const char *optArg; + bool doandstress; + + doandstress = false; + argi = 1; + bool hasElements = false; + bool hasElementWeights = false; + bool quick = false; + + while ((c = GetOpt("C:ac:d:l:s:t:uvw:T:q", optArg, argi)) != -1) { + switch(c) { + case 'C': + _skipCommonPairsRate = atoi(optArg); + break; + case 'T': + if (strcmp(optArg, "single") == 0) { + hasElements = false; + hasElementWeights = false; + } else if (strcmp(optArg, "array") == 0) { + hasElements = true; + hasElementWeights = false; + } else if (strcmp(optArg, "weightedSet") == 0) { + hasElements = true; + hasElementWeights = true; + } else { + printf("Bad collection type: %s\n", optArg); + return 1; + } + break; + case 'a': + doandstress = true; + break; + case 'c': + _commonDocFreq = atoi(optArg); + break; + case 'd': + _numDocs = atoi(optArg); + break; + case 'l': + _loops = atoi(optArg); + break; + case 's': + _stride = atoi(optArg); + break; + case 't': + do { + Schema schema; + Schema::IndexField indexField("field0", + Schema::STRING, + Schema::SINGLE); + schema.addIndexField(indexField); + std::unique_ptr<FPFactory> ff(getFPFactory(optArg, schema)); + if (ff.get() == NULL) { + badPostingType(optArg); + return 1; + } + } while (0); + _postingTypes.push_back(optArg); + break; + case 'u': + _unpack = true; + break; + case 'v': + _verbose = true; + break; + case 'w': + _numWordsPerClass = atoi(optArg); + break; + case 'q': + quick = true; + _numDocs = 36000; + _commonDocFreq = 10000; + _numWordsPerClass = 5; + break; + default: + Usage(); + return 1; + } + } + + if (_commonDocFreq > _numDocs) { + Usage(); + return 1; + } + + _wordSet.setupParams(hasElements, hasElementWeights); + + uint32_t w1dfreq = 10; + uint32_t w4dfreq = 790000; + uint32_t w5dfreq = 290000; + uint32_t w4w5od = 100000; + uint32_t numTasks = 40000; + if (quick) { + w1dfreq = 2; + w4dfreq = 19000; + w5dfreq = 5000; + w4w5od = 1000; + numTasks = 40; + } + + + FakeWord word1(_numDocs, w1dfreq, w1dfreq / 2, "word1", _rnd, + _wordSet.getFieldsParams(), _wordSet.getPackedIndex()); + FakeWord word2(_numDocs, 1000, 500, "word2", word1, 4, _rnd, + _wordSet.getFieldsParams(), _wordSet.getPackedIndex()); + FakeWord word3(_numDocs, _commonDocFreq, _commonDocFreq / 2, + "word3", word1, 10, _rnd, + _wordSet.getFieldsParams(), _wordSet.getPackedIndex()); + FakeWord word4(_numDocs, w4dfreq, w4dfreq / 2, + "word4", _rnd, + _wordSet.getFieldsParams(), _wordSet.getPackedIndex()); + FakeWord word5(_numDocs, w5dfreq, w5dfreq / 2, + "word5", word4, w4w5od, _rnd, + _wordSet.getFieldsParams(), _wordSet.getPackedIndex()); + + if (_postingTypes.empty()) + _postingTypes = getPostingTypes(); + std::vector<std::string>::const_iterator pti; + std::vector<std::string>::const_iterator ptie = _postingTypes.end() ; + + for (pti = _postingTypes.begin(); pti != ptie; ++pti) { + testFake(*pti, _wordSet.getSchema(), word1); + testFake(*pti, _wordSet.getSchema(), word2); + testFake(*pti, _wordSet.getSchema(), word3); + } + + for (pti = _postingTypes.begin(); pti != ptie; ++pti) { + testFakePair(*pti, _wordSet.getSchema(), false, word1, word3); + testFakePair(*pti, _wordSet.getSchema(), false, word2, word3); + } + + for (pti = _postingTypes.begin(); pti != ptie; ++pti) { + testFakePair(*pti, _wordSet.getSchema(), false, word4, word5); + } + + if (doandstress) { + _wordSet.setupWords(_rnd, _numDocs, _commonDocFreq, _numWordsPerClass); + } + if (doandstress) { + AndStress andstress; + andstress.run(_rnd, _wordSet, + _numDocs, _commonDocFreq, _postingTypes, _loops, + _skipCommonPairsRate, + numTasks, + _stride, + _unpack); + } + return 0; +} + +} // namespace postinglistbm + +int +main(int argc, char **argv) +{ + postinglistbm::PostingListBM app; + + setvbuf(stdout, NULL, _IOLBF, 32768); + app._rnd.srand48(32); + return app.Entry(argc, argv); + + return 0; +} |