diff options
author | Geir Storli <geirst@verizonmedia.com> | 2019-05-08 07:40:36 +0000 |
---|---|---|
committer | Geir Storli <geirst@verizonmedia.com> | 2019-05-08 07:40:36 +0000 |
commit | fd554ae1f32a636009a2bebf5039410ea0acc984 (patch) | |
tree | 5aad24e0b9d913101c5c61660da524003cd95c12 /searchlib/src | |
parent | 0c379910129e489b8bbd3ecde16690120068a669 (diff) |
Make it possible to specify doc freq for medium and rare words.
Diffstat (limited to 'searchlib/src')
5 files changed, 53 insertions, 26 deletions
diff --git a/searchlib/src/tests/postinglistbm/andstress.cpp b/searchlib/src/tests/postinglistbm/andstress.cpp index adca7892464..fcc234ef4e0 100644 --- a/searchlib/src/tests/postinglistbm/andstress.cpp +++ b/searchlib/src/tests/postinglistbm/andstress.cpp @@ -39,7 +39,6 @@ private: search::Rand48 &_rnd; unsigned int _numDocs; - unsigned int _commonDocFreq; std::vector<std::string> _postingTypes; unsigned int _loops; unsigned int _skipCommonPairsRate; @@ -68,8 +67,6 @@ private: public: AndStressMaster(search::Rand48 &rnd, FakeWordSet &wordSet, - unsigned int numDocs, - unsigned int commonDocFreq, const std::vector<std::string> &postingType, unsigned int loops, unsigned int skipCommonPairsRate, @@ -117,8 +114,6 @@ makePosting(FakeWord &fw) AndStressMaster::AndStressMaster(search::Rand48 &rnd, FakeWordSet &wordSet, - unsigned int numDocs, - unsigned int commonDocFreq, const std::vector<std::string> &postingTypes, unsigned int loops, unsigned int skipCommonPairsRate, @@ -126,8 +121,7 @@ AndStressMaster::AndStressMaster(search::Rand48 &rnd, uint32_t stride, bool unpack) : _rnd(rnd), - _numDocs(numDocs), - _commonDocFreq(commonDocFreq), + _numDocs(wordSet.numDocs()), _postingTypes(postingTypes), _loops(loops), _skipCommonPairsRate(skipCommonPairsRate), @@ -399,8 +393,6 @@ AndStress::~AndStress() void AndStress::run(search::Rand48 &rnd, FakeWordSet &wordSet, - unsigned int numDocs, - unsigned int commonDocFreq, const std::vector<std::string> &postingTypes, unsigned int loops, unsigned int skipCommonPairsRate, @@ -410,7 +402,7 @@ AndStress::run(search::Rand48 &rnd, { LOG(debug, "Andstress::run"); AndStressMaster master(rnd, wordSet, - numDocs, commonDocFreq, postingTypes, loops, + postingTypes, loops, skipCommonPairsRate, numTasks, stride, diff --git a/searchlib/src/tests/postinglistbm/andstress.h b/searchlib/src/tests/postinglistbm/andstress.h index fdc99fb42ba..7a7fe2f66f2 100644 --- a/searchlib/src/tests/postinglistbm/andstress.h +++ b/searchlib/src/tests/postinglistbm/andstress.h @@ -21,8 +21,6 @@ public: void run(search::Rand48 &rnd, search::fakedata::FakeWordSet &wordSet, - unsigned int numDocs, - unsigned int commonDocFreq, const std::vector<std::string> &postingTypes, unsigned int loops, unsigned int skipCommonPairsRate, diff --git a/searchlib/src/tests/postinglistbm/postinglistbm.cpp b/searchlib/src/tests/postinglistbm/postinglistbm.cpp index 0a6f99ede11..890ca8fd6eb 100644 --- a/searchlib/src/tests/postinglistbm/postinglistbm.cpp +++ b/searchlib/src/tests/postinglistbm/postinglistbm.cpp @@ -30,6 +30,8 @@ class PostingListBM : public FastOS_Application { private: uint32_t _numDocs; uint32_t _commonDocFreq; + uint32_t _mediumDocFreq; + uint32_t _rareDocFreq; uint32_t _numWordsPerClass; std::vector<std::string> _postingTypes; uint32_t _loops; @@ -54,6 +56,8 @@ usage() "[-C <skipCommonPairsRate>] " "[-T {string, array, weightedSet}] " "[-c <commonDoqFreq>] " + "[-m <mediumDoqFreq>] " + "[-r <rareDoqFreq>] " "[-d <numDocs>] " "[-l <numLoops>] " "[-s <stride>] " @@ -84,6 +88,8 @@ badPostingType(const std::string &postingType) PostingListBM::PostingListBM() : _numDocs(10000000), _commonDocFreq(50000), + _mediumDocFreq(1000), + _rareDocFreq(10), _numWordsPerClass(100), _postingTypes(), _loops(1), @@ -109,7 +115,7 @@ PostingListBM::Main() bool hasElementWeights = false; bool quick = false; - while ((c = GetOpt("C:c:d:l:s:t:uw:T:q", optArg, argi)) != -1) { + while ((c = GetOpt("C:c:m:r:d:l:s:t:uw:T:q", optArg, argi)) != -1) { switch(c) { case 'C': _skipCommonPairsRate = atoi(optArg); @@ -132,6 +138,12 @@ PostingListBM::Main() case 'c': _commonDocFreq = atoi(optArg); break; + case 'm': + _mediumDocFreq = atoi(optArg); + break; + case 'r': + _rareDocFreq = atoi(optArg); + break; case 'd': _numDocs = atoi(optArg); break; @@ -190,11 +202,11 @@ PostingListBM::Main() _postingTypes = getPostingTypes(); } - _wordSet.setupWords(_rnd, _numDocs, _commonDocFreq, _numWordsPerClass); + _wordSet.setupWords(_rnd, _numDocs, _commonDocFreq, _mediumDocFreq, _rareDocFreq, _numWordsPerClass); AndStress andstress; andstress.run(_rnd, _wordSet, - _numDocs, _commonDocFreq, _postingTypes, _loops, + _postingTypes, _loops, _skipCommonPairsRate, numTasks, _stride, diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.cpp index 5c87bf88e9c..09c8e86d979 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.cpp +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.cpp @@ -31,7 +31,8 @@ applyDocIdBiasToVector(FakeWordVector& words, uint32_t docIdBias) FakeWordSet::FakeWordSet() : _words(NUM_WORDCLASSES), _schema(), - _fieldsParams() + _fieldsParams(), + _numDocs(0) { setupParams(false, false); } @@ -40,7 +41,8 @@ FakeWordSet::FakeWordSet(bool hasElements, bool hasElementWeights) : _words(NUM_WORDCLASSES), _schema(), - _fieldsParams() + _fieldsParams(), + _numDocs(0) { setupParams(hasElements, hasElementWeights); } @@ -75,9 +77,20 @@ FakeWordSet::setupParams(bool hasElements, void FakeWordSet::setupWords(search::Rand48 &rnd, - unsigned int numDocs, - unsigned int commonDocFreq, - unsigned int numWordsPerWordClass) + uint32_t numDocs, + uint32_t commonDocFreq, + uint32_t numWordsPerWordClass) +{ + setupWords(rnd, numDocs, commonDocFreq, 1000, 10, numWordsPerWordClass); +} + +void +FakeWordSet::setupWords(search::Rand48 &rnd, + uint32_t numDocs, + uint32_t commonDocFreq, + uint32_t mediumDocFreq, + uint32_t rareDocFreq, + uint32_t numWordsPerWordClass) { std::string common = "common"; std::string medium = "medium"; @@ -86,11 +99,13 @@ FakeWordSet::setupWords(search::Rand48 &rnd, double before; double after; + _numDocs = numDocs; + LOG(info, "enter setupWords"); tv.SetNow(); before = tv.Secs(); uint32_t packedIndex = _fieldsParams.size() - 1; - for (unsigned int i = 0; i < numWordsPerWordClass; ++i) { + for (uint32_t i = 0; i < numWordsPerWordClass; ++i) { std::ostringstream vi; vi << (i + 1); @@ -99,12 +114,12 @@ FakeWordSet::setupWords(search::Rand48 &rnd, _fieldsParams[packedIndex], packedIndex)); - _words[MEDIUM_WORD].push_back(std::make_unique<FakeWord>(numDocs, 1000, 500, + _words[MEDIUM_WORD].push_back(std::make_unique<FakeWord>(numDocs, mediumDocFreq, mediumDocFreq / 2, medium + vi.str(), rnd, _fieldsParams[packedIndex], packedIndex)); - _words[RARE_WORD].push_back(std::make_unique<FakeWord>(numDocs, 10, 5, + _words[RARE_WORD].push_back(std::make_unique<FakeWord>(numDocs, rareDocFreq, rareDocFreq / 2, rare + vi.str(), rnd, _fieldsParams[packedIndex], packedIndex)); diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.h b/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.h index 0b7ee4db6fe..d404c664a34 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.h +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.h @@ -32,6 +32,7 @@ private: std::vector<FakeWordVector> _words; Schema _schema; std::vector<PosOccFieldsParams> _fieldsParams; + uint32_t _numDocs; public: FakeWordSet(); @@ -45,9 +46,16 @@ public: bool hasElementWeights); void setupWords(search::Rand48 &rnd, - unsigned int numDocs, - unsigned int commonDocFreq, - unsigned int numWordsPerWordClass); + uint32_t numDocs, + uint32_t commonDocFreq, + uint32_t numWordsPerWordClass); + + void setupWords(search::Rand48 &rnd, + uint32_t numDocs, + uint32_t commonDocFreq, + uint32_t mediumDocFreq, + uint32_t rareDocFreq, + uint32_t numWordsPerWordClass); const std::vector<FakeWordVector>& words() const { return _words; } @@ -69,6 +77,8 @@ public: return _schema; } + uint32_t numDocs() const { return _numDocs; } + void addDocIdBias(uint32_t docIdBias); }; |