aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2019-05-08 07:40:36 +0000
committerGeir Storli <geirst@verizonmedia.com>2019-05-08 07:40:36 +0000
commitfd554ae1f32a636009a2bebf5039410ea0acc984 (patch)
tree5aad24e0b9d913101c5c61660da524003cd95c12 /searchlib/src
parent0c379910129e489b8bbd3ecde16690120068a669 (diff)
Make it possible to specify doc freq for medium and rare words.
Diffstat (limited to 'searchlib/src')
-rw-r--r--searchlib/src/tests/postinglistbm/andstress.cpp12
-rw-r--r--searchlib/src/tests/postinglistbm/andstress.h2
-rw-r--r--searchlib/src/tests/postinglistbm/postinglistbm.cpp18
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakewordset.cpp31
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakewordset.h16
5 files changed, 53 insertions, 26 deletions
diff --git a/searchlib/src/tests/postinglistbm/andstress.cpp b/searchlib/src/tests/postinglistbm/andstress.cpp
index adca7892464..fcc234ef4e0 100644
--- a/searchlib/src/tests/postinglistbm/andstress.cpp
+++ b/searchlib/src/tests/postinglistbm/andstress.cpp
@@ -39,7 +39,6 @@ private:
search::Rand48 &_rnd;
unsigned int _numDocs;
- unsigned int _commonDocFreq;
std::vector<std::string> _postingTypes;
unsigned int _loops;
unsigned int _skipCommonPairsRate;
@@ -68,8 +67,6 @@ private:
public:
AndStressMaster(search::Rand48 &rnd,
FakeWordSet &wordSet,
- unsigned int numDocs,
- unsigned int commonDocFreq,
const std::vector<std::string> &postingType,
unsigned int loops,
unsigned int skipCommonPairsRate,
@@ -117,8 +114,6 @@ makePosting(FakeWord &fw)
AndStressMaster::AndStressMaster(search::Rand48 &rnd,
FakeWordSet &wordSet,
- unsigned int numDocs,
- unsigned int commonDocFreq,
const std::vector<std::string> &postingTypes,
unsigned int loops,
unsigned int skipCommonPairsRate,
@@ -126,8 +121,7 @@ AndStressMaster::AndStressMaster(search::Rand48 &rnd,
uint32_t stride,
bool unpack)
: _rnd(rnd),
- _numDocs(numDocs),
- _commonDocFreq(commonDocFreq),
+ _numDocs(wordSet.numDocs()),
_postingTypes(postingTypes),
_loops(loops),
_skipCommonPairsRate(skipCommonPairsRate),
@@ -399,8 +393,6 @@ AndStress::~AndStress()
void
AndStress::run(search::Rand48 &rnd,
FakeWordSet &wordSet,
- unsigned int numDocs,
- unsigned int commonDocFreq,
const std::vector<std::string> &postingTypes,
unsigned int loops,
unsigned int skipCommonPairsRate,
@@ -410,7 +402,7 @@ AndStress::run(search::Rand48 &rnd,
{
LOG(debug, "Andstress::run");
AndStressMaster master(rnd, wordSet,
- numDocs, commonDocFreq, postingTypes, loops,
+ postingTypes, loops,
skipCommonPairsRate,
numTasks,
stride,
diff --git a/searchlib/src/tests/postinglistbm/andstress.h b/searchlib/src/tests/postinglistbm/andstress.h
index fdc99fb42ba..7a7fe2f66f2 100644
--- a/searchlib/src/tests/postinglistbm/andstress.h
+++ b/searchlib/src/tests/postinglistbm/andstress.h
@@ -21,8 +21,6 @@ public:
void run(search::Rand48 &rnd,
search::fakedata::FakeWordSet &wordSet,
- unsigned int numDocs,
- unsigned int commonDocFreq,
const std::vector<std::string> &postingTypes,
unsigned int loops,
unsigned int skipCommonPairsRate,
diff --git a/searchlib/src/tests/postinglistbm/postinglistbm.cpp b/searchlib/src/tests/postinglistbm/postinglistbm.cpp
index 0a6f99ede11..890ca8fd6eb 100644
--- a/searchlib/src/tests/postinglistbm/postinglistbm.cpp
+++ b/searchlib/src/tests/postinglistbm/postinglistbm.cpp
@@ -30,6 +30,8 @@ class PostingListBM : public FastOS_Application {
private:
uint32_t _numDocs;
uint32_t _commonDocFreq;
+ uint32_t _mediumDocFreq;
+ uint32_t _rareDocFreq;
uint32_t _numWordsPerClass;
std::vector<std::string> _postingTypes;
uint32_t _loops;
@@ -54,6 +56,8 @@ usage()
"[-C <skipCommonPairsRate>] "
"[-T {string, array, weightedSet}] "
"[-c <commonDoqFreq>] "
+ "[-m <mediumDoqFreq>] "
+ "[-r <rareDoqFreq>] "
"[-d <numDocs>] "
"[-l <numLoops>] "
"[-s <stride>] "
@@ -84,6 +88,8 @@ badPostingType(const std::string &postingType)
PostingListBM::PostingListBM()
: _numDocs(10000000),
_commonDocFreq(50000),
+ _mediumDocFreq(1000),
+ _rareDocFreq(10),
_numWordsPerClass(100),
_postingTypes(),
_loops(1),
@@ -109,7 +115,7 @@ PostingListBM::Main()
bool hasElementWeights = false;
bool quick = false;
- while ((c = GetOpt("C:c:d:l:s:t:uw:T:q", optArg, argi)) != -1) {
+ while ((c = GetOpt("C:c:m:r:d:l:s:t:uw:T:q", optArg, argi)) != -1) {
switch(c) {
case 'C':
_skipCommonPairsRate = atoi(optArg);
@@ -132,6 +138,12 @@ PostingListBM::Main()
case 'c':
_commonDocFreq = atoi(optArg);
break;
+ case 'm':
+ _mediumDocFreq = atoi(optArg);
+ break;
+ case 'r':
+ _rareDocFreq = atoi(optArg);
+ break;
case 'd':
_numDocs = atoi(optArg);
break;
@@ -190,11 +202,11 @@ PostingListBM::Main()
_postingTypes = getPostingTypes();
}
- _wordSet.setupWords(_rnd, _numDocs, _commonDocFreq, _numWordsPerClass);
+ _wordSet.setupWords(_rnd, _numDocs, _commonDocFreq, _mediumDocFreq, _rareDocFreq, _numWordsPerClass);
AndStress andstress;
andstress.run(_rnd, _wordSet,
- _numDocs, _commonDocFreq, _postingTypes, _loops,
+ _postingTypes, _loops,
_skipCommonPairsRate,
numTasks,
_stride,
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.cpp
index 5c87bf88e9c..09c8e86d979 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.cpp
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.cpp
@@ -31,7 +31,8 @@ applyDocIdBiasToVector(FakeWordVector& words, uint32_t docIdBias)
FakeWordSet::FakeWordSet()
: _words(NUM_WORDCLASSES),
_schema(),
- _fieldsParams()
+ _fieldsParams(),
+ _numDocs(0)
{
setupParams(false, false);
}
@@ -40,7 +41,8 @@ FakeWordSet::FakeWordSet(bool hasElements,
bool hasElementWeights)
: _words(NUM_WORDCLASSES),
_schema(),
- _fieldsParams()
+ _fieldsParams(),
+ _numDocs(0)
{
setupParams(hasElements, hasElementWeights);
}
@@ -75,9 +77,20 @@ FakeWordSet::setupParams(bool hasElements,
void
FakeWordSet::setupWords(search::Rand48 &rnd,
- unsigned int numDocs,
- unsigned int commonDocFreq,
- unsigned int numWordsPerWordClass)
+ uint32_t numDocs,
+ uint32_t commonDocFreq,
+ uint32_t numWordsPerWordClass)
+{
+ setupWords(rnd, numDocs, commonDocFreq, 1000, 10, numWordsPerWordClass);
+}
+
+void
+FakeWordSet::setupWords(search::Rand48 &rnd,
+ uint32_t numDocs,
+ uint32_t commonDocFreq,
+ uint32_t mediumDocFreq,
+ uint32_t rareDocFreq,
+ uint32_t numWordsPerWordClass)
{
std::string common = "common";
std::string medium = "medium";
@@ -86,11 +99,13 @@ FakeWordSet::setupWords(search::Rand48 &rnd,
double before;
double after;
+ _numDocs = numDocs;
+
LOG(info, "enter setupWords");
tv.SetNow();
before = tv.Secs();
uint32_t packedIndex = _fieldsParams.size() - 1;
- for (unsigned int i = 0; i < numWordsPerWordClass; ++i) {
+ for (uint32_t i = 0; i < numWordsPerWordClass; ++i) {
std::ostringstream vi;
vi << (i + 1);
@@ -99,12 +114,12 @@ FakeWordSet::setupWords(search::Rand48 &rnd,
_fieldsParams[packedIndex],
packedIndex));
- _words[MEDIUM_WORD].push_back(std::make_unique<FakeWord>(numDocs, 1000, 500,
+ _words[MEDIUM_WORD].push_back(std::make_unique<FakeWord>(numDocs, mediumDocFreq, mediumDocFreq / 2,
medium + vi.str(), rnd,
_fieldsParams[packedIndex],
packedIndex));
- _words[RARE_WORD].push_back(std::make_unique<FakeWord>(numDocs, 10, 5,
+ _words[RARE_WORD].push_back(std::make_unique<FakeWord>(numDocs, rareDocFreq, rareDocFreq / 2,
rare + vi.str(), rnd,
_fieldsParams[packedIndex],
packedIndex));
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.h b/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.h
index 0b7ee4db6fe..d404c664a34 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.h
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.h
@@ -32,6 +32,7 @@ private:
std::vector<FakeWordVector> _words;
Schema _schema;
std::vector<PosOccFieldsParams> _fieldsParams;
+ uint32_t _numDocs;
public:
FakeWordSet();
@@ -45,9 +46,16 @@ public:
bool hasElementWeights);
void setupWords(search::Rand48 &rnd,
- unsigned int numDocs,
- unsigned int commonDocFreq,
- unsigned int numWordsPerWordClass);
+ uint32_t numDocs,
+ uint32_t commonDocFreq,
+ uint32_t numWordsPerWordClass);
+
+ void setupWords(search::Rand48 &rnd,
+ uint32_t numDocs,
+ uint32_t commonDocFreq,
+ uint32_t mediumDocFreq,
+ uint32_t rareDocFreq,
+ uint32_t numWordsPerWordClass);
const std::vector<FakeWordVector>& words() const { return _words; }
@@ -69,6 +77,8 @@ public:
return _schema;
}
+ uint32_t numDocs() const { return _numDocs; }
+
void addDocIdBias(uint32_t docIdBias);
};