aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.cpp
blob: 4bb6971290bf0516786df232d5d4990496eaf8cb (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include "fakewordset.h"
#include "fakeword.h"
#include <vespa/vespalib/util/time.h>
#include <vespa/searchlib/bitcompression/posocc_fields_params.h>
#include <sstream>

#include <vespa/log/log.h>
LOG_SETUP(".fakewordset");

namespace search::fakedata {

using FakeWordVector = FakeWordSet::FakeWordVector;
using index::PostingListParams;
using index::SchemaUtil;
using index::schema::CollectionType;
using index::schema::DataType;

namespace {

void
applyDocIdBiasToVector(FakeWordVector& words, uint32_t docIdBias)
{
    for (auto& word : words) {
        word->addDocIdBias(docIdBias);
    }
}

}

FakeWordSet::FakeWordSet()
    : _words(NUM_WORDCLASSES),
      _schema(),
      _fieldsParams(),
      _numDocs(0)
{
    setupParams(false, false);
}

FakeWordSet::FakeWordSet(bool hasElements,
                         bool hasElementWeights)
    : _words(NUM_WORDCLASSES),
      _schema(),
      _fieldsParams(),
      _numDocs(0)
{
    setupParams(hasElements, hasElementWeights);
}

FakeWordSet::~FakeWordSet() = default;

void
FakeWordSet::setupParams(bool hasElements,
                         bool hasElementWeights)
{
    _schema.clear();

    assert(hasElements || !hasElementWeights);
    Schema::CollectionType collectionType(CollectionType::SINGLE);
    if (hasElements) {
        if (hasElementWeights) {
            collectionType = CollectionType::WEIGHTEDSET;
        } else {
            collectionType = CollectionType::ARRAY;
        }
    }
    Schema::IndexField indexField("field0", DataType::STRING, collectionType);
    indexField.setAvgElemLen(512u);
    _schema.addIndexField(indexField);
    _fieldsParams.resize(_schema.getNumIndexFields());
    SchemaUtil::IndexIterator it(_schema);
    for(; it.isValid(); ++it) {
        _fieldsParams[it.getIndex()].
            setSchemaParams(_schema, it.getIndex());
    }
}

void
FakeWordSet::setupWords(vespalib::Rand48 &rnd,
                        uint32_t numDocs,
                        uint32_t commonDocFreq,
                        uint32_t numWordsPerWordClass)
{
    setupWords(rnd, numDocs, commonDocFreq, 1000, 10, numWordsPerWordClass);
}

void
FakeWordSet::setupWords(vespalib::Rand48 &rnd,
                        uint32_t numDocs,
                        uint32_t commonDocFreq,
                        uint32_t mediumDocFreq,
                        uint32_t rareDocFreq,
                        uint32_t numWordsPerWordClass)
{
    std::string common = "common";
    std::string medium = "medium";
    std::string rare = "rare";
    _numDocs = numDocs;

    LOG(info, "enter setupWords");
    vespalib::Timer tv;

    uint32_t packedIndex = _fieldsParams.size() - 1;
    for (uint32_t i = 0; i < numWordsPerWordClass; ++i) {
        std::ostringstream vi;

        vi << (i + 1);
        _words[COMMON_WORD].push_back(std::make_unique<FakeWord>(numDocs, commonDocFreq, commonDocFreq / 2,
                                                                 common + vi.str(), rnd,
                                                                 _fieldsParams[packedIndex],
                                                                 packedIndex));

        _words[MEDIUM_WORD].push_back(std::make_unique<FakeWord>(numDocs, mediumDocFreq, mediumDocFreq / 2,
                                                                 medium + vi.str(), rnd,
                                                                 _fieldsParams[packedIndex],
                                                                 packedIndex));

        _words[RARE_WORD].push_back(std::make_unique<FakeWord>(numDocs, rareDocFreq, rareDocFreq / 2,
                                                               rare + vi.str(), rnd,
                                                               _fieldsParams[packedIndex],
                                                               packedIndex));
    }

    LOG(info, "leave setupWords, elapsed %10.6f s", vespalib::to_s(tv.elapsed()));
}

int
FakeWordSet::getNumWords() const
{
    int ret = 0;
    for (const auto& words : _words) {
        ret += words.size();
    }
    return ret;
}

void
FakeWordSet::addDocIdBias(uint32_t docIdBias)
{
    for (auto& words : _words) {
        applyDocIdBiasToVector(words, docIdBias);
    }
}

}