aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/test/fakedata/fakewordset.h
blob: c6646f2e61fe438c99833f319efd62c1613ae4c1 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once

#include <vector>
#include <vespa/searchlib/bitcompression/compression.h>
#include <vespa/searchlib/bitcompression/posocccompression.h>
#include <vespa/searchlib/bitcompression/posocc_fields_params.h>
#include <vespa/searchcommon/common/schema.h>


namespace vespalib { class Rand48; }

namespace search::fakedata {

class FakeWord;

/**
 * Contains lists of fake words for 3 word classes categorized based on number of occurrences.
 */
class FakeWordSet {
public:
    using PosOccFieldsParams = bitcompression::PosOccFieldsParams;
    using Schema = index::Schema;
    using FakeWordPtr = std::unique_ptr<FakeWord>;
    using FakeWordVector = std::vector<FakeWordPtr>;

    enum {
        COMMON_WORD,
        MEDIUM_WORD,
        RARE_WORD,
        NUM_WORDCLASSES,
    };

private:
    std::vector<FakeWordVector> _words;
    Schema _schema;
    std::vector<PosOccFieldsParams> _fieldsParams;
    uint32_t _numDocs;

public:
    FakeWordSet();

    FakeWordSet(bool hasElements,
                bool hasElementWeights);

    ~FakeWordSet();

    void setupParams(bool hasElements,
                     bool hasElementWeights);

    void setupWords(vespalib::Rand48 &rnd,
                    uint32_t numDocs,
                    uint32_t commonDocFreq,
                    uint32_t numWordsPerWordClass);

    void setupWords(vespalib::Rand48 &rnd,
                    uint32_t numDocs,
                    uint32_t commonDocFreq,
                    uint32_t mediumDocFreq,
                    uint32_t rareDocFreq,
                    uint32_t numWordsPerWordClass);

    const std::vector<FakeWordVector>& words() const { return _words; }

    int getNumWords() const;

    const PosOccFieldsParams& getFieldsParams() const {
        return _fieldsParams.back();
    }

    uint32_t getPackedIndex() const {
        return _fieldsParams.size() - 1;
    }

    const std::vector<PosOccFieldsParams>& getAllFieldsParams() const {
        return _fieldsParams;
    }

    const Schema& getSchema() const {
        return _schema;
    }

    uint32_t numDocs() const { return _numDocs; }

    void addDocIdBias(uint32_t docIdBias);
};

}