aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/diskindex/bitvectorfile.h
blob: f6316bd7db7fd13e13653dcad74b741156610676 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#pragma once

#include "bitvectoridxfile.h"
#include <vespa/searchlib/common/bitvector.h>
#include <vespa/searchlib/common/tunefileinfo.h>
#include <vespa/vespalib/stllike/string.h>
#include <vespa/vespalib/stllike/allocator.h>
#include <vector>

class Fast_BufferedFile;

namespace search::diskindex {

class BitVectorFileWrite : public BitVectorIdxFileWrite
{
private:
    using Parent = BitVectorIdxFileWrite;
    std::unique_ptr<Fast_BufferedFile> _datFile;
    uint32_t                           _datHeaderLen;

public:
    BitVectorFileWrite(const BitVectorFileWrite &) = delete;
    BitVectorFileWrite(const BitVectorFileWrite &&) = delete;
    BitVectorFileWrite& operator=(const BitVectorFileWrite &) = delete;
    BitVectorFileWrite& operator=(const BitVectorFileWrite &&) = delete;
    BitVectorFileWrite(BitVectorKeyScope scope);
    ~BitVectorFileWrite() override;

    void open(const vespalib::string &name, uint32_t docIdLimit,
            const TuneFileSeqWrite &tuneFileWrite,
            const common::FileHeaderContext &fileHeaderContext) override;


    void addWordSingle(uint64_t wordNum, const BitVector &bitVector);
    void flush() override;
    void sync() override;
    void close() override;
    void makeDatHeader(const common::FileHeaderContext &fileHeaderContext);
    void updateDatHeader(uint64_t fileBitSize);
};


/*
 * Buffer document ids for a candidate bitvector.
 */
class BitVectorCandidate
{
private:
    std::vector<uint32_t, vespalib::allocator_large<uint32_t>> _array;
    BitVector::UP  _bv;
    uint64_t       _numDocs;
    const uint32_t _bitVectorLimit;


public:
    BitVectorCandidate(uint32_t docIdLimit, uint32_t bitVectorLimit)
        : _array(),
          _bv(BitVector::create(docIdLimit)),
          _numDocs(0u),
          _bitVectorLimit(bitVectorLimit)
    {
        _array.reserve(_bitVectorLimit);
    }


    BitVectorCandidate(uint32_t docIdLimit)
        : BitVectorCandidate(docIdLimit, BitVectorFileWrite::getBitVectorLimit(docIdLimit))
    { }

    ~BitVectorCandidate();

    void clear() {
        if (__builtin_expect(_numDocs > _bitVectorLimit, false)) {
            _bv->clear();
        }
        _numDocs = 0;
        _array.clear();
    }

    void flush(BitVector &obv) {
        if (__builtin_expect(_numDocs > _bitVectorLimit, false)) {
            obv.orWith(*_bv);
        } else {
            for (uint32_t i : _array) {
                obv.setBit(i);
            }
        }
        clear();
    }

    void add(uint32_t docId) {
        if (_numDocs < _bitVectorLimit) {
            _array.push_back(docId);
        } else {
            if (__builtin_expect(_numDocs == _bitVectorLimit, false)) {
                for (uint32_t i : _array) {
                    _bv->setBit(i);
                }
                _array.clear();
            }
            _bv->setBit(docId);
        }
        ++_numDocs;
    }

    /*
     * Get number of documents buffered.  This might include duplicates.
     */
    uint64_t getNumDocs() const { return _numDocs; }

    bool empty() const { return _numDocs == 0; }

    /*
     * Return true if array limit has been exceeded and bitvector has been
     * populated.
     */
    bool getCrossedBitVectorLimit() const {
        return _numDocs > _bitVectorLimit;
    }

    BitVector &getBitVector() { return *_bv; }
};

}