aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h
blob: 0e7f4a2145f2020ef1a53bba31065d7bd98ed0fa (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#pragma once

#include "zcbuf.h"
#include <vespa/searchlib/bitcompression/compression.h>
#include <vector>

namespace search::index {
class PostingListCounts;
class PostingListParams;
}

namespace search::diskindex {

/*
 * Base class for writing posting lists that might have basic skip info.
 */
class Zc4PostingWriterBase
{
public:
    struct DocIdAndFeatureSize {
        uint32_t _doc_id;
        uint32_t _field_length;
        uint32_t _num_occs;
        uint32_t _features_size;
        DocIdAndFeatureSize(uint32_t doc_id, uint32_t field_length, uint32_t num_occs, uint32_t features_size) noexcept
            : _doc_id(doc_id),
              _field_length(field_length),
              _num_occs(num_occs),
              _features_size(features_size)
        {
        }
    };
protected:
    uint32_t _minChunkDocs; // # of documents needed for chunking
    uint32_t _minSkipDocs;  // # of documents needed for skipping
    uint32_t _docIdLimit;   // Limit for document ids (docId < docIdLimit)

    // Unpacked document ids for word and feature sizes
    std::vector<DocIdAndFeatureSize> _docIds;

    uint64_t _featureOffset;        // Bit offset of next feature
    uint64_t _writePos; // Bit position for start of current word
    bool _dynamicK;     // Caclulate EG compression parameters ?
    bool _encode_interleaved_features;
    ZcBuf _zcDocIds;    // Document id deltas
    ZcBuf _l1Skip;      // L1 skip info
    ZcBuf _l2Skip;      // L2 skip info
    ZcBuf _l3Skip;      // L3 skip info
    ZcBuf _l4Skip;      // L4 skip info

    uint64_t _numWords; // Number of words in file
    index::PostingListCounts &_counts;
    search::ComprFileWriteContext _writeContext;
    search::ComprFileWriteContext _featureWriteContext;

    Zc4PostingWriterBase(const Zc4PostingWriterBase &) = delete;
    Zc4PostingWriterBase(Zc4PostingWriterBase &&) = delete;
    Zc4PostingWriterBase &operator=(const Zc4PostingWriterBase &) = delete;
    Zc4PostingWriterBase &operator=(Zc4PostingWriterBase &&) = delete;
    Zc4PostingWriterBase(index::PostingListCounts &counts);
    ~Zc4PostingWriterBase();
    void calc_skip_info(bool encode_features);
    void clear_skip_info();

public:
    ComprFileWriteContext &get_write_context() { return _writeContext; }
    ComprFileWriteContext &get_feature_write_context() { return _featureWriteContext; }
    uint32_t get_min_chunk_docs() const { return _minChunkDocs; }
    uint32_t get_min_skip_docs() const { return _minSkipDocs; }
    uint32_t get_docid_limit() const { return _docIdLimit; }
    uint64_t get_num_words() const { return _numWords; }
    bool get_dynamic_k() const { return _dynamicK; }
    bool get_encode_interleaved_features() const { return _encode_interleaved_features; }
    void set_dynamic_k(bool dynamicK) { _dynamicK = dynamicK; }
    void set_encode_interleaved_features(bool encode_interleaved_features) { _encode_interleaved_features = encode_interleaved_features; }
    void set_posting_list_params(const index::PostingListParams &params);
};

}