aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/memoryindex/compact_words_store.h
blob: 41e1ed8c18c85d97c95eac1fb114e6550f887c62 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once

#include <vespa/vespalib/datastore/datastore.h>
#include <vespa/vespalib/datastore/entryref.h>
#include <vespa/vespalib/util/array.h>
#include <vespa/vespalib/util/memoryusage.h>
#include <vespa/vespalib/stllike/hash_map.h>
#include <atomic>

namespace search::memoryindex {

/**
 * Class used to store the {wordRef, docId} tuples that are inserted into a FieldIndex and its posting lists.
 *
 * These tuples are later used when removing all remains of a document from the posting lists in that index.
 */
class CompactWordsStore {
public:
    /**
     * Builder used to collect all words (as wordRefs) for a docId in a field.
     */
    class Builder {
    public:
        using UP = std::unique_ptr<Builder>;
        using WordRefVector = vespalib::Array<vespalib::datastore::EntryRef>;

    private:
        uint32_t   _docId;
        WordRefVector _words;

    public:
        Builder(uint32_t docId_);
        ~Builder();
        Builder &insert(vespalib::datastore::EntryRef wordRef);
        uint32_t docId() const { return _docId; }
        const WordRefVector &words() const { return _words; }
    };

    /**
     * Iterator over all words (as wordRefs) for a docId in a field.
     */
    class Iterator {
    private:
        const uint32_t *_buf;
        uint32_t        _remainingWords;
        uint32_t        _wordRef;
        bool            _valid;

        inline void nextWord();

    public:
        Iterator();
        Iterator(const uint32_t *buf);
        bool valid() const { return _valid; }
        Iterator &operator++();
        vespalib::datastore::EntryRef wordRef() const { return vespalib::datastore::EntryRef(_wordRef); }
        bool hasBackingBuf() const { return _buf != nullptr; }
    };

    /**
     * Store for all unique words (as wordRefs) among all documents.
     */
    class Store {
    public:
        using DataStoreType = vespalib::datastore::DataStoreT<vespalib::datastore::EntryRefT<22>>;
        using RefType = DataStoreType::RefType;

    private:
        DataStoreType               _store;
        vespalib::datastore::BufferType<uint32_t> _type;
        const uint32_t              _typeId;

    public:
        Store();
        ~Store();
        vespalib::datastore::EntryRef insert(const Builder &builder);
        Iterator get(vespalib::datastore::EntryRef wordRef) const;
        vespalib::MemoryUsage getMemoryUsage() const { return _store.getMemoryUsage(); }
    };

    using DocumentWordsMap = vespalib::hash_map<uint32_t, vespalib::datastore::EntryRef>;

private:
    DocumentWordsMap _docs;
    std::atomic<size_t> _docs_used_bytes;
    std::atomic<size_t> _docs_allocated_bytes;
    Store            _wordsStore;

    void update_docs_memory_usage();

public:
    CompactWordsStore();
    ~CompactWordsStore();
    void insert(const Builder &builder);
    void remove(uint32_t docId);
    Iterator get(uint32_t docId) const;
    void commit();
    vespalib::MemoryUsage getMemoryUsage() const;
};

}