searchlib/src/vespa/searchlib/memoryindex/memory_index.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#pragma once

#include <vespa/searchcommon/common/schema.h>
#include <vespa/vespalib/util/idestructorcallback.h>
#include <vespa/searchlib/index/field_length_info.h>
#include <vespa/searchlib/queryeval/searchable.h>
#include <vespa/vespalib/stllike/hash_set.h>
#include <vespa/vespalib/util/memoryusage.h>
#include <atomic>
#include <mutex>

namespace search::index {
    class IFieldLengthInspector;
    class IndexBuilder;
}

namespace vespalib { class ISequencedTaskExecutor; }
namespace vespalib::slime { struct Cursor; }
namespace document { class Document; }

namespace search::memoryindex {

class DocumentInverterCollection;
class DocumentInverterContext;
class FieldIndexCollection;

/**
 * Memory index for a set of text and uri fields that uses lock-free B-Trees in underlying components.
 *
 * Each field is handled separately by a FieldIndex that contains postings lists for all unique words in that field.
 *
 * Documents are inserted and removed from the underlying field indexes in a two-step process:
 *   1) Call the async functions insertDocument() / removeDocument().
 *      This adds tasks to invert / remove the fields in the documents to the 'invert threads' executor.
 *   2) Call the async function commit().
 *      This adds tasks to push the changes into the field indexes to the 'push threads' executor.
 *      When commit is completed a completion callback is signaled.
 *
 * Use createBlueprint() to search the memory index for a given term in a given field.
 *
 */
class MemoryIndex : public queryeval::Searchable {
private:
    using ISequencedTaskExecutor = vespalib::ISequencedTaskExecutor;
    using LidVector = std::vector<uint32_t>;
    using OnWriteDoneType = const std::shared_ptr<vespalib::IDestructorCallback> &;
    index::Schema     _schema;
    ISequencedTaskExecutor &_invertThreads;
    ISequencedTaskExecutor &_pushThreads;
    std::unique_ptr<FieldIndexCollection> _fieldIndexes;
    std::unique_ptr<DocumentInverterContext> _inverter_context;
    std::unique_ptr<DocumentInverterCollection> _inverters;
    bool                _frozen;
    uint32_t            _maxDocId;
    std::atomic<uint32_t> _numDocs;
    mutable std::mutex  _lock;
    std::vector<bool>   _hiddenFields;
    index::Schema::SP   _prunedSchema;
    vespalib::hash_set<uint32_t> _indexedDocs; // documents in memory index
    const uint64_t      _staticMemoryFootprint;

    void updateMaxDocId(uint32_t docId) {
        if (docId > _maxDocId) {
            _maxDocId = docId;
        }
    }
    void incNumDocs() {
        auto num_docs = _numDocs.load(std::memory_order_relaxed);
        _numDocs.store(num_docs + 1, std::memory_order_relaxed);
    }
    void decNumDocs() {
        auto num_docs = _numDocs.load(std::memory_order_relaxed);
        if (num_docs > 0) {
            _numDocs.store(num_docs - 1, std::memory_order_relaxed);
        }
    }

public:
    using UP = std::unique_ptr<MemoryIndex>;
    using SP = std::shared_ptr<MemoryIndex>;

    /**
     * Create a new memory index based on the given schema.
     *
     * @param schema        the schema with which text and uri fields to keep in the index.
     * @param inspector     the inspector used to lookup initial field length info for all index fields.
     * @param invertThreads the executor with threads for doing document inverting.
     * @param pushThreads   the executor with threads for doing pushing of changes (inverted documents)
     *                      to corresponding field indexes.
     */
    MemoryIndex(const index::Schema& schema,
                const index::IFieldLengthInspector& inspector,
                ISequencedTaskExecutor& invertThreads,
                ISequencedTaskExecutor& pushThreads);

    MemoryIndex(const MemoryIndex &) = delete;
    MemoryIndex(MemoryIndex &&) = delete;
    MemoryIndex &operator=(const MemoryIndex &) = delete;
    MemoryIndex &operator=(MemoryIndex &&) = delete;
    ~MemoryIndex() override;

    const index::Schema &getSchema() const { return _schema; }

    bool isFrozen() const { return _frozen; }

    /**
     * Insert a document into the underlying field indexes.
     *
     * If the document is already in the index, the old version will be removed first.
     * This function is async. commit() must be called for changes to take effect.
     */
    void insertDocument(uint32_t docId, const document::Document &doc, OnWriteDoneType on_write_done);

    /**
     * Remove a document from the underlying field indexes.
     *
     * This function is async. commit() must be called for changes to take effect.
     */
    void removeDocuments(LidVector lids);

    /**
     * Commits the inserts and removes since the last commit, making them searchable.
     *
     * When commit is completed, 'on_write_done' goes out of scope, scheduling completion callback.
     */
    void commit(OnWriteDoneType on_write_done);

    /**
     * Freeze this index.
     *
     * Further index updates will be discarded.
     * Extra information kept to wash the posting lists will be discarded.
     */
    void freeze();

    /**
     * Dump the contents of this index into the given index builder.
     */
    void dump(index::IndexBuilder &indexBuilder);

    // Implements Searchable
    std::unique_ptr<queryeval::Blueprint> createBlueprint(const queryeval::IRequestContext & requestContext,
                                                          const queryeval::FieldSpec &field,
                                                          const query::Node &term) override;

    std::unique_ptr<queryeval::Blueprint> createBlueprint(const queryeval::IRequestContext & requestContext,
                                                          const queryeval::FieldSpecList &fields,
                                                          const query::Node &term) override
    {
        return queryeval::Searchable::createBlueprint(requestContext, fields, term);
    }

    virtual uint32_t getDocIdLimit() const {
        // Used to get docId range.
        return _maxDocId + 1;
    }

    virtual uint32_t getNumDocs() const {
        return _numDocs.load(std::memory_order_relaxed);
    }

    virtual uint64_t getNumWords() const;

    void pruneRemovedFields(const index::Schema &schema);

    index::Schema::SP getPrunedSchema() const;

    /**
     * Gets an approximation of how much memory the index uses.
     */
    vespalib::MemoryUsage getMemoryUsage() const;

    uint64_t getStaticMemoryFootprint() const { return _staticMemoryFootprint; }

    index::FieldLengthInfo get_field_length_info(const vespalib::string& field_name) const;

    void insert_write_context_state(vespalib::slime::Cursor& object) const;
};

}