searchcore/src/vespa/searchcore/proton/metrics/documentdb_tagged_metrics.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275

// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include "documentdb_tagged_metrics.h"
#include <vespa/vespalib/util/stringfmt.h>
#include <vespa/vespalib/util/exceptions.h>

#include <vespa/log/log.h>
LOG_SETUP(".proton.metrics.documentdb_tagged_metrics");

namespace proton {

using matching::MatchingStats;

DocumentDBTaggedMetrics::JobMetrics::JobMetrics(metrics::MetricSet* parent)
    : MetricSet("job", {}, "Job load average for various jobs in a document database", parent),
      attributeFlush("attribute_flush", {}, "Flushing of attribute vector(s) to disk", this),
      memoryIndexFlush("memory_index_flush", {}, "Flushing of memory index to disk", this),
      diskIndexFusion("disk_index_fusion", {}, "Fusion of disk indexes", this),
      documentStoreFlush("document_store_flush", {}, "Flushing of document store to disk", this),
      documentStoreCompact("document_store_compact", {},
              "Compaction of document store on disk", this),
      bucketMove("bucket_move", {},
              "Moving of buckets between 'ready' and 'notready' sub databases", this),
      lidSpaceCompact("lid_space_compact", {},
              "Compaction of lid space in document meta store and attribute vectors", this),
      removedDocumentsPrune("removed_documents_prune", {},
              "Pruning of removed documents in 'removed' sub database", this),
      total("total", {}, "The job load average total of all job metrics", this)
{
}

DocumentDBTaggedMetrics::JobMetrics::~JobMetrics() = default;

DocumentDBTaggedMetrics::SubDBMetrics::SubDBMetrics(const vespalib::string &name, MetricSet *parent)
    : MetricSet(name, {}, "Sub database metrics", parent),
      lidSpace(this),
      documentStore(this),
      attributes(this)
{
}

DocumentDBTaggedMetrics::SubDBMetrics::~SubDBMetrics() = default;

DocumentDBTaggedMetrics::SubDBMetrics::LidSpaceMetrics::LidSpaceMetrics(MetricSet *parent)
    : MetricSet("lid_space", {}, "Local document id (lid) space metrics for this document sub DB", parent),
      lidLimit("lid_limit", {}, "The size of the allocated lid space", this),
      usedLids("used_lids", {}, "The number of lids used", this),
      lowestFreeLid("lowest_free_lid", {}, "The lowest free lid", this),
      highestUsedLid("highest_used_lid", {}, "The highest used lid", this),
      lidBloatFactor("lid_bloat_factor", {}, "The bloat factor of this lid space, indicating the total amount of holes in the allocated lid space "
              "((lid_limit - used_lids) / lid_limit)", this),
      lidFragmentationFactor("lid_fragmentation_factor", {},
              "The fragmentation factor of this lid space, indicating the amount of holes in the currently used part of the lid space "
              "((highest_used_lid - used_lids) / highest_used_lid)", this)
{
}

DocumentDBTaggedMetrics::SubDBMetrics::LidSpaceMetrics::~LidSpaceMetrics() = default;

DocumentDBTaggedMetrics::SubDBMetrics::DocumentStoreMetrics::CacheMetrics::CacheMetrics(MetricSet *parent)
    : MetricSet("cache", {}, "Document store cache metrics", parent),
      memoryUsage("memory_usage", {}, "Memory usage of the cache (in bytes)", this),
      elements("elements", {}, "Number of elements in the cache", this),
      hitRate("hit_rate", {}, "Rate of hits in the cache compared to number of lookups", this),
      lookups("lookups", {}, "Number of lookups in the cache (hits + misses)", this),
      invalidations("invalidations", {}, "Number of invalidations (erased elements) in the cache. ", this)
{
}

DocumentDBTaggedMetrics::SubDBMetrics::DocumentStoreMetrics::CacheMetrics::~CacheMetrics() = default;

DocumentDBTaggedMetrics::SubDBMetrics::DocumentStoreMetrics::DocumentStoreMetrics(MetricSet *parent)
    : MetricSet("document_store", {}, "Document store metrics for this document sub DB", parent),
      diskUsage("disk_usage", {}, "Disk space usage in bytes", this),
      diskBloat("disk_bloat", {}, "Disk space bloat in bytes", this),
      maxBucketSpread("max_bucket_spread", {}, "Max bucket spread in underlying files (sum(unique buckets in each chunk)/unique buckets in file)", this),
      memoryUsage(this),
      cache(this)
{
}

DocumentDBTaggedMetrics::SubDBMetrics::DocumentStoreMetrics::~DocumentStoreMetrics() = default;

DocumentDBTaggedMetrics::AttributeMetrics::AttributeMetrics(MetricSet *parent)
    : MetricSet("attribute", {}, "Attribute vector metrics for this document db", parent),
      resourceUsage(this),
      totalMemoryUsage(this)
{
}

DocumentDBTaggedMetrics::AttributeMetrics::~AttributeMetrics() = default;

DocumentDBTaggedMetrics::AttributeMetrics::ResourceUsageMetrics::ResourceUsageMetrics(MetricSet *parent)
    : MetricSet("resource_usage", {}, "Metrics for various attribute vector resources usage", parent),
      address_space("address_space", {}, "The max relative address space used among "
              "components in all attribute vectors in this document db (value in the range [0, 1])", this),
      feedingBlocked("feeding_blocked", {}, "Whether feeding is blocked due to attribute resource limits being reached (value is either 0 or 1)", this)
{
}

DocumentDBTaggedMetrics::AttributeMetrics::ResourceUsageMetrics::~ResourceUsageMetrics() = default;

DocumentDBTaggedMetrics::IndexMetrics::IndexMetrics(MetricSet *parent)
    : MetricSet("index", {}, "Index metrics (memory and disk) for this document db", parent),
      diskUsage("disk_usage", {}, "Disk space usage in bytes", this),
      memoryUsage(this),
      docsInMemory("docs_in_memory", {}, "Number of documents in memory index", this)
{
}

DocumentDBTaggedMetrics::IndexMetrics::~IndexMetrics() = default;

void
DocumentDBTaggedMetrics::MatchingMetrics::update(const MatchingStats &stats)
{
    docsMatched.inc(stats.docsMatched());
    docsRanked.inc(stats.docsRanked());
    docsReRanked.inc(stats.docsReRanked());
    softDoomedQueries.inc(stats.softDoomed());
    queries.inc(stats.queries());
    querySetupTime.addValueBatch(stats.querySetupTimeAvg(), stats.querySetupTimeCount(),
                                      stats.querySetupTimeMin(), stats.querySetupTimeMax());
    queryLatency.addValueBatch(stats.queryLatencyAvg(), stats.queryLatencyCount(),
                               stats.queryLatencyMin(), stats.queryLatencyMax());
}

DocumentDBTaggedMetrics::MatchingMetrics::MatchingMetrics(MetricSet *parent)
    : MetricSet("matching", {}, "Matching metrics", parent),
      docsMatched("docs_matched", {}, "Number of documents matched", this),
      docsRanked("docs_ranked", {}, "Number of documents ranked (first phase)", this),
      docsReRanked("docs_reranked", {}, "Number of documents re-ranked (second phase)", this),
      queries("queries", {}, "Number of queries executed", this),
      softDoomedQueries("soft_doomed_queries", {}, "Number of queries hitting the soft timeout", this),
      querySetupTime("query_setup_time", {}, "Average time (sec) spent setting up and tearing down queries", this),
      queryLatency("query_latency", {}, "Total average latency (sec) when matching and ranking a query", this)
{
}

DocumentDBTaggedMetrics::MatchingMetrics::~MatchingMetrics() = default;

DocumentDBTaggedMetrics::MatchingMetrics::RankProfileMetrics::RankProfileMetrics(const vespalib::string &name,
                                                                                 size_t numDocIdPartitions,
                                                                                 MetricSet *parent)
    : MetricSet("rank_profile", {{"rankProfile", name}}, "Rank profile metrics", parent),
      docsMatched("docs_matched", {}, "Number of documents matched", this),
      docsRanked("docs_ranked", {}, "Number of documents ranked (first phase)", this),
      docsReRanked("docs_reranked", {}, "Number of documents re-ranked (second phase)", this),
      queries("queries", {}, "Number of queries executed", this),
      limitedQueries("limited_queries", {}, "Number of queries limited in match phase", this),
      softDoomedQueries("soft_doomed_queries", {}, "Number of queries hitting the soft timeout", this),
      softDoomFactor("soft_doom_factor", {}, "Factor used to compute soft-timeout", this),
      matchTime("match_time", {}, "Average time (sec) for matching a query (1st phase)", this),
      groupingTime("grouping_time", {}, "Average time (sec) spent on grouping", this),
      rerankTime("rerank_time", {}, "Average time (sec) spent on 2nd phase ranking", this),
      querySetupTime("query_setup_time", {}, "Average time (sec) spent setting up and tearing down queries", this),
      queryLatency("query_latency", {}, "Total average latency (sec) when matching and ranking a query", this)
{
    softDoomFactor.set(MatchingStats::INITIAL_SOFT_DOOM_FACTOR);
    for (size_t i = 0; i < numDocIdPartitions; ++i) {
        vespalib::string partition(vespalib::make_string("docid_part%02ld", i));
        partitions.push_back(std::make_unique<DocIdPartition>(partition, this));
    }
}

DocumentDBTaggedMetrics::MatchingMetrics::RankProfileMetrics::~RankProfileMetrics() = default;

DocumentDBTaggedMetrics::MatchingMetrics::RankProfileMetrics::DocIdPartition::DocIdPartition(const vespalib::string &name, MetricSet *parent)
    : MetricSet("docid_partition", {{"docidPartition", name}}, "DocId Partition profile metrics", parent),
      docsMatched("docs_matched", {}, "Number of documents matched", this),
      docsRanked("docs_ranked", {}, "Number of documents ranked (first phase)", this),
      docsReRanked("docs_reranked", {}, "Number of documents re-ranked (second phase)", this),
      activeTime("active_time", {}, "Time (sec) spent doing actual work", this),
      waitTime("wait_time", {}, "Time (sec) spent waiting for other external threads and resources", this)
{ }

DocumentDBTaggedMetrics::MatchingMetrics::RankProfileMetrics::DocIdPartition::~DocIdPartition() = default;

void
DocumentDBTaggedMetrics::MatchingMetrics::RankProfileMetrics::DocIdPartition::update(const MatchingStats::Partition &stats)
{
    docsMatched.inc(stats.docsMatched());
    docsRanked.inc(stats.docsRanked());
    docsReRanked.inc(stats.docsReRanked());
    activeTime.addValueBatch(stats.active_time_avg(), stats.active_time_count(),
                             stats.active_time_min(), stats.active_time_max());
    waitTime.addValueBatch(stats.wait_time_avg(), stats.wait_time_count(),
                           stats.wait_time_min(), stats.wait_time_max());
}

void
DocumentDBTaggedMetrics::MatchingMetrics::RankProfileMetrics::update(const metrics::MetricLockGuard &,
                                                                     const MatchingStats &stats)
{
    docsMatched.inc(stats.docsMatched());
    docsRanked.inc(stats.docsRanked());
    docsReRanked.inc(stats.docsReRanked());
    queries.inc(stats.queries());
    limitedQueries.inc(stats.limited_queries());
    softDoomedQueries.inc(stats.softDoomed());
    softDoomFactor.set(stats.softDoomFactor());
    matchTime.addValueBatch(stats.matchTimeAvg(), stats.matchTimeCount(),
                            stats.matchTimeMin(), stats.matchTimeMax());
    groupingTime.addValueBatch(stats.groupingTimeAvg(), stats.groupingTimeCount(),
                               stats.groupingTimeMin(), stats.groupingTimeMax());
    rerankTime.addValueBatch(stats.rerankTimeAvg(), stats.rerankTimeCount(),
                             stats.rerankTimeMin(), stats.rerankTimeMax());
    querySetupTime.addValueBatch(stats.querySetupTimeAvg(), stats.querySetupTimeCount(),
                                      stats.querySetupTimeMin(), stats.querySetupTimeMax());
    queryLatency.addValueBatch(stats.queryLatencyAvg(), stats.queryLatencyCount(),
                               stats.queryLatencyMin(), stats.queryLatencyMax());
    if (stats.getNumPartitions() > 0) {
        for (size_t i = partitions.size(); i < stats.getNumPartitions(); ++i) {
            // This loop is to handle live reconfigs that changes how many partitions(number of threads) might be used per query.
            vespalib::string partition(vespalib::make_string("docid_part%02ld", i));
            partitions.push_back(std::make_unique<DocIdPartition>(partition, this));
            LOG(info, "Number of partitions has been increased to '%ld' from '%ld' previously configured. Adding part %ld",
                stats.getNumPartitions(), partitions.size(), i);
        }
        for (size_t i = 0; i < stats.getNumPartitions(); ++i) {
            partitions[i]->update(stats.getPartition(i));
        }
    }
}

DocumentDBTaggedMetrics::SessionCacheMetrics::SessionCacheMetrics(metrics::MetricSet *parent)
    : metrics::MetricSet("session_cache", {}, "Metrics for session caches (search / grouping requests)", parent),
      search("search", this),
      grouping("grouping", this)
{
}

DocumentDBTaggedMetrics::SessionCacheMetrics::~SessionCacheMetrics() = default;

DocumentDBTaggedMetrics::DocumentsMetrics::DocumentsMetrics(metrics::MetricSet *parent)
    : metrics::MetricSet("documents", {}, "Metrics for various document counts in this document db", parent),
      active("active", {}, "The number of active / searchable documents in this document db", this),
      ready("ready", {}, "The number of ready documents in this document db", this),
      total("total", {}, "The total number of documents in this documents db (ready + not-ready)", this),
      removed("removed", {}, "The number of removed documents in this document db", this)
{
}

DocumentDBTaggedMetrics::DocumentsMetrics::~DocumentsMetrics() = default;

DocumentDBTaggedMetrics::BucketMoveMetrics::BucketMoveMetrics(metrics::MetricSet *parent)
        : metrics::MetricSet("bucket_move", {}, "Metrics for bucket move job in this document db", parent),
          bucketsPending("buckets_pending", {}, "The number of buckets left to move", this)
{ }

DocumentDBTaggedMetrics::BucketMoveMetrics::~BucketMoveMetrics() = default;

DocumentDBTaggedMetrics::DocumentDBTaggedMetrics(const vespalib::string &docTypeName, size_t maxNumThreads_)
    : MetricSet("documentdb", {{"documenttype", docTypeName}}, "Document DB metrics", nullptr),
      job(this),
      attribute(this),
      index(this),
      ready("ready", this),
      notReady("notready", this),
      removed("removed", this),
      threadingService("threading_service", this),
      matching(this),
      sessionCache(this),
      documents(this),
      bucketMove(this),
      feeding(this),
      totalMemoryUsage(this),
      totalDiskUsage("disk_usage", {}, "The total disk usage (in bytes) for this document db", this),
      heart_beat_age("heart_beat_age", {}, "How long ago (in seconds) heart beat maintenace job was run", this),
      maxNumThreads(maxNumThreads_)
{
}

DocumentDBTaggedMetrics::~DocumentDBTaggedMetrics() = default;

}