aboutsummaryrefslogtreecommitdiffstats
path: root/metrics/src/vespa/metrics/metricmanager.h
blob: b2e176d8139258e74a49c0e970a90b4bd7e335ed (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
/**
 * \class metrics::MetricManager
 *
 * A metrics-enabled application should have a single MetricManager.
 * You can register a number of MetricSets in the MetricManager. Each
 * metric in the metrics sets can be used by zero or more consumers,
 * configurable using readConfig().
 *
 * The consumers get their data by calling the getSnapshot() method,
 * which gives them a snapshot of all the current metrics which are
 * configured for the given name.
 *
 * Locking strategy:
 *
 * There are three locks in this class:
 *
 * Config lock:
 *   - This protects the class on config changes. It protects the _config and
 *     _consumerConfig members.
 *
 * Thread monitor (waiter):
 *   - This lock is kept by the worker thread while it is doing a work cycle,
 *     and it uses this monitor to sleep. It is used to make shutdown quick by
 *     interrupting thread, and to let functions called by clients be able to
 *     do a change while the worker thread is idle.
 *   - The log period is protected by the thread monitor.
 *   - The update hooks is protected by the thread monitor.
 *
 * Metric lock:
 *  - The metric log protects the active metric set when adding or removing
 *    metrics. Clients need to grab this lock before altering active metrics.
 *    The metric manager needs to grab this lock everytime it visits active
 *    metrics.
 *  - The metric log protects the snapshots. The snapshot writer is the metric
 *    worker thread and will grab the lock while editing them. Readers that
 *    aren't the worker thread itself must grab lock to be sure.
 *
 * If multiple locks is taken, the allowed locking order is:
 *   1. Thread monitor.
 *   2. Metric lock.
 *   3. Config lock.
 */
#pragma once

#include "metricset.h"
#include "metricsnapshot.h"
#include "memoryconsumption.h"
#include "valuemetric.h"
#include "updatehook.h"
#include <vespa/vespalib/stllike/hash_set.h>
#include <vespa/vespalib/util/jsonwriter.h>
#include <vespa/metrics/config-metricsmanager.h>
#include <vespa/config/subscription/configsubscriber.h>
#include <vespa/config/subscription/configuri.h>
#include <map>
#include <list>
#include <thread>

namespace metrics {

class MetricManager
{
public:

    struct Timer {
        virtual ~Timer() = default;
        virtual time_point getTime() const;
        time_point getTimeInMilliSecs() const { return getTime(); }
    };

    /**
     * Spec saved from config. If metricSetChildren has content, metric pointed
     * to is a metric set.
     */
    struct ConsumerSpec {
        vespalib::hash_set<Metric::String> includedMetrics;

        ConsumerSpec(ConsumerSpec &&) noexcept = default;
        ConsumerSpec & operator= (ConsumerSpec &&) noexcept = default;
        ConsumerSpec();
        ~ConsumerSpec();

        bool contains(const Metric& m) const {
            return (includedMetrics.find(m.getPath()) != includedMetrics.end());
        }

        vespalib::string toString() const;

        void addMemoryUsage(MemoryConsumption&) const;
    };

private:
    MetricSnapshot _activeMetrics;
    std::unique_ptr<config::ConfigSubscriber> _configSubscriber;
    std::unique_ptr<config::ConfigHandle<MetricsmanagerConfig>> _configHandle;
    std::unique_ptr<MetricsmanagerConfig> _config;
    std::map<Metric::String, ConsumerSpec> _consumerConfig;
    std::list<UpdateHook*> _periodicUpdateHooks;
    std::list<UpdateHook*> _snapshotUpdateHooks;
    mutable std::mutex _waiter;
    mutable std::condition_variable _cond;
    std::vector<std::shared_ptr<MetricSnapshotSet>> _snapshots;
    std::shared_ptr<MetricSnapshot> _totalMetrics;
    std::unique_ptr<Timer> _timer;
    std::atomic<time_point> _lastProcessedTime;
    // Should be added to config, but wont now due to problems with
    // upgrading
    bool _snapshotUnsetMetrics;
    bool _consumerConfigChanged;

    MetricSet _metricManagerMetrics;
    LongAverageMetric _periodicHookLatency;
    LongAverageMetric _snapshotHookLatency;
    LongAverageMetric _resetLatency;
    LongAverageMetric _snapshotLatency;
    LongAverageMetric _sleepTimes;
    std::atomic<bool> _stop_requested;
    std::thread       _thread;

    void request_stop() { _stop_requested.store(true, std::memory_order_relaxed); }
    bool stop_requested() const { return _stop_requested.load(std::memory_order_relaxed); }
    
public:
    MetricManager();
    explicit MetricManager(std::unique_ptr<Timer> timer);
    ~MetricManager();

    void stop();

    void snapshotUnsetMetrics(bool doIt) { _snapshotUnsetMetrics = doIt; }

    /**
     * Add a metric update hook. This will always be called prior to
     * snapshotting and metric logging, to make the metrics the best as they can
     * be at those occasions.
     *
     * @param period Period for how often callback should be called.
     *               The default value of 0, means only before snapshotting or
     *               logging, while another value will give callbacks each
     *               period seconds. Expensive metrics to calculate will
     *               typically only want to do it before snapshotting, while
     *               inexpensive metrics might want to log their value every 5
     *               seconds or so. Any value of period >= the smallest snapshot
     *               time will behave identically as if period is set to 0.
     */
    void addMetricUpdateHook(UpdateHook&);

    /** Remove a metric update hook so it won't get any more updates. */
    void removeMetricUpdateHook(UpdateHook&);

    /**
     * Force a metric update for all update hooks. Useful if you want to ensure
     * nice values before reporting something.
     * This function can not be called from an update hook callback.
     */
    void updateMetrics();

    /**
     * Force event logging to happen now.
     * This function can not be called from an update hook callback.
     */
    void forceEventLogging();

    /**
     * Register a new metric to be included in the active metric set. You need
     * to have grabbed the metric lock in order to do this. (You also need to
     * grab that lock if you alter registration of already registered metric
     * set.) This function can not be called from an update hook callback.
     */
    void registerMetric(const MetricLockGuard& l, Metric& m) {
        assertMetricLockLocked(l);
        _activeMetrics.getMetrics().registerMetric(m);
    }

    /**
     * Unregister a metric from the active metric set. You need to have grabbed
     * the metric lock in order to do this. (You also need to grab that lock
     * if you alter registration of already registered metric set.)
     * This function can not be called from an update hook callback.
     */
    void unregisterMetric(const MetricLockGuard& l, Metric& m) {
        assertMetricLockLocked(l);
        _activeMetrics.getMetrics().unregisterMetric(m);
    }

    /**
     * Reset all metrics including all snapshots.
     * This function can not be called from an update hook callback.
     */
    void reset(system_time currentTime);

    /**
     * Read configuration. Before reading config, all metrics should be set
     * up first. By doing this, the metrics manager can optimize reporting
     * of consumers. readConfig() will start a config subscription. It should
     * not be called multiple times.
     */
    void init(const config::ConfigUri & uri, bool startThread);
    void init(const config::ConfigUri & uri) {
        init(uri, true);
    }

    /**
     * Visit a given snapshot for a given consumer. (Empty consumer name means
     * all metrics). This function can be used for various printing by using
     * various writer visitors in the metrics module, or your own.
     */
    void visit(const MetricLockGuard & guard, const MetricSnapshot&,
               MetricVisitor&, const std::string& consumer) const;

    /**
     * The metric lock protects against changes in metric structure. After
     * metric manager init, you need to take this lock if you want to add or
     * remove metrics from registered metric sets, to avoid that happening at
     * the same time as metrics are being visited. Also, when accessing
     * snapshots, you need to have this lock to prevent metric manager to alter
     * snapshots while you are accessing them.
     */
    MetricLockGuard getMetricLock() const {
        return {_waiter};
    }

    /** While accessing the active metrics you should have the metric lock. */
    MetricSnapshot& getActiveMetrics(const MetricLockGuard& l) {
        assertMetricLockLocked(l);
        return _activeMetrics;
    }
    const MetricSnapshot& getActiveMetrics(const MetricLockGuard& l) const {
        assertMetricLockLocked(l);
        return _activeMetrics;
    }

    /** While accessing the total metrics you should have the metric lock. */
    const MetricSnapshot& getTotalMetricSnapshot(const MetricLockGuard& l) const {
        assertMetricLockLocked(l);
        return *_totalMetrics;
    }
    /** While accessing snapshots you should have the metric lock. */
    const MetricSnapshot& getMetricSnapshot( const MetricLockGuard& guard, vespalib::duration period) const {
        return getMetricSnapshot(guard, period, false);
    }
    const MetricSnapshot& getMetricSnapshot( const MetricLockGuard&, vespalib::duration period, bool getInProgressSet) const;
    const MetricSnapshotSet& getMetricSnapshotSet(const MetricLockGuard&, vespalib::duration period) const;

    std::vector<time_point::duration> getSnapshotPeriods(const MetricLockGuard& l) const;

    // Public only for testing. The returned pointer is only valid while holding the lock.
    const ConsumerSpec * getConsumerSpec(const MetricLockGuard & guard, const Metric::String& consumer) const;

    /**
     * If you add or remove metrics from the active metric sets, normally,
     * snapshots will be recreated next snapshot period. However, if you want
     * to see the effects of such changes in status pages ahead of that, you
     * can call this function in order to check whether snapshots needs to be
     * regenerated and regenerate them if needed.
     */
    void checkMetricsAltered(const MetricLockGuard &);

    /** Used by unit tests to verify that we have processed for a given time. */
    time_point getLastProcessedTime() const { return _lastProcessedTime.load(std::memory_order_relaxed); }

    /** Used by unit tests to wake waiters after altering time. */
    void timeChangedNotification() const;

    MemoryConsumption::UP getMemoryConsumption(const MetricLockGuard & guard) const;

    bool isInitialized() const;

    [[nodiscard]] bool any_snapshots_taken(const MetricLockGuard&) const noexcept;

private:
    void takeSnapshots(const MetricLockGuard &, system_time timeToProcess);

    friend struct MetricManagerTest;
    friend struct SnapshotTest;

    void configure(const MetricLockGuard & guard, std::unique_ptr<MetricsmanagerConfig> conf);
    void run();
    time_point tick(const MetricLockGuard & guard, time_point currentTime);
    /**
     * Utility function for updating periodic metrics.
     *
     * @param updateTime Update metrics timed to update at this time.
     * @param outOfSchedule Force calls to all hooks. Don't screw up normal
     *                      schedule though. If not time to update yet, update
     *                      without adjusting schedule for next update.
     * @return Time of next hook to be called in the future.
     */
    time_point updatePeriodicMetrics(const MetricLockGuard & guard, time_point updateTime, bool outOfSchedule);
    void updateSnapshotMetrics(const MetricLockGuard & guard);

    void handleMetricsAltered(const MetricLockGuard & guard);

    using SnapSpec = std::pair<time_point::duration, std::string>;
    static std::vector<SnapSpec> createSnapshotPeriods( const MetricsmanagerConfig& config);
    void assertMetricLockLocked(const MetricLockGuard& g) const;
};

} // metrics