summaryrefslogtreecommitdiffstats
path: root/slobrok
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-03-21 11:46:14 +0000
committerTor Brede Vekterli <vekterli@vespa.ai>2024-03-21 12:21:26 +0000
commit1562ae1726453a0789aabb4baad2754020579b8d (patch)
treed702db3a02c9b035d4a14ee94b295f27859d6743 /slobrok
parent723d6cacbdce4c45e01c92cb3e2eeb71f7b513f2 (diff)
Wire Prometheus metric export to state V1 APIs
Extends metric producer classes with the requested exposition format. As a consequence, the State API server has been changed to allow emitting other content types than just `application/json`. Add custom Prometheus rendering for Slobrok, as it does its own domain-specific metric tracking. However, since it has non-destructive sampling properties, we can actually use proper `counter` types.
Diffstat (limited to 'slobrok')
-rw-r--r--slobrok/src/vespa/slobrok/server/metrics_producer.cpp96
-rw-r--r--slobrok/src/vespa/slobrok/server/metrics_producer.h9
2 files changed, 78 insertions, 27 deletions
diff --git a/slobrok/src/vespa/slobrok/server/metrics_producer.cpp b/slobrok/src/vespa/slobrok/server/metrics_producer.cpp
index f25a0681397..b7eca808a75 100644
--- a/slobrok/src/vespa/slobrok/server/metrics_producer.cpp
+++ b/slobrok/src/vespa/slobrok/server/metrics_producer.cpp
@@ -2,6 +2,7 @@
#include "metrics_producer.h"
#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/stllike/asciistream.h>
#include <vespa/fnet/task.h>
#include <vespa/fnet/transport.h>
@@ -11,9 +12,12 @@ using namespace std::chrono;
namespace {
-time_t
-secondsSinceEpoch() {
- return duration_cast<seconds>(system_clock::now().time_since_epoch()).count();
+[[nodiscard]] constexpr seconds seconds_since_epoch(std::chrono::system_clock::time_point tp) noexcept {
+ return duration_cast<seconds>(tp.time_since_epoch());
+}
+
+[[nodiscard]] constexpr milliseconds ms_since_epoch(std::chrono::system_clock::time_point tp) noexcept {
+ return duration_cast<milliseconds>(tp.time_since_epoch());
}
class MetricsSnapshotter : public FNET_Task
@@ -32,7 +36,7 @@ public:
Schedule(60.0);
}
- ~MetricsSnapshotter() { Kill(); }
+ ~MetricsSnapshotter() override { Kill(); }
};
class MetricSnapshot
@@ -45,7 +49,7 @@ private:
double _snapLen;
public:
- MetricSnapshot(uint32_t prevTime, uint32_t currTime);
+ MetricSnapshot(system_clock::time_point prevTime, system_clock::time_point currTime);
void addCount(const char *name, const char *desc, uint32_t count);
vespalib::string asString() const {
@@ -53,15 +57,15 @@ public:
}
};
-MetricSnapshot::MetricSnapshot(uint32_t prevTime, uint32_t currTime)
+MetricSnapshot::MetricSnapshot(system_clock::time_point prevTime, system_clock::time_point currTime)
: _data(),
_metrics(_data.setObject()),
_snapshot(_metrics.setObject("snapshot")),
_values(_metrics.setArray("values")),
- _snapLen(currTime - prevTime)
+ _snapLen(static_cast<double>(seconds_since_epoch(currTime).count() - seconds_since_epoch(prevTime).count()))
{
- _snapshot.setLong("from", prevTime);
- _snapshot.setLong("to", currTime);
+ _snapshot.setLong("from", seconds_since_epoch(prevTime).count());
+ _snapshot.setLong("to", seconds_since_epoch(currTime).count());
if (_snapLen < 1.0) {
_snapLen = 1.0;
}
@@ -81,8 +85,8 @@ MetricSnapshot::addCount(const char *name, const char *desc, uint32_t count)
}
vespalib::string
-makeSnapshot(const RPCHooks::Metrics &prev, const RPCHooks::Metrics &curr,
- uint32_t prevTime, uint32_t currTime)
+make_json_snapshot(const RPCHooks::Metrics &prev, const RPCHooks::Metrics &curr,
+ system_clock::time_point prevTime, system_clock::time_point currTime)
{
MetricSnapshot snapshot(prevTime, currTime);
snapshot.addCount("slobrok.heartbeats.failed",
@@ -103,44 +107,90 @@ makeSnapshot(const RPCHooks::Metrics &prev, const RPCHooks::Metrics &curr,
return snapshot.asString();
}
+void emit_prometheus_counter(vespalib::asciistream &out, vespalib::stringref name,
+ vespalib::stringref description, uint64_t value,
+ system_clock::time_point now)
+{
+ // Prometheus naming conventions state that "_total" should be used for counter metrics.
+ out << "# HELP " << name << "_total " << description << '\n';
+ out << "# TYPE " << name << "_total counter\n";
+ out << name << "_total " << value << ' ' << ms_since_epoch(now).count() << '\n';
+}
+
+void emit_prometheus_gauge(vespalib::asciistream &out, vespalib::stringref name,
+ vespalib::stringref description, uint64_t value,
+ system_clock::time_point now)
+{
+ // Gauge metrics do not appear to have any convention for name suffixes, so emit name verbatim.
+ out << "# HELP " << name << ' ' << description << '\n';
+ out << "# TYPE " << name << " gauge\n";
+ out << name << ' ' << value << ' ' << ms_since_epoch(now).count() << '\n';
+}
+
+vespalib::string
+make_prometheus_snapshot(const RPCHooks::Metrics &curr, system_clock::time_point now)
+{
+ vespalib::asciistream out;
+ emit_prometheus_counter(out, "slobrok_heartbeats_failed",
+ "count of failed heartbeat requests",
+ curr.heartBeatFails, now);
+ emit_prometheus_counter(out, "slobrok_requests_register",
+ "count of register requests received",
+ curr.registerReqs, now);
+ emit_prometheus_counter(out, "slobrok_requests_mirror",
+ "count of mirroring requests received",
+ curr.mirrorReqs, now);
+ emit_prometheus_counter(out, "slobrok_requests_admin",
+ "count of administrative requests received",
+ curr.adminReqs, now);
+ emit_prometheus_gauge(out, "slobrok_missing_consensus",
+ "number of seconds without full consensus with all other brokers",
+ curr.missingConsensusTime, now);
+ return out.str();
+}
+
} // namespace <unnamed>
-MetricsProducer::MetricsProducer(const RPCHooks &hooks,
- FNET_Transport &transport)
+MetricsProducer::MetricsProducer(const RPCHooks &hooks, FNET_Transport &transport)
: _rpcHooks(hooks),
_lastMetrics(RPCHooks::Metrics::zero()),
_producer(),
- _startTime(secondsSinceEpoch()),
+ _startTime(system_clock::now()),
_lastSnapshotStart(_startTime),
- _snapshotter(new MetricsSnapshotter(transport, *this))
+ _snapshotter(std::make_unique<MetricsSnapshotter>(transport, *this))
{
}
MetricsProducer::~MetricsProducer() = default;
vespalib::string
-MetricsProducer::getMetrics(const vespalib::string &consumer)
+MetricsProducer::getMetrics(const vespalib::string &consumer, ExpositionFormat format)
{
- return _producer.getMetrics(consumer);
+ return _producer.getMetrics(consumer, format);
}
vespalib::string
-MetricsProducer::getTotalMetrics(const vespalib::string &)
+MetricsProducer::getTotalMetrics(const vespalib::string &, ExpositionFormat format)
{
- uint32_t now = secondsSinceEpoch();
+ const auto now = system_clock::now();
RPCHooks::Metrics current = _rpcHooks.getMetrics();
- RPCHooks::Metrics start = RPCHooks::Metrics::zero();
- return makeSnapshot(start, current, _startTime, now);
+ if (format == ExpositionFormat::Prometheus) {
+ return make_prometheus_snapshot(current, now);
+ } else {
+ RPCHooks::Metrics start = RPCHooks::Metrics::zero();
+ return make_json_snapshot(start, current, _startTime, now);
+ }
}
void
MetricsProducer::snapshot()
{
- uint32_t now = secondsSinceEpoch();
+ const auto now = system_clock::now();
RPCHooks::Metrics current = _rpcHooks.getMetrics();
- _producer.setMetrics(makeSnapshot(_lastMetrics, current, _lastSnapshotStart, now));
+ _producer.setMetrics(make_json_snapshot(_lastMetrics, current, _lastSnapshotStart, now), ExpositionFormat::JSON);
+ _producer.setMetrics(make_prometheus_snapshot(current, now), ExpositionFormat::Prometheus);
_lastMetrics = current;
_lastSnapshotStart = now;
}
diff --git a/slobrok/src/vespa/slobrok/server/metrics_producer.h b/slobrok/src/vespa/slobrok/server/metrics_producer.h
index fd1fc70651b..0a9dd589a15 100644
--- a/slobrok/src/vespa/slobrok/server/metrics_producer.h
+++ b/slobrok/src/vespa/slobrok/server/metrics_producer.h
@@ -4,6 +4,7 @@
#include "rpchooks.h"
#include <vespa/vespalib/net/http/metrics_producer.h>
#include <vespa/vespalib/net/http/simple_metrics_producer.h>
+#include <chrono>
class FNET_Transport;
@@ -15,13 +16,13 @@ private:
const RPCHooks &_rpcHooks;
RPCHooks::Metrics _lastMetrics;
vespalib::SimpleMetricsProducer _producer;
- uint32_t _startTime;
- uint32_t _lastSnapshotStart;
+ std::chrono::system_clock::time_point _startTime;
+ std::chrono::system_clock::time_point _lastSnapshotStart;
std::unique_ptr<FNET_Task> _snapshotter;
public:
- vespalib::string getMetrics(const vespalib::string &consumer) override;
- vespalib::string getTotalMetrics(const vespalib::string &consumer) override;
+ vespalib::string getMetrics(const vespalib::string &consumer, ExpositionFormat format) override;
+ vespalib::string getTotalMetrics(const vespalib::string &consumer, ExpositionFormat format) override;
void snapshot();