aboutsummaryrefslogtreecommitdiffstats
path: root/metrics-proxy
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2021-12-14 18:00:53 +0100
committerHenning Baldersheim <balder@yahoo-inc.com>2021-12-14 18:00:53 +0100
commit99f16ba7d600cb63699845859e79aa9f773c8bef (patch)
tree65a3c74562c10fde4becef983f847676216446c9 /metrics-proxy
parent2b2334c29f65f476240a13056bfd6061dbebc1ab (diff)
1 - Make the first sample period at least 5s long.
2 - Ensure that total cpu usage is sampled first on startup, and last on all remaining samples. This is to avoid the large skew you get when sample interval is shorter than the time used to collect the samples. This should eliminate the sampling noise when metricsproxy is started.
Diffstat (limited to 'metrics-proxy')
-rw-r--r--metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java40
-rw-r--r--metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPollerProvider.java7
2 files changed, 30 insertions, 17 deletions
diff --git a/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java b/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java
index c548d187569..27f86b0d503 100644
--- a/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java
+++ b/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java
@@ -5,6 +5,9 @@ import ai.vespa.metricsproxy.metric.Metric;
import ai.vespa.metricsproxy.metric.Metrics;
import ai.vespa.metricsproxy.metric.model.MetricId;
+import java.time.Duration;
+import java.time.Instant;
+import java.util.HashMap;
import java.util.logging.Level;
import java.io.BufferedReader;
@@ -35,7 +38,7 @@ public class SystemPoller {
private static final MetricId MEMORY_VIRT = MetricId.toMetricId("memory_virt");
private static final MetricId MEMORY_RSS = MetricId.toMetricId("memory_rss");
- private final int pollingIntervalSecs;
+ private final Duration interval;
private final List<VespaService> services;
private final Map<VespaService, Long> lastCpuJiffiesMetrics = new ConcurrentHashMap<>();
private final Timer systemPollTimer;
@@ -70,9 +73,9 @@ public class SystemPoller {
long getJiffies(VespaService service);
}
- public SystemPoller(List<VespaService> services, int pollingIntervalSecs) {
+ public SystemPoller(List<VespaService> services, Duration interval) {
this.services = services;
- this.pollingIntervalSecs = pollingIntervalSecs;
+ this.interval = interval;
systemPollTimer = new Timer("systemPollTimer", true);
jiffiesInterface = new GetJiffies() {
@Override
@@ -138,7 +141,7 @@ public class SystemPoller {
* Poll services for system metrics
*/
void poll() {
- long startTime = System.currentTimeMillis();
+ Instant startTime = Instant.now();
/* Don't do any work if there are no known services */
if (services.isEmpty()) {
@@ -149,11 +152,11 @@ public class SystemPoller {
log.log(Level.FINE, () -> "Monitoring system metrics for " + services.size() + " services");
boolean someAlive = services.stream().anyMatch(VespaService::isAlive);
- lastTotalCpuJiffies = updateMetrics(lastTotalCpuJiffies, startTime/1000, jiffiesInterface, services, lastCpuJiffiesMetrics);
+ lastTotalCpuJiffies = updateMetrics(lastTotalCpuJiffies, interval.getSeconds(), jiffiesInterface, services, lastCpuJiffiesMetrics);
// If none of the services were alive, reschedule in a short time
if (!someAlive) {
- reschedule(System.currentTimeMillis() - startTime);
+ reschedule(Duration.between(startTime, Instant.now()));
} else {
schedule();
}
@@ -161,6 +164,10 @@ public class SystemPoller {
static JiffiesAndCpus updateMetrics(JiffiesAndCpus prevTotalJiffies, long timeStamp, GetJiffies getJiffies,
List<VespaService> services, Map<VespaService, Long> lastCpuJiffiesMetrics) {
+ Map<VespaService, Long> currentServiceJiffies = new HashMap<>();
+ for (VespaService s : services) {
+ currentServiceJiffies.put(s, getJiffies.getJiffies(s));
+ }
JiffiesAndCpus sysJiffies = getJiffies.getTotalSystemJiffies();
JiffiesAndCpus sysJiffiesDiff = sysJiffies.diff(prevTotalJiffies);
log.log(Level.FINE, () -> "Total jiffies: " + sysJiffies.jiffies + " - " + prevTotalJiffies.jiffies + " = " + sysJiffiesDiff.jiffies);
@@ -173,7 +180,7 @@ public class SystemPoller {
metrics.add(new Metric(MEMORY_VIRT, size[memoryTypeVirtual], timeStamp));
metrics.add(new Metric(MEMORY_RSS, size[memoryTypeResident], timeStamp));
- long procJiffies = getJiffies.getJiffies(s);
+ long procJiffies = currentServiceJiffies.get(s);
long last = lastCpuJiffiesMetrics.get(s);
long diff = procJiffies - last;
@@ -253,24 +260,27 @@ public class SystemPoller {
: new JiffiesAndCpus();
}
- private void schedule(long time) {
+ void schedule(Duration time) {
try {
- systemPollTimer.schedule(new PollTask(this), time);
+ systemPollTimer.schedule(new PollTask(this), time.toMillis());
} catch(IllegalStateException e){
log.info("Tried to schedule task, but timer was already shut down.");
}
}
- public void schedule() {
- schedule(pollingIntervalSecs * 1000L);
+ void schedule() {
+ schedule(interval);
}
- private void reschedule(long skew) {
- long sleep = (pollingIntervalSecs * 1000L) - skew;
+ private void reschedule(Duration skew) {
+ Duration sleep = interval.minus(skew);
// Don't sleep less than 1 min
- sleep = Math.max(60 * 1000, sleep);
- schedule(sleep);
+ if ( sleep.compareTo(Duration.ofMinutes(1)) < 0) {
+ schedule(Duration.ofMinutes(1));
+ } else {
+ schedule(sleep);
+ }
}
diff --git a/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPollerProvider.java b/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPollerProvider.java
index 38a0ea5ed2d..05914c40469 100644
--- a/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPollerProvider.java
+++ b/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPollerProvider.java
@@ -4,6 +4,8 @@ package ai.vespa.metricsproxy.service;
import ai.vespa.metricsproxy.core.MonitoringConfig;
import com.yahoo.container.di.componentgraph.Provider;
+import java.time.Duration;
+
/**
* @author gjoranv
*/
@@ -17,8 +19,9 @@ public class SystemPollerProvider implements Provider<SystemPoller> {
*/
public SystemPollerProvider (VespaServices services, MonitoringConfig monitoringConfig) {
if (runningOnLinux()) {
- poller = new SystemPoller(services.getVespaServices(), 60 * monitoringConfig.intervalMinutes());
- poller.poll();
+ Duration interval = Duration.ofMinutes(monitoringConfig.intervalMinutes());
+ poller = new SystemPoller(services.getVespaServices(), interval);
+ poller.schedule(Duration.ofSeconds(5));
} else {
poller = null;
}