From 99f16ba7d600cb63699845859e79aa9f773c8bef Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Tue, 14 Dec 2021 18:00:53 +0100 Subject: 1 - Make the first sample period at least 5s long. 2 - Ensure that total cpu usage is sampled first on startup, and last on all remaining samples. This is to avoid the large skew you get when sample interval is shorter than the time used to collect the samples. This should eliminate the sampling noise when metricsproxy is started. --- .../vespa/metricsproxy/service/SystemPoller.java | 40 ++++++++++++++-------- .../metricsproxy/service/SystemPollerProvider.java | 7 ++-- 2 files changed, 30 insertions(+), 17 deletions(-) (limited to 'metrics-proxy') diff --git a/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java b/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java index c548d187569..27f86b0d503 100644 --- a/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java +++ b/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java @@ -5,6 +5,9 @@ import ai.vespa.metricsproxy.metric.Metric; import ai.vespa.metricsproxy.metric.Metrics; import ai.vespa.metricsproxy.metric.model.MetricId; +import java.time.Duration; +import java.time.Instant; +import java.util.HashMap; import java.util.logging.Level; import java.io.BufferedReader; @@ -35,7 +38,7 @@ public class SystemPoller { private static final MetricId MEMORY_VIRT = MetricId.toMetricId("memory_virt"); private static final MetricId MEMORY_RSS = MetricId.toMetricId("memory_rss"); - private final int pollingIntervalSecs; + private final Duration interval; private final List services; private final Map lastCpuJiffiesMetrics = new ConcurrentHashMap<>(); private final Timer systemPollTimer; @@ -70,9 +73,9 @@ public class SystemPoller { long getJiffies(VespaService service); } - public SystemPoller(List services, int pollingIntervalSecs) { + public SystemPoller(List services, Duration interval) { this.services = services; - this.pollingIntervalSecs = pollingIntervalSecs; + this.interval = interval; systemPollTimer = new Timer("systemPollTimer", true); jiffiesInterface = new GetJiffies() { @Override @@ -138,7 +141,7 @@ public class SystemPoller { * Poll services for system metrics */ void poll() { - long startTime = System.currentTimeMillis(); + Instant startTime = Instant.now(); /* Don't do any work if there are no known services */ if (services.isEmpty()) { @@ -149,11 +152,11 @@ public class SystemPoller { log.log(Level.FINE, () -> "Monitoring system metrics for " + services.size() + " services"); boolean someAlive = services.stream().anyMatch(VespaService::isAlive); - lastTotalCpuJiffies = updateMetrics(lastTotalCpuJiffies, startTime/1000, jiffiesInterface, services, lastCpuJiffiesMetrics); + lastTotalCpuJiffies = updateMetrics(lastTotalCpuJiffies, interval.getSeconds(), jiffiesInterface, services, lastCpuJiffiesMetrics); // If none of the services were alive, reschedule in a short time if (!someAlive) { - reschedule(System.currentTimeMillis() - startTime); + reschedule(Duration.between(startTime, Instant.now())); } else { schedule(); } @@ -161,6 +164,10 @@ public class SystemPoller { static JiffiesAndCpus updateMetrics(JiffiesAndCpus prevTotalJiffies, long timeStamp, GetJiffies getJiffies, List services, Map lastCpuJiffiesMetrics) { + Map currentServiceJiffies = new HashMap<>(); + for (VespaService s : services) { + currentServiceJiffies.put(s, getJiffies.getJiffies(s)); + } JiffiesAndCpus sysJiffies = getJiffies.getTotalSystemJiffies(); JiffiesAndCpus sysJiffiesDiff = sysJiffies.diff(prevTotalJiffies); log.log(Level.FINE, () -> "Total jiffies: " + sysJiffies.jiffies + " - " + prevTotalJiffies.jiffies + " = " + sysJiffiesDiff.jiffies); @@ -173,7 +180,7 @@ public class SystemPoller { metrics.add(new Metric(MEMORY_VIRT, size[memoryTypeVirtual], timeStamp)); metrics.add(new Metric(MEMORY_RSS, size[memoryTypeResident], timeStamp)); - long procJiffies = getJiffies.getJiffies(s); + long procJiffies = currentServiceJiffies.get(s); long last = lastCpuJiffiesMetrics.get(s); long diff = procJiffies - last; @@ -253,24 +260,27 @@ public class SystemPoller { : new JiffiesAndCpus(); } - private void schedule(long time) { + void schedule(Duration time) { try { - systemPollTimer.schedule(new PollTask(this), time); + systemPollTimer.schedule(new PollTask(this), time.toMillis()); } catch(IllegalStateException e){ log.info("Tried to schedule task, but timer was already shut down."); } } - public void schedule() { - schedule(pollingIntervalSecs * 1000L); + void schedule() { + schedule(interval); } - private void reschedule(long skew) { - long sleep = (pollingIntervalSecs * 1000L) - skew; + private void reschedule(Duration skew) { + Duration sleep = interval.minus(skew); // Don't sleep less than 1 min - sleep = Math.max(60 * 1000, sleep); - schedule(sleep); + if ( sleep.compareTo(Duration.ofMinutes(1)) < 0) { + schedule(Duration.ofMinutes(1)); + } else { + schedule(sleep); + } } diff --git a/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPollerProvider.java b/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPollerProvider.java index 38a0ea5ed2d..05914c40469 100644 --- a/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPollerProvider.java +++ b/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPollerProvider.java @@ -4,6 +4,8 @@ package ai.vespa.metricsproxy.service; import ai.vespa.metricsproxy.core.MonitoringConfig; import com.yahoo.container.di.componentgraph.Provider; +import java.time.Duration; + /** * @author gjoranv */ @@ -17,8 +19,9 @@ public class SystemPollerProvider implements Provider { */ public SystemPollerProvider (VespaServices services, MonitoringConfig monitoringConfig) { if (runningOnLinux()) { - poller = new SystemPoller(services.getVespaServices(), 60 * monitoringConfig.intervalMinutes()); - poller.poll(); + Duration interval = Duration.ofMinutes(monitoringConfig.intervalMinutes()); + poller = new SystemPoller(services.getVespaServices(), interval); + poller.schedule(Duration.ofSeconds(5)); } else { poller = null; } -- cgit v1.2.3