diff options
Diffstat (limited to 'metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java')
-rw-r--r-- | metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java | 259 |
1 files changed, 259 insertions, 0 deletions
diff --git a/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java b/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java new file mode 100644 index 00000000000..9f6614668a5 --- /dev/null +++ b/metrics-proxy/src/main/java/ai/vespa/metricsproxy/service/SystemPoller.java @@ -0,0 +1,259 @@ +/* + * Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + */ + +package ai.vespa.metricsproxy.service; + +import ai.vespa.metricsproxy.metric.Metric; +import ai.vespa.metricsproxy.metric.Metrics; +import com.yahoo.log.LogLevel; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Timer; +import java.util.TimerTask; +import java.util.concurrent.ConcurrentHashMap; +import java.util.logging.Logger; + +/** + * Class to get data from the system and update the services at given intervals. + * TODO: rewrite to use ScheduledExecutorService or just call poll() directly. + * + * @author Eirik Nygaard + */ +public class SystemPoller implements ServiceListener { + final private static Logger log = Logger.getLogger(SystemPoller.class.getPackage().getName()); + + private final int pollingIntervalSecs; + private volatile List<VespaService> services; + + private final int memoryTypeVirtual = 0; + private final int memoryTypeResident = 1; + private final Map<VespaService, Long> lastCpuJiffiesMetrics = new ConcurrentHashMap<>(); + private final Timer systemPollTimer; + + private long lastTotalCpuJiffies = -1; + + public SystemPoller(List<VespaService> services, int pollingIntervalSecs) { + this.services = services; + this.pollingIntervalSecs = pollingIntervalSecs; + systemPollTimer = new Timer("systemPollTimer", true); + } + + @Override + public void setServices(List<VespaService> services) { + log.log(LogLevel.DEBUG, "Setting services in SystemPoller to: " + services); + this.services = services; + } + + void stop() { + systemPollTimer.cancel(); + } + + /** + * Return memory usage for a given process, both resident and virtual is + * returned. + * + * @param service The instance to get memory usage for + * @return array[0] = memoryResident, array[1] = memoryVirtual (kB units) + */ + long[] getMemoryUsage(VespaService service) { + long size[] = new long[2]; + BufferedReader br; + int pid = service.getPid(); + + size[0] = 0; + size[1] = 0; + try { + br = new BufferedReader(new FileReader("/proc/" + pid + "/smaps")); + } catch (FileNotFoundException ex) { + markDead(service); + return size; + } + String line; + try { + while ((line = br.readLine()) != null) { + String[] elems = line.split("\\s+"); + /* Memory size is given in kB - convert to bytes by multiply with 1024*/ + if (line.startsWith("Rss:")) { + size[memoryTypeResident] += Long.parseLong(elems[1]) * 1024; + } else if (line.startsWith("Size:")) { + size[memoryTypeVirtual] += Long.parseLong(elems[1]) * 1024; + } + } + + br.close(); + } catch (IOException ex) { + log.log(LogLevel.DEBUG, "Unable to read line from smaps file", ex); + return size; + } + + return size; + } + + /** + * Mark a service as dead. + * + * @param service The service to mark as dead. + */ + private static void markDead(VespaService service) { + service.setAlive(false); + } + + /** + * Poll services for system metrics + */ + void poll() { + long startTime = System.currentTimeMillis(); + boolean someAlive = false; + + /* Don't do any work if there are no known services */ + if (services.isEmpty()) { + schedule(); + return; + } + + log.log(LogLevel.DEBUG, "Monitoring system metrics for " + services.size() + " services"); + + long sysJiffies = getNormalizedSystemJiffies(); + for (VespaService s : services) { + + + if(s.isAlive()) { + someAlive = true; + } + + Metrics metrics = new Metrics(); + log.log(LogLevel.DEBUG, "Current size of system metrics for service " + s + " is " + metrics.size()); + + long[] size = getMemoryUsage(s); + log.log(LogLevel.DEBUG, "Updating memory metric for service " + s); + + metrics.add(new Metric("memory_virt", size[memoryTypeVirtual], startTime / 1000)); + metrics.add(new Metric("memory_rss", size[memoryTypeResident], startTime / 1000)); + + long procJiffies = getPidJiffies(s); + if (lastTotalCpuJiffies >= 0 && lastCpuJiffiesMetrics.containsKey(s)) { + long last = lastCpuJiffiesMetrics.get(s); + long diff = procJiffies - last; + + if (diff >= 0) { + metrics.add(new Metric("cpu", 100 * ((double) diff) / (sysJiffies - lastTotalCpuJiffies), startTime / 1000)); + } + } + lastCpuJiffiesMetrics.put(s, procJiffies); + s.setSystemMetrics(metrics); + } + + lastTotalCpuJiffies = sysJiffies; + + // If none of the services were alive, reschedule in a short time + if (!someAlive) { + reschedule(System.currentTimeMillis() - startTime); + } else { + schedule(); + } + } + + long getPidJiffies(VespaService service) { + BufferedReader in; + String line; + String[] elems; + int pid = service.getPid(); + + try { + in = new BufferedReader(new FileReader("/proc/" + pid + "/stat")); + } catch (FileNotFoundException ex) { + log.log(LogLevel.DEBUG, "Unable to find pid in proc directory " + pid); + service.setAlive(false); + return 0; + } + + try { + line = in.readLine(); + in.close(); + } catch (IOException ex) { + log.log(LogLevel.DEBUG, "Unable to read line from process stat file", ex); + return 0; + } + + elems = line.split(" "); + + /* Add user mode and kernel mode jiffies for the given process */ + return Long.parseLong(elems[13]) + Long.parseLong(elems[14]); + } + + long getNormalizedSystemJiffies() { + BufferedReader in; + String line; + ArrayList<CpuJiffies> jiffies = new ArrayList<>(); + CpuJiffies total = null; + + try { + in = new BufferedReader(new FileReader("/proc/stat")); + } catch (FileNotFoundException ex) { + log.log(LogLevel.ERROR, "Unable to open stat file", ex); + return 0; + } + try { + while ((line = in.readLine()) != null) { + if (line.startsWith("cpu ")) { + total = new CpuJiffies(line); + } else if (line.startsWith("cpu")) { + jiffies.add(new CpuJiffies(line)); + } + } + + in.close(); + } catch (IOException ex) { + log.log(LogLevel.ERROR, "Unable to read line from stat file", ex); + return 0; + } + + /* Normalize so that a process that uses an entire CPU core will get 100% util */ + if (total != null) { + return total.getTotalJiffies() / jiffies.size(); + } else { + return 0; + } + } + + private void schedule(long time) { + try { + systemPollTimer.schedule(new PollTask(this), time); + } catch(IllegalStateException e){ + log.info("Tried to schedule task, but timer was already shut down."); + } + } + + public void schedule() { + schedule(pollingIntervalSecs * 1000); + } + + private void reschedule(long skew) { + long sleep = (pollingIntervalSecs * 1000) - skew; + + // Don't sleep less than 1 min + sleep = Math.max(60 * 1000, sleep); + schedule(sleep); + } + + + private static class PollTask extends TimerTask { + private final SystemPoller poller; + + PollTask(SystemPoller poller) { + this.poller = poller; + } + + @Override + public void run() { + poller.poll(); + } + } +} |