diff options
Diffstat (limited to 'node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java')
-rw-r--r-- | node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java | 222 |
1 files changed, 23 insertions, 199 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 90eda96d445..77c08133e82 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -1,7 +1,6 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.nodeagent; -import com.fasterxml.jackson.core.JsonProcessingException; import com.yahoo.config.provision.DockerImage; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.zone.ZoneApi; @@ -12,13 +11,8 @@ import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.hosted.dockerapi.Container; import com.yahoo.vespa.hosted.dockerapi.ContainerResources; -import com.yahoo.vespa.hosted.dockerapi.ContainerStats; import com.yahoo.vespa.hosted.dockerapi.exception.ContainerNotFoundException; import com.yahoo.vespa.hosted.dockerapi.exception.DockerException; -import com.yahoo.vespa.hosted.dockerapi.exception.DockerExecTimeoutException; -import com.yahoo.vespa.hosted.dockerapi.metrics.DimensionMetrics; -import com.yahoo.vespa.hosted.dockerapi.metrics.Dimensions; -import com.yahoo.vespa.hosted.dockerapi.metrics.Metrics; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeAttributes; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeOwner; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository; @@ -31,11 +25,8 @@ import com.yahoo.vespa.hosted.node.admin.maintenance.StorageMaintainer; import com.yahoo.vespa.hosted.node.admin.maintenance.acl.AclMaintainer; import com.yahoo.vespa.hosted.node.admin.maintenance.identity.CredentialsMaintainer; import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException; -import com.yahoo.vespa.hosted.node.admin.util.SecretAgentCheckConfig; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.concurrent.atomic.AtomicBoolean; @@ -68,16 +59,16 @@ public class NodeAgentImpl implements NodeAgent { private final Optional<CredentialsMaintainer> credentialsMaintainer; private final Optional<AclMaintainer> aclMaintainer; private final Optional<HealthChecker> healthChecker; - private final DoubleFlag containerCpuCap; + private Thread loopThread; + private ContainerState containerState = UNKNOWN; + private NodeSpec lastNode; + private int numberOfUnhandledException = 0; private long currentRebootGeneration = 0; private Optional<Long> currentRestartGeneration = Optional.empty(); - private final Thread loopThread; - - /** * ABSENT means container is definitely absent - A container that was absent will not suddenly appear without * NodeAgent explicitly starting it. @@ -92,10 +83,6 @@ public class NodeAgentImpl implements NodeAgent { UNKNOWN } - private ContainerState containerState = UNKNOWN; - - private NodeSpec lastNode = null; - private CpuUsageReporter lastCpuMetric = new CpuUsageReporter(); // Created in NodeAdminImpl public NodeAgentImpl( @@ -116,11 +103,15 @@ public class NodeAgentImpl implements NodeAgent { this.credentialsMaintainer = credentialsMaintainer; this.aclMaintainer = aclMaintainer; this.healthChecker = healthChecker; + this.containerCpuCap = Flags.CONTAINER_CPU_CAP.bindTo(flagSource); + } - this.containerCpuCap = Flags.CONTAINER_CPU_CAP.bindTo(flagSource) - .with(FetchVector.Dimension.HOSTNAME, contextSupplier.currentContext().node().hostname()); + @Override + public void start(NodeAgentContext initialContext) { + if (loopThread != null) + throw new IllegalStateException("Can not re-start a node agent."); - this.loopThread = new Thread(() -> { + loopThread = new Thread(() -> { while (!terminated.get()) { try { NodeAgentContext context = contextSupplier.nextContext(); @@ -128,19 +119,15 @@ public class NodeAgentImpl implements NodeAgent { } catch (InterruptedException ignored) { } } }); - this.loopThread.setName("tick-" + contextSupplier.currentContext().hostname()); - } - - @Override - public void start() { + loopThread.setName("tick-" + initialContext.hostname()); loopThread.start(); } @Override - public void stopForRemoval() { - if (!terminated.compareAndSet(false, true)) { - throw new RuntimeException("Can not re-stop a node agent."); - } + public void stopForRemoval(NodeAgentContext context) { + if (!terminated.compareAndSet(false, true)) + throw new IllegalStateException("Can not re-stop a node agent."); + contextSupplier.interrupt(); do { @@ -149,7 +136,7 @@ public class NodeAgentImpl implements NodeAgent { } catch (InterruptedException ignored) { } } while (loopThread.isAlive()); - contextSupplier.currentContext().log(logger, "Stopped"); + context.log(logger, "Stopped"); } void startServicesIfNeeded(NodeAgentContext context) { @@ -209,7 +196,6 @@ public class NodeAgentImpl implements NodeAgent { ContainerData containerData = createContainerData(context); dockerOperations.createContainer(context, containerData, getContainerResources(context)); dockerOperations.startContainer(context); - lastCpuMetric = new CpuUsageReporter(); hasStartedServices = true; // Automatically started with the container hasResumedNode = false; @@ -255,8 +241,7 @@ public class NodeAgentImpl implements NodeAgent { } } - private void stopServices() { - NodeAgentContext context = contextSupplier.currentContext(); + private void stopServices(NodeAgentContext context) { context.log(logger, "Stopping services"); if (containerState == ABSENT) return; try { @@ -268,13 +253,11 @@ public class NodeAgentImpl implements NodeAgent { } @Override - public void stopForHostSuspension() { - NodeAgentContext context = contextSupplier.currentContext(); + public void stopForHostSuspension(NodeAgentContext context) { getContainer(context).ifPresent(container -> removeContainer(context, container, "suspending host", true)); } - public void suspend() { - NodeAgentContext context = contextSupplier.currentContext(); + public void suspend(NodeAgentContext context) { context.log(logger, "Suspending services on node"); if (containerState == ABSENT) return; try { @@ -331,9 +314,9 @@ public class NodeAgentImpl implements NodeAgent { try { if (context.node().state() != NodeState.dirty) { - suspend(); + suspend(context); } - stopServices(); + stopServices(context); } catch (Exception e) { context.log(logger, LogLevel.WARNING, "Failed stopping services, ignoring", e); } @@ -365,6 +348,7 @@ public class NodeAgentImpl implements NodeAgent { .map(NodeOwner::asApplicationId) .map(appId -> containerCpuCap.with(FetchVector.Dimension.APPLICATION_ID, appId.serializedForm())) .orElse(containerCpuCap) + .with(FetchVector.Dimension.HOSTNAME, context.node().hostname()) .value() * context.node().vcpus(); return ContainerResources.from(cpuCap, context.node().vcpus(), context.node().memoryGb()); @@ -415,12 +399,6 @@ public class NodeAgentImpl implements NodeAgent { currentRestartGeneration.map(current -> current < node.currentRestartGeneration().get()).orElse(false)) currentRestartGeneration = node.currentRestartGeneration(); - // Every time the node spec changes, we should clear the metrics for this container as the dimensions - // will change and we will be reporting duplicate metrics. - if (container.map(c -> c.state.isRunning()).orElse(false)) { - storageMaintainer.writeMetricsConfig(context); - } - lastNode = node; } @@ -513,100 +491,6 @@ public class NodeAgentImpl implements NodeAgent { } } - @SuppressWarnings("unchecked") - public void updateContainerNodeMetrics() { - if (containerState != UNKNOWN) return; - final NodeAgentContext context = contextSupplier.currentContext(); - final NodeSpec node = context.node(); - - Optional<ContainerStats> containerStats = dockerOperations.getContainerStats(context); - if (!containerStats.isPresent()) return; - - Dimensions.Builder dimensionsBuilder = new Dimensions.Builder() - .add("host", context.hostname().value()) - .add("role", SecretAgentCheckConfig.nodeTypeToRole(context.nodeType())) - .add("state", node.state().toString()); - node.parentHostname().ifPresent(parent -> dimensionsBuilder.add("parentHostname", parent)); - node.allowedToBeDown().ifPresent(allowed -> - dimensionsBuilder.add("orchestratorState", allowed ? "ALLOWED_TO_BE_DOWN" : "NO_REMARKS")); - Dimensions dimensions = dimensionsBuilder.build(); - - ContainerStats stats = containerStats.get(); - final String APP = Metrics.APPLICATION_NODE; - final int totalNumCpuCores = stats.getCpuStats().getOnlineCpus(); - final long memoryTotalBytes = stats.getMemoryStats().getLimit(); - final long memoryTotalBytesUsage = stats.getMemoryStats().getUsage(); - final long memoryTotalBytesCache = stats.getMemoryStats().getCache(); - final long diskTotalBytes = (long) (node.diskGb() * BYTES_IN_GB); - final Optional<Long> diskTotalBytesUsed = storageMaintainer.getDiskUsageFor(context); - - lastCpuMetric.updateCpuDeltas(stats.getCpuStats()); - - // Ratio of CPU cores allocated to this container to total number of CPU cores on this host - final double allocatedCpuRatio = node.vcpus() / totalNumCpuCores; - double cpuUsageRatioOfAllocated = lastCpuMetric.getCpuUsageRatio() / allocatedCpuRatio; - double cpuKernelUsageRatioOfAllocated = lastCpuMetric.getCpuKernelUsageRatio() / allocatedCpuRatio; - double cpuThrottledTimeRate = lastCpuMetric.getThrottledTimeRate(); - double cpuThrottledCpuTimeRate = lastCpuMetric.getThrottledCpuTimeRate(); - - long memoryTotalBytesUsed = memoryTotalBytesUsage - memoryTotalBytesCache; - double memoryUsageRatio = (double) memoryTotalBytesUsed / memoryTotalBytes; - double memoryTotalUsageRatio = (double) memoryTotalBytesUsage / memoryTotalBytes; - Optional<Double> diskUsageRatio = diskTotalBytesUsed.map(used -> (double) used / diskTotalBytes); - - List<DimensionMetrics> metrics = new ArrayList<>(); - DimensionMetrics.Builder systemMetricsBuilder = new DimensionMetrics.Builder(APP, dimensions) - .withMetric("mem.limit", memoryTotalBytes) - .withMetric("mem.used", memoryTotalBytesUsed) - .withMetric("mem.util", 100 * memoryUsageRatio) - .withMetric("mem_total.used", memoryTotalBytesUsage) - .withMetric("mem_total.util", 100 * memoryTotalUsageRatio) - .withMetric("cpu.util", 100 * cpuUsageRatioOfAllocated) - .withMetric("cpu.sys.util", 100 * cpuKernelUsageRatioOfAllocated) - .withMetric("cpu.throttled_time.rate", cpuThrottledTimeRate) - .withMetric("cpu.throttled_cpu_time.rate", cpuThrottledCpuTimeRate) - .withMetric("cpu.vcpus", node.vcpus()) - .withMetric("disk.limit", diskTotalBytes); - - diskTotalBytesUsed.ifPresent(diskUsed -> systemMetricsBuilder.withMetric("disk.used", diskUsed)); - diskUsageRatio.ifPresent(diskRatio -> systemMetricsBuilder.withMetric("disk.util", 100 * diskRatio)); - metrics.add(systemMetricsBuilder.build()); - - stats.getNetworks().forEach((interfaceName, interfaceStats) -> { - Dimensions netDims = dimensionsBuilder.add("interface", interfaceName).build(); - DimensionMetrics networkMetrics = new DimensionMetrics.Builder(APP, netDims) - .withMetric("net.in.bytes", interfaceStats.getRxBytes()) - .withMetric("net.in.errors", interfaceStats.getRxErrors()) - .withMetric("net.in.dropped", interfaceStats.getRxDropped()) - .withMetric("net.out.bytes", interfaceStats.getTxBytes()) - .withMetric("net.out.errors", interfaceStats.getTxErrors()) - .withMetric("net.out.dropped", interfaceStats.getTxDropped()) - .build(); - metrics.add(networkMetrics); - }); - - pushMetricsToContainer(context, metrics); - } - - private void pushMetricsToContainer(NodeAgentContext context, List<DimensionMetrics> metrics) { - StringBuilder params = new StringBuilder(); - try { - for (DimensionMetrics dimensionMetrics : metrics) { - params.append(dimensionMetrics.toSecretAgentReport()); - } - String wrappedMetrics = "s:" + params.toString(); - - // Push metrics to the metrics proxy in each container. - // TODO Remove port selection logic when all hosted apps have upgraded to Vespa 7. - int port = context.node().currentVespaVersion().map(version -> version.getMajor() == 6).orElse(false) ? 19091 : 19095; - String[] command = {"vespa-rpc-invoke", "-t", "2", "tcp/localhost:" + port, "setExtraMetrics", wrappedMetrics}; - dockerOperations.executeCommandInContainerAsRoot(context, 5L, command); - } catch (JsonProcessingException | DockerExecTimeoutException e) { - context.log(logger, LogLevel.WARNING, "Failed to push metrics to container", e); - } - - } - private Optional<Container> getContainer(NodeAgentContext context) { if (containerState == ABSENT) return Optional.empty(); Optional<Container> container = dockerOperations.getContainer(context); @@ -621,66 +505,6 @@ public class NodeAgentImpl implements NodeAgent { return temp; } - class CpuUsageReporter { - private static final double PERIOD_IN_NANOSECONDS = 1_000d * ContainerResources.CPU_PERIOD_US; - private long containerKernelUsage = 0; - private long totalContainerUsage = 0; - private long totalSystemUsage = 0; - private long throttledTime = 0; - private long throttlingActivePeriods = 0; - private long throttledPeriods = 0; - - private long deltaContainerKernelUsage; - private long deltaContainerUsage; - private long deltaSystemUsage; - private long deltaThrottledTime; - private long deltaThrottlingActivePeriods; - private long deltaThrottledPeriods; - - private void updateCpuDeltas(ContainerStats.CpuStats cpuStats) { - // Do not calculate delta during the first tick - that will result in a metric value that is - // average since container start - if (totalSystemUsage != 0) { - deltaSystemUsage = cpuStats.getSystemCpuUsage() - totalSystemUsage; - deltaContainerUsage = cpuStats.getTotalUsage() - totalContainerUsage; - deltaContainerKernelUsage = cpuStats.getUsageInKernelMode() - containerKernelUsage; - deltaThrottledTime = cpuStats.getThrottledTime() - throttledTime; - deltaThrottlingActivePeriods = cpuStats.getThrottlingActivePeriods() - throttlingActivePeriods; - deltaThrottledPeriods = cpuStats.getThrottledPeriods() - throttledPeriods; - } - - totalSystemUsage = cpuStats.getSystemCpuUsage(); - totalContainerUsage = cpuStats.getTotalUsage(); - containerKernelUsage = cpuStats.getUsageInKernelMode(); - throttledTime = cpuStats.getThrottledTime(); - throttlingActivePeriods = cpuStats.getThrottlingActivePeriods(); - throttledPeriods = cpuStats.getThrottledPeriods(); - } - - /** - * Returns the CPU usage ratio for the docker container that this NodeAgent is managing - * in the time between the last two times updateCpuDeltas() was called. This is calculated - * by dividing the CPU time used by the container with the CPU time used by the entire system. - */ - double getCpuUsageRatio() { - return deltaSystemUsage == 0 ? Double.NaN : (double) deltaContainerUsage / deltaSystemUsage; - } - - double getCpuKernelUsageRatio() { - return deltaSystemUsage == 0 ? Double.NaN : (double) deltaContainerKernelUsage / deltaSystemUsage; - } - - double getThrottledTimeRate() { - return deltaThrottlingActivePeriods == 0 ? Double.NaN : - (double) deltaThrottledPeriods / deltaThrottlingActivePeriods; - } - - double getThrottledCpuTimeRate() { - return deltaThrottlingActivePeriods == 0 ? Double.NaN : - deltaThrottledTime / (PERIOD_IN_NANOSECONDS * deltaThrottlingActivePeriods); - } - } - // TODO: Also skip orchestration if we're downgrading in test/staging // How to implement: // - test/staging: We need to figure out whether we're in test/staging, zone is available in Environment |