diff options
3 files changed, 30 insertions, 22 deletions
diff --git a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java index 5a13c06909c..6b1d63dd12f 100644 --- a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java +++ b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java @@ -43,6 +43,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.util.logging.Logger; import java.util.stream.Collectors; @@ -278,7 +279,7 @@ public class DockerImpl implements Docker { public Optional<ContainerStats> getContainerStats(ContainerName containerName) { try { DockerStatsCallback statsCallback = dockerClient.statsCmd(containerName.asString()).exec(new DockerStatsCallback()); - statsCallback.awaitCompletion(10, TimeUnit.SECONDS); + statsCallback.awaitCompletion(5, TimeUnit.SECONDS); return statsCallback.stats.map(stats -> new ContainerStatsImpl( stats.getNetworks(), stats.getCpuStats(), stats.getMemoryStats(), stats.getBlkioStats())); @@ -446,16 +447,27 @@ public class DockerImpl implements Docker { } } + // docker-java currently (3.0.8) does not support getting docker stats with stream=false, therefore we need + // to subscribe to the stream and complete as soon we get the first result. private class DockerStatsCallback extends ResultCallbackTemplate<DockerStatsCallback, Statistics> { private Optional<Statistics> stats = Optional.empty(); + private final CountDownLatch completed = new CountDownLatch(1); @Override public void onNext(Statistics stats) { if (stats != null) { this.stats = Optional.of(stats); + completed.countDown(); onComplete(); } } + + @Override + public boolean awaitCompletion(long timeout, TimeUnit timeUnit) throws InterruptedException { + // For some reason it takes as long to execute onComplete as the awaitCompletion timeout is, therefore + // we have own awaitCompletion that completes as soon as we get the first result. + return completed.await(timeout, timeUnit); + } } private DockerClient initDockerConnection(final DockerConfig config, boolean fallbackTo123orErrors) { diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index ad9352896d6..f1b2a1a434f 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -17,13 +17,13 @@ import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgent; import com.yahoo.vespa.hosted.node.admin.util.PrefixLogger; import com.yahoo.vespa.hosted.provision.Node; -import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @@ -39,15 +39,15 @@ import java.util.stream.Stream; */ public class NodeAdminImpl implements NodeAdmin { private static final PrefixLogger logger = PrefixLogger.getNodeAdminLogger(NodeAdmin.class); - private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1); + private final ScheduledExecutorService aclScheduler = Executors.newScheduledThreadPool(1); + private final ScheduledExecutorService metricsScheduler = Executors.newScheduledThreadPool(1); private final DockerOperations dockerOperations; private final Function<String, NodeAgent> nodeAgentFactory; private final Optional<StorageMaintainer> storageMaintainer; - private final Optional<AclMaintainer> aclMaintainer; private AtomicBoolean frozen = new AtomicBoolean(false); - private final Map<String, NodeAgent> nodeAgents = new HashMap<>(); + private final Map<String, NodeAgent> nodeAgents = new ConcurrentHashMap<>(); private final int nodeAgentScanIntervalMillis; @@ -61,7 +61,6 @@ public class NodeAdminImpl implements NodeAdmin { this.dockerOperations = dockerOperations; this.nodeAgentFactory = nodeAgentFactory; this.storageMaintainer = storageMaintainer; - this.aclMaintainer = aclMaintainer; this.nodeAgentScanIntervalMillis = nodeAgentScanIntervalMillis; Dimensions dimensions = new Dimensions.Builder() @@ -72,15 +71,15 @@ public class NodeAdminImpl implements NodeAdmin { this.numberOfContainersInLoadImageState = metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "nodes.image.loading"); this.numberOfUnhandledExceptionsInNodeAgent = metricReceiver.declareCounter(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "nodes.unhandled_exceptions"); - scheduler.scheduleWithFixedDelay(() -> { + metricsScheduler.scheduleAtFixedRate(() -> { try { nodeAgents.values().forEach(nodeAgent -> nodeAgent.updateContainerNodeMetrics(nodeAgents.size())); } catch (Throwable e) { logger.warning("Metric fetcher scheduler failed", e); } - }, 0, 30000, TimeUnit.MILLISECONDS); + }, 0, 30, TimeUnit.SECONDS); - this.aclMaintainer.ifPresent(maintainer -> scheduler.scheduleAtFixedRate(maintainer, 30, 60, TimeUnit.SECONDS)); + aclMaintainer.ifPresent(maintainer -> aclScheduler.scheduleAtFixedRate(maintainer, 30, 60, TimeUnit.SECONDS)); } public void refreshContainersToRun(final List<ContainerNodeSpec> containersToRun) { @@ -176,10 +175,15 @@ public class NodeAdminImpl implements NodeAdmin { @Override public void shutdown() { - scheduler.shutdown(); + metricsScheduler.shutdown(); + aclScheduler.shutdown(); try { - if (! scheduler.awaitTermination(30, TimeUnit.SECONDS)) { - throw new RuntimeException("Did not manage to shutdown node-agent metrics update metricsFetcherScheduler."); + boolean metricsSchedulerShutdown = metricsScheduler.awaitTermination(30, TimeUnit.SECONDS); + boolean aclSchedulerShutdown = aclScheduler.awaitTermination(30, TimeUnit.SECONDS); + if (! (metricsSchedulerShutdown && aclSchedulerShutdown)) { + throw new RuntimeException("Failed shuttingdown all scheduler(s), shutdown status:\n" + + "\tMetrics Scheduler: " + metricsSchedulerShutdown + "\n" + + "\tACL Scheduler: " + aclSchedulerShutdown); } } catch (InterruptedException e) { throw new RuntimeException(e); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 60987501d4b..aa6f762543f 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -508,21 +508,14 @@ public class NodeAgentImpl implements NodeAgent { @SuppressWarnings("unchecked") public void updateContainerNodeMetrics(int numAllocatedContainersOnHost) { - logger.debug("Gathering metrics"); ContainerNodeSpec nodeSpec; synchronized (monitor) { nodeSpec = lastNodeSpec; } - if (nodeSpec == null || !vespaVersion.isPresent()) { - logger.debug("Not updating container metrics, nodeSpec=" + nodeSpec + ", vespaVersion=" + vespaVersion); - return; - } + if (nodeSpec == null || !vespaVersion.isPresent()) return; Optional<Docker.ContainerStats> containerStats = dockerOperations.getContainerStats(containerName); - if ( ! containerStats.isPresent()) { - logger.debug("Failed to get docker stats from daemon"); - return; - } + if ( ! containerStats.isPresent()) return; Docker.ContainerStats stats = containerStats.get(); Dimensions.Builder dimensionsBuilder = new Dimensions.Builder() @@ -558,7 +551,6 @@ public class NodeAgentImpl implements NodeAgent { double cpuPercentageOfHost = lastCpuMetric.getCpuUsagePercentage(currentCpuContainerTotalTime, currentCpuSystemTotalTime); double cpuPercentageOfAllocated = numAllocatedContainersOnHost * cpuPercentageOfHost; metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.cpu.busy.pct").sample(cpuPercentageOfAllocated); - logger.debug("Updated CPU busy metric with: " + cpuPercentageOfAllocated); addIfNotNull(dimensions, "node.cpu.throttled_time", stats.getCpuStats().get("throttling_data"), "throttled_time"); addIfNotNull(dimensions, "node.memory.limit", stats.getMemoryStats(), "limit"); |