aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java14
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java26
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java12
3 files changed, 30 insertions, 22 deletions
diff --git a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java
index 5a13c06909c..6b1d63dd12f 100644
--- a/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java
+++ b/docker-api/src/main/java/com/yahoo/vespa/hosted/dockerapi/DockerImpl.java
@@ -43,6 +43,7 @@ import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;
import java.util.stream.Collectors;
@@ -278,7 +279,7 @@ public class DockerImpl implements Docker {
public Optional<ContainerStats> getContainerStats(ContainerName containerName) {
try {
DockerStatsCallback statsCallback = dockerClient.statsCmd(containerName.asString()).exec(new DockerStatsCallback());
- statsCallback.awaitCompletion(10, TimeUnit.SECONDS);
+ statsCallback.awaitCompletion(5, TimeUnit.SECONDS);
return statsCallback.stats.map(stats -> new ContainerStatsImpl(
stats.getNetworks(), stats.getCpuStats(), stats.getMemoryStats(), stats.getBlkioStats()));
@@ -446,16 +447,27 @@ public class DockerImpl implements Docker {
}
}
+ // docker-java currently (3.0.8) does not support getting docker stats with stream=false, therefore we need
+ // to subscribe to the stream and complete as soon we get the first result.
private class DockerStatsCallback extends ResultCallbackTemplate<DockerStatsCallback, Statistics> {
private Optional<Statistics> stats = Optional.empty();
+ private final CountDownLatch completed = new CountDownLatch(1);
@Override
public void onNext(Statistics stats) {
if (stats != null) {
this.stats = Optional.of(stats);
+ completed.countDown();
onComplete();
}
}
+
+ @Override
+ public boolean awaitCompletion(long timeout, TimeUnit timeUnit) throws InterruptedException {
+ // For some reason it takes as long to execute onComplete as the awaitCompletion timeout is, therefore
+ // we have own awaitCompletion that completes as soon as we get the first result.
+ return completed.await(timeout, timeUnit);
+ }
}
private DockerClient initDockerConnection(final DockerConfig config, boolean fallbackTo123orErrors) {
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
index ad9352896d6..f1b2a1a434f 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
@@ -17,13 +17,13 @@ import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgent;
import com.yahoo.vespa.hosted.node.admin.util.PrefixLogger;
import com.yahoo.vespa.hosted.provision.Node;
-import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
@@ -39,15 +39,15 @@ import java.util.stream.Stream;
*/
public class NodeAdminImpl implements NodeAdmin {
private static final PrefixLogger logger = PrefixLogger.getNodeAdminLogger(NodeAdmin.class);
- private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1);
+ private final ScheduledExecutorService aclScheduler = Executors.newScheduledThreadPool(1);
+ private final ScheduledExecutorService metricsScheduler = Executors.newScheduledThreadPool(1);
private final DockerOperations dockerOperations;
private final Function<String, NodeAgent> nodeAgentFactory;
private final Optional<StorageMaintainer> storageMaintainer;
- private final Optional<AclMaintainer> aclMaintainer;
private AtomicBoolean frozen = new AtomicBoolean(false);
- private final Map<String, NodeAgent> nodeAgents = new HashMap<>();
+ private final Map<String, NodeAgent> nodeAgents = new ConcurrentHashMap<>();
private final int nodeAgentScanIntervalMillis;
@@ -61,7 +61,6 @@ public class NodeAdminImpl implements NodeAdmin {
this.dockerOperations = dockerOperations;
this.nodeAgentFactory = nodeAgentFactory;
this.storageMaintainer = storageMaintainer;
- this.aclMaintainer = aclMaintainer;
this.nodeAgentScanIntervalMillis = nodeAgentScanIntervalMillis;
Dimensions dimensions = new Dimensions.Builder()
@@ -72,15 +71,15 @@ public class NodeAdminImpl implements NodeAdmin {
this.numberOfContainersInLoadImageState = metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "nodes.image.loading");
this.numberOfUnhandledExceptionsInNodeAgent = metricReceiver.declareCounter(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "nodes.unhandled_exceptions");
- scheduler.scheduleWithFixedDelay(() -> {
+ metricsScheduler.scheduleAtFixedRate(() -> {
try {
nodeAgents.values().forEach(nodeAgent -> nodeAgent.updateContainerNodeMetrics(nodeAgents.size()));
} catch (Throwable e) {
logger.warning("Metric fetcher scheduler failed", e);
}
- }, 0, 30000, TimeUnit.MILLISECONDS);
+ }, 0, 30, TimeUnit.SECONDS);
- this.aclMaintainer.ifPresent(maintainer -> scheduler.scheduleAtFixedRate(maintainer, 30, 60, TimeUnit.SECONDS));
+ aclMaintainer.ifPresent(maintainer -> aclScheduler.scheduleAtFixedRate(maintainer, 30, 60, TimeUnit.SECONDS));
}
public void refreshContainersToRun(final List<ContainerNodeSpec> containersToRun) {
@@ -176,10 +175,15 @@ public class NodeAdminImpl implements NodeAdmin {
@Override
public void shutdown() {
- scheduler.shutdown();
+ metricsScheduler.shutdown();
+ aclScheduler.shutdown();
try {
- if (! scheduler.awaitTermination(30, TimeUnit.SECONDS)) {
- throw new RuntimeException("Did not manage to shutdown node-agent metrics update metricsFetcherScheduler.");
+ boolean metricsSchedulerShutdown = metricsScheduler.awaitTermination(30, TimeUnit.SECONDS);
+ boolean aclSchedulerShutdown = aclScheduler.awaitTermination(30, TimeUnit.SECONDS);
+ if (! (metricsSchedulerShutdown && aclSchedulerShutdown)) {
+ throw new RuntimeException("Failed shuttingdown all scheduler(s), shutdown status:\n" +
+ "\tMetrics Scheduler: " + metricsSchedulerShutdown + "\n" +
+ "\tACL Scheduler: " + aclSchedulerShutdown);
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 60987501d4b..aa6f762543f 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -508,21 +508,14 @@ public class NodeAgentImpl implements NodeAgent {
@SuppressWarnings("unchecked")
public void updateContainerNodeMetrics(int numAllocatedContainersOnHost) {
- logger.debug("Gathering metrics");
ContainerNodeSpec nodeSpec;
synchronized (monitor) {
nodeSpec = lastNodeSpec;
}
- if (nodeSpec == null || !vespaVersion.isPresent()) {
- logger.debug("Not updating container metrics, nodeSpec=" + nodeSpec + ", vespaVersion=" + vespaVersion);
- return;
- }
+ if (nodeSpec == null || !vespaVersion.isPresent()) return;
Optional<Docker.ContainerStats> containerStats = dockerOperations.getContainerStats(containerName);
- if ( ! containerStats.isPresent()) {
- logger.debug("Failed to get docker stats from daemon");
- return;
- }
+ if ( ! containerStats.isPresent()) return;
Docker.ContainerStats stats = containerStats.get();
Dimensions.Builder dimensionsBuilder = new Dimensions.Builder()
@@ -558,7 +551,6 @@ public class NodeAgentImpl implements NodeAgent {
double cpuPercentageOfHost = lastCpuMetric.getCpuUsagePercentage(currentCpuContainerTotalTime, currentCpuSystemTotalTime);
double cpuPercentageOfAllocated = numAllocatedContainersOnHost * cpuPercentageOfHost;
metricReceiver.declareGauge(MetricReceiverWrapper.APPLICATION_DOCKER, dimensions, "node.cpu.busy.pct").sample(cpuPercentageOfAllocated);
- logger.debug("Updated CPU busy metric with: " + cpuPercentageOfAllocated);
addIfNotNull(dimensions, "node.cpu.throttled_time", stats.getCpuStats().get("throttling_data"), "throttled_time");
addIfNotNull(dimensions, "node.memory.limit", stats.getMemoryStats(), "limit");