diff options
author | Valerij Fredriksen <valerij92@gmail.com> | 2019-02-09 11:01:23 +0100 |
---|---|---|
committer | Valerij Fredriksen <valerij92@gmail.com> | 2019-02-09 11:07:51 +0100 |
commit | 40dcf524be4ec1bda36631447cfb70d30e7f5654 (patch) | |
tree | c1a8389ad4b21958379e049230c7d73cc7de7005 /node-admin/src | |
parent | 7bf67793bac909c047f994e922f19d647a464fec (diff) |
Do not run metricsscheduler when suspended
Diffstat (limited to 'node-admin/src')
4 files changed, 44 insertions, 17 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java index 37d79d97e74..456391c65c2 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java @@ -18,6 +18,9 @@ public interface NodeAdmin { */ void refreshContainersToRun(final List<NodeSpec> containersToRun); + /** Gather node agent and its docker container metrics and forward them to the {@code MetricReceiverWrapper} */ + void updateNodeAgentMetrics(); + /** * Attempts to freeze/unfreeze all NodeAgents and itself. To freeze a NodeAgent means that * they will not pick up any changes from NodeRepository. diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index 288003ade3c..2b37dcdf69c 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -42,8 +42,6 @@ public class NodeAdminImpl implements NodeAdmin { private final ScheduledExecutorService aclScheduler = Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("aclscheduler")); - private final ScheduledExecutorService metricsScheduler = - Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("metricsscheduler")); private final NodeAgentWithSchedulerFactory nodeAgentWithSchedulerFactory; private final NodeAgentContextFactory nodeAgentContextFactory; @@ -121,13 +119,15 @@ public class NodeAdminImpl implements NodeAdmin { } } - private void updateNodeAgentMetrics() { + @Override + public void updateNodeAgentMetrics() { int numberContainersWaitingImage = 0; int numberOfNewUnhandledExceptions = 0; for (NodeAgentWithScheduler nodeAgentWithScheduler : nodeAgentWithSchedulerByHostname.values()) { if (nodeAgentWithScheduler.isDownloadingImage()) numberContainersWaitingImage++; numberOfNewUnhandledExceptions += nodeAgentWithScheduler.getAndResetNumberOfUnhandledExceptions(); + nodeAgentWithScheduler.updateContainerNodeMetrics(); } numberOfContainersInLoadImageState.sample(numberContainersWaitingImage); @@ -186,15 +186,6 @@ public class NodeAdminImpl implements NodeAdmin { @Override public void start() { - metricsScheduler.scheduleAtFixedRate(() -> { - try { - updateNodeAgentMetrics(); - nodeAgentWithSchedulerByHostname.values().forEach(NodeAgent::updateContainerNodeMetrics); - } catch (Throwable e) { - logger.warning("Metric fetcher scheduler failed", e); - } - }, 10, 55, TimeUnit.SECONDS); - aclMaintainer.ifPresent(maintainer -> { int delay = 120; // WARNING: Reducing this will increase the load on config servers. aclScheduler.scheduleWithFixedDelay(() -> { @@ -205,7 +196,6 @@ public class NodeAdminImpl implements NodeAdmin { @Override public void stop() { - metricsScheduler.shutdown(); aclScheduler.shutdown(); // Stop all node-agents in parallel, will block until the last NodeAgent is stopped @@ -213,12 +203,11 @@ public class NodeAdminImpl implements NodeAdmin { do { try { - metricsScheduler.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); aclScheduler.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); } catch (InterruptedException e) { logger.info("Was interrupted while waiting for metricsScheduler and aclScheduler to shutdown"); } - } while (!metricsScheduler.isTerminated() || !aclScheduler.isTerminated()); + } while (!aclScheduler.isTerminated()); } // Set-difference. Returns minuend minus subtrahend. diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java index 13d3f3307d2..18c3a836e41 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.nodeadmin; +import com.yahoo.concurrent.ThreadFactoryFactory; import com.yahoo.config.provision.HostName; import com.yahoo.log.LogLevel; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; @@ -10,11 +11,17 @@ import com.yahoo.vespa.hosted.provision.Node; import java.time.Duration; import java.util.ArrayList; +import java.util.EnumSet; import java.util.List; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.RESUMED; +import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.SUSPENDED; import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN; import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.TRANSITIONING; @@ -27,6 +34,9 @@ public class NodeAdminStateUpdater { private static final Logger log = Logger.getLogger(NodeAdminStateUpdater.class.getName()); private static final Duration FREEZE_CONVERGENCE_TIMEOUT = Duration.ofMinutes(5); + private final ScheduledExecutorService metricsScheduler = + Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("metricsscheduler")); + private final NodeRepository nodeRepository; private final Orchestrator orchestrator; private final NodeAdmin nodeAdmin; @@ -34,7 +44,7 @@ public class NodeAdminStateUpdater { public enum State { TRANSITIONING, RESUMED, SUSPENDED_NODE_ADMIN, SUSPENDED } - private State currentState = SUSPENDED_NODE_ADMIN; + private volatile State currentState = SUSPENDED_NODE_ADMIN; public NodeAdminStateUpdater( NodeRepository nodeRepository, @@ -49,6 +59,31 @@ public class NodeAdminStateUpdater { public void start() { nodeAdmin.start(); + + EnumSet<State> suspendedStates = EnumSet.of(SUSPENDED_NODE_ADMIN, SUSPENDED); + metricsScheduler.scheduleAtFixedRate(() -> { + try { + if (suspendedStates.contains(currentState)) return; + nodeAdmin.updateNodeAgentMetrics(); + } catch (Throwable e) { + log.log(Level.WARNING, "Metric fetcher scheduler failed", e); + } + }, 10, 55, TimeUnit.SECONDS); + } + + public void stop() { + metricsScheduler.shutdown(); + + // Stop all node-agents in parallel, will block until the last NodeAgent is stopped + nodeAdmin.stop(); + + do { + try { + metricsScheduler.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS); + } catch (InterruptedException e) { + log.info("Was interrupted while waiting for metricsScheduler and shutdown"); + } + } while (!metricsScheduler.isTerminated()); } /** diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java index e475e9a53c2..0254f58e7eb 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java @@ -139,7 +139,7 @@ public class DockerTester implements AutoCloseable { @Override public void close() { // First, stop NodeAdmin and all the NodeAgents - nodeAdmin.stop(); + nodeAdminStateUpdater.stop(); terminated = true; do { |