diff options
author | Håkon Hallingstad <hakon@oath.com> | 2018-11-01 09:33:23 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@oath.com> | 2018-11-01 09:33:23 +0100 |
commit | 1604202eb303fa8577e915d76b19c93435bbeafd (patch) | |
tree | f41fdfd6be48cc8ae9beb40935df0eb6a122a3a2 | |
parent | 10b69f509f0e91c8dddcf8d695a5dcafa36f41ba (diff) |
Define HealthChecker interface
4 files changed, 37 insertions, 49 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/HealthChecker.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/HealthChecker.java new file mode 100644 index 00000000000..e0b761ec816 --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/HealthChecker.java @@ -0,0 +1,14 @@ +package com.yahoo.vespa.hosted.node.admin.nodeagent; + +/** + * Interface for verifying the health of the node. + * + * @author hakonhall + */ +public interface HealthChecker extends AutoCloseable { + /** Verify the health of an active node, just before updating the node repo and calling Orchestrator resume. */ + void verifyHealth(NodeAgentContext context); + + @Override + void close(); +} diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 4e8cd00cf70..25b1a48b491 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -79,8 +79,10 @@ public class NodeAgentImpl implements NodeAgent { private int numberOfUnhandledException = 0; private Instant lastConverge; + private Node.State lastState = null; private final Thread loopThread; + private final Optional<HealthChecker> healthChecker; private final ScheduledExecutorService filebeatRestarter = Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("filebeatrestarter")); @@ -119,7 +121,8 @@ public class NodeAgentImpl implements NodeAgent { final Clock clock, final Duration timeBetweenEachConverge, final Optional<AthenzCredentialsMaintainer> athenzCredentialsMaintainer, - final Optional<AclMaintainer> aclMaintainer) { + final Optional<AclMaintainer> aclMaintainer, + final Optional<HealthChecker> healthChecker) { this.context = context; this.nodeRepository = nodeRepository; this.orchestrator = orchestrator; @@ -130,6 +133,7 @@ public class NodeAgentImpl implements NodeAgent { this.lastConverge = clock.instant(); this.athenzCredentialsMaintainer = athenzCredentialsMaintainer; this.aclMaintainer = aclMaintainer; + this.healthChecker = healthChecker; this.loopThread = new Thread(() -> { try { @@ -193,15 +197,11 @@ public class NodeAgentImpl implements NodeAgent { } } while (loopThread.isAlive() || !filebeatRestarter.isTerminated()); + healthChecker.ifPresent(HealthChecker::close); + context.log(logger, "Stopped"); } - /** - * Verifies that service is healthy, otherwise throws an exception. The default implementation does - * nothing, override if it's necessary to verify that a service is healthy before resuming. - */ - protected void verifyHealth(NodeSpec node) { } - void startServicesIfNeeded() { if (!hasStartedServices) { context.log(logger, "Starting services"); @@ -430,7 +430,6 @@ public class NodeAgentImpl implements NodeAgent { isFrozenCopy = isFrozen; } - doAtTickStart(isFrozen); boolean converged = false; if (isFrozenCopy) { @@ -452,14 +451,22 @@ public class NodeAgentImpl implements NodeAgent { context.log(logger, LogLevel.ERROR, "Unhandled exception, ignoring.", e); } } + } - doAtTickEnd(converged); + private String stateDescription(Node.State state) { + return state == null ? "[absent]" : state.toString(); } // Public for testing void converge() { final Optional<NodeSpec> optionalNode = nodeRepository.getOptionalNode(context.hostname().value()); + Node.State newState = optionalNode.map(NodeSpec::getState).orElse(null); + if (newState != lastState) { + context.log(logger, LogLevel.INFO, "State changed: " + stateDescription(lastState) + " -> " + stateDescription(newState)); + lastState = newState; + } + // We just removed the node from node repo, so this is expected until NodeAdmin stop this NodeAgent if (!optionalNode.isPresent() && expectNodeNotInNodeRepo) return; @@ -508,13 +515,10 @@ public class NodeAgentImpl implements NodeAgent { aclMaintainer.ifPresent(AclMaintainer::converge); } - verifyHealth(node); startServicesIfNeeded(); resumeNodeIfNeeded(node); - athenzCredentialsMaintainer.ifPresent(maintainer -> maintainer.converge(context)); - - doBeforeConverge(node); + healthChecker.ifPresent(checker -> checker.verifyHealth(context)); // Because it's more important to stop a bad release from rolling out in prod, // we put the resume call last. So if we fail after updating the node repo attributes @@ -551,39 +555,6 @@ public class NodeAgentImpl implements NodeAgent { } } - /** - * Execute at start of tick - * - * WARNING: MUST NOT throw an exception - * - * @param frozen whether the agent is frozen - */ - protected void doAtTickStart(boolean frozen) {} - - /** - * Execute at end of tick - * - * WARNING: MUST NOT throw an exception - * - * @param converged Whether the tick converged: converge() was called without exception - */ - protected void doAtTickEnd(boolean converged) {} - - /** - * Execute at end of a (so far) successful converge of an active node - * - * Method a subclass can override to execute code: - * - Called right before the node repo is updated with converged attributes, and - * Orchestrator resume() is called - * - The only way to avoid a successful converge and the update to the node repo - * and Orchestrator is to throw an exception - * - The method is only called in a tick if the node is active, not frozen, and - * there are no prior phases of the converge that fails - * - * @throws RuntimeException to fail the convergence - */ - protected void doBeforeConverge(NodeSpec node) {} - private void stopFilebeatSchedulerIfNeeded() { if (currentFilebeatRestarter.isPresent()) { currentFilebeatRestarter.get().cancel(true); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java index 57835a3a759..e22606104f1 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java @@ -95,7 +95,7 @@ public class DockerTester implements AutoCloseable { MetricReceiverWrapper mr = new MetricReceiverWrapper(MetricReceiver.nullImplementation); Function<String, NodeAgent> nodeAgentFactory = (hostName) -> new NodeAgentImpl( new NodeAgentContextImpl.Builder(hostName).fileSystem(fileSystem).build(), nodeRepository, - orchestrator, dockerOperations, storageMaintainer, clock, INTERVAL, Optional.empty(), Optional.empty()); + orchestrator, dockerOperations, storageMaintainer, clock, INTERVAL, Optional.empty(), Optional.empty(), Optional.empty()); nodeAdmin = new NodeAdminImpl(nodeAgentFactory, Optional.empty(), mr, Clock.systemUTC()); nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeRepository, orchestrator, nodeAdmin, HOST_HOSTNAME); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 0e72ef15f6a..5dbc3ecd1e9 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -70,6 +70,7 @@ public class NodeAgentImplTest { private final StorageMaintainer storageMaintainer = mock(StorageMaintainer.class); private final MetricReceiverWrapper metricReceiver = new MetricReceiverWrapper(MetricReceiver.nullImplementation); private final AclMaintainer aclMaintainer = mock(AclMaintainer.class); + private final HealthChecker healthChecker = mock(HealthChecker.class); private final ContainerStats emptyContainerStats = new ContainerStats(Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap()); private final AthenzCredentialsMaintainer athenzCredentialsMaintainer = mock(AthenzCredentialsMaintainer.class); @@ -191,12 +192,13 @@ public class NodeAgentImplTest { verify(dockerOperations, never()).startServices(any()); verify(orchestrator, never()).suspend(any(String.class)); - final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository, aclMaintainer); + final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository, aclMaintainer, healthChecker); inOrder.verify(dockerOperations, times(1)).pullImageAsyncIfNeeded(eq(dockerImage)); inOrder.verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any()); inOrder.verify(dockerOperations, times(1)).startContainer(eq(context)); inOrder.verify(aclMaintainer, times(1)).converge(); inOrder.verify(dockerOperations, times(1)).resumeNode(eq(context)); + inOrder.verify(healthChecker, times(1)).verifyHealth(eq(context)); inOrder.verify(nodeRepository).updateNodeAttributes( hostName, new NodeAttributes().withDockerImage(dockerImage)); inOrder.verify(orchestrator).resume(hostName); @@ -707,7 +709,8 @@ public class NodeAgentImplTest { doNothing().when(storageMaintainer).writeMetricsConfig(any(), any()); return new NodeAgentImpl(context, nodeRepository, orchestrator, dockerOperations, - storageMaintainer, clock, NODE_AGENT_SCAN_INTERVAL, Optional.of(athenzCredentialsMaintainer), Optional.of(aclMaintainer)); + storageMaintainer, clock, NODE_AGENT_SCAN_INTERVAL, Optional.of(athenzCredentialsMaintainer), Optional.of(aclMaintainer), + Optional.of(healthChecker)); } private void mockGetContainer(DockerImage dockerImage, boolean isRunning) { |