diff options
author | HÃ¥kon Hallingstad <hakon@oath.com> | 2018-11-01 11:28:21 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-11-01 11:28:21 +0100 |
commit | 5011d0ab2fcf67437e27c6293290e697d9e1e0aa (patch) | |
tree | abd0a166358246fbad5a781473710dd1211b5486 /node-admin/src | |
parent | b5b0326938349a7c21513b00b613dd2ba637357a (diff) | |
parent | 14efb76a6659b5d05907a7693c3330831b4d700c (diff) |
Merge pull request #7523 from vespa-engine/hakonhall/define-healthchecker-interface
Define HealthChecker interface
Diffstat (limited to 'node-admin/src')
4 files changed, 52 insertions, 49 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/HealthChecker.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/HealthChecker.java new file mode 100644 index 00000000000..e0b761ec816 --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/HealthChecker.java @@ -0,0 +1,14 @@ +package com.yahoo.vespa.hosted.node.admin.nodeagent; + +/** + * Interface for verifying the health of the node. + * + * @author hakonhall + */ +public interface HealthChecker extends AutoCloseable { + /** Verify the health of an active node, just before updating the node repo and calling Orchestrator resume. */ + void verifyHealth(NodeAgentContext context); + + @Override + void close(); +} diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index 4e8cd00cf70..01b42b7ed81 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -41,6 +41,7 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Consumer; +import java.util.function.Function; import java.util.logging.Logger; import static com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.ABSENT; @@ -81,6 +82,7 @@ public class NodeAgentImpl implements NodeAgent { private Instant lastConverge; private final Thread loopThread; + private final Optional<HealthChecker> healthChecker; private final ScheduledExecutorService filebeatRestarter = Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("filebeatrestarter")); @@ -119,7 +121,8 @@ public class NodeAgentImpl implements NodeAgent { final Clock clock, final Duration timeBetweenEachConverge, final Optional<AthenzCredentialsMaintainer> athenzCredentialsMaintainer, - final Optional<AclMaintainer> aclMaintainer) { + final Optional<AclMaintainer> aclMaintainer, + final Optional<HealthChecker> healthChecker) { this.context = context; this.nodeRepository = nodeRepository; this.orchestrator = orchestrator; @@ -130,6 +133,7 @@ public class NodeAgentImpl implements NodeAgent { this.lastConverge = clock.instant(); this.athenzCredentialsMaintainer = athenzCredentialsMaintainer; this.aclMaintainer = aclMaintainer; + this.healthChecker = healthChecker; this.loopThread = new Thread(() -> { try { @@ -196,12 +200,6 @@ public class NodeAgentImpl implements NodeAgent { context.log(logger, "Stopped"); } - /** - * Verifies that service is healthy, otherwise throws an exception. The default implementation does - * nothing, override if it's necessary to verify that a service is healthy before resuming. - */ - protected void verifyHealth(NodeSpec node) { } - void startServicesIfNeeded() { if (!hasStartedServices) { context.log(logger, "Starting services"); @@ -430,7 +428,6 @@ public class NodeAgentImpl implements NodeAgent { isFrozenCopy = isFrozen; } - doAtTickStart(isFrozen); boolean converged = false; if (isFrozenCopy) { @@ -452,8 +449,6 @@ public class NodeAgentImpl implements NodeAgent { context.log(logger, LogLevel.ERROR, "Unhandled exception, ignoring.", e); } } - - doAtTickEnd(converged); } // Public for testing @@ -461,7 +456,10 @@ public class NodeAgentImpl implements NodeAgent { final Optional<NodeSpec> optionalNode = nodeRepository.getOptionalNode(context.hostname().value()); // We just removed the node from node repo, so this is expected until NodeAdmin stop this NodeAgent - if (!optionalNode.isPresent() && expectNodeNotInNodeRepo) return; + if (!optionalNode.isPresent() && expectNodeNotInNodeRepo) { + context.log(logger, LogLevel.INFO, "Node removed from node repo (as expected)"); + return; + } final NodeSpec node = optionalNode.orElseThrow(() -> new IllegalStateException(String.format("Node '%s' missing from node repository", context.hostname()))); @@ -469,13 +467,14 @@ public class NodeAgentImpl implements NodeAgent { Optional<Container> container = getContainer(); if (!node.equals(lastNode)) { + logChangesToNodeSpec(lastNode, node); + // Every time the node spec changes, we should clear the metrics for this container as the dimensions // will change and we will be reporting duplicate metrics. if (container.map(c -> c.state.isRunning()).orElse(false)) { storageMaintainer.writeMetricsConfig(context, node); } - context.log(logger, LogLevel.DEBUG, "Loading new node spec: " + node.toString()); lastNode = node; } @@ -508,13 +507,10 @@ public class NodeAgentImpl implements NodeAgent { aclMaintainer.ifPresent(AclMaintainer::converge); } - verifyHealth(node); startServicesIfNeeded(); resumeNodeIfNeeded(node); - athenzCredentialsMaintainer.ifPresent(maintainer -> maintainer.converge(context)); - - doBeforeConverge(node); + healthChecker.ifPresent(checker -> checker.verifyHealth(context)); // Because it's more important to stop a bad release from rolling out in prod, // we put the resume call last. So if we fail after updating the node repo attributes @@ -551,38 +547,28 @@ public class NodeAgentImpl implements NodeAgent { } } - /** - * Execute at start of tick - * - * WARNING: MUST NOT throw an exception - * - * @param frozen whether the agent is frozen - */ - protected void doAtTickStart(boolean frozen) {} + private void logChangesToNodeSpec(NodeSpec lastNode, NodeSpec node) { + StringBuilder builder = new StringBuilder(); + appendIfDifferent(builder, "state", lastNode, node, NodeSpec::getState); + if (builder.length() > 0) { + context.log(logger, LogLevel.INFO, "Changes to node: " + builder.toString()); + } + } - /** - * Execute at end of tick - * - * WARNING: MUST NOT throw an exception - * - * @param converged Whether the tick converged: converge() was called without exception - */ - protected void doAtTickEnd(boolean converged) {} + private static <T> String fieldDescription(T value) { + return value == null ? "[absent]" : value.toString(); + } - /** - * Execute at end of a (so far) successful converge of an active node - * - * Method a subclass can override to execute code: - * - Called right before the node repo is updated with converged attributes, and - * Orchestrator resume() is called - * - The only way to avoid a successful converge and the update to the node repo - * and Orchestrator is to throw an exception - * - The method is only called in a tick if the node is active, not frozen, and - * there are no prior phases of the converge that fails - * - * @throws RuntimeException to fail the convergence - */ - protected void doBeforeConverge(NodeSpec node) {} + private <T> void appendIfDifferent(StringBuilder builder, String name, NodeSpec oldNode, NodeSpec newNode, Function<NodeSpec, T> getter) { + T oldValue = oldNode == null ? null : getter.apply(oldNode); + T newValue = getter.apply(newNode); + if (!Objects.equals(oldValue, newValue)) { + if (builder.length() > 0) { + builder.append(", "); + } + builder.append(name).append(" ").append(fieldDescription(oldValue)).append(" -> ").append(fieldDescription(newValue)); + } + } private void stopFilebeatSchedulerIfNeeded() { if (currentFilebeatRestarter.isPresent()) { diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java index 57835a3a759..e22606104f1 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java @@ -95,7 +95,7 @@ public class DockerTester implements AutoCloseable { MetricReceiverWrapper mr = new MetricReceiverWrapper(MetricReceiver.nullImplementation); Function<String, NodeAgent> nodeAgentFactory = (hostName) -> new NodeAgentImpl( new NodeAgentContextImpl.Builder(hostName).fileSystem(fileSystem).build(), nodeRepository, - orchestrator, dockerOperations, storageMaintainer, clock, INTERVAL, Optional.empty(), Optional.empty()); + orchestrator, dockerOperations, storageMaintainer, clock, INTERVAL, Optional.empty(), Optional.empty(), Optional.empty()); nodeAdmin = new NodeAdminImpl(nodeAgentFactory, Optional.empty(), mr, Clock.systemUTC()); nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeRepository, orchestrator, nodeAdmin, HOST_HOSTNAME); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java index 0e72ef15f6a..5dbc3ecd1e9 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java @@ -70,6 +70,7 @@ public class NodeAgentImplTest { private final StorageMaintainer storageMaintainer = mock(StorageMaintainer.class); private final MetricReceiverWrapper metricReceiver = new MetricReceiverWrapper(MetricReceiver.nullImplementation); private final AclMaintainer aclMaintainer = mock(AclMaintainer.class); + private final HealthChecker healthChecker = mock(HealthChecker.class); private final ContainerStats emptyContainerStats = new ContainerStats(Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap()); private final AthenzCredentialsMaintainer athenzCredentialsMaintainer = mock(AthenzCredentialsMaintainer.class); @@ -191,12 +192,13 @@ public class NodeAgentImplTest { verify(dockerOperations, never()).startServices(any()); verify(orchestrator, never()).suspend(any(String.class)); - final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository, aclMaintainer); + final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository, aclMaintainer, healthChecker); inOrder.verify(dockerOperations, times(1)).pullImageAsyncIfNeeded(eq(dockerImage)); inOrder.verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any()); inOrder.verify(dockerOperations, times(1)).startContainer(eq(context)); inOrder.verify(aclMaintainer, times(1)).converge(); inOrder.verify(dockerOperations, times(1)).resumeNode(eq(context)); + inOrder.verify(healthChecker, times(1)).verifyHealth(eq(context)); inOrder.verify(nodeRepository).updateNodeAttributes( hostName, new NodeAttributes().withDockerImage(dockerImage)); inOrder.verify(orchestrator).resume(hostName); @@ -707,7 +709,8 @@ public class NodeAgentImplTest { doNothing().when(storageMaintainer).writeMetricsConfig(any(), any()); return new NodeAgentImpl(context, nodeRepository, orchestrator, dockerOperations, - storageMaintainer, clock, NODE_AGENT_SCAN_INTERVAL, Optional.of(athenzCredentialsMaintainer), Optional.of(aclMaintainer)); + storageMaintainer, clock, NODE_AGENT_SCAN_INTERVAL, Optional.of(athenzCredentialsMaintainer), Optional.of(aclMaintainer), + Optional.of(healthChecker)); } private void mockGetContainer(DockerImage dockerImage, boolean isRunning) { |