aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@oath.com>2018-11-01 09:33:23 +0100
committerHåkon Hallingstad <hakon@oath.com>2018-11-01 09:33:23 +0100
commit1604202eb303fa8577e915d76b19c93435bbeafd (patch)
treef41fdfd6be48cc8ae9beb40935df0eb6a122a3a2
parent10b69f509f0e91c8dddcf8d695a5dcafa36f41ba (diff)
Define HealthChecker interface
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/HealthChecker.java14
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java63
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java2
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java7
4 files changed, 37 insertions, 49 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/HealthChecker.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/HealthChecker.java
new file mode 100644
index 00000000000..e0b761ec816
--- /dev/null
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/HealthChecker.java
@@ -0,0 +1,14 @@
+package com.yahoo.vespa.hosted.node.admin.nodeagent;
+
+/**
+ * Interface for verifying the health of the node.
+ *
+ * @author hakonhall
+ */
+public interface HealthChecker extends AutoCloseable {
+ /** Verify the health of an active node, just before updating the node repo and calling Orchestrator resume. */
+ void verifyHealth(NodeAgentContext context);
+
+ @Override
+ void close();
+}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index 4e8cd00cf70..25b1a48b491 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -79,8 +79,10 @@ public class NodeAgentImpl implements NodeAgent {
private int numberOfUnhandledException = 0;
private Instant lastConverge;
+ private Node.State lastState = null;
private final Thread loopThread;
+ private final Optional<HealthChecker> healthChecker;
private final ScheduledExecutorService filebeatRestarter =
Executors.newScheduledThreadPool(1, ThreadFactoryFactory.getDaemonThreadFactory("filebeatrestarter"));
@@ -119,7 +121,8 @@ public class NodeAgentImpl implements NodeAgent {
final Clock clock,
final Duration timeBetweenEachConverge,
final Optional<AthenzCredentialsMaintainer> athenzCredentialsMaintainer,
- final Optional<AclMaintainer> aclMaintainer) {
+ final Optional<AclMaintainer> aclMaintainer,
+ final Optional<HealthChecker> healthChecker) {
this.context = context;
this.nodeRepository = nodeRepository;
this.orchestrator = orchestrator;
@@ -130,6 +133,7 @@ public class NodeAgentImpl implements NodeAgent {
this.lastConverge = clock.instant();
this.athenzCredentialsMaintainer = athenzCredentialsMaintainer;
this.aclMaintainer = aclMaintainer;
+ this.healthChecker = healthChecker;
this.loopThread = new Thread(() -> {
try {
@@ -193,15 +197,11 @@ public class NodeAgentImpl implements NodeAgent {
}
} while (loopThread.isAlive() || !filebeatRestarter.isTerminated());
+ healthChecker.ifPresent(HealthChecker::close);
+
context.log(logger, "Stopped");
}
- /**
- * Verifies that service is healthy, otherwise throws an exception. The default implementation does
- * nothing, override if it's necessary to verify that a service is healthy before resuming.
- */
- protected void verifyHealth(NodeSpec node) { }
-
void startServicesIfNeeded() {
if (!hasStartedServices) {
context.log(logger, "Starting services");
@@ -430,7 +430,6 @@ public class NodeAgentImpl implements NodeAgent {
isFrozenCopy = isFrozen;
}
- doAtTickStart(isFrozen);
boolean converged = false;
if (isFrozenCopy) {
@@ -452,14 +451,22 @@ public class NodeAgentImpl implements NodeAgent {
context.log(logger, LogLevel.ERROR, "Unhandled exception, ignoring.", e);
}
}
+ }
- doAtTickEnd(converged);
+ private String stateDescription(Node.State state) {
+ return state == null ? "[absent]" : state.toString();
}
// Public for testing
void converge() {
final Optional<NodeSpec> optionalNode = nodeRepository.getOptionalNode(context.hostname().value());
+ Node.State newState = optionalNode.map(NodeSpec::getState).orElse(null);
+ if (newState != lastState) {
+ context.log(logger, LogLevel.INFO, "State changed: " + stateDescription(lastState) + " -> " + stateDescription(newState));
+ lastState = newState;
+ }
+
// We just removed the node from node repo, so this is expected until NodeAdmin stop this NodeAgent
if (!optionalNode.isPresent() && expectNodeNotInNodeRepo) return;
@@ -508,13 +515,10 @@ public class NodeAgentImpl implements NodeAgent {
aclMaintainer.ifPresent(AclMaintainer::converge);
}
- verifyHealth(node);
startServicesIfNeeded();
resumeNodeIfNeeded(node);
-
athenzCredentialsMaintainer.ifPresent(maintainer -> maintainer.converge(context));
-
- doBeforeConverge(node);
+ healthChecker.ifPresent(checker -> checker.verifyHealth(context));
// Because it's more important to stop a bad release from rolling out in prod,
// we put the resume call last. So if we fail after updating the node repo attributes
@@ -551,39 +555,6 @@ public class NodeAgentImpl implements NodeAgent {
}
}
- /**
- * Execute at start of tick
- *
- * WARNING: MUST NOT throw an exception
- *
- * @param frozen whether the agent is frozen
- */
- protected void doAtTickStart(boolean frozen) {}
-
- /**
- * Execute at end of tick
- *
- * WARNING: MUST NOT throw an exception
- *
- * @param converged Whether the tick converged: converge() was called without exception
- */
- protected void doAtTickEnd(boolean converged) {}
-
- /**
- * Execute at end of a (so far) successful converge of an active node
- *
- * Method a subclass can override to execute code:
- * - Called right before the node repo is updated with converged attributes, and
- * Orchestrator resume() is called
- * - The only way to avoid a successful converge and the update to the node repo
- * and Orchestrator is to throw an exception
- * - The method is only called in a tick if the node is active, not frozen, and
- * there are no prior phases of the converge that fails
- *
- * @throws RuntimeException to fail the convergence
- */
- protected void doBeforeConverge(NodeSpec node) {}
-
private void stopFilebeatSchedulerIfNeeded() {
if (currentFilebeatRestarter.isPresent()) {
currentFilebeatRestarter.get().cancel(true);
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java
index 57835a3a759..e22606104f1 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java
@@ -95,7 +95,7 @@ public class DockerTester implements AutoCloseable {
MetricReceiverWrapper mr = new MetricReceiverWrapper(MetricReceiver.nullImplementation);
Function<String, NodeAgent> nodeAgentFactory = (hostName) -> new NodeAgentImpl(
new NodeAgentContextImpl.Builder(hostName).fileSystem(fileSystem).build(), nodeRepository,
- orchestrator, dockerOperations, storageMaintainer, clock, INTERVAL, Optional.empty(), Optional.empty());
+ orchestrator, dockerOperations, storageMaintainer, clock, INTERVAL, Optional.empty(), Optional.empty(), Optional.empty());
nodeAdmin = new NodeAdminImpl(nodeAgentFactory, Optional.empty(), mr, Clock.systemUTC());
nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeRepository, orchestrator,
nodeAdmin, HOST_HOSTNAME);
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
index 0e72ef15f6a..5dbc3ecd1e9 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImplTest.java
@@ -70,6 +70,7 @@ public class NodeAgentImplTest {
private final StorageMaintainer storageMaintainer = mock(StorageMaintainer.class);
private final MetricReceiverWrapper metricReceiver = new MetricReceiverWrapper(MetricReceiver.nullImplementation);
private final AclMaintainer aclMaintainer = mock(AclMaintainer.class);
+ private final HealthChecker healthChecker = mock(HealthChecker.class);
private final ContainerStats emptyContainerStats = new ContainerStats(Collections.emptyMap(),
Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap());
private final AthenzCredentialsMaintainer athenzCredentialsMaintainer = mock(AthenzCredentialsMaintainer.class);
@@ -191,12 +192,13 @@ public class NodeAgentImplTest {
verify(dockerOperations, never()).startServices(any());
verify(orchestrator, never()).suspend(any(String.class));
- final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository, aclMaintainer);
+ final InOrder inOrder = inOrder(dockerOperations, orchestrator, nodeRepository, aclMaintainer, healthChecker);
inOrder.verify(dockerOperations, times(1)).pullImageAsyncIfNeeded(eq(dockerImage));
inOrder.verify(dockerOperations, times(1)).createContainer(eq(context), eq(node), any());
inOrder.verify(dockerOperations, times(1)).startContainer(eq(context));
inOrder.verify(aclMaintainer, times(1)).converge();
inOrder.verify(dockerOperations, times(1)).resumeNode(eq(context));
+ inOrder.verify(healthChecker, times(1)).verifyHealth(eq(context));
inOrder.verify(nodeRepository).updateNodeAttributes(
hostName, new NodeAttributes().withDockerImage(dockerImage));
inOrder.verify(orchestrator).resume(hostName);
@@ -707,7 +709,8 @@ public class NodeAgentImplTest {
doNothing().when(storageMaintainer).writeMetricsConfig(any(), any());
return new NodeAgentImpl(context, nodeRepository, orchestrator, dockerOperations,
- storageMaintainer, clock, NODE_AGENT_SCAN_INTERVAL, Optional.of(athenzCredentialsMaintainer), Optional.of(aclMaintainer));
+ storageMaintainer, clock, NODE_AGENT_SCAN_INTERVAL, Optional.of(athenzCredentialsMaintainer), Optional.of(aclMaintainer),
+ Optional.of(healthChecker));
}
private void mockGetContainer(DockerImage dockerImage, boolean isRunning) {