summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2018-10-29 09:37:19 +0100
committerGitHub <noreply@github.com>2018-10-29 09:37:19 +0100
commit5662bb226bbd4e7cf0f61a9ae269f57d75ac3a02 (patch)
treee238428ccb2bd93ed1cc9816ab1cae1358335a66 /node-admin
parent1794069cc28286f7fa0602a6d8e38d346808836d (diff)
parent2d677c94c9a727c565e879262dc99706d78fa0b3 (diff)
Merge pull request #7465 from vespa-engine/freva/move-nasu
NodeAdmin: Remove Thread from NASU
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/component/AdminComponent.java28
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java7
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java16
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java138
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java269
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java7
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java14
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java18
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java62
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java7
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java263
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java197
12 files changed, 387 insertions, 639 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/component/AdminComponent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/component/AdminComponent.java
deleted file mode 100644
index 9888cca9c7e..00000000000
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/component/AdminComponent.java
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hosted.node.admin.component;
-
-import com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater;
-
-/**
- * An AdminComponent cannot assume anything about the environment until enable()
- * is called: Required YUM packages may not have been installed, services
- * not started, etc. An enabled AdminComponent can be disabled to disengage from
- * the environment.
- */
-public interface AdminComponent {
- /**
- * Enable component. May be called more than once.
- */
- void enable();
-
- /**
- * @return NodeAdminStateUpdater used by the REST API
- */
- NodeAdminStateUpdater getNodeAdminStateUpdater();
-
- /**
- * Disable component. May be called more than once.
- * Must be compatible with component deconstruct().
- */
- void disable();
-}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java
index 16992bcb13a..37d79d97e74 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java
@@ -5,7 +5,6 @@ import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec;
import java.time.Duration;
import java.util.List;
-import java.util.Map;
/**
* NodeAdmin manages the life cycle of NodeAgents.
@@ -47,12 +46,6 @@ public interface NodeAdmin {
void stopNodeAgentServices(List<String> nodes);
/**
- * Returns a map containing all relevant NodeAdmin variables and their current values.
- * Do not try to parse output or use in tests.
- */
- Map<String, Object> debugInfo();
-
- /**
* Start node-admin schedulers.
*/
void start();
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
index b1c1a99a90a..f8114a68d07 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java
@@ -15,7 +15,6 @@ import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.HashSet;
-import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -143,21 +142,6 @@ public class NodeAdminImpl implements NodeAdmin {
});
}
- public int getNumberOfNodeAgents() {
- return nodeAgentsByHostname.keySet().size();
- }
-
- @Override
- public Map<String, Object> debugInfo() {
- Map<String, Object> debug = new LinkedHashMap<>();
- debug.put("isFrozen", isFrozen);
-
- List<Map<String, Object>> nodeAgentDebugs = nodeAgentsByHostname.values().stream()
- .map(NodeAgent::debugInfo).collect(Collectors.toList());
- debug.put("NodeAgents", nodeAgentDebugs);
- return debug;
- }
-
@Override
public void start() {
metricsScheduler.scheduleAtFixedRate(() -> {
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
new file mode 100644
index 00000000000..a12104c6e98
--- /dev/null
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
@@ -0,0 +1,138 @@
+// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.node.admin.nodeadmin;
+
+import com.yahoo.config.provision.HostName;
+import com.yahoo.log.LogLevel;
+import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec;
+import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository;
+import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator;
+import com.yahoo.vespa.hosted.provision.Node;
+
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+import java.util.stream.Collectors;
+
+import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.RESUMED;
+import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN;
+import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.TRANSITIONING;
+
+/**
+ * Pulls information from node repository and forwards containers to run to node admin.
+ *
+ * @author dybis, stiankri
+ */
+public class NodeAdminStateUpdater {
+ private static final Logger log = Logger.getLogger(NodeAdminStateUpdater.class.getName());
+ private static final Duration FREEZE_CONVERGENCE_TIMEOUT = Duration.ofMinutes(5);
+
+ private final NodeRepository nodeRepository;
+ private final Orchestrator orchestrator;
+ private final NodeAdmin nodeAdmin;
+ private final String hostHostname;
+
+ public enum State { TRANSITIONING, RESUMED, SUSPENDED_NODE_ADMIN, SUSPENDED }
+
+ private State currentState = SUSPENDED_NODE_ADMIN;
+
+ public NodeAdminStateUpdater(
+ NodeRepository nodeRepository,
+ Orchestrator orchestrator,
+ NodeAdmin nodeAdmin,
+ HostName hostHostname) {
+ this.nodeRepository = nodeRepository;
+ this.orchestrator = orchestrator;
+ this.nodeAdmin = nodeAdmin;
+ this.hostHostname = hostHostname.value();
+ }
+
+ public void start() {
+ nodeAdmin.start();
+ }
+
+ public void converge(State wantedState) {
+ try {
+ convergeState(wantedState);
+ } finally {
+ if (wantedState != RESUMED && currentState == TRANSITIONING) {
+ Duration subsystemFreezeDuration = nodeAdmin.subsystemFreezeDuration();
+ if (subsystemFreezeDuration.compareTo(FREEZE_CONVERGENCE_TIMEOUT) > 0) {
+ // We have spent too much time trying to freeze and node admin is still not frozen.
+ // To avoid node agents stalling for too long, we'll force unfrozen ticks now.
+ log.info("Timed out trying to freeze, will force unfreezed ticks");
+ fetchContainersToRunFromNodeRepository();
+ nodeAdmin.setFrozen(false);
+ }
+ } else if (currentState == RESUMED) {
+ fetchContainersToRunFromNodeRepository();
+ }
+ }
+ }
+
+ /**
+ * This method attempts to converge node-admin w/agents to a {@link State}
+ * with respect to: freeze, Orchestrator, and services running.
+ */
+ private void convergeState(State wantedState) {
+ if (currentState == wantedState) return;
+ currentState = TRANSITIONING;
+
+ boolean wantFrozen = wantedState != RESUMED;
+ if (!nodeAdmin.setFrozen(wantFrozen)) {
+ throw new ConvergenceException("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen"));
+ }
+
+ boolean hostIsActiveInNR = nodeRepository.getNode(hostHostname).getState() == Node.State.active;
+ switch (wantedState) {
+ case RESUMED:
+ if (hostIsActiveInNR) orchestrator.resume(hostHostname);
+ break;
+ case SUSPENDED_NODE_ADMIN:
+ if (hostIsActiveInNR) orchestrator.suspend(hostHostname);
+ break;
+ case SUSPENDED:
+ // Fetch active nodes from node repo before suspending nodes.
+ // It is only possible to suspend active nodes,
+ // the orchestrator will fail if trying to suspend nodes in other states.
+ // Even though state is frozen we need to interact with node repo, but
+ // the data from node repo should not be used for anything else.
+ // We should also suspend host's hostname to suspend node-admin
+ List<String> nodesInActiveState = getNodesInActiveState();
+
+ List<String> nodesToSuspend = new ArrayList<>(nodesInActiveState);
+ if (hostIsActiveInNR) nodesToSuspend.add(hostHostname);
+ if (!nodesToSuspend.isEmpty()) {
+ orchestrator.suspend(hostHostname, nodesToSuspend);
+ log.info("Orchestrator allows suspension of " + nodesToSuspend);
+ }
+
+ // The node agent services are stopped by this thread, which is OK only
+ // because the node agents are frozen (see above).
+ nodeAdmin.stopNodeAgentServices(nodesInActiveState);
+ break;
+ default:
+ throw new IllegalStateException("Unknown wanted state " + wantedState);
+ }
+
+ log.info("State changed from " + currentState + " to " + wantedState);
+ currentState = wantedState;
+ }
+
+ private void fetchContainersToRunFromNodeRepository() {
+ try {
+ final List<NodeSpec> containersToRun = nodeRepository.getNodes(hostHostname);
+ nodeAdmin.refreshContainersToRun(containersToRun);
+ } catch (Exception e) {
+ log.log(LogLevel.WARNING, "Failed to update which containers should be running", e);
+ }
+ }
+
+ private List<String> getNodesInActiveState() {
+ return nodeRepository.getNodes(hostHostname)
+ .stream()
+ .filter(node -> node.getState() == Node.State.active)
+ .map(NodeSpec::getHostname)
+ .collect(Collectors.toList());
+ }
+}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
deleted file mode 100644
index 296745c8e37..00000000000
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
+++ /dev/null
@@ -1,269 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hosted.node.admin.nodeadmin;
-
-import com.yahoo.config.provision.HostName;
-import com.yahoo.log.LogLevel;
-import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec;
-import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository;
-import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator;
-import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.OrchestratorException;
-import com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater;
-import com.yahoo.vespa.hosted.node.admin.configserver.HttpException;
-import com.yahoo.vespa.hosted.provision.Node;
-
-import java.time.Clock;
-import java.time.Duration;
-import java.time.Instant;
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Optional;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.logging.Logger;
-import java.util.stream.Collectors;
-
-import static com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater.State.RESUMED;
-import static com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN;
-import static com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater.State.TRANSITIONING;
-
-/**
- * Pulls information from node repository and forwards containers to run to node admin.
- *
- * @author dybis, stiankri
- */
-public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
- static final Duration FREEZE_CONVERGENCE_TIMEOUT = Duration.ofMinutes(5);
- static final String TRANSITION_EXCEPTION_MESSAGE = "NodeAdminStateUpdater has not run since current wanted state was set";
-
- private final AtomicBoolean terminated = new AtomicBoolean(false);
- private State currentState = SUSPENDED_NODE_ADMIN;
- private State wantedState = RESUMED;
- private boolean workToDoNow = true;
-
- private final Object monitor = new Object();
- private RuntimeException lastConvergenceException;
-
- private final Logger log = Logger.getLogger(NodeAdminStateUpdater.class.getName());
- private final Thread loopThread;
-
- private final NodeRepository nodeRepository;
- private final Orchestrator orchestrator;
- private final NodeAdmin nodeAdmin;
- private final Clock clock;
- private final String hostHostname;
- private final Duration nodeAdminConvergeStateInterval;
-
- private Instant lastTick;
-
- public NodeAdminStateUpdaterImpl(
- NodeRepository nodeRepository,
- Orchestrator orchestrator,
- NodeAdmin nodeAdmin,
- HostName hostHostname,
- Clock clock,
- Duration nodeAdminConvergeStateInterval) {
- this.nodeRepository = nodeRepository;
- this.orchestrator = orchestrator;
- this.nodeAdmin = nodeAdmin;
- this.hostHostname = hostHostname.value();
- this.clock = clock;
- this.nodeAdminConvergeStateInterval = nodeAdminConvergeStateInterval;
- this.lastTick = clock.instant();
-
- this.loopThread = new Thread(() -> {
- nodeAdmin.start();
-
- while (! terminated.get()) {
- tick();
- }
- });
- this.loopThread.setName("tick-NodeAdminStateUpdater");
- }
-
- @Override
- public Map<String, Object> getDebugPage() {
- Map<String, Object> debug = new LinkedHashMap<>();
- synchronized (monitor) {
- debug.put("hostHostname", hostHostname);
- debug.put("wantedState", wantedState);
- debug.put("currentState", currentState);
- debug.put("NodeAdmin", nodeAdmin.debugInfo());
- }
- return debug;
- }
-
- @Override
- public void setResumeStateAndCheckIfResumed(State wantedState) {
- synchronized (monitor) {
- if (this.wantedState != wantedState) {
- log.info("Wanted state change: " + this.wantedState + " -> " + wantedState);
- this.wantedState = wantedState;
- setLastConvergenceException(null);
- signalWorkToBeDone();
- }
-
- if (currentState != wantedState) {
- throw Optional.ofNullable(lastConvergenceException)
- .orElseGet(() -> new RuntimeException(TRANSITION_EXCEPTION_MESSAGE));
- }
- }
- }
-
- void signalWorkToBeDone() {
- synchronized (monitor) {
- if (! workToDoNow) {
- workToDoNow = true;
- monitor.notifyAll();
- }
- }
- }
-
- void tick() {
- State wantedStateCopy;
- synchronized (monitor) {
- while (! workToDoNow) {
- Duration timeSinceLastConverge = Duration.between(lastTick, clock.instant());
- long remainder = nodeAdminConvergeStateInterval.minus(timeSinceLastConverge).toMillis();
- if (remainder > 0) {
- try {
- monitor.wait(remainder);
- } catch (InterruptedException e) {
- log.info("Interrupted, but ignoring this: NodeAdminStateUpdater");
- }
- } else break;
- }
- lastTick = clock.instant();
- workToDoNow = false;
-
- // wantedState may change asynchronously, so we grab a copy of it here
- wantedStateCopy = this.wantedState;
- }
-
- try {
- convergeState(wantedStateCopy);
- setLastConvergenceException(null);
- } catch (OrchestratorException | ConvergenceException | HttpException e) {
- setLastConvergenceException(e);
- log.info("Unable to converge to " + wantedStateCopy + ": " + e.getMessage());
- } catch (RuntimeException e) {
- setLastConvergenceException(e);
- log.log(LogLevel.ERROR, "Error while trying to converge to " + wantedStateCopy, e);
- }
-
- if (wantedStateCopy != RESUMED && currentState == TRANSITIONING) {
- Duration subsystemFreezeDuration = nodeAdmin.subsystemFreezeDuration();
- if (subsystemFreezeDuration.compareTo(FREEZE_CONVERGENCE_TIMEOUT) > 0) {
- // We have spent too much time trying to freeze and node admin is still not frozen.
- // To avoid node agents stalling for too long, we'll force unfrozen ticks now.
- log.info("Timed out trying to freeze, will force unfreezed ticks");
- fetchContainersToRunFromNodeRepository();
- nodeAdmin.setFrozen(false);
- }
- } else if (currentState == RESUMED) {
- fetchContainersToRunFromNodeRepository();
- }
- }
-
- private void setLastConvergenceException(RuntimeException exception) {
- synchronized (monitor) {
- lastConvergenceException = exception;
- }
- }
-
- /**
- * This method attempts to converge node-admin w/agents to a {@link State}
- * with respect to: freeze, Orchestrator, and services running.
- */
- private void convergeState(State wantedState) {
- if (currentState == wantedState) return;
- synchronized (monitor) {
- currentState = TRANSITIONING;
- }
-
- boolean wantFrozen = wantedState != RESUMED;
- if (!nodeAdmin.setFrozen(wantFrozen)) {
- throw new ConvergenceException("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen"));
- }
-
- boolean hostIsActiveInNR = nodeRepository.getNode(hostHostname).getState() == Node.State.active;
- switch (wantedState) {
- case RESUMED:
- if (hostIsActiveInNR) orchestrator.resume(hostHostname);
- break;
- case SUSPENDED_NODE_ADMIN:
- if (hostIsActiveInNR) orchestrator.suspend(hostHostname);
- break;
- case SUSPENDED:
- // Fetch active nodes from node repo before suspending nodes.
- // It is only possible to suspend active nodes,
- // the orchestrator will fail if trying to suspend nodes in other states.
- // Even though state is frozen we need to interact with node repo, but
- // the data from node repo should not be used for anything else.
- // We should also suspend host's hostname to suspend node-admin
- List<String> nodesInActiveState = getNodesInActiveState();
-
- List<String> nodesToSuspend = new ArrayList<>(nodesInActiveState);
- if (hostIsActiveInNR) nodesToSuspend.add(hostHostname);
- if (!nodesToSuspend.isEmpty()) {
- orchestrator.suspend(hostHostname, nodesToSuspend);
- log.info("Orchestrator allows suspension of " + nodesToSuspend);
- }
-
- // The node agent services are stopped by this thread, which is OK only
- // because the node agents are frozen (see above).
- nodeAdmin.stopNodeAgentServices(nodesInActiveState);
- break;
- default:
- throw new IllegalStateException("Unknown wanted state " + wantedState);
- }
-
- log.info("State changed from " + currentState + " to " + wantedState);
- synchronized (monitor) {
- // Writes to currentState must be synchronized. Reads doesn't have to since this thread
- // is the only one modifying it.
- currentState = wantedState;
- }
- }
-
- private void fetchContainersToRunFromNodeRepository() {
- try {
- final List<NodeSpec> containersToRun = nodeRepository.getNodes(hostHostname);
- nodeAdmin.refreshContainersToRun(containersToRun);
- } catch (Exception e) {
- log.log(LogLevel.WARNING, "Failed to update which containers should be running", e);
- }
- }
-
- private List<String> getNodesInActiveState() {
- return nodeRepository.getNodes(hostHostname)
- .stream()
- .filter(node -> node.getState() == Node.State.active)
- .map(NodeSpec::getHostname)
- .collect(Collectors.toList());
- }
-
- public void start() {
- loopThread.start();
- }
-
- public void stop() {
- if (!terminated.compareAndSet(false, true)) {
- throw new RuntimeException("Can not re-stop a node agent.");
- }
-
- // First we need to stop NodeAdminStateUpdaterImpl thread to make sure no new NodeAgents are spawned
- signalWorkToBeDone();
-
- do {
- try {
- loopThread.join();
- } catch (InterruptedException e1) {
- log.info("Interrupted while waiting for NodeAdminStateUpdater thread and specVerfierScheduler to shutdown");
- }
- } while (loopThread.isAlive());
-
- // Finally, stop NodeAdmin and all the NodeAgents
- nodeAdmin.stop();
- }
-}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java
index 7c789e10d19..947e7c85d66 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java
@@ -1,8 +1,6 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.nodeagent;
-import java.util.Map;
-
/**
* Responsible for management of a single node over its lifecycle.
* May own its own resources, threads etc. Runs independently, but receives signals
@@ -30,11 +28,6 @@ public interface NodeAgent {
void suspend();
/**
- * Returns a map containing all relevant NodeAgent variables and their current values.
- */
- Map<String, Object> debugInfo();
-
- /**
* Starts the agent. After this method is called, the agent will asynchronously maintain the node, continuously
* striving to make the current state equal to the wanted state.
*/
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
index a220557ca9c..805069ac8c9 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java
@@ -30,7 +30,6 @@ import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
-import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
@@ -57,7 +56,6 @@ public class NodeAgentImpl implements NodeAgent {
private static final Logger logger = Logger.getLogger(NodeAgentImpl.class.getName());
-
private final AtomicBoolean terminated = new AtomicBoolean(false);
private boolean isFrozen = true;
private boolean wantFrozen = false;
@@ -156,18 +154,6 @@ public class NodeAgentImpl implements NodeAgent {
}
@Override
- public Map<String, Object> debugInfo() {
- Map<String, Object> debug = new LinkedHashMap<>();
- debug.put("hostname", context.hostname());
- debug.put("isFrozen", isFrozen);
- debug.put("wantFrozen", wantFrozen);
- debug.put("terminated", terminated);
- debug.put("workToDoNow", workToDoNow);
- debug.put("nodeRepoState", lastNode.getState().name());
- return debug;
- }
-
- @Override
public void start() {
context.log(logger, "Starting with interval " + timeBetweenEachConverge.toMillis() + " ms");
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java
deleted file mode 100644
index 8a926b511e6..00000000000
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hosted.node.admin.provider;
-
-import javax.annotation.concurrent.ThreadSafe;
-
-@ThreadSafe
-public interface NodeAdminStateUpdater extends NodeAdminDebugHandler {
- enum State { TRANSITIONING, RESUMED, SUSPENDED_NODE_ADMIN, SUSPENDED}
-
- /**
- * Set the wanted state, and assert whether the current state equals it.
- * Typically, this method should be called repeatedly until current state
- * has converged.
- *
- * @throws RuntimeException (or a subclass) if the state has not converged yet.
- */
- void setResumeStateAndCheckIfResumed(State wantedState);
-}
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java
index 6ed51657d6e..57835a3a759 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java
@@ -14,7 +14,7 @@ import com.yahoo.vespa.hosted.node.admin.docker.DockerOperations;
import com.yahoo.vespa.hosted.node.admin.docker.DockerOperationsImpl;
import com.yahoo.vespa.hosted.node.admin.maintenance.StorageMaintainer;
import com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminImpl;
-import com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdaterImpl;
+import com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgent;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl;
@@ -32,6 +32,7 @@ import java.time.Duration;
import java.util.Collections;
import java.util.Optional;
import java.util.function.Function;
+import java.util.logging.Logger;
import static com.yahoo.vespa.hosted.node.admin.task.util.file.IOExceptionUtil.uncheck;
import static org.mockito.ArgumentMatchers.any;
@@ -45,21 +46,26 @@ import static org.mockito.Mockito.when;
*/
// Need to deconstruct nodeAdminStateUpdater
public class DockerTester implements AutoCloseable {
- private static final Duration NODE_AGENT_SCAN_INTERVAL = Duration.ofMillis(100);
- private static final Duration NODE_ADMIN_CONVERGE_STATE_INTERVAL = Duration.ofMillis(10);
+ private static final Logger log = Logger.getLogger(DockerTester.class.getName());
+ private static final Duration INTERVAL = Duration.ofMillis(10);
private static final Path PATH_TO_VESPA_HOME = Paths.get("/opt/vespa");
static final String NODE_PROGRAM = PATH_TO_VESPA_HOME.resolve("bin/vespa-nodectl").toString();
static final HostName HOST_HOSTNAME = HostName.from("host.test.yahoo.com");
+ private final Thread loopThread;
+
final Docker docker = spy(new DockerMock());
final NodeRepoMock nodeRepository = spy(new NodeRepoMock());
final Orchestrator orchestrator = mock(Orchestrator.class);
final StorageMaintainer storageMaintainer = mock(StorageMaintainer.class);
final InOrder inOrder = Mockito.inOrder(docker, nodeRepository, orchestrator, storageMaintainer);
- final NodeAdminStateUpdaterImpl nodeAdminStateUpdater;
+ final NodeAdminStateUpdater nodeAdminStateUpdater;
final NodeAdminImpl nodeAdmin;
+ private boolean terminated = false;
+ private volatile NodeAdminStateUpdater.State wantedState = NodeAdminStateUpdater.State.RESUMED;
+
DockerTester() {
when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.empty());
@@ -89,11 +95,28 @@ public class DockerTester implements AutoCloseable {
MetricReceiverWrapper mr = new MetricReceiverWrapper(MetricReceiver.nullImplementation);
Function<String, NodeAgent> nodeAgentFactory = (hostName) -> new NodeAgentImpl(
new NodeAgentContextImpl.Builder(hostName).fileSystem(fileSystem).build(), nodeRepository,
- orchestrator, dockerOperations, storageMaintainer, clock, NODE_AGENT_SCAN_INTERVAL, Optional.empty(), Optional.empty());
+ orchestrator, dockerOperations, storageMaintainer, clock, INTERVAL, Optional.empty(), Optional.empty());
nodeAdmin = new NodeAdminImpl(nodeAgentFactory, Optional.empty(), mr, Clock.systemUTC());
- nodeAdminStateUpdater = new NodeAdminStateUpdaterImpl(nodeRepository, orchestrator,
- nodeAdmin, HOST_HOSTNAME, clock, NODE_ADMIN_CONVERGE_STATE_INTERVAL);
- nodeAdminStateUpdater.start();
+ nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeRepository, orchestrator,
+ nodeAdmin, HOST_HOSTNAME);
+
+ this.loopThread = new Thread(() -> {
+ nodeAdminStateUpdater.start();
+
+ while (! terminated) {
+ try {
+ nodeAdminStateUpdater.converge(wantedState);
+ } catch (RuntimeException e) {
+ log.info(e.getMessage());
+ }
+ try {
+ Thread.sleep(INTERVAL.toMillis());
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ });
+ loopThread.start();
}
/** Adds a node to node-repository mock that is running on this host */
@@ -103,12 +126,27 @@ public class DockerTester implements AutoCloseable {
.build());
}
- @Override
- public void close() {
- nodeAdminStateUpdater.stop();
+ void setWantedState(NodeAdminStateUpdater.State wantedState) {
+ this.wantedState = wantedState;
}
- public <T> T inOrder(T t) {
+ <T> T inOrder(T t) {
return inOrder.verify(t, timeout(1000));
}
+
+ @Override
+ public void close() {
+ terminated = true;
+
+ do {
+ try {
+ loopThread.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ } while (loopThread.isAlive());
+
+ // Finally, stop NodeAdmin and all the NodeAgents
+ nodeAdmin.stop();
+ }
}
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java
index 75fefde427d..7e17e2df530 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java
@@ -6,7 +6,7 @@ import com.yahoo.vespa.hosted.dockerapi.ContainerName;
import com.yahoo.vespa.hosted.dockerapi.ContainerResources;
import com.yahoo.vespa.hosted.dockerapi.DockerImage;
import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec;
-import com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater;
+import com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater;
import com.yahoo.vespa.hosted.provision.Node;
import org.junit.Test;
@@ -16,10 +16,7 @@ import java.util.OptionalLong;
import static com.yahoo.vespa.hosted.node.admin.integrationTests.DockerTester.HOST_HOSTNAME;
import static com.yahoo.vespa.hosted.node.admin.integrationTests.DockerTester.NODE_PROGRAM;
import static org.junit.Assert.assertTrue;
-import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
-import static org.mockito.Mockito.never;
-import static org.mockito.Mockito.verify;
/**
* Tests rebooting of Docker host
@@ -40,7 +37,7 @@ public class RebootTest {
eq(dockerImage), eq(ContainerResources.from(0, 0)), eq(new ContainerName("host1")), eq(hostname));
try {
- tester.nodeAdminStateUpdater.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED);
+ tester.setWantedState(NodeAdminStateUpdater.State.SUSPENDED);
} catch (RuntimeException ignored) { }
tester.inOrder(tester.orchestrator).suspend(
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
deleted file mode 100644
index 6afeca6eeb5..00000000000
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
+++ /dev/null
@@ -1,263 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hosted.node.admin.nodeadmin;
-
-import com.yahoo.config.provision.HostName;
-import com.yahoo.config.provision.NodeType;
-import com.yahoo.test.ManualClock;
-import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec;
-import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository;
-import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator;
-import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.OrchestratorException;
-import com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater;
-import com.yahoo.vespa.hosted.provision.Node;
-import org.junit.Test;
-
-import java.time.Duration;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.stream.Collectors;
-import java.util.stream.IntStream;
-
-import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdaterImpl.TRANSITION_EXCEPTION_MESSAGE;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
-import static org.mockito.ArgumentMatchers.any;
-import static org.mockito.ArgumentMatchers.anyBoolean;
-import static org.mockito.ArgumentMatchers.eq;
-import static org.mockito.Mockito.doNothing;
-import static org.mockito.Mockito.doThrow;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.never;
-import static org.mockito.Mockito.reset;
-import static org.mockito.Mockito.spy;
-import static org.mockito.Mockito.times;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.verifyNoMoreInteractions;
-import static org.mockito.Mockito.when;
-
-/**
- * Basic test of NodeAdminStateUpdaterImpl
- *
- * @author freva
- */
-public class NodeAdminStateUpdaterImplTest {
- private final NodeRepository nodeRepository = mock(NodeRepository.class);
- private final Orchestrator orchestrator = mock(Orchestrator.class);
- private final NodeAdmin nodeAdmin = mock(NodeAdmin.class);
- private final HostName hostHostname = HostName.from("basehost1.test.yahoo.com");
- private final ManualClock clock = new ManualClock();
- private final Duration convergeStateInterval = Duration.ofSeconds(30);
-
- private final NodeAdminStateUpdaterImpl refresher = spy(new NodeAdminStateUpdaterImpl(
- nodeRepository, orchestrator, nodeAdmin, hostHostname, clock, convergeStateInterval));
-
-
- @Test
- public void testStateConvergence() {
- mockNodeRepo(Node.State.active, 4);
- List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream()
- .map(NodeSpec::getHostname)
- .collect(Collectors.toList());
- List<String> suspendHostnames = new ArrayList<>(activeHostnames);
- suspendHostnames.add(hostHostname.value());
-
- // Initially everything is frozen to force convergence
- assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE);
- when(nodeAdmin.setFrozen(eq(false))).thenReturn(true);
- doNothing().when(orchestrator).resume(hostHostname.value());
- tickAfter(0); // The first tick should unfreeze
- verify(orchestrator, times(1)).resume(hostHostname.value()); // Resume host
- verify(orchestrator, times(1)).resume(hostHostname.value());
-
- // Everything is running and we want to continue running, therefore we have converged
- refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
- tickAfter(35);
- tickAfter(35);
- refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
- verify(refresher, never()).signalWorkToBeDone(); // No attempt in changing state
- verify(orchestrator, times(1)).resume(hostHostname.value()); // Already resumed
-
- // Lets try to suspend node admin only, immediately we get false back, and need to wait until next
- // tick before any change can happen
- assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE);
- verify(refresher, times(1)).signalWorkToBeDone();
- assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); // Still no change
- verify(refresher, times(1)).signalWorkToBeDone(); // We already notified of work, dont need to do it again
-
- when(nodeAdmin.setFrozen(eq(true))).thenReturn(false);
- when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
- tickAfter(0);
- assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, "NodeAdmin is not yet frozen");
- verify(refresher, times(1)).signalWorkToBeDone(); // No change in desired state
-
- // First orchestration failure happens within the freeze convergence timeout,
- // and so should not call setFrozen(false)
- final String exceptionMessage = "Cannot allow to suspend because some reason";
- verify(nodeAdmin, times(1)).setFrozen(eq(false));
- when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
- when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
- doThrow(new RuntimeException(exceptionMessage))
- .when(orchestrator).suspend(eq(hostHostname.value()));
- tickAfter(35);
- assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMessage);
- verify(refresher, times(1)).signalWorkToBeDone();
- verify(nodeAdmin, times(1)).setFrozen(eq(false));
-
- // The second orchestration failure happens after the freeze convergence timeout,
- // and so SHOULD call setFrozen(false)
- when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
- when(nodeAdmin.subsystemFreezeDuration()).thenReturn(NodeAdminStateUpdaterImpl.FREEZE_CONVERGENCE_TIMEOUT.plusMinutes(1));
- doThrow(new RuntimeException(exceptionMessage)).doNothing()
- .when(orchestrator).suspend(eq(hostHostname.value()));
- tickAfter(35);
- assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMessage);
- verify(refresher, times(1)).signalWorkToBeDone();
- verify(nodeAdmin, times(2)).setFrozen(eq(false)); // +1, since freeze convergence have timed out
-
- tickAfter(35);
- refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN);
- verify(nodeAdmin, times(2)).setFrozen(eq(false));
-
- // At this point orchestrator will say its OK to suspend, but something goes wrong when we try to stop services
- verify(orchestrator, times(0)).suspend(eq(hostHostname.value()), eq(suspendHostnames));
- doThrow(new RuntimeException("Failed to stop services")).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames));
- when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
- assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED, TRANSITION_EXCEPTION_MESSAGE);
- tickAfter(0); // Change in wanted state, no need to wait
- verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(suspendHostnames));
- verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state
- // Make sure we dont roll back if we fail to stop services - we will try to stop again next tick
- verify(nodeAdmin, times(2)).setFrozen(eq(false));
-
- // Finally we are successful in transitioning to frozen
- tickAfter(35);
- refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED);
-
- // We are in desired state, no changes will happen
- reset(nodeAdmin);
- tickAfter(35);
- tickAfter(35);
- refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED);
- verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state
- verifyNoMoreInteractions(nodeAdmin);
-
- // Lets try going back to resumed
- when(nodeAdmin.setFrozen(eq(false))).thenReturn(false).thenReturn(true); // NodeAgents not converged to yet
- assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE);
- tickAfter(35);
- assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, "NodeAdmin is not yet unfrozen");
-
- doThrow(new OrchestratorException("Cannot allow to suspend " + hostHostname.value())).doNothing()
- .when(orchestrator).resume(hostHostname.value());
- tickAfter(35);
- assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, "Cannot allow to suspend basehost1.test.yahoo.com");
- tickAfter(35);
- refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
- }
-
- @Test
- public void half_transition_revert() {
- final String exceptionMsg = "Cannot allow to suspend because some reason";
- mockNodeRepo(Node.State.active, 3);
-
- // Initially everything is frozen to force convergence
- when(nodeAdmin.setFrozen(eq(false))).thenReturn(true);
- doNothing().when(orchestrator).resume(hostHostname.value());
- tickAfter(0); // The first tick should unfreeze
- refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
- verify(nodeAdmin, times(1)).setFrozen(eq(false));
-
- // Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension
- when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
- when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
- doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(hostHostname.value()));
-
- assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE);
- tickAfter(30);
- verify(nodeAdmin, times(1)).setFrozen(eq(true));
- tickAfter(30);
- verify(nodeAdmin, times(2)).setFrozen(eq(true));
- verify(nodeAdmin, times(1)).setFrozen(eq(false)); // No new unfreezes during last 2 ticks
- verify(nodeAdmin, times(1)).refreshContainersToRun(any());
- assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMsg);
-
- // Only resume and fetch containers when subsystem freeze duration expires
- when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1));
- tickAfter(30);
- verify(nodeAdmin, times(2)).setFrozen(eq(false));
- verify(nodeAdmin, times(2)).refreshContainersToRun(any());
-
- // We change our mind, want to remain resumed
- assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE);
- tickAfter(30);
- refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
- verify(nodeAdmin, times(3)).setFrozen(eq(false)); // Make sure that we unfreeze!
- }
-
- @Test
- public void do_not_orchestrate_host_when_not_active() {
- when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1));
- when(nodeAdmin.setFrozen(anyBoolean())).thenReturn(true);
- mockNodeRepo(Node.State.ready, 3);
-
- // Resume and suspend only require that node-agents are frozen and permission from
- // orchestrator to resume/suspend host. Therefore, if host is not active, we only need to freeze.
- tickAfter(0);
- refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
- verify(orchestrator, never()).resume(eq(hostHostname.value()));
-
- assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE);
- tickAfter(0);
- refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN);
- verify(orchestrator, never()).suspend(eq(hostHostname.value()));
-
- // When doing batch suspend, only suspend the containers if the host is not active
- List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream()
- .map(NodeSpec::getHostname)
- .collect(Collectors.toList());
- assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED, TRANSITION_EXCEPTION_MESSAGE);
- tickAfter(0);
- refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED);
- verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(activeHostnames));
- }
-
- private void assertResumeStateError(NodeAdminStateUpdater.State targetState, String reason) {
- try {
- refresher.setResumeStateAndCheckIfResumed(targetState);
- fail("Expected set resume state to fail with \"" + reason + "\", but it succeeded without error");
- } catch (RuntimeException e) {
- assertEquals(reason, e.getMessage());
- }
- }
-
- private void mockNodeRepo(Node.State hostState, int numberOfNodes) {
- List<NodeSpec> containersToRun = IntStream.range(0, numberOfNodes)
- .mapToObj(i -> new NodeSpec.Builder()
- .hostname("host" + i + ".test.yahoo.com")
- .state(Node.State.active)
- .nodeType(NodeType.tenant)
- .flavor("docker")
- .minCpuCores(1)
- .minMainMemoryAvailableGb(1)
- .minDiskAvailableGb(1)
- .build())
- .collect(Collectors.toList());
-
- when(nodeRepository.getNodes(eq(hostHostname.value()))).thenReturn(containersToRun);
-
- when(nodeRepository.getNode(eq(hostHostname.value()))).thenReturn(new NodeSpec.Builder()
- .hostname(hostHostname.value())
- .state(hostState)
- .nodeType(NodeType.tenant)
- .flavor("default")
- .minCpuCores(1)
- .minMainMemoryAvailableGb(1)
- .minDiskAvailableGb(1)
- .build());
- }
-
- private void tickAfter(int seconds) {
- clock.advance(Duration.ofSeconds(seconds));
- refresher.tick();
- }
-}
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
new file mode 100644
index 00000000000..437195ca6d5
--- /dev/null
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
@@ -0,0 +1,197 @@
+// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.node.admin.nodeadmin;
+
+import com.yahoo.config.provision.HostName;
+import com.yahoo.config.provision.NodeType;
+import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec;
+import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository;
+import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator;
+import com.yahoo.vespa.hosted.provision.Node;
+import org.junit.Test;
+
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.RESUMED;
+import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.SUSPENDED;
+import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.anyBoolean;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.doThrow;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+/**
+ * Basic test of NodeAdminStateUpdater
+ *
+ * @author freva
+ */
+public class NodeAdminStateUpdaterTest {
+ private final NodeRepository nodeRepository = mock(NodeRepository.class);
+ private final Orchestrator orchestrator = mock(Orchestrator.class);
+ private final NodeAdmin nodeAdmin = mock(NodeAdmin.class);
+ private final HostName hostHostname = HostName.from("basehost1.test.yahoo.com");
+
+ private final NodeAdminStateUpdater refresher = spy(new NodeAdminStateUpdater(
+ nodeRepository, orchestrator, nodeAdmin, hostHostname));
+
+
+ @Test
+ public void state_convergence() {
+ mockNodeRepo(Node.State.active, 4);
+ List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream()
+ .map(NodeSpec::getHostname)
+ .collect(Collectors.toList());
+ List<String> suspendHostnames = new ArrayList<>(activeHostnames);
+ suspendHostnames.add(hostHostname.value());
+ when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
+
+ {
+ // Initially everything is frozen to force convergence
+ assertResumeStateError(RESUMED, "NodeAdmin is not yet unfrozen");
+ when(nodeAdmin.setFrozen(eq(false))).thenReturn(true);
+ refresher.converge(RESUMED);
+ verify(orchestrator, times(1)).resume(hostHostname.value());
+
+ // We are already resumed, so this should return without resuming again
+ refresher.converge(RESUMED);
+ verify(orchestrator, times(1)).resume(hostHostname.value());
+ verify(nodeAdmin, times(2)).setFrozen(eq(false));
+
+ // Lets try to suspend node admin only
+ when(nodeAdmin.setFrozen(eq(true))).thenReturn(false);
+ assertResumeStateError(SUSPENDED_NODE_ADMIN, "NodeAdmin is not yet frozen");
+ verify(nodeAdmin, times(2)).setFrozen(eq(false));
+ }
+
+ {
+ // First orchestration failure happens within the freeze convergence timeout,
+ // and so should not call setFrozen(false)
+ final String exceptionMessage = "Cannot allow to suspend because some reason";
+ when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
+ doThrow(new RuntimeException(exceptionMessage)).doNothing()
+ .when(orchestrator).suspend(eq(hostHostname.value()));
+ assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMessage);
+ verify(nodeAdmin, times(2)).setFrozen(eq(false));
+
+ refresher.converge(SUSPENDED_NODE_ADMIN);
+ verify(nodeAdmin, times(2)).setFrozen(eq(false));
+ }
+
+ {
+ // At this point orchestrator will say its OK to suspend, but something goes wrong when we try to stop services
+ final String exceptionMessage = "Failed to stop services";
+ verify(orchestrator, times(0)).suspend(eq(hostHostname.value()), eq(suspendHostnames));
+ doThrow(new RuntimeException(exceptionMessage)).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames));
+ assertResumeStateError(SUSPENDED, exceptionMessage);
+ verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(suspendHostnames));
+ // Make sure we dont roll back if we fail to stop services - we will try to stop again next tick
+ verify(nodeAdmin, times(2)).setFrozen(eq(false));
+
+ // Finally we are successful in transitioning to frozen
+ refresher.converge(SUSPENDED);
+ }
+ }
+
+ @Test
+ public void half_transition_revert() {
+ final String exceptionMsg = "Cannot allow to suspend because some reason";
+ mockNodeRepo(Node.State.active, 3);
+
+ // Initially everything is frozen to force convergence
+ when(nodeAdmin.setFrozen(eq(false))).thenReturn(true);
+ refresher.converge(RESUMED);
+ verify(nodeAdmin, times(1)).setFrozen(eq(false));
+
+ // Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension
+ when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
+ when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
+ doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(hostHostname.value()));
+
+ assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMsg);
+ verify(nodeAdmin, times(1)).setFrozen(eq(true));
+ assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMsg);
+ verify(nodeAdmin, times(2)).setFrozen(eq(true));
+ assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMsg);
+ verify(nodeAdmin, times(3)).setFrozen(eq(true));
+ verify(nodeAdmin, times(1)).setFrozen(eq(false)); // No new unfreezes during last 2 ticks
+ verify(nodeAdmin, times(1)).refreshContainersToRun(any());
+
+ // Only resume and fetch containers when subsystem freeze duration expires
+ when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1));
+ assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMsg);
+ verify(nodeAdmin, times(2)).setFrozen(eq(false));
+ verify(nodeAdmin, times(2)).refreshContainersToRun(any());
+
+ // We change our mind, want to remain resumed
+ refresher.converge(RESUMED);
+ verify(nodeAdmin, times(3)).setFrozen(eq(false)); // Make sure that we unfreeze!
+ }
+
+ @Test
+ public void do_not_orchestrate_host_when_not_active() {
+ when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1));
+ when(nodeAdmin.setFrozen(anyBoolean())).thenReturn(true);
+ mockNodeRepo(Node.State.ready, 3);
+
+ // Resume and suspend only require that node-agents are frozen and permission from
+ // orchestrator to resume/suspend host. Therefore, if host is not active, we only need to freeze.
+ refresher.converge(RESUMED);
+ verify(orchestrator, never()).resume(eq(hostHostname.value()));
+
+ refresher.converge(SUSPENDED_NODE_ADMIN);
+ verify(orchestrator, never()).suspend(eq(hostHostname.value()));
+
+ // When doing batch suspend, only suspend the containers if the host is not active
+ List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream()
+ .map(NodeSpec::getHostname)
+ .collect(Collectors.toList());
+ refresher.converge(SUSPENDED);
+ verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(activeHostnames));
+ }
+
+ private void assertResumeStateError(NodeAdminStateUpdater.State targetState, String reason) {
+ try {
+ refresher.converge(targetState);
+ fail("Expected set resume state to fail with \"" + reason + "\", but it succeeded without error");
+ } catch (RuntimeException e) {
+ assertEquals(reason, e.getMessage());
+ }
+ }
+
+ private void mockNodeRepo(Node.State hostState, int numberOfNodes) {
+ List<NodeSpec> containersToRun = IntStream.range(0, numberOfNodes)
+ .mapToObj(i -> new NodeSpec.Builder()
+ .hostname("host" + i + ".test.yahoo.com")
+ .state(Node.State.active)
+ .nodeType(NodeType.tenant)
+ .flavor("docker")
+ .minCpuCores(1)
+ .minMainMemoryAvailableGb(1)
+ .minDiskAvailableGb(1)
+ .build())
+ .collect(Collectors.toList());
+
+ when(nodeRepository.getNodes(eq(hostHostname.value()))).thenReturn(containersToRun);
+
+ when(nodeRepository.getNode(eq(hostHostname.value()))).thenReturn(new NodeSpec.Builder()
+ .hostname(hostHostname.value())
+ .state(hostState)
+ .nodeType(NodeType.tenant)
+ .flavor("default")
+ .minCpuCores(1)
+ .minMainMemoryAvailableGb(1)
+ .minDiskAvailableGb(1)
+ .build());
+ }
+}