diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2018-10-29 09:37:19 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-10-29 09:37:19 +0100 |
commit | 5662bb226bbd4e7cf0f61a9ae269f57d75ac3a02 (patch) | |
tree | e238428ccb2bd93ed1cc9816ab1cae1358335a66 /node-admin | |
parent | 1794069cc28286f7fa0602a6d8e38d346808836d (diff) | |
parent | 2d677c94c9a727c565e879262dc99706d78fa0b3 (diff) |
Merge pull request #7465 from vespa-engine/freva/move-nasu
NodeAdmin: Remove Thread from NASU
Diffstat (limited to 'node-admin')
12 files changed, 387 insertions, 639 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/component/AdminComponent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/component/AdminComponent.java deleted file mode 100644 index 9888cca9c7e..00000000000 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/component/AdminComponent.java +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hosted.node.admin.component; - -import com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater; - -/** - * An AdminComponent cannot assume anything about the environment until enable() - * is called: Required YUM packages may not have been installed, services - * not started, etc. An enabled AdminComponent can be disabled to disengage from - * the environment. - */ -public interface AdminComponent { - /** - * Enable component. May be called more than once. - */ - void enable(); - - /** - * @return NodeAdminStateUpdater used by the REST API - */ - NodeAdminStateUpdater getNodeAdminStateUpdater(); - - /** - * Disable component. May be called more than once. - * Must be compatible with component deconstruct(). - */ - void disable(); -} diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java index 16992bcb13a..37d79d97e74 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdmin.java @@ -5,7 +5,6 @@ import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; import java.time.Duration; import java.util.List; -import java.util.Map; /** * NodeAdmin manages the life cycle of NodeAgents. @@ -47,12 +46,6 @@ public interface NodeAdmin { void stopNodeAgentServices(List<String> nodes); /** - * Returns a map containing all relevant NodeAdmin variables and their current values. - * Do not try to parse output or use in tests. - */ - Map<String, Object> debugInfo(); - - /** * Start node-admin schedulers. */ void start(); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java index b1c1a99a90a..f8114a68d07 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminImpl.java @@ -15,7 +15,6 @@ import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.HashSet; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -143,21 +142,6 @@ public class NodeAdminImpl implements NodeAdmin { }); } - public int getNumberOfNodeAgents() { - return nodeAgentsByHostname.keySet().size(); - } - - @Override - public Map<String, Object> debugInfo() { - Map<String, Object> debug = new LinkedHashMap<>(); - debug.put("isFrozen", isFrozen); - - List<Map<String, Object>> nodeAgentDebugs = nodeAgentsByHostname.values().stream() - .map(NodeAgent::debugInfo).collect(Collectors.toList()); - debug.put("NodeAgents", nodeAgentDebugs); - return debug; - } - @Override public void start() { metricsScheduler.scheduleAtFixedRate(() -> { diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java new file mode 100644 index 00000000000..a12104c6e98 --- /dev/null +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java @@ -0,0 +1,138 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.nodeadmin; + +import com.yahoo.config.provision.HostName; +import com.yahoo.log.LogLevel; +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository; +import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator; +import com.yahoo.vespa.hosted.provision.Node; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; +import java.util.stream.Collectors; + +import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.RESUMED; +import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN; +import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.TRANSITIONING; + +/** + * Pulls information from node repository and forwards containers to run to node admin. + * + * @author dybis, stiankri + */ +public class NodeAdminStateUpdater { + private static final Logger log = Logger.getLogger(NodeAdminStateUpdater.class.getName()); + private static final Duration FREEZE_CONVERGENCE_TIMEOUT = Duration.ofMinutes(5); + + private final NodeRepository nodeRepository; + private final Orchestrator orchestrator; + private final NodeAdmin nodeAdmin; + private final String hostHostname; + + public enum State { TRANSITIONING, RESUMED, SUSPENDED_NODE_ADMIN, SUSPENDED } + + private State currentState = SUSPENDED_NODE_ADMIN; + + public NodeAdminStateUpdater( + NodeRepository nodeRepository, + Orchestrator orchestrator, + NodeAdmin nodeAdmin, + HostName hostHostname) { + this.nodeRepository = nodeRepository; + this.orchestrator = orchestrator; + this.nodeAdmin = nodeAdmin; + this.hostHostname = hostHostname.value(); + } + + public void start() { + nodeAdmin.start(); + } + + public void converge(State wantedState) { + try { + convergeState(wantedState); + } finally { + if (wantedState != RESUMED && currentState == TRANSITIONING) { + Duration subsystemFreezeDuration = nodeAdmin.subsystemFreezeDuration(); + if (subsystemFreezeDuration.compareTo(FREEZE_CONVERGENCE_TIMEOUT) > 0) { + // We have spent too much time trying to freeze and node admin is still not frozen. + // To avoid node agents stalling for too long, we'll force unfrozen ticks now. + log.info("Timed out trying to freeze, will force unfreezed ticks"); + fetchContainersToRunFromNodeRepository(); + nodeAdmin.setFrozen(false); + } + } else if (currentState == RESUMED) { + fetchContainersToRunFromNodeRepository(); + } + } + } + + /** + * This method attempts to converge node-admin w/agents to a {@link State} + * with respect to: freeze, Orchestrator, and services running. + */ + private void convergeState(State wantedState) { + if (currentState == wantedState) return; + currentState = TRANSITIONING; + + boolean wantFrozen = wantedState != RESUMED; + if (!nodeAdmin.setFrozen(wantFrozen)) { + throw new ConvergenceException("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen")); + } + + boolean hostIsActiveInNR = nodeRepository.getNode(hostHostname).getState() == Node.State.active; + switch (wantedState) { + case RESUMED: + if (hostIsActiveInNR) orchestrator.resume(hostHostname); + break; + case SUSPENDED_NODE_ADMIN: + if (hostIsActiveInNR) orchestrator.suspend(hostHostname); + break; + case SUSPENDED: + // Fetch active nodes from node repo before suspending nodes. + // It is only possible to suspend active nodes, + // the orchestrator will fail if trying to suspend nodes in other states. + // Even though state is frozen we need to interact with node repo, but + // the data from node repo should not be used for anything else. + // We should also suspend host's hostname to suspend node-admin + List<String> nodesInActiveState = getNodesInActiveState(); + + List<String> nodesToSuspend = new ArrayList<>(nodesInActiveState); + if (hostIsActiveInNR) nodesToSuspend.add(hostHostname); + if (!nodesToSuspend.isEmpty()) { + orchestrator.suspend(hostHostname, nodesToSuspend); + log.info("Orchestrator allows suspension of " + nodesToSuspend); + } + + // The node agent services are stopped by this thread, which is OK only + // because the node agents are frozen (see above). + nodeAdmin.stopNodeAgentServices(nodesInActiveState); + break; + default: + throw new IllegalStateException("Unknown wanted state " + wantedState); + } + + log.info("State changed from " + currentState + " to " + wantedState); + currentState = wantedState; + } + + private void fetchContainersToRunFromNodeRepository() { + try { + final List<NodeSpec> containersToRun = nodeRepository.getNodes(hostHostname); + nodeAdmin.refreshContainersToRun(containersToRun); + } catch (Exception e) { + log.log(LogLevel.WARNING, "Failed to update which containers should be running", e); + } + } + + private List<String> getNodesInActiveState() { + return nodeRepository.getNodes(hostHostname) + .stream() + .filter(node -> node.getState() == Node.State.active) + .map(NodeSpec::getHostname) + .collect(Collectors.toList()); + } +} diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java deleted file mode 100644 index 296745c8e37..00000000000 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java +++ /dev/null @@ -1,269 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hosted.node.admin.nodeadmin; - -import com.yahoo.config.provision.HostName; -import com.yahoo.log.LogLevel; -import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; -import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository; -import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator; -import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.OrchestratorException; -import com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater; -import com.yahoo.vespa.hosted.node.admin.configserver.HttpException; -import com.yahoo.vespa.hosted.provision.Node; - -import java.time.Clock; -import java.time.Duration; -import java.time.Instant; -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.logging.Logger; -import java.util.stream.Collectors; - -import static com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater.State.RESUMED; -import static com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN; -import static com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater.State.TRANSITIONING; - -/** - * Pulls information from node repository and forwards containers to run to node admin. - * - * @author dybis, stiankri - */ -public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { - static final Duration FREEZE_CONVERGENCE_TIMEOUT = Duration.ofMinutes(5); - static final String TRANSITION_EXCEPTION_MESSAGE = "NodeAdminStateUpdater has not run since current wanted state was set"; - - private final AtomicBoolean terminated = new AtomicBoolean(false); - private State currentState = SUSPENDED_NODE_ADMIN; - private State wantedState = RESUMED; - private boolean workToDoNow = true; - - private final Object monitor = new Object(); - private RuntimeException lastConvergenceException; - - private final Logger log = Logger.getLogger(NodeAdminStateUpdater.class.getName()); - private final Thread loopThread; - - private final NodeRepository nodeRepository; - private final Orchestrator orchestrator; - private final NodeAdmin nodeAdmin; - private final Clock clock; - private final String hostHostname; - private final Duration nodeAdminConvergeStateInterval; - - private Instant lastTick; - - public NodeAdminStateUpdaterImpl( - NodeRepository nodeRepository, - Orchestrator orchestrator, - NodeAdmin nodeAdmin, - HostName hostHostname, - Clock clock, - Duration nodeAdminConvergeStateInterval) { - this.nodeRepository = nodeRepository; - this.orchestrator = orchestrator; - this.nodeAdmin = nodeAdmin; - this.hostHostname = hostHostname.value(); - this.clock = clock; - this.nodeAdminConvergeStateInterval = nodeAdminConvergeStateInterval; - this.lastTick = clock.instant(); - - this.loopThread = new Thread(() -> { - nodeAdmin.start(); - - while (! terminated.get()) { - tick(); - } - }); - this.loopThread.setName("tick-NodeAdminStateUpdater"); - } - - @Override - public Map<String, Object> getDebugPage() { - Map<String, Object> debug = new LinkedHashMap<>(); - synchronized (monitor) { - debug.put("hostHostname", hostHostname); - debug.put("wantedState", wantedState); - debug.put("currentState", currentState); - debug.put("NodeAdmin", nodeAdmin.debugInfo()); - } - return debug; - } - - @Override - public void setResumeStateAndCheckIfResumed(State wantedState) { - synchronized (monitor) { - if (this.wantedState != wantedState) { - log.info("Wanted state change: " + this.wantedState + " -> " + wantedState); - this.wantedState = wantedState; - setLastConvergenceException(null); - signalWorkToBeDone(); - } - - if (currentState != wantedState) { - throw Optional.ofNullable(lastConvergenceException) - .orElseGet(() -> new RuntimeException(TRANSITION_EXCEPTION_MESSAGE)); - } - } - } - - void signalWorkToBeDone() { - synchronized (monitor) { - if (! workToDoNow) { - workToDoNow = true; - monitor.notifyAll(); - } - } - } - - void tick() { - State wantedStateCopy; - synchronized (monitor) { - while (! workToDoNow) { - Duration timeSinceLastConverge = Duration.between(lastTick, clock.instant()); - long remainder = nodeAdminConvergeStateInterval.minus(timeSinceLastConverge).toMillis(); - if (remainder > 0) { - try { - monitor.wait(remainder); - } catch (InterruptedException e) { - log.info("Interrupted, but ignoring this: NodeAdminStateUpdater"); - } - } else break; - } - lastTick = clock.instant(); - workToDoNow = false; - - // wantedState may change asynchronously, so we grab a copy of it here - wantedStateCopy = this.wantedState; - } - - try { - convergeState(wantedStateCopy); - setLastConvergenceException(null); - } catch (OrchestratorException | ConvergenceException | HttpException e) { - setLastConvergenceException(e); - log.info("Unable to converge to " + wantedStateCopy + ": " + e.getMessage()); - } catch (RuntimeException e) { - setLastConvergenceException(e); - log.log(LogLevel.ERROR, "Error while trying to converge to " + wantedStateCopy, e); - } - - if (wantedStateCopy != RESUMED && currentState == TRANSITIONING) { - Duration subsystemFreezeDuration = nodeAdmin.subsystemFreezeDuration(); - if (subsystemFreezeDuration.compareTo(FREEZE_CONVERGENCE_TIMEOUT) > 0) { - // We have spent too much time trying to freeze and node admin is still not frozen. - // To avoid node agents stalling for too long, we'll force unfrozen ticks now. - log.info("Timed out trying to freeze, will force unfreezed ticks"); - fetchContainersToRunFromNodeRepository(); - nodeAdmin.setFrozen(false); - } - } else if (currentState == RESUMED) { - fetchContainersToRunFromNodeRepository(); - } - } - - private void setLastConvergenceException(RuntimeException exception) { - synchronized (monitor) { - lastConvergenceException = exception; - } - } - - /** - * This method attempts to converge node-admin w/agents to a {@link State} - * with respect to: freeze, Orchestrator, and services running. - */ - private void convergeState(State wantedState) { - if (currentState == wantedState) return; - synchronized (monitor) { - currentState = TRANSITIONING; - } - - boolean wantFrozen = wantedState != RESUMED; - if (!nodeAdmin.setFrozen(wantFrozen)) { - throw new ConvergenceException("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen")); - } - - boolean hostIsActiveInNR = nodeRepository.getNode(hostHostname).getState() == Node.State.active; - switch (wantedState) { - case RESUMED: - if (hostIsActiveInNR) orchestrator.resume(hostHostname); - break; - case SUSPENDED_NODE_ADMIN: - if (hostIsActiveInNR) orchestrator.suspend(hostHostname); - break; - case SUSPENDED: - // Fetch active nodes from node repo before suspending nodes. - // It is only possible to suspend active nodes, - // the orchestrator will fail if trying to suspend nodes in other states. - // Even though state is frozen we need to interact with node repo, but - // the data from node repo should not be used for anything else. - // We should also suspend host's hostname to suspend node-admin - List<String> nodesInActiveState = getNodesInActiveState(); - - List<String> nodesToSuspend = new ArrayList<>(nodesInActiveState); - if (hostIsActiveInNR) nodesToSuspend.add(hostHostname); - if (!nodesToSuspend.isEmpty()) { - orchestrator.suspend(hostHostname, nodesToSuspend); - log.info("Orchestrator allows suspension of " + nodesToSuspend); - } - - // The node agent services are stopped by this thread, which is OK only - // because the node agents are frozen (see above). - nodeAdmin.stopNodeAgentServices(nodesInActiveState); - break; - default: - throw new IllegalStateException("Unknown wanted state " + wantedState); - } - - log.info("State changed from " + currentState + " to " + wantedState); - synchronized (monitor) { - // Writes to currentState must be synchronized. Reads doesn't have to since this thread - // is the only one modifying it. - currentState = wantedState; - } - } - - private void fetchContainersToRunFromNodeRepository() { - try { - final List<NodeSpec> containersToRun = nodeRepository.getNodes(hostHostname); - nodeAdmin.refreshContainersToRun(containersToRun); - } catch (Exception e) { - log.log(LogLevel.WARNING, "Failed to update which containers should be running", e); - } - } - - private List<String> getNodesInActiveState() { - return nodeRepository.getNodes(hostHostname) - .stream() - .filter(node -> node.getState() == Node.State.active) - .map(NodeSpec::getHostname) - .collect(Collectors.toList()); - } - - public void start() { - loopThread.start(); - } - - public void stop() { - if (!terminated.compareAndSet(false, true)) { - throw new RuntimeException("Can not re-stop a node agent."); - } - - // First we need to stop NodeAdminStateUpdaterImpl thread to make sure no new NodeAgents are spawned - signalWorkToBeDone(); - - do { - try { - loopThread.join(); - } catch (InterruptedException e1) { - log.info("Interrupted while waiting for NodeAdminStateUpdater thread and specVerfierScheduler to shutdown"); - } - } while (loopThread.isAlive()); - - // Finally, stop NodeAdmin and all the NodeAgents - nodeAdmin.stop(); - } -} diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java index 7c789e10d19..947e7c85d66 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgent.java @@ -1,8 +1,6 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.node.admin.nodeagent; -import java.util.Map; - /** * Responsible for management of a single node over its lifecycle. * May own its own resources, threads etc. Runs independently, but receives signals @@ -30,11 +28,6 @@ public interface NodeAgent { void suspend(); /** - * Returns a map containing all relevant NodeAgent variables and their current values. - */ - Map<String, Object> debugInfo(); - - /** * Starts the agent. After this method is called, the agent will asynchronously maintain the node, continuously * striving to make the current state equal to the wanted state. */ diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java index a220557ca9c..805069ac8c9 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeagent/NodeAgentImpl.java @@ -30,7 +30,6 @@ import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.ArrayList; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Objects; @@ -57,7 +56,6 @@ public class NodeAgentImpl implements NodeAgent { private static final Logger logger = Logger.getLogger(NodeAgentImpl.class.getName()); - private final AtomicBoolean terminated = new AtomicBoolean(false); private boolean isFrozen = true; private boolean wantFrozen = false; @@ -156,18 +154,6 @@ public class NodeAgentImpl implements NodeAgent { } @Override - public Map<String, Object> debugInfo() { - Map<String, Object> debug = new LinkedHashMap<>(); - debug.put("hostname", context.hostname()); - debug.put("isFrozen", isFrozen); - debug.put("wantFrozen", wantFrozen); - debug.put("terminated", terminated); - debug.put("workToDoNow", workToDoNow); - debug.put("nodeRepoState", lastNode.getState().name()); - return debug; - } - - @Override public void start() { context.log(logger, "Starting with interval " + timeBetweenEachConverge.toMillis() + " ms"); diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java deleted file mode 100644 index 8a926b511e6..00000000000 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hosted.node.admin.provider; - -import javax.annotation.concurrent.ThreadSafe; - -@ThreadSafe -public interface NodeAdminStateUpdater extends NodeAdminDebugHandler { - enum State { TRANSITIONING, RESUMED, SUSPENDED_NODE_ADMIN, SUSPENDED} - - /** - * Set the wanted state, and assert whether the current state equals it. - * Typically, this method should be called repeatedly until current state - * has converged. - * - * @throws RuntimeException (or a subclass) if the state has not converged yet. - */ - void setResumeStateAndCheckIfResumed(State wantedState); -} diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java index 6ed51657d6e..57835a3a759 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/DockerTester.java @@ -14,7 +14,7 @@ import com.yahoo.vespa.hosted.node.admin.docker.DockerOperations; import com.yahoo.vespa.hosted.node.admin.docker.DockerOperationsImpl; import com.yahoo.vespa.hosted.node.admin.maintenance.StorageMaintainer; import com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminImpl; -import com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdaterImpl; +import com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgent; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContextImpl; import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl; @@ -32,6 +32,7 @@ import java.time.Duration; import java.util.Collections; import java.util.Optional; import java.util.function.Function; +import java.util.logging.Logger; import static com.yahoo.vespa.hosted.node.admin.task.util.file.IOExceptionUtil.uncheck; import static org.mockito.ArgumentMatchers.any; @@ -45,21 +46,26 @@ import static org.mockito.Mockito.when; */ // Need to deconstruct nodeAdminStateUpdater public class DockerTester implements AutoCloseable { - private static final Duration NODE_AGENT_SCAN_INTERVAL = Duration.ofMillis(100); - private static final Duration NODE_ADMIN_CONVERGE_STATE_INTERVAL = Duration.ofMillis(10); + private static final Logger log = Logger.getLogger(DockerTester.class.getName()); + private static final Duration INTERVAL = Duration.ofMillis(10); private static final Path PATH_TO_VESPA_HOME = Paths.get("/opt/vespa"); static final String NODE_PROGRAM = PATH_TO_VESPA_HOME.resolve("bin/vespa-nodectl").toString(); static final HostName HOST_HOSTNAME = HostName.from("host.test.yahoo.com"); + private final Thread loopThread; + final Docker docker = spy(new DockerMock()); final NodeRepoMock nodeRepository = spy(new NodeRepoMock()); final Orchestrator orchestrator = mock(Orchestrator.class); final StorageMaintainer storageMaintainer = mock(StorageMaintainer.class); final InOrder inOrder = Mockito.inOrder(docker, nodeRepository, orchestrator, storageMaintainer); - final NodeAdminStateUpdaterImpl nodeAdminStateUpdater; + final NodeAdminStateUpdater nodeAdminStateUpdater; final NodeAdminImpl nodeAdmin; + private boolean terminated = false; + private volatile NodeAdminStateUpdater.State wantedState = NodeAdminStateUpdater.State.RESUMED; + DockerTester() { when(storageMaintainer.getDiskUsageFor(any())).thenReturn(Optional.empty()); @@ -89,11 +95,28 @@ public class DockerTester implements AutoCloseable { MetricReceiverWrapper mr = new MetricReceiverWrapper(MetricReceiver.nullImplementation); Function<String, NodeAgent> nodeAgentFactory = (hostName) -> new NodeAgentImpl( new NodeAgentContextImpl.Builder(hostName).fileSystem(fileSystem).build(), nodeRepository, - orchestrator, dockerOperations, storageMaintainer, clock, NODE_AGENT_SCAN_INTERVAL, Optional.empty(), Optional.empty()); + orchestrator, dockerOperations, storageMaintainer, clock, INTERVAL, Optional.empty(), Optional.empty()); nodeAdmin = new NodeAdminImpl(nodeAgentFactory, Optional.empty(), mr, Clock.systemUTC()); - nodeAdminStateUpdater = new NodeAdminStateUpdaterImpl(nodeRepository, orchestrator, - nodeAdmin, HOST_HOSTNAME, clock, NODE_ADMIN_CONVERGE_STATE_INTERVAL); - nodeAdminStateUpdater.start(); + nodeAdminStateUpdater = new NodeAdminStateUpdater(nodeRepository, orchestrator, + nodeAdmin, HOST_HOSTNAME); + + this.loopThread = new Thread(() -> { + nodeAdminStateUpdater.start(); + + while (! terminated) { + try { + nodeAdminStateUpdater.converge(wantedState); + } catch (RuntimeException e) { + log.info(e.getMessage()); + } + try { + Thread.sleep(INTERVAL.toMillis()); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + }); + loopThread.start(); } /** Adds a node to node-repository mock that is running on this host */ @@ -103,12 +126,27 @@ public class DockerTester implements AutoCloseable { .build()); } - @Override - public void close() { - nodeAdminStateUpdater.stop(); + void setWantedState(NodeAdminStateUpdater.State wantedState) { + this.wantedState = wantedState; } - public <T> T inOrder(T t) { + <T> T inOrder(T t) { return inOrder.verify(t, timeout(1000)); } + + @Override + public void close() { + terminated = true; + + do { + try { + loopThread.join(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } while (loopThread.isAlive()); + + // Finally, stop NodeAdmin and all the NodeAgents + nodeAdmin.stop(); + } } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java index 75fefde427d..7e17e2df530 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java @@ -6,7 +6,7 @@ import com.yahoo.vespa.hosted.dockerapi.ContainerName; import com.yahoo.vespa.hosted.dockerapi.ContainerResources; import com.yahoo.vespa.hosted.dockerapi.DockerImage; import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; -import com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater; +import com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater; import com.yahoo.vespa.hosted.provision.Node; import org.junit.Test; @@ -16,10 +16,7 @@ import java.util.OptionalLong; import static com.yahoo.vespa.hosted.node.admin.integrationTests.DockerTester.HOST_HOSTNAME; import static com.yahoo.vespa.hosted.node.admin.integrationTests.DockerTester.NODE_PROGRAM; import static org.junit.Assert.assertTrue; -import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.never; -import static org.mockito.Mockito.verify; /** * Tests rebooting of Docker host @@ -40,7 +37,7 @@ public class RebootTest { eq(dockerImage), eq(ContainerResources.from(0, 0)), eq(new ContainerName("host1")), eq(hostname)); try { - tester.nodeAdminStateUpdater.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED); + tester.setWantedState(NodeAdminStateUpdater.State.SUSPENDED); } catch (RuntimeException ignored) { } tester.inOrder(tester.orchestrator).suspend( diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java deleted file mode 100644 index 6afeca6eeb5..00000000000 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java +++ /dev/null @@ -1,263 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hosted.node.admin.nodeadmin; - -import com.yahoo.config.provision.HostName; -import com.yahoo.config.provision.NodeType; -import com.yahoo.test.ManualClock; -import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; -import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository; -import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator; -import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.OrchestratorException; -import com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater; -import com.yahoo.vespa.hosted.provision.Node; -import org.junit.Test; - -import java.time.Duration; -import java.util.ArrayList; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdaterImpl.TRANSITION_EXCEPTION_MESSAGE; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.ArgumentMatchers.anyBoolean; -import static org.mockito.ArgumentMatchers.eq; -import static org.mockito.Mockito.doNothing; -import static org.mockito.Mockito.doThrow; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.never; -import static org.mockito.Mockito.reset; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.verifyNoMoreInteractions; -import static org.mockito.Mockito.when; - -/** - * Basic test of NodeAdminStateUpdaterImpl - * - * @author freva - */ -public class NodeAdminStateUpdaterImplTest { - private final NodeRepository nodeRepository = mock(NodeRepository.class); - private final Orchestrator orchestrator = mock(Orchestrator.class); - private final NodeAdmin nodeAdmin = mock(NodeAdmin.class); - private final HostName hostHostname = HostName.from("basehost1.test.yahoo.com"); - private final ManualClock clock = new ManualClock(); - private final Duration convergeStateInterval = Duration.ofSeconds(30); - - private final NodeAdminStateUpdaterImpl refresher = spy(new NodeAdminStateUpdaterImpl( - nodeRepository, orchestrator, nodeAdmin, hostHostname, clock, convergeStateInterval)); - - - @Test - public void testStateConvergence() { - mockNodeRepo(Node.State.active, 4); - List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream() - .map(NodeSpec::getHostname) - .collect(Collectors.toList()); - List<String> suspendHostnames = new ArrayList<>(activeHostnames); - suspendHostnames.add(hostHostname.value()); - - // Initially everything is frozen to force convergence - assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE); - when(nodeAdmin.setFrozen(eq(false))).thenReturn(true); - doNothing().when(orchestrator).resume(hostHostname.value()); - tickAfter(0); // The first tick should unfreeze - verify(orchestrator, times(1)).resume(hostHostname.value()); // Resume host - verify(orchestrator, times(1)).resume(hostHostname.value()); - - // Everything is running and we want to continue running, therefore we have converged - refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); - tickAfter(35); - tickAfter(35); - refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); - verify(refresher, never()).signalWorkToBeDone(); // No attempt in changing state - verify(orchestrator, times(1)).resume(hostHostname.value()); // Already resumed - - // Lets try to suspend node admin only, immediately we get false back, and need to wait until next - // tick before any change can happen - assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); - verify(refresher, times(1)).signalWorkToBeDone(); - assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); // Still no change - verify(refresher, times(1)).signalWorkToBeDone(); // We already notified of work, dont need to do it again - - when(nodeAdmin.setFrozen(eq(true))).thenReturn(false); - when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); - tickAfter(0); - assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, "NodeAdmin is not yet frozen"); - verify(refresher, times(1)).signalWorkToBeDone(); // No change in desired state - - // First orchestration failure happens within the freeze convergence timeout, - // and so should not call setFrozen(false) - final String exceptionMessage = "Cannot allow to suspend because some reason"; - verify(nodeAdmin, times(1)).setFrozen(eq(false)); - when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); - when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); - doThrow(new RuntimeException(exceptionMessage)) - .when(orchestrator).suspend(eq(hostHostname.value())); - tickAfter(35); - assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMessage); - verify(refresher, times(1)).signalWorkToBeDone(); - verify(nodeAdmin, times(1)).setFrozen(eq(false)); - - // The second orchestration failure happens after the freeze convergence timeout, - // and so SHOULD call setFrozen(false) - when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); - when(nodeAdmin.subsystemFreezeDuration()).thenReturn(NodeAdminStateUpdaterImpl.FREEZE_CONVERGENCE_TIMEOUT.plusMinutes(1)); - doThrow(new RuntimeException(exceptionMessage)).doNothing() - .when(orchestrator).suspend(eq(hostHostname.value())); - tickAfter(35); - assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMessage); - verify(refresher, times(1)).signalWorkToBeDone(); - verify(nodeAdmin, times(2)).setFrozen(eq(false)); // +1, since freeze convergence have timed out - - tickAfter(35); - refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN); - verify(nodeAdmin, times(2)).setFrozen(eq(false)); - - // At this point orchestrator will say its OK to suspend, but something goes wrong when we try to stop services - verify(orchestrator, times(0)).suspend(eq(hostHostname.value()), eq(suspendHostnames)); - doThrow(new RuntimeException("Failed to stop services")).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames)); - when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); - assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED, TRANSITION_EXCEPTION_MESSAGE); - tickAfter(0); // Change in wanted state, no need to wait - verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(suspendHostnames)); - verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state - // Make sure we dont roll back if we fail to stop services - we will try to stop again next tick - verify(nodeAdmin, times(2)).setFrozen(eq(false)); - - // Finally we are successful in transitioning to frozen - tickAfter(35); - refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED); - - // We are in desired state, no changes will happen - reset(nodeAdmin); - tickAfter(35); - tickAfter(35); - refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED); - verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state - verifyNoMoreInteractions(nodeAdmin); - - // Lets try going back to resumed - when(nodeAdmin.setFrozen(eq(false))).thenReturn(false).thenReturn(true); // NodeAgents not converged to yet - assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE); - tickAfter(35); - assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, "NodeAdmin is not yet unfrozen"); - - doThrow(new OrchestratorException("Cannot allow to suspend " + hostHostname.value())).doNothing() - .when(orchestrator).resume(hostHostname.value()); - tickAfter(35); - assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, "Cannot allow to suspend basehost1.test.yahoo.com"); - tickAfter(35); - refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); - } - - @Test - public void half_transition_revert() { - final String exceptionMsg = "Cannot allow to suspend because some reason"; - mockNodeRepo(Node.State.active, 3); - - // Initially everything is frozen to force convergence - when(nodeAdmin.setFrozen(eq(false))).thenReturn(true); - doNothing().when(orchestrator).resume(hostHostname.value()); - tickAfter(0); // The first tick should unfreeze - refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); - verify(nodeAdmin, times(1)).setFrozen(eq(false)); - - // Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension - when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); - when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); - doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(hostHostname.value())); - - assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); - tickAfter(30); - verify(nodeAdmin, times(1)).setFrozen(eq(true)); - tickAfter(30); - verify(nodeAdmin, times(2)).setFrozen(eq(true)); - verify(nodeAdmin, times(1)).setFrozen(eq(false)); // No new unfreezes during last 2 ticks - verify(nodeAdmin, times(1)).refreshContainersToRun(any()); - assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMsg); - - // Only resume and fetch containers when subsystem freeze duration expires - when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1)); - tickAfter(30); - verify(nodeAdmin, times(2)).setFrozen(eq(false)); - verify(nodeAdmin, times(2)).refreshContainersToRun(any()); - - // We change our mind, want to remain resumed - assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE); - tickAfter(30); - refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); - verify(nodeAdmin, times(3)).setFrozen(eq(false)); // Make sure that we unfreeze! - } - - @Test - public void do_not_orchestrate_host_when_not_active() { - when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1)); - when(nodeAdmin.setFrozen(anyBoolean())).thenReturn(true); - mockNodeRepo(Node.State.ready, 3); - - // Resume and suspend only require that node-agents are frozen and permission from - // orchestrator to resume/suspend host. Therefore, if host is not active, we only need to freeze. - tickAfter(0); - refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); - verify(orchestrator, never()).resume(eq(hostHostname.value())); - - assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); - tickAfter(0); - refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN); - verify(orchestrator, never()).suspend(eq(hostHostname.value())); - - // When doing batch suspend, only suspend the containers if the host is not active - List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream() - .map(NodeSpec::getHostname) - .collect(Collectors.toList()); - assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED, TRANSITION_EXCEPTION_MESSAGE); - tickAfter(0); - refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED); - verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(activeHostnames)); - } - - private void assertResumeStateError(NodeAdminStateUpdater.State targetState, String reason) { - try { - refresher.setResumeStateAndCheckIfResumed(targetState); - fail("Expected set resume state to fail with \"" + reason + "\", but it succeeded without error"); - } catch (RuntimeException e) { - assertEquals(reason, e.getMessage()); - } - } - - private void mockNodeRepo(Node.State hostState, int numberOfNodes) { - List<NodeSpec> containersToRun = IntStream.range(0, numberOfNodes) - .mapToObj(i -> new NodeSpec.Builder() - .hostname("host" + i + ".test.yahoo.com") - .state(Node.State.active) - .nodeType(NodeType.tenant) - .flavor("docker") - .minCpuCores(1) - .minMainMemoryAvailableGb(1) - .minDiskAvailableGb(1) - .build()) - .collect(Collectors.toList()); - - when(nodeRepository.getNodes(eq(hostHostname.value()))).thenReturn(containersToRun); - - when(nodeRepository.getNode(eq(hostHostname.value()))).thenReturn(new NodeSpec.Builder() - .hostname(hostHostname.value()) - .state(hostState) - .nodeType(NodeType.tenant) - .flavor("default") - .minCpuCores(1) - .minMainMemoryAvailableGb(1) - .minDiskAvailableGb(1) - .build()); - } - - private void tickAfter(int seconds) { - clock.advance(Duration.ofSeconds(seconds)); - refresher.tick(); - } -} diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java new file mode 100644 index 00000000000..437195ca6d5 --- /dev/null +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java @@ -0,0 +1,197 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.node.admin.nodeadmin; + +import com.yahoo.config.provision.HostName; +import com.yahoo.config.provision.NodeType; +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeSpec; +import com.yahoo.vespa.hosted.node.admin.configserver.noderepository.NodeRepository; +import com.yahoo.vespa.hosted.node.admin.configserver.orchestrator.Orchestrator; +import com.yahoo.vespa.hosted.provision.Node; +import org.junit.Test; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.RESUMED; +import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.SUSPENDED; +import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyBoolean; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +/** + * Basic test of NodeAdminStateUpdater + * + * @author freva + */ +public class NodeAdminStateUpdaterTest { + private final NodeRepository nodeRepository = mock(NodeRepository.class); + private final Orchestrator orchestrator = mock(Orchestrator.class); + private final NodeAdmin nodeAdmin = mock(NodeAdmin.class); + private final HostName hostHostname = HostName.from("basehost1.test.yahoo.com"); + + private final NodeAdminStateUpdater refresher = spy(new NodeAdminStateUpdater( + nodeRepository, orchestrator, nodeAdmin, hostHostname)); + + + @Test + public void state_convergence() { + mockNodeRepo(Node.State.active, 4); + List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream() + .map(NodeSpec::getHostname) + .collect(Collectors.toList()); + List<String> suspendHostnames = new ArrayList<>(activeHostnames); + suspendHostnames.add(hostHostname.value()); + when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); + + { + // Initially everything is frozen to force convergence + assertResumeStateError(RESUMED, "NodeAdmin is not yet unfrozen"); + when(nodeAdmin.setFrozen(eq(false))).thenReturn(true); + refresher.converge(RESUMED); + verify(orchestrator, times(1)).resume(hostHostname.value()); + + // We are already resumed, so this should return without resuming again + refresher.converge(RESUMED); + verify(orchestrator, times(1)).resume(hostHostname.value()); + verify(nodeAdmin, times(2)).setFrozen(eq(false)); + + // Lets try to suspend node admin only + when(nodeAdmin.setFrozen(eq(true))).thenReturn(false); + assertResumeStateError(SUSPENDED_NODE_ADMIN, "NodeAdmin is not yet frozen"); + verify(nodeAdmin, times(2)).setFrozen(eq(false)); + } + + { + // First orchestration failure happens within the freeze convergence timeout, + // and so should not call setFrozen(false) + final String exceptionMessage = "Cannot allow to suspend because some reason"; + when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); + doThrow(new RuntimeException(exceptionMessage)).doNothing() + .when(orchestrator).suspend(eq(hostHostname.value())); + assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMessage); + verify(nodeAdmin, times(2)).setFrozen(eq(false)); + + refresher.converge(SUSPENDED_NODE_ADMIN); + verify(nodeAdmin, times(2)).setFrozen(eq(false)); + } + + { + // At this point orchestrator will say its OK to suspend, but something goes wrong when we try to stop services + final String exceptionMessage = "Failed to stop services"; + verify(orchestrator, times(0)).suspend(eq(hostHostname.value()), eq(suspendHostnames)); + doThrow(new RuntimeException(exceptionMessage)).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames)); + assertResumeStateError(SUSPENDED, exceptionMessage); + verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(suspendHostnames)); + // Make sure we dont roll back if we fail to stop services - we will try to stop again next tick + verify(nodeAdmin, times(2)).setFrozen(eq(false)); + + // Finally we are successful in transitioning to frozen + refresher.converge(SUSPENDED); + } + } + + @Test + public void half_transition_revert() { + final String exceptionMsg = "Cannot allow to suspend because some reason"; + mockNodeRepo(Node.State.active, 3); + + // Initially everything is frozen to force convergence + when(nodeAdmin.setFrozen(eq(false))).thenReturn(true); + refresher.converge(RESUMED); + verify(nodeAdmin, times(1)).setFrozen(eq(false)); + + // Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension + when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); + when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); + doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(hostHostname.value())); + + assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMsg); + verify(nodeAdmin, times(1)).setFrozen(eq(true)); + assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMsg); + verify(nodeAdmin, times(2)).setFrozen(eq(true)); + assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMsg); + verify(nodeAdmin, times(3)).setFrozen(eq(true)); + verify(nodeAdmin, times(1)).setFrozen(eq(false)); // No new unfreezes during last 2 ticks + verify(nodeAdmin, times(1)).refreshContainersToRun(any()); + + // Only resume and fetch containers when subsystem freeze duration expires + when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1)); + assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMsg); + verify(nodeAdmin, times(2)).setFrozen(eq(false)); + verify(nodeAdmin, times(2)).refreshContainersToRun(any()); + + // We change our mind, want to remain resumed + refresher.converge(RESUMED); + verify(nodeAdmin, times(3)).setFrozen(eq(false)); // Make sure that we unfreeze! + } + + @Test + public void do_not_orchestrate_host_when_not_active() { + when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1)); + when(nodeAdmin.setFrozen(anyBoolean())).thenReturn(true); + mockNodeRepo(Node.State.ready, 3); + + // Resume and suspend only require that node-agents are frozen and permission from + // orchestrator to resume/suspend host. Therefore, if host is not active, we only need to freeze. + refresher.converge(RESUMED); + verify(orchestrator, never()).resume(eq(hostHostname.value())); + + refresher.converge(SUSPENDED_NODE_ADMIN); + verify(orchestrator, never()).suspend(eq(hostHostname.value())); + + // When doing batch suspend, only suspend the containers if the host is not active + List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream() + .map(NodeSpec::getHostname) + .collect(Collectors.toList()); + refresher.converge(SUSPENDED); + verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(activeHostnames)); + } + + private void assertResumeStateError(NodeAdminStateUpdater.State targetState, String reason) { + try { + refresher.converge(targetState); + fail("Expected set resume state to fail with \"" + reason + "\", but it succeeded without error"); + } catch (RuntimeException e) { + assertEquals(reason, e.getMessage()); + } + } + + private void mockNodeRepo(Node.State hostState, int numberOfNodes) { + List<NodeSpec> containersToRun = IntStream.range(0, numberOfNodes) + .mapToObj(i -> new NodeSpec.Builder() + .hostname("host" + i + ".test.yahoo.com") + .state(Node.State.active) + .nodeType(NodeType.tenant) + .flavor("docker") + .minCpuCores(1) + .minMainMemoryAvailableGb(1) + .minDiskAvailableGb(1) + .build()) + .collect(Collectors.toList()); + + when(nodeRepository.getNodes(eq(hostHostname.value()))).thenReturn(containersToRun); + + when(nodeRepository.getNode(eq(hostHostname.value()))).thenReturn(new NodeSpec.Builder() + .hostname(hostHostname.value()) + .state(hostState) + .nodeType(NodeType.tenant) + .flavor("default") + .minCpuCores(1) + .minMainMemoryAvailableGb(1) + .minDiskAvailableGb(1) + .build()); + } +} |