diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2017-05-22 13:44:02 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-05-22 13:44:02 +0200 |
commit | 967a893b873f46ab863186fd0d147c7226c29fb8 (patch) | |
tree | 89d6bc99d51cb120b43627c80edc804f1e219ba4 /node-admin | |
parent | 885e83247645e05abfa38e18ca76601541cc50d9 (diff) | |
parent | 0238778be02f2b781d4f451c24f1d3a723c13a09 (diff) |
Merge pull request #2489 from yahoo/freva/freeze-node-agents-before-orchestrator-suspend
Freeze node agents before Orchestrator suspend
Diffstat (limited to 'node-admin')
3 files changed, 20 insertions, 11 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java index 0d31d7bf72d..d391f756978 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java @@ -131,11 +131,12 @@ public class NodeAdminStateUpdater extends AbstractComponent { * This method attempts to converge node-admin towards one of the {@link State} */ private void convergeState(State wantedState) { - if (wantedState == RESUMED) { - if (!nodeAdmin.setFrozen(false)) { - throw new RuntimeException("NodeAdmin has not yet converged to unfrozen"); - } + boolean wantFrozen = wantedState != RESUMED; + if (!nodeAdmin.setFrozen(wantFrozen)) { + throw new RuntimeException("NodeAdmin has not yet converged to " + (wantFrozen ? "frozen" : "unfrozen")); + } + if (wantedState == RESUMED) { orchestrator.resume(dockerHostHostName); if (wantedState == updateAndGetCurrentState(RESUMED)) return; } @@ -156,10 +157,14 @@ public class NodeAdminStateUpdater extends AbstractComponent { if (currentState == RESUMED) { List<String> nodesToSuspend = new ArrayList<>(nodesInActiveState); nodesToSuspend.add(dockerHostHostName); - orchestrator.suspend(dockerHostHostName, nodesToSuspend); - if (!nodeAdmin.setFrozen(true)) { - throw new RuntimeException("NodeAdmin has not yet converged to frozen"); + try { + orchestrator.suspend(dockerHostHostName, nodesToSuspend); + } catch (RuntimeException e) { + // Because upgrades may take a while, we should unfreeze everything in the mean time + // and try again next tick, since wantedState is still to suspend + nodeAdmin.setFrozen(false); + throw e; } if (wantedState == updateAndGetCurrentState(SUSPENDED_NODE_ADMIN)) return; @@ -178,6 +183,8 @@ public class NodeAdminStateUpdater extends AbstractComponent { private void fetchContainersToRunFromNodeRepository() { synchronized (monitor) { + // Refresh containers to run even if we would like to suspend but have failed to do so yet, + // because it may take a long time to get permission to suspend. if (currentState != RESUMED) { logger.info("Frozen, skipping fetching info from node repository"); return; diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java index 64dded68612..d7271f71003 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java @@ -156,9 +156,10 @@ public class RunInContainerTest { doThrow(new OrchestratorException("Cannot suspend because...")).when(orchestrator) .suspend("localhost.test.yahoo.com", Arrays.asList("host1.test.yahoo.com", parentHostname)); - // Orchestrator doesn't allow to suspend either the container or the node-admin + // Initially we are denied to suspend because we have to freeze all the node-agents assertFalse(doPutCall("suspend/node-admin")); Thread.sleep(50); + // At this point they should be frozen, but Orchestrator doesn't allow to suspend either the container or the node-admin assertFalse(doPutCall("suspend/node-admin")); doNothing().when(orchestrator) diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java index 2ce7e2a3699..f9b98cb672a 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java @@ -97,18 +97,19 @@ public class NodeAdminStateUpdaterTest { tickAfter(35); assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); verify(refresher, times(1)).signalWorkToBeDone(); - verify(nodeAdmin, times(1)).setFrozen(eq(false)); + verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Orchestrator denied request to suspend, resume tickAfter(35); assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); - verify(nodeAdmin, times(1)).setFrozen(eq(false)); + verify(nodeAdmin, times(2)).setFrozen(eq(false)); // At this point orchestrator says its OK to suspend, but something goes wrong when we try to stop services doThrow(new RuntimeException("Failed to stop services")).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames)); assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED)); tickAfter(0); // Change in wanted state, no need to wait verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state - verify(nodeAdmin, times(1)).setFrozen(eq(false)); // Make sure we dont roll back + // Make sure we dont roll back if we fail to stop services - we will try to stop again next tick + verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Finally we are successful in transitioning to frozen tickAfter(35); |