summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2017-05-22 13:44:02 +0200
committerGitHub <noreply@github.com>2017-05-22 13:44:02 +0200
commit967a893b873f46ab863186fd0d147c7226c29fb8 (patch)
tree89d6bc99d51cb120b43627c80edc804f1e219ba4 /node-admin
parent885e83247645e05abfa38e18ca76601541cc50d9 (diff)
parent0238778be02f2b781d4f451c24f1d3a723c13a09 (diff)
Merge pull request #2489 from yahoo/freva/freeze-node-agents-before-orchestrator-suspend
Freeze node agents before Orchestrator suspend
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java21
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java3
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java7
3 files changed, 20 insertions, 11 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
index 0d31d7bf72d..d391f756978 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
@@ -131,11 +131,12 @@ public class NodeAdminStateUpdater extends AbstractComponent {
* This method attempts to converge node-admin towards one of the {@link State}
*/
private void convergeState(State wantedState) {
- if (wantedState == RESUMED) {
- if (!nodeAdmin.setFrozen(false)) {
- throw new RuntimeException("NodeAdmin has not yet converged to unfrozen");
- }
+ boolean wantFrozen = wantedState != RESUMED;
+ if (!nodeAdmin.setFrozen(wantFrozen)) {
+ throw new RuntimeException("NodeAdmin has not yet converged to " + (wantFrozen ? "frozen" : "unfrozen"));
+ }
+ if (wantedState == RESUMED) {
orchestrator.resume(dockerHostHostName);
if (wantedState == updateAndGetCurrentState(RESUMED)) return;
}
@@ -156,10 +157,14 @@ public class NodeAdminStateUpdater extends AbstractComponent {
if (currentState == RESUMED) {
List<String> nodesToSuspend = new ArrayList<>(nodesInActiveState);
nodesToSuspend.add(dockerHostHostName);
- orchestrator.suspend(dockerHostHostName, nodesToSuspend);
- if (!nodeAdmin.setFrozen(true)) {
- throw new RuntimeException("NodeAdmin has not yet converged to frozen");
+ try {
+ orchestrator.suspend(dockerHostHostName, nodesToSuspend);
+ } catch (RuntimeException e) {
+ // Because upgrades may take a while, we should unfreeze everything in the mean time
+ // and try again next tick, since wantedState is still to suspend
+ nodeAdmin.setFrozen(false);
+ throw e;
}
if (wantedState == updateAndGetCurrentState(SUSPENDED_NODE_ADMIN)) return;
@@ -178,6 +183,8 @@ public class NodeAdminStateUpdater extends AbstractComponent {
private void fetchContainersToRunFromNodeRepository() {
synchronized (monitor) {
+ // Refresh containers to run even if we would like to suspend but have failed to do so yet,
+ // because it may take a long time to get permission to suspend.
if (currentState != RESUMED) {
logger.info("Frozen, skipping fetching info from node repository");
return;
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java
index 64dded68612..d7271f71003 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RunInContainerTest.java
@@ -156,9 +156,10 @@ public class RunInContainerTest {
doThrow(new OrchestratorException("Cannot suspend because...")).when(orchestrator)
.suspend("localhost.test.yahoo.com", Arrays.asList("host1.test.yahoo.com", parentHostname));
- // Orchestrator doesn't allow to suspend either the container or the node-admin
+ // Initially we are denied to suspend because we have to freeze all the node-agents
assertFalse(doPutCall("suspend/node-admin"));
Thread.sleep(50);
+ // At this point they should be frozen, but Orchestrator doesn't allow to suspend either the container or the node-admin
assertFalse(doPutCall("suspend/node-admin"));
doNothing().when(orchestrator)
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
index 2ce7e2a3699..f9b98cb672a 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
@@ -97,18 +97,19 @@ public class NodeAdminStateUpdaterTest {
tickAfter(35);
assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
verify(refresher, times(1)).signalWorkToBeDone();
- verify(nodeAdmin, times(1)).setFrozen(eq(false));
+ verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Orchestrator denied request to suspend, resume
tickAfter(35);
assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
- verify(nodeAdmin, times(1)).setFrozen(eq(false));
+ verify(nodeAdmin, times(2)).setFrozen(eq(false));
// At this point orchestrator says its OK to suspend, but something goes wrong when we try to stop services
doThrow(new RuntimeException("Failed to stop services")).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames));
assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED));
tickAfter(0); // Change in wanted state, no need to wait
verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state
- verify(nodeAdmin, times(1)).setFrozen(eq(false)); // Make sure we dont roll back
+ // Make sure we dont roll back if we fail to stop services - we will try to stop again next tick
+ verify(nodeAdmin, times(2)).setFrozen(eq(false));
// Finally we are successful in transitioning to frozen
tickAfter(35);