diff options
author | valerijf <valerijf@yahoo-inc.com> | 2017-04-10 11:59:27 +0200 |
---|---|---|
committer | valerijf <valerijf@yahoo-inc.com> | 2017-04-10 11:59:27 +0200 |
commit | 6c2039bef54af450c696b2a5be1958231ceb4599 (patch) | |
tree | 62bf584ce8416b74078bf7b8da2629266b4c1c48 /node-admin | |
parent | 6b92af3f370a84a87472af361590512e9822f39a (diff) |
Ask orchestrator to suspend before freezing
Diffstat (limited to 'node-admin')
2 files changed, 21 insertions, 29 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java index 3b4f3bc471c..bdc5994c89e 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java @@ -131,25 +131,16 @@ public class NodeAdminStateUpdater extends AbstractComponent { } /** - * This method attempts to converge NodeAgent's and NodeAdmin's frozen state with their orchestrator - * state. When trying to suspend node-admin, this method will first attempt to freeze all NodeAgents and - * NodeAdmin, then asking orchestrator for permission to suspend all active nodes on this host, including - * node-admin itself, if the request is denied, this method will unfreeze NodeAgents and NodeAdmin. + * This method attempts to converge node-admin towards one of the {@link State} */ private void convergeState(State wantedState) { - boolean wantFrozen = wantedState != RESUMED; - if (!nodeAdmin.setFrozen(wantFrozen)) { - throw new RuntimeException("NodeAdmin has not yet converged to " + (wantFrozen ? "frozen" : "unfrozen")); - } - - // To get to resumed state, we only need to converge NodeAdmins frozen state if (wantedState == RESUMED) { - orchestrator.resume(dockerHostHostName); - - synchronized (monitor) { - currentState = RESUMED; + if (!nodeAdmin.setFrozen(false)) { + throw new RuntimeException("NodeAdmin has not yet converged to unfrozen"); } - return; + + orchestrator.resume(dockerHostHostName); + if (wantedState == updateAndGetCurrentState(RESUMED)) return; } // Fetch active nodes from node repo before suspending nodes. @@ -162,28 +153,29 @@ public class NodeAdminStateUpdater extends AbstractComponent { try { nodesInActiveState = getNodesInActiveState(); } catch (IOException e) { - throw new RuntimeException("Failed to get nodes from node repo:" + e.getMessage()); + throw new RuntimeException("Failed to get nodes from node repo: " + e.getMessage()); } if (currentState == RESUMED) { List<String> nodesToSuspend = new ArrayList<>(nodesInActiveState); nodesToSuspend.add(dockerHostHostName); - try { - orchestrator.suspend(dockerHostHostName, nodesToSuspend); - } catch (Exception e) { - nodeAdmin.setFrozen(false); - throw e; - } + orchestrator.suspend(dockerHostHostName, nodesToSuspend); - synchronized (monitor) { - currentState = SUSPENDED_NODE_ADMIN; + if (!nodeAdmin.setFrozen(true)) { + throw new RuntimeException("NodeAdmin has not yet converged to frozen"); } - if (wantedState == currentState) return; + + if (wantedState == updateAndGetCurrentState(SUSPENDED_NODE_ADMIN)) return; } nodeAdmin.stopNodeAgentServices(nodesInActiveState); + updateAndGetCurrentState(SUSPENDED); + } + + private State updateAndGetCurrentState(State currentState) { synchronized (monitor) { - currentState = SUSPENDED; + this.currentState = currentState; + return currentState; } } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java index 25fa23b8951..2ce7e2a3699 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java @@ -97,18 +97,18 @@ public class NodeAdminStateUpdaterTest { tickAfter(35); assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); verify(refresher, times(1)).signalWorkToBeDone(); - verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Roll back + verify(nodeAdmin, times(1)).setFrozen(eq(false)); tickAfter(35); assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); - verify(nodeAdmin, times(2)).setFrozen(eq(false)); + verify(nodeAdmin, times(1)).setFrozen(eq(false)); // At this point orchestrator says its OK to suspend, but something goes wrong when we try to stop services doThrow(new RuntimeException("Failed to stop services")).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames)); assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED)); tickAfter(0); // Change in wanted state, no need to wait verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state - verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Make sure we dont roll back + verify(nodeAdmin, times(1)).setFrozen(eq(false)); // Make sure we dont roll back // Finally we are successful in transitioning to frozen tickAfter(35); |