summaryrefslogtreecommitdiffstats
path: root/node-admin/src
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@oath.com>2019-01-10 15:43:37 +0100
committerHåkon Hallingstad <hakon@oath.com>2019-01-10 15:43:37 +0100
commitf92963f0122018659efaab30411c087a6fa2b7e3 (patch)
treeecc37b397efecabd5047a48e449b9eb9af3e0eb2 /node-admin/src
parente87ecf42627da4fc1b38c6d8eeceadd1784558bc (diff)
Adjust node agents to run BEFORE trying to resume
Diffstat (limited to 'node-admin/src')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java33
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java51
2 files changed, 41 insertions, 43 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
index a12104c6e98..13d3f3307d2 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
@@ -51,30 +51,21 @@ public class NodeAdminStateUpdater {
nodeAdmin.start();
}
- public void converge(State wantedState) {
- try {
- convergeState(wantedState);
- } finally {
- if (wantedState != RESUMED && currentState == TRANSITIONING) {
- Duration subsystemFreezeDuration = nodeAdmin.subsystemFreezeDuration();
- if (subsystemFreezeDuration.compareTo(FREEZE_CONVERGENCE_TIMEOUT) > 0) {
- // We have spent too much time trying to freeze and node admin is still not frozen.
- // To avoid node agents stalling for too long, we'll force unfrozen ticks now.
- log.info("Timed out trying to freeze, will force unfreezed ticks");
- fetchContainersToRunFromNodeRepository();
- nodeAdmin.setFrozen(false);
- }
- } else if (currentState == RESUMED) {
- fetchContainersToRunFromNodeRepository();
- }
- }
- }
-
/**
* This method attempts to converge node-admin w/agents to a {@link State}
* with respect to: freeze, Orchestrator, and services running.
*/
- private void convergeState(State wantedState) {
+ public void converge(State wantedState) {
+ if (wantedState == RESUMED) {
+ adjustNodeAgentsToRunFromNodeRepository();
+ } else if (currentState == TRANSITIONING && nodeAdmin.subsystemFreezeDuration().compareTo(FREEZE_CONVERGENCE_TIMEOUT) > 0) {
+ // We have spent too much time trying to freeze and node admin is still not frozen.
+ // To avoid node agents stalling for too long, we'll force unfrozen ticks now.
+ adjustNodeAgentsToRunFromNodeRepository();
+ nodeAdmin.setFrozen(false);
+ throw new ConvergenceException("Timed out trying to freeze all nodes: will force an unfrozen tick");
+ }
+
if (currentState == wantedState) return;
currentState = TRANSITIONING;
@@ -119,7 +110,7 @@ public class NodeAdminStateUpdater {
currentState = wantedState;
}
- private void fetchContainersToRunFromNodeRepository() {
+ private void adjustNodeAgentsToRunFromNodeRepository() {
try {
final List<NodeSpec> containersToRun = nodeRepository.getNodes(hostHostname);
nodeAdmin.refreshContainersToRun(containersToRun);
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
index 437195ca6d5..74ba5561c8e 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
@@ -42,7 +42,7 @@ public class NodeAdminStateUpdaterTest {
private final NodeAdmin nodeAdmin = mock(NodeAdmin.class);
private final HostName hostHostname = HostName.from("basehost1.test.yahoo.com");
- private final NodeAdminStateUpdater refresher = spy(new NodeAdminStateUpdater(
+ private final NodeAdminStateUpdater updater = spy(new NodeAdminStateUpdater(
nodeRepository, orchestrator, nodeAdmin, hostHostname));
@@ -58,19 +58,19 @@ public class NodeAdminStateUpdaterTest {
{
// Initially everything is frozen to force convergence
- assertResumeStateError(RESUMED, "NodeAdmin is not yet unfrozen");
+ assertConvergeError(RESUMED, "NodeAdmin is not yet unfrozen");
when(nodeAdmin.setFrozen(eq(false))).thenReturn(true);
- refresher.converge(RESUMED);
+ updater.converge(RESUMED);
verify(orchestrator, times(1)).resume(hostHostname.value());
// We are already resumed, so this should return without resuming again
- refresher.converge(RESUMED);
+ updater.converge(RESUMED);
verify(orchestrator, times(1)).resume(hostHostname.value());
verify(nodeAdmin, times(2)).setFrozen(eq(false));
// Lets try to suspend node admin only
when(nodeAdmin.setFrozen(eq(true))).thenReturn(false);
- assertResumeStateError(SUSPENDED_NODE_ADMIN, "NodeAdmin is not yet frozen");
+ assertConvergeError(SUSPENDED_NODE_ADMIN, "NodeAdmin is not yet frozen");
verify(nodeAdmin, times(2)).setFrozen(eq(false));
}
@@ -81,10 +81,10 @@ public class NodeAdminStateUpdaterTest {
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
doThrow(new RuntimeException(exceptionMessage)).doNothing()
.when(orchestrator).suspend(eq(hostHostname.value()));
- assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMessage);
+ assertConvergeError(SUSPENDED_NODE_ADMIN, exceptionMessage);
verify(nodeAdmin, times(2)).setFrozen(eq(false));
- refresher.converge(SUSPENDED_NODE_ADMIN);
+ updater.converge(SUSPENDED_NODE_ADMIN);
verify(nodeAdmin, times(2)).setFrozen(eq(false));
}
@@ -93,13 +93,13 @@ public class NodeAdminStateUpdaterTest {
final String exceptionMessage = "Failed to stop services";
verify(orchestrator, times(0)).suspend(eq(hostHostname.value()), eq(suspendHostnames));
doThrow(new RuntimeException(exceptionMessage)).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames));
- assertResumeStateError(SUSPENDED, exceptionMessage);
+ assertConvergeError(SUSPENDED, exceptionMessage);
verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(suspendHostnames));
// Make sure we dont roll back if we fail to stop services - we will try to stop again next tick
verify(nodeAdmin, times(2)).setFrozen(eq(false));
// Finally we are successful in transitioning to frozen
- refresher.converge(SUSPENDED);
+ updater.converge(SUSPENDED);
}
}
@@ -110,31 +110,38 @@ public class NodeAdminStateUpdaterTest {
// Initially everything is frozen to force convergence
when(nodeAdmin.setFrozen(eq(false))).thenReturn(true);
- refresher.converge(RESUMED);
+ updater.converge(RESUMED);
verify(nodeAdmin, times(1)).setFrozen(eq(false));
+ verify(nodeAdmin, times(1)).refreshContainersToRun(any());
// Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
doThrow(new RuntimeException(exceptionMsg)).when(orchestrator).suspend(eq(hostHostname.value()));
- assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMsg);
+ assertConvergeError(SUSPENDED_NODE_ADMIN, exceptionMsg);
verify(nodeAdmin, times(1)).setFrozen(eq(true));
- assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMsg);
+ verify(orchestrator, times(1)).suspend(eq(hostHostname.value()));
+ assertConvergeError(SUSPENDED_NODE_ADMIN, exceptionMsg);
verify(nodeAdmin, times(2)).setFrozen(eq(true));
- assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMsg);
+ verify(orchestrator, times(2)).suspend(eq(hostHostname.value()));
+ assertConvergeError(SUSPENDED_NODE_ADMIN, exceptionMsg);
verify(nodeAdmin, times(3)).setFrozen(eq(true));
- verify(nodeAdmin, times(1)).setFrozen(eq(false)); // No new unfreezes during last 2 ticks
+ verify(orchestrator, times(3)).suspend(eq(hostHostname.value()));
+
+ // No new unfreezes nor refresh while trying to freeze
+ verify(nodeAdmin, times(1)).setFrozen(eq(false));
verify(nodeAdmin, times(1)).refreshContainersToRun(any());
// Only resume and fetch containers when subsystem freeze duration expires
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofHours(1));
- assertResumeStateError(SUSPENDED_NODE_ADMIN, exceptionMsg);
+ assertConvergeError(SUSPENDED_NODE_ADMIN, "Timed out trying to freeze all nodes: will force an unfrozen tick");
verify(nodeAdmin, times(2)).setFrozen(eq(false));
+ verify(orchestrator, times(3)).suspend(eq(hostHostname.value())); // no new suspend calls
verify(nodeAdmin, times(2)).refreshContainersToRun(any());
// We change our mind, want to remain resumed
- refresher.converge(RESUMED);
+ updater.converge(RESUMED);
verify(nodeAdmin, times(3)).setFrozen(eq(false)); // Make sure that we unfreeze!
}
@@ -146,24 +153,24 @@ public class NodeAdminStateUpdaterTest {
// Resume and suspend only require that node-agents are frozen and permission from
// orchestrator to resume/suspend host. Therefore, if host is not active, we only need to freeze.
- refresher.converge(RESUMED);
+ updater.converge(RESUMED);
verify(orchestrator, never()).resume(eq(hostHostname.value()));
- refresher.converge(SUSPENDED_NODE_ADMIN);
+ updater.converge(SUSPENDED_NODE_ADMIN);
verify(orchestrator, never()).suspend(eq(hostHostname.value()));
// When doing batch suspend, only suspend the containers if the host is not active
List<String> activeHostnames = nodeRepository.getNodes(hostHostname.value()).stream()
.map(NodeSpec::getHostname)
.collect(Collectors.toList());
- refresher.converge(SUSPENDED);
+ updater.converge(SUSPENDED);
verify(orchestrator, times(1)).suspend(eq(hostHostname.value()), eq(activeHostnames));
}
- private void assertResumeStateError(NodeAdminStateUpdater.State targetState, String reason) {
+ private void assertConvergeError(NodeAdminStateUpdater.State targetState, String reason) {
try {
- refresher.converge(targetState);
- fail("Expected set resume state to fail with \"" + reason + "\", but it succeeded without error");
+ updater.converge(targetState);
+ fail("Expected converging to " + targetState + " to fail with \"" + reason + "\", but it succeeded without error");
} catch (RuntimeException e) {
assertEquals(reason, e.getMessage());
}