summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@oath.com>2017-12-01 11:35:21 +0100
committerValerij Fredriksen <valerijf@oath.com>2017-12-01 11:35:21 +0100
commit336ae2c005c07c1f55295414e921e88a8ec81679 (patch)
tree7c52e4d75eee584863a0b147b6f64851e7be4cd3 /node-admin
parenta1be8bde80468ccf2be3b9147ce68cfc281debfb (diff)
Add TRANSITIONING state in NodeAdminStateUpdater
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java14
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java70
2 files changed, 59 insertions, 25 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
index 906e6b55640..e380e7a2e26 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
@@ -34,6 +34,7 @@ import java.util.stream.Collectors;
import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.RESUMED;
import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN;
+import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.TRANSITIONING;
/**
* Pulls information from node repository and forwards containers to run to node admin.
@@ -110,15 +111,15 @@ public class NodeAdminStateUpdater {
return this.getClass().getSimpleName() + "@" + Integer.toString(System.identityHashCode(this));
}
- public enum State { RESUMED, SUSPENDED_NODE_ADMIN, SUSPENDED}
+ public enum State { TRANSITIONING, RESUMED, SUSPENDED_NODE_ADMIN, SUSPENDED}
public Map<String, Object> getDebugPage() {
Map<String, Object> debug = new LinkedHashMap<>();
synchronized (monitor) {
debug.put("dockerHostHostName", dockerHostHostName);
+ debug.put("wantedState", wantedState);
+ debug.put("currentState", currentState);
debug.put("NodeAdmin", nodeAdmin.debugInfo());
- debug.put("Wanted State: ", wantedState);
- debug.put("Current State: ", currentState);
}
return debug;
}
@@ -185,7 +186,7 @@ public class NodeAdminStateUpdater {
log.log(LogLevel.ERROR, "Error while trying to converge to " + wantedStateCopy, e);
}
- if (wantedStateCopy != RESUMED && currentState == RESUMED) {
+ if (wantedStateCopy != RESUMED && currentState == TRANSITIONING) {
Duration subsystemFreezeDuration = nodeAdmin.subsystemFreezeDuration();
if (subsystemFreezeDuration.compareTo(FREEZE_CONVERGENCE_TIMEOUT) > 0) {
// We have spent too much time trying to freeze and node admin is still not frozen.
@@ -203,8 +204,9 @@ public class NodeAdminStateUpdater {
* with respect to: freeze, Orchestrator, and services running.
*/
private void convergeState(State wantedState) {
- if (currentState == wantedState) {
- return;
+ if (currentState == wantedState) return;
+ synchronized (monitor) {
+ currentState = TRANSITIONING;
}
boolean wantFrozen = wantedState != RESUMED;
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
index ce062702a3b..1058278a02c 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
@@ -13,8 +13,9 @@ import org.junit.Test;
import java.io.IOException;
import java.time.Duration;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -50,26 +51,13 @@ public class NodeAdminStateUpdaterTest {
@Test
public void testStateConvergence() throws IOException {
- List<ContainerNodeSpec> containersToRun = new ArrayList<>();
- for (int i = 0; i < 10; i++) {
- containersToRun.add(
- new ContainerNodeSpec.Builder()
- .hostname("host" + i + ".test.yahoo.com")
- .nodeState(i % 3 == 0 ? Node.State.active : Node.State.ready)
- .nodeType("tenant")
- .nodeFlavor("docker")
- .minCpuCores(1)
- .minMainMemoryAvailableGb(1)
- .minDiskAvailableGb(1)
- .build());
- }
- List<String> activeHostnames = Arrays.asList(
- "host0.test.yahoo.com", "host3.test.yahoo.com", "host6.test.yahoo.com", "host9.test.yahoo.com");
+ mockNodeRepo(4);
+ List<String> activeHostnames = nodeRepository.getContainersToRun(parentHostname).stream()
+ .map(node -> node.hostname)
+ .collect(Collectors.toList());
List<String> suspendHostnames = new ArrayList<>(activeHostnames);
suspendHostnames.add(parentHostname);
- when(nodeRepository.getContainersToRun(eq(parentHostname))).thenReturn(containersToRun);
-
// Initially everything is frozen to force convergence
assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
when(nodeAdmin.setFrozen(eq(false))).thenReturn(true);
@@ -148,7 +136,7 @@ public class NodeAdminStateUpdaterTest {
assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED));
verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state
verifyNoMoreInteractions(nodeAdmin);
-
+
// Lets try going back to resumed
when(nodeAdmin.setFrozen(eq(false))).thenReturn(false).thenReturn(true); // NodeAgents not converged to yet
assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
@@ -163,6 +151,50 @@ public class NodeAdminStateUpdaterTest {
assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
}
+ @Test
+ public void half_transition_revert() throws IOException {
+ mockNodeRepo(3);
+
+ // Initially everything is frozen to force convergence
+ when(nodeAdmin.setFrozen(eq(false))).thenReturn(true);
+ doNothing().when(orchestrator).resume(parentHostname);
+ tickAfter(0); // The first tick should unfreeze
+ assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ verify(nodeAdmin, times(1)).setFrozen(eq(false));
+
+ // Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension
+ when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
+ when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
+ doThrow(new RuntimeException("Cannot allow to suspend because some reason"))
+ .when(orchestrator).suspend(eq(parentHostname));
+
+ assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
+ tickAfter(0);
+ verify(nodeAdmin, times(1)).setFrozen(eq(true));
+
+ // We change our mind, want to remain resumed
+ assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ tickAfter(0);
+ assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Make sure that we unfreeze!
+ }
+
+ private void mockNodeRepo(int numberOfNodes) throws IOException {
+ List<ContainerNodeSpec> containersToRun = IntStream.range(0, numberOfNodes)
+ .mapToObj(i -> new ContainerNodeSpec.Builder()
+ .hostname("host" + i + ".test.yahoo.com")
+ .nodeState(Node.State.active)
+ .nodeType("tenant")
+ .nodeFlavor("docker")
+ .minCpuCores(1)
+ .minMainMemoryAvailableGb(1)
+ .minDiskAvailableGb(1)
+ .build())
+ .collect(Collectors.toList());
+
+ when(nodeRepository.getContainersToRun(eq(parentHostname))).thenReturn(containersToRun);
+ }
+
private void tickAfter(int seconds) {
clock.advance(Duration.ofSeconds(seconds));
refresher.tick();