aboutsummaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@yahoo-inc.com>2017-07-11 16:15:21 +0200
committerHåkon Hallingstad <hakon@yahoo-inc.com>2017-07-11 16:15:21 +0200
commitcac86d1d5ae9a4583eb1e93e2fa1517f6bff58dd (patch)
tree45406407174adc144b4e349561e30d3b00c88fba /node-admin
parent9f9bd607d5dd9ac225f098cccf9dace3cb5931a6 (diff)
Avoid getting permission to suspend Docker nodes when upgrading node admin
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java127
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java13
2 files changed, 71 insertions, 69 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
index e3473fc09b0..24d2d3cd190 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdater.java
@@ -22,7 +22,6 @@ import java.util.logging.Logger;
import java.util.stream.Collectors;
import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.RESUMED;
-import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.SUSPENDED;
import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN;
/**
@@ -101,7 +100,7 @@ public class NodeAdminStateUpdater extends AbstractComponent {
}
void tick() {
- State wantedState = null;
+ State wantedStateCopy;
synchronized (monitor) {
while (! workToDoNow) {
long remainder = delaysBetweenEachTickMillis - Duration.between(lastTick, clock.instant()).toMillis();
@@ -116,30 +115,25 @@ public class NodeAdminStateUpdater extends AbstractComponent {
lastTick = clock.instant();
workToDoNow = false;
- if (currentState != this.wantedState) {
- wantedState = this.wantedState;
- }
+ // wantedState may change asynchronously, so we grab a copy of it here
+ wantedStateCopy = this.wantedState;
}
- if (wantedState != null) { // There is a state we want to be in, but aren't right now
- boolean converged = false;
- try {
- convergeState(wantedState);
- converged = true;
- } catch (OrchestratorException | ConvergenceException e) {
- log.info("Unable to converge to " + wantedState + ": " + e.getMessage());
- } catch (Exception e) {
- log.log(LogLevel.ERROR, "Error while trying to converge to " + wantedState, e);
- }
+ try {
+ convergeState(wantedStateCopy);
+ } catch (OrchestratorException | ConvergenceException e) {
+ log.info("Unable to converge to " + wantedStateCopy + ": " + e.getMessage());
+ } catch (Exception e) {
+ log.log(LogLevel.ERROR, "Error while trying to converge to " + wantedStateCopy, e);
+ }
- if (wantedState != RESUMED && !converged) {
- Duration subsystemFreezeDuration = nodeAdmin.subsystemFreezeDuration();
- if (subsystemFreezeDuration.compareTo(FREEZE_CONVERGENCE_TIMEOUT) > 0) {
- // We have spent too long time trying to freeze and node admin is still not frozen.
- // To avoid node agents stalling for too long, we'll force unfrozen ticks now.
- log.info("Timed out trying to freeze, will force unfreezed ticks");
- nodeAdmin.setFrozen(false);
- }
+ if (wantedStateCopy != RESUMED && currentState == RESUMED) {
+ Duration subsystemFreezeDuration = nodeAdmin.subsystemFreezeDuration();
+ if (subsystemFreezeDuration.compareTo(FREEZE_CONVERGENCE_TIMEOUT) > 0) {
+ // We have spent too much time trying to freeze and node admin is still not frozen.
+ // To avoid node agents stalling for too long, we'll force unfrozen ticks now.
+ log.info("Timed out trying to freeze, will force unfreezed ticks");
+ nodeAdmin.setFrozen(false);
}
}
@@ -147,50 +141,53 @@ public class NodeAdminStateUpdater extends AbstractComponent {
}
/**
- * This method attempts to converge node-admin towards one of the {@link State}
+ * This method attempts to converge node-admin w/agents to a {@link State}
+ * with respect to: freeze, Orchestrator, and services running.
*/
private void convergeState(State wantedState) {
+ if (currentState == wantedState) {
+ return;
+ }
+
boolean wantFrozen = wantedState != RESUMED;
if (!nodeAdmin.setFrozen(wantFrozen)) {
throw new ConvergenceException("NodeAdmin is not yet " + (wantFrozen ? "frozen" : "unfrozen"));
}
- if (wantedState == RESUMED) {
- orchestrator.resume(dockerHostHostName);
- if (wantedState == updateAndGetCurrentState(RESUMED)) return;
- }
-
- // Fetch active nodes from node repo before suspending nodes.
- // It is only possible to suspend active nodes,
- // the orchestrator will fail if trying to suspend nodes in other states.
- // Even though state is frozen we need to interact with node repo, but
- // the data from node repo should not be used for anything else.
- // We should also suspend host's hostname to suspend node-admin
- List<String> nodesInActiveState;
- try {
- nodesInActiveState = getNodesInActiveState();
- } catch (IOException e) {
- throw new RuntimeException("Failed to get nodes from node repo", e);
- }
-
- if (currentState == RESUMED) {
- List<String> nodesToSuspend = new ArrayList<>(nodesInActiveState);
- nodesToSuspend.add(dockerHostHostName);
- orchestrator.suspend(dockerHostHostName, nodesToSuspend);
-
- if (wantedState == updateAndGetCurrentState(SUSPENDED_NODE_ADMIN)) return;
+ switch (wantedState) {
+ case RESUMED:
+ orchestrator.resume(dockerHostHostName);
+ break;
+ case SUSPENDED_NODE_ADMIN:
+ orchestrator.suspend(dockerHostHostName);
+ break;
+ case SUSPENDED:
+ // Fetch active nodes from node repo before suspending nodes.
+ // It is only possible to suspend active nodes,
+ // the orchestrator will fail if trying to suspend nodes in other states.
+ // Even though state is frozen we need to interact with node repo, but
+ // the data from node repo should not be used for anything else.
+ // We should also suspend host's hostname to suspend node-admin
+ List<String> nodesInActiveState = getNodesInActiveState();
+
+ List<String> nodesToSuspend = new ArrayList<>();
+ nodesToSuspend.addAll(nodesInActiveState);
+ nodesToSuspend.add(dockerHostHostName);
+ orchestrator.suspend(dockerHostHostName, nodesToSuspend);
+
+ // The node agent services are stopped by this thread, which is OK only
+ // because the node agents are frozen (see above).
+ nodeAdmin.stopNodeAgentServices(nodesInActiveState);
+ break;
+ default:
+ throw new IllegalStateException("Unknown wanted state " + wantedState);
}
- nodeAdmin.stopNodeAgentServices(nodesInActiveState);
- updateAndGetCurrentState(SUSPENDED);
- }
-
- private State updateAndGetCurrentState(State currentState) {
+ log.info("State changed from " + currentState + " to " + wantedState);
synchronized (monitor) {
- log.info("State change: " + this.currentState + " -> " + currentState
- + (currentState == wantedState ? " [converged]" : ""));
- this.currentState = currentState;
- return currentState;
+ // Writes to currentState must be synchronized. Reads doesn't have to since this thread
+ // is the only one modifying it.
+ currentState = wantedState;
}
}
@@ -221,12 +218,16 @@ public class NodeAdminStateUpdater extends AbstractComponent {
}
}
- private List<String> getNodesInActiveState() throws IOException {
- return nodeRepository.getContainersToRun()
- .stream()
- .filter(nodespec -> nodespec.nodeState == Node.State.active)
- .map(nodespec -> nodespec.hostname)
- .collect(Collectors.toList());
+ private List<String> getNodesInActiveState() {
+ try {
+ return nodeRepository.getContainersToRun()
+ .stream()
+ .filter(nodespec -> nodespec.nodeState == Node.State.active)
+ .map(nodespec -> nodespec.hostname)
+ .collect(Collectors.toList());
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to get nodes from node repo", e);
+ }
}
public void start(long stateConvergeInterval) {
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
index d68095e770a..f1f269adf5f 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterTest.java
@@ -97,8 +97,8 @@ public class NodeAdminStateUpdaterTest {
verify(nodeAdmin, times(1)).setFrozen(eq(false));
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
- doThrow(new RuntimeException("Cannot allow to suspend because some reason")).doNothing()
- .when(orchestrator).suspend(eq(parentHostname), eq(suspendHostnames));
+ doThrow(new RuntimeException("Cannot allow to suspend because some reason"))
+ .when(orchestrator).suspend(eq(parentHostname));
tickAfter(35);
assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
verify(refresher, times(1)).signalWorkToBeDone();
@@ -109,7 +109,7 @@ public class NodeAdminStateUpdaterTest {
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(NodeAdminStateUpdater.FREEZE_CONVERGENCE_TIMEOUT.plusMinutes(1));
doThrow(new RuntimeException("Cannot allow to suspend because some reason")).doNothing()
- .when(orchestrator).suspend(eq(parentHostname), eq(suspendHostnames));
+ .when(orchestrator).suspend(eq(parentHostname));
tickAfter(35);
assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
verify(refresher, times(1)).signalWorkToBeDone();
@@ -119,11 +119,13 @@ public class NodeAdminStateUpdaterTest {
assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
verify(nodeAdmin, times(2)).setFrozen(eq(false));
- // At this point orchestrator says its OK to suspend, but something goes wrong when we try to stop services
+ // At this point orchestrator will say its OK to suspend, but something goes wrong when we try to stop services
+ verify(orchestrator, times(0)).suspend(eq(parentHostname), eq(suspendHostnames));
doThrow(new RuntimeException("Failed to stop services")).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames));
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED));
tickAfter(0); // Change in wanted state, no need to wait
+ verify(orchestrator, times(1)).suspend(eq(parentHostname), eq(suspendHostnames));
verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state
// Make sure we dont roll back if we fail to stop services - we will try to stop again next tick
verify(nodeAdmin, times(2)).setFrozen(eq(false));
@@ -139,8 +141,7 @@ public class NodeAdminStateUpdaterTest {
assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED));
verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state
verifyNoMoreInteractions(nodeAdmin);
-
-
+
// Lets try going back to resumed
when(nodeAdmin.setFrozen(eq(false))).thenReturn(false).thenReturn(true); // NodeAgents not converged to yet
assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));