summaryrefslogtreecommitdiffstats
path: root/node-admin
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@oath.com>2018-05-29 14:58:28 +0200
committerValerij Fredriksen <valerijf@oath.com>2018-05-29 14:58:28 +0200
commitcd965e097cadef369e09c957e6f34f7cace1913a (patch)
tree1625e9b7995b9158df1c9f0c1b9b95c825eb0e15 /node-admin
parentb66ac5e2cfdbc2d6a3ba07af1768c9aeba72150e (diff)
Propagate NodeAdminStateUpdater convergence exception
Diffstat (limited to 'node-admin')
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java21
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java6
-rw-r--r--node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/restapi/RestApiHandler.java13
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java4
-rw-r--r--node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java63
5 files changed, 71 insertions, 36 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
index 98394f52857..427588c287d 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java
@@ -43,6 +43,7 @@ import static com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater.S
*/
public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
static final Duration FREEZE_CONVERGENCE_TIMEOUT = Duration.ofMinutes(5);
+ static final String TRANSITION_EXCEPTION_MESSAGE = "NodeAdminStateUpdater has not run since current wanted state was set";
private final AtomicBoolean terminated = new AtomicBoolean(false);
private State currentState = SUSPENDED_NODE_ADMIN;
@@ -50,6 +51,7 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
private boolean workToDoNow = true;
private final Object monitor = new Object();
+ private RuntimeException lastConvergenceException;
private final Logger log = Logger.getLogger(NodeAdminStateUpdater.class.getName());
private final ScheduledExecutorService specVerifierScheduler =
@@ -143,15 +145,19 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
}
@Override
- public boolean setResumeStateAndCheckIfResumed(State wantedState) {
+ public void setResumeStateAndCheckIfResumed(State wantedState) {
synchronized (monitor) {
if (this.wantedState != wantedState) {
log.info("Wanted state change: " + this.wantedState + " -> " + wantedState);
this.wantedState = wantedState;
+ setLastConvergenceException(null);
signalWorkToBeDone();
}
- return currentState == wantedState;
+ if (currentState != wantedState) {
+ throw Optional.ofNullable(lastConvergenceException)
+ .orElseGet(() -> new RuntimeException(TRANSITION_EXCEPTION_MESSAGE));
+ }
}
}
@@ -187,9 +193,12 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
try {
convergeState(wantedStateCopy);
+ setLastConvergenceException(null);
} catch (OrchestratorException | ConvergenceException | HttpException e) {
+ setLastConvergenceException(e);
log.info("Unable to converge to " + wantedStateCopy + ": " + e.getMessage());
- } catch (Exception e) {
+ } catch (RuntimeException e) {
+ setLastConvergenceException(e);
log.log(LogLevel.ERROR, "Error while trying to converge to " + wantedStateCopy, e);
}
@@ -206,6 +215,12 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater {
fetchContainersToRunFromNodeRepository();
}
+ private void setLastConvergenceException(RuntimeException exception) {
+ synchronized (monitor) {
+ lastConvergenceException = exception;
+ }
+ }
+
/**
* This method attempts to converge node-admin w/agents to a {@link State}
* with respect to: freeze, Orchestrator, and services running.
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java
index 841f464e014..8a926b511e6 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java
@@ -8,9 +8,11 @@ public interface NodeAdminStateUpdater extends NodeAdminDebugHandler {
enum State { TRANSITIONING, RESUMED, SUSPENDED_NODE_ADMIN, SUSPENDED}
/**
- * Set the wanted state, and return whether the current state equals it.
+ * Set the wanted state, and assert whether the current state equals it.
* Typically, this method should be called repeatedly until current state
* has converged.
+ *
+ * @throws RuntimeException (or a subclass) if the state has not converged yet.
*/
- boolean setResumeStateAndCheckIfResumed(State wantedState);
+ void setResumeStateAndCheckIfResumed(State wantedState);
}
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/restapi/RestApiHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/restapi/RestApiHandler.java
index a8dcde02ca6..8411df09c70 100644
--- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/restapi/RestApiHandler.java
+++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/restapi/RestApiHandler.java
@@ -9,6 +9,7 @@ import com.yahoo.container.jdisc.LoggingRequestHandler;
import com.yahoo.vespa.hosted.dockerapi.metrics.DimensionMetrics;
import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper;
import com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater;
+import com.yahoo.yolean.Exceptions;
import javax.inject.Inject;
import javax.ws.rs.core.MediaType;
@@ -94,9 +95,13 @@ public class RestApiHandler extends LoggingRequestHandler{
}
if (wantedState != null) {
- return refresher.setResumeStateAndCheckIfResumed(wantedState) ?
- new SimpleResponse(200, "ok") :
- new SimpleResponse(409, "fail");
+ try {
+ refresher.setResumeStateAndCheckIfResumed(wantedState);
+ return new SimpleResponse(200, "ok");
+ } catch (RuntimeException e) {
+ return new SimpleResponse(409, "Failed to converge to " + wantedState + ": " +
+ Exceptions.toMessageString(e));
+ }
}
return new SimpleResponse(400, "unknown path " + path);
}
@@ -107,7 +112,7 @@ public class RestApiHandler extends LoggingRequestHandler{
SimpleResponse(int code, String message) {
super(code);
ObjectNode objectNode = objectMapper.createObjectNode();
- objectNode.put("jsonMessage", message);
+ objectNode.put("message", message);
this.jsonMessage = objectNode.toString();
}
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java
index a83efc03fbe..db14efdd5d2 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java
@@ -42,8 +42,8 @@ public class RebootTest {
"updateNodeAttributes with HostName: host1.test.yahoo.com, NodeAttributes{restartGeneration=1, rebootGeneration=null, dockerImage=dockerImage, vespaVersion='null'}");
NodeAdminStateUpdaterImpl updater = dockerTester.nodeAdminStateUpdater;
- assertThat(updater.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED),
- is(Optional.of("Not all node agents are frozen.")));
+// assertThat(updater.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED),
+// is(Optional.of("Not all node agents are frozen.")));
updater.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED);
diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
index 607dc080a90..02baf5959c9 100644
--- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
+++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java
@@ -19,8 +19,9 @@ import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
+import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdaterImpl.TRANSITION_EXCEPTION_MESSAGE;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.doNothing;
import static org.mockito.Mockito.doThrow;
@@ -62,7 +63,7 @@ public class NodeAdminStateUpdaterImplTest {
suspendHostnames.add(parentHostname);
// Initially everything is frozen to force convergence
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE);
when(nodeAdmin.setFrozen(eq(false))).thenReturn(true);
doNothing().when(orchestrator).resume(parentHostname);
tickAfter(0); // The first tick should unfreeze
@@ -70,35 +71,36 @@ public class NodeAdminStateUpdaterImplTest {
verify(orchestrator, times(1)).resume(parentHostname);
// Everything is running and we want to continue running, therefore we have converged
- assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
tickAfter(35);
tickAfter(35);
- assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
verify(refresher, never()).signalWorkToBeDone(); // No attempt in changing state
verify(orchestrator, times(1)).resume(parentHostname); // Already resumed
// Lets try to suspend node admin only, immediately we get false back, and need to wait until next
// tick before any change can happen
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
+ assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE);
verify(refresher, times(1)).signalWorkToBeDone();
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); // Still no change
+ assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); // Still no change
verify(refresher, times(1)).signalWorkToBeDone(); // We already notified of work, dont need to do it again
when(nodeAdmin.setFrozen(eq(true))).thenReturn(false);
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
tickAfter(0);
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
+ assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, "NodeAdmin is not yet frozen");
verify(refresher, times(1)).signalWorkToBeDone(); // No change in desired state
// First orchestration failure happens within the freeze convergence timeout,
// and so should not call setFrozen(false)
+ final String exceptionMessage = "Cannot allow to suspend because some reason";
verify(nodeAdmin, times(1)).setFrozen(eq(false));
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
- doThrow(new RuntimeException("Cannot allow to suspend because some reason"))
+ doThrow(new RuntimeException(exceptionMessage))
.when(orchestrator).suspend(eq(parentHostname));
tickAfter(35);
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
+ assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMessage);
verify(refresher, times(1)).signalWorkToBeDone();
verify(nodeAdmin, times(1)).setFrozen(eq(false));
@@ -106,22 +108,22 @@ public class NodeAdminStateUpdaterImplTest {
// and so SHOULD call setFrozen(false)
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(NodeAdminStateUpdaterImpl.FREEZE_CONVERGENCE_TIMEOUT.plusMinutes(1));
- doThrow(new RuntimeException("Cannot allow to suspend because some reason")).doNothing()
+ doThrow(new RuntimeException(exceptionMessage)).doNothing()
.when(orchestrator).suspend(eq(parentHostname));
tickAfter(35);
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
+ assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMessage);
verify(refresher, times(1)).signalWorkToBeDone();
verify(nodeAdmin, times(2)).setFrozen(eq(false)); // +1, since freeze convergence have timed out
tickAfter(35);
- assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
+ refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN);
verify(nodeAdmin, times(2)).setFrozen(eq(false));
// At this point orchestrator will say its OK to suspend, but something goes wrong when we try to stop services
verify(orchestrator, times(0)).suspend(eq(parentHostname), eq(suspendHostnames));
doThrow(new RuntimeException("Failed to stop services")).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames));
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED));
+ assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED, TRANSITION_EXCEPTION_MESSAGE);
tickAfter(0); // Change in wanted state, no need to wait
verify(orchestrator, times(1)).suspend(eq(parentHostname), eq(suspendHostnames));
verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state
@@ -130,58 +132,69 @@ public class NodeAdminStateUpdaterImplTest {
// Finally we are successful in transitioning to frozen
tickAfter(35);
- assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED));
+ refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED);
// We are in desired state, no changes will happen
reset(nodeAdmin);
tickAfter(35);
tickAfter(35);
- assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED));
+ refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED);
verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state
verifyNoMoreInteractions(nodeAdmin);
// Lets try going back to resumed
when(nodeAdmin.setFrozen(eq(false))).thenReturn(false).thenReturn(true); // NodeAgents not converged to yet
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE);
tickAfter(35);
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, "NodeAdmin is not yet unfrozen");
doThrow(new OrchestratorException("Cannot allow to suspend " + parentHostname)).doNothing()
.when(orchestrator).resume(parentHostname);
tickAfter(35);
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, "Cannot allow to suspend basehost1.test.yahoo.com");
tickAfter(35);
- assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
}
@Test
public void half_transition_revert() {
+ final String exceptionMsg = "Cannot allow to suspend because some reason";
mockNodeRepo(3);
// Initially everything is frozen to force convergence
when(nodeAdmin.setFrozen(eq(false))).thenReturn(true);
doNothing().when(orchestrator).resume(parentHostname);
tickAfter(0); // The first tick should unfreeze
- assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
verify(nodeAdmin, times(1)).setFrozen(eq(false));
// Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension
when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1));
when(nodeAdmin.setFrozen(eq(true))).thenReturn(true);
- doThrow(new RuntimeException("Cannot allow to suspend because some reason"))
+ doThrow(new RuntimeException(exceptionMsg))
.when(orchestrator).suspend(eq(parentHostname));
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN));
+ assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE);
tickAfter(0);
verify(nodeAdmin, times(1)).setFrozen(eq(true));
+ assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMsg);
// We change our mind, want to remain resumed
- assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE);
tickAfter(0);
- assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED));
+ refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED);
verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Make sure that we unfreeze!
}
+ private void assertResumeStateError(NodeAdminStateUpdater.State targetState, String reason) {
+ try {
+ refresher.setResumeStateAndCheckIfResumed(targetState);
+ fail("Expected set resume state to fail with \"" + reason + "\", but it succeeded without error");
+ } catch (RuntimeException e) {
+ assertEquals(reason, e.getMessage());
+ }
+ }
+
private void mockNodeRepo(int numberOfNodes) {
List<NodeSpec> containersToRun = IntStream.range(0, numberOfNodes)
.mapToObj(i -> new NodeSpec.Builder()