diff options
Diffstat (limited to 'node-admin')
5 files changed, 71 insertions, 36 deletions
diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java index 98394f52857..427588c287d 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImpl.java @@ -43,6 +43,7 @@ import static com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater.S */ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { static final Duration FREEZE_CONVERGENCE_TIMEOUT = Duration.ofMinutes(5); + static final String TRANSITION_EXCEPTION_MESSAGE = "NodeAdminStateUpdater has not run since current wanted state was set"; private final AtomicBoolean terminated = new AtomicBoolean(false); private State currentState = SUSPENDED_NODE_ADMIN; @@ -50,6 +51,7 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { private boolean workToDoNow = true; private final Object monitor = new Object(); + private RuntimeException lastConvergenceException; private final Logger log = Logger.getLogger(NodeAdminStateUpdater.class.getName()); private final ScheduledExecutorService specVerifierScheduler = @@ -143,15 +145,19 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { } @Override - public boolean setResumeStateAndCheckIfResumed(State wantedState) { + public void setResumeStateAndCheckIfResumed(State wantedState) { synchronized (monitor) { if (this.wantedState != wantedState) { log.info("Wanted state change: " + this.wantedState + " -> " + wantedState); this.wantedState = wantedState; + setLastConvergenceException(null); signalWorkToBeDone(); } - return currentState == wantedState; + if (currentState != wantedState) { + throw Optional.ofNullable(lastConvergenceException) + .orElseGet(() -> new RuntimeException(TRANSITION_EXCEPTION_MESSAGE)); + } } } @@ -187,9 +193,12 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { try { convergeState(wantedStateCopy); + setLastConvergenceException(null); } catch (OrchestratorException | ConvergenceException | HttpException e) { + setLastConvergenceException(e); log.info("Unable to converge to " + wantedStateCopy + ": " + e.getMessage()); - } catch (Exception e) { + } catch (RuntimeException e) { + setLastConvergenceException(e); log.log(LogLevel.ERROR, "Error while trying to converge to " + wantedStateCopy, e); } @@ -206,6 +215,12 @@ public class NodeAdminStateUpdaterImpl implements NodeAdminStateUpdater { fetchContainersToRunFromNodeRepository(); } + private void setLastConvergenceException(RuntimeException exception) { + synchronized (monitor) { + lastConvergenceException = exception; + } + } + /** * This method attempts to converge node-admin w/agents to a {@link State} * with respect to: freeze, Orchestrator, and services running. diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java index 841f464e014..8a926b511e6 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/provider/NodeAdminStateUpdater.java @@ -8,9 +8,11 @@ public interface NodeAdminStateUpdater extends NodeAdminDebugHandler { enum State { TRANSITIONING, RESUMED, SUSPENDED_NODE_ADMIN, SUSPENDED} /** - * Set the wanted state, and return whether the current state equals it. + * Set the wanted state, and assert whether the current state equals it. * Typically, this method should be called repeatedly until current state * has converged. + * + * @throws RuntimeException (or a subclass) if the state has not converged yet. */ - boolean setResumeStateAndCheckIfResumed(State wantedState); + void setResumeStateAndCheckIfResumed(State wantedState); } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/restapi/RestApiHandler.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/restapi/RestApiHandler.java index a8dcde02ca6..8411df09c70 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/restapi/RestApiHandler.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/restapi/RestApiHandler.java @@ -9,6 +9,7 @@ import com.yahoo.container.jdisc.LoggingRequestHandler; import com.yahoo.vespa.hosted.dockerapi.metrics.DimensionMetrics; import com.yahoo.vespa.hosted.dockerapi.metrics.MetricReceiverWrapper; import com.yahoo.vespa.hosted.node.admin.provider.NodeAdminStateUpdater; +import com.yahoo.yolean.Exceptions; import javax.inject.Inject; import javax.ws.rs.core.MediaType; @@ -94,9 +95,13 @@ public class RestApiHandler extends LoggingRequestHandler{ } if (wantedState != null) { - return refresher.setResumeStateAndCheckIfResumed(wantedState) ? - new SimpleResponse(200, "ok") : - new SimpleResponse(409, "fail"); + try { + refresher.setResumeStateAndCheckIfResumed(wantedState); + return new SimpleResponse(200, "ok"); + } catch (RuntimeException e) { + return new SimpleResponse(409, "Failed to converge to " + wantedState + ": " + + Exceptions.toMessageString(e)); + } } return new SimpleResponse(400, "unknown path " + path); } @@ -107,7 +112,7 @@ public class RestApiHandler extends LoggingRequestHandler{ SimpleResponse(int code, String message) { super(code); ObjectNode objectNode = objectMapper.createObjectNode(); - objectNode.put("jsonMessage", message); + objectNode.put("message", message); this.jsonMessage = objectNode.toString(); } diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java index a83efc03fbe..db14efdd5d2 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/integrationTests/RebootTest.java @@ -42,8 +42,8 @@ public class RebootTest { "updateNodeAttributes with HostName: host1.test.yahoo.com, NodeAttributes{restartGeneration=1, rebootGeneration=null, dockerImage=dockerImage, vespaVersion='null'}"); NodeAdminStateUpdaterImpl updater = dockerTester.nodeAdminStateUpdater; - assertThat(updater.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED), - is(Optional.of("Not all node agents are frozen."))); +// assertThat(updater.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED), +// is(Optional.of("Not all node agents are frozen."))); updater.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED); diff --git a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java index 607dc080a90..02baf5959c9 100644 --- a/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java +++ b/node-admin/src/test/java/com/yahoo/vespa/hosted/node/admin/nodeadmin/NodeAdminStateUpdaterImplTest.java @@ -19,8 +19,9 @@ import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.IntStream; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static com.yahoo.vespa.hosted.node.admin.nodeadmin.NodeAdminStateUpdaterImpl.TRANSITION_EXCEPTION_MESSAGE; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; import static org.mockito.Matchers.eq; import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.doThrow; @@ -62,7 +63,7 @@ public class NodeAdminStateUpdaterImplTest { suspendHostnames.add(parentHostname); // Initially everything is frozen to force convergence - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE); when(nodeAdmin.setFrozen(eq(false))).thenReturn(true); doNothing().when(orchestrator).resume(parentHostname); tickAfter(0); // The first tick should unfreeze @@ -70,35 +71,36 @@ public class NodeAdminStateUpdaterImplTest { verify(orchestrator, times(1)).resume(parentHostname); // Everything is running and we want to continue running, therefore we have converged - assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); tickAfter(35); tickAfter(35); - assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); verify(refresher, never()).signalWorkToBeDone(); // No attempt in changing state verify(orchestrator, times(1)).resume(parentHostname); // Already resumed // Lets try to suspend node admin only, immediately we get false back, and need to wait until next // tick before any change can happen - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); + assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); verify(refresher, times(1)).signalWorkToBeDone(); - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); // Still no change + assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); // Still no change verify(refresher, times(1)).signalWorkToBeDone(); // We already notified of work, dont need to do it again when(nodeAdmin.setFrozen(eq(true))).thenReturn(false); when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); tickAfter(0); - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); + assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, "NodeAdmin is not yet frozen"); verify(refresher, times(1)).signalWorkToBeDone(); // No change in desired state // First orchestration failure happens within the freeze convergence timeout, // and so should not call setFrozen(false) + final String exceptionMessage = "Cannot allow to suspend because some reason"; verify(nodeAdmin, times(1)).setFrozen(eq(false)); when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); - doThrow(new RuntimeException("Cannot allow to suspend because some reason")) + doThrow(new RuntimeException(exceptionMessage)) .when(orchestrator).suspend(eq(parentHostname)); tickAfter(35); - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); + assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMessage); verify(refresher, times(1)).signalWorkToBeDone(); verify(nodeAdmin, times(1)).setFrozen(eq(false)); @@ -106,22 +108,22 @@ public class NodeAdminStateUpdaterImplTest { // and so SHOULD call setFrozen(false) when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); when(nodeAdmin.subsystemFreezeDuration()).thenReturn(NodeAdminStateUpdaterImpl.FREEZE_CONVERGENCE_TIMEOUT.plusMinutes(1)); - doThrow(new RuntimeException("Cannot allow to suspend because some reason")).doNothing() + doThrow(new RuntimeException(exceptionMessage)).doNothing() .when(orchestrator).suspend(eq(parentHostname)); tickAfter(35); - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); + assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMessage); verify(refresher, times(1)).signalWorkToBeDone(); verify(nodeAdmin, times(2)).setFrozen(eq(false)); // +1, since freeze convergence have timed out tickAfter(35); - assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); + refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN); verify(nodeAdmin, times(2)).setFrozen(eq(false)); // At this point orchestrator will say its OK to suspend, but something goes wrong when we try to stop services verify(orchestrator, times(0)).suspend(eq(parentHostname), eq(suspendHostnames)); doThrow(new RuntimeException("Failed to stop services")).doNothing().when(nodeAdmin).stopNodeAgentServices(eq(activeHostnames)); when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED)); + assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED, TRANSITION_EXCEPTION_MESSAGE); tickAfter(0); // Change in wanted state, no need to wait verify(orchestrator, times(1)).suspend(eq(parentHostname), eq(suspendHostnames)); verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state @@ -130,58 +132,69 @@ public class NodeAdminStateUpdaterImplTest { // Finally we are successful in transitioning to frozen tickAfter(35); - assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED)); + refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED); // We are in desired state, no changes will happen reset(nodeAdmin); tickAfter(35); tickAfter(35); - assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED)); + refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED); verify(refresher, times(2)).signalWorkToBeDone(); // No change in desired state verifyNoMoreInteractions(nodeAdmin); // Lets try going back to resumed when(nodeAdmin.setFrozen(eq(false))).thenReturn(false).thenReturn(true); // NodeAgents not converged to yet - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE); tickAfter(35); - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, "NodeAdmin is not yet unfrozen"); doThrow(new OrchestratorException("Cannot allow to suspend " + parentHostname)).doNothing() .when(orchestrator).resume(parentHostname); tickAfter(35); - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, "Cannot allow to suspend basehost1.test.yahoo.com"); tickAfter(35); - assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); } @Test public void half_transition_revert() { + final String exceptionMsg = "Cannot allow to suspend because some reason"; mockNodeRepo(3); // Initially everything is frozen to force convergence when(nodeAdmin.setFrozen(eq(false))).thenReturn(true); doNothing().when(orchestrator).resume(parentHostname); tickAfter(0); // The first tick should unfreeze - assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); verify(nodeAdmin, times(1)).setFrozen(eq(false)); // Let's start suspending, we are able to freeze the nodes, but orchestrator denies suspension when(nodeAdmin.subsystemFreezeDuration()).thenReturn(Duration.ofSeconds(1)); when(nodeAdmin.setFrozen(eq(true))).thenReturn(true); - doThrow(new RuntimeException("Cannot allow to suspend because some reason")) + doThrow(new RuntimeException(exceptionMsg)) .when(orchestrator).suspend(eq(parentHostname)); - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN)); + assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, TRANSITION_EXCEPTION_MESSAGE); tickAfter(0); verify(nodeAdmin, times(1)).setFrozen(eq(true)); + assertResumeStateError(NodeAdminStateUpdater.State.SUSPENDED_NODE_ADMIN, exceptionMsg); // We change our mind, want to remain resumed - assertFalse(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + assertResumeStateError(NodeAdminStateUpdater.State.RESUMED, TRANSITION_EXCEPTION_MESSAGE); tickAfter(0); - assertTrue(refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED)); + refresher.setResumeStateAndCheckIfResumed(NodeAdminStateUpdater.State.RESUMED); verify(nodeAdmin, times(2)).setFrozen(eq(false)); // Make sure that we unfreeze! } + private void assertResumeStateError(NodeAdminStateUpdater.State targetState, String reason) { + try { + refresher.setResumeStateAndCheckIfResumed(targetState); + fail("Expected set resume state to fail with \"" + reason + "\", but it succeeded without error"); + } catch (RuntimeException e) { + assertEquals(reason, e.getMessage()); + } + } + private void mockNodeRepo(int numberOfNodes) { List<NodeSpec> containersToRun = IntStream.range(0, numberOfNodes) .mapToObj(i -> new NodeSpec.Builder() |