diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2019-11-04 15:45:23 +0100 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2019-11-04 15:45:23 +0100 |
commit | a4eea5536f95e89890a1fdf6c347a4d8079144c2 (patch) | |
tree | 2bb6a8f7afa38feec568fd118c287d225a409f94 /clustercontroller-core/src/test/java/com/yahoo | |
parent | d99e553e2b8358203dcbb0edde53ae14b51042a7 (diff) |
Add non-converged nodes to task deadline exceeded messages
Makes it easier for an external observer to understand what set of nodes
is causing the cluster state to not converge.
Diffstat (limited to 'clustercontroller-core/src/test/java/com/yahoo')
2 files changed, 53 insertions, 14 deletions
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java index 2c8220c0dba..5f9e0d56cfa 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java @@ -1263,25 +1263,24 @@ public class StateChangeTest extends FleetControllerTest { private static abstract class MockTask extends RemoteClusterControllerTask { boolean invoked = false; - boolean leadershipLost = false; - boolean deadlineExceeded = false; + Failure failure; boolean isInvoked() { return invoked; } - boolean isLeadershipLost() { return leadershipLost; } + boolean isLeadershipLost() { + return (failure != null) && (failure.getCondition() == FailureCondition.LEADERSHIP_LOST); + } - boolean isDeadlineExceeded() { return deadlineExceeded; } + boolean isDeadlineExceeded() { + return (failure != null) && (failure.getCondition() == FailureCondition.DEADLINE_EXCEEDED); + } @Override public boolean hasVersionAckDependency() { return true; } @Override - public void handleFailure(FailureCondition condition) { - if (condition == FailureCondition.LEADERSHIP_LOST) { - this.leadershipLost = true; - } else if (condition == FailureCondition.DEADLINE_EXCEEDED) { - this.deadlineExceeded = true; - } + public void handleFailure(Failure failure) { + this.failure = failure; } } @@ -1600,4 +1599,43 @@ public class StateChangeTest extends FleetControllerTest { assertTrue(task.isDeadlineExceeded()); } + private void doTestTaskDeadlineExceeded(boolean deferredActivation, String expectedMessage) throws Exception { + FleetControllerOptions options = defaultOptions(); + options.setMaxDeferredTaskVersionWaitTime(Duration.ofSeconds(60)); + options.enableTwoPhaseClusterStateActivation = deferredActivation; + options.maxDivergentNodesPrintedInTaskErrorMessages = 10; + RemoteTaskFixture fixture = createFixtureWith(options); + + MockTask task = fixture.scheduleVersionDependentTaskWithSideEffects(); + communicator.setShouldDeferDistributorClusterStateAcks(true); + fixture.processScheduledTask(); + + assertTrue(task.isInvoked()); + assertFalse(task.isCompleted()); + assertFalse(task.isDeadlineExceeded()); + timer.advanceTime(60_001); + ctrl.tick(); + assertTrue(task.isCompleted()); + assertTrue(task.isDeadlineExceeded()); + // If we're not using two-phase activation for this test, all storage nodes have ACKed + // the bundle, but the distributors are explicitly deferred. If we used two-phase activation, + // all distributors and storage nodes will be listed here. + assertEquals(expectedMessage, task.failure.getMessage()); + } + + @Test + public void task_not_completed_within_deadline_lists_nodes_not_converged_in_error_message() throws Exception { + doTestTaskDeadlineExceeded(false, "the following nodes have not converged to " + + "at least version 4: distributor.0, distributor.1, distributor.2, distributor.3, " + + "distributor.4, distributor.5, distributor.6, distributor.7, distributor.8, distributor.9"); + } + + @Test + public void task_not_completed_within_deadline_with_deferred_activation_checks_activation_version() throws Exception { + doTestTaskDeadlineExceeded(true, "the following nodes have not converged to " + + "at least version 4: distributor.0, distributor.1, distributor.2, distributor.3, " + + "distributor.4, distributor.5, distributor.6, distributor.7, distributor.8, distributor.9 " + + "(... and 10 more)"); + } + } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java index 067830fd470..824a7af55c4 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java @@ -428,24 +428,25 @@ public class SetNodeStateTest extends StateRestApiTest { expectedException.expect(UnknownMasterException.class); SetNodeStateRequest request = createDummySetNodeStateRequest(); - request.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST); + request.handleFailure(RemoteClusterControllerTask.Failure.of(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST)); request.getResult(); } @Test public void leadership_loss_marks_request_as_failed_for_early_out_response() { SetNodeStateRequest request = createDummySetNodeStateRequest(); - request.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST); + request.handleFailure(RemoteClusterControllerTask.Failure.of(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST)); assertTrue(request.isFailed()); } @Test public void deadline_exceeded_fails_set_node_state_request() throws Exception { - expectedException.expectMessage("Task exceeded its version wait deadline"); + expectedException.expectMessage("Task exceeded its version wait deadline: gremlins in the computer"); expectedException.expect(DeadlineExceededException.class); SetNodeStateRequest request = createDummySetNodeStateRequest(); - request.handleFailure(RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED); + request.handleFailure(RemoteClusterControllerTask.Failure.of( + RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED, "gremlins in the computer")); request.getResult(); } |