diff options
Diffstat (limited to 'clustercontroller-core/src')
6 files changed, 142 insertions, 26 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index cbff11af730..4a619fa47f2 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -426,12 +426,14 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd private void failAllVersionDependentTasks() { tasksPendingStateRecompute.forEach(task -> { - task.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST); + task.handleFailure(RemoteClusterControllerTask.Failure.of( + RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST)); task.notifyCompleted(); }); tasksPendingStateRecompute.clear(); taskCompletionQueue.forEach(task -> { - task.getTask().handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST); + task.getTask().handleFailure(RemoteClusterControllerTask.Failure.of( + RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST)); task.getTask().notifyCompleted(); }); taskCompletionQueue.clear(); @@ -747,6 +749,43 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd return context; } + private static long effectiveActivatedStateVersion(NodeInfo nodeInfo, ClusterStateBundle bundle) { + return bundle.deferredActivation() + ? nodeInfo.getClusterStateVersionActivationAcked() + : nodeInfo.getClusterStateVersionBundleAcknowledged(); + } + + private List<Node> enumerateNodesNotYetAckedAtLeastVersion(long version) { + var bundle = systemStateBroadcaster.getClusterStateBundle(); + if (bundle == null) { + return List.of(); + } + return cluster.getNodeInfo().stream(). + filter(n -> effectiveActivatedStateVersion(n, bundle) < version). + map(NodeInfo::getNode). + collect(Collectors.toList()); + } + + private static <E> String stringifyListWithLimits(List<E> list, int limit) { + if (list.size() > limit) { + var sub = list.subList(0, limit); + return String.format("%s (... and %d more)", + sub.stream().map(E::toString).collect(Collectors.joining(", ")), + list.size() - limit); + } else { + return list.stream().map(E::toString).collect(Collectors.joining(", ")); + } + } + + private String buildNodesNotYetConvergedMessage(long taskConvergeVersion) { + var nodes = enumerateNodesNotYetAckedAtLeastVersion(taskConvergeVersion); + if (nodes.isEmpty()) { + return ""; + } + return String.format("the following nodes have not converged to at least version %d: %s", + taskConvergeVersion, stringifyListWithLimits(nodes, options.maxDivergentNodesPrintedInTaskErrorMessages)); + } + private boolean completeSatisfiedVersionDependentTasks() { int publishedVersion = systemStateBroadcaster.lastClusterStateVersionInSync(); long queueSizeBefore = taskCompletionQueue.size(); @@ -766,9 +805,11 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd taskCompletion.getTask().notifyCompleted(); taskCompletionQueue.remove(); } else if (taskCompletion.getDeadlineTimePointMs() <= now) { - log.log(LogLevel.WARNING, () -> String.format("Deferred task of type '%s' has exceeded wait deadline; completing with failure", - taskCompletion.getTask().getClass().getName())); - taskCompletion.getTask().handleFailure(RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED); + var details = buildNodesNotYetConvergedMessage(taskCompletion.getMinimumVersion()); + log.log(LogLevel.WARNING, () -> String.format("Deferred task of type '%s' has exceeded wait deadline; completing with failure (details: %s)", + taskCompletion.getTask().getClass().getName(), details)); + taskCompletion.getTask().handleFailure(RemoteClusterControllerTask.Failure.of( + RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED, details)); taskCompletion.getTask().notifyCompleted(); taskCompletionQueue.remove(); } else { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java index 5e9e91e1cb6..553b3332ee8 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java @@ -126,6 +126,8 @@ public class FleetControllerOptions implements Cloneable { // TODO: Choose a default value public double minMergeCompletionRatio = 1.0; + public int maxDivergentNodesPrintedInTaskErrorMessages = 10; + // TODO: Replace usage of this by usage where the nodes are explicitly passed (below) public FleetControllerOptions(String clusterName) { this.clusterName = clusterName; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java index 8382e127e13..9322ba9ec58 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java @@ -48,6 +48,30 @@ public abstract class RemoteClusterControllerTask { DEADLINE_EXCEEDED } + public static class Failure { + private final FailureCondition condition; + private final String message; + + private Failure(FailureCondition condition, String message) { + this.condition = condition; + this.message = message; + } + public static Failure of(FailureCondition condition, String message) { + return new Failure(condition, message); + } + public static Failure of(FailureCondition condition) { + return new Failure(condition, ""); + } + + public FailureCondition getCondition() { + return condition; + } + + public String getMessage() { + return message; + } + } + /** * If the task completion has been deferred due to hasVersionAckDependency(), * this method will be invoked if a failure occurs before the version has @@ -64,9 +88,10 @@ public abstract class RemoteClusterControllerTask { * before the dependent cluster version has been published. * * The task implementation is responsible for communicating the appropriate - * error semantics to the caller who initially scheduled the task. + * error semantics to the caller who initially scheduled the task. If additional + * details are available, Failure.getMessage() will return a non-empty string. */ - public void handleFailure(FailureCondition condition) {} + public void handleFailure(Failure failure) {} public Optional<Instant> getDeadline() { return Optional.empty(); diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java index 4ef62ad3fdf..c1b653074a5 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java @@ -61,12 +61,21 @@ public abstract class Request<Result> extends RemoteClusterControllerTask { } } + private static String failureStringWithPossibleMessage(String prefix, String message) { + if (message != null && !message.isEmpty()) { + return String.format("%s: %s", prefix, message); + } + return prefix; + } + @Override - public void handleFailure(FailureCondition condition) { - if (condition == FailureCondition.LEADERSHIP_LOST) { - failure = new UnknownMasterException("Leadership lost before request could complete"); - } else if (condition == FailureCondition.DEADLINE_EXCEEDED) { - failure = new DeadlineExceededException("Task exceeded its version wait deadline"); + public void handleFailure(Failure failure) { + if (failure.getCondition() == FailureCondition.LEADERSHIP_LOST) { + this.failure = new UnknownMasterException(failureStringWithPossibleMessage( + "Leadership lost before request could complete", failure.getMessage())); + } else if (failure.getCondition() == FailureCondition.DEADLINE_EXCEEDED) { + this.failure = new DeadlineExceededException(failureStringWithPossibleMessage( + "Task exceeded its version wait deadline", failure.getMessage())); } } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java index 2c8220c0dba..5f9e0d56cfa 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java @@ -1263,25 +1263,24 @@ public class StateChangeTest extends FleetControllerTest { private static abstract class MockTask extends RemoteClusterControllerTask { boolean invoked = false; - boolean leadershipLost = false; - boolean deadlineExceeded = false; + Failure failure; boolean isInvoked() { return invoked; } - boolean isLeadershipLost() { return leadershipLost; } + boolean isLeadershipLost() { + return (failure != null) && (failure.getCondition() == FailureCondition.LEADERSHIP_LOST); + } - boolean isDeadlineExceeded() { return deadlineExceeded; } + boolean isDeadlineExceeded() { + return (failure != null) && (failure.getCondition() == FailureCondition.DEADLINE_EXCEEDED); + } @Override public boolean hasVersionAckDependency() { return true; } @Override - public void handleFailure(FailureCondition condition) { - if (condition == FailureCondition.LEADERSHIP_LOST) { - this.leadershipLost = true; - } else if (condition == FailureCondition.DEADLINE_EXCEEDED) { - this.deadlineExceeded = true; - } + public void handleFailure(Failure failure) { + this.failure = failure; } } @@ -1600,4 +1599,43 @@ public class StateChangeTest extends FleetControllerTest { assertTrue(task.isDeadlineExceeded()); } + private void doTestTaskDeadlineExceeded(boolean deferredActivation, String expectedMessage) throws Exception { + FleetControllerOptions options = defaultOptions(); + options.setMaxDeferredTaskVersionWaitTime(Duration.ofSeconds(60)); + options.enableTwoPhaseClusterStateActivation = deferredActivation; + options.maxDivergentNodesPrintedInTaskErrorMessages = 10; + RemoteTaskFixture fixture = createFixtureWith(options); + + MockTask task = fixture.scheduleVersionDependentTaskWithSideEffects(); + communicator.setShouldDeferDistributorClusterStateAcks(true); + fixture.processScheduledTask(); + + assertTrue(task.isInvoked()); + assertFalse(task.isCompleted()); + assertFalse(task.isDeadlineExceeded()); + timer.advanceTime(60_001); + ctrl.tick(); + assertTrue(task.isCompleted()); + assertTrue(task.isDeadlineExceeded()); + // If we're not using two-phase activation for this test, all storage nodes have ACKed + // the bundle, but the distributors are explicitly deferred. If we used two-phase activation, + // all distributors and storage nodes will be listed here. + assertEquals(expectedMessage, task.failure.getMessage()); + } + + @Test + public void task_not_completed_within_deadline_lists_nodes_not_converged_in_error_message() throws Exception { + doTestTaskDeadlineExceeded(false, "the following nodes have not converged to " + + "at least version 4: distributor.0, distributor.1, distributor.2, distributor.3, " + + "distributor.4, distributor.5, distributor.6, distributor.7, distributor.8, distributor.9"); + } + + @Test + public void task_not_completed_within_deadline_with_deferred_activation_checks_activation_version() throws Exception { + doTestTaskDeadlineExceeded(true, "the following nodes have not converged to " + + "at least version 4: distributor.0, distributor.1, distributor.2, distributor.3, " + + "distributor.4, distributor.5, distributor.6, distributor.7, distributor.8, distributor.9 " + + "(... and 10 more)"); + } + } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java index 067830fd470..824a7af55c4 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java @@ -428,24 +428,25 @@ public class SetNodeStateTest extends StateRestApiTest { expectedException.expect(UnknownMasterException.class); SetNodeStateRequest request = createDummySetNodeStateRequest(); - request.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST); + request.handleFailure(RemoteClusterControllerTask.Failure.of(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST)); request.getResult(); } @Test public void leadership_loss_marks_request_as_failed_for_early_out_response() { SetNodeStateRequest request = createDummySetNodeStateRequest(); - request.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST); + request.handleFailure(RemoteClusterControllerTask.Failure.of(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST)); assertTrue(request.isFailed()); } @Test public void deadline_exceeded_fails_set_node_state_request() throws Exception { - expectedException.expectMessage("Task exceeded its version wait deadline"); + expectedException.expectMessage("Task exceeded its version wait deadline: gremlins in the computer"); expectedException.expect(DeadlineExceededException.class); SetNodeStateRequest request = createDummySetNodeStateRequest(); - request.handleFailure(RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED); + request.handleFailure(RemoteClusterControllerTask.Failure.of( + RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED, "gremlins in the computer")); request.getResult(); } |