summaryrefslogtreecommitdiffstats
path: root/clustercontroller-core
diff options
context:
space:
mode:
Diffstat (limited to 'clustercontroller-core')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java51
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java2
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java29
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java19
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java58
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java9
6 files changed, 142 insertions, 26 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
index cbff11af730..4a619fa47f2 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
@@ -426,12 +426,14 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
private void failAllVersionDependentTasks() {
tasksPendingStateRecompute.forEach(task -> {
- task.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST);
+ task.handleFailure(RemoteClusterControllerTask.Failure.of(
+ RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST));
task.notifyCompleted();
});
tasksPendingStateRecompute.clear();
taskCompletionQueue.forEach(task -> {
- task.getTask().handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST);
+ task.getTask().handleFailure(RemoteClusterControllerTask.Failure.of(
+ RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST));
task.getTask().notifyCompleted();
});
taskCompletionQueue.clear();
@@ -747,6 +749,43 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
return context;
}
+ private static long effectiveActivatedStateVersion(NodeInfo nodeInfo, ClusterStateBundle bundle) {
+ return bundle.deferredActivation()
+ ? nodeInfo.getClusterStateVersionActivationAcked()
+ : nodeInfo.getClusterStateVersionBundleAcknowledged();
+ }
+
+ private List<Node> enumerateNodesNotYetAckedAtLeastVersion(long version) {
+ var bundle = systemStateBroadcaster.getClusterStateBundle();
+ if (bundle == null) {
+ return List.of();
+ }
+ return cluster.getNodeInfo().stream().
+ filter(n -> effectiveActivatedStateVersion(n, bundle) < version).
+ map(NodeInfo::getNode).
+ collect(Collectors.toList());
+ }
+
+ private static <E> String stringifyListWithLimits(List<E> list, int limit) {
+ if (list.size() > limit) {
+ var sub = list.subList(0, limit);
+ return String.format("%s (... and %d more)",
+ sub.stream().map(E::toString).collect(Collectors.joining(", ")),
+ list.size() - limit);
+ } else {
+ return list.stream().map(E::toString).collect(Collectors.joining(", "));
+ }
+ }
+
+ private String buildNodesNotYetConvergedMessage(long taskConvergeVersion) {
+ var nodes = enumerateNodesNotYetAckedAtLeastVersion(taskConvergeVersion);
+ if (nodes.isEmpty()) {
+ return "";
+ }
+ return String.format("the following nodes have not converged to at least version %d: %s",
+ taskConvergeVersion, stringifyListWithLimits(nodes, options.maxDivergentNodesPrintedInTaskErrorMessages));
+ }
+
private boolean completeSatisfiedVersionDependentTasks() {
int publishedVersion = systemStateBroadcaster.lastClusterStateVersionInSync();
long queueSizeBefore = taskCompletionQueue.size();
@@ -766,9 +805,11 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
taskCompletion.getTask().notifyCompleted();
taskCompletionQueue.remove();
} else if (taskCompletion.getDeadlineTimePointMs() <= now) {
- log.log(LogLevel.WARNING, () -> String.format("Deferred task of type '%s' has exceeded wait deadline; completing with failure",
- taskCompletion.getTask().getClass().getName()));
- taskCompletion.getTask().handleFailure(RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED);
+ var details = buildNodesNotYetConvergedMessage(taskCompletion.getMinimumVersion());
+ log.log(LogLevel.WARNING, () -> String.format("Deferred task of type '%s' has exceeded wait deadline; completing with failure (details: %s)",
+ taskCompletion.getTask().getClass().getName(), details));
+ taskCompletion.getTask().handleFailure(RemoteClusterControllerTask.Failure.of(
+ RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED, details));
taskCompletion.getTask().notifyCompleted();
taskCompletionQueue.remove();
} else {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
index 5e9e91e1cb6..553b3332ee8 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
@@ -126,6 +126,8 @@ public class FleetControllerOptions implements Cloneable {
// TODO: Choose a default value
public double minMergeCompletionRatio = 1.0;
+ public int maxDivergentNodesPrintedInTaskErrorMessages = 10;
+
// TODO: Replace usage of this by usage where the nodes are explicitly passed (below)
public FleetControllerOptions(String clusterName) {
this.clusterName = clusterName;
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
index 8382e127e13..9322ba9ec58 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
@@ -48,6 +48,30 @@ public abstract class RemoteClusterControllerTask {
DEADLINE_EXCEEDED
}
+ public static class Failure {
+ private final FailureCondition condition;
+ private final String message;
+
+ private Failure(FailureCondition condition, String message) {
+ this.condition = condition;
+ this.message = message;
+ }
+ public static Failure of(FailureCondition condition, String message) {
+ return new Failure(condition, message);
+ }
+ public static Failure of(FailureCondition condition) {
+ return new Failure(condition, "");
+ }
+
+ public FailureCondition getCondition() {
+ return condition;
+ }
+
+ public String getMessage() {
+ return message;
+ }
+ }
+
/**
* If the task completion has been deferred due to hasVersionAckDependency(),
* this method will be invoked if a failure occurs before the version has
@@ -64,9 +88,10 @@ public abstract class RemoteClusterControllerTask {
* before the dependent cluster version has been published.
*
* The task implementation is responsible for communicating the appropriate
- * error semantics to the caller who initially scheduled the task.
+ * error semantics to the caller who initially scheduled the task. If additional
+ * details are available, Failure.getMessage() will return a non-empty string.
*/
- public void handleFailure(FailureCondition condition) {}
+ public void handleFailure(Failure failure) {}
public Optional<Instant> getDeadline() {
return Optional.empty();
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java
index 4ef62ad3fdf..c1b653074a5 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java
@@ -61,12 +61,21 @@ public abstract class Request<Result> extends RemoteClusterControllerTask {
}
}
+ private static String failureStringWithPossibleMessage(String prefix, String message) {
+ if (message != null && !message.isEmpty()) {
+ return String.format("%s: %s", prefix, message);
+ }
+ return prefix;
+ }
+
@Override
- public void handleFailure(FailureCondition condition) {
- if (condition == FailureCondition.LEADERSHIP_LOST) {
- failure = new UnknownMasterException("Leadership lost before request could complete");
- } else if (condition == FailureCondition.DEADLINE_EXCEEDED) {
- failure = new DeadlineExceededException("Task exceeded its version wait deadline");
+ public void handleFailure(Failure failure) {
+ if (failure.getCondition() == FailureCondition.LEADERSHIP_LOST) {
+ this.failure = new UnknownMasterException(failureStringWithPossibleMessage(
+ "Leadership lost before request could complete", failure.getMessage()));
+ } else if (failure.getCondition() == FailureCondition.DEADLINE_EXCEEDED) {
+ this.failure = new DeadlineExceededException(failureStringWithPossibleMessage(
+ "Task exceeded its version wait deadline", failure.getMessage()));
}
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java
index 2c8220c0dba..5f9e0d56cfa 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java
@@ -1263,25 +1263,24 @@ public class StateChangeTest extends FleetControllerTest {
private static abstract class MockTask extends RemoteClusterControllerTask {
boolean invoked = false;
- boolean leadershipLost = false;
- boolean deadlineExceeded = false;
+ Failure failure;
boolean isInvoked() { return invoked; }
- boolean isLeadershipLost() { return leadershipLost; }
+ boolean isLeadershipLost() {
+ return (failure != null) && (failure.getCondition() == FailureCondition.LEADERSHIP_LOST);
+ }
- boolean isDeadlineExceeded() { return deadlineExceeded; }
+ boolean isDeadlineExceeded() {
+ return (failure != null) && (failure.getCondition() == FailureCondition.DEADLINE_EXCEEDED);
+ }
@Override
public boolean hasVersionAckDependency() { return true; }
@Override
- public void handleFailure(FailureCondition condition) {
- if (condition == FailureCondition.LEADERSHIP_LOST) {
- this.leadershipLost = true;
- } else if (condition == FailureCondition.DEADLINE_EXCEEDED) {
- this.deadlineExceeded = true;
- }
+ public void handleFailure(Failure failure) {
+ this.failure = failure;
}
}
@@ -1600,4 +1599,43 @@ public class StateChangeTest extends FleetControllerTest {
assertTrue(task.isDeadlineExceeded());
}
+ private void doTestTaskDeadlineExceeded(boolean deferredActivation, String expectedMessage) throws Exception {
+ FleetControllerOptions options = defaultOptions();
+ options.setMaxDeferredTaskVersionWaitTime(Duration.ofSeconds(60));
+ options.enableTwoPhaseClusterStateActivation = deferredActivation;
+ options.maxDivergentNodesPrintedInTaskErrorMessages = 10;
+ RemoteTaskFixture fixture = createFixtureWith(options);
+
+ MockTask task = fixture.scheduleVersionDependentTaskWithSideEffects();
+ communicator.setShouldDeferDistributorClusterStateAcks(true);
+ fixture.processScheduledTask();
+
+ assertTrue(task.isInvoked());
+ assertFalse(task.isCompleted());
+ assertFalse(task.isDeadlineExceeded());
+ timer.advanceTime(60_001);
+ ctrl.tick();
+ assertTrue(task.isCompleted());
+ assertTrue(task.isDeadlineExceeded());
+ // If we're not using two-phase activation for this test, all storage nodes have ACKed
+ // the bundle, but the distributors are explicitly deferred. If we used two-phase activation,
+ // all distributors and storage nodes will be listed here.
+ assertEquals(expectedMessage, task.failure.getMessage());
+ }
+
+ @Test
+ public void task_not_completed_within_deadline_lists_nodes_not_converged_in_error_message() throws Exception {
+ doTestTaskDeadlineExceeded(false, "the following nodes have not converged to " +
+ "at least version 4: distributor.0, distributor.1, distributor.2, distributor.3, " +
+ "distributor.4, distributor.5, distributor.6, distributor.7, distributor.8, distributor.9");
+ }
+
+ @Test
+ public void task_not_completed_within_deadline_with_deferred_activation_checks_activation_version() throws Exception {
+ doTestTaskDeadlineExceeded(true, "the following nodes have not converged to " +
+ "at least version 4: distributor.0, distributor.1, distributor.2, distributor.3, " +
+ "distributor.4, distributor.5, distributor.6, distributor.7, distributor.8, distributor.9 " +
+ "(... and 10 more)");
+ }
+
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java
index 067830fd470..824a7af55c4 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java
@@ -428,24 +428,25 @@ public class SetNodeStateTest extends StateRestApiTest {
expectedException.expect(UnknownMasterException.class);
SetNodeStateRequest request = createDummySetNodeStateRequest();
- request.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST);
+ request.handleFailure(RemoteClusterControllerTask.Failure.of(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST));
request.getResult();
}
@Test
public void leadership_loss_marks_request_as_failed_for_early_out_response() {
SetNodeStateRequest request = createDummySetNodeStateRequest();
- request.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST);
+ request.handleFailure(RemoteClusterControllerTask.Failure.of(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST));
assertTrue(request.isFailed());
}
@Test
public void deadline_exceeded_fails_set_node_state_request() throws Exception {
- expectedException.expectMessage("Task exceeded its version wait deadline");
+ expectedException.expectMessage("Task exceeded its version wait deadline: gremlins in the computer");
expectedException.expect(DeadlineExceededException.class);
SetNodeStateRequest request = createDummySetNodeStateRequest();
- request.handleFailure(RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED);
+ request.handleFailure(RemoteClusterControllerTask.Failure.of(
+ RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED, "gremlins in the computer"));
request.getResult();
}