aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahoo-inc.com>2017-09-21 17:41:07 +0200
committerTor Brede Vekterli <vekterli@yahoo-inc.com>2017-09-25 16:05:23 +0200
commit8c6befb4a9fb5357d33208631cc15989dab771f7 (patch)
tree9501dda9ae920968e0a109a0922a45b3833c16aa
parentdebb34547c76429e8a345299e4765824903f784c (diff)
Add configurable deadline for cluster controller tasks
Prevents an unstable cluster from potentially holding up all container request processing threads indefinitely. Deadline errors are translated into HTTP 504 errors to REST API clients.
-rw-r--r--clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java3
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java22
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java12
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java22
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java9
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java9
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java37
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java16
-rw-r--r--clustercontroller-utils/src/main/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/errors/DeadlineExceededException.java8
-rw-r--r--clustercontroller-utils/src/main/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/server/RestApiHandler.java6
-rw-r--r--clustercontroller-utils/src/test/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/StateRestAPITest.java13
-rw-r--r--configdefinitions/src/vespa/fleetcontroller.def12
12 files changed, 153 insertions, 16 deletions
diff --git a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java
index 0936f9d781a..8b80b0f7be5 100644
--- a/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java
+++ b/clustercontroller-apps/src/main/java/com/yahoo/vespa/clustercontroller/apps/clustercontroller/ClusterControllerClusterConfigurer.java
@@ -10,6 +10,8 @@ import com.yahoo.cloud.config.SlobroksConfig;
import com.yahoo.vespa.config.content.StorDistributionConfig;
import com.yahoo.cloud.config.ZookeepersConfig;
+import java.time.Duration;
+
/**
* When the cluster controller is reconfigured, a new instance of this is created, which will propagate configured
* options to receivers such as the fleet controller.
@@ -70,6 +72,7 @@ public class ClusterControllerClusterConfigurer {
options.maxSlobrokDisconnectGracePeriod = (int) (config.max_slobrok_disconnect_grace_period() * 1000);
options.distributionBits = config.ideal_distribution_bits();
options.minNodeRatioPerGroup = config.min_node_ratio_per_group();
+ options.setMaxDeferredTaskVersionWaitTime(Duration.ofMillis((int)(config.max_deferred_task_version_wait_time_sec() * 1000)));
}
private void configure(SlobroksConfig config) {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
index 030d792a63b..457fe024535 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
@@ -400,12 +400,12 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
private void failAllVersionDependentTasks() {
tasksPendingStateRecompute.forEach(task -> {
- task.handleLeadershipLost();
+ task.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST);
task.notifyCompleted();
});
tasksPendingStateRecompute.clear();
taskCompletionQueue.forEach(task -> {
- task.getTask().handleLeadershipLost();
+ task.getTask().handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST);
task.getTask().notifyCompleted();
});
taskCompletionQueue.clear();
@@ -695,13 +695,27 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
private boolean completeSatisfiedVersionDependentTasks() {
int publishedVersion = systemStateBroadcaster.lastClusterStateVersionInSync();
long queueSizeBefore = taskCompletionQueue.size();
+ // Note: although version monotonicity of tasks in queue always should hold,
+ // deadline monotonicity is not guaranteed to do so due to reconfigs of task
+ // timeout durations. Means that tasks enqueued with shorter deadline duration
+ // might be observed as having at least the same timeout as tasks enqueued during
+ // a previous configuration. Current clock implementation is also susceptible to
+ // skewing.
+ final long now = timer.getCurrentTimeInMillis();
while (!taskCompletionQueue.isEmpty()) {
VersionDependentTaskCompletion taskCompletion = taskCompletionQueue.peek();
+ // TODO expose and use monotonic clock instead of system clock
if (publishedVersion >= taskCompletion.getMinimumVersion()) {
log.fine(() -> String.format("Deferred task of type '%s' has minimum version %d, published is %d; completing",
taskCompletion.getTask().getClass().getName(), taskCompletion.getMinimumVersion(), publishedVersion));
taskCompletion.getTask().notifyCompleted();
taskCompletionQueue.remove();
+ } else if (taskCompletion.getDeadlineTimePointMs() <= now) {
+ log.fine(() -> String.format("Deferred task of type '%s' has exceeded wait deadline; completing with failure",
+ taskCompletion.getTask().getClass().getName()));
+ taskCompletion.getTask().handleFailure(RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED);
+ taskCompletion.getTask().notifyCompleted();
+ taskCompletionQueue.remove();
} else {
break;
}
@@ -796,10 +810,12 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
* has been ACKed by all distributors in the system, those tasks will be marked as completed.
*/
private void scheduleVersionDependentTasksForFutureCompletion(int completeAtVersion) {
+ // TODO expose and use monotonic clock instead of system clock
+ final long deadlineTimePointMs = timer.getCurrentTimeInMillis() + options.getMaxDeferredTaskVersionWaitTime().toMillis();
for (RemoteClusterControllerTask task : tasksPendingStateRecompute) {
log.finest(() -> String.format("Adding task of type '%s' to be completed at version %d",
task.getClass().getName(), completeAtVersion));
- taskCompletionQueue.add(new VersionDependentTaskCompletion(completeAtVersion, task));
+ taskCompletionQueue.add(new VersionDependentTaskCompletion(completeAtVersion, task, deadlineTimePointMs));
}
tasksPendingStateRecompute.clear();
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
index a885d432597..d8c853f45cb 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java
@@ -7,6 +7,7 @@ import com.yahoo.vdslib.distribution.Distribution;
import com.yahoo.vdslib.state.NodeType;
import com.yahoo.vespa.clustercontroller.core.status.statuspage.StatusPageServer;
+import java.time.Duration;
import java.util.*;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
@@ -116,6 +117,8 @@ public class FleetControllerOptions implements Cloneable {
// TODO: Get rid of this by always getting nodes by distribution.getNodes()
public Set<ConfiguredNode> nodes;
+ private Duration maxDeferredTaskVersionWaitTime = Duration.ofSeconds(30);
+
// TODO: Replace usage of this by usage where the nodes are explicitly passed (below)
public FleetControllerOptions(String clusterName) {
this.clusterName = clusterName;
@@ -139,6 +142,14 @@ public class FleetControllerOptions implements Cloneable {
this.nodes = distribution.getNodes();
}
+ public Duration getMaxDeferredTaskVersionWaitTime() {
+ return maxDeferredTaskVersionWaitTime;
+ }
+
+ public void setMaxDeferredTaskVersionWaitTime(Duration maxDeferredTaskVersionWaitTime) {
+ this.maxDeferredTaskVersionWaitTime = maxDeferredTaskVersionWaitTime;
+ }
+
public FleetControllerOptions clone() {
try {
// TODO: This should deep clone
@@ -213,6 +224,7 @@ public class FleetControllerOptions implements Cloneable {
sb.append("<tr><td><nobr>Maximum event log size</nobr></td><td align=\"right\">").append(eventLogMaxSize).append("</td></tr>");
sb.append("<tr><td><nobr>Maximum node event log size</nobr></td><td align=\"right\">").append(eventNodeLogMaxSize).append("</td></tr>");
sb.append("<tr><td><nobr>Wanted distribution bits</nobr></td><td align=\"right\">").append(distributionBits).append("</td></tr>");
+ sb.append("<tr><td><nobr>Max deferred task version wait time</nobr></td><td align=\"right\">").append(maxDeferredTaskVersionWaitTime.toMillis()).append("ms</td></tr>");
sb.append("</table>");
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
index e4519a02632..d082158edc7 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
@@ -39,16 +39,30 @@ public abstract class RemoteClusterControllerTask {
*/
public boolean isFailed() { return false; }
+ public enum FailureCondition {
+ LEADERSHIP_LOST,
+ DEADLINE_EXCEEDED
+ }
+
/**
- * If the task response has been deferred due to hasVersionAckDependency(),
- * handleLeadershipLost() will be invoked on the task if the cluster controller
+ * If the task completion has been deferred due to hasVersionAckDependency(),
+ * this method will be invoked if a failure occurs before the version has
+ * been successfully ACKed.
+ *
+ * LEADERSHIP_LOST will be the failure condition if the cluster controller
* discovers it has lost leadership in the time between task execution and
- * deferred response send time.
+ * deferred completion time.
+ *
+ * DEADLINE_EXCEEDED will be the failure condition if the completion has been
+ * deferred for more than a configurable amount of time.
*
* This method will also be invoked if the controller is signalled to shut down
* before the dependent cluster version has been published.
+ *
+ * The task implementation is responsible for communicating the appropriate
+ * error semantics to the caller who initially scheduled the task.
*/
- public void handleLeadershipLost() {}
+ public void handleFailure(FailureCondition condition) {}
public boolean isCompleted() {
synchronized (monitor) {
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java
index d7b8f96005a..5d6a4f66467 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java
@@ -8,15 +8,18 @@ import java.util.Objects;
* completion depends on side effects by the task becoming visible in
* the cluster before a response can be sent. Each such task is associated
* with a particular cluster state version number representing a lower bound
- * on the published state containing the side effect.
+ * on the published state containing the side effect. Each task is also
+ * associated with a completion deadline.
*/
class VersionDependentTaskCompletion {
private final long minimumVersion;
private final RemoteClusterControllerTask task;
+ private final long deadlineTimePointMs;
- VersionDependentTaskCompletion(long minimumVersion, RemoteClusterControllerTask task) {
+ VersionDependentTaskCompletion(long minimumVersion, RemoteClusterControllerTask task, long deadlineTimePointMs) {
this.minimumVersion = minimumVersion;
this.task = task;
+ this.deadlineTimePointMs = deadlineTimePointMs;
}
long getMinimumVersion() {
@@ -27,6 +30,8 @@ class VersionDependentTaskCompletion {
return task;
}
+ long getDeadlineTimePointMs() { return deadlineTimePointMs; }
+
@Override
public boolean equals(Object o) {
if (this == o) return true;
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java
index 27ab43b2b5a..5ac15e75127 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java
@@ -2,6 +2,7 @@
package com.yahoo.vespa.clustercontroller.core.restapiv2;
import com.yahoo.vespa.clustercontroller.core.RemoteClusterControllerTask;
+import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.DeadlineExceededException;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.InternalFailure;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.StateRestApiException;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.UnknownMasterException;
@@ -61,8 +62,12 @@ public abstract class Request<Result> extends RemoteClusterControllerTask {
}
@Override
- public void handleLeadershipLost() {
- failure = new UnknownMasterException("Leadership lost before request could complete");
+ public void handleFailure(FailureCondition condition) {
+ if (condition == FailureCondition.LEADERSHIP_LOST) {
+ failure = new UnknownMasterException("Leadership lost before request could complete");
+ } else if (condition == FailureCondition.DEADLINE_EXCEEDED) {
+ failure = new DeadlineExceededException("Task exceeded its version wait deadline");
+ }
}
@Override
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java
index d9815241920..b10a8101c37 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java
@@ -16,6 +16,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
+import java.time.Duration;
import java.util.*;
import java.util.logging.Logger;
@@ -1248,16 +1249,25 @@ public class StateChangeTest extends FleetControllerTest {
private static abstract class MockTask extends RemoteClusterControllerTask {
boolean invoked = false;
boolean leadershipLost = false;
+ boolean deadlineExceeded = false;
boolean isInvoked() { return invoked; }
boolean isLeadershipLost() { return leadershipLost; }
+ boolean isDeadlineExceeded() { return deadlineExceeded; }
+
@Override
public boolean hasVersionAckDependency() { return true; }
@Override
- public void handleLeadershipLost() { this.leadershipLost = true; }
+ public void handleFailure(FailureCondition condition) {
+ if (condition == FailureCondition.LEADERSHIP_LOST) {
+ this.leadershipLost = true;
+ } else if (condition == FailureCondition.DEADLINE_EXCEEDED) {
+ this.deadlineExceeded = true;
+ }
+ }
}
// We create an explicit mock task class instead of using mock() simply because of
@@ -1541,4 +1551,29 @@ public class StateChangeTest extends FleetControllerTest {
assertTrue(task.isCompleted());
}
+ @Test
+ public void task_not_completed_within_deadline_is_failed_with_deadline_exceeded_error() throws Exception {
+ FleetControllerOptions options = defaultOptions();
+ options.setMaxDeferredTaskVersionWaitTime(Duration.ofSeconds(60));
+ RemoteTaskFixture fixture = createFixtureWith(options);
+
+ MockTask task = fixture.scheduleVersionDependentTaskWithSideEffects();
+ communicator.setShouldDeferDistributorClusterStateAcks(true);
+ fixture.processScheduledTask();
+
+ assertTrue(task.isInvoked());
+ assertFalse(task.isCompleted());
+ assertFalse(task.isDeadlineExceeded());
+
+ timer.advanceTime(59_000);
+ ctrl.tick();
+ assertFalse(task.isCompleted());
+ assertFalse(task.isDeadlineExceeded());
+
+ timer.advanceTime(1_001);
+ ctrl.tick();
+ assertTrue(task.isCompleted());
+ assertTrue(task.isDeadlineExceeded());
+ }
+
}
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java
index 416c57ce5d7..4fb244666a4 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java
@@ -2,7 +2,9 @@
package com.yahoo.vespa.clustercontroller.core.restapiv2;
import com.yahoo.vdslib.state.NodeType;
+import com.yahoo.vespa.clustercontroller.core.RemoteClusterControllerTask;
import com.yahoo.vespa.clustercontroller.core.restapiv2.requests.SetNodeStateRequest;
+import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.DeadlineExceededException;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.InvalidContentException;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.MissingUnitException;
import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.OperationNotSupportedForUnitException;
@@ -403,15 +405,25 @@ public class SetNodeStateTest extends StateRestApiTest {
expectedException.expect(UnknownMasterException.class);
SetNodeStateRequest request = createDummySetNodeStateRequest();
- request.handleLeadershipLost();
+ request.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST);
request.getResult();
}
@Test
public void leadership_loss_marks_request_as_failed_for_early_out_response() {
SetNodeStateRequest request = createDummySetNodeStateRequest();
- request.handleLeadershipLost();
+ request.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST);
assertTrue(request.isFailed());
}
+ @Test
+ public void deadline_exceeded_fails_set_node_state_request() throws Exception {
+ expectedException.expectMessage("Task exceeded its version wait deadline");
+ expectedException.expect(DeadlineExceededException.class);
+
+ SetNodeStateRequest request = createDummySetNodeStateRequest();
+ request.handleFailure(RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED);
+ request.getResult();
+ }
+
}
diff --git a/clustercontroller-utils/src/main/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/errors/DeadlineExceededException.java b/clustercontroller-utils/src/main/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/errors/DeadlineExceededException.java
new file mode 100644
index 00000000000..55032d515ce
--- /dev/null
+++ b/clustercontroller-utils/src/main/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/errors/DeadlineExceededException.java
@@ -0,0 +1,8 @@
+// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.clustercontroller.utils.staterestapi.errors;
+
+public class DeadlineExceededException extends StateRestApiException {
+ public DeadlineExceededException(String description) {
+ super(description);
+ }
+}
diff --git a/clustercontroller-utils/src/main/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/server/RestApiHandler.java b/clustercontroller-utils/src/main/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/server/RestApiHandler.java
index 395958063f5..71e6bc36de1 100644
--- a/clustercontroller-utils/src/main/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/server/RestApiHandler.java
+++ b/clustercontroller-utils/src/main/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/server/RestApiHandler.java
@@ -89,6 +89,12 @@ public class RestApiHandler implements HttpRequestHandler {
result.setHttpCode(503, "Service Unavailable");
result.setJson(jsonWriter.createErrorJson(exception.getMessage()));
return result;
+ } catch (DeadlineExceededException exception) {
+ logRequestException(request, exception, Level.WARNING);
+ JsonHttpResult result = new JsonHttpResult();
+ result.setHttpCode(504, "Gateway Timeout");
+ result.setJson(jsonWriter.createErrorJson(exception.getMessage()));
+ return result;
} catch (StateRestApiException exception) {
logRequestException(request, exception, Level.WARNING);
JsonHttpResult result = new JsonHttpResult();
diff --git a/clustercontroller-utils/src/test/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/StateRestAPITest.java b/clustercontroller-utils/src/test/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/StateRestAPITest.java
index 8328ecc491f..3492c632e60 100644
--- a/clustercontroller-utils/src/test/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/StateRestAPITest.java
+++ b/clustercontroller-utils/src/test/java/com/yahoo/vespa/clustercontroller/utils/staterestapi/StateRestAPITest.java
@@ -526,4 +526,17 @@ public class StateRestAPITest {
} catch (IllegalArgumentException e) {
}
}
+
+ @Test
+ public void deadline_exceeded_exception_returns_http_504_error() throws Exception {
+ setupDummyStateApi();
+ stateApi.induceException(new DeadlineExceededException("argh!"));
+ HttpResult result = execute(new HttpRequest().setPath("/cluster/v2"));
+
+ assertEquals(result.toString(true), 504, result.getHttpReturnCode());
+ assertEquals(result.toString(true), "Gateway Timeout", result.getHttpReturnCodeDescription());
+ assertEquals(result.toString(true), "application/json", result.getHeader("Content-Type"));
+ String expected = "{\"message\":\"argh!\"}";
+ assertEquals(expected, result.getContent().toString());
+ }
}
diff --git a/configdefinitions/src/vespa/fleetcontroller.def b/configdefinitions/src/vespa/fleetcontroller.def
index 8f42b5e7075..028bece1db2 100644
--- a/configdefinitions/src/vespa/fleetcontroller.def
+++ b/configdefinitions/src/vespa/fleetcontroller.def
@@ -107,7 +107,7 @@ min_distributor_up_ratio double default=0.01
min_storage_up_ratio double default=0.01
## Seconds to sleep after doing a work cycle where we did no work. Some
-## events do not interrupt the sleeping, such as slobrok changes, so shouldnt
+## events do not interrupt the sleeping, such as slobrok changes, so shouldn't
## set this too high
cycle_wait_time double default=0.1
@@ -122,7 +122,7 @@ min_time_before_first_system_state_broadcast double default=5.0
## always have a pending operation with very low cost. Keeping a low timeout is
## good to detect issues like packet loss. The default tries to balance the two
## by not resending too often, but detecting packet loss within a minute at
-## least. If we can guarantuee RPC layer to fail on packet loss within
+## least. If we can guarantee RPC layer to fail on packet loss within
## reasonable time we should increase this default.
get_node_state_request_timeout double default=120.0
@@ -146,3 +146,11 @@ ideal_distribution_bits int default=16
## availability has been restored above the given threshold.
## Default is 0, i.e. functionality is for all intents and purposes disabled.
min_node_ratio_per_group double default=0.0
+
+## If a cluster controller task has a dependency on a given cluster state
+## version being published and ACKed by the cluster, it will be put on a wait
+## queue while holding up the container thread associated with the task.
+## This config specifies the maximum time a task can be held in this queue
+## before being automatically failed out, if a version has not been ACKed
+## within this duration.
+max_deferred_task_version_wait_time_sec double default=30.0 \ No newline at end of file