diff options
author | Tor Brede Vekterli <vekterli@yahoo-inc.com> | 2017-10-12 17:04:54 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-12 17:04:54 +0200 |
commit | 50b3993733265a0455d78b12989c9f0391527a5d (patch) | |
tree | 023027060d9b243c6771b5d69e1f96b15c2ec4fc /clustercontroller-core/src/main | |
parent | 0f2f3accdc4803781920285fa9656c820d41a20a (diff) | |
parent | 8c6befb4a9fb5357d33208631cc15989dab771f7 (diff) |
Merge pull request #3525 from vespa-engine/vekterli/re-enable-synchronous-set-node-state
Re-enable synchronous set node state with additional safeguards
Diffstat (limited to 'clustercontroller-core/src/main')
6 files changed, 83 insertions, 16 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index 76d59622fe7..a5419c64818 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -400,12 +400,12 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd private void failAllVersionDependentTasks() { tasksPendingStateRecompute.forEach(task -> { - task.handleLeadershipLost(); + task.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST); task.notifyCompleted(); }); tasksPendingStateRecompute.clear(); taskCompletionQueue.forEach(task -> { - task.getTask().handleLeadershipLost(); + task.getTask().handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST); task.getTask().notifyCompleted(); }); taskCompletionQueue.clear(); @@ -665,8 +665,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd final RemoteClusterControllerTask task = remoteTasks.poll(); log.finest(() -> String.format("Processing remote task of type '%s'", task.getClass().getName())); task.doRemoteFleetControllerTask(context); - // We cannot introduce a version barrier for tasks when we're not the master (and therefore will not publish new versions). - if (!isMaster() || !task.hasVersionAckDependency()) { + if (taskMayBeCompletedImmediately(task)) { log.finest(() -> String.format("Done processing remote task of type '%s'", task.getClass().getName())); task.notifyCompleted(); } else { @@ -678,6 +677,11 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd return false; } + private boolean taskMayBeCompletedImmediately(RemoteClusterControllerTask task) { + // We cannot introduce a version barrier for tasks when we're not the master (and therefore will not publish new versions). + return (!task.hasVersionAckDependency() || task.isFailed() || !masterElectionHandler.isMaster()); + } + private RemoteClusterControllerTask.Context createRemoteTaskProcessingContext() { final RemoteClusterControllerTask.Context context = new RemoteClusterControllerTask.Context(); context.cluster = cluster; @@ -691,13 +695,27 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd private boolean completeSatisfiedVersionDependentTasks() { int publishedVersion = systemStateBroadcaster.lastClusterStateVersionInSync(); long queueSizeBefore = taskCompletionQueue.size(); + // Note: although version monotonicity of tasks in queue always should hold, + // deadline monotonicity is not guaranteed to do so due to reconfigs of task + // timeout durations. Means that tasks enqueued with shorter deadline duration + // might be observed as having at least the same timeout as tasks enqueued during + // a previous configuration. Current clock implementation is also susceptible to + // skewing. + final long now = timer.getCurrentTimeInMillis(); while (!taskCompletionQueue.isEmpty()) { VersionDependentTaskCompletion taskCompletion = taskCompletionQueue.peek(); + // TODO expose and use monotonic clock instead of system clock if (publishedVersion >= taskCompletion.getMinimumVersion()) { log.fine(() -> String.format("Deferred task of type '%s' has minimum version %d, published is %d; completing", taskCompletion.getTask().getClass().getName(), taskCompletion.getMinimumVersion(), publishedVersion)); taskCompletion.getTask().notifyCompleted(); taskCompletionQueue.remove(); + } else if (taskCompletion.getDeadlineTimePointMs() <= now) { + log.fine(() -> String.format("Deferred task of type '%s' has exceeded wait deadline; completing with failure", + taskCompletion.getTask().getClass().getName())); + taskCompletion.getTask().handleFailure(RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED); + taskCompletion.getTask().notifyCompleted(); + taskCompletionQueue.remove(); } else { break; } @@ -792,10 +810,12 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd * has been ACKed by all distributors in the system, those tasks will be marked as completed. */ private void scheduleVersionDependentTasksForFutureCompletion(int completeAtVersion) { + // TODO expose and use monotonic clock instead of system clock + final long deadlineTimePointMs = timer.getCurrentTimeInMillis() + options.getMaxDeferredTaskVersionWaitTime().toMillis(); for (RemoteClusterControllerTask task : tasksPendingStateRecompute) { log.finest(() -> String.format("Adding task of type '%s' to be completed at version %d", task.getClass().getName(), completeAtVersion)); - taskCompletionQueue.add(new VersionDependentTaskCompletion(completeAtVersion, task)); + taskCompletionQueue.add(new VersionDependentTaskCompletion(completeAtVersion, task, deadlineTimePointMs)); } tasksPendingStateRecompute.clear(); } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java index a885d432597..d8c853f45cb 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java @@ -7,6 +7,7 @@ import com.yahoo.vdslib.distribution.Distribution; import com.yahoo.vdslib.state.NodeType; import com.yahoo.vespa.clustercontroller.core.status.statuspage.StatusPageServer; +import java.time.Duration; import java.util.*; import java.text.DecimalFormat; import java.text.DecimalFormatSymbols; @@ -116,6 +117,8 @@ public class FleetControllerOptions implements Cloneable { // TODO: Get rid of this by always getting nodes by distribution.getNodes() public Set<ConfiguredNode> nodes; + private Duration maxDeferredTaskVersionWaitTime = Duration.ofSeconds(30); + // TODO: Replace usage of this by usage where the nodes are explicitly passed (below) public FleetControllerOptions(String clusterName) { this.clusterName = clusterName; @@ -139,6 +142,14 @@ public class FleetControllerOptions implements Cloneable { this.nodes = distribution.getNodes(); } + public Duration getMaxDeferredTaskVersionWaitTime() { + return maxDeferredTaskVersionWaitTime; + } + + public void setMaxDeferredTaskVersionWaitTime(Duration maxDeferredTaskVersionWaitTime) { + this.maxDeferredTaskVersionWaitTime = maxDeferredTaskVersionWaitTime; + } + public FleetControllerOptions clone() { try { // TODO: This should deep clone @@ -213,6 +224,7 @@ public class FleetControllerOptions implements Cloneable { sb.append("<tr><td><nobr>Maximum event log size</nobr></td><td align=\"right\">").append(eventLogMaxSize).append("</td></tr>"); sb.append("<tr><td><nobr>Maximum node event log size</nobr></td><td align=\"right\">").append(eventNodeLogMaxSize).append("</td></tr>"); sb.append("<tr><td><nobr>Wanted distribution bits</nobr></td><td align=\"right\">").append(distributionBits).append("</td></tr>"); + sb.append("<tr><td><nobr>Max deferred task version wait time</nobr></td><td align=\"right\">").append(maxDeferredTaskVersionWaitTime.toMillis()).append("ms</td></tr>"); sb.append("</table>"); } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java index 0fb9e5798c8..d082158edc7 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java @@ -32,15 +32,37 @@ public abstract class RemoteClusterControllerTask { public boolean hasVersionAckDependency() { return false; } /** - * If the task response has been deferred due to hasVersionAckDependency(), - * handleLeadershipLost() will be invoked on the task if the cluster controller + * If true, signals that a task has failed and can be immediately marked as + * complete without waiting for a version ACK. The task implementation has + * the responsibility of communicating any failure to the caller, and ensuring + * that the lack of version waiting does not violate any invariants. + */ + public boolean isFailed() { return false; } + + public enum FailureCondition { + LEADERSHIP_LOST, + DEADLINE_EXCEEDED + } + + /** + * If the task completion has been deferred due to hasVersionAckDependency(), + * this method will be invoked if a failure occurs before the version has + * been successfully ACKed. + * + * LEADERSHIP_LOST will be the failure condition if the cluster controller * discovers it has lost leadership in the time between task execution and - * deferred response send time. + * deferred completion time. + * + * DEADLINE_EXCEEDED will be the failure condition if the completion has been + * deferred for more than a configurable amount of time. * * This method will also be invoked if the controller is signalled to shut down * before the dependent cluster version has been published. + * + * The task implementation is responsible for communicating the appropriate + * error semantics to the caller who initially scheduled the task. */ - public void handleLeadershipLost() {} + public void handleFailure(FailureCondition condition) {} public boolean isCompleted() { synchronized (monitor) { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java index d7b8f96005a..5d6a4f66467 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java @@ -8,15 +8,18 @@ import java.util.Objects; * completion depends on side effects by the task becoming visible in * the cluster before a response can be sent. Each such task is associated * with a particular cluster state version number representing a lower bound - * on the published state containing the side effect. + * on the published state containing the side effect. Each task is also + * associated with a completion deadline. */ class VersionDependentTaskCompletion { private final long minimumVersion; private final RemoteClusterControllerTask task; + private final long deadlineTimePointMs; - VersionDependentTaskCompletion(long minimumVersion, RemoteClusterControllerTask task) { + VersionDependentTaskCompletion(long minimumVersion, RemoteClusterControllerTask task, long deadlineTimePointMs) { this.minimumVersion = minimumVersion; this.task = task; + this.deadlineTimePointMs = deadlineTimePointMs; } long getMinimumVersion() { @@ -27,6 +30,8 @@ class VersionDependentTaskCompletion { return task; } + long getDeadlineTimePointMs() { return deadlineTimePointMs; } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java index 924614df327..5ac15e75127 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.clustercontroller.core.restapiv2; import com.yahoo.vespa.clustercontroller.core.RemoteClusterControllerTask; +import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.DeadlineExceededException; import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.InternalFailure; import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.StateRestApiException; import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.UnknownMasterException; @@ -61,8 +62,17 @@ public abstract class Request<Result> extends RemoteClusterControllerTask { } @Override - public void handleLeadershipLost() { - failure = new UnknownMasterException("Leadership lost before request could complete"); + public void handleFailure(FailureCondition condition) { + if (condition == FailureCondition.LEADERSHIP_LOST) { + failure = new UnknownMasterException("Leadership lost before request could complete"); + } else if (condition == FailureCondition.DEADLINE_EXCEEDED) { + failure = new DeadlineExceededException("Task exceeded its version wait deadline"); + } + } + + @Override + public boolean isFailed() { + return (failure != null); } public abstract Result calculateResult(Context context) throws StateRestApiException, OtherMasterIndexException; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java index 2c094769961..cf41707db75 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java @@ -67,9 +67,7 @@ public class SetNodeStateRequest extends Request<SetResponse> { @Override public boolean hasVersionAckDependency() { - // FIXME this is a temporary change while edge cases in interactions between controller - // and orchestration are sorted out. - return false; + return (responseWait == SetUnitStateRequest.ResponseWait.WAIT_UNTIL_CLUSTER_ACKED); } static SetResponse setWantedState( |