diff options
author | Tor Brede Vekterli <vekterli@yahoo-inc.com> | 2017-10-12 17:04:54 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-12 17:04:54 +0200 |
commit | 50b3993733265a0455d78b12989c9f0391527a5d (patch) | |
tree | 023027060d9b243c6771b5d69e1f96b15c2ec4fc /clustercontroller-core | |
parent | 0f2f3accdc4803781920285fa9656c820d41a20a (diff) | |
parent | 8c6befb4a9fb5357d33208631cc15989dab771f7 (diff) |
Merge pull request #3525 from vespa-engine/vekterli/re-enable-synchronous-set-node-state
Re-enable synchronous set node state with additional safeguards
Diffstat (limited to 'clustercontroller-core')
8 files changed, 165 insertions, 21 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index 76d59622fe7..a5419c64818 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -400,12 +400,12 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd private void failAllVersionDependentTasks() { tasksPendingStateRecompute.forEach(task -> { - task.handleLeadershipLost(); + task.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST); task.notifyCompleted(); }); tasksPendingStateRecompute.clear(); taskCompletionQueue.forEach(task -> { - task.getTask().handleLeadershipLost(); + task.getTask().handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST); task.getTask().notifyCompleted(); }); taskCompletionQueue.clear(); @@ -665,8 +665,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd final RemoteClusterControllerTask task = remoteTasks.poll(); log.finest(() -> String.format("Processing remote task of type '%s'", task.getClass().getName())); task.doRemoteFleetControllerTask(context); - // We cannot introduce a version barrier for tasks when we're not the master (and therefore will not publish new versions). - if (!isMaster() || !task.hasVersionAckDependency()) { + if (taskMayBeCompletedImmediately(task)) { log.finest(() -> String.format("Done processing remote task of type '%s'", task.getClass().getName())); task.notifyCompleted(); } else { @@ -678,6 +677,11 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd return false; } + private boolean taskMayBeCompletedImmediately(RemoteClusterControllerTask task) { + // We cannot introduce a version barrier for tasks when we're not the master (and therefore will not publish new versions). + return (!task.hasVersionAckDependency() || task.isFailed() || !masterElectionHandler.isMaster()); + } + private RemoteClusterControllerTask.Context createRemoteTaskProcessingContext() { final RemoteClusterControllerTask.Context context = new RemoteClusterControllerTask.Context(); context.cluster = cluster; @@ -691,13 +695,27 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd private boolean completeSatisfiedVersionDependentTasks() { int publishedVersion = systemStateBroadcaster.lastClusterStateVersionInSync(); long queueSizeBefore = taskCompletionQueue.size(); + // Note: although version monotonicity of tasks in queue always should hold, + // deadline monotonicity is not guaranteed to do so due to reconfigs of task + // timeout durations. Means that tasks enqueued with shorter deadline duration + // might be observed as having at least the same timeout as tasks enqueued during + // a previous configuration. Current clock implementation is also susceptible to + // skewing. + final long now = timer.getCurrentTimeInMillis(); while (!taskCompletionQueue.isEmpty()) { VersionDependentTaskCompletion taskCompletion = taskCompletionQueue.peek(); + // TODO expose and use monotonic clock instead of system clock if (publishedVersion >= taskCompletion.getMinimumVersion()) { log.fine(() -> String.format("Deferred task of type '%s' has minimum version %d, published is %d; completing", taskCompletion.getTask().getClass().getName(), taskCompletion.getMinimumVersion(), publishedVersion)); taskCompletion.getTask().notifyCompleted(); taskCompletionQueue.remove(); + } else if (taskCompletion.getDeadlineTimePointMs() <= now) { + log.fine(() -> String.format("Deferred task of type '%s' has exceeded wait deadline; completing with failure", + taskCompletion.getTask().getClass().getName())); + taskCompletion.getTask().handleFailure(RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED); + taskCompletion.getTask().notifyCompleted(); + taskCompletionQueue.remove(); } else { break; } @@ -792,10 +810,12 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd * has been ACKed by all distributors in the system, those tasks will be marked as completed. */ private void scheduleVersionDependentTasksForFutureCompletion(int completeAtVersion) { + // TODO expose and use monotonic clock instead of system clock + final long deadlineTimePointMs = timer.getCurrentTimeInMillis() + options.getMaxDeferredTaskVersionWaitTime().toMillis(); for (RemoteClusterControllerTask task : tasksPendingStateRecompute) { log.finest(() -> String.format("Adding task of type '%s' to be completed at version %d", task.getClass().getName(), completeAtVersion)); - taskCompletionQueue.add(new VersionDependentTaskCompletion(completeAtVersion, task)); + taskCompletionQueue.add(new VersionDependentTaskCompletion(completeAtVersion, task, deadlineTimePointMs)); } tasksPendingStateRecompute.clear(); } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java index a885d432597..d8c853f45cb 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetControllerOptions.java @@ -7,6 +7,7 @@ import com.yahoo.vdslib.distribution.Distribution; import com.yahoo.vdslib.state.NodeType; import com.yahoo.vespa.clustercontroller.core.status.statuspage.StatusPageServer; +import java.time.Duration; import java.util.*; import java.text.DecimalFormat; import java.text.DecimalFormatSymbols; @@ -116,6 +117,8 @@ public class FleetControllerOptions implements Cloneable { // TODO: Get rid of this by always getting nodes by distribution.getNodes() public Set<ConfiguredNode> nodes; + private Duration maxDeferredTaskVersionWaitTime = Duration.ofSeconds(30); + // TODO: Replace usage of this by usage where the nodes are explicitly passed (below) public FleetControllerOptions(String clusterName) { this.clusterName = clusterName; @@ -139,6 +142,14 @@ public class FleetControllerOptions implements Cloneable { this.nodes = distribution.getNodes(); } + public Duration getMaxDeferredTaskVersionWaitTime() { + return maxDeferredTaskVersionWaitTime; + } + + public void setMaxDeferredTaskVersionWaitTime(Duration maxDeferredTaskVersionWaitTime) { + this.maxDeferredTaskVersionWaitTime = maxDeferredTaskVersionWaitTime; + } + public FleetControllerOptions clone() { try { // TODO: This should deep clone @@ -213,6 +224,7 @@ public class FleetControllerOptions implements Cloneable { sb.append("<tr><td><nobr>Maximum event log size</nobr></td><td align=\"right\">").append(eventLogMaxSize).append("</td></tr>"); sb.append("<tr><td><nobr>Maximum node event log size</nobr></td><td align=\"right\">").append(eventNodeLogMaxSize).append("</td></tr>"); sb.append("<tr><td><nobr>Wanted distribution bits</nobr></td><td align=\"right\">").append(distributionBits).append("</td></tr>"); + sb.append("<tr><td><nobr>Max deferred task version wait time</nobr></td><td align=\"right\">").append(maxDeferredTaskVersionWaitTime.toMillis()).append("ms</td></tr>"); sb.append("</table>"); } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java index 0fb9e5798c8..d082158edc7 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java @@ -32,15 +32,37 @@ public abstract class RemoteClusterControllerTask { public boolean hasVersionAckDependency() { return false; } /** - * If the task response has been deferred due to hasVersionAckDependency(), - * handleLeadershipLost() will be invoked on the task if the cluster controller + * If true, signals that a task has failed and can be immediately marked as + * complete without waiting for a version ACK. The task implementation has + * the responsibility of communicating any failure to the caller, and ensuring + * that the lack of version waiting does not violate any invariants. + */ + public boolean isFailed() { return false; } + + public enum FailureCondition { + LEADERSHIP_LOST, + DEADLINE_EXCEEDED + } + + /** + * If the task completion has been deferred due to hasVersionAckDependency(), + * this method will be invoked if a failure occurs before the version has + * been successfully ACKed. + * + * LEADERSHIP_LOST will be the failure condition if the cluster controller * discovers it has lost leadership in the time between task execution and - * deferred response send time. + * deferred completion time. + * + * DEADLINE_EXCEEDED will be the failure condition if the completion has been + * deferred for more than a configurable amount of time. * * This method will also be invoked if the controller is signalled to shut down * before the dependent cluster version has been published. + * + * The task implementation is responsible for communicating the appropriate + * error semantics to the caller who initially scheduled the task. */ - public void handleLeadershipLost() {} + public void handleFailure(FailureCondition condition) {} public boolean isCompleted() { synchronized (monitor) { diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java index d7b8f96005a..5d6a4f66467 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/VersionDependentTaskCompletion.java @@ -8,15 +8,18 @@ import java.util.Objects; * completion depends on side effects by the task becoming visible in * the cluster before a response can be sent. Each such task is associated * with a particular cluster state version number representing a lower bound - * on the published state containing the side effect. + * on the published state containing the side effect. Each task is also + * associated with a completion deadline. */ class VersionDependentTaskCompletion { private final long minimumVersion; private final RemoteClusterControllerTask task; + private final long deadlineTimePointMs; - VersionDependentTaskCompletion(long minimumVersion, RemoteClusterControllerTask task) { + VersionDependentTaskCompletion(long minimumVersion, RemoteClusterControllerTask task, long deadlineTimePointMs) { this.minimumVersion = minimumVersion; this.task = task; + this.deadlineTimePointMs = deadlineTimePointMs; } long getMinimumVersion() { @@ -27,6 +30,8 @@ class VersionDependentTaskCompletion { return task; } + long getDeadlineTimePointMs() { return deadlineTimePointMs; } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java index 924614df327..5ac15e75127 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.clustercontroller.core.restapiv2; import com.yahoo.vespa.clustercontroller.core.RemoteClusterControllerTask; +import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.DeadlineExceededException; import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.InternalFailure; import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.StateRestApiException; import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.UnknownMasterException; @@ -61,8 +62,17 @@ public abstract class Request<Result> extends RemoteClusterControllerTask { } @Override - public void handleLeadershipLost() { - failure = new UnknownMasterException("Leadership lost before request could complete"); + public void handleFailure(FailureCondition condition) { + if (condition == FailureCondition.LEADERSHIP_LOST) { + failure = new UnknownMasterException("Leadership lost before request could complete"); + } else if (condition == FailureCondition.DEADLINE_EXCEEDED) { + failure = new DeadlineExceededException("Task exceeded its version wait deadline"); + } + } + + @Override + public boolean isFailed() { + return (failure != null); } public abstract Result calculateResult(Context context) throws StateRestApiException, OtherMasterIndexException; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java index 2c094769961..cf41707db75 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java @@ -67,9 +67,7 @@ public class SetNodeStateRequest extends Request<SetResponse> { @Override public boolean hasVersionAckDependency() { - // FIXME this is a temporary change while edge cases in interactions between controller - // and orchestration are sorted out. - return false; + return (responseWait == SetUnitStateRequest.ResponseWait.WAIT_UNTIL_CLUSTER_ACKED); } static SetResponse setWantedState( diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java index 6a8ffae8f3f..b10a8101c37 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java @@ -16,6 +16,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import java.time.Duration; import java.util.*; import java.util.logging.Logger; @@ -1248,16 +1249,25 @@ public class StateChangeTest extends FleetControllerTest { private static abstract class MockTask extends RemoteClusterControllerTask { boolean invoked = false; boolean leadershipLost = false; + boolean deadlineExceeded = false; boolean isInvoked() { return invoked; } boolean isLeadershipLost() { return leadershipLost; } + boolean isDeadlineExceeded() { return deadlineExceeded; } + @Override public boolean hasVersionAckDependency() { return true; } @Override - public void handleLeadershipLost() { this.leadershipLost = true; } + public void handleFailure(FailureCondition condition) { + if (condition == FailureCondition.LEADERSHIP_LOST) { + this.leadershipLost = true; + } else if (condition == FailureCondition.DEADLINE_EXCEEDED) { + this.deadlineExceeded = true; + } + } } // We create an explicit mock task class instead of using mock() simply because of @@ -1275,6 +1285,11 @@ public class StateChangeTest extends FleetControllerTest { } } + private static class FailingMockSynchronousTaskWithSideEffects extends MockSynchronousTaskWithSideEffects { + @Override + public boolean isFailed() { return true; } + } + private static class MockNoOpSynchronousTask extends MockTask { @Override public void doRemoteFleetControllerTask(Context ctx) { @@ -1310,6 +1325,10 @@ public class StateChangeTest extends FleetControllerTest { return scheduleTask(new MockNoOpSynchronousTask()); } + MockTask scheduleFailingVersionDependentTaskWithSideEffects() throws Exception { + return scheduleTask(new FailingMockSynchronousTaskWithSideEffects()); + } + void markStorageNodeDown(int index) throws Exception { communicator.setNodeState(new Node(NodeType.STORAGE, index), State.DOWN, "foo"); // Auto-ACKed ctrl.tick(); @@ -1396,6 +1415,15 @@ public class StateChangeTest extends FleetControllerTest { } @Test + public void failing_task_is_immediately_completed() throws Exception { + RemoteTaskFixture fixture = createDefaultFixture(); + MockTask task = fixture.scheduleFailingVersionDependentTaskWithSideEffects(); + + assertTrue(task.isInvoked()); + assertTrue(task.isCompleted()); + } + + @Test public void no_op_synchronous_remote_task_can_complete_immediately_if_current_state_already_acked() throws Exception { RemoteTaskFixture fixture = createFixtureWith(optionsWithZeroTransitionTime()); fixture.markStorageNodeDown(0); @@ -1523,4 +1551,29 @@ public class StateChangeTest extends FleetControllerTest { assertTrue(task.isCompleted()); } + @Test + public void task_not_completed_within_deadline_is_failed_with_deadline_exceeded_error() throws Exception { + FleetControllerOptions options = defaultOptions(); + options.setMaxDeferredTaskVersionWaitTime(Duration.ofSeconds(60)); + RemoteTaskFixture fixture = createFixtureWith(options); + + MockTask task = fixture.scheduleVersionDependentTaskWithSideEffects(); + communicator.setShouldDeferDistributorClusterStateAcks(true); + fixture.processScheduledTask(); + + assertTrue(task.isInvoked()); + assertFalse(task.isCompleted()); + assertFalse(task.isDeadlineExceeded()); + + timer.advanceTime(59_000); + ctrl.tick(); + assertFalse(task.isCompleted()); + assertFalse(task.isDeadlineExceeded()); + + timer.advanceTime(1_001); + ctrl.tick(); + assertTrue(task.isCompleted()); + assertTrue(task.isDeadlineExceeded()); + } + } diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java index 0f35b96c31d..4fb244666a4 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java @@ -2,7 +2,9 @@ package com.yahoo.vespa.clustercontroller.core.restapiv2; import com.yahoo.vdslib.state.NodeType; +import com.yahoo.vespa.clustercontroller.core.RemoteClusterControllerTask; import com.yahoo.vespa.clustercontroller.core.restapiv2.requests.SetNodeStateRequest; +import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.DeadlineExceededException; import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.InvalidContentException; import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.MissingUnitException; import com.yahoo.vespa.clustercontroller.utils.staterestapi.errors.OperationNotSupportedForUnitException; @@ -374,11 +376,16 @@ public class SetNodeStateTest extends StateRestApiTest { .setNewState("user", "maintenance", "whatever reason.")); } - // FIXME requests should be tagged as version dependent; temporary workaround @Test - public void set_node_state_requests_are_by_default_not_tagged_as_having_version_ack_dependency() { + public void set_node_state_requests_are_by_default_tagged_as_having_version_ack_dependency() { SetNodeStateRequest request = createDummySetNodeStateRequest(); - assertFalse(request.hasVersionAckDependency()); + assertTrue(request.hasVersionAckDependency()); + } + + @Test + public void set_node_state_requests_not_initially_marked_as_failed() { + SetNodeStateRequest request = createDummySetNodeStateRequest(); + assertFalse(request.isFailed()); } @Test @@ -398,7 +405,24 @@ public class SetNodeStateTest extends StateRestApiTest { expectedException.expect(UnknownMasterException.class); SetNodeStateRequest request = createDummySetNodeStateRequest(); - request.handleLeadershipLost(); + request.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST); + request.getResult(); + } + + @Test + public void leadership_loss_marks_request_as_failed_for_early_out_response() { + SetNodeStateRequest request = createDummySetNodeStateRequest(); + request.handleFailure(RemoteClusterControllerTask.FailureCondition.LEADERSHIP_LOST); + assertTrue(request.isFailed()); + } + + @Test + public void deadline_exceeded_fails_set_node_state_request() throws Exception { + expectedException.expectMessage("Task exceeded its version wait deadline"); + expectedException.expect(DeadlineExceededException.class); + + SetNodeStateRequest request = createDummySetNodeStateRequest(); + request.handleFailure(RemoteClusterControllerTask.FailureCondition.DEADLINE_EXCEEDED); request.getResult(); } |