From debb34547c76429e8a345299e4765824903f784c Mon Sep 17 00:00:00 2001 From: Tor Brede Vekterli Date: Thu, 21 Sep 2017 13:19:44 +0200 Subject: Immediately complete failed remote tasks We check both for master status and task failure, as we otherwise place a potentially dangerous silent dependency on the task always failing itself if the controller is not a master. --- .../vespa/clustercontroller/core/FleetController.java | 8 ++++++-- .../core/RemoteClusterControllerTask.java | 8 ++++++++ .../clustercontroller/core/restapiv2/Request.java | 5 +++++ .../core/restapiv2/requests/SetNodeStateRequest.java | 4 +--- .../vespa/clustercontroller/core/StateChangeTest.java | 18 ++++++++++++++++++ .../core/restapiv2/SetNodeStateTest.java | 18 +++++++++++++++--- 6 files changed, 53 insertions(+), 8 deletions(-) (limited to 'clustercontroller-core') diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java index 120b482e11d..030d792a63b 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java @@ -665,8 +665,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd final RemoteClusterControllerTask task = remoteTasks.poll(); log.finest(() -> String.format("Processing remote task of type '%s'", task.getClass().getName())); task.doRemoteFleetControllerTask(context); - // We cannot introduce a version barrier for tasks when we're not the master (and therefore will not publish new versions). - if (!isMaster() || !task.hasVersionAckDependency()) { + if (taskMayBeCompletedImmediately(task)) { log.finest(() -> String.format("Done processing remote task of type '%s'", task.getClass().getName())); task.notifyCompleted(); } else { @@ -678,6 +677,11 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd return false; } + private boolean taskMayBeCompletedImmediately(RemoteClusterControllerTask task) { + // We cannot introduce a version barrier for tasks when we're not the master (and therefore will not publish new versions). + return (!task.hasVersionAckDependency() || task.isFailed() || !masterElectionHandler.isMaster()); + } + private RemoteClusterControllerTask.Context createRemoteTaskProcessingContext() { final RemoteClusterControllerTask.Context context = new RemoteClusterControllerTask.Context(); context.cluster = cluster; diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java index 0fb9e5798c8..e4519a02632 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java @@ -31,6 +31,14 @@ public abstract class RemoteClusterControllerTask { */ public boolean hasVersionAckDependency() { return false; } + /** + * If true, signals that a task has failed and can be immediately marked as + * complete without waiting for a version ACK. The task implementation has + * the responsibility of communicating any failure to the caller, and ensuring + * that the lack of version waiting does not violate any invariants. + */ + public boolean isFailed() { return false; } + /** * If the task response has been deferred due to hasVersionAckDependency(), * handleLeadershipLost() will be invoked on the task if the cluster controller diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java index 924614df327..27ab43b2b5a 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java @@ -65,6 +65,11 @@ public abstract class Request extends RemoteClusterControllerTask { failure = new UnknownMasterException("Leadership lost before request could complete"); } + @Override + public boolean isFailed() { + return (failure != null); + } + public abstract Result calculateResult(Context context) throws StateRestApiException, OtherMasterIndexException; } diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java index 2c094769961..cf41707db75 100644 --- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java +++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java @@ -67,9 +67,7 @@ public class SetNodeStateRequest extends Request { @Override public boolean hasVersionAckDependency() { - // FIXME this is a temporary change while edge cases in interactions between controller - // and orchestration are sorted out. - return false; + return (responseWait == SetUnitStateRequest.ResponseWait.WAIT_UNTIL_CLUSTER_ACKED); } static SetResponse setWantedState( diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java index 6a8ffae8f3f..d9815241920 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java @@ -1275,6 +1275,11 @@ public class StateChangeTest extends FleetControllerTest { } } + private static class FailingMockSynchronousTaskWithSideEffects extends MockSynchronousTaskWithSideEffects { + @Override + public boolean isFailed() { return true; } + } + private static class MockNoOpSynchronousTask extends MockTask { @Override public void doRemoteFleetControllerTask(Context ctx) { @@ -1310,6 +1315,10 @@ public class StateChangeTest extends FleetControllerTest { return scheduleTask(new MockNoOpSynchronousTask()); } + MockTask scheduleFailingVersionDependentTaskWithSideEffects() throws Exception { + return scheduleTask(new FailingMockSynchronousTaskWithSideEffects()); + } + void markStorageNodeDown(int index) throws Exception { communicator.setNodeState(new Node(NodeType.STORAGE, index), State.DOWN, "foo"); // Auto-ACKed ctrl.tick(); @@ -1395,6 +1404,15 @@ public class StateChangeTest extends FleetControllerTest { assertTrue(task.isCompleted()); // Now finally acked by all nodes } + @Test + public void failing_task_is_immediately_completed() throws Exception { + RemoteTaskFixture fixture = createDefaultFixture(); + MockTask task = fixture.scheduleFailingVersionDependentTaskWithSideEffects(); + + assertTrue(task.isInvoked()); + assertTrue(task.isCompleted()); + } + @Test public void no_op_synchronous_remote_task_can_complete_immediately_if_current_state_already_acked() throws Exception { RemoteTaskFixture fixture = createFixtureWith(optionsWithZeroTransitionTime()); diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java index 0f35b96c31d..416c57ce5d7 100644 --- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java +++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java @@ -374,11 +374,16 @@ public class SetNodeStateTest extends StateRestApiTest { .setNewState("user", "maintenance", "whatever reason.")); } - // FIXME requests should be tagged as version dependent; temporary workaround @Test - public void set_node_state_requests_are_by_default_not_tagged_as_having_version_ack_dependency() { + public void set_node_state_requests_are_by_default_tagged_as_having_version_ack_dependency() { SetNodeStateRequest request = createDummySetNodeStateRequest(); - assertFalse(request.hasVersionAckDependency()); + assertTrue(request.hasVersionAckDependency()); + } + + @Test + public void set_node_state_requests_not_initially_marked_as_failed() { + SetNodeStateRequest request = createDummySetNodeStateRequest(); + assertFalse(request.isFailed()); } @Test @@ -402,4 +407,11 @@ public class SetNodeStateTest extends StateRestApiTest { request.getResult(); } + @Test + public void leadership_loss_marks_request_as_failed_for_early_out_response() { + SetNodeStateRequest request = createDummySetNodeStateRequest(); + request.handleLeadershipLost(); + assertTrue(request.isFailed()); + } + } -- cgit v1.2.3