summaryrefslogtreecommitdiffstats
path: root/clustercontroller-core
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahoo-inc.com>2017-09-21 13:19:44 +0200
committerTor Brede Vekterli <vekterli@yahoo-inc.com>2017-09-25 16:05:23 +0200
commitdebb34547c76429e8a345299e4765824903f784c (patch)
tree34ac382d6296f63f2086032bdf37956f8caad1ff /clustercontroller-core
parentd2a0cce57df4d1bcfa41ed4cbfadc705636df142 (diff)
Immediately complete failed remote tasks
We check both for master status and task failure, as we otherwise place a potentially dangerous silent dependency on the task always failing itself if the controller is not a master.
Diffstat (limited to 'clustercontroller-core')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java8
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java8
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java5
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java4
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java18
-rw-r--r--clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java18
6 files changed, 53 insertions, 8 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
index 120b482e11d..030d792a63b 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/FleetController.java
@@ -665,8 +665,7 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
final RemoteClusterControllerTask task = remoteTasks.poll();
log.finest(() -> String.format("Processing remote task of type '%s'", task.getClass().getName()));
task.doRemoteFleetControllerTask(context);
- // We cannot introduce a version barrier for tasks when we're not the master (and therefore will not publish new versions).
- if (!isMaster() || !task.hasVersionAckDependency()) {
+ if (taskMayBeCompletedImmediately(task)) {
log.finest(() -> String.format("Done processing remote task of type '%s'", task.getClass().getName()));
task.notifyCompleted();
} else {
@@ -678,6 +677,11 @@ public class FleetController implements NodeStateOrHostInfoChangeHandler, NodeAd
return false;
}
+ private boolean taskMayBeCompletedImmediately(RemoteClusterControllerTask task) {
+ // We cannot introduce a version barrier for tasks when we're not the master (and therefore will not publish new versions).
+ return (!task.hasVersionAckDependency() || task.isFailed() || !masterElectionHandler.isMaster());
+ }
+
private RemoteClusterControllerTask.Context createRemoteTaskProcessingContext() {
final RemoteClusterControllerTask.Context context = new RemoteClusterControllerTask.Context();
context.cluster = cluster;
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
index 0fb9e5798c8..e4519a02632 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/RemoteClusterControllerTask.java
@@ -32,6 +32,14 @@ public abstract class RemoteClusterControllerTask {
public boolean hasVersionAckDependency() { return false; }
/**
+ * If true, signals that a task has failed and can be immediately marked as
+ * complete without waiting for a version ACK. The task implementation has
+ * the responsibility of communicating any failure to the caller, and ensuring
+ * that the lack of version waiting does not violate any invariants.
+ */
+ public boolean isFailed() { return false; }
+
+ /**
* If the task response has been deferred due to hasVersionAckDependency(),
* handleLeadershipLost() will be invoked on the task if the cluster controller
* discovers it has lost leadership in the time between task execution and
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java
index 924614df327..27ab43b2b5a 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/Request.java
@@ -65,6 +65,11 @@ public abstract class Request<Result> extends RemoteClusterControllerTask {
failure = new UnknownMasterException("Leadership lost before request could complete");
}
+ @Override
+ public boolean isFailed() {
+ return (failure != null);
+ }
+
public abstract Result calculateResult(Context context) throws StateRestApiException, OtherMasterIndexException;
}
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java
index 2c094769961..cf41707db75 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/restapiv2/requests/SetNodeStateRequest.java
@@ -67,9 +67,7 @@ public class SetNodeStateRequest extends Request<SetResponse> {
@Override
public boolean hasVersionAckDependency() {
- // FIXME this is a temporary change while edge cases in interactions between controller
- // and orchestration are sorted out.
- return false;
+ return (responseWait == SetUnitStateRequest.ResponseWait.WAIT_UNTIL_CLUSTER_ACKED);
}
static SetResponse setWantedState(
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java
index 6a8ffae8f3f..d9815241920 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/StateChangeTest.java
@@ -1275,6 +1275,11 @@ public class StateChangeTest extends FleetControllerTest {
}
}
+ private static class FailingMockSynchronousTaskWithSideEffects extends MockSynchronousTaskWithSideEffects {
+ @Override
+ public boolean isFailed() { return true; }
+ }
+
private static class MockNoOpSynchronousTask extends MockTask {
@Override
public void doRemoteFleetControllerTask(Context ctx) {
@@ -1310,6 +1315,10 @@ public class StateChangeTest extends FleetControllerTest {
return scheduleTask(new MockNoOpSynchronousTask());
}
+ MockTask scheduleFailingVersionDependentTaskWithSideEffects() throws Exception {
+ return scheduleTask(new FailingMockSynchronousTaskWithSideEffects());
+ }
+
void markStorageNodeDown(int index) throws Exception {
communicator.setNodeState(new Node(NodeType.STORAGE, index), State.DOWN, "foo"); // Auto-ACKed
ctrl.tick();
@@ -1396,6 +1405,15 @@ public class StateChangeTest extends FleetControllerTest {
}
@Test
+ public void failing_task_is_immediately_completed() throws Exception {
+ RemoteTaskFixture fixture = createDefaultFixture();
+ MockTask task = fixture.scheduleFailingVersionDependentTaskWithSideEffects();
+
+ assertTrue(task.isInvoked());
+ assertTrue(task.isCompleted());
+ }
+
+ @Test
public void no_op_synchronous_remote_task_can_complete_immediately_if_current_state_already_acked() throws Exception {
RemoteTaskFixture fixture = createFixtureWith(optionsWithZeroTransitionTime());
fixture.markStorageNodeDown(0);
diff --git a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java
index 0f35b96c31d..416c57ce5d7 100644
--- a/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java
+++ b/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/restapiv2/SetNodeStateTest.java
@@ -374,11 +374,16 @@ public class SetNodeStateTest extends StateRestApiTest {
.setNewState("user", "maintenance", "whatever reason."));
}
- // FIXME requests should be tagged as version dependent; temporary workaround
@Test
- public void set_node_state_requests_are_by_default_not_tagged_as_having_version_ack_dependency() {
+ public void set_node_state_requests_are_by_default_tagged_as_having_version_ack_dependency() {
SetNodeStateRequest request = createDummySetNodeStateRequest();
- assertFalse(request.hasVersionAckDependency());
+ assertTrue(request.hasVersionAckDependency());
+ }
+
+ @Test
+ public void set_node_state_requests_not_initially_marked_as_failed() {
+ SetNodeStateRequest request = createDummySetNodeStateRequest();
+ assertFalse(request.isFailed());
}
@Test
@@ -402,4 +407,11 @@ public class SetNodeStateTest extends StateRestApiTest {
request.getResult();
}
+ @Test
+ public void leadership_loss_marks_request_as_failed_for_early_out_response() {
+ SetNodeStateRequest request = createDummySetNodeStateRequest();
+ request.handleLeadershipLost();
+ assertTrue(request.isFailed());
+ }
+
}