summaryrefslogtreecommitdiffstats
path: root/orchestrator
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@oath.com>2018-06-25 13:33:02 +0200
committerHåkon Hallingstad <hakon@oath.com>2018-06-25 13:33:02 +0200
commit0f908eb4216f4f45a24979986dd03ef31421516c (patch)
tree1f5bd02fd1d5ee2b593df5346510ac81ab070d60 /orchestrator
parent948c3a15a1ceb5a7d58ab7ff3d1e33ea0996f2da (diff)
Avoid fatal first CC request timeout
If the first setNodeState to the "first" cluster controller times out, then we'd like to leave enough time to try the second CC. This avoids making a single CC a single point of failure. The strategy is to set a timeout of 50% of the remaining time, so if everything times out the timeouts would roughly be 50%, 25%, and 12.5% of original timeout. An alternative strategy would be to use 33% for each, which would be more democratic.
Diffstat (limited to 'orchestrator')
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java16
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java21
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java3
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java5
4 files changed, 36 insertions, 9 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java
index e795e16813f..f6bbea09642 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java
@@ -26,11 +26,19 @@ public class OrchestratorContext {
return timeBudget.originalTimeout().get().getSeconds();
}
+ public float getSuboperationTimeoutInSeconds() {
+ return getSuboperationTimeoutInSeconds(60);
+ }
+
/**
- * Get number of seconds until the deadline, or empty if there's no deadline, or throw
- * an {@link UncheckedTimeoutException} if timed out.
+ * Get timeout for a suboperation that should take up {@code shareOfRemaining} of the
+ * remaining time, or throw an {@link UncheckedTimeoutException} if timed out.
*/
- public float getSuboperationTimeoutInSeconds() {
- return (float) (timeBudget.timeLeftOrThrow().get().toMillis() / 1000.0);
+ public float getSuboperationTimeoutInSeconds(float shareOfRemaining) {
+ if (!(0f <= shareOfRemaining && shareOfRemaining <= 1.0f)) {
+ throw new IllegalArgumentException("Share of remaining time must be between 0 and 1: " + shareOfRemaining);
+ }
+
+ return shareOfRemaining * timeBudget.timeLeftOrThrow().get().toMillis() / 1000.0f;
}
}
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java
index ba469c1617e..efbc0cd0fcf 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java
@@ -15,6 +15,23 @@ public class ClusterControllerClientImpl implements ClusterControllerClient{
public static final String REQUEST_REASON = "Orchestrator";
+ // On setNodeState calls against the CC ensemble.
+ //
+ // We'd like to set a timeout for the request to the first CC such that if the first
+ // CC is faulty, there's sufficient time to send the request to the second and third CC.
+ // The timeouts would be:
+ // timeout(1. request) = SHARE_REMAINING_TIME * T
+ // timeout(2. request) = SHARE_REMAINING_TIME * T * (1 - SHARE_REMAINING_TIME)
+ // timeout(3. request) = SHARE_REMAINING_TIME * T * (1 - SHARE_REMAINING_TIME)^2
+ //
+ // Using a share of 50% gives approximately:
+ // timeout(1. request) = T * 0.5
+ // timeout(2. request) = T * 0.25
+ // timeout(3. request) = T * 0.125
+ //
+ // which seems fine
+ public static final float SHARE_REMAINING_TIME = 0.6f;
+
private final JaxRsStrategy<ClusterControllerJaxRsApi> clusterControllerApi;
private final String clusterName;
@@ -40,7 +57,7 @@ public class ClusterControllerClientImpl implements ClusterControllerClient{
return clusterControllerApi.apply(api -> api.setNodeState(
clusterName,
storageNodeIndex,
- context.getSuboperationTimeoutInSeconds(),
+ context.getSuboperationTimeoutInSeconds(SHARE_REMAINING_TIME),
stateRequest)
);
} catch (IOException e) {
@@ -69,7 +86,7 @@ public class ClusterControllerClientImpl implements ClusterControllerClient{
try {
return clusterControllerApi.apply(api -> api.setClusterState(
clusterName,
- context.getSuboperationTimeoutInSeconds(),
+ context.getSuboperationTimeoutInSeconds(SHARE_REMAINING_TIME),
stateRequest));
} catch (IOException e) {
final String message = String.format(
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java
index c1dfc5d38f2..228174a9b3d 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java
@@ -6,6 +6,7 @@ import com.yahoo.vespa.jaxrs.client.LocalPassThroughJaxRsStrategy;
import com.yahoo.vespa.orchestrator.OrchestratorContext;
import org.junit.Test;
+import static org.mockito.Matchers.anyFloat;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.times;
@@ -27,7 +28,7 @@ public class ClusterControllerClientTest {
final ClusterControllerNodeState wantedState = ClusterControllerNodeState.MAINTENANCE;
OrchestratorContext context = mock(OrchestratorContext.class);
- when(context.getSuboperationTimeoutInSeconds()).thenReturn(1.0f);
+ when(context.getSuboperationTimeoutInSeconds(anyFloat())).thenReturn(1.0f);
clusterControllerClient.setNodeState(context, STORAGE_NODE_INDEX, wantedState);
final ClusterControllerStateRequest expectedNodeStateRequest = new ClusterControllerStateRequest(
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java
index 87f07688c34..4dabae14add 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java
@@ -16,6 +16,7 @@ import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.fail;
import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.anyFloat;
import static org.mockito.Matchers.anyInt;
import static org.mockito.Matchers.anyString;
import static org.mockito.Matchers.argThat;
@@ -66,7 +67,7 @@ public class SingleInstanceClusterControllerClientFactoryTest {
public void testCreateClientWithSingleClusterControllerInstance() throws Exception {
final List<HostName> clusterControllers = Arrays.asList(HOST_NAME_1);
- when(context.getSuboperationTimeoutInSeconds()).thenReturn(1.0f);
+ when(context.getSuboperationTimeoutInSeconds(anyFloat())).thenReturn(1.0f);
clientFactory.createClient(clusterControllers, "clusterName")
.setNodeState(context, 0, ClusterControllerNodeState.MAINTENANCE);
@@ -94,7 +95,7 @@ public class SingleInstanceClusterControllerClientFactoryTest {
public void testCreateClientWithThreeClusterControllerInstances() throws Exception {
final List<HostName> clusterControllers = Arrays.asList(HOST_NAME_1, HOST_NAME_2, HOST_NAME_3);
- when(context.getSuboperationTimeoutInSeconds()).thenReturn(1.0f);
+ when(context.getSuboperationTimeoutInSeconds(anyFloat())).thenReturn(1.0f);
clientFactory.createClient(clusterControllers, "clusterName")
.setNodeState(context, 0, ClusterControllerNodeState.MAINTENANCE);