diff options
author | Håkon Hallingstad <hakon@oath.com> | 2018-06-25 13:33:02 +0200 |
---|---|---|
committer | Håkon Hallingstad <hakon@oath.com> | 2018-06-25 13:33:02 +0200 |
commit | 0f908eb4216f4f45a24979986dd03ef31421516c (patch) | |
tree | 1f5bd02fd1d5ee2b593df5346510ac81ab070d60 /orchestrator/src | |
parent | 948c3a15a1ceb5a7d58ab7ff3d1e33ea0996f2da (diff) |
Avoid fatal first CC request timeout
If the first setNodeState to the "first" cluster controller times out, then
we'd like to leave enough time to try the second CC. This avoids making a
single CC a single point of failure.
The strategy is to set a timeout of 50% of the remaining time, so if everything
times out the timeouts would roughly be 50%, 25%, and 12.5% of original
timeout.
An alternative strategy would be to use 33% for each, which would be more
democratic.
Diffstat (limited to 'orchestrator/src')
4 files changed, 36 insertions, 9 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java index e795e16813f..f6bbea09642 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java @@ -26,11 +26,19 @@ public class OrchestratorContext { return timeBudget.originalTimeout().get().getSeconds(); } + public float getSuboperationTimeoutInSeconds() { + return getSuboperationTimeoutInSeconds(60); + } + /** - * Get number of seconds until the deadline, or empty if there's no deadline, or throw - * an {@link UncheckedTimeoutException} if timed out. + * Get timeout for a suboperation that should take up {@code shareOfRemaining} of the + * remaining time, or throw an {@link UncheckedTimeoutException} if timed out. */ - public float getSuboperationTimeoutInSeconds() { - return (float) (timeBudget.timeLeftOrThrow().get().toMillis() / 1000.0); + public float getSuboperationTimeoutInSeconds(float shareOfRemaining) { + if (!(0f <= shareOfRemaining && shareOfRemaining <= 1.0f)) { + throw new IllegalArgumentException("Share of remaining time must be between 0 and 1: " + shareOfRemaining); + } + + return shareOfRemaining * timeBudget.timeLeftOrThrow().get().toMillis() / 1000.0f; } } diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java index ba469c1617e..efbc0cd0fcf 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java @@ -15,6 +15,23 @@ public class ClusterControllerClientImpl implements ClusterControllerClient{ public static final String REQUEST_REASON = "Orchestrator"; + // On setNodeState calls against the CC ensemble. + // + // We'd like to set a timeout for the request to the first CC such that if the first + // CC is faulty, there's sufficient time to send the request to the second and third CC. + // The timeouts would be: + // timeout(1. request) = SHARE_REMAINING_TIME * T + // timeout(2. request) = SHARE_REMAINING_TIME * T * (1 - SHARE_REMAINING_TIME) + // timeout(3. request) = SHARE_REMAINING_TIME * T * (1 - SHARE_REMAINING_TIME)^2 + // + // Using a share of 50% gives approximately: + // timeout(1. request) = T * 0.5 + // timeout(2. request) = T * 0.25 + // timeout(3. request) = T * 0.125 + // + // which seems fine + public static final float SHARE_REMAINING_TIME = 0.6f; + private final JaxRsStrategy<ClusterControllerJaxRsApi> clusterControllerApi; private final String clusterName; @@ -40,7 +57,7 @@ public class ClusterControllerClientImpl implements ClusterControllerClient{ return clusterControllerApi.apply(api -> api.setNodeState( clusterName, storageNodeIndex, - context.getSuboperationTimeoutInSeconds(), + context.getSuboperationTimeoutInSeconds(SHARE_REMAINING_TIME), stateRequest) ); } catch (IOException e) { @@ -69,7 +86,7 @@ public class ClusterControllerClientImpl implements ClusterControllerClient{ try { return clusterControllerApi.apply(api -> api.setClusterState( clusterName, - context.getSuboperationTimeoutInSeconds(), + context.getSuboperationTimeoutInSeconds(SHARE_REMAINING_TIME), stateRequest)); } catch (IOException e) { final String message = String.format( diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java index c1dfc5d38f2..228174a9b3d 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java @@ -6,6 +6,7 @@ import com.yahoo.vespa.jaxrs.client.LocalPassThroughJaxRsStrategy; import com.yahoo.vespa.orchestrator.OrchestratorContext; import org.junit.Test; +import static org.mockito.Matchers.anyFloat; import static org.mockito.Matchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; @@ -27,7 +28,7 @@ public class ClusterControllerClientTest { final ClusterControllerNodeState wantedState = ClusterControllerNodeState.MAINTENANCE; OrchestratorContext context = mock(OrchestratorContext.class); - when(context.getSuboperationTimeoutInSeconds()).thenReturn(1.0f); + when(context.getSuboperationTimeoutInSeconds(anyFloat())).thenReturn(1.0f); clusterControllerClient.setNodeState(context, STORAGE_NODE_INDEX, wantedState); final ClusterControllerStateRequest expectedNodeStateRequest = new ClusterControllerStateRequest( diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java index 87f07688c34..4dabae14add 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java @@ -16,6 +16,7 @@ import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.CoreMatchers.is; import static org.junit.Assert.fail; import static org.mockito.Matchers.any; +import static org.mockito.Matchers.anyFloat; import static org.mockito.Matchers.anyInt; import static org.mockito.Matchers.anyString; import static org.mockito.Matchers.argThat; @@ -66,7 +67,7 @@ public class SingleInstanceClusterControllerClientFactoryTest { public void testCreateClientWithSingleClusterControllerInstance() throws Exception { final List<HostName> clusterControllers = Arrays.asList(HOST_NAME_1); - when(context.getSuboperationTimeoutInSeconds()).thenReturn(1.0f); + when(context.getSuboperationTimeoutInSeconds(anyFloat())).thenReturn(1.0f); clientFactory.createClient(clusterControllers, "clusterName") .setNodeState(context, 0, ClusterControllerNodeState.MAINTENANCE); @@ -94,7 +95,7 @@ public class SingleInstanceClusterControllerClientFactoryTest { public void testCreateClientWithThreeClusterControllerInstances() throws Exception { final List<HostName> clusterControllers = Arrays.asList(HOST_NAME_1, HOST_NAME_2, HOST_NAME_3); - when(context.getSuboperationTimeoutInSeconds()).thenReturn(1.0f); + when(context.getSuboperationTimeoutInSeconds(anyFloat())).thenReturn(1.0f); clientFactory.createClient(clusterControllers, "clusterName") .setNodeState(context, 0, ClusterControllerNodeState.MAINTENANCE); |