diff options
author | Håkon Hallingstad <hakon@oath.com> | 2018-10-30 15:10:15 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@oath.com> | 2018-10-30 15:10:15 +0100 |
commit | eaea833fd78b37557b5d18a112fb63d1f04da3be (patch) | |
tree | 5f8c03d7dfd6dd374667067f34bf592457dc38a0 /orchestrator/src/main | |
parent | 47972b55d383e899e5eb3d2af24e631ae04d4c4c (diff) |
Retry twice if only 1 CC
Caught in the systemtests: If there's only one CC, there will only be 1 request
with timeout ~5s, whereas today the real timeout is 10s. This appears to make a
difference to the systemtests as converging to a cluster state may take several
seconds. There are 2 solutions:
1. Allocate ~10s to CC call, or
2. Make another ~5s call to CC if the first one fails.
(2) is simpler to implement for now. To implement (1), the timeout calculation
could receive the number of backends as a parameter, but that would make the
already complex logic here even worse. Or, we could only reserve enough time
for 1 call (abandon 2 calls logic). TBD later.
Diffstat (limited to 'orchestrator/src/main')
3 files changed, 11 insertions, 9 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java index d7e63ccfc76..46440682fa0 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java @@ -47,10 +47,11 @@ public class ClusterControllerClientImpl implements ClusterControllerClient{ timeouts); } catch (IOException | UncheckedTimeoutException e) { String message = String.format( - "Giving up setting %s for storage node with index %d in cluster %s", + "Giving up setting %s for storage node with index %d in cluster %s: %s", stateRequest, storageNodeIndex, - clusterName); + clusterName, + e.getMessage()); throw new IOException(message, e); } diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java index 5b0685e88a0..44ecc7ac167 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java @@ -41,11 +41,11 @@ import java.time.Duration; */ public class ClusterControllerClientTimeouts implements JaxRsTimeouts { // In data center connect timeout - static final Duration CONNECT_TIMEOUT = Duration.ofMillis(50); + static final Duration CONNECT_TIMEOUT = Duration.ofMillis(100); // Per call overhead - static final Duration IN_PROCESS_OVERHEAD_PER_CALL = Duration.ofMillis(50); - // In data center kernel and network overhead. - static final Duration NETWORK_OVERHEAD_PER_CALL = CONNECT_TIMEOUT; + static final Duration IN_PROCESS_OVERHEAD_PER_CALL = Duration.ofMillis(100); + // In-process kernel overhead, network overhead, server kernel overhead, and server in-process overhead. + static final Duration DOWNSTREAM_OVERHEAD_PER_CALL = CONNECT_TIMEOUT.plus(Duration.ofMillis(100)); // Minimum time reserved for post-RPC processing to finish BEFORE the deadline, including ZK write. static final Duration IN_PROCESS_OVERHEAD = Duration.ofMillis(100); // Number of JAX-RS RPC calls to account for within the time budget. @@ -100,7 +100,7 @@ public class ClusterControllerClientTimeouts implements JaxRsTimeouts { public Duration getServerTimeoutOrThrow() { // readTimeout = networkOverhead + serverTimeout - Duration serverTimeout = getReadTimeoutOrThrow().minus(NETWORK_OVERHEAD_PER_CALL); + Duration serverTimeout = getReadTimeoutOrThrow().minus(DOWNSTREAM_OVERHEAD_PER_CALL); if (serverTimeout.toMillis() < MIN_SERVER_TIMEOUT.toMillis()) { throw new UncheckedTimeoutException("Server would be given too little time to complete: " + serverTimeout + ". Original timeout was " + timeBudget.originalTimeout().get()); diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java index 7a0b96c2231..33e74235862 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java @@ -41,12 +41,13 @@ public class RetryingClusterControllerClientFactory implements ClusterController jaxRsClientFactory, CLUSTERCONTROLLER_SCHEME) .apiWithRetries(ClusterControllerJaxRsApi.class, CLUSTERCONTROLLER_API_PATH) - // Use max iteration 1. The JaxRsStrategyFactory will try host 1, 2, then 3: + // Use max iteration 1: The JaxRsStrategyFactory will try host 1, 2, then 3: // - If host 1 responds, it will redirect to master if necessary. Otherwise // - If host 2 responds, it will redirect to master if necessary. Otherwise // - If host 3 responds, it may redirect to master if necessary (if they're up // after all), but more likely there's no quorum and this will fail too. - .setMaxIterations(1); + // If there's only 1 CC, we'll try that one twice. + .setMaxIterations(clusterControllers.size() > 1 ? 1 : 2); return new ClusterControllerClientImpl(jaxRsApi, clusterName); } } |