Retry twice if only 1 CC

Caught in the systemtests: If there's only one CC, there will only be 1 request with timeout ~5s, whereas today the real timeout is 10s. This appears to make a difference to the systemtests as converging to a cluster state may take several seconds. There are 2 solutions: 1. Allocate ~10s to CC call, or 2. Make another ~5s call to CC if the first one fails. (2) is simpler to implement for now. To implement (1), the timeout calculation could receive the number of backends as a parameter, but that would make the already complex logic here even worse. Or, we could only reserve enough time for 1 call (abandon 2 calls logic). TBD later.
author: Håkon Hallingstad <hakon@oath.com> 2018-10-30 15:10:15 +0100
committer: Håkon Hallingstad <hakon@oath.com> 2018-10-30 15:10:15 +0100
commit: eaea833fd78b37557b5d18a112fb63d1f04da3be (patch)
tree: 5f8c03d7dfd6dd374667067f34bf592457dc38a0 /orchestrator/src/main
parent: 47972b55d383e899e5eb3d2af24e631ae04d4c4c (diff)
3 files changed, 11 insertions, 9 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java
index d7e63ccfc76..46440682fa0 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java
@@ -47,10 +47,11 @@ public class ClusterControllerClientImpl implements ClusterControllerClient{
                     timeouts);
         } catch (IOException | UncheckedTimeoutException e) {
             String message = String.format(
-                    "Giving up setting %s for storage node with index %d in cluster %s",
+                    "Giving up setting %s for storage node with index %d in cluster %s: %s",
                     stateRequest,
                     storageNodeIndex,
-                    clusterName);
+                    clusterName,
+                    e.getMessage());
 
             throw new IOException(message, e);
         }
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java
index 5b0685e88a0..44ecc7ac167 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java
@@ -41,11 +41,11 @@ import java.time.Duration;
  */
 public class ClusterControllerClientTimeouts implements JaxRsTimeouts {
     // In data center connect timeout
-    static final Duration CONNECT_TIMEOUT = Duration.ofMillis(50);
+    static final Duration CONNECT_TIMEOUT = Duration.ofMillis(100);
     // Per call overhead
-    static final Duration IN_PROCESS_OVERHEAD_PER_CALL = Duration.ofMillis(50);
-    // In data center kernel and network overhead.
-    static final Duration NETWORK_OVERHEAD_PER_CALL = CONNECT_TIMEOUT;
+    static final Duration IN_PROCESS_OVERHEAD_PER_CALL = Duration.ofMillis(100);
+    // In-process kernel overhead, network overhead, server kernel overhead, and server in-process overhead.
+    static final Duration DOWNSTREAM_OVERHEAD_PER_CALL = CONNECT_TIMEOUT.plus(Duration.ofMillis(100));
     // Minimum time reserved for post-RPC processing to finish BEFORE the deadline, including ZK write.
     static final Duration IN_PROCESS_OVERHEAD = Duration.ofMillis(100);
     // Number of JAX-RS RPC calls to account for within the time budget.
@@ -100,7 +100,7 @@ public class ClusterControllerClientTimeouts implements JaxRsTimeouts {
 
     public Duration getServerTimeoutOrThrow() {
         // readTimeout = networkOverhead + serverTimeout
-        Duration serverTimeout = getReadTimeoutOrThrow().minus(NETWORK_OVERHEAD_PER_CALL);
+        Duration serverTimeout = getReadTimeoutOrThrow().minus(DOWNSTREAM_OVERHEAD_PER_CALL);
         if (serverTimeout.toMillis() < MIN_SERVER_TIMEOUT.toMillis()) {
             throw new UncheckedTimeoutException("Server would be given too little time to complete: " +
                     serverTimeout + ". Original timeout was " + timeBudget.originalTimeout().get());
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java
index 7a0b96c2231..33e74235862 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java
@@ -41,12 +41,13 @@ public class RetryingClusterControllerClientFactory implements ClusterController
                         jaxRsClientFactory,
                         CLUSTERCONTROLLER_SCHEME)
                         .apiWithRetries(ClusterControllerJaxRsApi.class, CLUSTERCONTROLLER_API_PATH)
-                        // Use max iteration 1. The JaxRsStrategyFactory will try host 1, 2, then 3:
+                        // Use max iteration 1: The JaxRsStrategyFactory will try host 1, 2, then 3:
                         //  - If host 1 responds, it will redirect to master if necessary. Otherwise
                         //  - If host 2 responds, it will redirect to master if necessary. Otherwise
                         //  - If host 3 responds, it may redirect to master if necessary (if they're up
                         //    after all), but more likely there's no quorum and this will fail too.
-                        .setMaxIterations(1);
+                        // If there's only 1 CC, we'll try that one twice.
+                        .setMaxIterations(clusterControllers.size() > 1 ? 1 : 2);
         return new ClusterControllerClientImpl(jaxRsApi, clusterName);
     }
 }
author	Håkon Hallingstad <hakon@oath.com>	2018-10-30 15:10:15 +0100
committer	Håkon Hallingstad <hakon@oath.com>	2018-10-30 15:10:15 +0100
commit	eaea833fd78b37557b5d18a112fb63d1f04da3be (patch)
tree	5f8c03d7dfd6dd374667067f34bf592457dc38a0 /orchestrator/src/main
parent	47972b55d383e899e5eb3d2af24e631ae04d4c4c (diff)