summaryrefslogtreecommitdiffstats
path: root/orchestrator
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@oath.com>2018-10-30 15:10:15 +0100
committerHåkon Hallingstad <hakon@oath.com>2018-10-30 15:10:15 +0100
commiteaea833fd78b37557b5d18a112fb63d1f04da3be (patch)
tree5f8c03d7dfd6dd374667067f34bf592457dc38a0 /orchestrator
parent47972b55d383e899e5eb3d2af24e631ae04d4c4c (diff)
Retry twice if only 1 CC
Caught in the systemtests: If there's only one CC, there will only be 1 request with timeout ~5s, whereas today the real timeout is 10s. This appears to make a difference to the systemtests as converging to a cluster state may take several seconds. There are 2 solutions: 1. Allocate ~10s to CC call, or 2. Make another ~5s call to CC if the first one fails. (2) is simpler to implement for now. To implement (1), the timeout calculation could receive the number of backends as a parameter, but that would make the already complex logic here even worse. Or, we could only reserve enough time for 1 call (abandon 2 calls logic). TBD later.
Diffstat (limited to 'orchestrator')
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java5
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java10
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java5
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeoutsTest.java20
-rw-r--r--orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactoryTest.java2
5 files changed, 22 insertions, 20 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java
index d7e63ccfc76..46440682fa0 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java
@@ -47,10 +47,11 @@ public class ClusterControllerClientImpl implements ClusterControllerClient{
timeouts);
} catch (IOException | UncheckedTimeoutException e) {
String message = String.format(
- "Giving up setting %s for storage node with index %d in cluster %s",
+ "Giving up setting %s for storage node with index %d in cluster %s: %s",
stateRequest,
storageNodeIndex,
- clusterName);
+ clusterName,
+ e.getMessage());
throw new IOException(message, e);
}
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java
index 5b0685e88a0..44ecc7ac167 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java
@@ -41,11 +41,11 @@ import java.time.Duration;
*/
public class ClusterControllerClientTimeouts implements JaxRsTimeouts {
// In data center connect timeout
- static final Duration CONNECT_TIMEOUT = Duration.ofMillis(50);
+ static final Duration CONNECT_TIMEOUT = Duration.ofMillis(100);
// Per call overhead
- static final Duration IN_PROCESS_OVERHEAD_PER_CALL = Duration.ofMillis(50);
- // In data center kernel and network overhead.
- static final Duration NETWORK_OVERHEAD_PER_CALL = CONNECT_TIMEOUT;
+ static final Duration IN_PROCESS_OVERHEAD_PER_CALL = Duration.ofMillis(100);
+ // In-process kernel overhead, network overhead, server kernel overhead, and server in-process overhead.
+ static final Duration DOWNSTREAM_OVERHEAD_PER_CALL = CONNECT_TIMEOUT.plus(Duration.ofMillis(100));
// Minimum time reserved for post-RPC processing to finish BEFORE the deadline, including ZK write.
static final Duration IN_PROCESS_OVERHEAD = Duration.ofMillis(100);
// Number of JAX-RS RPC calls to account for within the time budget.
@@ -100,7 +100,7 @@ public class ClusterControllerClientTimeouts implements JaxRsTimeouts {
public Duration getServerTimeoutOrThrow() {
// readTimeout = networkOverhead + serverTimeout
- Duration serverTimeout = getReadTimeoutOrThrow().minus(NETWORK_OVERHEAD_PER_CALL);
+ Duration serverTimeout = getReadTimeoutOrThrow().minus(DOWNSTREAM_OVERHEAD_PER_CALL);
if (serverTimeout.toMillis() < MIN_SERVER_TIMEOUT.toMillis()) {
throw new UncheckedTimeoutException("Server would be given too little time to complete: " +
serverTimeout + ". Original timeout was " + timeBudget.originalTimeout().get());
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java
index 7a0b96c2231..33e74235862 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java
@@ -41,12 +41,13 @@ public class RetryingClusterControllerClientFactory implements ClusterController
jaxRsClientFactory,
CLUSTERCONTROLLER_SCHEME)
.apiWithRetries(ClusterControllerJaxRsApi.class, CLUSTERCONTROLLER_API_PATH)
- // Use max iteration 1. The JaxRsStrategyFactory will try host 1, 2, then 3:
+ // Use max iteration 1: The JaxRsStrategyFactory will try host 1, 2, then 3:
// - If host 1 responds, it will redirect to master if necessary. Otherwise
// - If host 2 responds, it will redirect to master if necessary. Otherwise
// - If host 3 responds, it may redirect to master if necessary (if they're up
// after all), but more likely there's no quorum and this will fail too.
- .setMaxIterations(1);
+ // If there's only 1 CC, we'll try that one twice.
+ .setMaxIterations(clusterControllers.size() > 1 ? 1 : 2);
return new ClusterControllerClientImpl(jaxRsApi, clusterName);
}
}
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeoutsTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeoutsTest.java
index 9fe65447ac1..ee81a89d76c 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeoutsTest.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeoutsTest.java
@@ -13,27 +13,27 @@ import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTim
import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts.IN_PROCESS_OVERHEAD;
import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts.IN_PROCESS_OVERHEAD_PER_CALL;
import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts.MIN_SERVER_TIMEOUT;
-import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts.NETWORK_OVERHEAD_PER_CALL;
+import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts.DOWNSTREAM_OVERHEAD_PER_CALL;
import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts.NUM_CALLS;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
public class ClusterControllerClientTimeoutsTest {
- // The minimum time left for any invocation of prepareForImmediateJaxRsCall().
+ // The minimum time left that allows for a single RPC to CC.
private static final Duration MINIMUM_TIME_LEFT = IN_PROCESS_OVERHEAD_PER_CALL
.plus(CONNECT_TIMEOUT)
- .plus(NETWORK_OVERHEAD_PER_CALL)
+ .plus(DOWNSTREAM_OVERHEAD_PER_CALL)
.plus(MIN_SERVER_TIMEOUT);
static {
- assertEquals(Duration.ofMillis(160), MINIMUM_TIME_LEFT);
+ assertEquals(Duration.ofMillis(410), MINIMUM_TIME_LEFT);
}
- // The minimum time left (= original time) which is required to allow any requests to the CC.
+ // The minimum time left (= original time) that allows for NUM_CALLS RPCs to CC.
private static final Duration MINIMUM_ORIGINAL_TIMEOUT = MINIMUM_TIME_LEFT
.multipliedBy(NUM_CALLS)
.plus(IN_PROCESS_OVERHEAD);
static {
- assertEquals(Duration.ofMillis(420), MINIMUM_ORIGINAL_TIMEOUT);
+ assertEquals(Duration.ofMillis(920), MINIMUM_ORIGINAL_TIMEOUT);
}
private final ManualClock clock = new ManualClock();
@@ -92,9 +92,9 @@ public class ClusterControllerClientTimeoutsTest {
}
private void assertStandardTimeouts() {
- assertEquals(Duration.ofMillis(50), timeouts.getConnectTimeoutOrThrow());
- assertEquals(Duration.ofMillis(1350), timeouts.getReadTimeoutOrThrow());
- assertEquals(Duration.ofMillis(1300), timeouts.getServerTimeoutOrThrow());
+ assertEquals(Duration.ofMillis(100), timeouts.getConnectTimeoutOrThrow());
+ assertEquals(Duration.ofMillis(1250), timeouts.getReadTimeoutOrThrow());
+ assertEquals(Duration.ofMillis(1050), timeouts.getServerTimeoutOrThrow());
}
@Test
@@ -138,7 +138,7 @@ public class ClusterControllerClientTimeoutsTest {
fail();
} catch (UncheckedTimeoutException e) {
assertEquals(
- "Server would be given too little time to complete: PT0.0095S. Original timeout was PT0.419S",
+ "Server would be given too little time to complete: PT0.0095S. Original timeout was PT0.919S",
e.getMessage());
}
}
diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactoryTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactoryTest.java
index 1f505991476..3b0b1a43085 100644
--- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactoryTest.java
+++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactoryTest.java
@@ -44,7 +44,7 @@ public class RetryingClusterControllerClientFactoryTest {
ArgumentCaptor<ClusterControllerStateRequest> requestCaptor = ArgumentCaptor.forClass(ClusterControllerStateRequest.class);
- verify(api, times(1)).setNodeState(eq(clusterName), eq(storageNode), eq(4.8f), requestCaptor.capture());
+ verify(api, times(1)).setNodeState(eq(clusterName), eq(storageNode), eq(4.55f), requestCaptor.capture());
ClusterControllerStateRequest request = requestCaptor.getValue();
assertEquals(ClusterControllerStateRequest.Condition.SAFE, request.condition);
Map<String, Object> expectedState = new HashMap<>();