1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
|
package com.yahoo.vespa.orchestrator.controller;
import com.google.common.util.concurrent.UncheckedTimeoutException;
import com.yahoo.time.TimeBudget;
import com.yahoo.vespa.jaxrs.client.JaxRsTimeouts;
import java.time.Duration;
/**
* Calculates various timeouts associated with a REST call from the Orchestrator to the Cluster Controller.
*
* <p>Timeout handling of HTTP messaging is fundamentally flawed in various Java implementations.
* We would like to specify a max time for the whole operation (connect, send request, and receive response).
* Jersey JAX-RS implementation and the Apache HTTP client library provides a way to set the connect timeout C
* and read timeout R. So if the operation takes NR reads, and the writes takes TW time,
* the theoretical max time is: T = C + R * NR + TW. With both NR and TW unknown, there's no way to
* set a proper C and R.</p>
*
* <p>The various timeouts is set according to the following considerations:</p>
*
* <ol>
* <li>Some time is reserved for the execution in this process, e.g. execution leading to the REST call,
* handling of the response, exception handling, etc, such that we can finish processing this request
* before the {@link #timeBudget} deadline. This is typically in the order of ms.</li>
* <li>A timeout will be passed to the Cluster Controller backend. We'll give a timeout such that if one
* CC times out, the next CC will be given exactly the same timeout. This may or may not be a good strategy:
* (A) There's typically a 3rd CC. But if the first 2 fails with timeout, the chance the last is OK
* is negligible. (B) If picking the CC is random, then giving the full timeout to the first
* should be sufficient since a later retry will hit the healthy CC. (C) Because we have been using
* DROP in networking rules, clients may time out (host out of app or whatever). This would suggest
* allowing more than 1 full request.</li>
* <li>The timeout passed to the CC backend should be such that if it honors that, the Orchestrator
* should not time out. This means some kernel and network overhead should be subtracted from the timeout
* passed to the CC.</li>
* <li>We're only able to set the connect and read/write timeouts(!) Since we're communicating within
* data center, assume connections are in the order of ms, while a single read may stall close up to the CC
* timeout.</li>
* </ol>
*
* @author hakonhall
*/
public class ClusterControllerClientTimeouts implements JaxRsTimeouts {
// In data center connect timeout
static final Duration CONNECT_TIMEOUT = Duration.ofMillis(100);
// Per call overhead
static final Duration IN_PROCESS_OVERHEAD_PER_CALL = Duration.ofMillis(100);
// In-process kernel overhead, network overhead, server kernel overhead, and server in-process overhead.
static final Duration DOWNSTREAM_OVERHEAD_PER_CALL = CONNECT_TIMEOUT.plus(Duration.ofMillis(100));
// Minimum time reserved for post-RPC processing to finish BEFORE the deadline, including ZK write.
static final Duration IN_PROCESS_OVERHEAD = Duration.ofMillis(100);
// Number of JAX-RS RPC calls to account for within the time budget.
static final int NUM_CALLS = 2;
// Minimum server-side timeout
static final Duration MIN_SERVER_TIMEOUT = Duration.ofMillis(10);
private final String clusterName;
private final TimeBudget timeBudget;
private final Duration maxClientTimeout;
/**
* Creates a timeouts instance.
*
* The {@link #timeBudget} SHOULD be the time budget for a single logical call to the Cluster Controller.
* A logical call to CC may in fact call the CC several times, if the first onces are down and/or not
* the master.
*
* @param clusterName The name of the content cluster this request is for.
* @param timeBudget The time budget for a single logical call to the the Cluster Controller.
*/
public ClusterControllerClientTimeouts(String clusterName, TimeBudget timeBudget) {
this.clusterName = clusterName;
this.timeBudget = timeBudget;
// timeLeft = inProcessOverhead + numCalls * clientTimeout
maxClientTimeout = timeBudget.originalTimeout().get().minus(IN_PROCESS_OVERHEAD).dividedBy(NUM_CALLS);
}
@Override
public Duration getConnectTimeoutOrThrow() {
return CONNECT_TIMEOUT;
}
@Override
public Duration getReadTimeoutOrThrow() {
Duration timeLeft = timeBudget.timeLeft().get();
if (timeLeft.toMillis() <= 0) {
throw new UncheckedTimeoutException("Exceeded the timeout " + timeBudget.originalTimeout().get() +
" against content cluster '" + clusterName + "' by " + timeLeft.negated());
}
Duration clientTimeout = min(timeLeft, maxClientTimeout);
verifyPositive(timeLeft, clientTimeout);
// clientTimeout = overheadPerCall + connectTimeout + readTimeout
Duration readTimeout = clientTimeout.minus(IN_PROCESS_OVERHEAD_PER_CALL).minus(CONNECT_TIMEOUT);
verifyPositive(timeLeft, readTimeout);
return readTimeout;
}
public Duration getServerTimeoutOrThrow() {
// readTimeout = networkOverhead + serverTimeout
Duration serverTimeout = getReadTimeoutOrThrow().minus(DOWNSTREAM_OVERHEAD_PER_CALL);
if (serverTimeout.toMillis() < MIN_SERVER_TIMEOUT.toMillis()) {
throw new UncheckedTimeoutException("Server would be given too little time to complete: " +
serverTimeout + ". Original timeout was " + timeBudget.originalTimeout().get());
}
return serverTimeout;
}
private static Duration min(Duration a, Duration b) {
return a.compareTo(b) < 0 ? a : b;
}
private void verifyLargerThan(Duration timeLeft, Duration availableDuration) {
if (availableDuration.toMillis() <= 0) {
throw new UncheckedTimeoutException("Too little time left (" + timeLeft +
") to call content cluster '" + clusterName +
"', original timeout was " + timeBudget.originalTimeout().get());
}
}
private void verifyPositive(Duration timeLeft, Duration duration) { verifyLargerThan(timeLeft, duration); }
}
|