From 629cfc4d314f761ca632ae61639905cc9d1e3fa6 Mon Sep 17 00:00:00 2001 From: HÃ¥kon Hallingstad Date: Tue, 23 Oct 2018 15:44:56 +0200 Subject: Enforce CC timeouts in Orchestrator 2 --- orchestrator/pom.xml | 6 + .../com/yahoo/vespa/orchestrator/Orchestrator.java | 9 -- .../vespa/orchestrator/OrchestratorContext.java | 70 +++++++--- .../yahoo/vespa/orchestrator/OrchestratorImpl.java | 37 ++--- .../controller/ClusterControllerClientImpl.java | 41 ++---- .../ClusterControllerClientTimeouts.java | 125 +++++++++++++++++ .../RetryingClusterControllerClientFactory.java | 34 +++-- ...ngleInstanceClusterControllerClientFactory.java | 54 ------- .../yahoo/vespa/orchestrator/model/NodeGroup.java | 2 +- .../orchestrator/status/InMemoryStatusService.java | 7 +- .../vespa/orchestrator/status/StatusService.java | 8 +- .../status/ZookeeperStatusService.java | 29 ++-- .../vespa/orchestrator/OrchestratorImplTest.java | 16 ++- .../controller/ClusterControllerClientTest.java | 8 +- .../ClusterControllerClientTimeoutsTest.java | 155 +++++++++++++++++++++ ...RetryingClusterControllerClientFactoryTest.java | 54 +++++++ ...InstanceClusterControllerClientFactoryTest.java | 116 --------------- .../orchestrator/resources/HostResourceTest.java | 10 +- .../status/ZookeeperStatusServiceTest.java | 21 +-- 19 files changed, 496 insertions(+), 306 deletions(-) create mode 100644 orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java delete mode 100644 orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactory.java create mode 100644 orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeoutsTest.java create mode 100644 orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactoryTest.java delete mode 100644 orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java (limited to 'orchestrator') diff --git a/orchestrator/pom.xml b/orchestrator/pom.xml index ae05a1908c9..37d308111ae 100644 --- a/orchestrator/pom.xml +++ b/orchestrator/pom.xml @@ -116,6 +116,12 @@ ${project.version} provided + + com.yahoo.vespa + testutil + ${project.version} + test + org.apache.curator curator-test diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/Orchestrator.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/Orchestrator.java index ab61a51c418..e37f5ad4d70 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/Orchestrator.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/Orchestrator.java @@ -73,15 +73,6 @@ public interface Orchestrator { */ void acquirePermissionToRemove(HostName hostName) throws OrchestrationException; - /** - * Suspend normal operations for a group of nodes in the same application. - * - * @param nodeGroup The group of nodes in an application. - * @throws HostStateChangeDeniedException if the request cannot be meet due to policy constraints. - * @throws HostNameNotFoundException if any hostnames in the node group is not recognized - */ - void suspendGroup(NodeGroup nodeGroup) throws HostStateChangeDeniedException, HostNameNotFoundException; - /** * Suspend several hosts. On failure, all hosts are resumed before exiting the method with an exception. */ diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java index 6577b4b96cc..25c6c780130 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorContext.java @@ -1,40 +1,74 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.orchestrator; -import com.google.common.util.concurrent.UncheckedTimeoutException; import com.yahoo.time.TimeBudget; +import com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts; import java.time.Clock; import java.time.Duration; +import java.time.Instant; +import java.util.Optional; /** - * Context for the Orchestrator, e.g. timeout management. + * Context for an operation (or suboperation) of the Orchestrator that needs to pass through to the backend, + * e.g. timeout management and probing. * - * @author hakon + * @author hakonhall */ public class OrchestratorContext { - private static final Duration DEFAULT_TIMEOUT = Duration.ofSeconds(10); + private static final Duration DEFAULT_TIMEOUT_FOR_SINGLE_OP = Duration.ofSeconds(10); + private static final Duration DEFAULT_TIMEOUT_FOR_BATCH_OP = Duration.ofSeconds(60); - private TimeBudget timeBudget; + private final Clock clock; + private final TimeBudget timeBudget; + private boolean probe; - public OrchestratorContext(Clock clock) { - this.timeBudget = TimeBudget.fromNow(clock, DEFAULT_TIMEOUT); + public static OrchestratorContext createContextForMultiAppOp(Clock clock) { + return new OrchestratorContext(clock, TimeBudget.fromNow(clock, DEFAULT_TIMEOUT_FOR_BATCH_OP), true); } - /** Get the original timeout in seconds. */ - public long getOriginalTimeoutInSeconds() { - return timeBudget.originalTimeout().get().getSeconds(); + public static OrchestratorContext createContextForSingleAppOp(Clock clock) { + return new OrchestratorContext(clock, TimeBudget.fromNow(clock, DEFAULT_TIMEOUT_FOR_SINGLE_OP), true); } - /** - * Get timeout for a suboperation that should take up {@code shareOfRemaining} of the - * remaining time, or throw an {@link UncheckedTimeoutException} if timed out. - */ - public float getSuboperationTimeoutInSeconds(float shareOfRemaining) { - if (!(0f <= shareOfRemaining && shareOfRemaining <= 1.0f)) { - throw new IllegalArgumentException("Share of remaining time must be between 0 and 1: " + shareOfRemaining); + private OrchestratorContext(Clock clock, TimeBudget timeBudget, boolean probe) { + this.clock = clock; + this.timeBudget = timeBudget; + this.probe = probe; + } + + public Duration getTimeLeft() { + return timeBudget.timeLeftOrThrow().get(); + } + + public ClusterControllerClientTimeouts getClusterControllerTimeouts(String clusterName) { + return new ClusterControllerClientTimeouts(clusterName, timeBudget.timeLeftAsTimeBudget()); + } + + + /** Mark this operation as a non-committal probe. */ + public OrchestratorContext markAsProbe() { + this.probe = true; + return this; + } + + /** Whether the operation is a no-op probe to test whether it would have succeeded, if it had been committal. */ + public boolean isProbe() { + return probe; + } + + /** Create an OrchestratorContext to use within an application lock. */ + public OrchestratorContext createSubcontextForApplication() { + Instant now = clock.instant(); + Instant deadline = timeBudget.deadline().get(); + Instant maxDeadline = now.plus(DEFAULT_TIMEOUT_FOR_SINGLE_OP); + if (maxDeadline.compareTo(deadline) < 0) { + deadline = maxDeadline; } - return shareOfRemaining * timeBudget.timeLeftOrThrow().get().toMillis() / 1000.0f; + return new OrchestratorContext( + clock, + TimeBudget.from(clock, now, Optional.of(Duration.between(now, deadline))), + probe); } } diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorImpl.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorImpl.java index ad8a35312e4..3a2673d9dbc 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorImpl.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/OrchestratorImpl.java @@ -105,7 +105,9 @@ public class OrchestratorImpl implements Orchestrator { @Override public void setNodeStatus(HostName hostName, HostStatus status) throws OrchestrationException { ApplicationInstanceReference reference = getApplicationInstance(hostName).reference(); - try (MutableStatusRegistry statusRegistry = statusService.lockApplicationInstance_forCurrentThreadOnly(reference)) { + OrchestratorContext context = OrchestratorContext.createContextForSingleAppOp(clock); + try (MutableStatusRegistry statusRegistry = statusService + .lockApplicationInstance_forCurrentThreadOnly(reference, context.getTimeLeft())) { statusRegistry.setHostState(hostName, status); } } @@ -127,10 +129,10 @@ public class OrchestratorImpl implements Orchestrator { ApplicationInstance appInstance = getApplicationInstance(hostName); - OrchestratorContext context = new OrchestratorContext(clock); + OrchestratorContext context = OrchestratorContext.createContextForSingleAppOp(clock); try (MutableStatusRegistry statusRegistry = statusService.lockApplicationInstance_forCurrentThreadOnly( appInstance.reference(), - context.getOriginalTimeoutInSeconds())) { + context.getTimeLeft())) { final HostStatus currentHostState = statusRegistry.getHostStatus(hostName); if (HostStatus.NO_REMARKS == currentHostState) { @@ -148,7 +150,7 @@ public class OrchestratorImpl implements Orchestrator { public void suspend(HostName hostName) throws HostStateChangeDeniedException, HostNameNotFoundException { ApplicationInstance appInstance = getApplicationInstance(hostName); NodeGroup nodeGroup = new NodeGroup(appInstance, hostName); - suspendGroup(nodeGroup); + suspendGroup(OrchestratorContext.createContextForSingleAppOp(clock), nodeGroup); } @Override @@ -156,10 +158,10 @@ public class OrchestratorImpl implements Orchestrator { ApplicationInstance appInstance = getApplicationInstance(hostName); NodeGroup nodeGroup = new NodeGroup(appInstance, hostName); - OrchestratorContext context = new OrchestratorContext(clock); + OrchestratorContext context = OrchestratorContext.createContextForSingleAppOp(clock); try (MutableStatusRegistry statusRegistry = statusService.lockApplicationInstance_forCurrentThreadOnly( appInstance.reference(), - context.getOriginalTimeoutInSeconds())) { + context.getTimeLeft())) { ApplicationApi applicationApi = new ApplicationApiImpl( nodeGroup, statusRegistry, @@ -169,16 +171,19 @@ public class OrchestratorImpl implements Orchestrator { } } - // Public for testing purposes - @Override - public void suspendGroup(NodeGroup nodeGroup) throws HostStateChangeDeniedException, HostNameNotFoundException { + /** + * Suspend normal operations for a group of nodes in the same application. + * + * @param nodeGroup The group of nodes in an application. + * @throws HostStateChangeDeniedException if the request cannot be meet due to policy constraints. + */ + void suspendGroup(OrchestratorContext context, NodeGroup nodeGroup) throws HostStateChangeDeniedException { ApplicationInstanceReference applicationReference = nodeGroup.getApplicationReference(); - OrchestratorContext context = new OrchestratorContext(clock); try (MutableStatusRegistry hostStatusRegistry = statusService.lockApplicationInstance_forCurrentThreadOnly( applicationReference, - context.getOriginalTimeoutInSeconds())) { + context.getTimeLeft())) { ApplicationInstanceStatus appStatus = statusService.forApplicationInstance(applicationReference).getApplicationInstanceStatus(); if (appStatus == ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN) { return; @@ -224,14 +229,12 @@ public class OrchestratorImpl implements Orchestrator { throw new BatchHostNameNotFoundException(parentHostname, hostNames, e); } + OrchestratorContext context = OrchestratorContext.createContextForMultiAppOp(clock); for (NodeGroup nodeGroup : nodeGroupsOrderedByApplication) { try { - suspendGroup(nodeGroup); + suspendGroup(context.createSubcontextForApplication(), nodeGroup); } catch (HostStateChangeDeniedException e) { throw new BatchHostStateChangeDeniedException(parentHostname, nodeGroup, e); - } catch (HostNameNotFoundException e) { - // Should never get here since since we would have received HostNameNotFoundException earlier. - throw new BatchHostNameNotFoundException(parentHostname, hostNames, e); } catch (RuntimeException e) { throw new BatchInternalErrorException(parentHostname, nodeGroup, e); } @@ -301,12 +304,12 @@ public class OrchestratorImpl implements Orchestrator { private void setApplicationStatus(ApplicationId appId, ApplicationInstanceStatus status) throws ApplicationStateChangeDeniedException, ApplicationIdNotFoundException{ - OrchestratorContext context = new OrchestratorContext(clock); + OrchestratorContext context = OrchestratorContext.createContextForSingleAppOp(clock); ApplicationInstanceReference appRef = OrchestratorUtil.toApplicationInstanceReference(appId, instanceLookupService); try (MutableStatusRegistry statusRegistry = statusService.lockApplicationInstance_forCurrentThreadOnly( appRef, - context.getOriginalTimeoutInSeconds())) { + context.getTimeLeft())) { // Short-circuit if already in wanted state if (status == statusRegistry.getApplicationInstanceStatus()) return; diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java index 467b534f809..50782ea081f 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientImpl.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.orchestrator.controller; +import com.google.common.util.concurrent.UncheckedTimeoutException; import com.yahoo.vespa.jaxrs.client.JaxRsStrategy; import com.yahoo.vespa.orchestrator.OrchestratorContext; @@ -15,23 +16,6 @@ public class ClusterControllerClientImpl implements ClusterControllerClient{ public static final String REQUEST_REASON = "Orchestrator"; - // On setNodeState calls against the CC ensemble. - // - // We'd like to set a timeout for the request to the first CC such that if the first - // CC is faulty, there's sufficient time to send the request to the second and third CC. - // The timeouts would be: - // timeout(1. request) = SHARE_REMAINING_TIME * T - // timeout(2. request) = SHARE_REMAINING_TIME * T * (1 - SHARE_REMAINING_TIME) - // timeout(3. request) = SHARE_REMAINING_TIME * T * (1 - SHARE_REMAINING_TIME)^2 - // - // Using a share of 50% gives approximately: - // timeout(1. request) = T * 0.5 - // timeout(2. request) = T * 0.25 - // timeout(3. request) = T * 0.125 - // - // which seems fine - public static final float SHARE_REMAINING_TIME = 0.5f; - private final JaxRsStrategy clusterControllerApi; private final String clusterName; @@ -52,15 +36,16 @@ public class ClusterControllerClientImpl implements ClusterControllerClient{ ClusterControllerNodeState wantedState) throws IOException { ClusterControllerStateRequest.State state = new ClusterControllerStateRequest.State(wantedState, REQUEST_REASON); ClusterControllerStateRequest stateRequest = new ClusterControllerStateRequest(state, ClusterControllerStateRequest.Condition.SAFE); + ClusterControllerClientTimeouts timeouts = context.getClusterControllerTimeouts(clusterName); try { return clusterControllerApi.apply(api -> api.setNodeState( clusterName, storageNodeIndex, - context.getSuboperationTimeoutInSeconds(SHARE_REMAINING_TIME), - stateRequest) - ); - } catch (IOException e) { + timeouts.getServerTimeout().toMillis() / 1000.0f, + stateRequest), + timeouts); + } catch (IOException | UncheckedTimeoutException e) { String message = String.format( "Giving up setting %s for storage node with index %d in cluster %s", stateRequest, @@ -79,16 +64,18 @@ public class ClusterControllerClientImpl implements ClusterControllerClient{ @Override public ClusterControllerStateResponse setApplicationState( OrchestratorContext context, - final ClusterControllerNodeState wantedState) throws IOException { - final ClusterControllerStateRequest.State state = new ClusterControllerStateRequest.State(wantedState, REQUEST_REASON); - final ClusterControllerStateRequest stateRequest = new ClusterControllerStateRequest(state, ClusterControllerStateRequest.Condition.FORCE); + ClusterControllerNodeState wantedState) throws IOException { + ClusterControllerStateRequest.State state = new ClusterControllerStateRequest.State(wantedState, REQUEST_REASON); + ClusterControllerStateRequest stateRequest = new ClusterControllerStateRequest(state, ClusterControllerStateRequest.Condition.FORCE); + ClusterControllerClientTimeouts timeouts = context.getClusterControllerTimeouts(clusterName); try { return clusterControllerApi.apply(api -> api.setClusterState( clusterName, - context.getSuboperationTimeoutInSeconds(SHARE_REMAINING_TIME), - stateRequest)); - } catch (IOException e) { + timeouts.getServerTimeout().toMillis() / 1000.0f, + stateRequest), + timeouts); + } catch (IOException | UncheckedTimeoutException e) { final String message = String.format( "Giving up setting %s for cluster %s", stateRequest, diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java new file mode 100644 index 00000000000..6c08ae53e70 --- /dev/null +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeouts.java @@ -0,0 +1,125 @@ +package com.yahoo.vespa.orchestrator.controller; + +import com.google.common.util.concurrent.UncheckedTimeoutException; +import com.yahoo.time.TimeBudget; +import com.yahoo.vespa.jaxrs.client.JaxRsTimeouts; + +import java.time.Duration; + +/** + * Calculates various timeouts associated with a REST call from the Orchestrator to the Cluster Controller. + * + *

Timeout handling of HTTP messaging is fundamentally flawed in various Java implementations. + * We would like to specify a max time for the whole operation (connect, send request, and receive response). + * Jersey JAX-RS implementation and the Apache HTTP client library provides a way to set the connect timeout C + * and read timeout R. So if the operation takes NR reads, and the writes takes TW time, + * the theoretical max time is: T = C + R * NR + TW. With both NR and TW unknown, there's no way to + * set a proper C and R.

+ * + *

The various timeouts is set according to the following considerations:

+ * + *
    + *
  1. Some time is reserved for the execution in this process, e.g. execution leading to the REST call, + * handling of the response, exception handling, etc, such that we can finish processing this request + * before the {@link #timeBudget} deadline. This is typically in the order of ms.
  2. + *
  3. A timeout will be passed to the Cluster Controller backend. We'll give a timeout such that if one + * CC times out, the next CC will be given exactly the same timeout. This may or may not be a good strategy: + * (A) There's typically a 3rd CC. But if the first 2 fails with timeout, the chance the last is OK + * is negligible. (B) If picking the CC is random, then giving the full timeout to the first + * should be sufficient since a later retry will hit the healthy CC. (C) Because we have been using + * DROP in networking rules, clients may time out (host out of app or whatever). This would suggest + * allowing more than 1 full request.
  4. + *
  5. The timeout passed to the CC backend should be such that if it honors that, the Orchestrator + * should not time out. This means some kernel and network overhead should be subtracted from the timeout + * passed to the CC.
  6. + *
  7. We're only able to set the connect and read/write timeouts(!) Since we're communicating within + * data center, assume connections are in the order of ms, while a single read may stall close up to the CC + * timeout.
  8. + *
+ * + * @author hakonhall + */ +public class ClusterControllerClientTimeouts implements JaxRsTimeouts { + // In data center connect timeout + static final Duration CONNECT_TIMEOUT = Duration.ofMillis(50); + // Per call overhead + static final Duration IN_PROCESS_OVERHEAD_PER_CALL = Duration.ofMillis(50); + // In data center kernel and network overhead. + static final Duration NETWORK_OVERHEAD_PER_CALL = CONNECT_TIMEOUT; + // Minimum time reserved for post-RPC processing to finish BEFORE the deadline, including ZK write. + static final Duration IN_PROCESS_OVERHEAD = Duration.ofMillis(100); + // Number of JAX-RS RPC calls to account for within the time budget. + static final int NUM_CALLS = 2; + // Minimum server-side timeout + static final Duration MIN_SERVER_TIMEOUT = Duration.ofMillis(10); + + private final String clusterName; + private final TimeBudget timeBudget; + private final Duration maxClientTimeout; + + /** + * Creates a timeouts instance. + * + * The {@link #timeBudget} SHOULD be the time budget for a single logical call to the Cluster Controller. + * A logical call to CC may in fact call the CC several times, if the first onces are down and/or not + * the master. + * + * @param clusterName The name of the content cluster this request is for. + * @param timeBudget The time budget for a single logical call to the the Cluster Controller. + */ + public ClusterControllerClientTimeouts(String clusterName, TimeBudget timeBudget) { + this.clusterName = clusterName; + this.timeBudget = timeBudget; + + // timeLeft = inProcessOverhead + numCalls * clientTimeout + maxClientTimeout = timeBudget.originalTimeout().get().minus(IN_PROCESS_OVERHEAD).dividedBy(NUM_CALLS); + } + + @Override + public Duration getConnectTimeout() { + return CONNECT_TIMEOUT; + } + + @Override + public Duration getReadTimeout() { + Duration timeLeft = timeBudget.timeLeft().get(); + if (timeLeft.toMillis() <= 0) { + throw new UncheckedTimeoutException("Exceeded the timeout " + timeBudget.originalTimeout().get() + + " against content cluster '" + clusterName + "' by " + timeLeft.negated()); + } + + Duration clientTimeout = min(timeLeft, maxClientTimeout); + verifyPositive(timeLeft, maxClientTimeout); + + // clientTimeout = overheadPerCall + connectTimeout + readTimeout + Duration readTimeout = clientTimeout.minus(IN_PROCESS_OVERHEAD_PER_CALL).minus(CONNECT_TIMEOUT); + verifyPositive(timeLeft, readTimeout); + + return readTimeout; + } + + public Duration getServerTimeout() { + // readTimeout = networkOverhead + serverTimeout + Duration serverTimeout = getReadTimeout().minus(NETWORK_OVERHEAD_PER_CALL); + if (serverTimeout.toMillis() < MIN_SERVER_TIMEOUT.toMillis()) { + throw new UncheckedTimeoutException("Server would be given too little time to complete: " + + serverTimeout + ". Original timeout was " + timeBudget.originalTimeout().get()); + } + + return serverTimeout; + } + + private static Duration min(Duration a, Duration b) { + return a.compareTo(b) < 0 ? a : b; + } + + private void verifyLargerThan(Duration timeLeft, Duration availableDuration) { + if (availableDuration.toMillis() <= 0) { + throw new UncheckedTimeoutException("Too little time left (" + timeLeft + + ") to call content cluster '" + clusterName + + "', original timeout was " + timeBudget.originalTimeout().get()); + } + } + + private void verifyPositive(Duration timeLeft, Duration duration) { verifyLargerThan(timeLeft, duration); } +} diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java index 5571eedeec6..7a0b96c2231 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactory.java @@ -8,9 +8,8 @@ import com.yahoo.vespa.jaxrs.client.JaxRsStrategy; import com.yahoo.vespa.jaxrs.client.JaxRsStrategyFactory; import com.yahoo.vespa.jaxrs.client.JerseyJaxRsClientFactory; +import java.util.HashSet; import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; /** * @author bakksjo @@ -21,14 +20,12 @@ public class RetryingClusterControllerClientFactory implements ClusterController public static final int HARDCODED_CLUSTERCONTROLLER_PORT = 19050; public static final String CLUSTERCONTROLLER_API_PATH = "/"; public static final String CLUSTERCONTROLLER_SCHEME = "http"; - private static final int CLUSTER_CONTROLLER_CONNECT_TIMEOUT_MS = 1000; - private static final int CLUSTER_CONTROLLER_READ_TIMEOUT_MS = 1000; private JaxRsClientFactory jaxRsClientFactory; @Inject public RetryingClusterControllerClientFactory() { - this(new JerseyJaxRsClientFactory(CLUSTER_CONTROLLER_CONNECT_TIMEOUT_MS, CLUSTER_CONTROLLER_READ_TIMEOUT_MS)); + this(new JerseyJaxRsClientFactory()); } public RetryingClusterControllerClientFactory(JaxRsClientFactory jaxRsClientFactory) { @@ -36,19 +33,20 @@ public class RetryingClusterControllerClientFactory implements ClusterController } @Override - public ClusterControllerClient createClient(List clusterControllers, - String clusterName) { - Set clusterControllerSet = clusterControllers.stream().collect(Collectors.toSet()); - JaxRsStrategy jaxRsApi - = new JaxRsStrategyFactory(clusterControllerSet, HARDCODED_CLUSTERCONTROLLER_PORT, jaxRsClientFactory, CLUSTERCONTROLLER_SCHEME) - .apiWithRetries(ClusterControllerJaxRsApi.class, CLUSTERCONTROLLER_API_PATH) - // Use max iteration 1. The JaxRsStrategyFactory will try host 1, 2, then 3: - // - If host 1 responds, it will redirect to master if necessary. Otherwise - // - If host 2 responds, it will redirect to master if necessary. Otherwise - // - If host 3 responds, it may redirect to master if necessary (if they're up - // after all), but more likely there's no quorum and this will fail too. - .setMaxIterations(1); + public ClusterControllerClient createClient(List clusterControllers, String clusterName) { + JaxRsStrategy jaxRsApi = + new JaxRsStrategyFactory( + new HashSet<>(clusterControllers), + HARDCODED_CLUSTERCONTROLLER_PORT, + jaxRsClientFactory, + CLUSTERCONTROLLER_SCHEME) + .apiWithRetries(ClusterControllerJaxRsApi.class, CLUSTERCONTROLLER_API_PATH) + // Use max iteration 1. The JaxRsStrategyFactory will try host 1, 2, then 3: + // - If host 1 responds, it will redirect to master if necessary. Otherwise + // - If host 2 responds, it will redirect to master if necessary. Otherwise + // - If host 3 responds, it may redirect to master if necessary (if they're up + // after all), but more likely there's no quorum and this will fail too. + .setMaxIterations(1); return new ClusterControllerClientImpl(jaxRsApi, clusterName); } - } diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactory.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactory.java deleted file mode 100644 index 7459f0a6b11..00000000000 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactory.java +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.orchestrator.controller; - -import com.yahoo.log.LogLevel; -import com.yahoo.vespa.applicationmodel.HostName; -import com.yahoo.vespa.jaxrs.client.JaxRsClientFactory; -import com.yahoo.vespa.jaxrs.client.JaxRsStrategy; -import com.yahoo.vespa.jaxrs.client.NoRetryJaxRsStrategy; - -import java.util.List; -import java.util.logging.Logger; - -/** - * @author bakksjo - */ -public class SingleInstanceClusterControllerClientFactory implements ClusterControllerClientFactory { - - public static final int CLUSTERCONTROLLER_HARDCODED_PORT = 19050; - public static final String CLUSTERCONTROLLER_HARDCODED_SCHEME = "http"; - public static final String CLUSTERCONTROLLER_API_PATH = "/"; - - private static final Logger log = Logger.getLogger(SingleInstanceClusterControllerClientFactory.class.getName()); - - private JaxRsClientFactory jaxRsClientFactory; - - public SingleInstanceClusterControllerClientFactory(JaxRsClientFactory jaxRsClientFactory) { - this.jaxRsClientFactory = jaxRsClientFactory; - } - - @Override - public ClusterControllerClient createClient(List clusterControllers, - String clusterName) { - if (clusterControllers.isEmpty()) { - throw new IllegalArgumentException("No cluster controller instances found"); - } - HostName controllerHostName = clusterControllers.iterator().next(); - int port = CLUSTERCONTROLLER_HARDCODED_PORT; // TODO: Get this from service monitor. - - log.log(LogLevel.DEBUG, () -> - "For cluster '" + clusterName + "' with controllers " + clusterControllers - + ", creating api client for " + controllerHostName.s() + ":" + port); - - JaxRsStrategy strategy = new NoRetryJaxRsStrategy<>( - controllerHostName, - port, - jaxRsClientFactory, - ClusterControllerJaxRsApi.class, - CLUSTERCONTROLLER_API_PATH, - CLUSTERCONTROLLER_HARDCODED_SCHEME); - - return new ClusterControllerClientImpl(strategy, clusterName); - } - -} diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/NodeGroup.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/NodeGroup.java index ed506c82079..74b4b534acc 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/NodeGroup.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/model/NodeGroup.java @@ -43,7 +43,7 @@ public class NodeGroup { } public List getHostNames() { - return hostNames.stream().collect(Collectors.toList()).stream().sorted().collect(Collectors.toList()); + return hostNames.stream().sorted().collect(Collectors.toList()); } public String toCommaSeparatedString() { diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/InMemoryStatusService.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/InMemoryStatusService.java index c5ae553a98c..bd5eb6f3e29 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/InMemoryStatusService.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/InMemoryStatusService.java @@ -4,6 +4,7 @@ package com.yahoo.vespa.orchestrator.status; import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference; import com.yahoo.vespa.applicationmodel.HostName; +import java.time.Duration; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -42,15 +43,11 @@ public class InMemoryStatusService implements StatusService { }; } - @Override - public MutableStatusRegistry lockApplicationInstance_forCurrentThreadOnly(ApplicationInstanceReference applicationInstanceReference) { - return lockApplicationInstance_forCurrentThreadOnly(applicationInstanceReference, 10); - } @Override public MutableStatusRegistry lockApplicationInstance_forCurrentThreadOnly( ApplicationInstanceReference applicationInstanceReference, - long timeoutSeconds) { + Duration timeout) { Lock lock = instanceLockService.get(applicationInstanceReference); return new InMemoryMutableStatusRegistry(lock, applicationInstanceReference); } diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/StatusService.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/StatusService.java index c47be096242..768e5290412 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/StatusService.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/StatusService.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.orchestrator.status; import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference; +import java.time.Duration; import java.util.Set; /** @@ -24,7 +25,7 @@ public interface StatusService { * possibly inconsistent snapshot values. It is not recommended that this method is used for anything other * than monitoring, logging, debugging, etc. It should never be used for multi-step operations (e.g. * read-then-write) where consistency is required. For those cases, use - * {@link #lockApplicationInstance_forCurrentThreadOnly(ApplicationInstanceReference)}. + * {@link #lockApplicationInstance_forCurrentThreadOnly(ApplicationInstanceReference, Duration)}. */ ReadOnlyStatusRegistry forApplicationInstance(ApplicationInstanceReference applicationInstanceReference); @@ -52,12 +53,9 @@ public interface StatusService { * this case, subsequent mutating operations will fail, but previous mutating operations are NOT rolled back. * This may leave the registry in an inconsistent state (as judged by the client code). */ - MutableStatusRegistry lockApplicationInstance_forCurrentThreadOnly(ApplicationInstanceReference applicationInstanceReference); - - /** Lock application instance with timeout. */ MutableStatusRegistry lockApplicationInstance_forCurrentThreadOnly( ApplicationInstanceReference applicationInstanceReference, - long timeoutSeconds); + Duration timeoutSeconds); /** * Returns all application instances that are allowed to be down. The intention is to use this diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java index deece6a4a65..7df29e038c1 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java @@ -56,21 +56,6 @@ public class ZookeeperStatusService implements StatusService { }; } - /** - * 1) locks the status service for an application instance. - * 2) fails all operations in this thread when the session is lost, - * since session loss might cause the lock to be lost. - * Since it only fails operations in this thread, - * all operations depending on a lock, including the locking itself, must be done in this thread. - * Note that since it is the thread that fails, all status operations in this thread will fail - * even if they're not supposed to be guarded by this lock - * (i.e. the request is for another applicationInstanceReference) - */ - @Override - public MutableStatusRegistry lockApplicationInstance_forCurrentThreadOnly(ApplicationInstanceReference applicationInstanceReference) { - return lockApplicationInstance_forCurrentThreadOnly(applicationInstanceReference, 10); - } - @Override public Set getAllSuspendedApplications() { try { @@ -93,13 +78,23 @@ public class ZookeeperStatusService implements StatusService { } } + /** + * 1) locks the status service for an application instance. + * 2) fails all operations in this thread when the session is lost, + * since session loss might cause the lock to be lost. + * Since it only fails operations in this thread, + * all operations depending on a lock, including the locking itself, must be done in this thread. + * Note that since it is the thread that fails, all status operations in this thread will fail + * even if they're not supposed to be guarded by this lock + * (i.e. the request is for another applicationInstanceReference) + */ @Override public MutableStatusRegistry lockApplicationInstance_forCurrentThreadOnly( ApplicationInstanceReference applicationInstanceReference, - long timeoutSeconds) { + Duration timeout) { String lockPath = applicationInstanceLock2Path(applicationInstanceReference); Lock lock = new Lock(lockPath, curator); - lock.acquire(Duration.ofSeconds(timeoutSeconds)); + lock.acquire(timeout); try { return new ZkMutableStatusRegistry(lock, applicationInstanceReference); diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java index 76d9398c44e..80174d05a54 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/OrchestratorImplTest.java @@ -43,6 +43,8 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.eq; import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.inOrder; import static org.mockito.Mockito.mock; @@ -245,6 +247,8 @@ public class OrchestratorImplTest { // A spy is preferential because suspendAll() relies on delegating the hard work to suspend() and resume(). OrchestratorImpl orchestrator = spy(this.orchestrator); + OrchestratorContext context = mock(OrchestratorContext.class); + orchestrator.suspendAll( new HostName("parentHostname"), Arrays.asList( @@ -257,9 +261,9 @@ public class OrchestratorImplTest { // TEST6: tenant-id-3:application-instance-3:default // TEST1: test-tenant-id:application:instance InOrder order = inOrder(orchestrator); - order.verify(orchestrator).suspendGroup(DummyInstanceLookupService.TEST3_NODE_GROUP); - order.verify(orchestrator).suspendGroup(DummyInstanceLookupService.TEST6_NODE_GROUP); - order.verify(orchestrator).suspendGroup(DummyInstanceLookupService.TEST1_NODE_GROUP); + order.verify(orchestrator).suspendGroup(any(), eq(DummyInstanceLookupService.TEST3_NODE_GROUP)); + order.verify(orchestrator).suspendGroup(any(), eq(DummyInstanceLookupService.TEST6_NODE_GROUP)); + order.verify(orchestrator).suspendGroup(any(), eq(DummyInstanceLookupService.TEST1_NODE_GROUP)); order.verifyNoMoreInteractions(); } @@ -272,7 +276,7 @@ public class OrchestratorImplTest { DummyInstanceLookupService.TEST6_HOST_NAME, "some-constraint", "error message"); - doThrow(supensionFailure).when(orchestrator).suspendGroup(DummyInstanceLookupService.TEST6_NODE_GROUP); + doThrow(supensionFailure).when(orchestrator).suspendGroup(any(), eq(DummyInstanceLookupService.TEST6_NODE_GROUP)); try { orchestrator.suspendAll( @@ -291,8 +295,8 @@ public class OrchestratorImplTest { } InOrder order = inOrder(orchestrator); - order.verify(orchestrator).suspendGroup(DummyInstanceLookupService.TEST3_NODE_GROUP); - order.verify(orchestrator).suspendGroup(DummyInstanceLookupService.TEST6_NODE_GROUP); + order.verify(orchestrator).suspendGroup(any(), eq(DummyInstanceLookupService.TEST3_NODE_GROUP)); + order.verify(orchestrator).suspendGroup(any(), eq(DummyInstanceLookupService.TEST6_NODE_GROUP)); order.verifyNoMoreInteractions(); } diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java index 228174a9b3d..91909391fe7 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTest.java @@ -6,7 +6,9 @@ import com.yahoo.vespa.jaxrs.client.LocalPassThroughJaxRsStrategy; import com.yahoo.vespa.orchestrator.OrchestratorContext; import org.junit.Test; -import static org.mockito.Matchers.anyFloat; +import java.time.Duration; + +import static org.mockito.Matchers.any; import static org.mockito.Matchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; @@ -28,7 +30,9 @@ public class ClusterControllerClientTest { final ClusterControllerNodeState wantedState = ClusterControllerNodeState.MAINTENANCE; OrchestratorContext context = mock(OrchestratorContext.class); - when(context.getSuboperationTimeoutInSeconds(anyFloat())).thenReturn(1.0f); + ClusterControllerClientTimeouts timeouts = mock(ClusterControllerClientTimeouts.class); + when(context.getClusterControllerTimeouts(any())).thenReturn(timeouts); + when(timeouts.getServerTimeout()).thenReturn(Duration.ofSeconds(1)); clusterControllerClient.setNodeState(context, STORAGE_NODE_INDEX, wantedState); final ClusterControllerStateRequest expectedNodeStateRequest = new ClusterControllerStateRequest( diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeoutsTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeoutsTest.java new file mode 100644 index 00000000000..36dd4c4a83f --- /dev/null +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/ClusterControllerClientTimeoutsTest.java @@ -0,0 +1,155 @@ +package com.yahoo.vespa.orchestrator.controller; + +import com.google.common.util.concurrent.UncheckedTimeoutException; +import com.yahoo.test.ManualClock; +import com.yahoo.time.TimeBudget; +import org.junit.Before; +import org.junit.Test; + +import java.time.Duration; +import java.util.Optional; + +import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts.CONNECT_TIMEOUT; +import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts.IN_PROCESS_OVERHEAD; +import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts.IN_PROCESS_OVERHEAD_PER_CALL; +import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts.MIN_SERVER_TIMEOUT; +import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts.NETWORK_OVERHEAD_PER_CALL; +import static com.yahoo.vespa.orchestrator.controller.ClusterControllerClientTimeouts.NUM_CALLS; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +public class ClusterControllerClientTimeoutsTest { + // The minimum time left for any invocation of prepareForImmediateJaxRsCall(). + private static final Duration MINIMUM_TIME_LEFT = IN_PROCESS_OVERHEAD_PER_CALL + .plus(CONNECT_TIMEOUT) + .plus(NETWORK_OVERHEAD_PER_CALL) + .plus(MIN_SERVER_TIMEOUT); + static { + assertEquals(Duration.ofMillis(160), MINIMUM_TIME_LEFT); + } + + // The minimum time left (= original time) which is required to allow any requests to the CC. + private static final Duration MINIMUM_ORIGINAL_TIMEOUT = MINIMUM_TIME_LEFT + .multipliedBy(NUM_CALLS) + .plus(IN_PROCESS_OVERHEAD); + static { + assertEquals(Duration.ofMillis(420), MINIMUM_ORIGINAL_TIMEOUT); + } + + private final ManualClock clock = new ManualClock(); + + private Duration originalTimeout; + private TimeBudget timeBudget; + private ClusterControllerClientTimeouts timeouts; + + private void makeTimeouts(Duration originalTimeout) { + this.originalTimeout = originalTimeout; + this.timeBudget = TimeBudget.from(clock, clock.instant(), Optional.of(originalTimeout)); + this.timeouts = new ClusterControllerClientTimeouts("clustername", timeBudget); + } + + @Before + public void setUp() { + makeTimeouts(Duration.ofSeconds(3)); + } + + @Test + public void makes2RequestsWithMaxProcessingTime() { + assertEquals(Duration.ofMillis(50), timeouts.getConnectTimeout()); + assertEquals(Duration.ofMillis(1350), timeouts.getReadTimeout()); + assertEquals(Duration.ofMillis(1300), timeouts.getServerTimeout()); + + Duration maxProcessingTime = IN_PROCESS_OVERHEAD_PER_CALL + .plus(CONNECT_TIMEOUT) + .plus(timeouts.getReadTimeout()); + assertEquals(1450, maxProcessingTime.toMillis()); + clock.advance(maxProcessingTime); + + assertEquals(Duration.ofMillis(50), timeouts.getConnectTimeout()); + assertEquals(Duration.ofMillis(1350), timeouts.getReadTimeout()); + assertEquals(Duration.ofMillis(1300), timeouts.getServerTimeout()); + + clock.advance(maxProcessingTime); + + try { + timeouts.getServerTimeout(); + fail(); + } catch (UncheckedTimeoutException e) { + assertEquals( + "Too little time left (PT0.1S) to call content cluster 'clustername', original timeout was PT3S", + e.getMessage()); + } + } + + @Test + public void makesAtLeast3RequestsWithShortProcessingTime() { + assertEquals(Duration.ofMillis(50), timeouts.getConnectTimeout()); + assertEquals(Duration.ofMillis(1350), timeouts.getReadTimeout()); + assertEquals(Duration.ofMillis(1300), timeouts.getServerTimeout()); + + Duration shortPocessingTime = Duration.ofMillis(200); + clock.advance(shortPocessingTime); + + assertEquals(Duration.ofMillis(50), timeouts.getConnectTimeout()); + assertEquals(Duration.ofMillis(1350), timeouts.getReadTimeout()); + assertEquals(Duration.ofMillis(1300), timeouts.getServerTimeout()); + + clock.advance(shortPocessingTime); + + assertEquals(Duration.ofMillis(50), timeouts.getConnectTimeout()); + assertEquals(Duration.ofMillis(1350), timeouts.getReadTimeout()); + assertEquals(Duration.ofMillis(1300), timeouts.getServerTimeout()); + } + + @Test + public void alreadyTimedOut() { + clock.advance(Duration.ofSeconds(4)); + + try { + timeouts.getServerTimeout(); + fail(); + } catch (UncheckedTimeoutException e) { + assertEquals( + "Exceeded the timeout PT3S against content cluster 'clustername' by PT1S", + e.getMessage()); + } + } + + @Test + public void justTooLittleTime() { + clock.advance(originalTimeout.minus(MINIMUM_TIME_LEFT).plus(Duration.ofMillis(1))); + try { + timeouts.getServerTimeout(); + fail(); + } catch (UncheckedTimeoutException e) { + assertEquals( + "Server would be given too little time to complete: PT0.009S. Original timeout was PT3S", + e.getMessage()); + } + } + + @Test + public void justEnoughTime() { + clock.advance(originalTimeout.minus(MINIMUM_TIME_LEFT)); + timeouts.getServerTimeout(); + } + + @Test + public void justTooLittleInitialTime() { + makeTimeouts(MINIMUM_ORIGINAL_TIMEOUT.minus(Duration.ofMillis(1))); + try { + timeouts.getServerTimeout(); + fail(); + } catch (UncheckedTimeoutException e) { + assertEquals( + "Server would be given too little time to complete: PT0.0095S. Original timeout was PT0.419S", + e.getMessage()); + } + } + + @Test + public void justEnoughInitialTime() { + makeTimeouts(MINIMUM_ORIGINAL_TIMEOUT); + timeouts.getServerTimeout(); + } +} \ No newline at end of file diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactoryTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactoryTest.java new file mode 100644 index 00000000000..1f505991476 --- /dev/null +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/RetryingClusterControllerClientFactoryTest.java @@ -0,0 +1,54 @@ +package com.yahoo.vespa.orchestrator.controller; + +import com.yahoo.test.ManualClock; +import com.yahoo.vespa.applicationmodel.HostName; +import com.yahoo.vespa.jaxrs.client.JaxRsClientFactory; +import com.yahoo.vespa.orchestrator.OrchestratorContext; +import org.junit.Test; +import org.mockito.ArgumentCaptor; + +import java.io.IOException; +import java.time.Clock; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.junit.Assert.assertEquals; +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class RetryingClusterControllerClientFactoryTest { + private final Clock clock = new ManualClock(); + + @Test + public void verifyJerseyCallForSetNodeState() throws IOException { + JaxRsClientFactory clientFactory = mock(JaxRsClientFactory.class); + ClusterControllerJaxRsApi api = mock(ClusterControllerJaxRsApi.class); + when(clientFactory.createClient(any())).thenReturn(api); + RetryingClusterControllerClientFactory factory = new RetryingClusterControllerClientFactory(clientFactory); + String clusterName = "clustername"; + HostName host1 = new HostName("host1"); + HostName host2 = new HostName("host2"); + HostName host3 = new HostName("host3"); + List clusterControllers = Arrays.asList(host1, host2, host3); + ClusterControllerClient client = factory.createClient(clusterControllers, clusterName); + OrchestratorContext context = OrchestratorContext.createContextForSingleAppOp(clock); + int storageNode = 2; + ClusterControllerNodeState wantedState = ClusterControllerNodeState.MAINTENANCE; + client.setNodeState(context, storageNode, wantedState); + + ArgumentCaptor requestCaptor = ArgumentCaptor.forClass(ClusterControllerStateRequest.class); + + verify(api, times(1)).setNodeState(eq(clusterName), eq(storageNode), eq(4.8f), requestCaptor.capture()); + ClusterControllerStateRequest request = requestCaptor.getValue(); + assertEquals(ClusterControllerStateRequest.Condition.SAFE, request.condition); + Map expectedState = new HashMap<>(); + expectedState.put("user", new ClusterControllerStateRequest.State(wantedState, "Orchestrator")); + assertEquals(expectedState, request.state); + } +} \ No newline at end of file diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java deleted file mode 100644 index 4dabae14add..00000000000 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/controller/SingleInstanceClusterControllerClientFactoryTest.java +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.orchestrator.controller; - -import com.yahoo.vespa.applicationmodel.ConfigId; -import com.yahoo.vespa.applicationmodel.HostName; -import com.yahoo.vespa.jaxrs.client.JaxRsClientFactory; -import com.yahoo.vespa.orchestrator.OrchestratorContext; -import org.junit.Before; -import org.junit.Test; - -import java.util.Arrays; -import java.util.List; - -import static org.hamcrest.CoreMatchers.anyOf; -import static org.hamcrest.CoreMatchers.equalTo; -import static org.hamcrest.CoreMatchers.is; -import static org.junit.Assert.fail; -import static org.mockito.Matchers.any; -import static org.mockito.Matchers.anyFloat; -import static org.mockito.Matchers.anyInt; -import static org.mockito.Matchers.anyString; -import static org.mockito.Matchers.argThat; -import static org.mockito.Matchers.eq; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; - -public class SingleInstanceClusterControllerClientFactoryTest { - private static final int PORT = SingleInstanceClusterControllerClientFactory.CLUSTERCONTROLLER_HARDCODED_PORT; - private static final String PATH = SingleInstanceClusterControllerClientFactory.CLUSTERCONTROLLER_API_PATH; - - private static final HostName HOST_NAME_1 = new HostName("host1"); - private static final HostName HOST_NAME_2 = new HostName("host2"); - private static final HostName HOST_NAME_3 = new HostName("host3"); - - OrchestratorContext context = mock(OrchestratorContext.class); - private final ClusterControllerJaxRsApi mockApi = mock(ClusterControllerJaxRsApi.class); - private final JaxRsClientFactory jaxRsClientFactory = mock(JaxRsClientFactory.class); - private final ClusterControllerClientFactory clientFactory - = new SingleInstanceClusterControllerClientFactory(jaxRsClientFactory); - - @Before - public void setup() { - when( - jaxRsClientFactory.createClient( - eq(ClusterControllerJaxRsApi.class), - any(HostName.class), - anyInt(), - anyString(), - anyString())) - .thenReturn(mockApi); - } - - @Test - public void testCreateClientWithNoClusterControllerInstances() throws Exception { - final List clusterControllers = Arrays.asList(); - - try { - clientFactory.createClient(clusterControllers, "clusterName"); - fail(); - } catch (IllegalArgumentException e) { - // As expected. - } - } - - @Test - public void testCreateClientWithSingleClusterControllerInstance() throws Exception { - final List clusterControllers = Arrays.asList(HOST_NAME_1); - - when(context.getSuboperationTimeoutInSeconds(anyFloat())).thenReturn(1.0f); - clientFactory.createClient(clusterControllers, "clusterName") - .setNodeState(context, 0, ClusterControllerNodeState.MAINTENANCE); - - verify(jaxRsClientFactory).createClient( - ClusterControllerJaxRsApi.class, - HOST_NAME_1, - PORT, - PATH, - "http"); - } - - @Test - public void testCreateClientWithoutClusterControllerInstances() throws Exception { - final List clusterControllers = Arrays.asList(); - - try { - clientFactory.createClient(clusterControllers, "clusterName"); - fail(); - } catch (IllegalArgumentException e) { - // As expected. - } - } - - @Test - public void testCreateClientWithThreeClusterControllerInstances() throws Exception { - final List clusterControllers = Arrays.asList(HOST_NAME_1, HOST_NAME_2, HOST_NAME_3); - - when(context.getSuboperationTimeoutInSeconds(anyFloat())).thenReturn(1.0f); - clientFactory.createClient(clusterControllers, "clusterName") - .setNodeState(context, 0, ClusterControllerNodeState.MAINTENANCE); - - verify(jaxRsClientFactory).createClient( - eq(ClusterControllerJaxRsApi.class), - argThat(is(anyOf( - equalTo(HOST_NAME_1), - equalTo(HOST_NAME_2), - equalTo(HOST_NAME_3)))), - eq(PORT), - eq(PATH), - eq("http")); - } - - private static ConfigId clusterControllerConfigId(final int index) { - return new ConfigId("admin/cluster-controllers/" + index); - } -} diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java index 45ba862c8f1..a9b8127e7fe 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/resources/HostResourceTest.java @@ -1,7 +1,6 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.orchestrator.resources; -import com.yahoo.jdisc.Timer; import com.yahoo.vespa.applicationmodel.ApplicationInstance; import com.yahoo.vespa.applicationmodel.ApplicationInstanceId; import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference; @@ -32,6 +31,7 @@ import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; import com.yahoo.vespa.orchestrator.status.HostStatus; import com.yahoo.vespa.orchestrator.status.MutableStatusRegistry; import com.yahoo.vespa.orchestrator.status.StatusService; +import org.junit.Before; import org.junit.Test; import javax.ws.rs.BadRequestException; @@ -41,6 +41,7 @@ import javax.ws.rs.core.UriBuilder; import javax.ws.rs.core.UriInfo; import java.net.URI; import java.time.Clock; +import java.time.Instant; import java.util.Arrays; import java.util.Collections; import java.util.Optional; @@ -74,7 +75,7 @@ public class HostResourceTest { static { when(EVERY_HOST_IS_UP_HOST_STATUS_SERVICE.forApplicationInstance(eq(APPLICATION_INSTANCE_REFERENCE))) .thenReturn(EVERY_HOST_IS_UP_MUTABLE_HOST_STATUS_REGISTRY); - when(EVERY_HOST_IS_UP_HOST_STATUS_SERVICE.lockApplicationInstance_forCurrentThreadOnly(eq(APPLICATION_INSTANCE_REFERENCE))) + when(EVERY_HOST_IS_UP_HOST_STATUS_SERVICE.lockApplicationInstance_forCurrentThreadOnly(eq(APPLICATION_INSTANCE_REFERENCE), any())) .thenReturn(EVERY_HOST_IS_UP_MUTABLE_HOST_STATUS_REGISTRY); when(EVERY_HOST_IS_UP_MUTABLE_HOST_STATUS_REGISTRY.getHostStatus(any())) .thenReturn(HostStatus.NO_REMARKS); @@ -150,6 +151,11 @@ public class HostResourceTest { private final UriInfo uriInfo = mock(UriInfo.class); + @Before + public void setUp() { + when(clock.instant()).thenReturn(Instant.now()); + } + @Test public void returns_200_on_success() { HostResource hostResource = diff --git a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusServiceTest.java b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusServiceTest.java index 2e914718e20..44847666670 100644 --- a/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusServiceTest.java +++ b/orchestrator/src/test/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusServiceTest.java @@ -17,6 +17,7 @@ import org.junit.After; import org.junit.Before; import org.junit.Test; +import java.time.Duration; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -81,7 +82,8 @@ public class ZookeeperStatusServiceTest { @Test public void setting_host_state_is_idempotent() { try (MutableStatusRegistry statusRegistry = zookeeperStatusService.lockApplicationInstance_forCurrentThreadOnly( - TestIds.APPLICATION_INSTANCE_REFERENCE)) { + TestIds.APPLICATION_INSTANCE_REFERENCE, + Duration.ofSeconds(10))) { //shuffling to catch "clean database" failures for all cases. for (HostStatus hostStatus: shuffledList(HostStatus.values())) { @@ -105,11 +107,12 @@ public class ZookeeperStatusServiceTest { final CompletableFuture lockedSuccessfullyFuture; try (MutableStatusRegistry statusRegistry = zookeeperStatusService.lockApplicationInstance_forCurrentThreadOnly( - TestIds.APPLICATION_INSTANCE_REFERENCE)) { + TestIds.APPLICATION_INSTANCE_REFERENCE, + Duration.ofSeconds(10))) { lockedSuccessfullyFuture = CompletableFuture.runAsync(() -> { try (MutableStatusRegistry statusRegistry2 = zookeeperStatusService2 - .lockApplicationInstance_forCurrentThreadOnly(TestIds.APPLICATION_INSTANCE_REFERENCE)) + .lockApplicationInstance_forCurrentThreadOnly(TestIds.APPLICATION_INSTANCE_REFERENCE, Duration.ofSeconds(10))) { } }); @@ -131,13 +134,13 @@ public class ZookeeperStatusServiceTest { ZookeeperStatusService zookeeperStatusService2 = new ZookeeperStatusService(curator); try (MutableStatusRegistry statusRegistry = zookeeperStatusService.lockApplicationInstance_forCurrentThreadOnly( - TestIds.APPLICATION_INSTANCE_REFERENCE)) { + TestIds.APPLICATION_INSTANCE_REFERENCE, Duration.ofSeconds(10))) { //must run in separate thread, since having 2 locks in the same thread fails CompletableFuture resultOfZkOperationAfterLockFailure = CompletableFuture.runAsync(() -> { try { zookeeperStatusService2.lockApplicationInstance_forCurrentThreadOnly( - TestIds.APPLICATION_INSTANCE_REFERENCE,1); + TestIds.APPLICATION_INSTANCE_REFERENCE, Duration.ofSeconds(1)); fail("Both zookeeper host status services locked simultaneously for the same application instance"); } catch (RuntimeException e) { } @@ -211,7 +214,7 @@ public class ZookeeperStatusServiceTest { // Suspend try (MutableStatusRegistry statusRegistry = zookeeperStatusService.lockApplicationInstance_forCurrentThreadOnly( - TestIds.APPLICATION_INSTANCE_REFERENCE)) { + TestIds.APPLICATION_INSTANCE_REFERENCE, Duration.ofSeconds(10))) { statusRegistry.setApplicationInstanceStatus(ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN); } @@ -223,7 +226,7 @@ public class ZookeeperStatusServiceTest { // Resume try (MutableStatusRegistry statusRegistry = zookeeperStatusService.lockApplicationInstance_forCurrentThreadOnly( - TestIds.APPLICATION_INSTANCE_REFERENCE)) { + TestIds.APPLICATION_INSTANCE_REFERENCE, Duration.ofSeconds(10))) { statusRegistry.setApplicationInstanceStatus(ApplicationInstanceStatus.NO_REMARKS); } @@ -241,12 +244,12 @@ public class ZookeeperStatusServiceTest { assertThat(suspendedApps.size(), is(0)); try (MutableStatusRegistry statusRegistry = zookeeperStatusService.lockApplicationInstance_forCurrentThreadOnly( - TestIds.APPLICATION_INSTANCE_REFERENCE)) { + TestIds.APPLICATION_INSTANCE_REFERENCE, Duration.ofSeconds(10))) { statusRegistry.setApplicationInstanceStatus(ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN); } try (MutableStatusRegistry statusRegistry = zookeeperStatusService.lockApplicationInstance_forCurrentThreadOnly( - TestIds.APPLICATION_INSTANCE_REFERENCE2)) { + TestIds.APPLICATION_INSTANCE_REFERENCE2, Duration.ofSeconds(10))) { statusRegistry.setApplicationInstanceStatus(ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN); } -- cgit v1.2.3