summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorJon Marius Venstad <venstad@gmail.com>2020-03-20 10:12:08 +0100
committerJon Marius Venstad <venstad@gmail.com>2020-03-20 10:12:08 +0100
commit2dbcd43a93179d09ee92450f1bc7b80398cd1489 (patch)
tree0b49f614b9bd1dee823b675a2ace2e6aa33287f3 /controller-server
parentcc659eb6a33016e412f89b797ea09b10fa4c5f3a (diff)
Use system-dependent timeouts in deployment jobs
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java63
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java4
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java10
3 files changed, 48 insertions, 29 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
index d7dbdecf41b..596e19bfe65 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
@@ -13,6 +13,7 @@ import com.yahoo.config.provision.AthenzService;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.HostName;
import com.yahoo.config.provision.NodeResources;
+import com.yahoo.config.provision.SystemName;
import com.yahoo.config.provision.zone.RoutingMethod;
import com.yahoo.config.provision.zone.ZoneId;
import com.yahoo.log.LogLevel;
@@ -91,6 +92,7 @@ import static com.yahoo.vespa.hosted.controller.deployment.Step.deployReal;
import static com.yahoo.vespa.hosted.controller.deployment.Step.deployTester;
import static com.yahoo.vespa.hosted.controller.deployment.Step.installTester;
import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.Objects.requireNonNull;
import static java.util.logging.Level.INFO;
import static java.util.logging.Level.WARNING;
import static java.util.stream.Collectors.joining;
@@ -117,22 +119,16 @@ public class InternalStepRunner implements StepRunner {
static final NodeResources DEFAULT_TESTER_RESOURCES_AWS =
new NodeResources(2, 8, 50, 0.3, NodeResources.DiskSpeed.any);
- static final Duration capacityTimeout = Duration.ofMinutes(5);
- static final Duration endpointTimeout = Duration.ofMinutes(15);
- static final Duration endpointCertificateTimeout = Duration.ofMinutes(15);
- static final Duration testerTimeout = Duration.ofMinutes(30);
- static final Duration nodesDownTimeout = Duration.ofMinutes(60);
- static final Duration noNodesDownTimeout = Duration.ofMinutes(120);
- static final Duration certificateTimeout = Duration.ofMinutes(300);
-
private final Controller controller;
private final TestConfigSerializer testConfigSerializer;
private final DeploymentFailureMails mails;
+ private final Timeouts timeouts;
public InternalStepRunner(Controller controller) {
this.controller = controller;
this.testConfigSerializer = new TestConfigSerializer(controller.system());
this.mails = new DeploymentFailureMails(controller.zoneRegistry());
+ this.timeouts = Timeouts.of(controller.system());
}
@Override
@@ -263,8 +259,8 @@ public class InternalStepRunner implements StepRunner {
? Optional.of(deploymentFailed) : Optional.empty();
switch (e.getErrorCode()) {
case CERTIFICATE_NOT_READY:
- if (startTime.plus(endpointCertificateTimeout).isBefore(controller.clock().instant())) {
- logger.log("Deployment failed to find provisioned endpoint certificate after " + endpointCertificateTimeout);
+ if (startTime.plus(timeouts.endpointCertificate()).isBefore(controller.clock().instant())) {
+ logger.log("Deployment failed to find provisioned endpoint certificate after " + timeouts.endpointCertificate());
return Optional.of(RunStatus.endpointCertificateTimeout);
}
return result;
@@ -279,7 +275,7 @@ public class InternalStepRunner implements StepRunner {
return result;
case OUT_OF_CAPACITY:
logger.log(e.getServerMessage());
- return controller.system().isCd() && startTime.plus(capacityTimeout).isAfter(controller.clock().instant())
+ return controller.system().isCd() && startTime.plus(timeouts.capacity()).isAfter(controller.clock().instant())
? Optional.empty()
: Optional.of(outOfCapacity);
case INVALID_APPLICATION_PACKAGE:
@@ -294,8 +290,8 @@ public class InternalStepRunner implements StepRunner {
switch (e.type()) {
case CERT_NOT_AVAILABLE:
// Same as CERTIFICATE_NOT_READY above, only from the controller
- if (startTime.plus(endpointCertificateTimeout).isBefore(controller.clock().instant())) {
- logger.log("Deployment failed to find provisioned endpoint certificate after " + endpointCertificateTimeout);
+ if (startTime.plus(timeouts.endpointCertificate()).isBefore(controller.clock().instant())) {
+ logger.log("Deployment failed to find provisioned endpoint certificate after " + timeouts.endpointCertificate());
return Optional.of(RunStatus.endpointCertificateTimeout);
}
return Optional.empty();
@@ -352,25 +348,25 @@ public class InternalStepRunner implements StepRunner {
return Optional.of(running);
}
}
- else if (timedOut(id, deployment.get(), endpointTimeout)) {
- logger.log(WARNING, "Endpoints failed to show up within " + endpointTimeout.toMinutes() + " minutes!");
+ else if (timedOut(id, deployment.get(), timeouts.endpoint())) {
+ logger.log(WARNING, "Endpoints failed to show up within " + timeouts.endpoint().toMinutes() + " minutes!");
return Optional.of(error);
}
}
String failureReason = null;
- NodeList suspendedTooLong = nodeList.suspendedSince(controller.clock().instant().minus(nodesDownTimeout));
+ NodeList suspendedTooLong = nodeList.suspendedSince(controller.clock().instant().minus(timeouts.nodesDown()));
if ( ! suspendedTooLong.isEmpty()) {
- failureReason = "Some nodes have been suspended for more than " + nodesDownTimeout.toMinutes() + " minutes:\n" +
+ failureReason = "Some nodes have been suspended for more than " + timeouts.nodesDown().toMinutes() + " minutes:\n" +
suspendedTooLong.asList().stream().map(node -> node.node().hostname().value()).collect(joining("\n"));
}
if (run.noNodesDownSince()
- .map(since -> since.isBefore(controller.clock().instant().minus(noNodesDownTimeout)))
+ .map(since -> since.isBefore(controller.clock().instant().minus(timeouts.noNodesDown())))
.orElse(false)) {
if (summary.needPlatformUpgrade() > 0 || summary.needReboot() > 0 || summary.needRestart() > 0)
- failureReason = "No nodes allowed to suspend to progress installation for " + noNodesDownTimeout.toMinutes() + " minutes.";
+ failureReason = "No nodes allowed to suspend to progress installation for " + timeouts.noNodesDown().toMinutes() + " minutes.";
else
failureReason = "Nodes not able to start with new application package.";
}
@@ -442,8 +438,8 @@ public class InternalStepRunner implements StepRunner {
return Optional.of(running);
}
- if (run.stepInfo(installTester).get().startTime().get().plus(testerTimeout).isBefore(controller.clock().instant())) {
- logger.log(WARNING, "Installation of tester failed to complete within " + testerTimeout.toMinutes() + " minutes!");
+ if (run.stepInfo(installTester).get().startTime().get().plus(timeouts.tester()).isBefore(controller.clock().instant())) {
+ logger.log(WARNING, "Installation of tester failed to complete within " + timeouts.tester().toMinutes() + " minutes!");
return Optional.of(error);
}
@@ -807,7 +803,7 @@ public class InternalStepRunner implements StepRunner {
X509Certificate certificate = X509CertificateBuilder.fromKeypair(keyPair,
subject,
controller.clock().instant(),
- controller.clock().instant().plus(certificateTimeout),
+ controller.clock().instant().plus(timeouts.testerCertificate()),
SignatureAlgorithm.SHA512_WITH_RSA,
BigInteger.valueOf(1))
.build();
@@ -928,4 +924,27 @@ public class InternalStepRunner implements StepRunner {
}
+
+ static class Timeouts {
+
+ private final SystemName system;
+
+ private Timeouts(SystemName system) {
+ this.system = requireNonNull(system);
+ }
+
+ public static Timeouts of(SystemName system) {
+ return new Timeouts(system);
+ }
+
+ Duration capacity() { return Duration.ofMinutes(system.isCd() ? 5 : 0); }
+ Duration endpoint() { return Duration.ofMinutes(15); }
+ Duration endpointCertificate() { return Duration.ofMinutes(15); }
+ Duration tester() { return Duration.ofMinutes(30); }
+ Duration nodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 60); }
+ Duration noNodesDown() { return Duration.ofMinutes(system.isCd() ? 30 : 120); }
+ Duration testerCertificate() { return Duration.ofMinutes(300); }
+
+ }
+
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java
index 706a7cec2a9..a04dc6fb579 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java
@@ -367,7 +367,7 @@ public class DeploymentContext {
triggerJobs();
RunId id = currentRun(job).id();
doDeploy(job);
- tester.clock().advance(InternalStepRunner.noNodesDownTimeout.plusSeconds(1));
+ tester.clock().advance(InternalStepRunner.Timeouts.of(tester.controller().system()).noNodesDown().plusSeconds(1));
runner.advance(currentRun(job));
assertTrue(jobs.run(id).get().hasFailed());
assertTrue(jobs.run(id).get().hasEnded());
@@ -381,7 +381,7 @@ public class DeploymentContext {
RunId id = currentRun(job).id();
doDeploy(job);
doUpgrade(job);
- tester.clock().advance(InternalStepRunner.noNodesDownTimeout.plusSeconds(1));
+ tester.clock().advance(InternalStepRunner.Timeouts.of(tester.controller().system()).noNodesDown().plusSeconds(1));
runner.advance(currentRun(job));
assertTrue(jobs.run(id).get().hasFailed());
assertTrue(jobs.run(id).get().hasEnded());
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
index 21b6e729e41..0e949419792 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
@@ -162,7 +162,7 @@ public class InternalStepRunnerTest {
tester.runner().run();
assertEquals(unfinished, tester.jobs().run(id).get().stepStatuses().get(Step.installReal));
- tester.clock().advance(InternalStepRunner.noNodesDownTimeout.plus(Duration.ofSeconds(1)));
+ tester.clock().advance(InternalStepRunner.Timeouts.of(system()).noNodesDown().plus(Duration.ofSeconds(1)));
tester.runner().run();
assertEquals(installationFailed, tester.jobs().run(id).get().status());
}
@@ -190,7 +190,7 @@ public class InternalStepRunnerTest {
tester.configServer().convergeServices(app.testerId().id(), JobType.stagingTest.zone(system()));
tester.runner().run();
- tester.clock().advance(InternalStepRunner.endpointTimeout.plus(Duration.ofSeconds(1)));
+ tester.clock().advance(InternalStepRunner.Timeouts.of(system()).endpoint().plus(Duration.ofSeconds(1)));
tester.runner().run();
assertEquals(failed, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal));
}
@@ -214,7 +214,7 @@ public class InternalStepRunnerTest {
Node systemTestNode = tester.configServer().nodeRepository().list(JobType.systemTest.zone(system()),
app.instanceId()).iterator().next();
- tester.clock().advance(InternalStepRunner.noNodesDownTimeout.minus(Duration.ofSeconds(1)));
+ tester.clock().advance(InternalStepRunner.Timeouts.of(system()).noNodesDown().minus(Duration.ofSeconds(1)));
tester.configServer().nodeRepository().putByHostname(JobType.systemTest.zone(system()),
new Node.Builder(systemTestNode)
.serviceState(Node.ServiceState.allowedDown)
@@ -229,7 +229,7 @@ public class InternalStepRunnerTest {
assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal));
assertEquals(failed, tester.jobs().last(app.instanceId(), JobType.stagingTest).get().stepStatuses().get(Step.installInitialReal));
- tester.clock().advance(InternalStepRunner.nodesDownTimeout.minus(Duration.ofSeconds(3)));
+ tester.clock().advance(InternalStepRunner.Timeouts.of(system()).nodesDown().minus(Duration.ofSeconds(3)));
tester.runner().run();
assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal));
@@ -439,7 +439,7 @@ public class InternalStepRunnerTest {
trusted.add(tester.jobs().run(id).get().testerCertificate().get());
assertEquals(trusted, tester.configServer().application(app.instanceId(), id.type().zone(system())).get().applicationPackage().trustedCertificates());
- tester.clock().advance(InternalStepRunner.certificateTimeout.plus(Duration.ofSeconds(1)));
+ tester.clock().advance(InternalStepRunner.Timeouts.of(system()).testerCertificate().plus(Duration.ofSeconds(1)));
tester.runner().run();
assertEquals(RunStatus.aborted, tester.jobs().run(id).get().status());
}