summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndreas Eriksen <andreer@verizonmedia.com>2020-02-19 09:21:12 +0100
committerGitHub <noreply@github.com>2020-02-19 09:21:12 +0100
commit85866b95d2d8eae5b6fdaad59433c96cfd810d23 (patch)
tree7f8e69cb83bc530708686225cd3673d872d245f4
parent740bf5fde068d1b81b9f1acd69be4d4b0cfba427 (diff)
parent19a6d9e3819c0c18dca6c8b776d83f7171665f28 (diff)
Merge pull request #12248 from vespa-engine/andreer/fail-faster-and-record-when-missing-endpoint-cert
fail faster and record when missing endpoint cert
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java9
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java3
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java35
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java1
5 files changed, 33 insertions, 17 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
index 060ffd63fb3..9d666c6f7b5 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
@@ -76,6 +76,7 @@ import static com.yahoo.vespa.hosted.controller.api.integration.configserver.Nod
import static com.yahoo.vespa.hosted.controller.api.integration.configserver.Node.State.reserved;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.deploymentFailed;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.endpointCertificateTimeout;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.error;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.installationFailed;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.outOfCapacity;
@@ -115,6 +116,7 @@ public class InternalStepRunner implements StepRunner {
new NodeResources(2, 8, 50, 0.3, NodeResources.DiskSpeed.any);
static final Duration endpointTimeout = Duration.ofMinutes(15);
+ static final Duration endpointCertificateTimeout = Duration.ofMinutes(15);
static final Duration testerTimeout = Duration.ofMinutes(30);
static final Duration installationTimeout = Duration.ofMinutes(60);
static final Duration certificateTimeout = Duration.ofMinutes(300);
@@ -273,9 +275,14 @@ public class InternalStepRunner implements StepRunner {
Optional<RunStatus> result = startTime.isBefore(controller.clock().instant().minus(Duration.ofHours(1)))
? Optional.of(deploymentFailed) : Optional.empty();
switch (e.getErrorCode()) {
+ case CERTIFICATE_NOT_READY:
+ if (startTime.plus(endpointCertificateTimeout).isBefore(controller.clock().instant())) {
+ logger.log("Deployment failed to find provisioned endpoint certificate after " + endpointCertificateTimeout);
+ return Optional.of(RunStatus.endpointCertificateTimeout);
+ }
+ return result;
case ACTIVATION_CONFLICT:
case APPLICATION_LOCK_FAILURE:
- case CERTIFICATE_NOT_READY:
logger.log("Deployment failed with possibly transient error " + e.getErrorCode() +
", will retry: " + e.getMessage());
return result;
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java
index 5df914bad80..80924c3c0aa 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java
@@ -16,6 +16,7 @@ public class JobMetrics {
public static final String start = "deployment.start";
public static final String outOfCapacity = "deployment.outOfCapacity";
+ public static final String endpointCertificateTimeout = "deployment.endpointCertificateTimeout";
public static final String deploymentFailure = "deployment.deploymentFailure";
public static final String convergenceFailure = "deployment.convergenceFailure";
public static final String testFailure = "deployment.testFailure";
@@ -50,6 +51,7 @@ public class JobMetrics {
static String valueOf(RunStatus status) {
switch (status) {
case outOfCapacity: return outOfCapacity;
+ case endpointCertificateTimeout: return endpointCertificateTimeout;
case deploymentFailed: return deploymentFailure;
case installationFailed: return convergenceFailure;
case testFailure: return testFailure;
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java
index 4d0b7ef3b90..fba3f7ae6e9 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java
@@ -17,6 +17,9 @@ public enum RunStatus {
/** Deployment of the real application was rejected. */
deploymentFailed,
+ /** Deployment timed out waiting for endpoint certificate */
+ endpointCertificateTimeout,
+
/** Installation of the real application timed out. */
installationFailed,
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
index 9e674134347..1aa229984a8 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
@@ -31,6 +31,7 @@ import java.util.TreeMap;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.deploymentFailed;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.endpointCertificateTimeout;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.error;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.installationFailed;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.outOfCapacity;
@@ -346,14 +347,15 @@ class RunSerializer {
static String valueOf(RunStatus status) {
switch (status) {
- case running : return "running";
- case outOfCapacity : return "outOfCapacity";
- case deploymentFailed : return "deploymentFailed";
- case installationFailed : return "installationFailed";
- case testFailure : return "testFailure";
- case error : return "error";
- case success : return "success";
- case aborted : return "aborted";
+ case running : return "running";
+ case outOfCapacity : return "outOfCapacity";
+ case endpointCertificateTimeout : return "endpointCertificateTimeout";
+ case deploymentFailed : return "deploymentFailed";
+ case installationFailed : return "installationFailed";
+ case testFailure : return "testFailure";
+ case error : return "error";
+ case success : return "success";
+ case aborted : return "aborted";
default: throw new AssertionError("No value defined for '" + status + "'!");
}
@@ -361,14 +363,15 @@ class RunSerializer {
static RunStatus runStatusOf(String status) {
switch (status) {
- case "running" : return running;
- case "outOfCapacity" : return outOfCapacity;
- case "deploymentFailed" : return deploymentFailed;
- case "installationFailed" : return installationFailed;
- case "testFailure" : return testFailure;
- case "error" : return error;
- case "success" : return success;
- case "aborted" : return aborted;
+ case "running" : return running;
+ case "outOfCapacity" : return outOfCapacity;
+ case "endpointCertificateTimeout" : return endpointCertificateTimeout;
+ case "deploymentFailed" : return deploymentFailed;
+ case "installationFailed" : return installationFailed;
+ case "testFailure" : return testFailure;
+ case "error" : return error;
+ case "success" : return success;
+ case "aborted" : return aborted;
default: throw new IllegalArgumentException("No run status defined by '" + status + "'!");
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java
index ce4ae7af6b4..c36d4494a82 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java
@@ -407,6 +407,7 @@ public class JobRunnerTest {
assertEquals(1, metric.getMetric(context::equals, JobMetrics.convergenceFailure).get().intValue());
assertEquals(1, metric.getMetric(context::equals, JobMetrics.deploymentFailure).get().intValue());
assertEquals(1, metric.getMetric(context::equals, JobMetrics.outOfCapacity).get().intValue());
+ assertEquals(1, metric.getMetric(context::equals, JobMetrics.endpointCertificateTimeout).get().intValue());
assertEquals(1, metric.getMetric(context::equals, JobMetrics.testFailure).get().intValue());
}