summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorjonmv <venstad@gmail.com>2022-10-12 16:37:29 +0200
committerjonmv <venstad@gmail.com>2022-10-12 16:37:29 +0200
commit9b1bdb9b3b0feaee96d7d4d6122c657e6b7d4d00 (patch)
treea84ff1998e4cfed80364ec7594711a9895c22c45 /controller-server
parentf329a9d5e0a323b0485dcae52d90987b675808bc (diff)
Do not retry invalid applications (for the next 100 years)
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java6
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java9
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java3
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java30
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java5
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java60
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java28
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java10
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelperTest.java3
9 files changed, 87 insertions, 67 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
index 9bc2c5a5595..0fe9a84f5fa 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java
@@ -54,6 +54,7 @@ import static com.yahoo.config.application.api.DeploymentSpec.RevisionTarget.nex
import static com.yahoo.config.provision.Environment.prod;
import static com.yahoo.config.provision.Environment.staging;
import static com.yahoo.config.provision.Environment.test;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.invalidApplication;
import static java.util.Comparator.comparing;
import static java.util.Comparator.naturalOrder;
import static java.util.Comparator.reverseOrder;
@@ -1027,10 +1028,11 @@ public class DeploymentStatus {
Versions lastVersions = job.lastCompleted().get().versions();
Versions toRun = Versions.from(change, status.application, dependent.flatMap(status::deploymentFor), status.fallbackPlatform(change, job.id()));
if ( ! toRun.targetsMatch(lastVersions)) return Optional.empty();
- if ( job.id().type().environment().isTest()
+ if ( job.id().type().environment().isTest()
&& ! dependent.map(JobId::type).map(status::findCloud).map(List.of(CloudName.AWS, CloudName.GCP)::contains).orElse(true)
- && job.isNodeAllocationFailure()) return Optional.empty();
+ && job.isNodeAllocationFailure()) return Optional.empty();
+ if (job.lastStatus().get() == invalidApplication) return Optional.of(status.now.plus(Duration.ofDays(36524))); // 100 years
Instant firstFailing = job.firstFailing().get().end().get();
Instant lastCompleted = job.lastCompleted().get().end().get();
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
index e685e5d167e..dcdfea6e594 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
@@ -73,6 +73,7 @@ import static com.yahoo.vespa.hosted.controller.api.integration.configserver.Nod
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.deploymentFailed;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.error;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.installationFailed;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.invalidApplication;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.noTests;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.nodeAllocationFailure;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.reset;
@@ -257,6 +258,8 @@ public class InternalStepRunner implements StepRunner {
? result
: Optional.of(nodeAllocationFailure);
case INVALID_APPLICATION_PACKAGE:
+ logger.log(WARNING, e.getMessage());
+ return Optional.of(invalidApplication);
case BAD_REQUEST:
logger.log(WARNING, e.getMessage());
return Optional.of(deploymentFailed);
@@ -838,9 +841,12 @@ public class InternalStepRunner implements StepRunner {
case nodeAllocationFailure:
if ( ! run.id().type().environment().isTest()) updater.accept("could not allocate the requested capacity to your tenant. Please contact Vespa Cloud support.");
return;
- case deploymentFailed:
+ case invalidApplication:
updater.accept("invalid application configuration. Please review warnings and errors in the deployment job log.");
return;
+ case deploymentFailed:
+ updater.accept("failure processing application configuration. Please review warnings and errors in the deployment job log.");
+ return;
case installationFailed:
updater.accept("nodes were not able to deploy to the new configuration. Please check the Vespa log for errors, and contact Vespa Cloud support if unable to resolve these.");
return;
@@ -867,6 +873,7 @@ public class InternalStepRunner implements StepRunner {
case nodeAllocationFailure:
return run.id().type().isProduction() ? Optional.of(mails.nodeAllocationFailure(run.id(), recipients)) : Optional.empty();
case deploymentFailed:
+ case invalidApplication:
return Optional.of(mails.deploymentFailure(run.id(), recipients));
case installationFailed:
return Optional.of(mails.installationFailure(run.id(), recipients));
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
index 36036d6d36d..4c84f311458 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
@@ -5,7 +5,6 @@ import com.google.common.collect.ImmutableSortedMap;
import com.yahoo.component.Version;
import com.yahoo.component.VersionCompatibility;
import com.yahoo.concurrent.UncheckedTimeoutException;
-import com.yahoo.config.application.api.DeploymentSpec.UpgradePolicy;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.SystemName;
import com.yahoo.config.provision.zone.ZoneId;
@@ -127,7 +126,7 @@ public class JobController {
this.curator = controller.curator();
this.logs = new BufferedLogStore(curator, controller.serviceRegistry().runDataStore());
this.cloud = controller.serviceRegistry().testerCloud();
- this.metric = new JobMetrics(controller.metric(), controller::system);
+ this.metric = new JobMetrics(controller.metric());
}
public TesterCloud cloud() { return cloud; }
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java
index 14fce806152..d1fa00d1c41 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobMetrics.java
@@ -19,6 +19,7 @@ public class JobMetrics {
public static final String nodeAllocationFailure = "deployment.nodeAllocationFailure";
public static final String endpointCertificateTimeout = "deployment.endpointCertificateTimeout";
public static final String deploymentFailure = "deployment.deploymentFailure";
+ public static final String invalidApplication = "deployment.invalidApplication";
public static final String convergenceFailure = "deployment.convergenceFailure";
public static final String testFailure = "deployment.testFailure";
public static final String noTests = "deployment.noTests";
@@ -27,11 +28,9 @@ public class JobMetrics {
public static final String success = "deployment.success";
private final Metric metric;
- private final Supplier<SystemName> system;
- public JobMetrics(Metric metric, Supplier<SystemName> system) {
+ public JobMetrics(Metric metric) {
this.metric = metric;
- this.system = system;
}
public void jobStarted(JobId id) {
@@ -51,18 +50,19 @@ public class JobMetrics {
}
static String valueOf(RunStatus status) {
- switch (status) {
- case nodeAllocationFailure: return nodeAllocationFailure;
- case endpointCertificateTimeout: return endpointCertificateTimeout;
- case deploymentFailed: return deploymentFailure;
- case installationFailed: return convergenceFailure;
- case testFailure: return testFailure;
- case noTests: return noTests;
- case error: return error;
- case aborted: return abort;
- case success: return success;
- default: throw new IllegalArgumentException("Unexpected run status '" + status + "'");
- }
+ return switch (status) {
+ case nodeAllocationFailure -> nodeAllocationFailure;
+ case endpointCertificateTimeout -> endpointCertificateTimeout;
+ case invalidApplication -> invalidApplication;
+ case deploymentFailed -> deploymentFailure;
+ case installationFailed -> convergenceFailure;
+ case testFailure -> testFailure;
+ case noTests -> noTests;
+ case error -> error;
+ case aborted -> abort;
+ case success -> success;
+ default -> throw new IllegalArgumentException("Unexpected run status '" + status + "'");
+ };
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java
index 9ca634b19fd..aa727b602e1 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/RunStatus.java
@@ -14,7 +14,10 @@ public enum RunStatus {
/** Deployment was rejected due node allocation failure. */
nodeAllocationFailure,
- /** Deployment of the real application was rejected. */
+ /** Deployment of the real application was rejected because the package is faulty. */
+ invalidApplication,
+
+ /** Deployment of the real application was rejected, for other reasons. */
deploymentFailed,
/** Deployment timed out waiting for endpoint certificate */
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
index fcc6d99aec2..49d108d08df 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
@@ -34,6 +34,7 @@ import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.deploymentF
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.endpointCertificateTimeout;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.error;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.installationFailed;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.invalidApplication;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.noTests;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.nodeAllocationFailure;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.reset;
@@ -329,39 +330,38 @@ class RunSerializer {
}
static String valueOf(RunStatus status) {
- switch (status) {
- case running : return "running";
- case nodeAllocationFailure : return "nodeAllocationFailure";
- case endpointCertificateTimeout : return "endpointCertificateTimeout";
- case deploymentFailed : return "deploymentFailed";
- case installationFailed : return "installationFailed";
- case testFailure : return "testFailure";
- case noTests : return "noTests";
- case error : return "error";
- case success : return "success";
- case aborted : return "aborted";
- case reset : return "reset";
-
- default: throw new AssertionError("No value defined for '" + status + "'!");
- }
+ return switch (status) {
+ case running -> "running";
+ case nodeAllocationFailure -> "nodeAllocationFailure";
+ case endpointCertificateTimeout -> "endpointCertificateTimeout";
+ case deploymentFailed -> "deploymentFailed";
+ case invalidApplication -> "invalidApplication";
+ case installationFailed -> "installationFailed";
+ case testFailure -> "testFailure";
+ case noTests -> "noTests";
+ case error -> "error";
+ case success -> "success";
+ case aborted -> "aborted";
+ case reset -> "reset";
+ };
}
static RunStatus runStatusOf(String status) {
- switch (status) {
- case "running" : return running;
- case "nodeAllocationFailure" : return nodeAllocationFailure;
- case "endpointCertificateTimeout" : return endpointCertificateTimeout;
- case "deploymentFailed" : return deploymentFailed;
- case "installationFailed" : return installationFailed;
- case "noTests" : return noTests;
- case "testFailure" : return testFailure;
- case "error" : return error;
- case "success" : return success;
- case "aborted" : return aborted;
- case "reset" : return reset;
-
- default: throw new IllegalArgumentException("No run status defined by '" + status + "'!");
- }
+ return switch (status) {
+ case "running" -> running;
+ case "nodeAllocationFailure" -> nodeAllocationFailure;
+ case "endpointCertificateTimeout" -> endpointCertificateTimeout;
+ case "deploymentFailed" -> deploymentFailed;
+ case "invalidApplication" -> invalidApplication;
+ case "installationFailed" -> installationFailed;
+ case "noTests" -> noTests;
+ case "testFailure" -> testFailure;
+ case "error" -> error;
+ case "success" -> success;
+ case "aborted" -> aborted;
+ case "reset" -> reset;
+ default -> throw new IllegalArgumentException("No run status defined by '" + status + "'!");
+ };
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java
index 8c601f8c678..592fbd0e856 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java
@@ -227,20 +227,18 @@ class JobControllerApiHandlerHelper {
}
private static String nameOf(RunStatus status) {
- switch (status) {
- case reset: // This means the run will reset and keep running.
- case running: return "running";
- case aborted: return "aborted";
- case error: return "error";
- case testFailure: return "testFailure";
- case noTests: return "noTests";
- case endpointCertificateTimeout: return "endpointCertificateTimeout";
- case nodeAllocationFailure: return "nodeAllocationFailure";
- case installationFailed: return "installationFailed";
- case deploymentFailed: return "deploymentFailed";
- case success: return "success";
- default: throw new IllegalArgumentException("Unexpected status '" + status + "'");
- }
+ return switch (status) {
+ case reset, running -> "running";
+ case aborted -> "aborted";
+ case error -> "error";
+ case testFailure -> "testFailure";
+ case noTests -> "noTests";
+ case endpointCertificateTimeout -> "endpointCertificateTimeout";
+ case nodeAllocationFailure -> "nodeAllocationFailure";
+ case installationFailed -> "installationFailed";
+ case invalidApplication, deploymentFailed -> "deploymentFailed";
+ case success -> "success";
+ };
}
/**
@@ -440,7 +438,7 @@ class JobControllerApiHandlerHelper {
runObject.setString("url", baseUriForJob.resolve(baseUriForJob.getPath() + "/run/" + run.id().number()).toString());
runObject.setLong("start", run.start().toEpochMilli());
run.end().ifPresent(end -> runObject.setLong("end", end.toEpochMilli()));
- runObject.setString("status", run.status().name());
+ runObject.setString("status", nameOf(run.status()));
run.reason().ifPresent(reason -> runObject.setString("reason", reason));
toSlime(runObject.setObject("versions"), run.versions(), application);
Cursor runStepsArray = runObject.setArray("steps");
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
index 537090c6d68..6bc99b865e4 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
@@ -14,6 +14,8 @@ import com.yahoo.config.provision.zone.ZoneId;
import com.yahoo.vespa.flags.PermanentFlags;
import com.yahoo.vespa.hosted.controller.ControllerTester;
import com.yahoo.vespa.hosted.controller.Instance;
+import com.yahoo.vespa.hosted.controller.api.integration.configserver.ConfigServerException;
+import com.yahoo.vespa.hosted.controller.api.integration.configserver.ConfigServerException.ErrorCode;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobId;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.RevisionId;
@@ -118,9 +120,17 @@ public class DeploymentTriggerTest {
tester.triggerJobs();
app.assertRunning(productionUsWest1);
+ tester.configServer().throwOnNextPrepare(new ConfigServerException(ErrorCode.INVALID_APPLICATION_PACKAGE, "nope", "bah"));
+ tester.runner().run();
+ assertEquals(RunStatus.invalidApplication, tester.jobs().last(app.instanceId(), productionUsWest1).get().status());
+ tester.triggerJobs();
+ app.assertNotRunning(productionUsWest1);
+
// production-us-west-1 fails, but the app loses its projectId, and the job isn't retried.
+ app.submit(applicationPackage).runJob(systemTest).runJob(stagingTest).triggerJobs();
tester.applications().lockApplicationOrThrow(app.application().id(), locked ->
tester.applications().store(locked.withProjectId(OptionalLong.empty())));
+
app.timeOutConvergence(productionUsWest1);
tester.triggerJobs();
assertEquals(0, tester.jobs().active().size(), "Job is not triggered when no projectId is present");
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelperTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelperTest.java
index 71e3607983c..6555277b06b 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelperTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelperTest.java
@@ -39,6 +39,7 @@ import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.tes
import static com.yahoo.vespa.hosted.controller.deployment.DeploymentContext.applicationPackage;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.deploymentFailed;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.installationFailed;
+import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.invalidApplication;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.running;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -87,7 +88,7 @@ public class JobControllerApiHandlerHelperTest {
// us-east-3 eats the deployment failure and fails before deployment, while us-west-1 fails after.
tester.configServer().throwOnNextPrepare(new ConfigServerException(INVALID_APPLICATION_PACKAGE, "ERROR!", "Failed to deploy application"));
tester.runner().run();
- assertEquals(deploymentFailed, tester.jobs().last(app.instanceId(), productionUsEast3).get().status());
+ assertEquals(invalidApplication, tester.jobs().last(app.instanceId(), productionUsEast3).get().status());
tester.runner().run();
tester.clock().advance(Duration.ofHours(4).plusSeconds(1));