diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2022-10-11 15:38:52 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-11 15:38:52 +0200 |
commit | d06fa108aa31cbffb644c1af72fe3074938a0c1d (patch) | |
tree | 231b6341d6fdb238b4c8bbe6116290f6fd004a69 | |
parent | e9a3193205ddc97927b25731b09fa10afcdac659 (diff) | |
parent | a6a3f5574c0355b8f24724c3c85a0d2bf82350ef (diff) |
Merge pull request #24388 from vespa-engine/jonmv/abandon-always-failing-apps
Jonmv/abandon always failing apps
4 files changed, 74 insertions, 5 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index cf09afa7181..d5a31a07408 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -91,10 +91,31 @@ public class DeploymentTrigger { status, false)); } + + // If app has been broken since it was first submitted, and not fixed for a long time, we stop managing it until a new submission comes in. + if (applicationWasAlwaysBroken(status)) + application = application.withProjectId(OptionalLong.empty()); + applications().store(application); }); } + private boolean applicationWasAlwaysBroken(DeploymentStatus status) { + // If application has a production deployment, we cannot forget it. + if (status.application().instances().values().stream().anyMatch(instance -> ! instance.productionDeployments().isEmpty())) + return false; + + // Then, we need a job that always failed, and failed on the last revision for at least 30 days. + RevisionId last = status.application().revisions().last().get().id(); + Instant threshold = clock.instant().minus(Duration.ofDays(30)); + for (JobStatus job : status.jobs().asList()) + for (Run run : job.runs().descendingMap().values()) + if (run.hasEnded() && ! run.hasFailed() || ! run.versions().targetRevision().equals(last)) break; + else if (run.start().isBefore(threshold)) return true; + + return false; + } + /** * Records information when a job completes (successfully or not). This information is used when deciding what to * trigger next. @@ -339,8 +360,8 @@ public class DeploymentTrigger { /** Returns the set of all jobs which have changes to propagate from the upstream steps. */ private List<Job> computeReadyJobs() { return jobs.deploymentStatuses(ApplicationList.from(applications().readable()) - .withProjectId() // Need to keep this, as we have applications with deployment spec that shouldn't be orchestrated. // Maybe not any longer? - .withDeploymentSpec()) + .withProjectId() // Need to keep this, as we have applications with deployment spec that shouldn't be orchestrated. + .withJobs()) .withChanges() .asList().stream() .filter(status -> ! hasExceededQuota(status.application().id().tenant())) diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java index b33a43a2031..37b06fea066 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java @@ -26,9 +26,8 @@ public class OutstandingChangeDeployer extends ControllerMaintainer { protected double maintain() { double ok = 0, total = 0; for (Application application : ApplicationList.from(controller().applications().readable()) - .withProductionDeployment() .withProjectId() - .withDeploymentSpec() + .withJobs() .asList()) try { ++total; diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java index 037dacfcac9..d49cb244e47 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java @@ -70,7 +70,8 @@ public class Upgrader extends ControllerMaintainer { private DeploymentStatusList deploymentStatuses(VersionStatus versionStatus) { return controller().jobController().deploymentStatuses(ApplicationList.from(controller().applications().readable()) - .withProjectId(), + .withProjectId() + .withJobs(), versionStatus); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index d8cef45f124..537090c6d68 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -2700,6 +2700,54 @@ public class DeploymentTriggerTest { } @Test + void testBrokenApplication() { + DeploymentContext app = tester.newDeploymentContext(); + app.submit().runJob(systemTest).failDeployment(stagingTest).failDeployment(stagingTest); + tester.clock().advance(Duration.ofDays(31)); + tester.outstandingChangeDeployer().run(); + assertEquals(OptionalLong.empty(), app.application().projectId()); + + app.assertNotRunning(stagingTest); + tester.triggerJobs(); + app.assertNotRunning(stagingTest); + assertEquals(4, app.deploymentStatus().jobsToRun().size()); + + app.submit().runJob(systemTest).failDeployment(stagingTest); + tester.clock().advance(Duration.ofDays(20)); + app.submit().runJob(systemTest).failDeployment(stagingTest); + tester.clock().advance(Duration.ofDays(20)); + tester.outstandingChangeDeployer().run(); + assertEquals(OptionalLong.of(1000), app.application().projectId()); + tester.clock().advance(Duration.ofDays(20)); + tester.outstandingChangeDeployer().run(); + assertEquals(OptionalLong.empty(), app.application().projectId()); + + app.assertNotRunning(stagingTest); + tester.triggerJobs(); + app.assertNotRunning(stagingTest); + assertEquals(4, app.deploymentStatus().jobsToRun().size()); + + app.submit().runJob(systemTest).runJob(stagingTest).failDeployment(productionUsCentral1); + tester.clock().advance(Duration.ofDays(31)); + tester.outstandingChangeDeployer().run(); + assertEquals(OptionalLong.empty(), app.application().projectId()); + + app.assertNotRunning(productionUsCentral1); + tester.triggerJobs(); + app.assertNotRunning(productionUsCentral1); + assertEquals(3, app.deploymentStatus().jobsToRun().size()); + + app.submit().runJob(systemTest).runJob(stagingTest).timeOutConvergence(productionUsCentral1); + tester.clock().advance(Duration.ofDays(31)); + tester.outstandingChangeDeployer().run(); + assertEquals(OptionalLong.of(1000), app.application().projectId()); + + app.assertNotRunning(productionUsCentral1); + tester.triggerJobs(); + app.assertRunning(productionUsCentral1); + } + + @Test void testJobNames() { ZoneRegistryMock zones = new ZoneRegistryMock(SystemName.main); List<ZoneApi> existing = new ArrayList<>(zones.zones().all().zones()); |