summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2022-10-11 15:38:52 +0200
committerGitHub <noreply@github.com>2022-10-11 15:38:52 +0200
commitd06fa108aa31cbffb644c1af72fe3074938a0c1d (patch)
tree231b6341d6fdb238b4c8bbe6116290f6fd004a69
parente9a3193205ddc97927b25731b09fa10afcdac659 (diff)
parenta6a3f5574c0355b8f24724c3c85a0d2bf82350ef (diff)
Merge pull request #24388 from vespa-engine/jonmv/abandon-always-failing-apps
Jonmv/abandon always failing apps
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java25
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java3
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java3
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java48
4 files changed, 74 insertions, 5 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
index cf09afa7181..d5a31a07408 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
@@ -91,10 +91,31 @@ public class DeploymentTrigger {
status,
false));
}
+
+ // If app has been broken since it was first submitted, and not fixed for a long time, we stop managing it until a new submission comes in.
+ if (applicationWasAlwaysBroken(status))
+ application = application.withProjectId(OptionalLong.empty());
+
applications().store(application);
});
}
+ private boolean applicationWasAlwaysBroken(DeploymentStatus status) {
+ // If application has a production deployment, we cannot forget it.
+ if (status.application().instances().values().stream().anyMatch(instance -> ! instance.productionDeployments().isEmpty()))
+ return false;
+
+ // Then, we need a job that always failed, and failed on the last revision for at least 30 days.
+ RevisionId last = status.application().revisions().last().get().id();
+ Instant threshold = clock.instant().minus(Duration.ofDays(30));
+ for (JobStatus job : status.jobs().asList())
+ for (Run run : job.runs().descendingMap().values())
+ if (run.hasEnded() && ! run.hasFailed() || ! run.versions().targetRevision().equals(last)) break;
+ else if (run.start().isBefore(threshold)) return true;
+
+ return false;
+ }
+
/**
* Records information when a job completes (successfully or not). This information is used when deciding what to
* trigger next.
@@ -339,8 +360,8 @@ public class DeploymentTrigger {
/** Returns the set of all jobs which have changes to propagate from the upstream steps. */
private List<Job> computeReadyJobs() {
return jobs.deploymentStatuses(ApplicationList.from(applications().readable())
- .withProjectId() // Need to keep this, as we have applications with deployment spec that shouldn't be orchestrated. // Maybe not any longer?
- .withDeploymentSpec())
+ .withProjectId() // Need to keep this, as we have applications with deployment spec that shouldn't be orchestrated.
+ .withJobs())
.withChanges()
.asList().stream()
.filter(status -> ! hasExceededQuota(status.application().id().tenant()))
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java
index b33a43a2031..37b06fea066 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployer.java
@@ -26,9 +26,8 @@ public class OutstandingChangeDeployer extends ControllerMaintainer {
protected double maintain() {
double ok = 0, total = 0;
for (Application application : ApplicationList.from(controller().applications().readable())
- .withProductionDeployment()
.withProjectId()
- .withDeploymentSpec()
+ .withJobs()
.asList())
try {
++total;
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
index 037dacfcac9..d49cb244e47 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
@@ -70,7 +70,8 @@ public class Upgrader extends ControllerMaintainer {
private DeploymentStatusList deploymentStatuses(VersionStatus versionStatus) {
return controller().jobController().deploymentStatuses(ApplicationList.from(controller().applications().readable())
- .withProjectId(),
+ .withProjectId()
+ .withJobs(),
versionStatus);
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
index d8cef45f124..537090c6d68 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java
@@ -2700,6 +2700,54 @@ public class DeploymentTriggerTest {
}
@Test
+ void testBrokenApplication() {
+ DeploymentContext app = tester.newDeploymentContext();
+ app.submit().runJob(systemTest).failDeployment(stagingTest).failDeployment(stagingTest);
+ tester.clock().advance(Duration.ofDays(31));
+ tester.outstandingChangeDeployer().run();
+ assertEquals(OptionalLong.empty(), app.application().projectId());
+
+ app.assertNotRunning(stagingTest);
+ tester.triggerJobs();
+ app.assertNotRunning(stagingTest);
+ assertEquals(4, app.deploymentStatus().jobsToRun().size());
+
+ app.submit().runJob(systemTest).failDeployment(stagingTest);
+ tester.clock().advance(Duration.ofDays(20));
+ app.submit().runJob(systemTest).failDeployment(stagingTest);
+ tester.clock().advance(Duration.ofDays(20));
+ tester.outstandingChangeDeployer().run();
+ assertEquals(OptionalLong.of(1000), app.application().projectId());
+ tester.clock().advance(Duration.ofDays(20));
+ tester.outstandingChangeDeployer().run();
+ assertEquals(OptionalLong.empty(), app.application().projectId());
+
+ app.assertNotRunning(stagingTest);
+ tester.triggerJobs();
+ app.assertNotRunning(stagingTest);
+ assertEquals(4, app.deploymentStatus().jobsToRun().size());
+
+ app.submit().runJob(systemTest).runJob(stagingTest).failDeployment(productionUsCentral1);
+ tester.clock().advance(Duration.ofDays(31));
+ tester.outstandingChangeDeployer().run();
+ assertEquals(OptionalLong.empty(), app.application().projectId());
+
+ app.assertNotRunning(productionUsCentral1);
+ tester.triggerJobs();
+ app.assertNotRunning(productionUsCentral1);
+ assertEquals(3, app.deploymentStatus().jobsToRun().size());
+
+ app.submit().runJob(systemTest).runJob(stagingTest).timeOutConvergence(productionUsCentral1);
+ tester.clock().advance(Duration.ofDays(31));
+ tester.outstandingChangeDeployer().run();
+ assertEquals(OptionalLong.of(1000), app.application().projectId());
+
+ app.assertNotRunning(productionUsCentral1);
+ tester.triggerJobs();
+ app.assertRunning(productionUsCentral1);
+ }
+
+ @Test
void testJobNames() {
ZoneRegistryMock zones = new ZoneRegistryMock(SystemName.main);
List<ZoneApi> existing = new ArrayList<>(zones.zones().all().zones());