From 0d41f8dff5c3d1343a39653cec19a24eb6c63a11 Mon Sep 17 00:00:00 2001 From: Jon Marius Venstad Date: Tue, 1 Mar 2022 12:06:52 +0100 Subject: Detect test failure when considering change splitting for prod deployments --- .../hosted/controller/deployment/DeploymentStatus.java | 9 ++++++--- .../yahoo/vespa/hosted/controller/deployment/JobList.java | 10 ++++++++++ .../controller/deployment/DeploymentTriggerTest.java | 15 +++++++++++++++ 3 files changed, 31 insertions(+), 3 deletions(-) (limited to 'controller-server') diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java index 989f305b0cc..8c933f98277 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java @@ -351,8 +351,8 @@ public class DeploymentStatus { || step.completedAt(change.withoutPlatform(), Optional.of(job)).isPresent()) return List.of(change); - // For a dual change, where both target remain, we determine what to run by looking at when the two parts became ready: - // for deployments, we look at dependencies; for tests, this may be overridden by what is already deployed. + // For a dual change, where both targets remain, we determine what to run by looking at when the two parts became ready: + // for deployments, we look at dependencies; for production tests, this may be overridden by what is already deployed. JobId deployment = new JobId(job.application(), JobType.from(system, job.type().zone(system)).get()); UpgradeRollout rollout = application.deploymentSpec().requireInstance(job.application().instance()).upgradeRollout(); if (job.type().isTest()) { @@ -409,10 +409,13 @@ public class DeploymentStatus { // Both changes are ready for this step, and we look to the specified rollout to decide. boolean platformReadyFirst = platformReadyAt.get().isBefore(revisionReadyAt.get()); boolean revisionReadyFirst = revisionReadyAt.get().isBefore(platformReadyAt.get()); + boolean failingUpgradeOnlyTests = ! jobs().type(systemTest, stagingTest) + .failingHardOn(Versions.from(change.withoutApplication(), application, deploymentFor(job), systemVersion)) + .isEmpty(); switch (rollout) { case separate: // Let whichever change rolled out first, keep rolling first, unless upgrade alone is failing. return (platformReadyFirst || platformReadyAt.get().equals(Instant.EPOCH)) // Assume platform was first if no jobs have run yet. - ? step.job().flatMap(jobs()::get).flatMap(JobStatus::firstFailing).isPresent() + ? step.job().flatMap(jobs()::get).flatMap(JobStatus::firstFailing).isPresent() || failingUpgradeOnlyTests ? List.of(change) // Platform was first, but is failing. : List.of(change.withoutApplication(), change) // Platform was first, and is OK. : revisionReadyFirst diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java index 5de07bad859..d06bdc45583 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java @@ -17,6 +17,7 @@ import java.util.function.Function; import java.util.function.Predicate; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted; +import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.nodeAllocationFailure; /** * A list of deployment jobs that can be filtered in various ways. @@ -102,6 +103,15 @@ public class JobList extends AbstractFilteringList { return matching(job -> job.id().type().isProduction()); } + /** Returns the jobs with any runs failing with non-out-of-test-capacity on the given versions — targets only for system test, everything present otherwise. */ + public JobList failingHardOn(Versions versions) { + return matching(job -> ! RunList.from(job) + .on(versions) + .matching(Run::hasFailed) + .not().matching(run -> run.status() == nodeAllocationFailure && run.id().type().environment().isTest()) + .isEmpty()); + } + /** Returns the jobs with any runs matching the given versions — targets only for system test, everything present otherwise. */ public JobList triggeredOn(Versions versions) { return matching(job -> ! RunList.from(job).on(versions).isEmpty()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index c01827b6eb0..b2847a29654 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -1434,6 +1434,21 @@ public class DeploymentTriggerTest { tester.jobs().last(app.instanceId(), productionUsWest1).get().versions()); app.runJob(productionUsWest1); assertEquals(Change.empty(), app.instance().change()); + + // New upgrade fails in staging-test, and revision to fix it is submitted. + var version2 = new Version("7.2"); + tester.controllerTester().upgradeSystem(version2); + tester.upgrader().maintain(); + app.runJob(systemTest).failDeployment(stagingTest); + tester.clock().advance(Duration.ofMinutes(30)); + app.failDeployment(stagingTest); + app.submit(appPackage); + + app.runJob(systemTest).runJob(stagingTest) // Tests run with combined upgrade. + .runJob(productionUsCentral1) // Combined upgrade stays together. + .runJob(productionUsEast3).runJob(productionUsWest1); + assertEquals(Map.of(), app.deploymentStatus().jobsToRun()); + assertEquals(Change.empty(), app.instance().change()); } @Test -- cgit v1.2.3