diff options
author | jonmv <venstad@gmail.com> | 2022-11-10 11:12:21 +0100 |
---|---|---|
committer | jonmv <venstad@gmail.com> | 2022-11-10 11:12:21 +0100 |
commit | 463ad9db000b6383215b060567c68122ac09d88a (patch) | |
tree | d3ce4343b52c9324362b220b9b5bb95a62b9f8af /controller-server | |
parent | b26cbb29aec75b76767e6e1631eae2a11f035823 (diff) |
Avoid rare deadlock with production test/deployment triggering
Diffstat (limited to 'controller-server')
2 files changed, 76 insertions, 13 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java index e16b1378377..4c314c64f7f 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatus.java @@ -40,6 +40,7 @@ import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Objects; import java.util.Optional; import java.util.Set; @@ -603,21 +604,24 @@ public class DeploymentStatus { // the revision is now blocked by waiting for the production test to verify the upgrade. // In this case we must abandon the production test on the pure upgrade, so the revision can be deployed. if (platformDeployedAt.isPresent() && revisionDeployedAt.isEmpty()) { - if (jobSteps.get(deployment).readyAt(change, Optional.of(deployment)) - .map(ready -> ! now.isBefore(ready)).orElse(false)) { - switch (rollout) { - // If separate rollout, this test should keep blocking the revision, unless there are failures. - case separate: return hasFailures(jobSteps.get(deployment), jobSteps.get(job)) ? List.of(change) : List.of(change.withoutApplication(), change); - // If leading rollout, this test should now expect the two changes to fuse and roll together. - case leading: return List.of(change); - // If simultaneous rollout, this test should now expect the revision to run ahead. - case simultaneous: return List.of(change.withoutPlatform(), change); - } - } + if (jobSteps.get(deployment).readyAt(change, Optional.of(deployment)) + .map(ready -> ! now.isBefore(ready)).orElse(false)) { + return switch (rollout) { + // If separate rollout, this test should keep blocking the revision, unless there are failures. + case separate -> hasFailures(jobSteps.get(deployment), jobSteps.get(job)) ? List.of(change) : List.of(change.withoutApplication(), change); + // If leading rollout, this test should now expect the two changes to fuse and roll together. + case leading -> List.of(change); + // If simultaneous rollout, this test should now expect the revision to run ahead. + case simultaneous -> List.of(change.withoutPlatform(), change); + }; + } return List.of(change.withoutApplication(), change); } - // If neither is deployed, then neither is ready, and we guess like for deployments. - // If both are deployed, then we need to follow normal logic for whatever is ready. + // If neither is deployed, then neither is ready, and we assume the same order of changes as for the deployment job. + if (platformDeployedAt.isEmpty()) + return changes(deployment, jobSteps.get(deployment), change); + + // If both are deployed, then we need to follow normal logic for what is ready. } Optional<Instant> platformReadyAt = step.dependenciesCompletedAt(change.withoutApplication(), Optional.of(job)); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index ea9cf0385d9..9fdecfa625e 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -2787,4 +2787,63 @@ public class DeploymentTriggerTest { assertThrows(IllegalStateException.class, JobType.fromJobName("staging-test", zones)::zone); } + @Test + void test() { + String deploymentXml = """ + <deployment version="1.0"> + <test/> + <staging/> + <block-change days="fri" hours="0-23" time-zone="UTC" /> + <prod> + <region>us-east-3</region> + <delay hours="1"/> + <test>us-east-3</test> + <region>us-west-1</region> + </prod> + </deployment>"""; + + // TODO jonmv: recreate problem where revision starts, then upgrade, while prod is blocked, + // then both are tested as separate upgrades, but prod-test has them reversed. + + Version version1 = new Version("7.1"); + tester.controllerTester().upgradeSystem(version1); + ApplicationPackage applicationPackage = ApplicationPackageBuilder.fromDeploymentXml(deploymentXml); + tester.clock().setInstant(Instant.EPOCH.plusSeconds(8 * 60 * 60)); // Thursday morning. + DeploymentContext app = tester.newDeploymentContext().submit(applicationPackage); + RevisionId revision1 = app.lastSubmission().get(); + app.runJob(systemTest).runJob(stagingTest).runJob(productionUsEast3); + tester.clock().advance(Duration.ofHours(1)); + app.runJob(testUsEast3).runJob(productionUsWest1); + assertEquals(Change.empty(), app.instance().change()); + + tester.clock().advance(Duration.ofDays(1)); // Enter block window. + app.submit(applicationPackage); + assertEquals(Change.empty(), app.instance().change()); + + Version version2 = new Version("7.2"); + RevisionId revision2 = app.lastSubmission().get(); + + app.runJob(systemTest).runJob(stagingTest); + app.triggerJobs().assertNotRunning(productionUsEast3); + tester.controllerTester().upgradeSystem(version2); + tester.clock().advance(Duration.ofDays(1)); // Leave block window. + tester.upgrader().run(); + tester.outstandingChangeDeployer().run(); + assertEquals(Change.of(revision2).with(version2), app.instance().change()); + app.runJob(systemTest).runJob(stagingTest); + app.runJob(productionUsEast3); + app.triggerJobs(); + app.assertNotRunning(productionUsEast3); // Platform upgrade should not start before test is done with revision. + tester.clock().advance(Duration.ofHours(1)); + app.triggerJobs(); + app.assertNotRunning(productionUsEast3); // Platform upgrade should not start before test is done with revision. + app.runJob(testUsEast3); + app.runJob(productionUsEast3) + .runJob(productionUsWest1); + tester.clock().advance(Duration.ofHours(1)); + app.runJob(testUsEast3); + app.runJob(productionUsWest1); + assertEquals(Change.empty(), app.instance().change()); + } + } |