diff options
author | Jon Marius Venstad <venstad@gmail.com> | 2020-08-17 11:03:05 +0200 |
---|---|---|
committer | Jon Marius Venstad <venstad@gmail.com> | 2020-08-17 11:03:05 +0200 |
commit | 57c235368d9c450918e76445b8d0782564e8380e (patch) | |
tree | d18ac3543d59c03b9321d42e9cfdf7e5ce480b3b /controller-server/src | |
parent | 0f03e049eaa576fadd4c3e459148099a7fc6b126 (diff) |
Keep only sequences of failing production jobs in version history
Diffstat (limited to 'controller-server/src')
3 files changed, 99 insertions, 6 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/DeploymentStatistics.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/DeploymentStatistics.java index 99cf7542d53..df018d64748 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/DeploymentStatistics.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/versions/DeploymentStatistics.java @@ -84,13 +84,14 @@ public class DeploymentStatistics { for (Deployment deployment : instance.productionDeployments().values()) allVersions.add(deployment.version()); - JobList failing = status.jobs().failing(); + JobList failing = status.jobs().failing() + .not().withStatus(RunStatus.outOfCapacity) + .not().withStatus(RunStatus.aborted); - // Add all unsuccessful runs for failing jobs as any run may have resulted in an incomplete deployment + // Add all unsuccessful runs for failing production jobs as any run may have resulted in an incomplete deployment // where a subset of nodes have upgraded. failing.not().failingApplicationChange() - .not().withStatus(RunStatus.outOfCapacity) - .not().withStatus(RunStatus.aborted) + .production() .mapToList(JobStatus::runs) .forEach(runs -> runs.descendingMap().values().stream() .dropWhile(run -> ! run.hasEnded()) @@ -101,9 +102,17 @@ public class DeploymentStatistics { failingUpgrade.get(run.versions().targetPlatform()).add(run); })); + // Add only the last failing run for test jobs. + failing.not().failingApplicationChange() + .not().production() + .lastCompleted().asList() + .forEach(run -> { + failingUpgrade.putIfAbsent(run.versions().targetPlatform(), new ArrayList<>()); + failingUpgrade.get(run.versions().targetPlatform()).add(run); + }); + + // Add only the last failing for instances failing only an application change, i.e., no upgrade. failing.failingApplicationChange() - .concat(failing.withStatus(RunStatus.outOfCapacity)) - .concat(failing.withStatus(RunStatus.aborted)) .lastCompleted().asList() .forEach(run -> { otherFailing.putIfAbsent(run.versions().targetPlatform(), new ArrayList<>()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java index 157941bd092..dcb7a6dd42b 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentIssueReporterTest.java @@ -162,6 +162,10 @@ public class DeploymentIssueReporterTest { reporter.maintain(); assertTrue("We get a platform issue when confidence is broken", issues.platformIssue()); assertFalse("No deployment issue is filed for app2, which has a version upgrade failure.", issues.isOpenFor(app2.application().id())); + + app2.runJob(systemTest); + tester.controllerTester().upgradeSystem(version); + assertEquals(VespaVersion.Confidence.low, tester.controller().versionStatus().systemVersion().get().confidence()); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java index 44e172d9f93..3ea9d038d99 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/versions/VersionStatusTest.java @@ -385,6 +385,86 @@ public class VersionStatusTest { } @Test + public void testConfidenceWithLingeringVersions() { + DeploymentTester tester = new DeploymentTester().atMondayMorning(); + Version version0 = new Version("6.2"); + tester.controllerTester().upgradeSystem(version0); + tester.upgrader().maintain(); + var appPackage = new ApplicationPackageBuilder().region("us-west-1").region("us-east-3").upgradePolicy("canary"); + + var canary0 = tester.newDeploymentContext("tenant1", "canary0", "default") + .submit(appPackage.build()) + .deploy(); + + assertEquals("All applications running on this version: High", + Confidence.high, confidence(tester.controller(), version0)); + + // New version is released + Version version1 = new Version("6.3"); + tester.controllerTester().upgradeSystem(version1); + tester.upgrader().maintain(); + tester.triggerJobs(); + + // App upgrades to the new version and fails + canary0.failDeployment(systemTest); + canary0.abortJob(stagingTest); + tester.controllerTester().computeVersionStatus(); + assertEquals("One canary failed: Broken", + Confidence.broken, confidence(tester.controller(), version1)); + + // New version is released + Version version2 = new Version("6.4"); + tester.controllerTester().upgradeSystem(version2); + tester.upgrader().maintain(); + assertEquals("Confidence remains unchanged for version1 until app overrides old tests: Broken", + Confidence.broken, confidence(tester.controller(), version1)); + assertEquals("Confidence defaults to low for version with no applications", + Confidence.low, confidence(tester.controller(), version2)); + assertEquals(version2, canary0.instance().change().platform().orElseThrow()); + + canary0.failDeployment(systemTest); + canary0.abortJob(stagingTest); + tester.controllerTester().computeVersionStatus(); + assertFalse("Previous version should be forgotten, as canary only had test jobs run on it", + tester.controller().versionStatus().versions().stream().anyMatch(version -> version.versionNumber().equals(version1))); + + // App succeeds with tests, but fails production deployment + canary0.runJob(systemTest) + .runJob(stagingTest) + .failDeployment(productionUsWest1); + + assertEquals("One canary failed: Broken", + Confidence.broken, confidence(tester.controller(), version2)); + + // A new version is released, and the app again fails production deployment. + Version version3 = new Version("6.5"); + tester.controllerTester().upgradeSystem(version3); + tester.upgrader().maintain(); + assertEquals("Confidence remains unchanged for version2: Broken", + Confidence.broken, confidence(tester.controller(), version2)); + assertEquals("Confidence defaults to low for version with no applications", + Confidence.low, confidence(tester.controller(), version3)); + assertEquals(version3, canary0.instance().change().platform().orElseThrow()); + + canary0.runJob(systemTest) + .runJob(stagingTest) + .failDeployment(productionUsWest1); + tester.controllerTester().computeVersionStatus(); + assertEquals("Confidence remains unchanged for version2: Broken", + Confidence.broken, confidence(tester.controller(), version2)); + assertEquals("Canary broken, so confidence for version3: Broken", + Confidence.broken, confidence(tester.controller(), version3)); + + // App succeeds production deployment, clearing failure on version2 + canary0.runJob(productionUsWest1); + tester.controllerTester().computeVersionStatus(); + assertFalse("Previous version should be forgotten, as canary only had test jobs run on it", + tester.controller().versionStatus().versions().stream().anyMatch(version -> version.versionNumber().equals(version2))); + assertEquals("Canary OK, but not done upgrading, so confidence for version3: Low", + Confidence.low, confidence(tester.controller(), version3)); + } + + @Test public void testConfidenceOverride() { DeploymentTester tester = new DeploymentTester(); Version version0 = new Version("6.2"); |