diff options
author | Martin Polden <mpolden@mpolden.no> | 2020-10-28 13:51:23 +0100 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2020-10-28 13:54:32 +0100 |
commit | cd34f7befd9685a032ecc4d54328e44176a4ed0b (patch) | |
tree | 99b510075b89c940f886fba22d1ed81d7eb171c4 /controller-server | |
parent | f29ea86738895c416b197868c919574cae556950 (diff) |
Report metric for broken system version
Diffstat (limited to 'controller-server')
4 files changed, 53 insertions, 6 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index 0c5ef123eef..780eec47e81 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -19,6 +19,7 @@ import com.yahoo.vespa.hosted.controller.deployment.JobList; import com.yahoo.vespa.hosted.controller.rotation.RotationLock; import com.yahoo.vespa.hosted.controller.versions.NodeVersion; import com.yahoo.vespa.hosted.controller.versions.NodeVersions; +import com.yahoo.vespa.hosted.controller.versions.VersionStatus; import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import java.time.Clock; @@ -51,6 +52,7 @@ public class MetricsReporter extends ControllerMaintainer { public static final String PLATFORM_CHANGE_DURATION = "deployment.platformChangeDuration"; public static final String OS_NODE_COUNT = "deployment.nodeCountByOsVersion"; public static final String PLATFORM_NODE_COUNT = "deployment.nodeCountByPlatformVersion"; + public static final String BROKEN_SYSTEM_VERSION = "deployment.brokenSystemVersion"; public static final String REMAINING_ROTATIONS = "remaining_rotations"; public static final String NAME_SERVICE_REQUESTS_QUEUED = "dns.queuedRequests"; public static final String OPERATION_PREFIX = "operation."; @@ -72,11 +74,20 @@ public class MetricsReporter extends ControllerMaintainer { reportDeploymentMetrics(); reportRemainingRotations(); reportQueuedNameServiceRequests(); - reportInfrastructureUpgradeMetrics(); + VersionStatus versionStatus = controller().readVersionStatus(); + reportInfrastructureUpgradeMetrics(versionStatus); reportAuditLog(); + reportBrokenSystemVersion(versionStatus); return true; } + private void reportBrokenSystemVersion(VersionStatus versionStatus) { + Version systemVersion = controller().systemVersion(versionStatus); + VespaVersion.Confidence confidence = versionStatus.version(systemVersion).confidence(); + int isBroken = confidence == VespaVersion.Confidence.broken ? 1 : 0; + metric.set(BROKEN_SYSTEM_VERSION, isBroken, metric.createContext(Map.of())); + } + private void reportAuditLog() { AuditLog log = controller().auditLogger().readLog(); HashMap<String, HashMap<String, Integer>> metricCounts = new HashMap<>(); @@ -109,9 +120,9 @@ public class MetricsReporter extends ControllerMaintainer { } } - private void reportInfrastructureUpgradeMetrics() { + private void reportInfrastructureUpgradeMetrics(VersionStatus versionStatus) { Map<NodeVersion, Duration> osChangeDurations = osChangeDurations(); - Map<NodeVersion, Duration> platformChangeDurations = platformChangeDurations(); + Map<NodeVersion, Duration> platformChangeDurations = platformChangeDurations(versionStatus); reportChangeDurations(osChangeDurations, OS_CHANGE_DURATION); reportChangeDurations(platformChangeDurations, PLATFORM_CHANGE_DURATION); reportNodeCount(osChangeDurations.keySet(), OS_NODE_COUNT); @@ -182,8 +193,8 @@ public class MetricsReporter extends ControllerMaintainer { }); } - private Map<NodeVersion, Duration> platformChangeDurations() { - return changeDurations(controller().versionStatus().versions(), VespaVersion::nodeVersions); + private Map<NodeVersion, Duration> platformChangeDurations(VersionStatus versionStatus) { + return changeDurations(versionStatus.versions(), VespaVersion::nodeVersions); } private Map<NodeVersion, Duration> osChangeDurations() { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java index d90eb715499..a42dbe7fbde 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java @@ -337,7 +337,7 @@ public class DeploymentContext { return this; } - /** Abort the running job of the given type and. */ + /** Abort the running job of the given type. */ public DeploymentContext abortJob(JobType type) { var job = jobId(type); assertNotSame(RunStatus.aborted, currentRun(job).status()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java index 41d10015411..7fd02a8e780 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java @@ -101,6 +101,7 @@ public class DeploymentTester { public OutstandingChangeDeployer outstandingChangeDeployer() { return outstandingChangeDeployer; } + /** A tester with clock configured to a time when confidence can freely change */ public DeploymentTester atMondayMorning() { return at(tester.clock().instant().atZone(ZoneOffset.UTC) .with(TemporalAdjusters.previousOrSame(DayOfWeek.MONDAY)) diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java index 609ec128ac8..062bd97f901 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java @@ -13,11 +13,13 @@ import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.ControllerTester; import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node; import com.yahoo.vespa.hosted.controller.application.ApplicationPackage; +import com.yahoo.vespa.hosted.controller.application.Change; import com.yahoo.vespa.hosted.controller.application.SystemApplication; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; import com.yahoo.vespa.hosted.controller.integration.MetricsMock; import com.yahoo.vespa.hosted.controller.integration.ZoneApiMock; +import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import org.junit.Test; import java.time.Duration; @@ -445,6 +447,39 @@ public class MetricsReporterTest { } } + @Test + public void broken_system_version() { + var tester = new DeploymentTester().atMondayMorning(); + var ctx = tester.newDeploymentContext(); + var applicationPackage = new ApplicationPackageBuilder().upgradePolicy("canary").region("us-west-1").build(); + + // Application deploys successfully on current system version + ctx.submit(applicationPackage).deploy(); + tester.controllerTester().computeVersionStatus(); + var reporter = createReporter(tester.controller()); + reporter.maintain(); + assertEquals(VespaVersion.Confidence.high, tester.controller().readVersionStatus().systemVersion().get().confidence()); + assertEquals(0, metrics.getMetric(MetricsReporter.BROKEN_SYSTEM_VERSION)); + + // System upgrades. Canary upgrade fails + Version version0 = Version.fromString("6.2"); + tester.controllerTester().upgradeSystem(version0); + tester.upgrader().maintain(); + assertEquals(Change.of(version0), ctx.instance().change()); + ctx.failDeployment(stagingTest); + tester.controllerTester().computeVersionStatus(); + assertEquals(VespaVersion.Confidence.broken, tester.controller().readVersionStatus().systemVersion().get().confidence()); + reporter.maintain(); + assertEquals(1, metrics.getMetric(MetricsReporter.BROKEN_SYSTEM_VERSION)); + + // Canary is healed and confidence is raised + ctx.deployPlatform(version0); + tester.controllerTester().computeVersionStatus(); + assertEquals(VespaVersion.Confidence.high, tester.controller().readVersionStatus().systemVersion().get().confidence()); + reporter.maintain(); + assertEquals(0, metrics.getMetric(MetricsReporter.BROKEN_SYSTEM_VERSION)); + } + private void assertNodeCount(String metric, int n, Version version) { long nodeCount = metrics.getMetric((dimensions) -> version.toFullString().equals(dimensions.get("currentVersion")), metric) .stream() |