diff options
author | Martin Polden <mpolden@mpolden.no> | 2018-08-23 13:02:49 +0200 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2018-08-23 13:06:46 +0200 |
commit | eea64065947b894284ecc6ce1e0a2ce39df45ac3 (patch) | |
tree | 28d5d9a579c9cdaa170b76ec958d04638d821fa3 /controller-server | |
parent | 7854ede299fa3a10b1b34154bc06ec685a960130 (diff) |
Emit metric for deployments failing on upgrade
Diffstat (limited to 'controller-server')
2 files changed, 109 insertions, 31 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index feec83d226e..258a72c8d01 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import com.google.common.collect.ImmutableMap; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.SystemName; import com.yahoo.jdisc.Metric; @@ -11,6 +12,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.chef.Chef; import com.yahoo.vespa.hosted.controller.api.integration.chef.rest.PartialNode; import com.yahoo.vespa.hosted.controller.api.integration.chef.rest.PartialNodeResult; import com.yahoo.vespa.hosted.controller.application.ApplicationList; +import com.yahoo.vespa.hosted.controller.application.JobList; import com.yahoo.vespa.hosted.controller.application.JobStatus; import com.yahoo.vespa.hosted.controller.rotation.RotationLock; @@ -34,6 +36,7 @@ public class MetricsReporter extends Maintainer { public static final String convergeMetric = "seconds.since.last.chef.convergence"; public static final String deploymentFailMetric = "deployment.failurePercentage"; public static final String deploymentAverageDuration = "deployment.averageDuration"; + public static final String deploymentFailingUpgrades = "deployment.failingUpgrades"; public static final String remainingRotations = "remaining_rotations"; private final Metric metric; @@ -113,38 +116,46 @@ public class MetricsReporter extends Maintainer { } private void reportDeploymentMetrics() { - metric.set(deploymentFailMetric, deploymentFailRatio() * 100, metric.createContext(Collections.emptyMap())); - for (Map.Entry<ApplicationId, Duration> entry : averageDeploymentDurations().entrySet()) { - Map<String, String> dimensions = new HashMap<>(); - dimensions.put("tenant", entry.getKey().tenant().value()); - dimensions.put("app", entry.getKey().application().value() + "." + entry.getKey().instance().value()); - metric.set(deploymentAverageDuration, entry.getValue().getSeconds(), metric.createContext(dimensions)); - } + ApplicationList applications = ApplicationList.from(controller().applications().asList()) + .notPullRequest() + .hasProductionDeployment(); + + metric.set(deploymentFailMetric, deploymentFailRatio(applications) * 100, metric.createContext(Collections.emptyMap())); + + averageDeploymentDurations(applications, clock.instant()).forEach((application, duration) -> { + metric.set(deploymentAverageDuration, duration.getSeconds(), metric.createContext(dimensions(application))); + }); + + deploymentsFailingUpgrade(applications).forEach((application, failingJobs) -> { + metric.set(deploymentFailingUpgrades, failingJobs, metric.createContext(dimensions(application))); + }); } - private double deploymentFailRatio() { - List<Application> applications = ApplicationList.from(controller().applications().asList()) - .notPullRequest() - .hasProductionDeployment() - .asList(); + private static double deploymentFailRatio(ApplicationList applicationList) { + List<Application> applications = applicationList.asList(); if (applications.isEmpty()) return 0; return (double) applications.stream().filter(a -> a.deploymentJobs().hasFailures()).count() / (double) applications.size(); } - private Map<ApplicationId, Duration> averageDeploymentDurations() { - Instant now = clock.instant(); - return ApplicationList.from(controller().applications().asList()) - .notPullRequest() - .hasProductionDeployment() - .asList() - .stream() - .collect(Collectors.toMap(Application::id, - application -> averageDeploymentDuration(application, now))); + private static Map<ApplicationId, Duration> averageDeploymentDurations(ApplicationList applications, Instant now) { + return applications.asList().stream() + .collect(Collectors.toMap(Application::id, + application -> averageDeploymentDuration(application, now))); } - private Duration averageDeploymentDuration(Application application, Instant now) { + private static Map<ApplicationId, Integer> deploymentsFailingUpgrade(ApplicationList applications) { + return applications.asList() + .stream() + .collect(Collectors.toMap(Application::id, MetricsReporter::deploymentsFailingUpgrade)); + } + + private static int deploymentsFailingUpgrade(Application application) { + return JobList.from(application).upgrading().failing().size(); + } + + private static Duration averageDeploymentDuration(Application application, Instant now) { List<Duration> jobDurations = application.deploymentJobs().jobStatus().values().stream() .filter(status -> status.lastTriggered().isPresent()) .map(status -> { @@ -162,10 +173,17 @@ public class MetricsReporter extends Maintainer { .orElse(Duration.ZERO); } - private void keepNodesWithSystem(PartialNodeResult nodeResult, SystemName system) { + private static void keepNodesWithSystem(PartialNodeResult nodeResult, SystemName system) { nodeResult.rows.removeIf(node -> !system.name().equals(node.getValue("system").orElse("main"))); } + private static Map<String, String> dimensions(ApplicationId application) { + return ImmutableMap.of( + "tenant", application.tenant().value(), + "app",application.application().value() + "." + application.instance().value() + ); + } + } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java index fa6edd939c4..7ab858f3081 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java @@ -3,18 +3,19 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; +import com.yahoo.component.Version; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.SystemName; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.ControllerTester; -import com.yahoo.vespa.hosted.controller.integration.MetricsMock; -import com.yahoo.vespa.hosted.controller.integration.MetricsMock.MapContext; import com.yahoo.vespa.hosted.controller.api.integration.chef.ChefMock; import com.yahoo.vespa.hosted.controller.api.integration.chef.rest.PartialNodeResult; import com.yahoo.vespa.hosted.controller.application.ApplicationPackage; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; +import com.yahoo.vespa.hosted.controller.integration.MetricsMock; +import com.yahoo.vespa.hosted.controller.integration.MetricsMock.MapContext; import com.yahoo.vespa.hosted.controller.persistence.MockCuratorDb; import org.junit.Before; import org.junit.Test; @@ -43,6 +44,7 @@ import static org.junit.Assert.assertNull; public class MetricsReporterTest { private static final Path testData = Paths.get("src/test/resources/"); + private MetricsMock metrics; @Before @@ -75,7 +77,7 @@ public class MetricsReporterTest { .environment(Environment.prod) .region("us-west-1") .build(); - MetricsReporter metricsReporter = createReporter(tester.controller(), metrics, SystemName.cd); + MetricsReporter metricsReporter = createReporter(tester.controller(), metrics, SystemName.main); metricsReporter.maintain(); assertEquals(0.0, metrics.getMetric(MetricsReporter.deploymentFailMetric)); @@ -102,7 +104,7 @@ public class MetricsReporterTest { } @Test - public void it_omits_zone_when_unknown() { + public void test_chef_metrics_omit_zone_when_unknown() { ControllerTester tester = new ControllerTester(); String hostname = "fake-node2.test"; MapContext metricContext = getMetricContextByHost(tester.controller(), hostname); @@ -117,7 +119,7 @@ public class MetricsReporterTest { .region("us-west-1") .build(); - MetricsReporter reporter = createReporter(tester.controller(), metrics, SystemName.cd); + MetricsReporter reporter = createReporter(tester.controller(), metrics, SystemName.main); Application app = tester.createApplication("app1", "tenant1", 1, 11L); tester.deployCompletely(app, applicationPackage); @@ -136,7 +138,7 @@ public class MetricsReporterTest { tester.deployAndNotify(app, applicationPackage, true, productionUsWest1); reporter.maintain(); - // Average time is 1 hour + // Average time is 1 hour (system-test) + 90 minutes (staging-test runs in parallel with system-test) + 90 minutes (production) / 3 jobs assertEquals(Duration.ofMinutes(80), getAverageDeploymentDuration(app)); // Another deployment starts and stalls for 12 hours @@ -151,11 +153,69 @@ public class MetricsReporterTest { getAverageDeploymentDuration(app)); } + @Test + public void test_deployments_failing_upgrade() { + DeploymentTester tester = new DeploymentTester(); + ApplicationPackage applicationPackage = new ApplicationPackageBuilder() + .environment(Environment.prod) + .region("us-west-1") + .build(); + + MetricsReporter reporter = createReporter(tester.controller(), metrics, SystemName.main); + Application app = tester.createApplication("app1", "tenant1", 1, 11L); + + // Initial deployment without failures + tester.deployCompletely(app, applicationPackage); + reporter.maintain(); + assertEquals(0, getDeploymentsFailingUpgrade(app)); + + // Failing application change is not counted + tester.jobCompletion(component).application(app).nextBuildNumber().uploadArtifact(applicationPackage).submit(); + tester.deployAndNotify(app, applicationPackage, false, systemTest); + reporter.maintain(); + assertEquals(0, getDeploymentsFailingUpgrade(app)); + + // Application change completes + tester.deployAndNotify(app, applicationPackage, true, systemTest); + tester.deployAndNotify(app, applicationPackage, true, stagingTest); + tester.deployAndNotify(app, applicationPackage, true, productionUsWest1); + assertFalse("Change deployed", tester.controller().applications().require(app.id()).change().isPresent()); + + // New versions is released and upgrade fails in test environments + Version version = Version.fromString("7.1"); + tester.upgradeSystem(version); + tester.upgrader().maintain(); + tester.deployAndNotify(app, applicationPackage, false, systemTest); + tester.deployAndNotify(app, applicationPackage, false, stagingTest); + reporter.maintain(); + assertEquals(2, getDeploymentsFailingUpgrade(app)); + + // Test and staging pass and upgrade fails in production + tester.deployAndNotify(app, applicationPackage, true, systemTest); + tester.deployAndNotify(app, applicationPackage, true, stagingTest); + tester.deployAndNotify(app, applicationPackage, false, productionUsWest1); + reporter.maintain(); + assertEquals(1, getDeploymentsFailingUpgrade(app)); + + // Upgrade eventually succeeds + tester.deployAndNotify(app, applicationPackage, true, productionUsWest1); + assertFalse("Upgrade deployed", tester.controller().applications().require(app.id()).change().isPresent()); + reporter.maintain(); + assertEquals(0, getDeploymentsFailingUpgrade(app)); + } + private Duration getAverageDeploymentDuration(Application application) { + return Duration.ofSeconds(getMetric(MetricsReporter.deploymentAverageDuration, application).longValue()); + } + + private int getDeploymentsFailingUpgrade(Application application) { + return getMetric(MetricsReporter.deploymentFailingUpgrades, application).intValue(); + } + + private Number getMetric(String name, Application application) { return metrics.getMetric((dimensions) -> application.id().tenant().value().equals(dimensions.get("tenant")) && appDimension(application).equals(dimensions.get("app")), - MetricsReporter.deploymentAverageDuration) - .map(seconds -> Duration.ofSeconds(seconds.longValue())) + name) .orElseThrow(() -> new RuntimeException("Expected metric to exist for " + application.id())); } |