summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2018-08-23 13:02:49 +0200
committerMartin Polden <mpolden@mpolden.no>2018-08-23 13:06:46 +0200
commiteea64065947b894284ecc6ce1e0a2ce39df45ac3 (patch)
tree28d5d9a579c9cdaa170b76ec958d04638d821fa3 /controller-server
parent7854ede299fa3a10b1b34154bc06ec685a960130 (diff)
Emit metric for deployments failing on upgrade
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java64
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java76
2 files changed, 109 insertions, 31 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
index feec83d226e..258a72c8d01 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
@@ -1,6 +1,7 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.maintenance;
+import com.google.common.collect.ImmutableMap;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.SystemName;
import com.yahoo.jdisc.Metric;
@@ -11,6 +12,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.chef.Chef;
import com.yahoo.vespa.hosted.controller.api.integration.chef.rest.PartialNode;
import com.yahoo.vespa.hosted.controller.api.integration.chef.rest.PartialNodeResult;
import com.yahoo.vespa.hosted.controller.application.ApplicationList;
+import com.yahoo.vespa.hosted.controller.application.JobList;
import com.yahoo.vespa.hosted.controller.application.JobStatus;
import com.yahoo.vespa.hosted.controller.rotation.RotationLock;
@@ -34,6 +36,7 @@ public class MetricsReporter extends Maintainer {
public static final String convergeMetric = "seconds.since.last.chef.convergence";
public static final String deploymentFailMetric = "deployment.failurePercentage";
public static final String deploymentAverageDuration = "deployment.averageDuration";
+ public static final String deploymentFailingUpgrades = "deployment.failingUpgrades";
public static final String remainingRotations = "remaining_rotations";
private final Metric metric;
@@ -113,38 +116,46 @@ public class MetricsReporter extends Maintainer {
}
private void reportDeploymentMetrics() {
- metric.set(deploymentFailMetric, deploymentFailRatio() * 100, metric.createContext(Collections.emptyMap()));
- for (Map.Entry<ApplicationId, Duration> entry : averageDeploymentDurations().entrySet()) {
- Map<String, String> dimensions = new HashMap<>();
- dimensions.put("tenant", entry.getKey().tenant().value());
- dimensions.put("app", entry.getKey().application().value() + "." + entry.getKey().instance().value());
- metric.set(deploymentAverageDuration, entry.getValue().getSeconds(), metric.createContext(dimensions));
- }
+ ApplicationList applications = ApplicationList.from(controller().applications().asList())
+ .notPullRequest()
+ .hasProductionDeployment();
+
+ metric.set(deploymentFailMetric, deploymentFailRatio(applications) * 100, metric.createContext(Collections.emptyMap()));
+
+ averageDeploymentDurations(applications, clock.instant()).forEach((application, duration) -> {
+ metric.set(deploymentAverageDuration, duration.getSeconds(), metric.createContext(dimensions(application)));
+ });
+
+ deploymentsFailingUpgrade(applications).forEach((application, failingJobs) -> {
+ metric.set(deploymentFailingUpgrades, failingJobs, metric.createContext(dimensions(application)));
+ });
}
- private double deploymentFailRatio() {
- List<Application> applications = ApplicationList.from(controller().applications().asList())
- .notPullRequest()
- .hasProductionDeployment()
- .asList();
+ private static double deploymentFailRatio(ApplicationList applicationList) {
+ List<Application> applications = applicationList.asList();
if (applications.isEmpty()) return 0;
return (double) applications.stream().filter(a -> a.deploymentJobs().hasFailures()).count() /
(double) applications.size();
}
- private Map<ApplicationId, Duration> averageDeploymentDurations() {
- Instant now = clock.instant();
- return ApplicationList.from(controller().applications().asList())
- .notPullRequest()
- .hasProductionDeployment()
- .asList()
- .stream()
- .collect(Collectors.toMap(Application::id,
- application -> averageDeploymentDuration(application, now)));
+ private static Map<ApplicationId, Duration> averageDeploymentDurations(ApplicationList applications, Instant now) {
+ return applications.asList().stream()
+ .collect(Collectors.toMap(Application::id,
+ application -> averageDeploymentDuration(application, now)));
}
- private Duration averageDeploymentDuration(Application application, Instant now) {
+ private static Map<ApplicationId, Integer> deploymentsFailingUpgrade(ApplicationList applications) {
+ return applications.asList()
+ .stream()
+ .collect(Collectors.toMap(Application::id, MetricsReporter::deploymentsFailingUpgrade));
+ }
+
+ private static int deploymentsFailingUpgrade(Application application) {
+ return JobList.from(application).upgrading().failing().size();
+ }
+
+ private static Duration averageDeploymentDuration(Application application, Instant now) {
List<Duration> jobDurations = application.deploymentJobs().jobStatus().values().stream()
.filter(status -> status.lastTriggered().isPresent())
.map(status -> {
@@ -162,10 +173,17 @@ public class MetricsReporter extends Maintainer {
.orElse(Duration.ZERO);
}
- private void keepNodesWithSystem(PartialNodeResult nodeResult, SystemName system) {
+ private static void keepNodesWithSystem(PartialNodeResult nodeResult, SystemName system) {
nodeResult.rows.removeIf(node -> !system.name().equals(node.getValue("system").orElse("main")));
}
+ private static Map<String, String> dimensions(ApplicationId application) {
+ return ImmutableMap.of(
+ "tenant", application.tenant().value(),
+ "app",application.application().value() + "." + application.instance().value()
+ );
+ }
+
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
index fa6edd939c4..7ab858f3081 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
@@ -3,18 +3,19 @@ package com.yahoo.vespa.hosted.controller.maintenance;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
+import com.yahoo.component.Version;
import com.yahoo.config.provision.Environment;
import com.yahoo.config.provision.SystemName;
import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.ControllerTester;
-import com.yahoo.vespa.hosted.controller.integration.MetricsMock;
-import com.yahoo.vespa.hosted.controller.integration.MetricsMock.MapContext;
import com.yahoo.vespa.hosted.controller.api.integration.chef.ChefMock;
import com.yahoo.vespa.hosted.controller.api.integration.chef.rest.PartialNodeResult;
import com.yahoo.vespa.hosted.controller.application.ApplicationPackage;
import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder;
import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester;
+import com.yahoo.vespa.hosted.controller.integration.MetricsMock;
+import com.yahoo.vespa.hosted.controller.integration.MetricsMock.MapContext;
import com.yahoo.vespa.hosted.controller.persistence.MockCuratorDb;
import org.junit.Before;
import org.junit.Test;
@@ -43,6 +44,7 @@ import static org.junit.Assert.assertNull;
public class MetricsReporterTest {
private static final Path testData = Paths.get("src/test/resources/");
+
private MetricsMock metrics;
@Before
@@ -75,7 +77,7 @@ public class MetricsReporterTest {
.environment(Environment.prod)
.region("us-west-1")
.build();
- MetricsReporter metricsReporter = createReporter(tester.controller(), metrics, SystemName.cd);
+ MetricsReporter metricsReporter = createReporter(tester.controller(), metrics, SystemName.main);
metricsReporter.maintain();
assertEquals(0.0, metrics.getMetric(MetricsReporter.deploymentFailMetric));
@@ -102,7 +104,7 @@ public class MetricsReporterTest {
}
@Test
- public void it_omits_zone_when_unknown() {
+ public void test_chef_metrics_omit_zone_when_unknown() {
ControllerTester tester = new ControllerTester();
String hostname = "fake-node2.test";
MapContext metricContext = getMetricContextByHost(tester.controller(), hostname);
@@ -117,7 +119,7 @@ public class MetricsReporterTest {
.region("us-west-1")
.build();
- MetricsReporter reporter = createReporter(tester.controller(), metrics, SystemName.cd);
+ MetricsReporter reporter = createReporter(tester.controller(), metrics, SystemName.main);
Application app = tester.createApplication("app1", "tenant1", 1, 11L);
tester.deployCompletely(app, applicationPackage);
@@ -136,7 +138,7 @@ public class MetricsReporterTest {
tester.deployAndNotify(app, applicationPackage, true, productionUsWest1);
reporter.maintain();
- // Average time is 1 hour
+ // Average time is 1 hour (system-test) + 90 minutes (staging-test runs in parallel with system-test) + 90 minutes (production) / 3 jobs
assertEquals(Duration.ofMinutes(80), getAverageDeploymentDuration(app));
// Another deployment starts and stalls for 12 hours
@@ -151,11 +153,69 @@ public class MetricsReporterTest {
getAverageDeploymentDuration(app));
}
+ @Test
+ public void test_deployments_failing_upgrade() {
+ DeploymentTester tester = new DeploymentTester();
+ ApplicationPackage applicationPackage = new ApplicationPackageBuilder()
+ .environment(Environment.prod)
+ .region("us-west-1")
+ .build();
+
+ MetricsReporter reporter = createReporter(tester.controller(), metrics, SystemName.main);
+ Application app = tester.createApplication("app1", "tenant1", 1, 11L);
+
+ // Initial deployment without failures
+ tester.deployCompletely(app, applicationPackage);
+ reporter.maintain();
+ assertEquals(0, getDeploymentsFailingUpgrade(app));
+
+ // Failing application change is not counted
+ tester.jobCompletion(component).application(app).nextBuildNumber().uploadArtifact(applicationPackage).submit();
+ tester.deployAndNotify(app, applicationPackage, false, systemTest);
+ reporter.maintain();
+ assertEquals(0, getDeploymentsFailingUpgrade(app));
+
+ // Application change completes
+ tester.deployAndNotify(app, applicationPackage, true, systemTest);
+ tester.deployAndNotify(app, applicationPackage, true, stagingTest);
+ tester.deployAndNotify(app, applicationPackage, true, productionUsWest1);
+ assertFalse("Change deployed", tester.controller().applications().require(app.id()).change().isPresent());
+
+ // New versions is released and upgrade fails in test environments
+ Version version = Version.fromString("7.1");
+ tester.upgradeSystem(version);
+ tester.upgrader().maintain();
+ tester.deployAndNotify(app, applicationPackage, false, systemTest);
+ tester.deployAndNotify(app, applicationPackage, false, stagingTest);
+ reporter.maintain();
+ assertEquals(2, getDeploymentsFailingUpgrade(app));
+
+ // Test and staging pass and upgrade fails in production
+ tester.deployAndNotify(app, applicationPackage, true, systemTest);
+ tester.deployAndNotify(app, applicationPackage, true, stagingTest);
+ tester.deployAndNotify(app, applicationPackage, false, productionUsWest1);
+ reporter.maintain();
+ assertEquals(1, getDeploymentsFailingUpgrade(app));
+
+ // Upgrade eventually succeeds
+ tester.deployAndNotify(app, applicationPackage, true, productionUsWest1);
+ assertFalse("Upgrade deployed", tester.controller().applications().require(app.id()).change().isPresent());
+ reporter.maintain();
+ assertEquals(0, getDeploymentsFailingUpgrade(app));
+ }
+
private Duration getAverageDeploymentDuration(Application application) {
+ return Duration.ofSeconds(getMetric(MetricsReporter.deploymentAverageDuration, application).longValue());
+ }
+
+ private int getDeploymentsFailingUpgrade(Application application) {
+ return getMetric(MetricsReporter.deploymentFailingUpgrades, application).intValue();
+ }
+
+ private Number getMetric(String name, Application application) {
return metrics.getMetric((dimensions) -> application.id().tenant().value().equals(dimensions.get("tenant")) &&
appDimension(application).equals(dimensions.get("app")),
- MetricsReporter.deploymentAverageDuration)
- .map(seconds -> Duration.ofSeconds(seconds.longValue()))
+ name)
.orElseThrow(() -> new RuntimeException("Expected metric to exist for " + application.id()));
}