summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2018-03-16 15:13:04 +0100
committerMartin Polden <mpolden@mpolden.no>2018-03-19 12:11:39 +0100
commiteefcb1a43709b8e1657499012d72e7d7f8a75df2 (patch)
tree2b041b0051085669d74f58e3f252eee200e35447
parente0479e09d6651bfebb571492f9fee6838502b265 (diff)
Average deployment duration metric
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java41
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java65
2 files changed, 100 insertions, 6 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
index ab388ca9a9f..967db0dff99 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
@@ -1,6 +1,7 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.maintenance;
+import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.SystemName;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.controller.Application;
@@ -10,6 +11,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.chef.Chef;
import com.yahoo.vespa.hosted.controller.api.integration.chef.rest.PartialNode;
import com.yahoo.vespa.hosted.controller.api.integration.chef.rest.PartialNodeResult;
import com.yahoo.vespa.hosted.controller.application.ApplicationList;
+import com.yahoo.vespa.hosted.controller.application.JobStatus;
import com.yahoo.vespa.hosted.controller.rotation.RotationLock;
import java.time.Clock;
@@ -21,6 +23,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
+import java.util.stream.Collectors;
/**
* @author mortent
@@ -30,6 +33,7 @@ public class MetricsReporter extends Maintainer {
public static final String convergeMetric = "seconds.since.last.chef.convergence";
public static final String deploymentFailMetric = "deployment.failurePercentage";
+ public static final String deploymentAverageDuration = "deployment.averageDuration";
public static final String remainingRotations = "remaining_rotations";
private final Metric metric;
@@ -112,6 +116,10 @@ public class MetricsReporter extends Maintainer {
private void reportDeploymentMetrics() {
metric.set(deploymentFailMetric, deploymentFailRatio() * 100, metric.createContext(Collections.emptyMap()));
+ for (Map.Entry<ApplicationId, Duration> entry : averageDeploymentDurations().entrySet()) {
+ metric.set(deploymentAverageDuration, entry.getValue().getSeconds(),
+ metric.createContext(Collections.singletonMap("application", entry.getKey().toString())));
+ }
}
private double deploymentFailRatio() {
@@ -121,8 +129,37 @@ public class MetricsReporter extends Maintainer {
.asList();
if (applications.isEmpty()) return 0;
- return (double)applications.stream().filter(a -> a.deploymentJobs().hasFailures()).count() /
- (double)applications.size();
+ return (double) applications.stream().filter(a -> a.deploymentJobs().hasFailures()).count() /
+ (double) applications.size();
+ }
+
+ private Map<ApplicationId, Duration> averageDeploymentDurations() {
+ Instant now = clock.instant();
+ return ApplicationList.from(controller().applications().asList())
+ .notPullRequest()
+ .hasProductionDeployment()
+ .asList()
+ .stream()
+ .collect(Collectors.toMap(Application::id,
+ application -> averageDeploymentDuration(application, now)));
+ }
+
+ private Duration averageDeploymentDuration(Application application, Instant now) {
+ List<Duration> jobDurations = application.deploymentJobs().jobStatus().values().stream()
+ .filter(status -> status.lastTriggered().isPresent())
+ .map(status -> {
+ Instant triggeredAt = status.lastTriggered().get().at();
+ Instant runningUntil = status.lastCompleted()
+ .map(JobStatus.JobRun::at)
+ .filter(at -> at.isAfter(triggeredAt))
+ .orElse(now);
+ return Duration.between(triggeredAt, runningUntil);
+ })
+ .collect(Collectors.toList());
+ return jobDurations.stream()
+ .reduce(Duration::plus)
+ .map(totalDuration -> totalDuration.dividedBy(jobDurations.size()))
+ .orElse(Duration.ZERO);
}
private void keepNodesWithSystem(PartialNodeResult nodeResult, SystemName system) {
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
index d57f01bd132..93fdf3618a7 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
@@ -26,11 +26,14 @@ import java.io.UncheckedIOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.Clock;
+import java.time.Duration;
import java.time.Instant;
import java.time.ZoneId;
import java.util.Map;
import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.component;
+import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.productionUsWest1;
+import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.stagingTest;
import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.systemTest;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
@@ -54,8 +57,9 @@ public class MetricsReporterTest {
@Test
public void test_chef_metrics() {
+ Clock clock = Clock.fixed(Instant.ofEpochSecond(1475497913), ZoneId.systemDefault());;
ControllerTester tester = new ControllerTester();
- MetricsReporter metricsReporter = createReporter(tester.controller(), metrics, SystemName.cd);
+ MetricsReporter metricsReporter = createReporter(clock, tester.controller(), metrics, SystemName.cd);
metricsReporter.maintain();
assertEquals(2, metrics.getMetrics().size());
@@ -95,7 +99,7 @@ public class MetricsReporterTest {
assertEquals(0.0, metrics.getMetric(MetricsReporter.deploymentFailMetric));
// 1 app fails system-test
- tester.jobCompletion(component).application(app4).submit();
+ tester.jobCompletion(component).application(app4).nextBuildNumber().uploadArtifact(applicationPackage).submit();
tester.deployAndNotify(app4, applicationPackage, false, systemTest);
metricsReporter.maintain();
@@ -110,11 +114,66 @@ public class MetricsReporterTest {
assertNull(metricContext.getDimensions().get("zone"));
}
+ @Test
+ public void test_deployment_average_duration() {
+ DeploymentTester tester = new DeploymentTester();
+ ApplicationPackage applicationPackage = new ApplicationPackageBuilder()
+ .environment(Environment.prod)
+ .region("us-west-1")
+ .build();
+
+ MetricsReporter reporter = createReporter(tester.controller(), metrics, SystemName.cd);
+
+ Application app = tester.createApplication("app1", "tenant1", 1, 11L);
+ tester.deployCompletely(app, applicationPackage);
+ reporter.maintain();
+ assertEquals(Duration.ZERO, getAverageDeploymentDuration(app)); // An exceptionally fast deployment :-)
+
+ // App spends 3 hours deploying
+ tester.jobCompletion(component).application(app).nextBuildNumber().uploadArtifact(applicationPackage).submit();
+ tester.clock().advance(Duration.ofHours(1));
+ tester.deployAndNotify(app, applicationPackage, true, systemTest);
+
+ tester.clock().advance(Duration.ofMinutes(30));
+ tester.deployAndNotify(app, applicationPackage, true, stagingTest);
+
+ tester.clock().advance(Duration.ofMinutes(90));
+ tester.deployAndNotify(app, applicationPackage, true, productionUsWest1);
+ reporter.maintain();
+
+ // Average time is 1 hour
+ assertEquals(Duration.ofHours(1), getAverageDeploymentDuration(app));
+
+ // Another deployment starts and stalls for 12 hours
+ tester.jobCompletion(component).application(app).nextBuildNumber(2).uploadArtifact(applicationPackage).submit();
+ tester.clock().advance(Duration.ofHours(12));
+ reporter.maintain();
+
+ assertEquals(Duration.ofHours(12) // hanging system-test
+ .plus(Duration.ofMinutes(30)) // previous staging-test
+ .plus(Duration.ofMinutes(90)) // previous production job
+ .dividedBy(3), // Total number of orchestrated jobs
+ getAverageDeploymentDuration(app));
+ }
+
+ private Duration getAverageDeploymentDuration(Application application) {
+ return metrics.getMetric((dimension, value) -> dimension.equals("application") &&
+ value.equals(application.id().toString()),
+ MetricsReporter.deploymentAverageDuration)
+ .map(seconds -> Duration.ofSeconds(seconds.longValue()))
+ .orElseThrow(() -> new RuntimeException("Expected metric to exist for " + application.id()));
+ }
+
private void assertDimension(MapContext metricContext, String dimensionName, String expectedValue) {
assertEquals(expectedValue, metricContext.getDimensions().get(dimensionName));
}
private MetricsReporter createReporter(Controller controller, MetricsMock metricsMock, SystemName system) {
+ return createReporter(controller.clock(), controller, metricsMock, system);
+ }
+
+ private MetricsReporter createReporter(Clock clock, Controller controller, MetricsMock metricsMock,
+ SystemName system) {
Chef client = Mockito.mock(Chef.class);
PartialNodeResult result;
try {
@@ -126,8 +185,6 @@ public class MetricsReporterTest {
}
when(client.partialSearchNodes(anyString(), anyListOf(AttributeMapping.class))).thenReturn(result);
- Clock clock = Clock.fixed(Instant.ofEpochSecond(1475497913), ZoneId.systemDefault());
-
return new MetricsReporter(controller, metricsMock, client, clock, new JobControl(new MockCuratorDb()), system);
}