diff options
author | Martin Polden <mpolden@mpolden.no> | 2018-03-16 15:13:04 +0100 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2018-03-19 12:11:39 +0100 |
commit | eefcb1a43709b8e1657499012d72e7d7f8a75df2 (patch) | |
tree | 2b041b0051085669d74f58e3f252eee200e35447 | |
parent | e0479e09d6651bfebb571492f9fee6838502b265 (diff) |
Average deployment duration metric
2 files changed, 100 insertions, 6 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index ab388ca9a9f..967db0dff99 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.SystemName; import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.controller.Application; @@ -10,6 +11,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.chef.Chef; import com.yahoo.vespa.hosted.controller.api.integration.chef.rest.PartialNode; import com.yahoo.vespa.hosted.controller.api.integration.chef.rest.PartialNodeResult; import com.yahoo.vespa.hosted.controller.application.ApplicationList; +import com.yahoo.vespa.hosted.controller.application.JobStatus; import com.yahoo.vespa.hosted.controller.rotation.RotationLock; import java.time.Clock; @@ -21,6 +23,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.stream.Collectors; /** * @author mortent @@ -30,6 +33,7 @@ public class MetricsReporter extends Maintainer { public static final String convergeMetric = "seconds.since.last.chef.convergence"; public static final String deploymentFailMetric = "deployment.failurePercentage"; + public static final String deploymentAverageDuration = "deployment.averageDuration"; public static final String remainingRotations = "remaining_rotations"; private final Metric metric; @@ -112,6 +116,10 @@ public class MetricsReporter extends Maintainer { private void reportDeploymentMetrics() { metric.set(deploymentFailMetric, deploymentFailRatio() * 100, metric.createContext(Collections.emptyMap())); + for (Map.Entry<ApplicationId, Duration> entry : averageDeploymentDurations().entrySet()) { + metric.set(deploymentAverageDuration, entry.getValue().getSeconds(), + metric.createContext(Collections.singletonMap("application", entry.getKey().toString()))); + } } private double deploymentFailRatio() { @@ -121,8 +129,37 @@ public class MetricsReporter extends Maintainer { .asList(); if (applications.isEmpty()) return 0; - return (double)applications.stream().filter(a -> a.deploymentJobs().hasFailures()).count() / - (double)applications.size(); + return (double) applications.stream().filter(a -> a.deploymentJobs().hasFailures()).count() / + (double) applications.size(); + } + + private Map<ApplicationId, Duration> averageDeploymentDurations() { + Instant now = clock.instant(); + return ApplicationList.from(controller().applications().asList()) + .notPullRequest() + .hasProductionDeployment() + .asList() + .stream() + .collect(Collectors.toMap(Application::id, + application -> averageDeploymentDuration(application, now))); + } + + private Duration averageDeploymentDuration(Application application, Instant now) { + List<Duration> jobDurations = application.deploymentJobs().jobStatus().values().stream() + .filter(status -> status.lastTriggered().isPresent()) + .map(status -> { + Instant triggeredAt = status.lastTriggered().get().at(); + Instant runningUntil = status.lastCompleted() + .map(JobStatus.JobRun::at) + .filter(at -> at.isAfter(triggeredAt)) + .orElse(now); + return Duration.between(triggeredAt, runningUntil); + }) + .collect(Collectors.toList()); + return jobDurations.stream() + .reduce(Duration::plus) + .map(totalDuration -> totalDuration.dividedBy(jobDurations.size())) + .orElse(Duration.ZERO); } private void keepNodesWithSystem(PartialNodeResult nodeResult, SystemName system) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java index d57f01bd132..93fdf3618a7 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java @@ -26,11 +26,14 @@ import java.io.UncheckedIOException; import java.nio.file.Path; import java.nio.file.Paths; import java.time.Clock; +import java.time.Duration; import java.time.Instant; import java.time.ZoneId; import java.util.Map; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.component; +import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.productionUsWest1; +import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.stagingTest; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.systemTest; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -54,8 +57,9 @@ public class MetricsReporterTest { @Test public void test_chef_metrics() { + Clock clock = Clock.fixed(Instant.ofEpochSecond(1475497913), ZoneId.systemDefault());; ControllerTester tester = new ControllerTester(); - MetricsReporter metricsReporter = createReporter(tester.controller(), metrics, SystemName.cd); + MetricsReporter metricsReporter = createReporter(clock, tester.controller(), metrics, SystemName.cd); metricsReporter.maintain(); assertEquals(2, metrics.getMetrics().size()); @@ -95,7 +99,7 @@ public class MetricsReporterTest { assertEquals(0.0, metrics.getMetric(MetricsReporter.deploymentFailMetric)); // 1 app fails system-test - tester.jobCompletion(component).application(app4).submit(); + tester.jobCompletion(component).application(app4).nextBuildNumber().uploadArtifact(applicationPackage).submit(); tester.deployAndNotify(app4, applicationPackage, false, systemTest); metricsReporter.maintain(); @@ -110,11 +114,66 @@ public class MetricsReporterTest { assertNull(metricContext.getDimensions().get("zone")); } + @Test + public void test_deployment_average_duration() { + DeploymentTester tester = new DeploymentTester(); + ApplicationPackage applicationPackage = new ApplicationPackageBuilder() + .environment(Environment.prod) + .region("us-west-1") + .build(); + + MetricsReporter reporter = createReporter(tester.controller(), metrics, SystemName.cd); + + Application app = tester.createApplication("app1", "tenant1", 1, 11L); + tester.deployCompletely(app, applicationPackage); + reporter.maintain(); + assertEquals(Duration.ZERO, getAverageDeploymentDuration(app)); // An exceptionally fast deployment :-) + + // App spends 3 hours deploying + tester.jobCompletion(component).application(app).nextBuildNumber().uploadArtifact(applicationPackage).submit(); + tester.clock().advance(Duration.ofHours(1)); + tester.deployAndNotify(app, applicationPackage, true, systemTest); + + tester.clock().advance(Duration.ofMinutes(30)); + tester.deployAndNotify(app, applicationPackage, true, stagingTest); + + tester.clock().advance(Duration.ofMinutes(90)); + tester.deployAndNotify(app, applicationPackage, true, productionUsWest1); + reporter.maintain(); + + // Average time is 1 hour + assertEquals(Duration.ofHours(1), getAverageDeploymentDuration(app)); + + // Another deployment starts and stalls for 12 hours + tester.jobCompletion(component).application(app).nextBuildNumber(2).uploadArtifact(applicationPackage).submit(); + tester.clock().advance(Duration.ofHours(12)); + reporter.maintain(); + + assertEquals(Duration.ofHours(12) // hanging system-test + .plus(Duration.ofMinutes(30)) // previous staging-test + .plus(Duration.ofMinutes(90)) // previous production job + .dividedBy(3), // Total number of orchestrated jobs + getAverageDeploymentDuration(app)); + } + + private Duration getAverageDeploymentDuration(Application application) { + return metrics.getMetric((dimension, value) -> dimension.equals("application") && + value.equals(application.id().toString()), + MetricsReporter.deploymentAverageDuration) + .map(seconds -> Duration.ofSeconds(seconds.longValue())) + .orElseThrow(() -> new RuntimeException("Expected metric to exist for " + application.id())); + } + private void assertDimension(MapContext metricContext, String dimensionName, String expectedValue) { assertEquals(expectedValue, metricContext.getDimensions().get(dimensionName)); } private MetricsReporter createReporter(Controller controller, MetricsMock metricsMock, SystemName system) { + return createReporter(controller.clock(), controller, metricsMock, system); + } + + private MetricsReporter createReporter(Clock clock, Controller controller, MetricsMock metricsMock, + SystemName system) { Chef client = Mockito.mock(Chef.class); PartialNodeResult result; try { @@ -126,8 +185,6 @@ public class MetricsReporterTest { } when(client.partialSearchNodes(anyString(), anyListOf(AttributeMapping.class))).thenReturn(result); - Clock clock = Clock.fixed(Instant.ofEpochSecond(1475497913), ZoneId.systemDefault()); - return new MetricsReporter(controller, metricsMock, client, clock, new JobControl(new MockCuratorDb()), system); } |