diff options
Diffstat (limited to 'controller-server')
3 files changed, 115 insertions, 4 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index 2939d10f99e..a1c25c1fb53 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.component.Version; +import com.yahoo.config.application.api.DeploymentInstanceSpec; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.HostName; import com.yahoo.config.provision.zone.ZoneId; @@ -24,11 +25,14 @@ import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import java.time.Clock; import java.time.Duration; import java.time.Instant; +import java.time.temporal.ChronoUnit; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Optional; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; @@ -49,6 +53,7 @@ public class MetricsReporter extends ControllerMaintainer { public static final String DEPLOYMENT_FAILING_UPGRADES = "deployment.failingUpgrades"; public static final String DEPLOYMENT_BUILD_AGE_SECONDS = "deployment.buildAgeSeconds"; public static final String DEPLOYMENT_WARNINGS = "deployment.warnings"; + public static final String DEPLOYMENT_OVERDUE_UPGRADE = "deployment.overdueUpgradeSeconds"; public static final String OS_CHANGE_DURATION = "deployment.osChangeDuration"; public static final String PLATFORM_CHANGE_DURATION = "deployment.platformChangeDuration"; public static final String OS_NODE_COUNT = "deployment.nodeCountByOsVersion"; @@ -146,15 +151,19 @@ public class MetricsReporter extends ControllerMaintainer { metric.set(DEPLOYMENT_FAIL_METRIC, deploymentFailRatio(deployments) * 100, metric.createContext(Map.of())); averageDeploymentDurations(deployments, clock.instant()).forEach((instance, duration) -> { - metric.set(DEPLOYMENT_AVERAGE_DURATION, duration.getSeconds(), metric.createContext(dimensions(instance))); + metric.set(DEPLOYMENT_AVERAGE_DURATION, duration.toSeconds(), metric.createContext(dimensions(instance))); }); deploymentsFailingUpgrade(deployments).forEach((instance, failingJobs) -> { metric.set(DEPLOYMENT_FAILING_UPGRADES, failingJobs, metric.createContext(dimensions(instance))); }); - deploymentWarnings(deployments).forEach((application, warnings) -> { - metric.set(DEPLOYMENT_WARNINGS, warnings, metric.createContext(dimensions(application))); + deploymentWarnings(deployments).forEach((instance, warnings) -> { + metric.set(DEPLOYMENT_WARNINGS, warnings, metric.createContext(dimensions(instance))); + }); + + overdueUpgradeDurationByInstance(deployments).forEach((instance, overduePeriod) -> { + metric.set(DEPLOYMENT_OVERDUE_UPGRADE, overduePeriod.toSeconds(), metric.createContext(dimensions(instance))); }); for (Application application : applications.asList()) @@ -165,6 +174,38 @@ public class MetricsReporter extends ControllerMaintainer { metric.createContext(dimensions(application.id().defaultInstance())))); } + private Map<ApplicationId, Duration> overdueUpgradeDurationByInstance(DeploymentStatusList deployments) { + Instant now = clock.instant(); + Map<ApplicationId, Duration> overdueUpgrades = new HashMap<>(); + for (var deploymentStatus : deployments) { + for (var kv : deploymentStatus.instanceJobs().entrySet()) { + ApplicationId instance = kv.getKey(); + JobList jobs = kv.getValue(); + boolean upgradeRunning = !jobs.production().upgrading().isEmpty(); + DeploymentInstanceSpec instanceSpec = deploymentStatus.application().deploymentSpec().requireInstance(instance.instance()); + Duration overdueDuration = upgradeRunning ? overdueUpgradeDuration(now, instanceSpec) : Duration.ZERO; + overdueUpgrades.put(instance, overdueDuration); + } + } + return Collections.unmodifiableMap(overdueUpgrades); + } + + /** Returns how long an upgrade has been running inside a block window */ + static Duration overdueUpgradeDuration(Instant upgradingAt, DeploymentInstanceSpec instanceSpec) { + Optional<Instant> lastOpened = Optional.empty(); // When the upgrade window most recently opened + Instant oneWeekAgo = upgradingAt.minus(Duration.ofDays(7)); + Duration step = Duration.ofHours(1); + for (Instant instant = upgradingAt; !instanceSpec.canUpgradeAt(instant); instant = instant.minus(step).truncatedTo(ChronoUnit.HOURS)) { + if (!instant.isAfter(oneWeekAgo)) { // Wrapped around, the entire week is being blocked + lastOpened = Optional.empty(); + break; + } + lastOpened = Optional.of(instant); + } + if (lastOpened.isEmpty()) return Duration.ZERO; + return Duration.between(lastOpened.get(), upgradingAt); + } + private void reportQueuedNameServiceRequests() { metric.set(NAME_SERVICE_REQUESTS_QUEUED, controller().curator().readNameServiceQueue().requests().size(), metric.createContext(Map.of())); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java index dcea323a8e8..36de515ab58 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.hosted.controller.integration; +import com.yahoo.config.provision.ApplicationId; import com.yahoo.jdisc.Metric; import java.util.Collections; @@ -74,6 +75,11 @@ public class MetricsMock implements Metric { return Optional.empty(); } + /** Returns the most recently added metric for given instance */ + public Optional<Number> getMetric(ApplicationId instance, String name) { + return getMetric(d -> instance.toFullString().equals(d.get("applicationId")), name); + } + public static class MapContext implements Context { private final Map<String, String> dimensions; diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java index 7619cf71f1a..3c91fb66894 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java @@ -15,10 +15,11 @@ import com.yahoo.vespa.hosted.controller.ControllerTester; import com.yahoo.vespa.hosted.controller.api.integration.billing.PlanId; import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node; import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeFilter; -import com.yahoo.vespa.hosted.controller.application.pkg.ApplicationPackage; import com.yahoo.vespa.hosted.controller.application.Change; import com.yahoo.vespa.hosted.controller.application.SystemApplication; +import com.yahoo.vespa.hosted.controller.application.pkg.ApplicationPackage; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; +import com.yahoo.vespa.hosted.controller.deployment.DeploymentContext; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; import com.yahoo.vespa.hosted.controller.integration.MetricsMock; import com.yahoo.vespa.hosted.controller.integration.ZoneApiMock; @@ -27,9 +28,11 @@ import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import org.junit.Test; import java.time.Duration; +import java.time.Instant; import java.util.Comparator; import java.util.List; import java.util.Set; +import java.util.function.Supplier; import java.util.function.UnaryOperator; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -494,6 +497,66 @@ public class MetricsReporterTest { assertEquals(1, metrics.getMetric(d -> "trial".equals(d.get("plan")), MetricsReporter.TENANT_METRIC).get()); } + @Test + public void overdue_upgrade_metric() { + ApplicationPackage pkg = new ApplicationPackageBuilder().region("us-west-1") + // window 1 + .blockChange(false, true, "mon-tue", "2-9", "CET") + // window 2 + .blockChange(false, true, "mon-tue", "1-8,11-12", "CET") + // window 3 + .blockChange(false, true, "wed-thu", "0-23", "CET") + // window 4 (does not apply to upgrade) + .blockChange(true, false, "mon-sun", "0-7", "CET") + .build(); + + Instant mondayNight = Instant.parse("2021-12-13T23:00:00.00Z"); + DeploymentTester tester = new DeploymentTester().at(mondayNight); + MetricsReporter reporter = createReporter(tester.controller()); + DeploymentContext context = tester.newDeploymentContext(); + Supplier<Duration> metric = () -> { + reporter.maintain(); + return Duration.ofSeconds(metrics.getMetric(context.instanceId(),MetricsReporter.DEPLOYMENT_OVERDUE_UPGRADE) + .get().longValue()); + }; + + // Deploy completely once + context.submit(pkg).completeRollout(); + + // System is upgraded, triggering upgrade of application + tester.controllerTester().upgradeSystem(Version.fromString("7.0")); + tester.upgrader().maintain(); + + // Start production job for upgrade, without completing it + context.runJob(systemTest) + .runJob(stagingTest) + .triggerJobs() + .assertRunning(productionUsWest1); + assertEquals("Upgrade is not overdue yet", Duration.ZERO, metric.get()); + + // Upgrade continues into block window + tester.clock().advance(Duration.ofHours(3)); // Tuesday at 02:00 (03:00 CET) + assertEquals("Upgrade is overdue measured relative to window 2", Duration.ofHours(2), metric.get()); + + tester.clock().advance(Duration.ofHours(6)); // Tuesday at 08:00 (09:00 CET) + assertEquals("Upgrade is overdue measured relative to window 1", Duration.ofHours(8), metric.get()); + + tester.clock().advance(Duration.ofHours(1)); // Tuesday at 09:00 (10:00 CET) + assertEquals("Upgrade is no longer overdue", Duration.ZERO, metric.get()); + + tester.clock().advance(Duration.ofDays(2)); // Thursday at 10:00 (11:00 CET) + assertEquals("Upgrade is overdue measure relative to window 3", Duration.ofHours(34), metric.get()); + } + + @Test + public void overdue_upgrade_completely_blocked() { + ApplicationPackage pkg = new ApplicationPackageBuilder().region("us-west-1") + .blockChange(false, true, "mon-sun", "0-23", "CET") + .build(); + Instant mondayNight = Instant.parse("2021-12-13T23:00:00.00Z"); + assertEquals(Duration.ZERO, MetricsReporter.overdueUpgradeDuration(mondayNight, pkg.deploymentSpec().requireInstance("default"))); + } + private void assertNodeCount(String metric, int n, Version version) { long nodeCount = metrics.getMetric((dimensions) -> version.toFullString().equals(dimensions.get("currentVersion")), metric) .stream() @@ -606,3 +669,4 @@ public class MetricsReporterTest { } + |