aboutsummaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2021-12-15 10:28:14 +0100
committerMartin Polden <mpolden@mpolden.no>2021-12-15 15:10:24 +0100
commit04014910000f2aadabb3f8fa129e2b3fb88bb76e (patch)
treedf065ee570cb8a4841c90d2e06d9257ce717f830 /controller-server
parent00c8107b04121999d27a4117ca0b8eb4bc4aa8ef (diff)
Add metric for overdue upgrades
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java47
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java6
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java66
3 files changed, 115 insertions, 4 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
index 2939d10f99e..a1c25c1fb53 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
@@ -2,6 +2,7 @@
package com.yahoo.vespa.hosted.controller.maintenance;
import com.yahoo.component.Version;
+import com.yahoo.config.application.api.DeploymentInstanceSpec;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.HostName;
import com.yahoo.config.provision.zone.ZoneId;
@@ -24,11 +25,14 @@ import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
+import java.time.temporal.ChronoUnit;
import java.util.Collection;
+import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
+import java.util.Optional;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
@@ -49,6 +53,7 @@ public class MetricsReporter extends ControllerMaintainer {
public static final String DEPLOYMENT_FAILING_UPGRADES = "deployment.failingUpgrades";
public static final String DEPLOYMENT_BUILD_AGE_SECONDS = "deployment.buildAgeSeconds";
public static final String DEPLOYMENT_WARNINGS = "deployment.warnings";
+ public static final String DEPLOYMENT_OVERDUE_UPGRADE = "deployment.overdueUpgradeSeconds";
public static final String OS_CHANGE_DURATION = "deployment.osChangeDuration";
public static final String PLATFORM_CHANGE_DURATION = "deployment.platformChangeDuration";
public static final String OS_NODE_COUNT = "deployment.nodeCountByOsVersion";
@@ -146,15 +151,19 @@ public class MetricsReporter extends ControllerMaintainer {
metric.set(DEPLOYMENT_FAIL_METRIC, deploymentFailRatio(deployments) * 100, metric.createContext(Map.of()));
averageDeploymentDurations(deployments, clock.instant()).forEach((instance, duration) -> {
- metric.set(DEPLOYMENT_AVERAGE_DURATION, duration.getSeconds(), metric.createContext(dimensions(instance)));
+ metric.set(DEPLOYMENT_AVERAGE_DURATION, duration.toSeconds(), metric.createContext(dimensions(instance)));
});
deploymentsFailingUpgrade(deployments).forEach((instance, failingJobs) -> {
metric.set(DEPLOYMENT_FAILING_UPGRADES, failingJobs, metric.createContext(dimensions(instance)));
});
- deploymentWarnings(deployments).forEach((application, warnings) -> {
- metric.set(DEPLOYMENT_WARNINGS, warnings, metric.createContext(dimensions(application)));
+ deploymentWarnings(deployments).forEach((instance, warnings) -> {
+ metric.set(DEPLOYMENT_WARNINGS, warnings, metric.createContext(dimensions(instance)));
+ });
+
+ overdueUpgradeDurationByInstance(deployments).forEach((instance, overduePeriod) -> {
+ metric.set(DEPLOYMENT_OVERDUE_UPGRADE, overduePeriod.toSeconds(), metric.createContext(dimensions(instance)));
});
for (Application application : applications.asList())
@@ -165,6 +174,38 @@ public class MetricsReporter extends ControllerMaintainer {
metric.createContext(dimensions(application.id().defaultInstance()))));
}
+ private Map<ApplicationId, Duration> overdueUpgradeDurationByInstance(DeploymentStatusList deployments) {
+ Instant now = clock.instant();
+ Map<ApplicationId, Duration> overdueUpgrades = new HashMap<>();
+ for (var deploymentStatus : deployments) {
+ for (var kv : deploymentStatus.instanceJobs().entrySet()) {
+ ApplicationId instance = kv.getKey();
+ JobList jobs = kv.getValue();
+ boolean upgradeRunning = !jobs.production().upgrading().isEmpty();
+ DeploymentInstanceSpec instanceSpec = deploymentStatus.application().deploymentSpec().requireInstance(instance.instance());
+ Duration overdueDuration = upgradeRunning ? overdueUpgradeDuration(now, instanceSpec) : Duration.ZERO;
+ overdueUpgrades.put(instance, overdueDuration);
+ }
+ }
+ return Collections.unmodifiableMap(overdueUpgrades);
+ }
+
+ /** Returns how long an upgrade has been running inside a block window */
+ static Duration overdueUpgradeDuration(Instant upgradingAt, DeploymentInstanceSpec instanceSpec) {
+ Optional<Instant> lastOpened = Optional.empty(); // When the upgrade window most recently opened
+ Instant oneWeekAgo = upgradingAt.minus(Duration.ofDays(7));
+ Duration step = Duration.ofHours(1);
+ for (Instant instant = upgradingAt; !instanceSpec.canUpgradeAt(instant); instant = instant.minus(step).truncatedTo(ChronoUnit.HOURS)) {
+ if (!instant.isAfter(oneWeekAgo)) { // Wrapped around, the entire week is being blocked
+ lastOpened = Optional.empty();
+ break;
+ }
+ lastOpened = Optional.of(instant);
+ }
+ if (lastOpened.isEmpty()) return Duration.ZERO;
+ return Duration.between(lastOpened.get(), upgradingAt);
+ }
+
private void reportQueuedNameServiceRequests() {
metric.set(NAME_SERVICE_REQUESTS_QUEUED, controller().curator().readNameServiceQueue().requests().size(),
metric.createContext(Map.of()));
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java
index dcea323a8e8..36de515ab58 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java
@@ -2,6 +2,7 @@
package com.yahoo.vespa.hosted.controller.integration;
+import com.yahoo.config.provision.ApplicationId;
import com.yahoo.jdisc.Metric;
import java.util.Collections;
@@ -74,6 +75,11 @@ public class MetricsMock implements Metric {
return Optional.empty();
}
+ /** Returns the most recently added metric for given instance */
+ public Optional<Number> getMetric(ApplicationId instance, String name) {
+ return getMetric(d -> instance.toFullString().equals(d.get("applicationId")), name);
+ }
+
public static class MapContext implements Context {
private final Map<String, String> dimensions;
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
index 7619cf71f1a..3c91fb66894 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
@@ -15,10 +15,11 @@ import com.yahoo.vespa.hosted.controller.ControllerTester;
import com.yahoo.vespa.hosted.controller.api.integration.billing.PlanId;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeFilter;
-import com.yahoo.vespa.hosted.controller.application.pkg.ApplicationPackage;
import com.yahoo.vespa.hosted.controller.application.Change;
import com.yahoo.vespa.hosted.controller.application.SystemApplication;
+import com.yahoo.vespa.hosted.controller.application.pkg.ApplicationPackage;
import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder;
+import com.yahoo.vespa.hosted.controller.deployment.DeploymentContext;
import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester;
import com.yahoo.vespa.hosted.controller.integration.MetricsMock;
import com.yahoo.vespa.hosted.controller.integration.ZoneApiMock;
@@ -27,9 +28,11 @@ import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import org.junit.Test;
import java.time.Duration;
+import java.time.Instant;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
+import java.util.function.Supplier;
import java.util.function.UnaryOperator;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -494,6 +497,66 @@ public class MetricsReporterTest {
assertEquals(1, metrics.getMetric(d -> "trial".equals(d.get("plan")), MetricsReporter.TENANT_METRIC).get());
}
+ @Test
+ public void overdue_upgrade_metric() {
+ ApplicationPackage pkg = new ApplicationPackageBuilder().region("us-west-1")
+ // window 1
+ .blockChange(false, true, "mon-tue", "2-9", "CET")
+ // window 2
+ .blockChange(false, true, "mon-tue", "1-8,11-12", "CET")
+ // window 3
+ .blockChange(false, true, "wed-thu", "0-23", "CET")
+ // window 4 (does not apply to upgrade)
+ .blockChange(true, false, "mon-sun", "0-7", "CET")
+ .build();
+
+ Instant mondayNight = Instant.parse("2021-12-13T23:00:00.00Z");
+ DeploymentTester tester = new DeploymentTester().at(mondayNight);
+ MetricsReporter reporter = createReporter(tester.controller());
+ DeploymentContext context = tester.newDeploymentContext();
+ Supplier<Duration> metric = () -> {
+ reporter.maintain();
+ return Duration.ofSeconds(metrics.getMetric(context.instanceId(),MetricsReporter.DEPLOYMENT_OVERDUE_UPGRADE)
+ .get().longValue());
+ };
+
+ // Deploy completely once
+ context.submit(pkg).completeRollout();
+
+ // System is upgraded, triggering upgrade of application
+ tester.controllerTester().upgradeSystem(Version.fromString("7.0"));
+ tester.upgrader().maintain();
+
+ // Start production job for upgrade, without completing it
+ context.runJob(systemTest)
+ .runJob(stagingTest)
+ .triggerJobs()
+ .assertRunning(productionUsWest1);
+ assertEquals("Upgrade is not overdue yet", Duration.ZERO, metric.get());
+
+ // Upgrade continues into block window
+ tester.clock().advance(Duration.ofHours(3)); // Tuesday at 02:00 (03:00 CET)
+ assertEquals("Upgrade is overdue measured relative to window 2", Duration.ofHours(2), metric.get());
+
+ tester.clock().advance(Duration.ofHours(6)); // Tuesday at 08:00 (09:00 CET)
+ assertEquals("Upgrade is overdue measured relative to window 1", Duration.ofHours(8), metric.get());
+
+ tester.clock().advance(Duration.ofHours(1)); // Tuesday at 09:00 (10:00 CET)
+ assertEquals("Upgrade is no longer overdue", Duration.ZERO, metric.get());
+
+ tester.clock().advance(Duration.ofDays(2)); // Thursday at 10:00 (11:00 CET)
+ assertEquals("Upgrade is overdue measure relative to window 3", Duration.ofHours(34), metric.get());
+ }
+
+ @Test
+ public void overdue_upgrade_completely_blocked() {
+ ApplicationPackage pkg = new ApplicationPackageBuilder().region("us-west-1")
+ .blockChange(false, true, "mon-sun", "0-23", "CET")
+ .build();
+ Instant mondayNight = Instant.parse("2021-12-13T23:00:00.00Z");
+ assertEquals(Duration.ZERO, MetricsReporter.overdueUpgradeDuration(mondayNight, pkg.deploymentSpec().requireInstance("default")));
+ }
+
private void assertNodeCount(String metric, int n, Version version) {
long nodeCount = metrics.getMetric((dimensions) -> version.toFullString().equals(dimensions.get("currentVersion")), metric)
.stream()
@@ -606,3 +669,4 @@ public class MetricsReporterTest {
}
+