diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2021-12-15 16:24:44 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-12-15 16:24:44 +0100 |
commit | 928c99639681b92a8990621eb8403a8579c82797 (patch) | |
tree | 0a7ae76bd250c32b751923be1ba996150732949d | |
parent | 7ed22d8580082eaff536cd0b59d0498edec3f718 (diff) | |
parent | 04014910000f2aadabb3f8fa129e2b3fb88bb76e (diff) |
Merge pull request #20521 from vespa-engine/mpolden/upgrade-in-block-window-metric
Add metric for overdue upgrades
5 files changed, 121 insertions, 12 deletions
diff --git a/config-model-api/src/main/java/com/yahoo/config/application/api/TimeWindow.java b/config-model-api/src/main/java/com/yahoo/config/application/api/TimeWindow.java index e17dea63769..5a2b3a10fe1 100644 --- a/config-model-api/src/main/java/com/yahoo/config/application/api/TimeWindow.java +++ b/config-model-api/src/main/java/com/yahoo/config/application/api/TimeWindow.java @@ -11,7 +11,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.TreeSet; +import java.util.Objects; import java.util.function.BiFunction; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -28,9 +28,9 @@ public class TimeWindow { private final ZoneId zone; private TimeWindow(List<DayOfWeek> days, List<Integer> hours, ZoneId zone) { - this.days = Collections.unmodifiableList(new ArrayList<>(new TreeSet<>(days))); - this.hours = Collections.unmodifiableList(new ArrayList<>(new TreeSet<>(hours))); - this.zone = zone; + this.days = Objects.requireNonNull(days).stream().distinct().sorted().collect(Collectors.toUnmodifiableList()); + this.hours = Objects.requireNonNull(hours).stream().distinct().sorted().collect(Collectors.toUnmodifiableList()); + this.zone = Objects.requireNonNull(zone); } /** Returns days in this time window */ @@ -57,8 +57,8 @@ public class TimeWindow { return "time window for hour(s) " + hours.toString() + " on " + days.stream().map(DayOfWeek::name) - .map(String::toLowerCase) - .collect(Collectors.toList()).toString() + + .map(String::toLowerCase) + .collect(Collectors.toList()) + " in " + zone; } diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/http/v1/RoutingStatusApiHandler.java b/configserver/src/main/java/com/yahoo/vespa/config/server/http/v1/RoutingStatusApiHandler.java index 43ed16ab21c..d9cbc7bc533 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/http/v1/RoutingStatusApiHandler.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/http/v1/RoutingStatusApiHandler.java @@ -112,8 +112,6 @@ public class RoutingStatusApiHandler extends RestApiRequestHandler<RoutingStatus .map(this::deploymentStatus) .collect(Collectors.toList()); DeploymentRoutingStatus currentStatus = currentStatuses.get(0); - // Redeploy application so that a new LbServicesConfig containing the updated status is generated and consumed - // by routing layer. This is required to update status of upstreams in application endpoints log.log(Level.INFO, "Changing routing status of " + instance + " from " + currentStatus.status() + " to " + wantedStatus.status()); boolean needsChange = currentStatuses.stream().anyMatch(status -> status.status() != wantedStatus.status()); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java index 2939d10f99e..a1c25c1fb53 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.component.Version; +import com.yahoo.config.application.api.DeploymentInstanceSpec; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.HostName; import com.yahoo.config.provision.zone.ZoneId; @@ -24,11 +25,14 @@ import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import java.time.Clock; import java.time.Duration; import java.time.Instant; +import java.time.temporal.ChronoUnit; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Optional; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; @@ -49,6 +53,7 @@ public class MetricsReporter extends ControllerMaintainer { public static final String DEPLOYMENT_FAILING_UPGRADES = "deployment.failingUpgrades"; public static final String DEPLOYMENT_BUILD_AGE_SECONDS = "deployment.buildAgeSeconds"; public static final String DEPLOYMENT_WARNINGS = "deployment.warnings"; + public static final String DEPLOYMENT_OVERDUE_UPGRADE = "deployment.overdueUpgradeSeconds"; public static final String OS_CHANGE_DURATION = "deployment.osChangeDuration"; public static final String PLATFORM_CHANGE_DURATION = "deployment.platformChangeDuration"; public static final String OS_NODE_COUNT = "deployment.nodeCountByOsVersion"; @@ -146,15 +151,19 @@ public class MetricsReporter extends ControllerMaintainer { metric.set(DEPLOYMENT_FAIL_METRIC, deploymentFailRatio(deployments) * 100, metric.createContext(Map.of())); averageDeploymentDurations(deployments, clock.instant()).forEach((instance, duration) -> { - metric.set(DEPLOYMENT_AVERAGE_DURATION, duration.getSeconds(), metric.createContext(dimensions(instance))); + metric.set(DEPLOYMENT_AVERAGE_DURATION, duration.toSeconds(), metric.createContext(dimensions(instance))); }); deploymentsFailingUpgrade(deployments).forEach((instance, failingJobs) -> { metric.set(DEPLOYMENT_FAILING_UPGRADES, failingJobs, metric.createContext(dimensions(instance))); }); - deploymentWarnings(deployments).forEach((application, warnings) -> { - metric.set(DEPLOYMENT_WARNINGS, warnings, metric.createContext(dimensions(application))); + deploymentWarnings(deployments).forEach((instance, warnings) -> { + metric.set(DEPLOYMENT_WARNINGS, warnings, metric.createContext(dimensions(instance))); + }); + + overdueUpgradeDurationByInstance(deployments).forEach((instance, overduePeriod) -> { + metric.set(DEPLOYMENT_OVERDUE_UPGRADE, overduePeriod.toSeconds(), metric.createContext(dimensions(instance))); }); for (Application application : applications.asList()) @@ -165,6 +174,38 @@ public class MetricsReporter extends ControllerMaintainer { metric.createContext(dimensions(application.id().defaultInstance())))); } + private Map<ApplicationId, Duration> overdueUpgradeDurationByInstance(DeploymentStatusList deployments) { + Instant now = clock.instant(); + Map<ApplicationId, Duration> overdueUpgrades = new HashMap<>(); + for (var deploymentStatus : deployments) { + for (var kv : deploymentStatus.instanceJobs().entrySet()) { + ApplicationId instance = kv.getKey(); + JobList jobs = kv.getValue(); + boolean upgradeRunning = !jobs.production().upgrading().isEmpty(); + DeploymentInstanceSpec instanceSpec = deploymentStatus.application().deploymentSpec().requireInstance(instance.instance()); + Duration overdueDuration = upgradeRunning ? overdueUpgradeDuration(now, instanceSpec) : Duration.ZERO; + overdueUpgrades.put(instance, overdueDuration); + } + } + return Collections.unmodifiableMap(overdueUpgrades); + } + + /** Returns how long an upgrade has been running inside a block window */ + static Duration overdueUpgradeDuration(Instant upgradingAt, DeploymentInstanceSpec instanceSpec) { + Optional<Instant> lastOpened = Optional.empty(); // When the upgrade window most recently opened + Instant oneWeekAgo = upgradingAt.minus(Duration.ofDays(7)); + Duration step = Duration.ofHours(1); + for (Instant instant = upgradingAt; !instanceSpec.canUpgradeAt(instant); instant = instant.minus(step).truncatedTo(ChronoUnit.HOURS)) { + if (!instant.isAfter(oneWeekAgo)) { // Wrapped around, the entire week is being blocked + lastOpened = Optional.empty(); + break; + } + lastOpened = Optional.of(instant); + } + if (lastOpened.isEmpty()) return Duration.ZERO; + return Duration.between(lastOpened.get(), upgradingAt); + } + private void reportQueuedNameServiceRequests() { metric.set(NAME_SERVICE_REQUESTS_QUEUED, controller().curator().readNameServiceQueue().requests().size(), metric.createContext(Map.of())); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java index dcea323a8e8..36de515ab58 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/MetricsMock.java @@ -2,6 +2,7 @@ package com.yahoo.vespa.hosted.controller.integration; +import com.yahoo.config.provision.ApplicationId; import com.yahoo.jdisc.Metric; import java.util.Collections; @@ -74,6 +75,11 @@ public class MetricsMock implements Metric { return Optional.empty(); } + /** Returns the most recently added metric for given instance */ + public Optional<Number> getMetric(ApplicationId instance, String name) { + return getMetric(d -> instance.toFullString().equals(d.get("applicationId")), name); + } + public static class MapContext implements Context { private final Map<String, String> dimensions; diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java index 7619cf71f1a..3c91fb66894 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java @@ -15,10 +15,11 @@ import com.yahoo.vespa.hosted.controller.ControllerTester; import com.yahoo.vespa.hosted.controller.api.integration.billing.PlanId; import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node; import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeFilter; -import com.yahoo.vespa.hosted.controller.application.pkg.ApplicationPackage; import com.yahoo.vespa.hosted.controller.application.Change; import com.yahoo.vespa.hosted.controller.application.SystemApplication; +import com.yahoo.vespa.hosted.controller.application.pkg.ApplicationPackage; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; +import com.yahoo.vespa.hosted.controller.deployment.DeploymentContext; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; import com.yahoo.vespa.hosted.controller.integration.MetricsMock; import com.yahoo.vespa.hosted.controller.integration.ZoneApiMock; @@ -27,9 +28,11 @@ import com.yahoo.vespa.hosted.controller.versions.VespaVersion; import org.junit.Test; import java.time.Duration; +import java.time.Instant; import java.util.Comparator; import java.util.List; import java.util.Set; +import java.util.function.Supplier; import java.util.function.UnaryOperator; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -494,6 +497,66 @@ public class MetricsReporterTest { assertEquals(1, metrics.getMetric(d -> "trial".equals(d.get("plan")), MetricsReporter.TENANT_METRIC).get()); } + @Test + public void overdue_upgrade_metric() { + ApplicationPackage pkg = new ApplicationPackageBuilder().region("us-west-1") + // window 1 + .blockChange(false, true, "mon-tue", "2-9", "CET") + // window 2 + .blockChange(false, true, "mon-tue", "1-8,11-12", "CET") + // window 3 + .blockChange(false, true, "wed-thu", "0-23", "CET") + // window 4 (does not apply to upgrade) + .blockChange(true, false, "mon-sun", "0-7", "CET") + .build(); + + Instant mondayNight = Instant.parse("2021-12-13T23:00:00.00Z"); + DeploymentTester tester = new DeploymentTester().at(mondayNight); + MetricsReporter reporter = createReporter(tester.controller()); + DeploymentContext context = tester.newDeploymentContext(); + Supplier<Duration> metric = () -> { + reporter.maintain(); + return Duration.ofSeconds(metrics.getMetric(context.instanceId(),MetricsReporter.DEPLOYMENT_OVERDUE_UPGRADE) + .get().longValue()); + }; + + // Deploy completely once + context.submit(pkg).completeRollout(); + + // System is upgraded, triggering upgrade of application + tester.controllerTester().upgradeSystem(Version.fromString("7.0")); + tester.upgrader().maintain(); + + // Start production job for upgrade, without completing it + context.runJob(systemTest) + .runJob(stagingTest) + .triggerJobs() + .assertRunning(productionUsWest1); + assertEquals("Upgrade is not overdue yet", Duration.ZERO, metric.get()); + + // Upgrade continues into block window + tester.clock().advance(Duration.ofHours(3)); // Tuesday at 02:00 (03:00 CET) + assertEquals("Upgrade is overdue measured relative to window 2", Duration.ofHours(2), metric.get()); + + tester.clock().advance(Duration.ofHours(6)); // Tuesday at 08:00 (09:00 CET) + assertEquals("Upgrade is overdue measured relative to window 1", Duration.ofHours(8), metric.get()); + + tester.clock().advance(Duration.ofHours(1)); // Tuesday at 09:00 (10:00 CET) + assertEquals("Upgrade is no longer overdue", Duration.ZERO, metric.get()); + + tester.clock().advance(Duration.ofDays(2)); // Thursday at 10:00 (11:00 CET) + assertEquals("Upgrade is overdue measure relative to window 3", Duration.ofHours(34), metric.get()); + } + + @Test + public void overdue_upgrade_completely_blocked() { + ApplicationPackage pkg = new ApplicationPackageBuilder().region("us-west-1") + .blockChange(false, true, "mon-sun", "0-23", "CET") + .build(); + Instant mondayNight = Instant.parse("2021-12-13T23:00:00.00Z"); + assertEquals(Duration.ZERO, MetricsReporter.overdueUpgradeDuration(mondayNight, pkg.deploymentSpec().requireInstance("default"))); + } + private void assertNodeCount(String metric, int n, Version version) { long nodeCount = metrics.getMetric((dimensions) -> version.toFullString().equals(dimensions.get("currentVersion")), metric) .stream() @@ -606,3 +669,4 @@ public class MetricsReporterTest { } + |