aboutsummaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2020-10-28 13:51:23 +0100
committerMartin Polden <mpolden@mpolden.no>2020-10-28 13:54:32 +0100
commitcd34f7befd9685a032ecc4d54328e44176a4ed0b (patch)
tree99b510075b89c940f886fba22d1ed81d7eb171c4 /controller-server
parentf29ea86738895c416b197868c919574cae556950 (diff)
Report metric for broken system version
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java21
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java2
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java1
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java35
4 files changed, 53 insertions, 6 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
index 0c5ef123eef..780eec47e81 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporter.java
@@ -19,6 +19,7 @@ import com.yahoo.vespa.hosted.controller.deployment.JobList;
import com.yahoo.vespa.hosted.controller.rotation.RotationLock;
import com.yahoo.vespa.hosted.controller.versions.NodeVersion;
import com.yahoo.vespa.hosted.controller.versions.NodeVersions;
+import com.yahoo.vespa.hosted.controller.versions.VersionStatus;
import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import java.time.Clock;
@@ -51,6 +52,7 @@ public class MetricsReporter extends ControllerMaintainer {
public static final String PLATFORM_CHANGE_DURATION = "deployment.platformChangeDuration";
public static final String OS_NODE_COUNT = "deployment.nodeCountByOsVersion";
public static final String PLATFORM_NODE_COUNT = "deployment.nodeCountByPlatformVersion";
+ public static final String BROKEN_SYSTEM_VERSION = "deployment.brokenSystemVersion";
public static final String REMAINING_ROTATIONS = "remaining_rotations";
public static final String NAME_SERVICE_REQUESTS_QUEUED = "dns.queuedRequests";
public static final String OPERATION_PREFIX = "operation.";
@@ -72,11 +74,20 @@ public class MetricsReporter extends ControllerMaintainer {
reportDeploymentMetrics();
reportRemainingRotations();
reportQueuedNameServiceRequests();
- reportInfrastructureUpgradeMetrics();
+ VersionStatus versionStatus = controller().readVersionStatus();
+ reportInfrastructureUpgradeMetrics(versionStatus);
reportAuditLog();
+ reportBrokenSystemVersion(versionStatus);
return true;
}
+ private void reportBrokenSystemVersion(VersionStatus versionStatus) {
+ Version systemVersion = controller().systemVersion(versionStatus);
+ VespaVersion.Confidence confidence = versionStatus.version(systemVersion).confidence();
+ int isBroken = confidence == VespaVersion.Confidence.broken ? 1 : 0;
+ metric.set(BROKEN_SYSTEM_VERSION, isBroken, metric.createContext(Map.of()));
+ }
+
private void reportAuditLog() {
AuditLog log = controller().auditLogger().readLog();
HashMap<String, HashMap<String, Integer>> metricCounts = new HashMap<>();
@@ -109,9 +120,9 @@ public class MetricsReporter extends ControllerMaintainer {
}
}
- private void reportInfrastructureUpgradeMetrics() {
+ private void reportInfrastructureUpgradeMetrics(VersionStatus versionStatus) {
Map<NodeVersion, Duration> osChangeDurations = osChangeDurations();
- Map<NodeVersion, Duration> platformChangeDurations = platformChangeDurations();
+ Map<NodeVersion, Duration> platformChangeDurations = platformChangeDurations(versionStatus);
reportChangeDurations(osChangeDurations, OS_CHANGE_DURATION);
reportChangeDurations(platformChangeDurations, PLATFORM_CHANGE_DURATION);
reportNodeCount(osChangeDurations.keySet(), OS_NODE_COUNT);
@@ -182,8 +193,8 @@ public class MetricsReporter extends ControllerMaintainer {
});
}
- private Map<NodeVersion, Duration> platformChangeDurations() {
- return changeDurations(controller().versionStatus().versions(), VespaVersion::nodeVersions);
+ private Map<NodeVersion, Duration> platformChangeDurations(VersionStatus versionStatus) {
+ return changeDurations(versionStatus.versions(), VespaVersion::nodeVersions);
}
private Map<NodeVersion, Duration> osChangeDurations() {
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java
index d90eb715499..a42dbe7fbde 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentContext.java
@@ -337,7 +337,7 @@ public class DeploymentContext {
return this;
}
- /** Abort the running job of the given type and. */
+ /** Abort the running job of the given type. */
public DeploymentContext abortJob(JobType type) {
var job = jobId(type);
assertNotSame(RunStatus.aborted, currentRun(job).status());
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java
index 41d10015411..7fd02a8e780 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java
@@ -101,6 +101,7 @@ public class DeploymentTester {
public OutstandingChangeDeployer outstandingChangeDeployer() { return outstandingChangeDeployer; }
+ /** A tester with clock configured to a time when confidence can freely change */
public DeploymentTester atMondayMorning() {
return at(tester.clock().instant().atZone(ZoneOffset.UTC)
.with(TemporalAdjusters.previousOrSame(DayOfWeek.MONDAY))
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
index 609ec128ac8..062bd97f901 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/MetricsReporterTest.java
@@ -13,11 +13,13 @@ import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.ControllerTester;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node;
import com.yahoo.vespa.hosted.controller.application.ApplicationPackage;
+import com.yahoo.vespa.hosted.controller.application.Change;
import com.yahoo.vespa.hosted.controller.application.SystemApplication;
import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder;
import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester;
import com.yahoo.vespa.hosted.controller.integration.MetricsMock;
import com.yahoo.vespa.hosted.controller.integration.ZoneApiMock;
+import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import org.junit.Test;
import java.time.Duration;
@@ -445,6 +447,39 @@ public class MetricsReporterTest {
}
}
+ @Test
+ public void broken_system_version() {
+ var tester = new DeploymentTester().atMondayMorning();
+ var ctx = tester.newDeploymentContext();
+ var applicationPackage = new ApplicationPackageBuilder().upgradePolicy("canary").region("us-west-1").build();
+
+ // Application deploys successfully on current system version
+ ctx.submit(applicationPackage).deploy();
+ tester.controllerTester().computeVersionStatus();
+ var reporter = createReporter(tester.controller());
+ reporter.maintain();
+ assertEquals(VespaVersion.Confidence.high, tester.controller().readVersionStatus().systemVersion().get().confidence());
+ assertEquals(0, metrics.getMetric(MetricsReporter.BROKEN_SYSTEM_VERSION));
+
+ // System upgrades. Canary upgrade fails
+ Version version0 = Version.fromString("6.2");
+ tester.controllerTester().upgradeSystem(version0);
+ tester.upgrader().maintain();
+ assertEquals(Change.of(version0), ctx.instance().change());
+ ctx.failDeployment(stagingTest);
+ tester.controllerTester().computeVersionStatus();
+ assertEquals(VespaVersion.Confidence.broken, tester.controller().readVersionStatus().systemVersion().get().confidence());
+ reporter.maintain();
+ assertEquals(1, metrics.getMetric(MetricsReporter.BROKEN_SYSTEM_VERSION));
+
+ // Canary is healed and confidence is raised
+ ctx.deployPlatform(version0);
+ tester.controllerTester().computeVersionStatus();
+ assertEquals(VespaVersion.Confidence.high, tester.controller().readVersionStatus().systemVersion().get().confidence());
+ reporter.maintain();
+ assertEquals(0, metrics.getMetric(MetricsReporter.BROKEN_SYSTEM_VERSION));
+ }
+
private void assertNodeCount(String metric, int n, Version version) {
long nodeCount = metrics.getMetric((dimensions) -> version.toFullString().equals(dimensions.get("currentVersion")), metric)
.stream()