From 17d7c346f7a954007d3018a956d9c9c12ba4e0c8 Mon Sep 17 00:00:00 2001 From: Jon Marius Venstad Date: Thu, 11 Jan 2018 15:51:36 +0100 Subject: Set metric whenever throttling is evaluated --- .../com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java | 6 +++++- .../hosted/provision/maintenance/NodeRepositoryMaintenance.java | 2 +- .../yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java | 9 ++++++++- .../yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java | 9 +++++++++ .../vespa/hosted/provision/monitoring/MetricsReporterTest.java | 2 +- 5 files changed, 24 insertions(+), 4 deletions(-) (limited to 'node-repository/src') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index fd813ca291c..6089cfe64c9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -5,6 +5,7 @@ import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.config.provision.NodeType; +import com.yahoo.jdisc.Metric; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; @@ -55,11 +56,12 @@ public class NodeFailer extends Maintainer { private final Orchestrator orchestrator; private final Instant constructionTime; private final ThrottlePolicy throttlePolicy; + private final Metric metric; public NodeFailer(Deployer deployer, HostLivenessTracker hostLivenessTracker, ServiceMonitor serviceMonitor, NodeRepository nodeRepository, Duration downTimeLimit, Clock clock, Orchestrator orchestrator, - ThrottlePolicy throttlePolicy, + ThrottlePolicy throttlePolicy, Metric metric, JobControl jobControl) { // check ping status every five minutes, but at least twice as often as the down time limit super(nodeRepository, min(downTimeLimit.dividedBy(2), Duration.ofMinutes(5)), jobControl); @@ -71,6 +73,7 @@ public class NodeFailer extends Maintainer { this.orchestrator = orchestrator; this.constructionTime = clock.instant(); this.throttlePolicy = throttlePolicy; + this.metric = metric; } @Override @@ -290,6 +293,7 @@ public class NodeFailer extends Maintainer { log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(), throttlePolicy.toHumanReadableString())); } + metric.set("nodeFailThrottling", throttle ? 1 : 0, null); return throttle; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 9e826bfcb9a..12ba67eba6d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -64,7 +64,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { DefaultTimes defaults = new DefaultTimes(zone.environment()); jobControl = new JobControl(nodeRepository.database()); - nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, durationFromEnv("fail_grace").orElse(defaults.failGrace), clock, orchestrator, throttlePolicyFromEnv("throttle_policy").orElse(defaults.throttlePolicy), jobControl); + nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, durationFromEnv("fail_grace").orElse(defaults.failGrace), clock, orchestrator, throttlePolicyFromEnv("throttle_policy").orElse(defaults.throttlePolicy), metric, jobControl); periodicApplicationMaintainer = new PeriodicApplicationMaintainer(deployer, nodeRepository, durationFromEnv("periodic_redeploy_interval").orElse(defaults.periodicRedeployInterval), jobControl); operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, nodeRepository, clock, durationFromEnv("operator_change_redeploy_interval").orElse(defaults.operatorChangeRedeployInterval), jobControl); zooKeeperAccessMaintainer = new ZooKeeperAccessMaintainer(nodeRepository, curator, durationFromEnv("zookeeper_access_maintenance_interval").orElse(defaults.zooKeeperAccessMaintenanceInterval), jobControl); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java index 0e0195a5bed..05f49ce1f32 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java @@ -23,6 +23,7 @@ import com.yahoo.vespa.curator.mock.MockCurator; import com.yahoo.vespa.curator.transaction.CuratorTransaction; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.monitoring.MetricsReporterTest; import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner; import com.yahoo.vespa.hosted.provision.testutils.MockDeployer; @@ -63,6 +64,7 @@ public class NodeFailTester { public NodeFailer failer; public ServiceMonitorStub serviceMonitor; public MockDeployer deployer; + public MetricsReporterTest.TestMetric metric; private final TestHostLivenessTracker hostLivenessTracker; private final Orchestrator orchestrator; private final NodeRepositoryProvisioner provisioner; @@ -99,6 +101,7 @@ public class NodeFailTester { apps.put(app2, new MockDeployer.ApplicationContext(app2, clusterApp2, Capacity.fromNodeCount(wantedNodesApp2, Optional.of("default")), 1)); tester.deployer = new MockDeployer(tester.provisioner, apps); tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository); + tester.metric = new MetricsReporterTest.TestMetric(); tester.failer = tester.createFailer(); return tester; } @@ -134,6 +137,7 @@ public class NodeFailTester { apps.put(app2, new MockDeployer.ApplicationContext(app2, clusterApp2, capacity2, 1)); tester.deployer = new MockDeployer(tester.provisioner, apps); tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository); + tester.metric = new MetricsReporterTest.TestMetric(); tester.failer = tester.createFailer(); return tester; } @@ -155,6 +159,7 @@ public class NodeFailTester { apps.put(app1, new MockDeployer.ApplicationContext(app1, clusterApp1, allProxies, 1)); tester.deployer = new MockDeployer(tester.provisioner, apps); tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository); + tester.metric = new MetricsReporterTest.TestMetric(); tester.failer = tester.createFailer(); return tester; } @@ -163,6 +168,7 @@ public class NodeFailTester { NodeFailTester tester = new NodeFailTester(); tester.deployer = new MockDeployer(tester.provisioner, Collections.emptyMap()); tester.serviceMonitor = new ServiceMonitorStub(Collections.emptyMap(), tester.nodeRepository); + tester.metric = new MetricsReporterTest.TestMetric(); tester.failer = tester.createFailer(); return tester; } @@ -177,7 +183,8 @@ public class NodeFailTester { } public NodeFailer createFailer() { - return new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, downtimeLimitOneHour, clock, orchestrator, NodeFailer.ThrottlePolicy.hosted, new JobControl(nodeRepository.database())); + metric.values = new HashMap<>(); + return new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, downtimeLimitOneHour, clock, orchestrator, NodeFailer.ThrottlePolicy.hosted, metric, new JobControl(nodeRepository.database())); } public void allNodesMakeAConfigRequestExcept(Node ... deadNodeArray) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index f95cfc1b0f1..6d41cfa08e5 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -381,6 +381,7 @@ public class NodeFailerTest { // 2 nodes are failed (the minimum amount that are always allowed to fail) tester.failer.run(); assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); // 6 more hours pass, no more nodes are failed for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) { @@ -389,6 +390,7 @@ public class NodeFailerTest { } tester.failer.run(); assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); // 2 docker hosts now fail, 1 of them (with all its children is allowed to fail) hosts.subList(0, 2).forEach(host -> { @@ -401,6 +403,7 @@ public class NodeFailerTest { tester.failer.run(); assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); // 24 more hours pass without any other nodes being failed out for (int minutes = 0, interval = 30; minutes <= 23 * 60; minutes += interval) { @@ -409,18 +412,21 @@ public class NodeFailerTest { } tester.failer.run(); assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); // Next, the 2 ready nodes that were dead from the start are failed out, and finally // the second host and all its children are failed tester.clock.advance(Duration.ofMinutes(30)); tester.failer.run(); assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get("nodeFailThrottling")); // Nothing else to fail tester.clock.advance(Duration.ofHours(25)); tester.allNodesMakeAConfigRequestExcept(deadNodes); tester.failer.run(); assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is not indicated by the metric.", 0, tester.metric.values.get("nodeFailThrottling")); } // Throttles based on percentage in large zone @@ -437,6 +443,7 @@ public class NodeFailerTest { tester.failer.run(); // 1% are allowed to fail assertEquals(5, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); // 6 more hours pass, no more nodes are failed for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) { @@ -445,6 +452,7 @@ public class NodeFailerTest { } tester.failer.run(); assertEquals(5, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling")); // 18 more hours pass, 24 hours since the first 5 nodes were failed. The remaining 5 are failed for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) { @@ -453,6 +461,7 @@ public class NodeFailerTest { } tester.failer.run(); assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size()); + assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get("nodeFailThrottling")); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java index 6c0c344a72b..d1a14d3e489 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java @@ -193,7 +193,7 @@ public class MetricsReporterTest { return Optional.empty(); } - private static class TestMetric implements Metric { + public static class TestMetric implements Metric { public Map values = new HashMap<>(); public Map> context = new HashMap<>(); -- cgit v1.2.3