summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Marius Venstad <jvenstad@yahoo-inc.com>2018-01-11 15:51:36 +0100
committerJon Marius Venstad <jvenstad@yahoo-inc.com>2018-01-11 15:51:36 +0100
commit17d7c346f7a954007d3018a956d9c9c12ba4e0c8 (patch)
tree5264a77ab107160570973f83215d4ba75dedc83f /node-repository
parentbc50cf5e58d01cc547926173c480012da2a043fa (diff)
Set metric whenever throttling is evaluated
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java6
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java9
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java9
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java2
5 files changed, 24 insertions, 4 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index fd813ca291c..6089cfe64c9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -5,6 +5,7 @@ import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.config.provision.NodeType;
+import com.yahoo.jdisc.Metric;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
@@ -55,11 +56,12 @@ public class NodeFailer extends Maintainer {
private final Orchestrator orchestrator;
private final Instant constructionTime;
private final ThrottlePolicy throttlePolicy;
+ private final Metric metric;
public NodeFailer(Deployer deployer, HostLivenessTracker hostLivenessTracker,
ServiceMonitor serviceMonitor, NodeRepository nodeRepository,
Duration downTimeLimit, Clock clock, Orchestrator orchestrator,
- ThrottlePolicy throttlePolicy,
+ ThrottlePolicy throttlePolicy, Metric metric,
JobControl jobControl) {
// check ping status every five minutes, but at least twice as often as the down time limit
super(nodeRepository, min(downTimeLimit.dividedBy(2), Duration.ofMinutes(5)), jobControl);
@@ -71,6 +73,7 @@ public class NodeFailer extends Maintainer {
this.orchestrator = orchestrator;
this.constructionTime = clock.instant();
this.throttlePolicy = throttlePolicy;
+ this.metric = metric;
}
@Override
@@ -290,6 +293,7 @@ public class NodeFailer extends Maintainer {
log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(),
throttlePolicy.toHumanReadableString()));
}
+ metric.set("nodeFailThrottling", throttle ? 1 : 0, null);
return throttle;
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index 9e826bfcb9a..12ba67eba6d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -64,7 +64,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
DefaultTimes defaults = new DefaultTimes(zone.environment());
jobControl = new JobControl(nodeRepository.database());
- nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, durationFromEnv("fail_grace").orElse(defaults.failGrace), clock, orchestrator, throttlePolicyFromEnv("throttle_policy").orElse(defaults.throttlePolicy), jobControl);
+ nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, durationFromEnv("fail_grace").orElse(defaults.failGrace), clock, orchestrator, throttlePolicyFromEnv("throttle_policy").orElse(defaults.throttlePolicy), metric, jobControl);
periodicApplicationMaintainer = new PeriodicApplicationMaintainer(deployer, nodeRepository, durationFromEnv("periodic_redeploy_interval").orElse(defaults.periodicRedeployInterval), jobControl);
operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, nodeRepository, clock, durationFromEnv("operator_change_redeploy_interval").orElse(defaults.operatorChangeRedeployInterval), jobControl);
zooKeeperAccessMaintainer = new ZooKeeperAccessMaintainer(nodeRepository, curator, durationFromEnv("zookeeper_access_maintenance_interval").orElse(defaults.zooKeeperAccessMaintenanceInterval), jobControl);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
index 0e0195a5bed..05f49ce1f32 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
@@ -23,6 +23,7 @@ import com.yahoo.vespa.curator.mock.MockCurator;
import com.yahoo.vespa.curator.transaction.CuratorTransaction;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.vespa.hosted.provision.monitoring.MetricsReporterTest;
import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder;
import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner;
import com.yahoo.vespa.hosted.provision.testutils.MockDeployer;
@@ -63,6 +64,7 @@ public class NodeFailTester {
public NodeFailer failer;
public ServiceMonitorStub serviceMonitor;
public MockDeployer deployer;
+ public MetricsReporterTest.TestMetric metric;
private final TestHostLivenessTracker hostLivenessTracker;
private final Orchestrator orchestrator;
private final NodeRepositoryProvisioner provisioner;
@@ -99,6 +101,7 @@ public class NodeFailTester {
apps.put(app2, new MockDeployer.ApplicationContext(app2, clusterApp2, Capacity.fromNodeCount(wantedNodesApp2, Optional.of("default")), 1));
tester.deployer = new MockDeployer(tester.provisioner, apps);
tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository);
+ tester.metric = new MetricsReporterTest.TestMetric();
tester.failer = tester.createFailer();
return tester;
}
@@ -134,6 +137,7 @@ public class NodeFailTester {
apps.put(app2, new MockDeployer.ApplicationContext(app2, clusterApp2, capacity2, 1));
tester.deployer = new MockDeployer(tester.provisioner, apps);
tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository);
+ tester.metric = new MetricsReporterTest.TestMetric();
tester.failer = tester.createFailer();
return tester;
}
@@ -155,6 +159,7 @@ public class NodeFailTester {
apps.put(app1, new MockDeployer.ApplicationContext(app1, clusterApp1, allProxies, 1));
tester.deployer = new MockDeployer(tester.provisioner, apps);
tester.serviceMonitor = new ServiceMonitorStub(apps, tester.nodeRepository);
+ tester.metric = new MetricsReporterTest.TestMetric();
tester.failer = tester.createFailer();
return tester;
}
@@ -163,6 +168,7 @@ public class NodeFailTester {
NodeFailTester tester = new NodeFailTester();
tester.deployer = new MockDeployer(tester.provisioner, Collections.emptyMap());
tester.serviceMonitor = new ServiceMonitorStub(Collections.emptyMap(), tester.nodeRepository);
+ tester.metric = new MetricsReporterTest.TestMetric();
tester.failer = tester.createFailer();
return tester;
}
@@ -177,7 +183,8 @@ public class NodeFailTester {
}
public NodeFailer createFailer() {
- return new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, downtimeLimitOneHour, clock, orchestrator, NodeFailer.ThrottlePolicy.hosted, new JobControl(nodeRepository.database()));
+ metric.values = new HashMap<>();
+ return new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, downtimeLimitOneHour, clock, orchestrator, NodeFailer.ThrottlePolicy.hosted, metric, new JobControl(nodeRepository.database()));
}
public void allNodesMakeAConfigRequestExcept(Node ... deadNodeArray) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index f95cfc1b0f1..6d41cfa08e5 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -381,6 +381,7 @@ public class NodeFailerTest {
// 2 nodes are failed (the minimum amount that are always allowed to fail)
tester.failer.run();
assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
// 6 more hours pass, no more nodes are failed
for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) {
@@ -389,6 +390,7 @@ public class NodeFailerTest {
}
tester.failer.run();
assertEquals(2, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
// 2 docker hosts now fail, 1 of them (with all its children is allowed to fail)
hosts.subList(0, 2).forEach(host -> {
@@ -401,6 +403,7 @@ public class NodeFailerTest {
tester.failer.run();
assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
// 24 more hours pass without any other nodes being failed out
for (int minutes = 0, interval = 30; minutes <= 23 * 60; minutes += interval) {
@@ -409,18 +412,21 @@ public class NodeFailerTest {
}
tester.failer.run();
assertEquals(6, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
// Next, the 2 ready nodes that were dead from the start are failed out, and finally
// the second host and all its children are failed
tester.clock.advance(Duration.ofMinutes(30));
tester.failer.run();
assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get("nodeFailThrottling"));
// Nothing else to fail
tester.clock.advance(Duration.ofHours(25));
tester.allNodesMakeAConfigRequestExcept(deadNodes);
tester.failer.run();
assertEquals(12, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is not indicated by the metric.", 0, tester.metric.values.get("nodeFailThrottling"));
}
// Throttles based on percentage in large zone
@@ -437,6 +443,7 @@ public class NodeFailerTest {
tester.failer.run();
// 1% are allowed to fail
assertEquals(5, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
// 6 more hours pass, no more nodes are failed
for (int minutes = 0, interval = 30; minutes < 6 * 60; minutes += interval) {
@@ -445,6 +452,7 @@ public class NodeFailerTest {
}
tester.failer.run();
assertEquals(5, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is indicated by the metric.", 1, tester.metric.values.get("nodeFailThrottling"));
// 18 more hours pass, 24 hours since the first 5 nodes were failed. The remaining 5 are failed
for (int minutes = 0, interval = 30; minutes < 18 * 60; minutes += interval) {
@@ -453,6 +461,7 @@ public class NodeFailerTest {
}
tester.failer.run();
assertEquals(10, tester.nodeRepository.getNodes(Node.State.failed).size());
+ assertEquals("Throttling is not indicated by the metric, as no throttled attempt is made.", 0, tester.metric.values.get("nodeFailThrottling"));
}
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java
index 6c0c344a72b..d1a14d3e489 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/monitoring/MetricsReporterTest.java
@@ -193,7 +193,7 @@ public class MetricsReporterTest {
return Optional.empty();
}
- private static class TestMetric implements Metric {
+ public static class TestMetric implements Metric {
public Map<String, Number> values = new HashMap<>();
public Map<String, List<Context>> context = new HashMap<>();