aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@verizonmedia.com>2019-10-22 11:03:12 +0200
committerJon Bratseth <bratseth@verizonmedia.com>2019-10-22 11:03:12 +0200
commit534e7046f4fdde6e8756237e132a22d1a259eee0 (patch)
tree48030b7739c7fc7fd118c5dc7d8e891edc72e0da /node-repository/src
parent6566f3ff831fff8419ca32c692289b86448ac31c (diff)
Add metric hostedVespa.docker.skew to measure average host skew
Diffstat (limited to 'node-repository/src')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java23
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java1
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java7
4 files changed, 31 insertions, 2 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index c644626bf01..46cf3f24806 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -84,7 +84,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
new DynamicProvisioningMaintainer(nodeRepository, durationFromEnv("host_provisioner_interval").orElse(defaults.dynamicProvisionerInterval), hostProvisioner, flagSource));
capacityReportMaintainer = new CapacityReportMaintainer(nodeRepository, metric, durationFromEnv("capacity_report_interval").orElse(defaults.capacityReportInterval));
osUpgradeActivator = new OsUpgradeActivator(nodeRepository, defaults.osUpgradeActivatorInterval);
- rebalancer = new Rebalancer(nodeRepository, provisionServiceProvider.getHostResourcesCalculator(), clock, defaults.rebalancerInterval);
+ rebalancer = new Rebalancer(nodeRepository, provisionServiceProvider.getHostResourcesCalculator(), metric, clock, defaults.rebalancerInterval);
// The DuperModel is filled with infrastructure applications by the infrastructure provisioner, so explicitly run that now
infrastructureProvisioner.maintain();
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
index 5698d13e33c..b94e9974313 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
+import com.yahoo.jdisc.Metric;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
@@ -21,11 +22,17 @@ import java.util.stream.Stream;
public class Rebalancer extends Maintainer {
private final HostResourcesCalculator hostResourcesCalculator;
+ private final Metric metric;
private final Clock clock;
- public Rebalancer(NodeRepository nodeRepository, HostResourcesCalculator hostResourcesCalculator, Clock clock, Duration interval) {
+ public Rebalancer(NodeRepository nodeRepository,
+ HostResourcesCalculator hostResourcesCalculator,
+ Metric metric,
+ Clock clock,
+ Duration interval) {
super(nodeRepository, interval);
this.hostResourcesCalculator = hostResourcesCalculator;
+ this.metric = metric;
this.clock = clock;
}
@@ -34,6 +41,8 @@ public class Rebalancer extends Maintainer {
// Work with an unlocked snapshot as this can take a long time and full consistency is not needed
NodeList allNodes = nodeRepository().list();
+ updateSkewMetric(allNodes);
+
if ( ! zoneIsStable(allNodes)) return;
Move bestMove = findBestMove(allNodes);
@@ -41,6 +50,18 @@ public class Rebalancer extends Maintainer {
markWantToRetire(bestMove.node);
}
+ /** We do this here rather than in MetricsReporter because it is expensive and frequent updates are unnecessary */
+ private void updateSkewMetric(NodeList allNodes) {
+ DockerHostCapacity capacity = new DockerHostCapacity(allNodes, hostResourcesCalculator);
+ double totalSkew = 0;
+ int hostCount = 0;
+ for (Node host : allNodes.nodeType((NodeType.host)).state(Node.State.active)) {
+ hostCount++;
+ totalSkew += Node.skew(host.flavor().resources(), capacity.freeCapacityOf(host));
+ }
+ metric.set("hostedVespa.docker.skew", totalSkew/hostCount, null);
+ }
+
private boolean zoneIsStable(NodeList allNodes) {
NodeList active = allNodes.state(Node.State.active);
if (active.stream().anyMatch(node -> node.allocation().get().membership().retired())) return false;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
index c5af068c30b..db4ba661b64 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java
@@ -236,4 +236,5 @@ public class MetricsReporterTest {
}
}
}
+
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java
index 3e9744a6518..ad491b41a31 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java
@@ -35,8 +35,10 @@ public class RebalancerTest {
@Test
public void testRebalancing() {
ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.perf, RegionName.from("us-east"))).flavorsConfig(flavorsConfig()).build();
+ MetricsReporterTest.TestMetric metric = new MetricsReporterTest.TestMetric();
Rebalancer rebalancer = new Rebalancer(tester.nodeRepository(),
new IdentityHostResourcesCalculator(),
+ metric,
tester.clock(),
Duration.ofMinutes(1));
@@ -54,12 +56,15 @@ public class RebalancerTest {
rebalancer.maintain();
assertFalse("No better place to move the skewed node, so no action is taken",
tester.nodeRepository().getNode(cpuSkewedNodeHostname).get().status().wantToRetire());
+ assertEquals(0.00325, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
tester.makeReadyNodes(1, "cpu", NodeType.host, 8);
rebalancer.maintain();
assertTrue("We can now move the node to the cpu skewed host to reduce skew",
tester.nodeRepository().getNode(cpuSkewedNodeHostname).get().status().wantToRetire());
+ assertEquals("We're not actually moving the node here so skew remains steady",
+ 0.00325, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
ApplicationId memApp = makeApplicationId("t2", "a2");
deployApp(memApp, clusterSpec("c"), memResources, tester, 1);
@@ -69,6 +74,8 @@ public class RebalancerTest {
tester.makeReadyNodes(1, "mem", NodeType.host, 8);
rebalancer.maintain();
+ assertEquals("Deploying the mem skewed app increased skew",
+ 0.00752, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001);
assertFalse("The mem skewed node is not set want to retire as the cpu skewed node still is",
tester.nodeRepository().getNode(memSkewedNodeHostname).get().status().wantToRetire());