diff options
author | Jon Bratseth <bratseth@verizonmedia.com> | 2019-10-22 11:03:12 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@verizonmedia.com> | 2019-10-22 11:03:12 +0200 |
commit | 534e7046f4fdde6e8756237e132a22d1a259eee0 (patch) | |
tree | 48030b7739c7fc7fd118c5dc7d8e891edc72e0da /node-repository | |
parent | 6566f3ff831fff8419ca32c692289b86448ac31c (diff) |
Add metric hostedVespa.docker.skew to measure average host skew
Diffstat (limited to 'node-repository')
4 files changed, 31 insertions, 2 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index c644626bf01..46cf3f24806 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -84,7 +84,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { new DynamicProvisioningMaintainer(nodeRepository, durationFromEnv("host_provisioner_interval").orElse(defaults.dynamicProvisionerInterval), hostProvisioner, flagSource)); capacityReportMaintainer = new CapacityReportMaintainer(nodeRepository, metric, durationFromEnv("capacity_report_interval").orElse(defaults.capacityReportInterval)); osUpgradeActivator = new OsUpgradeActivator(nodeRepository, defaults.osUpgradeActivatorInterval); - rebalancer = new Rebalancer(nodeRepository, provisionServiceProvider.getHostResourcesCalculator(), clock, defaults.rebalancerInterval); + rebalancer = new Rebalancer(nodeRepository, provisionServiceProvider.getHostResourcesCalculator(), metric, clock, defaults.rebalancerInterval); // The DuperModel is filled with infrastructure applications by the infrastructure provisioner, so explicitly run that now infrastructureProvisioner.maintain(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index 5698d13e33c..b94e9974313 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; +import com.yahoo.jdisc.Metric; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; @@ -21,11 +22,17 @@ import java.util.stream.Stream; public class Rebalancer extends Maintainer { private final HostResourcesCalculator hostResourcesCalculator; + private final Metric metric; private final Clock clock; - public Rebalancer(NodeRepository nodeRepository, HostResourcesCalculator hostResourcesCalculator, Clock clock, Duration interval) { + public Rebalancer(NodeRepository nodeRepository, + HostResourcesCalculator hostResourcesCalculator, + Metric metric, + Clock clock, + Duration interval) { super(nodeRepository, interval); this.hostResourcesCalculator = hostResourcesCalculator; + this.metric = metric; this.clock = clock; } @@ -34,6 +41,8 @@ public class Rebalancer extends Maintainer { // Work with an unlocked snapshot as this can take a long time and full consistency is not needed NodeList allNodes = nodeRepository().list(); + updateSkewMetric(allNodes); + if ( ! zoneIsStable(allNodes)) return; Move bestMove = findBestMove(allNodes); @@ -41,6 +50,18 @@ public class Rebalancer extends Maintainer { markWantToRetire(bestMove.node); } + /** We do this here rather than in MetricsReporter because it is expensive and frequent updates are unnecessary */ + private void updateSkewMetric(NodeList allNodes) { + DockerHostCapacity capacity = new DockerHostCapacity(allNodes, hostResourcesCalculator); + double totalSkew = 0; + int hostCount = 0; + for (Node host : allNodes.nodeType((NodeType.host)).state(Node.State.active)) { + hostCount++; + totalSkew += Node.skew(host.flavor().resources(), capacity.freeCapacityOf(host)); + } + metric.set("hostedVespa.docker.skew", totalSkew/hostCount, null); + } + private boolean zoneIsStable(NodeList allNodes) { NodeList active = allNodes.state(Node.State.active); if (active.stream().anyMatch(node -> node.allocation().get().membership().retired())) return false; diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java index c5af068c30b..db4ba661b64 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java @@ -236,4 +236,5 @@ public class MetricsReporterTest { } } } + } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java index 3e9744a6518..ad491b41a31 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/RebalancerTest.java @@ -35,8 +35,10 @@ public class RebalancerTest { @Test public void testRebalancing() { ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.perf, RegionName.from("us-east"))).flavorsConfig(flavorsConfig()).build(); + MetricsReporterTest.TestMetric metric = new MetricsReporterTest.TestMetric(); Rebalancer rebalancer = new Rebalancer(tester.nodeRepository(), new IdentityHostResourcesCalculator(), + metric, tester.clock(), Duration.ofMinutes(1)); @@ -54,12 +56,15 @@ public class RebalancerTest { rebalancer.maintain(); assertFalse("No better place to move the skewed node, so no action is taken", tester.nodeRepository().getNode(cpuSkewedNodeHostname).get().status().wantToRetire()); + assertEquals(0.00325, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001); tester.makeReadyNodes(1, "cpu", NodeType.host, 8); rebalancer.maintain(); assertTrue("We can now move the node to the cpu skewed host to reduce skew", tester.nodeRepository().getNode(cpuSkewedNodeHostname).get().status().wantToRetire()); + assertEquals("We're not actually moving the node here so skew remains steady", + 0.00325, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001); ApplicationId memApp = makeApplicationId("t2", "a2"); deployApp(memApp, clusterSpec("c"), memResources, tester, 1); @@ -69,6 +74,8 @@ public class RebalancerTest { tester.makeReadyNodes(1, "mem", NodeType.host, 8); rebalancer.maintain(); + assertEquals("Deploying the mem skewed app increased skew", + 0.00752, metric.values.get("hostedVespa.docker.skew").doubleValue(), 0.00001); assertFalse("The mem skewed node is not set want to retire as the cpu skewed node still is", tester.nodeRepository().getNode(memSkewedNodeHostname).get().status().wantToRetire()); |