diff options
author | Martin Polden <mpolden@mpolden.no> | 2021-02-18 14:28:26 +0100 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2021-02-18 15:03:58 +0100 |
commit | 2d68f8be7aee47a60d2ae6d8a19d36986da4efff (patch) | |
tree | db4a21bc92b3d255d23fcc0334f9f8202cddc94b /node-repository | |
parent | 49c817f7e8272685c73f49802a84bd613cb77d9a (diff) |
Avoid suggesting node moves to spares
Diffstat (limited to 'node-repository')
3 files changed, 31 insertions, 24 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java index 609d1f74526..9fe9b7db478 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java @@ -12,6 +12,7 @@ import com.yahoo.vespa.hosted.provision.provisioning.HostCapacity; import java.time.Duration; import java.time.Instant; +import java.util.Set; /** * Base class for maintainers that move nodes. @@ -39,6 +40,7 @@ public abstract class NodeMover<MOVE> extends NodeRepositoryMaintainer { HostCapacity capacity = new HostCapacity(allNodes, nodeRepository().resourcesCalculator()); MOVE bestMove = emptyMove; NodeList activeNodes = allNodes.nodeType(NodeType.tenant).state(Node.State.active); + Set<Node> spares = capacity.findSpareHosts(allNodes.asList(), nodeRepository().spareCount()); for (Node node : activeNodes) { if (node.parentHostname().isEmpty()) continue; ApplicationId applicationId = node.allocation().get().owner(); @@ -46,6 +48,7 @@ public abstract class NodeMover<MOVE> extends NodeRepositoryMaintainer { if (deployedRecently(applicationId)) continue; for (Node toHost : allNodes.matching(nodeRepository().nodes()::canAllocateTenantNodeTo)) { if (toHost.hostname().equals(node.parentHostname().get())) continue; + if (spares.contains(toHost)) continue; // Do not offer spares as a valid move as they are reserved for replacement of failed nodes if ( ! capacity.freeCapacityOf(toHost).satisfies(node.resources())) continue; MOVE suggestedMove = suggestedMove(node, allNodes.parentOf(node).get(), toHost, allNodes); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java index 50e0116d98d..cfab980570d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java @@ -54,7 +54,7 @@ public class SwitchRebalancer extends NodeMover<Move> { @Override protected Move bestMoveOf(Move a, Move b) { - if (b.isEmpty()) return a; + if (!a.isEmpty()) return a; return b; } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java index 9109d992f4a..0161d75c83d 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancerTest.java @@ -21,7 +21,6 @@ import com.yahoo.vespa.hosted.provision.testutils.MockDeployer.ClusterContext; import org.junit.Test; import java.time.Duration; -import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.Set; @@ -42,16 +41,16 @@ public class SwitchRebalancerTest { public void rebalance() { ClusterSpec.Id cluster1 = ClusterSpec.Id.from("c1"); ClusterSpec.Id cluster2 = ClusterSpec.Id.from("c2"); - ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build(); + ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))) + .spareCount(1) + .build(); MockDeployer deployer = deployer(tester, cluster1, cluster2); SwitchRebalancer rebalancer = new SwitchRebalancer(tester.nodeRepository(), Duration.ofDays(1), new TestMetric(), deployer); // Provision initial hosts on same switch NodeResources hostResources = new NodeResources(48, 128, 500, 10); - List<Node> hosts0 = tester.makeReadyNodes(3, hostResources, NodeType.host, 5); - tester.activateTenantHosts(); String switch0 = "switch0"; - tester.patchNodes(hosts0, (host) -> host.withSwitchHostname(switch0)); + provisionHosts(3, switch0, hostResources, tester); // Deploy application deployer.deployFromLocalActive(app).get().activate(); @@ -62,21 +61,21 @@ public class SwitchRebalancerTest { tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment); assertNoMoves(rebalancer, tester); - // Provision hosts on distinct switches - List<Node> hosts1 = tester.makeReadyNodes(3, hostResources, NodeType.host, 5); - tester.activateTenantHosts(); - for (int i = 0; i < hosts1.size(); i++) { - String switchHostname = "switch" + (i + 1); - tester.patchNode(hosts1.get(i), (host) -> host.withSwitchHostname(switchHostname)); - } + // Provision a single host on a new switch + provisionHost("switch1", hostResources, tester); - // Application is redeployed + // Application is redeployed and rebalancer does nothing as not enough time has passed since deployment deployer.deployFromLocalActive(app).get().activate(); + assertNoMoves(rebalancer, tester); - // Rebalancer does nothing as not enough time has passed since previous deployment + // No rebalancing because the additional host is counted as a spare + tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment); assertNoMoves(rebalancer, tester); - // Rebalancer retires one node from non-exclusive switch in each cluster, and allocates a new one + // More hosts are provisioned. Rebalancer now retires one node from non-exclusive switch in each cluster, and + // allocates a new one + provisionHost("switch2", hostResources, tester); + provisionHost("switch3", hostResources, tester); for (var cluster : List.of(cluster1, cluster2)) { tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment); rebalancer.maintain(); @@ -110,13 +109,10 @@ public class SwitchRebalancerTest { // Provision initial hosts on two switches NodeResources hostResources = new NodeResources(8, 16, 500, 10); - List<Node> hosts0 = tester.makeReadyNodes(4, hostResources, NodeType.host, 5); - hosts0.sort(Comparator.comparing(Node::hostname)); - tester.activateTenantHosts(); String switch0 = "switch0"; String switch1 = "switch1"; - tester.patchNode(hosts0.get(0), (host) -> host.withSwitchHostname(switch0)); - tester.patchNodes(hosts0.subList(1, hosts0.size()), (host) -> host.withSwitchHostname(switch1)); + provisionHost(switch0, hostResources, tester); + provisionHosts(3, switch1, hostResources, tester); // Deploy application deployer.deployFromLocalActive(app).get().activate(); @@ -126,10 +122,8 @@ public class SwitchRebalancerTest { assertEquals(3, tester.activeNodesOn(switch1, app, spec.id()).size()); // Another host becomes available on a new host - List<Node> hosts2 = tester.makeReadyNodes(1, hostResources, NodeType.host, 5); - tester.activateTenantHosts(); String switch2 = "switch2"; - tester.patchNodes(hosts2, (host) -> host.withSwitchHostname(switch2)); + provisionHost(switch2, hostResources, tester); // Rebalance tester.clock().advance(SwitchRebalancer.waitTimeAfterPreviousDeployment); @@ -151,6 +145,16 @@ public class SwitchRebalancerTest { assertNoMoves(rebalancer, tester); } + private void provisionHost(String switchHostname, NodeResources hostResources, ProvisioningTester tester) { + provisionHosts(1, switchHostname, hostResources, tester); + } + + private void provisionHosts(int count, String switchHostname, NodeResources hostResources, ProvisioningTester tester) { + List<Node> hosts = tester.makeReadyNodes(count, hostResources, NodeType.host, 5); + tester.patchNodes(hosts, (host) -> host.withSwitchHostname(switchHostname)); + tester.activateTenantHosts(); + } + private void assertNoMoves(SwitchRebalancer rebalancer, ProvisioningTester tester) { NodeList nodes0 = tester.nodeRepository().nodes().list(Node.State.active).owner(app); rebalancer.maintain(); |