From 6af8b75f47a7ef458d9917ee10b020b604490db7 Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Thu, 15 Jul 2021 13:55:10 +0200 Subject: Account for groups when deciding if node is replacement --- .../provision/provisioning/NodePrioritizer.java | 13 ++++++----- .../provision/maintenance/NodeFailTester.java | 9 ++++---- .../provision/maintenance/NodeFailerTest.java | 26 ++++++++++++++++++++++ 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java index 695f0dd8659..5fe10f09f8a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java @@ -76,7 +76,7 @@ public class NodePrioritizer { // In dynamically provisioned zones, we can always take spare hosts since we can provision new on-demand, // NodeCandidate::compareTo will ensure that they will not be used until there is no room elsewhere. // In non-dynamically provisioned zones, we only allow allocating to spare hosts to replace failed nodes. - this.canAllocateToSpareHosts = dynamicProvisioning || isReplacement(nodesInCluster); + this.canAllocateToSpareHosts = dynamicProvisioning || isReplacement(nodesInCluster, clusterSpec.group()); // Do not allocate new nodes for exclusive deployments in dynamically provisioned zones: provision new host instead. this.canAllocateNew = requestedNodes instanceof NodeSpec.CountNodeSpec && (!dynamicProvisioning || !requestedNodes.isExclusive()); @@ -195,10 +195,13 @@ public class NodePrioritizer { } /** Returns whether we are allocating to replace a failed node */ - private boolean isReplacement(NodeList nodesInCluster) { - int failedNodesInCluster = nodesInCluster.failing().size() + nodesInCluster.state(Node.State.failed).size(); - if (failedNodesInCluster == 0) return false; - return ! requestedNodes.fulfilledBy(nodesInCluster.size() - failedNodesInCluster); + private boolean isReplacement(NodeList nodesInCluster, Optional group) { + NodeList nodesInGroup = group.map(ClusterSpec.Group::index) + .map(nodesInCluster::group) + .orElse(nodesInCluster); + int failedNodesInGroup = nodesInGroup.failing().size() + nodesInGroup.state(Node.State.failed).size(); + if (failedNodesInGroup == 0) return false; + return ! requestedNodes.fulfilledBy(nodesInGroup.size() - failedNodesInGroup); } /** diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java index 9ded28094d2..774350084ac 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java @@ -148,11 +148,12 @@ public class NodeFailTester { } public static NodeFailTester withOneUndeployedApplication(Capacity capacity) { - NodeFailTester tester = new NodeFailTester(); + return withOneUndeployedApplication(capacity, ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.42").build()); + } - // Create applications - ClusterSpec clusterApp = ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.42").build(); - Map apps = Map.of(app1, new MockDeployer.ApplicationContext(app1, clusterApp, capacity)); + public static NodeFailTester withOneUndeployedApplication(Capacity capacity, ClusterSpec spec) { + NodeFailTester tester = new NodeFailTester(); + Map apps = Map.of(app1, new MockDeployer.ApplicationContext(app1, spec, capacity)); tester.initializeMaintainers(apps); return tester; } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 7eff8af8b8d..dfeb82281e6 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -376,6 +376,32 @@ public class NodeFailerTest { assertEquals(downHost, tester.nodeRepository.nodes().list(Node.State.failed).nodeType(NodeType.tenant).asList().get(0).hostname()); } + @Test + public void node_failing_can_allocate_spare_to_replace_failed_node_in_group() { + NodeResources resources = new NodeResources(1, 20, 15, 1); + Capacity capacity = Capacity.from(new ClusterResources(4, 2, resources), false, true); + ClusterSpec spec = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test")).vespaVersion("6.42").build(); + NodeFailTester tester = NodeFailTester.withOneUndeployedApplication(capacity, spec); + assertEquals("Test depends on this setting in NodeFailTester", 1, tester.nodeRepository.spareCount()); + tester.createAndActivateHosts(5, resources); // One extra - becomes designated spare + tester.activate(NodeFailTester.app1, spec, capacity); + + // Hardware failure is reported for host + NodeList activeNodes = tester.nodeRepository.nodes().list(Node.State.active); + Node downNode = activeNodes.owner(NodeFailTester.app1).first().get(); + Node downHost = activeNodes.parentOf(downNode).get(); + tester.tester.patchNode(downHost, (node) -> node.with(node.reports().withReport(badTotalMemorySizeReport))); + tester.suspend(downHost.hostname()); + tester.suspend(downNode.hostname()); + + // Node is failed and replaced + tester.runMaintainers(); + assertEquals(1, tester.deployer.redeployments); + NodeList failedOrActive = tester.nodeRepository.nodes().list(Node.State.active, Node.State.failed); + assertEquals(4, failedOrActive.state(Node.State.active).nodeType(NodeType.tenant).size()); + assertEquals(Set.of(downNode.hostname()), failedOrActive.state(Node.State.failed).nodeType(NodeType.tenant).hostnames()); + } + @Test public void failing_ready_nodes() { NodeFailTester tester = NodeFailTester.withTwoApplications(); -- cgit v1.2.3