diff options
author | Martin Polden <mpolden@mpolden.no> | 2021-07-15 14:08:50 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-07-15 14:08:50 +0200 |
commit | 757a195f84dd60f3a82ea927d166e405ad135aff (patch) | |
tree | 82690404ccc333303881044ba558f27161f2745a | |
parent | cd6bcb0a881cf7470ab84433e630870c5633fa12 (diff) | |
parent | 6af8b75f47a7ef458d9917ee10b020b604490db7 (diff) |
Merge pull request #18614 from vespa-engine/mpolden/replacement-node-in-group
Account for groups when deciding if node is replacement
4 files changed, 55 insertions, 14 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java index 695f0dd8659..5fe10f09f8a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java @@ -76,7 +76,7 @@ public class NodePrioritizer { // In dynamically provisioned zones, we can always take spare hosts since we can provision new on-demand, // NodeCandidate::compareTo will ensure that they will not be used until there is no room elsewhere. // In non-dynamically provisioned zones, we only allow allocating to spare hosts to replace failed nodes. - this.canAllocateToSpareHosts = dynamicProvisioning || isReplacement(nodesInCluster); + this.canAllocateToSpareHosts = dynamicProvisioning || isReplacement(nodesInCluster, clusterSpec.group()); // Do not allocate new nodes for exclusive deployments in dynamically provisioned zones: provision new host instead. this.canAllocateNew = requestedNodes instanceof NodeSpec.CountNodeSpec && (!dynamicProvisioning || !requestedNodes.isExclusive()); @@ -195,10 +195,13 @@ public class NodePrioritizer { } /** Returns whether we are allocating to replace a failed node */ - private boolean isReplacement(NodeList nodesInCluster) { - int failedNodesInCluster = nodesInCluster.failing().size() + nodesInCluster.state(Node.State.failed).size(); - if (failedNodesInCluster == 0) return false; - return ! requestedNodes.fulfilledBy(nodesInCluster.size() - failedNodesInCluster); + private boolean isReplacement(NodeList nodesInCluster, Optional<ClusterSpec.Group> group) { + NodeList nodesInGroup = group.map(ClusterSpec.Group::index) + .map(nodesInCluster::group) + .orElse(nodesInCluster); + int failedNodesInGroup = nodesInGroup.failing().size() + nodesInGroup.state(Node.State.failed).size(); + if (failedNodesInGroup == 0) return false; + return ! requestedNodes.fulfilledBy(nodesInGroup.size() - failedNodesInGroup); } /** diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java index 2b180853d83..bdb93294cd1 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java @@ -35,6 +35,7 @@ import java.io.UncheckedIOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.Collection; import java.util.List; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; @@ -49,13 +50,21 @@ import static com.yahoo.config.provision.NodeResources.DiskSpeed.fast; import static com.yahoo.config.provision.NodeResources.StorageType.local; import static com.yahoo.config.provision.NodeResources.StorageType.remote; import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; /** * Scenario tester with real node-repository data loaded from ZK snapshot file * + * How to use: + * + * 1. Copy /opt/vespa/conf/configserver-app/node-flavors.xml from config server to /tmp/node-flavors.xml + * 2. Copy /opt/vespa/var/zookeeper/version-2/snapshot.XXX from config server to /tmp/snapshot + * 3. Set capacities and specs according to the wanted scenario + * * @author valerijf */ public class RealDataScenarioTest { + private static final Logger log = Logger.getLogger(RealDataScenarioTest.class.getSimpleName()); @Ignore @@ -63,11 +72,11 @@ public class RealDataScenarioTest { public void test() { ProvisioningTester tester = new ProvisioningTester.Builder() .zone(new Zone(Cloud.builder().dynamicProvisioning(true).build(), SystemName.defaultSystem(), Environment.prod, RegionName.defaultName())) - .flavorsConfig(parseFlavors(Paths.get(System.getProperty("user.home"), ".flavors.xml"))) + .flavorsConfig(parseFlavors(Paths.get("/tmp/node-flavors.xml"))) .nameResolver(new DnsNameResolver()) .spareCount(1) .build(); - initFromZk(tester.nodeRepository(), Paths.get(System.getProperty("user.home"), "snapshot")); + initFromZk(tester.nodeRepository(), Paths.get("/tmp/snapshot")); ApplicationId app = ApplicationId.from("tenant", "app", "default"); Version version = Version.fromString("7.123.4"); @@ -91,10 +100,11 @@ public class RealDataScenarioTest { } private void deploy(ProvisioningTester tester, ApplicationId app, ClusterSpec[] specs, Capacity[] capacities) { + assertEquals("Equal capacity and spec count", capacities.length, specs.length); List<HostSpec> hostSpecs = IntStream.range(0, capacities.length) - .mapToObj(i -> tester.provisioner().prepare(app, specs[i], capacities[i], log::log).stream()) - .flatMap(s -> s) - .collect(Collectors.toList()); + .mapToObj(i -> tester.provisioner().prepare(app, specs[i], capacities[i], log::log)) + .flatMap(Collection::stream) + .collect(Collectors.toList()); NestedTransaction transaction = new NestedTransaction(); tester.provisioner().activate(hostSpecs, new ActivationContext(0), new ApplicationTransaction(new ProvisionLock(app, () -> {}), transaction)); transaction.commit(); @@ -144,4 +154,5 @@ public class RealDataScenarioTest { throw new UncheckedIOException(e); } } + } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java index 9ded28094d2..774350084ac 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java @@ -148,11 +148,12 @@ public class NodeFailTester { } public static NodeFailTester withOneUndeployedApplication(Capacity capacity) { - NodeFailTester tester = new NodeFailTester(); + return withOneUndeployedApplication(capacity, ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.42").build()); + } - // Create applications - ClusterSpec clusterApp = ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.42").build(); - Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(app1, new MockDeployer.ApplicationContext(app1, clusterApp, capacity)); + public static NodeFailTester withOneUndeployedApplication(Capacity capacity, ClusterSpec spec) { + NodeFailTester tester = new NodeFailTester(); + Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(app1, new MockDeployer.ApplicationContext(app1, spec, capacity)); tester.initializeMaintainers(apps); return tester; } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java index 7eff8af8b8d..dfeb82281e6 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java @@ -377,6 +377,32 @@ public class NodeFailerTest { } @Test + public void node_failing_can_allocate_spare_to_replace_failed_node_in_group() { + NodeResources resources = new NodeResources(1, 20, 15, 1); + Capacity capacity = Capacity.from(new ClusterResources(4, 2, resources), false, true); + ClusterSpec spec = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test")).vespaVersion("6.42").build(); + NodeFailTester tester = NodeFailTester.withOneUndeployedApplication(capacity, spec); + assertEquals("Test depends on this setting in NodeFailTester", 1, tester.nodeRepository.spareCount()); + tester.createAndActivateHosts(5, resources); // One extra - becomes designated spare + tester.activate(NodeFailTester.app1, spec, capacity); + + // Hardware failure is reported for host + NodeList activeNodes = tester.nodeRepository.nodes().list(Node.State.active); + Node downNode = activeNodes.owner(NodeFailTester.app1).first().get(); + Node downHost = activeNodes.parentOf(downNode).get(); + tester.tester.patchNode(downHost, (node) -> node.with(node.reports().withReport(badTotalMemorySizeReport))); + tester.suspend(downHost.hostname()); + tester.suspend(downNode.hostname()); + + // Node is failed and replaced + tester.runMaintainers(); + assertEquals(1, tester.deployer.redeployments); + NodeList failedOrActive = tester.nodeRepository.nodes().list(Node.State.active, Node.State.failed); + assertEquals(4, failedOrActive.state(Node.State.active).nodeType(NodeType.tenant).size()); + assertEquals(Set.of(downNode.hostname()), failedOrActive.state(Node.State.failed).nodeType(NodeType.tenant).hostnames()); + } + + @Test public void failing_ready_nodes() { NodeFailTester tester = NodeFailTester.withTwoApplications(); |