aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2021-07-15 14:08:50 +0200
committerGitHub <noreply@github.com>2021-07-15 14:08:50 +0200
commit757a195f84dd60f3a82ea927d166e405ad135aff (patch)
tree82690404ccc333303881044ba558f27161f2745a
parentcd6bcb0a881cf7470ab84433e630870c5633fa12 (diff)
parent6af8b75f47a7ef458d9917ee10b020b604490db7 (diff)
Merge pull request #18614 from vespa-engine/mpolden/replacement-node-in-group
Account for groups when deciding if node is replacement
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java13
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java21
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java9
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java26
4 files changed, 55 insertions, 14 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
index 695f0dd8659..5fe10f09f8a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
@@ -76,7 +76,7 @@ public class NodePrioritizer {
// In dynamically provisioned zones, we can always take spare hosts since we can provision new on-demand,
// NodeCandidate::compareTo will ensure that they will not be used until there is no room elsewhere.
// In non-dynamically provisioned zones, we only allow allocating to spare hosts to replace failed nodes.
- this.canAllocateToSpareHosts = dynamicProvisioning || isReplacement(nodesInCluster);
+ this.canAllocateToSpareHosts = dynamicProvisioning || isReplacement(nodesInCluster, clusterSpec.group());
// Do not allocate new nodes for exclusive deployments in dynamically provisioned zones: provision new host instead.
this.canAllocateNew = requestedNodes instanceof NodeSpec.CountNodeSpec
&& (!dynamicProvisioning || !requestedNodes.isExclusive());
@@ -195,10 +195,13 @@ public class NodePrioritizer {
}
/** Returns whether we are allocating to replace a failed node */
- private boolean isReplacement(NodeList nodesInCluster) {
- int failedNodesInCluster = nodesInCluster.failing().size() + nodesInCluster.state(Node.State.failed).size();
- if (failedNodesInCluster == 0) return false;
- return ! requestedNodes.fulfilledBy(nodesInCluster.size() - failedNodesInCluster);
+ private boolean isReplacement(NodeList nodesInCluster, Optional<ClusterSpec.Group> group) {
+ NodeList nodesInGroup = group.map(ClusterSpec.Group::index)
+ .map(nodesInCluster::group)
+ .orElse(nodesInCluster);
+ int failedNodesInGroup = nodesInGroup.failing().size() + nodesInGroup.state(Node.State.failed).size();
+ if (failedNodesInGroup == 0) return false;
+ return ! requestedNodes.fulfilledBy(nodesInGroup.size() - failedNodesInGroup);
}
/**
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java
index 2b180853d83..bdb93294cd1 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/RealDataScenarioTest.java
@@ -35,6 +35,7 @@ import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
+import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
@@ -49,13 +50,21 @@ import static com.yahoo.config.provision.NodeResources.DiskSpeed.fast;
import static com.yahoo.config.provision.NodeResources.StorageType.local;
import static com.yahoo.config.provision.NodeResources.StorageType.remote;
import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
/**
* Scenario tester with real node-repository data loaded from ZK snapshot file
*
+ * How to use:
+ *
+ * 1. Copy /opt/vespa/conf/configserver-app/node-flavors.xml from config server to /tmp/node-flavors.xml
+ * 2. Copy /opt/vespa/var/zookeeper/version-2/snapshot.XXX from config server to /tmp/snapshot
+ * 3. Set capacities and specs according to the wanted scenario
+ *
* @author valerijf
*/
public class RealDataScenarioTest {
+
private static final Logger log = Logger.getLogger(RealDataScenarioTest.class.getSimpleName());
@Ignore
@@ -63,11 +72,11 @@ public class RealDataScenarioTest {
public void test() {
ProvisioningTester tester = new ProvisioningTester.Builder()
.zone(new Zone(Cloud.builder().dynamicProvisioning(true).build(), SystemName.defaultSystem(), Environment.prod, RegionName.defaultName()))
- .flavorsConfig(parseFlavors(Paths.get(System.getProperty("user.home"), ".flavors.xml")))
+ .flavorsConfig(parseFlavors(Paths.get("/tmp/node-flavors.xml")))
.nameResolver(new DnsNameResolver())
.spareCount(1)
.build();
- initFromZk(tester.nodeRepository(), Paths.get(System.getProperty("user.home"), "snapshot"));
+ initFromZk(tester.nodeRepository(), Paths.get("/tmp/snapshot"));
ApplicationId app = ApplicationId.from("tenant", "app", "default");
Version version = Version.fromString("7.123.4");
@@ -91,10 +100,11 @@ public class RealDataScenarioTest {
}
private void deploy(ProvisioningTester tester, ApplicationId app, ClusterSpec[] specs, Capacity[] capacities) {
+ assertEquals("Equal capacity and spec count", capacities.length, specs.length);
List<HostSpec> hostSpecs = IntStream.range(0, capacities.length)
- .mapToObj(i -> tester.provisioner().prepare(app, specs[i], capacities[i], log::log).stream())
- .flatMap(s -> s)
- .collect(Collectors.toList());
+ .mapToObj(i -> tester.provisioner().prepare(app, specs[i], capacities[i], log::log))
+ .flatMap(Collection::stream)
+ .collect(Collectors.toList());
NestedTransaction transaction = new NestedTransaction();
tester.provisioner().activate(hostSpecs, new ActivationContext(0), new ApplicationTransaction(new ProvisionLock(app, () -> {}), transaction));
transaction.commit();
@@ -144,4 +154,5 @@ public class RealDataScenarioTest {
throw new UncheckedIOException(e);
}
}
+
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
index 9ded28094d2..774350084ac 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
@@ -148,11 +148,12 @@ public class NodeFailTester {
}
public static NodeFailTester withOneUndeployedApplication(Capacity capacity) {
- NodeFailTester tester = new NodeFailTester();
+ return withOneUndeployedApplication(capacity, ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.42").build());
+ }
- // Create applications
- ClusterSpec clusterApp = ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.42").build();
- Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(app1, new MockDeployer.ApplicationContext(app1, clusterApp, capacity));
+ public static NodeFailTester withOneUndeployedApplication(Capacity capacity, ClusterSpec spec) {
+ NodeFailTester tester = new NodeFailTester();
+ Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(app1, new MockDeployer.ApplicationContext(app1, spec, capacity));
tester.initializeMaintainers(apps);
return tester;
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index 7eff8af8b8d..dfeb82281e6 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -377,6 +377,32 @@ public class NodeFailerTest {
}
@Test
+ public void node_failing_can_allocate_spare_to_replace_failed_node_in_group() {
+ NodeResources resources = new NodeResources(1, 20, 15, 1);
+ Capacity capacity = Capacity.from(new ClusterResources(4, 2, resources), false, true);
+ ClusterSpec spec = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test")).vespaVersion("6.42").build();
+ NodeFailTester tester = NodeFailTester.withOneUndeployedApplication(capacity, spec);
+ assertEquals("Test depends on this setting in NodeFailTester", 1, tester.nodeRepository.spareCount());
+ tester.createAndActivateHosts(5, resources); // One extra - becomes designated spare
+ tester.activate(NodeFailTester.app1, spec, capacity);
+
+ // Hardware failure is reported for host
+ NodeList activeNodes = tester.nodeRepository.nodes().list(Node.State.active);
+ Node downNode = activeNodes.owner(NodeFailTester.app1).first().get();
+ Node downHost = activeNodes.parentOf(downNode).get();
+ tester.tester.patchNode(downHost, (node) -> node.with(node.reports().withReport(badTotalMemorySizeReport)));
+ tester.suspend(downHost.hostname());
+ tester.suspend(downNode.hostname());
+
+ // Node is failed and replaced
+ tester.runMaintainers();
+ assertEquals(1, tester.deployer.redeployments);
+ NodeList failedOrActive = tester.nodeRepository.nodes().list(Node.State.active, Node.State.failed);
+ assertEquals(4, failedOrActive.state(Node.State.active).nodeType(NodeType.tenant).size());
+ assertEquals(Set.of(downNode.hostname()), failedOrActive.state(Node.State.failed).nodeType(NodeType.tenant).hostnames());
+ }
+
+ @Test
public void failing_ready_nodes() {
NodeFailTester tester = NodeFailTester.withTwoApplications();