diff options
author | Valerij Fredriksen <valerijf@oath.com> | 2018-02-09 16:44:07 +0100 |
---|---|---|
committer | Valerij Fredriksen <valerijf@oath.com> | 2018-02-09 16:57:03 +0100 |
commit | ecfb9e45b593841c996f70e67ad480d8a09fc0b2 (patch) | |
tree | 45ee2a2ebd2f9e1063cbd9bcb0807d72ad304b6f /node-repository | |
parent | 3e349b470c968586116245ef9a8719002cdf47b7 (diff) |
Allow retirement of nodes where node-spec is specified by type
Diffstat (limited to 'node-repository')
3 files changed, 208 insertions, 27 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java index d9d7b4a5d12..2a140945f43 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java @@ -213,23 +213,23 @@ class NodeAllocation { * @return the final list of nodes */ List<Node> finalNodes(List<Node> surplusNodes) { - long currentRetired = nodes.stream().filter(node -> node.node.allocation().get().membership().retired()).count(); - long surplus = requestedNodes.surplusGiven(nodes.size()) - currentRetired; + int currentRetiredCount = (int) nodes.stream().filter(node -> node.node.allocation().get().membership().retired()).count(); + int deltaRetiredCount = requestedNodes.idealRetiredCount(nodes.size(), currentRetiredCount) - currentRetiredCount; - if (surplus > 0) { // retire until surplus is 0, prefer to retire higher indexes to minimize redistribution + if (deltaRetiredCount > 0) { // retire until deltaRetiredCount is 0, prefer to retire higher indexes to minimize redistribution for (PrioritizableNode node : byDecreasingIndex(nodes)) { if ( ! node.node.allocation().get().membership().retired() && node.node.state().equals(Node.State.active)) { node.node = node.node.retire(Agent.application, clock.instant()); surplusNodes.add(node.node); // offer this node to other groups - if (--surplus == 0) break; + if (--deltaRetiredCount == 0) break; } } } - else if (surplus < 0) { // unretire until surplus is 0 + else if (deltaRetiredCount < 0) { // unretire until deltaRetiredCount is 0 for (PrioritizableNode node : byIncreasingIndex(nodes)) { if ( node.node.allocation().get().membership().retired() && hasCompatibleFlavor(node.node)) { node.node = node.node.unretire(); - if (++surplus == 0) break; + if (++deltaRetiredCount == 0) break; } } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java index 23a6e3a8b9a..dc3f4a64421 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java @@ -34,8 +34,8 @@ public interface NodeSpec { /** Returns whether the given node count is sufficient to fulfill this spec */ boolean fulfilledBy(int count); - /** Returns the amount the given count is above the minimum amount needed to fulfill this request */ - int surplusGiven(int count); + /** Returns the ideal number of nodes that should be retired to fulfill this spec */ + int idealRetiredCount(int acceptedCount, int currentRetiredCount); /** Returns a specification of a fraction of all the nodes of this. It is assumed the argument is a valid divisor. */ NodeSpec fraction(int divisor); @@ -97,7 +97,7 @@ public interface NodeSpec { public boolean saturatedBy(int count) { return fulfilledBy(count); } // min=max for count specs @Override - public int surplusGiven(int count) { return count - this.count; } + public int idealRetiredCount(int acceptedCount, int currentRetiredCount) { return acceptedCount - this.count; } @Override public NodeSpec fraction(int divisor) { return new CountNodeSpec(count/divisor, requestedFlavor); } @@ -152,7 +152,14 @@ public interface NodeSpec { public boolean saturatedBy(int count) { return false; } @Override - public int surplusGiven(int count) { return 0; } + public int idealRetiredCount(int acceptedCount, int currentRetiredCount) { + /* + * All nodes marked with wantToRetire get marked as retired just before this function is called, + * the job of this function is to throttle the retired count. If no nodes are marked as retired + * then continue this way, otherwise allow only 1 node to be retired + */ + return Math.min(1, currentRetiredCount); + } @Override public NodeSpec fraction(int divisor) { return this; } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java index 873193ac3b8..97fde2274f2 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/NodeTypeProvisioningTest.java @@ -11,15 +11,22 @@ import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.RegionName; import com.yahoo.config.provision.Zone; import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.maintenance.JobControl; +import com.yahoo.vespa.hosted.provision.maintenance.RetiredExpirer; import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.testutils.MockDeployer; +import org.junit.Before; import org.junit.Test; +import java.time.Duration; +import java.util.Collections; import java.util.HashSet; import java.util.List; -import java.util.Optional; +import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; /** * Tests provisioning by node type instead of by count and flavor @@ -28,28 +35,31 @@ import static org.junit.Assert.assertFalse; */ public class NodeTypeProvisioningTest { - @Test - public void proxy_deployment() { - ProvisioningTester tester = new ProvisioningTester(new Zone(Environment.prod, RegionName.from("us-east"))); + private final ProvisioningTester tester = new ProvisioningTester(new Zone(Environment.prod, RegionName.from("us-east"))); + + private final ApplicationId application = tester.makeApplicationId(); // application using proxy nodes + private final Capacity capacity = Capacity.fromRequiredNodeType(NodeType.proxy); + private final ClusterSpec clusterSpec = ClusterSpec.request( + ClusterSpec.Type.container, ClusterSpec.Id.from("test"), Version.fromString("6.42")); + @Before + public void setup() { tester.makeReadyNodes( 1, "small", NodeType.proxy); tester.makeReadyNodes( 3, "small", NodeType.host); tester.makeReadyNodes( 5, "small", NodeType.tenant); tester.makeReadyNodes(10, "large", NodeType.proxy); tester.makeReadyNodes(20, "large", NodeType.host); tester.makeReadyNodes(40, "large", NodeType.tenant); + } - ApplicationId application = tester.makeApplicationId(); // application using proxy nodes - - + @Test + public void proxy_deployment() { { // Deploy List<HostSpec> hosts = deployProxies(application, tester); assertEquals("Reserved all proxies", 11, hosts.size()); tester.activate(application, new HashSet<>(hosts)); List<Node> nodes = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active); assertEquals("Activated all proxies", 11, nodes.size()); - for (Node node : nodes) - assertEquals(NodeType.proxy, node.type()); } { // Redeploy with no changes @@ -83,14 +93,178 @@ public class NodeTypeProvisioningTest { } } + @Test + public void retire_proxy() { + MockDeployer deployer = new MockDeployer( + tester.provisioner(), + Collections.singletonMap( + application, new MockDeployer.ApplicationContext(application, clusterSpec, capacity, 1))); + RetiredExpirer retiredExpirer = new RetiredExpirer(tester.nodeRepository(), tester.orchestrator(), deployer, + tester.clock(), Duration.ofDays(30), Duration.ofMinutes(10), new JobControl(tester.nodeRepository().database())); + + { // Deploy + List<HostSpec> hosts = deployProxies(application, tester); + assertEquals("Reserved all proxies", 11, hosts.size()); + tester.activate(application, new HashSet<>(hosts)); + List<Node> nodes = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active); + assertEquals("Activated all proxies", 11, nodes.size()); + } + + Node nodeToRetire = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active).get(5); + { // Pick out a node and retire it + tester.nodeRepository().write(nodeToRetire.with(nodeToRetire.status().withWantToRetire(true))); + + List<HostSpec> hosts = deployProxies(application, tester); + assertEquals(11, hosts.size()); + tester.activate(application, new HashSet<>(hosts)); + List<Node> nodes = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active); + assertEquals(11, nodes.size()); + + // Verify that wantToRetire has been propagated + assertTrue(tester.nodeRepository().getNode(nodeToRetire.hostname()) + .flatMap(Node::allocation) + .map(allocation -> allocation.membership().retired()) + .orElseThrow(RuntimeException::new)); + } + + { // Redeploying while the node is still retiring has no effect + List<HostSpec> hosts = deployProxies(application, tester); + assertEquals(11, hosts.size()); + tester.activate(application, new HashSet<>(hosts)); + List<Node> nodes = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active); + assertEquals(11, nodes.size()); + + // Verify that the node is still marked as retired + assertTrue(tester.nodeRepository().getNode(nodeToRetire.hostname()) + .flatMap(Node::allocation) + .map(allocation -> allocation.membership().retired()) + .orElseThrow(RuntimeException::new)); + } + + { + tester.advanceTime(Duration.ofMinutes(11)); + retiredExpirer.run(); + + List<HostSpec> hosts = deployProxies(application, tester); + assertEquals(10, hosts.size()); + tester.activate(application, new HashSet<>(hosts)); + List<Node> nodes = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active); + assertEquals(10, nodes.size()); + + // Verify that the node is now inactive + assertEquals(Node.State.inactive, tester.nodeRepository().getNode(nodeToRetire.hostname()) + .orElseThrow(RuntimeException::new).state()); + } + } + + @Test + public void retire_multiple_proxy_simultaneously() { + MockDeployer deployer = new MockDeployer( + tester.provisioner(), + Collections.singletonMap( + application, new MockDeployer.ApplicationContext(application, clusterSpec, capacity, 1))); + RetiredExpirer retiredExpirer = new RetiredExpirer(tester.nodeRepository(), tester.orchestrator(), deployer, + tester.clock(), Duration.ofDays(30), Duration.ofMinutes(10), new JobControl(tester.nodeRepository().database())); + final int numNodesToRetire = 5; + + { // Deploy + List<HostSpec> hosts = deployProxies(application, tester); + assertEquals("Reserved all proxies", 11, hosts.size()); + tester.activate(application, new HashSet<>(hosts)); + List<Node> nodes = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active); + assertEquals("Activated all proxies", 11, nodes.size()); + } + + List<Node> nodesToRetire = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active) + .subList(3, 3 + numNodesToRetire); + String currentyRetiringHostname; + { + nodesToRetire.forEach(nodeToRetire -> + tester.nodeRepository().write(nodeToRetire.with(nodeToRetire.status().withWantToRetire(true)))); + + List<HostSpec> hosts = deployProxies(application, tester); + assertEquals(11, hosts.size()); + tester.activate(application, new HashSet<>(hosts)); + List<Node> nodes = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active); + assertEquals(11, nodes.size()); + + // Verify that wantToRetire has been propagated + List<Node> nodesCurrentlyRetiring = nodes.stream() + .filter(node -> node.allocation().get().membership().retired()) + .collect(Collectors.toList()); + assertEquals(1, nodesCurrentlyRetiring.size()); + + // The retiring node should be one of the nodes we marked for retirement + currentyRetiringHostname = nodesCurrentlyRetiring.get(0).hostname(); + assertTrue(nodesToRetire.stream().map(Node::hostname).filter(hostname -> hostname.equals(currentyRetiringHostname)).count() == 1); + } + + { // Redeploying while the node is still retiring has no effect + List<HostSpec> hosts = deployProxies(application, tester); + assertEquals(11, hosts.size()); + tester.activate(application, new HashSet<>(hosts)); + List<Node> nodes = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active); + assertEquals(11, nodes.size()); + + // Verify that wantToRetire has been propagated + List<Node> nodesCurrentlyRetiring = nodes.stream() + .filter(node -> node.allocation().get().membership().retired()) + .collect(Collectors.toList()); + assertEquals(1, nodesCurrentlyRetiring.size()); + + // The node that started retiring is still the only one retiring + assertEquals(currentyRetiringHostname, nodesCurrentlyRetiring.get(0).hostname()); + } + + { + tester.advanceTime(Duration.ofMinutes(11)); + retiredExpirer.run(); + + List<HostSpec> hosts = deployProxies(application, tester); + assertEquals(10, hosts.size()); + tester.activate(application, new HashSet<>(hosts)); + List<Node> nodes = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active); + assertEquals(10, nodes.size()); + + // Verify the node we previously set to retire has finished retiring + assertEquals(Node.State.inactive, tester.nodeRepository().getNode(currentyRetiringHostname) + .orElseThrow(RuntimeException::new).state()); + + // Verify that a node is currently retiring + List<Node> nodesCurrentlyRetiring = nodes.stream() + .filter(node -> node.allocation().get().membership().retired()) + .collect(Collectors.toList()); + assertEquals(1, nodesCurrentlyRetiring.size()); + + // This node is different from the one that was retiring previously + String newRetiringHostname = nodesCurrentlyRetiring.get(0).hostname(); + assertNotEquals(currentyRetiringHostname, newRetiringHostname); + // ... but is one of the nodes that were put to wantToRetire earlier + assertTrue(nodesToRetire.stream().map(Node::hostname).filter(hostname -> hostname.equals(newRetiringHostname)).count() == 1); + } + + + for (int i = 0; i < 10; i++){ + tester.advanceTime(Duration.ofMinutes(11)); + retiredExpirer.run(); + List<HostSpec> hosts = deployProxies(application, tester); + tester.activate(application, new HashSet<>(hosts)); + } + + // After a long time, all currently active proxy nodes are not marked with wantToRetire or as retired + long numRetiredActiveProxyNodes = tester.nodeRepository().getNodes(NodeType.proxy, Node.State.active).stream() + .filter(node -> !node.status().wantToRetire()) + .filter(node -> !node.allocation().get().membership().retired()) + .count(); + assertEquals(11 - numNodesToRetire, numRetiredActiveProxyNodes); + + // All the nodes that were marked with wantToRetire earlier are now inactive + assertEquals(nodesToRetire.stream().map(Node::hostname).collect(Collectors.toSet()), + tester.nodeRepository().getNodes(Node.State.inactive).stream().map(Node::hostname).collect(Collectors.toSet())); + } + private List<HostSpec> deployProxies(ApplicationId application, ProvisioningTester tester) { - return tester.prepare(application, - ClusterSpec.request(ClusterSpec.Type.container, - ClusterSpec.Id.from("test"), - Version.fromString("6.42")), - Capacity.fromRequiredNodeType(NodeType.proxy), - 1); - + return tester.prepare(application, clusterSpec, capacity, 1); } } |