diff options
author | Jon Bratseth <bratseth@oath.com> | 2020-04-26 13:45:26 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-04-26 13:45:26 +0200 |
commit | 03e96a594ad449ed21441046506c96dd02a859a1 (patch) | |
tree | 68e2526496bf7ff38211755cfdcff1b9bca19d22 | |
parent | e426febfaeb0840dbc148026f394e4cfc1f0c71a (diff) | |
parent | a7ca18c57e6fdfce0d06db7efb41f87a2049206e (diff) |
Merge pull request #13068 from vespa-engine/bratseth/smarter-window-change-handling
Smarter window change handling
6 files changed, 131 insertions, 15 deletions
diff --git a/config-provisioning/src/main/java/com/yahoo/config/provision/Capacity.java b/config-provisioning/src/main/java/com/yahoo/config/provision/Capacity.java index 3c7a14b9496..f723575c342 100644 --- a/config-provisioning/src/main/java/com/yahoo/config/provision/Capacity.java +++ b/config-provisioning/src/main/java/com/yahoo/config/provision/Capacity.java @@ -72,7 +72,12 @@ public final class Capacity { /** Create a non-required, failable capacity request */ public static Capacity from(ClusterResources resources) { - return from(resources, false, true); + return from(resources, resources); + } + + /** Create a non-required, failable capacity request */ + public static Capacity from(ClusterResources min, ClusterResources max) { + return from(min, max, false, true); } public static Capacity from(ClusterResources resources, boolean required, boolean canFail) { diff --git a/config-provisioning/src/main/java/com/yahoo/config/provision/ClusterSpec.java b/config-provisioning/src/main/java/com/yahoo/config/provision/ClusterSpec.java index 97549e851ad..3a230c89732 100644 --- a/config-provisioning/src/main/java/com/yahoo/config/provision/ClusterSpec.java +++ b/config-provisioning/src/main/java/com/yahoo/config/provision/ClusterSpec.java @@ -29,7 +29,7 @@ public final class ClusterSpec { this.type = type; this.id = id; this.groupId = groupId; - this.vespaVersion = vespaVersion; + this.vespaVersion = Objects.requireNonNull(vespaVersion); this.exclusive = exclusive; // TODO(mpolden): Require combinedId to always be present for type combined after April 2020 if (type != Type.combined && combinedId.isPresent()) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java index a3340e4cedb..434860ccb30 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java @@ -137,32 +137,62 @@ public class NodeRepositoryProvisioner implements Provisioner { application = application.withClusterLimits(clusterId, requested.minResources(), requested.maxResources()); nodeRepository.applications().put(application, lock); return application.clusters().get(clusterId).targetResources() - .orElseGet(() -> currentResources(applicationId, clusterId, requested) - .orElse(requested.minResources())); + .orElseGet(() -> currentResources(applicationId, clusterId, requested)); } } - /** Returns the current resources of this cluster, if it's already deployed and inside the requested limits */ - private Optional<ClusterResources> currentResources(ApplicationId applicationId, - ClusterSpec.Id clusterId, - Capacity requested) { + /** Returns the current resources of this cluster, or the closes */ + private ClusterResources currentResources(ApplicationId applicationId, + ClusterSpec.Id clusterId, + Capacity requested) { List<Node> nodes = NodeList.copyOf(nodeRepository.getNodes(applicationId, Node.State.active)) .cluster(clusterId) .not().retired() .not().removable() .asList(); - if (nodes.isEmpty()) return Optional.empty(); - long groups = nodes.stream().map(node -> node.allocation().get().membership().cluster().group()).distinct().count(); + if (nodes.isEmpty()) return requested.minResources(); // New deployment: Start at min - // To allow non-numeric settings to be updated without resetting to the min target, we need to use - // the non-numeric settings of the current min limit with the current numeric settings + long groups = nodes.stream().map(node -> node.allocation().get().membership().cluster().group()).distinct().count(); + // Settings which are not autoscaled should always be taken from the currently requested capacity + // and for those min and max are always the same NodeResources nodeResources = nodes.get(0).allocation().get().requestedResources() + .withBandwidthGbps(requested.minResources().nodeResources().bandwidthGbps()) .with(requested.minResources().nodeResources().diskSpeed()) .with(requested.minResources().nodeResources().storageType()); var currentResources = new ClusterResources(nodes.size(), (int)groups, nodeResources); - if ( ! currentResources.isWithin(requested.minResources(), requested.maxResources())) return Optional.empty(); + return ensureWithin(requested.minResources(), requested.maxResources(), currentResources); + } + + /** Make the minimal adjustments needed to the current resources to stay within the limits */ + private ClusterResources ensureWithin(ClusterResources min, ClusterResources max, ClusterResources current) { + int nodes = between(min.nodes(), max.nodes(), current.nodes()); + int groups = between(min.groups(), max.groups(), current.groups()); + if (nodes % groups != 0) { + // That didn't work - try to preserve current group size instead. + // Rounding here is needed because a node may be missing due to node failing. + int currentGroupsSize = Math.round((float)current.nodes() / current.groups()); + nodes = currentGroupsSize * groups; + if (nodes != between(min.nodes(), max.nodes(), nodes)) { + // Give up: Use max + nodes = max.nodes(); + groups = max.groups(); + } + } + double vcpu = between(min.nodeResources().vcpu(), max.nodeResources().vcpu(), current.nodeResources().vcpu()); + double memoryGb = between(min.nodeResources().memoryGb(), max.nodeResources().memoryGb(), current.nodeResources().memoryGb()); + double diskGb = between(min.nodeResources().diskGb(), max.nodeResources().diskGb(), current.nodeResources().diskGb()); + NodeResources nodeResources = current.nodeResources().withVcpu(vcpu) + .withMemoryGb(memoryGb) + .withDiskGb(diskGb); + return new ClusterResources(nodes, groups, nodeResources); + } + + private int between(int min, int max, int n) { + return Math.min(max, Math.max(min, n)); + } - return Optional.of(currentResources); + private double between(double min, double max, double n) { + return Math.min(max, Math.max(min, n)); } private void logIfDownscaled(int targetNodes, int actualNodes, ClusterSpec cluster, ProvisionLogger logger) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/MultigroupProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/MultigroupProvisioningTest.java index 1265961e351..050f3b7e865 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/MultigroupProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/MultigroupProvisioningTest.java @@ -109,7 +109,7 @@ public class MultigroupProvisioningTest { } @Test - public void test_one_node_and_group_to_two_with_flavor_migration() { + public void test_one_node_and_group_to_two_with_resource_change() { ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.perf, RegionName.from("us-east"))).build(); ApplicationId application1 = tester.makeApplicationId(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java index 8fc73d68785..1b7c75a03ac 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java @@ -405,6 +405,58 @@ public class ProvisioningTest { tester.activate(application, state.allHosts); } + @Test + public void test_changing_limits() { + ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build(); + tester.makeReadyHosts(30, new NodeResources(20, 40, 100, 4)); + tester.deployZoneApp(); + + ApplicationId app1 = tester.makeApplicationId("app1"); + ClusterSpec cluster1 = ClusterSpec.request(ClusterSpec.Type.content, new ClusterSpec.Id("cluster1")).vespaVersion("7").build(); + + // Initial deployment + tester.activate(app1, cluster1, Capacity.from(resources(4, 2, 2, 10, 20), + resources(8, 4, 4, 20, 40))); + tester.assertNodes("Initial allocation at min", + 4, 2, 2, 10, 20, + app1, cluster1); + + // Move window above current allocation + tester.activate(app1, cluster1, Capacity.from(resources(8, 4, 4, 20, 40), + resources(10, 5, 5, 25, 50))); + tester.assertNodes("New allocation at new min", + 8, 4, 4, 20, 40, + app1, cluster1); + + // Move window below current allocation + tester.activate(app1, cluster1, Capacity.from(resources(4, 2, 2, 10, 20), + resources(6, 3, 3, 15, 25))); + tester.assertNodes("New allocation at new max", + 6, 3, 3, 15, 25, + app1, cluster1); + + // Widening window does not change allocation + tester.activate(app1, cluster1, Capacity.from(resources(2, 1, 1, 5, 15), + resources(8, 4, 4, 20, 30))); + tester.assertNodes("Same allocation", + 6, 3, 3, 15, 25, + app1, cluster1); + + // Changing limits in opposite directions cause a mixture of min and max + tester.activate(app1, cluster1, Capacity.from(resources(2, 1, 10, 30, 5), + resources(4, 2, 14, 40, 10))); + tester.assertNodes("A mix of min and max", + 4, 2, 10, 30, 10, + app1, cluster1); + + // Changing group size + tester.activate(app1, cluster1, Capacity.from(resources(6, 3, 8, 25, 5), + resources(9, 3, 12, 35, 15))); + tester.assertNodes("Groups changed", + 6, 3, 10, 30, 10, + app1, cluster1); + } + @Test(expected = IllegalArgumentException.class) public void prod_deployment_requires_redundancy() { ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).build(); @@ -920,4 +972,8 @@ public class ProvisioningTest { .isPresent(); } + private ClusterResources resources(int nodes, int groups, double vcpu, double memory, double disk) { + return new ClusterResources(nodes, groups, new NodeResources(vcpu, memory, disk, 0.1)); + } + } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java index a8df47aab1a..cb4509b3de7 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java @@ -149,6 +149,10 @@ public class ProvisioningTester { return hosts1; } + public Collection<HostSpec> activate(ApplicationId application, ClusterSpec cluster, Capacity capacity) { + return activate(application, prepare(application, cluster, capacity, true)); + } + public Collection<HostSpec> activate(ApplicationId application, Collection<HostSpec> hosts) { NestedTransaction transaction = new NestedTransaction(); transaction.add(new CuratorTransaction(curator)); @@ -197,6 +201,22 @@ public class ProvisioningTester { } } + /** Assert on the current *non retired* nodes */ + public void assertNodes(String explanation, int nodes, int groups, double vcpu, double memory, double disk, + ApplicationId app, ClusterSpec cluster) { + List<Node> nodeList = nodeRepository.list().owner(app).cluster(cluster.id()).not().retired().asList(); + assertEquals(explanation + ": Node count", + nodes, + nodeList.size()); + assertEquals(explanation + ": Group count", + groups, + nodeList.stream().map(n -> n.allocation().get().membership().cluster().group().get()).distinct().count()); + for (Node node : nodeList) + assertEquals(explanation + ": Resources", + new NodeResources(vcpu, memory, disk, 0.1), + node.flavor().resources()); + } + public void fail(HostSpec host) { int beforeFailCount = nodeRepository.getNode(host.hostname(), Node.State.active).get().status().failCount(); Node failedNode = nodeRepository.fail(host.hostname(), Agent.system, "Failing to unit test"); @@ -241,6 +261,11 @@ public class ProvisioningTester { return makeReadyNodes(n, flavor, NodeType.tenant); } + /** Call depliyZoneApp() after this before deploying applications */ + public List<Node> makeReadyHosts(int n, NodeResources resources) { + return makeReadyNodes(n, resources, NodeType.host, 5); + } + public List<Node> makeReadyNodes(int n, NodeResources resources) { return makeReadyNodes(n, resources, NodeType.tenant, 0); } |