diff options
Diffstat (limited to 'node-repository')
2 files changed, 107 insertions, 16 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index 22d1e0333fb..392bcc5aa08 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -15,13 +15,16 @@ import com.yahoo.transaction.Mutex; import com.yahoo.vespa.flags.BooleanFlag; import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.flags.Flags; +import com.yahoo.vespa.flags.JacksonFlag; import com.yahoo.vespa.flags.ListFlag; import com.yahoo.vespa.flags.custom.ClusterCapacity; +import com.yahoo.vespa.flags.custom.SharedHost; import com.yahoo.vespa.hosted.provision.LockedNodeList; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.hosted.provision.node.IP; import com.yahoo.vespa.hosted.provision.provisioning.FatalProvisioningException; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner; @@ -35,6 +38,7 @@ import com.yahoo.yolean.Exceptions; import javax.naming.NameNotFoundException; import java.time.Duration; +import java.time.Instant; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; @@ -60,6 +64,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { private final HostProvisioner hostProvisioner; private final ListFlag<ClusterCapacity> preprovisionCapacityFlag; private final BooleanFlag compactPreprovisionCapacityFlag; + private final JacksonFlag<SharedHost> sharedHostFlag; DynamicProvisioningMaintainer(NodeRepository nodeRepository, Duration interval, @@ -70,6 +75,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { this.hostProvisioner = hostProvisioner; this.preprovisionCapacityFlag = Flags.PREPROVISION_CAPACITY.bindTo(flagSource); this.compactPreprovisionCapacityFlag = Flags.COMPACT_PREPROVISION_CAPACITY.bindTo(flagSource); + this.sharedHostFlag = Flags.SHARED_HOST.bindTo(flagSource); } @Override @@ -204,8 +210,36 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { } private List<Node> findExcessHosts(NodeList nodeList) { - final List<Node> nodes = provisionUntilNoDeficit(nodeList); + final List<Node> nodes = new ArrayList<>(provisionUntilNoDeficit(nodeList)); + + Map<String, Node> sharedHosts = new HashMap<>(findSharedHosts(nodeList)); + + int minCount = sharedHostFlag.value().getMinCount(); + int deficit = minCount - sharedHosts.size(); + if (deficit > 0) { + provisionHosts(deficit, NodeResources.unspecified()) + .forEach(host -> { + sharedHosts.put(host.hostname(), host); + nodes.add(host); + }); + } + + return candidatesForRemoval(nodes).stream() + .sorted(Comparator.comparing(node -> node.history().events().stream() + .map(History.Event::at).min(Comparator.naturalOrder()).orElseGet(() -> Instant.MIN))) + .filter(node -> { + if (!sharedHosts.containsKey(node.hostname()) || sharedHosts.size() > minCount) { + sharedHosts.remove(node.hostname()); + return true; + } else { + return false; + } + }) + .collect(Collectors.toList()); + } + + private List<Node> candidatesForRemoval(List<Node> nodes) { Map<String, Node> hostsByHostname = new HashMap<>(nodes.stream() .filter(node -> node.type() == NodeType.host) .filter(host -> host.state() != Node.State.parked || host.status().wantToDeprovision()) @@ -220,6 +254,14 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { return List.copyOf(hostsByHostname.values()); } + private Map<String, Node> findSharedHosts(NodeList nodeList) { + return nodeList.stream() + .filter(node -> NodeRepository.canAllocateTenantNodeTo(node, true)) + .filter(node -> node.reservedTo().isEmpty()) + .filter(node -> node.exclusiveTo().isEmpty()) + .collect(Collectors.toMap(Node::hostname, Function.identity())); + } + /** * @return the nodes in {@code nodeList} plus all hosts provisioned, plus all preprovision capacity * nodes that were allocated. @@ -246,21 +288,25 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { throw new IllegalStateException("Have provisioned " + numProvisions + " times but there's still deficit: aborting"); } - try { - Version osVersion = nodeRepository().osVersions().targetFor(NodeType.host).orElse(Version.emptyVersion); - List<Integer> provisionIndexes = nodeRepository().database().getProvisionIndexes(deficit.get().count()); - List<Node> hosts = hostProvisioner.provisionHosts(provisionIndexes, toNodeResources(deficit.get()), - ApplicationId.defaultId(), osVersion, HostSharing.shared) - .stream() - .map(ProvisionedHost::generateHost) - .collect(Collectors.toList()); - nodeRepository().addNodes(hosts, Agent.DynamicProvisioningMaintainer); - nodesPlusProvisioned.addAll(hosts); - } catch (OutOfCapacityException | IllegalArgumentException | IllegalStateException e) { - throw new OutOfCapacityException("Failed to pre-provision " + deficit.get() + ": " + e.getMessage()); - } catch (RuntimeException e) { - throw new RuntimeException("Failed to pre-provision " + deficit.get() + ", will retry in " + interval(), e); - } + nodesPlusProvisioned.addAll(provisionHosts(deficit.get().count(), toNodeResources(deficit.get()))); + } + } + + private List<Node> provisionHosts(int count, NodeResources nodeResources) { + try { + Version osVersion = nodeRepository().osVersions().targetFor(NodeType.host).orElse(Version.emptyVersion); + List<Integer> provisionIndexes = nodeRepository().database().getProvisionIndexes(count); + List<Node> hosts = hostProvisioner.provisionHosts(provisionIndexes, nodeResources, + ApplicationId.defaultId(), osVersion, HostSharing.shared) + .stream() + .map(ProvisionedHost::generateHost) + .collect(Collectors.toList()); + nodeRepository().addNodes(hosts, Agent.DynamicProvisioningMaintainer); + return hosts; + } catch (OutOfCapacityException | IllegalArgumentException | IllegalStateException e) { + throw new OutOfCapacityException("Failed to provision " + count + " " + nodeResources + ": " + e.getMessage()); + } catch (RuntimeException e) { + throw new RuntimeException("Failed to provision " + count + " " + nodeResources + ", will retry in " + interval(), e); } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java index 292a6872bb0..c747d95d41f 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java @@ -18,6 +18,7 @@ import com.yahoo.config.provision.Zone; import com.yahoo.vespa.flags.Flags; import com.yahoo.vespa.flags.InMemoryFlagSource; import com.yahoo.vespa.flags.custom.ClusterCapacity; +import com.yahoo.vespa.flags.custom.SharedHost; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Address; @@ -267,6 +268,50 @@ public class DynamicProvisioningMaintainerTest { } @Test + public void verify_min_count_of_shared_hosts() { + // What's going on here? We are trying to verify the impact of varying the minimum number of + // shared hosts (SharedHost.minCount()). + // + // addInitialNodes() adds 4 tenant hosts: + // host1 shared !removable # not removable because it got child nodes w/allocation + // host2 !shared removable # not counted as a shared host because it is failed + // host3 shared removable + // host4 shared !removable # not removable because it got child nodes w/allocation + // + // Hosts 1, 3, and 4 count as "shared hosts" with respect to the minCount lower boundary. + // Hosts 3 and 4 are removable, that is they will be deprovisioned as excess hosts unless + // prevented by minCount. + + // minCount=0: All (2) removable hosts are deprovisioned + assertWithMinCount(0, 0, 2); + // minCount=1: The same thing happens, because there are 2 shared hosts left + assertWithMinCount(1, 0, 2); + assertWithMinCount(2, 0, 2); + // minCount=3: since we require 3 shared hosts, host3 is not deprovisioned. + assertWithMinCount(3, 0, 1); + // 4 shared hosts require we provision 1 shared host + assertWithMinCount(4, 1, 1); + // 5 shared hosts require we provision 2 shared hosts + assertWithMinCount(5, 2, 1); + assertWithMinCount(6, 3, 1); + } + + private void assertWithMinCount(int minCount, int provisionCount, int deprovisionCount) { + var tester = new DynamicProvisioningTester().addInitialNodes(); + tester.hostProvisioner.provisionSharedHost("host4"); + + tester.flagSource.withJacksonFlag(Flags.SHARED_HOST.id(), new SharedHost(null, minCount), SharedHost.class); + tester.maintainer.maintain(); + assertEquals(provisionCount, tester.hostProvisioner.provisionedHosts.size()); + assertEquals(deprovisionCount, tester.hostProvisioner.deprovisionedHosts); + + // Verify next maintain is a no-op + tester.maintainer.maintain(); + assertEquals(provisionCount, tester.hostProvisioner.provisionedHosts.size()); + assertEquals(deprovisionCount, tester.hostProvisioner.deprovisionedHosts); + } + + @Test public void does_not_remove_if_host_provisioner_failed() { var tester = new DynamicProvisioningTester(); Node host2 = tester.addNode("host2", Optional.empty(), NodeType.host, Node.State.failed, DynamicProvisioningTester.tenantApp); |