diff options
author | Valerij Fredriksen <valerijf@yahooinc.com> | 2022-11-02 14:24:49 +0100 |
---|---|---|
committer | Valerij Fredriksen <valerijf@yahooinc.com> | 2022-11-02 15:58:06 +0100 |
commit | 2a348d61213778f11c762de5f3570d8174f9f294 (patch) | |
tree | a4c6927aece141e2d7d5f86596b32b795c59cf97 | |
parent | 6688797036b4239ba58c8774f4c0893ed660bbc9 (diff) |
Move resume provisioning to separate maintainer
6 files changed, 203 insertions, 105 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index 6470e4fdb23..245bce1b9e8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -26,17 +26,13 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.NodesAndHosts; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.History; -import com.yahoo.vespa.hosted.provision.node.IP; -import com.yahoo.vespa.hosted.provision.provisioning.FatalProvisioningException; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.HostSharing; import com.yahoo.vespa.hosted.provision.provisioning.NodeCandidate; import com.yahoo.vespa.hosted.provision.provisioning.NodePrioritizer; import com.yahoo.vespa.hosted.provision.provisioning.NodeSpec; import com.yahoo.vespa.hosted.provision.provisioning.ProvisionedHost; -import com.yahoo.yolean.Exceptions; -import javax.naming.NamingException; import java.time.Duration; import java.time.Instant; import java.util.ArrayList; @@ -45,7 +41,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; import java.util.function.Function; import java.util.logging.Level; import java.util.logging.Logger; @@ -77,46 +72,10 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { @Override protected double maintain() { NodeList nodes = nodeRepository().nodes().list(); - resumeProvisioning(nodes); convergeToCapacity(nodes); return 1.0; } - /** Resume provisioning of already provisioned hosts and their children */ - private void resumeProvisioning(NodeList nodes) { - Map<String, Set<Node>> nodesByProvisionedParentHostname = - nodes.nodeType(NodeType.tenant, NodeType.config, NodeType.controller) - .asList() - .stream() - .filter(node -> node.parentHostname().isPresent()) - .collect(Collectors.groupingBy(node -> node.parentHostname().get(), Collectors.toSet())); - - nodes.state(Node.State.provisioned).nodeType(NodeType.host, NodeType.confighost, NodeType.controllerhost).forEach(host -> { - Set<Node> children = nodesByProvisionedParentHostname.getOrDefault(host.hostname(), Set.of()); - try { - try (var lock = nodeRepository().nodes().lockUnallocated()) { - List<Node> updatedNodes = hostProvisioner.provision(host, children); - verifyDns(updatedNodes); - nodeRepository().nodes().write(updatedNodes, lock); - } - } catch (IllegalArgumentException | IllegalStateException e) { - log.log(Level.INFO, "Could not provision " + host.hostname() + " with " + children.size() + " children, will retry in " + - interval() + ": " + Exceptions.toMessageString(e)); - } catch (FatalProvisioningException e) { - log.log(Level.SEVERE, "Failed to provision " + host.hostname() + " with " + children.size() + - " children, failing out the host recursively", e); - // Fail out as operator to force a quick redeployment - nodeRepository().nodes().failOrMarkRecursively( - host.hostname(), Agent.DynamicProvisioningMaintainer, "Failed by HostProvisioner due to provisioning failure"); - } catch (RuntimeException e) { - if (e.getCause() instanceof NamingException) - log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " + interval() + ": " + Exceptions.toMessageString(e)); - else - log.log(Level.WARNING, "Failed to provision " + host.hostname() + ", will retry in " + interval(), e); - } - }); - } - /** Converge zone to wanted capacity */ private void convergeToCapacity(NodeList nodes) { List<Node> excessHosts; @@ -337,13 +296,4 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { return new NodeResources(clusterCapacity.vcpu(), clusterCapacity.memoryGb(), clusterCapacity.diskGb(), clusterCapacity.bandwidthGbps()); } - - /** Verify DNS configuration of given nodes */ - private void verifyDns(List<Node> nodes) { - for (var node : nodes) { - for (var ipAddress : node.ipConfig().primary()) { - IP.verifyDns(node.hostname(), ipAddress, nodeRepository().nameResolver()); - } - } - } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java new file mode 100644 index 00000000000..5810e0a508f --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisioner.java @@ -0,0 +1,89 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.config.provision.NodeType; +import com.yahoo.jdisc.Metric; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.node.IP; +import com.yahoo.vespa.hosted.provision.provisioning.FatalProvisioningException; +import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner; +import com.yahoo.yolean.Exceptions; + +import javax.naming.NamingException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.stream.Collectors; + +/** + * @author freva + * @author mpolden + */ +public class HostResumeProvisioner extends NodeRepositoryMaintainer { + + private static final Logger log = Logger.getLogger(HostResumeProvisioner.class.getName()); + + private final HostProvisioner hostProvisioner; + + HostResumeProvisioner(NodeRepository nodeRepository, Duration interval, Metric metric, HostProvisioner hostProvisioner) { + super(nodeRepository, interval, metric); + this.hostProvisioner = hostProvisioner; + } + + @Override + protected double maintain() { + NodeList allNodes = nodeRepository().nodes().list(); + Map<String, Set<Node>> nodesByProvisionedParentHostname = + allNodes.nodeType(NodeType.tenant, NodeType.config, NodeType.controller) + .asList() + .stream() + .filter(node -> node.parentHostname().isPresent()) + .collect(Collectors.groupingBy(node -> node.parentHostname().get(), Collectors.toSet())); + + NodeList hosts = allNodes.state(Node.State.provisioned).nodeType(NodeType.host, NodeType.confighost, NodeType.controllerhost); + int failures = 0; + for (Node host : hosts) { + Set<Node> children = nodesByProvisionedParentHostname.getOrDefault(host.hostname(), Set.of()); + try { + try (var lock = nodeRepository().nodes().lockUnallocated()) { + List<Node> updatedNodes = hostProvisioner.provision(host, children); + verifyDns(updatedNodes); + nodeRepository().nodes().write(updatedNodes, lock); + } + } catch (IllegalArgumentException | IllegalStateException e) { + log.log(Level.INFO, "Could not provision " + host.hostname() + " with " + children.size() + " children, will retry in " + + interval() + ": " + Exceptions.toMessageString(e)); + } catch (FatalProvisioningException e) { + failures++; + log.log(Level.SEVERE, "Failed to provision " + host.hostname() + " with " + children.size() + + " children, failing out the host recursively", e); + // Fail out as operator to force a quick redeployment + nodeRepository().nodes().failOrMarkRecursively( + host.hostname(), Agent.DynamicProvisioningMaintainer, "Failed by HostProvisioner due to provisioning failure"); + } catch (RuntimeException e) { + if (e.getCause() instanceof NamingException) + log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " + interval() + ": " + Exceptions.toMessageString(e)); + else { + failures++; + log.log(Level.WARNING, "Failed to provision " + host.hostname() + ", will retry in " + interval(), e); + } + } + } + return asSuccessFactor(hosts.size(), failures); + } + + /** Verify DNS configuration of given nodes */ + private void verifyDns(List<Node> nodes) { + for (var node : nodes) { + for (var ipAddress : node.ipConfig().primary()) { + IP.verifyDns(node.hostname(), ipAddress, nodeRepository().nameResolver()); + } + } + } +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 6175531fc65..acc5dd66f7a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -71,6 +71,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { provisionServiceProvider.getHostProvisioner() .map(hostProvisioner -> List.of( new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource, metric), + new HostResumeProvisioner(nodeRepository, defaults.hostResumeProvisionerInterval, metric, hostProvisioner), new HostRetirer(nodeRepository, defaults.hostRetirerInterval, metric, hostProvisioner), new DiskReplacer(nodeRepository, defaults.diskReplacerInterval, metric, hostProvisioner))) .ifPresent(maintainers::addAll); @@ -112,6 +113,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final Duration infrastructureProvisionInterval; private final Duration loadBalancerExpirerInterval; private final Duration dynamicProvisionerInterval; + private final Duration hostResumeProvisionerInterval; private final Duration diskReplacerInterval; private final Duration osUpgradeActivatorInterval; private final Duration rebalancerInterval; @@ -126,6 +128,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { DefaultTimes(Zone zone, Deployer deployer) { autoscalingInterval = Duration.ofMinutes(5); dynamicProvisionerInterval = Duration.ofMinutes(3); + hostResumeProvisionerInterval = Duration.ofMinutes(3); diskReplacerInterval = Duration.ofMinutes(3); failedExpirerInterval = Duration.ofMinutes(10); failGrace = Duration.ofMinutes(20); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java index 3aa841ecacf..76e16e9fbaf 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java @@ -70,7 +70,7 @@ public class MockHostProvisioner implements HostProvisioner { .orElseThrow(() -> new NodeAllocationException("No host flavor matches " + resources, true))); List<ProvisionedHost> hosts = new ArrayList<>(); for (int index : provisionIndices) { - String hostHostname = hostType == NodeType.host ? "hostname" + index : hostType.name() + index; + String hostHostname = hostType == NodeType.host ? "host" + index : hostType.name() + index; hosts.add(new ProvisionedHost("id-of-" + hostType.name() + index, hostHostname, hostFlavor, @@ -173,11 +173,11 @@ public class MockHostProvisioner implements HostProvisioner { } private List<Address> createAddressesForHost(NodeType hostType, Flavor flavor, int hostIndex) { - long numAddresses = Math.max(1, Math.round(flavor.resources().bandwidthGbps())); - return IntStream.range(0, (int) numAddresses) + long numAddresses = Math.max(2, Math.round(flavor.resources().bandwidthGbps())); + return IntStream.range(1, (int) numAddresses) .mapToObj(i -> { String hostname = hostType == NodeType.host - ? "nodename" + hostIndex + "_" + i + ? "host" + hostIndex + "-" + i : hostType.childNodeType().name() + i; return new Address(hostname); }) diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java index f9c7d7cd88d..e882de3aa62 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java @@ -66,39 +66,6 @@ import static org.junit.Assert.fail; public class DynamicProvisioningMaintainerTest { @Test - public void delegates_to_host_provisioner_and_writes_back_result() { - var tester = new DynamicProvisioningTester().addInitialNodes(); - tester.hostProvisioner.with(Behaviour.failDeprovisioning); // To avoid deleting excess nodes - - Node host3 = tester.nodeRepository.nodes().node("host3").orElseThrow(); - Node host4 = tester.nodeRepository.nodes().node("host4").orElseThrow(); - Node host41 = tester.nodeRepository.nodes().node("host4-1").orElseThrow(); - assertTrue("No IP addresses assigned", - Stream.of(host3, host4, host41).map(node -> node.ipConfig().primary()).allMatch(Set::isEmpty)); - - Node host3new = host3.with(host3.ipConfig().withPrimary(Set.of("::3:0"))); - Node host4new = host4.with(host4.ipConfig().withPrimary(Set.of("::4:0"))); - Node host41new = host41.with(host41.ipConfig().withPrimary(Set.of("::4:1", "::4:2"))); - - tester.maintainer.maintain(); - assertEquals(host3new, tester.nodeRepository.nodes().node("host3").get()); - assertEquals(host4new, tester.nodeRepository.nodes().node("host4").get()); - assertEquals(host41new, tester.nodeRepository.nodes().node("host4-1").get()); - } - - @Test - public void correctly_fails_if_irrecoverable_failure() { - var tester = new DynamicProvisioningTester(); - tester.hostProvisioner.with(Behaviour.failProvisioning); - Node host4 = tester.addNode("host4", Optional.empty(), NodeType.host, Node.State.provisioned); - Node host41 = tester.addNode("host4-1", Optional.of("host4"), NodeType.tenant, Node.State.reserved, DynamicProvisioningTester.tenantApp); - assertTrue("No IP addresses assigned", Stream.of(host4, host41).map(node -> node.ipConfig().primary()).allMatch(Set::isEmpty)); - - tester.maintainer.maintain(); - assertEquals(Set.of("host4", "host4-1"), tester.nodeRepository.nodes().list(Node.State.failed).hostnames()); - } - - @Test public void finds_nodes_that_need_deprovisioning_without_pre_provisioning() { var tester = new DynamicProvisioningTester().addInitialNodes(); assertTrue(tester.nodeRepository.nodes().node("host2").isPresent()); @@ -384,24 +351,6 @@ public class DynamicProvisioningMaintainerTest { } @Test - public void defer_writing_ip_addresses_until_dns_resolves() { - var tester = new DynamicProvisioningTester().addInitialNodes(); - tester.hostProvisioner.with(Behaviour.failDnsUpdate); - - Supplier<NodeList> provisioning = () -> tester.nodeRepository.nodes().list(Node.State.provisioned).nodeType(NodeType.host); - assertEquals(2, provisioning.get().size()); - tester.maintainer.maintain(); - - assertTrue("No IP addresses written as DNS updates are failing", - provisioning.get().stream().allMatch(host -> host.ipConfig().pool().ipSet().isEmpty())); - - tester.hostProvisioner.without(Behaviour.failDnsUpdate); - tester.maintainer.maintain(); - assertTrue("IP addresses written as DNS updates are succeeding", - provisioning.get().stream().noneMatch(host -> host.ipConfig().pool().ipSet().isEmpty())); - } - - @Test public void deprovision_empty_confighost() { // cfghost1, cfg1, cfghost2, cfg2, cfghost3, and NOT cfg3. var tester = new DynamicProvisioningTester(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java new file mode 100644 index 00000000000..715aa82afb0 --- /dev/null +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/HostResumeProvisionerTest.java @@ -0,0 +1,107 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.component.Version; +import com.yahoo.config.provision.Capacity; +import com.yahoo.config.provision.Cloud; +import com.yahoo.config.provision.ClusterResources; +import com.yahoo.config.provision.ClusterSpec; +import com.yahoo.config.provision.Environment; +import com.yahoo.config.provision.Flavor; +import com.yahoo.config.provision.NodeResources; +import com.yahoo.config.provision.NodeType; +import com.yahoo.config.provision.RegionName; +import com.yahoo.config.provision.SystemName; +import com.yahoo.config.provision.Zone; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.node.IP; +import com.yahoo.vespa.hosted.provision.provisioning.FlavorConfigBuilder; +import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester; +import com.yahoo.vespa.hosted.provision.testutils.MockHostProvisioner; +import com.yahoo.vespa.hosted.provision.testutils.MockNameResolver; +import org.junit.Test; + +import java.time.Duration; +import java.util.List; +import java.util.Set; +import java.util.function.Supplier; +import java.util.stream.Stream; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * @author freva + */ +public class HostResumeProvisionerTest { + + private final List<Flavor> flavors = FlavorConfigBuilder.createDummies("default").getFlavors(); + private final MockNameResolver nameResolver = new MockNameResolver(); + private final MockHostProvisioner hostProvisioner = new MockHostProvisioner(flavors, nameResolver, 0); + private final ProvisioningTester tester = new ProvisioningTester.Builder() + .zone(new Zone(Cloud.builder().dynamicProvisioning(true).build(), SystemName.defaultSystem(), Environment.dev, RegionName.defaultName())) + .hostProvisioner(hostProvisioner) + .nameResolver(nameResolver) + .flavors(flavors) + .build(); + private final HostResumeProvisioner hostResumeProvisioner = new HostResumeProvisioner(tester.nodeRepository(), Duration.ofDays(1), new TestMetric(), hostProvisioner); + + @Test + public void delegates_to_host_provisioner_and_writes_back_result() { + deployApplication(); + + Node host = tester.nodeRepository().nodes().node("host100").orElseThrow(); + Node node = tester.nodeRepository().nodes().node("host100-1").orElseThrow(); + assertTrue("No IP addresses assigned", + Stream.of(host, node).map(n -> n.ipConfig().primary()).allMatch(Set::isEmpty)); + + Node hostNew = host.with(host.ipConfig().withPrimary(Set.of("::100:0")).withPool(host.ipConfig().pool().withIpAddresses(Set.of("::100:1", "::100:2")))); + Node nodeNew = node.with(IP.Config.ofEmptyPool(Set.of("::100:1"))); + + hostResumeProvisioner.maintain(); + assertEquals(hostNew.ipConfig(), tester.nodeRepository().nodes().node("host100").get().ipConfig()); + assertEquals(nodeNew.ipConfig(), tester.nodeRepository().nodes().node("host100-1").get().ipConfig()); + } + + @Test + public void defer_writing_ip_addresses_until_dns_resolves() { + deployApplication(); + hostProvisioner.with(MockHostProvisioner.Behaviour.failDnsUpdate); + + Supplier<NodeList> provisioning = () -> tester.nodeRepository().nodes().list(Node.State.provisioned).nodeType(NodeType.host); + assertEquals(1, provisioning.get().size()); + provisioning.get().forEach(h -> System.out.println(h.hostname() + " " + h.ipConfig())); + hostResumeProvisioner.maintain(); + + assertTrue("No IP addresses written as DNS updates are failing", + provisioning.get().stream().allMatch(host -> host.ipConfig().pool().ipSet().isEmpty())); + + hostProvisioner.without(MockHostProvisioner.Behaviour.failDnsUpdate); + hostResumeProvisioner.maintain(); + provisioning.get().forEach(h -> System.out.println(h.hostname() + " " + h.ipConfig())); + assertTrue("IP addresses written as DNS updates are succeeding", + provisioning.get().stream().noneMatch(host -> host.ipConfig().pool().ipSet().isEmpty())); + } + + @Test + public void correctly_fails_if_irrecoverable_failure() { + deployApplication(); + hostProvisioner.with(MockHostProvisioner.Behaviour.failProvisioning); + + Node host = tester.nodeRepository().nodes().node("host100").orElseThrow(); + Node node = tester.nodeRepository().nodes().node("host100-1").orElseThrow(); + assertTrue("No IP addresses assigned", + Stream.of(host, node).map(n -> n.ipConfig().primary()).allMatch(Set::isEmpty)); + + hostResumeProvisioner.maintain(); + assertEquals(Set.of("host100", "host100-1"), tester.nodeRepository().nodes().list(Node.State.failed).hostnames()); + } + + private void deployApplication() { + ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("cluster1")).vespaVersion(Version.fromString("7")).build(); + Capacity capacity = Capacity.from(new ClusterResources(1, 1, new NodeResources(1, 30, 20, 3))); + tester.prepare(ProvisioningTester.applicationId(), cluster, capacity); + assertEquals(2, tester.nodeRepository().nodes().list().size()); + } +} |