diff options
author | Valerij Fredriksen <valerijf@yahooinc.com> | 2022-02-03 22:32:17 +0100 |
---|---|---|
committer | Valerij Fredriksen <valerijf@yahooinc.com> | 2022-02-03 22:47:57 +0100 |
commit | f2f9542b2c39a5d4333d2e78de1e0a898f1b5f0a (patch) | |
tree | c398a6b551c12c53139a2f29d7bfcdb7d810df6d /node-repository | |
parent | aa6c641255ee3a9742dc50f5076d321145533267 (diff) |
Do not allocate nodes to suspended hosts
Diffstat (limited to 'node-repository')
4 files changed, 51 insertions, 17 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index 89db8a3d94d..a387bc28aa4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -118,7 +118,7 @@ public class NodeRepository extends AbstractComponent { this.db = new CuratorDatabaseClient(flavors, curator, clock, useCuratorClientCache, nodeCacheSize); this.zone = zone; this.clock = clock; - this.nodes = new Nodes(db, zone, clock); + this.nodes = new Nodes(db, zone, clock, orchestrator); this.flavors = flavors; this.resourcesCalculator = provisionServiceProvider.getHostResourcesCalculator(); this.nameResolver = nameResolver; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index e914c3b16e6..a1916d7dc20 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -7,7 +7,6 @@ import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.TransientException; import com.yahoo.jdisc.Metric; import com.yahoo.transaction.Mutex; -import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeMutex; @@ -15,7 +14,6 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException; -import com.yahoo.vespa.orchestrator.HostNameNotFoundException; import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; import com.yahoo.yolean.Exceptions; @@ -156,7 +154,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); for (Node node : activeNodes) { - Instant graceTimeStart = clock().instant().minus(suspended(node) ? suspendedDownTimeLimit : downTimeLimit); + Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit); if (node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) { // Allow a grace period after node re-activation if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart)) @@ -205,23 +203,14 @@ public class NodeFailer extends NodeRepositoryMaintainer { } } - private boolean suspended(Node node) { - try { - return nodeRepository().orchestrator().getNodeStatus(new HostName(node.hostname())).isSuspended(); - } catch (HostNameNotFoundException e) { - // Treat it as not suspended - return false; - } - } - /** Is the node and all active children suspended? */ private boolean allSuspended(Node node, NodeList activeNodes) { - if (!suspended(node)) return false; + if (!nodeRepository().nodes().suspended(node)) return false; if (node.parentHostname().isPresent()) return true; // optimization return activeNodes.stream() .filter(childNode -> childNode.parentHostname().isPresent() && childNode.parentHostname().get().equals(node.hostname())) - .allMatch(this::suspended); + .allMatch(nodeRepository().nodes()::suspended); } /** diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index 39857034b09..57a3b436e37 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -10,6 +10,7 @@ import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.Zone; import com.yahoo.transaction.Mutex; import com.yahoo.transaction.NestedTransaction; +import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.hosted.provision.LockedNodeList; import com.yahoo.vespa.hosted.provision.NoSuchNodeException; import com.yahoo.vespa.hosted.provision.Node; @@ -18,6 +19,8 @@ import com.yahoo.vespa.hosted.provision.NodeMutex; import com.yahoo.vespa.hosted.provision.maintenance.NodeFailer; import com.yahoo.vespa.hosted.provision.node.filter.NodeFilter; import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient; +import com.yahoo.vespa.orchestrator.HostNameNotFoundException; +import com.yahoo.vespa.orchestrator.Orchestrator; import java.time.Clock; import java.time.Duration; @@ -53,14 +56,16 @@ public class Nodes { private static final Logger log = Logger.getLogger(Nodes.class.getName()); + private final CuratorDatabaseClient db; private final Zone zone; private final Clock clock; - private final CuratorDatabaseClient db; + private final Orchestrator orchestrator; - public Nodes(CuratorDatabaseClient db, Zone zone, Clock clock) { + public Nodes(CuratorDatabaseClient db, Zone zone, Clock clock, Orchestrator orchestrator) { this.zone = zone; this.clock = clock; this.db = db; + this.orchestrator = orchestrator; } /** Read and write all nodes to make sure they are stored in the latest version of the serialized format */ @@ -732,6 +737,7 @@ public class Nodes { if ( ! host.type().canRun(NodeType.tenant)) return false; if (host.status().wantToRetire()) return false; if (host.allocation().map(alloc -> alloc.membership().retired()).orElse(false)) return false; + if (suspended(host)) return false; if (dynamicProvisioning) return EnumSet.of(Node.State.active, Node.State.ready, Node.State.provisioned).contains(host.state()); @@ -739,6 +745,15 @@ public class Nodes { return host.state() == Node.State.active; } + public boolean suspended(Node node) { + try { + return orchestrator.getNodeStatus(new HostName(node.hostname())).isSuspended(); + } catch (HostNameNotFoundException e) { + // Treat it as not suspended + return false; + } + } + /** Create a lock which provides exclusive rights to making changes to the given application */ // TODO: Move to Applications public Mutex lock(ApplicationId application) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java index df9e8efde93..583ccdda656 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java @@ -17,6 +17,7 @@ import com.yahoo.config.provision.SystemName; import com.yahoo.config.provision.Zone; import com.yahoo.config.provisioning.FlavorsConfig; import com.yahoo.transaction.NestedTransaction; +import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.curator.transaction.CuratorTransaction; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.Node.State; @@ -267,6 +268,35 @@ public class DynamicAllocationTest { } @Test + public void does_not_allocate_to_suspended_hosts() { + ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).flavorsConfig(flavorsConfig()).build(); + tester.makeReadyNodes(4, "host-small", NodeType.host, 32); + tester.activateTenantHosts(); + + HostName randomHost = new HostName(tester.nodeRepository().nodes().list(State.active).first().get().hostname()); + tester.orchestrator().suspend(randomHost); + + ApplicationId application1 = ProvisioningTester.applicationId(); + ClusterSpec clusterSpec = clusterSpec("myContent.t1.a1"); + NodeResources flavor = new NodeResources(1, 4, 100, 1); + + try { + tester.prepare(application1, clusterSpec, 4, 1, flavor); + fail("Should not be able to deploy 4 nodes on 4 hosts because 1 is suspended"); + } catch (OutOfCapacityException ignored) { } + + // Resume the host, the deployment goes through + tester.orchestrator().resume(randomHost); + tester.activate(application1, tester.prepare(application1, clusterSpec, 4, 1, flavor)); + Set<String> hostnames = tester.getNodes(application1, State.active).hostnames(); + + // Verify that previously allocated nodes are not affected by host suspension + tester.orchestrator().suspend(randomHost); + tester.activate(application1, tester.prepare(application1, clusterSpec, 4, 1, flavor)); + assertEquals(hostnames, tester.getNodes(application1, State.active).hostnames()); + } + + @Test public void non_prod_zones_do_not_have_spares() { ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.perf, RegionName.from("us-east"))).flavorsConfig(flavorsConfig()).build(); tester.makeReadyNodes(3, "host-small", NodeType.host, 32); |