summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <valerijf@yahooinc.com>2022-02-03 22:32:17 +0100
committerValerij Fredriksen <valerijf@yahooinc.com>2022-02-03 22:47:57 +0100
commitf2f9542b2c39a5d4333d2e78de1e0a898f1b5f0a (patch)
treec398a6b551c12c53139a2f29d7bfcdb7d810df6d /node-repository
parentaa6c641255ee3a9742dc50f5076d321145533267 (diff)
Do not allocate nodes to suspended hosts
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java17
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java19
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java30
4 files changed, 51 insertions, 17 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
index 89db8a3d94d..a387bc28aa4 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
@@ -118,7 +118,7 @@ public class NodeRepository extends AbstractComponent {
this.db = new CuratorDatabaseClient(flavors, curator, clock, useCuratorClientCache, nodeCacheSize);
this.zone = zone;
this.clock = clock;
- this.nodes = new Nodes(db, zone, clock);
+ this.nodes = new Nodes(db, zone, clock, orchestrator);
this.flavors = flavors;
this.resourcesCalculator = provisionServiceProvider.getHostResourcesCalculator();
this.nameResolver = nameResolver;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index e914c3b16e6..a1916d7dc20 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -7,7 +7,6 @@ import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.TransientException;
import com.yahoo.jdisc.Metric;
import com.yahoo.transaction.Mutex;
-import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeMutex;
@@ -15,7 +14,6 @@ import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
-import com.yahoo.vespa.orchestrator.HostNameNotFoundException;
import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
import com.yahoo.yolean.Exceptions;
@@ -156,7 +154,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
for (Node node : activeNodes) {
- Instant graceTimeStart = clock().instant().minus(suspended(node) ? suspendedDownTimeLimit : downTimeLimit);
+ Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit);
if (node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) {
// Allow a grace period after node re-activation
if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart))
@@ -205,23 +203,14 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
}
- private boolean suspended(Node node) {
- try {
- return nodeRepository().orchestrator().getNodeStatus(new HostName(node.hostname())).isSuspended();
- } catch (HostNameNotFoundException e) {
- // Treat it as not suspended
- return false;
- }
- }
-
/** Is the node and all active children suspended? */
private boolean allSuspended(Node node, NodeList activeNodes) {
- if (!suspended(node)) return false;
+ if (!nodeRepository().nodes().suspended(node)) return false;
if (node.parentHostname().isPresent()) return true; // optimization
return activeNodes.stream()
.filter(childNode -> childNode.parentHostname().isPresent() &&
childNode.parentHostname().get().equals(node.hostname()))
- .allMatch(this::suspended);
+ .allMatch(nodeRepository().nodes()::suspended);
}
/**
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
index 39857034b09..57a3b436e37 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
@@ -10,6 +10,7 @@ import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.Zone;
import com.yahoo.transaction.Mutex;
import com.yahoo.transaction.NestedTransaction;
+import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.hosted.provision.LockedNodeList;
import com.yahoo.vespa.hosted.provision.NoSuchNodeException;
import com.yahoo.vespa.hosted.provision.Node;
@@ -18,6 +19,8 @@ import com.yahoo.vespa.hosted.provision.NodeMutex;
import com.yahoo.vespa.hosted.provision.maintenance.NodeFailer;
import com.yahoo.vespa.hosted.provision.node.filter.NodeFilter;
import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient;
+import com.yahoo.vespa.orchestrator.HostNameNotFoundException;
+import com.yahoo.vespa.orchestrator.Orchestrator;
import java.time.Clock;
import java.time.Duration;
@@ -53,14 +56,16 @@ public class Nodes {
private static final Logger log = Logger.getLogger(Nodes.class.getName());
+ private final CuratorDatabaseClient db;
private final Zone zone;
private final Clock clock;
- private final CuratorDatabaseClient db;
+ private final Orchestrator orchestrator;
- public Nodes(CuratorDatabaseClient db, Zone zone, Clock clock) {
+ public Nodes(CuratorDatabaseClient db, Zone zone, Clock clock, Orchestrator orchestrator) {
this.zone = zone;
this.clock = clock;
this.db = db;
+ this.orchestrator = orchestrator;
}
/** Read and write all nodes to make sure they are stored in the latest version of the serialized format */
@@ -732,6 +737,7 @@ public class Nodes {
if ( ! host.type().canRun(NodeType.tenant)) return false;
if (host.status().wantToRetire()) return false;
if (host.allocation().map(alloc -> alloc.membership().retired()).orElse(false)) return false;
+ if (suspended(host)) return false;
if (dynamicProvisioning)
return EnumSet.of(Node.State.active, Node.State.ready, Node.State.provisioned).contains(host.state());
@@ -739,6 +745,15 @@ public class Nodes {
return host.state() == Node.State.active;
}
+ public boolean suspended(Node node) {
+ try {
+ return orchestrator.getNodeStatus(new HostName(node.hostname())).isSuspended();
+ } catch (HostNameNotFoundException e) {
+ // Treat it as not suspended
+ return false;
+ }
+ }
+
/** Create a lock which provides exclusive rights to making changes to the given application */
// TODO: Move to Applications
public Mutex lock(ApplicationId application) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java
index df9e8efde93..583ccdda656 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicAllocationTest.java
@@ -17,6 +17,7 @@ import com.yahoo.config.provision.SystemName;
import com.yahoo.config.provision.Zone;
import com.yahoo.config.provisioning.FlavorsConfig;
import com.yahoo.transaction.NestedTransaction;
+import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.curator.transaction.CuratorTransaction;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.Node.State;
@@ -267,6 +268,35 @@ public class DynamicAllocationTest {
}
@Test
+ public void does_not_allocate_to_suspended_hosts() {
+ ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.prod, RegionName.from("us-east"))).flavorsConfig(flavorsConfig()).build();
+ tester.makeReadyNodes(4, "host-small", NodeType.host, 32);
+ tester.activateTenantHosts();
+
+ HostName randomHost = new HostName(tester.nodeRepository().nodes().list(State.active).first().get().hostname());
+ tester.orchestrator().suspend(randomHost);
+
+ ApplicationId application1 = ProvisioningTester.applicationId();
+ ClusterSpec clusterSpec = clusterSpec("myContent.t1.a1");
+ NodeResources flavor = new NodeResources(1, 4, 100, 1);
+
+ try {
+ tester.prepare(application1, clusterSpec, 4, 1, flavor);
+ fail("Should not be able to deploy 4 nodes on 4 hosts because 1 is suspended");
+ } catch (OutOfCapacityException ignored) { }
+
+ // Resume the host, the deployment goes through
+ tester.orchestrator().resume(randomHost);
+ tester.activate(application1, tester.prepare(application1, clusterSpec, 4, 1, flavor));
+ Set<String> hostnames = tester.getNodes(application1, State.active).hostnames();
+
+ // Verify that previously allocated nodes are not affected by host suspension
+ tester.orchestrator().suspend(randomHost);
+ tester.activate(application1, tester.prepare(application1, clusterSpec, 4, 1, flavor));
+ assertEquals(hostnames, tester.getNodes(application1, State.active).hostnames());
+ }
+
+ @Test
public void non_prod_zones_do_not_have_spares() {
ProvisioningTester tester = new ProvisioningTester.Builder().zone(new Zone(Environment.perf, RegionName.from("us-east"))).flavorsConfig(flavorsConfig()).build();
tester.makeReadyNodes(3, "host-small", NodeType.host, 32);