summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java49
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java2
3 files changed, 23 insertions, 30 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 17f25689a76..e918c1a815a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -7,7 +7,6 @@ import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.TransientException;
import com.yahoo.jdisc.Metric;
-import java.util.logging.Level;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
@@ -31,6 +30,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
+import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
@@ -138,7 +138,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
// We do this here ("lazily") to avoid writing to zk for each config request.
for (Node node : nodeRepository().getNodes(Node.State.ready)) {
Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname());
- if ( ! lastLocalRequest.isPresent()) continue;
+ if (lastLocalRequest.isEmpty()) continue;
if (! node.history().hasEventAfter(History.Event.Type.requested, lastLocalRequest.get())) {
History updatedHistory = node.history()
@@ -189,11 +189,15 @@ public class NodeFailer extends NodeRepositoryMaintainer {
.forEach((hostName, serviceInstances) -> {
Node node = activeNodesByHostname.get(hostName.s());
if (node == null) return;
-
- if (badNode(serviceInstances)) {
- recordAsDown(node);
- } else {
- clearDownRecord(node);
+ try (var lock = nodeRepository().lock(node.allocation().get().owner())) {
+ Optional<Node> currentNode = nodeRepository().getNode(node.hostname(), Node.State.active); // re-get inside lock
+ if (currentNode.isEmpty()) return; // Node disappeared since acquiring lock
+ node = currentNode.get();
+ if (badNode(serviceInstances)) {
+ recordAsDown(node, lock);
+ } else {
+ clearDownRecord(node, lock);
+ }
}
});
}
@@ -311,27 +315,16 @@ public class NodeFailer extends NodeRepositoryMaintainer {
countsByStatus.getOrDefault(ServiceStatus.DOWN, 0L) > 0L;
}
- /**
- * Record a node as down if not already recorded and returns the node in the new state.
- * This assumes the node is found in the node
- * repo and that the node is allocated. If we get here otherwise something is truly odd.
- */
- private Node recordAsDown(Node node) {
- if (node.history().event(History.Event.Type.down).isPresent()) return node; // already down: Don't change down timestamp
-
- try (Mutex lock = nodeRepository().lock(node.allocation().get().owner())) {
- node = nodeRepository().getNode(node.hostname(), Node.State.active).get(); // re-get inside lock
- return nodeRepository().write(node.downAt(clock.instant(), Agent.NodeFailer), lock);
- }
+ /** Record a node as down if not already recorded */
+ private void recordAsDown(Node node, Mutex lock) {
+ if (node.history().event(History.Event.Type.down).isPresent()) return; // already down: Don't change down timestamp
+ nodeRepository().write(node.downAt(clock.instant(), Agent.NodeFailer), lock);
}
- private void clearDownRecord(Node node) {
- if ( ! node.history().event(History.Event.Type.down).isPresent()) return;
-
- try (Mutex lock = nodeRepository().lock(node.allocation().get().owner())) {
- node = nodeRepository().getNode(node.hostname(), Node.State.active).get(); // re-get inside lock
- nodeRepository().write(node.up(), lock);
- }
+ /** Clear down record for node, if any */
+ private void clearDownRecord(Node node, Mutex lock) {
+ if (node.history().event(History.Event.Type.down).isEmpty()) return;
+ nodeRepository().write(node.up(), lock);
}
/**
@@ -344,7 +337,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
private boolean failActive(Node node, String reason) {
Optional<Deployment> deployment =
deployer.deployFromLocalActive(node.allocation().get().owner(), Duration.ofMinutes(30));
- if ( ! deployment.isPresent()) return false; // this will be done at another config server
+ if (deployment.isEmpty()) return false; // this will be done at another config server
try (Mutex lock = nodeRepository().lock(node.allocation().get().owner())) {
// If the active node that we are trying to fail is of type host, we need to successfully fail all
@@ -394,7 +387,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
if (recentlyFailedNodes.size() < throttlePolicy.allowedToFailOf(nodes.size())) return false;
// Always allow failing physical nodes up to minimum limit
- if (!node.parentHostname().isPresent() &&
+ if (node.parentHostname().isEmpty() &&
recentlyFailedNodes.parents().size() < throttlePolicy.minimumAllowedToFail) return false;
log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(),
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java
index 09f7ad8f9c3..433a69a063e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java
@@ -95,7 +95,7 @@ public class NodeRepositoryProvisioner implements Provisioner {
if ( ! hasQuota(application, requested.maxResources().nodes()))
throw new IllegalArgumentException(requested + " requested for " + cluster +
- ". Max value exceeds your quota. Resolve this at https://cloud.vespa.ai/quota");
+ ". Max value exceeds your quota. Resolve this at https://cloud.vespa.ai/pricing");
nodeResourceLimits.ensureWithinAdvertisedLimits("Min", requested.minResources().nodeResources(), cluster);
nodeResourceLimits.ensureWithinAdvertisedLimits("Max", requested.maxResources().nodeResources(), cluster);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java
index 9e211fd497c..b35f9dc88ae 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java
@@ -600,7 +600,7 @@ public class ProvisioningTest {
fail("Expected exception");
}
catch (IllegalArgumentException e) {
- assertEquals("6 nodes with [vcpu: 1.0, memory: 4.0 Gb, disk 10.0 Gb, bandwidth: 4.0 Gbps] requested for content cluster 'content0' 6.42. Max value exceeds your quota. Resolve this at https://cloud.vespa.ai/quota",
+ assertEquals("6 nodes with [vcpu: 1.0, memory: 4.0 Gb, disk 10.0 Gb, bandwidth: 4.0 Gbps] requested for content cluster 'content0' 6.42. Max value exceeds your quota. Resolve this at https://cloud.vespa.ai/pricing",
e.getMessage());
}
}