summaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository/src/main/java/com')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java12
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java15
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancerList.java12
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancers.java11
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java5
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java14
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureVersions.java1
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java8
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java173
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java9
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java8
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/NodeAcl.java3
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java24
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java12
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/CapacityPolicies.java5
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java1
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java11
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java22
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java1
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LoadBalancersV1ApiHandler.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java6
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java1
26 files changed, 199 insertions, 167 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
index 7f54fff5c70..a387bc28aa4 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java
@@ -29,6 +29,7 @@ import com.yahoo.vespa.hosted.provision.provisioning.ContainerImages;
import com.yahoo.vespa.hosted.provision.provisioning.FirmwareChecks;
import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisionServiceProvider;
+import com.yahoo.vespa.orchestrator.Orchestrator;
import java.time.Clock;
import java.util.List;
@@ -59,6 +60,7 @@ public class NodeRepository extends AbstractComponent {
private final LoadBalancers loadBalancers;
private final FlagSource flagSource;
private final MetricsDb metricsDb;
+ private final Orchestrator orchestrator;
private final int spareCount;
/**
@@ -72,7 +74,8 @@ public class NodeRepository extends AbstractComponent {
Curator curator,
Zone zone,
FlagSource flagSource,
- MetricsDb metricsDb) {
+ MetricsDb metricsDb,
+ Orchestrator orchestrator) {
this(flavors,
provisionServiceProvider,
curator,
@@ -83,6 +86,7 @@ public class NodeRepository extends AbstractComponent {
Optional.of(config.tenantContainerImage()).filter(s -> !s.isEmpty()).map(DockerImage::fromString),
flagSource,
metricsDb,
+ orchestrator,
config.useCuratorClientCache(),
zone.environment().isProduction() && !zone.getCloud().dynamicProvisioning() && !zone.system().isCd() ? 1 : 0,
config.nodeCacheSize());
@@ -102,6 +106,7 @@ public class NodeRepository extends AbstractComponent {
Optional<DockerImage> tenantContainerImage,
FlagSource flagSource,
MetricsDb metricsDb,
+ Orchestrator orchestrator,
boolean useCuratorClientCache,
int spareCount,
long nodeCacheSize) {
@@ -113,7 +118,7 @@ public class NodeRepository extends AbstractComponent {
this.db = new CuratorDatabaseClient(flavors, curator, clock, useCuratorClientCache, nodeCacheSize);
this.zone = zone;
this.clock = clock;
- this.nodes = new Nodes(db, zone, clock);
+ this.nodes = new Nodes(db, zone, clock, orchestrator);
this.flavors = flavors;
this.resourcesCalculator = provisionServiceProvider.getHostResourcesCalculator();
this.nameResolver = nameResolver;
@@ -127,6 +132,7 @@ public class NodeRepository extends AbstractComponent {
this.loadBalancers = new LoadBalancers(db);
this.flagSource = flagSource;
this.metricsDb = metricsDb;
+ this.orchestrator = orchestrator;
this.spareCount = spareCount;
nodes.rewrite();
}
@@ -172,6 +178,8 @@ public class NodeRepository extends AbstractComponent {
public MetricsDb metricsDb() { return metricsDb; }
+ public Orchestrator orchestrator() { return orchestrator; }
+
public NodeRepoStats computeStats() { return NodeRepoStats.computeOver(this); }
/** Returns the time keeper of this system */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
index 08a9e373085..a73b6896c2c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java
@@ -18,6 +18,7 @@ import io.questdb.cairo.sql.Record;
import io.questdb.cairo.sql.RecordCursor;
import io.questdb.cairo.sql.RecordCursorFactory;
import io.questdb.griffin.CompiledQuery;
+import io.questdb.griffin.QueryFuture;
import io.questdb.griffin.SqlCompiler;
import io.questdb.griffin.SqlException;
import io.questdb.griffin.SqlExecutionContext;
@@ -341,6 +342,16 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb {
}
}
+ /**
+ * Issues and wait for an SQL statement to be executed against the QuestDb engine.
+ * Needs to be done for some queries, e.g. 'alter table' queries, see https://github.com/questdb/questdb/issues/1846
+ */
+ private void issueAsync(String sql, SqlExecutionContext context) throws SqlException {
+ try (QueryFuture future = issue(sql, context).execute(null)) {
+ future.await();
+ }
+ }
+
private SqlExecutionContext newContext() {
return new SqlExecutionContextImpl(engine(), 1);
}
@@ -374,7 +385,7 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb {
void gc() {
synchronized (writeLock) {
try {
- issue("alter table " + name + " drop partition where at < dateadd('d', -4, now());", newContext());
+ issueAsync("alter table " + name + " drop partition where at < dateadd('d', -4, now());", newContext());
}
catch (SqlException e) {
log.log(Level.WARNING, "Failed to gc old metrics data in " + dir + " table " + name, e);
@@ -396,7 +407,7 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb {
void ensureColumnExists(String column, String columnType) throws SqlException {
if (columnNames().contains(column)) return;
- issue("alter table " + name + " add column " + column + " " + columnType, newContext());
+ issueAsync("alter table " + name + " add column " + column + " " + columnType, newContext());
}
private Optional<Long> adjustOrDiscard(Instant at) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancerList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancerList.java
index ad5bf1a2962..3e7da831bc4 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancerList.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancerList.java
@@ -2,6 +2,8 @@
package com.yahoo.vespa.hosted.provision.lb;
import com.yahoo.collections.AbstractFilteringList;
+import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.config.provision.ClusterSpec;
import java.util.Collection;
@@ -21,6 +23,16 @@ public class LoadBalancerList extends AbstractFilteringList<LoadBalancer, LoadBa
return matching(lb -> lb.state() == state);
}
+ /** Returns the subset of load balancers in given cluster */
+ public LoadBalancerList application(ApplicationId application) {
+ return matching(lb -> lb.id().application().equals(application));
+ }
+
+ /** Returns the subset of load balancers in given cluster */
+ public LoadBalancerList cluster(ClusterSpec.Id cluster) {
+ return matching(lb -> lb.id().cluster().equals(cluster));
+ }
+
public static LoadBalancerList copyOf(Collection<LoadBalancer> loadBalancers) {
return new LoadBalancerList(loadBalancers, false);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancers.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancers.java
index d2c1aab72e2..7cbb8ef2764 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancers.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancers.java
@@ -2,20 +2,9 @@
package com.yahoo.vespa.hosted.provision.lb;
import com.yahoo.config.provision.ApplicationId;
-import com.yahoo.config.provision.NodeType;
-import com.yahoo.vespa.hosted.provision.Node;
-import com.yahoo.vespa.hosted.provision.NodeList;
-import com.yahoo.vespa.hosted.provision.NodeRepository;
-import com.yahoo.vespa.hosted.provision.node.NodeAcl;
import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient;
-import java.util.Comparator;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.TreeSet;
import java.util.function.Predicate;
-import java.util.stream.Collectors;
/**
* The load balancers of this node repo.
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
index 341ab1f785c..e6476cd7373 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
@@ -26,7 +26,6 @@ import com.yahoo.vespa.hosted.provision.NodesAndHosts;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.hosted.provision.node.IP;
-import com.yahoo.vespa.hosted.provision.node.Nodes;
import com.yahoo.vespa.hosted.provision.provisioning.FatalProvisioningException;
import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner;
import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.HostSharing;
@@ -205,7 +204,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
private Map<String, Node> findSharedHosts(NodeList nodeList) {
return nodeList.stream()
- .filter(node -> Nodes.canAllocateTenantNodeTo(node, true))
+ .filter(node -> nodeRepository().nodes().canAllocateTenantNodeTo(node, true))
.filter(node -> node.reservedTo().isEmpty())
.filter(node -> node.exclusiveToApplicationId().isEmpty())
.collect(Collectors.toMap(Node::hostname, Function.identity()));
@@ -298,7 +297,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
int wantedGroups = 1;
NodePrioritizer prioritizer = new NodePrioritizer(nodesAndHosts, applicationId, clusterSpec, nodeSpec, wantedGroups,
- true, nodeRepository().nameResolver(), nodeRepository().resourcesCalculator(),
+ true, nodeRepository().nameResolver(), nodeRepository().nodes(), nodeRepository().resourcesCalculator(),
nodeRepository().spareCount());
List<NodeCandidate> nodeCandidates = prioritizer.collect(List.of());
MutableInteger index = new MutableInteger(0);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
index 6133705ed59..3274f12dbc6 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java
@@ -42,6 +42,8 @@ import java.util.stream.Collectors;
public class FailedExpirer extends NodeRepositoryMaintainer {
private static final Logger log = Logger.getLogger(FailedExpirer.class.getName());
+ // Try recycling nodes until reaching this many failures
+ private static final int maxAllowedFailures = 50;
private final NodeRepository nodeRepository;
private final Duration statefulExpiry; // Stateful nodes: Grace period to allow recovery of data
@@ -85,11 +87,11 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
recycle(nodesToRecycle);
}
- /** Move eligible nodes to dirty. This may be a subset of the given nodes */
+ /** Move eligible nodes to dirty or parked. This may be a subset of the given nodes */
private void recycle(List<Node> nodes) {
List<Node> nodesToRecycle = new ArrayList<>();
for (Node candidate : nodes) {
- if (NodeFailer.hasHardwareIssue(candidate, nodeRepository)) {
+ if (broken(candidate)) {
List<String> unparkedChildren = !candidate.type().isHost() ? List.of() :
nodeRepository.nodes().list()
.childrenOf(candidate)
@@ -98,7 +100,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
if (unparkedChildren.isEmpty()) {
nodeRepository.nodes().park(candidate.hostname(), false, Agent.FailedExpirer,
- "Parked by FailedExpirer due to hardware issue");
+ "Parked by FailedExpirer due to hardware issue or high fail count");
} else {
log.info(String.format("Expired failed node %s with hardware issue was not parked because of " +
"unparked children: %s", candidate.hostname(),
@@ -111,4 +113,10 @@ public class FailedExpirer extends NodeRepositoryMaintainer {
nodeRepository.nodes().deallocate(nodesToRecycle, Agent.FailedExpirer, "Expired by FailedExpirer");
}
+ /** Returns whether node is broken and cannot be recycled */
+ private boolean broken(Node node) {
+ return NodeFailer.hasHardwareIssue(node, nodeRepository) ||
+ (node.type().isHost() && node.status().failCount() >= maxAllowedFailures);
+ }
+
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureVersions.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureVersions.java
index 4f913bb55dd..a1f36a4f1a5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureVersions.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureVersions.java
@@ -88,7 +88,6 @@ public class InfrastructureVersions {
case controllerhost:
case proxyhost:
case host:
- case devhost:
break;
default:
throw new IllegalArgumentException("Target version for type " + nodeType + " is not allowed");
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index ca14a1be4c4..636884cef0a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -22,7 +22,6 @@ import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.node.ClusterId;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.hosted.provision.persistence.CacheStats;
-import com.yahoo.vespa.orchestrator.Orchestrator;
import com.yahoo.vespa.service.monitor.ServiceModel;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
@@ -47,20 +46,17 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
private final Set<Pair<Metric.Context, String>> nonZeroMetrics = new HashSet<>();
private final Metric metric;
- private final Orchestrator orchestrator;
private final ServiceMonitor serviceMonitor;
private final Map<Map<String, String>, Metric.Context> contextMap = new HashMap<>();
private final Supplier<Integer> pendingRedeploymentsSupplier;
MetricsReporter(NodeRepository nodeRepository,
Metric metric,
- Orchestrator orchestrator,
ServiceMonitor serviceMonitor,
Supplier<Integer> pendingRedeploymentsSupplier,
Duration interval) {
super(nodeRepository, interval, metric);
this.metric = metric;
- this.orchestrator = orchestrator;
this.serviceMonitor = serviceMonitor;
this.pendingRedeploymentsSupplier = pendingRedeploymentsSupplier;
}
@@ -201,7 +197,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
metric.set("wantToRetire", node.status().wantToRetire() ? 1 : 0, context);
metric.set("wantToDeprovision", node.status().wantToDeprovision() ? 1 : 0, context);
- metric.set("failReport", NodeFailer.reasonsToFailParentHost(node).isEmpty() ? 0 : 1, context);
+ metric.set("failReport", NodeFailer.reasonsToFailHost(node).isEmpty() ? 0 : 1, context);
if (node.type().isHost()) {
metric.set("wantToEncrypt", node.reports().getReport("wantToEncrypt").isPresent() ? 1 : 0, context);
@@ -212,7 +208,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
serviceModel.getApplication(hostname)
.map(ApplicationInstance::reference)
- .map(reference -> orchestrator.getHostInfo(reference, hostname))
+ .map(reference -> nodeRepository().orchestrator().getHostInfo(reference, hostname))
.ifPresent(info -> {
int suspended = info.status().isSuspended() ? 1 : 0;
metric.set("suspended", suspended, context);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index 00881f5e2a8..a1916d7dc20 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -7,7 +7,6 @@ import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.TransientException;
import com.yahoo.jdisc.Metric;
import com.yahoo.transaction.Mutex;
-import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeMutex;
@@ -15,17 +14,16 @@ import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
-import com.yahoo.vespa.orchestrator.HostNameNotFoundException;
-import com.yahoo.vespa.orchestrator.Orchestrator;
import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
import com.yahoo.yolean.Exceptions;
import java.time.Duration;
import java.time.Instant;
-import java.util.HashMap;
+import java.util.Collection;
+import java.util.HashSet;
import java.util.List;
-import java.util.Map;
import java.util.Optional;
+import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
@@ -40,7 +38,6 @@ import java.util.stream.Collectors;
public class NodeFailer extends NodeRepositoryMaintainer {
private static final Logger log = Logger.getLogger(NodeFailer.class.getName());
- private static final Duration nodeRequestInterval = Duration.ofMinutes(10);
/** Metric for number of hosts that we want to fail, but cannot due to throttling */
static final String throttledHostFailuresMetric = "throttledHostFailures";
@@ -53,20 +50,17 @@ public class NodeFailer extends NodeRepositoryMaintainer {
private final Deployer deployer;
private final Duration downTimeLimit;
- private final Orchestrator orchestrator;
- private final Instant constructionTime;
+ private final Duration suspendedDownTimeLimit;
private final ThrottlePolicy throttlePolicy;
private final Metric metric;
public NodeFailer(Deployer deployer, NodeRepository nodeRepository,
- Duration downTimeLimit, Duration interval, Orchestrator orchestrator,
- ThrottlePolicy throttlePolicy, Metric metric) {
+ Duration downTimeLimit, Duration interval, ThrottlePolicy throttlePolicy, Metric metric) {
// check ping status every interval, but at least twice as often as the down time limit
super(nodeRepository, min(downTimeLimit.dividedBy(2), interval), metric);
this.deployer = deployer;
this.downTimeLimit = downTimeLimit;
- this.orchestrator = orchestrator;
- this.constructionTime = nodeRepository.clock().instant();
+ this.suspendedDownTimeLimit = downTimeLimit.multipliedBy(4); // Allow more downtime when a node is suspended
this.throttlePolicy = throttlePolicy;
this.metric = metric;
}
@@ -82,38 +76,34 @@ public class NodeFailer extends NodeRepositoryMaintainer {
// Ready nodes
try (Mutex lock = nodeRepository().nodes().lockUnallocated()) {
- for (Map.Entry<Node, String> entry : getReadyNodesByFailureReason().entrySet()) {
+ for (FailingNode failing : findReadyFailingNodes()) {
attempts++;
- Node node = entry.getKey();
- if (throttle(node)) {
+ if (throttle(failing.node())) {
failures++;
- if (node.type().isHost())
+ if (failing.node().type().isHost())
throttledHostFailures++;
else
throttledNodeFailures++;
continue;
}
- String reason = entry.getValue();
- nodeRepository().nodes().fail(node.hostname(), Agent.NodeFailer, reason);
+ nodeRepository().nodes().fail(failing.node().hostname(), Agent.NodeFailer, failing.reason());
}
}
// Active nodes
- for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason().entrySet()) {
+ for (FailingNode failing : findActiveFailingNodes()) {
attempts++;
- Node node = entry.getKey();
- if (!failAllowedFor(node.type())) continue;
+ if (!failAllowedFor(failing.node().type())) continue;
- if (throttle(node)) {
+ if (throttle(failing.node())) {
failures++;
- if (node.type().isHost())
+ if (failing.node().type().isHost())
throttledHostFailures++;
else
throttledNodeFailures++;
continue;
}
- String reason = entry.getValue();
- failActive(node, reason);
+ failActive(failing);
}
// Active hosts
@@ -143,60 +133,54 @@ public class NodeFailer extends NodeRepositoryMaintainer {
return asSuccessFactor(attempts, failures);
}
- private Map<Node, String> getReadyNodesByFailureReason() {
- Instant oldestAcceptableRequestTime =
- // Allow requests some time to be registered in case all config servers have been down
- constructionTime.isAfter(clock().instant().minus(nodeRequestInterval.multipliedBy(2))) ?
- Instant.EPOCH :
-
- // Nodes are taken as dead if they have not made a config request since this instant.
- // Add 10 minutes to the down time limit to allow nodes to make a request that infrequently.
- clock().instant().minus(downTimeLimit).minus(nodeRequestInterval);
-
- Map<Node, String> nodesByFailureReason = new HashMap<>();
+ private Collection<FailingNode> findReadyFailingNodes() {
+ Set<FailingNode> failingNodes = new HashSet<>();
for (Node node : nodeRepository().nodes().list(Node.State.ready)) {
Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node);
- List<String> failureReports = reasonsToFailParentHost(hostNode);
+ List<String> failureReports = reasonsToFailHost(hostNode);
if (failureReports.size() > 0) {
if (hostNode.equals(node)) {
- nodesByFailureReason.put(node, "Host has failure reports: " + failureReports);
+ failingNodes.add(new FailingNode(node, "Host has failure reports: " + failureReports));
} else {
- nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports);
+ failingNodes.add(new FailingNode(node, "Parent (" + hostNode + ") has failure reports: " + failureReports));
}
}
}
- return nodesByFailureReason;
+ return failingNodes;
}
- private Map<Node, String> getActiveNodesByFailureReason() {
+ private Collection<FailingNode> findActiveFailingNodes() {
+ Set<FailingNode> failingNodes = new HashSet<>();
NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
- Instant graceTimeEnd = clock().instant().minus(downTimeLimit);
- Map<Node, String> nodesByFailureReason = new HashMap<>();
+
for (Node node : activeNodes) {
- if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) {
+ Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit);
+ if (node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) {
// Allow a grace period after node re-activation
- if ( ! node.history().hasEventAfter(History.Event.Type.activated, graceTimeEnd))
- nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit);
+ if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart))
+ failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit));
}
- else if (hostSuspended(node, activeNodes)) {
- Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node);
- if (hostNode.type().isHost()) {
- List<String> failureReports = reasonsToFailParentHost(hostNode);
- if (failureReports.size() > 0) {
- if (hostNode.equals(node)) {
- nodesByFailureReason.put(node, "Host has failure reports: " + failureReports);
- } else {
- nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports);
- }
+ }
+
+ for (Node node : activeNodes) {
+ if (allSuspended(node, activeNodes)) {
+ Node host = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node);
+ if (host.type().isHost()) {
+ List<String> failureReports = reasonsToFailHost(host);
+ if ( ! failureReports.isEmpty()) {
+ failingNodes.add(new FailingNode(node, host.equals(node) ?
+ "Host has failure reports: " + failureReports :
+ "Parent " + host + " has failure reports: " + failureReports));
}
}
}
}
- return nodesByFailureReason;
+
+ return failingNodes;
}
- public static List<String> reasonsToFailParentHost(Node hostNode) {
- return hostNode.reports().getReports().stream()
+ public static List<String> reasonsToFailHost(Node host) {
+ return host.reports().getReports().stream()
.filter(report -> report.getType().hostShouldBeFailed())
// The generated string is built from the report's ID, created time, and description only.
.map(report -> report.getReportId() + " reported " + report.getCreatedTime() + ": " + report.getDescription())
@@ -205,37 +189,28 @@ public class NodeFailer extends NodeRepositoryMaintainer {
/** Returns whether node has any kind of hardware issue */
static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) {
- Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository.nodes().node(parent)).orElse(node);
- return reasonsToFailParentHost(hostNode).size() > 0;
+ Node host = node.parentHostname().flatMap(parent -> nodeRepository.nodes().node(parent)).orElse(node);
+ return reasonsToFailHost(host).size() > 0;
}
private boolean applicationSuspended(Node node) {
try {
- return orchestrator.getApplicationInstanceStatus(node.allocation().get().owner())
+ return nodeRepository().orchestrator().getApplicationInstanceStatus(node.allocation().get().owner())
== ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN;
} catch (ApplicationIdNotFoundException e) {
- //Treat it as not suspended and allow to fail the node anyway
- return false;
- }
- }
-
- private boolean nodeSuspended(Node node) {
- try {
- return orchestrator.getNodeStatus(new HostName(node.hostname())).isSuspended();
- } catch (HostNameNotFoundException e) {
- // Treat it as not suspended
+ // Treat it as not suspended and allow to fail the node anyway
return false;
}
}
/** Is the node and all active children suspended? */
- private boolean hostSuspended(Node node, NodeList activeNodes) {
- if (!nodeSuspended(node)) return false;
+ private boolean allSuspended(Node node, NodeList activeNodes) {
+ if (!nodeRepository().nodes().suspended(node)) return false;
if (node.parentHostname().isPresent()) return true; // optimization
return activeNodes.stream()
.filter(childNode -> childNode.parentHostname().isPresent() &&
childNode.parentHostname().get().equals(node.hostname()))
- .allMatch(this::nodeSuspended);
+ .allMatch(nodeRepository().nodes()::suspended);
}
/**
@@ -264,40 +239,40 @@ public class NodeFailer extends NodeRepositoryMaintainer {
*
* @return whether node was successfully failed
*/
- private boolean failActive(Node node, String reason) {
+ private boolean failActive(FailingNode failing) {
Optional<Deployment> deployment =
- deployer.deployFromLocalActive(node.allocation().get().owner(), Duration.ofMinutes(30));
+ deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(30));
if (deployment.isEmpty()) return false;
- try (Mutex lock = nodeRepository().nodes().lock(node.allocation().get().owner())) {
+ try (Mutex lock = nodeRepository().nodes().lock(failing.node().allocation().get().owner())) {
// If the active node that we are trying to fail is of type host, we need to successfully fail all
// the children nodes running on it before we fail the host
boolean allTenantNodesFailedOutSuccessfully = true;
- String reasonForChildFailure = "Failing due to parent host " + node.hostname() + " failure: " + reason;
- for (Node failingTenantNode : nodeRepository().nodes().list().childrenOf(node)) {
+ String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason();
+ for (Node failingTenantNode : nodeRepository().nodes().list().childrenOf(failing.node())) {
if (failingTenantNode.state() == Node.State.active) {
- allTenantNodesFailedOutSuccessfully &= failActive(failingTenantNode, reasonForChildFailure);
+ allTenantNodesFailedOutSuccessfully &= failActive(new FailingNode(failingTenantNode, reasonForChildFailure));
} else {
nodeRepository().nodes().fail(failingTenantNode.hostname(), Agent.NodeFailer, reasonForChildFailure);
}
}
if (! allTenantNodesFailedOutSuccessfully) return false;
- wantToFail(node, true, lock);
+ wantToFail(failing.node(), true, lock);
try {
deployment.get().activate();
return true;
} catch (TransientException e) {
- log.log(Level.INFO, "Failed to redeploy " + node.allocation().get().owner() +
+ log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() +
" with a transient error, will be retried by application maintainer: " +
Exceptions.toMessageString(e));
return true;
} catch (RuntimeException e) {
// Reset want to fail: We'll retry failing unless it heals in the meantime
- nodeRepository().nodes().node(node.hostname())
+ nodeRepository().nodes().node(failing.node().hostname())
.ifPresent(n -> wantToFail(n, false, lock));
- log.log(Level.WARNING, "Could not fail " + node + " for " + node.allocation().get().owner() +
- " for " + reason + ": " + Exceptions.toMessageString(e));
+ log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
+ " for " + failing.reason() + ": " + Exceptions.toMessageString(e));
return false;
}
}
@@ -359,4 +334,30 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
+ private static class FailingNode {
+
+ private final Node node;
+ private final String reason;
+
+ public FailingNode(Node node, String reason) {
+ this.node = node;
+ this.reason = reason;
+ }
+
+ public Node node() { return node; }
+ public String reason() { return reason; }
+
+ @Override
+ public boolean equals(Object other) {
+ if ( ! (other instanceof FailingNode)) return false;
+ return ((FailingNode)other).node().equals(this.node());
+ }
+
+ @Override
+ public int hashCode() {
+ return node.hashCode();
+ }
+
+ }
+
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java
index 57db874fb84..552db84748d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java
@@ -74,6 +74,8 @@ public abstract class NodeMover<MOVE> extends NodeRepositoryMaintainer {
if (deployedRecently(applicationId)) continue;
for (HostWithResources toHost : hostResources) {
if (toHost.node.hostname().equals(node.parentHostname().get())) continue;
+ if (toHost.node.reservedTo().isPresent() &&
+ !toHost.node.reservedTo().get().equals(applicationId.tenant())) continue; // Reserved to a different tenant
if (spares.contains(toHost.node)) continue; // Do not offer spares as a valid move as they are reserved for replacement of failed nodes
if ( ! toHost.hasCapacity(node.resources())) continue;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index 2f200032492..15decde0d7c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -14,7 +14,6 @@ import com.yahoo.vespa.flags.FlagSource;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.autoscale.MetricsFetcher;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisionServiceProvider;
-import com.yahoo.vespa.orchestrator.Orchestrator;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
import java.time.Duration;
@@ -35,7 +34,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
@Inject
public NodeRepositoryMaintenance(NodeRepository nodeRepository, Deployer deployer, InfraDeployer infraDeployer,
HostLivenessTracker hostLivenessTracker, ServiceMonitor serviceMonitor,
- Zone zone, Orchestrator orchestrator, Metric metric,
+ Zone zone, Metric metric,
ProvisionServiceProvider provisionServiceProvider, FlagSource flagSource,
MetricsFetcher metricsFetcher) {
DefaultTimes defaults = new DefaultTimes(zone, deployer);
@@ -46,11 +45,11 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
maintainers.add(periodicApplicationMaintainer);
maintainers.add(infrastructureProvisioner);
- maintainers.add(new NodeFailer(deployer, nodeRepository, defaults.failGrace, defaults.nodeFailerInterval, orchestrator, defaults.throttlePolicy, metric));
+ maintainers.add(new NodeFailer(deployer, nodeRepository, defaults.failGrace, defaults.nodeFailerInterval, defaults.throttlePolicy, metric));
maintainers.add(new NodeHealthTracker(hostLivenessTracker, serviceMonitor, nodeRepository, defaults.nodeFailureStatusUpdateInterval, metric));
maintainers.add(new ExpeditedChangeApplicationMaintainer(deployer, metric, nodeRepository, defaults.expeditedChangeRedeployInterval));
maintainers.add(new ReservationExpirer(nodeRepository, defaults.reservationExpiry, metric));
- maintainers.add(new RetiredExpirer(nodeRepository, orchestrator, deployer, metric, defaults.retiredInterval, defaults.retiredExpiry));
+ maintainers.add(new RetiredExpirer(nodeRepository, deployer, metric, defaults.retiredInterval, defaults.retiredExpiry));
maintainers.add(new InactiveExpirer(nodeRepository, defaults.inactiveExpiry, Map.of(NodeType.config, defaults.inactiveConfigServerExpiry,
NodeType.controller, defaults.inactiveControllerExpiry),
metric));
@@ -58,7 +57,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
maintainers.add(new DirtyExpirer(nodeRepository, defaults.dirtyExpiry, metric));
maintainers.add(new ProvisionedExpirer(nodeRepository, defaults.provisionedExpiry, metric));
maintainers.add(new NodeRebooter(nodeRepository, flagSource, metric));
- maintainers.add(new MetricsReporter(nodeRepository, metric, orchestrator, serviceMonitor, periodicApplicationMaintainer::pendingDeployments, defaults.metricsInterval));
+ maintainers.add(new MetricsReporter(nodeRepository, metric, serviceMonitor, periodicApplicationMaintainer::pendingDeployments, defaults.metricsInterval));
maintainers.add(new SpareCapacityMaintainer(deployer, nodeRepository, metric, defaults.spareCapacityMaintenanceInterval));
maintainers.add(new OsUpgradeActivator(nodeRepository, defaults.osUpgradeActivatorInterval, metric));
maintainers.add(new Rebalancer(deployer, nodeRepository, metric, defaults.rebalancerInterval));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
index a7ba8b27851..73c9a1ab55a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
@@ -1,7 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
-import com.google.common.util.concurrent.UncheckedTimeoutException;
+import com.yahoo.concurrent.UncheckedTimeoutException;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Deployer;
import com.yahoo.jdisc.Metric;
@@ -11,7 +11,6 @@ import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.orchestrator.OrchestrationException;
-import com.yahoo.vespa.orchestrator.Orchestrator;
import com.yahoo.yolean.Exceptions;
import java.time.Duration;
@@ -31,11 +30,9 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
private final Deployer deployer;
private final Metric metric;
- private final Orchestrator orchestrator;
private final Duration retiredExpiry;
public RetiredExpirer(NodeRepository nodeRepository,
- Orchestrator orchestrator,
Deployer deployer,
Metric metric,
Duration maintenanceInterval,
@@ -43,7 +40,6 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
super(nodeRepository, maintenanceInterval, metric);
this.deployer = deployer;
this.metric = metric;
- this.orchestrator = orchestrator;
this.retiredExpiry = retiredExpiry;
}
@@ -126,7 +122,7 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
}
try {
- orchestrator.acquirePermissionToRemove(new HostName(node.hostname()));
+ nodeRepository().orchestrator().acquirePermissionToRemove(new HostName(node.hostname()));
log.info("Node " + node + " has been granted permission to be removed");
return true;
} catch (UncheckedTimeoutException e) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/NodeAcl.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/NodeAcl.java
index 88a62c94f43..ac24c83e129 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/NodeAcl.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/NodeAcl.java
@@ -109,10 +109,9 @@ public class NodeAcl {
case proxy:
// Proxy nodes trust:
// - config servers
- // - all connections from the world on 4080 (insecure tb removed), and 4443
+ // - all connections from the world on 443 (production traffic) and 4443 (health checks)
trustedNodes.addAll(allNodes.nodeType(NodeType.config).asList());
trustedPorts.add(443);
- trustedPorts.add(4080);
trustedPorts.add(4443);
break;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
index 7f57ec219ae..57a3b436e37 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
@@ -10,6 +10,7 @@ import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.Zone;
import com.yahoo.transaction.Mutex;
import com.yahoo.transaction.NestedTransaction;
+import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.hosted.provision.LockedNodeList;
import com.yahoo.vespa.hosted.provision.NoSuchNodeException;
import com.yahoo.vespa.hosted.provision.Node;
@@ -18,6 +19,8 @@ import com.yahoo.vespa.hosted.provision.NodeMutex;
import com.yahoo.vespa.hosted.provision.maintenance.NodeFailer;
import com.yahoo.vespa.hosted.provision.node.filter.NodeFilter;
import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient;
+import com.yahoo.vespa.orchestrator.HostNameNotFoundException;
+import com.yahoo.vespa.orchestrator.Orchestrator;
import java.time.Clock;
import java.time.Duration;
@@ -53,14 +56,16 @@ public class Nodes {
private static final Logger log = Logger.getLogger(Nodes.class.getName());
+ private final CuratorDatabaseClient db;
private final Zone zone;
private final Clock clock;
- private final CuratorDatabaseClient db;
+ private final Orchestrator orchestrator;
- public Nodes(CuratorDatabaseClient db, Zone zone, Clock clock) {
+ public Nodes(CuratorDatabaseClient db, Zone zone, Clock clock, Orchestrator orchestrator) {
this.zone = zone;
this.clock = clock;
this.db = db;
+ this.orchestrator = orchestrator;
}
/** Read and write all nodes to make sure they are stored in the latest version of the serialized format */
@@ -474,7 +479,7 @@ public class Nodes {
if (node.state() == Node.State.ready) return node;
Node parentHost = node.parentHostname().flatMap(this::node).orElse(node);
- List<String> failureReasons = NodeFailer.reasonsToFailParentHost(parentHost);
+ List<String> failureReasons = NodeFailer.reasonsToFailHost(parentHost);
if ( ! failureReasons.isEmpty())
illegal(node + " cannot be readied because it has hard failures: " + failureReasons);
@@ -728,10 +733,11 @@ public class Nodes {
return canAllocateTenantNodeTo(host, zone.getCloud().dynamicProvisioning());
}
- public static boolean canAllocateTenantNodeTo(Node host, boolean dynamicProvisioning) {
+ public boolean canAllocateTenantNodeTo(Node host, boolean dynamicProvisioning) {
if ( ! host.type().canRun(NodeType.tenant)) return false;
if (host.status().wantToRetire()) return false;
if (host.allocation().map(alloc -> alloc.membership().retired()).orElse(false)) return false;
+ if (suspended(host)) return false;
if (dynamicProvisioning)
return EnumSet.of(Node.State.active, Node.State.ready, Node.State.provisioned).contains(host.state());
@@ -739,6 +745,15 @@ public class Nodes {
return host.state() == Node.State.active;
}
+ public boolean suspended(Node node) {
+ try {
+ return orchestrator.getNodeStatus(new HostName(node.hostname())).isSuspended();
+ } catch (HostNameNotFoundException e) {
+ // Treat it as not suspended
+ return false;
+ }
+ }
+
/** Create a lock which provides exclusive rights to making changes to the given application */
// TODO: Move to Applications
public Mutex lock(ApplicationId application) {
@@ -819,6 +834,7 @@ public class Nodes {
private static boolean parkOnDeallocationOf(Node node, Agent agent) {
if (node.state() == Node.State.parked) return false;
if (agent == Agent.operator) return false;
+ if (!node.type().isHost() && node.status().wantToDeprovision()) return false;
boolean retirementRequestedByOperator = node.status().wantToRetire() &&
node.history().event(History.Event.Type.wantToRetire)
.map(History.Event::agent)
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
index ba28d8e6b9a..379bb2566df 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java
@@ -1,8 +1,8 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.persistence;
-import com.google.common.util.concurrent.UncheckedTimeoutException;
import com.yahoo.component.Version;
+import com.yahoo.concurrent.UncheckedTimeoutException;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.ApplicationLockException;
import com.yahoo.config.provision.ApplicationTransaction;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
index 543972a9cb3..cd1b786afd1 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
@@ -554,7 +554,6 @@ public class NodeSerializer {
case "confighost": return NodeType.confighost;
case "controller": return NodeType.controller;
case "controllerhost": return NodeType.controllerhost;
- case "devhost": return NodeType.devhost;
default : throw new IllegalArgumentException("Unknown node type '" + typeString + "'");
}
}
@@ -569,7 +568,6 @@ public class NodeSerializer {
case confighost: return "confighost";
case controller: return "controller";
case controllerhost: return "controllerhost";
- case devhost: return "devhost";
}
throw new IllegalArgumentException("Serialized form of '" + type + "' not defined");
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java
index 8c358301b85..3da0506f2e1 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java
@@ -45,8 +45,8 @@ class Activator {
/** Activate required resources for application guarded by given lock */
public void activate(Collection<HostSpec> hosts, long generation, ApplicationTransaction transaction) {
- activateNodes(hosts, generation, transaction);
- activateLoadBalancers(hosts, transaction);
+ NodeList newActive = activateNodes(hosts, generation, transaction);
+ activateLoadBalancers(hosts, newActive, transaction);
}
/**
@@ -62,8 +62,9 @@ class Activator {
* @param generation the application config generation that is activated
* @param transaction transaction with operations to commit together with any operations done within the repository,
* while holding the node repository lock on this application
+ * @return the nodes that will be active when transaction is committed
*/
- private void activateNodes(Collection<HostSpec> hosts, long generation, ApplicationTransaction transaction) {
+ private NodeList activateNodes(Collection<HostSpec> hosts, long generation, ApplicationTransaction transaction) {
Instant activationTime = nodeRepository.clock().instant(); // Use one timestamp for all activation changes
ApplicationId application = transaction.application();
Set<String> hostnames = hosts.stream().map(HostSpec::hostname).collect(Collectors.toSet());
@@ -95,6 +96,7 @@ class Activator {
oldActive.not().retired(),
newActive.not().retired());
unreserveParentsOf(reserved);
+ return newActive;
}
private void deactivate(NodeList toDeactivate, ApplicationTransaction transaction) {
@@ -149,8 +151,8 @@ class Activator {
}
/** Activate load balancers */
- private void activateLoadBalancers(Collection<HostSpec> hosts, ApplicationTransaction transaction) {
- loadBalancerProvisioner.ifPresent(provisioner -> provisioner.activate(allClustersOf(hosts), transaction));
+ private void activateLoadBalancers(Collection<HostSpec> hosts, NodeList newActive, ApplicationTransaction transaction) {
+ loadBalancerProvisioner.ifPresent(provisioner -> provisioner.activate(allClustersOf(hosts), newActive, transaction));
}
private static Set<ClusterSpec> allClustersOf(Collection<HostSpec> hosts) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/CapacityPolicies.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/CapacityPolicies.java
index 4088d717a67..290a3f8f947 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/CapacityPolicies.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/CapacityPolicies.java
@@ -7,7 +7,6 @@ import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Environment;
import com.yahoo.config.provision.NodeResources;
-import com.yahoo.config.provision.SystemName;
import com.yahoo.config.provision.Zone;
import com.yahoo.vespa.flags.PermanentFlags;
import com.yahoo.vespa.hosted.provision.NodeRepository;
@@ -72,10 +71,6 @@ public class CapacityPolicies {
public NodeResources defaultNodeResources(ClusterSpec.Type clusterType) {
if (clusterType == ClusterSpec.Type.admin) {
- if (zone.system() == SystemName.dev) {
- // Use small logserver in dev system
- return new NodeResources(0.1, 1, 10, 0.3);
- }
return zone.getCloud().dynamicProvisioning() && ! sharedHosts.apply(clusterType) ?
new NodeResources(0.5, 4, 50, 0.3) :
new NodeResources(0.5, 2, 50, 0.3);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
index 2d93763c631..ae65f367684 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java
@@ -149,6 +149,7 @@ public class GroupPreparer {
wantedGroups,
nodeRepository.zone().getCloud().dynamicProvisioning(),
nodeRepository.nameResolver(),
+ nodeRepository.nodes(),
nodeRepository.resourcesCalculator(),
nodeRepository.spareCount());
allocation.offer(prioritizer.collect(surplusActiveNodes));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java
index 5ff78c53f8a..04f084dd079 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java
@@ -100,11 +100,11 @@ public class LoadBalancerProvisioner {
*
* Calling this when no load balancer has been prepared for given cluster is a no-op.
*/
- public void activate(Set<ClusterSpec> clusters, ApplicationTransaction transaction) {
+ public void activate(Set<ClusterSpec> clusters, NodeList newActive, ApplicationTransaction transaction) {
Set<ClusterSpec.Id> activatingClusters = clusters.stream()
.map(LoadBalancerProvisioner::effectiveId)
.collect(Collectors.toSet());
- for (var cluster : loadBalancedClustersOf(transaction.application()).entrySet()) {
+ for (var cluster : loadBalancedClustersOf(newActive).entrySet()) {
if (!activatingClusters.contains(cluster.getKey())) continue;
Node clusterNode = cluster.getValue().first().get();
@@ -232,12 +232,13 @@ public class LoadBalancerProvisioner {
/** Returns the nodes allocated to the given load balanced cluster */
private NodeList nodesOf(ClusterSpec.Id loadBalancedCluster, ApplicationId application) {
- return loadBalancedClustersOf(application).getOrDefault(loadBalancedCluster, NodeList.copyOf(List.of()));
+ NodeList nodes = nodeRepository.nodes().list(Node.State.reserved, Node.State.active)
+ .owner(application);
+ return loadBalancedClustersOf(nodes).getOrDefault(loadBalancedCluster, NodeList.of());
}
/** Returns the load balanced clusters of given application and their nodes */
- private Map<ClusterSpec.Id, NodeList> loadBalancedClustersOf(ApplicationId application) {
- NodeList nodes = nodeRepository.nodes().list(Node.State.reserved, Node.State.active).owner(application);
+ private Map<ClusterSpec.Id, NodeList> loadBalancedClustersOf(NodeList nodes) {
if (nodes.stream().anyMatch(node -> node.type() == NodeType.config)) {
nodes = nodes.nodeType(NodeType.config).type(ClusterSpec.Type.admin);
} else if (nodes.stream().anyMatch(node -> node.type() == NodeType.controller)) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
index 85a43e38e07..fe4eb5d68c9 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
@@ -30,13 +30,14 @@ import java.util.stream.Collectors;
*/
public class NodePrioritizer {
- private final List<NodeCandidate> nodes = new ArrayList<>();
+ private final List<NodeCandidate> candidates = new ArrayList<>();
private final NodesAndHosts<LockedNodeList> allNodesAndHosts;
private final HostCapacity capacity;
private final NodeSpec requestedNodes;
private final ApplicationId application;
private final ClusterSpec clusterSpec;
private final NameResolver nameResolver;
+ private final Nodes nodes;
private final boolean dynamicProvisioning;
/** Whether node specification allows new nodes to be allocated. */
private final boolean canAllocateNew;
@@ -46,7 +47,7 @@ public class NodePrioritizer {
private final Set<Node> spareHosts;
public NodePrioritizer(NodesAndHosts<LockedNodeList> allNodesAndHosts, ApplicationId application, ClusterSpec clusterSpec, NodeSpec nodeSpec,
- int wantedGroups, boolean dynamicProvisioning, NameResolver nameResolver,
+ int wantedGroups, boolean dynamicProvisioning, NameResolver nameResolver, Nodes nodes,
HostResourcesCalculator hostResourcesCalculator, int spareCount) {
this.allNodesAndHosts = allNodesAndHosts;
this.capacity = new HostCapacity(this.allNodesAndHosts, hostResourcesCalculator);
@@ -58,6 +59,7 @@ public class NodePrioritizer {
capacity.findSpareHostsInDynamicallyProvisionedZones(this.allNodesAndHosts.nodes().asList()) :
capacity.findSpareHosts(this.allNodesAndHosts.nodes().asList(), spareCount);
this.nameResolver = nameResolver;
+ this.nodes = nodes;
NodeList nodesInCluster = this.allNodesAndHosts.nodes().owner(application).type(clusterSpec.type()).cluster(clusterSpec.id());
NodeList nonRetiredNodesInCluster = nodesInCluster.not().retired();
@@ -95,12 +97,12 @@ public class NodePrioritizer {
/** Returns the list of nodes sorted by {@link NodeCandidate#compareTo(NodeCandidate)} */
private List<NodeCandidate> prioritize() {
// Group candidates by their switch hostname
- Map<String, List<NodeCandidate>> candidatesBySwitch = this.nodes.stream()
+ Map<String, List<NodeCandidate>> candidatesBySwitch = this.candidates.stream()
.collect(Collectors.groupingBy(candidate -> candidate.parent.orElseGet(candidate::toNode)
.switchHostname()
.orElse("")));
// Mark lower priority nodes on shared switch as non-exclusive
- List<NodeCandidate> nodes = new ArrayList<>(this.nodes.size());
+ List<NodeCandidate> nodes = new ArrayList<>(this.candidates.size());
for (var clusterSwitch : candidatesBySwitch.keySet()) {
List<NodeCandidate> switchCandidates = candidatesBySwitch.get(clusterSwitch);
if (clusterSwitch.isEmpty()) {
@@ -126,7 +128,7 @@ public class NodePrioritizer {
for (Node node : surplusNodes) {
NodeCandidate candidate = candidateFrom(node, true);
if (!candidate.violatesSpares || canAllocateToSpareHosts) {
- nodes.add(candidate);
+ candidates.add(candidate);
}
}
}
@@ -136,7 +138,7 @@ public class NodePrioritizer {
if ( !canAllocateNew) return;
for (Node host : allNodesAndHosts.nodes()) {
- if ( ! Nodes.canAllocateTenantNodeTo(host, dynamicProvisioning)) continue;
+ if ( ! nodes.canAllocateTenantNodeTo(host, dynamicProvisioning)) continue;
if (host.reservedTo().isPresent() && !host.reservedTo().get().equals(application.tenant())) continue;
if (host.reservedTo().isPresent() && application.instance().isTester()) continue;
if (host.exclusiveToApplicationId().isPresent()) continue; // Never allocate new nodes to exclusive hosts
@@ -144,7 +146,7 @@ public class NodePrioritizer {
if (spareHosts.contains(host) && !canAllocateToSpareHosts) continue;
if ( ! capacity.hasCapacity(host, requestedNodes.resources().get())) continue;
if ( ! allNodesAndHosts.childrenOf(host).owner(application).cluster(clusterSpec.id()).isEmpty()) continue;
- nodes.add(NodeCandidate.createNewChild(requestedNodes.resources().get(),
+ candidates.add(NodeCandidate.createNewChild(requestedNodes.resources().get(),
capacity.availableCapacityOf(host),
host,
spareHosts.contains(host),
@@ -164,7 +166,7 @@ public class NodePrioritizer {
.filter(node -> node.allocation().get().membership().cluster().id().equals(clusterSpec.id()))
.filter(node -> node.state() == Node.State.active || canStillAllocate(node))
.map(node -> candidateFrom(node, false))
- .forEach(nodes::add);
+ .forEach(candidates::add);
}
/** Add nodes already provisioned, but not allocated to any application */
@@ -174,7 +176,7 @@ public class NodePrioritizer {
.filter(node -> node.state() == Node.State.ready)
.map(node -> candidateFrom(node, false))
.filter(n -> !n.violatesSpares || canAllocateToSpareHosts)
- .forEach(nodes::add);
+ .forEach(candidates::add);
}
/** Create a candidate from given pre-existing node */
@@ -218,7 +220,7 @@ public class NodePrioritizer {
private boolean canStillAllocate(Node node) {
if (node.type() != NodeType.tenant || node.parentHostname().isEmpty()) return true;
Optional<Node> parent = allNodesAndHosts.parentOf(node);
- return parent.isPresent() ? Nodes.canAllocateTenantNodeTo(parent.get(), dynamicProvisioning) : null;
+ return parent.isPresent() && nodes.canAllocateTenantNodeTo(parent.get(), dynamicProvisioning);
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java
index d5dbe08dca9..310f921367e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java
@@ -71,7 +71,6 @@ public class NodeResourceLimits {
}
private double minAdvertisedMemoryGb(ClusterSpec.Type clusterType) {
- if (zone().system() == SystemName.dev) return 1; // Allow small containers in dev system
if (clusterType == ClusterSpec.Type.admin) return 1;
return 4;
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LoadBalancersV1ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LoadBalancersV1ApiHandler.java
index aa6209ae80d..1ba686772c7 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LoadBalancersV1ApiHandler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LoadBalancersV1ApiHandler.java
@@ -2,7 +2,7 @@
package com.yahoo.vespa.hosted.provision.restapi;
import com.yahoo.container.jdisc.HttpResponse;
-import com.yahoo.container.jdisc.LoggingRequestHandler;
+import com.yahoo.container.jdisc.ThreadedHttpRequestHandler;
import com.yahoo.restapi.RestApi;
import com.yahoo.restapi.RestApiRequestHandler;
import com.yahoo.vespa.hosted.provision.NodeRepository;
@@ -18,7 +18,7 @@ public class LoadBalancersV1ApiHandler extends RestApiRequestHandler<LoadBalance
private final NodeRepository nodeRepository;
@Inject
- public LoadBalancersV1ApiHandler(LoggingRequestHandler.Context parentCtx, NodeRepository nodeRepository) {
+ public LoadBalancersV1ApiHandler(ThreadedHttpRequestHandler.Context parentCtx, NodeRepository nodeRepository) {
super(parentCtx, LoadBalancersV1ApiHandler::createRestApiDefinition);
this.nodeRepository = nodeRepository;
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java
index be011c886a5..6282c072001 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java
@@ -53,7 +53,6 @@ public class NodeSerializer {
case "confighost": return NodeType.confighost;
case "controller": return NodeType.controller;
case "controllerhost": return NodeType.controllerhost;
- case "devhost": return NodeType.devhost;
default: throw new IllegalArgumentException("Unknown node type '" + nodeType + "'");
}
}
@@ -68,7 +67,6 @@ public class NodeSerializer {
case confighost: return "confighost";
case controller: return "controller";
case controllerhost: return "controllerhost";
- case devhost: return "devhost";
default: throw new IllegalArgumentException("Unknown node type '" + type.name() + "'");
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
index 15e1061f5e1..1304b85be6b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
@@ -11,7 +11,7 @@ import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.TenantName;
import com.yahoo.container.jdisc.HttpRequest;
import com.yahoo.container.jdisc.HttpResponse;
-import com.yahoo.container.jdisc.LoggingRequestHandler;
+import com.yahoo.container.jdisc.ThreadedHttpRequestHandler;
import com.yahoo.io.IOUtils;
import com.yahoo.restapi.ErrorResponse;
import com.yahoo.restapi.MessageResponse;
@@ -67,7 +67,7 @@ import static com.yahoo.slime.SlimeUtils.optionalString;
*
* @author bratseth
*/
-public class NodesV2ApiHandler extends LoggingRequestHandler {
+public class NodesV2ApiHandler extends ThreadedHttpRequestHandler {
private final Orchestrator orchestrator;
private final NodeRepository nodeRepository;
@@ -75,7 +75,7 @@ public class NodesV2ApiHandler extends LoggingRequestHandler {
private final NodeFlavors nodeFlavors;
@Inject
- public NodesV2ApiHandler(LoggingRequestHandler.Context parentCtx, Orchestrator orchestrator,
+ public NodesV2ApiHandler(ThreadedHttpRequestHandler.Context parentCtx, Orchestrator orchestrator,
NodeRepository nodeRepository, MetricsDb metricsDb, NodeFlavors flavors) {
super(parentCtx);
this.orchestrator = orchestrator;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java
index 1a2d5294aa5..ff406efdc39 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java
@@ -70,6 +70,7 @@ public class MockNodeRepository extends NodeRepository {
Optional.empty(),
new InMemoryFlagSource(),
new MemoryMetricsDb(Clock.fixed(Instant.ofEpochMilli(123), ZoneId.of("Z"))),
+ new OrchestratorMock(),
true,
0, 1000);
this.flavors = flavors;