diff options
Diffstat (limited to 'node-repository/src/main/java/com')
26 files changed, 199 insertions, 167 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index 7f54fff5c70..a387bc28aa4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -29,6 +29,7 @@ import com.yahoo.vespa.hosted.provision.provisioning.ContainerImages; import com.yahoo.vespa.hosted.provision.provisioning.FirmwareChecks; import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator; import com.yahoo.vespa.hosted.provision.provisioning.ProvisionServiceProvider; +import com.yahoo.vespa.orchestrator.Orchestrator; import java.time.Clock; import java.util.List; @@ -59,6 +60,7 @@ public class NodeRepository extends AbstractComponent { private final LoadBalancers loadBalancers; private final FlagSource flagSource; private final MetricsDb metricsDb; + private final Orchestrator orchestrator; private final int spareCount; /** @@ -72,7 +74,8 @@ public class NodeRepository extends AbstractComponent { Curator curator, Zone zone, FlagSource flagSource, - MetricsDb metricsDb) { + MetricsDb metricsDb, + Orchestrator orchestrator) { this(flavors, provisionServiceProvider, curator, @@ -83,6 +86,7 @@ public class NodeRepository extends AbstractComponent { Optional.of(config.tenantContainerImage()).filter(s -> !s.isEmpty()).map(DockerImage::fromString), flagSource, metricsDb, + orchestrator, config.useCuratorClientCache(), zone.environment().isProduction() && !zone.getCloud().dynamicProvisioning() && !zone.system().isCd() ? 1 : 0, config.nodeCacheSize()); @@ -102,6 +106,7 @@ public class NodeRepository extends AbstractComponent { Optional<DockerImage> tenantContainerImage, FlagSource flagSource, MetricsDb metricsDb, + Orchestrator orchestrator, boolean useCuratorClientCache, int spareCount, long nodeCacheSize) { @@ -113,7 +118,7 @@ public class NodeRepository extends AbstractComponent { this.db = new CuratorDatabaseClient(flavors, curator, clock, useCuratorClientCache, nodeCacheSize); this.zone = zone; this.clock = clock; - this.nodes = new Nodes(db, zone, clock); + this.nodes = new Nodes(db, zone, clock, orchestrator); this.flavors = flavors; this.resourcesCalculator = provisionServiceProvider.getHostResourcesCalculator(); this.nameResolver = nameResolver; @@ -127,6 +132,7 @@ public class NodeRepository extends AbstractComponent { this.loadBalancers = new LoadBalancers(db); this.flagSource = flagSource; this.metricsDb = metricsDb; + this.orchestrator = orchestrator; this.spareCount = spareCount; nodes.rewrite(); } @@ -172,6 +178,8 @@ public class NodeRepository extends AbstractComponent { public MetricsDb metricsDb() { return metricsDb; } + public Orchestrator orchestrator() { return orchestrator; } + public NodeRepoStats computeStats() { return NodeRepoStats.computeOver(this); } /** Returns the time keeper of this system */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java index 08a9e373085..a73b6896c2c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/QuestMetricsDb.java @@ -18,6 +18,7 @@ import io.questdb.cairo.sql.Record; import io.questdb.cairo.sql.RecordCursor; import io.questdb.cairo.sql.RecordCursorFactory; import io.questdb.griffin.CompiledQuery; +import io.questdb.griffin.QueryFuture; import io.questdb.griffin.SqlCompiler; import io.questdb.griffin.SqlException; import io.questdb.griffin.SqlExecutionContext; @@ -341,6 +342,16 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { } } + /** + * Issues and wait for an SQL statement to be executed against the QuestDb engine. + * Needs to be done for some queries, e.g. 'alter table' queries, see https://github.com/questdb/questdb/issues/1846 + */ + private void issueAsync(String sql, SqlExecutionContext context) throws SqlException { + try (QueryFuture future = issue(sql, context).execute(null)) { + future.await(); + } + } + private SqlExecutionContext newContext() { return new SqlExecutionContextImpl(engine(), 1); } @@ -374,7 +385,7 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { void gc() { synchronized (writeLock) { try { - issue("alter table " + name + " drop partition where at < dateadd('d', -4, now());", newContext()); + issueAsync("alter table " + name + " drop partition where at < dateadd('d', -4, now());", newContext()); } catch (SqlException e) { log.log(Level.WARNING, "Failed to gc old metrics data in " + dir + " table " + name, e); @@ -396,7 +407,7 @@ public class QuestMetricsDb extends AbstractComponent implements MetricsDb { void ensureColumnExists(String column, String columnType) throws SqlException { if (columnNames().contains(column)) return; - issue("alter table " + name + " add column " + column + " " + columnType, newContext()); + issueAsync("alter table " + name + " add column " + column + " " + columnType, newContext()); } private Optional<Long> adjustOrDiscard(Instant at) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancerList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancerList.java index ad5bf1a2962..3e7da831bc4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancerList.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancerList.java @@ -2,6 +2,8 @@ package com.yahoo.vespa.hosted.provision.lb; import com.yahoo.collections.AbstractFilteringList; +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.ClusterSpec; import java.util.Collection; @@ -21,6 +23,16 @@ public class LoadBalancerList extends AbstractFilteringList<LoadBalancer, LoadBa return matching(lb -> lb.state() == state); } + /** Returns the subset of load balancers in given cluster */ + public LoadBalancerList application(ApplicationId application) { + return matching(lb -> lb.id().application().equals(application)); + } + + /** Returns the subset of load balancers in given cluster */ + public LoadBalancerList cluster(ClusterSpec.Id cluster) { + return matching(lb -> lb.id().cluster().equals(cluster)); + } + public static LoadBalancerList copyOf(Collection<LoadBalancer> loadBalancers) { return new LoadBalancerList(loadBalancers, false); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancers.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancers.java index d2c1aab72e2..7cbb8ef2764 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancers.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/LoadBalancers.java @@ -2,20 +2,9 @@ package com.yahoo.vespa.hosted.provision.lb; import com.yahoo.config.provision.ApplicationId; -import com.yahoo.config.provision.NodeType; -import com.yahoo.vespa.hosted.provision.Node; -import com.yahoo.vespa.hosted.provision.NodeList; -import com.yahoo.vespa.hosted.provision.NodeRepository; -import com.yahoo.vespa.hosted.provision.node.NodeAcl; import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient; -import java.util.Comparator; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Set; -import java.util.TreeSet; import java.util.function.Predicate; -import java.util.stream.Collectors; /** * The load balancers of this node repo. diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index 341ab1f785c..e6476cd7373 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -26,7 +26,6 @@ import com.yahoo.vespa.hosted.provision.NodesAndHosts; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.hosted.provision.node.IP; -import com.yahoo.vespa.hosted.provision.node.Nodes; import com.yahoo.vespa.hosted.provision.provisioning.FatalProvisioningException; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.HostSharing; @@ -205,7 +204,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { private Map<String, Node> findSharedHosts(NodeList nodeList) { return nodeList.stream() - .filter(node -> Nodes.canAllocateTenantNodeTo(node, true)) + .filter(node -> nodeRepository().nodes().canAllocateTenantNodeTo(node, true)) .filter(node -> node.reservedTo().isEmpty()) .filter(node -> node.exclusiveToApplicationId().isEmpty()) .collect(Collectors.toMap(Node::hostname, Function.identity())); @@ -298,7 +297,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { int wantedGroups = 1; NodePrioritizer prioritizer = new NodePrioritizer(nodesAndHosts, applicationId, clusterSpec, nodeSpec, wantedGroups, - true, nodeRepository().nameResolver(), nodeRepository().resourcesCalculator(), + true, nodeRepository().nameResolver(), nodeRepository().nodes(), nodeRepository().resourcesCalculator(), nodeRepository().spareCount()); List<NodeCandidate> nodeCandidates = prioritizer.collect(List.of()); MutableInteger index = new MutableInteger(0); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index 6133705ed59..3274f12dbc6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -42,6 +42,8 @@ import java.util.stream.Collectors; public class FailedExpirer extends NodeRepositoryMaintainer { private static final Logger log = Logger.getLogger(FailedExpirer.class.getName()); + // Try recycling nodes until reaching this many failures + private static final int maxAllowedFailures = 50; private final NodeRepository nodeRepository; private final Duration statefulExpiry; // Stateful nodes: Grace period to allow recovery of data @@ -85,11 +87,11 @@ public class FailedExpirer extends NodeRepositoryMaintainer { recycle(nodesToRecycle); } - /** Move eligible nodes to dirty. This may be a subset of the given nodes */ + /** Move eligible nodes to dirty or parked. This may be a subset of the given nodes */ private void recycle(List<Node> nodes) { List<Node> nodesToRecycle = new ArrayList<>(); for (Node candidate : nodes) { - if (NodeFailer.hasHardwareIssue(candidate, nodeRepository)) { + if (broken(candidate)) { List<String> unparkedChildren = !candidate.type().isHost() ? List.of() : nodeRepository.nodes().list() .childrenOf(candidate) @@ -98,7 +100,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { if (unparkedChildren.isEmpty()) { nodeRepository.nodes().park(candidate.hostname(), false, Agent.FailedExpirer, - "Parked by FailedExpirer due to hardware issue"); + "Parked by FailedExpirer due to hardware issue or high fail count"); } else { log.info(String.format("Expired failed node %s with hardware issue was not parked because of " + "unparked children: %s", candidate.hostname(), @@ -111,4 +113,10 @@ public class FailedExpirer extends NodeRepositoryMaintainer { nodeRepository.nodes().deallocate(nodesToRecycle, Agent.FailedExpirer, "Expired by FailedExpirer"); } + /** Returns whether node is broken and cannot be recycled */ + private boolean broken(Node node) { + return NodeFailer.hasHardwareIssue(node, nodeRepository) || + (node.type().isHost() && node.status().failCount() >= maxAllowedFailures); + } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureVersions.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureVersions.java index 4f913bb55dd..a1f36a4f1a5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureVersions.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureVersions.java @@ -88,7 +88,6 @@ public class InfrastructureVersions { case controllerhost: case proxyhost: case host: - case devhost: break; default: throw new IllegalArgumentException("Target version for type " + nodeType + " is not allowed"); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index ca14a1be4c4..636884cef0a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -22,7 +22,6 @@ import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.node.ClusterId; import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.hosted.provision.persistence.CacheStats; -import com.yahoo.vespa.orchestrator.Orchestrator; import com.yahoo.vespa.service.monitor.ServiceModel; import com.yahoo.vespa.service.monitor.ServiceMonitor; @@ -47,20 +46,17 @@ public class MetricsReporter extends NodeRepositoryMaintainer { private final Set<Pair<Metric.Context, String>> nonZeroMetrics = new HashSet<>(); private final Metric metric; - private final Orchestrator orchestrator; private final ServiceMonitor serviceMonitor; private final Map<Map<String, String>, Metric.Context> contextMap = new HashMap<>(); private final Supplier<Integer> pendingRedeploymentsSupplier; MetricsReporter(NodeRepository nodeRepository, Metric metric, - Orchestrator orchestrator, ServiceMonitor serviceMonitor, Supplier<Integer> pendingRedeploymentsSupplier, Duration interval) { super(nodeRepository, interval, metric); this.metric = metric; - this.orchestrator = orchestrator; this.serviceMonitor = serviceMonitor; this.pendingRedeploymentsSupplier = pendingRedeploymentsSupplier; } @@ -201,7 +197,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { metric.set("wantToRetire", node.status().wantToRetire() ? 1 : 0, context); metric.set("wantToDeprovision", node.status().wantToDeprovision() ? 1 : 0, context); - metric.set("failReport", NodeFailer.reasonsToFailParentHost(node).isEmpty() ? 0 : 1, context); + metric.set("failReport", NodeFailer.reasonsToFailHost(node).isEmpty() ? 0 : 1, context); if (node.type().isHost()) { metric.set("wantToEncrypt", node.reports().getReport("wantToEncrypt").isPresent() ? 1 : 0, context); @@ -212,7 +208,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { serviceModel.getApplication(hostname) .map(ApplicationInstance::reference) - .map(reference -> orchestrator.getHostInfo(reference, hostname)) + .map(reference -> nodeRepository().orchestrator().getHostInfo(reference, hostname)) .ifPresent(info -> { int suspended = info.status().isSuspended() ? 1 : 0; metric.set("suspended", suspended, context); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 00881f5e2a8..a1916d7dc20 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -7,7 +7,6 @@ import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.TransientException; import com.yahoo.jdisc.Metric; import com.yahoo.transaction.Mutex; -import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeMutex; @@ -15,17 +14,16 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException; -import com.yahoo.vespa.orchestrator.HostNameNotFoundException; -import com.yahoo.vespa.orchestrator.Orchestrator; import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; import com.yahoo.yolean.Exceptions; import java.time.Duration; import java.time.Instant; -import java.util.HashMap; +import java.util.Collection; +import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; @@ -40,7 +38,6 @@ import java.util.stream.Collectors; public class NodeFailer extends NodeRepositoryMaintainer { private static final Logger log = Logger.getLogger(NodeFailer.class.getName()); - private static final Duration nodeRequestInterval = Duration.ofMinutes(10); /** Metric for number of hosts that we want to fail, but cannot due to throttling */ static final String throttledHostFailuresMetric = "throttledHostFailures"; @@ -53,20 +50,17 @@ public class NodeFailer extends NodeRepositoryMaintainer { private final Deployer deployer; private final Duration downTimeLimit; - private final Orchestrator orchestrator; - private final Instant constructionTime; + private final Duration suspendedDownTimeLimit; private final ThrottlePolicy throttlePolicy; private final Metric metric; public NodeFailer(Deployer deployer, NodeRepository nodeRepository, - Duration downTimeLimit, Duration interval, Orchestrator orchestrator, - ThrottlePolicy throttlePolicy, Metric metric) { + Duration downTimeLimit, Duration interval, ThrottlePolicy throttlePolicy, Metric metric) { // check ping status every interval, but at least twice as often as the down time limit super(nodeRepository, min(downTimeLimit.dividedBy(2), interval), metric); this.deployer = deployer; this.downTimeLimit = downTimeLimit; - this.orchestrator = orchestrator; - this.constructionTime = nodeRepository.clock().instant(); + this.suspendedDownTimeLimit = downTimeLimit.multipliedBy(4); // Allow more downtime when a node is suspended this.throttlePolicy = throttlePolicy; this.metric = metric; } @@ -82,38 +76,34 @@ public class NodeFailer extends NodeRepositoryMaintainer { // Ready nodes try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { - for (Map.Entry<Node, String> entry : getReadyNodesByFailureReason().entrySet()) { + for (FailingNode failing : findReadyFailingNodes()) { attempts++; - Node node = entry.getKey(); - if (throttle(node)) { + if (throttle(failing.node())) { failures++; - if (node.type().isHost()) + if (failing.node().type().isHost()) throttledHostFailures++; else throttledNodeFailures++; continue; } - String reason = entry.getValue(); - nodeRepository().nodes().fail(node.hostname(), Agent.NodeFailer, reason); + nodeRepository().nodes().fail(failing.node().hostname(), Agent.NodeFailer, failing.reason()); } } // Active nodes - for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason().entrySet()) { + for (FailingNode failing : findActiveFailingNodes()) { attempts++; - Node node = entry.getKey(); - if (!failAllowedFor(node.type())) continue; + if (!failAllowedFor(failing.node().type())) continue; - if (throttle(node)) { + if (throttle(failing.node())) { failures++; - if (node.type().isHost()) + if (failing.node().type().isHost()) throttledHostFailures++; else throttledNodeFailures++; continue; } - String reason = entry.getValue(); - failActive(node, reason); + failActive(failing); } // Active hosts @@ -143,60 +133,54 @@ public class NodeFailer extends NodeRepositoryMaintainer { return asSuccessFactor(attempts, failures); } - private Map<Node, String> getReadyNodesByFailureReason() { - Instant oldestAcceptableRequestTime = - // Allow requests some time to be registered in case all config servers have been down - constructionTime.isAfter(clock().instant().minus(nodeRequestInterval.multipliedBy(2))) ? - Instant.EPOCH : - - // Nodes are taken as dead if they have not made a config request since this instant. - // Add 10 minutes to the down time limit to allow nodes to make a request that infrequently. - clock().instant().minus(downTimeLimit).minus(nodeRequestInterval); - - Map<Node, String> nodesByFailureReason = new HashMap<>(); + private Collection<FailingNode> findReadyFailingNodes() { + Set<FailingNode> failingNodes = new HashSet<>(); for (Node node : nodeRepository().nodes().list(Node.State.ready)) { Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node); - List<String> failureReports = reasonsToFailParentHost(hostNode); + List<String> failureReports = reasonsToFailHost(hostNode); if (failureReports.size() > 0) { if (hostNode.equals(node)) { - nodesByFailureReason.put(node, "Host has failure reports: " + failureReports); + failingNodes.add(new FailingNode(node, "Host has failure reports: " + failureReports)); } else { - nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports); + failingNodes.add(new FailingNode(node, "Parent (" + hostNode + ") has failure reports: " + failureReports)); } } } - return nodesByFailureReason; + return failingNodes; } - private Map<Node, String> getActiveNodesByFailureReason() { + private Collection<FailingNode> findActiveFailingNodes() { + Set<FailingNode> failingNodes = new HashSet<>(); NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); - Instant graceTimeEnd = clock().instant().minus(downTimeLimit); - Map<Node, String> nodesByFailureReason = new HashMap<>(); + for (Node node : activeNodes) { - if (node.history().hasEventBefore(History.Event.Type.down, graceTimeEnd) && ! applicationSuspended(node)) { + Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit); + if (node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) { // Allow a grace period after node re-activation - if ( ! node.history().hasEventAfter(History.Event.Type.activated, graceTimeEnd)) - nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit); + if (!node.history().hasEventAfter(History.Event.Type.activated, graceTimeStart)) + failingNodes.add(new FailingNode(node, "Node has been down longer than " + downTimeLimit)); } - else if (hostSuspended(node, activeNodes)) { - Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node); - if (hostNode.type().isHost()) { - List<String> failureReports = reasonsToFailParentHost(hostNode); - if (failureReports.size() > 0) { - if (hostNode.equals(node)) { - nodesByFailureReason.put(node, "Host has failure reports: " + failureReports); - } else { - nodesByFailureReason.put(node, "Parent (" + hostNode + ") has failure reports: " + failureReports); - } + } + + for (Node node : activeNodes) { + if (allSuspended(node, activeNodes)) { + Node host = node.parentHostname().flatMap(parent -> nodeRepository().nodes().node(parent)).orElse(node); + if (host.type().isHost()) { + List<String> failureReports = reasonsToFailHost(host); + if ( ! failureReports.isEmpty()) { + failingNodes.add(new FailingNode(node, host.equals(node) ? + "Host has failure reports: " + failureReports : + "Parent " + host + " has failure reports: " + failureReports)); } } } } - return nodesByFailureReason; + + return failingNodes; } - public static List<String> reasonsToFailParentHost(Node hostNode) { - return hostNode.reports().getReports().stream() + public static List<String> reasonsToFailHost(Node host) { + return host.reports().getReports().stream() .filter(report -> report.getType().hostShouldBeFailed()) // The generated string is built from the report's ID, created time, and description only. .map(report -> report.getReportId() + " reported " + report.getCreatedTime() + ": " + report.getDescription()) @@ -205,37 +189,28 @@ public class NodeFailer extends NodeRepositoryMaintainer { /** Returns whether node has any kind of hardware issue */ static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) { - Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository.nodes().node(parent)).orElse(node); - return reasonsToFailParentHost(hostNode).size() > 0; + Node host = node.parentHostname().flatMap(parent -> nodeRepository.nodes().node(parent)).orElse(node); + return reasonsToFailHost(host).size() > 0; } private boolean applicationSuspended(Node node) { try { - return orchestrator.getApplicationInstanceStatus(node.allocation().get().owner()) + return nodeRepository().orchestrator().getApplicationInstanceStatus(node.allocation().get().owner()) == ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN; } catch (ApplicationIdNotFoundException e) { - //Treat it as not suspended and allow to fail the node anyway - return false; - } - } - - private boolean nodeSuspended(Node node) { - try { - return orchestrator.getNodeStatus(new HostName(node.hostname())).isSuspended(); - } catch (HostNameNotFoundException e) { - // Treat it as not suspended + // Treat it as not suspended and allow to fail the node anyway return false; } } /** Is the node and all active children suspended? */ - private boolean hostSuspended(Node node, NodeList activeNodes) { - if (!nodeSuspended(node)) return false; + private boolean allSuspended(Node node, NodeList activeNodes) { + if (!nodeRepository().nodes().suspended(node)) return false; if (node.parentHostname().isPresent()) return true; // optimization return activeNodes.stream() .filter(childNode -> childNode.parentHostname().isPresent() && childNode.parentHostname().get().equals(node.hostname())) - .allMatch(this::nodeSuspended); + .allMatch(nodeRepository().nodes()::suspended); } /** @@ -264,40 +239,40 @@ public class NodeFailer extends NodeRepositoryMaintainer { * * @return whether node was successfully failed */ - private boolean failActive(Node node, String reason) { + private boolean failActive(FailingNode failing) { Optional<Deployment> deployment = - deployer.deployFromLocalActive(node.allocation().get().owner(), Duration.ofMinutes(30)); + deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(30)); if (deployment.isEmpty()) return false; - try (Mutex lock = nodeRepository().nodes().lock(node.allocation().get().owner())) { + try (Mutex lock = nodeRepository().nodes().lock(failing.node().allocation().get().owner())) { // If the active node that we are trying to fail is of type host, we need to successfully fail all // the children nodes running on it before we fail the host boolean allTenantNodesFailedOutSuccessfully = true; - String reasonForChildFailure = "Failing due to parent host " + node.hostname() + " failure: " + reason; - for (Node failingTenantNode : nodeRepository().nodes().list().childrenOf(node)) { + String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason(); + for (Node failingTenantNode : nodeRepository().nodes().list().childrenOf(failing.node())) { if (failingTenantNode.state() == Node.State.active) { - allTenantNodesFailedOutSuccessfully &= failActive(failingTenantNode, reasonForChildFailure); + allTenantNodesFailedOutSuccessfully &= failActive(new FailingNode(failingTenantNode, reasonForChildFailure)); } else { nodeRepository().nodes().fail(failingTenantNode.hostname(), Agent.NodeFailer, reasonForChildFailure); } } if (! allTenantNodesFailedOutSuccessfully) return false; - wantToFail(node, true, lock); + wantToFail(failing.node(), true, lock); try { deployment.get().activate(); return true; } catch (TransientException e) { - log.log(Level.INFO, "Failed to redeploy " + node.allocation().get().owner() + + log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() + " with a transient error, will be retried by application maintainer: " + Exceptions.toMessageString(e)); return true; } catch (RuntimeException e) { // Reset want to fail: We'll retry failing unless it heals in the meantime - nodeRepository().nodes().node(node.hostname()) + nodeRepository().nodes().node(failing.node().hostname()) .ifPresent(n -> wantToFail(n, false, lock)); - log.log(Level.WARNING, "Could not fail " + node + " for " + node.allocation().get().owner() + - " for " + reason + ": " + Exceptions.toMessageString(e)); + log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() + + " for " + failing.reason() + ": " + Exceptions.toMessageString(e)); return false; } } @@ -359,4 +334,30 @@ public class NodeFailer extends NodeRepositoryMaintainer { } + private static class FailingNode { + + private final Node node; + private final String reason; + + public FailingNode(Node node, String reason) { + this.node = node; + this.reason = reason; + } + + public Node node() { return node; } + public String reason() { return reason; } + + @Override + public boolean equals(Object other) { + if ( ! (other instanceof FailingNode)) return false; + return ((FailingNode)other).node().equals(this.node()); + } + + @Override + public int hashCode() { + return node.hashCode(); + } + + } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java index 57db874fb84..552db84748d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java @@ -74,6 +74,8 @@ public abstract class NodeMover<MOVE> extends NodeRepositoryMaintainer { if (deployedRecently(applicationId)) continue; for (HostWithResources toHost : hostResources) { if (toHost.node.hostname().equals(node.parentHostname().get())) continue; + if (toHost.node.reservedTo().isPresent() && + !toHost.node.reservedTo().get().equals(applicationId.tenant())) continue; // Reserved to a different tenant if (spares.contains(toHost.node)) continue; // Do not offer spares as a valid move as they are reserved for replacement of failed nodes if ( ! toHost.hasCapacity(node.resources())) continue; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 2f200032492..15decde0d7c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -14,7 +14,6 @@ import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.autoscale.MetricsFetcher; import com.yahoo.vespa.hosted.provision.provisioning.ProvisionServiceProvider; -import com.yahoo.vespa.orchestrator.Orchestrator; import com.yahoo.vespa.service.monitor.ServiceMonitor; import java.time.Duration; @@ -35,7 +34,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { @Inject public NodeRepositoryMaintenance(NodeRepository nodeRepository, Deployer deployer, InfraDeployer infraDeployer, HostLivenessTracker hostLivenessTracker, ServiceMonitor serviceMonitor, - Zone zone, Orchestrator orchestrator, Metric metric, + Zone zone, Metric metric, ProvisionServiceProvider provisionServiceProvider, FlagSource flagSource, MetricsFetcher metricsFetcher) { DefaultTimes defaults = new DefaultTimes(zone, deployer); @@ -46,11 +45,11 @@ public class NodeRepositoryMaintenance extends AbstractComponent { maintainers.add(periodicApplicationMaintainer); maintainers.add(infrastructureProvisioner); - maintainers.add(new NodeFailer(deployer, nodeRepository, defaults.failGrace, defaults.nodeFailerInterval, orchestrator, defaults.throttlePolicy, metric)); + maintainers.add(new NodeFailer(deployer, nodeRepository, defaults.failGrace, defaults.nodeFailerInterval, defaults.throttlePolicy, metric)); maintainers.add(new NodeHealthTracker(hostLivenessTracker, serviceMonitor, nodeRepository, defaults.nodeFailureStatusUpdateInterval, metric)); maintainers.add(new ExpeditedChangeApplicationMaintainer(deployer, metric, nodeRepository, defaults.expeditedChangeRedeployInterval)); maintainers.add(new ReservationExpirer(nodeRepository, defaults.reservationExpiry, metric)); - maintainers.add(new RetiredExpirer(nodeRepository, orchestrator, deployer, metric, defaults.retiredInterval, defaults.retiredExpiry)); + maintainers.add(new RetiredExpirer(nodeRepository, deployer, metric, defaults.retiredInterval, defaults.retiredExpiry)); maintainers.add(new InactiveExpirer(nodeRepository, defaults.inactiveExpiry, Map.of(NodeType.config, defaults.inactiveConfigServerExpiry, NodeType.controller, defaults.inactiveControllerExpiry), metric)); @@ -58,7 +57,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { maintainers.add(new DirtyExpirer(nodeRepository, defaults.dirtyExpiry, metric)); maintainers.add(new ProvisionedExpirer(nodeRepository, defaults.provisionedExpiry, metric)); maintainers.add(new NodeRebooter(nodeRepository, flagSource, metric)); - maintainers.add(new MetricsReporter(nodeRepository, metric, orchestrator, serviceMonitor, periodicApplicationMaintainer::pendingDeployments, defaults.metricsInterval)); + maintainers.add(new MetricsReporter(nodeRepository, metric, serviceMonitor, periodicApplicationMaintainer::pendingDeployments, defaults.metricsInterval)); maintainers.add(new SpareCapacityMaintainer(deployer, nodeRepository, metric, defaults.spareCapacityMaintenanceInterval)); maintainers.add(new OsUpgradeActivator(nodeRepository, defaults.osUpgradeActivatorInterval, metric)); maintainers.add(new Rebalancer(deployer, nodeRepository, metric, defaults.rebalancerInterval)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java index a7ba8b27851..73c9a1ab55a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java @@ -1,7 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.maintenance; -import com.google.common.util.concurrent.UncheckedTimeoutException; +import com.yahoo.concurrent.UncheckedTimeoutException; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Deployer; import com.yahoo.jdisc.Metric; @@ -11,7 +11,6 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.orchestrator.OrchestrationException; -import com.yahoo.vespa.orchestrator.Orchestrator; import com.yahoo.yolean.Exceptions; import java.time.Duration; @@ -31,11 +30,9 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { private final Deployer deployer; private final Metric metric; - private final Orchestrator orchestrator; private final Duration retiredExpiry; public RetiredExpirer(NodeRepository nodeRepository, - Orchestrator orchestrator, Deployer deployer, Metric metric, Duration maintenanceInterval, @@ -43,7 +40,6 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { super(nodeRepository, maintenanceInterval, metric); this.deployer = deployer; this.metric = metric; - this.orchestrator = orchestrator; this.retiredExpiry = retiredExpiry; } @@ -126,7 +122,7 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { } try { - orchestrator.acquirePermissionToRemove(new HostName(node.hostname())); + nodeRepository().orchestrator().acquirePermissionToRemove(new HostName(node.hostname())); log.info("Node " + node + " has been granted permission to be removed"); return true; } catch (UncheckedTimeoutException e) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/NodeAcl.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/NodeAcl.java index 88a62c94f43..ac24c83e129 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/NodeAcl.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/NodeAcl.java @@ -109,10 +109,9 @@ public class NodeAcl { case proxy: // Proxy nodes trust: // - config servers - // - all connections from the world on 4080 (insecure tb removed), and 4443 + // - all connections from the world on 443 (production traffic) and 4443 (health checks) trustedNodes.addAll(allNodes.nodeType(NodeType.config).asList()); trustedPorts.add(443); - trustedPorts.add(4080); trustedPorts.add(4443); break; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index 7f57ec219ae..57a3b436e37 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -10,6 +10,7 @@ import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.Zone; import com.yahoo.transaction.Mutex; import com.yahoo.transaction.NestedTransaction; +import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.hosted.provision.LockedNodeList; import com.yahoo.vespa.hosted.provision.NoSuchNodeException; import com.yahoo.vespa.hosted.provision.Node; @@ -18,6 +19,8 @@ import com.yahoo.vespa.hosted.provision.NodeMutex; import com.yahoo.vespa.hosted.provision.maintenance.NodeFailer; import com.yahoo.vespa.hosted.provision.node.filter.NodeFilter; import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient; +import com.yahoo.vespa.orchestrator.HostNameNotFoundException; +import com.yahoo.vespa.orchestrator.Orchestrator; import java.time.Clock; import java.time.Duration; @@ -53,14 +56,16 @@ public class Nodes { private static final Logger log = Logger.getLogger(Nodes.class.getName()); + private final CuratorDatabaseClient db; private final Zone zone; private final Clock clock; - private final CuratorDatabaseClient db; + private final Orchestrator orchestrator; - public Nodes(CuratorDatabaseClient db, Zone zone, Clock clock) { + public Nodes(CuratorDatabaseClient db, Zone zone, Clock clock, Orchestrator orchestrator) { this.zone = zone; this.clock = clock; this.db = db; + this.orchestrator = orchestrator; } /** Read and write all nodes to make sure they are stored in the latest version of the serialized format */ @@ -474,7 +479,7 @@ public class Nodes { if (node.state() == Node.State.ready) return node; Node parentHost = node.parentHostname().flatMap(this::node).orElse(node); - List<String> failureReasons = NodeFailer.reasonsToFailParentHost(parentHost); + List<String> failureReasons = NodeFailer.reasonsToFailHost(parentHost); if ( ! failureReasons.isEmpty()) illegal(node + " cannot be readied because it has hard failures: " + failureReasons); @@ -728,10 +733,11 @@ public class Nodes { return canAllocateTenantNodeTo(host, zone.getCloud().dynamicProvisioning()); } - public static boolean canAllocateTenantNodeTo(Node host, boolean dynamicProvisioning) { + public boolean canAllocateTenantNodeTo(Node host, boolean dynamicProvisioning) { if ( ! host.type().canRun(NodeType.tenant)) return false; if (host.status().wantToRetire()) return false; if (host.allocation().map(alloc -> alloc.membership().retired()).orElse(false)) return false; + if (suspended(host)) return false; if (dynamicProvisioning) return EnumSet.of(Node.State.active, Node.State.ready, Node.State.provisioned).contains(host.state()); @@ -739,6 +745,15 @@ public class Nodes { return host.state() == Node.State.active; } + public boolean suspended(Node node) { + try { + return orchestrator.getNodeStatus(new HostName(node.hostname())).isSuspended(); + } catch (HostNameNotFoundException e) { + // Treat it as not suspended + return false; + } + } + /** Create a lock which provides exclusive rights to making changes to the given application */ // TODO: Move to Applications public Mutex lock(ApplicationId application) { @@ -819,6 +834,7 @@ public class Nodes { private static boolean parkOnDeallocationOf(Node node, Agent agent) { if (node.state() == Node.State.parked) return false; if (agent == Agent.operator) return false; + if (!node.type().isHost() && node.status().wantToDeprovision()) return false; boolean retirementRequestedByOperator = node.status().wantToRetire() && node.history().event(History.Event.Type.wantToRetire) .map(History.Event::agent) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java index ba28d8e6b9a..379bb2566df 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java @@ -1,8 +1,8 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.provision.persistence; -import com.google.common.util.concurrent.UncheckedTimeoutException; import com.yahoo.component.Version; +import com.yahoo.concurrent.UncheckedTimeoutException; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ApplicationLockException; import com.yahoo.config.provision.ApplicationTransaction; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index 543972a9cb3..cd1b786afd1 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -554,7 +554,6 @@ public class NodeSerializer { case "confighost": return NodeType.confighost; case "controller": return NodeType.controller; case "controllerhost": return NodeType.controllerhost; - case "devhost": return NodeType.devhost; default : throw new IllegalArgumentException("Unknown node type '" + typeString + "'"); } } @@ -569,7 +568,6 @@ public class NodeSerializer { case confighost: return "confighost"; case controller: return "controller"; case controllerhost: return "controllerhost"; - case devhost: return "devhost"; } throw new IllegalArgumentException("Serialized form of '" + type + "' not defined"); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java index 8c358301b85..3da0506f2e1 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java @@ -45,8 +45,8 @@ class Activator { /** Activate required resources for application guarded by given lock */ public void activate(Collection<HostSpec> hosts, long generation, ApplicationTransaction transaction) { - activateNodes(hosts, generation, transaction); - activateLoadBalancers(hosts, transaction); + NodeList newActive = activateNodes(hosts, generation, transaction); + activateLoadBalancers(hosts, newActive, transaction); } /** @@ -62,8 +62,9 @@ class Activator { * @param generation the application config generation that is activated * @param transaction transaction with operations to commit together with any operations done within the repository, * while holding the node repository lock on this application + * @return the nodes that will be active when transaction is committed */ - private void activateNodes(Collection<HostSpec> hosts, long generation, ApplicationTransaction transaction) { + private NodeList activateNodes(Collection<HostSpec> hosts, long generation, ApplicationTransaction transaction) { Instant activationTime = nodeRepository.clock().instant(); // Use one timestamp for all activation changes ApplicationId application = transaction.application(); Set<String> hostnames = hosts.stream().map(HostSpec::hostname).collect(Collectors.toSet()); @@ -95,6 +96,7 @@ class Activator { oldActive.not().retired(), newActive.not().retired()); unreserveParentsOf(reserved); + return newActive; } private void deactivate(NodeList toDeactivate, ApplicationTransaction transaction) { @@ -149,8 +151,8 @@ class Activator { } /** Activate load balancers */ - private void activateLoadBalancers(Collection<HostSpec> hosts, ApplicationTransaction transaction) { - loadBalancerProvisioner.ifPresent(provisioner -> provisioner.activate(allClustersOf(hosts), transaction)); + private void activateLoadBalancers(Collection<HostSpec> hosts, NodeList newActive, ApplicationTransaction transaction) { + loadBalancerProvisioner.ifPresent(provisioner -> provisioner.activate(allClustersOf(hosts), newActive, transaction)); } private static Set<ClusterSpec> allClustersOf(Collection<HostSpec> hosts) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/CapacityPolicies.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/CapacityPolicies.java index 4088d717a67..290a3f8f947 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/CapacityPolicies.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/CapacityPolicies.java @@ -7,7 +7,6 @@ import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.NodeResources; -import com.yahoo.config.provision.SystemName; import com.yahoo.config.provision.Zone; import com.yahoo.vespa.flags.PermanentFlags; import com.yahoo.vespa.hosted.provision.NodeRepository; @@ -72,10 +71,6 @@ public class CapacityPolicies { public NodeResources defaultNodeResources(ClusterSpec.Type clusterType) { if (clusterType == ClusterSpec.Type.admin) { - if (zone.system() == SystemName.dev) { - // Use small logserver in dev system - return new NodeResources(0.1, 1, 10, 0.3); - } return zone.getCloud().dynamicProvisioning() && ! sharedHosts.apply(clusterType) ? new NodeResources(0.5, 4, 50, 0.3) : new NodeResources(0.5, 2, 50, 0.3); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java index 2d93763c631..ae65f367684 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java @@ -149,6 +149,7 @@ public class GroupPreparer { wantedGroups, nodeRepository.zone().getCloud().dynamicProvisioning(), nodeRepository.nameResolver(), + nodeRepository.nodes(), nodeRepository.resourcesCalculator(), nodeRepository.spareCount()); allocation.offer(prioritizer.collect(surplusActiveNodes)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java index 5ff78c53f8a..04f084dd079 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java @@ -100,11 +100,11 @@ public class LoadBalancerProvisioner { * * Calling this when no load balancer has been prepared for given cluster is a no-op. */ - public void activate(Set<ClusterSpec> clusters, ApplicationTransaction transaction) { + public void activate(Set<ClusterSpec> clusters, NodeList newActive, ApplicationTransaction transaction) { Set<ClusterSpec.Id> activatingClusters = clusters.stream() .map(LoadBalancerProvisioner::effectiveId) .collect(Collectors.toSet()); - for (var cluster : loadBalancedClustersOf(transaction.application()).entrySet()) { + for (var cluster : loadBalancedClustersOf(newActive).entrySet()) { if (!activatingClusters.contains(cluster.getKey())) continue; Node clusterNode = cluster.getValue().first().get(); @@ -232,12 +232,13 @@ public class LoadBalancerProvisioner { /** Returns the nodes allocated to the given load balanced cluster */ private NodeList nodesOf(ClusterSpec.Id loadBalancedCluster, ApplicationId application) { - return loadBalancedClustersOf(application).getOrDefault(loadBalancedCluster, NodeList.copyOf(List.of())); + NodeList nodes = nodeRepository.nodes().list(Node.State.reserved, Node.State.active) + .owner(application); + return loadBalancedClustersOf(nodes).getOrDefault(loadBalancedCluster, NodeList.of()); } /** Returns the load balanced clusters of given application and their nodes */ - private Map<ClusterSpec.Id, NodeList> loadBalancedClustersOf(ApplicationId application) { - NodeList nodes = nodeRepository.nodes().list(Node.State.reserved, Node.State.active).owner(application); + private Map<ClusterSpec.Id, NodeList> loadBalancedClustersOf(NodeList nodes) { if (nodes.stream().anyMatch(node -> node.type() == NodeType.config)) { nodes = nodes.nodeType(NodeType.config).type(ClusterSpec.Type.admin); } else if (nodes.stream().anyMatch(node -> node.type() == NodeType.controller)) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java index 85a43e38e07..fe4eb5d68c9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java @@ -30,13 +30,14 @@ import java.util.stream.Collectors; */ public class NodePrioritizer { - private final List<NodeCandidate> nodes = new ArrayList<>(); + private final List<NodeCandidate> candidates = new ArrayList<>(); private final NodesAndHosts<LockedNodeList> allNodesAndHosts; private final HostCapacity capacity; private final NodeSpec requestedNodes; private final ApplicationId application; private final ClusterSpec clusterSpec; private final NameResolver nameResolver; + private final Nodes nodes; private final boolean dynamicProvisioning; /** Whether node specification allows new nodes to be allocated. */ private final boolean canAllocateNew; @@ -46,7 +47,7 @@ public class NodePrioritizer { private final Set<Node> spareHosts; public NodePrioritizer(NodesAndHosts<LockedNodeList> allNodesAndHosts, ApplicationId application, ClusterSpec clusterSpec, NodeSpec nodeSpec, - int wantedGroups, boolean dynamicProvisioning, NameResolver nameResolver, + int wantedGroups, boolean dynamicProvisioning, NameResolver nameResolver, Nodes nodes, HostResourcesCalculator hostResourcesCalculator, int spareCount) { this.allNodesAndHosts = allNodesAndHosts; this.capacity = new HostCapacity(this.allNodesAndHosts, hostResourcesCalculator); @@ -58,6 +59,7 @@ public class NodePrioritizer { capacity.findSpareHostsInDynamicallyProvisionedZones(this.allNodesAndHosts.nodes().asList()) : capacity.findSpareHosts(this.allNodesAndHosts.nodes().asList(), spareCount); this.nameResolver = nameResolver; + this.nodes = nodes; NodeList nodesInCluster = this.allNodesAndHosts.nodes().owner(application).type(clusterSpec.type()).cluster(clusterSpec.id()); NodeList nonRetiredNodesInCluster = nodesInCluster.not().retired(); @@ -95,12 +97,12 @@ public class NodePrioritizer { /** Returns the list of nodes sorted by {@link NodeCandidate#compareTo(NodeCandidate)} */ private List<NodeCandidate> prioritize() { // Group candidates by their switch hostname - Map<String, List<NodeCandidate>> candidatesBySwitch = this.nodes.stream() + Map<String, List<NodeCandidate>> candidatesBySwitch = this.candidates.stream() .collect(Collectors.groupingBy(candidate -> candidate.parent.orElseGet(candidate::toNode) .switchHostname() .orElse(""))); // Mark lower priority nodes on shared switch as non-exclusive - List<NodeCandidate> nodes = new ArrayList<>(this.nodes.size()); + List<NodeCandidate> nodes = new ArrayList<>(this.candidates.size()); for (var clusterSwitch : candidatesBySwitch.keySet()) { List<NodeCandidate> switchCandidates = candidatesBySwitch.get(clusterSwitch); if (clusterSwitch.isEmpty()) { @@ -126,7 +128,7 @@ public class NodePrioritizer { for (Node node : surplusNodes) { NodeCandidate candidate = candidateFrom(node, true); if (!candidate.violatesSpares || canAllocateToSpareHosts) { - nodes.add(candidate); + candidates.add(candidate); } } } @@ -136,7 +138,7 @@ public class NodePrioritizer { if ( !canAllocateNew) return; for (Node host : allNodesAndHosts.nodes()) { - if ( ! Nodes.canAllocateTenantNodeTo(host, dynamicProvisioning)) continue; + if ( ! nodes.canAllocateTenantNodeTo(host, dynamicProvisioning)) continue; if (host.reservedTo().isPresent() && !host.reservedTo().get().equals(application.tenant())) continue; if (host.reservedTo().isPresent() && application.instance().isTester()) continue; if (host.exclusiveToApplicationId().isPresent()) continue; // Never allocate new nodes to exclusive hosts @@ -144,7 +146,7 @@ public class NodePrioritizer { if (spareHosts.contains(host) && !canAllocateToSpareHosts) continue; if ( ! capacity.hasCapacity(host, requestedNodes.resources().get())) continue; if ( ! allNodesAndHosts.childrenOf(host).owner(application).cluster(clusterSpec.id()).isEmpty()) continue; - nodes.add(NodeCandidate.createNewChild(requestedNodes.resources().get(), + candidates.add(NodeCandidate.createNewChild(requestedNodes.resources().get(), capacity.availableCapacityOf(host), host, spareHosts.contains(host), @@ -164,7 +166,7 @@ public class NodePrioritizer { .filter(node -> node.allocation().get().membership().cluster().id().equals(clusterSpec.id())) .filter(node -> node.state() == Node.State.active || canStillAllocate(node)) .map(node -> candidateFrom(node, false)) - .forEach(nodes::add); + .forEach(candidates::add); } /** Add nodes already provisioned, but not allocated to any application */ @@ -174,7 +176,7 @@ public class NodePrioritizer { .filter(node -> node.state() == Node.State.ready) .map(node -> candidateFrom(node, false)) .filter(n -> !n.violatesSpares || canAllocateToSpareHosts) - .forEach(nodes::add); + .forEach(candidates::add); } /** Create a candidate from given pre-existing node */ @@ -218,7 +220,7 @@ public class NodePrioritizer { private boolean canStillAllocate(Node node) { if (node.type() != NodeType.tenant || node.parentHostname().isEmpty()) return true; Optional<Node> parent = allNodesAndHosts.parentOf(node); - return parent.isPresent() ? Nodes.canAllocateTenantNodeTo(parent.get(), dynamicProvisioning) : null; + return parent.isPresent() && nodes.canAllocateTenantNodeTo(parent.get(), dynamicProvisioning); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java index d5dbe08dca9..310f921367e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeResourceLimits.java @@ -71,7 +71,6 @@ public class NodeResourceLimits { } private double minAdvertisedMemoryGb(ClusterSpec.Type clusterType) { - if (zone().system() == SystemName.dev) return 1; // Allow small containers in dev system if (clusterType == ClusterSpec.Type.admin) return 1; return 4; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LoadBalancersV1ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LoadBalancersV1ApiHandler.java index aa6209ae80d..1ba686772c7 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LoadBalancersV1ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LoadBalancersV1ApiHandler.java @@ -2,7 +2,7 @@ package com.yahoo.vespa.hosted.provision.restapi; import com.yahoo.container.jdisc.HttpResponse; -import com.yahoo.container.jdisc.LoggingRequestHandler; +import com.yahoo.container.jdisc.ThreadedHttpRequestHandler; import com.yahoo.restapi.RestApi; import com.yahoo.restapi.RestApiRequestHandler; import com.yahoo.vespa.hosted.provision.NodeRepository; @@ -18,7 +18,7 @@ public class LoadBalancersV1ApiHandler extends RestApiRequestHandler<LoadBalance private final NodeRepository nodeRepository; @Inject - public LoadBalancersV1ApiHandler(LoggingRequestHandler.Context parentCtx, NodeRepository nodeRepository) { + public LoadBalancersV1ApiHandler(ThreadedHttpRequestHandler.Context parentCtx, NodeRepository nodeRepository) { super(parentCtx, LoadBalancersV1ApiHandler::createRestApiDefinition); this.nodeRepository = nodeRepository; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java index be011c886a5..6282c072001 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java @@ -53,7 +53,6 @@ public class NodeSerializer { case "confighost": return NodeType.confighost; case "controller": return NodeType.controller; case "controllerhost": return NodeType.controllerhost; - case "devhost": return NodeType.devhost; default: throw new IllegalArgumentException("Unknown node type '" + nodeType + "'"); } } @@ -68,7 +67,6 @@ public class NodeSerializer { case confighost: return "confighost"; case controller: return "controller"; case controllerhost: return "controllerhost"; - case devhost: return "devhost"; default: throw new IllegalArgumentException("Unknown node type '" + type.name() + "'"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index 15e1061f5e1..1304b85be6b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -11,7 +11,7 @@ import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.TenantName; import com.yahoo.container.jdisc.HttpRequest; import com.yahoo.container.jdisc.HttpResponse; -import com.yahoo.container.jdisc.LoggingRequestHandler; +import com.yahoo.container.jdisc.ThreadedHttpRequestHandler; import com.yahoo.io.IOUtils; import com.yahoo.restapi.ErrorResponse; import com.yahoo.restapi.MessageResponse; @@ -67,7 +67,7 @@ import static com.yahoo.slime.SlimeUtils.optionalString; * * @author bratseth */ -public class NodesV2ApiHandler extends LoggingRequestHandler { +public class NodesV2ApiHandler extends ThreadedHttpRequestHandler { private final Orchestrator orchestrator; private final NodeRepository nodeRepository; @@ -75,7 +75,7 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { private final NodeFlavors nodeFlavors; @Inject - public NodesV2ApiHandler(LoggingRequestHandler.Context parentCtx, Orchestrator orchestrator, + public NodesV2ApiHandler(ThreadedHttpRequestHandler.Context parentCtx, Orchestrator orchestrator, NodeRepository nodeRepository, MetricsDb metricsDb, NodeFlavors flavors) { super(parentCtx); this.orchestrator = orchestrator; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java index 1a2d5294aa5..ff406efdc39 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java @@ -70,6 +70,7 @@ public class MockNodeRepository extends NodeRepository { Optional.empty(), new InMemoryFlagSource(), new MemoryMetricsDb(Clock.fixed(Instant.ofEpochMilli(123), ZoneId.of("Z"))), + new OrchestratorMock(), true, 0, 1000); this.flavors = flavors; |