diff options
Diffstat (limited to 'node-repository/src/main/java/com')
26 files changed, 421 insertions, 237 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepoStats.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepoStats.java index 085b89d1253..1460ce70686 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepoStats.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepoStats.java @@ -26,16 +26,23 @@ import java.util.Set; */ public class NodeRepoStats { + private final double totalCost; + private final double totalAllocatedCost; private final Load load; private final Load activeLoad; private final List<ApplicationStats> applicationStats; - private NodeRepoStats(Load load, Load activeLoad, List<ApplicationStats> applicationStats) { + private NodeRepoStats(double totalCost, double totalAllocatedCost, Load load, Load activeLoad, List<ApplicationStats> applicationStats) { + this.totalCost = totalCost; + this.totalAllocatedCost = totalAllocatedCost; this.load = load; this.activeLoad = activeLoad; this.applicationStats = List.copyOf(applicationStats); } + public double totalCost() { return totalCost; } + public double totalAllocatedCost() { return totalAllocatedCost; } + /** * Returns the current average work-extracting utilization in this node repo over all nodes. * Capacity not allocated to active nodes are taken to have 0 utilization as it provides no useful work. @@ -50,11 +57,15 @@ public class NodeRepoStats { public static NodeRepoStats computeOver(NodeRepository nodeRepository) { NodeList allNodes = nodeRepository.nodes().list(); - List<NodeTimeseries> allNodeTimeseries = nodeRepository.metricsDb().getNodeTimeseries(Duration.ofHours(1), Set.of()); + double totalCost = allNodes.hosts().stream().mapToDouble(host -> host.resources().cost()).sum(); + double totalAllocatedCost = allNodes.not().hosts().stream() + .filter(node -> node.allocation().isPresent()) + .mapToDouble(node -> node.resources().cost()).sum(); + List<NodeTimeseries> allNodeTimeseries = nodeRepository.metricsDb().getNodeTimeseries(Duration.ofHours(1), Set.of()); Pair<Load, Load> load = computeLoad(allNodes, allNodeTimeseries); List<ApplicationStats> applicationStats = computeApplicationStats(allNodes, allNodeTimeseries); - return new NodeRepoStats(load.getFirst(), load.getSecond(), applicationStats); + return new NodeRepoStats(totalCost, totalAllocatedCost, load.getFirst(), load.getSecond(), applicationStats); } private static Pair<Load, Load> computeLoad(NodeList allNodes, List<NodeTimeseries> allNodeTimeseries) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java index 88a4b492a0b..4a1545cc66c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java @@ -64,6 +64,33 @@ public class AllocatableClusterResources { this.fulfilment = fulfilment(realResources, idealResources); } + private AllocatableClusterResources(int nodes, + int groups, + NodeResources realResources, + NodeResources advertisedResources, + ClusterSpec clusterSpec, + double fulfilment) { + this.nodes = nodes; + this.groups = groups; + this.realResources = realResources; + this.advertisedResources = advertisedResources; + this.clusterSpec = clusterSpec; + this.fulfilment = fulfilment; + } + + /** Returns this with the redundant node or group removed from counts. */ + public AllocatableClusterResources withoutRedundancy() { + int groupSize = nodes / groups; + int nodesAdjustedForRedundancy = nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes; + int groupsAdjustedForRedundancy = nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups; + return new AllocatableClusterResources(nodesAdjustedForRedundancy, + groupsAdjustedForRedundancy, + realResources, + advertisedResources, + clusterSpec, + fulfilment); + } + /** * Returns the resources which will actually be available per node in this cluster with this allocation. * These should be used for reasoning about allocation to meet measured demand. @@ -83,11 +110,6 @@ public class AllocatableClusterResources { public int nodes() { return nodes; } public int groups() { return groups; } - public int groupSize() { - // ceil: If the division does not produce a whole number we assume some node is missing - return (int)Math.ceil((double)nodes / groups); - } - public ClusterSpec clusterSpec() { return clusterSpec; } public double cost() { return nodes * advertisedResources.cost(); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java index 41fa9499353..2befd69f893 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java @@ -2,7 +2,6 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.ClusterResources; -import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; @@ -20,11 +19,6 @@ public class AllocationOptimizer { private static final int minimumNodes = 2; // Since this number includes redundancy it cannot be lower than 2 private static final int maximumNodes = 150; - // When a query is issued on a node the cost is the sum of a fixed cost component and a cost component - // proportional to document count. We must account for this when comparing configurations with more or fewer nodes. - // TODO: Measure this, and only take it into account with queries - private static final double fixedCpuCostFraction = 0.1; - private final NodeRepository nodeRepository; public AllocationOptimizer(NodeRepository nodeRepository) { @@ -32,13 +26,13 @@ public class AllocationOptimizer { } /** - * Searches the space of possible allocations given a target + * Searches the space of possible allocations given a target relative load * and (optionally) cluster limits and returns the best alternative. * * @return the best allocation, if there are any possible legal allocations, fulfilling the target * fully or partially, within the limits */ - public Optional<AllocatableClusterResources> findBestAllocation(ResourceTarget target, + public Optional<AllocatableClusterResources> findBestAllocation(Load targetLoad, AllocatableClusterResources current, ClusterModel clusterModel, Limits limits) { @@ -53,18 +47,11 @@ public class AllocationOptimizer { for (int groups = limits.min().groups(); groups <= limits.max().groups(); groups++) { for (int nodes = limits.min().nodes(); nodes <= limits.max().nodes(); nodes++) { if (nodes % groups != 0) continue; - int groupSize = nodes / groups; - - // Adjust for redundancy: Node in group if groups = 1, an extra group if multiple groups - // TODO: Make the best choice based on size and redundancy setting instead - int nodesAdjustedForRedundancy = target.adjustForRedundancy() ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes; - int groupsAdjustedForRedundancy = target.adjustForRedundancy() ? (groups == 1 ? 1 : groups - 1) : groups; ClusterResources next = new ClusterResources(nodes, groups, - nodeResourcesWith(nodesAdjustedForRedundancy, - groupsAdjustedForRedundancy, - limits, target, current, clusterModel)); + nodeResourcesWith(nodes, groups, + limits, targetLoad, current, clusterModel)); var allocatableResources = AllocatableClusterResources.from(next, current.clusterSpec(), limits, hosts, nodeRepository); if (allocatableResources.isEmpty()) continue; @@ -77,42 +64,25 @@ public class AllocationOptimizer { /** * For the observed load this instance is initialized with, returns the resources needed per node to be at - * ideal load given a target node count + * the target relative load, given a target node and group count. */ private NodeResources nodeResourcesWith(int nodes, int groups, Limits limits, - ResourceTarget target, + Load targetLoad, AllocatableClusterResources current, ClusterModel clusterModel) { - double cpu, memory, disk; - int groupSize = nodes / groups; - - if (current.clusterSpec().type() == ClusterSpec.Type.content) { // load scales with node share of content - // Cpu: Query cpu scales with cluster size, write cpu scales with group size - // Memory and disk: Scales with group size - - // The fixed cost portion of cpu does not scale with changes to the node count - double queryCpuPerGroup = fixedCpuCostFraction * target.resources().vcpu() + - (1 - fixedCpuCostFraction) * target.resources().vcpu() * current.groupSize() / groupSize; - double queryCpu = queryCpuPerGroup * current.groups() / groups; - double writeCpu = target.resources().vcpu() * current.groupSize() / groupSize; - cpu = clusterModel.queryCpuFraction() * queryCpu + (1 - clusterModel.queryCpuFraction()) * writeCpu; - memory = target.resources().memoryGb() * current.groupSize() / groupSize; - disk = target.resources().diskGb() * current.groupSize() / groupSize; - } - else { - cpu = target.resources().vcpu() * current.nodes() / nodes; - memory = target.resources().memoryGb(); - disk = target.resources().diskGb(); - } + var scaled = targetLoad // redundancy aware target relative to current load + .multiply(clusterModel.loadWith(nodes, groups)) // redundancy aware adjustment with these counts + .divide(clusterModel.redundancyAdjustment()) // correct for double redundancy adjustment + .scaled(current.realResources().nodeResources()); // Combine the scaled resource values computed here // with the currently configured non-scaled values, given in the limits, if any - NodeResources nonScaled = limits.isEmpty() || limits.min().nodeResources().isUnspecified() - ? current.advertisedResources().nodeResources() - : limits.min().nodeResources(); // min=max for non-scaled - return nonScaled.withVcpu(cpu).withMemoryGb(memory).withDiskGb(disk); + var nonScaled = limits.isEmpty() || limits.min().nodeResources().isUnspecified() + ? current.advertisedResources().nodeResources() + : limits.min().nodeResources(); // min=max for non-scaled + return nonScaled.withVcpu(scaled.vcpu()).withMemoryGb(scaled.memoryGb()).withDiskGb(scaled.diskGb()); } /** Returns a copy of the given limits where the minimum nodes are at least the given value when allowed */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index c2e66d39861..7a02fa9eb7e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -26,7 +26,7 @@ public class Autoscaler { /** What cost difference is worth a reallocation? */ private static final double costDifferenceWorthReallocation = 0.1; /** What resource difference is worth a reallocation? */ - private static final double resourceDifferenceWorthReallocation = 0.1; + private static final double resourceDifferenceWorthReallocation = 0.03; private final NodeRepository nodeRepository; private final AllocationOptimizer allocationOptimizer; @@ -61,8 +61,8 @@ public class Autoscaler { private Advice autoscale(Application application, Cluster cluster, NodeList clusterNodes, Limits limits) { ClusterModel clusterModel = new ClusterModel(application, - cluster, clusterNodes.clusterSpec(), + cluster, clusterNodes, nodeRepository.metricsDb(), nodeRepository.clock()); @@ -89,10 +89,8 @@ public class Autoscaler { " nodes, but require from " + clusterNodes.size()); var currentAllocation = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository); - var target = ResourceTarget.idealLoad(clusterModel, currentAllocation); - Optional<AllocatableClusterResources> bestAllocation = - allocationOptimizer.findBestAllocation(target, currentAllocation, clusterModel, limits); + allocationOptimizer.findBestAllocation(clusterModel.loadAdjustment(), currentAllocation, clusterModel, limits); if (bestAllocation.isEmpty()) return Advice.dontScale(Status.insufficient, "No allocations are possible within configured limits"); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index b7a5c1e7fe7..ae18e7ffb91 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -27,14 +27,17 @@ public class ClusterModel { /** Containers typically use more cpu right after generation change, so discard those metrics */ public static final Duration warmupDuration = Duration.ofMinutes(5); - private static final Duration currentLoadDuration = Duration.ofMinutes(5); - static final double idealQueryCpuLoad = 0.8; static final double idealWriteCpuLoad = 0.95; static final double idealMemoryLoad = 0.65; static final double idealContainerDiskLoad = 0.95; static final double idealContentDiskLoad = 0.6; + // When a query is issued on a node the cost is the sum of a fixed cost component and a cost component + // proportional to document count. We must account for this when comparing configurations with more or fewer nodes. + // TODO: Measure this, and only take it into account with queries + private static final double fixedCpuCostFraction = 0.1; + private final Application application; private final ClusterSpec clusterSpec; private final Cluster cluster; @@ -50,8 +53,8 @@ public class ClusterModel { private Double maxQueryGrowthRate = null; public ClusterModel(Application application, - Cluster cluster, ClusterSpec clusterSpec, + Cluster cluster, NodeList clusterNodes, MetricsDb metricsDb, Clock clock) { @@ -76,7 +79,7 @@ public class ClusterModel { this.application = application; this.clusterSpec = clusterSpec; this.cluster = cluster; - this.nodes = null; + this.nodes = NodeList.of(); this.clock = clock; this.scalingDuration = scalingDuration; @@ -88,6 +91,20 @@ public class ClusterModel { public ClusterSpec clusterSpec() { return clusterSpec; } public Cluster cluster() { return cluster; } + /** Returns the relative load adjustment that should be made to this cluster given available measurements. */ + public Load loadAdjustment() { + if (nodeTimeseries().measurementsPerNode() == 0) return Load.one(); // No info, no change + /* + // Should we scale up? + Load relativePeak = nodeTimeseries().peakLoad().divide(idealLoad()); + if (relativePeak.any(v -> v > 1)) + return relativePeak.max(Load.one()); // Don't downscale any dimension if we upscale + + // Should we scale down? + */ + return averageLoad().divide(idealLoad()); + } + /** Returns the predicted duration of a rescaling of this cluster */ public Duration scalingDuration() { return scalingDuration; } @@ -110,14 +127,78 @@ public class ClusterModel { return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), clock); } - /** Returns average load during the last {@link #currentLoadDuration} */ - public Load currentLoad() { return nodeTimeseries().averageLoad(clock.instant().minus(currentLoadDuration)); } + /** Returns average of the last load reading from each node. */ + public Load currentLoad() { return nodeTimeseries().currentLoad(); } /** Returns average load during the last {@link #scalingDuration()} */ public Load averageLoad() { return nodeTimeseries().averageLoad(clock.instant().minus(scalingDuration())); } + /** The number of nodes this cluster has, or will have if not deployed yet. */ + // TODO: Make this the deployed, not current count + public int nodeCount() { + if ( ! nodes.isEmpty()) return (int)nodes.stream().count(); + return cluster.minResources().nodes(); + } + + /** The number of groups this cluster has, or will have if not deployed yet. */ + // TODO: Make this the deployed, not current count + public int groupCount() { + if ( ! nodes.isEmpty()) return (int)nodes.stream().mapToInt(node -> node.allocation().get().membership().cluster().group().get().index()).distinct().count(); + return cluster.minResources().groups(); + } + + public int groupSize() { + // ceil: If the division does not produce a whole number we assume some node is missing + return (int)Math.ceil((double)nodeCount() / groupCount()); + } + + /** Returns the relative load adjustment accounting for redundancy in this. */ + public Load redundancyAdjustment() { + return loadWith(nodeCount(), groupCount()); + } + + /** + * Returns the relative load adjustment accounting for redundancy given these nodes+groups + * relative to node nodes+groups in this. + */ + public Load loadWith(int trueNodes, int trueGroups) { + int nodes = nodesAdjustedForRedundancy(trueNodes, trueGroups); + int groups = groupsAdjustedForRedundancy(trueNodes, trueGroups); + if (clusterSpec().type() == ClusterSpec.Type.content) { // load scales with node share of content + int groupSize = nodes / groups; + + // Cpu: Query cpu scales with cluster size, write cpu scales with group size + // Memory and disk: Scales with group size + + // The fixed cost portion of cpu does not scale with changes to the node count + double queryCpuPerGroup = fixedCpuCostFraction + (1 - fixedCpuCostFraction) * groupSize() / groupSize; + + double queryCpu = queryCpuPerGroup * groupCount() / groups; + double writeCpu = (double)groupSize() / groupSize; + return new Load(queryCpuFraction() * queryCpu + (1 - queryCpuFraction()) * writeCpu, + (double)groupSize() / groupSize, + (double)groupSize() / groupSize); + } + else { + return new Load((double)nodeCount() / nodes, 1, 1); + } + } + + /** + * Returns the ideal load across the nodes of this sich that each node will be at ideal load + * if one of the nodes go down. + */ public Load idealLoad() { - return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad()); + return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad()).divide(redundancyAdjustment()); + } + + public int nodesAdjustedForRedundancy(int nodes, int groups) { + int groupSize = (int)Math.ceil((double)nodes / groups); + return nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes; + } + + public int groupsAdjustedForRedundancy(int nodes, int groups) { + return nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups; } /** Ideal cpu load must take the application traffic fraction into account */ @@ -171,7 +252,6 @@ public class ClusterModel { // Assume we have missed timely recording completion if it is longer than 4 days totalDuration = totalDuration.plus(maximum(Duration.ofDays(4), event.duration().get())); } - if (completedEventCount == 0) { // Use defaults if (clusterSpec.isStateful()) return Duration.ofHours(12); return Duration.ofMinutes(10); @@ -212,13 +292,13 @@ public class ClusterModel { * as QuestDb is known to temporarily fail during reading of data. */ public static Optional<ClusterModel> create(Application application, - Cluster cluster, ClusterSpec clusterSpec, + Cluster cluster, NodeList clusterNodes, MetricsDb metricsDb, Clock clock) { try { - return Optional.of(new ClusterModel(application, cluster, clusterSpec, clusterNodes, metricsDb, clock)); + return Optional.of(new ClusterModel(application, clusterSpec, cluster, clusterNodes, metricsDb, clock)); } catch (Exception e) { log.log(Level.WARNING, "Failed creating a cluster model for " + application + " " + cluster, e); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java index 5ad4ef2e263..ab5be045dd4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java @@ -7,6 +7,8 @@ import com.yahoo.vespa.hosted.provision.applications.Cluster; import java.time.Duration; import java.time.Instant; import java.util.List; +import java.util.Optional; +import java.util.OptionalDouble; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -48,6 +50,7 @@ public class ClusterNodesTimeseries { /** Returns the average number of measurements per node */ public int measurementsPerNode() { + if (clusterNodes.size() == 0) return 0; int measurementCount = timeseries.stream().mapToInt(m -> m.size()).sum(); return measurementCount / clusterNodes.size(); } @@ -69,6 +72,41 @@ public class ClusterNodesTimeseries { return total.divide(count); } + /** Returns average of the latest load reading from each node */ + public Load currentLoad() { + Load total = Load.zero(); + int count = 0; + for (var nodeTimeseries : timeseries) { + Optional<NodeMetricSnapshot> last = nodeTimeseries.last(); + if (last.isEmpty()) continue; + + total = total.add(last.get().load()); + count++; + } + return total.divide(count); + } + + /** + * Returns the "peak load" in this: Which is for each load dimension, + * the average of the highest reading for that dimension on each node. + */ + public Load peakLoad() { + return new Load(peakLoad(Load.Dimension.cpu), peakLoad(Load.Dimension.memory), peakLoad(Load.Dimension.disk)); + } + + private double peakLoad(Load.Dimension dimension) { + double total = 0; + int count = 0; + for (var nodeTimeseries : timeseries) { + OptionalDouble value = nodeTimeseries.peak(dimension); + if (value.isEmpty()) continue; + total += value.getAsDouble(); + count++; + } + if (count == 0) return 0; + return total / count; + } + private static List<NodeTimeseries> keep(List<NodeTimeseries> timeseries, Predicate<NodeMetricSnapshot> filter) { return timeseries.stream().map(nodeTimeseries -> nodeTimeseries.keep(filter)).collect(Collectors.toList()); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java index a52b048a9e0..88c7e70cd35 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java @@ -3,6 +3,12 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.NodeResources; +import java.util.Objects; +import java.util.function.DoubleBinaryOperator; +import java.util.function.DoubleFunction; +import java.util.function.DoubleUnaryOperator; +import java.util.function.Predicate; + /** * The load of a node or system, measured as fractions of max (1.0) in three dimensions. * @@ -10,6 +16,8 @@ import com.yahoo.config.provision.NodeResources; */ public class Load { + public enum Dimension { cpu, memory, disk } + private final double cpu, memory, disk; public Load(double cpu, double memory, double disk) { @@ -23,27 +31,51 @@ public class Load { public double disk() { return disk; } public Load add(Load other) { - return new Load(cpu + other.cpu(), memory + other.memory(), disk + other.disk()); + return join(other, (a, b) -> a + b); } public Load multiply(NodeResources resources) { return new Load(cpu * resources.vcpu(), memory * resources.memoryGb(), disk * resources.diskGb()); } - public Load multiply(double factor) { - return new Load(cpu * factor, memory * factor, disk * factor); + return map(v -> v * factor); + } + public Load multiply(Load other) { + return join(other, (a, b) -> a * b); } + public Load divide(Load divisor) { + return join(divisor, (a, b) -> divide(a, b)); + } + public Load divide(double divisor) { + return map(v -> divide(v, divisor)); + } public Load divide(NodeResources resources) { return new Load(divide(cpu, resources.vcpu()), divide(memory, resources.memoryGb()), divide(disk, resources.diskGb())); } - public Load divide(Load divisor) { - return new Load(divide(cpu, divisor.cpu()), divide(memory, divisor.memory()), divide(disk, divisor.disk())); + /** Returns the load having the max value of this and the given load in each dimension. */ + public Load max(Load other) { + return join(other, (a, b) -> Math.max(a, b)); } - public Load divide(double divisor) { - return new Load(divide(cpu, divisor), divide(memory, divisor), divide(disk, divisor)); + /** Returns the load where the given function is applied to each dimension of this. */ + public Load map(DoubleUnaryOperator f) { + return new Load(f.applyAsDouble(cpu), + f.applyAsDouble(memory), + f.applyAsDouble(disk)); + } + + /** Returns the load where the given function is applied to each dimension of this and the given load. */ + public Load join(Load other, DoubleBinaryOperator f) { + return new Load(f.applyAsDouble(this.cpu(), other.cpu()), + f.applyAsDouble(this.memory(), other.memory()), + f.applyAsDouble(this.disk(), other.disk())); + } + + /** Returns true if any dimension matches the predicate. */ + public boolean any(Predicate<Double> test) { + return test.test(cpu) || test.test(memory) || test.test(disk); } public NodeResources scaled(NodeResources resources) { @@ -52,6 +84,14 @@ public class Load { .withDiskGb(disk * resources.diskGb()); } + public double get(Dimension dimension) { + return switch (dimension) { + case cpu -> cpu(); + case memory -> memory(); + case disk -> disk(); + }; + } + private double requireNormalized(double value, String name) { if (Double.isNaN(value)) throw new IllegalArgumentException(name + " must be a number but is NaN"); @@ -60,17 +100,31 @@ public class Load { return value; } + private static double divide(double a, double b) { + if (a == 0 && b == 0) return 0; + return a / b; + } + + @Override + public boolean equals(Object o) { + if (o == this) return true; + if ( ! (o instanceof Load other)) return false; + if (other.cpu() != this.cpu()) return false; + if (other.memory() != this.memory()) return false; + if (other.disk() != this.disk()) return false; + return true; + } + + @Override + public int hashCode() { return Objects.hash(cpu, memory, disk); } + @Override public String toString() { return "load: " + cpu + " cpu, " + memory + " memory, " + disk + " disk"; } public static Load zero() { return new Load(0, 0, 0); } - - private static double divide(double a, double b) { - if (a == 0 && b == 0) return 0; - return a / b; - } + public static Load one() { return new Load(1, 1, 1); } public static Load byDividing(NodeResources a, NodeResources b) { return new Load(divide(a.vcpu(), b.vcpu()), divide(a.memoryGb(), b.memoryGb()), divide(a.diskGb(), b.diskGb())); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java index 4a5f8972e11..500dbf0f66f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java @@ -6,6 +6,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Optional; +import java.util.OptionalDouble; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -40,6 +41,10 @@ public class NodeTimeseries { return Optional.of(snapshots.get(snapshots.size() - 1)); } + public OptionalDouble peak(Load.Dimension dimension) { + return snapshots.stream().mapToDouble(snapshot -> snapshot.load().get(dimension)).max(); + } + public List<NodeMetricSnapshot> asList() { return snapshots; } public String hostname() { return hostname; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java deleted file mode 100644 index 72836baaf5b..00000000000 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hosted.provision.autoscale; - -import com.yahoo.config.provision.NodeResources; -import com.yahoo.vespa.hosted.provision.applications.Application; - -import java.time.Clock; -import java.time.Duration; -import java.util.OptionalDouble; - -/** - * A resource target to hit for the allocation optimizer. - * The target is measured in cpu, memory and disk per node in the allocation given by current. - * - * @author bratseth - */ -public class ResourceTarget { - - private final boolean adjustForRedundancy; - - /** The target real resources per node, assuming the node assignment where this was decided */ - private final NodeResources resources; - - private ResourceTarget(NodeResources resources, boolean adjustForRedundancy) { - this.resources = resources; - this.adjustForRedundancy = adjustForRedundancy; - } - - /** Are the target resources given by this including redundancy or not */ - public boolean adjustForRedundancy() { return adjustForRedundancy; } - - /** Returns the target resources per node in terms of the current allocation */ - public NodeResources resources() { return resources; } - - @Override - public String toString() { - return "target " + resources + (adjustForRedundancy ? "(with redundancy adjustment) " : ""); - } - - /** Create a target of achieving ideal load given a current load */ - public static ResourceTarget idealLoad(ClusterModel clusterModel, - AllocatableClusterResources current) { - var loadAdjustment = clusterModel.averageLoad().divide(clusterModel.idealLoad()); - return new ResourceTarget(loadAdjustment.scaled(current.realResources().nodeResources()), true); - } - - /** Crete a target of preserving a current allocation */ - public static ResourceTarget preserve(AllocatableClusterResources current) { - return new ResourceTarget(current.realResources().nodeResources(), false); - } - -} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index 9c6eb2199f5..a9e7ded66e6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -12,7 +12,6 @@ import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.jdisc.Metric; import com.yahoo.lang.MutableInteger; -import com.yahoo.transaction.Mutex; import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.flags.JacksonFlag; import com.yahoo.vespa.flags.ListFlag; @@ -77,16 +76,14 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { @Override protected double maintain() { - try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { - NodeList nodes = nodeRepository().nodes().list(); - resumeProvisioning(nodes, lock); - convergeToCapacity(nodes); - } + NodeList nodes = nodeRepository().nodes().list(); + resumeProvisioning(nodes); + convergeToCapacity(nodes); return 1.0; } /** Resume provisioning of already provisioned hosts and their children */ - private void resumeProvisioning(NodeList nodes, Mutex lock) { + private void resumeProvisioning(NodeList nodes) { Map<String, Set<Node>> nodesByProvisionedParentHostname = nodes.nodeType(NodeType.tenant, NodeType.config, NodeType.controller) .asList() @@ -97,9 +94,11 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { nodes.state(Node.State.provisioned).nodeType(NodeType.host, NodeType.confighost, NodeType.controllerhost).forEach(host -> { Set<Node> children = nodesByProvisionedParentHostname.getOrDefault(host.hostname(), Set.of()); try { - List<Node> updatedNodes = hostProvisioner.provision(host, children); - verifyDns(updatedNodes); - nodeRepository().nodes().write(updatedNodes, lock); + try (var lock = nodeRepository().nodes().lockUnallocated()) { + List<Node> updatedNodes = hostProvisioner.provision(host, children); + verifyDns(updatedNodes); + nodeRepository().nodes().write(updatedNodes, lock); + } } catch (IllegalArgumentException | IllegalStateException e) { log.log(Level.INFO, "Could not provision " + host.hostname() + " with " + children.size() + " children, will retry in " + interval() + ": " + Exceptions.toMessageString(e)); @@ -108,7 +107,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { " children, failing out the host recursively", e); // Fail out as operator to force a quick redeployment nodeRepository().nodes().failOrMarkRecursively( - host.hostname(), Agent.operator, "Failed by HostProvisioner due to provisioning failure"); + host.hostname(), Agent.DynamicProvisioningMaintainer, "Failed by HostProvisioner due to provisioning failure"); } catch (RuntimeException e) { if (e.getCause() instanceof NameNotFoundException) log.log(Level.INFO, "Could not provision " + host.hostname() + ", will retry in " + interval() + ": " + Exceptions.toMessageString(e)); @@ -187,29 +186,38 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { .collect(Collectors.toList()); } - private List<Node> candidatesForRemoval(List<Node> nodes) { - Map<String, Node> hostsByHostname = new HashMap<>(nodes.stream() - .filter(node -> { - switch (node.type()) { - case host: - // TODO: Mark empty tenant hosts as wanttoretire & wanttodeprovision elsewhere, then handle as confighost here - return node.state() != Node.State.parked || node.status().wantToDeprovision(); - case confighost: - case controllerhost: - return node.state() == Node.State.parked && node.status().wantToDeprovision(); - default: - return false; - } - }) - .collect(Collectors.toMap(Node::hostname, Function.identity()))); + private static List<Node> candidatesForRemoval(List<Node> nodes) { + Map<String, Node> removableHostsByHostname = new HashMap<>(); + for (var node : nodes) { + if (canRemoveHost(node)) { + removableHostsByHostname.put(node.hostname(), node); + } + } + for (var node : nodes) { + if (node.parentHostname().isPresent() && !canRemoveNode(node)) { + removableHostsByHostname.remove(node.parentHostname().get()); + } + } + return List.copyOf(removableHostsByHostname.values()); + } - nodes.stream() - .filter(node -> node.allocation().isPresent()) - .flatMap(node -> node.parentHostname().stream()) - .distinct() - .forEach(hostsByHostname::remove); + private static boolean canRemoveHost(Node host) { + return switch (host.type()) { + // TODO: Mark empty tenant hosts as wanttoretire & wanttodeprovision elsewhere, then handle as confighost here + case host -> host.state() != Node.State.parked || host.status().wantToDeprovision(); + case confighost, controllerhost -> canDeprovision(host); + default -> false; + }; + } + + private static boolean canRemoveNode(Node node) { + if (node.type().isHost()) throw new IllegalArgumentException("Node " + node + " is not a child"); + return node.allocation().isEmpty() || canDeprovision(node); + } - return List.copyOf(hostsByHostname.values()); + private static boolean canDeprovision(Node node) { + return node.status().wantToDeprovision() && (node.state() == Node.State.parked || + node.state() == Node.State.failed); } private Map<String, Node> findSharedHosts(NodeList nodeList) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 62557b275c8..67c1c7359f7 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -198,11 +198,6 @@ public class MetricsReporter extends NodeRepositoryMaintainer { metric.set("wantToDeprovision", node.status().wantToDeprovision() ? 1 : 0, context); metric.set("failReport", NodeFailer.reasonsToFailHost(node).isEmpty() ? 0 : 1, context); - if (node.type().isHost()) { - metric.set("wantToEncrypt", node.reports().getReport("wantToEncrypt").isPresent() ? 1 : 0, context); - metric.set("diskEncrypted", node.reports().getReport("diskEncrypted").isPresent() ? 1 : 0, context); - } - HostName hostname = new HostName(node.hostname()); serviceModel.getApplication(hostname) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 3c5b20da4d0..3e7abe8f053 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -12,6 +12,7 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeMutex; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; +import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException; import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; @@ -19,9 +20,11 @@ import com.yahoo.yolean.Exceptions; import java.time.Duration; import java.time.Instant; +import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.logging.Level; @@ -106,26 +109,6 @@ public class NodeFailer extends NodeRepositoryMaintainer { failActive(failing); } - // Active hosts - NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); - for (Node host : activeNodes.hosts().failing()) { - if ( ! activeNodes.childrenOf(host).isEmpty()) continue; - Optional<NodeMutex> locked = Optional.empty(); - try { - attempts++; - locked = nodeRepository().nodes().lockAndGet(host); - if (locked.isEmpty()) continue; - nodeRepository().nodes().fail(List.of(locked.get().node()), Agent.NodeFailer, - "Host should be failed and have no tenant nodes"); - } - catch (Exception e) { - failures++; - } - finally { - locked.ifPresent(NodeMutex::close); - } - } - int throttlingActive = Math.min(1, throttledHostFailures + throttledNodeFailures); metric.set(throttlingActiveMetric, throttlingActive, null); metric.set(throttledHostFailuresMetric, throttledHostFailures, null); @@ -153,6 +136,9 @@ public class NodeFailer extends NodeRepositoryMaintainer { Set<FailingNode> failingNodes = new HashSet<>(); NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); + for (Node host : activeNodes.hosts().failing()) + failingNodes.add(new FailingNode(host, "Host should be failed and have no tenant nodes")); + for (Node node : activeNodes) { Instant graceTimeStart = clock().instant().minus(nodeRepository().nodes().suspended(node) ? suspendedDownTimeLimit : downTimeLimit); if (node.isDown() && node.history().hasEventBefore(History.Event.Type.down, graceTimeStart) && !applicationSuspended(node)) { @@ -241,42 +227,61 @@ public class NodeFailer extends NodeRepositoryMaintainer { deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(30)); if (deployment.isEmpty()) return false; - try (Mutex lock = nodeRepository().nodes().lock(failing.node().allocation().get().owner())) { - // If the active node that we are trying to fail is of type host, we need to successfully fail all - // the children nodes running on it before we fail the host - boolean allTenantNodesFailedOutSuccessfully = true; + // If the active node that we are trying to fail is of type host, we need to successfully fail all + // the children nodes running on it before we fail the host. Failing a child node in a dynamically + // provisioned zone may require provisioning new hosts that require the host application lock to be held, + // so we must release ours before failing the children. + List<FailingNode> activeChildrenToFail = new ArrayList<>(); + try (NodeMutex lock = nodeRepository().nodes().lockAndGetRequired(failing.node())) { + // Now that we have gotten the node object under the proper lock, sanity-check it still makes sense to fail + if (!Objects.equals(failing.node().allocation().map(Allocation::owner), lock.node().allocation().map(Allocation::owner))) + return false; + if (lock.node().state() == Node.State.failed) + return true; + if (!Objects.equals(failing.node().state(), lock.node().state())) + return false; + failing = new FailingNode(lock.node(), failing.reason); + String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason(); for (Node failingTenantNode : nodeRepository().nodes().list().childrenOf(failing.node())) { if (failingTenantNode.state() == Node.State.active) { - allTenantNodesFailedOutSuccessfully &= failActive(new FailingNode(failingTenantNode, reasonForChildFailure)); - } else { + activeChildrenToFail.add(new FailingNode(failingTenantNode, reasonForChildFailure)); + } else if (failingTenantNode.state() != Node.State.failed) { nodeRepository().nodes().fail(failingTenantNode.hostname(), Agent.NodeFailer, reasonForChildFailure); } } - if (! allTenantNodesFailedOutSuccessfully) return false; - wantToFail(failing.node(), true, lock); - try { - deployment.get().activate(); - return true; - } catch (TransientException e) { - log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() + - " with a transient error, will be retried by application maintainer: " + - Exceptions.toMessageString(e)); - return true; - } catch (RuntimeException e) { - // Reset want to fail: We'll retry failing unless it heals in the meantime - nodeRepository().nodes().node(failing.node().hostname()) - .ifPresent(n -> wantToFail(n, false, lock)); - log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() + - " for " + failing.reason() + ": " + Exceptions.toMessageString(e)); - return false; + if (activeChildrenToFail.isEmpty()) { + wantToFail(failing.node(), true, lock); + try { + deployment.get().activate(); + return true; + } catch (TransientException e) { + log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() + + " with a transient error, will be retried by application maintainer: " + + Exceptions.toMessageString(e)); + return true; + } catch (RuntimeException e) { + // Reset want to fail: We'll retry failing unless it heals in the meantime + nodeRepository().nodes().node(failing.node().hostname()) + .ifPresent(n -> wantToFail(n, false, lock)); + log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() + + " for " + failing.reason() + ": " + Exceptions.toMessageString(e)); + return false; + } } } + + // In a dynamically provisioned zone the failing of the first child may require a new host to be provisioned, + // so failActive() may take a long time to complete, but the remaining children should be fast. + activeChildrenToFail.forEach(this::failActive); + + return false; } private void wantToFail(Node node, boolean wantToFail, Mutex lock) { - nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock); + if (!node.status().wantToFail()) + nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock); } /** Returns true if node failing should be throttled */ @@ -284,16 +289,24 @@ public class NodeFailer extends NodeRepositoryMaintainer { if (throttlePolicy == ThrottlePolicy.disabled) return false; Instant startOfThrottleWindow = clock().instant().minus(throttlePolicy.throttleWindow); NodeList allNodes = nodeRepository().nodes().list(); - NodeList recentlyFailedNodes = allNodes.state(Node.State.failed) - .matching(n -> n.history().hasEventAfter(History.Event.Type.failed, - startOfThrottleWindow)); + NodeList recentlyFailedNodes = allNodes + .matching(n -> n.status().wantToFail() || + (n.state() == Node.State.failed && + n.history().hasEventAfter(History.Event.Type.failed, startOfThrottleWindow))); // Allow failing any node within policy if (recentlyFailedNodes.size() < throttlePolicy.allowedToFailOf(allNodes.size())) return false; // Always allow failing a minimum number of hosts - if (node.parentHostname().isEmpty() && - recentlyFailedNodes.parents().size() < throttlePolicy.minimumAllowedToFail) return false; + if (node.parentHostname().isEmpty()) { + Set<String> parentsOfRecentlyFailedNodes = recentlyFailedNodes.stream() + .map(n -> n.parentHostname().orElse(n.hostname())) + .collect(Collectors.toSet()); + long potentiallyFailed = parentsOfRecentlyFailedNodes.contains(node.hostname()) ? + parentsOfRecentlyFailedNodes.size() : + parentsOfRecentlyFailedNodes.size() + 1; + if (potentiallyFailed <= throttlePolicy.minimumAllowedToFail) return false; + } // Always allow failing children of a failed host if (recentlyFailedNodes.parentOf(node).isPresent()) return false; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index f32fd225427..aa1abb18d8c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -124,14 +124,14 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final NodeFailer.ThrottlePolicy throttlePolicy; DefaultTimes(Zone zone, Deployer deployer) { - autoscalingInterval = Duration.ofMinutes(15); + autoscalingInterval = Duration.ofMinutes(5); dynamicProvisionerInterval = Duration.ofMinutes(3); failedExpirerInterval = Duration.ofMinutes(10); failGrace = Duration.ofMinutes(30); infrastructureProvisionInterval = Duration.ofMinutes(3); loadBalancerExpirerInterval = Duration.ofMinutes(5); metricsInterval = Duration.ofMinutes(1); - nodeFailerInterval = Duration.ofMinutes(15); + nodeFailerInterval = Duration.ofMinutes(9); nodeFailureStatusUpdateInterval = Duration.ofMinutes(2); nodeMetricsCollectionInterval = Duration.ofMinutes(1); expeditedChangeRedeployInterval = Duration.ofMinutes(3); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java index ac804f99cd3..c2d4506a28c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java @@ -4,6 +4,7 @@ package com.yahoo.vespa.hosted.provision.node; import com.google.common.collect.ImmutableMap; import com.yahoo.vespa.hosted.provision.Node; +import java.time.Duration; import java.time.Instant; import java.util.ArrayList; import java.util.Collection; @@ -50,6 +51,12 @@ public class History { return builder.build(); } + /** Returns the age of this node as best as we can determine: The time since the first event registered for it */ + public Duration age(Instant now) { + Instant oldestEventTime = events.values().stream().map(event -> event.at()).sorted().findFirst().orElse(now); + return Duration.between(oldestEventTime, now); + } + /** Returns the last event of given type, if it is present in this history */ public Optional<Event> event(Event.Type type) { return Optional.ofNullable(events.get(type)); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index d750a3ef737..41a23ac21ff 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -212,12 +212,12 @@ public class Nodes { return setReady(List.of(nodeToReady), agent, reason).get(0); } - /** Reserve nodes. This method does <b>not</b> lock the node repository */ + /** Reserve nodes. This method does <b>not</b> lock the node repository. */ public List<Node> reserve(List<Node> nodes) { return db.writeTo(Node.State.reserved, nodes, Agent.application, Optional.empty()); } - /** Activate nodes. This method does <b>not</b> lock the node repository */ + /** Activate nodes. This method does <b>not</b> lock the node repository. */ public List<Node> activate(List<Node> nodes, NestedTransaction transaction) { return db.writeTo(Node.State.active, nodes, Agent.application, Optional.empty(), transaction); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeListFilter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeListFilter.java index 8578e3eb5ec..2b790ff7392 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeListFilter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeListFilter.java @@ -30,4 +30,5 @@ public class NodeListFilter { public static Predicate<Node> from(List<Node> nodes) { return makePredicate(nodes); } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java index 30fd2713017..4178d4a6328 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingOsUpgrader.java @@ -7,6 +7,7 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.filter.NodeListFilter; +import java.time.Instant; import java.util.Objects; import java.util.Optional; import java.util.logging.Logger; @@ -39,8 +40,10 @@ public class DelegatingOsUpgrader implements OsUpgrader { public void upgradeTo(OsVersionTarget target) { NodeList activeNodes = nodeRepository.nodes().list(Node.State.active).nodeType(target.nodeType()); int numberToUpgrade = Math.max(0, maxActiveUpgrades - activeNodes.changingOsVersionTo(target.version()).size()); + Instant now = nodeRepository.clock().instant(); NodeList nodesToUpgrade = activeNodes.not().changingOsVersionTo(target.version()) .osVersionIsBefore(target.version()) + .matching(node -> canUpgradeAt(now, node)) .byIncreasingOsVersion() .first(numberToUpgrade); if (nodesToUpgrade.size() == 0) return; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java index 5310ef339ed..4140de76368 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsUpgrader.java @@ -2,6 +2,9 @@ package com.yahoo.vespa.hosted.provision.os; import com.yahoo.config.provision.NodeType; +import com.yahoo.vespa.hosted.provision.Node; + +import java.time.Instant; /** * Interface for an OS upgrader. @@ -16,4 +19,9 @@ public interface OsUpgrader { /** Disable OS upgrade for all nodes of given type */ void disableUpgrade(NodeType type); + /** Returns whether node can upgrade at given instant */ + default boolean canUpgradeAt(Instant instant, Node node) { + return true; + } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java index 7c6d1cb69db..440046ab818 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/OsVersions.java @@ -84,7 +84,7 @@ public class OsVersions { Version target = Optional.ofNullable(change.targets().get(nodeType)) .map(OsVersionTarget::version) .orElse(Version.emptyVersion); - chooseUpgrader(nodeType, target).disableUpgrade(nodeType); + chooseUpgrader(nodeType, Optional.of(target)).disableUpgrade(nodeType); return change.withoutTarget(nodeType); }); } @@ -120,7 +120,7 @@ public class OsVersions { try (Lock lock = db.lockOsVersionChange()) { OsVersionTarget target = readChange().targets().get(nodeType); if (target == null) return; // No target set for this type - OsUpgrader upgrader = chooseUpgrader(nodeType, target.version()); + OsUpgrader upgrader = chooseUpgrader(nodeType, Optional.of(target.version())); if (resume) { upgrader.upgradeTo(target); } else { @@ -129,17 +129,23 @@ public class OsVersions { } } + /** Returns whether node can be upgraded now */ + public boolean canUpgrade(Node node) { + return chooseUpgrader(node.type(), Optional.empty()).canUpgradeAt(nodeRepository.clock().instant(), node); + } + /** Returns the upgrader to use when upgrading given node type to target */ - private OsUpgrader chooseUpgrader(NodeType nodeType, Version target) { + private OsUpgrader chooseUpgrader(NodeType nodeType, Optional<Version> target) { if (reprovisionToUpgradeOs) { return new RetiringOsUpgrader(nodeRepository); } // Require rebuild if we have any nodes of this type on a major version lower than target - boolean rebuildRequired = nodeRepository.nodes().list(Node.State.active).nodeType(nodeType).stream() + boolean rebuildRequired = target.isPresent() && + nodeRepository.nodes().list(Node.State.active).nodeType(nodeType).stream() .map(Node::status) .map(Status::osVersion) .anyMatch(osVersion -> osVersion.current().isPresent() && - osVersion.current().get().getMajor() < target.getMajor()); + osVersion.current().get().getMajor() < target.get().getMajor()); if (rebuildRequired) { return new RebuildingOsUpgrader(nodeRepository); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java index efc377e6cc3..f96effe9e10 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RebuildingOsUpgrader.java @@ -47,7 +47,7 @@ public class RebuildingOsUpgrader implements OsUpgrader { public void upgradeTo(OsVersionTarget target) { NodeList allNodes = nodeRepository.nodes().list(); Instant now = nodeRepository.clock().instant(); - rebuildableHosts(target, allNodes).forEach(host -> rebuild(host, target.version(), now)); + rebuildableHosts(target, allNodes, now).forEach(host -> rebuild(host, target.version(), now)); } @Override @@ -62,7 +62,7 @@ public class RebuildingOsUpgrader implements OsUpgrader { return Math.max(0, limit - hostsOfType.rebuilding().size()); } - private List<Node> rebuildableHosts(OsVersionTarget target, NodeList allNodes) { + private List<Node> rebuildableHosts(OsVersionTarget target, NodeList allNodes, Instant now) { NodeList hostsOfTargetType = allNodes.nodeType(target.nodeType()); int rebuildLimit = rebuildLimit(target.nodeType(), hostsOfTargetType); @@ -76,6 +76,7 @@ public class RebuildingOsUpgrader implements OsUpgrader { NodeList candidates = hostsOfTargetType.state(Node.State.active) .not().rebuilding() .osVersionIsBefore(target.version()) + .matching(node -> canUpgradeAt(now, node)) .byIncreasingOsVersion(); for (Node host : candidates) { if (hostsToRebuild.size() == rebuildLimit) break; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java index d923c78a929..79b7441cc34 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java @@ -26,6 +26,9 @@ public class RetiringOsUpgrader implements OsUpgrader { private static final Logger LOG = Logger.getLogger(RetiringOsUpgrader.class.getName()); + /** The duration this leaves new nodes alone before scheduling any upgrade */ + static final Duration GRACE_PERIOD = Duration.ofDays(30); + protected final NodeRepository nodeRepository; public RetiringOsUpgrader(NodeRepository nodeRepository) { @@ -33,21 +36,27 @@ public class RetiringOsUpgrader implements OsUpgrader { } @Override - public final void upgradeTo(OsVersionTarget target) { + public void upgradeTo(OsVersionTarget target) { NodeList allNodes = nodeRepository.nodes().list(); Instant now = nodeRepository.clock().instant(); NodeList candidates = candidates(now, target, allNodes); candidates.not().deprovisioning() + .matching(node -> canUpgradeAt(now, node)) .byIncreasingOsVersion() .first(1) .forEach(node -> deprovision(node, target.version(), now)); } @Override - public final void disableUpgrade(NodeType type) { + public void disableUpgrade(NodeType type) { // No action needed in this implementation. } + @Override + public boolean canUpgradeAt(Instant instant, Node node) { + return node.history().age(instant).compareTo(GRACE_PERIOD) > 0; + } + /** Returns nodes that are candidates for upgrade */ private NodeList candidates(Instant instant, OsVersionTarget target, NodeList allNodes) { NodeList activeNodes = allNodes.state(Node.State.active).nodeType(target.nodeType()); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/ArchiveUris.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/ArchiveUris.java index 1cd8bb0fccd..e057fabc4fc 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/ArchiveUris.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/ArchiveUris.java @@ -7,11 +7,11 @@ import com.yahoo.vespa.curator.Lock; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient; - import java.time.Duration; import java.util.Map; import java.util.Optional; import java.util.TreeMap; +import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Pattern; @@ -63,7 +63,7 @@ public class ArchiveUris { })); } - /** Set the docker image for nodes of given type */ + /** Set (or remove, if archiveURI is empty) archive URI to use for given tenant */ public void setArchiveUri(TenantName tenant, Optional<String> archiveUri) { try (Lock lock = db.lockArchiveUris()) { Map<TenantName, String> archiveUris = new TreeMap<>(db.readArchiveUris()); @@ -73,7 +73,8 @@ public class ArchiveUris { () -> archiveUris.remove(tenant)); db.writeArchiveUris(archiveUris); this.archiveUris.invalidate(); // Throw away current cache - log.info("Set archive URI for " + tenant + " to " + archiveUri.orElse(null)); + log.log(Level.FINE, () -> archiveUri.map(s -> "Set archive URI for " + tenant + " to " + s) + .orElseGet(() -> "Remove archive URI for " + tenant)); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java index 64865c15529..065293ca5d8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java @@ -27,7 +27,7 @@ import com.yahoo.vespa.hosted.provision.autoscale.AllocatableClusterResources; import com.yahoo.vespa.hosted.provision.autoscale.AllocationOptimizer; import com.yahoo.vespa.hosted.provision.autoscale.ClusterModel; import com.yahoo.vespa.hosted.provision.autoscale.Limits; -import com.yahoo.vespa.hosted.provision.autoscale.ResourceTarget; +import com.yahoo.vespa.hosted.provision.autoscale.Load; import com.yahoo.vespa.hosted.provision.node.Allocation; import com.yahoo.vespa.hosted.provision.node.filter.ApplicationFilter; import com.yahoo.vespa.hosted.provision.node.filter.NodeHostFilter; @@ -171,7 +171,7 @@ public class NodeRepositoryProvisioner implements Provisioner { firstDeployment // start at min, preserve current resources otherwise ? new AllocatableClusterResources(initialResourcesFrom(requested, clusterSpec, application.id()), clusterSpec, nodeRepository) : new AllocatableClusterResources(nodes.asList(), nodeRepository); - var clusterModel = new ClusterModel(application, cluster, clusterSpec, nodes, nodeRepository.metricsDb(), nodeRepository.clock()); + var clusterModel = new ClusterModel(application, clusterSpec, cluster, nodes, nodeRepository.metricsDb(), nodeRepository.clock()); return within(Limits.of(requested), currentResources, firstDeployment, clusterModel); } @@ -196,7 +196,7 @@ public class NodeRepositoryProvisioner implements Provisioner { if (! firstDeployment && currentAsAdvertised.isWithin(limits.min(), limits.max())) return currentAsAdvertised; // Otherwise, find an allocation that preserves the current resources as well as possible - return allocationOptimizer.findBestAllocation(ResourceTarget.preserve(current), + return allocationOptimizer.findBestAllocation(Load.one(), current, clusterModel, limits) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java index f0b8e77ee56..1c10de8498a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java @@ -62,7 +62,7 @@ public class ApplicationSerializer { NodeList nodes = applicationNodes.not().retired().cluster(cluster.id()); if (nodes.isEmpty()) return; ClusterResources currentResources = nodes.toResources(); - Optional<ClusterModel> clusterModel = ClusterModel.create(application, cluster, nodes.clusterSpec(), nodes, metricsDb, nodeRepository.clock()); + Optional<ClusterModel> clusterModel = ClusterModel.create(application, nodes.clusterSpec(), cluster, nodes, metricsDb, nodeRepository.clock()); Cursor clusterObject = clustersObject.setObject(cluster.id().value()); clusterObject.setString("type", nodes.clusterSpec().type().name()); toSlime(cluster.minResources(), clusterObject.setObject("min")); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java index 3659166c9da..87a9735f91e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java @@ -162,6 +162,9 @@ class NodesResponse extends SlimeJsonResponse { object.setLong("currentRebootGeneration", node.status().reboot().current()); node.status().osVersion().current().ifPresent(version -> object.setString("currentOsVersion", version.toFullString())); node.status().osVersion().wanted().ifPresent(version -> object.setString("wantedOsVersion", version.toFullString())); + if (node.type().isHost()) { + object.setBool("deferOsUpgrade", !nodeRepository.osVersions().canUpgrade(node)); + } node.status().firmwareVerifiedAt().ifPresent(instant -> object.setLong("currentFirmwareCheck", instant.toEpochMilli())); if (node.type().isHost()) nodeRepository.firmwareChecks().requiredAfter().ifPresent(after -> object.setLong("wantedFirmwareCheck", after.toEpochMilli())); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index c5d8b2518e5..b3fbe124493 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -44,6 +44,7 @@ import com.yahoo.vespa.hosted.provision.node.filter.ParentHostFilter; import com.yahoo.vespa.hosted.provision.restapi.NodesResponse.ResponseType; import com.yahoo.vespa.orchestrator.Orchestrator; import com.yahoo.yolean.Exceptions; + import javax.inject.Inject; import java.io.IOException; import java.io.UncheckedIOException; @@ -453,6 +454,8 @@ public class NodesV2ApiHandler extends ThreadedHttpRequestHandler { Slime slime = new Slime(); Cursor root = slime.setObject(); + root.setDouble("totalCost", stats.totalCost()); + root.setDouble("totalAllocatedCost", stats.totalAllocatedCost()); toSlime(stats.load(), root.setObject("load")); toSlime(stats.activeLoad(), root.setObject("activeLoad")); Cursor applicationsArray = root.setArray("applications"); |