diff options
author | Jon Bratseth <bratseth@oath.com> | 2021-06-04 19:40:59 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-06-04 19:40:59 +0200 |
commit | b14315a6f51bc8e5bce22e0d9d11d0e730aaf96d (patch) | |
tree | 5f604f87bf9d37bb2d37c161b15c03b73d770dd2 /node-repository | |
parent | e304fe27f449fae9cb71d5c969e662e601e61d2b (diff) | |
parent | ba36521578a55088c6e38d50b616af85eb33cf19 (diff) |
Merge pull request #18113 from vespa-engine/bratseth/maintainer-success-degree
Emit a success factor from maintainers
Diffstat (limited to 'node-repository')
21 files changed, 114 insertions, 77 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java index 9ac1ca2b4c1..24160c19dfa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java @@ -41,9 +41,9 @@ public abstract class ApplicationMaintainer extends NodeRepositoryMaintainer { } @Override - protected final boolean maintain() { + protected final double maintain() { applicationsNeedingMaintenance().forEach(this::deploy); - return true; + return 1.0; } /** Returns the number of deployments that are pending execution */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index 05eb878e1b1..3914a0c9f07 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -44,14 +44,13 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { - if ( ! nodeRepository().nodes().isWorking()) return false; + protected double maintain() { + if ( ! nodeRepository().nodes().isWorking()) return 0.0; - boolean success = true; - if ( ! nodeRepository().zone().environment().isAnyOf(Environment.dev, Environment.prod)) return success; + if ( ! nodeRepository().zone().environment().isAnyOf(Environment.dev, Environment.prod)) return 1.0; activeNodesByApplication().forEach(this::autoscale); - return success; + return 1.0; } private void autoscale(ApplicationId application, NodeList applicationNodes) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index af32c285d99..0eb2038e233 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -74,13 +74,13 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { NodeList nodes = nodeRepository().nodes().list(); resumeProvisioning(nodes, lock); convergeToCapacity(nodes); } - return true; + return 1.0; } /** Resume provisioning of already provisioned hosts and their children */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java index 2443a12d198..25108425e6e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java @@ -40,7 +40,7 @@ public abstract class Expirer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { NodeList expired = nodeRepository().nodes().list(fromState).matching(this::isExpired); if ( ! expired.isEmpty()) { @@ -49,7 +49,7 @@ public abstract class Expirer extends NodeRepositoryMaintainer { } metric.add("expired." + fromState, expired.size(), null); - return true; + return 1.0; } protected boolean isExpired(Node node) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index e98da35aa6a..7505ce42668 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -66,7 +66,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { List<Node> remainingNodes = new ArrayList<>(nodeRepository.nodes().list(Node.State.failed) .nodeType(NodeType.tenant, NodeType.host) .asList()); @@ -78,7 +78,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { recycleIf(remainingNodes, node -> node.allocation().get().membership().cluster().isStateful() && node.history().hasEventBefore(History.Event.Type.failed, clock().instant().minus(statefulExpiry))); - return true; + return 1.0; } /** Recycle the nodes matching condition, and remove those nodes from the nodes list. */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java index caf20463f60..80f74a011c0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java @@ -43,7 +43,7 @@ public class HostEncrypter extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { Instant now = nodeRepository().clock().instant(); NodeList allNodes = nodeRepository().nodes().list(); for (var nodeType : NodeType.values()) { @@ -51,7 +51,7 @@ public class HostEncrypter extends NodeRepositoryMaintainer { if (upgradingVespa(allNodes, nodeType)) continue; unencryptedHosts(allNodes, nodeType).forEach(host -> encrypt(host, now)); } - return true; + return 1.0; } /** Returns whether any node of given type is currently upgrading its Vespa version */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java index e317333135c..d9f5ea6a7a9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java @@ -39,9 +39,9 @@ public class InfrastructureProvisioner extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { infraDeployer.activateAllSupportedInfraApplications(false); - return true; + return 1.0; } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java index 10069fd1a18..ac9d8d6671a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.google.common.collect.Sets; import com.yahoo.jdisc.Metric; +import com.yahoo.lang.MutableInteger; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.lb.LoadBalancer; @@ -54,9 +55,9 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { expireReserved(); - return removeInactive() & pruneReals(); + return ( removeInactive() + pruneReals() ) / 2; } /** Move reserved load balancer that have expired to inactive */ @@ -68,7 +69,8 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { } /** Deprovision inactive load balancers that have expired */ - private boolean removeInactive() { + private double removeInactive() { + MutableInteger attempts = new MutableInteger(0); var failed = new ArrayList<LoadBalancerId>(); var lastException = new AtomicReference<Exception>(); var expiry = nodeRepository().clock().instant().minus(inactiveExpiry); @@ -76,6 +78,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { lb.changedAt().isBefore(expiry) && allocatedNodes(lb.id()).isEmpty(), lb -> { try { + attempts.add(1); log.log(Level.INFO, () -> "Removing expired inactive load balancer " + lb.id()); service.remove(lb.id().application(), lb.id().cluster()); db.removeLoadBalancer(lb.id()); @@ -92,11 +95,12 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { .collect(Collectors.joining(", ")), interval())); } - return lastException.get() == null; + return asSuccessFactor(attempts.get(), failed.size()); } /** Remove reals from inactive load balancers */ - private boolean pruneReals() { + private double pruneReals() { + var attempts = new MutableInteger(0); var failed = new ArrayList<LoadBalancerId>(); var lastException = new AtomicReference<Exception>(); patchLoadBalancers(lb -> lb.state() == State.inactive, lb -> { @@ -107,6 +111,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { reals.removeIf(real -> !allocatedNodes.contains(real.hostname().value())); if (reals.equals(lb.instance().get().reals())) return; // Nothing to remove try { + attempts.add(1); LOG.log(Level.INFO, () -> "Removing reals from inactive load balancer " + lb.id() + ": " + Sets.difference(lb.instance().get().reals(), reals)); service.create(new LoadBalancerSpec(lb.id().application(), lb.id().cluster(), reals), true); db.writeLoadBalancer(lb.with(lb.instance().map(instance -> instance.withReals(reals)))); @@ -124,7 +129,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { interval()), lastException.get()); } - return lastException.get() == null; + return asSuccessFactor(attempts.get(), failed.size()); } /** Patch load balancers matching given filter, while holding lock */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 85437b3e78a..3990c5099eb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -65,7 +65,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { } @Override - public boolean maintain() { + public double maintain() { NodeList nodes = nodeRepository().nodes().list(); ServiceModel serviceModel = serviceMonitor.getServiceModelSnapshot(); @@ -80,7 +80,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { updateRepairTicketMetrics(nodes); updateAllocationMetrics(nodes); updateExclusiveSwitchMetrics(nodes); - return true; + return 1.0; } private void updateAllocationMetrics(NodeList nodes) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index effa41dc69f..f16459ee8b9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -72,17 +72,21 @@ public class NodeFailer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { - if ( ! nodeRepository().nodes().isWorking()) return false; + protected double maintain() { + if ( ! nodeRepository().nodes().isWorking()) return 0.0; + int attempts = 0; + int failures = 0; int throttledHostFailures = 0; int throttledNodeFailures = 0; // Ready nodes try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { for (Map.Entry<Node, String> entry : getReadyNodesByFailureReason().entrySet()) { + attempts++; Node node = entry.getKey(); if (throttle(node)) { + failures++; if (node.type().isHost()) throttledHostFailures++; else @@ -96,10 +100,12 @@ public class NodeFailer extends NodeRepositoryMaintainer { // Active nodes for (Map.Entry<Node, String> entry : getActiveNodesByFailureReason().entrySet()) { + attempts++; Node node = entry.getKey(); if (!failAllowedFor(node.type())) continue; if (throttle(node)) { + failures++; if (node.type().isHost()) throttledHostFailures++; else @@ -116,11 +122,15 @@ public class NodeFailer extends NodeRepositoryMaintainer { if ( ! activeNodes.childrenOf(host).isEmpty()) continue; Optional<NodeMutex> locked = Optional.empty(); try { + attempts++; locked = nodeRepository().nodes().lockAndGet(host); if (locked.isEmpty()) continue; nodeRepository().nodes().fail(List.of(locked.get().node()), Agent.NodeFailer, "Host should be failed and have no tenant nodes"); } + catch (Exception e) { + failures++; + } finally { locked.ifPresent(NodeMutex::close); } @@ -130,7 +140,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { metric.set(throttlingActiveMetric, throttlingActive, null); metric.set(throttledHostFailuresMetric, throttledHostFailures, null); metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null); - return throttlingActive == 0; + return asSuccessFactor(attempts, failures); } private Map<Node, String> getReadyNodesByFailureReason() { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java index fe2fb5229f9..37969a30b81 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java @@ -5,6 +5,7 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ApplicationLockException; import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.jdisc.Metric; +import com.yahoo.lang.MutableInteger; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; @@ -48,13 +49,11 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { - updateReadyNodeLivenessEvents(); - updateActiveNodeDownState(); - return true; + protected double maintain() { + return ( updateReadyNodeLivenessEvents() + updateActiveNodeDownState() ) / 2; } - private void updateReadyNodeLivenessEvents() { + private double updateReadyNodeLivenessEvents() { // Update node last request events through ZooKeeper to collect request to all config servers. // We do this here ("lazily") to avoid writing to zk for each config request. try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { @@ -69,13 +68,16 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { } } } + return 1.0; } /** * If the node is down (see {@link #allDown}), and there is no "down" history record, we add it. * Otherwise we remove any "down" history record. */ - private void updateActiveNodeDownState() { + private double updateActiveNodeDownState() { + var attempts = new MutableInteger(0); + var failures = new MutableInteger(0); NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> { Optional<Node> node = activeNodes.node(hostname.toString()); @@ -90,6 +92,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { try (var lock = nodeRepository().nodes().lock(owner)) { node = getNode(hostname.toString(), owner, lock); // Re-get inside lock if (node.isEmpty()) return; // Node disappeared or changed allocation + attempts.add(1); if (isDown) { recordAsDown(node.get(), lock); } else { @@ -98,8 +101,10 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { } catch (ApplicationLockException e) { // Fine, carry on with other nodes. We'll try updating this one in the next run log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e)); + failures.add(1); } }); + return asSuccessFactor(attempts.get(), failures.get()); } /** diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java index 1ea4577f7fe..d671900d08c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java @@ -33,19 +33,21 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { + int attempts = 0; + var failures = new MutableInteger(0); try { - var warnings = new MutableInteger(0); Set<ApplicationId> applications = activeNodesByApplication().keySet(); - if (applications.isEmpty()) return true; + if (applications.isEmpty()) return 1.0; long pauseMs = interval().toMillis() / applications.size() - 1; // spread requests over interval int done = 0; for (ApplicationId application : applications) { + attempts++; metricsFetcher.fetchMetrics(application) .whenComplete((metricsResponse, exception) -> handleResponse(metricsResponse, exception, - warnings, + failures, application)); if (++done < applications.size()) Thread.sleep(pauseMs); @@ -56,23 +58,22 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer { nodeRepository().metricsDb().gc(); - // Suppress failures for manual zones for now to avoid noise - return nodeRepository().zone().environment().isManuallyDeployed() || warnings.get() == 0; + return asSuccessFactor(attempts, failures.get()); } catch (InterruptedException e) { - return false; + return asSuccessFactor(attempts, failures.get()); } } private void handleResponse(MetricsResponse response, Throwable exception, - MutableInteger warnings, + MutableInteger failures, ApplicationId application) { if (exception != null) { - if (warnings.get() < maxWarningsPerInvocation) + if (failures.get() < maxWarningsPerInvocation) log.log(Level.WARNING, "Could not update metrics for " + application + ": " + Exceptions.toMessageString(exception)); - warnings.add(1); + failures.add(1); } else if (response != null) { nodeRepository().metricsDb().addNodeMetrics(response.nodeMetrics()); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java index 6ee657beadd..c282fcdb7fc 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java @@ -37,7 +37,7 @@ public class NodeRebooter extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { // Reboot candidates: Nodes in long-term states, where we know we can safely orchestrate a reboot List<Node> nodesToReboot = nodeRepository().nodes().list(Node.State.active, Node.State.ready).stream() .filter(node -> node.type().isHost()) @@ -46,7 +46,7 @@ public class NodeRebooter extends NodeRepositoryMaintainer { if (!nodesToReboot.isEmpty()) nodeRepository().nodes().reboot(NodeListFilter.from(nodesToReboot)); - return true; + return 1.0; } private boolean shouldReboot(Node node) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java index 0a1f6961f9f..fe5cd419b31 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java @@ -25,7 +25,7 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { public NodeRepositoryMaintainer(NodeRepository nodeRepository, Duration interval, Metric metric) { super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(), - jobMetrics(metric), nodeRepository.database().cluster(), true); + new NodeRepositoryJobMetrics(metric), nodeRepository.database().cluster(), true); this.nodeRepository = nodeRepository; } @@ -48,10 +48,20 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { .groupingBy(node -> node.allocation().get().owner()); } - private static JobMetrics jobMetrics(Metric metric) { - return new JobMetrics((job, consecutiveFailures) -> { + private static class NodeRepositoryJobMetrics extends JobMetrics { + + private final Metric metric; + + public NodeRepositoryJobMetrics(Metric metric) { + this.metric = metric; + } + + @Override + protected void recordCompletion(String job, Long consecutiveFailures, double successFactor) { metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job))); - }); + metric.set("maintenance.successFactor", successFactor, metric.createContext(Map.of("job", job))); + } + } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java index 4eba15307cb..749603a373d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java @@ -23,13 +23,13 @@ public class OsUpgradeActivator extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { for (var nodeType : NodeType.values()) { if (!nodeType.isHost()) continue; boolean resume = canUpgradeOsOf(nodeType); nodeRepository().osVersions().resumeUpgradeOf(nodeType, resume); } - return true; + return 1.0; } /** Returns whether to allow OS upgrade of nodes of given type */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index 1543506a78e..7bb748c92c9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -33,19 +33,18 @@ public class Rebalancer extends NodeMover<Rebalancer.Move> { } @Override - protected boolean maintain() { - if ( ! nodeRepository().nodes().isWorking()) return false; + protected double maintain() { + if ( ! nodeRepository().nodes().isWorking()) return 0.0; - boolean success = true; - if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success; // Rebalancing not necessary - if (nodeRepository().zone().environment().isTest()) return success; // Short lived deployments; no need to rebalance + if (nodeRepository().zone().getCloud().dynamicProvisioning()) return 1.0; // Rebalancing not necessary + if (nodeRepository().zone().environment().isTest()) return 1.0; // Short lived deployments; no need to rebalance // Work with an unlocked snapshot as this can take a long time and full consistency is not needed NodeList allNodes = nodeRepository().nodes().list(); updateSkewMetric(allNodes); - if ( ! zoneIsStable(allNodes)) return success; + if ( ! zoneIsStable(allNodes)) return 1.0; findBestMove(allNodes).execute(true, Agent.Rebalancer, deployer, metric, nodeRepository()); - return success; + return 1.0; } @Override diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java index f72daf1bc2b..3f5893b368a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java @@ -48,7 +48,10 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { + protected double maintain() { + int attempts = 0; + int successes = 0; + NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); Map<ApplicationId, NodeList> retiredNodesByApplication = activeNodes.retired().groupingBy(node -> node.allocation().get().owner()); for (Map.Entry<ApplicationId, NodeList> entry : retiredNodesByApplication.entrySet()) { @@ -57,17 +60,19 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { List<Node> nodesToRemove = retiredNodes.stream().filter(n -> canRemove(n, activeNodes)).collect(Collectors.toList()); if (nodesToRemove.isEmpty()) continue; + attempts++; try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) { if ( ! deployment.isValid()) continue; nodeRepository().nodes().setRemovable(application, nodesToRemove); boolean success = deployment.activate().isPresent(); - if ( ! success) return success; + if ( ! success) continue; String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", ")); log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList); + successes++; } } - return true; + return attempts == 0 ? 1.0 : ((double)successes / attempts); } /** diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java index c217580872b..888f06a5004 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java @@ -36,13 +36,16 @@ public class ScalingSuggestionsMaintainer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { - if ( ! nodeRepository().zone().environment().isProduction()) return true; + protected double maintain() { + if ( ! nodeRepository().zone().environment().isProduction()) return 1.0; + int attempts = 0; int successes = 0; - for (var application : activeNodesByApplication().entrySet()) + for (var application : activeNodesByApplication().entrySet()) { + attempts++; successes += suggest(application.getKey(), application.getValue()); - return successes > 0; + } + return attempts == 0 ? 1.0 : ((double)successes / attempts); } private int suggest(ApplicationId application, NodeList applicationNodes) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java index 0307ae13b24..0589571e9d8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -66,12 +66,11 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { } @Override - protected boolean maintain() { - if ( ! nodeRepository().nodes().isWorking()) return false; + protected double maintain() { + if ( ! nodeRepository().nodes().isWorking()) return 0.0; - boolean success = true; // Don't need to maintain spare capacity in dynamically provisioned zones; can provision more on demand. - if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success; + if (nodeRepository().zone().getCloud().dynamicProvisioning()) return 1.0; NodeList allNodes = nodeRepository().nodes().list(); CapacityChecker capacityChecker = new CapacityChecker(allNodes); @@ -80,6 +79,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { metric.set("overcommittedHosts", overcommittedHosts.size(), null); retireOvercommitedHosts(allNodes, overcommittedHosts); + boolean success = true; Optional<CapacityChecker.HostFailurePath> failurePath = capacityChecker.worstCaseHostLossLeadingToFailure(); if (failurePath.isPresent()) { int spareHostCapacity = failurePath.get().hostsCausingFailure.size() - 1; @@ -96,7 +96,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { } metric.set("spareHostCapacity", spareHostCapacity, null); } - return success; + return success ? 1.0 : 0.0; } private boolean execute(List<Move> mitigation, CapacityChecker.HostFailurePath failurePath) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java index cfab980570d..44890f2f5af 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java @@ -33,14 +33,14 @@ public class SwitchRebalancer extends NodeMover<Move> { } @Override - protected boolean maintain() { - if (!nodeRepository().nodes().isWorking()) return false; - if (!nodeRepository().zone().environment().isProduction()) return true; + protected double maintain() { + if (!nodeRepository().nodes().isWorking()) return 0.0; + if (!nodeRepository().zone().environment().isProduction()) return 1.0; NodeList allNodes = nodeRepository().nodes().list(); // Lockless as strong consistency is not needed - if (!zoneIsStable(allNodes)) return true; + if (!zoneIsStable(allNodes)) return 1.0; findBestMove(allNodes).execute(false, Agent.SwitchRebalancer, deployer, metric, nodeRepository()); - return true; + return 1.0; } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java index 5b2f7ce91e8..cfe6e4d348d 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java @@ -43,7 +43,7 @@ public class NodeMetricsDbMaintainerTest { fetcher, Duration.ofHours(1), new TestMetric()); - assertTrue(maintainer.maintain()); + assertEquals(maintainer.maintain(), 1.0, 0.0000001); List<NodeTimeseries> timeseriesList = tester.nodeRepository().metricsDb().getNodeTimeseries(Duration.ofDays(1), Set.of("host-1.yahoo.com", "host-2.yahoo.com")); assertEquals(2, timeseriesList.size()); |