From cd1b747b4f65fa3a6ed6aace23235db7591638c5 Mon Sep 17 00:00:00 2001 From: Arnstein Ressem Date: Fri, 4 Jun 2021 21:57:32 +0200 Subject: Revert "Emit a success factor from maintainers" --- .../maintenance/ApplicationMaintainer.java | 4 ++-- .../maintenance/AutoscalingMaintainer.java | 9 +++++---- .../maintenance/DynamicProvisioningMaintainer.java | 4 ++-- .../vespa/hosted/provision/maintenance/Expirer.java | 4 ++-- .../hosted/provision/maintenance/FailedExpirer.java | 4 ++-- .../hosted/provision/maintenance/HostEncrypter.java | 4 ++-- .../maintenance/InfrastructureProvisioner.java | 4 ++-- .../provision/maintenance/LoadBalancerExpirer.java | 17 ++++++----------- .../provision/maintenance/MetricsReporter.java | 4 ++-- .../hosted/provision/maintenance/NodeFailer.java | 16 +++------------- .../provision/maintenance/NodeHealthTracker.java | 17 ++++++----------- .../maintenance/NodeMetricsDbMaintainer.java | 21 ++++++++++----------- .../hosted/provision/maintenance/NodeRebooter.java | 4 ++-- .../maintenance/NodeRepositoryMaintainer.java | 18 ++++-------------- .../provision/maintenance/OsUpgradeActivator.java | 4 ++-- .../hosted/provision/maintenance/Rebalancer.java | 13 +++++++------ .../provision/maintenance/RetiredExpirer.java | 11 +++-------- .../maintenance/ScalingSuggestionsMaintainer.java | 11 ++++------- .../maintenance/SpareCapacityMaintainer.java | 10 +++++----- .../provision/maintenance/SwitchRebalancer.java | 10 +++++----- .../maintenance/NodeMetricsDbMaintainerTest.java | 2 +- 21 files changed, 77 insertions(+), 114 deletions(-) (limited to 'node-repository/src') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java index 24160c19dfa..9ac1ca2b4c1 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java @@ -41,9 +41,9 @@ public abstract class ApplicationMaintainer extends NodeRepositoryMaintainer { } @Override - protected final double maintain() { + protected final boolean maintain() { applicationsNeedingMaintenance().forEach(this::deploy); - return 1.0; + return true; } /** Returns the number of deployments that are pending execution */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index 3914a0c9f07..05eb878e1b1 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -44,13 +44,14 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { } @Override - protected double maintain() { - if ( ! nodeRepository().nodes().isWorking()) return 0.0; + protected boolean maintain() { + if ( ! nodeRepository().nodes().isWorking()) return false; - if ( ! nodeRepository().zone().environment().isAnyOf(Environment.dev, Environment.prod)) return 1.0; + boolean success = true; + if ( ! nodeRepository().zone().environment().isAnyOf(Environment.dev, Environment.prod)) return success; activeNodesByApplication().forEach(this::autoscale); - return 1.0; + return success; } private void autoscale(ApplicationId application, NodeList applicationNodes) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index 0eb2038e233..af32c285d99 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -74,13 +74,13 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { } @Override - protected double maintain() { + protected boolean maintain() { try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { NodeList nodes = nodeRepository().nodes().list(); resumeProvisioning(nodes, lock); convergeToCapacity(nodes); } - return 1.0; + return true; } /** Resume provisioning of already provisioned hosts and their children */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java index 25108425e6e..2443a12d198 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java @@ -40,7 +40,7 @@ public abstract class Expirer extends NodeRepositoryMaintainer { } @Override - protected double maintain() { + protected boolean maintain() { NodeList expired = nodeRepository().nodes().list(fromState).matching(this::isExpired); if ( ! expired.isEmpty()) { @@ -49,7 +49,7 @@ public abstract class Expirer extends NodeRepositoryMaintainer { } metric.add("expired." + fromState, expired.size(), null); - return 1.0; + return true; } protected boolean isExpired(Node node) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index 7505ce42668..e98da35aa6a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -66,7 +66,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { } @Override - protected double maintain() { + protected boolean maintain() { List remainingNodes = new ArrayList<>(nodeRepository.nodes().list(Node.State.failed) .nodeType(NodeType.tenant, NodeType.host) .asList()); @@ -78,7 +78,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { recycleIf(remainingNodes, node -> node.allocation().get().membership().cluster().isStateful() && node.history().hasEventBefore(History.Event.Type.failed, clock().instant().minus(statefulExpiry))); - return 1.0; + return true; } /** Recycle the nodes matching condition, and remove those nodes from the nodes list. */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java index 80f74a011c0..caf20463f60 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostEncrypter.java @@ -43,7 +43,7 @@ public class HostEncrypter extends NodeRepositoryMaintainer { } @Override - protected double maintain() { + protected boolean maintain() { Instant now = nodeRepository().clock().instant(); NodeList allNodes = nodeRepository().nodes().list(); for (var nodeType : NodeType.values()) { @@ -51,7 +51,7 @@ public class HostEncrypter extends NodeRepositoryMaintainer { if (upgradingVespa(allNodes, nodeType)) continue; unencryptedHosts(allNodes, nodeType).forEach(host -> encrypt(host, now)); } - return 1.0; + return true; } /** Returns whether any node of given type is currently upgrading its Vespa version */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java index d9f5ea6a7a9..e317333135c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InfrastructureProvisioner.java @@ -39,9 +39,9 @@ public class InfrastructureProvisioner extends NodeRepositoryMaintainer { } @Override - protected double maintain() { + protected boolean maintain() { infraDeployer.activateAllSupportedInfraApplications(false); - return 1.0; + return true; } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java index ac9d8d6671a..10069fd1a18 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java @@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.google.common.collect.Sets; import com.yahoo.jdisc.Metric; -import com.yahoo.lang.MutableInteger; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.lb.LoadBalancer; @@ -55,9 +54,9 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { } @Override - protected double maintain() { + protected boolean maintain() { expireReserved(); - return ( removeInactive() + pruneReals() ) / 2; + return removeInactive() & pruneReals(); } /** Move reserved load balancer that have expired to inactive */ @@ -69,8 +68,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { } /** Deprovision inactive load balancers that have expired */ - private double removeInactive() { - MutableInteger attempts = new MutableInteger(0); + private boolean removeInactive() { var failed = new ArrayList(); var lastException = new AtomicReference(); var expiry = nodeRepository().clock().instant().minus(inactiveExpiry); @@ -78,7 +76,6 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { lb.changedAt().isBefore(expiry) && allocatedNodes(lb.id()).isEmpty(), lb -> { try { - attempts.add(1); log.log(Level.INFO, () -> "Removing expired inactive load balancer " + lb.id()); service.remove(lb.id().application(), lb.id().cluster()); db.removeLoadBalancer(lb.id()); @@ -95,12 +92,11 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { .collect(Collectors.joining(", ")), interval())); } - return asSuccessFactor(attempts.get(), failed.size()); + return lastException.get() == null; } /** Remove reals from inactive load balancers */ - private double pruneReals() { - var attempts = new MutableInteger(0); + private boolean pruneReals() { var failed = new ArrayList(); var lastException = new AtomicReference(); patchLoadBalancers(lb -> lb.state() == State.inactive, lb -> { @@ -111,7 +107,6 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { reals.removeIf(real -> !allocatedNodes.contains(real.hostname().value())); if (reals.equals(lb.instance().get().reals())) return; // Nothing to remove try { - attempts.add(1); LOG.log(Level.INFO, () -> "Removing reals from inactive load balancer " + lb.id() + ": " + Sets.difference(lb.instance().get().reals(), reals)); service.create(new LoadBalancerSpec(lb.id().application(), lb.id().cluster(), reals), true); db.writeLoadBalancer(lb.with(lb.instance().map(instance -> instance.withReals(reals)))); @@ -129,7 +124,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { interval()), lastException.get()); } - return asSuccessFactor(attempts.get(), failed.size()); + return lastException.get() == null; } /** Patch load balancers matching given filter, while holding lock */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 3990c5099eb..85437b3e78a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -65,7 +65,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { } @Override - public double maintain() { + public boolean maintain() { NodeList nodes = nodeRepository().nodes().list(); ServiceModel serviceModel = serviceMonitor.getServiceModelSnapshot(); @@ -80,7 +80,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { updateRepairTicketMetrics(nodes); updateAllocationMetrics(nodes); updateExclusiveSwitchMetrics(nodes); - return 1.0; + return true; } private void updateAllocationMetrics(NodeList nodes) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index f16459ee8b9..effa41dc69f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -72,21 +72,17 @@ public class NodeFailer extends NodeRepositoryMaintainer { } @Override - protected double maintain() { - if ( ! nodeRepository().nodes().isWorking()) return 0.0; + protected boolean maintain() { + if ( ! nodeRepository().nodes().isWorking()) return false; - int attempts = 0; - int failures = 0; int throttledHostFailures = 0; int throttledNodeFailures = 0; // Ready nodes try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { for (Map.Entry entry : getReadyNodesByFailureReason().entrySet()) { - attempts++; Node node = entry.getKey(); if (throttle(node)) { - failures++; if (node.type().isHost()) throttledHostFailures++; else @@ -100,12 +96,10 @@ public class NodeFailer extends NodeRepositoryMaintainer { // Active nodes for (Map.Entry entry : getActiveNodesByFailureReason().entrySet()) { - attempts++; Node node = entry.getKey(); if (!failAllowedFor(node.type())) continue; if (throttle(node)) { - failures++; if (node.type().isHost()) throttledHostFailures++; else @@ -122,15 +116,11 @@ public class NodeFailer extends NodeRepositoryMaintainer { if ( ! activeNodes.childrenOf(host).isEmpty()) continue; Optional locked = Optional.empty(); try { - attempts++; locked = nodeRepository().nodes().lockAndGet(host); if (locked.isEmpty()) continue; nodeRepository().nodes().fail(List.of(locked.get().node()), Agent.NodeFailer, "Host should be failed and have no tenant nodes"); } - catch (Exception e) { - failures++; - } finally { locked.ifPresent(NodeMutex::close); } @@ -140,7 +130,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { metric.set(throttlingActiveMetric, throttlingActive, null); metric.set(throttledHostFailuresMetric, throttledHostFailures, null); metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null); - return asSuccessFactor(attempts, failures); + return throttlingActive == 0; } private Map getReadyNodesByFailureReason() { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java index 37969a30b81..fe2fb5229f9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java @@ -5,7 +5,6 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ApplicationLockException; import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.jdisc.Metric; -import com.yahoo.lang.MutableInteger; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.applicationmodel.ServiceInstance; import com.yahoo.vespa.applicationmodel.ServiceStatus; @@ -49,11 +48,13 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { } @Override - protected double maintain() { - return ( updateReadyNodeLivenessEvents() + updateActiveNodeDownState() ) / 2; + protected boolean maintain() { + updateReadyNodeLivenessEvents(); + updateActiveNodeDownState(); + return true; } - private double updateReadyNodeLivenessEvents() { + private void updateReadyNodeLivenessEvents() { // Update node last request events through ZooKeeper to collect request to all config servers. // We do this here ("lazily") to avoid writing to zk for each config request. try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { @@ -68,16 +69,13 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { } } } - return 1.0; } /** * If the node is down (see {@link #allDown}), and there is no "down" history record, we add it. * Otherwise we remove any "down" history record. */ - private double updateActiveNodeDownState() { - var attempts = new MutableInteger(0); - var failures = new MutableInteger(0); + private void updateActiveNodeDownState() { NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> { Optional node = activeNodes.node(hostname.toString()); @@ -92,7 +90,6 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { try (var lock = nodeRepository().nodes().lock(owner)) { node = getNode(hostname.toString(), owner, lock); // Re-get inside lock if (node.isEmpty()) return; // Node disappeared or changed allocation - attempts.add(1); if (isDown) { recordAsDown(node.get(), lock); } else { @@ -101,10 +98,8 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { } catch (ApplicationLockException e) { // Fine, carry on with other nodes. We'll try updating this one in the next run log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e)); - failures.add(1); } }); - return asSuccessFactor(attempts.get(), failures.get()); } /** diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java index d671900d08c..1ea4577f7fe 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainer.java @@ -33,21 +33,19 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer { } @Override - protected double maintain() { - int attempts = 0; - var failures = new MutableInteger(0); + protected boolean maintain() { try { + var warnings = new MutableInteger(0); Set applications = activeNodesByApplication().keySet(); - if (applications.isEmpty()) return 1.0; + if (applications.isEmpty()) return true; long pauseMs = interval().toMillis() / applications.size() - 1; // spread requests over interval int done = 0; for (ApplicationId application : applications) { - attempts++; metricsFetcher.fetchMetrics(application) .whenComplete((metricsResponse, exception) -> handleResponse(metricsResponse, exception, - failures, + warnings, application)); if (++done < applications.size()) Thread.sleep(pauseMs); @@ -58,22 +56,23 @@ public class NodeMetricsDbMaintainer extends NodeRepositoryMaintainer { nodeRepository().metricsDb().gc(); - return asSuccessFactor(attempts, failures.get()); + // Suppress failures for manual zones for now to avoid noise + return nodeRepository().zone().environment().isManuallyDeployed() || warnings.get() == 0; } catch (InterruptedException e) { - return asSuccessFactor(attempts, failures.get()); + return false; } } private void handleResponse(MetricsResponse response, Throwable exception, - MutableInteger failures, + MutableInteger warnings, ApplicationId application) { if (exception != null) { - if (failures.get() < maxWarningsPerInvocation) + if (warnings.get() < maxWarningsPerInvocation) log.log(Level.WARNING, "Could not update metrics for " + application + ": " + Exceptions.toMessageString(exception)); - failures.add(1); + warnings.add(1); } else if (response != null) { nodeRepository().metricsDb().addNodeMetrics(response.nodeMetrics()); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java index c282fcdb7fc..6ee657beadd 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java @@ -37,7 +37,7 @@ public class NodeRebooter extends NodeRepositoryMaintainer { } @Override - protected double maintain() { + protected boolean maintain() { // Reboot candidates: Nodes in long-term states, where we know we can safely orchestrate a reboot List nodesToReboot = nodeRepository().nodes().list(Node.State.active, Node.State.ready).stream() .filter(node -> node.type().isHost()) @@ -46,7 +46,7 @@ public class NodeRebooter extends NodeRepositoryMaintainer { if (!nodesToReboot.isEmpty()) nodeRepository().nodes().reboot(NodeListFilter.from(nodesToReboot)); - return 1.0; + return true; } private boolean shouldReboot(Node node) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java index fe5cd419b31..0a1f6961f9f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java @@ -25,7 +25,7 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { public NodeRepositoryMaintainer(NodeRepository nodeRepository, Duration interval, Metric metric) { super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(), - new NodeRepositoryJobMetrics(metric), nodeRepository.database().cluster(), true); + jobMetrics(metric), nodeRepository.database().cluster(), true); this.nodeRepository = nodeRepository; } @@ -48,20 +48,10 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { .groupingBy(node -> node.allocation().get().owner()); } - private static class NodeRepositoryJobMetrics extends JobMetrics { - - private final Metric metric; - - public NodeRepositoryJobMetrics(Metric metric) { - this.metric = metric; - } - - @Override - protected void recordCompletion(String job, Long consecutiveFailures, double successFactor) { + private static JobMetrics jobMetrics(Metric metric) { + return new JobMetrics((job, consecutiveFailures) -> { metric.set("maintenance.consecutiveFailures", consecutiveFailures, metric.createContext(Map.of("job", job))); - metric.set("maintenance.successFactor", successFactor, metric.createContext(Map.of("job", job))); - } - + }); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java index 749603a373d..4eba15307cb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java @@ -23,13 +23,13 @@ public class OsUpgradeActivator extends NodeRepositoryMaintainer { } @Override - protected double maintain() { + protected boolean maintain() { for (var nodeType : NodeType.values()) { if (!nodeType.isHost()) continue; boolean resume = canUpgradeOsOf(nodeType); nodeRepository().osVersions().resumeUpgradeOf(nodeType, resume); } - return 1.0; + return true; } /** Returns whether to allow OS upgrade of nodes of given type */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index 7bb748c92c9..1543506a78e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -33,18 +33,19 @@ public class Rebalancer extends NodeMover { } @Override - protected double maintain() { - if ( ! nodeRepository().nodes().isWorking()) return 0.0; + protected boolean maintain() { + if ( ! nodeRepository().nodes().isWorking()) return false; - if (nodeRepository().zone().getCloud().dynamicProvisioning()) return 1.0; // Rebalancing not necessary - if (nodeRepository().zone().environment().isTest()) return 1.0; // Short lived deployments; no need to rebalance + boolean success = true; + if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success; // Rebalancing not necessary + if (nodeRepository().zone().environment().isTest()) return success; // Short lived deployments; no need to rebalance // Work with an unlocked snapshot as this can take a long time and full consistency is not needed NodeList allNodes = nodeRepository().nodes().list(); updateSkewMetric(allNodes); - if ( ! zoneIsStable(allNodes)) return 1.0; + if ( ! zoneIsStable(allNodes)) return success; findBestMove(allNodes).execute(true, Agent.Rebalancer, deployer, metric, nodeRepository()); - return 1.0; + return success; } @Override diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java index 3f5893b368a..f72daf1bc2b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java @@ -48,10 +48,7 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { } @Override - protected double maintain() { - int attempts = 0; - int successes = 0; - + protected boolean maintain() { NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); Map retiredNodesByApplication = activeNodes.retired().groupingBy(node -> node.allocation().get().owner()); for (Map.Entry entry : retiredNodesByApplication.entrySet()) { @@ -60,19 +57,17 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { List nodesToRemove = retiredNodes.stream().filter(n -> canRemove(n, activeNodes)).collect(Collectors.toList()); if (nodesToRemove.isEmpty()) continue; - attempts++; try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) { if ( ! deployment.isValid()) continue; nodeRepository().nodes().setRemovable(application, nodesToRemove); boolean success = deployment.activate().isPresent(); - if ( ! success) continue; + if ( ! success) return success; String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", ")); log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList); - successes++; } } - return attempts == 0 ? 1.0 : ((double)successes / attempts); + return true; } /** diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java index 888f06a5004..c217580872b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java @@ -36,16 +36,13 @@ public class ScalingSuggestionsMaintainer extends NodeRepositoryMaintainer { } @Override - protected double maintain() { - if ( ! nodeRepository().zone().environment().isProduction()) return 1.0; + protected boolean maintain() { + if ( ! nodeRepository().zone().environment().isProduction()) return true; - int attempts = 0; int successes = 0; - for (var application : activeNodesByApplication().entrySet()) { - attempts++; + for (var application : activeNodesByApplication().entrySet()) successes += suggest(application.getKey(), application.getValue()); - } - return attempts == 0 ? 1.0 : ((double)successes / attempts); + return successes > 0; } private int suggest(ApplicationId application, NodeList applicationNodes) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java index 0589571e9d8..0307ae13b24 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -66,11 +66,12 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { } @Override - protected double maintain() { - if ( ! nodeRepository().nodes().isWorking()) return 0.0; + protected boolean maintain() { + if ( ! nodeRepository().nodes().isWorking()) return false; + boolean success = true; // Don't need to maintain spare capacity in dynamically provisioned zones; can provision more on demand. - if (nodeRepository().zone().getCloud().dynamicProvisioning()) return 1.0; + if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success; NodeList allNodes = nodeRepository().nodes().list(); CapacityChecker capacityChecker = new CapacityChecker(allNodes); @@ -79,7 +80,6 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { metric.set("overcommittedHosts", overcommittedHosts.size(), null); retireOvercommitedHosts(allNodes, overcommittedHosts); - boolean success = true; Optional failurePath = capacityChecker.worstCaseHostLossLeadingToFailure(); if (failurePath.isPresent()) { int spareHostCapacity = failurePath.get().hostsCausingFailure.size() - 1; @@ -96,7 +96,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { } metric.set("spareHostCapacity", spareHostCapacity, null); } - return success ? 1.0 : 0.0; + return success; } private boolean execute(List mitigation, CapacityChecker.HostFailurePath failurePath) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java index 44890f2f5af..cfab980570d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java @@ -33,14 +33,14 @@ public class SwitchRebalancer extends NodeMover { } @Override - protected double maintain() { - if (!nodeRepository().nodes().isWorking()) return 0.0; - if (!nodeRepository().zone().environment().isProduction()) return 1.0; + protected boolean maintain() { + if (!nodeRepository().nodes().isWorking()) return false; + if (!nodeRepository().zone().environment().isProduction()) return true; NodeList allNodes = nodeRepository().nodes().list(); // Lockless as strong consistency is not needed - if (!zoneIsStable(allNodes)) return 1.0; + if (!zoneIsStable(allNodes)) return true; findBestMove(allNodes).execute(false, Agent.SwitchRebalancer, deployer, metric, nodeRepository()); - return 1.0; + return true; } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java index cfe6e4d348d..5b2f7ce91e8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMetricsDbMaintainerTest.java @@ -43,7 +43,7 @@ public class NodeMetricsDbMaintainerTest { fetcher, Duration.ofHours(1), new TestMetric()); - assertEquals(maintainer.maintain(), 1.0, 0.0000001); + assertTrue(maintainer.maintain()); List timeseriesList = tester.nodeRepository().metricsDb().getNodeTimeseries(Duration.ofDays(1), Set.of("host-1.yahoo.com", "host-2.yahoo.com")); assertEquals(2, timeseriesList.size()); -- cgit v1.2.3