diff options
author | Valerij Fredriksen <valerijf@yahooinc.com> | 2022-11-01 15:36:37 +0100 |
---|---|---|
committer | Valerij Fredriksen <valerijf@yahooinc.com> | 2022-11-02 15:58:06 +0100 |
commit | 6688797036b4239ba58c8774f4c0893ed660bbc9 (patch) | |
tree | 9140fceebfa305b3997e9ddd1bfb232411afb17e /node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance | |
parent | 56e467f72da59b3efbfaff574c22e79542f9d17e (diff) |
Move disk replacer to separate maintainer
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance')
3 files changed, 63 insertions, 28 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DiskReplacer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DiskReplacer.java new file mode 100644 index 00000000000..acd5cb61d81 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DiskReplacer.java @@ -0,0 +1,56 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.maintenance; + +import com.yahoo.jdisc.Metric; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.NodeMutex; +import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner; + +import java.time.Duration; +import java.util.Optional; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * Rebuilds hosts by replacing the root disk (only supports hosts with remote storage). + * + * @author mpolden + */ +public class DiskReplacer extends NodeRepositoryMaintainer { + + private static final Logger log = Logger.getLogger(DiskReplacer.class.getName()); + + private final HostProvisioner hostProvisioner; + + DiskReplacer(NodeRepository nodeRepository, Duration interval, Metric metric, HostProvisioner hostProvisioner) { + super(nodeRepository, interval, metric); + this.hostProvisioner = hostProvisioner; + } + + @Override + protected double maintain() { + NodeList nodes = nodeRepository().nodes().list().rebuilding(true); + int failures = 0; + for (var host : nodes) { + Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Duration.ofSeconds(10)); + if (optionalMutex.isEmpty()) continue; + try (NodeMutex mutex = optionalMutex.get()) { + // Re-check flag while holding lock + host = mutex.node(); + if (!host.status().wantToRebuild()) { + continue; + } + Node updatedNode = hostProvisioner.replaceRootDisk(host); + if (!updatedNode.status().wantToRebuild()) { + nodeRepository().nodes().write(updatedNode, mutex); + } + } catch (RuntimeException e) { + failures++; + log.log(Level.WARNING, "Failed to rebuild " + host.hostname() + ", will retry in " + interval(), e); + } + } + return this.asSuccessFactor(nodes.size(), failures); + } +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index d6cfeab0cd7..6470e4fdb23 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -79,7 +79,6 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { NodeList nodes = nodeRepository().nodes().list(); resumeProvisioning(nodes); convergeToCapacity(nodes); - replaceRootDisk(nodes); return 1.0; } @@ -152,28 +151,6 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { }); } - /** Replace the root disk of hosts that have requested soft-rebuild */ - private void replaceRootDisk(NodeList nodes) { - NodeList softRebuildingHosts = nodes.rebuilding(true); - for (var host : softRebuildingHosts) { - Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Duration.ofSeconds(10)); - if (optionalMutex.isEmpty()) return; - try (NodeMutex mutex = optionalMutex.get()) { - // Re-check flag while holding lock - host = mutex.node(); - if (!host.status().wantToRebuild()) { - continue; - } - Node updatedNode = hostProvisioner.replaceRootDisk(host); - if (!updatedNode.status().wantToRebuild()) { - nodeRepository().nodes().write(updatedNode, mutex); - } - } catch (RuntimeException e) { - log.log(Level.WARNING, "Failed to rebuild " + host.hostname() + ", will retry in " + interval(), e); - } - } - } - /** * Provision hosts to ensure there is room to allocate spare nodes. * diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 7c748b60527..6175531fc65 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -69,11 +69,11 @@ public class NodeRepositoryMaintenance extends AbstractComponent { .map(lbService -> new LoadBalancerExpirer(nodeRepository, defaults.loadBalancerExpirerInterval, lbService, metric)) .ifPresent(maintainers::add); provisionServiceProvider.getHostProvisioner() - .map(hostProvisioner -> new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource, metric)) - .ifPresent(maintainers::add); - provisionServiceProvider.getHostProvisioner() - .map(hostProvisioner -> new HostRetirer(nodeRepository, defaults.hostRetirerInterval, metric, hostProvisioner)) - .ifPresent(maintainers::add); + .map(hostProvisioner -> List.of( + new DynamicProvisioningMaintainer(nodeRepository, defaults.dynamicProvisionerInterval, hostProvisioner, flagSource, metric), + new HostRetirer(nodeRepository, defaults.hostRetirerInterval, metric, hostProvisioner), + new DiskReplacer(nodeRepository, defaults.diskReplacerInterval, metric, hostProvisioner))) + .ifPresent(maintainers::addAll); // The DuperModel is filled with infrastructure applications by the infrastructure provisioner, so explicitly run that now infrastructureProvisioner.maintainButThrowOnException(); } @@ -112,6 +112,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { private final Duration infrastructureProvisionInterval; private final Duration loadBalancerExpirerInterval; private final Duration dynamicProvisionerInterval; + private final Duration diskReplacerInterval; private final Duration osUpgradeActivatorInterval; private final Duration rebalancerInterval; private final Duration nodeMetricsCollectionInterval; @@ -125,6 +126,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { DefaultTimes(Zone zone, Deployer deployer) { autoscalingInterval = Duration.ofMinutes(5); dynamicProvisionerInterval = Duration.ofMinutes(3); + diskReplacerInterval = Duration.ofMinutes(3); failedExpirerInterval = Duration.ofMinutes(10); failGrace = Duration.ofMinutes(20); infrastructureProvisionInterval = Duration.ofMinutes(3); |