diff options
author | jonmv <venstad@gmail.com> | 2023-08-29 14:28:25 +0200 |
---|---|---|
committer | jonmv <venstad@gmail.com> | 2023-08-29 14:28:25 +0200 |
commit | 20e6b984a523865381561b999168fcaeee85d960 (patch) | |
tree | 00bf08ebb43f58f58f5cffc9634a4bc7ff307a9c /node-repository | |
parent | e7c66eb91bd479cee4d499954ecb176c4361956a (diff) |
Batch disk replacer
Diffstat (limited to 'node-repository')
3 files changed, 37 insertions, 39 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DiskReplacer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DiskReplacer.java index 0e67027f8ca..0c1d6291baa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DiskReplacer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DiskReplacer.java @@ -8,17 +8,14 @@ import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeMutex; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner; +import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.RebuildResult; import com.yahoo.yolean.Exceptions; -import com.yahoo.yolean.UncheckedInterruptedException; import java.time.Duration; -import java.util.HashMap; -import java.util.Map; +import java.util.List; import java.util.Optional; -import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import java.util.concurrent.Future; import java.util.logging.Level; import java.util.logging.Logger; @@ -43,34 +40,23 @@ public class DiskReplacer extends NodeRepositoryMaintainer { @Override protected double maintain() { NodeList nodes = nodeRepository().nodes().list().rebuilding(true); - int rebuilding = 0; int failures = 0; + List<Node> rebuilding; try (var locked = nodeRepository().nodes().lockAndGetAll(nodes.asList(), Optional.of(Duration.ofSeconds(10)))) { - Map<String, Future<Node>> rebuilt = new HashMap<>(); - for (NodeMutex node : locked.nodes()) { - if (node.node().status().wantToRebuild() && ++rebuilding <= maxBatchSize) { - rebuilt.put(node.node().hostname(), executor.submit(() -> hostProvisioner.replaceRootDisk(node.node()))); - } - } + rebuilding = locked.nodes().stream().map(NodeMutex::node).toList(); + RebuildResult result = hostProvisioner.replaceRootDisk(rebuilding); + + for (Node updated : result.rebuilt()) + if (!updated.status().wantToRebuild()) + nodeRepository().nodes().write(updated, () -> { }); - for (var node : rebuilt.entrySet()) { - try { - Node updated = node.getValue().get(); - if ( ! updated.status().wantToRebuild()) { - nodeRepository().nodes().write(updated, () -> { }); - } - } - catch (ExecutionException e) { - ++failures; - log.log(Level.WARNING, "Failed to rebuild " + node.getKey() + ", will retry in " + - interval() + ": " + Exceptions.toMessageString(e.getCause())); - } - catch (InterruptedException e) { - throw new UncheckedInterruptedException(e, true); - } + for (var entry : result.failed().entrySet()) { + ++failures; + log.log(Level.WARNING, "Failed to rebuild " + entry.getKey() + ", will retry in " + + interval() + ": " + Exceptions.toMessageString(entry.getValue())); } } - return this.asSuccessFactorDeviation(rebuilding, failures); + return this.asSuccessFactorDeviation(rebuilding.size(), failures); } @Override diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java index 66d1a4e8bc8..630c8670bdf 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java @@ -7,7 +7,9 @@ import com.yahoo.config.provision.NodeAllocationException; import com.yahoo.config.provision.NodeResources; import com.yahoo.vespa.hosted.provision.Node; +import java.util.Collection; import java.util.List; +import java.util.Map; import java.util.function.Consumer; import java.util.function.Predicate; @@ -65,11 +67,13 @@ public interface HostProvisioner { */ void deprovision(Node host); - /** Replace the root (OS) disk of host. Implementations of this are expected to be idempotent. + /** Replace the root (OS) disk of hosts. Implementations of this are expected to be idempotent. * - * @return the updated node object + * @return the node objects for which updates were made */ - Node replaceRootDisk(Node host); + default RebuildResult replaceRootDisk(Collection<Node> hosts) { throw new UnsupportedOperationException(); } + + record RebuildResult(List<Node> rebuilt, Map<Node, Exception> failed) { } /** * Returns the maintenance events scheduled for hosts in this zone, in given cloud accounts. Host events in the diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java index 965611b9a6e..03923853594 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java @@ -21,9 +21,11 @@ import com.yahoo.vespa.hosted.provision.provisioning.ProvisionedHost; import java.time.Instant; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -32,6 +34,8 @@ import java.util.function.Consumer; import java.util.function.Predicate; import java.util.stream.IntStream; +import static com.yahoo.config.provision.NodeType.host; + /** * @author mpolden */ @@ -82,7 +86,7 @@ public class MockHostProvisioner implements HostProvisioner { List<ProvisionedHost> hosts = new ArrayList<>(); for (int index : request.indices()) { - String hostHostname = request.type() == NodeType.host ? "host" + index : request.type().name() + index; + String hostHostname = request.type() == host ? "host" + index : request.type().name() + index; hosts.add(new ProvisionedHost("id-of-" + request.type().name() + index, hostHostname, hostFlavor, @@ -117,13 +121,17 @@ public class MockHostProvisioner implements HostProvisioner { } @Override - public Node replaceRootDisk(Node host) { - if (!host.type().isHost()) throw new IllegalArgumentException(host + " is not a host"); - if (rebuildsCompleted.remove(host.hostname())) { - return host.withWantToRetire(host.status().wantToRetire(), host.status().wantToDeprovision(), - false, false, Agent.system, Instant.ofEpochMilli(123)); + public RebuildResult replaceRootDisk(Collection<Node> hosts) { + List<Node> updated = new ArrayList<>(); + Map<Node, Exception> failed = new LinkedHashMap<>(); + for (Node host : hosts) { + if ( ! host.type().isHost()) failed.put(host, new IllegalArgumentException(host + " is not a host")); + if (rebuildsCompleted.remove(host.hostname())) { + updated.add(host.withWantToRetire(host.status().wantToRetire(), host.status().wantToDeprovision(), + false, false, Agent.system, Instant.ofEpochMilli(123))); + } } - return host; + return new RebuildResult(updated, failed); } @Override @@ -219,7 +227,7 @@ public class MockHostProvisioner implements HostProvisioner { long numAddresses = Math.max(2, Math.round(flavor.resources().bandwidthGbps())); return IntStream.range(1, (int) numAddresses) .mapToObj(i -> { - String hostname = hostType == NodeType.host + String hostname = hostType == host ? "host" + hostIndex + "-" + i : hostType.childNodeType().name() + i; return HostName.of(hostname); |