summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorjonmv <venstad@gmail.com>2023-08-29 14:28:25 +0200
committerjonmv <venstad@gmail.com>2023-08-29 14:28:25 +0200
commit20e6b984a523865381561b999168fcaeee85d960 (patch)
tree00bf08ebb43f58f58f5cffc9634a4bc7ff307a9c /node-repository
parente7c66eb91bd479cee4d499954ecb176c4361956a (diff)
Batch disk replacer
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DiskReplacer.java42
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java24
3 files changed, 37 insertions, 39 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DiskReplacer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DiskReplacer.java
index 0e67027f8ca..0c1d6291baa 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DiskReplacer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DiskReplacer.java
@@ -8,17 +8,14 @@ import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeMutex;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner;
+import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.RebuildResult;
import com.yahoo.yolean.Exceptions;
-import com.yahoo.yolean.UncheckedInterruptedException;
import java.time.Duration;
-import java.util.HashMap;
-import java.util.Map;
+import java.util.List;
import java.util.Optional;
-import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -43,34 +40,23 @@ public class DiskReplacer extends NodeRepositoryMaintainer {
@Override
protected double maintain() {
NodeList nodes = nodeRepository().nodes().list().rebuilding(true);
- int rebuilding = 0;
int failures = 0;
+ List<Node> rebuilding;
try (var locked = nodeRepository().nodes().lockAndGetAll(nodes.asList(), Optional.of(Duration.ofSeconds(10)))) {
- Map<String, Future<Node>> rebuilt = new HashMap<>();
- for (NodeMutex node : locked.nodes()) {
- if (node.node().status().wantToRebuild() && ++rebuilding <= maxBatchSize) {
- rebuilt.put(node.node().hostname(), executor.submit(() -> hostProvisioner.replaceRootDisk(node.node())));
- }
- }
+ rebuilding = locked.nodes().stream().map(NodeMutex::node).toList();
+ RebuildResult result = hostProvisioner.replaceRootDisk(rebuilding);
+
+ for (Node updated : result.rebuilt())
+ if (!updated.status().wantToRebuild())
+ nodeRepository().nodes().write(updated, () -> { });
- for (var node : rebuilt.entrySet()) {
- try {
- Node updated = node.getValue().get();
- if ( ! updated.status().wantToRebuild()) {
- nodeRepository().nodes().write(updated, () -> { });
- }
- }
- catch (ExecutionException e) {
- ++failures;
- log.log(Level.WARNING, "Failed to rebuild " + node.getKey() + ", will retry in " +
- interval() + ": " + Exceptions.toMessageString(e.getCause()));
- }
- catch (InterruptedException e) {
- throw new UncheckedInterruptedException(e, true);
- }
+ for (var entry : result.failed().entrySet()) {
+ ++failures;
+ log.log(Level.WARNING, "Failed to rebuild " + entry.getKey() + ", will retry in " +
+ interval() + ": " + Exceptions.toMessageString(entry.getValue()));
}
}
- return this.asSuccessFactorDeviation(rebuilding, failures);
+ return this.asSuccessFactorDeviation(rebuilding.size(), failures);
}
@Override
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java
index 66d1a4e8bc8..630c8670bdf 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/HostProvisioner.java
@@ -7,7 +7,9 @@ import com.yahoo.config.provision.NodeAllocationException;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.vespa.hosted.provision.Node;
+import java.util.Collection;
import java.util.List;
+import java.util.Map;
import java.util.function.Consumer;
import java.util.function.Predicate;
@@ -65,11 +67,13 @@ public interface HostProvisioner {
*/
void deprovision(Node host);
- /** Replace the root (OS) disk of host. Implementations of this are expected to be idempotent.
+ /** Replace the root (OS) disk of hosts. Implementations of this are expected to be idempotent.
*
- * @return the updated node object
+ * @return the node objects for which updates were made
*/
- Node replaceRootDisk(Node host);
+ default RebuildResult replaceRootDisk(Collection<Node> hosts) { throw new UnsupportedOperationException(); }
+
+ record RebuildResult(List<Node> rebuilt, Map<Node, Exception> failed) { }
/**
* Returns the maintenance events scheduled for hosts in this zone, in given cloud accounts. Host events in the
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java
index 965611b9a6e..03923853594 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockHostProvisioner.java
@@ -21,9 +21,11 @@ import com.yahoo.vespa.hosted.provision.provisioning.ProvisionedHost;
import java.time.Instant;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -32,6 +34,8 @@ import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.stream.IntStream;
+import static com.yahoo.config.provision.NodeType.host;
+
/**
* @author mpolden
*/
@@ -82,7 +86,7 @@ public class MockHostProvisioner implements HostProvisioner {
List<ProvisionedHost> hosts = new ArrayList<>();
for (int index : request.indices()) {
- String hostHostname = request.type() == NodeType.host ? "host" + index : request.type().name() + index;
+ String hostHostname = request.type() == host ? "host" + index : request.type().name() + index;
hosts.add(new ProvisionedHost("id-of-" + request.type().name() + index,
hostHostname,
hostFlavor,
@@ -117,13 +121,17 @@ public class MockHostProvisioner implements HostProvisioner {
}
@Override
- public Node replaceRootDisk(Node host) {
- if (!host.type().isHost()) throw new IllegalArgumentException(host + " is not a host");
- if (rebuildsCompleted.remove(host.hostname())) {
- return host.withWantToRetire(host.status().wantToRetire(), host.status().wantToDeprovision(),
- false, false, Agent.system, Instant.ofEpochMilli(123));
+ public RebuildResult replaceRootDisk(Collection<Node> hosts) {
+ List<Node> updated = new ArrayList<>();
+ Map<Node, Exception> failed = new LinkedHashMap<>();
+ for (Node host : hosts) {
+ if ( ! host.type().isHost()) failed.put(host, new IllegalArgumentException(host + " is not a host"));
+ if (rebuildsCompleted.remove(host.hostname())) {
+ updated.add(host.withWantToRetire(host.status().wantToRetire(), host.status().wantToDeprovision(),
+ false, false, Agent.system, Instant.ofEpochMilli(123)));
+ }
}
- return host;
+ return new RebuildResult(updated, failed);
}
@Override
@@ -219,7 +227,7 @@ public class MockHostProvisioner implements HostProvisioner {
long numAddresses = Math.max(2, Math.round(flavor.resources().bandwidthGbps()));
return IntStream.range(1, (int) numAddresses)
.mapToObj(i -> {
- String hostname = hostType == NodeType.host
+ String hostname = hostType == host
? "host" + hostIndex + "-" + i
: hostType.childNodeType().name() + i;
return HostName.of(hostname);