summaryrefslogtreecommitdiffstats
path: root/controller-server
diff options
context:
space:
mode:
authorOla Aunrønning <olaa@verizonmedia.com>2022-03-22 09:41:33 +0100
committerOla Aunrønning <olaa@verizonmedia.com>2022-03-22 09:41:33 +0100
commit8cd41ef8ba2c36a01f1a12f9772435e06db92223 (patch)
tree026109d033035a3f47a099f716c2e573da0c3eb0 /controller-server
parent4a6506e8e721d75d91e1398a876653df1e749d2b (diff)
Support multizone CMR
Diffstat (limited to 'controller-server')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainer.java55
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainerTest.java27
2 files changed, 57 insertions, 25 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainer.java
index 9c3a5aa2831..b764d16483e 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainer.java
@@ -31,6 +31,7 @@ import java.util.Set;
import java.util.function.Predicate;
import java.util.logging.Logger;
import java.util.stream.Collectors;
+import java.util.stream.Stream;
/**
*
@@ -80,7 +81,8 @@ public class VcmrMaintainer extends ControllerMaintainer {
var updatedVcmr = vcmr.withActionPlan(nextActions)
.withStatus(status);
curator.writeChangeRequest(updatedVcmr);
- approveChangeRequest(updatedVcmr);
+ if (nodes.keySet().size() == 1)
+ approveChangeRequest(updatedVcmr);
});
}
});
@@ -119,19 +121,30 @@ public class VcmrMaintainer extends ControllerMaintainer {
return Status.NOOP;
}
- private List<HostAction> getNextActions(List<Node> nodes, VespaChangeRequest changeRequest) {
- var spareCapacity = hasSpareCapacity(changeRequest.getZoneId(), nodes);
- return nodes.stream()
- .map(node -> nextAction(node, changeRequest, spareCapacity))
- .collect(Collectors.toList());
+ private List<HostAction> getNextActions(Map<ZoneId, List<Node>> nodesByZone, VespaChangeRequest changeRequest) {
+ return nodesByZone.entrySet()
+ .stream()
+ .flatMap(entry -> {
+ var zone = entry.getKey();
+ var nodes = entry.getValue();
+ if (nodes.isEmpty()) {
+ return Stream.empty();
+ }
+ var spareCapacity = hasSpareCapacity(zone, nodes);
+ return nodes.stream().map(node -> nextAction(zone, node, changeRequest, spareCapacity));
+ }).collect(Collectors.toList());
+
}
// Get the superset of impacted hosts by looking at impacted switches
- private List<Node> impactedNodes(Map<ZoneId, List<Node>> nodesByZone, VespaChangeRequest changeRequest) {
- return nodesByZone.get(changeRequest.getZoneId())
+ private Map<ZoneId, List<Node>> impactedNodes(Map<ZoneId, List<Node>> nodesByZone, VespaChangeRequest changeRequest) {
+ return nodesByZone.entrySet()
.stream()
- .filter(isImpacted(changeRequest))
- .collect(Collectors.toList());
+ .filter(entry -> entry.getValue().stream().anyMatch(isImpacted(changeRequest))) // Skip zones without impacted nodes
+ .collect(Collectors.toMap(
+ Map.Entry::getKey,
+ entry -> entry.getValue().stream().filter(isImpacted(changeRequest)).collect(Collectors.toList())
+ ));
}
private Optional<HostAction> getPreviousAction(Node node, VespaChangeRequest changeRequest) {
@@ -141,25 +154,25 @@ public class VcmrMaintainer extends ControllerMaintainer {
.findFirst();
}
- private HostAction nextAction(Node node, VespaChangeRequest changeRequest, boolean spareCapacity) {
+ private HostAction nextAction(ZoneId zoneId, Node node, VespaChangeRequest changeRequest, boolean spareCapacity) {
var hostAction = getPreviousAction(node, changeRequest)
.orElse(new HostAction(node.hostname().value(), State.NONE, Instant.now()));
if (changeRequest.getChangeRequestSource().isClosed()) {
LOG.fine(() -> changeRequest.getChangeRequestSource().getId() + " is closed, recycling " + node.hostname());
- recycleNode(changeRequest.getZoneId(), node, hostAction);
- removeReport(changeRequest, node);
+ recycleNode(zoneId, node, hostAction);
+ removeReport(zoneId, changeRequest, node);
return hostAction.withState(State.COMPLETE);
}
if (isLowImpact(changeRequest))
return hostAction;
- addReport(changeRequest, node);
+ addReport(zoneId, changeRequest, node);
if (isPostponed(changeRequest, hostAction)) {
LOG.fine(() -> changeRequest.getChangeRequestSource().getId() + " is postponed, recycling " + node.hostname());
- recycleNode(changeRequest.getZoneId(), node, hostAction);
+ recycleNode(zoneId, node, hostAction);
return hostAction.withState(State.PENDING_RETIREMENT);
}
@@ -172,11 +185,11 @@ public class VcmrMaintainer extends ControllerMaintainer {
LOG.info(Text.format("Retiring %s due to %s", node.hostname().value(), changeRequest.getChangeRequestSource().getId()));
// TODO: Remove try/catch once retirement is stabilized
try {
- setWantToRetire(changeRequest.getZoneId(), node, true);
+ setWantToRetire(zoneId, node, true);
} catch (Exception e) {
LOG.warning("Failed to retire host " + node.hostname() + ": " + Exceptions.toMessageString(e));
// Check if retirement actually failed
- if (!nodeRepository.getNode(changeRequest.getZoneId(), node.hostname().value()).wantToRetire()) {
+ if (!nodeRepository.getNode(zoneId, node.hostname().value()).wantToRetire()) {
return hostAction;
}
}
@@ -291,20 +304,20 @@ public class VcmrMaintainer extends ControllerMaintainer {
changeRequestClient.approveChangeRequest(changeRequest);
}
- private void removeReport(VespaChangeRequest changeRequest, Node node) {
+ private void removeReport(ZoneId zoneId, VespaChangeRequest changeRequest, Node node) {
var report = VcmrReport.fromReports(node.reports());
if (report.removeVcmr(changeRequest.getChangeRequestSource().getId())) {
- updateReport(changeRequest.getZoneId(), node, report);
+ updateReport(zoneId, node, report);
}
}
- private void addReport(VespaChangeRequest changeRequest, Node node) {
+ private void addReport(ZoneId zoneId, VespaChangeRequest changeRequest, Node node) {
var report = VcmrReport.fromReports(node.reports());
var source = changeRequest.getChangeRequestSource();
if (report.addVcmr(source.getId(), source.getPlannedStartTime(), source.getPlannedEndTime())) {
- updateReport(changeRequest.getZoneId(), node, report);
+ updateReport(zoneId, node, report);
}
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainerTest.java
index 581d10d4f88..5d40e7d5580 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainerTest.java
@@ -38,8 +38,10 @@ public class VcmrMaintainerTest {
private VcmrMaintainer maintainer;
private NodeRepositoryMock nodeRepo;
private final ZoneId zoneId = ZoneId.from("prod.us-east-3");
+ private final ZoneId zone2 = ZoneId.from("prod.us-west-1");
private final HostName host1 = HostName.from("host1");
private final HostName host2 = HostName.from("host2");
+ private final HostName host3 = HostName.from("host3");
private final String changeRequestId = "id123";
@Before
@@ -55,9 +57,7 @@ public class VcmrMaintainerTest {
vcmrReport.addVcmr("id123", ZonedDateTime.now(), ZonedDateTime.now());
var parkedNode = createNode(host1, NodeType.host, Node.State.parked, true);
var failedNode = createNode(host2, NodeType.host, Node.State.failed, false);
- Map<String, String> reports = vcmrReport.toNodeReports().entrySet().stream()
- .collect(Collectors.toMap(Map.Entry::getKey,
- kv -> kv.getValue().toString()));
+ var reports = vcmrReport.toNodeReports();
parkedNode = Node.builder(parkedNode)
.reports(reports)
.build();
@@ -89,6 +89,7 @@ public class VcmrMaintainerTest {
maintainer.maintain();
var writtenChangeRequest = tester.curator().readChangeRequest(changeRequestId).get();
+ assertEquals(2, writtenChangeRequest.getHostActionPlan().size());
var configAction = writtenChangeRequest.getHostActionPlan().get(0);
var tenantHostAction = writtenChangeRequest.getHostActionPlan().get(1);
assertEquals(State.REQUIRES_OPERATOR_ACTION, configAction.getState());
@@ -201,6 +202,24 @@ public class VcmrMaintainerTest {
assertFalse(retiringNode.wantToRetire());
}
+ @Test
+ public void handle_multizone_vcmr() {
+ var node1 = createNode(host1, NodeType.config, Node.State.active, false);
+ var node2 = createNode(host2, NodeType.host, Node.State.active, false);
+ var node3 = createNode(host3, NodeType.host, Node.State.active, false);
+ nodeRepo.putNodes(zoneId, List.of(node1, node2));
+ nodeRepo.putNodes(zone2, List.of(node3));
+ nodeRepo.hasSpareCapacity(true);
+
+ tester.curator().writeChangeRequest(futureChangeRequest());
+ maintainer.maintain();
+
+ var writtenChangeRequest = tester.curator().readChangeRequest(changeRequestId).get();
+ var actionPlan = writtenChangeRequest.getHostActionPlan();
+ assertEquals(State.REQUIRES_OPERATOR_ACTION, actionPlan.get(0).getState());
+ assertEquals(State.PENDING_RETIREMENT, actionPlan.get(1).getState());
+ assertEquals(State.PENDING_RETIREMENT, actionPlan.get(2).getState());
+ }
private VespaChangeRequest canceledChangeRequest() {
return newChangeRequest(ChangeRequestSource.Status.CANCELED, State.RETIRED, State.RETIRING, ZonedDateTime.now());
@@ -233,7 +252,7 @@ public class VcmrMaintainerTest {
changeRequestId,
source,
List.of("switch1"),
- List.of("host1", "host2"),
+ List.of("host1", "host2", "host3"),
ChangeRequest.Approval.REQUESTED,
ChangeRequest.Impact.VERY_HIGH,
VespaChangeRequest.Status.IN_PROGRESS,