diff options
author | Ola Aunrønning <olaa@verizonmedia.com> | 2022-03-22 09:41:33 +0100 |
---|---|---|
committer | Ola Aunrønning <olaa@verizonmedia.com> | 2022-03-22 09:41:33 +0100 |
commit | 8cd41ef8ba2c36a01f1a12f9772435e06db92223 (patch) | |
tree | 026109d033035a3f47a099f716c2e573da0c3eb0 /controller-server | |
parent | 4a6506e8e721d75d91e1398a876653df1e749d2b (diff) |
Support multizone CMR
Diffstat (limited to 'controller-server')
2 files changed, 57 insertions, 25 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainer.java index 9c3a5aa2831..b764d16483e 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainer.java @@ -31,6 +31,7 @@ import java.util.Set; import java.util.function.Predicate; import java.util.logging.Logger; import java.util.stream.Collectors; +import java.util.stream.Stream; /** * @@ -80,7 +81,8 @@ public class VcmrMaintainer extends ControllerMaintainer { var updatedVcmr = vcmr.withActionPlan(nextActions) .withStatus(status); curator.writeChangeRequest(updatedVcmr); - approveChangeRequest(updatedVcmr); + if (nodes.keySet().size() == 1) + approveChangeRequest(updatedVcmr); }); } }); @@ -119,19 +121,30 @@ public class VcmrMaintainer extends ControllerMaintainer { return Status.NOOP; } - private List<HostAction> getNextActions(List<Node> nodes, VespaChangeRequest changeRequest) { - var spareCapacity = hasSpareCapacity(changeRequest.getZoneId(), nodes); - return nodes.stream() - .map(node -> nextAction(node, changeRequest, spareCapacity)) - .collect(Collectors.toList()); + private List<HostAction> getNextActions(Map<ZoneId, List<Node>> nodesByZone, VespaChangeRequest changeRequest) { + return nodesByZone.entrySet() + .stream() + .flatMap(entry -> { + var zone = entry.getKey(); + var nodes = entry.getValue(); + if (nodes.isEmpty()) { + return Stream.empty(); + } + var spareCapacity = hasSpareCapacity(zone, nodes); + return nodes.stream().map(node -> nextAction(zone, node, changeRequest, spareCapacity)); + }).collect(Collectors.toList()); + } // Get the superset of impacted hosts by looking at impacted switches - private List<Node> impactedNodes(Map<ZoneId, List<Node>> nodesByZone, VespaChangeRequest changeRequest) { - return nodesByZone.get(changeRequest.getZoneId()) + private Map<ZoneId, List<Node>> impactedNodes(Map<ZoneId, List<Node>> nodesByZone, VespaChangeRequest changeRequest) { + return nodesByZone.entrySet() .stream() - .filter(isImpacted(changeRequest)) - .collect(Collectors.toList()); + .filter(entry -> entry.getValue().stream().anyMatch(isImpacted(changeRequest))) // Skip zones without impacted nodes + .collect(Collectors.toMap( + Map.Entry::getKey, + entry -> entry.getValue().stream().filter(isImpacted(changeRequest)).collect(Collectors.toList()) + )); } private Optional<HostAction> getPreviousAction(Node node, VespaChangeRequest changeRequest) { @@ -141,25 +154,25 @@ public class VcmrMaintainer extends ControllerMaintainer { .findFirst(); } - private HostAction nextAction(Node node, VespaChangeRequest changeRequest, boolean spareCapacity) { + private HostAction nextAction(ZoneId zoneId, Node node, VespaChangeRequest changeRequest, boolean spareCapacity) { var hostAction = getPreviousAction(node, changeRequest) .orElse(new HostAction(node.hostname().value(), State.NONE, Instant.now())); if (changeRequest.getChangeRequestSource().isClosed()) { LOG.fine(() -> changeRequest.getChangeRequestSource().getId() + " is closed, recycling " + node.hostname()); - recycleNode(changeRequest.getZoneId(), node, hostAction); - removeReport(changeRequest, node); + recycleNode(zoneId, node, hostAction); + removeReport(zoneId, changeRequest, node); return hostAction.withState(State.COMPLETE); } if (isLowImpact(changeRequest)) return hostAction; - addReport(changeRequest, node); + addReport(zoneId, changeRequest, node); if (isPostponed(changeRequest, hostAction)) { LOG.fine(() -> changeRequest.getChangeRequestSource().getId() + " is postponed, recycling " + node.hostname()); - recycleNode(changeRequest.getZoneId(), node, hostAction); + recycleNode(zoneId, node, hostAction); return hostAction.withState(State.PENDING_RETIREMENT); } @@ -172,11 +185,11 @@ public class VcmrMaintainer extends ControllerMaintainer { LOG.info(Text.format("Retiring %s due to %s", node.hostname().value(), changeRequest.getChangeRequestSource().getId())); // TODO: Remove try/catch once retirement is stabilized try { - setWantToRetire(changeRequest.getZoneId(), node, true); + setWantToRetire(zoneId, node, true); } catch (Exception e) { LOG.warning("Failed to retire host " + node.hostname() + ": " + Exceptions.toMessageString(e)); // Check if retirement actually failed - if (!nodeRepository.getNode(changeRequest.getZoneId(), node.hostname().value()).wantToRetire()) { + if (!nodeRepository.getNode(zoneId, node.hostname().value()).wantToRetire()) { return hostAction; } } @@ -291,20 +304,20 @@ public class VcmrMaintainer extends ControllerMaintainer { changeRequestClient.approveChangeRequest(changeRequest); } - private void removeReport(VespaChangeRequest changeRequest, Node node) { + private void removeReport(ZoneId zoneId, VespaChangeRequest changeRequest, Node node) { var report = VcmrReport.fromReports(node.reports()); if (report.removeVcmr(changeRequest.getChangeRequestSource().getId())) { - updateReport(changeRequest.getZoneId(), node, report); + updateReport(zoneId, node, report); } } - private void addReport(VespaChangeRequest changeRequest, Node node) { + private void addReport(ZoneId zoneId, VespaChangeRequest changeRequest, Node node) { var report = VcmrReport.fromReports(node.reports()); var source = changeRequest.getChangeRequestSource(); if (report.addVcmr(source.getId(), source.getPlannedStartTime(), source.getPlannedEndTime())) { - updateReport(changeRequest.getZoneId(), node, report); + updateReport(zoneId, node, report); } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainerTest.java index 581d10d4f88..5d40e7d5580 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/VcmrMaintainerTest.java @@ -38,8 +38,10 @@ public class VcmrMaintainerTest { private VcmrMaintainer maintainer; private NodeRepositoryMock nodeRepo; private final ZoneId zoneId = ZoneId.from("prod.us-east-3"); + private final ZoneId zone2 = ZoneId.from("prod.us-west-1"); private final HostName host1 = HostName.from("host1"); private final HostName host2 = HostName.from("host2"); + private final HostName host3 = HostName.from("host3"); private final String changeRequestId = "id123"; @Before @@ -55,9 +57,7 @@ public class VcmrMaintainerTest { vcmrReport.addVcmr("id123", ZonedDateTime.now(), ZonedDateTime.now()); var parkedNode = createNode(host1, NodeType.host, Node.State.parked, true); var failedNode = createNode(host2, NodeType.host, Node.State.failed, false); - Map<String, String> reports = vcmrReport.toNodeReports().entrySet().stream() - .collect(Collectors.toMap(Map.Entry::getKey, - kv -> kv.getValue().toString())); + var reports = vcmrReport.toNodeReports(); parkedNode = Node.builder(parkedNode) .reports(reports) .build(); @@ -89,6 +89,7 @@ public class VcmrMaintainerTest { maintainer.maintain(); var writtenChangeRequest = tester.curator().readChangeRequest(changeRequestId).get(); + assertEquals(2, writtenChangeRequest.getHostActionPlan().size()); var configAction = writtenChangeRequest.getHostActionPlan().get(0); var tenantHostAction = writtenChangeRequest.getHostActionPlan().get(1); assertEquals(State.REQUIRES_OPERATOR_ACTION, configAction.getState()); @@ -201,6 +202,24 @@ public class VcmrMaintainerTest { assertFalse(retiringNode.wantToRetire()); } + @Test + public void handle_multizone_vcmr() { + var node1 = createNode(host1, NodeType.config, Node.State.active, false); + var node2 = createNode(host2, NodeType.host, Node.State.active, false); + var node3 = createNode(host3, NodeType.host, Node.State.active, false); + nodeRepo.putNodes(zoneId, List.of(node1, node2)); + nodeRepo.putNodes(zone2, List.of(node3)); + nodeRepo.hasSpareCapacity(true); + + tester.curator().writeChangeRequest(futureChangeRequest()); + maintainer.maintain(); + + var writtenChangeRequest = tester.curator().readChangeRequest(changeRequestId).get(); + var actionPlan = writtenChangeRequest.getHostActionPlan(); + assertEquals(State.REQUIRES_OPERATOR_ACTION, actionPlan.get(0).getState()); + assertEquals(State.PENDING_RETIREMENT, actionPlan.get(1).getState()); + assertEquals(State.PENDING_RETIREMENT, actionPlan.get(2).getState()); + } private VespaChangeRequest canceledChangeRequest() { return newChangeRequest(ChangeRequestSource.Status.CANCELED, State.RETIRED, State.RETIRING, ZonedDateTime.now()); @@ -233,7 +252,7 @@ public class VcmrMaintainerTest { changeRequestId, source, List.of("switch1"), - List.of("host1", "host2"), + List.of("host1", "host2", "host3"), ChangeRequest.Approval.REQUESTED, ChangeRequest.Impact.VERY_HIGH, VespaChangeRequest.Status.IN_PROGRESS, |