diff options
author | Ola Aunrønning <olaa@verizonmedia.com> | 2021-04-16 11:26:01 +0200 |
---|---|---|
committer | Ola Aunrønning <olaa@verizonmedia.com> | 2021-04-16 11:38:53 +0200 |
commit | 46a45826fea31ca587f65fb00c5681c9e2455c62 (patch) | |
tree | 48c7718febb7c1d5a1c51a10467b5a31aea881a9 /controller-server | |
parent | dc8cf2848a7bd3e535b6296128cb28064e66bf6f (diff) |
Default action is no-op. Add tests
Diffstat (limited to 'controller-server')
5 files changed, 236 insertions, 23 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java index 4a4bee03ee5..9513348303f 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java @@ -69,6 +69,7 @@ public class ControllerMaintenance extends AbstractComponent { maintainers.add(new ArchiveAccessMaintainer(controller, metric, intervals.archiveAccessMaintainer)); maintainers.add(new TenantRoleMaintainer(controller, intervals.tenantRoleMaintainer)); maintainers.add(new ChangeRequestMaintainer(controller, intervals.changeRequestMaintainer)); + maintainers.add(new VCMRMaintainer(controller, intervals.vcmrMaintainer)); } public Upgrader upgrader() { return upgrader; } @@ -123,6 +124,7 @@ public class ControllerMaintenance extends AbstractComponent { private final Duration archiveAccessMaintainer; private final Duration tenantRoleMaintainer; private final Duration changeRequestMaintainer; + private final Duration vcmrMaintainer; public Intervals(SystemName system) { this.system = Objects.requireNonNull(system); @@ -154,6 +156,7 @@ public class ControllerMaintenance extends AbstractComponent { this.archiveAccessMaintainer = duration(10, MINUTES); this.tenantRoleMaintainer = duration(5, MINUTES); this.changeRequestMaintainer = duration(12, HOURS); + this.vcmrMaintainer = duration(12, HOURS); } private Duration duration(long amount, TemporalUnit unit) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java index 994f0ec603f..e5d99e38be5 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java @@ -10,7 +10,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node; import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeRepository; import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeRepositoryNode; import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeState; -import com.yahoo.vespa.hosted.controller.api.integration.vcmr.ChangeRequest; +import com.yahoo.vespa.hosted.controller.api.integration.vcmr.ChangeRequest.Impact; import com.yahoo.vespa.hosted.controller.api.integration.vcmr.HostAction; import com.yahoo.vespa.hosted.controller.api.integration.vcmr.HostAction.State; import com.yahoo.vespa.hosted.controller.api.integration.vcmr.VespaChangeRequest; @@ -32,7 +32,7 @@ import java.util.stream.Collectors; * @author olaa * * Maintains status and execution of VCMRs - * For now only executes + * For now only retires all affected tenant hosts if zone capacity allows it */ public class VCMRMaintainer extends ControllerMaintainer { @@ -46,7 +46,6 @@ public class VCMRMaintainer extends ControllerMaintainer { this.nodeRepository = controller.serviceRegistry().configServer().nodeRepository(); } - @Override protected boolean maintain() { var changeRequests = curator.readChangeRequests() @@ -120,20 +119,18 @@ public class VCMRMaintainer extends ControllerMaintainer { private HostAction nextAction(Node node, VespaChangeRequest changeRequest, boolean spareCapacity) { var hostAction = getPreviousAction(node, changeRequest) - .orElse(new HostAction(node.hostname().value(), State.PENDING_RETIREMENT, Instant.now())); - - if (node.type() != NodeType.host || !spareCapacity) { - return hostAction.withState(State.REQUIRES_OPERATOR_ACTION); - } + .orElse(new HostAction(node.hostname().value(), State.NONE, Instant.now())); if (changeRequest.getChangeRequestSource().isClosed()) { - recycleNode(changeRequest.getZoneId(), node); + recycleNode(changeRequest.getZoneId(), node, hostAction); return hostAction.withState(State.COMPLETE); } + if (node.type() != NodeType.host || !spareCapacity) { + return hostAction.withState(State.REQUIRES_OPERATOR_ACTION); + } + if (shouldRetire(changeRequest, hostAction)) { - if (node.state() != Node.State.active) - return hostAction.withState(State.RETIRED); if (!node.wantToRetire()) { logger.info(String.format("Retiring %s due to %s", node.hostname().value(), changeRequest.getChangeRequestSource().getId())); setWantToRetire(changeRequest.getZoneId(), node, true); @@ -141,15 +138,25 @@ public class VCMRMaintainer extends ControllerMaintainer { return hostAction.withState(State.RETIRING); } + if (hasRetired(node, hostAction)) { + return hostAction.withState(State.RETIRED); + } + + if (pendingRetirement(node)) { + return hostAction.withState(State.PENDING_RETIREMENT); + } + return hostAction; } - private void recycleNode(ZoneId zoneId, Node node) { - if (node.state() == Node.State.parked) { + // Dirty host iff the parked host was retired by this maintainer + private void recycleNode(ZoneId zoneId, Node node, HostAction hostAction) { + if (hostAction.getState() == State.RETIRED && + node.state() == Node.State.parked) { logger.info("Setting " + node.hostname() + " to dirty"); nodeRepository.setState(zoneId, NodeState.dirty, node.hostname().value()); } - if (node.wantToRetire()) + if (hostAction.getState() == State.RETIRING && node.wantToRetire()) setWantToRetire(zoneId, node, false); } @@ -160,6 +167,18 @@ public class VCMRMaintainer extends ControllerMaintainer { .isBefore(ZonedDateTime.now()); } + private boolean hasRetired(Node node, HostAction hostAction) { + return hostAction.getState() == State.RETIRING && + node.state() == Node.State.parked; + } + + /** + * TODO: For now, we choose to retire any active host + */ + private boolean pendingRetirement(Node node) { + return node.state() == Node.State.active; + } + private Map<ZoneId, List<Node>> nodesByZone() { return controller().zoneRegistry() .zones() @@ -181,7 +200,8 @@ public class VCMRMaintainer extends ControllerMaintainer { } private Predicate<VespaChangeRequest> shouldUpdate() { return changeRequest -> changeRequest.getStatus() != Status.COMPLETED && - List.of(ChangeRequest.Impact.HIGH, ChangeRequest.Impact.VERY_HIGH).contains(changeRequest.getImpact()); + List.of(Impact.HIGH, Impact.VERY_HIGH) + .contains(changeRequest.getImpact()); } private boolean hasSpareCapacity(ZoneId zoneId, List<Node> nodes) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/NodeRepositoryMock.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/NodeRepositoryMock.java index bb1ff5c0189..fe241976d13 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/NodeRepositoryMock.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/integration/NodeRepositoryMock.java @@ -55,6 +55,7 @@ public class NodeRepositoryMock implements NodeRepository { private final Map<ZoneId, List<NodeRepositoryNode>> nodeRepoNodes = new HashMap<>(); private boolean allowPatching = false; + private boolean hasSpareCapacity = false; /** Add or update given nodes in zone */ public void putNodes(ZoneId zone, List<Node> nodes) { @@ -161,8 +162,14 @@ public class NodeRepositoryMock implements NodeRepository { } @Override - public void setState(ZoneId zone, NodeState nodeState, String hostname) { - throw new UnsupportedOperationException(); + public void setState(ZoneId zone, NodeState nodeState, String hostName) { + var existing = list(zone, List.of(HostName.from(hostName))); + if (existing.size() != 1) throw new IllegalArgumentException("Node " + hostName + " not found in " + zone); + + var node = new Node.Builder(existing.get(0)) + .state(Node.State.valueOf(nodeState.name())) + .build(); + putNodes(zone, node); } @Override @@ -277,11 +284,16 @@ public class NodeRepositoryMock implements NodeRepository { List<Node> existing = list(zoneId, List.of(HostName.from(hostName))); if (existing.size() != 1) throw new IllegalArgumentException("Node " + hostName + " not found in " + zoneId); - // Note: Only supports switchHostname and modelName - Node newNode = new Node.Builder(existing.get(0)).switchHostname(node.getSwitchHostname()) - .modelName(node.getModelName()) - .build(); - putNodes(zoneId, newNode); + // Note: Only supports switchHostname, modelName and wantToRetire + Node.Builder newNode = new Node.Builder(existing.get(0)); + if (node.getSwitchHostname() != null) + newNode.switchHostname(node.getSwitchHostname()); + if (node.getModelName() != null) + newNode.modelName(node.getModelName()); + if (node.getWantToRetire() != null) + newNode.wantToRetire(node.getWantToRetire()); + + putNodes(zoneId, newNode.build()); } @Override @@ -291,7 +303,7 @@ public class NodeRepositoryMock implements NodeRepository { @Override public boolean isReplaceable(ZoneId zoneId, List<HostName> hostNames) { - return false; + return hasSpareCapacity; } public Optional<Duration> osUpgradeBudget(ZoneId zone, NodeType type, Version version) { @@ -341,4 +353,8 @@ public class NodeRepositoryMock implements NodeRepository { return this; } + public void hasSpareCapacity(boolean hasSpareCapacity) { + this.hasSpareCapacity = hasSpareCapacity; + } + } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainerTest.java new file mode 100644 index 00000000000..f5cea0eb391 --- /dev/null +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainerTest.java @@ -0,0 +1,171 @@ +package com.yahoo.vespa.hosted.controller.maintenance; + +import com.yahoo.config.provision.HostName; +import com.yahoo.config.provision.NodeType; +import com.yahoo.config.provision.zone.ZoneId; +import com.yahoo.vespa.hosted.controller.ControllerTester; +import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node; +import com.yahoo.vespa.hosted.controller.api.integration.vcmr.ChangeRequest; +import com.yahoo.vespa.hosted.controller.api.integration.vcmr.ChangeRequestSource; +import com.yahoo.vespa.hosted.controller.api.integration.vcmr.HostAction; +import com.yahoo.vespa.hosted.controller.api.integration.vcmr.HostAction.State; +import com.yahoo.vespa.hosted.controller.api.integration.vcmr.VespaChangeRequest; +import com.yahoo.vespa.hosted.controller.api.integration.vcmr.VespaChangeRequest.Status; +import com.yahoo.vespa.hosted.controller.integration.NodeRepositoryMock; +import org.junit.Test; + +import java.time.Duration; +import java.time.Instant; +import java.time.ZonedDateTime; +import java.util.List; + +import static org.junit.Assert.*; + +/** + * @author olaa + */ +public class VCMRMaintainerTest { + + private final ControllerTester tester = new ControllerTester(); + private final VCMRMaintainer maintainer = new VCMRMaintainer(tester.controller(), Duration.ofMinutes(1)); + private final NodeRepositoryMock nodeRepo = tester.serviceRegistry().configServer().nodeRepository(); + private final ZoneId zoneId = ZoneId.from("prod.us-east-3"); + private final HostName host1 = HostName.from("host1"); + private final HostName host2 = HostName.from("host2"); + private final String changeRequestId = "id123"; + + @Test + public void recycle_hosts_after_completion() { + var parkedNode = createNode(host1, NodeType.host, Node.State.parked, true); + var failedNode = createNode(host2, NodeType.host, Node.State.failed, false); + nodeRepo.putNodes(zoneId, List.of(parkedNode, failedNode)); + + tester.curator().writeChangeRequest(canceledChangeRequest()); + maintainer.maintain(); + + // Only the parked node is recycled + var nodeList = nodeRepo.list(zoneId, List.of(host1, host2)); + assertEquals(Node.State.dirty, nodeList.get(0).state()); + assertEquals(Node.State.failed, nodeList.get(1).state()); + var writtenChangeRequest = tester.curator().readChangeRequest(changeRequestId).get(); + assertEquals(Status.COMPLETED, writtenChangeRequest.getStatus()); + } + + @Test + public void infrastructure_hosts_require_maunal_intervention() { + var configNode = createNode(host1, NodeType.config, Node.State.active, false); + var activeNode = createNode(host2, NodeType.host, Node.State.active, false); + nodeRepo.putNodes(zoneId, List.of(configNode, activeNode)); + nodeRepo.hasSpareCapacity(true); + + tester.curator().writeChangeRequest(openChangeRequest()); + maintainer.maintain(); + + var writtenChangeRequest = tester.curator().readChangeRequest(changeRequestId).get(); + var configAction = writtenChangeRequest.getHostActionPlan().get(0); + var tenantHostAction = writtenChangeRequest.getHostActionPlan().get(1); + assertEquals(State.REQUIRES_OPERATOR_ACTION, configAction.getState()); + assertEquals(State.PENDING_RETIREMENT, tenantHostAction.getState()); + assertEquals(Status.REQUIRES_OPERATOR_ACTION, writtenChangeRequest.getStatus()); + } + + @Test + public void retires_hosts_when_near_vcmr() { + var activeNode = createNode(host1, NodeType.host, Node.State.active, false); + var failedNode = createNode(host2, NodeType.host, Node.State.failed, false); + nodeRepo.putNodes(zoneId, List.of(activeNode, failedNode)); + nodeRepo.allowPatching(true).hasSpareCapacity(true); + + tester.curator().writeChangeRequest(startingChangeRequest()); + maintainer.maintain(); + + var writtenChangeRequest = tester.curator().readChangeRequest(changeRequestId).orElseThrow(); + var parkedNodeAction = writtenChangeRequest.getHostActionPlan().get(0); + var failedNodeAction = writtenChangeRequest.getHostActionPlan().get(1); + assertEquals(State.RETIRING, parkedNodeAction.getState()); + assertEquals(State.NONE, failedNodeAction.getState()); + assertEquals(Status.IN_PROGRESS, writtenChangeRequest.getStatus()); + + activeNode = nodeRepo.list(zoneId, List.of(activeNode.hostname())).get(0); + assertTrue(activeNode.wantToRetire()); + + } + + @Test + public void no_spare_capacity_requires_operator_action() { + var activeNode = createNode(host1, NodeType.host, Node.State.active, false); + var failedNode = createNode(host2, NodeType.host, Node.State.failed, false); + nodeRepo.putNodes(zoneId, List.of(activeNode, failedNode)); + nodeRepo.hasSpareCapacity(false); + + tester.curator().writeChangeRequest(startingChangeRequest()); + maintainer.maintain(); + + var writtenChangeRequest = tester.curator().readChangeRequest(changeRequestId).orElseThrow(); + var parkedNodeAction = writtenChangeRequest.getHostActionPlan().get(0); + var failedNodeAction = writtenChangeRequest.getHostActionPlan().get(1); + assertEquals(State.REQUIRES_OPERATOR_ACTION, parkedNodeAction.getState()); + assertEquals(State.REQUIRES_OPERATOR_ACTION, failedNodeAction.getState()); + assertEquals(Status.REQUIRES_OPERATOR_ACTION, writtenChangeRequest.getStatus()); + } + + @Test + public void updates_status_when_retiring_host_is_parked() { + var parkedNode = createNode(host1, NodeType.host, Node.State.parked, true); + nodeRepo.putNodes(zoneId, parkedNode); + nodeRepo.hasSpareCapacity(true); + + tester.curator().writeChangeRequest(inProgressChangeRequest()); + maintainer.maintain(); + + var writtenChangeRequest = tester.curator().readChangeRequest(changeRequestId).orElseThrow(); + var parkedNodeAction = writtenChangeRequest.getHostActionPlan().get(0); + assertEquals(State.RETIRED, parkedNodeAction.getState()); + assertEquals(Status.IN_PROGRESS, writtenChangeRequest.getStatus()); + } + + private VespaChangeRequest canceledChangeRequest() { + return newChangeRequest(ChangeRequestSource.Status.CANCELED, State.RETIRED, State.RETIRING, ZonedDateTime.now()); + } + + private VespaChangeRequest openChangeRequest() { + return newChangeRequest(ChangeRequestSource.Status.WAITING_FOR_APPROVAL, State.NONE, State.NONE, ZonedDateTime.now().plus(Duration.ofDays(5L))); + } + + private VespaChangeRequest startingChangeRequest() { + return newChangeRequest(ChangeRequestSource.Status.STARTED, State.PENDING_RETIREMENT, State.NONE, ZonedDateTime.now()); + } + + private VespaChangeRequest inProgressChangeRequest() { + return newChangeRequest(ChangeRequestSource.Status.STARTED, State.RETIRING, State.RETIRING, ZonedDateTime.now()); + } + + + private VespaChangeRequest newChangeRequest(ChangeRequestSource.Status sourceStatus, State state1, State state2, ZonedDateTime startTime) { + var source = new ChangeRequestSource("aws", changeRequestId, "url", sourceStatus , startTime, ZonedDateTime.now()); + var actionPlan = List.of( + new HostAction(host1.value(), state1, Instant.now()), + new HostAction(host2.value(), state2, Instant.now()) + ); + return new VespaChangeRequest( + changeRequestId, + source, + List.of("switch1"), + List.of("host1", "host2"), + ChangeRequest.Approval.APPROVED, + ChangeRequest.Impact.VERY_HIGH, + VespaChangeRequest.Status.IN_PROGRESS, + actionPlan, + ZoneId.from("prod.us-east-3") + ); + } + + private Node createNode(HostName hostname, NodeType nodeType, Node.State state, boolean wantToRetire) { + return new Node.Builder() + .hostname(hostname) + .type(nodeType) + .state(state) + .wantToRetire(wantToRetire) + .build(); + } +}
\ No newline at end of file diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/controller/responses/maintenance.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/controller/responses/maintenance.json index 44c464cc5e7..3cf79977fb8 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/controller/responses/maintenance.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/controller/responses/maintenance.json @@ -91,6 +91,9 @@ "name": "Upgrader" }, { + "name": "VCMRMaintainer" + }, + { "name": "VersionStatusUpdater" } ], |