diff options
author | Ola Aunrønning <olaa@verizonmedia.com> | 2021-04-14 12:46:57 +0200 |
---|---|---|
committer | Ola Aunrønning <olaa@verizonmedia.com> | 2021-04-16 11:34:44 +0200 |
commit | dc8cf2848a7bd3e535b6296128cb28064e66bf6f (patch) | |
tree | 2a6870065f7a9ec21934411eab7d6b7e02b4ce0c /controller-server | |
parent | 3aef0ca8a0ffa0423b0e2299e5fa139770d21793 (diff) |
Add VCMRMaintainer
Diffstat (limited to 'controller-server')
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java | 202 |
1 files changed, 202 insertions, 0 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java new file mode 100644 index 00000000000..994f0ec603f --- /dev/null +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/VCMRMaintainer.java @@ -0,0 +1,202 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.controller.maintenance; + +import com.yahoo.config.provision.Environment; +import com.yahoo.config.provision.NodeType; +import com.yahoo.config.provision.SystemName; +import com.yahoo.config.provision.zone.ZoneId; +import com.yahoo.vespa.hosted.controller.Controller; +import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node; +import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeRepository; +import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeRepositoryNode; +import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeState; +import com.yahoo.vespa.hosted.controller.api.integration.vcmr.ChangeRequest; +import com.yahoo.vespa.hosted.controller.api.integration.vcmr.HostAction; +import com.yahoo.vespa.hosted.controller.api.integration.vcmr.HostAction.State; +import com.yahoo.vespa.hosted.controller.api.integration.vcmr.VespaChangeRequest; +import com.yahoo.vespa.hosted.controller.api.integration.vcmr.VespaChangeRequest.Status; +import com.yahoo.vespa.hosted.controller.persistence.CuratorDb; + +import java.time.Duration; +import java.time.Instant; +import java.time.ZonedDateTime; +import java.util.EnumSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Predicate; +import java.util.logging.Logger; +import java.util.stream.Collectors; + +/** + * @author olaa + * + * Maintains status and execution of VCMRs + * For now only executes + */ +public class VCMRMaintainer extends ControllerMaintainer { + + private final Logger logger = Logger.getLogger(VCMRMaintainer.class.getName()); + private final CuratorDb curator; + private final NodeRepository nodeRepository; + + public VCMRMaintainer(Controller controller, Duration interval) { + super(controller, interval, null, EnumSet.of(SystemName.main)); + this.curator = controller.curator(); + this.nodeRepository = controller.serviceRegistry().configServer().nodeRepository(); + } + + + @Override + protected boolean maintain() { + var changeRequests = curator.readChangeRequests() + .stream() + .filter(shouldUpdate()) + .collect(Collectors.toList()); + + var nodesByZone = nodesByZone(); + + changeRequests.forEach(changeRequest -> { + var nodes = impactedNodes(nodesByZone, changeRequest); + var nextActions = getNextActions(nodes, changeRequest); + var status = getStatus(nextActions, changeRequest); + + try (var lock = curator.lockChangeRequests()) { + curator.writeChangeRequest( + changeRequest + .withActionPlan(nextActions) + .withStatus(status)); + } + }); + + return true; + } + + /** + * Status is based on: + * 1. Whether the source has reportedly closed the request + * 2. Whether any host requires operator action + * 3. Whether any host has started/finished retiring + */ + private Status getStatus(List<HostAction> nextActions, VespaChangeRequest changeRequest) { + if (changeRequest.getChangeRequestSource().isClosed()) { + return Status.COMPLETED; + } + + var byActionState = nextActions.stream().collect(Collectors.groupingBy(HostAction::getState, Collectors.counting())); + + if (byActionState.getOrDefault(State.REQUIRES_OPERATOR_ACTION, 0L) > 0) { + return Status.REQUIRES_OPERATOR_ACTION; + } + + if (byActionState.getOrDefault(State.RETIRING, 0L) + byActionState.getOrDefault(State.RETIRED, 0L) > 0) { + return Status.IN_PROGRESS; + } + + return Status.PENDING_ACTION; + } + + private List<HostAction> getNextActions(List<Node> nodes, VespaChangeRequest changeRequest) { + var spareCapacity = hasSpareCapacity(changeRequest.getZoneId(), nodes); + return nodes.stream() + .map(node -> nextAction(node, changeRequest, spareCapacity)) + .collect(Collectors.toList()); + } + + // Get the superset of impacted hosts by looking at impacted switches + private List<Node> impactedNodes(Map<ZoneId, List<Node>> nodesByZone, VespaChangeRequest changeRequest) { + return nodesByZone.get(changeRequest.getZoneId()) + .stream() + .filter(isImpacted(changeRequest)) + .collect(Collectors.toList()); + } + + private Optional<HostAction> getPreviousAction(Node node, VespaChangeRequest changeRequest) { + return changeRequest.getHostActionPlan() + .stream() + .filter(hostAction -> hostAction.getHostname().equals(node.hostname().value())) + .findFirst(); + } + + private HostAction nextAction(Node node, VespaChangeRequest changeRequest, boolean spareCapacity) { + var hostAction = getPreviousAction(node, changeRequest) + .orElse(new HostAction(node.hostname().value(), State.PENDING_RETIREMENT, Instant.now())); + + if (node.type() != NodeType.host || !spareCapacity) { + return hostAction.withState(State.REQUIRES_OPERATOR_ACTION); + } + + if (changeRequest.getChangeRequestSource().isClosed()) { + recycleNode(changeRequest.getZoneId(), node); + return hostAction.withState(State.COMPLETE); + } + + if (shouldRetire(changeRequest, hostAction)) { + if (node.state() != Node.State.active) + return hostAction.withState(State.RETIRED); + if (!node.wantToRetire()) { + logger.info(String.format("Retiring %s due to %s", node.hostname().value(), changeRequest.getChangeRequestSource().getId())); + setWantToRetire(changeRequest.getZoneId(), node, true); + } + return hostAction.withState(State.RETIRING); + } + + return hostAction; + } + + private void recycleNode(ZoneId zoneId, Node node) { + if (node.state() == Node.State.parked) { + logger.info("Setting " + node.hostname() + " to dirty"); + nodeRepository.setState(zoneId, NodeState.dirty, node.hostname().value()); + } + if (node.wantToRetire()) + setWantToRetire(zoneId, node, false); + } + + private boolean shouldRetire(VespaChangeRequest changeRequest, HostAction action) { + return action.getState() == State.PENDING_RETIREMENT && + changeRequest.getChangeRequestSource().getPlannedStartTime() + .minus(Duration.ofDays(2)) + .isBefore(ZonedDateTime.now()); + } + + private Map<ZoneId, List<Node>> nodesByZone() { + return controller().zoneRegistry() + .zones() + .reachable() + .in(Environment.prod) + .ids() + .stream() + .collect(Collectors.toMap( + zone -> zone, + zone -> nodeRepository.list(zone, false) + )); + } + + private Predicate<Node> isImpacted(VespaChangeRequest changeRequest) { + return node -> changeRequest.getImpactedHosts().contains(node.hostname().value()) || + node.switchHostname() + .map(switchHostname -> changeRequest.getImpactedSwitches().contains(switchHostname)) + .orElse(false); + } + private Predicate<VespaChangeRequest> shouldUpdate() { + return changeRequest -> changeRequest.getStatus() != Status.COMPLETED && + List.of(ChangeRequest.Impact.HIGH, ChangeRequest.Impact.VERY_HIGH).contains(changeRequest.getImpact()); + } + + private boolean hasSpareCapacity(ZoneId zoneId, List<Node> nodes) { + var tenantHosts = nodes.stream() + .filter(node -> node.type() == NodeType.host) + .map(Node::hostname) + .collect(Collectors.toList()); + + return tenantHosts.isEmpty() || + nodeRepository.isReplaceable(zoneId, tenantHosts); + } + + private void setWantToRetire(ZoneId zoneId, Node node, boolean wantToRetire) { + var newNode = new NodeRepositoryNode(); + newNode.setWantToRetire(wantToRetire); + nodeRepository.patchNode(zoneId, node.hostname().value(), newNode); + } +} |