diff options
Diffstat (limited to 'node-repository/src/main/java')
48 files changed, 899 insertions, 873 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index 2642983dd2a..beec04b3b29 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -2,36 +2,22 @@ package com.yahoo.vespa.hosted.provision; import com.google.inject.Inject; -import com.yahoo.collections.ListMap; import com.yahoo.component.AbstractComponent; -import com.yahoo.component.Version; import com.yahoo.concurrent.maintenance.JobControl; -import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.ApplicationTransaction; import com.yahoo.config.provision.DockerImage; -import com.yahoo.config.provision.Flavor; import com.yahoo.config.provision.NodeFlavors; -import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.Zone; import com.yahoo.config.provisioning.NodeRepositoryConfig; -import com.yahoo.transaction.Mutex; -import com.yahoo.transaction.NestedTransaction; import com.yahoo.vespa.curator.Curator; import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.hosted.provision.Node.State; import com.yahoo.vespa.hosted.provision.applications.Applications; import com.yahoo.vespa.hosted.provision.lb.LoadBalancers; import com.yahoo.vespa.hosted.provision.maintenance.InfrastructureVersions; -import com.yahoo.vespa.hosted.provision.maintenance.NodeFailer; -import com.yahoo.vespa.hosted.provision.maintenance.PeriodicApplicationMaintainer; import com.yahoo.vespa.hosted.provision.node.Agent; -import com.yahoo.vespa.hosted.provision.node.Allocation; -import com.yahoo.vespa.hosted.provision.node.History; -import com.yahoo.vespa.hosted.provision.node.IP; import com.yahoo.vespa.hosted.provision.node.NodeAcl; -import com.yahoo.vespa.hosted.provision.node.filter.NodeFilter; -import com.yahoo.vespa.hosted.provision.node.filter.NodeListFilter; -import com.yahoo.vespa.hosted.provision.node.filter.StateFilter; +import com.yahoo.vespa.hosted.provision.node.Nodes; import com.yahoo.vespa.hosted.provision.os.OsVersions; import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient; import com.yahoo.vespa.hosted.provision.persistence.DnsNameResolver; @@ -41,53 +27,21 @@ import com.yahoo.vespa.hosted.provision.provisioning.ContainerImages; import com.yahoo.vespa.hosted.provision.provisioning.FirmwareChecks; import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator; import com.yahoo.vespa.hosted.provision.provisioning.ProvisionServiceProvider; -import com.yahoo.vespa.hosted.provision.restapi.NotFoundException; import java.time.Clock; import java.time.Duration; import java.time.Instant; -import java.util.ArrayList; -import java.util.EnumSet; import java.util.List; -import java.util.Map; -import java.util.Objects; import java.util.Optional; -import java.util.Set; -import java.util.function.BiFunction; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; -import java.util.stream.Stream; /** - * The hosted Vespa production node repository, which stores its state in Zookeeper. - * The node repository knows about all nodes in a zone, their states and manages all transitions between - * node states. - * <p> - * Node repo locking: Locks must be acquired before making changes to the set of nodes, or to the content - * of the nodes. - * Unallocated states use a single lock, while application level locks are used for all allocated states - * such that applications can mostly change in parallel. - * If both locks are needed acquire the application lock first, then the unallocated lock. - * <p> - * Changes to the set of active nodes must be accompanied by changes to the config model of the application. - * Such changes are not handled by the node repository but by the classes calling it - see - * {@link com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner} for such changes initiated - * by the application package and {@link PeriodicApplicationMaintainer} - * for changes initiated by the node repository. - * Refer to {@link com.yahoo.vespa.hosted.provision.maintenance.NodeRepositoryMaintenance} for timing details - * of the node state transitions. + * The top level singleton in the node repo, providing access to all its state as child objects. * * @author bratseth */ -// Node state transitions: -// 1) (new) | deprovisioned - > provisioned -> (dirty ->) ready -> reserved -> active -> inactive -> dirty -> ready -// 2) inactive -> reserved | parked -// 3) reserved -> dirty -// 4) * -> failed | parked -> (breakfixed) -> dirty | active | deprovisioned -// 5) deprovisioned -> (forgotten) -// Nodes have an application assigned when in states reserved, active and inactive. -// Nodes might have an application assigned in dirty. public class NodeRepository extends AbstractComponent { private static final Logger log = Logger.getLogger(NodeRepository.class.getName()); @@ -95,6 +49,7 @@ public class NodeRepository extends AbstractComponent { private final CuratorDatabaseClient db; private final Clock clock; private final Zone zone; + private final Nodes nodes; private final NodeFlavors flavors; private final HostResourcesCalculator resourcesCalculator; private final NameResolver nameResolver; @@ -156,6 +111,7 @@ public class NodeRepository extends AbstractComponent { this.db = new CuratorDatabaseClient(flavors, curator, clock, zone, useCuratorClientCache, nodeCacheSize); this.zone = zone; this.clock = clock; + this.nodes = new Nodes(db, zone, clock); this.flavors = flavors; this.resourcesCalculator = provisionServiceProvider.getHostResourcesCalculator(); this.nameResolver = nameResolver; @@ -187,7 +143,10 @@ public class NodeRepository extends AbstractComponent { /** Returns the curator database client used by this */ public CuratorDatabaseClient database() { return db; } - /** @return The name resolver used to resolve hostname and ip addresses */ + /** Returns the nodes of the node repo. */ + public Nodes nodes() { return nodes; } + + /** Returns the name resolver used to resolve hostname and ip addresses */ public NameResolver nameResolver() { return nameResolver; } /** Returns the OS versions to use for nodes in this */ @@ -226,205 +185,17 @@ public class NodeRepository extends AbstractComponent { */ public List<NodeAcl> getChildAcls(Node host) { if ( ! host.type().isHost()) throw new IllegalArgumentException("Only hosts have children"); - NodeList allNodes = list(); - return list().childrenOf(host).asList().stream() + NodeList allNodes = nodes().list(); + return nodes().list().childrenOf(host).asList().stream() .map(childNode -> childNode.acl(allNodes, loadBalancers)) .collect(Collectors.toUnmodifiableList()); } - // ---------------- Query API ---------------------------------------------------------------- - - /** - * Finds and returns the node with the hostname in any of the given states, or empty if not found - * - * @param hostname the full host name of the node - * @param inState the states the node may be in. If no states are given, it will be returned from any state - * @return the node, or empty if it was not found in any of the given states - */ - public Optional<Node> getNode(String hostname, State ... inState) { - return db.readNode(hostname, inState); - } - - /** - * Returns all nodes in any of the given states. - * - * @param inState the states to return nodes from. If no states are given, all nodes of the given type are returned - * @return the node, or empty if it was not found in any of the given states - */ - public List<Node> getNodes(State ... inState) { - return new ArrayList<>(db.readNodes(inState)); - } - /** - * Finds and returns the nodes of the given type in any of the given states. - * - * @param type the node type to return - * @param inState the states to return nodes from. If no states are given, all nodes of the given type are returned - * @return the node, or empty if it was not found in any of the given states - */ - public List<Node> getNodes(NodeType type, State ... inState) { - return db.readNodes(inState).stream().filter(node -> node.type().equals(type)).collect(Collectors.toList()); - } - - /** Returns a filterable list of nodes in this repository in any of the given states */ - public NodeList list(State ... inState) { - return NodeList.copyOf(getNodes(inState)); - } - - public NodeList list(ApplicationId application, State ... inState) { - return NodeList.copyOf(getNodes(application, inState)); - } - - /** Returns a filterable list of all nodes of an application */ - public NodeList list(ApplicationId application) { - return NodeList.copyOf(getNodes(application)); - } - - /** Returns a locked list of all nodes in this repository */ - public LockedNodeList list(Mutex lock) { - return new LockedNodeList(getNodes(), lock); - } - - public List<Node> getNodes(ApplicationId id, State ... inState) { return db.readNodes(id, inState); } - public List<Node> getInactive() { return db.readNodes(State.inactive); } - public List<Node> getFailed() { return db.readNodes(State.failed); } - - /** - * Returns whether the zone managed by this node repository seems to be working. - * If too many nodes are not responding, there is probably some zone-wide issue - * and we should probably refrain from making changes to it. - */ - public boolean isWorking() { - NodeList activeNodes = list(State.active); - if (activeNodes.size() <= 5) return true; // Not enough data to decide - NodeList downNodes = activeNodes.down(); - return ! ( (double)downNodes.size() / (double)activeNodes.size() > 0.2 ); - } - - // ----------------- Node lifecycle ----------------------------------------------------------- - - /** Adds a list of newly created docker container nodes to the node repository as <i>reserved</i> nodes */ - public List<Node> addDockerNodes(LockedNodeList nodes) { - for (Node node : nodes) { - if ( ! node.flavor().getType().equals(Flavor.Type.DOCKER_CONTAINER)) - illegal("Cannot add " + node + ": This is not a docker node"); - if (node.allocation().isEmpty()) - illegal("Cannot add " + node + ": Docker containers needs to be allocated"); - Optional<Node> existing = getNode(node.hostname()); - if (existing.isPresent()) - illegal("Cannot add " + node + ": A node with this name already exists (" + - existing.get() + ", " + existing.get().history() + "). Node to be added: " + - node + ", " + node.history()); - } - return db.addNodesInState(nodes.asList(), State.reserved, Agent.system); - } - - /** - * Adds a list of (newly created) nodes to the node repository as <i>provisioned</i> nodes. - * If any of the nodes already exists in the deprovisioned state, the new node will be merged - * with the history of that node. - */ - public List<Node> addNodes(List<Node> nodes, Agent agent) { - try (Mutex lock = lockUnallocated()) { - List<Node> nodesToAdd = new ArrayList<>(); - List<Node> nodesToRemove = new ArrayList<>(); - for (int i = 0; i < nodes.size(); i++) { - var node = nodes.get(i); - - // Check for duplicates - for (int j = 0; j < i; j++) { - if (node.equals(nodes.get(j))) - illegal("Cannot add nodes: " + node + " is duplicated in the argument list"); - } - - Optional<Node> existing = getNode(node.hostname()); - if (existing.isPresent()) { - if (existing.get().state() != State.deprovisioned) - illegal("Cannot add " + node + ": A node with this name already exists"); - node = node.with(existing.get().history()); - node = node.with(existing.get().reports()); - node = node.with(node.status().withFailCount(existing.get().status().failCount())); - if (existing.get().status().firmwareVerifiedAt().isPresent()) - node = node.with(node.status().withFirmwareVerifiedAt(existing.get().status().firmwareVerifiedAt().get())); - nodesToRemove.add(existing.get()); - } - - nodesToAdd.add(node); - } - List<Node> resultingNodes = db.addNodesInState(IP.Config.verify(nodesToAdd, list(lock)), State.provisioned, agent); - db.removeNodes(nodesToRemove); - return resultingNodes; - } - } - - /** Sets a list of nodes ready and returns the nodes in the ready state */ - public List<Node> setReady(List<Node> nodes, Agent agent, String reason) { - try (Mutex lock = lockUnallocated()) { - List<Node> nodesWithResetFields = nodes.stream() - .map(node -> { - if (node.state() != State.provisioned && node.state() != State.dirty) - illegal("Can not set " + node + " ready. It is not provisioned or dirty."); - if (node.type() == NodeType.host && node.ipConfig().pool().getIpSet().isEmpty()) - illegal("Can not set host " + node + " ready. Its IP address pool is empty."); - return node.withWantToRetire(false, false, Agent.system, clock.instant()); - }) - .collect(Collectors.toList()); - - return db.writeTo(State.ready, nodesWithResetFields, agent, Optional.of(reason)); - } - } - - public Node setReady(String hostname, Agent agent, String reason) { - Node nodeToReady = getNode(hostname).orElseThrow(() -> - new NoSuchNodeException("Could not move " + hostname + " to ready: Node not found")); - - if (nodeToReady.state() == State.ready) return nodeToReady; - return setReady(List.of(nodeToReady), agent, reason).get(0); - } - - /** Reserve nodes. This method does <b>not</b> lock the node repository */ - public List<Node> reserve(List<Node> nodes) { - return db.writeTo(State.reserved, nodes, Agent.application, Optional.empty()); - } - - /** Activate nodes. This method does <b>not</b> lock the node repository */ - public List<Node> activate(List<Node> nodes, NestedTransaction transaction) { - return db.writeTo(State.active, nodes, Agent.application, Optional.empty(), transaction); - } - - /** - * Sets a list of nodes to have their allocation removable (active to inactive) in the node repository. - * - * @param application the application the nodes belong to - * @param nodes the nodes to make removable. These nodes MUST be in the active state. - */ - public void setRemovable(ApplicationId application, List<Node> nodes) { - try (Mutex lock = lock(application)) { - List<Node> removableNodes = - nodes.stream().map(node -> node.with(node.allocation().get().removable(true))) - .collect(Collectors.toList()); - write(removableNodes, lock); - } - } - - /** - * Deactivates these nodes in a transaction and returns the nodes in the new state which will hold if the - * transaction commits. - */ - public List<Node> deactivate(List<Node> nodes, ApplicationTransaction transaction) { - var stateless = NodeList.copyOf(nodes).stateless(); - var stateful = NodeList.copyOf(nodes).stateful(); - List<Node> written = new ArrayList<>(); - written.addAll(deallocate(stateless.asList(), Agent.application, "Deactivated by application", transaction.nested())); - written.addAll(db.writeTo(State.inactive, stateful.asList(), Agent.application, Optional.empty(), transaction.nested())); - return written; - - } - /** Removes this application: Active nodes are deactivated while all non-active nodes are set dirty. */ public void remove(ApplicationTransaction transaction) { - NodeList applicationNodes = list(transaction.application()); + NodeList applicationNodes = nodes().list(transaction.application()); NodeList activeNodes = applicationNodes.state(State.active); - deactivate(activeNodes.asList(), transaction); + nodes().deactivate(activeNodes.asList(), transaction); db.writeTo(State.dirty, applicationNodes.except(activeNodes.asSet()).asList(), Agent.system, @@ -433,497 +204,10 @@ public class NodeRepository extends AbstractComponent { applications.remove(transaction); } - /** Move nodes to the dirty state */ - public List<Node> deallocate(List<Node> nodes, Agent agent, String reason) { - return performOn(NodeListFilter.from(nodes), (node, lock) -> deallocate(node, agent, reason)); - } - - public List<Node> deallocateRecursively(String hostname, Agent agent, String reason) { - Node nodeToDirty = getNode(hostname).orElseThrow(() -> - new IllegalArgumentException("Could not deallocate " + hostname + ": Node not found")); - - List<Node> nodesToDirty = - (nodeToDirty.type().isHost() ? - Stream.concat(list().childrenOf(hostname).asList().stream(), Stream.of(nodeToDirty)) : - Stream.of(nodeToDirty)) - .filter(node -> node.state() != State.dirty) - .collect(Collectors.toList()); - - List<String> hostnamesNotAllowedToDirty = nodesToDirty.stream() - .filter(node -> node.state() != State.provisioned) - .filter(node -> node.state() != State.failed) - .filter(node -> node.state() != State.parked) - .filter(node -> node.state() != State.breakfixed) - .map(Node::hostname) - .collect(Collectors.toList()); - if ( ! hostnamesNotAllowedToDirty.isEmpty()) - illegal("Could not deallocate " + nodeToDirty + ": " + - hostnamesNotAllowedToDirty + " are not in states [provisioned, failed, parked, breakfixed]"); - - return nodesToDirty.stream().map(node -> deallocate(node, agent, reason)).collect(Collectors.toList()); - } - - /** - * Set a node dirty or parked, allowed if it is in the provisioned, inactive, failed or parked state. - * Use this to clean newly provisioned nodes or to recycle failed nodes which have been repaired or put on hold. - */ - public Node deallocate(Node node, Agent agent, String reason) { - NestedTransaction transaction = new NestedTransaction(); - Node deallocated = deallocate(node, agent, reason, transaction); - transaction.commit(); - return deallocated; - } - - public List<Node> deallocate(List<Node> nodes, Agent agent, String reason, NestedTransaction transaction) { - return nodes.stream().map(node -> deallocate(node, agent, reason, transaction)).collect(Collectors.toList()); - } - - public Node deallocate(Node node, Agent agent, String reason, NestedTransaction transaction) { - if (node.state() != State.parked && agent != Agent.operator - && (node.status().wantToDeprovision() || retiredByOperator(node))) - return park(node.hostname(), false, agent, reason, transaction); - else - return db.writeTo(State.dirty, List.of(node), agent, Optional.of(reason), transaction).get(0); - } - - private static boolean retiredByOperator(Node node) { - return node.status().wantToRetire() && node.history().event(History.Event.Type.wantToRetire) - .map(History.Event::agent) - .map(agent -> agent == Agent.operator) - .orElse(false); - } - - /** - * Fails this node and returns it in its new state. - * - * @return the node in its new state - * @throws NoSuchNodeException if the node is not found - */ - public Node fail(String hostname, Agent agent, String reason) { - return move(hostname, true, State.failed, agent, Optional.of(reason)); - } - - /** - * Fails all the nodes that are children of hostname before finally failing the hostname itself. - * - * @return List of all the failed nodes in their new state - */ - public List<Node> failRecursively(String hostname, Agent agent, String reason) { - return moveRecursively(hostname, State.failed, agent, Optional.of(reason)); - } - - /** - * Parks this node and returns it in its new state. - * - * @return the node in its new state - * @throws NoSuchNodeException if the node is not found - */ - public Node park(String hostname, boolean keepAllocation, Agent agent, String reason) { - NestedTransaction transaction = new NestedTransaction(); - Node parked = park(hostname, keepAllocation, agent, reason, transaction); - transaction.commit(); - return parked; - } - - public Node park(String hostname, boolean keepAllocation, Agent agent, String reason, NestedTransaction transaction) { - return move(hostname, keepAllocation, State.parked, agent, Optional.of(reason), transaction); - } - - /** - * Parks all the nodes that are children of hostname before finally parking the hostname itself. - * - * @return List of all the parked nodes in their new state - */ - public List<Node> parkRecursively(String hostname, Agent agent, String reason) { - return moveRecursively(hostname, State.parked, agent, Optional.of(reason)); - } - - /** - * Moves a previously failed or parked node back to the active state. - * - * @return the node in its new state - * @throws NoSuchNodeException if the node is not found - */ - public Node reactivate(String hostname, Agent agent, String reason) { - return move(hostname, true, State.active, agent, Optional.of(reason)); - } - - /** - * Moves a host to breakfixed state, removing any children. - */ - public List<Node> breakfixRecursively(String hostname, Agent agent, String reason) { - Node node = getNode(hostname).orElseThrow(() -> - new NoSuchNodeException("Could not breakfix " + hostname + ": Node not found")); - - try (Mutex lock = lockUnallocated()) { - requireBreakfixable(node); - List<Node> removed = removeChildren(node, false); - removed.add(move(node, State.breakfixed, agent, Optional.of(reason))); - return removed; - } - } - - private List<Node> moveRecursively(String hostname, State toState, Agent agent, Optional<String> reason) { - List<Node> moved = list().childrenOf(hostname).asList().stream() - .map(child -> move(child, toState, agent, reason)) - .collect(Collectors.toList()); - - moved.add(move(hostname, true, toState, agent, reason)); - return moved; - } - - private Node move(String hostname, boolean keepAllocation, State toState, Agent agent, Optional<String> reason) { - NestedTransaction transaction = new NestedTransaction(); - Node moved = move(hostname, keepAllocation, toState, agent, reason, transaction); - transaction.commit(); - return moved; - } - - private Node move(String hostname, boolean keepAllocation, State toState, Agent agent, Optional<String> reason, - NestedTransaction transaction) { - Node node = getNode(hostname).orElseThrow(() -> - new NoSuchNodeException("Could not move " + hostname + " to " + toState + ": Node not found")); - - if (!keepAllocation && node.allocation().isPresent()) { - node = node.withoutAllocation(); - } - - return move(node, toState, agent, reason, transaction); - } - - private Node move(Node node, State toState, Agent agent, Optional<String> reason) { - NestedTransaction transaction = new NestedTransaction(); - Node moved = move(node, toState, agent, reason, transaction); - transaction.commit(); - return moved; - } - - private Node move(Node node, State toState, Agent agent, Optional<String> reason, NestedTransaction transaction) { - if (toState == Node.State.active && node.allocation().isEmpty()) - illegal("Could not set " + node + " active. It has no allocation."); - - // TODO: Work out a safe lock acquisition strategy for moves, e.g. migrate to lockNode. - try (Mutex lock = lock(node)) { - if (toState == State.active) { - for (Node currentActive : getNodes(node.allocation().get().owner(), State.active)) { - if (node.allocation().get().membership().cluster().equals(currentActive.allocation().get().membership().cluster()) - && node.allocation().get().membership().index() == currentActive.allocation().get().membership().index()) - illegal("Could not set " + node + " active: Same cluster and index as " + currentActive); - } - } - return db.writeTo(toState, List.of(node), agent, reason, transaction).get(0); - } - } - - /* - * This method is used by the REST API to handle readying nodes for new allocations. For tenant docker - * containers this will remove the node from node repository, otherwise the node will be moved to state ready. - */ - public Node markNodeAvailableForNewAllocation(String hostname, Agent agent, String reason) { - Node node = getNode(hostname).orElseThrow(() -> new NotFoundException("No node with hostname '" + hostname + "'")); - if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER && node.type() == NodeType.tenant) { - if (node.state() != State.dirty) - illegal("Cannot make " + node + " available for new allocation as it is not in state [dirty]"); - return removeRecursively(node, true).get(0); - } - - if (node.state() == State.ready) return node; - - Node parentHost = node.parentHostname().flatMap(this::getNode).orElse(node); - List<String> failureReasons = NodeFailer.reasonsToFailParentHost(parentHost); - if ( ! failureReasons.isEmpty()) - illegal(node + " cannot be readied because it has hard failures: " + failureReasons); - - return setReady(List.of(node), agent, reason).get(0); - } - - /** - * Removes all the nodes that are children of hostname before finally removing the hostname itself. - * - * @return a List of all the nodes that have been removed or (for hosts) deprovisioned - */ - public List<Node> removeRecursively(String hostname) { - Node node = getNode(hostname).orElseThrow(() -> new NotFoundException("No node with hostname '" + hostname + "'")); - return removeRecursively(node, false); - } - - public List<Node> removeRecursively(Node node, boolean force) { - try (Mutex lock = lockUnallocated()) { - requireRemovable(node, false, force); - - if (node.type().isHost()) { - List<Node> removed = removeChildren(node, force); - if (zone.getCloud().dynamicProvisioning() || node.type() != NodeType.host) - db.removeNodes(List.of(node)); - else { - node = node.with(IP.Config.EMPTY); - move(node, State.deprovisioned, Agent.system, Optional.empty()); - } - removed.add(node); - return removed; - } - else { - List<Node> removed = List.of(node); - db.removeNodes(removed); - return removed; - } - } - } - - /** Forgets a deprovisioned node. This removes all traces of the node in the node repository. */ - public void forget(Node node) { - if (node.state() != State.deprovisioned) - throw new IllegalArgumentException(node + " must be deprovisioned before it can be forgotten"); - db.removeNodes(List.of(node)); - } - - private List<Node> removeChildren(Node node, boolean force) { - List<Node> children = list().childrenOf(node).asList(); - children.forEach(child -> requireRemovable(child, true, force)); - db.removeNodes(children); - return new ArrayList<>(children); - } - - /** - * Throws if the given node cannot be removed. Removal is allowed if: - * - Tenant node: node is unallocated - * - Host node: iff in state provisioned|failed|parked - * - Child node: - * If only removing the container node: node in state ready - * If also removing the parent node: child is in state provisioned|failed|parked|dirty|ready - */ - private void requireRemovable(Node node, boolean removingAsChild, boolean force) { - if (force) return; - - if (node.type() == NodeType.tenant && node.allocation().isPresent()) - illegal(node + " is currently allocated and cannot be removed"); - - if (!node.type().isHost() && !removingAsChild) { - if (node.state() != State.ready) - illegal(node + " can not be removed as it is not in the state " + State.ready); - } - else if (!node.type().isHost()) { // removing a child node - Set<State> legalStates = EnumSet.of(State.provisioned, State.failed, State.parked, State.dirty, State.ready); - if ( ! legalStates.contains(node.state())) - illegal(node + " can not be removed as it is not in the states " + legalStates); - } - else { // a host - Set<State> legalStates = EnumSet.of(State.provisioned, State.failed, State.parked); - if (! legalStates.contains(node.state())) - illegal(node + " can not be removed as it is not in the states " + legalStates); - } - } - - /** - * Throws if given node cannot be breakfixed. - * Breakfix is allowed if the following is true: - * - Node is tenant host - * - Node is in zone without dynamic provisioning - * - Node is in parked or failed state - */ - private void requireBreakfixable(Node node) { - if (zone().getCloud().dynamicProvisioning()) { - illegal("Can not breakfix in zone: " + zone()); - } - - if (node.type() != NodeType.host) { - illegal(node + " can not be breakfixed as it is not a tenant host"); - } - - Set<State> legalStates = EnumSet.of(State.failed, State.parked); - if (! legalStates.contains(node.state())) { - illegal(node + " can not be removed as it is not in the states " + legalStates); - } - } - - /** - * Increases the restart generation of the active nodes matching the filter. - * - * @return the nodes in their new state - */ - public List<Node> restart(NodeFilter filter) { - return performOn(StateFilter.from(State.active, filter), - (node, lock) -> write(node.withRestart(node.allocation().get().restartGeneration().withIncreasedWanted()), - lock)); - } - - /** - * Increases the reboot generation of the nodes matching the filter. - * - * @return the nodes in their new state - */ - public List<Node> reboot(NodeFilter filter) { - return performOn(filter, (node, lock) -> write(node.withReboot(node.status().reboot().withIncreasedWanted()), lock)); - } - - /** - * Set target OS version of all nodes matching given filter. - * - * @return the nodes in their new state - */ - public List<Node> upgradeOs(NodeFilter filter, Optional<Version> version) { - return performOn(filter, (node, lock) -> { - var newStatus = node.status().withOsVersion(node.status().osVersion().withWanted(version)); - return write(node.with(newStatus), lock); - }); - } - - /** Retire nodes matching given filter */ - public List<Node> retire(NodeFilter filter, Agent agent, Instant instant) { - return performOn(filter, (node, lock) -> write(node.withWantToRetire(true, agent, instant), lock)); - } - - /** - * Writes this node after it has changed some internal state but NOT changed its state field. - * This does NOT lock the node repository implicitly, but callers are expected to already hold the lock. - * - * @param lock already acquired lock - * @return the written node for convenience - */ - public Node write(Node node, Mutex lock) { return write(List.of(node), lock).get(0); } - - /** - * Writes these nodes after they have changed some internal state but NOT changed their state field. - * This does NOT lock the node repository implicitly, but callers are expected to already hold the lock. - * - * @param lock already acquired lock - * @return the written nodes for convenience - */ - public List<Node> write(List<Node> nodes, @SuppressWarnings("unused") Mutex lock) { - return db.writeTo(nodes, Agent.system, Optional.empty()); - } - - /** - * Performs an operation requiring locking on all nodes matching some filter. - * - * @param filter the filter determining the set of nodes where the operation will be performed - * @param action the action to perform - * @return the set of nodes on which the action was performed, as they became as a result of the operation - */ - private List<Node> performOn(NodeFilter filter, BiFunction<Node, Mutex, Node> action) { - List<Node> unallocatedNodes = new ArrayList<>(); - ListMap<ApplicationId, Node> allocatedNodes = new ListMap<>(); - - // Group matching nodes by the lock needed - for (Node node : db.readNodes()) { - if ( ! filter.matches(node)) continue; - if (node.allocation().isPresent()) - allocatedNodes.put(node.allocation().get().owner(), node); - else - unallocatedNodes.add(node); - } - - // perform operation while holding locks - List<Node> resultingNodes = new ArrayList<>(); - try (Mutex lock = lockUnallocated()) { - for (Node node : unallocatedNodes) { - Optional<Node> currentNode = db.readNode(node.hostname()); // Re-read while holding lock - if (currentNode.isEmpty()) continue; - resultingNodes.add(action.apply(currentNode.get(), lock)); - } - } - for (Map.Entry<ApplicationId, List<Node>> applicationNodes : allocatedNodes.entrySet()) { - try (Mutex lock = lock(applicationNodes.getKey())) { - for (Node node : applicationNodes.getValue()) { - Optional<Node> currentNode = db.readNode(node.hostname()); // Re-read while holding lock - if (currentNode.isEmpty()) continue; - resultingNodes.add(action.apply(currentNode.get(), lock)); - } - } - } - return resultingNodes; - } - - public boolean canAllocateTenantNodeTo(Node host) { - return canAllocateTenantNodeTo(host, zone.getCloud().dynamicProvisioning()); - } - - public static boolean canAllocateTenantNodeTo(Node host, boolean dynamicProvisioning) { - if ( ! host.type().canRun(NodeType.tenant)) return false; - if (host.status().wantToRetire()) return false; - if (host.allocation().map(alloc -> alloc.membership().retired()).orElse(false)) return false; - - if (dynamicProvisioning) - return EnumSet.of(State.active, State.ready, State.provisioned).contains(host.state()); - else - return host.state() == State.active; - } - /** Returns the time keeper of this system */ public Clock clock() { return clock; } /** Returns the zone of this system */ public Zone zone() { return zone; } - /** Create a lock which provides exclusive rights to making changes to the given application */ - public Mutex lock(ApplicationId application) { - return db.lock(application); - } - - /** Create a lock with a timeout which provides exclusive rights to making changes to the given application */ - public Mutex lock(ApplicationId application, Duration timeout) { - return db.lock(application, timeout); - } - - /** Create a lock which provides exclusive rights to modifying unallocated nodes */ - public Mutex lockUnallocated() { return db.lockInactive(); } - - /** Returns the unallocated/application lock, and the node acquired under that lock. */ - public Optional<NodeMutex> lockAndGet(Node node) { - Node staleNode = node; - - final int maxRetries = 4; - for (int i = 0; i < maxRetries; ++i) { - Mutex lockToClose = lock(staleNode); - try { - // As an optimization we first try finding the node in the same state - Optional<Node> freshNode = getNode(staleNode.hostname(), staleNode.state()); - if (freshNode.isEmpty()) { - freshNode = getNode(staleNode.hostname()); - if (freshNode.isEmpty()) { - return Optional.empty(); - } - } - - if (Objects.equals(freshNode.get().allocation().map(Allocation::owner), - staleNode.allocation().map(Allocation::owner))) { - NodeMutex nodeMutex = new NodeMutex(freshNode.get(), lockToClose); - lockToClose = null; - return Optional.of(nodeMutex); - } - - // The wrong lock was held when the fresh node was fetched, so try again - staleNode = freshNode.get(); - } finally { - if (lockToClose != null) lockToClose.close(); - } - } - - throw new IllegalStateException("Giving up (after " + maxRetries + " attempts) " + - "fetching an up to date node under lock: " + node.hostname()); - } - - /** Returns the unallocated/application lock, and the node acquired under that lock. */ - public Optional<NodeMutex> lockAndGet(String hostname) { - return getNode(hostname).flatMap(this::lockAndGet); - } - - /** Returns the unallocated/application lock, and the node acquired under that lock. */ - public NodeMutex lockAndGetRequired(Node node) { - return lockAndGet(node).orElseThrow(() -> new IllegalArgumentException("No such node: " + node.hostname())); - } - - /** Returns the unallocated/application lock, and the node acquired under that lock. */ - public NodeMutex lockAndGetRequired(String hostname) { - return lockAndGet(hostname).orElseThrow(() -> new IllegalArgumentException("No such node: " + hostname)); - } - - private Mutex lock(Node node) { - return node.allocation().isPresent() ? lock(node.allocation().get().owner()) : lockUnallocated(); - } - - private void illegal(String message) { - throw new IllegalArgumentException(message); - } - } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java index 7064a7d2e6a..84634b26c4a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java @@ -49,7 +49,7 @@ public class AllocationOptimizer { limits = atLeast(minimumNodes, limits); Optional<AllocatableClusterResources> bestAllocation = Optional.empty(); - NodeList hosts = nodeRepository.list().hosts(); + NodeList hosts = nodeRepository.nodes().list().hosts(); for (int groups = limits.min().groups(); groups <= limits.max().groups(); groups++) { for (int nodes = limits.min().nodes(); nodes <= limits.max().nodes(); nodes++) { if (nodes % groups != 0) continue; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index c4ff86a5390..81fa7ed2d4b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -176,7 +176,7 @@ public class Autoscaler { return false; // A deployment is ongoing - if (nodeRepository.getNodes(nodes.first().get().allocation().get().owner(), Node.State.reserved).size() > 0) + if (nodeRepository.nodes().getNodes(nodes.first().get().allocation().get().owner(), Node.State.reserved).size() > 0) return false; return true; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MemoryMetricsDb.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MemoryMetricsDb.java index 999acad7ab0..a881bde2a33 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MemoryMetricsDb.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MemoryMetricsDb.java @@ -75,7 +75,7 @@ public class MemoryMetricsDb implements MetricsDb { private void add(String hostname, MetricSnapshot snapshot) { NodeTimeseries timeseries = db.get(hostname); if (timeseries == null) { // new node - Optional<Node> node = nodeRepository.getNode(hostname); + Optional<Node> node = nodeRepository.nodes().getNode(hostname); if (node.isEmpty()) return; if (node.get().allocation().isEmpty()) return; timeseries = new NodeTimeseries(hostname, new ArrayList<>()); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java index 961c1393550..b93c7930b5b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcher.java @@ -51,7 +51,7 @@ public class MetricsV2MetricsFetcher extends AbstractComponent implements Metric @Override public CompletableFuture<MetricsResponse> fetchMetrics(ApplicationId application) { - NodeList applicationNodes = nodeRepository.list(application).state(Node.State.active); + NodeList applicationNodes = nodeRepository.nodes().list(application).state(Node.State.active); Optional<Node> metricsV2Container = applicationNodes.container() .matching(node -> expectedUp(node)) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/SharedLoadBalancerService.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/SharedLoadBalancerService.java index 559dbe63cba..da5591e0800 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/SharedLoadBalancerService.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/lb/SharedLoadBalancerService.java @@ -34,7 +34,7 @@ public class SharedLoadBalancerService implements LoadBalancerService { @Override public LoadBalancerInstance create(LoadBalancerSpec spec, boolean force) { - var proxyNodes = new ArrayList<>(nodeRepository.getNodes(NodeType.proxy)); + var proxyNodes = new ArrayList<>(nodeRepository.nodes().getNodes(NodeType.proxy)); proxyNodes.sort(hostnameComparator); if (proxyNodes.size() == 0) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index 8eb92217356..1d2938742aa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -52,7 +52,7 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { @Override protected boolean maintain() { - if ( ! nodeRepository().isWorking()) return false; + if ( ! nodeRepository().nodes().isWorking()) return false; boolean success = true; if ( ! nodeRepository().zone().environment().isProduction()) return success; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java index 88bf3426ee0..22c8e49825d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java @@ -30,7 +30,7 @@ public class DirtyExpirer extends Expirer { @Override protected void expire(List<Node> expired) { for (Node expiredNode : expired) - nodeRepository().fail(expiredNode.hostname(), Agent.DirtyExpirer, "Node is stuck in dirty"); + nodeRepository().nodes().fail(expiredNode.hostname(), Agent.DirtyExpirer, "Node is stuck in dirty"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index 509b0e1352f..5e54f09f7a3 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -28,6 +28,7 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.hosted.provision.node.IP; +import com.yahoo.vespa.hosted.provision.node.Nodes; import com.yahoo.vespa.hosted.provision.provisioning.FatalProvisioningException; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner; import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner.HostSharing; @@ -81,8 +82,8 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { @Override protected boolean maintain() { - try (Mutex lock = nodeRepository().lockUnallocated()) { - NodeList nodes = nodeRepository().list(); + try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { + NodeList nodes = nodeRepository().nodes().list(); resumeProvisioning(nodes, lock); convergeToCapacity(nodes); } @@ -102,7 +103,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { try { List<Node> updatedNodes = hostProvisioner.provision(host, children); verifyDns(updatedNodes); - nodeRepository().write(updatedNodes, lock); + nodeRepository().nodes().write(updatedNodes, lock); } catch (IllegalArgumentException | IllegalStateException e) { log.log(Level.INFO, "Failed to provision " + host.hostname() + " with " + children.size() + " children: " + Exceptions.toMessageString(e)); @@ -110,7 +111,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { log.log(Level.SEVERE, "Failed to provision " + host.hostname() + " with " + children.size() + " children, failing out the host recursively", e); // Fail out as operator to force a quick redeployment - nodeRepository().failRecursively( + nodeRepository().nodes().failRecursively( host.hostname(), Agent.operator, "Failed by HostProvisioner due to provisioning failure"); } catch (RuntimeException e) { if (e.getCause() instanceof NameNotFoundException) @@ -137,7 +138,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { excessHosts.forEach(host -> { try { hostProvisioner.deprovision(host); - nodeRepository().removeRecursively(host, true); + nodeRepository().nodes().removeRecursively(host, true); } catch (RuntimeException e) { log.log(Level.WARNING, "Failed to deprovision " + host.hostname() + ", will retry in " + interval(), e); } @@ -199,7 +200,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { private Map<String, Node> findSharedHosts(NodeList nodeList) { return nodeList.stream() - .filter(node -> NodeRepository.canAllocateTenantNodeTo(node, true)) + .filter(node -> Nodes.canAllocateTenantNodeTo(node, true)) .filter(node -> node.reservedTo().isEmpty()) .filter(node -> node.exclusiveTo().isEmpty()) .collect(Collectors.toMap(Node::hostname, Function.identity())); @@ -244,7 +245,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { .stream() .map(ProvisionedHost::generateHost) .collect(Collectors.toList()); - nodeRepository().addNodes(hosts, Agent.DynamicProvisioningMaintainer); + nodeRepository().nodes().addNodes(hosts, Agent.DynamicProvisioningMaintainer); return hosts; } catch (OutOfCapacityException | IllegalArgumentException | IllegalStateException e) { throw new OutOfCapacityException("Failed to provision " + count + " " + nodeResources + ": " + e.getMessage()); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java index ba59ab02780..8ccb8980a71 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Expirer.java @@ -41,7 +41,7 @@ public abstract class Expirer extends NodeRepositoryMaintainer { @Override protected boolean maintain() { - List<Node> expired = nodeRepository().getNodes(fromState).stream() + List<Node> expired = nodeRepository().nodes().getNodes(fromState).stream() .filter(this::isExpired) .collect(Collectors.toList()); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index 1a47af6b929..08edee0be8b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -68,7 +68,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { @Override protected boolean maintain() { - List<Node> remainingNodes = nodeRepository.getNodes(Node.State.failed).stream() + List<Node> remainingNodes = nodeRepository.nodes().getNodes(Node.State.failed).stream() .filter(node -> node.type() == NodeType.tenant || node.type() == NodeType.host) .collect(Collectors.toList()); @@ -95,14 +95,14 @@ public class FailedExpirer extends NodeRepositoryMaintainer { for (Node candidate : nodes) { if (NodeFailer.hasHardwareIssue(candidate, nodeRepository)) { List<String> unparkedChildren = !candidate.type().isHost() ? List.of() : - nodeRepository.list() + nodeRepository.nodes().list() .childrenOf(candidate) .matching(node -> node.state() != Node.State.parked) .mapToList(Node::hostname); if (unparkedChildren.isEmpty()) { - nodeRepository.park(candidate.hostname(), false, Agent.FailedExpirer, - "Parked by FailedExpirer due to hardware issue"); + nodeRepository.nodes().park(candidate.hostname(), false, Agent.FailedExpirer, + "Parked by FailedExpirer due to hardware issue"); } else { log.info(String.format("Expired failed node %s with hardware issue was not parked because of " + "unparked children: %s", candidate.hostname(), @@ -112,7 +112,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { nodesToRecycle.add(candidate); } } - nodeRepository.deallocate(nodesToRecycle, Agent.FailedExpirer, "Expired by FailedExpirer"); + nodeRepository.nodes().deallocate(nodesToRecycle, Agent.FailedExpirer, "Expired by FailedExpirer"); } /** Returns whether the current node fail count should be used as an indicator of hardware issue */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InactiveExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InactiveExpirer.java index 231d2ac08b1..ae6e716bffe 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InactiveExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/InactiveExpirer.java @@ -39,7 +39,7 @@ public class InactiveExpirer extends Expirer { @Override protected void expire(List<Node> expired) { expired.forEach(node -> { - nodeRepository.deallocate(node, Agent.InactiveExpirer, "Expired by InactiveExpirer"); + nodeRepository.nodes().deallocate(node, Agent.InactiveExpirer, "Expired by InactiveExpirer"); }); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java index febdbbd4dc1..2ef12177eaf 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java @@ -133,7 +133,7 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { } private List<Node> allocatedNodes(LoadBalancerId loadBalancer) { - return nodeRepository().list().owner(loadBalancer.application()).cluster(loadBalancer.cluster()).asList(); + return nodeRepository().nodes().list().owner(loadBalancer.application()).cluster(loadBalancer.cluster()).asList(); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java index 52c487c28cf..e8f216c793a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java @@ -104,7 +104,7 @@ class MaintenanceDeployment implements Closeable { private Optional<Mutex> tryLock(ApplicationId application, NodeRepository nodeRepository) { try { // Use a short lock to avoid interfering with change deployments - return Optional.of(nodeRepository.lock(application, Duration.ofSeconds(1))); + return Optional.of(nodeRepository.nodes().lock(application, Duration.ofSeconds(1))); } catch (ApplicationLockException e) { return Optional.empty(); @@ -116,7 +116,7 @@ class MaintenanceDeployment implements Closeable { Deployer deployer, NodeRepository nodeRepository) { if (lock.isEmpty()) return Optional.empty(); - if (nodeRepository.getNodes(application, Node.State.active).isEmpty()) return Optional.empty(); + if (nodeRepository.nodes().getNodes(application, Node.State.active).isEmpty()) return Optional.empty(); return deployer.deployFromLocalActive(application); } @@ -168,7 +168,7 @@ class MaintenanceDeployment implements Closeable { if ( ! deployment.prepare()) return false; if (verifyTarget) { expectedNewNode = - nodeRepository.getNodes(application, Node.State.reserved).stream() + nodeRepository.nodes().getNodes(application, Node.State.reserved).stream() .filter(n -> !n.hostname().equals(node.hostname())) .filter(n -> n.allocation().get().membership().cluster().id().equals(node.allocation().get().membership().cluster().id())) .findAny(); @@ -185,15 +185,15 @@ class MaintenanceDeployment implements Closeable { markWantToRetire(node, false, agent, nodeRepository); // Necessary if this failed, no-op otherwise // Immediately clean up if we reserved the node but could not activate or reserved a node on the wrong host - expectedNewNode.flatMap(node -> nodeRepository.getNode(node.hostname(), Node.State.reserved)) - .ifPresent(node -> nodeRepository.deallocate(node, agent, "Expired by " + agent)); + expectedNewNode.flatMap(node -> nodeRepository.nodes().getNode(node.hostname(), Node.State.reserved)) + .ifPresent(node -> nodeRepository.nodes().deallocate(node, agent, "Expired by " + agent)); } } } /** Returns true only if this operation changes the state of the wantToRetire flag */ private boolean markWantToRetire(Node node, boolean wantToRetire, Agent agent, NodeRepository nodeRepository) { - Optional<NodeMutex> nodeMutex = nodeRepository.lockAndGet(node); + Optional<NodeMutex> nodeMutex = nodeRepository.nodes().lockAndGet(node); if (nodeMutex.isEmpty()) return false; try (var nodeLock = nodeMutex.get()) { @@ -201,7 +201,7 @@ class MaintenanceDeployment implements Closeable { if (nodeLock.node().status().wantToRetire() == wantToRetire) return false; - nodeRepository.write(nodeLock.node().withWantToRetire(wantToRetire, agent, nodeRepository.clock().instant()), nodeLock); + nodeRepository.nodes().write(nodeLock.node().withWantToRetire(wantToRetire, agent, nodeRepository.clock().instant()), nodeLock); return true; } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index c14210bf5a7..9504a73f21e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -66,7 +66,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { @Override public boolean maintain() { - NodeList nodes = nodeRepository().list(); + NodeList nodes = nodeRepository().nodes().list(); ServiceModel serviceModel = serviceMonitor.getServiceModelSnapshot(); updateZoneMetrics(); @@ -126,7 +126,7 @@ public class MetricsReporter extends NodeRepositoryMaintainer { } private void updateZoneMetrics() { - metric.set("zone.working", nodeRepository().isWorking() ? 1 : 0, null); + metric.set("zone.working", nodeRepository().nodes().isWorking() ? 1 : 0, null); } private void updateCacheMetrics() { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index 2999655e5fa..7c3e3eb4553 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -75,13 +75,13 @@ public class NodeFailer extends NodeRepositoryMaintainer { @Override protected boolean maintain() { - if ( ! nodeRepository().isWorking()) return false; + if ( ! nodeRepository().nodes().isWorking()) return false; int throttledHostFailures = 0; int throttledNodeFailures = 0; // Ready nodes - try (Mutex lock = nodeRepository().lockUnallocated()) { + try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { for (Map.Entry<Node, String> entry : getReadyNodesByFailureReason().entrySet()) { Node node = entry.getKey(); if (throttle(node)) { @@ -90,7 +90,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { continue; } String reason = entry.getValue(); - nodeRepository().fail(node.hostname(), Agent.NodeFailer, reason); + nodeRepository().nodes().fail(node.hostname(), Agent.NodeFailer, reason); } } @@ -129,11 +129,11 @@ public class NodeFailer extends NodeRepositoryMaintainer { clock().instant().minus(downTimeLimit).minus(nodeRequestInterval); Map<Node, String> nodesByFailureReason = new HashMap<>(); - for (Node node : nodeRepository().getNodes(Node.State.ready)) { + for (Node node : nodeRepository().nodes().getNodes(Node.State.ready)) { if (expectConfigRequests(node) && ! hasNodeRequestedConfigAfter(node, oldestAcceptableRequestTime)) { nodesByFailureReason.put(node, "Not receiving config requests from node"); } else { - Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node); + Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().getNode(parent)).orElse(node); List<String> failureReports = reasonsToFailParentHost(hostNode); if (failureReports.size() > 0) { if (hostNode.equals(node)) { @@ -148,7 +148,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { } private Map<Node, String> getActiveNodesByFailureReason() { - List<Node> activeNodes = nodeRepository().getNodes(Node.State.active); + List<Node> activeNodes = nodeRepository().nodes().getNodes(Node.State.active); Instant graceTimeEnd = clock().instant().minus(downTimeLimit); Map<Node, String> nodesByFailureReason = new HashMap<>(); for (Node node : activeNodes) { @@ -158,7 +158,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { nodesByFailureReason.put(node, "Node has been down longer than " + downTimeLimit); } else if (hostSuspended(node, activeNodes)) { - Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().getNode(parent)).orElse(node); + Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository().nodes().getNode(parent)).orElse(node); if (hostNode.type().isHost()) { List<String> failureReports = reasonsToFailParentHost(hostNode); if (failureReports.size() > 0) { @@ -184,7 +184,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { /** Returns whether node has any kind of hardware issue */ static boolean hasHardwareIssue(Node node, NodeRepository nodeRepository) { - Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository.getNode(parent)).orElse(node); + Node hostNode = node.parentHostname().flatMap(parent -> nodeRepository.nodes().getNode(parent)).orElse(node); return reasonsToFailParentHost(hostNode).size() > 0; } @@ -246,7 +246,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { return true; case proxy: case proxyhost: - return nodeRepository().getNodes(nodeType, Node.State.failed).size() == 0; + return nodeRepository().nodes().getNodes(nodeType, Node.State.failed).size() == 0; default: return false; } @@ -264,21 +264,21 @@ public class NodeFailer extends NodeRepositoryMaintainer { deployer.deployFromLocalActive(node.allocation().get().owner(), Duration.ofMinutes(30)); if (deployment.isEmpty()) return false; - try (Mutex lock = nodeRepository().lock(node.allocation().get().owner())) { + try (Mutex lock = nodeRepository().nodes().lock(node.allocation().get().owner())) { // If the active node that we are trying to fail is of type host, we need to successfully fail all // the children nodes running on it before we fail the host boolean allTenantNodesFailedOutSuccessfully = true; String reasonForChildFailure = "Failing due to parent host " + node.hostname() + " failure: " + reason; - for (Node failingTenantNode : nodeRepository().list().childrenOf(node)) { + for (Node failingTenantNode : nodeRepository().nodes().list().childrenOf(node)) { if (failingTenantNode.state() == Node.State.active) { allTenantNodesFailedOutSuccessfully &= failActive(failingTenantNode, reasonForChildFailure); } else { - nodeRepository().fail(failingTenantNode.hostname(), Agent.NodeFailer, reasonForChildFailure); + nodeRepository().nodes().fail(failingTenantNode.hostname(), Agent.NodeFailer, reasonForChildFailure); } } if (! allTenantNodesFailedOutSuccessfully) return false; - node = nodeRepository().fail(node.hostname(), Agent.NodeFailer, reason); + node = nodeRepository().nodes().fail(node.hostname(), Agent.NodeFailer, reason); try { deployment.get().activate(); return true; @@ -290,8 +290,8 @@ public class NodeFailer extends NodeRepositoryMaintainer { } catch (RuntimeException e) { // The expected reason for deployment to fail here is that there is no capacity available to redeploy. // In that case we should leave the node in the active state to avoid failing additional nodes. - nodeRepository().reactivate(node.hostname(), Agent.NodeFailer, - "Failed to redeploy after being failed by NodeFailer"); + nodeRepository().nodes().reactivate(node.hostname(), Agent.NodeFailer, + "Failed to redeploy after being failed by NodeFailer"); log.log(Level.WARNING, "Attempted to fail " + node + " for " + node.allocation().get().owner() + ", but redeploying without the node failed", e); return false; @@ -303,7 +303,7 @@ public class NodeFailer extends NodeRepositoryMaintainer { private boolean throttle(Node node) { if (throttlePolicy == ThrottlePolicy.disabled) return false; Instant startOfThrottleWindow = clock().instant().minus(throttlePolicy.throttleWindow); - List<Node> nodes = nodeRepository().getNodes(); + List<Node> nodes = nodeRepository().nodes().getNodes(); NodeList recentlyFailedNodes = nodes.stream() .filter(n -> n.state() == Node.State.failed) .filter(n -> n.history().hasEventAfter(History.Event.Type.failed, startOfThrottleWindow)) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java index 24af0d60cf6..92131a1cd74 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java @@ -57,15 +57,15 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { private void updateReadyNodeLivenessEvents() { // Update node last request events through ZooKeeper to collect request to all config servers. // We do this here ("lazily") to avoid writing to zk for each config request. - try (Mutex lock = nodeRepository().lockUnallocated()) { - for (Node node : nodeRepository().getNodes(Node.State.ready)) { + try (Mutex lock = nodeRepository().nodes().lockUnallocated()) { + for (Node node : nodeRepository().nodes().getNodes(Node.State.ready)) { Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname()); if (lastLocalRequest.isEmpty()) continue; if (!node.history().hasEventAfter(History.Event.Type.requested, lastLocalRequest.get())) { History updatedHistory = node.history() .with(new History.Event(History.Event.Type.requested, Agent.NodeHealthTracker, lastLocalRequest.get())); - nodeRepository().write(node.with(updatedHistory), lock); + nodeRepository().nodes().write(node.with(updatedHistory), lock); } } } @@ -76,7 +76,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { * Otherwise we remove any "down" history record. */ private void updateActiveNodeDownState() { - NodeList activeNodes = nodeRepository().list(Node.State.active); + NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> { Optional<Node> node = activeNodes.matching(n -> n.hostname().equals(hostname.toString())).first(); if (node.isEmpty()) return; @@ -87,7 +87,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { // Lock and update status ApplicationId owner = node.get().allocation().get().owner(); - try (var lock = nodeRepository().lock(owner)) { + try (var lock = nodeRepository().nodes().lock(owner)) { node = getNode(hostname.toString(), owner, lock); // Re-get inside lock if (node.isEmpty()) return; // Node disappeared or changed allocation if (isDown) { @@ -116,7 +116,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { /** Get node by given hostname and application. The applicationLock must be held when calling this */ private Optional<Node> getNode(String hostname, ApplicationId application, @SuppressWarnings("unused") Mutex applicationLock) { - return nodeRepository().getNode(hostname, Node.State.active) + return nodeRepository().nodes().getNode(hostname, Node.State.active) .filter(node -> node.allocation().isPresent()) .filter(node -> node.allocation().get().owner().equals(application)); } @@ -124,13 +124,13 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { /** Record a node as down if not already recorded */ private void recordAsDown(Node node, Mutex lock) { if (node.history().event(History.Event.Type.down).isPresent()) return; // already down: Don't change down timestamp - nodeRepository().write(node.downAt(clock().instant(), Agent.NodeHealthTracker), lock); + nodeRepository().nodes().write(node.downAt(clock().instant(), Agent.NodeHealthTracker), lock); } /** Clear down record for node, if any */ private void clearDownRecord(Node node, Mutex lock) { if (node.history().event(History.Event.Type.down).isEmpty()) return; - nodeRepository().write(node.up(), lock); + nodeRepository().nodes().write(node.up(), lock); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java index 0ab5611327b..58711e14d7f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeMover.java @@ -44,7 +44,7 @@ public abstract class NodeMover<MOVE> extends NodeRepositoryMaintainer { ApplicationId applicationId = node.allocation().get().owner(); if (applicationId.instance().isTester()) continue; if (deployedRecently(applicationId)) continue; - for (Node toHost : allNodes.matching(nodeRepository()::canAllocateTenantNodeTo)) { + for (Node toHost : allNodes.matching(nodeRepository().nodes()::canAllocateTenantNodeTo)) { if (toHost.hostname().equals(node.parentHostname().get())) continue; if ( ! capacity.freeCapacityOf(toHost).satisfies(node.resources())) continue; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java index 96d10415e63..e2cafbb9406 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooter.java @@ -39,13 +39,13 @@ public class NodeRebooter extends NodeRepositoryMaintainer { @Override protected boolean maintain() { // Reboot candidates: Nodes in long-term states, where we know we can safely orchestrate a reboot - List<Node> nodesToReboot = nodeRepository().getNodes(Node.State.active, Node.State.ready).stream() + List<Node> nodesToReboot = nodeRepository().nodes().getNodes(Node.State.active, Node.State.ready).stream() .filter(node -> node.type().isHost()) .filter(this::shouldReboot) .collect(Collectors.toList()); if (!nodesToReboot.isEmpty()) - nodeRepository().reboot(NodeListFilter.from(nodesToReboot)); + nodeRepository().nodes().reboot(NodeListFilter.from(nodesToReboot)); return true; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java index 6d93b54229f..e6338d73a17 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java @@ -42,7 +42,7 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { /** A utility to group active tenant nodes by application */ protected Map<ApplicationId, List<Node>> activeNodesByApplication() { - return nodeRepository().list(Node.State.active) + return nodeRepository().nodes().list(Node.State.active) .nodeType(NodeType.tenant) .asList() .stream() diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java index b5933196ecc..49a33c4d120 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java @@ -38,7 +38,7 @@ public class OperatorChangeApplicationMaintainer extends ApplicationMaintainer { @Override protected Set<ApplicationId> applicationsNeedingMaintenance() { - Map<ApplicationId, List<Node>> nodesByApplication = nodeRepository().list() + Map<ApplicationId, List<Node>> nodesByApplication = nodeRepository().nodes().list() .nodeType(NodeType.tenant, NodeType.proxy).asList().stream() .filter(node -> node.allocation().isPresent()) .collect(Collectors.groupingBy(node -> node.allocation().get().owner(), Collectors.toList())); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java index 68c1c98d8ba..3ff4ca89ad4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OsUpgradeActivator.java @@ -34,7 +34,8 @@ public class OsUpgradeActivator extends NodeRepositoryMaintainer { /** Returns whether to allow OS upgrade of nodes of given type */ private boolean canUpgradeOsOf(NodeType type) { - return nodeRepository().list(Node.State.ready, Node.State.active) + return nodeRepository().nodes() + .list(Node.State.ready, Node.State.active) .nodeType(type) .changingVersion() .asList() diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java index d20b06becaf..8253b3def0a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java @@ -76,7 +76,7 @@ public class PeriodicApplicationMaintainer extends ApplicationMaintainer { } protected List<Node> nodesNeedingMaintenance() { - return nodeRepository().getNodes(Node.State.active); + return nodeRepository().nodes().getNodes(Node.State.active); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java index 00cccab6f74..c5a2bc97ef2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ProvisionedExpirer.java @@ -24,7 +24,7 @@ public class ProvisionedExpirer extends Expirer { @Override protected void expire(List<Node> expired) { for (Node expiredNode : expired) - nodeRepository().parkRecursively(expiredNode.hostname(), Agent.ProvisionedExpirer, "Node is stuck in provisioned"); + nodeRepository().nodes().parkRecursively(expiredNode.hostname(), Agent.ProvisionedExpirer, "Node is stuck in provisioned"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java index 1651c494f4a..88e64331c82 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/Rebalancer.java @@ -34,14 +34,14 @@ public class Rebalancer extends NodeMover<Rebalancer.Move> { @Override protected boolean maintain() { - if ( ! nodeRepository().isWorking()) return false; + if ( ! nodeRepository().nodes().isWorking()) return false; boolean success = true; if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success; // Rebalancing not necessary if (nodeRepository().zone().environment().isTest()) return success; // Short lived deployments; no need to rebalance // Work with an unlocked snapshot as this can take a long time and full consistency is not needed - NodeList allNodes = nodeRepository().list(); + NodeList allNodes = nodeRepository().nodes().list(); updateSkewMetric(allNodes); if ( ! zoneIsStable(allNodes)) return success; findBestMove(allNodes).execute(true, Agent.Rebalancer, deployer, metric, nodeRepository()); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ReservationExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ReservationExpirer.java index 1967615de02..e55e735bda6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ReservationExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ReservationExpirer.java @@ -25,6 +25,6 @@ public class ReservationExpirer extends Expirer { } @Override - protected void expire(List<Node> expired) { nodeRepository().deallocate(expired, Agent.ReservationExpirer, "Expired by ReservationExpirer"); } + protected void expire(List<Node> expired) { nodeRepository().nodes().deallocate(expired, Agent.ReservationExpirer, "Expired by ReservationExpirer"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java index abda7ed6120..337d25ca732 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java @@ -45,7 +45,7 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { @Override protected boolean maintain() { - List<Node> activeNodes = nodeRepository().getNodes(Node.State.active); + List<Node> activeNodes = nodeRepository().nodes().getNodes(Node.State.active); Map<ApplicationId, List<Node>> retiredNodesByApplication = activeNodes.stream() .filter(node -> node.allocation().isPresent()) @@ -62,7 +62,7 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { List<Node> nodesToRemove = retiredNodes.stream().filter(this::canRemove).collect(Collectors.toList()); if (nodesToRemove.isEmpty()) continue; - nodeRepository().setRemovable(application, nodesToRemove); + nodeRepository().nodes().setRemovable(application, nodesToRemove); boolean success = deployment.activate().isPresent(); if ( ! success) return success; @@ -83,7 +83,7 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { */ private boolean canRemove(Node node) { if (node.type().isHost()) { - if (nodeRepository() + if (nodeRepository().nodes() .list().childrenOf(node).asList().stream() .allMatch(child -> child.state() == Node.State.parked || child.state() == Node.State.failed)) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java index 3bbebd7798d..4f7ab498599 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java @@ -69,7 +69,7 @@ public class ScalingSuggestionsMaintainer extends NodeRepositoryMaintainer { var suggestion = autoscaler.suggest(cluster.get(), clusterNodes); if (suggestion.isEmpty()) return false; // Wait only a short time for the lock to avoid interfering with change deployments - try (Mutex lock = nodeRepository().lock(applicationId, Duration.ofSeconds(1))) { + try (Mutex lock = nodeRepository().nodes().lock(applicationId, Duration.ofSeconds(1))) { // empty suggested resources == keep the current allocation, so we record that var suggestedResources = suggestion.target().orElse(clusterNodes.not().retired().toResources()); applications().get(applicationId).ifPresent(a -> updateSuggestion(suggestedResources, clusterId, a, lock)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java index ce2ab1dfa87..debc1484e58 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SpareCapacityMaintainer.java @@ -67,13 +67,13 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { @Override protected boolean maintain() { - if ( ! nodeRepository().isWorking()) return false; + if ( ! nodeRepository().nodes().isWorking()) return false; boolean success = true; // Don't need to maintain spare capacity in dynamically provisioned zones; can provision more on demand. if (nodeRepository().zone().getCloud().dynamicProvisioning()) return success; - NodeList allNodes = nodeRepository().list(); + NodeList allNodes = nodeRepository().nodes().list(); CapacityChecker capacityChecker = new CapacityChecker(allNodes); List<Node> overcommittedHosts = capacityChecker.findOvercommittedHosts(); @@ -116,7 +116,7 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { if (nodeWhichCantMove.isEmpty()) return List.of(); Node node = nodeWhichCantMove.get(); - NodeList allNodes = nodeRepository().list(); + NodeList allNodes = nodeRepository().nodes().list(); // Allocation will assign the spareCount most empty nodes as "spares", which will not be allocated on // unless needed for node failing. Our goal here is to make room on these spares for the given node HostCapacity hostCapacity = new HostCapacity(allNodes, nodeRepository().resourcesCalculator()); @@ -165,11 +165,11 @@ public class SpareCapacityMaintainer extends NodeRepositoryMaintainer { try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) { if ( ! deployment.isValid()) return; // this will be done at another config server - Optional<Node> nodeWithWantToRetire = nodeRepository().getNode(nodeToRetire.get().hostname()) + Optional<Node> nodeWithWantToRetire = nodeRepository().nodes().getNode(nodeToRetire.get().hostname()) .map(node -> node.withWantToRetire(true, Agent.SpareCapacityMaintainer, nodeRepository().clock().instant())); if (nodeWithWantToRetire.isEmpty()) return; - nodeRepository().write(nodeWithWantToRetire.get(), deployment.applicationLock().get()); + nodeRepository().nodes().write(nodeWithWantToRetire.get(), deployment.applicationLock().get()); log.log(Level.INFO, String.format("Redeploying %s to relocate %s from overcommited host", application, nodeToRetire.get().hostname())); deployment.activate(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java index e545b3d97ee..50e0116d98d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/SwitchRebalancer.java @@ -34,9 +34,9 @@ public class SwitchRebalancer extends NodeMover<Move> { @Override protected boolean maintain() { - if (!nodeRepository().isWorking()) return false; + if (!nodeRepository().nodes().isWorking()) return false; if (!nodeRepository().zone().environment().isProduction()) return true; - NodeList allNodes = nodeRepository().list(); // Lockless as strong consistency is not needed + NodeList allNodes = nodeRepository().nodes().list(); // Lockless as strong consistency is not needed if (!zoneIsStable(allNodes)) return true; findBestMove(allNodes).execute(false, Agent.SwitchRebalancer, deployer, metric, nodeRepository()); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java new file mode 100644 index 00000000000..d61c6f38306 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -0,0 +1,739 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.node; + +import com.yahoo.collections.ListMap; +import com.yahoo.component.Version; +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.ApplicationTransaction; +import com.yahoo.config.provision.Flavor; +import com.yahoo.config.provision.NodeType; +import com.yahoo.config.provision.Zone; +import com.yahoo.transaction.Mutex; +import com.yahoo.transaction.NestedTransaction; +import com.yahoo.vespa.hosted.provision.LockedNodeList; +import com.yahoo.vespa.hosted.provision.NoSuchNodeException; +import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.NodeMutex; +import com.yahoo.vespa.hosted.provision.maintenance.NodeFailer; +import com.yahoo.vespa.hosted.provision.node.filter.NodeFilter; +import com.yahoo.vespa.hosted.provision.node.filter.NodeListFilter; +import com.yahoo.vespa.hosted.provision.node.filter.StateFilter; +import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient; +import com.yahoo.vespa.hosted.provision.restapi.NotFoundException; + +import java.time.Clock; +import java.time.Duration; +import java.time.Instant; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.function.BiFunction; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * The nodes in the node repo and their state transitions + * + * @author bratseth + */ +// Node state transitions: +// 1) (new) | deprovisioned - > provisioned -> (dirty ->) ready -> reserved -> active -> inactive -> dirty -> ready +// 2) inactive -> reserved | parked +// 3) reserved -> dirty +// 4) * -> failed | parked -> (breakfixed) -> dirty | active | deprovisioned +// 5) deprovisioned -> (forgotten) +// Nodes have an application assigned when in states reserved, active and inactive. +// Nodes might have an application assigned in dirty. +public class Nodes { + + private final Zone zone; + private final Clock clock; + private final CuratorDatabaseClient db; + + public Nodes(CuratorDatabaseClient db, Zone zone, Clock clock) { + this.zone = zone; + this.clock = clock; + this.db = db; + } + + // ---------------- Query API ---------------------------------------------------------------- + + /** + * Finds and returns the node with the hostname in any of the given states, or empty if not found + * + * @param hostname the full host name of the node + * @param inState the states the node may be in. If no states are given, it will be returned from any state + * @return the node, or empty if it was not found in any of the given states + */ + public Optional<Node> getNode(String hostname, Node.State... inState) { + return db.readNode(hostname, inState); + } + + /** + * Returns all nodes in any of the given states. + * + * @param inState the states to return nodes from. If no states are given, all nodes of the given type are returned + * @return the node, or empty if it was not found in any of the given states + */ + public List<Node> getNodes(Node.State... inState) { + return new ArrayList<>(db.readNodes(inState)); + } + /** + * Finds and returns the nodes of the given type in any of the given states. + * + * @param type the node type to return + * @param inState the states to return nodes from. If no states are given, all nodes of the given type are returned + * @return the node, or empty if it was not found in any of the given states + */ + public List<Node> getNodes(NodeType type, Node.State... inState) { + return db.readNodes(inState).stream().filter(node -> node.type().equals(type)).collect(Collectors.toList()); + } + + /** Returns a filterable list of nodes in this repository in any of the given states */ + public NodeList list(Node.State... inState) { + return NodeList.copyOf(getNodes(inState)); + } + + public NodeList list(ApplicationId application, Node.State... inState) { + return NodeList.copyOf(getNodes(application, inState)); + } + + /** Returns a filterable list of all nodes of an application */ + public NodeList list(ApplicationId application) { + return NodeList.copyOf(getNodes(application)); + } + + /** Returns a locked list of all nodes in this repository */ + public LockedNodeList list(Mutex lock) { + return new LockedNodeList(getNodes(), lock); + } + + public List<Node> getNodes(ApplicationId id, Node.State... inState) { return db.readNodes(id, inState); } + public List<Node> getInactive() { return db.readNodes(Node.State.inactive); } + public List<Node> getFailed() { return db.readNodes(Node.State.failed); } + + /** + * Returns whether the zone managed by this node repository seems to be working. + * If too many nodes are not responding, there is probably some zone-wide issue + * and we should probably refrain from making changes to it. + */ + public boolean isWorking() { + NodeList activeNodes = list(Node.State.active); + if (activeNodes.size() <= 5) return true; // Not enough data to decide + NodeList downNodes = activeNodes.down(); + return ! ( (double)downNodes.size() / (double)activeNodes.size() > 0.2 ); + } + + // ----------------- Node lifecycle ----------------------------------------------------------- + + /** Adds a list of newly created docker container nodes to the node repository as <i>reserved</i> nodes */ + public List<Node> addDockerNodes(LockedNodeList nodes) { + for (Node node : nodes) { + if ( ! node.flavor().getType().equals(Flavor.Type.DOCKER_CONTAINER)) + illegal("Cannot add " + node + ": This is not a docker node"); + if (node.allocation().isEmpty()) + illegal("Cannot add " + node + ": Docker containers needs to be allocated"); + Optional<Node> existing = getNode(node.hostname()); + if (existing.isPresent()) + illegal("Cannot add " + node + ": A node with this name already exists (" + + existing.get() + ", " + existing.get().history() + "). Node to be added: " + + node + ", " + node.history()); + } + return db.addNodesInState(nodes.asList(), Node.State.reserved, Agent.system); + } + + /** + * Adds a list of (newly created) nodes to the node repository as <i>provisioned</i> nodes. + * If any of the nodes already exists in the deprovisioned state, the new node will be merged + * with the history of that node. + */ + public List<Node> addNodes(List<Node> nodes, Agent agent) { + try (Mutex lock = lockUnallocated()) { + List<Node> nodesToAdd = new ArrayList<>(); + List<Node> nodesToRemove = new ArrayList<>(); + for (int i = 0; i < nodes.size(); i++) { + var node = nodes.get(i); + + // Check for duplicates + for (int j = 0; j < i; j++) { + if (node.equals(nodes.get(j))) + illegal("Cannot add nodes: " + node + " is duplicated in the argument list"); + } + + Optional<Node> existing = getNode(node.hostname()); + if (existing.isPresent()) { + if (existing.get().state() != Node.State.deprovisioned) + illegal("Cannot add " + node + ": A node with this name already exists"); + node = node.with(existing.get().history()); + node = node.with(existing.get().reports()); + node = node.with(node.status().withFailCount(existing.get().status().failCount())); + if (existing.get().status().firmwareVerifiedAt().isPresent()) + node = node.with(node.status().withFirmwareVerifiedAt(existing.get().status().firmwareVerifiedAt().get())); + nodesToRemove.add(existing.get()); + } + + nodesToAdd.add(node); + } + List<Node> resultingNodes = db.addNodesInState(IP.Config.verify(nodesToAdd, list(lock)), Node.State.provisioned, agent); + db.removeNodes(nodesToRemove); + return resultingNodes; + } + } + + /** Sets a list of nodes ready and returns the nodes in the ready state */ + public List<Node> setReady(List<Node> nodes, Agent agent, String reason) { + try (Mutex lock = lockUnallocated()) { + List<Node> nodesWithResetFields = nodes.stream() + .map(node -> { + if (node.state() != Node.State.provisioned && node.state() != Node.State.dirty) + illegal("Can not set " + node + " ready. It is not provisioned or dirty."); + if (node.type() == NodeType.host && node.ipConfig().pool().getIpSet().isEmpty()) + illegal("Can not set host " + node + " ready. Its IP address pool is empty."); + return node.withWantToRetire(false, false, Agent.system, clock.instant()); + }) + .collect(Collectors.toList()); + + return db.writeTo(Node.State.ready, nodesWithResetFields, agent, Optional.of(reason)); + } + } + + public Node setReady(String hostname, Agent agent, String reason) { + Node nodeToReady = getNode(hostname).orElseThrow(() -> + new NoSuchNodeException("Could not move " + hostname + " to ready: Node not found")); + + if (nodeToReady.state() == Node.State.ready) return nodeToReady; + return setReady(List.of(nodeToReady), agent, reason).get(0); + } + + /** Reserve nodes. This method does <b>not</b> lock the node repository */ + public List<Node> reserve(List<Node> nodes) { + return db.writeTo(Node.State.reserved, nodes, Agent.application, Optional.empty()); + } + + /** Activate nodes. This method does <b>not</b> lock the node repository */ + public List<Node> activate(List<Node> nodes, NestedTransaction transaction) { + return db.writeTo(Node.State.active, nodes, Agent.application, Optional.empty(), transaction); + } + + /** + * Sets a list of nodes to have their allocation removable (active to inactive) in the node repository. + * + * @param application the application the nodes belong to + * @param nodes the nodes to make removable. These nodes MUST be in the active state. + */ + public void setRemovable(ApplicationId application, List<Node> nodes) { + try (Mutex lock = lock(application)) { + List<Node> removableNodes = + nodes.stream().map(node -> node.with(node.allocation().get().removable(true))) + .collect(Collectors.toList()); + write(removableNodes, lock); + } + } + + /** + * Deactivates these nodes in a transaction and returns the nodes in the new state which will hold if the + * transaction commits. + */ + public List<Node> deactivate(List<Node> nodes, ApplicationTransaction transaction) { + var stateless = NodeList.copyOf(nodes).stateless(); + var stateful = NodeList.copyOf(nodes).stateful(); + List<Node> written = new ArrayList<>(); + written.addAll(deallocate(stateless.asList(), Agent.application, "Deactivated by application", transaction.nested())); + written.addAll(db.writeTo(Node.State.inactive, stateful.asList(), Agent.application, Optional.empty(), transaction.nested())); + return written; + + } + + /** Move nodes to the dirty state */ + public List<Node> deallocate(List<Node> nodes, Agent agent, String reason) { + return performOn(NodeListFilter.from(nodes), (node, lock) -> deallocate(node, agent, reason)); + } + + public List<Node> deallocateRecursively(String hostname, Agent agent, String reason) { + Node nodeToDirty = getNode(hostname).orElseThrow(() -> + new IllegalArgumentException("Could not deallocate " + hostname + ": Node not found")); + + List<Node> nodesToDirty = + (nodeToDirty.type().isHost() ? + Stream.concat(list().childrenOf(hostname).asList().stream(), Stream.of(nodeToDirty)) : + Stream.of(nodeToDirty)) + .filter(node -> node.state() != Node.State.dirty) + .collect(Collectors.toList()); + + List<String> hostnamesNotAllowedToDirty = nodesToDirty.stream() + .filter(node -> node.state() != Node.State.provisioned) + .filter(node -> node.state() != Node.State.failed) + .filter(node -> node.state() != Node.State.parked) + .filter(node -> node.state() != Node.State.breakfixed) + .map(Node::hostname) + .collect(Collectors.toList()); + if ( ! hostnamesNotAllowedToDirty.isEmpty()) + illegal("Could not deallocate " + nodeToDirty + ": " + + hostnamesNotAllowedToDirty + " are not in states [provisioned, failed, parked, breakfixed]"); + + return nodesToDirty.stream().map(node -> deallocate(node, agent, reason)).collect(Collectors.toList()); + } + + /** + * Set a node dirty or parked, allowed if it is in the provisioned, inactive, failed or parked state. + * Use this to clean newly provisioned nodes or to recycle failed nodes which have been repaired or put on hold. + */ + public Node deallocate(Node node, Agent agent, String reason) { + NestedTransaction transaction = new NestedTransaction(); + Node deallocated = deallocate(node, agent, reason, transaction); + transaction.commit(); + return deallocated; + } + + public List<Node> deallocate(List<Node> nodes, Agent agent, String reason, NestedTransaction transaction) { + return nodes.stream().map(node -> deallocate(node, agent, reason, transaction)).collect(Collectors.toList()); + } + + public Node deallocate(Node node, Agent agent, String reason, NestedTransaction transaction) { + if (node.state() != Node.State.parked && agent != Agent.operator + && (node.status().wantToDeprovision() || retiredByOperator(node))) + return park(node.hostname(), false, agent, reason, transaction); + else + return db.writeTo(Node.State.dirty, List.of(node), agent, Optional.of(reason), transaction).get(0); + } + + private static boolean retiredByOperator(Node node) { + return node.status().wantToRetire() && node.history().event(History.Event.Type.wantToRetire) + .map(History.Event::agent) + .map(agent -> agent == Agent.operator) + .orElse(false); + } + + /** + * Fails this node and returns it in its new state. + * + * @return the node in its new state + * @throws NoSuchNodeException if the node is not found + */ + public Node fail(String hostname, Agent agent, String reason) { + return move(hostname, true, Node.State.failed, agent, Optional.of(reason)); + } + + /** + * Fails all the nodes that are children of hostname before finally failing the hostname itself. + * + * @return List of all the failed nodes in their new state + */ + public List<Node> failRecursively(String hostname, Agent agent, String reason) { + return moveRecursively(hostname, Node.State.failed, agent, Optional.of(reason)); + } + + /** + * Parks this node and returns it in its new state. + * + * @return the node in its new state + * @throws NoSuchNodeException if the node is not found + */ + public Node park(String hostname, boolean keepAllocation, Agent agent, String reason) { + NestedTransaction transaction = new NestedTransaction(); + Node parked = park(hostname, keepAllocation, agent, reason, transaction); + transaction.commit(); + return parked; + } + + public Node park(String hostname, boolean keepAllocation, Agent agent, String reason, NestedTransaction transaction) { + return move(hostname, keepAllocation, Node.State.parked, agent, Optional.of(reason), transaction); + } + + /** + * Parks all the nodes that are children of hostname before finally parking the hostname itself. + * + * @return List of all the parked nodes in their new state + */ + public List<Node> parkRecursively(String hostname, Agent agent, String reason) { + return moveRecursively(hostname, Node.State.parked, agent, Optional.of(reason)); + } + + /** + * Moves a previously failed or parked node back to the active state. + * + * @return the node in its new state + * @throws NoSuchNodeException if the node is not found + */ + public Node reactivate(String hostname, Agent agent, String reason) { + return move(hostname, true, Node.State.active, agent, Optional.of(reason)); + } + + /** + * Moves a host to breakfixed state, removing any children. + */ + public List<Node> breakfixRecursively(String hostname, Agent agent, String reason) { + Node node = getNode(hostname).orElseThrow(() -> + new NoSuchNodeException("Could not breakfix " + hostname + ": Node not found")); + + try (Mutex lock = lockUnallocated()) { + requireBreakfixable(node); + List<Node> removed = removeChildren(node, false); + removed.add(move(node, Node.State.breakfixed, agent, Optional.of(reason))); + return removed; + } + } + + private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, Optional<String> reason) { + List<Node> moved = list().childrenOf(hostname).asList().stream() + .map(child -> move(child, toState, agent, reason)) + .collect(Collectors.toList()); + + moved.add(move(hostname, true, toState, agent, reason)); + return moved; + } + + private Node move(String hostname, boolean keepAllocation, Node.State toState, Agent agent, Optional<String> reason) { + NestedTransaction transaction = new NestedTransaction(); + Node moved = move(hostname, keepAllocation, toState, agent, reason, transaction); + transaction.commit(); + return moved; + } + + private Node move(String hostname, boolean keepAllocation, Node.State toState, Agent agent, Optional<String> reason, + NestedTransaction transaction) { + Node node = getNode(hostname).orElseThrow(() -> + new NoSuchNodeException("Could not move " + hostname + " to " + toState + ": Node not found")); + + if (!keepAllocation && node.allocation().isPresent()) { + node = node.withoutAllocation(); + } + + return move(node, toState, agent, reason, transaction); + } + + private Node move(Node node, Node.State toState, Agent agent, Optional<String> reason) { + NestedTransaction transaction = new NestedTransaction(); + Node moved = move(node, toState, agent, reason, transaction); + transaction.commit(); + return moved; + } + + private Node move(Node node, Node.State toState, Agent agent, Optional<String> reason, NestedTransaction transaction) { + if (toState == Node.State.active && node.allocation().isEmpty()) + illegal("Could not set " + node + " active. It has no allocation."); + + // TODO: Work out a safe lock acquisition strategy for moves, e.g. migrate to lockNode. + try (Mutex lock = lock(node)) { + if (toState == Node.State.active) { + for (Node currentActive : getNodes(node.allocation().get().owner(), Node.State.active)) { + if (node.allocation().get().membership().cluster().equals(currentActive.allocation().get().membership().cluster()) + && node.allocation().get().membership().index() == currentActive.allocation().get().membership().index()) + illegal("Could not set " + node + " active: Same cluster and index as " + currentActive); + } + } + return db.writeTo(toState, List.of(node), agent, reason, transaction).get(0); + } + } + + /* + * This method is used by the REST API to handle readying nodes for new allocations. For tenant docker + * containers this will remove the node from node repository, otherwise the node will be moved to state ready. + */ + public Node markNodeAvailableForNewAllocation(String hostname, Agent agent, String reason) { + Node node = getNode(hostname).orElseThrow(() -> new NotFoundException("No node with hostname '" + hostname + "'")); + if (node.flavor().getType() == Flavor.Type.DOCKER_CONTAINER && node.type() == NodeType.tenant) { + if (node.state() != Node.State.dirty) + illegal("Cannot make " + node + " available for new allocation as it is not in state [dirty]"); + return removeRecursively(node, true).get(0); + } + + if (node.state() == Node.State.ready) return node; + + Node parentHost = node.parentHostname().flatMap(this::getNode).orElse(node); + List<String> failureReasons = NodeFailer.reasonsToFailParentHost(parentHost); + if ( ! failureReasons.isEmpty()) + illegal(node + " cannot be readied because it has hard failures: " + failureReasons); + + return setReady(List.of(node), agent, reason).get(0); + } + + /** + * Removes all the nodes that are children of hostname before finally removing the hostname itself. + * + * @return a List of all the nodes that have been removed or (for hosts) deprovisioned + */ + public List<Node> removeRecursively(String hostname) { + Node node = getNode(hostname).orElseThrow(() -> new NotFoundException("No node with hostname '" + hostname + "'")); + return removeRecursively(node, false); + } + + public List<Node> removeRecursively(Node node, boolean force) { + try (Mutex lock = lockUnallocated()) { + requireRemovable(node, false, force); + + if (node.type().isHost()) { + List<Node> removed = removeChildren(node, force); + if (zone.getCloud().dynamicProvisioning() || node.type() != NodeType.host) + db.removeNodes(List.of(node)); + else { + node = node.with(IP.Config.EMPTY); + move(node, Node.State.deprovisioned, Agent.system, Optional.empty()); + } + removed.add(node); + return removed; + } + else { + List<Node> removed = List.of(node); + db.removeNodes(removed); + return removed; + } + } + } + + /** Forgets a deprovisioned node. This removes all traces of the node in the node repository. */ + public void forget(Node node) { + if (node.state() != Node.State.deprovisioned) + throw new IllegalArgumentException(node + " must be deprovisioned before it can be forgotten"); + db.removeNodes(List.of(node)); + } + + private List<Node> removeChildren(Node node, boolean force) { + List<Node> children = list().childrenOf(node).asList(); + children.forEach(child -> requireRemovable(child, true, force)); + db.removeNodes(children); + return new ArrayList<>(children); + } + + /** + * Throws if the given node cannot be removed. Removal is allowed if: + * - Tenant node: node is unallocated + * - Host node: iff in state provisioned|failed|parked + * - Child node: + * If only removing the container node: node in state ready + * If also removing the parent node: child is in state provisioned|failed|parked|dirty|ready + */ + private void requireRemovable(Node node, boolean removingAsChild, boolean force) { + if (force) return; + + if (node.type() == NodeType.tenant && node.allocation().isPresent()) + illegal(node + " is currently allocated and cannot be removed"); + + if (!node.type().isHost() && !removingAsChild) { + if (node.state() != Node.State.ready) + illegal(node + " can not be removed as it is not in the state " + Node.State.ready); + } + else if (!node.type().isHost()) { // removing a child node + Set<Node.State> legalStates = EnumSet.of(Node.State.provisioned, Node.State.failed, Node.State.parked, Node.State.dirty, Node.State.ready); + if ( ! legalStates.contains(node.state())) + illegal(node + " can not be removed as it is not in the states " + legalStates); + } + else { // a host + Set<Node.State> legalStates = EnumSet.of(Node.State.provisioned, Node.State.failed, Node.State.parked); + if (! legalStates.contains(node.state())) + illegal(node + " can not be removed as it is not in the states " + legalStates); + } + } + + /** + * Throws if given node cannot be breakfixed. + * Breakfix is allowed if the following is true: + * - Node is tenant host + * - Node is in zone without dynamic provisioning + * - Node is in parked or failed state + */ + private void requireBreakfixable(Node node) { + if (zone.getCloud().dynamicProvisioning()) { + illegal("Can not breakfix in zone: " + zone); + } + + if (node.type() != NodeType.host) { + illegal(node + " can not be breakfixed as it is not a tenant host"); + } + + Set<Node.State> legalStates = EnumSet.of(Node.State.failed, Node.State.parked); + if (! legalStates.contains(node.state())) { + illegal(node + " can not be removed as it is not in the states " + legalStates); + } + } + + /** + * Increases the restart generation of the active nodes matching the filter. + * + * @return the nodes in their new state + */ + public List<Node> restart(NodeFilter filter) { + return performOn(StateFilter.from(Node.State.active, filter), + (node, lock) -> write(node.withRestart(node.allocation().get().restartGeneration().withIncreasedWanted()), + lock)); + } + + /** + * Increases the reboot generation of the nodes matching the filter. + * + * @return the nodes in their new state + */ + public List<Node> reboot(NodeFilter filter) { + return performOn(filter, (node, lock) -> write(node.withReboot(node.status().reboot().withIncreasedWanted()), lock)); + } + + /** + * Set target OS version of all nodes matching given filter. + * + * @return the nodes in their new state + */ + public List<Node> upgradeOs(NodeFilter filter, Optional<Version> version) { + return performOn(filter, (node, lock) -> { + var newStatus = node.status().withOsVersion(node.status().osVersion().withWanted(version)); + return write(node.with(newStatus), lock); + }); + } + + /** Retire nodes matching given filter */ + public List<Node> retire(NodeFilter filter, Agent agent, Instant instant) { + return performOn(filter, (node, lock) -> write(node.withWantToRetire(true, agent, instant), lock)); + } + + /** + * Writes this node after it has changed some internal state but NOT changed its state field. + * This does NOT lock the node repository implicitly, but callers are expected to already hold the lock. + * + * @param lock already acquired lock + * @return the written node for convenience + */ + public Node write(Node node, Mutex lock) { return write(List.of(node), lock).get(0); } + + /** + * Writes these nodes after they have changed some internal state but NOT changed their state field. + * This does NOT lock the node repository implicitly, but callers are expected to already hold the lock. + * + * @param lock already acquired lock + * @return the written nodes for convenience + */ + public List<Node> write(List<Node> nodes, @SuppressWarnings("unused") Mutex lock) { + return db.writeTo(nodes, Agent.system, Optional.empty()); + } + + /** + * Performs an operation requiring locking on all nodes matching some filter. + * + * @param filter the filter determining the set of nodes where the operation will be performed + * @param action the action to perform + * @return the set of nodes on which the action was performed, as they became as a result of the operation + */ + private List<Node> performOn(NodeFilter filter, BiFunction<Node, Mutex, Node> action) { + List<Node> unallocatedNodes = new ArrayList<>(); + ListMap<ApplicationId, Node> allocatedNodes = new ListMap<>(); + + // Group matching nodes by the lock needed + for (Node node : db.readNodes()) { + if ( ! filter.matches(node)) continue; + if (node.allocation().isPresent()) + allocatedNodes.put(node.allocation().get().owner(), node); + else + unallocatedNodes.add(node); + } + + // perform operation while holding locks + List<Node> resultingNodes = new ArrayList<>(); + try (Mutex lock = lockUnallocated()) { + for (Node node : unallocatedNodes) { + Optional<Node> currentNode = db.readNode(node.hostname()); // Re-read while holding lock + if (currentNode.isEmpty()) continue; + resultingNodes.add(action.apply(currentNode.get(), lock)); + } + } + for (Map.Entry<ApplicationId, List<Node>> applicationNodes : allocatedNodes.entrySet()) { + try (Mutex lock = lock(applicationNodes.getKey())) { + for (Node node : applicationNodes.getValue()) { + Optional<Node> currentNode = db.readNode(node.hostname()); // Re-read while holding lock + if (currentNode.isEmpty()) continue; + resultingNodes.add(action.apply(currentNode.get(), lock)); + } + } + } + return resultingNodes; + } + + public boolean canAllocateTenantNodeTo(Node host) { + return canAllocateTenantNodeTo(host, zone.getCloud().dynamicProvisioning()); + } + + public static boolean canAllocateTenantNodeTo(Node host, boolean dynamicProvisioning) { + if ( ! host.type().canRun(NodeType.tenant)) return false; + if (host.status().wantToRetire()) return false; + if (host.allocation().map(alloc -> alloc.membership().retired()).orElse(false)) return false; + + if (dynamicProvisioning) + return EnumSet.of(Node.State.active, Node.State.ready, Node.State.provisioned).contains(host.state()); + else + return host.state() == Node.State.active; + } + + /** Create a lock which provides exclusive rights to making changes to the given application */ + public Mutex lock(ApplicationId application) { + return db.lock(application); + } + + /** Create a lock with a timeout which provides exclusive rights to making changes to the given application */ + public Mutex lock(ApplicationId application, Duration timeout) { + return db.lock(application, timeout); + } + + /** Create a lock which provides exclusive rights to modifying unallocated nodes */ + public Mutex lockUnallocated() { return db.lockInactive(); } + + /** Returns the unallocated/application lock, and the node acquired under that lock. */ + public Optional<NodeMutex> lockAndGet(Node node) { + Node staleNode = node; + + final int maxRetries = 4; + for (int i = 0; i < maxRetries; ++i) { + Mutex lockToClose = lock(staleNode); + try { + // As an optimization we first try finding the node in the same state + Optional<Node> freshNode = getNode(staleNode.hostname(), staleNode.state()); + if (freshNode.isEmpty()) { + freshNode = getNode(staleNode.hostname()); + if (freshNode.isEmpty()) { + return Optional.empty(); + } + } + + if (Objects.equals(freshNode.get().allocation().map(Allocation::owner), + staleNode.allocation().map(Allocation::owner))) { + NodeMutex nodeMutex = new NodeMutex(freshNode.get(), lockToClose); + lockToClose = null; + return Optional.of(nodeMutex); + } + + // The wrong lock was held when the fresh node was fetched, so try again + staleNode = freshNode.get(); + } finally { + if (lockToClose != null) lockToClose.close(); + } + } + + throw new IllegalStateException("Giving up (after " + maxRetries + " attempts) " + + "fetching an up to date node under lock: " + node.hostname()); + } + + /** Returns the unallocated/application lock, and the node acquired under that lock. */ + public Optional<NodeMutex> lockAndGet(String hostname) { + return getNode(hostname).flatMap(this::lockAndGet); + } + + /** Returns the unallocated/application lock, and the node acquired under that lock. */ + public NodeMutex lockAndGetRequired(Node node) { + return lockAndGet(node).orElseThrow(() -> new IllegalArgumentException("No such node: " + node.hostname())); + } + + /** Returns the unallocated/application lock, and the node acquired under that lock. */ + public NodeMutex lockAndGetRequired(String hostname) { + return lockAndGet(hostname).orElseThrow(() -> new IllegalArgumentException("No such node: " + hostname)); + } + + private Mutex lock(Node node) { + return node.allocation().isPresent() ? lock(node.allocation().get().owner()) : lockUnallocated(); + } + + private void illegal(String message) { + throw new IllegalArgumentException(message); + } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingUpgrader.java index 74b288d77c5..5410cb06269 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/DelegatingUpgrader.java @@ -35,7 +35,7 @@ public class DelegatingUpgrader implements Upgrader { @Override public void upgradeTo(OsVersionTarget target) { - NodeList activeNodes = nodeRepository.list().nodeType(target.nodeType()).state(Node.State.active); + NodeList activeNodes = nodeRepository.nodes().list().nodeType(target.nodeType()).state(Node.State.active); int numberToUpgrade = Math.max(0, maxActiveUpgrades - activeNodes.changingOsVersionTo(target.version()).size()); NodeList nodesToUpgrade = activeNodes.not().changingOsVersionTo(target.version()) .osVersionIsBefore(target.version()) @@ -44,17 +44,17 @@ public class DelegatingUpgrader implements Upgrader { if (nodesToUpgrade.size() == 0) return; LOG.info("Upgrading " + nodesToUpgrade.size() + " nodes of type " + target.nodeType() + " to OS version " + target.version().toFullString()); - nodeRepository.upgradeOs(NodeListFilter.from(nodesToUpgrade.asList()), Optional.of(target.version())); + nodeRepository.nodes().upgradeOs(NodeListFilter.from(nodesToUpgrade.asList()), Optional.of(target.version())); } @Override public void disableUpgrade(NodeType type) { - NodeList nodesUpgrading = nodeRepository.list() + NodeList nodesUpgrading = nodeRepository.nodes().list() .nodeType(type) .changingOsVersion(); if (nodesUpgrading.size() == 0) return; LOG.info("Disabling OS upgrade of all " + type + " nodes"); - nodeRepository.upgradeOs(NodeListFilter.from(nodesUpgrading.asList()), Optional.empty()); + nodeRepository.nodes().upgradeOs(NodeListFilter.from(nodesUpgrading.asList()), Optional.empty()); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringUpgrader.java index 68ac96fdf1d..8615488b92f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringUpgrader.java @@ -35,7 +35,7 @@ public class RetiringUpgrader implements Upgrader { @Override public void upgradeTo(OsVersionTarget target) { - NodeList allNodes = nodeRepository.list(); + NodeList allNodes = nodeRepository.nodes().list(); NodeList activeNodes = allNodes.state(Node.State.active).nodeType(target.nodeType()); if (activeNodes.size() == 0) return; // No nodes eligible for upgrade @@ -62,7 +62,7 @@ public class RetiringUpgrader implements Upgrader { /** Retire and deprovision given host and its children */ private void retire(Node host, Version target, Instant now, NodeList allNodes) { if (!host.type().isHost()) throw new IllegalArgumentException("Cannot retire non-host " + host); - Optional<NodeMutex> nodeMutex = nodeRepository.lockAndGet(host); + Optional<NodeMutex> nodeMutex = nodeRepository.nodes().lockAndGet(host); if (nodeMutex.isEmpty()) return; try (var lock = nodeMutex.get()) { host = lock.node(); @@ -72,10 +72,10 @@ public class RetiringUpgrader implements Upgrader { host.status().osVersion().current().map(Version::toFullString).orElse("<unset>") + ", want " + target); NodeList children = allNodes.childrenOf(host); - nodeRepository.retire(NodeListFilter.from(children.asList()), Agent.RetiringUpgrader, now); + nodeRepository.nodes().retire(NodeListFilter.from(children.asList()), Agent.RetiringUpgrader, now); host = host.withWantToRetire(true, true, Agent.RetiringUpgrader, now); host = host.with(host.status().withOsVersion(host.status().osVersion().withWanted(Optional.of(target)))); - nodeRepository.write(host, lock); + nodeRepository.nodes().write(host, lock); nodeRepository.osVersions().writeChange((change) -> change.withRetirementAt(now, nodeType)); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java index 2ec3912181b..cad9faacf20 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java @@ -64,7 +64,7 @@ class Activator { Instant activationTime = nodeRepository.clock().instant(); // Use one timestamp for all activation changes ApplicationId application = transaction.application(); Set<String> hostnames = hosts.stream().map(HostSpec::hostname).collect(Collectors.toSet()); - NodeList allNodes = nodeRepository.list(); + NodeList allNodes = nodeRepository.nodes().list(); NodeList applicationNodes = allNodes.owner(application); List<Node> reserved = applicationNodes.state(Node.State.reserved).asList(); @@ -86,8 +86,8 @@ class Activator { List<Node> activeToRemove = removeHostsFromList(hostnames, oldActive); activeToRemove = activeToRemove.stream().map(Node::unretire).collect(Collectors.toList()); // only active nodes can be retired. TODO: Move this line to deactivate - nodeRepository.deactivate(activeToRemove, transaction); // TODO: Pass activation time in this call and next line - nodeRepository.activate(newActive, transaction.nested()); // activate also continued active to update node state + nodeRepository.nodes().deactivate(activeToRemove, transaction); // TODO: Pass activation time in this call and next line + nodeRepository.nodes().activate(newActive, transaction.nested()); // activate also continued active to update node state rememberResourceChange(transaction, generation, activationTime, NodeList.copyOf(oldActive).not().retired(), @@ -126,16 +126,16 @@ class Activator { private void unreserveParentsOf(List<Node> nodes) { for (Node node : nodes) { if ( node.parentHostname().isEmpty()) continue; - Optional<Node> parentNode = nodeRepository.getNode(node.parentHostname().get()); + Optional<Node> parentNode = nodeRepository.nodes().getNode(node.parentHostname().get()); if (parentNode.isEmpty()) continue; if (parentNode.get().reservedTo().isEmpty()) continue; // Above is an optimization to avoid unnecessary locking - now repeat all conditions under lock - Optional<NodeMutex> parent = nodeRepository.lockAndGet(node.parentHostname().get()); + Optional<NodeMutex> parent = nodeRepository.nodes().lockAndGet(node.parentHostname().get()); if (parent.isEmpty()) continue; try (var lock = parent.get()) { if (lock.node().reservedTo().isEmpty()) continue; - nodeRepository.write(lock.node().withoutReservedTo(), lock); + nodeRepository.nodes().write(lock.node().withoutReservedTo(), lock); } } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java index d9ca0100402..cd381b467d4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java @@ -81,8 +81,8 @@ public class GroupPreparer { } // There were some changes, so re-do the allocation with locks - try (Mutex lock = nodeRepository.lock(application); - Mutex allocationLock = nodeRepository.lockUnallocated()) { + try (Mutex lock = nodeRepository.nodes().lock(application); + Mutex allocationLock = nodeRepository.nodes().lockUnallocated()) { NodeAllocation allocation = prepareAllocation(application, cluster, requestedNodes, surplusActiveNodes, highestIndex, wantedGroups, allocationLock, @@ -109,7 +109,7 @@ public class GroupPreparer { List<Node> hosts = provisionedHosts.stream() .map(ProvisionedHost::generateHost) .collect(Collectors.toList()); - nodeRepository.addNodes(hosts, Agent.application); + nodeRepository.nodes().addNodes(hosts, Agent.application); // Offer the nodes on the newly provisioned hosts, this should be enough to cover the deficit List<NodeCandidate> candidates = provisionedHosts.stream() @@ -124,8 +124,8 @@ public class GroupPreparer { allocation.outOfCapacityDetails()); // Carry out and return allocation - nodeRepository.reserve(allocation.reservableNodes()); - nodeRepository.addDockerNodes(new LockedNodeList(allocation.newNodes(), allocationLock)); + nodeRepository.nodes().reserve(allocation.reservableNodes()); + nodeRepository.nodes().addDockerNodes(new LockedNodeList(allocation.newNodes(), allocationLock)); List<Node> acceptedNodes = allocation.finalNodes(); surplusActiveNodes.removeAll(acceptedNodes); return acceptedNodes; @@ -135,7 +135,7 @@ public class GroupPreparer { private NodeAllocation prepareAllocation(ApplicationId application, ClusterSpec cluster, NodeSpec requestedNodes, List<Node> surplusActiveNodes, MutableInteger highestIndex, int wantedGroups, Mutex allocationLock, String allocateOsRequirement) { - LockedNodeList allNodes = nodeRepository.list(allocationLock); + LockedNodeList allNodes = nodeRepository.nodes().list(allocationLock); NodeAllocation allocation = new NodeAllocation(allNodes, application, cluster, requestedNodes, highestIndex, nodeRepository); NodePrioritizer prioritizer = new NodePrioritizer( diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/InfraDeployerImpl.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/InfraDeployerImpl.java index d81d16fe62e..1b98514e3a8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/InfraDeployerImpl.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/InfraDeployerImpl.java @@ -89,7 +89,7 @@ public class InfraDeployerImpl implements InfraDeployer { public void prepare() { if (prepared) return; - try (Mutex lock = nodeRepository.lock(application.getApplicationId())) { + try (Mutex lock = nodeRepository.nodes().lock(application.getApplicationId())) { NodeType nodeType = application.getCapacity().type(); Version targetVersion = infrastructureVersions.getTargetVersionFor(nodeType); hostSpecs = provisioner.prepare(application.getApplicationId(), diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java index 356b27dfe2c..156d1023bbc 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/LoadBalancerProvisioner.java @@ -210,7 +210,7 @@ public class LoadBalancerProvisioner { /** Returns the load balanced clusters of given application and their nodes */ private Map<ClusterSpec.Id, List<Node>> loadBalancedClustersOf(ApplicationId application) { - NodeList nodes = NodeList.copyOf(nodeRepository.getNodes(Node.State.reserved, Node.State.active)) + NodeList nodes = NodeList.copyOf(nodeRepository.nodes().getNodes(Node.State.reserved, Node.State.active)) .owner(application); if (nodes.stream().anyMatch(node -> node.type() == NodeType.config)) { nodes = nodes.nodeType(NodeType.config).type(ClusterSpec.Type.admin); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java index a8db081dbfd..9578e43609b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java @@ -9,6 +9,7 @@ import com.yahoo.vespa.hosted.provision.LockedNodeList; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.vespa.hosted.provision.node.Nodes; import com.yahoo.vespa.hosted.provision.persistence.NameResolver; import java.util.ArrayList; @@ -136,7 +137,7 @@ public class NodePrioritizer { if ( !canAllocateNew) return; for (Node host : allNodes) { - if ( ! NodeRepository.canAllocateTenantNodeTo(host, dynamicProvisioning)) continue; + if ( ! Nodes.canAllocateTenantNodeTo(host, dynamicProvisioning)) continue; if (host.reservedTo().isPresent() && !host.reservedTo().get().equals(application.tenant())) continue; if (host.reservedTo().isPresent() && application.instance().isTester()) continue; if (host.exclusiveTo().isPresent()) continue; // Never allocate new nodes to exclusive hosts @@ -221,7 +222,7 @@ public class NodePrioritizer { if (node.type() != NodeType.tenant || node.parentHostname().isEmpty()) return true; Optional<Node> parent = allNodes.parentOf(node); if (parent.isEmpty()) return false; - return NodeRepository.canAllocateTenantNodeTo(parent.get(), dynamicProvisioning); + return Nodes.canAllocateTenantNodeTo(parent.get(), dynamicProvisioning); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java index 05c20ee69f1..825ea82e95c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java @@ -118,7 +118,7 @@ public class NodeRepositoryProvisioner implements Provisioner { @Override public void restart(ApplicationId application, HostFilter filter) { - nodeRepository.restart(ApplicationFilter.from(application, NodeHostFilter.from(filter))); + nodeRepository.nodes().restart(ApplicationFilter.from(application, NodeHostFilter.from(filter))); } @Override @@ -129,7 +129,7 @@ public class NodeRepositoryProvisioner implements Provisioner { @Override public ProvisionLock lock(ApplicationId application) { - return new ProvisionLock(application, nodeRepository.lock(application)); + return new ProvisionLock(application, nodeRepository.nodes().lock(application)); } /** @@ -137,7 +137,7 @@ public class NodeRepositoryProvisioner implements Provisioner { * and updates the application store with the received min and max. */ private ClusterResources decideTargetResources(ApplicationId applicationId, ClusterSpec clusterSpec, Capacity requested) { - try (Mutex lock = nodeRepository.lock(applicationId)) { + try (Mutex lock = nodeRepository.nodes().lock(applicationId)) { Application application = nodeRepository.applications().get(applicationId).orElse(new Application(applicationId)); application = application.withCluster(clusterSpec.id(), clusterSpec.isExclusive(), requested.minResources(), requested.maxResources()); nodeRepository.applications().put(application, lock); @@ -150,7 +150,7 @@ public class NodeRepositoryProvisioner implements Provisioner { private ClusterResources currentResources(ApplicationId applicationId, ClusterSpec clusterSpec, Capacity requested) { - List<Node> nodes = NodeList.copyOf(nodeRepository.getNodes(applicationId, Node.State.active)) + List<Node> nodes = NodeList.copyOf(nodeRepository.nodes().getNodes(applicationId, Node.State.active)) .cluster(clusterSpec.id()) .not().retired() .not().removable() diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java index 6597c64a399..18ab9b70491 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Preparer.java @@ -85,7 +85,7 @@ class Preparer { */ private List<Node> findNodesInRemovableGroups(ApplicationId application, ClusterSpec requestedCluster, int wantedGroups) { List<Node> surplusNodes = new ArrayList<>(0); - for (Node node : nodeRepository.getNodes(application, Node.State.active)) { + for (Node node : nodeRepository.nodes().getNodes(application, Node.State.active)) { ClusterSpec nodeCluster = node.allocation().get().membership().cluster(); if ( ! nodeCluster.id().equals(requestedCluster.id())) continue; if ( ! nodeCluster.type().equals(requestedCluster.type())) continue; @@ -127,7 +127,7 @@ class Preparer { */ private int findHighestIndex(ApplicationId application, ClusterSpec cluster) { int highestIndex = -1; - for (Node node : nodeRepository.getNodes(application, Node.State.allocatedStates().toArray(new Node.State[0]))) { + for (Node node : nodeRepository.nodes().getNodes(application, Node.State.allocatedStates().toArray(new Node.State[0]))) { ClusterSpec nodeCluster = node.allocation().get().membership().cluster(); if ( ! nodeCluster.id().equals(cluster.id())) continue; if ( ! nodeCluster.type().equals(cluster.type())) continue; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java index 219fdaf9663..3c06533b8f5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/HostCapacityResponse.java @@ -28,7 +28,7 @@ public class HostCapacityResponse extends HttpResponse { public HostCapacityResponse(NodeRepository nodeRepository, HttpRequest request) { super(200); - capacityChecker = new CapacityChecker(nodeRepository.list()); + capacityChecker = new CapacityChecker(nodeRepository.nodes().list()); json = request.getBooleanProperty("json"); String hostsJson = request.getProperty("hosts"); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeAclResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeAclResponse.java index 2d2feccc114..708a2f73ee6 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeAclResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeAclResponse.java @@ -39,11 +39,11 @@ public class NodeAclResponse extends HttpResponse { } private void toSlime(String hostname, Cursor object) { - Node node = nodeRepository.getNode(hostname) + Node node = nodeRepository.nodes().getNode(hostname) .orElseThrow(() -> new NotFoundException("No node with hostname '" + hostname + "'")); List<NodeAcl> acls = aclsForChildren ? nodeRepository.getChildAcls(node) : - List.of(node.acl(nodeRepository.list(), nodeRepository.loadBalancers())); + List.of(node.acl(nodeRepository.nodes().list(), nodeRepository.loadBalancers())); Cursor trustedNodesArray = object.setArray("trustedNodes"); acls.forEach(nodeAcl -> toSlime(nodeAcl, trustedNodesArray)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java index 47b0021874d..ea951798da0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java @@ -73,9 +73,9 @@ public class NodePatcher implements AutoCloseable { throw new UncheckedIOException("Error reading request body", e); } - this.patchedNodes = new PatchedNodes(nodeRepository.lockAndGetRequired(node)); + this.patchedNodes = new PatchedNodes(nodeRepository.nodes().lockAndGetRequired(node)); try { - this.memoizedNodes = Suppliers.memoize(() -> nodeRepository.list(patchedNodes.nodeMutex())); + this.memoizedNodes = Suppliers.memoize(() -> nodeRepository.nodes().list(patchedNodes.nodeMutex())); } catch (RuntimeException e) { patchedNodes.close(); throw e; @@ -312,7 +312,7 @@ public class NodePatcher implements AutoCloseable { if (!fetchedChildren) { memoizedNodes.get() .childrenOf(hostname) - .forEach(node -> nodeRepository.lockAndGet(node) + .forEach(node -> nodeRepository.nodes().lockAndGet(node) .ifPresent(nodeMutex -> nodes.put(nodeMutex.node().hostname(), nodeMutex))); fetchedChildren = true; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java index e12952062b8..e71902f908b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesResponse.java @@ -110,13 +110,13 @@ class NodesResponse extends HttpResponse { private void nodesToSlime(Node.State state, Cursor parentObject) { Cursor nodeArray = parentObject.setArray("nodes"); for (NodeType type : NodeType.values()) - toSlime(nodeRepository.getNodes(type, state), nodeArray); + toSlime(nodeRepository.nodes().getNodes(type, state), nodeArray); } /** Outputs all the nodes to a node array */ private void nodesToSlime(Cursor parentObject) { Cursor nodeArray = parentObject.setArray("nodes"); - toSlime(nodeRepository.getNodes(), nodeArray); + toSlime(nodeRepository.nodes().getNodes(), nodeArray); } private void toSlime(List<Node> nodes, Cursor array) { @@ -127,7 +127,7 @@ class NodesResponse extends HttpResponse { } private void nodeToSlime(String hostname, Cursor object) { - Node node = nodeRepository.getNode(hostname).orElseThrow(() -> + Node node = nodeRepository.nodes().getNode(hostname).orElseThrow(() -> new NotFoundException("No node with hostname '" + hostname + "'")); toSlime(node, true, object); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index 08bfd104863..a6a58b6a9dd 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -130,27 +130,27 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { // Check paths to disallow illegal state changes if (path.startsWith("/nodes/v2/state/ready/") || path.startsWith("/nodes/v2/state/availablefornewallocations/")) { - nodeRepository.markNodeAvailableForNewAllocation(lastElement(path), Agent.operator, "Readied through the nodes/v2 API"); + nodeRepository.nodes().markNodeAvailableForNewAllocation(lastElement(path), Agent.operator, "Readied through the nodes/v2 API"); return new MessageResponse("Moved " + lastElement(path) + " to ready"); } else if (path.startsWith("/nodes/v2/state/failed/")) { - List<Node> failedNodes = nodeRepository.failRecursively(lastElement(path), Agent.operator, "Failed through the nodes/v2 API"); + List<Node> failedNodes = nodeRepository.nodes().failRecursively(lastElement(path), Agent.operator, "Failed through the nodes/v2 API"); return new MessageResponse("Moved " + hostnamesAsString(failedNodes) + " to failed"); } else if (path.startsWith("/nodes/v2/state/parked/")) { - List<Node> parkedNodes = nodeRepository.parkRecursively(lastElement(path), Agent.operator, "Parked through the nodes/v2 API"); + List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(lastElement(path), Agent.operator, "Parked through the nodes/v2 API"); return new MessageResponse("Moved " + hostnamesAsString(parkedNodes) + " to parked"); } else if (path.startsWith("/nodes/v2/state/dirty/")) { - List<Node> dirtiedNodes = nodeRepository.deallocateRecursively(lastElement(path), Agent.operator, "Dirtied through the nodes/v2 API"); + List<Node> dirtiedNodes = nodeRepository.nodes().deallocateRecursively(lastElement(path), Agent.operator, "Dirtied through the nodes/v2 API"); return new MessageResponse("Moved " + hostnamesAsString(dirtiedNodes) + " to dirty"); } else if (path.startsWith("/nodes/v2/state/active/")) { - nodeRepository.reactivate(lastElement(path), Agent.operator, "Reactivated through nodes/v2 API"); + nodeRepository.nodes().reactivate(lastElement(path), Agent.operator, "Reactivated through nodes/v2 API"); return new MessageResponse("Moved " + lastElement(path) + " to active"); } else if (path.startsWith("/nodes/v2/state/breakfixed/")) { - List<Node> breakfixedNodes = nodeRepository.breakfixRecursively(lastElement(path), Agent.operator, "Breakfixed through the nodes/v2 API"); + List<Node> breakfixedNodes = nodeRepository.nodes().breakfixRecursively(lastElement(path), Agent.operator, "Breakfixed through the nodes/v2 API"); return new MessageResponse("Breakfixed " + hostnamesAsString(breakfixedNodes)); } @@ -162,7 +162,7 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { if (path.startsWith("/nodes/v2/node/")) { try (NodePatcher patcher = new NodePatcher(nodeFlavors, request.getData(), nodeFromRequest(request), nodeRepository)) { var patchedNodes = patcher.apply(); - nodeRepository.write(patchedNodes, patcher.nodeMutexOfHost()); + nodeRepository.nodes().write(patchedNodes, patcher.nodeMutexOfHost()); return new MessageResponse("Updated " + patcher.nodeMutexOfHost().node().hostname()); } @@ -177,11 +177,11 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { private HttpResponse handlePOST(HttpRequest request) { Path path = new Path(request.getUri()); if (path.matches("/nodes/v2/command/restart")) { - int restartCount = nodeRepository.restart(toNodeFilter(request)).size(); + int restartCount = nodeRepository.nodes().restart(toNodeFilter(request)).size(); return new MessageResponse("Scheduled restart of " + restartCount + " matching nodes"); } if (path.matches("/nodes/v2/command/reboot")) { - int rebootCount = nodeRepository.reboot(toNodeFilter(request)).size(); + int rebootCount = nodeRepository.nodes().reboot(toNodeFilter(request)).size(); return new MessageResponse("Scheduled reboot of " + rebootCount + " matching nodes"); } if (path.matches("/nodes/v2/node")) { @@ -208,14 +208,14 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { } private HttpResponse deleteNode(String hostname) { - Optional<NodeMutex> nodeMutex = nodeRepository.lockAndGet(hostname); + Optional<NodeMutex> nodeMutex = nodeRepository.nodes().lockAndGet(hostname); if (nodeMutex.isEmpty()) throw new NotFoundException("No node with hostname '" + hostname + "'"); try (var lock = nodeMutex.get()) { if (lock.node().state() == Node.State.deprovisioned) { - nodeRepository.forget(lock.node()); + nodeRepository.nodes().forget(lock.node()); return new MessageResponse("Permanently removed " + hostname); } else { - List<Node> removedNodes = nodeRepository.removeRecursively(hostname); + List<Node> removedNodes = nodeRepository.nodes().removeRecursively(hostname); return new MessageResponse("Removed " + removedNodes.stream().map(Node::hostname).collect(Collectors.joining(", "))); } } @@ -223,13 +223,13 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { private Node nodeFromRequest(HttpRequest request) { String hostname = lastElement(request.getUri().getPath()); - return nodeRepository.getNode(hostname).orElseThrow(() -> + return nodeRepository.nodes().getNode(hostname).orElseThrow(() -> new NotFoundException("No node found with hostname " + hostname)); } public int addNodes(InputStream jsonStream) { List<Node> nodes = createNodesFromSlime(toSlime(jsonStream).get()); - return nodeRepository.addNodes(nodes, Agent.operator).size(); + return nodeRepository.nodes().addNodes(nodes, Agent.operator).size(); } private Slime toSlime(InputStream jsonStream) { @@ -435,7 +435,7 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { if (application.isEmpty()) return ErrorResponse.notFoundError("No application '" + id + "'"); Slime slime = ApplicationSerializer.toSlime(application.get(), - nodeRepository.getNodes(id, Node.State.active), + nodeRepository.nodes().getNodes(id, Node.State.active), withPath("/nodes/v2/applications/" + id, uri)); return new SlimeJsonResponse(slime); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java index 9461cb00c5f..c31ebbb2c11 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java @@ -195,9 +195,9 @@ public class MockDeployer implements Deployer { public long activate() { lastDeployTimes.put(applicationId, clock.instant()); - for (Node node : nodeRepository.list().owner(applicationId).state(Node.State.active).wantToRetire().asList()) { - try (NodeMutex lock = nodeRepository.lockAndGetRequired(node)) { - nodeRepository.write(lock.node().retire(nodeRepository.clock().instant()), lock); + for (Node node : nodeRepository.nodes().list().owner(applicationId).state(Node.State.active).wantToRetire().asList()) { + try (NodeMutex lock = nodeRepository.nodes().lockAndGetRequired(node)) { + nodeRepository.nodes().write(lock.node().retire(nodeRepository.clock().instant()), lock); } } return redeployments++; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java index cf1ccf9a052..f6649b44c0b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java @@ -139,17 +139,17 @@ public class MockNodeRepository extends NodeRepository { nodes.add(Node.create("cfg2", ipConfig(202), "cfg2.yahoo.com", flavors.getFlavorOrThrow("default"), NodeType.config).build()); // Ready all nodes, except 7 and 55 - nodes = addNodes(nodes, Agent.system); + nodes = nodes().addNodes(nodes, Agent.system); nodes.remove(node7); nodes.remove(node55); - nodes = deallocate(nodes, Agent.system, getClass().getSimpleName()); - setReady(nodes, Agent.system, getClass().getSimpleName()); + nodes = nodes().deallocate(nodes, Agent.system, getClass().getSimpleName()); + nodes().setReady(nodes, Agent.system, getClass().getSimpleName()); - fail(node5.hostname(), Agent.system, getClass().getSimpleName()); - deallocateRecursively(node55.hostname(), Agent.system, getClass().getSimpleName()); + nodes().fail(node5.hostname(), Agent.system, getClass().getSimpleName()); + nodes().deallocateRecursively(node55.hostname(), Agent.system, getClass().getSimpleName()); - fail("dockerhost6.yahoo.com", Agent.operator, getClass().getSimpleName()); - removeRecursively("dockerhost6.yahoo.com"); + nodes().fail("dockerhost6.yahoo.com", Agent.operator, getClass().getSimpleName()); + nodes().removeRecursively("dockerhost6.yahoo.com"); ApplicationId zoneApp = ApplicationId.from(TenantName.from("zoneapp"), ApplicationName.from("zoneapp"), InstanceName.from("zoneapp")); ClusterSpec zoneCluster = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("node-admin")).vespaVersion("6.42").build(); @@ -169,7 +169,7 @@ public class MockNodeRepository extends NodeRepository { clock().instant()))); cluster1 = cluster1.withTarget(Optional.of(new ClusterResources(4, 1, new NodeResources(3, 16, 100, 1)))); - try (Mutex lock = lock(app1Id)) { + try (Mutex lock = nodes().lock(app1Id)) { applications().put(app1.with(cluster1), lock); } @@ -184,8 +184,8 @@ public class MockNodeRepository extends NodeRepository { List<Node> largeNodes = new ArrayList<>(); largeNodes.add(Node.create("node13", ipConfig(13), "host13.yahoo.com", resources(10, 48, 500, 1, fast, local), NodeType.tenant).build()); largeNodes.add(Node.create("node14", ipConfig(14), "host14.yahoo.com", resources(10, 48, 500, 1, fast, local), NodeType.tenant).build()); - addNodes(largeNodes, Agent.system); - setReady(largeNodes, Agent.system, getClass().getSimpleName()); + nodes().addNodes(largeNodes, Agent.system); + nodes().setReady(largeNodes, Agent.system, getClass().getSimpleName()); ApplicationId app4 = ApplicationId.from(TenantName.from("tenant4"), ApplicationName.from("application4"), InstanceName.from("instance4")); ClusterSpec cluster4 = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("id4")).vespaVersion("6.42").build(); activate(provisioner.prepare(app4, cluster4, Capacity.from(new ClusterResources(2, 1, new NodeResources(10, 48, 500, 1)), false, true), null), app4, provisioner); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java index e18f5adb257..ce30baa3862 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/ServiceMonitorStub.java @@ -70,7 +70,7 @@ public class ServiceMonitorStub implements ServiceMonitor { Map<ApplicationInstanceReference, ApplicationInstance> status = new HashMap<>(); for (Map.Entry<ApplicationId, MockDeployer.ApplicationContext> app : apps.entrySet()) { Set<ServiceInstance> serviceInstances = new HashSet<>(); - for (Node node : nodeRepository.getNodes(app.getValue().id(), Node.State.active)) { + for (Node node : nodeRepository.nodes().getNodes(app.getValue().id(), Node.State.active)) { serviceInstances.add(new ServiceInstance(new ConfigId("configid"), new HostName(node.hostname()), getHostStatus(node.hostname()))); |