diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2022-10-28 10:40:52 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-28 10:40:52 +0200 |
commit | d66f9b3bcd71c8fecd0afd6583c699d7d2bcd8bb (patch) | |
tree | 7fecbee597be570911b52af4be9ccf2ba50dc7c1 | |
parent | b7f2bb194faa7d52042c3acbd0dc0bbc64908c82 (diff) | |
parent | b18d1072413ffbcf96a7f4a33ed40015c038459c (diff) |
Merge pull request #24628 from vespa-engine/freva/ensure-nodes-allocated
Node-Repo: Always lock by application ID
29 files changed, 117 insertions, 99 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index 6548e8ae16b..768036fd284 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -402,15 +402,6 @@ public final class Node implements Nodelike { exclusiveToClusterType, switchHostname, trustStoreItems, cloudAccount); } - /** Returns a new Node without an allocation. */ - public Node withoutAllocation() { - return new Node(id, ipConfig, hostname, parentHostname, flavor, status, state, - Optional.empty(), history, type, reports, modelName, reservedTo, - exclusiveToApplicationId, exclusiveToClusterType, switchHostname, trustStoreItems, - cloudAccount); - } - - /** Returns a copy of this node with IP config set to the given value. */ public Node with(IP.Config ipConfig) { return new Node(id, ipConfig, hostname, parentHostname, flavor, status, state, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index ff156751f5e..6fa02d4c1c3 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -118,7 +118,8 @@ public class NodeRepository extends AbstractComponent { this.db = new CuratorDatabaseClient(flavors, curator, clock, useCuratorClientCache, nodeCacheSize); this.zone = zone; this.clock = clock; - this.nodes = new Nodes(db, zone, clock, orchestrator); + this.applications = new Applications(db); + this.nodes = new Nodes(db, zone, clock, orchestrator, applications); this.flavors = flavors; this.resourcesCalculator = provisionServiceProvider.getHostResourcesCalculator(); this.nameResolver = nameResolver; @@ -128,7 +129,6 @@ public class NodeRepository extends AbstractComponent { this.containerImages = new ContainerImages(containerImage, tenantContainerImage); this.archiveUris = new ArchiveUris(db); this.jobControl = new JobControl(new JobControlFlags(db, flagSource)); - this.applications = new Applications(db); this.loadBalancers = new LoadBalancers(db); this.metricsDb = metricsDb; this.orchestrator = orchestrator; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Applications.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Applications.java index 2770a382383..2cf6d290dc8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Applications.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Applications.java @@ -8,6 +8,7 @@ import com.yahoo.transaction.Mutex; import com.yahoo.transaction.NestedTransaction; import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient; +import java.time.Duration; import java.util.List; import java.util.Optional; @@ -62,4 +63,14 @@ public class Applications { db.deleteApplication(transaction); } + /** Create a lock which provides exclusive rights to making changes to the given application */ + public Mutex lock(ApplicationId application) { + return db.lock(application); + } + + /** Create a lock with a timeout which provides exclusive rights to making changes to the given application */ + public Mutex lock(ApplicationId application, Duration timeout) { + return db.lock(application, timeout); + } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java index 466582b88e6..97691c84be8 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java @@ -73,7 +73,7 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer { // Lock and write if there are state updates and/or we should autoscale now if (advice.isPresent() && !cluster.get().targetResources().equals(advice.target()) || (updatedCluster != cluster.get() || !advice.reason().equals(cluster.get().autoscalingStatus()))) { - try (var lock = nodeRepository().nodes().lock(applicationId)) { + try (var lock = nodeRepository().applications().lock(applicationId)) { application = nodeRepository().applications().get(applicationId); if (application.isEmpty()) return; cluster = application.get().cluster(clusterId); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java index c4703102f47..517458afd00 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java @@ -23,18 +23,18 @@ import java.util.List; */ public class DirtyExpirer extends Expirer { - private final boolean keepAllocationOnExpiry; + private final boolean wantToDeprovisionOnExpiry; DirtyExpirer(NodeRepository nodeRepository, Duration dirtyTimeout, Metric metric) { super(Node.State.dirty, History.Event.Type.deallocated, nodeRepository, dirtyTimeout, metric); - // Do not keep allocation in dynamically provisioned zones so that the hosts can be deprovisioned - this.keepAllocationOnExpiry = ! nodeRepository.zone().cloud().dynamicProvisioning(); + // Deprovision hosts in dynamically provisioned on expiry + this.wantToDeprovisionOnExpiry = nodeRepository.zone().cloud().dynamicProvisioning(); } @Override protected void expire(List<Node> expired) { for (Node expiredNode : expired) - nodeRepository().nodes().fail(expiredNode.hostname(), keepAllocationOnExpiry, Agent.DirtyExpirer, "Node is stuck in dirty"); + nodeRepository().nodes().fail(expiredNode.hostname(), wantToDeprovisionOnExpiry, Agent.DirtyExpirer, "Node is stuck in dirty"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index 000809ef5af..d6cfeab0cd7 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -132,7 +132,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { } excessHosts.forEach(host -> { - Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Optional.of(Duration.ofSeconds(10))); + Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Duration.ofSeconds(10)); if (optionalMutex.isEmpty()) return; try (NodeMutex mutex = optionalMutex.get()) { if (host.state() != mutex.node().state()) return; @@ -156,7 +156,8 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { private void replaceRootDisk(NodeList nodes) { NodeList softRebuildingHosts = nodes.rebuilding(true); for (var host : softRebuildingHosts) { - Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Optional.of(Duration.ofSeconds(10))); + Optional<NodeMutex> optionalMutex = nodeRepository().nodes().lockAndGet(host, Duration.ofSeconds(10)); + if (optionalMutex.isEmpty()) return; try (NodeMutex mutex = optionalMutex.get()) { // Re-check flag while holding lock host = mutex.node(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index ef2d5bb798d..2fb2f016c95 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -102,7 +102,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { .mapToList(Node::hostname); if (unparkedChildren.isEmpty()) { - nodeRepository.nodes().park(candidate.hostname(), false, Agent.FailedExpirer, + nodeRepository.nodes().park(candidate.hostname(), true, Agent.FailedExpirer, "Parked by FailedExpirer due to hardware issue or high fail count"); } else { log.info(String.format("Expired failed node %s with hardware issue was not parked because of " + diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java index 36ca58e6ccd..5314d36a3c2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/LoadBalancerExpirer.java @@ -146,7 +146,11 @@ public class LoadBalancerExpirer extends NodeRepositoryMaintainer { } private List<Node> allocatedNodes(LoadBalancerId loadBalancer) { - return nodeRepository().nodes().list().owner(loadBalancer.application()).cluster(loadBalancer.cluster()).asList(); + return nodeRepository().nodes() + .list(Node.State.active, Node.State.inactive, Node.State.reserved) + .owner(loadBalancer.application()) + .cluster(loadBalancer.cluster()) + .asList(); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java index b2c3859e33a..a94adc5aaae 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MaintenanceDeployment.java @@ -107,7 +107,7 @@ class MaintenanceDeployment implements Closeable { Duration timeout = Duration.ofSeconds(3); try { // Use a short lock to avoid interfering with change deployments - return Optional.of(nodeRepository.nodes().lock(application, timeout)); + return Optional.of(nodeRepository.applications().lock(application, timeout)); } catch (ApplicationLockException e) { log.log(Level.INFO, () -> "Could not lock " + application + " for maintenance deployment within " + timeout); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java index 624492a14f3..b602d2ac7cd 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java @@ -63,7 +63,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { // Lock and update status ApplicationId owner = node.get().allocation().get().owner(); - try (var lock = nodeRepository().nodes().lock(owner)) { + try (var lock = nodeRepository().applications().lock(owner)) { node = getNode(hostname.toString(), owner, lock); // Re-get inside lock if (node.isEmpty()) return; // Node disappeared or changed allocation attempts.add(1); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java index 9cb5f887e48..531201f9484 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainer.java @@ -64,7 +64,7 @@ public class ScalingSuggestionsMaintainer extends NodeRepositoryMaintainer { var suggestion = autoscaler.suggest(application, cluster.get(), clusterNodes); if (suggestion.isEmpty()) return true; // Wait only a short time for the lock to avoid interfering with change deployments - try (Mutex lock = nodeRepository().nodes().lock(applicationId, Duration.ofSeconds(1))) { + try (Mutex lock = nodeRepository().applications().lock(applicationId, Duration.ofSeconds(1))) { // empty suggested resources == keep the current allocation, so we record that var suggestedResources = suggestion.target().orElse(clusterNodes.not().retired().toResources()); applications().get(applicationId).ifPresent(a -> updateSuggestion(suggestedResources, clusterId, a, lock)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index 016ca8232a2..a271dfe571e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -11,11 +11,13 @@ import com.yahoo.config.provision.Zone; import com.yahoo.transaction.Mutex; import com.yahoo.transaction.NestedTransaction; import com.yahoo.vespa.applicationmodel.HostName; +import com.yahoo.vespa.applicationmodel.InfrastructureApplication; import com.yahoo.vespa.hosted.provision.LockedNodeList; import com.yahoo.vespa.hosted.provision.NoSuchNodeException; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeMutex; +import com.yahoo.vespa.hosted.provision.applications.Applications; import com.yahoo.vespa.hosted.provision.maintenance.NodeFailer; import com.yahoo.vespa.hosted.provision.node.filter.NodeFilter; import com.yahoo.vespa.hosted.provision.persistence.CuratorDatabaseClient; @@ -60,12 +62,14 @@ public class Nodes { private final Zone zone; private final Clock clock; private final Orchestrator orchestrator; + private final Applications applications; - public Nodes(CuratorDatabaseClient db, Zone zone, Clock clock, Orchestrator orchestrator) { + public Nodes(CuratorDatabaseClient db, Zone zone, Clock clock, Orchestrator orchestrator, Applications applications) { this.zone = zone; this.clock = clock; this.db = db; this.orchestrator = orchestrator; + this.applications = applications; } /** Read and write all nodes to make sure they are stored in the latest version of the serialized format */ @@ -228,7 +232,7 @@ public class Nodes { * @param reusable move the node directly to {@link Node.State#dirty} after removal */ public void setRemovable(ApplicationId application, List<Node> nodes, boolean reusable) { - try (Mutex lock = lock(application)) { + try (Mutex lock = applications.lock(application)) { List<Node> removableNodes = nodes.stream() .map(node -> node.with(node.allocation().get().removable(true, reusable))) .toList(); @@ -320,7 +324,7 @@ public class Nodes { public Node deallocate(Node node, Agent agent, String reason, NestedTransaction transaction) { if (parkOnDeallocationOf(node, agent)) { - return park(node.hostname(), false, agent, reason, transaction); + return park(node.hostname(), true, agent, reason, transaction); } else { Node.State toState = Node.State.dirty; if (node.state() == Node.State.parked) { @@ -338,11 +342,11 @@ public class Nodes { * @throws NoSuchNodeException if the node is not found */ public Node fail(String hostname, Agent agent, String reason) { - return fail(hostname, true, agent, reason); + return fail(hostname, false, agent, reason); } - public Node fail(String hostname, boolean keepAllocation, Agent agent, String reason) { - return move(hostname, Node.State.failed, agent, keepAllocation, Optional.of(reason)); + public Node fail(String hostname, boolean wantToDeprovision, Agent agent, String reason) { + return move(hostname, Node.State.failed, agent, wantToDeprovision, Optional.of(reason)); } /** @@ -357,7 +361,7 @@ public class Nodes { List<Node> changed = performOn(children, (node, lock) -> failOrMark(node, agent, reason, lock)); if (children.state(Node.State.active).isEmpty()) - changed.add(move(hostname, Node.State.failed, agent, true, Optional.of(reason))); + changed.add(move(hostname, Node.State.failed, agent, false, Optional.of(reason))); else changed.addAll(performOn(NodeList.of(node(hostname).orElseThrow()), (node, lock) -> failOrMark(node, agent, reason, lock))); @@ -370,7 +374,7 @@ public class Nodes { write(node, lock); return node; } else { - return move(node.hostname(), Node.State.failed, agent, true, Optional.of(reason)); + return move(node.hostname(), Node.State.failed, agent, false, Optional.of(reason)); } } @@ -380,15 +384,15 @@ public class Nodes { * @return the node in its new state * @throws NoSuchNodeException if the node is not found */ - public Node park(String hostname, boolean keepAllocation, Agent agent, String reason) { + public Node park(String hostname, boolean wantToDeprovision, Agent agent, String reason) { NestedTransaction transaction = new NestedTransaction(); - Node parked = park(hostname, keepAllocation, agent, reason, transaction); + Node parked = park(hostname, wantToDeprovision, agent, reason, transaction); transaction.commit(); return parked; } - private Node park(String hostname, boolean keepAllocation, Agent agent, String reason, NestedTransaction transaction) { - return move(hostname, Node.State.parked, agent, keepAllocation, Optional.of(reason), transaction); + private Node park(String hostname, boolean wantToDeprovision, Agent agent, String reason, NestedTransaction transaction) { + return move(hostname, Node.State.parked, agent, wantToDeprovision, Optional.of(reason), transaction); } /** @@ -407,7 +411,7 @@ public class Nodes { * @throws NoSuchNodeException if the node is not found */ public Node reactivate(String hostname, Agent agent, String reason) { - return move(hostname, Node.State.active, agent, true, Optional.of(reason)); + return move(hostname, Node.State.active, agent, false, Optional.of(reason)); } /** @@ -419,7 +423,7 @@ public class Nodes { requireBreakfixable(node); NestedTransaction transaction = new NestedTransaction(); List<Node> removed = removeChildren(node, false, transaction); - removed.add(move(node.hostname(), Node.State.breakfixed, agent, true, Optional.of(reason), transaction)); + removed.add(move(node.hostname(), Node.State.breakfixed, agent, false, Optional.of(reason), transaction)); transaction.commit(); return removed; } @@ -428,39 +432,37 @@ public class Nodes { private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, Optional<String> reason) { NestedTransaction transaction = new NestedTransaction(); List<Node> moved = list().childrenOf(hostname).asList().stream() - .map(child -> move(child.hostname(), toState, agent, true, reason, transaction)) + .map(child -> move(child.hostname(), toState, agent, false, reason, transaction)) .collect(Collectors.toList()); - moved.add(move(hostname, toState, agent, true, reason, transaction)); + moved.add(move(hostname, toState, agent, false, reason, transaction)); transaction.commit(); return moved; } /** Move a node to given state */ - private Node move(String hostname, Node.State toState, Agent agent, boolean keepAllocation, Optional<String> reason) { + private Node move(String hostname, Node.State toState, Agent agent, boolean wantToDeprovision, Optional<String> reason) { NestedTransaction transaction = new NestedTransaction(); - Node moved = move(hostname, toState, agent, keepAllocation, reason, transaction); + Node moved = move(hostname, toState, agent, wantToDeprovision, reason, transaction); transaction.commit(); return moved; } /** Move a node to given state as part of a transaction */ - private Node move(String hostname, Node.State toState, Agent agent, boolean keepAllocation, Optional<String> reason, NestedTransaction transaction) { + private Node move(String hostname, Node.State toState, Agent agent, boolean wantToDeprovision, Optional<String> reason, NestedTransaction transaction) { // TODO: Work out a safe lock acquisition strategy for moves. Lock is only held while adding operations to // transaction, but lock must also be held while committing try (NodeMutex lock = lockAndGetRequired(hostname)) { Node node = lock.node(); if (toState == Node.State.active) { if (node.allocation().isEmpty()) illegal("Could not set " + node + " active: It has no allocation"); - if (!keepAllocation) illegal("Could not set " + node + " active: Requested to discard allocation"); for (Node currentActive : list(Node.State.active).owner(node.allocation().get().owner())) { if (node.allocation().get().membership().cluster().equals(currentActive.allocation().get().membership().cluster()) && node.allocation().get().membership().index() == currentActive.allocation().get().membership().index()) illegal("Could not set " + node + " active: Same cluster and index as " + currentActive); } } - if (!keepAllocation && node.allocation().isPresent()) { - node = node.withoutAllocation(); - } + if (wantToDeprovision) + node = node.withWantToRetire(wantToDeprovision, wantToDeprovision, agent, clock.instant()); if (toState == Node.State.deprovisioned) { node = node.with(IP.Config.EMPTY); } @@ -708,8 +710,9 @@ public class Nodes { // Group matching nodes by the lock needed for (Node node : nodes) { - if (node.allocation().isPresent()) - allocatedNodes.put(node.allocation().get().owner(), node); + Optional<ApplicationId> applicationId = applicationIdForLock(node); + if (applicationId.isPresent()) + allocatedNodes.put(applicationId.get(), node); else unallocatedNodes.add(node); } @@ -724,7 +727,7 @@ public class Nodes { } } for (Map.Entry<ApplicationId, List<Node>> applicationNodes : allocatedNodes.entrySet()) { - try (Mutex lock = lock(applicationNodes.getKey())) { + try (Mutex lock = applications.lock(applicationNodes.getKey())) { for (Node node : applicationNodes.getValue()) { Optional<Node> currentNode = db.readNode(node.hostname()); // Re-read while holding lock if (currentNode.isEmpty()) continue; @@ -760,30 +763,16 @@ public class Nodes { } } - /** Create a lock which provides exclusive rights to making changes to the given application */ - // TODO: Move to Applications - public Mutex lock(ApplicationId application) { - return db.lock(application); - } - - /** Create a lock with a timeout which provides exclusive rights to making changes to the given application */ - public Mutex lock(ApplicationId application, Duration timeout) { - return db.lock(application, timeout); - } - /** Create a lock which provides exclusive rights to modifying unallocated nodes */ public Mutex lockUnallocated() { return db.lockInactive(); } /** Returns the unallocated/application lock, and the node acquired under that lock. */ - public Optional<NodeMutex> lockAndGet(Node node) { return lockAndGet(node, Optional.empty()); } - - /** Returns the unallocated/application lock, and the node acquired under that lock. */ - public Optional<NodeMutex> lockAndGet(Node node, Optional<Duration> timeout) { + private Optional<NodeMutex> lockAndGet(Node node, Optional<Duration> timeout) { Node staleNode = node; final int maxRetries = 4; for (int i = 0; i < maxRetries; ++i) { - Mutex lockToClose = timeout.isPresent() ? lock(staleNode, timeout.get()) : lock(staleNode); + Mutex lockToClose = lock(staleNode, timeout); try { // As an optimization we first try finding the node in the same state Optional<Node> freshNode = node(staleNode.hostname(), staleNode.state()); @@ -794,8 +783,9 @@ public class Nodes { } } - if (Objects.equals(freshNode.get().allocation().map(Allocation::owner), - staleNode.allocation().map(Allocation::owner))) { + if (node.type() != NodeType.tenant || + Objects.equals(freshNode.get().allocation().map(Allocation::owner), + staleNode.allocation().map(Allocation::owner))) { NodeMutex nodeMutex = new NodeMutex(freshNode.get(), lockToClose); lockToClose = null; return Optional.of(nodeMutex); @@ -823,6 +813,12 @@ public class Nodes { } /** Returns the unallocated/application lock, and the node acquired under that lock. */ + public Optional<NodeMutex> lockAndGet(Node node) { return lockAndGet(node, Optional.empty()); } + + /** Returns the unallocated/application lock, and the node acquired under that lock. */ + public Optional<NodeMutex> lockAndGet(Node node, Duration timeout) { return lockAndGet(node, Optional.of(timeout)); } + + /** Returns the unallocated/application lock, and the node acquired under that lock. */ public NodeMutex lockAndGetRequired(Node node) { return lockAndGet(node).orElseThrow(() -> new NoSuchNodeException("No node with hostname '" + node.hostname() + "'")); } @@ -837,19 +833,34 @@ public class Nodes { return lockAndGet(hostname, timeout).orElseThrow(() -> new NoSuchNodeException("No node with hostname '" + hostname + "'")); } - private Mutex lock(Node node) { - return node.allocation().isPresent() ? lock(node.allocation().get().owner()) : lockUnallocated(); - } - - private Mutex lock(Node node, Duration timeout) { - return node.allocation().isPresent() ? lock(node.allocation().get().owner(), timeout) : lockUnallocated(); + private Mutex lock(Node node, Optional<Duration> timeout) { + Optional<ApplicationId> application = applicationIdForLock(node); + if (application.isPresent()) + return timeout.map(t -> applications.lock(application.get(), t)) + .orElseGet(() -> applications.lock(application.get())); + else + return timeout.map(db::lockInactive).orElseGet(db::lockInactive); } private Node requireNode(String hostname) { return node(hostname).orElseThrow(() -> new NoSuchNodeException("No node with hostname '" + hostname + "'")); } - private void illegal(String message) { + /** Returns the application ID that should be used for locking when modifying this node */ + private static Optional<ApplicationId> applicationIdForLock(Node node) { + return switch (node.type()) { + case tenant -> node.allocation().map(Allocation::owner); + case host -> Optional.of(InfrastructureApplication.TENANT_HOST.id()); + case config -> Optional.of(InfrastructureApplication.CONFIG_SERVER.id()); + case confighost -> Optional.of(InfrastructureApplication.CONFIG_SERVER_HOST.id()); + case controller -> Optional.of(InfrastructureApplication.CONTROLLER.id()); + case controllerhost -> Optional.of(InfrastructureApplication.CONTROLLER_HOST.id()); + case proxy -> Optional.of(InfrastructureApplication.PROXY.id()); + case proxyhost -> Optional.of(InfrastructureApplication.PROXY_HOST.id()); + }; + } + + private static void illegal(String message) { throw new IllegalArgumentException(message); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java index a8fb0a5e4c2..fbe61c24bf9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/GroupPreparer.java @@ -98,7 +98,7 @@ public class GroupPreparer { /// Note that this will write to the node repo. private List<Node> prepareWithLocks(ApplicationId application, ClusterSpec cluster, NodeSpec requestedNodes, List<Node> surplusActiveNodes, NodeIndices indices, int wantedGroups) { - try (Mutex lock = nodeRepository.nodes().lock(application); + try (Mutex lock = nodeRepository.applications().lock(application); Mutex allocationLock = nodeRepository.nodes().lockUnallocated()) { NodesAndHosts<LockedNodeList> allNodesAndHosts = NodesAndHosts.create(nodeRepository.nodes().list(allocationLock)); NodeAllocation allocation = prepareAllocation(application, cluster, requestedNodes, surplusActiveNodes, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/InfraDeployerImpl.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/InfraDeployerImpl.java index 53defef6fc7..605ef280c2e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/InfraDeployerImpl.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/InfraDeployerImpl.java @@ -89,7 +89,7 @@ public class InfraDeployerImpl implements InfraDeployer { public void prepare() { if (prepared) return; - try (Mutex lock = nodeRepository.nodes().lock(application.getApplicationId())) { + try (Mutex lock = nodeRepository.applications().lock(application.getApplicationId())) { NodeType nodeType = application.getCapacity().type(); Version targetVersion = infrastructureVersions.getTargetVersionFor(nodeType); hostSpecs = provisioner.prepare(application.getApplicationId(), diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java index 59063f1ce85..ca483fe27b4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java @@ -139,7 +139,7 @@ public class NodeRepositoryProvisioner implements Provisioner { @Override public ProvisionLock lock(ApplicationId application) { - return new ProvisionLock(application, nodeRepository.nodes().lock(application)); + return new ProvisionLock(application, nodeRepository.applications().lock(application)); } /** @@ -147,7 +147,7 @@ public class NodeRepositoryProvisioner implements Provisioner { * and updates the application store with the received min and max. */ private ClusterResources decideTargetResources(ApplicationId applicationId, ClusterSpec clusterSpec, Capacity requested) { - try (Mutex lock = nodeRepository.nodes().lock(applicationId)) { + try (Mutex lock = nodeRepository.applications().lock(applicationId)) { var application = nodeRepository.applications().get(applicationId).orElse(Application.empty(applicationId)) .withCluster(clusterSpec.id(), clusterSpec.isExclusive(), requested); nodeRepository.applications().put(application, lock); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationPatcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationPatcher.java index 7b5146955fb..eee1d364c03 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationPatcher.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationPatcher.java @@ -35,7 +35,7 @@ public class ApplicationPatcher implements AutoCloseable { throw new UncheckedIOException("Error reading request body", e); } // Use same timeout for acquiring lock as client timeout for patch request - this.lock = nodeRepository.nodes().lock(applicationId, Duration.ofSeconds(30)); + this.lock = nodeRepository.applications().lock(applicationId, Duration.ofSeconds(30)); try { this.application = nodeRepository.applications().require(applicationId); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java index 8ba00760905..26e1cd71a3d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java @@ -179,7 +179,7 @@ public class MockNodeRepository extends NodeRepository { clock().instant()))); cluster1 = cluster1.withTarget(Optional.of(new ClusterResources(4, 1, new NodeResources(3, 16, 100, 1)))); - try (Mutex lock = nodes().lock(app1Id)) { + try (Mutex lock = applications().lock(app1Id)) { applications().put(app1.with(cluster1), lock); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java index 078df5264a1..b338527b0fd 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java @@ -40,7 +40,7 @@ public class NodeRepositoryTest { assertEquals(4, tester.nodeRepository().nodes().list().size()); for (var hostname : List.of("host2", "cfghost1")) { - tester.nodeRepository().nodes().park(hostname, true, Agent.system, "Parking to unit test"); + tester.nodeRepository().nodes().park(hostname, false, Agent.system, "Parking to unit test"); tester.nodeRepository().nodes().removeRecursively(hostname); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java index 3d363e8a96b..e75ef10e689 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java @@ -111,7 +111,7 @@ class AutoscalingTester { } public void deactivateRetired(ApplicationId application, ClusterSpec cluster, Capacity capacity) { - try (Mutex lock = nodeRepository().nodes().lock(application)) { + try (Mutex lock = nodeRepository().applications().lock(application)) { for (Node node : nodeRepository().nodes().list(Node.State.active).owner(application)) { if (node.allocation().get().membership().retired()) nodeRepository().nodes().write(node.with(node.allocation().get().removable(true, true)), lock); @@ -137,14 +137,14 @@ class AutoscalingTester { 0, clock().instant().minus(Duration.ofDays(1).minus(duration))).withCompletion(clock().instant().minus(Duration.ofDays(1)))); application = application.with(cluster); - nodeRepository().applications().put(application, nodeRepository().nodes().lock(applicationId)); + nodeRepository().applications().put(application, nodeRepository().applications().lock(applicationId)); } public Autoscaler.Advice autoscale(ApplicationId applicationId, ClusterSpec cluster, Capacity capacity) { capacity = capacityPolicies.applyOn(capacity, applicationId, capacityPolicies.decideExclusivity(capacity, cluster.isExclusive())); Application application = nodeRepository().applications().get(applicationId).orElse(Application.empty(applicationId)) .withCluster(cluster.id(), false, capacity); - try (Mutex lock = nodeRepository().nodes().lock(applicationId)) { + try (Mutex lock = nodeRepository().applications().lock(applicationId)) { nodeRepository().applications().put(application, lock); } return autoscaler.autoscale(application, application.clusters().get(cluster.id()), @@ -155,7 +155,7 @@ class AutoscalingTester { ClusterResources min, ClusterResources max) { Application application = nodeRepository().applications().get(applicationId).orElse(Application.empty(applicationId)) .withCluster(clusterId, false, Capacity.from(min, max)); - try (Mutex lock = nodeRepository().nodes().lock(applicationId)) { + try (Mutex lock = nodeRepository().applications().lock(applicationId)) { nodeRepository().applications().put(application, lock); } return autoscaler.suggest(application, application.clusters().get(clusterId), diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java index 4c7f26f9396..35881c59c5e 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java @@ -113,7 +113,7 @@ public class Fixture { var application = application(); application = application.with(application.status().withCurrentReadShare(currentReadShare) .withMaxReadShare(maxReadShare)); - tester.nodeRepository().applications().put(application, tester.nodeRepository().nodes().lock(applicationId)); + tester.nodeRepository().applications().put(application, tester.nodeRepository().applications().lock(applicationId)); } public static class Builder { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java index 41851f6888e..e83880404f4 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsV2MetricsFetcherTest.java @@ -81,7 +81,7 @@ public class MetricsV2MetricsFetcherTest { { httpClient.cannedResponse = cannedResponseForApplication2; - try (Mutex lock = tester.nodeRepository().nodes().lock(application1)) { + try (Mutex lock = tester.nodeRepository().applications().lock(application1)) { tester.nodeRepository().nodes().write(tester.nodeRepository().nodes().list(Node.State.active).owner(application2) .first().get().retire(tester.clock().instant()), lock); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirerTest.java index 275708d9ae0..ac20b9164f8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirerTest.java @@ -62,7 +62,7 @@ public class DirtyExpirerTest { tester.clock().advance(expiryTimeout.plusSeconds(1)); expirer.run(); assertEquals(Node.State.failed, tester.nodeRepository().nodes().list().first().get().state()); - assertEquals(dynamicProvisioning, tester.nodeRepository().nodes().list().first().get().allocation().isEmpty()); + assertEquals(dynamicProvisioning, tester.nodeRepository().nodes().list().first().get().status().wantToDeprovision()); } }
\ No newline at end of file diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java index bced4daed34..de4df1992ee 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java @@ -594,7 +594,7 @@ public class DynamicProvisioningMaintainerTest { assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(host43.hostname()).get().state()); // Last child is parked - tester.nodeRepository.nodes().park(host42.hostname(), true, Agent.system, getClass().getSimpleName()); + tester.nodeRepository.nodes().park(host42.hostname(), false, Agent.system, getClass().getSimpleName()); // Host and children can now be removed tester.maintainer.maintain(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java index 2b808917597..2df38a38d49 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java @@ -66,7 +66,7 @@ public class PeriodicApplicationMaintainerTest { // Fail and park some nodes nodeRepository.nodes().fail(nodeRepository.nodes().list().owner(fixture.app1).asList().get(3).hostname(), Agent.system, "Failing to unit test"); nodeRepository.nodes().fail(nodeRepository.nodes().list().owner(fixture.app2).asList().get(0).hostname(), Agent.system, "Failing to unit test"); - nodeRepository.nodes().park(nodeRepository.nodes().list().owner(fixture.app2).asList().get(4).hostname(), true, Agent.system, "Parking to unit test"); + nodeRepository.nodes().park(nodeRepository.nodes().list().owner(fixture.app2).asList().get(4).hostname(), false, Agent.system, "Parking to unit test"); int failedInApp1 = 1; int failedOrParkedInApp2 = 2; assertEquals(fixture.wantedNodesApp1 - failedInApp1, nodeRepository.nodes().list(Node.State.active).owner(fixture.app1).size()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java index 8f34b0ae259..259277925f4 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java @@ -592,7 +592,7 @@ public class OsVersionsTest { Optional<Version> wantedOsVersion = node.status().osVersion().wanted(); if (node.status().wantToDeprovision()) { ApplicationId application = node.allocation().get().owner(); - tester.nodeRepository().nodes().park(node.hostname(), false, Agent.system, + tester.nodeRepository().nodes().park(node.hostname(), true, Agent.system, getClass().getSimpleName()); tester.nodeRepository().nodes().removeRecursively(node.hostname()); node = provisionInfraApplication(1, application, nodeType).get(0); @@ -607,7 +607,7 @@ public class OsVersionsTest { Optional<Version> wantedOsVersion = node.status().osVersion().wanted(); if (node.status().wantToRebuild()) { ApplicationId application = node.allocation().get().owner(); - tester.nodeRepository().nodes().park(node.hostname(), false, Agent.system, + tester.nodeRepository().nodes().park(node.hostname(), true, Agent.system, getClass().getSimpleName()); tester.nodeRepository().nodes().removeRecursively(node.hostname()); Node newNode = Node.create(node.id(), node.ipConfig(), node.hostname(), node.flavor(), node.type()) diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java index c1c4b1e96ea..49a820f3291 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java @@ -165,10 +165,10 @@ public class ProvisioningTest { Set<Integer> state1Indexes = state1.allHosts.stream().map(hostSpec -> hostSpec.membership().get().index()).collect(Collectors.toSet()); // deallocate 2 nodes with index 0 - NodeMutex node1 = tester.nodeRepository().nodes().lockAndGet(tester.removeOne(state1.container0).hostname()).get(); - NodeMutex node2 = tester.nodeRepository().nodes().lockAndGet(tester.removeOne(state1.content0).hostname()).get(); - tester.nodeRepository().nodes().write(node1.node().withoutAllocation(), node1); - tester.nodeRepository().nodes().write(node2.node().withoutAllocation(), node1); + Node node1 = tester.nodeRepository().nodes().node(tester.removeOne(state1.container0).hostname()).get(); + Node node2 = tester.nodeRepository().nodes().node(tester.removeOne(state1.content0).hostname()).get(); + tester.nodeRepository().nodes().removeRecursively(node1, true); + tester.nodeRepository().nodes().removeRecursively(node2, true); // redeploy to get new nodes SystemState state2 = prepare(app1, 2, 2, 3, 3, defaultResources, tester); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java index faf3f60e8af..a7c4e8ecd7a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTester.java @@ -250,7 +250,7 @@ public class ProvisioningTester { } public void deactivate(ApplicationId applicationId) { - try (var lock = nodeRepository.nodes().lock(applicationId)) { + try (var lock = nodeRepository.applications().lock(applicationId)) { NestedTransaction deactivateTransaction = new NestedTransaction(); nodeRepository.remove(new ApplicationTransaction(new ProvisionLock(applicationId, lock), deactivateTransaction)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java index 7728e0ac9c8..aaef785704d 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/VirtualNodeProvisioningTest.java @@ -599,7 +599,7 @@ public class VirtualNodeProvisioningTest { tester.activate(app1, cluster1, Capacity.from(new ClusterResources(5, 1, r))); tester.activate(app1, cluster1, Capacity.from(new ClusterResources(2, 1, r))); - var tx = new ApplicationTransaction(new ProvisionLock(app1, tester.nodeRepository().nodes().lock(app1)), new NestedTransaction()); + var tx = new ApplicationTransaction(new ProvisionLock(app1, tester.nodeRepository().applications().lock(app1)), new NestedTransaction()); tester.nodeRepository().nodes().deactivate(tester.nodeRepository().nodes().list(Node.State.active).owner(app1).retired().asList(), tx); tx.nested().commit(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationPatcherTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationPatcherTest.java index 244bb40b2bf..79722a747e2 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationPatcherTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationPatcherTest.java @@ -20,7 +20,7 @@ public class ApplicationPatcherTest { public void testPatching() { NodeRepositoryTester tester = new NodeRepositoryTester(); Application application = Application.empty(ApplicationId.from("t1", "a1", "i1")); - tester.nodeRepository().applications().put(application, tester.nodeRepository().nodes().lock(application.id())); + tester.nodeRepository().applications().put(application, tester.nodeRepository().applications().lock(application.id())); String patch = "{ \"currentReadShare\" :0.4, \"maxReadShare\": 1.0 }"; ApplicationPatcher patcher = new ApplicationPatcher(new ByteArrayInputStream(patch.getBytes()), application.id(), |