diff options
author | Valerij Fredriksen <valerijf@yahooinc.com> | 2022-10-27 14:21:56 +0200 |
---|---|---|
committer | Valerij Fredriksen <valerijf@yahooinc.com> | 2022-10-27 14:51:04 +0200 |
commit | 85186ac2df4521f9376fd637faf3d6e92ab4fea9 (patch) | |
tree | 1a6bf72cfdeb4b6c47df22d37879fb5833fb0f8d /node-repository | |
parent | 30a93b4910c1de1a10ebccfa545d03e1deac8056 (diff) |
Never remove allocation from nodes
Diffstat (limited to 'node-repository')
10 files changed, 34 insertions, 45 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index 6548e8ae16b..768036fd284 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -402,15 +402,6 @@ public final class Node implements Nodelike { exclusiveToClusterType, switchHostname, trustStoreItems, cloudAccount); } - /** Returns a new Node without an allocation. */ - public Node withoutAllocation() { - return new Node(id, ipConfig, hostname, parentHostname, flavor, status, state, - Optional.empty(), history, type, reports, modelName, reservedTo, - exclusiveToApplicationId, exclusiveToClusterType, switchHostname, trustStoreItems, - cloudAccount); - } - - /** Returns a copy of this node with IP config set to the given value. */ public Node with(IP.Config ipConfig) { return new Node(id, ipConfig, hostname, parentHostname, flavor, status, state, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java index c4703102f47..517458afd00 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirer.java @@ -23,18 +23,18 @@ import java.util.List; */ public class DirtyExpirer extends Expirer { - private final boolean keepAllocationOnExpiry; + private final boolean wantToDeprovisionOnExpiry; DirtyExpirer(NodeRepository nodeRepository, Duration dirtyTimeout, Metric metric) { super(Node.State.dirty, History.Event.Type.deallocated, nodeRepository, dirtyTimeout, metric); - // Do not keep allocation in dynamically provisioned zones so that the hosts can be deprovisioned - this.keepAllocationOnExpiry = ! nodeRepository.zone().cloud().dynamicProvisioning(); + // Deprovision hosts in dynamically provisioned on expiry + this.wantToDeprovisionOnExpiry = nodeRepository.zone().cloud().dynamicProvisioning(); } @Override protected void expire(List<Node> expired) { for (Node expiredNode : expired) - nodeRepository().nodes().fail(expiredNode.hostname(), keepAllocationOnExpiry, Agent.DirtyExpirer, "Node is stuck in dirty"); + nodeRepository().nodes().fail(expiredNode.hostname(), wantToDeprovisionOnExpiry, Agent.DirtyExpirer, "Node is stuck in dirty"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index ef2d5bb798d..2fb2f016c95 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -102,7 +102,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { .mapToList(Node::hostname); if (unparkedChildren.isEmpty()) { - nodeRepository.nodes().park(candidate.hostname(), false, Agent.FailedExpirer, + nodeRepository.nodes().park(candidate.hostname(), true, Agent.FailedExpirer, "Parked by FailedExpirer due to hardware issue or high fail count"); } else { log.info(String.format("Expired failed node %s with hardware issue was not parked because of " + diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index 016ca8232a2..e9b07858019 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -320,7 +320,7 @@ public class Nodes { public Node deallocate(Node node, Agent agent, String reason, NestedTransaction transaction) { if (parkOnDeallocationOf(node, agent)) { - return park(node.hostname(), false, agent, reason, transaction); + return park(node.hostname(), true, agent, reason, transaction); } else { Node.State toState = Node.State.dirty; if (node.state() == Node.State.parked) { @@ -338,11 +338,11 @@ public class Nodes { * @throws NoSuchNodeException if the node is not found */ public Node fail(String hostname, Agent agent, String reason) { - return fail(hostname, true, agent, reason); + return fail(hostname, false, agent, reason); } - public Node fail(String hostname, boolean keepAllocation, Agent agent, String reason) { - return move(hostname, Node.State.failed, agent, keepAllocation, Optional.of(reason)); + public Node fail(String hostname, boolean wantToDeprovision, Agent agent, String reason) { + return move(hostname, Node.State.failed, agent, wantToDeprovision, Optional.of(reason)); } /** @@ -357,7 +357,7 @@ public class Nodes { List<Node> changed = performOn(children, (node, lock) -> failOrMark(node, agent, reason, lock)); if (children.state(Node.State.active).isEmpty()) - changed.add(move(hostname, Node.State.failed, agent, true, Optional.of(reason))); + changed.add(move(hostname, Node.State.failed, agent, false, Optional.of(reason))); else changed.addAll(performOn(NodeList.of(node(hostname).orElseThrow()), (node, lock) -> failOrMark(node, agent, reason, lock))); @@ -370,7 +370,7 @@ public class Nodes { write(node, lock); return node; } else { - return move(node.hostname(), Node.State.failed, agent, true, Optional.of(reason)); + return move(node.hostname(), Node.State.failed, agent, false, Optional.of(reason)); } } @@ -380,15 +380,15 @@ public class Nodes { * @return the node in its new state * @throws NoSuchNodeException if the node is not found */ - public Node park(String hostname, boolean keepAllocation, Agent agent, String reason) { + public Node park(String hostname, boolean wantToDeprovision, Agent agent, String reason) { NestedTransaction transaction = new NestedTransaction(); - Node parked = park(hostname, keepAllocation, agent, reason, transaction); + Node parked = park(hostname, wantToDeprovision, agent, reason, transaction); transaction.commit(); return parked; } - private Node park(String hostname, boolean keepAllocation, Agent agent, String reason, NestedTransaction transaction) { - return move(hostname, Node.State.parked, agent, keepAllocation, Optional.of(reason), transaction); + private Node park(String hostname, boolean wantToDeprovision, Agent agent, String reason, NestedTransaction transaction) { + return move(hostname, Node.State.parked, agent, wantToDeprovision, Optional.of(reason), transaction); } /** @@ -407,7 +407,7 @@ public class Nodes { * @throws NoSuchNodeException if the node is not found */ public Node reactivate(String hostname, Agent agent, String reason) { - return move(hostname, Node.State.active, agent, true, Optional.of(reason)); + return move(hostname, Node.State.active, agent, false, Optional.of(reason)); } /** @@ -419,7 +419,7 @@ public class Nodes { requireBreakfixable(node); NestedTransaction transaction = new NestedTransaction(); List<Node> removed = removeChildren(node, false, transaction); - removed.add(move(node.hostname(), Node.State.breakfixed, agent, true, Optional.of(reason), transaction)); + removed.add(move(node.hostname(), Node.State.breakfixed, agent, false, Optional.of(reason), transaction)); transaction.commit(); return removed; } @@ -428,39 +428,37 @@ public class Nodes { private List<Node> moveRecursively(String hostname, Node.State toState, Agent agent, Optional<String> reason) { NestedTransaction transaction = new NestedTransaction(); List<Node> moved = list().childrenOf(hostname).asList().stream() - .map(child -> move(child.hostname(), toState, agent, true, reason, transaction)) + .map(child -> move(child.hostname(), toState, agent, false, reason, transaction)) .collect(Collectors.toList()); - moved.add(move(hostname, toState, agent, true, reason, transaction)); + moved.add(move(hostname, toState, agent, false, reason, transaction)); transaction.commit(); return moved; } /** Move a node to given state */ - private Node move(String hostname, Node.State toState, Agent agent, boolean keepAllocation, Optional<String> reason) { + private Node move(String hostname, Node.State toState, Agent agent, boolean wantToDeprovision, Optional<String> reason) { NestedTransaction transaction = new NestedTransaction(); - Node moved = move(hostname, toState, agent, keepAllocation, reason, transaction); + Node moved = move(hostname, toState, agent, wantToDeprovision, reason, transaction); transaction.commit(); return moved; } /** Move a node to given state as part of a transaction */ - private Node move(String hostname, Node.State toState, Agent agent, boolean keepAllocation, Optional<String> reason, NestedTransaction transaction) { + private Node move(String hostname, Node.State toState, Agent agent, boolean wantToDeprovision, Optional<String> reason, NestedTransaction transaction) { // TODO: Work out a safe lock acquisition strategy for moves. Lock is only held while adding operations to // transaction, but lock must also be held while committing try (NodeMutex lock = lockAndGetRequired(hostname)) { Node node = lock.node(); if (toState == Node.State.active) { if (node.allocation().isEmpty()) illegal("Could not set " + node + " active: It has no allocation"); - if (!keepAllocation) illegal("Could not set " + node + " active: Requested to discard allocation"); for (Node currentActive : list(Node.State.active).owner(node.allocation().get().owner())) { if (node.allocation().get().membership().cluster().equals(currentActive.allocation().get().membership().cluster()) && node.allocation().get().membership().index() == currentActive.allocation().get().membership().index()) illegal("Could not set " + node + " active: Same cluster and index as " + currentActive); } } - if (!keepAllocation && node.allocation().isPresent()) { - node = node.withoutAllocation(); - } + if (wantToDeprovision) + node = node.withWantToRetire(wantToDeprovision, wantToDeprovision, agent, clock.instant()); if (toState == Node.State.deprovisioned) { node = node.with(IP.Config.EMPTY); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java index 078df5264a1..b338527b0fd 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java @@ -40,7 +40,7 @@ public class NodeRepositoryTest { assertEquals(4, tester.nodeRepository().nodes().list().size()); for (var hostname : List.of("host2", "cfghost1")) { - tester.nodeRepository().nodes().park(hostname, true, Agent.system, "Parking to unit test"); + tester.nodeRepository().nodes().park(hostname, false, Agent.system, "Parking to unit test"); tester.nodeRepository().nodes().removeRecursively(hostname); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirerTest.java index 275708d9ae0..ac20b9164f8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DirtyExpirerTest.java @@ -62,7 +62,7 @@ public class DirtyExpirerTest { tester.clock().advance(expiryTimeout.plusSeconds(1)); expirer.run(); assertEquals(Node.State.failed, tester.nodeRepository().nodes().list().first().get().state()); - assertEquals(dynamicProvisioning, tester.nodeRepository().nodes().list().first().get().allocation().isEmpty()); + assertEquals(dynamicProvisioning, tester.nodeRepository().nodes().list().first().get().status().wantToDeprovision()); } }
\ No newline at end of file diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java index bced4daed34..de4df1992ee 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java @@ -594,7 +594,7 @@ public class DynamicProvisioningMaintainerTest { assertEquals(Node.State.failed, tester.nodeRepository.nodes().node(host43.hostname()).get().state()); // Last child is parked - tester.nodeRepository.nodes().park(host42.hostname(), true, Agent.system, getClass().getSimpleName()); + tester.nodeRepository.nodes().park(host42.hostname(), false, Agent.system, getClass().getSimpleName()); // Host and children can now be removed tester.maintainer.maintain(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java index 2b808917597..2df38a38d49 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainerTest.java @@ -66,7 +66,7 @@ public class PeriodicApplicationMaintainerTest { // Fail and park some nodes nodeRepository.nodes().fail(nodeRepository.nodes().list().owner(fixture.app1).asList().get(3).hostname(), Agent.system, "Failing to unit test"); nodeRepository.nodes().fail(nodeRepository.nodes().list().owner(fixture.app2).asList().get(0).hostname(), Agent.system, "Failing to unit test"); - nodeRepository.nodes().park(nodeRepository.nodes().list().owner(fixture.app2).asList().get(4).hostname(), true, Agent.system, "Parking to unit test"); + nodeRepository.nodes().park(nodeRepository.nodes().list().owner(fixture.app2).asList().get(4).hostname(), false, Agent.system, "Parking to unit test"); int failedInApp1 = 1; int failedOrParkedInApp2 = 2; assertEquals(fixture.wantedNodesApp1 - failedInApp1, nodeRepository.nodes().list(Node.State.active).owner(fixture.app1).size()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java index 8f34b0ae259..259277925f4 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java @@ -592,7 +592,7 @@ public class OsVersionsTest { Optional<Version> wantedOsVersion = node.status().osVersion().wanted(); if (node.status().wantToDeprovision()) { ApplicationId application = node.allocation().get().owner(); - tester.nodeRepository().nodes().park(node.hostname(), false, Agent.system, + tester.nodeRepository().nodes().park(node.hostname(), true, Agent.system, getClass().getSimpleName()); tester.nodeRepository().nodes().removeRecursively(node.hostname()); node = provisionInfraApplication(1, application, nodeType).get(0); @@ -607,7 +607,7 @@ public class OsVersionsTest { Optional<Version> wantedOsVersion = node.status().osVersion().wanted(); if (node.status().wantToRebuild()) { ApplicationId application = node.allocation().get().owner(); - tester.nodeRepository().nodes().park(node.hostname(), false, Agent.system, + tester.nodeRepository().nodes().park(node.hostname(), true, Agent.system, getClass().getSimpleName()); tester.nodeRepository().nodes().removeRecursively(node.hostname()); Node newNode = Node.create(node.id(), node.ipConfig(), node.hostname(), node.flavor(), node.type()) diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java index c1c4b1e96ea..49a820f3291 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/ProvisioningTest.java @@ -165,10 +165,10 @@ public class ProvisioningTest { Set<Integer> state1Indexes = state1.allHosts.stream().map(hostSpec -> hostSpec.membership().get().index()).collect(Collectors.toSet()); // deallocate 2 nodes with index 0 - NodeMutex node1 = tester.nodeRepository().nodes().lockAndGet(tester.removeOne(state1.container0).hostname()).get(); - NodeMutex node2 = tester.nodeRepository().nodes().lockAndGet(tester.removeOne(state1.content0).hostname()).get(); - tester.nodeRepository().nodes().write(node1.node().withoutAllocation(), node1); - tester.nodeRepository().nodes().write(node2.node().withoutAllocation(), node1); + Node node1 = tester.nodeRepository().nodes().node(tester.removeOne(state1.container0).hostname()).get(); + Node node2 = tester.nodeRepository().nodes().node(tester.removeOne(state1.content0).hostname()).get(); + tester.nodeRepository().nodes().removeRecursively(node1, true); + tester.nodeRepository().nodes().removeRecursively(node2, true); // redeploy to get new nodes SystemState state2 = prepare(app1, 2, 2, 3, 3, defaultResources, tester); |