diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-04-08 15:44:39 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-04-08 15:44:39 +0200 |
commit | 2296580fa6544bc73b38f26b5731c7576bf9ae57 (patch) | |
tree | 268302cd83ce2fe61a0518cbbd9b688c56f53590 /node-repository | |
parent | e969caac12895478af9ef627c84abce90d2f21ef (diff) | |
parent | 50ba6295c808cf9cbe0e0a02daa96fb0ed16105f (diff) |
Merge with master
Diffstat (limited to 'node-repository')
18 files changed, 209 insertions, 63 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index b38473504ad..de72024cb77 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -210,9 +210,20 @@ public final class Node implements Nodelike { * If both given wantToRetire and wantToDeprovision are equal to the current values, the method is no-op. */ public Node withWantToRetire(boolean wantToRetire, boolean wantToDeprovision, Agent agent, Instant at) { + return withWantToRetire(wantToRetire, wantToDeprovision, false, agent, at); + } + + /** + * Returns a copy of this node with wantToRetire, wantToDeprovision and wantToRebuild set to the given values + * and updated history. + * + * If all given values are equal to the current ones, the method is no-op. + */ + public Node withWantToRetire(boolean wantToRetire, boolean wantToDeprovision, boolean wantToRebuild, Agent agent, Instant at) { if (wantToRetire == status.wantToRetire() && - wantToDeprovision == status.wantToDeprovision()) return this; - Node node = this.with(status.withWantToRetire(wantToRetire, wantToDeprovision)); + wantToDeprovision == status.wantToDeprovision() && + wantToRebuild == status.wantToRebuild()) return this; + Node node = this.with(status.withWantToRetire(wantToRetire, wantToDeprovision, wantToRebuild)); if (wantToRetire) node = node.with(history.with(new History.Event(History.Event.Type.wantToRetire, agent, at))); return node; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java index e09ad4d6e4e..e3f8803a838 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java @@ -134,6 +134,11 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> { return matching(node -> node.allocation().map(a -> a.owner().equals(application)).orElse(false)); } + /** Returns the subset of nodes allocated to a tester instance */ + public NodeList tester() { + return matching(node -> node.allocation().isPresent() && node.allocation().get().owner().instance().isTester()); + } + /** Returns the subset of nodes matching the given node type(s) */ public NodeList nodeType(NodeType first, NodeType... rest) { if (rest.length == 0) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepoStats.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepoStats.java index ca18028ad5a..05a98c5b4da 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepoStats.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepoStats.java @@ -83,7 +83,7 @@ public class NodeRepoStats { Map<String, NodeMetricSnapshot> snapshotsByHost = byHost(allNodeTimeseries); for (var applicationNodes : allNodes.state(Node.State.active) .nodeType(NodeType.tenant) - .matching(node -> ! node.allocation().get().owner().instance().isTester()) + .not().tester() .groupingBy(node -> node.allocation().get().owner()).entrySet()) { NodeResources totalResources = NodeResources.zero(); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java index fb31d51abfc..e2316f96c16 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/FailedExpirer.java @@ -97,7 +97,7 @@ public class FailedExpirer extends NodeRepositoryMaintainer { List<String> unparkedChildren = !candidate.type().isHost() ? List.of() : nodeRepository.nodes().list() .childrenOf(candidate) - .matching(node -> node.state() != Node.State.parked) + .not().state(Node.State.parked) .mapToList(Node::hostname); if (unparkedChildren.isEmpty()) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java index 2950de285b9..fe2fb5229f9 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java @@ -78,7 +78,7 @@ public class NodeHealthTracker extends NodeRepositoryMaintainer { private void updateActiveNodeDownState() { NodeList activeNodes = nodeRepository().nodes().list(Node.State.active); serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> { - Optional<Node> node = activeNodes.matching(n -> n.hostname().equals(hostname.toString())).first(); + Optional<Node> node = activeNodes.node(hostname.toString()); if (node.isEmpty()) return; // Already correct record, nothing to do diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java index 025c8be449c..0a1f6961f9f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java @@ -44,7 +44,7 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { return nodeRepository().nodes() .list(Node.State.active) .nodeType(NodeType.tenant) - .matching(node -> ! node.allocation().get().owner().instance().isTester()) + .not().tester() .groupingBy(node -> node.allocation().get().owner()); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/IP.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/IP.java index a04e305242f..1b7c629416a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/IP.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/IP.java @@ -220,6 +220,18 @@ public class IP { ipv6 } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + IpAddresses that = (IpAddresses) o; + return ipAddresses.equals(that.ipAddresses) && protocol == that.protocol; + } + + @Override + public int hashCode() { + return Objects.hash(ipAddresses, protocol); + } } /** @@ -346,13 +358,13 @@ public class IP { public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - Pool that = (Pool) o; - return Objects.equals(ipAddresses, that.ipAddresses); + Pool pool = (Pool) o; + return ipAddresses.equals(pool.ipAddresses) && addresses.equals(pool.addresses); } @Override public int hashCode() { - return Objects.hash(ipAddresses); + return Objects.hash(ipAddresses, addresses); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index fe61e2f5920..b4d72a35b80 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -15,7 +15,6 @@ import com.yahoo.vespa.hosted.provision.NoSuchNodeException; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeMutex; -import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.maintenance.NodeFailer; import com.yahoo.vespa.hosted.provision.node.filter.NodeFilter; import com.yahoo.vespa.hosted.provision.node.filter.NodeListFilter; @@ -198,6 +197,22 @@ public class Nodes { return setReady(List.of(nodeToReady), agent, reason).get(0); } + /** Restore a node that has been rebuilt */ + public Node restore(String hostname, Agent agent, String reason) { + // A deprovisioned host has no children so this doesn't need to to be recursive + try (NodeMutex lock = lockAndGetRequired(hostname)) { + Node existing = lock.node(); + if (existing.state() != Node.State.deprovisioned) illegal("Can not move node " + hostname + " to " + + Node.State.provisioned + ". It is not in " + + Node.State.deprovisioned); + if (!existing.status().wantToRebuild()) illegal("Can not move node " + hostname + " to " + + Node.State.provisioned + + ". Rebuild has not been requested"); + Node nodeWithResetFields = existing.withWantToRetire(false, false, false, agent, clock.instant()); + return db.writeTo(Node.State.provisioned, nodeWithResetFields, agent, Optional.of(reason)); + } + } + /** Reserve nodes. This method does <b>not</b> lock the node repository */ public List<Node> reserve(List<Node> nodes) { return db.writeTo(Node.State.reserved, nodes, Agent.application, Optional.empty()); @@ -291,20 +306,12 @@ public class Nodes { } public Node deallocate(Node node, Agent agent, String reason, NestedTransaction transaction) { - if (node.state() != Node.State.parked && agent != Agent.operator - && (node.status().wantToDeprovision() || retiredByOperator(node))) + if (parkOnDeallocationOf(node, agent)) return park(node.hostname(), false, agent, reason, transaction); else return db.writeTo(Node.State.dirty, List.of(node), agent, Optional.of(reason), transaction).get(0); } - private static boolean retiredByOperator(Node node) { - return node.status().wantToRetire() && node.history().event(History.Event.Type.wantToRetire) - .map(History.Event::agent) - .map(agent -> agent == Agent.operator) - .orElse(false); - } - /** * Fails this node and returns it in its new state. * @@ -368,9 +375,7 @@ public class Nodes { * Moves a host to breakfixed state, removing any children. */ public List<Node> breakfixRecursively(String hostname, Agent agent, String reason) { - Node node = node(hostname).orElseThrow(() -> - new NoSuchNodeException("Could not breakfix " + hostname + ": Node not found")); - + Node node = node(hostname).orElseThrow(() -> new NoSuchNodeException("Could not breakfix " + hostname + ": Node not found")); try (Mutex lock = lockUnallocated()) { requireBreakfixable(node); List<Node> removed = removeChildren(node, false); @@ -397,9 +402,7 @@ public class Nodes { private Node move(String hostname, boolean keepAllocation, Node.State toState, Agent agent, Optional<String> reason, NestedTransaction transaction) { - Node node = node(hostname).orElseThrow(() -> - new NoSuchNodeException("Could not move " + hostname + " to " + toState + ": Node not found")); - + Node node = node(hostname).orElseThrow(() -> new NoSuchNodeException("Could not move " + hostname + " to " + toState + ": Node not found")); if (!keepAllocation && node.allocation().isPresent()) { node = node.withoutAllocation(); } @@ -472,7 +475,9 @@ public class Nodes { if (zone.getCloud().dynamicProvisioning() || node.type() != NodeType.host) db.removeNodes(List.of(node)); else { - node = node.with(IP.Config.EMPTY); + if (!node.status().wantToRebuild()) { // Keep IP addresses if we're rebuilding + node = node.with(IP.Config.EMPTY); + } move(node, Node.State.deprovisioned, Agent.system, Optional.empty()); } removed.add(node); @@ -590,19 +595,31 @@ public class Nodes { } /** Retire and deprovision given host and all of its children */ - public List<Node> deprovision(Node host, Agent agent, Instant instant) { - if (!host.type().isHost()) throw new IllegalArgumentException("Cannot deprovision non-host " + host); - Optional<NodeMutex> nodeMutex = lockAndGet(host); + public List<Node> deprovision(String hostname, Agent agent, Instant instant) { + return decomission(hostname, DecommisionOperation.deprovision, agent, instant); + } + + /** Retire and rebuild given host and all of its children */ + public List<Node> rebuild(String hostname, Agent agent, Instant instant) { + return decomission(hostname, DecommisionOperation.rebuild, agent, instant); + } + + private List<Node> decomission(String hostname, DecommisionOperation op, Agent agent, Instant instant) { + Optional<NodeMutex> nodeMutex = lockAndGet(hostname); if (nodeMutex.isEmpty()) return List.of(); + Node host = nodeMutex.get().node(); + if (!host.type().isHost()) throw new IllegalArgumentException("Cannot " + op + " non-host " + host); List<Node> result; + boolean wantToDeprovision = op == DecommisionOperation.deprovision; + boolean wantToRebuild = op == DecommisionOperation.rebuild; try (NodeMutex lock = nodeMutex.get(); Mutex allocationLock = lockUnallocated()) { // This takes allocationLock to prevent any further allocation of nodes on this host host = lock.node(); NodeList children = list(allocationLock).childrenOf(host); result = performOn(NodeListFilter.from(children.asList()), - (node, nodeLock) -> write(node.withWantToRetire(true, true, agent, instant), + (node, nodeLock) -> write(node.withWantToRetire(true, wantToDeprovision, wantToRebuild, agent, instant), nodeLock)); - result.add(write(host.withWantToRetire(true, true, agent, instant), lock)); + result.add(write(host.withWantToRetire(true, wantToDeprovision, wantToRebuild, agent, instant), lock)); } return result; } @@ -755,4 +772,24 @@ public class Nodes { throw new IllegalArgumentException(message); } + /** Returns whether node should be parked when deallocated by given agent */ + private static boolean parkOnDeallocationOf(Node node, Agent agent) { + if (node.state() == Node.State.parked) return false; + if (agent == Agent.operator) return false; + boolean retirementRequestedByOperator = node.status().wantToRetire() && + node.history().event(History.Event.Type.wantToRetire) + .map(History.Event::agent) + .map(a -> a == Agent.operator) + .orElse(false); + return node.status().wantToDeprovision() || + node.status().wantToRebuild() || + retirementRequestedByOperator; + } + + /** The different ways a host can be decomissioned */ + private enum DecommisionOperation { + deprovision, + rebuild, + } + } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java index 999ba83c2b2..39d0d80b88f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java @@ -21,6 +21,7 @@ public class Status { private final int failCount; private final boolean wantToRetire; private final boolean wantToDeprovision; + private final boolean wantToRebuild; private final boolean preferToRetire; private final boolean wantToFail; private final OsVersion osVersion; @@ -32,6 +33,7 @@ public class Status { int failCount, boolean wantToRetire, boolean wantToDeprovision, + boolean wantToRebuild, boolean preferToRetire, boolean wantToFail, OsVersion osVersion, @@ -40,11 +42,15 @@ public class Status { this.vespaVersion = Objects.requireNonNull(vespaVersion, "Vespa version must be non-null").filter(v -> !Version.emptyVersion.equals(v)); this.containerImage = Objects.requireNonNull(containerImage, "Container image must be non-null").filter(d -> !DockerImage.EMPTY.equals(d)); this.failCount = failCount; - if (wantToDeprovision && !wantToRetire) { - throw new IllegalArgumentException("Node cannot be marked wantToDeprovision unless it's also marked wantToRetire"); + if (wantToDeprovision && wantToRebuild) { + throw new IllegalArgumentException("Node cannot be marked both wantToDeprovision and wantToRebuild"); + } + if ((wantToDeprovision || wantToRebuild) && !wantToRetire) { + throw new IllegalArgumentException("Node cannot be marked wantToDeprovision or wantToRebuild unless it's also marked wantToRetire"); } this.wantToRetire = wantToRetire; this.wantToDeprovision = wantToDeprovision; + this.wantToRebuild = wantToRebuild; this.preferToRetire = preferToRetire; this.wantToFail = wantToFail; this.osVersion = Objects.requireNonNull(osVersion, "OS version must be non-null"); @@ -52,35 +58,35 @@ public class Status { } /** Returns a copy of this with the reboot generation changed */ - public Status withReboot(Generation reboot) { return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } + public Status withReboot(Generation reboot) { return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } /** Returns the reboot generation of this node */ public Generation reboot() { return reboot; } /** Returns a copy of this with the vespa version changed */ - public Status withVespaVersion(Version version) { return new Status(reboot, Optional.of(version), containerImage, failCount, wantToRetire, wantToDeprovision, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } + public Status withVespaVersion(Version version) { return new Status(reboot, Optional.of(version), containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } /** Returns the Vespa version installed on the node, if known */ public Optional<Version> vespaVersion() { return vespaVersion; } /** Returns a copy of this with the container image changed */ - public Status withContainerImage(DockerImage containerImage) { return new Status(reboot, vespaVersion, Optional.of(containerImage), failCount, wantToRetire, wantToDeprovision, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } + public Status withContainerImage(DockerImage containerImage) { return new Status(reboot, vespaVersion, Optional.of(containerImage), failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } /** Returns the container image the node is running, if any */ public Optional<DockerImage> containerImage() { return containerImage; } - public Status withIncreasedFailCount() { return new Status(reboot, vespaVersion, containerImage, failCount + 1, wantToRetire, wantToDeprovision, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } + public Status withIncreasedFailCount() { return new Status(reboot, vespaVersion, containerImage, failCount + 1, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } - public Status withDecreasedFailCount() { return new Status(reboot, vespaVersion, containerImage, failCount - 1, wantToRetire, wantToDeprovision, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } + public Status withDecreasedFailCount() { return new Status(reboot, vespaVersion, containerImage, failCount - 1, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } - public Status withFailCount(int value) { return new Status(reboot, vespaVersion, containerImage, value, wantToRetire, wantToDeprovision, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } + public Status withFailCount(int value) { return new Status(reboot, vespaVersion, containerImage, value, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } /** Returns how many times this node has been moved to the failed state. */ public int failCount() { return failCount; } - /** Returns a copy of this with the want to retire/deprovision flags changed */ - public Status withWantToRetire(boolean wantToRetire, boolean wantToDeprovision) { - return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); + /** Returns a copy of this with the want to retire/deprovision/rebuild flags changed */ + public Status withWantToRetire(boolean wantToRetire, boolean wantToDeprovision, boolean wantToRebuild) { + return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } /** @@ -94,6 +100,9 @@ public class Status { */ public boolean wantToDeprovision() { return wantToDeprovision; } + /** Returns whether this node should be rebuilt when possible. */ + public boolean wantToRebuild() { return wantToRebuild; } + /** * Returns whether this node is requested to retire. Unlike {@link Status#wantToRetire()}, this is a soft * request to retire, which will not allow any replacement to increase node skew in the cluster. @@ -102,7 +111,7 @@ public class Status { /** Returns a copy of this with want to fail set to the given value */ public Status withWantToFail(boolean wantToFail) { - return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); + return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } /** Returns whether this node should be failed */ @@ -110,12 +119,12 @@ public class Status { /** Returns a copy of this with prefer-to-retire set to given value */ public Status withPreferToRetire(boolean preferToRetire) { - return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); + return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); } /** Returns a copy of this with the OS version set to given version */ public Status withOsVersion(OsVersion version) { - return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, preferToRetire, wantToFail, version, firmwareVerifiedAt); + return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, version, firmwareVerifiedAt); } /** Returns the OS version of this node */ @@ -125,7 +134,7 @@ public class Status { /** Returns a copy of this with the firmwareVerifiedAt set to the given instant. */ public Status withFirmwareVerifiedAt(Instant instant) { - return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, preferToRetire, wantToFail, osVersion, Optional.of(instant)); + return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, Optional.of(instant)); } /** Returns the last time this node had firmware that was verified to be up to date. */ @@ -136,7 +145,7 @@ public class Status { /** Returns the initial status of a newly provisioned node */ public static Status initial() { return new Status(Generation.initial(), Optional.empty(), Optional.empty(), 0, false, - false, false, false, OsVersion.EMPTY, Optional.empty()); + false, false, false, false, OsVersion.EMPTY, Optional.empty()); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java index 930db265066..f378a4249f4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/os/RetiringOsUpgrader.java @@ -63,7 +63,7 @@ public class RetiringOsUpgrader implements OsUpgrader { LOG.info("Retiring and deprovisioning " + host + ": On stale OS version " + host.status().osVersion().current().map(Version::toFullString).orElse("<unset>") + ", want " + target); - nodeRepository.nodes().deprovision(host, Agent.RetiringUpgrader, now); + nodeRepository.nodes().deprovision(host.hostname(), Agent.RetiringUpgrader, now); nodeRepository.nodes().upgradeOs(NodeListFilter.from(host), Optional.of(target)); nodeRepository.osVersions().writeChange((change) -> change.withRetirementAt(now, host.type())); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index c4885703cf2..cc3fd75a22c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -82,6 +82,7 @@ public class NodeSerializer { private static final String nodeTypeKey = "type"; private static final String wantToRetireKey = "wantToRetire"; private static final String wantToDeprovisionKey = "wantToDeprovision"; + private static final String wantToRebuildKey = "wantToRebuild"; private static final String preferToRetireKey = "preferToRetire"; private static final String wantToFailKey = "wantToFailKey"; private static final String osVersionKey = "osVersion"; @@ -166,6 +167,7 @@ public class NodeSerializer { object.setBool(preferToRetireKey, node.status().preferToRetire()); object.setBool(wantToDeprovisionKey, node.status().wantToDeprovision()); object.setBool(wantToFailKey, node.status().wantToFail()); + object.setBool(wantToRebuildKey, node.status().wantToRebuild()); node.allocation().ifPresent(allocation -> toSlime(allocation, object.setObject(instanceKey))); toSlime(node.history(), object.setArray(historyKey)); object.setString(nodeTypeKey, toString(node.type())); @@ -273,6 +275,7 @@ public class NodeSerializer { (int) object.field(failCountKey).asLong(), object.field(wantToRetireKey).asBool(), object.field(wantToDeprovisionKey).asBool(), + object.field(wantToRebuildKey).asBool(), object.field(preferToRetireKey).asBool(), object.field(wantToFailKey).asBool(), new OsVersion(versionFromSlime(object.field(osVersionKey)), diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java index e8fa4f04eed..1bea7056790 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodePatcher.java @@ -54,6 +54,7 @@ public class NodePatcher implements AutoCloseable { private static final String WANT_TO_RETIRE = "wantToRetire"; private static final String WANT_TO_DEPROVISION = "wantToDeprovision"; + private static final String WANT_TO_REBUILD = "wantToRebuild"; private static final Set<String> RECURSIVE_FIELDS = Set.of(WANT_TO_RETIRE); private final NodeRepository nodeRepository; @@ -141,11 +142,17 @@ public class NodePatcher implements AutoCloseable { return IP.Config.verify(node.with(node.ipConfig().withPool(node.ipConfig().pool().withIpAddresses(asStringSet(value)))), memoizedNodes.get()); case "additionalHostnames" : return IP.Config.verify(node.with(node.ipConfig().withPool(node.ipConfig().pool().withAddresses(asAddressList(value)))), memoizedNodes.get()); - case WANT_TO_RETIRE : - case WANT_TO_DEPROVISION : + case WANT_TO_RETIRE: + case WANT_TO_DEPROVISION: + case WANT_TO_REBUILD: boolean wantToRetire = asOptionalBoolean(root.field(WANT_TO_RETIRE)).orElse(node.status().wantToRetire()); boolean wantToDeprovision = asOptionalBoolean(root.field(WANT_TO_DEPROVISION)).orElse(node.status().wantToDeprovision()); - return node.withWantToRetire(wantToRetire, wantToDeprovision && !applyingAsChild, Agent.operator, clock.instant()); + boolean wantToRebuild = asOptionalBoolean(root.field(WANT_TO_REBUILD)).orElse(node.status().wantToRebuild()); + return node.withWantToRetire(wantToRetire, + wantToDeprovision && !applyingAsChild, + wantToRebuild && !applyingAsChild, + Agent.operator, + clock.instant()); case "reports" : return nodeWithPatchedReports(node, value); case "openStackId" : diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index c9483a99a69..0875bf3815d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -135,27 +135,31 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { // Check paths to disallow illegal state changes if (path.matches("/nodes/v2/state/ready/{hostname}")) { nodeRepository.nodes().markNodeAvailableForNewAllocation(path.get("hostname"), Agent.operator, "Readied through the nodes/v2 API"); - return new MessageResponse("Moved " + path.get("hostname") + " to ready"); + return new MessageResponse("Moved " + path.get("hostname") + " to " + Node.State.ready); } else if (path.matches("/nodes/v2/state/failed/{hostname}")) { List<Node> failedNodes = nodeRepository.nodes().failRecursively(path.get("hostname"), Agent.operator, "Failed through the nodes/v2 API"); - return new MessageResponse("Moved " + hostnamesAsString(failedNodes) + " to failed"); + return new MessageResponse("Moved " + hostnamesAsString(failedNodes) + " to " + Node.State.failed); } else if (path.matches("/nodes/v2/state/parked/{hostname}")) { List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), Agent.operator, "Parked through the nodes/v2 API"); - return new MessageResponse("Moved " + hostnamesAsString(parkedNodes) + " to parked"); + return new MessageResponse("Moved " + hostnamesAsString(parkedNodes) + " to " + Node.State.parked); } else if (path.matches("/nodes/v2/state/dirty/{hostname}")) { List<Node> dirtiedNodes = nodeRepository.nodes().deallocateRecursively(path.get("hostname"), Agent.operator, "Dirtied through the nodes/v2 API"); - return new MessageResponse("Moved " + hostnamesAsString(dirtiedNodes) + " to dirty"); + return new MessageResponse("Moved " + hostnamesAsString(dirtiedNodes) + " to " + Node.State.dirty); } else if (path.matches("/nodes/v2/state/active/{hostname}")) { nodeRepository.nodes().reactivate(path.get("hostname"), Agent.operator, "Reactivated through nodes/v2 API"); - return new MessageResponse("Moved " + path.get("hostname") + " to active"); + return new MessageResponse("Moved " + path.get("hostname") + " to " + Node.State.active); } else if (path.matches("/nodes/v2/state/breakfixed/{hostname}")) { List<Node> breakfixedNodes = nodeRepository.nodes().breakfixRecursively(path.get("hostname"), Agent.operator, "Breakfixed through the nodes/v2 API"); - return new MessageResponse("Breakfixed " + hostnamesAsString(breakfixedNodes)); + return new MessageResponse("Moved " + hostnamesAsString(breakfixedNodes) + " to " + Node.State.breakfixed); + } + else if (path.matches("/nodes/v2/state/provisioned/{hostname}")) { + Node restoredNode = nodeRepository.nodes().restore(path.get("hostname"), Agent.operator, "Restored through the nodes/v2 API"); + return new MessageResponse("Moved " + hostnamesAsString(List.of(restoredNode)) + " to " + Node.State.provisioned); } throw new NotFoundException("Cannot put to path '" + path + "'"); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java index 6d27acf77d1..79a6101750e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java @@ -119,7 +119,7 @@ public class MockNodeRepository extends NodeRepository { nodes.add(node10); Node node55 = Node.create("node55", ipConfig(55), "host55.yahoo.com", resources(2, 8, 50, 1, fast, local), NodeType.tenant) - .status(Status.initial().withWantToRetire(true, true)).build(); + .status(Status.initial().withWantToRetire(true, true, false)).build(); nodes.add(node55); /* Setup docker hosts (two of these will be reserved for spares */ diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java index d18d2bf101d..c0699ebf835 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java @@ -17,6 +17,7 @@ import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -204,6 +205,42 @@ public class NodeRepositoryTest { } @Test + public void restore_rebuilt_host() { + NodeRepositoryTester tester = new NodeRepositoryTester(); + assertEquals(0, tester.nodeRepository().nodes().list().size()); + + String host1 = "host1"; + String host2 = "host2"; + tester.addHost("id1", host1, "default", NodeType.host); + tester.addHost("id2", host2, "default", NodeType.host); + assertEquals(2, tester.nodeRepository().nodes().list().size()); + + // One host is requested to rebuild, two hosts are parked + tester.nodeRepository().nodes().rebuild(host2, Agent.system, tester.clock().instant()); + tester.nodeRepository().nodes().park(host1, false, Agent.system, getClass().getSimpleName()); + tester.nodeRepository().nodes().park(host2, false, Agent.system, getClass().getSimpleName()); + IP.Config ipConfigOfHost2 = tester.nodeRepository().nodes().node(host2).get().ipConfig(); + + // Two hosts are removed + tester.nodeRepository().nodes().removeRecursively(host1); + tester.nodeRepository().nodes().removeRecursively(host2); + assertEquals(2, tester.nodeRepository().nodes().list(Node.State.deprovisioned).size()); + + // Host not rebuilding cannot be restored + try { + tester.nodeRepository().nodes().restore(host1, Agent.system, getClass().getSimpleName()); + fail("Expected exception"); + } catch (IllegalArgumentException ignored) {} + + // Other host is restored + Node node = tester.nodeRepository().nodes().restore(host2, Agent.system, getClass().getSimpleName()); + assertSame(Node.State.provisioned, node.state()); + assertEquals("IP addresses are preserved", ipConfigOfHost2, node.ipConfig()); + assertFalse(node.status().wantToRetire()); + assertFalse(node.status().wantToRebuild()); + } + + @Test public void dirty_host_only_if_we_can_dirty_children() { NodeRepositoryTester tester = new NodeRepositoryTester(); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java index 305f1b5952e..b761f743687 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainerTest.java @@ -475,7 +475,7 @@ public class DynamicProvisioningMaintainerTest { Supplier<Node> nodeToRemove = () -> tester.nodeRepository().nodes().node(configNodes.childrenOf(hostnameToRemove).first().get().hostname()).get(); // Set want to retire and deprovision on host and children - tester.nodeRepository().nodes().deprovision(hostToRemove.get(), Agent.system, tester.clock().instant()); + tester.nodeRepository().nodes().deprovision(hostToRemove.get().hostname(), Agent.system, tester.clock().instant()); // Redeployment of config server application retires node tester.prepareAndActivateInfraApplication(configSrvApp, hostType.childNodeType()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java index 881646fa546..523ceeb94ce 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializerTest.java @@ -228,7 +228,7 @@ public class NodeSerializerTest { } @Test - public void serialize_parentHostname() { + public void serialize_parent_hostname() { final String parentHostname = "parent.yahoo.com"; Node node = Node.create("myId", new IP.Config(Set.of("127.0.0.1"), Set.of()), "myHostname", nodeFlavors.getFlavorOrThrow("default"), NodeType.tenant) .parentHostname(parentHostname) @@ -307,6 +307,17 @@ public class NodeSerializerTest { } @Test + public void want_to_rebuild() { + Node node = nodeSerializer.fromJson(State.active, nodeSerializer.toJson(createNode())); + assertFalse(node.status().wantToRebuild()); + node = node.with(node.status().withWantToRetire(true, false, true)); + node = nodeSerializer.fromJson(State.active, nodeSerializer.toJson(node)); + assertTrue(node.status().wantToRetire()); + assertFalse(node.status().wantToDeprovision()); + assertTrue(node.status().wantToRebuild()); + } + + @Test public void vespa_version_serialization() { String nodeWithWantedVespaVersion = "{\n" + diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java index 9ac85b9a99c..e0715702722 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java @@ -221,6 +221,9 @@ public class NodesV2ApiTest { // Make sure that wantToRetire is applied recursively, but wantToDeprovision isn't tester.assertResponseContains(new Request("http://localhost:8080/nodes/v2/node/host5.yahoo.com"), "\"wantToRetire\":true,\"preferToRetire\":false,\"wantToDeprovision\":false,"); + assertResponse(new Request("http://localhost:8080/nodes/v2/node/dockerhost1.yahoo.com", + Utf8.toBytes("{\"wantToRebuild\": true, \"wantToRetire\": true}"), Request.Method.PATCH), + "{\"message\":\"Updated dockerhost1.yahoo.com\"}"); tester.assertResponseContains(new Request("http://localhost:8080/nodes/v2/node/dockerhost1.yahoo.com"), "\"modelName\":\"foo\""); assertResponse(new Request("http://localhost:8080/nodes/v2/node/dockerhost1.yahoo.com", @@ -234,7 +237,16 @@ public class NodesV2ApiTest { assertFile(new Request("http://localhost:8080/nodes/v2/node/host4.yahoo.com"), "node4-after-changes.json"); - // move the docker host to deprovisioned + // move a host marked as wantToRebuild to deprovisioned + assertResponse(new Request("http://localhost:8080/nodes/v2/node/dockerhost1.yahoo.com", + new byte[0], Request.Method.DELETE), + "{\"message\":\"Removed dockerhost1.yahoo.com\"}"); + // ... and then restore it + assertResponse(new Request("http://localhost:8080/nodes/v2/state/provisioned/dockerhost1.yahoo.com", + new byte[0], Request.Method.PUT), + "{\"message\":\"Moved dockerhost1.yahoo.com to provisioned\"}"); + + // move a host to deprovisioned assertResponse(new Request("http://localhost:8080/nodes/v2/node/dockerhost1.yahoo.com", new byte[0], Request.Method.DELETE), "{\"message\":\"Removed dockerhost1.yahoo.com\"}"); @@ -242,8 +254,6 @@ public class NodesV2ApiTest { assertResponse(new Request("http://localhost:8080/nodes/v2/node/dockerhost1.yahoo.com", new byte[0], Request.Method.DELETE), "{\"message\":\"Permanently removed dockerhost1.yahoo.com\"}"); - - } @Test |