From e3a0dff247baa58b42b17c472aed52e3432ab402 Mon Sep 17 00:00:00 2001 From: Ola Aunrønning Date: Mon, 2 Nov 2020 15:42:28 +0100 Subject: Added 'breakfixed' node state --- .../com/yahoo/vespa/hosted/provision/Node.java | 5 ++- .../vespa/hosted/provision/NodeRepository.java | 52 +++++++++++++++++++--- .../yahoo/vespa/hosted/provision/node/History.java | 5 ++- .../persistence/CuratorDatabaseClient.java | 1 + .../provision/persistence/NodeSerializer.java | 2 + .../hosted/provision/restapi/NodeSerializer.java | 2 + .../provision/restapi/NodesV2ApiHandler.java | 4 ++ .../vespa/hosted/provision/NodeRepositoryTest.java | 33 ++++++++++++++ .../provision/maintenance/MetricsReporterTest.java | 3 +- .../restapi/responses/states-recursive.json | 5 +++ .../hosted/provision/restapi/responses/states.json | 3 ++ 11 files changed, 107 insertions(+), 8 deletions(-) (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index 7520a30716b..7703da4aab2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -476,7 +476,10 @@ public final class Node implements Nodelike { parked, /** This host has previously been in use but is now removed. */ - deprovisioned; + deprovisioned, + + /** This host is currently undergoing repair. */ + breakfixed; /** Returns whether this is a state where the node is assigned to an application */ public boolean isAllocated() { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index 3cf5d77de69..6e2c9a24ed2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -534,11 +534,12 @@ public class NodeRepository extends AbstractComponent { .filter(node -> node.state() != State.provisioned) .filter(node -> node.state() != State.failed) .filter(node -> node.state() != State.parked) + .filter(node -> node.state() != State.breakfixed) .map(Node::hostname) .collect(Collectors.toList()); if ( ! hostnamesNotAllowedToDirty.isEmpty()) illegal("Could not deallocate " + nodeToDirty + ": " + - hostnamesNotAllowedToDirty + " are not in states [provisioned, failed, parked]"); + hostnamesNotAllowedToDirty + " are not in states [provisioned, failed, parked, breakfixed]"); return nodesToDirty.stream().map(node -> setDirty(node, agent, reason)).collect(Collectors.toList()); } @@ -591,6 +592,21 @@ public class NodeRepository extends AbstractComponent { return move(hostname, true, State.active, agent, Optional.of(reason)); } + /** + * Moves a host to breakfixed state, removing any children. + */ + public List breakfixRecursively(String hostname, Agent agent, String reason) { + Node node = getNode(hostname).orElseThrow(() -> + new NoSuchNodeException("Could not breakfix " + hostname + ": Node not found")); + + try (Mutex lock = lockUnallocated()) { + requireBreakfixable(node); + List removed = removeChildren(node, false); + removed.add(move(node, State.breakfixed, agent, Optional.of(reason))); + return removed; + } + } + private List moveRecursively(String hostname, State toState, Agent agent, Optional reason) { List moved = list().childrenOf(hostname).asList().stream() .map(child -> move(child, toState, agent, reason)) @@ -664,10 +680,7 @@ public class NodeRepository extends AbstractComponent { requireRemovable(node, false, force); if (node.type().isHost()) { - List children = list().childrenOf(node).asList(); - children.forEach(child -> requireRemovable(child, true, force)); - db.removeNodes(children); - List removed = new ArrayList<>(children); + List removed = removeChildren(node, force); if (zone.getCloud().dynamicProvisioning() || node.type() != NodeType.host) db.removeNodes(List.of(node)); else { @@ -692,6 +705,13 @@ public class NodeRepository extends AbstractComponent { db.removeNodes(List.of(node)); } + private List removeChildren(Node node, boolean force) { + List children = list().childrenOf(node).asList(); + children.forEach(child -> requireRemovable(child, true, force)); + db.removeNodes(children); + return new ArrayList<>(children); + } + /** * Throws if the given node cannot be removed. Removal is allowed if: * - Tenant node: node is unallocated @@ -722,6 +742,28 @@ public class NodeRepository extends AbstractComponent { } } + /** + * Throws if given node cannot be breakfixed. + * Breakfix is allowed if the following is true: + * - Node is tenant host + * - Node is in zone without dynamic provisioning + * - Node is in parked or failed state + */ + private void requireBreakfixable(Node node) { + if (zone().getCloud().dynamicProvisioning()) { + illegal("Can not breakfix in zone: " + zone()); + } + + if (node.type() != NodeType.host) { + illegal(node + " can not be breakfixed as it is not a tenant host"); + } + + Set legalStates = EnumSet.of(State.failed, State.parked); + if (! legalStates.contains(node.state())) { + illegal(node + " can not be removed as it is not in the states " + legalStates); + } + } + /** * Increases the restart generation of the active nodes matching the filter. * diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java index 959071c83c4..e92415d6538 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java @@ -89,6 +89,7 @@ public class History { case failed: return this.with(new Event(Event.Type.failed, agent, at)); case dirty: return this.with(new Event(Event.Type.deallocated, agent, at)); case parked: return this.with(new Event(Event.Type.parked, agent, at)); + case breakfixed: return this.with(new Event(Event.Type.breakfixed, agent, at)); default: return this; } } @@ -145,7 +146,9 @@ public class History { // The node verified its firmware (whether this resulted in a reboot depends on the node model) firmwareVerified(false), // The node was failed - failed(false); + failed(false), + // The node was breakfixed + breakfixed(false); private final boolean applicationLevel; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java index b892a998ec8..42e26814d41 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java @@ -339,6 +339,7 @@ public class CuratorDatabaseClient { case ready: return "ready"; case reserved: return "reserved"; case deprovisioned: return "deprovisioned"; + case breakfixed: return "breakfixed"; default: throw new RuntimeException("Node state " + state + " does not map to a directory name"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index d2256344854..c555d0281a5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -400,6 +400,7 @@ public class NodeSerializer { case "rebooted" : return History.Event.Type.rebooted; case "osUpgraded" : return History.Event.Type.osUpgraded; case "firmwareVerified" : return History.Event.Type.firmwareVerified; + case "breakfixed" : return History.Event.Type.breakfixed; } throw new IllegalArgumentException("Unknown node event type '" + eventTypeString + "'"); } @@ -422,6 +423,7 @@ public class NodeSerializer { case rebooted: return "rebooted"; case osUpgraded: return "osUpgraded"; case firmwareVerified: return "firmwareVerified"; + case breakfixed: return "breakfixed"; } throw new IllegalArgumentException("Serialized form of '" + nodeEventType + "' not defined"); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java index bd65894101c..b72d021e4f5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java @@ -24,6 +24,7 @@ public class NodeSerializer { case "ready": return Node.State.ready; case "reserved": return Node.State.reserved; case "deprovisioned": return Node.State.deprovisioned; + case "breakfixed": return Node.State.breakfixed; default: throw new IllegalArgumentException("Unknown node state '" + state + "'"); } } @@ -39,6 +40,7 @@ public class NodeSerializer { case ready: return "ready"; case reserved: return "reserved"; case deprovisioned: return "deprovisioned"; + case breakfixed: return "breakfixed"; default: throw new IllegalArgumentException("Unknown node state '" + state + "'"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index f861539526d..59604d094fa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -148,6 +148,10 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { nodeRepository.reactivate(lastElement(path), Agent.operator, "Reactivated through nodes/v2 API"); return new MessageResponse("Moved " + lastElement(path) + " to active"); } + else if (path.startsWith("/nodes/v2/state/breakfixed/")) { + List breakfixedNodes = nodeRepository.breakfixRecursively(lastElement(path), Agent.operator, "Breakfixed through the nodes/v2 API"); + return new MessageResponse("Breakfixed " + hostnamesAsString(breakfixedNodes)); + } throw new NotFoundException("Cannot put to path '" + path + "'"); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java index 1ab8291d71f..e6e19d0407e 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java @@ -241,6 +241,39 @@ public class NodeRepositoryTest { assertEquals(asSet("host2", "node20"), filterNodes(tester, node -> node.state() == Node.State.dirty)); } + @Test + public void breakfix_tenant_host() { + NodeRepositoryTester tester = new NodeRepositoryTester(); + tester.addHost("host1", "host1", "default", NodeType.host); + tester.addNode("node1", "node1", "host1", "docker", NodeType.tenant); + String reason = NodeRepositoryTest.class.getSimpleName(); + + try { + tester.nodeRepository().breakfixRecursively("node1", Agent.system, reason); + fail("Should not be able to breakfix tenant node"); + } catch (IllegalArgumentException ignored) {} + + try { + tester.nodeRepository().breakfixRecursively("host1", Agent.system, reason); + fail("Should not be able to breakfix host in state not in [parked, failed]"); + } catch (IllegalArgumentException ignored) {} + + tester.setNodeState("host1", Node.State.failed); + tester.setNodeState("node1", Node.State.active); + try { + tester.nodeRepository().breakfixRecursively("host1", Agent.system, reason); + fail("Should not be able to breakfix host with active tenant node"); + } catch (IllegalArgumentException ignored) {} + + tester.setNodeState("node1", Node.State.failed); + tester.nodeRepository().breakfixRecursively("host1", Agent.system, reason); + + assertEquals(1, tester.nodeRepository().getNodes().size()); + Node node = tester.nodeRepository().getNodes().get(0); + assertEquals("host1", node.hostname()); + assertEquals(Node.State.breakfixed, node.state()); + } + private static Set asSet(String... elements) { return new HashSet<>(Arrays.asList(elements)); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java index a5cd922300c..653791c971a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java @@ -96,6 +96,7 @@ public class MetricsReporterTest { expectedMetrics.put("hostedVespa.dirtyHosts", 0); expectedMetrics.put("hostedVespa.failedHosts", 0); expectedMetrics.put("hostedVespa.deprovisionedHosts", 0); + expectedMetrics.put("hostedVespa.breakfixedHosts", 0); expectedMetrics.put("hostedVespa.pendingRedeployments", 42); expectedMetrics.put("hostedVespa.docker.totalCapacityDisk", 0.0); expectedMetrics.put("hostedVespa.docker.totalCapacityMem", 0.0); @@ -122,7 +123,7 @@ public class MetricsReporterTest { nodeRepository.list(); expectedMetrics.put("cache.curator.hitRate", 0.5D); expectedMetrics.put("cache.curator.evictionCount", 0L); - expectedMetrics.put("cache.curator.size", 11L); + expectedMetrics.put("cache.curator.size", 12L); ManualClock clock = new ManualClock(Instant.ofEpochSecond(124)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states-recursive.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states-recursive.json index 4fb742cbb4a..27767be6315 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states-recursive.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states-recursive.json @@ -63,6 +63,11 @@ "url": "http://localhost:8080/nodes/v2/state/deprovisioned", "nodes": [ ] + }, + "breakfixed": { + "url": "http://localhost:8080/nodes/v2/state/breakfixed", + "nodes": [ + ] } } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states.json index 69579148df3..fb1282f5195 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states.json @@ -26,6 +26,9 @@ }, "deprovisioned": { "url": "http://localhost:8080/nodes/v2/state/deprovisioned" + }, + "breakfixed": { + "url": "http://localhost:8080/nodes/v2/state/breakfixed" } } } \ No newline at end of file -- cgit v1.2.3 From b403f2fb1222377848cf1296ec8a420698cd30d4 Mon Sep 17 00:00:00 2001 From: Ola Aunrønning Date: Wed, 4 Nov 2020 09:57:30 +0100 Subject: Include breakfixed in state transition comment --- .../src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index 6e2c9a24ed2..d8ea19b7677 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -88,7 +88,7 @@ import java.util.stream.Stream; // 1) (new) | deprovisioned - > provisioned -> (dirty ->) ready -> reserved -> active -> inactive -> dirty -> ready // 2) inactive -> reserved | parked // 3) reserved -> dirty -// 4) * -> failed | parked -> dirty | active | deprovisioned +// 4) * -> failed | parked -> (breakfixed) -> dirty | active | deprovisioned // 5) deprovisioned -> (forgotten) // Nodes have an application assigned when in states reserved, active and inactive. // Nodes might have an application assigned in dirty. -- cgit v1.2.3