diff options
author | Ola Aunrønning <olaa@verizonmedia.com> | 2020-11-02 15:42:28 +0100 |
---|---|---|
committer | Ola Aunrønning <olaa@verizonmedia.com> | 2020-11-02 15:43:14 +0100 |
commit | e3a0dff247baa58b42b17c472aed52e3432ab402 (patch) | |
tree | 2665d59713f7c8e45cefc1fc2fae7bff300b9420 | |
parent | fe688d87e4ff96dc63d797eaba5582d2594d7076 (diff) |
Added 'breakfixed' node state
13 files changed, 110 insertions, 10 deletions
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeState.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeState.java index ab4ab07ce85..3e5de9501f3 100644 --- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeState.java +++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeState.java @@ -11,6 +11,7 @@ public enum NodeState { dirty, failed, parked, - deprovisioned; + deprovisioned, + breakfixed } diff --git a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeState.java b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeState.java index e13785fdab9..6ef59fd8521 100644 --- a/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeState.java +++ b/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/configserver/noderepository/NodeState.java @@ -9,5 +9,5 @@ package com.yahoo.vespa.hosted.node.admin.configserver.noderepository; * @author freva */ public enum NodeState { - provisioned, ready, reserved, active, inactive, dirty, failed, parked, deprovisioned + provisioned, ready, reserved, active, inactive, dirty, failed, parked, deprovisioned, breakfixed } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java index 7520a30716b..7703da4aab2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java @@ -476,7 +476,10 @@ public final class Node implements Nodelike { parked, /** This host has previously been in use but is now removed. */ - deprovisioned; + deprovisioned, + + /** This host is currently undergoing repair. */ + breakfixed; /** Returns whether this is a state where the node is assigned to an application */ public boolean isAllocated() { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java index 3cf5d77de69..6e2c9a24ed2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeRepository.java @@ -534,11 +534,12 @@ public class NodeRepository extends AbstractComponent { .filter(node -> node.state() != State.provisioned) .filter(node -> node.state() != State.failed) .filter(node -> node.state() != State.parked) + .filter(node -> node.state() != State.breakfixed) .map(Node::hostname) .collect(Collectors.toList()); if ( ! hostnamesNotAllowedToDirty.isEmpty()) illegal("Could not deallocate " + nodeToDirty + ": " + - hostnamesNotAllowedToDirty + " are not in states [provisioned, failed, parked]"); + hostnamesNotAllowedToDirty + " are not in states [provisioned, failed, parked, breakfixed]"); return nodesToDirty.stream().map(node -> setDirty(node, agent, reason)).collect(Collectors.toList()); } @@ -591,6 +592,21 @@ public class NodeRepository extends AbstractComponent { return move(hostname, true, State.active, agent, Optional.of(reason)); } + /** + * Moves a host to breakfixed state, removing any children. + */ + public List<Node> breakfixRecursively(String hostname, Agent agent, String reason) { + Node node = getNode(hostname).orElseThrow(() -> + new NoSuchNodeException("Could not breakfix " + hostname + ": Node not found")); + + try (Mutex lock = lockUnallocated()) { + requireBreakfixable(node); + List<Node> removed = removeChildren(node, false); + removed.add(move(node, State.breakfixed, agent, Optional.of(reason))); + return removed; + } + } + private List<Node> moveRecursively(String hostname, State toState, Agent agent, Optional<String> reason) { List<Node> moved = list().childrenOf(hostname).asList().stream() .map(child -> move(child, toState, agent, reason)) @@ -664,10 +680,7 @@ public class NodeRepository extends AbstractComponent { requireRemovable(node, false, force); if (node.type().isHost()) { - List<Node> children = list().childrenOf(node).asList(); - children.forEach(child -> requireRemovable(child, true, force)); - db.removeNodes(children); - List<Node> removed = new ArrayList<>(children); + List<Node> removed = removeChildren(node, force); if (zone.getCloud().dynamicProvisioning() || node.type() != NodeType.host) db.removeNodes(List.of(node)); else { @@ -692,6 +705,13 @@ public class NodeRepository extends AbstractComponent { db.removeNodes(List.of(node)); } + private List<Node> removeChildren(Node node, boolean force) { + List<Node> children = list().childrenOf(node).asList(); + children.forEach(child -> requireRemovable(child, true, force)); + db.removeNodes(children); + return new ArrayList<>(children); + } + /** * Throws if the given node cannot be removed. Removal is allowed if: * - Tenant node: node is unallocated @@ -723,6 +743,28 @@ public class NodeRepository extends AbstractComponent { } /** + * Throws if given node cannot be breakfixed. + * Breakfix is allowed if the following is true: + * - Node is tenant host + * - Node is in zone without dynamic provisioning + * - Node is in parked or failed state + */ + private void requireBreakfixable(Node node) { + if (zone().getCloud().dynamicProvisioning()) { + illegal("Can not breakfix in zone: " + zone()); + } + + if (node.type() != NodeType.host) { + illegal(node + " can not be breakfixed as it is not a tenant host"); + } + + Set<State> legalStates = EnumSet.of(State.failed, State.parked); + if (! legalStates.contains(node.state())) { + illegal(node + " can not be removed as it is not in the states " + legalStates); + } + } + + /** * Increases the restart generation of the active nodes matching the filter. * * @return the nodes in their new state. diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java index 959071c83c4..e92415d6538 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java @@ -89,6 +89,7 @@ public class History { case failed: return this.with(new Event(Event.Type.failed, agent, at)); case dirty: return this.with(new Event(Event.Type.deallocated, agent, at)); case parked: return this.with(new Event(Event.Type.parked, agent, at)); + case breakfixed: return this.with(new Event(Event.Type.breakfixed, agent, at)); default: return this; } } @@ -145,7 +146,9 @@ public class History { // The node verified its firmware (whether this resulted in a reboot depends on the node model) firmwareVerified(false), // The node was failed - failed(false); + failed(false), + // The node was breakfixed + breakfixed(false); private final boolean applicationLevel; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java index b892a998ec8..42e26814d41 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/CuratorDatabaseClient.java @@ -339,6 +339,7 @@ public class CuratorDatabaseClient { case ready: return "ready"; case reserved: return "reserved"; case deprovisioned: return "deprovisioned"; + case breakfixed: return "breakfixed"; default: throw new RuntimeException("Node state " + state + " does not map to a directory name"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java index d2256344854..c555d0281a5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java @@ -400,6 +400,7 @@ public class NodeSerializer { case "rebooted" : return History.Event.Type.rebooted; case "osUpgraded" : return History.Event.Type.osUpgraded; case "firmwareVerified" : return History.Event.Type.firmwareVerified; + case "breakfixed" : return History.Event.Type.breakfixed; } throw new IllegalArgumentException("Unknown node event type '" + eventTypeString + "'"); } @@ -422,6 +423,7 @@ public class NodeSerializer { case rebooted: return "rebooted"; case osUpgraded: return "osUpgraded"; case firmwareVerified: return "firmwareVerified"; + case breakfixed: return "breakfixed"; } throw new IllegalArgumentException("Serialized form of '" + nodeEventType + "' not defined"); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java index bd65894101c..b72d021e4f5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodeSerializer.java @@ -24,6 +24,7 @@ public class NodeSerializer { case "ready": return Node.State.ready; case "reserved": return Node.State.reserved; case "deprovisioned": return Node.State.deprovisioned; + case "breakfixed": return Node.State.breakfixed; default: throw new IllegalArgumentException("Unknown node state '" + state + "'"); } } @@ -39,6 +40,7 @@ public class NodeSerializer { case ready: return "ready"; case reserved: return "reserved"; case deprovisioned: return "deprovisioned"; + case breakfixed: return "breakfixed"; default: throw new IllegalArgumentException("Unknown node state '" + state + "'"); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index f861539526d..59604d094fa 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -148,6 +148,10 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { nodeRepository.reactivate(lastElement(path), Agent.operator, "Reactivated through nodes/v2 API"); return new MessageResponse("Moved " + lastElement(path) + " to active"); } + else if (path.startsWith("/nodes/v2/state/breakfixed/")) { + List<Node> breakfixedNodes = nodeRepository.breakfixRecursively(lastElement(path), Agent.operator, "Breakfixed through the nodes/v2 API"); + return new MessageResponse("Breakfixed " + hostnamesAsString(breakfixedNodes)); + } throw new NotFoundException("Cannot put to path '" + path + "'"); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java index 1ab8291d71f..e6e19d0407e 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java @@ -241,6 +241,39 @@ public class NodeRepositoryTest { assertEquals(asSet("host2", "node20"), filterNodes(tester, node -> node.state() == Node.State.dirty)); } + @Test + public void breakfix_tenant_host() { + NodeRepositoryTester tester = new NodeRepositoryTester(); + tester.addHost("host1", "host1", "default", NodeType.host); + tester.addNode("node1", "node1", "host1", "docker", NodeType.tenant); + String reason = NodeRepositoryTest.class.getSimpleName(); + + try { + tester.nodeRepository().breakfixRecursively("node1", Agent.system, reason); + fail("Should not be able to breakfix tenant node"); + } catch (IllegalArgumentException ignored) {} + + try { + tester.nodeRepository().breakfixRecursively("host1", Agent.system, reason); + fail("Should not be able to breakfix host in state not in [parked, failed]"); + } catch (IllegalArgumentException ignored) {} + + tester.setNodeState("host1", Node.State.failed); + tester.setNodeState("node1", Node.State.active); + try { + tester.nodeRepository().breakfixRecursively("host1", Agent.system, reason); + fail("Should not be able to breakfix host with active tenant node"); + } catch (IllegalArgumentException ignored) {} + + tester.setNodeState("node1", Node.State.failed); + tester.nodeRepository().breakfixRecursively("host1", Agent.system, reason); + + assertEquals(1, tester.nodeRepository().getNodes().size()); + Node node = tester.nodeRepository().getNodes().get(0); + assertEquals("host1", node.hostname()); + assertEquals(Node.State.breakfixed, node.state()); + } + private static Set<String> asSet(String... elements) { return new HashSet<>(Arrays.asList(elements)); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java index a5cd922300c..653791c971a 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporterTest.java @@ -96,6 +96,7 @@ public class MetricsReporterTest { expectedMetrics.put("hostedVespa.dirtyHosts", 0); expectedMetrics.put("hostedVespa.failedHosts", 0); expectedMetrics.put("hostedVespa.deprovisionedHosts", 0); + expectedMetrics.put("hostedVespa.breakfixedHosts", 0); expectedMetrics.put("hostedVespa.pendingRedeployments", 42); expectedMetrics.put("hostedVespa.docker.totalCapacityDisk", 0.0); expectedMetrics.put("hostedVespa.docker.totalCapacityMem", 0.0); @@ -122,7 +123,7 @@ public class MetricsReporterTest { nodeRepository.list(); expectedMetrics.put("cache.curator.hitRate", 0.5D); expectedMetrics.put("cache.curator.evictionCount", 0L); - expectedMetrics.put("cache.curator.size", 11L); + expectedMetrics.put("cache.curator.size", 12L); ManualClock clock = new ManualClock(Instant.ofEpochSecond(124)); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states-recursive.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states-recursive.json index 4fb742cbb4a..27767be6315 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states-recursive.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states-recursive.json @@ -63,6 +63,11 @@ "url": "http://localhost:8080/nodes/v2/state/deprovisioned", "nodes": [ ] + }, + "breakfixed": { + "url": "http://localhost:8080/nodes/v2/state/breakfixed", + "nodes": [ + ] } } } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states.json index 69579148df3..fb1282f5195 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/states.json @@ -26,6 +26,9 @@ }, "deprovisioned": { "url": "http://localhost:8080/nodes/v2/state/deprovisioned" + }, + "breakfixed": { + "url": "http://localhost:8080/nodes/v2/state/breakfixed" } } }
\ No newline at end of file |