diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-04-09 13:57:01 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-04-09 13:57:01 +0200 |
commit | 0d2381234ad4d0a18b08ce15efd550474249e657 (patch) | |
tree | c7bd7206582f3dfeff8b9efb4affb91d49b95682 | |
parent | cd395de6588bcb68bc4fb61cbdebd54153090e53 (diff) |
Mark active as wantToFail on operator actions
10 files changed, 83 insertions, 31 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/Redundancy.java b/config-model/src/main/java/com/yahoo/vespa/model/content/Redundancy.java index 655ee1d1d57..706d9a01c82 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/content/Redundancy.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/content/Redundancy.java @@ -11,8 +11,8 @@ import com.yahoo.vespa.config.search.core.ProtonConfig; */ public class Redundancy implements StorDistributionConfig.Producer, ProtonConfig.Producer { - // This numbers are all per group as wanted numbers. - private final int initialRedundancy ; + // These numbers are all per group as wanted numbers. + private final int initialRedundancy; private final int finalRedundancy; private final int readyCopies; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java index 5ab2c272b00..ffdd680497b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java @@ -9,6 +9,9 @@ import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; import java.util.Comparator; import java.util.EnumSet; import java.util.List; @@ -277,6 +280,12 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> { /** Returns the nodes of this as a stream */ public Stream<Node> stream() { return asList().stream(); } + public static NodeList of(Node ... nodes) { + List<Node> nodeList = new ArrayList<>(); + Collections.addAll(nodeList, nodes); + return copyOf(nodeList); + } + public static NodeList copyOf(List<Node> nodes) { return new NodeList(nodes, false); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java index 55548e70ddd..4224667a726 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java @@ -110,7 +110,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer { log.log(Level.SEVERE, "Failed to provision " + host.hostname() + " with " + children.size() + " children, failing out the host recursively", e); // Fail out as operator to force a quick redeployment - nodeRepository().nodes().failRecursively( + nodeRepository().nodes().failOrMarkRecursively( host.hostname(), Agent.operator, "Failed by HostProvisioner due to provisioning failure"); } catch (RuntimeException e) { if (e.getCause() instanceof NameNotFoundException) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java index b4d72a35b80..9ac33643148 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java @@ -262,7 +262,7 @@ public class Nodes { /** Move nodes to the dirty state */ public List<Node> deallocate(List<Node> nodes, Agent agent, String reason) { - return performOn(NodeListFilter.from(nodes), (node, lock) -> deallocate(node, agent, reason)); + return performOn(NodeList.copyOf(nodes), (node, lock) -> deallocate(node, agent, reason)); } public List<Node> deallocateRecursively(String hostname, Agent agent, String reason) { @@ -328,11 +328,32 @@ public class Nodes { /** * Fails all the nodes that are children of hostname before finally failing the hostname itself. + * Non-active nodes are failed immediately, while active nodes are marked as wantToFail. + * The host is failed if it has no active nodes and marked wantToFail if it has. * - * @return List of all the failed nodes in their new state + * @return all the nodes that were changed by this request */ - public List<Node> failRecursively(String hostname, Agent agent, String reason) { - return moveRecursively(hostname, Node.State.failed, agent, Optional.of(reason)); + public List<Node> failOrMarkRecursively(String hostname, Agent agent, String reason) { + NodeList children = list().childrenOf(hostname); + List<Node> changed = performOn(children, (node, lock) -> failOrMark(node, agent, reason, lock)); + + if (children.state(Node.State.active).isEmpty()) + changed.add(move(hostname, true, Node.State.failed, agent, Optional.of(reason))); + else + changed.addAll(performOn(NodeList.of(node(hostname).orElseThrow()), (node, lock) -> failOrMark(node, agent, reason, lock))); + + return changed; + } + + private Node failOrMark(Node node, Agent agent, String reason, Mutex lock) { + if (node.state() == Node.State.active) { + node = node.withWantToFail(true, agent, reason, clock.instant()); + write(node, lock); + return node; + } + else { + return move(node, Node.State.failed, agent, Optional.of(reason)); + } } /** @@ -615,8 +636,7 @@ public class Nodes { try (NodeMutex lock = nodeMutex.get(); Mutex allocationLock = lockUnallocated()) { // This takes allocationLock to prevent any further allocation of nodes on this host host = lock.node(); - NodeList children = list(allocationLock).childrenOf(host); - result = performOn(NodeListFilter.from(children.asList()), + result = performOn(list(allocationLock).childrenOf(host), (node, nodeLock) -> write(node.withWantToRetire(true, wantToDeprovision, wantToRebuild, agent, instant), nodeLock)); result.add(write(host.withWantToRetire(true, wantToDeprovision, wantToRebuild, agent, instant), lock)); @@ -644,20 +664,22 @@ public class Nodes { return db.writeTo(nodes, Agent.system, Optional.empty()); } + private List<Node> performOn(NodeFilter filter, BiFunction<Node, Mutex, Node> action) { + return performOn(list().matching(filter), action); + } + /** * Performs an operation requiring locking on all nodes matching some filter. * - * @param filter the filter determining the set of nodes where the operation will be performed * @param action the action to perform * @return the set of nodes on which the action was performed, as they became as a result of the operation */ - private List<Node> performOn(NodeFilter filter, BiFunction<Node, Mutex, Node> action) { + private List<Node> performOn(NodeList nodes, BiFunction<Node, Mutex, Node> action) { List<Node> unallocatedNodes = new ArrayList<>(); ListMap<ApplicationId, Node> allocatedNodes = new ListMap<>(); // Group matching nodes by the lock needed - for (Node node : db.readNodes()) { - if ( ! filter.matches(node)) continue; + for (Node node : nodes) { if (node.allocation().isPresent()) allocatedNodes.put(node.allocation().get().owner(), node); else diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeFilter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeFilter.java index bc433c83b2e..296b3ab798a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeFilter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeFilter.java @@ -3,12 +3,14 @@ package com.yahoo.vespa.hosted.provision.node.filter; import com.yahoo.vespa.hosted.provision.Node; +import java.util.function.Predicate; + /** * A chainable node filter * * @author bratseth */ -public abstract class NodeFilter { +public abstract class NodeFilter implements Predicate<Node> { private final NodeFilter next; @@ -20,6 +22,11 @@ public abstract class NodeFilter { /** Returns whether this node matches this filter */ public abstract boolean matches(Node node); + @Override + public final boolean test(Node node) { + return matches(node); + } + /** Returns whether this is a match according to the chained filter */ protected final boolean nextMatches(Node node) { if (next == null) return true; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index 0875bf3815d..4b6c5472f6a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -26,6 +26,7 @@ import com.yahoo.slime.Slime; import com.yahoo.slime.SlimeUtils; import com.yahoo.vespa.hosted.provision.NoSuchNodeException; import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeMutex; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Application; @@ -138,8 +139,9 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { return new MessageResponse("Moved " + path.get("hostname") + " to " + Node.State.ready); } else if (path.matches("/nodes/v2/state/failed/{hostname}")) { - List<Node> failedNodes = nodeRepository.nodes().failRecursively(path.get("hostname"), Agent.operator, "Failed through the nodes/v2 API"); - return new MessageResponse("Moved " + hostnamesAsString(failedNodes) + " to " + Node.State.failed); + var failedOrMarkedNodes = NodeList.copyOf(nodeRepository.nodes().failOrMarkRecursively(path.get("hostname"), Agent.operator, "Failed through the nodes/v2 API")); + return new MessageResponse("Moved " + hostnamesAsString(failedOrMarkedNodes.state(Node.State.failed).asList()) + " to " + Node.State.failed + + " and marked " + hostnamesAsString(failedOrMarkedNodes.failing().asList()) + " as wantToFail"); } else if (path.matches("/nodes/v2/state/parked/{hostname}")) { List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), Agent.operator, "Parked through the nodes/v2 API"); @@ -430,6 +432,7 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { } private static String hostnamesAsString(List<Node> nodes) { + if (nodes.isEmpty()) return "none"; return nodes.stream().map(Node::hostname).sorted().collect(Collectors.joining(", ")); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java index c0699ebf835..f9e7da9b563 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java @@ -158,7 +158,17 @@ public class NodeRepositoryTest { assertEquals(2, tester.nodeRepository().nodes().list().size()); // Fail host and container - tester.nodeRepository().nodes().failRecursively(cfghost1, Agent.system, getClass().getSimpleName()); + tester.nodeRepository().nodes().failOrMarkRecursively(cfghost1, Agent.system, getClass().getSimpleName()); + + assertEquals("cfg1 is not failed yet as it active", + Node.State.active, tester.nodeRepository().nodes().node(cfg1).get().state()); + assertEquals("cfghost1 is not failed yet as it active", + Node.State.active, tester.nodeRepository().nodes().node(cfghost1).get().state()); + assertTrue(tester.nodeRepository().nodes().node(cfg1).get().status().wantToFail()); + assertTrue(tester.nodeRepository().nodes().node(cfghost1).get().status().wantToFail()); + + tester.nodeRepository().nodes().fail(cfg1, Agent.system, "test"); + tester.nodeRepository().nodes().fail(cfghost1, Agent.system, "test"); // Remove recursively tester.nodeRepository().nodes().removeRecursively(cfghost1); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java index 72410c204a3..0c1466e7bf0 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java @@ -130,7 +130,7 @@ public class DynamicDockerAllocationTest { // App 2 and 3 should have been allocated to the same nodes - fail one of the parent hosts from there String parent = "host-1.yahoo.com"; - tester.nodeRepository().nodes().failRecursively(parent, Agent.system, "Testing"); + tester.nodeRepository().nodes().failOrMarkRecursively(parent, Agent.system, "Testing"); // Redeploy all applications deployApp(application1, clusterSpec1, resources, tester, 3); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java index e0715702722..554295a8062 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java @@ -133,7 +133,7 @@ public class NodesV2ApiTest { // PUT a node in failed ... assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed/host2.yahoo.com", new byte[0], Request.Method.PUT), - "{\"message\":\"Moved host2.yahoo.com to failed\"}"); + "{\"message\":\"Moved host2.yahoo.com to failed and marked none as wantToFail\"}"); tester.assertResponseContains(new Request("http://localhost:8080/nodes/v2/node/host2.yahoo.com"), "\"state\":\"failed\""); // ... and put it back in active (after fixing). This is useful to restore data when multiple nodes fail. @@ -149,7 +149,7 @@ public class NodesV2ApiTest { // or, PUT a node in failed ... assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed/test-node-pool-102-2", new byte[0], Request.Method.PUT), - "{\"message\":\"Moved test-node-pool-102-2 to failed\"}"); + "{\"message\":\"Moved test-node-pool-102-2 to failed and marked none as wantToFail\"}"); tester.assertResponseContains(new Request("http://localhost:8080/nodes/v2/node/test-node-pool-102-2"), "\"state\":\"failed\""); // ... and deallocate it such that it moves to dirty and is recycled @@ -165,14 +165,12 @@ public class NodesV2ApiTest { tester.assertResponse(new Request("http://localhost:8080/nodes/v2/node/test-node-pool-102-2", new byte[0], Request.Method.GET), 404, "{\"error-code\":\"NOT_FOUND\",\"message\":\"No node with hostname 'test-node-pool-102-2'\"}"); - // Put a host in failed and make sure its children are also failed + // Mark a node and its children as want to fail assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed/dockerhost1.yahoo.com", new byte[0], Request.Method.PUT), - "{\"message\":\"Moved dockerhost1.yahoo.com, host4.yahoo.com to failed\"}"); - + "{\"message\":\"Moved none to failed and marked dockerhost1.yahoo.com, host4.yahoo.com as wantToFail\"}"); + // Nodes are not failed yet assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed"), "{\"nodes\":[" + - "{\"url\":\"http://localhost:8080/nodes/v2/node/host5.yahoo.com\"}," + - "{\"url\":\"http://localhost:8080/nodes/v2/node/host4.yahoo.com\"}," + - "{\"url\":\"http://localhost:8080/nodes/v2/node/dockerhost1.yahoo.com\"}]}"); + "{\"url\":\"http://localhost:8080/nodes/v2/node/host5.yahoo.com\"}]}"); // Update (PATCH) a node (multiple fields can also be sent in one request body) assertResponse(new Request("http://localhost:8080/nodes/v2/node/host4.yahoo.com", @@ -238,6 +236,9 @@ public class NodesV2ApiTest { assertFile(new Request("http://localhost:8080/nodes/v2/node/host4.yahoo.com"), "node4-after-changes.json"); // move a host marked as wantToRebuild to deprovisioned + assertResponse(new Request("http://localhost:8080/nodes/v2/state/parked/dockerhost1.yahoo.com", + new byte[0], Request.Method.PUT), + "{\"message\":\"Moved dockerhost1.yahoo.com to parked\"}"); assertResponse(new Request("http://localhost:8080/nodes/v2/node/dockerhost1.yahoo.com", new byte[0], Request.Method.DELETE), "{\"message\":\"Removed dockerhost1.yahoo.com\"}"); @@ -423,7 +424,7 @@ public class NodesV2ApiTest { "{\"message\":\"Added 1 nodes to the provisioned state\"}"); assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed/foo.yahoo.com", new byte[0], Request.Method.PUT), - "{\"message\":\"Moved foo.yahoo.com to failed\"}"); + "{\"message\":\"Moved foo.yahoo.com to failed and marked none as wantToFail\"}"); assertResponse(new Request("http://localhost:8080/nodes/v2/state/dirty/foo.yahoo.com", new byte[0], Request.Method.PUT), "{\"message\":\"Moved foo.yahoo.com to dirty\"}"); @@ -471,7 +472,7 @@ public class NodesV2ApiTest { // Attempt to fail and ready an allocated node without going through dirty assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed/host1.yahoo.com", new byte[0], Request.Method.PUT), - "{\"message\":\"Moved host1.yahoo.com to failed\"}"); + "{\"message\":\"Moved host1.yahoo.com to failed and marked none as wantToFail\"}"); tester.assertResponse(new Request("http://localhost:8080/nodes/v2/state/ready/host1.yahoo.com", new byte[0], Request.Method.PUT), 400, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json index a2f43480605..17484824d9e 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json @@ -1,7 +1,7 @@ { "url": "http://localhost:8080/nodes/v2/node/host4.yahoo.com", "id": "host4.yahoo.com", - "state": "failed", + "state": "active", "type": "tenant", "hostname": "host4.yahoo.com", "parentHostname": "parent.yahoo.com", @@ -34,7 +34,7 @@ "currentRebootGeneration": 1, "vespaVersion": "6.43.0", "currentDockerImage": "docker-registry.domain.tld:8080/dist/vespa:6.45.0", - "failCount": 1, + "failCount": 0, "wantToRetire": true, "preferToRetire": false, "wantToDeprovision": false, @@ -60,7 +60,7 @@ "agent": "application" }, { - "event": "failed", + "event": "wantToFail", "at": 123, "agent": "operator" }, |