summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-04-09 13:57:01 +0200
committerJon Bratseth <bratseth@gmail.com>2021-04-09 13:57:01 +0200
commit0d2381234ad4d0a18b08ce15efd550474249e657 (patch)
treec7bd7206582f3dfeff8b9efb4affb91d49b95682
parentcd395de6588bcb68bc4fb61cbdebd54153090e53 (diff)
Mark active as wantToFail on operator actions
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/content/Redundancy.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java9
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java42
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeFilter.java9
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java7
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java12
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java21
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json6
10 files changed, 83 insertions, 31 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/Redundancy.java b/config-model/src/main/java/com/yahoo/vespa/model/content/Redundancy.java
index 655ee1d1d57..706d9a01c82 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/content/Redundancy.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/content/Redundancy.java
@@ -11,8 +11,8 @@ import com.yahoo.vespa.config.search.core.ProtonConfig;
*/
public class Redundancy implements StorDistributionConfig.Producer, ProtonConfig.Producer {
- // This numbers are all per group as wanted numbers.
- private final int initialRedundancy ;
+ // These numbers are all per group as wanted numbers.
+ private final int initialRedundancy;
private final int finalRedundancy;
private final int readyCopies;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
index 5ab2c272b00..ffdd680497b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
@@ -9,6 +9,9 @@ import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.List;
@@ -277,6 +280,12 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> {
/** Returns the nodes of this as a stream */
public Stream<Node> stream() { return asList().stream(); }
+ public static NodeList of(Node ... nodes) {
+ List<Node> nodeList = new ArrayList<>();
+ Collections.addAll(nodeList, nodes);
+ return copyOf(nodeList);
+ }
+
public static NodeList copyOf(List<Node> nodes) {
return new NodeList(nodes, false);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
index 55548e70ddd..4224667a726 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/DynamicProvisioningMaintainer.java
@@ -110,7 +110,7 @@ public class DynamicProvisioningMaintainer extends NodeRepositoryMaintainer {
log.log(Level.SEVERE, "Failed to provision " + host.hostname() + " with " + children.size() +
" children, failing out the host recursively", e);
// Fail out as operator to force a quick redeployment
- nodeRepository().nodes().failRecursively(
+ nodeRepository().nodes().failOrMarkRecursively(
host.hostname(), Agent.operator, "Failed by HostProvisioner due to provisioning failure");
} catch (RuntimeException e) {
if (e.getCause() instanceof NameNotFoundException)
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
index b4d72a35b80..9ac33643148 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
@@ -262,7 +262,7 @@ public class Nodes {
/** Move nodes to the dirty state */
public List<Node> deallocate(List<Node> nodes, Agent agent, String reason) {
- return performOn(NodeListFilter.from(nodes), (node, lock) -> deallocate(node, agent, reason));
+ return performOn(NodeList.copyOf(nodes), (node, lock) -> deallocate(node, agent, reason));
}
public List<Node> deallocateRecursively(String hostname, Agent agent, String reason) {
@@ -328,11 +328,32 @@ public class Nodes {
/**
* Fails all the nodes that are children of hostname before finally failing the hostname itself.
+ * Non-active nodes are failed immediately, while active nodes are marked as wantToFail.
+ * The host is failed if it has no active nodes and marked wantToFail if it has.
*
- * @return List of all the failed nodes in their new state
+ * @return all the nodes that were changed by this request
*/
- public List<Node> failRecursively(String hostname, Agent agent, String reason) {
- return moveRecursively(hostname, Node.State.failed, agent, Optional.of(reason));
+ public List<Node> failOrMarkRecursively(String hostname, Agent agent, String reason) {
+ NodeList children = list().childrenOf(hostname);
+ List<Node> changed = performOn(children, (node, lock) -> failOrMark(node, agent, reason, lock));
+
+ if (children.state(Node.State.active).isEmpty())
+ changed.add(move(hostname, true, Node.State.failed, agent, Optional.of(reason)));
+ else
+ changed.addAll(performOn(NodeList.of(node(hostname).orElseThrow()), (node, lock) -> failOrMark(node, agent, reason, lock)));
+
+ return changed;
+ }
+
+ private Node failOrMark(Node node, Agent agent, String reason, Mutex lock) {
+ if (node.state() == Node.State.active) {
+ node = node.withWantToFail(true, agent, reason, clock.instant());
+ write(node, lock);
+ return node;
+ }
+ else {
+ return move(node, Node.State.failed, agent, Optional.of(reason));
+ }
}
/**
@@ -615,8 +636,7 @@ public class Nodes {
try (NodeMutex lock = nodeMutex.get(); Mutex allocationLock = lockUnallocated()) {
// This takes allocationLock to prevent any further allocation of nodes on this host
host = lock.node();
- NodeList children = list(allocationLock).childrenOf(host);
- result = performOn(NodeListFilter.from(children.asList()),
+ result = performOn(list(allocationLock).childrenOf(host),
(node, nodeLock) -> write(node.withWantToRetire(true, wantToDeprovision, wantToRebuild, agent, instant),
nodeLock));
result.add(write(host.withWantToRetire(true, wantToDeprovision, wantToRebuild, agent, instant), lock));
@@ -644,20 +664,22 @@ public class Nodes {
return db.writeTo(nodes, Agent.system, Optional.empty());
}
+ private List<Node> performOn(NodeFilter filter, BiFunction<Node, Mutex, Node> action) {
+ return performOn(list().matching(filter), action);
+ }
+
/**
* Performs an operation requiring locking on all nodes matching some filter.
*
- * @param filter the filter determining the set of nodes where the operation will be performed
* @param action the action to perform
* @return the set of nodes on which the action was performed, as they became as a result of the operation
*/
- private List<Node> performOn(NodeFilter filter, BiFunction<Node, Mutex, Node> action) {
+ private List<Node> performOn(NodeList nodes, BiFunction<Node, Mutex, Node> action) {
List<Node> unallocatedNodes = new ArrayList<>();
ListMap<ApplicationId, Node> allocatedNodes = new ListMap<>();
// Group matching nodes by the lock needed
- for (Node node : db.readNodes()) {
- if ( ! filter.matches(node)) continue;
+ for (Node node : nodes) {
if (node.allocation().isPresent())
allocatedNodes.put(node.allocation().get().owner(), node);
else
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeFilter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeFilter.java
index bc433c83b2e..296b3ab798a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeFilter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/filter/NodeFilter.java
@@ -3,12 +3,14 @@ package com.yahoo.vespa.hosted.provision.node.filter;
import com.yahoo.vespa.hosted.provision.Node;
+import java.util.function.Predicate;
+
/**
* A chainable node filter
*
* @author bratseth
*/
-public abstract class NodeFilter {
+public abstract class NodeFilter implements Predicate<Node> {
private final NodeFilter next;
@@ -20,6 +22,11 @@ public abstract class NodeFilter {
/** Returns whether this node matches this filter */
public abstract boolean matches(Node node);
+ @Override
+ public final boolean test(Node node) {
+ return matches(node);
+ }
+
/** Returns whether this is a match according to the chained filter */
protected final boolean nextMatches(Node node) {
if (next == null) return true;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
index 0875bf3815d..4b6c5472f6a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
@@ -26,6 +26,7 @@ import com.yahoo.slime.Slime;
import com.yahoo.slime.SlimeUtils;
import com.yahoo.vespa.hosted.provision.NoSuchNodeException;
import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeMutex;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Application;
@@ -138,8 +139,9 @@ public class NodesV2ApiHandler extends LoggingRequestHandler {
return new MessageResponse("Moved " + path.get("hostname") + " to " + Node.State.ready);
}
else if (path.matches("/nodes/v2/state/failed/{hostname}")) {
- List<Node> failedNodes = nodeRepository.nodes().failRecursively(path.get("hostname"), Agent.operator, "Failed through the nodes/v2 API");
- return new MessageResponse("Moved " + hostnamesAsString(failedNodes) + " to " + Node.State.failed);
+ var failedOrMarkedNodes = NodeList.copyOf(nodeRepository.nodes().failOrMarkRecursively(path.get("hostname"), Agent.operator, "Failed through the nodes/v2 API"));
+ return new MessageResponse("Moved " + hostnamesAsString(failedOrMarkedNodes.state(Node.State.failed).asList()) + " to " + Node.State.failed +
+ " and marked " + hostnamesAsString(failedOrMarkedNodes.failing().asList()) + " as wantToFail");
}
else if (path.matches("/nodes/v2/state/parked/{hostname}")) {
List<Node> parkedNodes = nodeRepository.nodes().parkRecursively(path.get("hostname"), Agent.operator, "Parked through the nodes/v2 API");
@@ -430,6 +432,7 @@ public class NodesV2ApiHandler extends LoggingRequestHandler {
}
private static String hostnamesAsString(List<Node> nodes) {
+ if (nodes.isEmpty()) return "none";
return nodes.stream().map(Node::hostname).sorted().collect(Collectors.joining(", "));
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java
index c0699ebf835..f9e7da9b563 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/NodeRepositoryTest.java
@@ -158,7 +158,17 @@ public class NodeRepositoryTest {
assertEquals(2, tester.nodeRepository().nodes().list().size());
// Fail host and container
- tester.nodeRepository().nodes().failRecursively(cfghost1, Agent.system, getClass().getSimpleName());
+ tester.nodeRepository().nodes().failOrMarkRecursively(cfghost1, Agent.system, getClass().getSimpleName());
+
+ assertEquals("cfg1 is not failed yet as it active",
+ Node.State.active, tester.nodeRepository().nodes().node(cfg1).get().state());
+ assertEquals("cfghost1 is not failed yet as it active",
+ Node.State.active, tester.nodeRepository().nodes().node(cfghost1).get().state());
+ assertTrue(tester.nodeRepository().nodes().node(cfg1).get().status().wantToFail());
+ assertTrue(tester.nodeRepository().nodes().node(cfghost1).get().status().wantToFail());
+
+ tester.nodeRepository().nodes().fail(cfg1, Agent.system, "test");
+ tester.nodeRepository().nodes().fail(cfghost1, Agent.system, "test");
// Remove recursively
tester.nodeRepository().nodes().removeRecursively(cfghost1);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java
index 72410c204a3..0c1466e7bf0 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/provisioning/DynamicDockerAllocationTest.java
@@ -130,7 +130,7 @@ public class DynamicDockerAllocationTest {
// App 2 and 3 should have been allocated to the same nodes - fail one of the parent hosts from there
String parent = "host-1.yahoo.com";
- tester.nodeRepository().nodes().failRecursively(parent, Agent.system, "Testing");
+ tester.nodeRepository().nodes().failOrMarkRecursively(parent, Agent.system, "Testing");
// Redeploy all applications
deployApp(application1, clusterSpec1, resources, tester, 3);
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java
index e0715702722..554295a8062 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiTest.java
@@ -133,7 +133,7 @@ public class NodesV2ApiTest {
// PUT a node in failed ...
assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed/host2.yahoo.com",
new byte[0], Request.Method.PUT),
- "{\"message\":\"Moved host2.yahoo.com to failed\"}");
+ "{\"message\":\"Moved host2.yahoo.com to failed and marked none as wantToFail\"}");
tester.assertResponseContains(new Request("http://localhost:8080/nodes/v2/node/host2.yahoo.com"),
"\"state\":\"failed\"");
// ... and put it back in active (after fixing). This is useful to restore data when multiple nodes fail.
@@ -149,7 +149,7 @@ public class NodesV2ApiTest {
// or, PUT a node in failed ...
assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed/test-node-pool-102-2",
new byte[0], Request.Method.PUT),
- "{\"message\":\"Moved test-node-pool-102-2 to failed\"}");
+ "{\"message\":\"Moved test-node-pool-102-2 to failed and marked none as wantToFail\"}");
tester.assertResponseContains(new Request("http://localhost:8080/nodes/v2/node/test-node-pool-102-2"),
"\"state\":\"failed\"");
// ... and deallocate it such that it moves to dirty and is recycled
@@ -165,14 +165,12 @@ public class NodesV2ApiTest {
tester.assertResponse(new Request("http://localhost:8080/nodes/v2/node/test-node-pool-102-2", new byte[0], Request.Method.GET),
404, "{\"error-code\":\"NOT_FOUND\",\"message\":\"No node with hostname 'test-node-pool-102-2'\"}");
- // Put a host in failed and make sure its children are also failed
+ // Mark a node and its children as want to fail
assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed/dockerhost1.yahoo.com", new byte[0], Request.Method.PUT),
- "{\"message\":\"Moved dockerhost1.yahoo.com, host4.yahoo.com to failed\"}");
-
+ "{\"message\":\"Moved none to failed and marked dockerhost1.yahoo.com, host4.yahoo.com as wantToFail\"}");
+ // Nodes are not failed yet
assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed"), "{\"nodes\":[" +
- "{\"url\":\"http://localhost:8080/nodes/v2/node/host5.yahoo.com\"}," +
- "{\"url\":\"http://localhost:8080/nodes/v2/node/host4.yahoo.com\"}," +
- "{\"url\":\"http://localhost:8080/nodes/v2/node/dockerhost1.yahoo.com\"}]}");
+ "{\"url\":\"http://localhost:8080/nodes/v2/node/host5.yahoo.com\"}]}");
// Update (PATCH) a node (multiple fields can also be sent in one request body)
assertResponse(new Request("http://localhost:8080/nodes/v2/node/host4.yahoo.com",
@@ -238,6 +236,9 @@ public class NodesV2ApiTest {
assertFile(new Request("http://localhost:8080/nodes/v2/node/host4.yahoo.com"), "node4-after-changes.json");
// move a host marked as wantToRebuild to deprovisioned
+ assertResponse(new Request("http://localhost:8080/nodes/v2/state/parked/dockerhost1.yahoo.com",
+ new byte[0], Request.Method.PUT),
+ "{\"message\":\"Moved dockerhost1.yahoo.com to parked\"}");
assertResponse(new Request("http://localhost:8080/nodes/v2/node/dockerhost1.yahoo.com",
new byte[0], Request.Method.DELETE),
"{\"message\":\"Removed dockerhost1.yahoo.com\"}");
@@ -423,7 +424,7 @@ public class NodesV2ApiTest {
"{\"message\":\"Added 1 nodes to the provisioned state\"}");
assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed/foo.yahoo.com",
new byte[0], Request.Method.PUT),
- "{\"message\":\"Moved foo.yahoo.com to failed\"}");
+ "{\"message\":\"Moved foo.yahoo.com to failed and marked none as wantToFail\"}");
assertResponse(new Request("http://localhost:8080/nodes/v2/state/dirty/foo.yahoo.com",
new byte[0], Request.Method.PUT),
"{\"message\":\"Moved foo.yahoo.com to dirty\"}");
@@ -471,7 +472,7 @@ public class NodesV2ApiTest {
// Attempt to fail and ready an allocated node without going through dirty
assertResponse(new Request("http://localhost:8080/nodes/v2/state/failed/host1.yahoo.com",
new byte[0], Request.Method.PUT),
- "{\"message\":\"Moved host1.yahoo.com to failed\"}");
+ "{\"message\":\"Moved host1.yahoo.com to failed and marked none as wantToFail\"}");
tester.assertResponse(new Request("http://localhost:8080/nodes/v2/state/ready/host1.yahoo.com",
new byte[0], Request.Method.PUT),
400,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json
index a2f43480605..17484824d9e 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/restapi/responses/node4-after-changes.json
@@ -1,7 +1,7 @@
{
"url": "http://localhost:8080/nodes/v2/node/host4.yahoo.com",
"id": "host4.yahoo.com",
- "state": "failed",
+ "state": "active",
"type": "tenant",
"hostname": "host4.yahoo.com",
"parentHostname": "parent.yahoo.com",
@@ -34,7 +34,7 @@
"currentRebootGeneration": 1,
"vespaVersion": "6.43.0",
"currentDockerImage": "docker-registry.domain.tld:8080/dist/vespa:6.45.0",
- "failCount": 1,
+ "failCount": 0,
"wantToRetire": true,
"preferToRetire": false,
"wantToDeprovision": false,
@@ -60,7 +60,7 @@
"agent": "application"
},
{
- "event": "failed",
+ "event": "wantToFail",
"at": 123,
"agent": "operator"
},