aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2021-04-08 16:51:37 +0200
committerGitHub <noreply@github.com>2021-04-08 16:51:37 +0200
commit67c724dfc6f20ecde3fdf8a5997f7fe37eb83a06 (patch)
treecdd770b4fed1bddc6f8387a6bcaa8705d487de2f
parent248b9d5278345c4932b6cc9f598e3943be44b850 (diff)
parentf58f74b95f60702587fe053eda8f5c2ad4fc5f9f (diff)
Merge pull request #17310 from vespa-engine/bratseth/wantToFail
Move nodes to 'failed' during activate
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java11
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java5
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java17
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java8
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java51
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java5
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java8
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java1
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java12
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java11
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java5
13 files changed, 99 insertions, 39 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
index 80dfafb5116..de72024cb77 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/Node.java
@@ -194,6 +194,17 @@ public final class Node implements Nodelike {
}
/**
+ * Returns a copy of this where wantToFail is set to true and history is updated to reflect this.
+ */
+ public Node withWantToFail(boolean wantToFail, Agent agent, String reason, Instant at) {
+ Node node = this.with(status.withWantToFail(wantToFail));
+ if (wantToFail)
+ node = node.with(history.with(new History.Event(History.Event.Type.wantToFail, agent, at)));
+ return node;
+
+ }
+
+ /**
* Returns a copy of this node with wantToRetire and wantToDeprovision set to the given values and updated history.
*
* If both given wantToRetire and wantToDeprovision are equal to the current values, the method is no-op.
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
index 24f94b9d63b..5ab2c272b00 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/NodeList.java
@@ -91,6 +91,11 @@ public class NodeList extends AbstractFilteringList<Node, NodeList> {
!node.status().vespaVersion().get().equals(node.allocation().get().membership().cluster().vespaVersion()));
}
+ /** Returns the subset of nodes with want to fail set to true */
+ public NodeList failing() {
+ return matching(node -> node.status().wantToFail());
+ }
+
/** Returns the subset of nodes that are currently changing their OS version to given version */
public NodeList changingOsVersionTo(Version version) {
return matching(node -> node.status().osVersion().changingTo(version));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index eb9c8300724..ac6ecd98fac 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -277,7 +277,7 @@ public class NodeFailer extends NodeRepositoryMaintainer {
}
if (! allTenantNodesFailedOutSuccessfully) return false;
- node = nodeRepository().nodes().fail(node.hostname(), Agent.NodeFailer, reason);
+ wantToFail(node, true, reason, lock);
try {
deployment.get().activate();
return true;
@@ -287,17 +287,20 @@ public class NodeFailer extends NodeRepositoryMaintainer {
Exceptions.toMessageString(e));
return true;
} catch (RuntimeException e) {
- // The expected reason for deployment to fail here is that there is no capacity available to redeploy.
- // In that case we should leave the node in the active state to avoid failing additional nodes.
- nodeRepository().nodes().reactivate(node.hostname(), Agent.NodeFailer,
- "Failed to redeploy after being failed by NodeFailer");
- log.log(Level.WARNING, "Attempted to fail " + node + " for " + node.allocation().get().owner() +
- ", but redeploying without the node failed", e);
+ // Reset want to fail: We'll retry failing unless it heals in the meantime
+ nodeRepository().nodes().node(node.hostname())
+ .ifPresent(n -> wantToFail(n, false, "Could not fail", lock));
+ log.log(Level.WARNING, "Could not fail " + node + " for " + node.allocation().get().owner() +
+ " for " + reason + ": " + Exceptions.toMessageString(e));
return false;
}
}
}
+ private void wantToFail(Node node, boolean wantToFail, String reason, Mutex lock) {
+ nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, reason, clock().instant()), lock);
+ }
+
/** Returns true if node failing should be throttled */
private boolean throttle(Node node) {
if (throttlePolicy == ThrottlePolicy.disabled) return false;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
index 0c5a8ea1d9f..b5780cd86a0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/History.java
@@ -152,6 +152,8 @@ public class History {
wantToRetire(false),
// The node was scheduled for retirement (soft)
preferToRetire(false),
+ // This node was scheduled for failing
+ wantToFail,
// The active node was retired
retired,
// The active node went down according to the service monitor
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
index cd0928ef320..b4d72a35b80 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Nodes.java
@@ -252,6 +252,14 @@ public class Nodes {
}
+ /**
+ * Fails these nodes in a transaction and returns the nodes in the new state which will hold if the
+ * transaction commits.
+ */
+ public List<Node> fail(List<Node> nodes, ApplicationTransaction transaction) {
+ return db.writeTo(Node.State.failed, nodes, Agent.application, Optional.of("Failed by application"), transaction.nested());
+ }
+
/** Move nodes to the dirty state */
public List<Node> deallocate(List<Node> nodes, Agent agent, String reason) {
return performOn(NodeListFilter.from(nodes), (node, lock) -> deallocate(node, agent, reason));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java
index 8964977091c..39d0d80b88f 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/node/Status.java
@@ -23,6 +23,7 @@ public class Status {
private final boolean wantToDeprovision;
private final boolean wantToRebuild;
private final boolean preferToRetire;
+ private final boolean wantToFail;
private final OsVersion osVersion;
private final Optional<Instant> firmwareVerifiedAt;
@@ -34,6 +35,7 @@ public class Status {
boolean wantToDeprovision,
boolean wantToRebuild,
boolean preferToRetire,
+ boolean wantToFail,
OsVersion osVersion,
Optional<Instant> firmwareVerifiedAt) {
this.reboot = Objects.requireNonNull(generation, "Generation must be non-null");
@@ -50,76 +52,79 @@ public class Status {
this.wantToDeprovision = wantToDeprovision;
this.wantToRebuild = wantToRebuild;
this.preferToRetire = preferToRetire;
+ this.wantToFail = wantToFail;
this.osVersion = Objects.requireNonNull(osVersion, "OS version must be non-null");
this.firmwareVerifiedAt = Objects.requireNonNull(firmwareVerifiedAt, "Firmware check instant must be non-null");
}
/** Returns a copy of this with the reboot generation changed */
- public Status withReboot(Generation reboot) { return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, osVersion, firmwareVerifiedAt); }
+ public Status withReboot(Generation reboot) { return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); }
/** Returns the reboot generation of this node */
public Generation reboot() { return reboot; }
/** Returns a copy of this with the vespa version changed */
- public Status withVespaVersion(Version version) { return new Status(reboot, Optional.of(version), containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, osVersion, firmwareVerifiedAt); }
+ public Status withVespaVersion(Version version) { return new Status(reboot, Optional.of(version), containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); }
/** Returns the Vespa version installed on the node, if known */
public Optional<Version> vespaVersion() { return vespaVersion; }
/** Returns a copy of this with the container image changed */
- public Status withContainerImage(DockerImage containerImage) { return new Status(reboot, vespaVersion, Optional.of(containerImage), failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, osVersion, firmwareVerifiedAt); }
+ public Status withContainerImage(DockerImage containerImage) { return new Status(reboot, vespaVersion, Optional.of(containerImage), failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); }
/** Returns the container image the node is running, if any */
public Optional<DockerImage> containerImage() { return containerImage; }
- public Status withIncreasedFailCount() { return new Status(reboot, vespaVersion, containerImage, failCount + 1, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, osVersion, firmwareVerifiedAt); }
+ public Status withIncreasedFailCount() { return new Status(reboot, vespaVersion, containerImage, failCount + 1, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); }
- public Status withDecreasedFailCount() { return new Status(reboot, vespaVersion, containerImage, failCount - 1, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, osVersion, firmwareVerifiedAt); }
+ public Status withDecreasedFailCount() { return new Status(reboot, vespaVersion, containerImage, failCount - 1, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); }
- public Status withFailCount(int value) { return new Status(reboot, vespaVersion, containerImage, value, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, osVersion, firmwareVerifiedAt); }
+ public Status withFailCount(int value) { return new Status(reboot, vespaVersion, containerImage, value, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt); }
/** Returns how many times this node has been moved to the failed state. */
public int failCount() { return failCount; }
/** Returns a copy of this with the want to retire/deprovision/rebuild flags changed */
public Status withWantToRetire(boolean wantToRetire, boolean wantToDeprovision, boolean wantToRebuild) {
- return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, osVersion, firmwareVerifiedAt);
+ return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt);
}
/**
* Returns whether this node is requested to retire. This is a hard request to retire, which allows any replacement
* to increase node skew in the cluster.
*/
- public boolean wantToRetire() {
- return wantToRetire;
- }
+ public boolean wantToRetire() { return wantToRetire; }
- /** Returns whether this node should be de-provisioned when possible. */
- public boolean wantToDeprovision() {
- return wantToDeprovision;
- }
+ /**
+ * Returns whether this node should be de-provisioned when possible.
+ */
+ public boolean wantToDeprovision() { return wantToDeprovision; }
/** Returns whether this node should be rebuilt when possible. */
- public boolean wantToRebuild() {
- return wantToRebuild;
- }
+ public boolean wantToRebuild() { return wantToRebuild; }
/**
* Returns whether this node is requested to retire. Unlike {@link Status#wantToRetire()}, this is a soft
* request to retire, which will not allow any replacement to increase node skew in the cluster.
*/
- public boolean preferToRetire() {
- return preferToRetire;
+ public boolean preferToRetire() { return preferToRetire; }
+
+ /** Returns a copy of this with want to fail set to the given value */
+ public Status withWantToFail(boolean wantToFail) {
+ return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt);
}
+ /** Returns whether this node should be failed */
+ public boolean wantToFail() { return wantToFail; }
+
/** Returns a copy of this with prefer-to-retire set to given value */
public Status withPreferToRetire(boolean preferToRetire) {
- return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, osVersion, firmwareVerifiedAt);
+ return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, firmwareVerifiedAt);
}
/** Returns a copy of this with the OS version set to given version */
public Status withOsVersion(OsVersion version) {
- return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, version, firmwareVerifiedAt);
+ return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, version, firmwareVerifiedAt);
}
/** Returns the OS version of this node */
@@ -129,7 +134,7 @@ public class Status {
/** Returns a copy of this with the firmwareVerifiedAt set to the given instant. */
public Status withFirmwareVerifiedAt(Instant instant) {
- return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, osVersion, Optional.of(instant));
+ return new Status(reboot, vespaVersion, containerImage, failCount, wantToRetire, wantToDeprovision, wantToRebuild, preferToRetire, wantToFail, osVersion, Optional.of(instant));
}
/** Returns the last time this node had firmware that was verified to be up to date. */
@@ -140,7 +145,7 @@ public class Status {
/** Returns the initial status of a newly provisioned node */
public static Status initial() {
return new Status(Generation.initial(), Optional.empty(), Optional.empty(), 0, false,
- false, false, false, OsVersion.EMPTY, Optional.empty());
+ false, false, false, false, OsVersion.EMPTY, Optional.empty());
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
index 5c006f6d6a0..cc3fd75a22c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/NodeSerializer.java
@@ -84,6 +84,7 @@ public class NodeSerializer {
private static final String wantToDeprovisionKey = "wantToDeprovision";
private static final String wantToRebuildKey = "wantToRebuild";
private static final String preferToRetireKey = "preferToRetire";
+ private static final String wantToFailKey = "wantToFailKey";
private static final String osVersionKey = "osVersion";
private static final String wantedOsVersionKey = "wantedOsVersion";
private static final String firmwareCheckKey = "firmwareCheck";
@@ -165,6 +166,7 @@ public class NodeSerializer {
object.setBool(wantToRetireKey, node.status().wantToRetire());
object.setBool(preferToRetireKey, node.status().preferToRetire());
object.setBool(wantToDeprovisionKey, node.status().wantToDeprovision());
+ object.setBool(wantToFailKey, node.status().wantToFail());
object.setBool(wantToRebuildKey, node.status().wantToRebuild());
node.allocation().ifPresent(allocation -> toSlime(allocation, object.setObject(instanceKey)));
toSlime(node.history(), object.setArray(historyKey));
@@ -275,6 +277,7 @@ public class NodeSerializer {
object.field(wantToDeprovisionKey).asBool(),
object.field(wantToRebuildKey).asBool(),
object.field(preferToRetireKey).asBool(),
+ object.field(wantToFailKey).asBool(),
new OsVersion(versionFromSlime(object.field(osVersionKey)),
versionFromSlime(object.field(wantedOsVersionKey))),
instantFromSlime(object.field(firmwareCheckKey)));
@@ -416,6 +419,7 @@ public class NodeSerializer {
case "reserved" : return History.Event.Type.reserved;
case "activated" : return History.Event.Type.activated;
case "wantToRetire": return History.Event.Type.wantToRetire;
+ case "wantToFail": return History.Event.Type.wantToFail;
case "retired" : return History.Event.Type.retired;
case "deactivated" : return History.Event.Type.deactivated;
case "parked" : return History.Event.Type.parked;
@@ -441,6 +445,7 @@ public class NodeSerializer {
case reserved : return "reserved";
case activated : return "activated";
case wantToRetire: return "wantToRetire";
+ case wantToFail: return "wantToFail";
case retired : return "retired";
case deactivated : return "deactivated";
case parked : return "parked";
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java
index 03db7e0e7e5..cd0ec2c5d66 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java
@@ -88,7 +88,7 @@ class Activator {
List<Node> activeToRemove = removeHostsFromList(hostnames, oldActive);
activeToRemove = activeToRemove.stream().map(Node::unretire).collect(Collectors.toList()); // only active nodes can be retired. TODO: Move this line to deactivate
- nodeRepository.nodes().deactivate(activeToRemove, transaction); // TODO: Pass activation time in this call and next line
+ deactivate(activeToRemove, transaction); // TODO: Pass activation time in this call and next line
nodeRepository.nodes().activate(newActive, transaction.nested()); // activate also continued active to update node state
rememberResourceChange(transaction, generation, activationTime,
@@ -97,6 +97,12 @@ class Activator {
unreserveParentsOf(reservedToActivate);
}
+ private void deactivate(List<Node> toDeactivateList, ApplicationTransaction transaction) {
+ NodeList toDeactivate = NodeList.copyOf(toDeactivateList);
+ nodeRepository.nodes().deactivate(toDeactivate.not().failing().asList(), transaction);
+ nodeRepository.nodes().fail(toDeactivate.failing().asList(), transaction);
+ }
+
private void rememberResourceChange(ApplicationTransaction transaction, long generation, Instant at,
NodeList oldNodes, NodeList newNodes) {
Optional<Application> application = nodeRepository.applications().get(transaction.application());
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
index 3d3a54774e4..720548c2d99 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeAllocation.java
@@ -111,6 +111,7 @@ class NodeAllocation {
if ( ! membership.cluster().satisfies(cluster)) continue; // wrong cluster id/type
if ((! candidate.isSurplus || saturated()) && ! membership.cluster().group().equals(cluster.group())) continue; // wrong group and we can't or have no reason to change it
if ( candidate.state() == Node.State.active && allocation.isRemovable()) continue; // don't accept; causes removal
+ if ( candidate.state() == Node.State.active && candidate.wantToFail()) continue; // don't accept; causes failing
if ( indexes.contains(membership.index())) continue; // duplicate index (just to be sure)
boolean resizeable = requestedNodes.considerRetiring() && candidate.isResizable;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java
index 85e73245508..5adde885276 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeCandidate.java
@@ -57,7 +57,6 @@ public abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidat
/** This node can be resized to the new NodeResources */
final boolean isResizable;
-
private NodeCandidate(NodeResources freeParentCapacity, Optional<Node> parent, boolean violatesSpares, boolean exclusiveSwitch, boolean isSurplus, boolean isNew, boolean isResizeable) {
if (isResizeable && isNew)
throw new IllegalArgumentException("A new node cannot be resizable");
@@ -79,6 +78,8 @@ public abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidat
public abstract boolean preferToRetire();
+ public abstract boolean wantToFail();
+
public abstract Flavor flavor();
public abstract NodeCandidate allocate(ApplicationId owner, ClusterMembership membership, NodeResources requestedResources, Instant at);
@@ -301,6 +302,9 @@ public abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidat
public boolean preferToRetire() { return node.status().preferToRetire(); }
@Override
+ public boolean wantToFail() { return node.status().wantToFail(); }
+
+ @Override
public Flavor flavor() { return node.flavor(); }
@Override
@@ -387,6 +391,9 @@ public abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidat
public boolean preferToRetire() { return false; }
@Override
+ public boolean wantToFail() { return false; }
+
+ @Override
public Flavor flavor() { return new Flavor(resources); }
@Override
@@ -483,6 +490,9 @@ public abstract class NodeCandidate implements Nodelike, Comparable<NodeCandidat
public boolean preferToRetire() { return false; }
@Override
+ public boolean wantToFail() { return false; }
+
+ @Override
public Flavor flavor() { return new Flavor(resources); }
@Override
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
index db7bc394089..6ab8fc8ad49 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodePrioritizer.java
@@ -206,7 +206,7 @@ public class NodePrioritizer {
/** Returns whether we are allocating to replace a failed node */
private boolean isReplacement(NodeList nodesInCluster) {
- int failedNodesInCluster = nodesInCluster.state(Node.State.failed).size();
+ int failedNodesInCluster = nodesInCluster.failing().size();
if (failedNodesInCluster == 0) return false;
return ! requestedNodes.fulfilledBy(nodesInCluster.size() - failedNodesInCluster);
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
index 27f5b45173b..3f66f9cadc4 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailTester.java
@@ -52,6 +52,7 @@ public class NodeFailTester {
public static final ApplicationId tenantHostApp = ApplicationId.from("hosted-vespa", "tenant-host", "default");
public static final ApplicationId app1 = ApplicationId.from("foo1", "bar", "fuz");
public static final ApplicationId app2 = ApplicationId.from("foo2", "bar", "fuz");
+ public static final ClusterSpec.Id testCluster = ClusterSpec.Id.from("test");
public static final NodeFlavors hostFlavors = FlavorConfigBuilder.createDummies("default", "docker");
private static final Duration downtimeLimitOneHour = Duration.ofMinutes(60);
@@ -96,8 +97,8 @@ public class NodeFailTester {
tester.createHostNodes(3);
// Create applications
- ClusterSpec clusterApp1 = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("test")).vespaVersion("6.42").build();
- ClusterSpec clusterApp2 = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test")).vespaVersion("6.42").build();
+ ClusterSpec clusterApp1 = ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.42").build();
+ ClusterSpec clusterApp2 = ClusterSpec.request(ClusterSpec.Type.content, testCluster).vespaVersion("6.42").build();
Capacity capacity1 = Capacity.from(new ClusterResources(5, 1, nodeResources), false, true);
Capacity capacity2 = Capacity.from(new ClusterResources(7, 1, nodeResources), false, true);
@@ -125,8 +126,8 @@ public class NodeFailTester {
// Create applications
ClusterSpec clusterNodeAdminApp = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("node-admin")).vespaVersion("6.42").build();
- ClusterSpec clusterApp1 = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("test")).vespaVersion("6.75.0").build();
- ClusterSpec clusterApp2 = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test")).vespaVersion("6.75.0").build();
+ ClusterSpec clusterApp1 = ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.75.0").build();
+ ClusterSpec clusterApp2 = ClusterSpec.request(ClusterSpec.Type.content, testCluster).vespaVersion("6.75.0").build();
Capacity allHosts = Capacity.fromRequiredNodeType(NodeType.host);
Capacity capacity1 = Capacity.from(new ClusterResources(3, 1, new NodeResources(1, 4, 100, 0.3)), false, true);
Capacity capacity2 = Capacity.from(new ClusterResources(5, 1, new NodeResources(1, 4, 100, 0.3)), false, true);
@@ -150,7 +151,7 @@ public class NodeFailTester {
NodeFailTester tester = new NodeFailTester();
// Create applications
- ClusterSpec clusterApp = ClusterSpec.request(ClusterSpec.Type.container, ClusterSpec.Id.from("test")).vespaVersion("6.42").build();
+ ClusterSpec clusterApp = ClusterSpec.request(ClusterSpec.Type.container, testCluster).vespaVersion("6.42").build();
Map<ApplicationId, MockDeployer.ApplicationContext> apps = Map.of(app1, new MockDeployer.ApplicationContext(app1, clusterApp, capacity));
tester.initializeMaintainers(apps);
return tester;
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
index ca1fa2831b8..bb954058916 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailerTest.java
@@ -42,7 +42,6 @@ import static org.mockito.Mockito.when;
*/
public class NodeFailerTest {
-
private static final Report badTotalMemorySizeReport = Report.basicReport(
"badTotalMemorySize", HARD_FAIL, Instant.now(), "too low");
@@ -308,6 +307,10 @@ public class NodeFailerTest {
tester.highestIndex(tester.nodeRepository.nodes().list(Node.State.active).owner(NodeFailTester.app1)).allocation().get().membership().index()
>
lastNode.allocation().get().membership().index());
+
+ assertEquals("Node failing does not cause recording of scaling events",
+ 1,
+ tester.nodeRepository.applications().get(NodeFailTester.app1).get().cluster(NodeFailTester.testCluster).get().scalingEvents().size());
}
@Test