aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2021-03-15 11:12:28 +0100
committerHåkon Hallingstad <hakon@verizonmedia.com>2021-03-15 11:12:28 +0100
commit8a593f1d3082b543cc488e627696d7b9cb8c05d1 (patch)
treebd843bc47c640c9e81c6a9e8322aef30154757a1 /node-repository
parentb7027ef7f782df562f60a74cf09d77aab2105193 (diff)
Require 3 active config servers/controllers when allowing removal
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java28
1 files changed, 26 insertions, 2 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
index c8c3cc0daa5..93014b93669 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
@@ -27,6 +27,7 @@ import java.util.stream.Collectors;
*/
public class RetiredExpirer extends NodeRepositoryMaintainer {
+ public static final int NUM_CONFIG_SERVERS = 3;
private final Deployer deployer;
private final Metric metric;
private final Orchestrator orchestrator;
@@ -57,7 +58,7 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
for (Map.Entry<ApplicationId, List<Node>> entry : retiredNodesByApplication.entrySet()) {
ApplicationId application = entry.getKey();
List<Node> retiredNodes = entry.getValue();
- List<Node> nodesToRemove = retiredNodes.stream().filter(this::canRemove).collect(Collectors.toList());
+ List<Node> nodesToRemove = retiredNodes.stream().filter(n -> canRemove(n, activeNodes)).collect(Collectors.toList());
if (nodesToRemove.isEmpty()) continue;
try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository())) {
@@ -81,11 +82,34 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
* - The node has been in state {@link History.Event.Type#retired} for longer than {@link #retiredExpiry}
* - Orchestrator allows it
*/
- private boolean canRemove(Node node) {
+ private boolean canRemove(Node node, NodeList activeNodes) {
if (node.type().isHost()) {
if (nodeRepository().nodes().list().childrenOf(node).asList().stream()
.allMatch(child -> child.state() == Node.State.parked ||
child.state() == Node.State.failed)) {
+
+ if (node.type().isConfigServerLike() && activeNodes.nodeType(node.type()).asSet().size() < NUM_CONFIG_SERVERS) {
+ // Scenario: All 3 controllers want to retire.
+ //
+ // Say RetiredExpirer runs on cfg1 and gives cfg2 permission to be removed (PERMANENTLY_DOWN in ZK).
+ // The consequent redeployment moves cfg2 to inactive, removing cfg2 from the application,
+ // and PERMANENTLY_DOWN for cfg2 is cleaned up.
+ //
+ // If the RetiredExpirer on cfg3 now runs before its InfrastructureProvisioner, then
+ // a. But the duper model still contains cfg2
+ // b. The service model still monitors cfg2 for health and it is UP
+ // c. The Orchestrator has no host status (like PERMANENTLY_DOWN) for cfg2,
+ // which is equivalent to NO_REMARKS
+ // Therefore, from the point of view of the Orchestrator invoked below, any cfg will
+ // be allowed to be removed, say cfg1. In the subsequent redeployment, both cfg2
+ // and cfg1 are now inactive.
+ //
+ // A proper solution would be to ensure the duper model is changed atomically
+ // with node states across all config servers. As this would require some work,
+ // we will instead verify here that there are 3 active config servers before
+ // allowing the removal of any config server.
+ return false;
+ }
log.info("Host " + node + " has no non-parked/failed children");
return true;
}