summaryrefslogtreecommitdiffstats
path: root/node-repository/src
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2021-03-15 11:25:56 +0100
committerHåkon Hallingstad <hakon@verizonmedia.com>2021-03-15 11:25:56 +0100
commitedefe87ca60b99db431055037a1e02b6ce3f9fbe (patch)
tree505d007f088f8031ebe25f9c8752047c88205260 /node-repository/src
parent8a593f1d3082b543cc488e627696d7b9cb8c05d1 (diff)
Avoid eventual expiry of cfglike node
Diffstat (limited to 'node-repository/src')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java53
1 files changed, 28 insertions, 25 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
index 93014b93669..c13c2f9731e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
@@ -27,7 +27,8 @@ import java.util.stream.Collectors;
*/
public class RetiredExpirer extends NodeRepositoryMaintainer {
- public static final int NUM_CONFIG_SERVERS = 3;
+ private static final int NUM_CONFIG_SERVERS = 3;
+
private final Deployer deployer;
private final Metric metric;
private final Orchestrator orchestrator;
@@ -87,29 +88,6 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
if (nodeRepository().nodes().list().childrenOf(node).asList().stream()
.allMatch(child -> child.state() == Node.State.parked ||
child.state() == Node.State.failed)) {
-
- if (node.type().isConfigServerLike() && activeNodes.nodeType(node.type()).asSet().size() < NUM_CONFIG_SERVERS) {
- // Scenario: All 3 controllers want to retire.
- //
- // Say RetiredExpirer runs on cfg1 and gives cfg2 permission to be removed (PERMANENTLY_DOWN in ZK).
- // The consequent redeployment moves cfg2 to inactive, removing cfg2 from the application,
- // and PERMANENTLY_DOWN for cfg2 is cleaned up.
- //
- // If the RetiredExpirer on cfg3 now runs before its InfrastructureProvisioner, then
- // a. But the duper model still contains cfg2
- // b. The service model still monitors cfg2 for health and it is UP
- // c. The Orchestrator has no host status (like PERMANENTLY_DOWN) for cfg2,
- // which is equivalent to NO_REMARKS
- // Therefore, from the point of view of the Orchestrator invoked below, any cfg will
- // be allowed to be removed, say cfg1. In the subsequent redeployment, both cfg2
- // and cfg1 are now inactive.
- //
- // A proper solution would be to ensure the duper model is changed atomically
- // with node states across all config servers. As this would require some work,
- // we will instead verify here that there are 3 active config servers before
- // allowing the removal of any config server.
- return false;
- }
log.info("Host " + node + " has no non-parked/failed children");
return true;
}
@@ -117,7 +95,32 @@ public class RetiredExpirer extends NodeRepositoryMaintainer {
return false;
}
- if (node.history().hasEventBefore(History.Event.Type.retired, clock().instant().minus(retiredExpiry))) {
+ if (node.type().isConfigServerLike()) {
+ // Avoid eventual expiry of configserver-like nodes
+
+ if (activeNodes.nodeType(node.type()).asSet().size() < NUM_CONFIG_SERVERS) {
+ // Scenario: All 3 config servers want to retire.
+ //
+ // Say RetiredExpirer runs on cfg1 and gives cfg2 permission to be removed (PERMANENTLY_DOWN in ZK).
+ // The consequent redeployment moves cfg2 to inactive, removing cfg2 from the application,
+ // and PERMANENTLY_DOWN for cfg2 is cleaned up.
+ //
+ // If the RetiredExpirer on cfg3 now runs before its InfrastructureProvisioner, then
+ // a. But the duper model still contains cfg2
+ // b. The service model still monitors cfg2 for health and it is UP
+ // c. The Orchestrator has no host status (like PERMANENTLY_DOWN) for cfg2,
+ // which is equivalent to NO_REMARKS
+ // Therefore, from the point of view of the Orchestrator invoked below, any cfg will
+ // be allowed to be removed, say cfg1. In the subsequent redeployment, both cfg2
+ // and cfg1 are now inactive.
+ //
+ // A proper solution would be to ensure the duper model is changed atomically
+ // with node states across all config servers. As this would require some work,
+ // we will instead verify here that there are 3 active config servers before
+ // allowing the removal of any config server.
+ return false;
+ }
+ } else if (node.history().hasEventBefore(History.Event.Type.retired, clock().instant().minus(retiredExpiry))) {
log.warning("Node " + node + " has been retired longer than " + retiredExpiry + ": Allowing removal. This may cause data loss");
return true;
}