diff options
author | Håkon Hallingstad <hakon@verizonmedia.com> | 2021-03-15 11:25:56 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@verizonmedia.com> | 2021-03-15 11:25:56 +0100 |
commit | edefe87ca60b99db431055037a1e02b6ce3f9fbe (patch) | |
tree | 505d007f088f8031ebe25f9c8752047c88205260 /node-repository/src | |
parent | 8a593f1d3082b543cc488e627696d7b9cb8c05d1 (diff) |
Avoid eventual expiry of cfglike node
Diffstat (limited to 'node-repository/src')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java | 53 |
1 files changed, 28 insertions, 25 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java index 93014b93669..c13c2f9731e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java @@ -27,7 +27,8 @@ import java.util.stream.Collectors; */ public class RetiredExpirer extends NodeRepositoryMaintainer { - public static final int NUM_CONFIG_SERVERS = 3; + private static final int NUM_CONFIG_SERVERS = 3; + private final Deployer deployer; private final Metric metric; private final Orchestrator orchestrator; @@ -87,29 +88,6 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { if (nodeRepository().nodes().list().childrenOf(node).asList().stream() .allMatch(child -> child.state() == Node.State.parked || child.state() == Node.State.failed)) { - - if (node.type().isConfigServerLike() && activeNodes.nodeType(node.type()).asSet().size() < NUM_CONFIG_SERVERS) { - // Scenario: All 3 controllers want to retire. - // - // Say RetiredExpirer runs on cfg1 and gives cfg2 permission to be removed (PERMANENTLY_DOWN in ZK). - // The consequent redeployment moves cfg2 to inactive, removing cfg2 from the application, - // and PERMANENTLY_DOWN for cfg2 is cleaned up. - // - // If the RetiredExpirer on cfg3 now runs before its InfrastructureProvisioner, then - // a. But the duper model still contains cfg2 - // b. The service model still monitors cfg2 for health and it is UP - // c. The Orchestrator has no host status (like PERMANENTLY_DOWN) for cfg2, - // which is equivalent to NO_REMARKS - // Therefore, from the point of view of the Orchestrator invoked below, any cfg will - // be allowed to be removed, say cfg1. In the subsequent redeployment, both cfg2 - // and cfg1 are now inactive. - // - // A proper solution would be to ensure the duper model is changed atomically - // with node states across all config servers. As this would require some work, - // we will instead verify here that there are 3 active config servers before - // allowing the removal of any config server. - return false; - } log.info("Host " + node + " has no non-parked/failed children"); return true; } @@ -117,7 +95,32 @@ public class RetiredExpirer extends NodeRepositoryMaintainer { return false; } - if (node.history().hasEventBefore(History.Event.Type.retired, clock().instant().minus(retiredExpiry))) { + if (node.type().isConfigServerLike()) { + // Avoid eventual expiry of configserver-like nodes + + if (activeNodes.nodeType(node.type()).asSet().size() < NUM_CONFIG_SERVERS) { + // Scenario: All 3 config servers want to retire. + // + // Say RetiredExpirer runs on cfg1 and gives cfg2 permission to be removed (PERMANENTLY_DOWN in ZK). + // The consequent redeployment moves cfg2 to inactive, removing cfg2 from the application, + // and PERMANENTLY_DOWN for cfg2 is cleaned up. + // + // If the RetiredExpirer on cfg3 now runs before its InfrastructureProvisioner, then + // a. But the duper model still contains cfg2 + // b. The service model still monitors cfg2 for health and it is UP + // c. The Orchestrator has no host status (like PERMANENTLY_DOWN) for cfg2, + // which is equivalent to NO_REMARKS + // Therefore, from the point of view of the Orchestrator invoked below, any cfg will + // be allowed to be removed, say cfg1. In the subsequent redeployment, both cfg2 + // and cfg1 are now inactive. + // + // A proper solution would be to ensure the duper model is changed atomically + // with node states across all config servers. As this would require some work, + // we will instead verify here that there are 3 active config servers before + // allowing the removal of any config server. + return false; + } + } else if (node.history().hasEventBefore(History.Event.Type.retired, clock().instant().minus(retiredExpiry))) { log.warning("Node " + node + " has been retired longer than " + retiredExpiry + ": Allowing removal. This may cause data loss"); return true; } |