summaryrefslogtreecommitdiffstats
path: root/clustercontroller-core
diff options
context:
space:
mode:
Diffstat (limited to 'clustercontroller-core')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java18
1 files changed, 14 insertions, 4 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java
index 2af9fe1f091..7edff399633 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java
@@ -376,14 +376,24 @@ public class SystemStateGenerator {
// to be taken up at this point.
return false;
}
+ // There exists an edge in watchTimers where a node in Maintenance is implicitly
+ // transitioned into Down without being Down in either reported or wanted states
+ // iff isRpcAddressOutdated() is true. To avoid getting into an edge where we
+ // inadvertently clear this state because its reported/wanted states seem fine,
+ // we must also check if that particular edge could have happened. I.e. whether
+ // the node's RPC address is marked as outdated.
+ // It also makes sense in general to not allow taking a node back up automatically
+ // if its RPC connectivity appears to be bad.
+ if (info.isRpcAddressOutdated()) {
+ return false;
+ }
// Rationale: we can only enter this statement if the _current_ (generated) state
// of the node is Down. Aside from the group take-down logic, there should not exist
// any other edges in the cluster controller state transition logic where a node
- // may be set Down while both its reported state, RPC connectivity and wanted state
- // imply that a better state should already have been chosen. Consequently we allow
- // the node to have its Down-state cleared.
+ // may be set Down while both its reported state and wanted state imply that a better
+ // state should already have been chosen. Consequently we allow the node to have its
+ // Down-state cleared.
return (info.getReportedState().getState() != State.DOWN
- && !info.isRpcAddressOutdated()
&& !info.getWantedState().getState().oneOf("d"));
}