summaryrefslogtreecommitdiffstats
path: root/clustercontroller-core
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahoo-inc.com>2016-06-22 15:41:20 +0200
committerTor Brede Vekterli <vekterli@yahoo-inc.com>2016-06-22 15:41:20 +0200
commit594a017b330b5b13c374fb8a4785651d7da2d26a (patch)
tree14900ad655d03b6f7c02158dffddb202244d44df /clustercontroller-core
parent90c3b21d21d83f2cc4b148afe4d4e6278d1f8394 (diff)
Clarify predicate on isRpcAddressOutdated() for clearing node state
Logic is unchanged, but added comment with rationale and cross-reference to other method that we're trying to be symmetrical with in terms of state transition behavior.
Diffstat (limited to 'clustercontroller-core')
-rw-r--r--clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java18
1 files changed, 14 insertions, 4 deletions
diff --git a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java
index 2af9fe1f091..7edff399633 100644
--- a/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java
+++ b/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/SystemStateGenerator.java
@@ -376,14 +376,24 @@ public class SystemStateGenerator {
// to be taken up at this point.
return false;
}
+ // There exists an edge in watchTimers where a node in Maintenance is implicitly
+ // transitioned into Down without being Down in either reported or wanted states
+ // iff isRpcAddressOutdated() is true. To avoid getting into an edge where we
+ // inadvertently clear this state because its reported/wanted states seem fine,
+ // we must also check if that particular edge could have happened. I.e. whether
+ // the node's RPC address is marked as outdated.
+ // It also makes sense in general to not allow taking a node back up automatically
+ // if its RPC connectivity appears to be bad.
+ if (info.isRpcAddressOutdated()) {
+ return false;
+ }
// Rationale: we can only enter this statement if the _current_ (generated) state
// of the node is Down. Aside from the group take-down logic, there should not exist
// any other edges in the cluster controller state transition logic where a node
- // may be set Down while both its reported state, RPC connectivity and wanted state
- // imply that a better state should already have been chosen. Consequently we allow
- // the node to have its Down-state cleared.
+ // may be set Down while both its reported state and wanted state imply that a better
+ // state should already have been chosen. Consequently we allow the node to have its
+ // Down-state cleared.
return (info.getReportedState().getState() != State.DOWN
- && !info.isRpcAddressOutdated()
&& !info.getWantedState().getState().oneOf("d"));
}