diff options
-rw-r--r-- | config-model/src/main/java/com/yahoo/vespa/model/ConfigSentinel.java | 8 | ||||
-rw-r--r-- | configd/src/apps/sentinel/connectivity.cpp | 22 | ||||
-rw-r--r-- | configd/src/apps/sentinel/connectivity.h | 2 | ||||
-rw-r--r-- | configdefinitions/src/vespa/sentinel.def | 8 |
4 files changed, 20 insertions, 20 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/ConfigSentinel.java b/config-model/src/main/java/com/yahoo/vespa/model/ConfigSentinel.java index 800bf73cdbb..d05913143e4 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/ConfigSentinel.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/ConfigSentinel.java @@ -86,11 +86,11 @@ public class ConfigSentinel extends AbstractService implements SentinelConfig.Pr private SentinelConfig.Connectivity.Builder getConnectivityConfig(boolean enable) { var builder = new SentinelConfig.Connectivity.Builder(); if (enable) { - builder.maxBadOutPercent(60); - builder.maxBadReverseCount(3); + builder.minOkPercent(40); + builder.maxBadCount(3); } else { - builder.maxBadOutPercent(100); - builder.maxBadReverseCount(Integer.MAX_VALUE); + builder.minOkPercent(0); + builder.maxBadCount(Integer.MAX_VALUE); } return builder; } diff --git a/configd/src/apps/sentinel/connectivity.cpp b/configd/src/apps/sentinel/connectivity.cpp index 8d1aa0e9673..2cdd6f8a914 100644 --- a/configd/src/apps/sentinel/connectivity.cpp +++ b/configd/src/apps/sentinel/connectivity.cpp @@ -120,8 +120,8 @@ void classifyConnFails(ConnectivityMap &connectivityMap, void Connectivity::configure(const SentinelConfig::Connectivity &config) { _config = config; - LOG(config, "connectivity.maxBadReverseCount = %d", _config.maxBadReverseCount); - LOG(config, "connectivity.maxBadOutPercent = %d", _config.maxBadOutPercent); + LOG(config, "connectivity.maxBadCount = %d", _config.maxBadCount); + LOG(config, "connectivity.minOkPercent = %d", _config.minOkPercent); if (auto up = ConfigOwner::fetchModelConfig(MODEL_TIMEOUT_MS)) { _checkSpecs = specsFrom(*up); } @@ -165,31 +165,31 @@ void Connectivity::accumulate(Accumulated &target, CcResult value) { case CcResult::UNREACHABLE_UP: case CcResult::INDIRECT_PING_FAIL: ++target.numSeriousIssues; - ++target.numIssues; break; case CcResult::CONN_FAIL: - ++target.numIssues; + // not OK, but not a serious issue either break; case CcResult::INDIRECT_PING_UNAVAIL: case CcResult::ALL_OK: + ++target.numUpAndOk; break; } } bool Connectivity::enoughOk(const Accumulated &results, size_t clusterSize) { bool enough = true; - if (results.numSeriousIssues > size_t(_config.maxBadReverseCount)) { + if (results.numSeriousIssues > size_t(_config.maxBadCount)) { LOG(warning, "%zu of %zu nodes up but with network connectivity problems (max is %d)", - results.numSeriousIssues, clusterSize, _config.maxBadReverseCount); + results.numSeriousIssues, clusterSize, _config.maxBadCount); enough = false; } - if (results.numIssues * 100.0 > _config.maxBadOutPercent * clusterSize) { - double pct = results.numIssues * 100.0 / clusterSize; - LOG(warning, "Problems with connection to %zu of %zu nodes, %.1f%% (max is %d%%)", - results.numIssues, clusterSize, pct, _config.maxBadOutPercent); + if (results.numUpAndOk * 100.0 < _config.minOkPercent * clusterSize) { + double pct = results.numUpAndOk * 100.0 / clusterSize; + LOG(warning, "Only %zu of %zu nodes are up and OK, %.1f%% (min is %d%%)", + results.numUpAndOk, clusterSize, pct, _config.minOkPercent); enough = false; } - if (results.numIssues == 0) { + if (results.numUpAndOk == clusterSize) { LOG(info, "All connectivity checks OK, proceeding with service startup"); } else if (enough) { LOG(info, "Enough connectivity checks OK, proceeding with service startup"); diff --git a/configd/src/apps/sentinel/connectivity.h b/configd/src/apps/sentinel/connectivity.h index a1e454a255a..6b0e2d7523e 100644 --- a/configd/src/apps/sentinel/connectivity.h +++ b/configd/src/apps/sentinel/connectivity.h @@ -28,7 +28,7 @@ public: bool checkConnectivity(RpcServer &rpcServer); private: struct Accumulated { - size_t numIssues = 0; + size_t numUpAndOk = 0; size_t numSeriousIssues = 0; }; void accumulate(Accumulated &target, CcResult value); diff --git a/configdefinitions/src/vespa/sentinel.def b/configdefinitions/src/vespa/sentinel.def index 45ef9b21cfd..cf19e701717 100644 --- a/configdefinitions/src/vespa/sentinel.def +++ b/configdefinitions/src/vespa/sentinel.def @@ -22,11 +22,11 @@ application.region string default="default" # those that can connect back to us. We delay starting services # if we have more problems than the following limits allow: -## Percentage we fail to talk to, maximum -connectivity.maxBadOutPercent int default=100 +## Percentage of nodes that must be up and fully OK, minimum +connectivity.minOkPercent int default=0 -## Absolute number of nodes that fail to talk back to us, maximum -connectivity.maxBadReverseCount int default=999999999 +## Absolute number of nodes with confirmed network connectivity problems, maximum +connectivity.maxBadCount int default=999999999 ## The command to run. This will be run by sh -c, and the following ## environment variables are defined: $ROOT, $VESPA_SERVICE_NAME, |