diff options
author | Arne Juul <arnej@yahooinc.com> | 2023-05-12 10:33:01 +0000 |
---|---|---|
committer | Arne Juul <arnej@yahooinc.com> | 2023-05-12 10:33:01 +0000 |
commit | 93cdbc4255040bca62645df21d7b210910cdd6ac (patch) | |
tree | 772c0bc29d142af119b11525ef6e4348f897782e /client/go/internal/admin/vespa-wrapper/configserver/check.go | |
parent | 5a80d77bc5d6df103e9beb7ca9b21fd8b8670234 (diff) |
wait for ping OK to majority of configserver hosts
Diffstat (limited to 'client/go/internal/admin/vespa-wrapper/configserver/check.go')
-rw-r--r-- | client/go/internal/admin/vespa-wrapper/configserver/check.go | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/client/go/internal/admin/vespa-wrapper/configserver/check.go b/client/go/internal/admin/vespa-wrapper/configserver/check.go index a0248dd128f..2a444019261 100644 --- a/client/go/internal/admin/vespa-wrapper/configserver/check.go +++ b/client/go/internal/admin/vespa-wrapper/configserver/check.go @@ -5,10 +5,15 @@ package configserver import ( "fmt" + "os" + "strings" + "time" "github.com/vespa-engine/vespa/client/go/internal/admin/defaults" + "github.com/vespa-engine/vespa/client/go/internal/admin/envvars" "github.com/vespa-engine/vespa/client/go/internal/admin/trace" "github.com/vespa-engine/vespa/client/go/internal/util" + "github.com/vespa-engine/vespa/client/go/internal/vespa" ) func checkIsConfigserver(myname string) { @@ -22,3 +27,91 @@ func checkIsConfigserver(myname string) { trace.Warning("only these hosts should run a config server:", onlyHosts) util.JustExitMsg(fmt.Sprintf("this host [%s] should not run a config server", myname)) } + +type pingChecker struct { + hostNames []string + lastErr map[string]error + lastOut map[string]string + backticks util.BackTicks +} + +func (pc *pingChecker) ping(hostname string) bool { + out, err := pc.backticks.Run("ping", "-c", "1", "-q", hostname) + pc.lastErr[hostname] = err + pc.lastOut[hostname] = strings.TrimSuffix(out, "\n") + return err == nil +} + +func (pc *pingChecker) pingAll() { + for _, hn := range pc.hostNames { + pc.ping(hn) + } +} + +func (pc *pingChecker) countOk() int { + isOk := 0 + for _, err := range pc.lastErr { + if err == nil { + isOk++ + } + } + return isOk +} + +func (pc *pingChecker) requiredOk() int { + return len(pc.hostNames)/2 + 1 +} + +func (pc *pingChecker) printErrors() { + for hn, err := range pc.lastErr { + if err != nil { + out := pc.lastOut[hn] + trace.Warning("failed to 'ping' host:", hn, "=>", err, "command output:", out) + } + } +} + +func waitForDnsResolving() { + onlyHosts := defaults.VespaConfigserverHosts() + if len(onlyHosts) < 2 { + // no wait in single-node case + return + } + if os.Getenv(envvars.VESPA_SKIP_PING) != "" { + trace.Debug("skipping DNS resolution check") + return + } + helper := pingChecker{ + hostNames: onlyHosts, + lastErr: make(map[string]error), + lastOut: make(map[string]string), + backticks: util.BackTicksWithStderr, + } + myname, _ := vespa.FindOurHostname() + if !helper.ping(myname) { + trace.Warning("self-ping failed, consider skipping this check") + } + trace.Debug("check DNS resolution, require", helper.requiredOk(), "OK answers") + for i := 0; i < 180; i++ { + helper.pingAll() + isOk := helper.countOk() + if isOk >= helper.requiredOk() { + if i > 2 || isOk < len(onlyHosts) { + trace.Info("successful 'ping' of", isOk, "configservers after", i, "retries") + } + helper.printErrors() + return + } + if i%10 == 2 { + trace.Warning("waiting for successful 'ping' of configservers", onlyHosts) + } + if i%40 == 3 { + helper.printErrors() + } + if i == 2 { + trace.Warning(fmt.Sprintf("set %s=true in environment to skip this check", envvars.VESPA_SKIP_PING)) + } + time.Sleep(1000 * time.Millisecond) + } + util.JustExitMsg("Giving up waiting for working 'ping' of enough configservers") +} |