diff options
Diffstat (limited to 'configd/src/apps/sentinel/service.cpp')
-rw-r--r-- | configd/src/apps/sentinel/service.cpp | 432 |
1 files changed, 432 insertions, 0 deletions
diff --git a/configd/src/apps/sentinel/service.cpp b/configd/src/apps/sentinel/service.cpp new file mode 100644 index 00000000000..9a1f3dc9c82 --- /dev/null +++ b/configd/src/apps/sentinel/service.cpp @@ -0,0 +1,432 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <sys/types.h> +#include <signal.h> +#include <errno.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/wait.h> + +#include <vespa/log/log.h> +LOG_SETUP(".service", "$Id$"); +#include <vespa/log/llparser.h> + +#include "service.h" +#include "output-connection.h" + +extern sig_atomic_t stop; + +namespace config { +namespace sentinel { + + +Service::Service(const SentinelConfig::Service& service, const SentinelConfig::Application& application, + std::list<OutputConnection *> &ocs, StartMetrics &metrics) + : _pid(-1), + _rawState(READY), + _state(_rawState), + _exitStatus(0), + _config(new SentinelConfig::Service(service)), + _isAutomatic(true), + _restartPenalty(0), + _last_start(0), + _application(application), + _outputConnections(ocs), + _metrics(metrics) +{ + LOG(debug, "%s: created", name().c_str()); + LOG(debug, "autostart: %s", _config->autostart ? "YES" : "NO"); + LOG(debug, " restart: %s", _config->autorestart ? "YES" : "NO"); + LOG(debug, " command: %s", _config->command.c_str()); + LOG(debug, " configid: %s", _config->id.c_str()); + + if (_config->autostart) { + start(); + } +} + +void +Service::reconfigure(const SentinelConfig::Service& config) +{ + if (config.command != _config->command) { + LOG(debug, "%s: reconfigured command '%s' -> '%s' - this will " + "take effect at next restart", name().c_str(), + _config->command.c_str(), config.command.c_str()); + } + if (config.autostart != _config->autostart) { + LOG(debug, "%s: reconfigured autostart %s", name().c_str(), + config.autostart ? "OFF -> ON" : "ON -> OFF"); + } + if (config.autorestart != _config->autorestart) { + LOG(debug, "%s: reconfigured autorestart %s", name().c_str(), + config.autorestart ? "OFF -> ON" : "ON -> OFF"); + } + if (config.id != _config->id) { + LOG(warning, "%s: reconfigured config id '%s' -> '%s' - signaling service restart", + name().c_str(), _config->id.c_str(), config.id.c_str()); + terminate(true); + } + + delete _config; + _config = new SentinelConfig::Service(config); + + if (_isAutomatic + && ((_config->autostart && _state == READY) + || (_config->autorestart && _state == FINISHED))) + { + LOG(debug, "%s: Restarting due to new config", name().c_str()); + start(); + } +} + +Service::~Service() +{ + terminate(false); + delete _config; +} + +int +Service::terminate(bool catchable) +{ + if (isRunning()) { + runPreShutdownCommand(); + LOG(debug, "%s: terminate(%s)", name().c_str(), catchable ? "cleanly" : "NOW"); + resetRestartPenalty(); + if (catchable) { + setState(TERMINATING); + int ret = kill(_pid, SIGTERM); + LOG(debug, "%s: kill -SIGTERM %d: %s", name().c_str(), (int)_pid, + ret == 0 ? "OK" : strerror(errno)); + return ret; + } else { + setState(KILLING); + kill(_pid, SIGCONT); // if it was stopped for some reason + int ret = kill(_pid, SIGKILL); + LOG(debug, "%s: kill -SIGKILL %d: %s", name().c_str(), (int)_pid, + ret == 0 ? "OK" : strerror(errno)); + return ret; + } + } + + return 0; // Not running, so all is ok. +} + +void +Service::runPreShutdownCommand() +{ + if (_config->preShutdownCommand.length() > 0) { + LOG(debug, "%s: runPreShutdownCommand(%s)", name().c_str(), _config->preShutdownCommand.c_str()); + runCommand(_config->preShutdownCommand); + } +} + +void +Service::runCommand(const std::string & command) +{ + int ret = system(command.c_str()); + if (ret != 0) { + LOG(info, "%s: unable to run showdown command (%s): %d (%s)", name().c_str(), command.c_str(), ret, strerror(ret)); + } +} + +int +Service::start() +{ + // make sure the service does not restart in a tight loop: + time_t now = time(0); + int diff = now - _last_start; + if (diff < 10) { + incrementRestartPenalty(); + now += _restartPenalty; // will delay start this much + } + _last_start = now; + +// make a pipe, close the good ends of it, mark it close-on-exec +// if exec fails, write a complaint on the fd (which will then be read +// by mother program). +// +// Return 0 on success, -1 on failure + setState(STARTING); + + int pipes[2]; + int err = pipe(pipes); + int stdoutpipes[2]; + err |= pipe(stdoutpipes); + int stderrpipes[2]; + err |= pipe(stderrpipes); + + if (err == -1) { + LOG(error, "%s: Attempted to start, but pipe() failed: %s", name().c_str(), + strerror(errno)); + setState(FAILED); + return -1; + } + + fflush(NULL); + _pid = fork(); + if (_pid == -1) { + LOG(error, "%s: Attempted to start, but fork() failed: %s", name().c_str(), + strerror(errno)); + setState(FAILED); + close(pipes[0]); + close(pipes[1]); + close(stdoutpipes[0]); + close(stdoutpipes[1]); + close(stderrpipes[0]); + close(stderrpipes[1]); + return -1; + } + + if (_pid == 0) { + close(pipes[0]); // Close reading end + close(stdoutpipes[0]); + close(stderrpipes[0]); + + close(1); + dup2(stdoutpipes[1], 1); + close(stdoutpipes[1]); + + close(2); + dup2(stderrpipes[1], 2); + close(stderrpipes[1]); + + LOG(debug, "%s: Started as pid %d", name().c_str(), + static_cast<int>(getpid())); + signal(SIGTERM, SIG_DFL); + signal(SIGINT, SIG_DFL); + if (stop) { + kill(getpid(), SIGTERM); + } + if (_restartPenalty > 0) { + LOG(debug, "%s: Applying %u sec restart penalty", name().c_str(), + _restartPenalty); + sleep(_restartPenalty); + } + EV_STARTING(name().c_str()); + runChild(pipes); // This function should not return. + _exit(EXIT_FAILURE); + } + + close(pipes[1]); // close writing end + close(stdoutpipes[1]); + close(stderrpipes[1]); + + // do not call ensureChildRuns, as the pipe magic did not work as intended + // This also ensures that the process does not wait while the service process waits in penalty. + // ensureChildRuns(pipes[0]); // This will wait until the execl goes through + setState(RUNNING); + _metrics.currentlyRunningServices++; + close(pipes[0]); // close reading end + + using ns_log::LLParser; + LLParser *p = new LLParser(); + p->setService(_config->name.c_str()); + p->setComponent("stdout"); + p->setPid(_pid); + fcntl(stdoutpipes[0], F_SETFL, + fcntl(stdoutpipes[0], F_GETFL) | O_NONBLOCK); + OutputConnection *c = new OutputConnection(stdoutpipes[0], p); + _outputConnections.push_back(c); + + p = new LLParser(); + p->setService(_config->name.c_str()); + p->setComponent("stderr"); + p->setPid(_pid); + p->setDefaultLevel(ns_log::Logger::warning); + fcntl(stderrpipes[0], F_SETFL, + fcntl(stderrpipes[0], F_GETFL) | O_NONBLOCK); + c = new OutputConnection(stderrpipes[0], p); + _outputConnections.push_back(c); + + return (_state == RUNNING) ? 0 : -1; +} + + +// TODO: Garbage collect this, since it did not work as intended when execl'ing /bin/sh +void +Service::ensureChildRuns(int fd) +{ + char buf[200]; + int len; + do { + len = read(fd, buf, sizeof buf); + } while (len == -1 && errno == EINTR); + if (len > 0) { + // Failed to do an execl.. pick up the remains + _exitStatus = 0; + waitpid(_pid, &_exitStatus, 0); + setState(FAILED); + } else { + setState(RUNNING); + } +} + + +void +Service::youExited(int status) +{ + // Someone did a waitpid() and figured out that we exited. + _exitStatus = status; + if (WIFEXITED(status)) { + LOG(debug, "%s: Exited with exit code %d", name().c_str(), + WEXITSTATUS(status)); + EV_STOPPED(name().c_str(), _pid, WEXITSTATUS(status)); + setState(FINISHED); + } else if (WIFSIGNALED(status)) { + bool expectedDeath = (_state == KILLING || _state == TERMINATING + || _state == KILLED || _state == TERMINATED); + if (expectedDeath) { + EV_STOPPED(name().c_str(), _pid, WTERMSIG(status)); + LOG(debug, "%s: Exited expectedly by signal %d", name().c_str(), + WTERMSIG(status)); + } else { + EV_CRASH(name().c_str(), _pid, WTERMSIG(status)); + setState(FAILED); + } + } else if (WIFSTOPPED(status)) { + LOG(warning, "%s: STOPPED by signal %d!", name().c_str(), WSTOPSIG(status)); + setState(FAILED); + } else { + LOG(error, "%s: Weird exit code %d", name().c_str(), status); + setState(FAILED); + } + _metrics.currentlyRunningServices--; + + if (_state == TERMINATING) { + setState(TERMINATED); + } else if (_state == KILLING) { + setState(KILLED); + } + if (_isAutomatic && _config->autorestart && !stop) { + // ### Implement some rate limiting here maybe? + LOG(debug, "%s: Has autorestart flag, restarting.", name().c_str()); + setState(READY); + _metrics.totalRestartsCounter++; + _metrics.totalRestartsLastPeriod++; + start(); + } +} + +void +Service::runChild(int pipes[2]) +{ + // child process - this should exec or signal error + for (int n = 3; n < 1024; ++n) { // Close all open fds on exec() + fcntl(n, F_SETFD, FD_CLOEXEC); + } + + // TODO: Garbage collect the clever pipes magic, as it does not work when the execl target is /bin/sh + fcntl(pipes[1], F_SETFD, FD_CLOEXEC); // close on exec() + + // Set up environment + setenv("VESPA_SERVICE_NAME", _config->name.c_str(), 1); + setenv("VESPA_CONFIG_ID", _config->id.c_str(), 1); + setenv("VESPA_APPLICATION_TENANT", _application.tenant.c_str(), 1); + setenv("VESPA_APPLICATION_NAME", _application.name.c_str(), 1); + setenv("VESPA_APPLICATION_ENVIRONMENT", _application.environment.c_str(), 1); + setenv("VESPA_APPLICATION_REGION", _application.region.c_str(), 1); + setenv("VESPA_APPLICATION_INSTANCE", _application.instance.c_str(), 1); + if (_config->affinity.cpuSocket >= 0) { + setenv("VESPA_AFFINITY_CPU_SOCKET", std::to_string(_config->affinity.cpuSocket).c_str(), 1); + } + // ROOT is already set + + // Set up file descriptor 0 (1 and 2 should be setup already) + close(0); + int fd = open("/dev/null", O_RDONLY | O_NOCTTY, 0666); + if (fd != 0) { + char buf[200]; + snprintf(buf, sizeof buf, "open /dev/null for fd 0: got %d " + "(%s)", fd, strerror(errno)); + write(pipes[1], buf, strlen(buf)); + _exit(EXIT_FAILURE); + } + fcntl(0, F_SETFD, 0); // Don't close on exec + + execl("/bin/sh", "/bin/sh", "-c", _config->command.c_str(), NULL); + + char buf[200]; + snprintf(buf, sizeof buf, "exec error: %s for /bin/sh -c '%s'", + strerror(errno), _config->command.c_str()); + write(pipes[1], buf, strlen(buf)); + _exit(EXIT_FAILURE); +} + +const vespalib::string & +Service::name() const +{ + return _config->name; +} + +bool +Service::isRunning() const +{ + switch (_state) { + case READY: + case FINISHED: + case KILLED: + case TERMINATED: + case FAILED: + return false; + + case STARTING: + case RUNNING: + case TERMINATING: + case KILLING: + return true; + } + return true; // this will not be reached +} + +void +Service::setAutomatic(bool autoStatus) +{ + _isAutomatic = autoStatus; + resetRestartPenalty(); +} + + +void +Service::incrementRestartPenalty() +{ + if (_restartPenalty < MAX_RESTART_PENALTY) { + _restartPenalty++; + } else { + _restartPenalty = MAX_RESTART_PENALTY; + } +} + + +void +Service::setState(ServiceState state) +{ + if (state != _state) { + LOG(debug, "%s: %s->%s", name().c_str(), stateName(_state), stateName(state)); + _rawState = state; + } + + // penalize failed services + if (state == FAILED) { + incrementRestartPenalty(); + } +} + +const char * +Service::stateName(ServiceState state) const +{ + switch (state) { + case READY: return "READY"; + case STARTING: return "STARTING"; + case RUNNING: return "RUNNING"; + case TERMINATING: return "TERMINATING"; + case KILLING: return "KILLING"; + case FINISHED: return "FINISHED"; + case TERMINATED: return "TERMINATED"; + case KILLED: return "KILLED"; + case FAILED: return "FAILED"; + } + return "--BAD--"; +} + + +} // end namespace sentinel +} // end namespace config |