summaryrefslogtreecommitdiffstats
path: root/configd
diff options
context:
space:
mode:
authorArne Juul <arnej@yahoo-inc.com>2019-05-07 10:24:59 +0000
committerArne Juul <arnej@yahoo-inc.com>2019-05-07 12:07:44 +0000
commit9ef76bca3d69abd12226e6711172c6ffe529da16 (patch)
tree49b340d660c82f77338534a6c0b37dbc6161c8c7 /configd
parentd15201a325696298a415b1c854bbda8a79ecc2fe (diff)
re-implement delayed restart
* instead of sleeping in the forked process, make "restarting" a new service state and check if the restart penalty has been applied periodically * reset restart penalty on config generation changes, meaning application deployment triggers immediate startup of delayed services * also, keep removed services after re-configuration in a separate "orphans" list, to get better tracking of their final shutdown
Diffstat (limited to 'configd')
-rw-r--r--configd/src/apps/sentinel/config-handler.cpp26
-rw-r--r--configd/src/apps/sentinel/config-handler.h2
-rw-r--r--configd/src/apps/sentinel/service.cpp96
-rw-r--r--configd/src/apps/sentinel/service.h7
4 files changed, 97 insertions, 34 deletions
diff --git a/configd/src/apps/sentinel/config-handler.cpp b/configd/src/apps/sentinel/config-handler.cpp
index a434835c656..0f624da2bf0 100644
--- a/configd/src/apps/sentinel/config-handler.cpp
+++ b/configd/src/apps/sentinel/config-handler.cpp
@@ -155,6 +155,13 @@ ConfigHandler::doConfigure()
}
}
_services.swap(services);
+ for (auto & entry : services) {
+ Service::UP svc = std::move(entry.second);
+ if (svc && svc->isRunning()) {
+ svc->remove();
+ _orphans[entry.first] = std::move(svc);
+ }
+ }
vespalib::ComponentConfigProducer::Config current("sentinel", _subscriber.getGeneration(), "ok");
_stateApi.myComponents.addConfig(current);
}
@@ -168,7 +175,7 @@ ConfigHandler::doWork()
if (_subscriber.nextGeneration(0)) {
doConfigure();
}
-
+ handleRestarts();
handleCommands();
handleOutputs();
handleChildDeaths();
@@ -183,6 +190,16 @@ ConfigHandler::doWork()
return false;
}
+void
+ConfigHandler::handleRestarts()
+{
+ for (const auto & entry : _services) {
+ Service & svc = *(entry.second);
+ if (svc.wantsRestart()) {
+ svc.start();
+ }
+ }
+}
void
ConfigHandler::handleChildDeaths()
@@ -198,6 +215,7 @@ ConfigHandler::handleChildDeaths()
LOG(debug, "pid %d finished, Service:%s", (int)pid,
service->name().c_str());
service->youExited(status);
+ _orphans.erase(service->name());
} else {
LOG(warning, "Unknown child pid %d exited (wait-status = %d)",
(int)pid, status);
@@ -268,6 +286,12 @@ ConfigHandler::serviceByPid(pid_t pid)
return service;
}
}
+ for (const auto & it : _orphans) {
+ Service *service = it.second.get();
+ if (service->pid() == pid) {
+ return service;
+ }
+ }
return nullptr;
}
diff --git a/configd/src/apps/sentinel/config-handler.h b/configd/src/apps/sentinel/config-handler.h
index a1ae054f888..0af19a6363a 100644
--- a/configd/src/apps/sentinel/config-handler.h
+++ b/configd/src/apps/sentinel/config-handler.h
@@ -29,6 +29,7 @@ private:
ConfigSubscriber _subscriber;
ConfigHandle<SentinelConfig>::UP _sentinelHandle;
ServiceMap _services;
+ ServiceMap _orphans;
std::list<OutputConnection *> _outputConnections;
CommandQueue _cmdQ;
std::unique_ptr<RpcServer> _rpcServer;
@@ -46,6 +47,7 @@ private:
void handleCmd(const Cmd& cmd);
void handleOutputs();
void handleChildDeaths();
+ void handleRestarts();
static int listen(int port);
void configure_port(int port);
diff --git a/configd/src/apps/sentinel/service.cpp b/configd/src/apps/sentinel/service.cpp
index 20c2a6c9b00..941ea690024 100644
--- a/configd/src/apps/sentinel/service.cpp
+++ b/configd/src/apps/sentinel/service.cpp
@@ -72,11 +72,12 @@ Service::reconfigure(const SentinelConfig::Service& config)
delete _config;
_config = new SentinelConfig::Service(config);
- if (_isAutomatic
- && ((_state == READY) || (_state == FINISHED)))
- {
- LOG(debug, "%s: Restarting due to new config", name().c_str());
- start();
+ if ((_state == READY) || (_state == FINISHED) || (_state == RESTARTING)) {
+ resetRestartPenalty();
+ if (_isAutomatic) {
+ LOG(debug, "%s: Restarting due to new config", name().c_str());
+ start();
+ }
}
}
@@ -148,19 +149,14 @@ Service::runCommand(const std::string & command)
}
}
-int
+void
Service::start()
{
- // make sure the service does not restart in a tight loop:
- time_t now = time(0);
- int diff = now - _last_start;
- if (diff < MAX_RESTART_PENALTY) {
- incrementRestartPenalty();
+ if (_state == REMOVING) {
+ LOG(warning, "tried to start '%s' in REMOVING state", name().c_str());
+ return;
}
- if (diff > 10 * MAX_RESTART_PENALTY) {
- resetRestartPenalty();
- }
- now += _restartPenalty; // will delay start this much
+ time_t now = time(0);
_last_start = now;
// make a pipe, close the good ends of it, mark it close-on-exec
@@ -181,7 +177,7 @@ Service::start()
LOG(error, "%s: Attempted to start, but pipe() failed: %s", name().c_str(),
strerror(errno));
setState(FAILED);
- return -1;
+ return;
}
fflush(nullptr);
@@ -196,7 +192,7 @@ Service::start()
close(stdoutpipes[1]);
close(stderrpipes[0]);
close(stderrpipes[1]);
- return -1;
+ return;
}
if (_pid == 0) {
@@ -219,11 +215,6 @@ Service::start()
if (stop()) {
kill(getpid(), SIGTERM);
}
- if (_restartPenalty > 0) {
- LOG(info, "%s: Applying %u sec restart penalty", name().c_str(),
- _restartPenalty);
- sleep(_restartPenalty);
- }
EV_STARTING(name().c_str());
runChild(pipes); // This function should not return.
_exit(EXIT_FAILURE);
@@ -260,8 +251,15 @@ Service::start()
fcntl(stderrpipes[0], F_GETFL) | O_NONBLOCK);
c = new OutputConnection(stderrpipes[0], p);
_outputConnections.push_back(c);
+}
- return (_state == RUNNING) ? 0 : -1;
+void
+Service::remove()
+{
+ LOG(info, "%s: removed from config", name().c_str());
+ setAutomatic(false);
+ terminate(false, false);
+ setState(REMOVING);
}
@@ -290,18 +288,24 @@ Service::youExited(int status)
{
// Someone did a waitpid() and figured out that we exited.
_exitStatus = status;
+ bool expectedDeath = (_state == KILLING || _state == TERMINATING
+ || _state == REMOVING
+ || _state == KILLED || _state == TERMINATED);
if (WIFEXITED(status)) {
LOG(debug, "%s: Exited with exit code %d", name().c_str(),
WEXITSTATUS(status));
EV_STOPPED(name().c_str(), _pid, WEXITSTATUS(status));
setState(FINISHED);
} else if (WIFSIGNALED(status)) {
- bool expectedDeath = (_state == KILLING || _state == TERMINATING
- || _state == KILLED || _state == TERMINATED);
if (expectedDeath) {
EV_STOPPED(name().c_str(), _pid, WTERMSIG(status));
LOG(debug, "%s: Exited expectedly by signal %d", name().c_str(),
WTERMSIG(status));
+ if (_state == TERMINATING) {
+ setState(TERMINATED);
+ } else if (_state == KILLING) {
+ setState(KILLED);
+ }
} else {
EV_CRASH(name().c_str(), _pid, WTERMSIG(status));
setState(FAILED);
@@ -316,18 +320,26 @@ Service::youExited(int status)
_metrics.currentlyRunningServices--;
_metrics.sentinel_running.sample(_metrics.currentlyRunningServices);
- if (_state == TERMINATING) {
- setState(TERMINATED);
- } else if (_state == KILLING) {
- setState(KILLED);
+ if (! expectedDeath) {
+ // make sure the service does not restart in a tight loop:
+ time_t now = time(0);
+ unsigned int diff = now - _last_start;
+ if (diff < MAX_RESTART_PENALTY) {
+ incrementRestartPenalty();
+ }
+ if (diff > 10 * MAX_RESTART_PENALTY) {
+ resetRestartPenalty();
+ }
+ if (diff < _restartPenalty) {
+ LOG(info, "%s: will delay start by %u seconds", name().c_str(), _restartPenalty - diff);
+ }
}
if (_isAutomatic && !stop()) {
// ### Implement some rate limiting here maybe?
LOG(debug, "%s: Restarting.", name().c_str());
- setState(READY);
+ setState(RESTARTING);
_metrics.totalRestartsCounter++;
_metrics.sentinel_restarts.add();
- start();
}
}
@@ -391,6 +403,8 @@ Service::isRunning() const
case KILLED:
case TERMINATED:
case FAILED:
+ case RESTARTING:
+ case REMOVING:
return false;
case STARTING:
@@ -402,6 +416,19 @@ Service::isRunning() const
return true; // this will not be reached
}
+bool
+Service::wantsRestart() const
+{
+ if (_state == RESTARTING) {
+ time_t now = time(0);
+ if (now > _last_start + _restartPenalty) {
+ return true;
+ }
+ }
+ return false;
+}
+
+
void
Service::setAutomatic(bool autoStatus)
{
@@ -418,12 +445,17 @@ Service::incrementRestartPenalty()
if (_restartPenalty > MAX_RESTART_PENALTY) {
_restartPenalty = MAX_RESTART_PENALTY;
}
+ LOG(info, "%s: incremented restart penalty to %u seconds", name().c_str(), _restartPenalty);
}
void
Service::setState(ServiceState state)
{
+ if (_state == REMOVING) {
+ // ignore further changes
+ return;
+ }
if (state != _state) {
LOG(debug, "%s: %s->%s", name().c_str(), stateName(_state), stateName(state));
_rawState = state;
@@ -448,6 +480,8 @@ Service::stateName(ServiceState state) const
case TERMINATED: return "TERMINATED";
case KILLED: return "KILLED";
case FAILED: return "FAILED";
+ case RESTARTING: return "RESTARTING";
+ case REMOVING: return "REMOVING";
}
return "--BAD--";
}
diff --git a/configd/src/apps/sentinel/service.h b/configd/src/apps/sentinel/service.h
index 54bf1105a77..5a188f217ff 100644
--- a/configd/src/apps/sentinel/service.h
+++ b/configd/src/apps/sentinel/service.h
@@ -20,13 +20,14 @@ private:
pid_t _pid;
enum ServiceState { READY, STARTING, RUNNING, TERMINATING, KILLING,
+ RESTARTING, REMOVING,
FINISHED, TERMINATED, KILLED, FAILED } _rawState;
const enum ServiceState& _state;
int _exitStatus;
SentinelConfig::Service *_config;
bool _isAutomatic;
- static const int MAX_RESTART_PENALTY = 1800;
+ static const unsigned int MAX_RESTART_PENALTY = 1800;
unsigned int _restartPenalty;
time_t _last_start;
@@ -55,11 +56,13 @@ public:
int terminate() {
return terminate(true, false);
}
- int start();
+ void start();
+ void remove();
void youExited(int status); // Call this if waitpid says it exited
const vespalib::string & name() const;
const char *stateName() const { return stateName(_state); }
bool isRunning() const;
+ bool wantsRestart() const;
int exitStatus() const { return _exitStatus; }
const SentinelConfig::Service& serviceConfig() const { return *_config; }
void setAutomatic(bool autoStatus);