diff options
author | Håkon Hallingstad <hakon@oath.com> | 2019-01-25 09:27:12 +0100 |
---|---|---|
committer | Håkon Hallingstad <hakon@oath.com> | 2019-01-25 09:27:12 +0100 |
commit | cd7f0447817eaf21898f172bdc2a4fc8bb721d1a (patch) | |
tree | dd02bf7436bbf07671cd4b2f115eb9ce943b7ce2 /service-monitor/src/main | |
parent | ee29f449256f4d9d21abe8e1c461399b2cb303ca (diff) |
Metadata about /state/v1/health status
The service monitor uses /state/v1/health to monitor config servers and the
host admins (but not yet tenant host admins).
This commit adds some metadata about the status of a service:
- The time the status was last checked
- The time the status changed to the current
This can be used to e.g. make more intelligent decisions in the Orchestrator,
e.g. only allowing a service to suspend if it has been DOWN longer than X
seconds (to avoid spurious DOWN to break redundancy and uptime guarantees).
Diffstat (limited to 'service-monitor/src/main')
12 files changed, 73 insertions, 80 deletions
diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/health/ApplicationHealthMonitor.java b/service-monitor/src/main/java/com/yahoo/vespa/service/health/ApplicationHealthMonitor.java index 5eac6fbb000..e728d1ea914 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/health/ApplicationHealthMonitor.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/health/ApplicationHealthMonitor.java @@ -6,6 +6,7 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.vespa.applicationmodel.ClusterId; import com.yahoo.vespa.applicationmodel.ConfigId; import com.yahoo.vespa.applicationmodel.ServiceStatus; +import com.yahoo.vespa.applicationmodel.ServiceStatusInfo; import com.yahoo.vespa.applicationmodel.ServiceType; import com.yahoo.vespa.service.model.ServiceId; import com.yahoo.vespa.service.monitor.ServiceStatusProvider; @@ -47,14 +48,14 @@ class ApplicationHealthMonitor implements ServiceStatusProvider, AutoCloseable { } @Override - public ServiceStatus getStatus(ApplicationId applicationId, - ClusterId clusterId, - ServiceType serviceType, - ConfigId configId) { + public ServiceStatusInfo getStatus(ApplicationId applicationId, + ClusterId clusterId, + ServiceType serviceType, + ConfigId configId) { ServiceId serviceId = new ServiceId(applicationId, clusterId, serviceType, configId); HealthMonitor monitor = monitors.get(serviceId); if (monitor == null) { - return ServiceStatus.NOT_CHECKED; + return new ServiceStatusInfo(ServiceStatus.NOT_CHECKED); } return monitor.getStatus(); diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthInfo.java b/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthInfo.java index 17d9e6b7b49..17670c705d8 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthInfo.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthInfo.java @@ -1,10 +1,8 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.service.health; -import com.yahoo.vespa.applicationmodel.ServiceStatus; import com.yahoo.yolean.Exceptions; -import java.time.Instant; import java.util.Optional; import java.util.OptionalInt; @@ -19,7 +17,6 @@ public class HealthInfo { private final Optional<Exception> exception; private final OptionalInt httpStatusCode; private final Optional<String> healthStatusCode; - private final Instant time; static HealthInfo fromException(Exception exception) { return new HealthInfo(Optional.of(exception), OptionalInt.empty(), Optional.empty()); @@ -33,39 +30,18 @@ public class HealthInfo { return new HealthInfo(Optional.empty(), OptionalInt.empty(), Optional.of(healthStatusCode)); } - static HealthInfo empty() { - return new HealthInfo(Optional.empty(), OptionalInt.empty(), Optional.empty()); - } - - private HealthInfo(Optional<Exception> exception, - OptionalInt httpStatusCode, - Optional<String> healthStatusCode) { + private HealthInfo(Optional<Exception> exception, OptionalInt httpStatusCode, Optional<String> healthStatusCode) { this.exception = exception; this.httpStatusCode = httpStatusCode; this.healthStatusCode = healthStatusCode; - this.time = Instant.now(); } public boolean isHealthy() { return healthStatusCode.map(UP_STATUS_CODE::equals).orElse(false); } - public ServiceStatus toServiceStatus() { - // Bootstrapping ServiceStatus: To avoid thundering herd problem at startup, - // the clients will not fetch the health immediately. What should the ServiceStatus - // be before the first health has been fetched? - // - // NOT_CHECKED: Logically the right thing, but if an Orchestrator gets a suspend request - // in this window, and another service within the cluster is down, it ends up allowing - // suspension when it shouldn't have done so. - // - // DOWN: Only safe initial value, possibly except if the first initial delay is long, - // as that could indicate it has been down for too long. - return isHealthy() ? ServiceStatus.UP : ServiceStatus.DOWN; - } - - public Instant time() { - return time; + public Optional<String> getErrorDescription() { + return isHealthy() ? Optional.empty() : Optional.of(toString()); } @Override diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthMonitor.java b/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthMonitor.java index f0e13548f58..14ec977c98c 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthMonitor.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthMonitor.java @@ -1,13 +1,13 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.service.health; -import com.yahoo.vespa.applicationmodel.ServiceStatus; +import com.yahoo.vespa.applicationmodel.ServiceStatusInfo; /** * @author hakonhall */ interface HealthMonitor extends AutoCloseable { - ServiceStatus getStatus(); + ServiceStatusInfo getStatus(); @Override void close(); diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthMonitorManager.java b/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthMonitorManager.java index ddfbdf59b3c..b802c6c5413 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthMonitorManager.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthMonitorManager.java @@ -7,6 +7,7 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.vespa.applicationmodel.ClusterId; import com.yahoo.vespa.applicationmodel.ConfigId; import com.yahoo.vespa.applicationmodel.ServiceStatus; +import com.yahoo.vespa.applicationmodel.ServiceStatusInfo; import com.yahoo.vespa.applicationmodel.ServiceType; import com.yahoo.vespa.flags.FlagSource; import com.yahoo.vespa.flags.Flags; @@ -97,20 +98,20 @@ public class HealthMonitorManager implements MonitorManager { } @Override - public ServiceStatus getStatus(ApplicationId applicationId, - ClusterId clusterId, - ServiceType serviceType, - ConfigId configId) { + public ServiceStatusInfo getStatus(ApplicationId applicationId, + ClusterId clusterId, + ServiceType serviceType, + ConfigId configId) { ApplicationHealthMonitor monitor = healthMonitors.get(applicationId); if (!monitorTenantHostHealth && ZoneApplication.isNodeAdminService(applicationId, clusterId, serviceType)) { // Legacy: The zone app is not health monitored (monitor == null), but the node-admin cluster's services // are hard-coded to be UP - return ServiceStatus.UP; + return new ServiceStatusInfo(ServiceStatus.UP); } if (monitor == null) { - return ServiceStatus.NOT_CHECKED; + return new ServiceStatusInfo(ServiceStatus.NOT_CHECKED); } if (monitorTenantHostHealth && applicationId.equals(ZoneApplication.getApplicationId())) { @@ -120,7 +121,7 @@ public class HealthMonitorManager implements MonitorManager { if (ZoneApplication.isNodeAdminService(applicationId, clusterId, serviceType)) { return monitor.getStatus(applicationId, clusterId, serviceType, configId); } else { - return ServiceStatus.NOT_CHECKED; + return new ServiceStatusInfo(ServiceStatus.NOT_CHECKED); } } diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthUpdater.java b/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthUpdater.java index 4ed49e17e9f..8101336c638 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthUpdater.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/health/HealthUpdater.java @@ -1,14 +1,15 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.service.health; +import com.yahoo.vespa.applicationmodel.ServiceStatusInfo; import com.yahoo.vespa.service.executor.Runlet; /** * A {@link HealthUpdater} will probe the health with {@link #run()}, whose result can be fetched with the - * thread-safe method {@link #getLatestHealthInfo()}. + * thread-safe method {@link #getServiceStatusInfo()}. * * @author hakonhall */ interface HealthUpdater extends Runlet { - HealthInfo getLatestHealthInfo(); + ServiceStatusInfo getServiceStatusInfo(); } diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/health/StateV1HealthMonitor.java b/service-monitor/src/main/java/com/yahoo/vespa/service/health/StateV1HealthMonitor.java index d37797c7be9..7a6494e0122 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/health/StateV1HealthMonitor.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/health/StateV1HealthMonitor.java @@ -1,7 +1,7 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.service.health; -import com.yahoo.vespa.applicationmodel.ServiceStatus; +import com.yahoo.vespa.applicationmodel.ServiceStatusInfo; import com.yahoo.vespa.service.executor.Cancellable; import com.yahoo.vespa.service.executor.RunletExecutor; @@ -22,8 +22,8 @@ class StateV1HealthMonitor implements HealthMonitor { } @Override - public ServiceStatus getStatus() { - return updater.getLatestHealthInfo().toServiceStatus(); + public ServiceStatusInfo getStatus() { + return updater.getServiceStatusInfo(); } @Override diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/health/StateV1HealthUpdater.java b/service-monitor/src/main/java/com/yahoo/vespa/service/health/StateV1HealthUpdater.java index 011ec3b3212..972e81ce822 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/health/StateV1HealthUpdater.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/health/StateV1HealthUpdater.java @@ -1,8 +1,13 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.service.health; +import com.yahoo.vespa.applicationmodel.ServiceStatus; +import com.yahoo.vespa.applicationmodel.ServiceStatusInfo; + import java.net.URL; import java.time.Duration; +import java.time.Instant; +import java.util.Optional; /** * @author hakonhall @@ -10,7 +15,7 @@ import java.time.Duration; class StateV1HealthUpdater implements HealthUpdater { private final StateV1HealthClient healthClient; - private volatile HealthInfo lastHealthInfo = HealthInfo.empty(); + private volatile ServiceStatusInfo serviceStatusInfo = new ServiceStatusInfo(ServiceStatus.NOT_CHECKED); StateV1HealthUpdater(URL url, Duration requestTimeout, Duration connectionKeepAlive) { this(new StateV1HealthClient(url, requestTimeout, connectionKeepAlive)); @@ -21,17 +26,27 @@ class StateV1HealthUpdater implements HealthUpdater { } @Override - public HealthInfo getLatestHealthInfo() { - return lastHealthInfo; + public ServiceStatusInfo getServiceStatusInfo() { + return serviceStatusInfo; } @Override public void run() { + // Get time before fetching rather than after, to make the resulting age be an upper limit. + Instant now = Instant.now(); + + HealthInfo healthInfo; try { - lastHealthInfo = healthClient.get(); + healthInfo = healthClient.get(); } catch (Exception e) { - lastHealthInfo = HealthInfo.fromException(e); + healthInfo = HealthInfo.fromException(e); } + + ServiceStatus newServiceStatus = healthInfo.isHealthy() ? ServiceStatus.UP : ServiceStatus.DOWN; + Optional<Instant> newSince = newServiceStatus == serviceStatusInfo.serviceStatus() ? + serviceStatusInfo.since() : Optional.of(now); + + serviceStatusInfo = new ServiceStatusInfo(newServiceStatus, newSince, Optional.of(now), healthInfo.getErrorDescription()); } @Override diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/manager/UnionMonitorManager.java b/service-monitor/src/main/java/com/yahoo/vespa/service/manager/UnionMonitorManager.java index cfd9269d9c4..3490ad4a5d2 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/manager/UnionMonitorManager.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/manager/UnionMonitorManager.java @@ -1,11 +1,13 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.service.manager; +import com.google.inject.Inject; import com.yahoo.config.model.api.ApplicationInfo; import com.yahoo.config.provision.ApplicationId; import com.yahoo.vespa.applicationmodel.ClusterId; import com.yahoo.vespa.applicationmodel.ConfigId; import com.yahoo.vespa.applicationmodel.ServiceStatus; +import com.yahoo.vespa.applicationmodel.ServiceStatusInfo; import com.yahoo.vespa.applicationmodel.ServiceType; import com.yahoo.vespa.service.health.HealthMonitorManager; import com.yahoo.vespa.service.slobrok.SlobrokMonitorManagerImpl; @@ -17,19 +19,20 @@ public class UnionMonitorManager implements MonitorManager { private final SlobrokMonitorManagerImpl slobrokMonitorManager; private final HealthMonitorManager healthMonitorManager; + @Inject public UnionMonitorManager(SlobrokMonitorManagerImpl slobrokMonitorManager, HealthMonitorManager healthMonitorManager) { this.slobrokMonitorManager = slobrokMonitorManager; this.healthMonitorManager = healthMonitorManager; } @Override - public ServiceStatus getStatus(ApplicationId applicationId, - ClusterId clusterId, - ServiceType serviceType, - ConfigId configId) { + public ServiceStatusInfo getStatus(ApplicationId applicationId, + ClusterId clusterId, + ServiceType serviceType, + ConfigId configId) { // Trust the new health monitoring status if it actually monitors the particular service. - ServiceStatus status = healthMonitorManager.getStatus(applicationId, clusterId, serviceType, configId); - if (status != ServiceStatus.NOT_CHECKED) { + ServiceStatusInfo status = healthMonitorManager.getStatus(applicationId, clusterId, serviceType, configId); + if (status.serviceStatus() != ServiceStatus.NOT_CHECKED) { return status; } diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/model/ApplicationInstanceGenerator.java b/service-monitor/src/main/java/com/yahoo/vespa/service/model/ApplicationInstanceGenerator.java index 3ca9446df26..9e6b64f0c91 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/model/ApplicationInstanceGenerator.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/model/ApplicationInstanceGenerator.java @@ -14,7 +14,7 @@ import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.applicationmodel.ServiceCluster; import com.yahoo.vespa.applicationmodel.ServiceClusterKey; import com.yahoo.vespa.applicationmodel.ServiceInstance; -import com.yahoo.vespa.applicationmodel.ServiceStatus; +import com.yahoo.vespa.applicationmodel.ServiceStatusInfo; import com.yahoo.vespa.applicationmodel.ServiceType; import com.yahoo.vespa.applicationmodel.TenantId; import com.yahoo.vespa.service.duper.ConfigServerApplication; @@ -99,11 +99,8 @@ public class ApplicationInstanceGenerator { HostName hostName, ServiceStatusProvider serviceStatusProvider) { ConfigId configId = toConfigId(serviceInfo); - - ServiceStatus status = serviceStatusProvider.getStatus( - applicationId, - clusterId, - toServiceType(serviceInfo), configId); + ServiceType serviceType = toServiceType(serviceInfo); + ServiceStatusInfo status = serviceStatusProvider.getStatus(applicationId, clusterId, serviceType, configId); return new ServiceInstance(configId, hostName, status); } diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java b/service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java index cff98957590..50ea31eb9c4 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/model/ServiceMonitorImpl.java @@ -21,12 +21,10 @@ public class ServiceMonitorImpl implements ServiceMonitor { @Inject public ServiceMonitorImpl(DuperModelManager duperModelManager, - SlobrokMonitorManagerImpl slobrokMonitorManager, - HealthMonitorManager healthMonitorManager, + UnionMonitorManager monitorManager, Metric metric, Timer timer, Zone zone) { - UnionMonitorManager monitorManager = new UnionMonitorManager(slobrokMonitorManager, healthMonitorManager); duperModelManager.registerListener(monitorManager); ServiceModelProvider uncachedServiceModelProvider = new ServiceModelProvider( diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/ServiceStatusProvider.java b/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/ServiceStatusProvider.java index 30a20cf9980..9486bf505ab 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/ServiceStatusProvider.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/monitor/ServiceStatusProvider.java @@ -5,6 +5,7 @@ import com.yahoo.config.provision.ApplicationId; import com.yahoo.vespa.applicationmodel.ClusterId; import com.yahoo.vespa.applicationmodel.ConfigId; import com.yahoo.vespa.applicationmodel.ServiceStatus; +import com.yahoo.vespa.applicationmodel.ServiceStatusInfo; import com.yahoo.vespa.applicationmodel.ServiceType; /** @@ -19,9 +20,9 @@ public interface ServiceStatusProvider { * service status provider does does not monitor the service status for * the particular application, cluster, service type, and config id. */ - ServiceStatus getStatus(ApplicationId applicationId, - ClusterId clusterId, - ServiceType serviceType, - ConfigId configId); + ServiceStatusInfo getStatus(ApplicationId applicationId, + ClusterId clusterId, + ServiceType serviceType, + ConfigId configId); } diff --git a/service-monitor/src/main/java/com/yahoo/vespa/service/slobrok/SlobrokMonitorManagerImpl.java b/service-monitor/src/main/java/com/yahoo/vespa/service/slobrok/SlobrokMonitorManagerImpl.java index 4e47bf010d7..203ad7266db 100644 --- a/service-monitor/src/main/java/com/yahoo/vespa/service/slobrok/SlobrokMonitorManagerImpl.java +++ b/service-monitor/src/main/java/com/yahoo/vespa/service/slobrok/SlobrokMonitorManagerImpl.java @@ -9,6 +9,7 @@ import com.yahoo.log.LogLevel; import com.yahoo.vespa.applicationmodel.ClusterId; import com.yahoo.vespa.applicationmodel.ConfigId; import com.yahoo.vespa.applicationmodel.ServiceStatus; +import com.yahoo.vespa.applicationmodel.ServiceStatusInfo; import com.yahoo.vespa.applicationmodel.ServiceType; import com.yahoo.vespa.service.duper.DuperModelManager; import com.yahoo.vespa.service.manager.MonitorManager; @@ -84,27 +85,26 @@ public class SlobrokMonitorManagerImpl implements SlobrokApi, MonitorManager { } @Override - public ServiceStatus getStatus(ApplicationId applicationId, - ClusterId clusterId, - ServiceType serviceType, - ConfigId configId) { + public ServiceStatusInfo getStatus(ApplicationId applicationId, + ClusterId clusterId, + ServiceType serviceType, + ConfigId configId) { if (!wouldMonitor(applicationId)) { - return ServiceStatus.NOT_CHECKED; + return new ServiceStatusInfo(ServiceStatus.NOT_CHECKED); } Optional<String> slobrokServiceName = findSlobrokServiceName(serviceType, configId); if (slobrokServiceName.isPresent()) { synchronized (monitor) { SlobrokMonitor slobrokMonitor = slobrokMonitors.get(applicationId); - if (slobrokMonitor != null && - slobrokMonitor.registeredInSlobrok(slobrokServiceName.get())) { - return ServiceStatus.UP; + if (slobrokMonitor != null && slobrokMonitor.registeredInSlobrok(slobrokServiceName.get())) { + return new ServiceStatusInfo(ServiceStatus.UP); } else { - return ServiceStatus.DOWN; + return new ServiceStatusInfo(ServiceStatus.DOWN); } } } else { - return ServiceStatus.NOT_CHECKED; + return new ServiceStatusInfo(ServiceStatus.NOT_CHECKED); } } |