diff options
author | Håkon Hallingstad <hakon@verizonmedia.com> | 2019-10-21 17:12:07 +0200 |
---|---|---|
committer | Håkon Hallingstad <hakon@verizonmedia.com> | 2019-10-21 17:12:07 +0200 |
commit | 0848c72d2c06290decc2bf0999063040790f61d6 (patch) | |
tree | dc932474d4b7f0bbd6163dc81cbe1d4c9fee6e5c /orchestrator/src/main | |
parent | 38d3fedeb51ff66e86a8bd1ccdfeabbcc11b642d (diff) |
Add Orchestrator application lock metrics
Diffstat (limited to 'orchestrator/src/main')
-rw-r--r-- | orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java | 56 |
1 files changed, 50 insertions, 6 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java index e3d2a0827ed..0803e453da6 100644 --- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java +++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java @@ -2,7 +2,10 @@ package com.yahoo.vespa.orchestrator.status; import com.google.common.util.concurrent.UncheckedTimeoutException; +import com.yahoo.config.provision.ApplicationId; import com.yahoo.container.jaxrs.annotation.Component; +import com.yahoo.jdisc.Metric; +import com.yahoo.jdisc.Timer; import com.yahoo.log.LogLevel; import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference; import com.yahoo.vespa.applicationmodel.HostName; @@ -17,6 +20,7 @@ import org.apache.zookeeper.data.Stat; import javax.inject.Inject; import java.time.Duration; +import java.time.Instant; import java.util.Collections; import java.util.HashSet; import java.util.Map; @@ -42,18 +46,27 @@ public class ZookeeperStatusService implements StatusService { private final Curator curator; private final CuratorCounter counter; + private final Metric metric; + private final Timer timer; + + /** + * A cache of metric contexts for each possible dimension map. In practice, there is one dimension map + * for each application, so up to hundreds of elements. + */ + private final ConcurrentHashMap<Map<String, String>, Metric.Context> cachedContexts = new ConcurrentHashMap<>(); /** A cache of hosts allowed to be down. Access only through {@link #getValidCache()}! */ - private final Map<ApplicationInstanceReference, Set<HostName>> hostsDown; + private final Map<ApplicationInstanceReference, Set<HostName>> hostsDown = new ConcurrentHashMap<>(); private volatile long cacheRefreshedAt; @Inject - public ZookeeperStatusService(@Component Curator curator) { + public ZookeeperStatusService(@Component Curator curator, @Component Metric metric, @Component Timer timer) { this.curator = curator; this.counter = new CuratorCounter(curator, HOST_STATUS_CACHE_COUNTER_PATH); this.cacheRefreshedAt = counter.get(); - this.hostsDown = new ConcurrentHashMap<>(); + this.metric = metric; + this.timer = timer; } @Override @@ -104,15 +117,42 @@ public class ZookeeperStatusService implements StatusService { public MutableStatusRegistry lockApplicationInstance_forCurrentThreadOnly( OrchestratorContext context, ApplicationInstanceReference applicationInstanceReference) throws UncheckedTimeoutException { + ApplicationId applicationId = OrchestratorUtil.toApplicationId(applicationInstanceReference); + String app = applicationId.application().value() + "." + applicationId.instance().value(); + Map<String, String> dimensions = Map.of( + "tenantName", applicationId.tenant().value(), + "applicationId", applicationId.toFullString(), + "app", app); + Metric.Context metricContext = cachedContexts.computeIfAbsent(dimensions, metric::createContext); + Duration duration = context.getTimeLeft(); String lockPath = applicationInstanceLock2Path(applicationInstanceReference); Lock lock = new Lock(lockPath, curator); - lock.acquire(duration); + + Instant startTime = timer.currentTime(); + Instant acquireEndTime; + boolean lockAcquired = false; + try { + lock.acquire(duration); + lockAcquired = true; + } finally { + acquireEndTime = timer.currentTime(); + double seconds = Duration.between(startTime, acquireEndTime).toMillis() / 1000.0; + metric.set("orchestrator.lock.acquire-latency", seconds, metricContext); + metric.set("orchestrator.lock.acquired", lockAcquired ? 1 : 0, metricContext); + } + + Runnable updateLockHoldMetric = () -> { + Instant lockReleasedTime = timer.currentTime(); + double seconds = Duration.between(acquireEndTime, lockReleasedTime).toMillis() / 1000.0; + metric.set("orchestrator.lock.hold-latency", seconds, metricContext); + }; try { - return new ZkMutableStatusRegistry(lock, applicationInstanceReference, context.isProbe()); + return new ZkMutableStatusRegistry(lock, applicationInstanceReference, context.isProbe(), updateLockHoldMetric); } catch (Throwable t) { // In case the constructor throws an exception. + updateLockHoldMetric.run(); lock.close(); throw t; } @@ -237,13 +277,16 @@ public class ZookeeperStatusService implements StatusService { private final Lock lock; private final ApplicationInstanceReference applicationInstanceReference; private final boolean probe; + private final Runnable onLockRelease; public ZkMutableStatusRegistry(Lock lock, ApplicationInstanceReference applicationInstanceReference, - boolean probe) { + boolean probe, + Runnable onLockRelease) { this.lock = lock; this.applicationInstanceReference = applicationInstanceReference; this.probe = probe; + this.onLockRelease = onLockRelease; } @Override @@ -293,6 +336,7 @@ public class ZookeeperStatusService implements StatusService { @Override public void close() { + onLockRelease.run(); try { lock.close(); } catch (RuntimeException e) { |