summaryrefslogtreecommitdiffstats
path: root/orchestrator/src/main
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2019-10-21 17:12:07 +0200
committerHåkon Hallingstad <hakon@verizonmedia.com>2019-10-21 17:12:07 +0200
commit0848c72d2c06290decc2bf0999063040790f61d6 (patch)
treedc932474d4b7f0bbd6163dc81cbe1d4c9fee6e5c /orchestrator/src/main
parent38d3fedeb51ff66e86a8bd1ccdfeabbcc11b642d (diff)
Add Orchestrator application lock metrics
Diffstat (limited to 'orchestrator/src/main')
-rw-r--r--orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java56
1 files changed, 50 insertions, 6 deletions
diff --git a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java
index e3d2a0827ed..0803e453da6 100644
--- a/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java
+++ b/orchestrator/src/main/java/com/yahoo/vespa/orchestrator/status/ZookeeperStatusService.java
@@ -2,7 +2,10 @@
package com.yahoo.vespa.orchestrator.status;
import com.google.common.util.concurrent.UncheckedTimeoutException;
+import com.yahoo.config.provision.ApplicationId;
import com.yahoo.container.jaxrs.annotation.Component;
+import com.yahoo.jdisc.Metric;
+import com.yahoo.jdisc.Timer;
import com.yahoo.log.LogLevel;
import com.yahoo.vespa.applicationmodel.ApplicationInstanceReference;
import com.yahoo.vespa.applicationmodel.HostName;
@@ -17,6 +20,7 @@ import org.apache.zookeeper.data.Stat;
import javax.inject.Inject;
import java.time.Duration;
+import java.time.Instant;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
@@ -42,18 +46,27 @@ public class ZookeeperStatusService implements StatusService {
private final Curator curator;
private final CuratorCounter counter;
+ private final Metric metric;
+ private final Timer timer;
+
+ /**
+ * A cache of metric contexts for each possible dimension map. In practice, there is one dimension map
+ * for each application, so up to hundreds of elements.
+ */
+ private final ConcurrentHashMap<Map<String, String>, Metric.Context> cachedContexts = new ConcurrentHashMap<>();
/** A cache of hosts allowed to be down. Access only through {@link #getValidCache()}! */
- private final Map<ApplicationInstanceReference, Set<HostName>> hostsDown;
+ private final Map<ApplicationInstanceReference, Set<HostName>> hostsDown = new ConcurrentHashMap<>();
private volatile long cacheRefreshedAt;
@Inject
- public ZookeeperStatusService(@Component Curator curator) {
+ public ZookeeperStatusService(@Component Curator curator, @Component Metric metric, @Component Timer timer) {
this.curator = curator;
this.counter = new CuratorCounter(curator, HOST_STATUS_CACHE_COUNTER_PATH);
this.cacheRefreshedAt = counter.get();
- this.hostsDown = new ConcurrentHashMap<>();
+ this.metric = metric;
+ this.timer = timer;
}
@Override
@@ -104,15 +117,42 @@ public class ZookeeperStatusService implements StatusService {
public MutableStatusRegistry lockApplicationInstance_forCurrentThreadOnly(
OrchestratorContext context,
ApplicationInstanceReference applicationInstanceReference) throws UncheckedTimeoutException {
+ ApplicationId applicationId = OrchestratorUtil.toApplicationId(applicationInstanceReference);
+ String app = applicationId.application().value() + "." + applicationId.instance().value();
+ Map<String, String> dimensions = Map.of(
+ "tenantName", applicationId.tenant().value(),
+ "applicationId", applicationId.toFullString(),
+ "app", app);
+ Metric.Context metricContext = cachedContexts.computeIfAbsent(dimensions, metric::createContext);
+
Duration duration = context.getTimeLeft();
String lockPath = applicationInstanceLock2Path(applicationInstanceReference);
Lock lock = new Lock(lockPath, curator);
- lock.acquire(duration);
+
+ Instant startTime = timer.currentTime();
+ Instant acquireEndTime;
+ boolean lockAcquired = false;
+ try {
+ lock.acquire(duration);
+ lockAcquired = true;
+ } finally {
+ acquireEndTime = timer.currentTime();
+ double seconds = Duration.between(startTime, acquireEndTime).toMillis() / 1000.0;
+ metric.set("orchestrator.lock.acquire-latency", seconds, metricContext);
+ metric.set("orchestrator.lock.acquired", lockAcquired ? 1 : 0, metricContext);
+ }
+
+ Runnable updateLockHoldMetric = () -> {
+ Instant lockReleasedTime = timer.currentTime();
+ double seconds = Duration.between(acquireEndTime, lockReleasedTime).toMillis() / 1000.0;
+ metric.set("orchestrator.lock.hold-latency", seconds, metricContext);
+ };
try {
- return new ZkMutableStatusRegistry(lock, applicationInstanceReference, context.isProbe());
+ return new ZkMutableStatusRegistry(lock, applicationInstanceReference, context.isProbe(), updateLockHoldMetric);
} catch (Throwable t) {
// In case the constructor throws an exception.
+ updateLockHoldMetric.run();
lock.close();
throw t;
}
@@ -237,13 +277,16 @@ public class ZookeeperStatusService implements StatusService {
private final Lock lock;
private final ApplicationInstanceReference applicationInstanceReference;
private final boolean probe;
+ private final Runnable onLockRelease;
public ZkMutableStatusRegistry(Lock lock,
ApplicationInstanceReference applicationInstanceReference,
- boolean probe) {
+ boolean probe,
+ Runnable onLockRelease) {
this.lock = lock;
this.applicationInstanceReference = applicationInstanceReference;
this.probe = probe;
+ this.onLockRelease = onLockRelease;
}
@Override
@@ -293,6 +336,7 @@ public class ZookeeperStatusService implements StatusService {
@Override
public void close() {
+ onLockRelease.run();
try {
lock.close();
} catch (RuntimeException e) {