summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2020-09-24 13:53:40 +0200
committerHåkon Hallingstad <hakon@verizonmedia.com>2020-09-24 13:53:40 +0200
commit069716d610dbea6fa58e009cac8c5495c5e7d06d (patch)
tree2ee22c776e85084330da7dd1fad0a54d643e49f6
parent181b198578ce66e2759a450a9ee6adc8ca4f9134 (diff)
Expose locks info in REST API
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java73
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java1
-rw-r--r--zkfacade/abi-spec.json60
-rw-r--r--zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java16
-rw-r--r--zkfacade/src/main/java/com/yahoo/vespa/curator/LockInfo.java64
-rw-r--r--zkfacade/src/main/java/com/yahoo/vespa/curator/ThreadLockInfo.java125
6 files changed, 337 insertions, 2 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java
new file mode 100644
index 00000000000..817063592b1
--- /dev/null
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java
@@ -0,0 +1,73 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.provision.restapi;
+
+import com.yahoo.container.jdisc.HttpResponse;
+import com.yahoo.slime.Cursor;
+import com.yahoo.slime.JsonFormat;
+import com.yahoo.slime.Slime;
+import com.yahoo.vespa.curator.LockInfo;
+import com.yahoo.vespa.curator.ThreadLockInfo;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.time.Instant;
+import java.util.List;
+
+public class LocksResponse extends HttpResponse {
+
+ private final Slime slime;
+
+ public LocksResponse() {
+ super(200);
+
+ this.slime = new Slime();
+ Cursor root = slime.setObject();
+
+ root.setLong("in-critical-region", ThreadLockInfo.inCriticalRegionCount());
+ root.setLong("invoke-acquire", ThreadLockInfo.invokeAcquireCount());
+ root.setLong("acquire-timed-out", ThreadLockInfo.acquireTimedOutCount());
+ root.setLong("lock-acquired", ThreadLockInfo.lockAcquiredCount());
+ root.setLong("locks-released", ThreadLockInfo.locksReleasedCount());
+ root.setLong("acquire-reentrant-lock-errors", ThreadLockInfo.failedToAcquireReentrantLockCount());
+ root.setLong("no-locks-errors", ThreadLockInfo.noLocksErrorCount());
+ root.setLong("timeout-on-reentrancy-errors", ThreadLockInfo.timeoutOnReentrancyErrorCount());
+
+ List<ThreadLockInfo> threadLockInfos = ThreadLockInfo.getThreadLockInfos();
+ if (!threadLockInfos.isEmpty()) {
+ Cursor threadsCursor = root.setArray("threads");
+ threadLockInfos.forEach(threadLockInfo -> {
+ Cursor threadLockInfoCursor = threadsCursor.addObject();
+ threadLockInfoCursor.setString("thread-name", threadLockInfo.getThreadName());
+ threadLockInfoCursor.setString("lock-path", threadLockInfo.getLockPath());
+
+ List<LockInfo> lockInfos = threadLockInfo.getLockInfos();
+ if (!lockInfos.isEmpty()) {
+ Cursor lockInfosCursor = threadLockInfoCursor.setArray("locks");
+ lockInfos.forEach(lockInfo -> {
+ Cursor lockInfoCursor = lockInfosCursor.addObject();
+ lockInfoCursor.setString("invoke-acquire-time", toString(lockInfo.getTimeAcquiredWasInvoked()));
+ lockInfoCursor.setLong("reentrancy-hold-count-on-acquire", lockInfo.getThreadHoldCountOnAcquire());
+ lockInfoCursor.setString("acquire-timeout", lockInfo.getAcquireTimeout().toString());
+ lockInfo.getTimeLockWasAcquired().ifPresent(instant -> lockInfoCursor.setString("lock-acquired-time", toString(instant)));
+ lockInfoCursor.setString("lock-state", lockInfo.getLockState().name());
+ lockInfo.getTimeTerminalStateWasReached().ifPresent(instant -> lockInfoCursor.setString("terminal-state-time", toString(instant)));
+ });
+ }
+ });
+ }
+ }
+
+ @Override
+ public void render(OutputStream stream) throws IOException {
+ new JsonFormat(true).encode(stream, slime);
+ }
+
+ @Override
+ public String getContentType() {
+ return "application/json";
+ }
+
+ private static String toString(Instant time) {
+ return Instant.ofEpochMilli(time.toEpochMilli()).toString();
+ }
+}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
index 1891b9d2f82..a2d599eab6e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
@@ -116,6 +116,7 @@ public class NodesV2ApiHandler extends LoggingRequestHandler {
if (pathS.startsWith("/nodes/v2/state/")) return new NodesResponse(ResponseType.nodesInStateList, request, orchestrator, nodeRepository);
if (pathS.startsWith("/nodes/v2/acl/")) return new NodeAclResponse(request, nodeRepository);
if (pathS.equals( "/nodes/v2/command/")) return new ResourceResponse(request.getUri(), "restart", "reboot");
+ if (pathS.equals( "/nodes/v2/locks")) return new LocksResponse();
if (pathS.equals( "/nodes/v2/maintenance/")) return new JobsResponse(nodeRepository.jobControl());
if (pathS.equals( "/nodes/v2/upgrade/")) return new UpgradeResponse(nodeRepository.infrastructureVersions(), nodeRepository.osVersions(), nodeRepository.dockerImages());
if (pathS.startsWith("/nodes/v2/capacity")) return new HostCapacityResponse(nodeRepository, request);
diff --git a/zkfacade/abi-spec.json b/zkfacade/abi-spec.json
index f4ad1ab4372..614c58f3cf6 100644
--- a/zkfacade/abi-spec.json
+++ b/zkfacade/abi-spec.json
@@ -109,5 +109,65 @@
"public void close()"
],
"fields": []
+ },
+ "com.yahoo.vespa.curator.LockInfo$LockState": {
+ "superClass": "java.lang.Enum",
+ "interfaces": [],
+ "attributes": [
+ "public",
+ "final",
+ "enum"
+ ],
+ "methods": [
+ "public static com.yahoo.vespa.curator.LockInfo$LockState[] values()",
+ "public static com.yahoo.vespa.curator.LockInfo$LockState valueOf(java.lang.String)",
+ "public boolean isTerminal()"
+ ],
+ "fields": [
+ "public static final enum com.yahoo.vespa.curator.LockInfo$LockState ACQUIRING",
+ "public static final enum com.yahoo.vespa.curator.LockInfo$LockState TIMED_OUT",
+ "public static final enum com.yahoo.vespa.curator.LockInfo$LockState ACQUIRED",
+ "public static final enum com.yahoo.vespa.curator.LockInfo$LockState FAILED_TO_REENTER",
+ "public static final enum com.yahoo.vespa.curator.LockInfo$LockState RELEASED"
+ ]
+ },
+ "com.yahoo.vespa.curator.LockInfo": {
+ "superClass": "java.lang.Object",
+ "interfaces": [],
+ "attributes": [
+ "public"
+ ],
+ "methods": [
+ "public static com.yahoo.vespa.curator.LockInfo invokingAcquire(int, java.time.Duration)",
+ "public int getThreadHoldCountOnAcquire()",
+ "public java.time.Instant getTimeAcquiredWasInvoked()",
+ "public java.time.Duration getAcquireTimeout()",
+ "public com.yahoo.vespa.curator.LockInfo$LockState getLockState()",
+ "public java.util.Optional getTimeLockWasAcquired()",
+ "public java.util.Optional getTimeTerminalStateWasReached()"
+ ],
+ "fields": []
+ },
+ "com.yahoo.vespa.curator.ThreadLockInfo": {
+ "superClass": "java.lang.Object",
+ "interfaces": [],
+ "attributes": [
+ "public"
+ ],
+ "methods": [
+ "public static int invokeAcquireCount()",
+ "public static int inCriticalRegionCount()",
+ "public static int acquireTimedOutCount()",
+ "public static int lockAcquiredCount()",
+ "public static int locksReleasedCount()",
+ "public static int noLocksErrorCount()",
+ "public static int failedToAcquireReentrantLockCount()",
+ "public static int timeoutOnReentrancyErrorCount()",
+ "public static java.util.List getThreadLockInfos()",
+ "public java.lang.String getThreadName()",
+ "public java.lang.String getLockPath()",
+ "public java.util.List getLockInfos()"
+ ],
+ "fields": []
}
} \ No newline at end of file
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java
index d97d8f5ed71..299fceb8d7f 100644
--- a/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java
+++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java
@@ -32,12 +32,20 @@ public class Lock implements Mutex {
/** Take the lock with the given timeout. This may be called multiple times from the same thread - each matched by a close */
public void acquire(Duration timeout) throws UncheckedTimeoutException {
+ ThreadLockInfo threadLockInfo = getThreadLockInfo();
+ threadLockInfo.invokingAcquire(timeout);
try {
- if ( ! mutex.acquire(timeout.toMillis(), TimeUnit.MILLISECONDS))
+ if ( ! mutex.acquire(timeout.toMillis(), TimeUnit.MILLISECONDS)) {
+ threadLockInfo.acquireTimedOut();
+
throw new UncheckedTimeoutException("Timed out after waiting " + timeout +
- " to acquire lock '" + lockPath + "'");
+ " to acquire lock '" + lockPath + "'");
+ }
+ threadLockInfo.lockAcquired();
+
if ( ! lock.tryLock()) { // Should be available to only this thread, while holding the above mutex.
release();
+ threadLockInfo.failedToAcquireReentrantLock();
throw new IllegalStateException("InterProcessMutex acquired, but guarded lock held by someone else, for lock '" + lockPath + "'");
}
}
@@ -60,6 +68,7 @@ public class Lock implements Mutex {
}
private void release() {
+ getThreadLockInfo().lockReleased();
try {
mutex.release();
}
@@ -68,6 +77,9 @@ public class Lock implements Mutex {
}
}
+ private ThreadLockInfo getThreadLockInfo() {
+ return ThreadLockInfo.getCurrentThreadLockInfo(lockPath, lock);
+ }
}
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/LockInfo.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/LockInfo.java
new file mode 100644
index 00000000000..870ee12ebda
--- /dev/null
+++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/LockInfo.java
@@ -0,0 +1,64 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.curator;
+
+import java.time.Duration;
+import java.time.Instant;
+import java.util.Optional;
+
+/**
+ * Information about a lock.
+ *
+ * <p>Should be mutated by a single thread. Other threads may see an inconsistent state of this instance.</p>
+ */
+public class LockInfo {
+
+ private final int threadHoldCountOnAcquire;
+ private final Instant acquireInstant;
+ private final Duration timeout;
+
+ private volatile Optional<Instant> lockAcquiredInstant = Optional.empty();
+ private volatile Optional<Instant> terminalStateInstant = Optional.empty();
+
+ public static LockInfo invokingAcquire(int holdCount, Duration timeout) {
+ return new LockInfo(holdCount, timeout);
+ }
+
+ public enum LockState {
+ ACQUIRING(false), TIMED_OUT(true), ACQUIRED(false), FAILED_TO_REENTER(true), RELEASED(true);
+
+ private final boolean terminal;
+
+ LockState(boolean terminal) { this.terminal = terminal; }
+
+ public boolean isTerminal() { return terminal; }
+ }
+
+ private volatile LockState lockState = LockState.ACQUIRING;
+
+ private LockInfo(int threadHoldCountOnAcquire, Duration timeout) {
+ this.threadHoldCountOnAcquire = threadHoldCountOnAcquire;
+ this.acquireInstant = Instant.now();
+ this.timeout = timeout;
+ }
+
+ public int getThreadHoldCountOnAcquire() { return threadHoldCountOnAcquire; }
+ public Instant getTimeAcquiredWasInvoked() { return acquireInstant; }
+ public Duration getAcquireTimeout() { return timeout; }
+ public LockState getLockState() { return lockState; }
+ public Optional<Instant> getTimeLockWasAcquired() { return lockAcquiredInstant; }
+ public Optional<Instant> getTimeTerminalStateWasReached() { return terminalStateInstant; }
+
+ void timedOut() { setTerminalState(LockState.TIMED_OUT); }
+ void failedToAcquireReentrantLock() { setTerminalState(LockState.FAILED_TO_REENTER); }
+ void released() { setTerminalState(LockState.RELEASED); }
+
+ void lockAcquired() {
+ lockState = LockState.ACQUIRED;
+ lockAcquiredInstant = Optional.of(Instant.now());
+ }
+
+ void setTerminalState(LockState terminalState) {
+ lockState = terminalState;
+ terminalStateInstant = Optional.of(Instant.now());
+ }
+}
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/ThreadLockInfo.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/ThreadLockInfo.java
new file mode 100644
index 00000000000..9fbcb550ddd
--- /dev/null
+++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/ThreadLockInfo.java
@@ -0,0 +1,125 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.curator;
+
+import java.time.Duration;
+import java.util.List;
+import java.util.Optional;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.locks.ReentrantLock;
+import java.util.function.Consumer;
+
+/**
+ * This class contains process-wide statistics and information related to acquiring and releasing
+ * {@link Lock}. Instances of this class contain information tied to a specific thread and lock path.
+ *
+ * <p>Instances of this class are thread-safe as long as foreign threads (!= this.thread) avoid mutable methods.</p>
+ */
+public class ThreadLockInfo {
+
+ private static final ConcurrentHashMap<Thread, ThreadLockInfo> locks = new ConcurrentHashMap<>();
+
+ private static final int MAX_COMPLETED_LOCK_INFOS_SIZE = 10;
+ private static final ConcurrentLinkedDeque<LockInfo> completedLockInfos = new ConcurrentLinkedDeque<>();
+
+ private static final AtomicInteger invokeAcquireCount = new AtomicInteger(0);
+ private static final AtomicInteger inCriticalRegionCount = new AtomicInteger(0);
+ private static final AtomicInteger acquireTimedOutCount = new AtomicInteger(0);
+ private static final AtomicInteger lockAcquiredCount = new AtomicInteger(0);
+ private static final AtomicInteger locksReleasedCount = new AtomicInteger(0);
+
+ private static final AtomicInteger failedToAcquireReentrantLockCount = new AtomicInteger(0);
+ private static final AtomicInteger noLocksErrorCount = new AtomicInteger(0);
+ private static final AtomicInteger timeoutOnReentrancyErrorCount = new AtomicInteger(0);
+
+ private final Thread thread;
+ private final String lockPath;
+ private final ReentrantLock lock;
+
+ /** The locks are reentrant so there may be more than 1 lock for this thread. */
+ private final ConcurrentLinkedQueue<LockInfo> lockInfos = new ConcurrentLinkedQueue<>();
+
+ public static int invokeAcquireCount() { return invokeAcquireCount.get(); }
+ public static int inCriticalRegionCount() { return inCriticalRegionCount.get(); }
+ public static int acquireTimedOutCount() { return acquireTimedOutCount.get(); }
+ public static int lockAcquiredCount() { return lockAcquiredCount.get(); }
+ public static int locksReleasedCount() { return locksReleasedCount.get(); }
+ public static int noLocksErrorCount() { return noLocksErrorCount.get(); }
+ public static int failedToAcquireReentrantLockCount() { return failedToAcquireReentrantLockCount.get(); }
+ public static int timeoutOnReentrancyErrorCount() { return timeoutOnReentrancyErrorCount.get(); }
+ public static List<ThreadLockInfo> getThreadLockInfos() { return List.copyOf(locks.values()); }
+
+ /** Returns the per-thread singleton ThreadLockInfo. */
+ static ThreadLockInfo getCurrentThreadLockInfo(String lockPath, ReentrantLock lock) {
+ return locks.computeIfAbsent(
+ Thread.currentThread(),
+ currentThread -> new ThreadLockInfo(currentThread, lockPath, lock));
+ }
+
+ ThreadLockInfo(Thread currentThread, String lockPath, ReentrantLock lock) {
+ this.thread = currentThread;
+ this.lockPath = lockPath;
+ this.lock = lock;
+ }
+
+ public String getThreadName() { return thread.getName(); }
+ public String getLockPath() { return lockPath; }
+ public List<LockInfo> getLockInfos() { return List.copyOf(lockInfos); }
+
+ /** Mutable method (see class doc) */
+ void invokingAcquire(Duration timeout) {
+ invokeAcquireCount.incrementAndGet();
+ inCriticalRegionCount.incrementAndGet();
+ lockInfos.add(LockInfo.invokingAcquire(lock.getHoldCount(), timeout));
+ }
+
+ /** Mutable method (see class doc) */
+ void acquireTimedOut() {
+ if (lockInfos.size() > 1) {
+ timeoutOnReentrancyErrorCount.incrementAndGet();
+ }
+
+ removeLastLockInfo(acquireTimedOutCount, LockInfo::timedOut);
+ }
+
+ /** Mutable method (see class doc) */
+ void lockAcquired() {
+ lockAcquiredCount.incrementAndGet();
+ getLastLockInfo().ifPresent(LockInfo::lockAcquired);
+ }
+
+ /** Mutable method (see class doc) */
+ void failedToAcquireReentrantLock() {
+ removeLastLockInfo(failedToAcquireReentrantLockCount, LockInfo::failedToAcquireReentrantLock);
+ }
+
+ /** Mutable method (see class doc) */
+ void lockReleased() {
+ removeLastLockInfo(locksReleasedCount, LockInfo::released);
+ }
+
+ private Optional<LockInfo> getLastLockInfo() {
+ return lockInfos.isEmpty() ? Optional.empty() : Optional.of(lockInfos.peek());
+ }
+
+ private void removeLastLockInfo(AtomicInteger metricToIncrement, Consumer<LockInfo> completeLockInfo) {
+ metricToIncrement.incrementAndGet();
+ inCriticalRegionCount.decrementAndGet();
+
+ if (lockInfos.isEmpty()) {
+ noLocksErrorCount.incrementAndGet();
+ return;
+ }
+
+ LockInfo lockInfo = lockInfos.poll();
+ completeLockInfo.accept(lockInfo);
+
+ if (completedLockInfos.size() >= MAX_COMPLETED_LOCK_INFOS_SIZE) {
+ // This is thread-safe, as no-one but currentThread mutates completedLockInfos
+ completedLockInfos.removeLast();
+ }
+ completedLockInfos.addFirst(lockInfo);
+ }
+}