diff options
author | Håkon Hallingstad <hakon@verizonmedia.com> | 2020-09-24 13:53:40 +0200 |
---|---|---|
committer | Håkon Hallingstad <hakon@verizonmedia.com> | 2020-09-24 13:53:40 +0200 |
commit | 069716d610dbea6fa58e009cac8c5495c5e7d06d (patch) | |
tree | 2ee22c776e85084330da7dd1fad0a54d643e49f6 | |
parent | 181b198578ce66e2759a450a9ee6adc8ca4f9134 (diff) |
Expose locks info in REST API
6 files changed, 337 insertions, 2 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java new file mode 100644 index 00000000000..817063592b1 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java @@ -0,0 +1,73 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.restapi; + +import com.yahoo.container.jdisc.HttpResponse; +import com.yahoo.slime.Cursor; +import com.yahoo.slime.JsonFormat; +import com.yahoo.slime.Slime; +import com.yahoo.vespa.curator.LockInfo; +import com.yahoo.vespa.curator.ThreadLockInfo; + +import java.io.IOException; +import java.io.OutputStream; +import java.time.Instant; +import java.util.List; + +public class LocksResponse extends HttpResponse { + + private final Slime slime; + + public LocksResponse() { + super(200); + + this.slime = new Slime(); + Cursor root = slime.setObject(); + + root.setLong("in-critical-region", ThreadLockInfo.inCriticalRegionCount()); + root.setLong("invoke-acquire", ThreadLockInfo.invokeAcquireCount()); + root.setLong("acquire-timed-out", ThreadLockInfo.acquireTimedOutCount()); + root.setLong("lock-acquired", ThreadLockInfo.lockAcquiredCount()); + root.setLong("locks-released", ThreadLockInfo.locksReleasedCount()); + root.setLong("acquire-reentrant-lock-errors", ThreadLockInfo.failedToAcquireReentrantLockCount()); + root.setLong("no-locks-errors", ThreadLockInfo.noLocksErrorCount()); + root.setLong("timeout-on-reentrancy-errors", ThreadLockInfo.timeoutOnReentrancyErrorCount()); + + List<ThreadLockInfo> threadLockInfos = ThreadLockInfo.getThreadLockInfos(); + if (!threadLockInfos.isEmpty()) { + Cursor threadsCursor = root.setArray("threads"); + threadLockInfos.forEach(threadLockInfo -> { + Cursor threadLockInfoCursor = threadsCursor.addObject(); + threadLockInfoCursor.setString("thread-name", threadLockInfo.getThreadName()); + threadLockInfoCursor.setString("lock-path", threadLockInfo.getLockPath()); + + List<LockInfo> lockInfos = threadLockInfo.getLockInfos(); + if (!lockInfos.isEmpty()) { + Cursor lockInfosCursor = threadLockInfoCursor.setArray("locks"); + lockInfos.forEach(lockInfo -> { + Cursor lockInfoCursor = lockInfosCursor.addObject(); + lockInfoCursor.setString("invoke-acquire-time", toString(lockInfo.getTimeAcquiredWasInvoked())); + lockInfoCursor.setLong("reentrancy-hold-count-on-acquire", lockInfo.getThreadHoldCountOnAcquire()); + lockInfoCursor.setString("acquire-timeout", lockInfo.getAcquireTimeout().toString()); + lockInfo.getTimeLockWasAcquired().ifPresent(instant -> lockInfoCursor.setString("lock-acquired-time", toString(instant))); + lockInfoCursor.setString("lock-state", lockInfo.getLockState().name()); + lockInfo.getTimeTerminalStateWasReached().ifPresent(instant -> lockInfoCursor.setString("terminal-state-time", toString(instant))); + }); + } + }); + } + } + + @Override + public void render(OutputStream stream) throws IOException { + new JsonFormat(true).encode(stream, slime); + } + + @Override + public String getContentType() { + return "application/json"; + } + + private static String toString(Instant time) { + return Instant.ofEpochMilli(time.toEpochMilli()).toString(); + } +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index 1891b9d2f82..a2d599eab6e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -116,6 +116,7 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { if (pathS.startsWith("/nodes/v2/state/")) return new NodesResponse(ResponseType.nodesInStateList, request, orchestrator, nodeRepository); if (pathS.startsWith("/nodes/v2/acl/")) return new NodeAclResponse(request, nodeRepository); if (pathS.equals( "/nodes/v2/command/")) return new ResourceResponse(request.getUri(), "restart", "reboot"); + if (pathS.equals( "/nodes/v2/locks")) return new LocksResponse(); if (pathS.equals( "/nodes/v2/maintenance/")) return new JobsResponse(nodeRepository.jobControl()); if (pathS.equals( "/nodes/v2/upgrade/")) return new UpgradeResponse(nodeRepository.infrastructureVersions(), nodeRepository.osVersions(), nodeRepository.dockerImages()); if (pathS.startsWith("/nodes/v2/capacity")) return new HostCapacityResponse(nodeRepository, request); diff --git a/zkfacade/abi-spec.json b/zkfacade/abi-spec.json index f4ad1ab4372..614c58f3cf6 100644 --- a/zkfacade/abi-spec.json +++ b/zkfacade/abi-spec.json @@ -109,5 +109,65 @@ "public void close()" ], "fields": [] + }, + "com.yahoo.vespa.curator.LockInfo$LockState": { + "superClass": "java.lang.Enum", + "interfaces": [], + "attributes": [ + "public", + "final", + "enum" + ], + "methods": [ + "public static com.yahoo.vespa.curator.LockInfo$LockState[] values()", + "public static com.yahoo.vespa.curator.LockInfo$LockState valueOf(java.lang.String)", + "public boolean isTerminal()" + ], + "fields": [ + "public static final enum com.yahoo.vespa.curator.LockInfo$LockState ACQUIRING", + "public static final enum com.yahoo.vespa.curator.LockInfo$LockState TIMED_OUT", + "public static final enum com.yahoo.vespa.curator.LockInfo$LockState ACQUIRED", + "public static final enum com.yahoo.vespa.curator.LockInfo$LockState FAILED_TO_REENTER", + "public static final enum com.yahoo.vespa.curator.LockInfo$LockState RELEASED" + ] + }, + "com.yahoo.vespa.curator.LockInfo": { + "superClass": "java.lang.Object", + "interfaces": [], + "attributes": [ + "public" + ], + "methods": [ + "public static com.yahoo.vespa.curator.LockInfo invokingAcquire(int, java.time.Duration)", + "public int getThreadHoldCountOnAcquire()", + "public java.time.Instant getTimeAcquiredWasInvoked()", + "public java.time.Duration getAcquireTimeout()", + "public com.yahoo.vespa.curator.LockInfo$LockState getLockState()", + "public java.util.Optional getTimeLockWasAcquired()", + "public java.util.Optional getTimeTerminalStateWasReached()" + ], + "fields": [] + }, + "com.yahoo.vespa.curator.ThreadLockInfo": { + "superClass": "java.lang.Object", + "interfaces": [], + "attributes": [ + "public" + ], + "methods": [ + "public static int invokeAcquireCount()", + "public static int inCriticalRegionCount()", + "public static int acquireTimedOutCount()", + "public static int lockAcquiredCount()", + "public static int locksReleasedCount()", + "public static int noLocksErrorCount()", + "public static int failedToAcquireReentrantLockCount()", + "public static int timeoutOnReentrancyErrorCount()", + "public static java.util.List getThreadLockInfos()", + "public java.lang.String getThreadName()", + "public java.lang.String getLockPath()", + "public java.util.List getLockInfos()" + ], + "fields": [] } }
\ No newline at end of file diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java index d97d8f5ed71..299fceb8d7f 100644 --- a/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java +++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java @@ -32,12 +32,20 @@ public class Lock implements Mutex { /** Take the lock with the given timeout. This may be called multiple times from the same thread - each matched by a close */ public void acquire(Duration timeout) throws UncheckedTimeoutException { + ThreadLockInfo threadLockInfo = getThreadLockInfo(); + threadLockInfo.invokingAcquire(timeout); try { - if ( ! mutex.acquire(timeout.toMillis(), TimeUnit.MILLISECONDS)) + if ( ! mutex.acquire(timeout.toMillis(), TimeUnit.MILLISECONDS)) { + threadLockInfo.acquireTimedOut(); + throw new UncheckedTimeoutException("Timed out after waiting " + timeout + - " to acquire lock '" + lockPath + "'"); + " to acquire lock '" + lockPath + "'"); + } + threadLockInfo.lockAcquired(); + if ( ! lock.tryLock()) { // Should be available to only this thread, while holding the above mutex. release(); + threadLockInfo.failedToAcquireReentrantLock(); throw new IllegalStateException("InterProcessMutex acquired, but guarded lock held by someone else, for lock '" + lockPath + "'"); } } @@ -60,6 +68,7 @@ public class Lock implements Mutex { } private void release() { + getThreadLockInfo().lockReleased(); try { mutex.release(); } @@ -68,6 +77,9 @@ public class Lock implements Mutex { } } + private ThreadLockInfo getThreadLockInfo() { + return ThreadLockInfo.getCurrentThreadLockInfo(lockPath, lock); + } } diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/LockInfo.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/LockInfo.java new file mode 100644 index 00000000000..870ee12ebda --- /dev/null +++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/LockInfo.java @@ -0,0 +1,64 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.curator; + +import java.time.Duration; +import java.time.Instant; +import java.util.Optional; + +/** + * Information about a lock. + * + * <p>Should be mutated by a single thread. Other threads may see an inconsistent state of this instance.</p> + */ +public class LockInfo { + + private final int threadHoldCountOnAcquire; + private final Instant acquireInstant; + private final Duration timeout; + + private volatile Optional<Instant> lockAcquiredInstant = Optional.empty(); + private volatile Optional<Instant> terminalStateInstant = Optional.empty(); + + public static LockInfo invokingAcquire(int holdCount, Duration timeout) { + return new LockInfo(holdCount, timeout); + } + + public enum LockState { + ACQUIRING(false), TIMED_OUT(true), ACQUIRED(false), FAILED_TO_REENTER(true), RELEASED(true); + + private final boolean terminal; + + LockState(boolean terminal) { this.terminal = terminal; } + + public boolean isTerminal() { return terminal; } + } + + private volatile LockState lockState = LockState.ACQUIRING; + + private LockInfo(int threadHoldCountOnAcquire, Duration timeout) { + this.threadHoldCountOnAcquire = threadHoldCountOnAcquire; + this.acquireInstant = Instant.now(); + this.timeout = timeout; + } + + public int getThreadHoldCountOnAcquire() { return threadHoldCountOnAcquire; } + public Instant getTimeAcquiredWasInvoked() { return acquireInstant; } + public Duration getAcquireTimeout() { return timeout; } + public LockState getLockState() { return lockState; } + public Optional<Instant> getTimeLockWasAcquired() { return lockAcquiredInstant; } + public Optional<Instant> getTimeTerminalStateWasReached() { return terminalStateInstant; } + + void timedOut() { setTerminalState(LockState.TIMED_OUT); } + void failedToAcquireReentrantLock() { setTerminalState(LockState.FAILED_TO_REENTER); } + void released() { setTerminalState(LockState.RELEASED); } + + void lockAcquired() { + lockState = LockState.ACQUIRED; + lockAcquiredInstant = Optional.of(Instant.now()); + } + + void setTerminalState(LockState terminalState) { + lockState = terminalState; + terminalStateInstant = Optional.of(Instant.now()); + } +} diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/ThreadLockInfo.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/ThreadLockInfo.java new file mode 100644 index 00000000000..9fbcb550ddd --- /dev/null +++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/ThreadLockInfo.java @@ -0,0 +1,125 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.curator; + +import java.time.Duration; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedDeque; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.Consumer; + +/** + * This class contains process-wide statistics and information related to acquiring and releasing + * {@link Lock}. Instances of this class contain information tied to a specific thread and lock path. + * + * <p>Instances of this class are thread-safe as long as foreign threads (!= this.thread) avoid mutable methods.</p> + */ +public class ThreadLockInfo { + + private static final ConcurrentHashMap<Thread, ThreadLockInfo> locks = new ConcurrentHashMap<>(); + + private static final int MAX_COMPLETED_LOCK_INFOS_SIZE = 10; + private static final ConcurrentLinkedDeque<LockInfo> completedLockInfos = new ConcurrentLinkedDeque<>(); + + private static final AtomicInteger invokeAcquireCount = new AtomicInteger(0); + private static final AtomicInteger inCriticalRegionCount = new AtomicInteger(0); + private static final AtomicInteger acquireTimedOutCount = new AtomicInteger(0); + private static final AtomicInteger lockAcquiredCount = new AtomicInteger(0); + private static final AtomicInteger locksReleasedCount = new AtomicInteger(0); + + private static final AtomicInteger failedToAcquireReentrantLockCount = new AtomicInteger(0); + private static final AtomicInteger noLocksErrorCount = new AtomicInteger(0); + private static final AtomicInteger timeoutOnReentrancyErrorCount = new AtomicInteger(0); + + private final Thread thread; + private final String lockPath; + private final ReentrantLock lock; + + /** The locks are reentrant so there may be more than 1 lock for this thread. */ + private final ConcurrentLinkedQueue<LockInfo> lockInfos = new ConcurrentLinkedQueue<>(); + + public static int invokeAcquireCount() { return invokeAcquireCount.get(); } + public static int inCriticalRegionCount() { return inCriticalRegionCount.get(); } + public static int acquireTimedOutCount() { return acquireTimedOutCount.get(); } + public static int lockAcquiredCount() { return lockAcquiredCount.get(); } + public static int locksReleasedCount() { return locksReleasedCount.get(); } + public static int noLocksErrorCount() { return noLocksErrorCount.get(); } + public static int failedToAcquireReentrantLockCount() { return failedToAcquireReentrantLockCount.get(); } + public static int timeoutOnReentrancyErrorCount() { return timeoutOnReentrancyErrorCount.get(); } + public static List<ThreadLockInfo> getThreadLockInfos() { return List.copyOf(locks.values()); } + + /** Returns the per-thread singleton ThreadLockInfo. */ + static ThreadLockInfo getCurrentThreadLockInfo(String lockPath, ReentrantLock lock) { + return locks.computeIfAbsent( + Thread.currentThread(), + currentThread -> new ThreadLockInfo(currentThread, lockPath, lock)); + } + + ThreadLockInfo(Thread currentThread, String lockPath, ReentrantLock lock) { + this.thread = currentThread; + this.lockPath = lockPath; + this.lock = lock; + } + + public String getThreadName() { return thread.getName(); } + public String getLockPath() { return lockPath; } + public List<LockInfo> getLockInfos() { return List.copyOf(lockInfos); } + + /** Mutable method (see class doc) */ + void invokingAcquire(Duration timeout) { + invokeAcquireCount.incrementAndGet(); + inCriticalRegionCount.incrementAndGet(); + lockInfos.add(LockInfo.invokingAcquire(lock.getHoldCount(), timeout)); + } + + /** Mutable method (see class doc) */ + void acquireTimedOut() { + if (lockInfos.size() > 1) { + timeoutOnReentrancyErrorCount.incrementAndGet(); + } + + removeLastLockInfo(acquireTimedOutCount, LockInfo::timedOut); + } + + /** Mutable method (see class doc) */ + void lockAcquired() { + lockAcquiredCount.incrementAndGet(); + getLastLockInfo().ifPresent(LockInfo::lockAcquired); + } + + /** Mutable method (see class doc) */ + void failedToAcquireReentrantLock() { + removeLastLockInfo(failedToAcquireReentrantLockCount, LockInfo::failedToAcquireReentrantLock); + } + + /** Mutable method (see class doc) */ + void lockReleased() { + removeLastLockInfo(locksReleasedCount, LockInfo::released); + } + + private Optional<LockInfo> getLastLockInfo() { + return lockInfos.isEmpty() ? Optional.empty() : Optional.of(lockInfos.peek()); + } + + private void removeLastLockInfo(AtomicInteger metricToIncrement, Consumer<LockInfo> completeLockInfo) { + metricToIncrement.incrementAndGet(); + inCriticalRegionCount.decrementAndGet(); + + if (lockInfos.isEmpty()) { + noLocksErrorCount.incrementAndGet(); + return; + } + + LockInfo lockInfo = lockInfos.poll(); + completeLockInfo.accept(lockInfo); + + if (completedLockInfos.size() >= MAX_COMPLETED_LOCK_INFOS_SIZE) { + // This is thread-safe, as no-one but currentThread mutates completedLockInfos + completedLockInfos.removeLast(); + } + completedLockInfos.addFirst(lockInfo); + } +} |