diff options
author | Håkon Hallingstad <hakon@verizonmedia.com> | 2020-10-19 08:52:39 +0200 |
---|---|---|
committer | Håkon Hallingstad <hakon@verizonmedia.com> | 2020-10-19 08:52:39 +0200 |
commit | 7c04dcf468ffb100bc1d6bda87058b442ed5cce1 (patch) | |
tree | 0c94773157b11bc68520e54354434059a86cc117 /zkfacade/src | |
parent | 411aa57af0a5d3f8d9441e22c8cdb59d482702d0 (diff) |
Replace deadlock avoidance with metrics
Diffstat (limited to 'zkfacade/src')
4 files changed, 91 insertions, 29 deletions
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockMetrics.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockMetrics.java index 36758354171..ce3676433cd 100644 --- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockMetrics.java +++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockMetrics.java @@ -30,6 +30,16 @@ public class LockMetrics { private final LatencyStats acquireStats = new LatencyStats(); private final LatencyStats lockedStats = new LatencyStats(); + private final AtomicInteger deadlockCount = new AtomicInteger(0); + private final AtomicInteger acquireWithoutReleaseCount = new AtomicInteger(0); + private final AtomicInteger nakedReleaseCount = new AtomicInteger(0); + private final AtomicInteger foreignReleaseCount = new AtomicInteger(0); + + private final AtomicInteger cumulativeDeadlockCount = new AtomicInteger(0); + private final AtomicInteger cumulativeAcquireWithoutReleaseCount = new AtomicInteger(0); + private final AtomicInteger cumulativeNakedReleaseCount = new AtomicInteger(0); + private final AtomicInteger cumulativeForeignReleaseCount = new AtomicInteger(0); + /** Returns a Runnable that must be invoked when the acquire() finishes. */ ActiveInterval acquireInvoked(boolean reentry) { if (reentry) { @@ -78,6 +88,26 @@ public class LockMetrics { cumulativeReleaseFailedCount.incrementAndGet(); } + void incrementDeadlockCount() { + deadlockCount.incrementAndGet(); + cumulativeDeadlockCount.incrementAndGet(); + } + + void incrementAcquireWithoutReleaseCount() { + acquireWithoutReleaseCount.incrementAndGet(); + cumulativeAcquireWithoutReleaseCount.incrementAndGet(); + } + + void incrementNakedReleaseCount() { + nakedReleaseCount.incrementAndGet(); + cumulativeNakedReleaseCount.incrementAndGet(); + } + + void incrementForeignReleaseCount() { + foreignReleaseCount.incrementAndGet(); + cumulativeForeignReleaseCount.incrementAndGet(); + } + public int getAndResetAcquireCount() { return acquireCount.getAndSet(0); } public int getAndResetAcquireFailedCount() { return acquireFailedCount.getAndSet(0); } public int getAndResetAcquireTimedOutCount() { return acquireTimedOutCount.getAndSet(0); } @@ -85,6 +115,10 @@ public class LockMetrics { public int getAndResetReleaseCount() { return releaseCount.getAndSet(0); } public int getAndResetReleaseFailedCount() { return releaseFailedCount.getAndSet(0); } public int getAndResetReentryCount() { return reentryCount.getAndSet(0); } + public int getAndResetDeadlockCount() { return deadlockCount.getAndSet(0); } + public int getAndResetAcquireWithoutReleaseCount() { return acquireWithoutReleaseCount.getAndSet(0); } + public int getAndResetNakedReleaseCount() { return nakedReleaseCount.getAndSet(0); } + public int getAndResetForeignReleaseCount() { return foreignReleaseCount.getAndSet(0); } public int getCumulativeAcquireCount() { return cumulativeAcquireCount.get(); } public int getCumulativeAcquireFailedCount() { return cumulativeAcquireFailedCount.get(); } @@ -93,6 +127,10 @@ public class LockMetrics { public int getCumulativeReleaseCount() { return cumulativeReleaseCount.get(); } public int getCumulativeReleaseFailedCount() { return cumulativeReleaseFailedCount.get(); } public int getCumulativeReentryCount() { return cumulativeReentryCount.get(); } + public int getCumulativeDeadlockCount() { return cumulativeDeadlockCount.get(); } + public int getCumulativeAcquireWithoutReleaseCount() { return cumulativeAcquireWithoutReleaseCount.get(); } + public int getCumulativeNakedReleaseCount() { return cumulativeNakedReleaseCount.get(); } + public int getCumulativeForeignReleaseCount() { return cumulativeForeignReleaseCount.get(); } public LatencyMetrics getAcquireLatencyMetrics() { return acquireStats.getLatencyMetrics(); } public LatencyMetrics getLockedLatencyMetrics() { return lockedStats.getLatencyMetrics(); } diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java index 2ed6655d68f..0c51cfc685d 100644 --- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java +++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java @@ -7,6 +7,7 @@ import java.util.Map; import java.util.Optional; import java.util.PriorityQueue; import java.util.concurrent.ConcurrentHashMap; +import java.util.logging.Logger; /** * This class manages statistics related to lock attempts on {@link com.yahoo.vespa.curator.Lock}. @@ -14,6 +15,8 @@ import java.util.concurrent.ConcurrentHashMap; * @author hakon */ public class LockStats { + private static final Logger logger = Logger.getLogger(LockStats.class.getName()); + // No 'volatile' is needed because field is only ever changed for testing which is single-threaded. private static LockStats stats = new LockStats(); @@ -64,7 +67,8 @@ public class LockStats { void notifyOfThreadHoldingLock(Thread currentThread, String lockPath) { Thread oldThread = lockPathsHeld.put(lockPath, currentThread); if (oldThread != null) { - throw new IllegalStateException("Thread " + currentThread.getName() + + getLockMetrics(lockPath).incrementAcquireWithoutReleaseCount(); + logger.warning("Thread " + currentThread.getName() + " reports it has the lock on " + lockPath + ", but thread " + oldThread.getName() + " has not reported it released the lock"); } @@ -74,10 +78,12 @@ public class LockStats { void notifyOfThreadReleasingLock(Thread currentThread, String lockPath) { Thread oldThread = lockPathsHeld.remove(lockPath); if (oldThread == null) { - throw new IllegalStateException("Thread " + currentThread.getName() + - " is releasing the lock " + lockPath + ", but nobody own that lock"); + getLockMetrics(lockPath).incrementNakedReleaseCount(); + logger.warning("Thread " + currentThread.getName() + + " is releasing the lock " + lockPath + ", but nobody owns that lock"); } else if (oldThread != currentThread) { - throw new IllegalStateException("Thread " + currentThread.getName() + + getLockMetrics(lockPath).incrementForeignReleaseCount(); + logger.warning("Thread " + currentThread.getName() + " is releasing the lock " + lockPath + ", but it was owned by thread " + oldThread.getName()); } diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java index f38811c4b3c..70442e4833d 100644 --- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java +++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java @@ -1,12 +1,13 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.curator.stats; -import com.google.common.util.concurrent.UncheckedTimeoutException; import com.yahoo.vespa.curator.Lock; import java.time.Duration; +import java.util.HashSet; import java.util.List; import java.util.Optional; +import java.util.Set; import java.util.concurrent.ConcurrentLinkedDeque; import java.util.function.Consumer; import java.util.logging.Logger; @@ -70,7 +71,7 @@ public class ThreadLockStats { boolean reentry = lockAttemptsStack.stream().anyMatch(lockAttempt -> lockAttempt.getLockPath().equals(lockPath)); if (!reentry) { - throwOnDeadlock(lockPath); + testForDeadlock(lockPath); } LockAttempt lockAttempt = LockAttempt.invokingAcquire(this, lockPath, timeout, @@ -151,40 +152,48 @@ public class ThreadLockStats { } /** - * Throws a timeout exception if acquiring the path would cause a deadlock. + * Tries to detect whether acquiring a given lock path would deadlock. * * <p>Thread T0 will deadlock if it tries to acquire a lock on a path L1 held by T1, * and T1 is waiting on L2 held by T2, and so forth, and TN is waiting on L0 held by T0.</p> * + * + * <p>Since the underlying data structures are concurrently being modified (as an optimization, + * no lock is taken for this calculation), a cycle may be detected not involving T0.</p> + * * <p>This method is a best-effort attempt at detecting deadlocks: A deadlock may in fact be * resolved even though this method throws, if e.g. locks are released just after this - * method </p> - * - * @throws com.google.common.util.concurrent.UncheckedTimeoutException + * method.</p> */ - private void throwOnDeadlock(String pathToAcquire) { + private void testForDeadlock(String pathToAcquire) { LockStats globalLockStats = LockStats.getGlobal(); var errorMessage = new StringBuilder().append("Deadlock detected: Thread ").append(thread.getName()); + // The set of all threads waiting. If we're waiting in a cycle, there is a deadlock... + Set<Thread> threadsAcquiring = new HashSet<>(); + Thread threadAcquiringLockPath = thread; String lockPath = pathToAcquire; + while (true) { Optional<ThreadLockStats> threadLockStats = globalLockStats.getThreadLockStatsHolding(lockPath); if (threadLockStats.isEmpty()) { return; } + Thread threadHoldingLockPath = threadLockStats.get().thread; errorMessage.append(", trying to acquire lock ") .append(lockPath) .append(" held by thread ") - .append(threadLockStats.get().thread.getName()); - - if (threadLockStats.get().thread == thread) { - if (lockPath.equals(pathToAcquire)) { - // reentry, ignore - return; - } else { - throw new UncheckedTimeoutException(errorMessage.toString()); - } + .append(threadHoldingLockPath.getName()); + + if (threadAcquiringLockPath == threadHoldingLockPath) { + // reentry + return; + } else if (threadsAcquiring.contains(threadAcquiringLockPath)) { + // deadlock + getGlobalLockMetrics(pathToAcquire).incrementDeadlockCount(); + logger.warning(errorMessage.toString()); + return; } Optional<String> nextLockPath = threadLockStats.get().acquiringLockPath(); @@ -192,7 +201,9 @@ public class ThreadLockStats { return; } + threadsAcquiring.add(threadHoldingLockPath); lockPath = nextLockPath.get(); + threadAcquiringLockPath = threadHoldingLockPath; } } diff --git a/zkfacade/src/test/java/com/yahoo/vespa/curator/stats/LockTest.java b/zkfacade/src/test/java/com/yahoo/vespa/curator/stats/LockTest.java index c28691fe655..5340232a369 100644 --- a/zkfacade/src/test/java/com/yahoo/vespa/curator/stats/LockTest.java +++ b/zkfacade/src/test/java/com/yahoo/vespa/curator/stats/LockTest.java @@ -31,7 +31,7 @@ public class LockTest { private final InterProcessLock mutex = mock(InterProcessLock.class); private final String lockPath = "/lock/path"; private final String lock2Path = "/lock2/path"; - private static final Duration acquireTimeout = Duration.ofMinutes(10); + private static final Duration acquireTimeout = Duration.ofMillis(1000); private final Lock lock = new Lock(lockPath, mutex); private final Lock lock2 = new Lock(lock2Path, mutex); @@ -93,6 +93,10 @@ public class LockTest { assertEquals(expected.getCumulativeReleaseCount(), actual.getCumulativeReleaseCount()); assertEquals(expected.getCumulativeReleaseFailedCount(), actual.getCumulativeReleaseFailedCount()); assertEquals(expected.getCumulativeReentryCount(), actual.getCumulativeReentryCount()); + assertEquals(expected.getCumulativeDeadlockCount(), actual.getCumulativeDeadlockCount()); + assertEquals(expected.getCumulativeNakedReleaseCount(), actual.getCumulativeNakedReleaseCount()); + assertEquals(expected.getCumulativeAcquireWithoutReleaseCount(), actual.getCumulativeAcquireWithoutReleaseCount()); + assertEquals(expected.getCumulativeForeignReleaseCount(), actual.getCumulativeForeignReleaseCount()); assertEquals(expected.getAndResetAcquireCount(), actual.getAndResetAcquireCount()); assertEquals(expected.getAndResetAcquireFailedCount(), actual.getAndResetAcquireFailedCount()); @@ -228,14 +232,13 @@ public class LockTest { lock1.acquire(acquireTimeout); fail(); } catch (UncheckedTimeoutException e) { - assertEquals( - "Unexpected timeout exception message: " + e.getMessage(), - "Deadlock detected: Thread main, " + - "trying to acquire lock /lock/path/1 held by thread LockTest-async-thread, " + - "trying to acquire lock /lock/path/2 held by thread main", - e.getMessage()); + assertEquals("Timed out after waiting PT1S to acquire lock '/lock/path/1'", e.getMessage()); } + LockMetrics lockMetrics = LockStats.getGlobal().getLockMetrics("/lock/path/1"); + assertEquals(1, lockMetrics.getAndResetDeadlockCount()); + assertEquals(1, lockMetrics.getCumulativeDeadlockCount()); + // Unlock, which unblocks thread lock2.close(); thread.join(); @@ -245,7 +248,9 @@ public class LockTest { lock1.acquire(acquireTimeout); // This will block - lock2.acquire(acquireTimeout); + try { + lock2.acquire(acquireTimeout); + } catch (UncheckedTimeoutException ignored) {} lock2.close(); @@ -255,7 +260,9 @@ public class LockTest { private static class InterProcessMutexMock implements InterProcessLock { private final ReentrantLock lock = new ReentrantLock(); @Override public void acquire() throws Exception { lock.lock(); } - @Override public boolean acquire(long time, TimeUnit unit) throws Exception { acquire(); return true; } + @Override public boolean acquire(long time, TimeUnit unit) throws Exception { + return lock.tryLock(time, unit); + } @Override public void release() throws Exception { lock.unlock(); } @Override public boolean isAcquiredInThisProcess() { return lock.isLocked(); } } |