summaryrefslogtreecommitdiffstats
path: root/zkfacade
diff options
context:
space:
mode:
authorHåkon Hallingstad <hakon@verizonmedia.com>2020-10-19 08:52:39 +0200
committerHåkon Hallingstad <hakon@verizonmedia.com>2020-10-19 08:52:39 +0200
commit7c04dcf468ffb100bc1d6bda87058b442ed5cce1 (patch)
tree0c94773157b11bc68520e54354434059a86cc117 /zkfacade
parent411aa57af0a5d3f8d9441e22c8cdb59d482702d0 (diff)
Replace deadlock avoidance with metrics
Diffstat (limited to 'zkfacade')
-rw-r--r--zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockMetrics.java38
-rw-r--r--zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java14
-rw-r--r--zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java43
-rw-r--r--zkfacade/src/test/java/com/yahoo/vespa/curator/stats/LockTest.java25
4 files changed, 91 insertions, 29 deletions
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockMetrics.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockMetrics.java
index 36758354171..ce3676433cd 100644
--- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockMetrics.java
+++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockMetrics.java
@@ -30,6 +30,16 @@ public class LockMetrics {
private final LatencyStats acquireStats = new LatencyStats();
private final LatencyStats lockedStats = new LatencyStats();
+ private final AtomicInteger deadlockCount = new AtomicInteger(0);
+ private final AtomicInteger acquireWithoutReleaseCount = new AtomicInteger(0);
+ private final AtomicInteger nakedReleaseCount = new AtomicInteger(0);
+ private final AtomicInteger foreignReleaseCount = new AtomicInteger(0);
+
+ private final AtomicInteger cumulativeDeadlockCount = new AtomicInteger(0);
+ private final AtomicInteger cumulativeAcquireWithoutReleaseCount = new AtomicInteger(0);
+ private final AtomicInteger cumulativeNakedReleaseCount = new AtomicInteger(0);
+ private final AtomicInteger cumulativeForeignReleaseCount = new AtomicInteger(0);
+
/** Returns a Runnable that must be invoked when the acquire() finishes. */
ActiveInterval acquireInvoked(boolean reentry) {
if (reentry) {
@@ -78,6 +88,26 @@ public class LockMetrics {
cumulativeReleaseFailedCount.incrementAndGet();
}
+ void incrementDeadlockCount() {
+ deadlockCount.incrementAndGet();
+ cumulativeDeadlockCount.incrementAndGet();
+ }
+
+ void incrementAcquireWithoutReleaseCount() {
+ acquireWithoutReleaseCount.incrementAndGet();
+ cumulativeAcquireWithoutReleaseCount.incrementAndGet();
+ }
+
+ void incrementNakedReleaseCount() {
+ nakedReleaseCount.incrementAndGet();
+ cumulativeNakedReleaseCount.incrementAndGet();
+ }
+
+ void incrementForeignReleaseCount() {
+ foreignReleaseCount.incrementAndGet();
+ cumulativeForeignReleaseCount.incrementAndGet();
+ }
+
public int getAndResetAcquireCount() { return acquireCount.getAndSet(0); }
public int getAndResetAcquireFailedCount() { return acquireFailedCount.getAndSet(0); }
public int getAndResetAcquireTimedOutCount() { return acquireTimedOutCount.getAndSet(0); }
@@ -85,6 +115,10 @@ public class LockMetrics {
public int getAndResetReleaseCount() { return releaseCount.getAndSet(0); }
public int getAndResetReleaseFailedCount() { return releaseFailedCount.getAndSet(0); }
public int getAndResetReentryCount() { return reentryCount.getAndSet(0); }
+ public int getAndResetDeadlockCount() { return deadlockCount.getAndSet(0); }
+ public int getAndResetAcquireWithoutReleaseCount() { return acquireWithoutReleaseCount.getAndSet(0); }
+ public int getAndResetNakedReleaseCount() { return nakedReleaseCount.getAndSet(0); }
+ public int getAndResetForeignReleaseCount() { return foreignReleaseCount.getAndSet(0); }
public int getCumulativeAcquireCount() { return cumulativeAcquireCount.get(); }
public int getCumulativeAcquireFailedCount() { return cumulativeAcquireFailedCount.get(); }
@@ -93,6 +127,10 @@ public class LockMetrics {
public int getCumulativeReleaseCount() { return cumulativeReleaseCount.get(); }
public int getCumulativeReleaseFailedCount() { return cumulativeReleaseFailedCount.get(); }
public int getCumulativeReentryCount() { return cumulativeReentryCount.get(); }
+ public int getCumulativeDeadlockCount() { return cumulativeDeadlockCount.get(); }
+ public int getCumulativeAcquireWithoutReleaseCount() { return cumulativeAcquireWithoutReleaseCount.get(); }
+ public int getCumulativeNakedReleaseCount() { return cumulativeNakedReleaseCount.get(); }
+ public int getCumulativeForeignReleaseCount() { return cumulativeForeignReleaseCount.get(); }
public LatencyMetrics getAcquireLatencyMetrics() { return acquireStats.getLatencyMetrics(); }
public LatencyMetrics getLockedLatencyMetrics() { return lockedStats.getLatencyMetrics(); }
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java
index 2ed6655d68f..0c51cfc685d 100644
--- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java
+++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java
@@ -7,6 +7,7 @@ import java.util.Map;
import java.util.Optional;
import java.util.PriorityQueue;
import java.util.concurrent.ConcurrentHashMap;
+import java.util.logging.Logger;
/**
* This class manages statistics related to lock attempts on {@link com.yahoo.vespa.curator.Lock}.
@@ -14,6 +15,8 @@ import java.util.concurrent.ConcurrentHashMap;
* @author hakon
*/
public class LockStats {
+ private static final Logger logger = Logger.getLogger(LockStats.class.getName());
+
// No 'volatile' is needed because field is only ever changed for testing which is single-threaded.
private static LockStats stats = new LockStats();
@@ -64,7 +67,8 @@ public class LockStats {
void notifyOfThreadHoldingLock(Thread currentThread, String lockPath) {
Thread oldThread = lockPathsHeld.put(lockPath, currentThread);
if (oldThread != null) {
- throw new IllegalStateException("Thread " + currentThread.getName() +
+ getLockMetrics(lockPath).incrementAcquireWithoutReleaseCount();
+ logger.warning("Thread " + currentThread.getName() +
" reports it has the lock on " + lockPath + ", but thread " + oldThread.getName() +
" has not reported it released the lock");
}
@@ -74,10 +78,12 @@ public class LockStats {
void notifyOfThreadReleasingLock(Thread currentThread, String lockPath) {
Thread oldThread = lockPathsHeld.remove(lockPath);
if (oldThread == null) {
- throw new IllegalStateException("Thread " + currentThread.getName() +
- " is releasing the lock " + lockPath + ", but nobody own that lock");
+ getLockMetrics(lockPath).incrementNakedReleaseCount();
+ logger.warning("Thread " + currentThread.getName() +
+ " is releasing the lock " + lockPath + ", but nobody owns that lock");
} else if (oldThread != currentThread) {
- throw new IllegalStateException("Thread " + currentThread.getName() +
+ getLockMetrics(lockPath).incrementForeignReleaseCount();
+ logger.warning("Thread " + currentThread.getName() +
" is releasing the lock " + lockPath + ", but it was owned by thread "
+ oldThread.getName());
}
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java
index f38811c4b3c..70442e4833d 100644
--- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java
+++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java
@@ -1,12 +1,13 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.curator.stats;
-import com.google.common.util.concurrent.UncheckedTimeoutException;
import com.yahoo.vespa.curator.Lock;
import java.time.Duration;
+import java.util.HashSet;
import java.util.List;
import java.util.Optional;
+import java.util.Set;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.function.Consumer;
import java.util.logging.Logger;
@@ -70,7 +71,7 @@ public class ThreadLockStats {
boolean reentry = lockAttemptsStack.stream().anyMatch(lockAttempt -> lockAttempt.getLockPath().equals(lockPath));
if (!reentry) {
- throwOnDeadlock(lockPath);
+ testForDeadlock(lockPath);
}
LockAttempt lockAttempt = LockAttempt.invokingAcquire(this, lockPath, timeout,
@@ -151,40 +152,48 @@ public class ThreadLockStats {
}
/**
- * Throws a timeout exception if acquiring the path would cause a deadlock.
+ * Tries to detect whether acquiring a given lock path would deadlock.
*
* <p>Thread T0 will deadlock if it tries to acquire a lock on a path L1 held by T1,
* and T1 is waiting on L2 held by T2, and so forth, and TN is waiting on L0 held by T0.</p>
*
+ *
+ * <p>Since the underlying data structures are concurrently being modified (as an optimization,
+ * no lock is taken for this calculation), a cycle may be detected not involving T0.</p>
+ *
* <p>This method is a best-effort attempt at detecting deadlocks: A deadlock may in fact be
* resolved even though this method throws, if e.g. locks are released just after this
- * method </p>
- *
- * @throws com.google.common.util.concurrent.UncheckedTimeoutException
+ * method.</p>
*/
- private void throwOnDeadlock(String pathToAcquire) {
+ private void testForDeadlock(String pathToAcquire) {
LockStats globalLockStats = LockStats.getGlobal();
var errorMessage = new StringBuilder().append("Deadlock detected: Thread ").append(thread.getName());
+ // The set of all threads waiting. If we're waiting in a cycle, there is a deadlock...
+ Set<Thread> threadsAcquiring = new HashSet<>();
+ Thread threadAcquiringLockPath = thread;
String lockPath = pathToAcquire;
+
while (true) {
Optional<ThreadLockStats> threadLockStats = globalLockStats.getThreadLockStatsHolding(lockPath);
if (threadLockStats.isEmpty()) {
return;
}
+ Thread threadHoldingLockPath = threadLockStats.get().thread;
errorMessage.append(", trying to acquire lock ")
.append(lockPath)
.append(" held by thread ")
- .append(threadLockStats.get().thread.getName());
-
- if (threadLockStats.get().thread == thread) {
- if (lockPath.equals(pathToAcquire)) {
- // reentry, ignore
- return;
- } else {
- throw new UncheckedTimeoutException(errorMessage.toString());
- }
+ .append(threadHoldingLockPath.getName());
+
+ if (threadAcquiringLockPath == threadHoldingLockPath) {
+ // reentry
+ return;
+ } else if (threadsAcquiring.contains(threadAcquiringLockPath)) {
+ // deadlock
+ getGlobalLockMetrics(pathToAcquire).incrementDeadlockCount();
+ logger.warning(errorMessage.toString());
+ return;
}
Optional<String> nextLockPath = threadLockStats.get().acquiringLockPath();
@@ -192,7 +201,9 @@ public class ThreadLockStats {
return;
}
+ threadsAcquiring.add(threadHoldingLockPath);
lockPath = nextLockPath.get();
+ threadAcquiringLockPath = threadHoldingLockPath;
}
}
diff --git a/zkfacade/src/test/java/com/yahoo/vespa/curator/stats/LockTest.java b/zkfacade/src/test/java/com/yahoo/vespa/curator/stats/LockTest.java
index c28691fe655..5340232a369 100644
--- a/zkfacade/src/test/java/com/yahoo/vespa/curator/stats/LockTest.java
+++ b/zkfacade/src/test/java/com/yahoo/vespa/curator/stats/LockTest.java
@@ -31,7 +31,7 @@ public class LockTest {
private final InterProcessLock mutex = mock(InterProcessLock.class);
private final String lockPath = "/lock/path";
private final String lock2Path = "/lock2/path";
- private static final Duration acquireTimeout = Duration.ofMinutes(10);
+ private static final Duration acquireTimeout = Duration.ofMillis(1000);
private final Lock lock = new Lock(lockPath, mutex);
private final Lock lock2 = new Lock(lock2Path, mutex);
@@ -93,6 +93,10 @@ public class LockTest {
assertEquals(expected.getCumulativeReleaseCount(), actual.getCumulativeReleaseCount());
assertEquals(expected.getCumulativeReleaseFailedCount(), actual.getCumulativeReleaseFailedCount());
assertEquals(expected.getCumulativeReentryCount(), actual.getCumulativeReentryCount());
+ assertEquals(expected.getCumulativeDeadlockCount(), actual.getCumulativeDeadlockCount());
+ assertEquals(expected.getCumulativeNakedReleaseCount(), actual.getCumulativeNakedReleaseCount());
+ assertEquals(expected.getCumulativeAcquireWithoutReleaseCount(), actual.getCumulativeAcquireWithoutReleaseCount());
+ assertEquals(expected.getCumulativeForeignReleaseCount(), actual.getCumulativeForeignReleaseCount());
assertEquals(expected.getAndResetAcquireCount(), actual.getAndResetAcquireCount());
assertEquals(expected.getAndResetAcquireFailedCount(), actual.getAndResetAcquireFailedCount());
@@ -228,14 +232,13 @@ public class LockTest {
lock1.acquire(acquireTimeout);
fail();
} catch (UncheckedTimeoutException e) {
- assertEquals(
- "Unexpected timeout exception message: " + e.getMessage(),
- "Deadlock detected: Thread main, " +
- "trying to acquire lock /lock/path/1 held by thread LockTest-async-thread, " +
- "trying to acquire lock /lock/path/2 held by thread main",
- e.getMessage());
+ assertEquals("Timed out after waiting PT1S to acquire lock '/lock/path/1'", e.getMessage());
}
+ LockMetrics lockMetrics = LockStats.getGlobal().getLockMetrics("/lock/path/1");
+ assertEquals(1, lockMetrics.getAndResetDeadlockCount());
+ assertEquals(1, lockMetrics.getCumulativeDeadlockCount());
+
// Unlock, which unblocks thread
lock2.close();
thread.join();
@@ -245,7 +248,9 @@ public class LockTest {
lock1.acquire(acquireTimeout);
// This will block
- lock2.acquire(acquireTimeout);
+ try {
+ lock2.acquire(acquireTimeout);
+ } catch (UncheckedTimeoutException ignored) {}
lock2.close();
@@ -255,7 +260,9 @@ public class LockTest {
private static class InterProcessMutexMock implements InterProcessLock {
private final ReentrantLock lock = new ReentrantLock();
@Override public void acquire() throws Exception { lock.lock(); }
- @Override public boolean acquire(long time, TimeUnit unit) throws Exception { acquire(); return true; }
+ @Override public boolean acquire(long time, TimeUnit unit) throws Exception {
+ return lock.tryLock(time, unit);
+ }
@Override public void release() throws Exception { lock.unlock(); }
@Override public boolean isAcquiredInThisProcess() { return lock.isLocked(); }
}