diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2022-04-26 15:25:04 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-04-26 15:25:04 +0200 |
commit | e2756ecd8289704764a7c44bb7c83c576730cb76 (patch) | |
tree | ac23f005b3e268976260a03adf8c9e59e29c55cd | |
parent | 0166052d0b344b16e33d6d5c90571b3343ec9dad (diff) | |
parent | 5fafbc12f4d63271534a4522ac136c50ba5027a0 (diff) |
Merge pull request #22283 from vespa-engine/hakon/lock-debugging
Debugging of double-locking
5 files changed, 74 insertions, 20 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java index 91d127976ce..b0966f7db21 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java @@ -438,8 +438,15 @@ public class JobController { }); } finally { - for (Mutex lock : locks) - lock.close(); + for (Mutex lock : locks) { + try { + lock.close(); + } catch (Throwable t) { + log.log(WARNING, "Failed to close the lock " + lock + ": the lock may or may not " + + "have been released in ZooKeeper, and if not this controller " + + "must be restarted to release the lock", t); + } + } } } diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java index 6f914a8e9a3..05294a5435b 100644 --- a/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java +++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java @@ -9,6 +9,8 @@ import com.yahoo.vespa.curator.stats.ThreadLockStats; import org.apache.curator.framework.recipes.locks.InterProcessLock; import java.time.Duration; +import java.util.HashMap; +import java.util.Map; import java.util.concurrent.TimeUnit; /** @@ -21,6 +23,10 @@ import java.util.concurrent.TimeUnit; */ public class Lock implements Mutex { + private final Object monitor = new Object(); + private long nextSequenceNumber = 0; + private final Map<Long, Long> reentriesByThreadId = new HashMap<>(); + private final InterProcessLock mutex; private final String lockPath; @@ -52,14 +58,43 @@ public class Lock implements Mutex { throw new UncheckedTimeoutException("Timed out after waiting " + timeout + " to acquire lock '" + lockPath + "'"); } - threadLockStats.lockAcquired(); + + invoke(+1L, threadLockStats::lockAcquired); + } + + @FunctionalInterface + private interface TriConsumer { + void accept(String lockId, long reentryCountDiff, Map<Long, Long> reentriesByThreadId); + } + + // TODO(hakon): Remove once debugging is unnecessary + private void invoke(long reentryCountDiff, TriConsumer consumer) { + long threadId = Thread.currentThread().getId(); + final long sequenceNumber; + final Map<Long, Long> reentriesByThreadIdCopy; + synchronized (monitor) { + sequenceNumber = nextSequenceNumber++; + reentriesByThreadId.merge(threadId, reentryCountDiff, (oldValue, argumentValue) -> { + long sum = oldValue + argumentValue /* == reentryCountDiff */; + if (sum == 0) { + // Remove from map + return null; + } else { + return sum; + } + }); + reentriesByThreadIdCopy = Map.copyOf(reentriesByThreadId); + } + + String lockId = Integer.toHexString(System.identityHashCode(this)); + consumer.accept(lockId, sequenceNumber, reentriesByThreadIdCopy); } @Override public void close() { ThreadLockStats threadLockStats = LockStats.getForCurrentThread(); // Update metrics now before release() to avoid double-counting time in locked state. - threadLockStats.preRelease(); + invoke(-1L, threadLockStats::preRelease); try { mutex.release(); threadLockStats.postRelease(); @@ -72,6 +107,10 @@ public class Lock implements Mutex { protected String lockPath() { return lockPath; } + @Override + public String toString() { + return "Lock{" + lockPath + "}"; + } } diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockAttempt.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockAttempt.java index 887e2cd2700..1bbd3c7c734 100644 --- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockAttempt.java +++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockAttempt.java @@ -66,7 +66,7 @@ public class LockAttempt { public String getLockPath() { return lockPath; } public Instant getTimeAcquiredWasInvoked() { return callAcquireInstant; } public Duration getAcquireTimeout() { return timeout; } - public boolean getReentry() { return reentry; } + public boolean isReentry() { return reentry; } public LockState getLockState() { return lockState; } public Optional<Instant> getTimeLockWasAcquired() { return lockAcquiredInstant; } public boolean isAcquiring() { return lockAcquiredInstant.isEmpty(); } diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java index e4f78a4f9e9..ecb344dedb9 100644 --- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java +++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java @@ -64,28 +64,33 @@ public class LockStats { } /** Must be invoked only after the first and non-reentry acquisition of the lock. */ - void notifyOfThreadHoldingLock(Thread currentThread, String lockPath) { + void notifyOfThreadHoldingLock(Thread currentThread, String lockPath, String lockId, + long sequenceNumber, Map<Long, Long> reentriesByThreadId) { Thread oldThread = lockPathsHeld.put(lockPath, currentThread); if (oldThread != null) { getLockMetrics(lockPath).incrementAcquireWithoutReleaseCount(); - logger.warning("Thread " + currentThread.getName() + - " reports it has the lock on " + lockPath + ", but thread " + oldThread.getName() + - " has not reported it released the lock"); + logger.warning("Thread " + currentThread.getName() + " reports it has the lock on " + + lockPath + ", but thread " + oldThread.getName() + + " has not reported it released the lock. " + lockId + "#" + sequenceNumber + + ", reentries by thread ID = " + reentriesByThreadId); } } /** Must be invoked only before the last and non-reentry release of the lock. */ - void notifyOfThreadReleasingLock(Thread currentThread, String lockPath) { + void notifyOfThreadReleasingLock(Thread currentThread, String lockPath, String lockId, + long sequenceNumber, Map<Long, Long> reentriesByThreadId) { Thread oldThread = lockPathsHeld.remove(lockPath); if (oldThread == null) { getLockMetrics(lockPath).incrementNakedReleaseCount(); - logger.warning("Thread " + currentThread.getName() + - " is releasing the lock " + lockPath + ", but nobody owns that lock"); + logger.warning("Thread " + currentThread.getName() + " is releasing the lock " + lockPath + + ", but nobody owns that lock. " + lockId + "#" + sequenceNumber + + ", reentries by thread ID = " + reentriesByThreadId); } else if (oldThread != currentThread) { getLockMetrics(lockPath).incrementForeignReleaseCount(); logger.warning("Thread " + currentThread.getName() + - " is releasing the lock " + lockPath + ", but it was owned by thread " - + oldThread.getName()); + " is releasing the lock " + lockPath + ", but it was owned by thread " + + oldThread.getName() + ". " + lockId + "#" + sequenceNumber + + ", reentries by thread ID = " + reentriesByThreadId); } } diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java index d4511bd04fb..7f8eafdcf7f 100644 --- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java +++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java @@ -6,6 +6,7 @@ import com.yahoo.vespa.curator.Lock; import java.time.Duration; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.concurrent.ConcurrentLinkedDeque; @@ -97,7 +98,7 @@ public class ThreadLockStats { } /** Mutable method (see class doc) */ - public void lockAcquired() { + public void lockAcquired(String lockId, long sequenceNumber, Map<Long, Long> reentriesByThreadId) { withLastLockAttempt(lockAttempt -> { // Note on the order of lockAcquired() vs notifyOfThreadHoldingLock(): When the latter is // invoked, other threads may query e.g. isAcquired() on the lockAttempt, which would @@ -105,19 +106,21 @@ public class ThreadLockStats { // but seems better to ensure LockAttempt is updated first. lockAttempt.lockAcquired(); - if (!lockAttempt.getReentry()) { - LockStats.getGlobal().notifyOfThreadHoldingLock(thread, lockAttempt.getLockPath()); + if (!lockAttempt.isReentry()) { + LockStats.getGlobal().notifyOfThreadHoldingLock(thread, lockAttempt.getLockPath(), + lockId, sequenceNumber, reentriesByThreadId); } }); } /** Mutable method (see class doc) */ - public void preRelease() { + public void preRelease(String lockId, long sequenceNumber, Map<Long, Long> reentriesByThreadId) { withLastLockAttempt(lockAttempt -> { // Note on the order of these two statement: Same concerns apply here as in lockAcquired(). - if (!lockAttempt.getReentry()) { - LockStats.getGlobal().notifyOfThreadReleasingLock(thread, lockAttempt.getLockPath()); + if (!lockAttempt.isReentry()) { + LockStats.getGlobal().notifyOfThreadReleasingLock(thread, lockAttempt.getLockPath(), + lockId, sequenceNumber, reentriesByThreadId); } lockAttempt.preRelease(); |