summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Marius Venstad <jonmv@users.noreply.github.com>2022-04-26 15:25:04 +0200
committerGitHub <noreply@github.com>2022-04-26 15:25:04 +0200
commite2756ecd8289704764a7c44bb7c83c576730cb76 (patch)
treeac23f005b3e268976260a03adf8c9e59e29c55cd
parent0166052d0b344b16e33d6d5c90571b3343ec9dad (diff)
parent5fafbc12f4d63271534a4522ac136c50ba5027a0 (diff)
Merge pull request #22283 from vespa-engine/hakon/lock-debugging
Debugging of double-locking
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java11
-rw-r--r--zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java43
-rw-r--r--zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockAttempt.java2
-rw-r--r--zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java23
-rw-r--r--zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java15
5 files changed, 74 insertions, 20 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
index 91d127976ce..b0966f7db21 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
@@ -438,8 +438,15 @@ public class JobController {
});
}
finally {
- for (Mutex lock : locks)
- lock.close();
+ for (Mutex lock : locks) {
+ try {
+ lock.close();
+ } catch (Throwable t) {
+ log.log(WARNING, "Failed to close the lock " + lock + ": the lock may or may not " +
+ "have been released in ZooKeeper, and if not this controller " +
+ "must be restarted to release the lock", t);
+ }
+ }
}
}
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java
index 6f914a8e9a3..05294a5435b 100644
--- a/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java
+++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/Lock.java
@@ -9,6 +9,8 @@ import com.yahoo.vespa.curator.stats.ThreadLockStats;
import org.apache.curator.framework.recipes.locks.InterProcessLock;
import java.time.Duration;
+import java.util.HashMap;
+import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
@@ -21,6 +23,10 @@ import java.util.concurrent.TimeUnit;
*/
public class Lock implements Mutex {
+ private final Object monitor = new Object();
+ private long nextSequenceNumber = 0;
+ private final Map<Long, Long> reentriesByThreadId = new HashMap<>();
+
private final InterProcessLock mutex;
private final String lockPath;
@@ -52,14 +58,43 @@ public class Lock implements Mutex {
throw new UncheckedTimeoutException("Timed out after waiting " + timeout +
" to acquire lock '" + lockPath + "'");
}
- threadLockStats.lockAcquired();
+
+ invoke(+1L, threadLockStats::lockAcquired);
+ }
+
+ @FunctionalInterface
+ private interface TriConsumer {
+ void accept(String lockId, long reentryCountDiff, Map<Long, Long> reentriesByThreadId);
+ }
+
+ // TODO(hakon): Remove once debugging is unnecessary
+ private void invoke(long reentryCountDiff, TriConsumer consumer) {
+ long threadId = Thread.currentThread().getId();
+ final long sequenceNumber;
+ final Map<Long, Long> reentriesByThreadIdCopy;
+ synchronized (monitor) {
+ sequenceNumber = nextSequenceNumber++;
+ reentriesByThreadId.merge(threadId, reentryCountDiff, (oldValue, argumentValue) -> {
+ long sum = oldValue + argumentValue /* == reentryCountDiff */;
+ if (sum == 0) {
+ // Remove from map
+ return null;
+ } else {
+ return sum;
+ }
+ });
+ reentriesByThreadIdCopy = Map.copyOf(reentriesByThreadId);
+ }
+
+ String lockId = Integer.toHexString(System.identityHashCode(this));
+ consumer.accept(lockId, sequenceNumber, reentriesByThreadIdCopy);
}
@Override
public void close() {
ThreadLockStats threadLockStats = LockStats.getForCurrentThread();
// Update metrics now before release() to avoid double-counting time in locked state.
- threadLockStats.preRelease();
+ invoke(-1L, threadLockStats::preRelease);
try {
mutex.release();
threadLockStats.postRelease();
@@ -72,6 +107,10 @@ public class Lock implements Mutex {
protected String lockPath() { return lockPath; }
+ @Override
+ public String toString() {
+ return "Lock{" + lockPath + "}";
+ }
}
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockAttempt.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockAttempt.java
index 887e2cd2700..1bbd3c7c734 100644
--- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockAttempt.java
+++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockAttempt.java
@@ -66,7 +66,7 @@ public class LockAttempt {
public String getLockPath() { return lockPath; }
public Instant getTimeAcquiredWasInvoked() { return callAcquireInstant; }
public Duration getAcquireTimeout() { return timeout; }
- public boolean getReentry() { return reentry; }
+ public boolean isReentry() { return reentry; }
public LockState getLockState() { return lockState; }
public Optional<Instant> getTimeLockWasAcquired() { return lockAcquiredInstant; }
public boolean isAcquiring() { return lockAcquiredInstant.isEmpty(); }
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java
index e4f78a4f9e9..ecb344dedb9 100644
--- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java
+++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LockStats.java
@@ -64,28 +64,33 @@ public class LockStats {
}
/** Must be invoked only after the first and non-reentry acquisition of the lock. */
- void notifyOfThreadHoldingLock(Thread currentThread, String lockPath) {
+ void notifyOfThreadHoldingLock(Thread currentThread, String lockPath, String lockId,
+ long sequenceNumber, Map<Long, Long> reentriesByThreadId) {
Thread oldThread = lockPathsHeld.put(lockPath, currentThread);
if (oldThread != null) {
getLockMetrics(lockPath).incrementAcquireWithoutReleaseCount();
- logger.warning("Thread " + currentThread.getName() +
- " reports it has the lock on " + lockPath + ", but thread " + oldThread.getName() +
- " has not reported it released the lock");
+ logger.warning("Thread " + currentThread.getName() + " reports it has the lock on " +
+ lockPath + ", but thread " + oldThread.getName() +
+ " has not reported it released the lock. " + lockId + "#" + sequenceNumber +
+ ", reentries by thread ID = " + reentriesByThreadId);
}
}
/** Must be invoked only before the last and non-reentry release of the lock. */
- void notifyOfThreadReleasingLock(Thread currentThread, String lockPath) {
+ void notifyOfThreadReleasingLock(Thread currentThread, String lockPath, String lockId,
+ long sequenceNumber, Map<Long, Long> reentriesByThreadId) {
Thread oldThread = lockPathsHeld.remove(lockPath);
if (oldThread == null) {
getLockMetrics(lockPath).incrementNakedReleaseCount();
- logger.warning("Thread " + currentThread.getName() +
- " is releasing the lock " + lockPath + ", but nobody owns that lock");
+ logger.warning("Thread " + currentThread.getName() + " is releasing the lock " + lockPath +
+ ", but nobody owns that lock. " + lockId + "#" + sequenceNumber +
+ ", reentries by thread ID = " + reentriesByThreadId);
} else if (oldThread != currentThread) {
getLockMetrics(lockPath).incrementForeignReleaseCount();
logger.warning("Thread " + currentThread.getName() +
- " is releasing the lock " + lockPath + ", but it was owned by thread "
- + oldThread.getName());
+ " is releasing the lock " + lockPath + ", but it was owned by thread " +
+ oldThread.getName() + ". " + lockId + "#" + sequenceNumber +
+ ", reentries by thread ID = " + reentriesByThreadId);
}
}
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java
index d4511bd04fb..7f8eafdcf7f 100644
--- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java
+++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/ThreadLockStats.java
@@ -6,6 +6,7 @@ import com.yahoo.vespa.curator.Lock;
import java.time.Duration;
import java.util.HashSet;
import java.util.List;
+import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedDeque;
@@ -97,7 +98,7 @@ public class ThreadLockStats {
}
/** Mutable method (see class doc) */
- public void lockAcquired() {
+ public void lockAcquired(String lockId, long sequenceNumber, Map<Long, Long> reentriesByThreadId) {
withLastLockAttempt(lockAttempt -> {
// Note on the order of lockAcquired() vs notifyOfThreadHoldingLock(): When the latter is
// invoked, other threads may query e.g. isAcquired() on the lockAttempt, which would
@@ -105,19 +106,21 @@ public class ThreadLockStats {
// but seems better to ensure LockAttempt is updated first.
lockAttempt.lockAcquired();
- if (!lockAttempt.getReentry()) {
- LockStats.getGlobal().notifyOfThreadHoldingLock(thread, lockAttempt.getLockPath());
+ if (!lockAttempt.isReentry()) {
+ LockStats.getGlobal().notifyOfThreadHoldingLock(thread, lockAttempt.getLockPath(),
+ lockId, sequenceNumber, reentriesByThreadId);
}
});
}
/** Mutable method (see class doc) */
- public void preRelease() {
+ public void preRelease(String lockId, long sequenceNumber, Map<Long, Long> reentriesByThreadId) {
withLastLockAttempt(lockAttempt -> {
// Note on the order of these two statement: Same concerns apply here as in lockAcquired().
- if (!lockAttempt.getReentry()) {
- LockStats.getGlobal().notifyOfThreadReleasingLock(thread, lockAttempt.getLockPath());
+ if (!lockAttempt.isReentry()) {
+ LockStats.getGlobal().notifyOfThreadReleasingLock(thread, lockAttempt.getLockPath(),
+ lockId, sequenceNumber, reentriesByThreadId);
}
lockAttempt.preRelease();