summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjonmv <venstad@gmail.com>2023-09-07 16:52:51 +0200
committerjonmv <venstad@gmail.com>2023-09-07 16:52:51 +0200
commitc226beefe3ac3005b566a3b668d1521ae314e06b (patch)
tree176ba69d67e840c3b6891bfdd075d51fb8bae322
parent0ae2ba10acbd196f77459f0da2813d2b44590e8f (diff)
Add thread name template as dimension for locked load for curator locks
-rw-r--r--container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java41
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java7
-rw-r--r--zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyMetrics.java14
-rw-r--r--zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyStats.java18
5 files changed, 62 insertions, 20 deletions
diff --git a/container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java b/container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java
index 83e9e496411..303febe1e39 100644
--- a/container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java
+++ b/container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java
@@ -50,7 +50,7 @@ public class ContainerThreadpoolImpl extends AbstractComponent implements AutoCl
ThreadPoolMetric threadPoolMetric = new ThreadPoolMetric(metric, name);
WorkerCompletionTimingThreadPoolExecutor executor =
new WorkerCompletionTimingThreadPoolExecutor(minThreads, maxThreads,
- (int)config.keepAliveTime() * 1000, TimeUnit.MILLISECONDS,
+ (long) config.keepAliveTime() * 1000, TimeUnit.MILLISECONDS,
createQueue(queueSize),
ThreadFactoryFactory.getThreadFactory(name),
threadPoolMetric);
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
index 15913fec5ed..a7e82250275 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java
@@ -310,42 +310,47 @@ public class MetricsReporter extends NodeRepositoryMaintainer {
}
private void updateLockMetrics() {
+ Set<Pair<Metric.Context, String>> currentNonZeroMetrics = new HashSet<>();
LockStats.getGlobal().getLockMetricsByPath()
.forEach((lockPath, lockMetrics) -> {
Metric.Context context = getContext(Map.of("lockPath", lockPath));
LatencyMetrics acquireLatencyMetrics = lockMetrics.getAndResetAcquireLatencyMetrics();
- setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_MAX_ACTIVE_LATENCY.baseName(), acquireLatencyMetrics.maxActiveLatencySeconds(), context);
- setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_HZ.baseName(), acquireLatencyMetrics.startHz(), context);
- setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_LOAD.baseName(), acquireLatencyMetrics.load(), context);
+ setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_MAX_ACTIVE_LATENCY.baseName(), acquireLatencyMetrics.maxActiveLatencySeconds(), context, currentNonZeroMetrics);
+ setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_HZ.baseName(), acquireLatencyMetrics.startHz(), context, currentNonZeroMetrics);
+ setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_LOAD.baseName(), acquireLatencyMetrics.load(), context, currentNonZeroMetrics);
LatencyMetrics lockedLatencyMetrics = lockMetrics.getAndResetLockedLatencyMetrics();
- setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LATENCY.baseName(), lockedLatencyMetrics.maxLatencySeconds(), context);
- setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD.baseName(), lockedLatencyMetrics.load(), context);
+ setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LATENCY.baseName(), lockedLatencyMetrics.maxLatencySeconds(), context, currentNonZeroMetrics);
+ lockedLatencyMetrics.loadByThread().forEach((name, load) -> {
+ setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD.baseName(), load, getContext(Map.of("lockPath", lockPath, "thread", name)), currentNonZeroMetrics);
+ });
- setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_TIMED_OUT.baseName(), lockMetrics.getAndResetAcquireTimedOutCount(), context);
- setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_DEADLOCK.baseName(), lockMetrics.getAndResetDeadlockCount(), context);
+ setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_TIMED_OUT.baseName(), lockMetrics.getAndResetAcquireTimedOutCount(), context, currentNonZeroMetrics);
+ setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_DEADLOCK.baseName(), lockMetrics.getAndResetDeadlockCount(), context, currentNonZeroMetrics);
// bucket for various rare errors - to reduce #metrics
setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ERRORS.baseName(),
- lockMetrics.getAndResetAcquireFailedCount() +
- lockMetrics.getAndResetReleaseFailedCount() +
- lockMetrics.getAndResetNakedReleaseCount() +
- lockMetrics.getAndResetAcquireWithoutReleaseCount() +
- lockMetrics.getAndResetForeignReleaseCount(),
- context);
+ lockMetrics.getAndResetAcquireFailedCount() +
+ lockMetrics.getAndResetReleaseFailedCount() +
+ lockMetrics.getAndResetNakedReleaseCount() +
+ lockMetrics.getAndResetAcquireWithoutReleaseCount() +
+ lockMetrics.getAndResetForeignReleaseCount(),
+ context,
+ currentNonZeroMetrics);
});
+ // Need to set the metric to 0 after it has been set to non-zero, to avoid carrying a non-zero 'last' from earlier periods.
+ nonZeroMetrics.removeIf(currentNonZeroMetrics::contains); // Retain those that turned zero for this period.
+ nonZeroMetrics.forEach(metricKey -> metric.set(metricKey.getSecond(), 0, metricKey.getFirst()));
+ nonZeroMetrics.clear();
+ nonZeroMetrics.addAll(currentNonZeroMetrics);
}
- private void setNonZero(String key, Number value, Metric.Context context) {
+ private void setNonZero(String key, Number value, Metric.Context context, Set<Pair<Metric.Context, String>> nonZeroMetrics) {
var metricKey = new Pair<>(context, key);
if (Double.compare(value.doubleValue(), 0.0) != 0) {
metric.set(key, value, context);
nonZeroMetrics.add(metricKey);
- } else if (nonZeroMetrics.remove(metricKey)) {
- // Need to set the metric to 0 after it has been set to non-zero, to avoid carrying
- // a non-zero 'last' from earlier periods.
- metric.set(key, value, context);
}
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java
index 42904bb6d68..8e4bcd6d942 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java
@@ -115,6 +115,13 @@ public class LocksResponse extends HttpResponse {
setNonZeroDouble(cursor, name + "MaxActiveLatency", latencyMetrics.maxActiveLatencySeconds());
setNonZeroDouble(cursor, name + "Hz", latencyMetrics.endHz());
setNonZeroDouble(cursor, name + "Load", latencyMetrics.load());
+ if (latencyMetrics.loadByThread().isEmpty()) return;
+ Cursor loadByThreadCursor = cursor.setArray(name + "LoadByThread");
+ latencyMetrics.loadByThread().forEach((threadName, load) -> {
+ Cursor loadForThreadCursor = loadByThreadCursor.addObject();
+ loadForThreadCursor.setString("name", threadName);
+ loadForThreadCursor.setDouble("load", load);
+ });
}
private static void setNonZeroDouble(Cursor cursor, String fieldName, double value) {
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyMetrics.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyMetrics.java
index 0d5320c387a..58780f3906e 100644
--- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyMetrics.java
+++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyMetrics.java
@@ -2,6 +2,9 @@
package com.yahoo.vespa.curator.stats;
import java.time.Duration;
+import java.util.Collections;
+import java.util.Map;
+import java.util.TreeMap;
import static java.lang.Math.round;
@@ -22,18 +25,20 @@ public class LatencyMetrics {
private final Duration maxActiveLatency;
private final double startHz;
private final double endHz;
+ private final Map<String, Double> loadByThread;
private final double load;
private final int maxLoad;
private final int currentLoad;
public LatencyMetrics(Duration latency, Duration maxLatency, Duration maxActiveLatency,
- double startHz, double endHz,
+ double startHz, double endHz, Map<String, Double> loadByThread,
double load, int maxLoad, int currentLoad) {
this.latency = latency;
this.maxLatency = maxLatency;
this.maxActiveLatency = maxActiveLatency;
this.startHz = startHz;
this.endHz = endHz;
+ this.loadByThread = new TreeMap<>(loadByThread);
this.load = load;
this.maxLoad = maxLoad;
this.currentLoad = currentLoad;
@@ -54,6 +59,13 @@ public class LatencyMetrics {
/** Returns the average number of intervals that ended in the period per second. */
public double endHz() { return roundTo3DecimalPlaces(endHz); }
+ /** Returns the average load of the implied time periond, per thread, with 3 decimal places precision. */
+ public Map<String, Double> loadByThread() {
+ Map<String, Double> result = new TreeMap<>();
+ loadByThread.forEach((name, load) -> result.put(name, roundTo3DecimalPlaces(load)));
+ return Collections.unmodifiableMap(result);
+ }
+
/** The average load of the implied time period, with 3 decimal places precision. */
public double load() { return roundTo3DecimalPlaces(load); }
diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyStats.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyStats.java
index 367d5ab2b9f..999b3b4c6d2 100644
--- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyStats.java
+++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyStats.java
@@ -3,7 +3,9 @@ package com.yahoo.vespa.curator.stats;
import java.time.Duration;
import java.util.Comparator;
+import java.util.HashMap;
import java.util.HashSet;
+import java.util.Map;
import java.util.Optional;
import java.util.function.LongSupplier;
import java.util.logging.Level;
@@ -36,6 +38,7 @@ public class LatencyStats {
private long startOfPeriodNanos;
private long endOfPeriodNanos;
private double cumulativeLoadNanos;
+ private final Map<String, Long> cumulativeLoadNanosByThread = new HashMap<>();
private Duration cumulativeLatency;
private Duration maxLatency;
private int numIntervalsStarted;
@@ -92,6 +95,8 @@ public class LatencyStats {
private static class ActiveIntervalInfo {
private final long startNanos;
+ // Poor man's attempt at collapsing thread names into their pool names, as that is the relevant (task) level here.
+ private final String threadNameTemplate = Thread.currentThread().getName().replaceAll("\\d+", "*");
public ActiveIntervalInfo(long startOfIntervalNanos) { this.startNanos = startOfIntervalNanos; }
public long startOfIntervalNanos() { return startNanos; }
}
@@ -109,6 +114,11 @@ public class LatencyStats {
private void pushEndOfPeriodToNow() {
long currentNanos = nanoTimeSupplier.getAsLong();
cumulativeLoadNanos += activeIntervals.size() * (currentNanos - endOfPeriodNanos);
+ for (ActiveIntervalInfo activeInterval : activeIntervals) {
+ cumulativeLoadNanosByThread.merge(activeInterval.threadNameTemplate,
+ currentNanos - endOfPeriodNanos,
+ Long::sum);
+ }
endOfPeriodNanos = currentNanos;
}
@@ -146,15 +156,22 @@ public class LatencyStats {
.orElse(maxLatency);
final double startHz, endHz, load;
+ final Map<String, Double> loadByThread = new HashMap<>();
long periodNanos = endOfPeriodNanos - startOfPeriodNanos;
if (periodNanos > 0) {
double periodSeconds = periodNanos / 1_000_000_000.0;
startHz = numIntervalsStarted / periodSeconds;
endHz = numIntervalsEnded / periodSeconds;
load = cumulativeLoadNanos / periodNanos;
+ cumulativeLoadNanosByThread.forEach((name, threadLoad) -> {
+ if (threadLoad > 0) loadByThread.put(name, threadLoad / (double) periodNanos);
+ });
} else {
startHz = endHz = 0.0;
load = activeIntervals.size();
+ for (ActiveIntervalInfo activeInterval : activeIntervals) {
+ loadByThread.put(activeInterval.threadNameTemplate, 1.0);
+ }
}
return new LatencyMetrics(latency,
@@ -162,6 +179,7 @@ public class LatencyStats {
maxActiveLatency,
startHz,
endHz,
+ loadByThread,
load,
maxLoad,
activeIntervals.size());