diff options
author | jonmv <venstad@gmail.com> | 2023-09-07 16:52:51 +0200 |
---|---|---|
committer | jonmv <venstad@gmail.com> | 2023-09-07 16:52:51 +0200 |
commit | c226beefe3ac3005b566a3b668d1521ae314e06b (patch) | |
tree | 176ba69d67e840c3b6891bfdd075d51fb8bae322 | |
parent | 0ae2ba10acbd196f77459f0da2813d2b44590e8f (diff) |
Add thread name template as dimension for locked load for curator locks
5 files changed, 62 insertions, 20 deletions
diff --git a/container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java b/container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java index 83e9e496411..303febe1e39 100644 --- a/container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java +++ b/container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java @@ -50,7 +50,7 @@ public class ContainerThreadpoolImpl extends AbstractComponent implements AutoCl ThreadPoolMetric threadPoolMetric = new ThreadPoolMetric(metric, name); WorkerCompletionTimingThreadPoolExecutor executor = new WorkerCompletionTimingThreadPoolExecutor(minThreads, maxThreads, - (int)config.keepAliveTime() * 1000, TimeUnit.MILLISECONDS, + (long) config.keepAliveTime() * 1000, TimeUnit.MILLISECONDS, createQueue(queueSize), ThreadFactoryFactory.getThreadFactory(name), threadPoolMetric); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java index 15913fec5ed..a7e82250275 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/MetricsReporter.java @@ -310,42 +310,47 @@ public class MetricsReporter extends NodeRepositoryMaintainer { } private void updateLockMetrics() { + Set<Pair<Metric.Context, String>> currentNonZeroMetrics = new HashSet<>(); LockStats.getGlobal().getLockMetricsByPath() .forEach((lockPath, lockMetrics) -> { Metric.Context context = getContext(Map.of("lockPath", lockPath)); LatencyMetrics acquireLatencyMetrics = lockMetrics.getAndResetAcquireLatencyMetrics(); - setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_MAX_ACTIVE_LATENCY.baseName(), acquireLatencyMetrics.maxActiveLatencySeconds(), context); - setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_HZ.baseName(), acquireLatencyMetrics.startHz(), context); - setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_LOAD.baseName(), acquireLatencyMetrics.load(), context); + setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_MAX_ACTIVE_LATENCY.baseName(), acquireLatencyMetrics.maxActiveLatencySeconds(), context, currentNonZeroMetrics); + setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_HZ.baseName(), acquireLatencyMetrics.startHz(), context, currentNonZeroMetrics); + setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_LOAD.baseName(), acquireLatencyMetrics.load(), context, currentNonZeroMetrics); LatencyMetrics lockedLatencyMetrics = lockMetrics.getAndResetLockedLatencyMetrics(); - setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LATENCY.baseName(), lockedLatencyMetrics.maxLatencySeconds(), context); - setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD.baseName(), lockedLatencyMetrics.load(), context); + setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LATENCY.baseName(), lockedLatencyMetrics.maxLatencySeconds(), context, currentNonZeroMetrics); + lockedLatencyMetrics.loadByThread().forEach((name, load) -> { + setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_LOCKED_LOAD.baseName(), load, getContext(Map.of("lockPath", lockPath, "thread", name)), currentNonZeroMetrics); + }); - setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_TIMED_OUT.baseName(), lockMetrics.getAndResetAcquireTimedOutCount(), context); - setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_DEADLOCK.baseName(), lockMetrics.getAndResetDeadlockCount(), context); + setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ACQUIRE_TIMED_OUT.baseName(), lockMetrics.getAndResetAcquireTimedOutCount(), context, currentNonZeroMetrics); + setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_DEADLOCK.baseName(), lockMetrics.getAndResetDeadlockCount(), context, currentNonZeroMetrics); // bucket for various rare errors - to reduce #metrics setNonZero(ConfigServerMetrics.LOCK_ATTEMPT_ERRORS.baseName(), - lockMetrics.getAndResetAcquireFailedCount() + - lockMetrics.getAndResetReleaseFailedCount() + - lockMetrics.getAndResetNakedReleaseCount() + - lockMetrics.getAndResetAcquireWithoutReleaseCount() + - lockMetrics.getAndResetForeignReleaseCount(), - context); + lockMetrics.getAndResetAcquireFailedCount() + + lockMetrics.getAndResetReleaseFailedCount() + + lockMetrics.getAndResetNakedReleaseCount() + + lockMetrics.getAndResetAcquireWithoutReleaseCount() + + lockMetrics.getAndResetForeignReleaseCount(), + context, + currentNonZeroMetrics); }); + // Need to set the metric to 0 after it has been set to non-zero, to avoid carrying a non-zero 'last' from earlier periods. + nonZeroMetrics.removeIf(currentNonZeroMetrics::contains); // Retain those that turned zero for this period. + nonZeroMetrics.forEach(metricKey -> metric.set(metricKey.getSecond(), 0, metricKey.getFirst())); + nonZeroMetrics.clear(); + nonZeroMetrics.addAll(currentNonZeroMetrics); } - private void setNonZero(String key, Number value, Metric.Context context) { + private void setNonZero(String key, Number value, Metric.Context context, Set<Pair<Metric.Context, String>> nonZeroMetrics) { var metricKey = new Pair<>(context, key); if (Double.compare(value.doubleValue(), 0.0) != 0) { metric.set(key, value, context); nonZeroMetrics.add(metricKey); - } else if (nonZeroMetrics.remove(metricKey)) { - // Need to set the metric to 0 after it has been set to non-zero, to avoid carrying - // a non-zero 'last' from earlier periods. - metric.set(key, value, context); } } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java index 42904bb6d68..8e4bcd6d942 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/LocksResponse.java @@ -115,6 +115,13 @@ public class LocksResponse extends HttpResponse { setNonZeroDouble(cursor, name + "MaxActiveLatency", latencyMetrics.maxActiveLatencySeconds()); setNonZeroDouble(cursor, name + "Hz", latencyMetrics.endHz()); setNonZeroDouble(cursor, name + "Load", latencyMetrics.load()); + if (latencyMetrics.loadByThread().isEmpty()) return; + Cursor loadByThreadCursor = cursor.setArray(name + "LoadByThread"); + latencyMetrics.loadByThread().forEach((threadName, load) -> { + Cursor loadForThreadCursor = loadByThreadCursor.addObject(); + loadForThreadCursor.setString("name", threadName); + loadForThreadCursor.setDouble("load", load); + }); } private static void setNonZeroDouble(Cursor cursor, String fieldName, double value) { diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyMetrics.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyMetrics.java index 0d5320c387a..58780f3906e 100644 --- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyMetrics.java +++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyMetrics.java @@ -2,6 +2,9 @@ package com.yahoo.vespa.curator.stats; import java.time.Duration; +import java.util.Collections; +import java.util.Map; +import java.util.TreeMap; import static java.lang.Math.round; @@ -22,18 +25,20 @@ public class LatencyMetrics { private final Duration maxActiveLatency; private final double startHz; private final double endHz; + private final Map<String, Double> loadByThread; private final double load; private final int maxLoad; private final int currentLoad; public LatencyMetrics(Duration latency, Duration maxLatency, Duration maxActiveLatency, - double startHz, double endHz, + double startHz, double endHz, Map<String, Double> loadByThread, double load, int maxLoad, int currentLoad) { this.latency = latency; this.maxLatency = maxLatency; this.maxActiveLatency = maxActiveLatency; this.startHz = startHz; this.endHz = endHz; + this.loadByThread = new TreeMap<>(loadByThread); this.load = load; this.maxLoad = maxLoad; this.currentLoad = currentLoad; @@ -54,6 +59,13 @@ public class LatencyMetrics { /** Returns the average number of intervals that ended in the period per second. */ public double endHz() { return roundTo3DecimalPlaces(endHz); } + /** Returns the average load of the implied time periond, per thread, with 3 decimal places precision. */ + public Map<String, Double> loadByThread() { + Map<String, Double> result = new TreeMap<>(); + loadByThread.forEach((name, load) -> result.put(name, roundTo3DecimalPlaces(load))); + return Collections.unmodifiableMap(result); + } + /** The average load of the implied time period, with 3 decimal places precision. */ public double load() { return roundTo3DecimalPlaces(load); } diff --git a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyStats.java b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyStats.java index 367d5ab2b9f..999b3b4c6d2 100644 --- a/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyStats.java +++ b/zkfacade/src/main/java/com/yahoo/vespa/curator/stats/LatencyStats.java @@ -3,7 +3,9 @@ package com.yahoo.vespa.curator.stats; import java.time.Duration; import java.util.Comparator; +import java.util.HashMap; import java.util.HashSet; +import java.util.Map; import java.util.Optional; import java.util.function.LongSupplier; import java.util.logging.Level; @@ -36,6 +38,7 @@ public class LatencyStats { private long startOfPeriodNanos; private long endOfPeriodNanos; private double cumulativeLoadNanos; + private final Map<String, Long> cumulativeLoadNanosByThread = new HashMap<>(); private Duration cumulativeLatency; private Duration maxLatency; private int numIntervalsStarted; @@ -92,6 +95,8 @@ public class LatencyStats { private static class ActiveIntervalInfo { private final long startNanos; + // Poor man's attempt at collapsing thread names into their pool names, as that is the relevant (task) level here. + private final String threadNameTemplate = Thread.currentThread().getName().replaceAll("\\d+", "*"); public ActiveIntervalInfo(long startOfIntervalNanos) { this.startNanos = startOfIntervalNanos; } public long startOfIntervalNanos() { return startNanos; } } @@ -109,6 +114,11 @@ public class LatencyStats { private void pushEndOfPeriodToNow() { long currentNanos = nanoTimeSupplier.getAsLong(); cumulativeLoadNanos += activeIntervals.size() * (currentNanos - endOfPeriodNanos); + for (ActiveIntervalInfo activeInterval : activeIntervals) { + cumulativeLoadNanosByThread.merge(activeInterval.threadNameTemplate, + currentNanos - endOfPeriodNanos, + Long::sum); + } endOfPeriodNanos = currentNanos; } @@ -146,15 +156,22 @@ public class LatencyStats { .orElse(maxLatency); final double startHz, endHz, load; + final Map<String, Double> loadByThread = new HashMap<>(); long periodNanos = endOfPeriodNanos - startOfPeriodNanos; if (periodNanos > 0) { double periodSeconds = periodNanos / 1_000_000_000.0; startHz = numIntervalsStarted / periodSeconds; endHz = numIntervalsEnded / periodSeconds; load = cumulativeLoadNanos / periodNanos; + cumulativeLoadNanosByThread.forEach((name, threadLoad) -> { + if (threadLoad > 0) loadByThread.put(name, threadLoad / (double) periodNanos); + }); } else { startHz = endHz = 0.0; load = activeIntervals.size(); + for (ActiveIntervalInfo activeInterval : activeIntervals) { + loadByThread.put(activeInterval.threadNameTemplate, 1.0); + } } return new LatencyMetrics(latency, @@ -162,6 +179,7 @@ public class LatencyStats { maxActiveLatency, startHz, endHz, + loadByThread, load, maxLoad, activeIntervals.size()); |