diff options
author | Harald Musum <musum@verizonmedia.com> | 2023-10-11 23:15:01 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-11 23:15:01 +0200 |
commit | d46e3ce2535ffcb11c71b6781f8ca4816ba6f71d (patch) | |
tree | 4f33d356b7ca0e1323fd0a897af4dbab9d551a54 | |
parent | f47b44fb69228c7a7ec316d5213508bd563bf907 (diff) | |
parent | 9c8cdbe382aafb770cfa642ed51f12883b70c539 (diff) |
Merge pull request #28883 from vespa-engine/jonmv/reapply-job-runner-metrics
Jonmv/reapply job runner metrics
13 files changed, 121 insertions, 19 deletions
diff --git a/container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java b/container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java index befbda28ac0..f92d218390f 100644 --- a/container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java +++ b/container-core/src/main/java/com/yahoo/container/handler/threadpool/ContainerThreadpoolImpl.java @@ -54,7 +54,7 @@ public class ContainerThreadpoolImpl extends AbstractComponent implements AutoCl createQueue(queueSize), ThreadFactoryFactory.getThreadFactory(name), threadPoolMetric); - // Prestart needed, if not all threads will be created by the fist N tasks and hence they might also + // Pre-start needed, if not all threads will be created by the fist N tasks and hence they might also // get the dreaded thread locals initialized even if they will never run. // That counters what we want to achieve with the Q that will prefer thread locality. executor.prestartAllCoreThreads(); diff --git a/container-core/src/main/java/com/yahoo/container/handler/threadpool/WorkerCompletionTimingThreadPoolExecutor.java b/container-core/src/main/java/com/yahoo/container/handler/threadpool/WorkerCompletionTimingThreadPoolExecutor.java index e3c2c78abec..cee2cc54b5b 100644 --- a/container-core/src/main/java/com/yahoo/container/handler/threadpool/WorkerCompletionTimingThreadPoolExecutor.java +++ b/container-core/src/main/java/com/yahoo/container/handler/threadpool/WorkerCompletionTimingThreadPoolExecutor.java @@ -8,8 +8,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; /** - * A thread pool executor which maintains the last time a worker completed - * package private for testing + * A thread pool executor which maintains the last time a worker completed. * * @author Steinar Knutsen * @author baldersheim diff --git a/container-core/src/main/java/com/yahoo/container/protect/ProcessTerminator.java b/container-core/src/main/java/com/yahoo/container/protect/ProcessTerminator.java index 315dc21ec38..3c44ba2eaa6 100644 --- a/container-core/src/main/java/com/yahoo/container/protect/ProcessTerminator.java +++ b/container-core/src/main/java/com/yahoo/container/protect/ProcessTerminator.java @@ -4,8 +4,8 @@ package com.yahoo.container.protect; import com.yahoo.protect.Process; /** - * An injectable terminator of the Java vm. - * Components that encounters conditions where the vm should be terminated should + * An injectable terminator of the Java VM. + * Components that encounter conditions where the VM should be terminated should * request an instance of this injected. That makes termination testable * as tests can create subclasses of this which register the termination request * rather than terminating. diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java index 662b4018a34..1080b379c4d 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java @@ -727,6 +727,8 @@ public class InternalStepRunner implements StepRunner { DeploymentSpec spec = controller.applications().requireApplication(TenantAndApplicationId.from(id.application())).deploymentSpec(); boolean requireTests = spec.steps().stream().anyMatch(step -> step.concerns(id.type().environment())); + logger.log(WARNING, "No tests were actually run, but this test suite is explicitly declared in 'deployment.xml'. " + + "Either add tests, ensure they're correctly configured, or remove the test declaration."); return Optional.of(requireTests ? testFailure : noTests); } case SUCCESS: diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java index 6f00ff39637..0f482b1a015 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java @@ -1,7 +1,9 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import ai.vespa.metrics.ControllerMetrics; import com.yahoo.concurrent.DaemonThreadFactory; +import com.yahoo.jdisc.Metric; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.api.integration.deployment.RunId; import com.yahoo.vespa.hosted.controller.deployment.InternalStepRunner; @@ -11,11 +13,14 @@ import com.yahoo.vespa.hosted.controller.deployment.Step; import com.yahoo.vespa.hosted.controller.deployment.StepRunner; import java.time.Duration; +import java.util.Map; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import java.util.logging.Logger; @@ -32,22 +37,29 @@ public class JobRunner extends ControllerMaintainer { private final JobController jobs; private final ExecutorService executors; private final StepRunner runner; + private final Metrics metrics; public JobRunner(Controller controller, Duration duration) { - this(controller, duration, Executors.newFixedThreadPool(32, new DaemonThreadFactory("job-runner-")), new InternalStepRunner(controller)); + this(controller, duration, Executors.newFixedThreadPool(32, new DaemonThreadFactory("job-runner-")), + new InternalStepRunner(controller)); } public JobRunner(Controller controller, Duration duration, ExecutorService executors, StepRunner runner) { + this(controller, duration, executors, runner, new Metrics(controller.metric(), Duration.ofMillis(100))); + } + + JobRunner(Controller controller, Duration duration, ExecutorService executors, StepRunner runner, Metrics metrics) { super(controller, duration); this.jobs = controller.jobController(); this.jobs.setRunner(this::advance); this.executors = executors; this.runner = runner; + this.metrics = metrics; } @Override protected double maintain() { - executors.execute(() -> jobs.active().forEach(this::advance)); + execute(() -> jobs.active().forEach(this::advance)); jobs.collectGarbage(); return 1.0; } @@ -55,6 +67,7 @@ public class JobRunner extends ControllerMaintainer { @Override public void shutdown() { super.shutdown(); + metrics.shutdown(); executors.shutdown(); } @@ -83,14 +96,14 @@ public class JobRunner extends ControllerMaintainer { jobs.locked(id, run -> { if ( ! run.hasFailed() && controller().clock().instant().isAfter(run.sleepUntil().orElse(run.start()).plus(jobTimeout))) - executors.execute(() -> { + execute(() -> { jobs.abort(run.id(), "job timeout of " + jobTimeout + " reached", false); advance(run.id()); }); else if (run.readySteps().isEmpty()) - executors.execute(() -> finish(run.id())); + execute(() -> finish(run.id())); else if (run.hasFailed() || run.sleepUntil().map(sleepUntil -> ! sleepUntil.isAfter(controller().clock().instant())).orElse(true)) - run.readySteps().forEach(step -> executors.execute(() -> advance(run.id(), step))); + run.readySteps().forEach(step -> execute(() -> advance(run.id(), step))); return null; }); @@ -145,4 +158,39 @@ public class JobRunner extends ControllerMaintainer { } } + private void execute(Runnable task) { + metrics.queued.incrementAndGet(); + executors.execute(() -> { + metrics.queued.decrementAndGet(); + metrics.active.incrementAndGet(); + try { task.run(); } + finally { metrics.active.decrementAndGet(); } + }); + } + + static class Metrics { + + private final AtomicInteger queued = new AtomicInteger(); + private final AtomicInteger active = new AtomicInteger(); + private final ScheduledExecutorService reporter = Executors.newSingleThreadScheduledExecutor(new DaemonThreadFactory("job-runner-metrics-")); + private final Metric metric; + private final Metric.Context context; + + Metrics(Metric metric, Duration interval) { + this.metric = metric; + this.context = metric.createContext(Map.of()); + reporter.scheduleAtFixedRate(this::report, interval.toMillis(), interval.toMillis(), TimeUnit.MILLISECONDS); + } + + void report() { + metric.set(ControllerMetrics.DEPLOYMENT_JOBS_QUEUED.baseName(), queued.get(), context); + metric.set(ControllerMetrics.DEPLOYMENT_JOBS_ACTIVE.baseName(), active.get(), context); + } + + void shutdown() { + reporter.shutdown(); + } + + } + } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java index 1cc549ec6ca..c16234b3948 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java @@ -75,8 +75,7 @@ public class DeploymentTester { tester = controllerTester; jobs = tester.controller().jobController(); cloud = (MockTesterCloud) tester.controller().jobController().cloud(); - runner = new JobRunner(tester.controller(), maintenanceInterval, JobRunnerTest.inThreadExecutor(), - new InternalStepRunner(tester.controller())); + runner = new JobRunner(tester.controller(), maintenanceInterval, JobRunnerTest.inThreadExecutor(), new InternalStepRunner(tester.controller())); upgrader = new Upgrader(tester.controller(), maintenanceInterval); upgrader.setUpgradesPerMinute(1); // Anything that makes it at least one for any maintenance period is fine. readyJobsTrigger = new ReadyJobsTrigger(tester.controller(), maintenanceInterval); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java index e87d4f1f3f0..20717be598f 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunnerTest.java @@ -1,8 +1,10 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.maintenance; +import ai.vespa.metrics.ControllerMetrics; import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; +import com.yahoo.jdisc.test.MockMetric; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobId; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; import com.yahoo.vespa.hosted.controller.api.integration.deployment.RevisionId; @@ -22,6 +24,7 @@ import com.yahoo.vespa.hosted.controller.deployment.StepRunner; import com.yahoo.vespa.hosted.controller.deployment.Submission; import com.yahoo.vespa.hosted.controller.deployment.Versions; import com.yahoo.vespa.hosted.controller.integration.MetricsMock; +import com.yahoo.vespa.hosted.controller.maintenance.JobRunner.Metrics; import org.junit.jupiter.api.Test; import java.time.Duration; @@ -37,7 +40,9 @@ import java.util.concurrent.BrokenBarrierException; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Phaser; +import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; @@ -121,6 +126,51 @@ public class JobRunnerTest { } @Test + void metrics() { + Phaser phaser = new Phaser(4); + StepRunner runner = (step, id) -> { + phaser.arriveAndAwaitAdvance(); + phaser.arriveAndAwaitAdvance(); + return Optional.of(running); + }; + ExecutorService executor = new ThreadPoolExecutor(3, 3, 0, TimeUnit.SECONDS, new LinkedBlockingQueue<>(), (task, pool) -> task.run()); + DeploymentTester tester = new DeploymentTester(); + MockMetric metric = new MockMetric(); + Metrics metrics = new Metrics(metric, Duration.ofDays(1)); + JobRunner jobs = new JobRunner(tester.controller(), Duration.ofDays(1), executor, runner, metrics); + tester.newDeploymentContext().submit(); + + assertEquals(Map.of(), metric.metrics()); + metrics.report(); + assertEquals(Map.of(ControllerMetrics.DEPLOYMENT_JOBS_QUEUED.baseName(), + Map.of(Map.of(), 0.0), + ControllerMetrics.DEPLOYMENT_JOBS_ACTIVE.baseName(), + Map.of(Map.of(), 0.0)), + metric.metrics()); + tester.triggerJobs(); + + assertEquals(2, tester.jobs().active().size()); + jobs.maintain(); + phaser.arriveAndAwaitAdvance(); + metrics.report(); + assertEquals(Map.of(ControllerMetrics.DEPLOYMENT_JOBS_QUEUED.baseName(), + Map.of(Map.of(), 1.0), + ControllerMetrics.DEPLOYMENT_JOBS_ACTIVE.baseName(), + Map.of(Map.of(), 3.0)), + metric.metrics()); + + jobs.shutdown(); + phaser.forceTermination(); + jobs.awaitShutdown(); + metrics.report(); + assertEquals(Map.of(ControllerMetrics.DEPLOYMENT_JOBS_QUEUED.baseName(), + Map.of(Map.of(), 0.0), + ControllerMetrics.DEPLOYMENT_JOBS_ACTIVE.baseName(), + Map.of(Map.of(), 0.0)), + metric.metrics()); + } + + @Test void stepLogic() { DeploymentTester tester = new DeploymentTester(); JobController jobs = tester.controller().jobController(); diff --git a/documentapi/src/main/java/com/yahoo/documentapi/messagebus/protocol/AdaptiveLoadBalancer.java b/documentapi/src/main/java/com/yahoo/documentapi/messagebus/protocol/AdaptiveLoadBalancer.java index 4a4cf0fd5c8..ae934857e2c 100644 --- a/documentapi/src/main/java/com/yahoo/documentapi/messagebus/protocol/AdaptiveLoadBalancer.java +++ b/documentapi/src/main/java/com/yahoo/documentapi/messagebus/protocol/AdaptiveLoadBalancer.java @@ -7,7 +7,7 @@ import java.util.List; import java.util.Random; /** - * Will pick 2 random candidates and select the one with least pending operations. + * Will pick 2 random candidates and select the one with the least pending operations. * * @author baldersheim */ diff --git a/documentapi/src/main/java/com/yahoo/documentapi/messagebus/protocol/LocalServicePolicy.java b/documentapi/src/main/java/com/yahoo/documentapi/messagebus/protocol/LocalServicePolicy.java index ddd04a3ca53..4f8227b35a0 100755 --- a/documentapi/src/main/java/com/yahoo/documentapi/messagebus/protocol/LocalServicePolicy.java +++ b/documentapi/src/main/java/com/yahoo/documentapi/messagebus/protocol/LocalServicePolicy.java @@ -23,7 +23,7 @@ public class LocalServicePolicy implements DocumentProtocolRoutingPolicy { private final Map<String, CacheEntry> cache = new HashMap<>(); /** - * Constructs a policy that will choose local services that match the slobrok pattern in which this policy occured. + * Constructs a policy that will choose local services that match the slobrok pattern in which this policy occurred. * If no local service can be found, this policy simply returns the asterisk to allow the network to choose any. * * @param param The address to use for this, if empty this will resolve to hostname. diff --git a/messagebus/src/main/java/com/yahoo/messagebus/ErrorCode.java b/messagebus/src/main/java/com/yahoo/messagebus/ErrorCode.java index 8fab523d4ae..88e3e1a89bc 100644 --- a/messagebus/src/main/java/com/yahoo/messagebus/ErrorCode.java +++ b/messagebus/src/main/java/com/yahoo/messagebus/ErrorCode.java @@ -20,7 +20,7 @@ public final class ErrorCode { /** No addresses found for the services of the message route. */ public static final int NO_ADDRESS_FOR_SERVICE = TRANSIENT_ERROR + 2; - /** A connection problem occured while sending. */ + /** A connection problem occurred while sending. */ public static final int CONNECTION_ERROR = TRANSIENT_ERROR + 3; /** The session specified for the message is unknown. */ @@ -50,10 +50,10 @@ public final class ErrorCode { /** No services found for the message route. */ public static final int NO_SERVICES_FOR_ROUTE = FATAL_ERROR + 3; - /** An error occured while encoding the message. */ + /** An error occurred while encoding the message. */ public static final int ENCODE_ERROR = FATAL_ERROR + 5; - /** A fatal network error occured while sending. */ + /** A fatal network error occurred while sending. */ public static final int NETWORK_ERROR = FATAL_ERROR + 6; /** The protocol specified for the message is unknown. */ @@ -77,7 +77,7 @@ public final class ErrorCode { /** Exception thrown by routing policy. */ public static final int POLICY_ERROR = FATAL_ERROR + 13; - /** An error occured while sequencing a message. */ + /** An error occurred while sequencing a message. */ public static final int SEQUENCE_ERROR = FATAL_ERROR + 14; /** An application specific non-recoverable error. */ diff --git a/messagebus/src/main/java/com/yahoo/messagebus/routing/RoutingContext.java b/messagebus/src/main/java/com/yahoo/messagebus/routing/RoutingContext.java index 227dd546ad8..18b5de34bb4 100755 --- a/messagebus/src/main/java/com/yahoo/messagebus/routing/RoutingContext.java +++ b/messagebus/src/main/java/com/yahoo/messagebus/routing/RoutingContext.java @@ -19,7 +19,7 @@ public class RoutingContext { private final RoutingNode node; private final int directive; - private final Set<Integer> consumableErrors = new HashSet<Integer>(); + private final Set<Integer> consumableErrors = new HashSet<>(); private boolean selectOnRetry = true; private Object context = null; diff --git a/metrics/src/main/java/ai/vespa/metrics/ControllerMetrics.java b/metrics/src/main/java/ai/vespa/metrics/ControllerMetrics.java index 3676be90cd4..f03c54aa822 100644 --- a/metrics/src/main/java/ai/vespa/metrics/ControllerMetrics.java +++ b/metrics/src/main/java/ai/vespa/metrics/ControllerMetrics.java @@ -9,6 +9,8 @@ public enum ControllerMetrics implements VespaMetrics { ATHENZ_REQUEST_ERROR("athenz.request.error", Unit.REQUEST, "Controller: Athenz request error"), ARCHIVE_BUCKET_COUNT("archive.bucketCount", Unit.BUCKET, "Controller: Archive bucket count"), + DEPLOYMENT_JOBS_QUEUED("deployment.jobsQueued", Unit.TASK, "The number of deployment jobs queued"), + DEPLOYMENT_JOBS_ACTIVE("deployment.jobsActive", Unit.TASK, "The number of deployment jobs active"), DEPLOYMENT_START("deployment.start", Unit.DEPLOYMENT, "The number of started deployment jobs"), DEPLOYMENT_NODE_ALLOCATION_FAILURE("deployment.nodeAllocationFailure", Unit.DEPLOYMENT, "The number of deployments failed due to node allocation failures"), DEPLOYMENT_ENDPOINT_CERTIFICATE_TIMEOUT("deployment.endpointCertificateTimeout", Unit.DEPLOYMENT, "The number of deployments failed due to timeout acquiring endpoint certificate"), diff --git a/metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java b/metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java index 6bffddb885a..9443a08e28b 100644 --- a/metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java +++ b/metrics/src/main/java/ai/vespa/metrics/set/InfrastructureMetricSet.java @@ -143,6 +143,8 @@ public class InfrastructureMetricSet { addMetric(metrics, ControllerMetrics.ARCHIVE_BUCKET_COUNT.max()); addMetric(metrics, ControllerMetrics.BILLING_TENANTS.max()); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_JOBS_QUEUED, EnumSet.of(count, sum)); + addMetric(metrics, ControllerMetrics.DEPLOYMENT_JOBS_ACTIVE, EnumSet.of(count, sum)); addMetric(metrics, ControllerMetrics.DEPLOYMENT_ABORT.count()); addMetric(metrics, ControllerMetrics.DEPLOYMENT_AVERAGE_DURATION.max()); addMetric(metrics, ControllerMetrics.DEPLOYMENT_CONVERGENCE_FAILURE.count()); |