diff options
7 files changed, 50 insertions, 41 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java index fdbf199898a..84dd2c6a8c3 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintainer.java @@ -25,8 +25,8 @@ public abstract class NodeRepositoryMaintainer extends Maintainer { private final NodeRepository nodeRepository; public NodeRepositoryMaintainer(NodeRepository nodeRepository, Duration interval, Metric metric) { - super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(), - jobMetrics(metric), nodeRepository.database().cluster()); + super(null, interval, nodeRepository.clock().instant(), nodeRepository.jobControl(), jobMetrics(metric), + nodeRepository.database().cluster()); this.nodeRepository = nodeRepository; } diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobControl.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobControl.java index 2a682bcb4db..583337203ab 100644 --- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobControl.java +++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobControl.java @@ -51,7 +51,7 @@ public class JobControl { public void run(String jobSimpleClassName) { var job = startedJobs.get(jobSimpleClassName); if (job == null) throw new IllegalArgumentException("No such job '" + jobSimpleClassName + "'"); - job.lockAndMaintain(true); + job.lockAndMaintain(); } /** Acquire lock for running given job */ diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java index d4d60723cbe..483057a828d 100644 --- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java +++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java @@ -1,6 +1,7 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.concurrent.maintenance; +import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.function.BiConsumer; @@ -13,7 +14,7 @@ public class JobMetrics { private final BiConsumer<String, Long> metricConsumer; - private final ConcurrentHashMap<String, Long> incompleteRuns = new ConcurrentHashMap<>(); + private final Map<String, Long> incompleteRuns = new ConcurrentHashMap<>(); public JobMetrics(BiConsumer<String, Long> metricConsumer) { this.metricConsumer = metricConsumer; diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java index daad1f8fb4b..9fb5172ab0a 100644 --- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java +++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java @@ -1,6 +1,7 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.concurrent.maintenance; +import com.google.common.util.concurrent.UncheckedTimeoutException; import com.yahoo.net.HostName; import java.time.Duration; @@ -32,15 +33,15 @@ public abstract class Maintainer implements Runnable { private final ScheduledExecutorService service; private final AtomicBoolean shutDown = new AtomicBoolean(); - public Maintainer(String name, Duration interval, Instant startedAt, JobControl jobControl, - JobMetrics jobMetrics, List<String> clusterHostnames) { + public Maintainer(String name, Duration interval, Instant startedAt, JobControl jobControl, JobMetrics jobMetrics, List<String> clusterHostnames) { + this(name, interval, staggeredDelay(interval, startedAt, HostName.getLocalhost(), clusterHostnames), jobControl, jobMetrics); + } + + public Maintainer(String name, Duration interval, Duration initialDelay, JobControl jobControl, JobMetrics jobMetrics) { this.name = name; this.interval = requireInterval(interval); this.jobControl = Objects.requireNonNull(jobControl); this.jobMetrics = Objects.requireNonNull(jobMetrics); - Objects.requireNonNull(startedAt); - Objects.requireNonNull(clusterHostnames); - Duration initialDelay = staggeredDelay(interval, startedAt, HostName.getLocalhost(), clusterHostnames); service = new ScheduledThreadPoolExecutor(1, r -> new Thread(r, name() + "-worker")); service.scheduleAtFixedRate(this, initialDelay.toMillis(), interval.toMillis(), TimeUnit.MILLISECONDS); jobControl.started(name(), this); @@ -48,7 +49,17 @@ public abstract class Maintainer implements Runnable { @Override public void run() { - lockAndMaintain(false); + log.log(Level.FINE, () -> "Running " + this.getClass().getSimpleName()); + try { + if (jobControl.isActive(name())) { + lockAndMaintain(); + } + } catch (UncheckedTimeoutException ignored) { + // Another actor is running this job + } catch (Throwable e) { + log.log(Level.WARNING, this + " failed. Will retry in " + interval.toMinutes() + " minutes", e); + } + log.log(Level.FINE, () -> "Finished " + this.getClass().getSimpleName()); } /** Starts shutdown of this, typically by shutting down executors. {@link #awaitShutdown()} waits for shutdown to complete. */ @@ -81,18 +92,17 @@ public abstract class Maintainer implements Runnable { protected Duration interval() { return interval; } /** Run this while holding the job lock */ - public final void lockAndMaintain(boolean force) { - if (!force && !jobControl.isActive(name())) return; - log.log(Level.FINE, () -> "Running " + this.getClass().getSimpleName()); - jobMetrics.recordRunOf(name()); + @SuppressWarnings("unused") + public final void lockAndMaintain() { try (var lock = jobControl.lockJob(name())) { - if (maintain()) jobMetrics.recordSuccessOf(name()); - } catch (Throwable e) { - log.log(Level.WARNING, this + " failed. Will retry in " + interval.toMinutes() + " minutes", e); - } finally { - jobMetrics.forward(name()); + try { + jobMetrics.recordRunOf(name()); + if (maintain()) jobMetrics.recordSuccessOf(name()); + } finally { + // Always forward metrics + jobMetrics.forward(name()); + } } - log.log(Level.FINE, () -> "Finished " + this.getClass().getSimpleName()); } /** Returns the simple name of this job */ diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java index 139a2901cd3..a0ca9b529c5 100644 --- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java @@ -19,9 +19,8 @@ public class JobControlTest { String job1 = "Job1"; String job2 = "Job2"; - JobMetrics metrics = new JobMetrics((job, instant) -> {}); - TestMaintainer maintainer1 = new TestMaintainer(job1, jobControl, metrics); - TestMaintainer maintainer2 = new TestMaintainer(job2, jobControl, metrics); + TestMaintainer maintainer1 = new TestMaintainer(job1, jobControl); + TestMaintainer maintainer2 = new TestMaintainer(job2, jobControl); assertEquals(2, jobControl.jobs().size()); assertTrue(jobControl.jobs().contains(job1)); assertTrue(jobControl.jobs().contains(job2)); @@ -62,7 +61,7 @@ public class JobControlTest { public void testJobControlMayDeactivateJobs() { JobControlStateMock state = new JobControlStateMock(); JobControl jobControl = new JobControl(state); - TestMaintainer mockMaintainer = new TestMaintainer(null, jobControl, new JobMetrics((job, instant) -> {})); + TestMaintainer mockMaintainer = new TestMaintainer(null, jobControl); assertTrue(jobControl.jobs().contains("TestMaintainer")); diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java index e881d4b3ff6..2bfaad894a5 100644 --- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java @@ -1,7 +1,6 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.concurrent.maintenance; -import com.google.common.util.concurrent.UncheckedTimeoutException; import org.junit.Test; import java.time.Duration; @@ -16,8 +15,6 @@ import static org.junit.Assert.assertEquals; */ public class MaintainerTest { - private final JobControl jobControl = new JobControl(new JobControlStateMock()); - @Test public void staggering() { List<String> cluster = List.of("cfg1", "cfg2", "cfg3"); @@ -44,7 +41,7 @@ public class MaintainerTest { public void success_metric() { AtomicLong consecutiveFailures = new AtomicLong(); JobMetrics jobMetrics = new JobMetrics((job, count) -> consecutiveFailures.set(count)); - TestMaintainer maintainer = new TestMaintainer(null, jobControl, jobMetrics); + TestMaintainer maintainer = new TestMaintainer(jobMetrics); // Maintainer fails twice in a row maintainer.successOnNextRun(false).run(); @@ -61,16 +58,12 @@ public class MaintainerTest { assertEquals(0, consecutiveFailures.get()); // Maintainer throws - maintainer.throwOnNextRun(new RuntimeException()).run(); + maintainer.throwOnNextRun(true).run(); assertEquals(1, consecutiveFailures.get()); // Maintainer recovers - maintainer.throwOnNextRun(null).run(); + maintainer.throwOnNextRun(false).run(); assertEquals(0, consecutiveFailures.get()); - - // Lock exception is treated as a failure - maintainer.throwOnNextRun(new UncheckedTimeoutException()).run(); - assertEquals(1, consecutiveFailures.get()); } } diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java index ea32af60208..5eae643fe40 100644 --- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java @@ -2,8 +2,6 @@ package com.yahoo.concurrent.maintenance; import java.time.Duration; -import java.time.Instant; -import java.util.List; /** * @author mpolden @@ -12,10 +10,18 @@ class TestMaintainer extends Maintainer { private int totalRuns = 0; private boolean success = true; - private RuntimeException exceptionToThrow = null; + private boolean throwing = false; public TestMaintainer(String name, JobControl jobControl, JobMetrics jobMetrics) { - super(name, Duration.ofDays(1), Instant.now(), jobControl, jobMetrics, List.of()); + super(name, Duration.ofDays(1), Duration.ofDays(1), jobControl, jobMetrics); + } + + public TestMaintainer(JobMetrics jobMetrics) { + this(null, new JobControl(new JobControlStateMock()), jobMetrics); + } + + public TestMaintainer(String name, JobControl jobControl) { + this(name, jobControl, new JobMetrics((job, instant) -> {})); } public int totalRuns() { @@ -27,14 +33,14 @@ class TestMaintainer extends Maintainer { return this; } - public TestMaintainer throwOnNextRun(RuntimeException e) { - this.exceptionToThrow = e; + public TestMaintainer throwOnNextRun(boolean throwing) { + this.throwing = throwing; return this; } @Override protected boolean maintain() { - if (exceptionToThrow != null) throw exceptionToThrow; + if (throwing) throw new RuntimeException("Maintenance run failed"); totalRuns++; return success; } |