diff options
author | Martin Polden <mpolden@mpolden.no> | 2021-01-07 13:54:05 +0100 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2021-01-07 13:54:05 +0100 |
commit | 672d9a71ebac45a886c9ee45625890f18af8aa4f (patch) | |
tree | 1671b6dbe9554a9b743400476650bacd1ff60790 /vespajlib | |
parent | 5c1096afe9f2db748f6e8d3c559021101fd8ae6c (diff) |
Count lock timeout as unsuccessful run for exclusive maintainers
Diffstat (limited to 'vespajlib')
5 files changed, 60 insertions, 34 deletions
diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java index 483057a828d..d4d60723cbe 100644 --- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java +++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/JobMetrics.java @@ -1,7 +1,6 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.concurrent.maintenance; -import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.function.BiConsumer; @@ -14,7 +13,7 @@ public class JobMetrics { private final BiConsumer<String, Long> metricConsumer; - private final Map<String, Long> incompleteRuns = new ConcurrentHashMap<>(); + private final ConcurrentHashMap<String, Long> incompleteRuns = new ConcurrentHashMap<>(); public JobMetrics(BiConsumer<String, Long> metricConsumer) { this.metricConsumer = metricConsumer; diff --git a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java index 9fb5172ab0a..bd0e618ba05 100644 --- a/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java +++ b/vespajlib/src/main/java/com/yahoo/concurrent/maintenance/Maintainer.java @@ -27,18 +27,20 @@ public abstract class Maintainer implements Runnable { protected final Logger log = Logger.getLogger(this.getClass().getName()); private final String name; + private final Mode mode; private final JobControl jobControl; private final JobMetrics jobMetrics; private final Duration interval; private final ScheduledExecutorService service; private final AtomicBoolean shutDown = new AtomicBoolean(); - public Maintainer(String name, Duration interval, Instant startedAt, JobControl jobControl, JobMetrics jobMetrics, List<String> clusterHostnames) { - this(name, interval, staggeredDelay(interval, startedAt, HostName.getLocalhost(), clusterHostnames), jobControl, jobMetrics); + public Maintainer(String name, Mode mode, Duration interval, Instant startedAt, JobControl jobControl, JobMetrics jobMetrics, List<String> clusterHostnames) { + this(name, mode, interval, staggeredDelay(interval, startedAt, HostName.getLocalhost(), clusterHostnames), jobControl, jobMetrics); } - public Maintainer(String name, Duration interval, Duration initialDelay, JobControl jobControl, JobMetrics jobMetrics) { + public Maintainer(String name, Mode mode, Duration interval, Duration initialDelay, JobControl jobControl, JobMetrics jobMetrics) { this.name = name; + this.mode = Objects.requireNonNull(mode); this.interval = requireInterval(interval); this.jobControl = Objects.requireNonNull(jobControl); this.jobMetrics = Objects.requireNonNull(jobMetrics); @@ -49,13 +51,10 @@ public abstract class Maintainer implements Runnable { @Override public void run() { - log.log(Level.FINE, () -> "Running " + this.getClass().getSimpleName()); try { if (jobControl.isActive(name())) { lockAndMaintain(); } - } catch (UncheckedTimeoutException ignored) { - // Another actor is running this job } catch (Throwable e) { log.log(Level.WARNING, this + " failed. Will retry in " + interval.toMinutes() + " minutes", e); } @@ -94,14 +93,17 @@ public abstract class Maintainer implements Runnable { /** Run this while holding the job lock */ @SuppressWarnings("unused") public final void lockAndMaintain() { + log.log(Level.FINE, () -> "Running " + this.getClass().getSimpleName()); + jobMetrics.recordRunOf(name()); try (var lock = jobControl.lockJob(name())) { - try { - jobMetrics.recordRunOf(name()); - if (maintain()) jobMetrics.recordSuccessOf(name()); - } finally { - // Always forward metrics - jobMetrics.forward(name()); + if (maintain()) jobMetrics.recordSuccessOf(name()); + } catch (UncheckedTimeoutException ignored) { + if (mode == Mode.shared) { + // This is fine as we're colliding with a run on another node + jobMetrics.recordSuccessOf(name()); } + } finally { + jobMetrics.forward(name()); } } @@ -127,4 +129,14 @@ public abstract class Maintainer implements Runnable { return interval; } + public enum Mode { + + /** Completing a scheduled run on any node is sufficient */ + shared, + + /** Completing a scheduled run is always required */ + exclusive, + + } + } diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java index a0ca9b529c5..61413c2973c 100644 --- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/JobControlTest.java @@ -19,8 +19,9 @@ public class JobControlTest { String job1 = "Job1"; String job2 = "Job2"; - TestMaintainer maintainer1 = new TestMaintainer(job1, jobControl); - TestMaintainer maintainer2 = new TestMaintainer(job2, jobControl); + JobMetrics metrics = new JobMetrics((job, instant) -> {}); + TestMaintainer maintainer1 = new TestMaintainer(job1, Maintainer.Mode.shared, jobControl, metrics); + TestMaintainer maintainer2 = new TestMaintainer(job2, Maintainer.Mode.shared, jobControl, metrics); assertEquals(2, jobControl.jobs().size()); assertTrue(jobControl.jobs().contains(job1)); assertTrue(jobControl.jobs().contains(job2)); @@ -61,7 +62,7 @@ public class JobControlTest { public void testJobControlMayDeactivateJobs() { JobControlStateMock state = new JobControlStateMock(); JobControl jobControl = new JobControl(state); - TestMaintainer mockMaintainer = new TestMaintainer(null, jobControl); + TestMaintainer mockMaintainer = new TestMaintainer(null, Maintainer.Mode.shared, jobControl, new JobMetrics((job, instant) -> {})); assertTrue(jobControl.jobs().contains("TestMaintainer")); diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java index 2bfaad894a5..4b726eb6637 100644 --- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/MaintainerTest.java @@ -1,6 +1,7 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.concurrent.maintenance; +import com.google.common.util.concurrent.UncheckedTimeoutException; import org.junit.Test; import java.time.Duration; @@ -15,6 +16,8 @@ import static org.junit.Assert.assertEquals; */ public class MaintainerTest { + private final JobControl jobControl = new JobControl(new JobControlStateMock()); + @Test public void staggering() { List<String> cluster = List.of("cfg1", "cfg2", "cfg3"); @@ -41,7 +44,7 @@ public class MaintainerTest { public void success_metric() { AtomicLong consecutiveFailures = new AtomicLong(); JobMetrics jobMetrics = new JobMetrics((job, count) -> consecutiveFailures.set(count)); - TestMaintainer maintainer = new TestMaintainer(jobMetrics); + TestMaintainer maintainer = new TestMaintainer(null, Maintainer.Mode.shared, jobControl, jobMetrics); // Maintainer fails twice in a row maintainer.successOnNextRun(false).run(); @@ -58,11 +61,30 @@ public class MaintainerTest { assertEquals(0, consecutiveFailures.get()); // Maintainer throws - maintainer.throwOnNextRun(true).run(); + maintainer.throwOnNextRun(new RuntimeException()).run(); + assertEquals(1, consecutiveFailures.get()); + + // Maintainer recovers + maintainer.throwOnNextRun(null).run(); + assertEquals(0, consecutiveFailures.get()); + + // Lock exception is considered successful for shared maintainer + maintainer.throwOnNextRun(new UncheckedTimeoutException()).run(); + assertEquals(0, consecutiveFailures.get()); + } + + @Test + public void success_metric_exclusive_maintainer() { + AtomicLong consecutiveFailures = new AtomicLong(); + JobMetrics jobMetrics = new JobMetrics((job, count) -> consecutiveFailures.set(count)); + TestMaintainer maintainer = new TestMaintainer(null, Maintainer.Mode.exclusive, jobControl, jobMetrics); + + // Timeout is considered a failure + maintainer.throwOnNextRun(new UncheckedTimeoutException()).run(); assertEquals(1, consecutiveFailures.get()); // Maintainer recovers - maintainer.throwOnNextRun(false).run(); + maintainer.throwOnNextRun(null).run(); assertEquals(0, consecutiveFailures.get()); } diff --git a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java index 5eae643fe40..dcf45ee9da0 100644 --- a/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java +++ b/vespajlib/src/test/java/com/yahoo/concurrent/maintenance/TestMaintainer.java @@ -10,18 +10,10 @@ class TestMaintainer extends Maintainer { private int totalRuns = 0; private boolean success = true; - private boolean throwing = false; + private RuntimeException exceptionToThrow = null; - public TestMaintainer(String name, JobControl jobControl, JobMetrics jobMetrics) { - super(name, Duration.ofDays(1), Duration.ofDays(1), jobControl, jobMetrics); - } - - public TestMaintainer(JobMetrics jobMetrics) { - this(null, new JobControl(new JobControlStateMock()), jobMetrics); - } - - public TestMaintainer(String name, JobControl jobControl) { - this(name, jobControl, new JobMetrics((job, instant) -> {})); + public TestMaintainer(String name, Mode mode, JobControl jobControl, JobMetrics jobMetrics) { + super(name, mode, Duration.ofDays(1), Duration.ofDays(1), jobControl, jobMetrics); } public int totalRuns() { @@ -33,14 +25,14 @@ class TestMaintainer extends Maintainer { return this; } - public TestMaintainer throwOnNextRun(boolean throwing) { - this.throwing = throwing; + public TestMaintainer throwOnNextRun(RuntimeException e) { + this.exceptionToThrow = e; return this; } @Override protected boolean maintain() { - if (throwing) throw new RuntimeException("Maintenance run failed"); + if (exceptionToThrow != null) throw exceptionToThrow; totalRuns++; return success; } |