diff options
author | Jon Marius Venstad <jvenstad@yahoo-inc.com> | 2018-06-25 09:27:52 +0200 |
---|---|---|
committer | Jon Marius Venstad <jvenstad@yahoo-inc.com> | 2018-07-02 13:42:47 +0200 |
commit | 61e0edbca9756906f1cae34c66365ce62269bca5 (patch) | |
tree | 0df1de1d421a589d3d3bde8ac18f087b58975bac /controller-server | |
parent | 3dee94396f87eae9ec5c421a407333b01d1c7853 (diff) |
Avoid hiding unexpected lock timeouts
Diffstat (limited to 'controller-server')
6 files changed, 105 insertions, 28 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobProfile.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobProfile.java new file mode 100644 index 00000000000..f0093d20f56 --- /dev/null +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobProfile.java @@ -0,0 +1,5 @@ +package com.yahoo.vespa.hosted.controller.deployment; + +public class JobProfile { + +} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/LockedStep.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/LockedStep.java new file mode 100644 index 00000000000..cc2e46e5132 --- /dev/null +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/LockedStep.java @@ -0,0 +1,5 @@ +package com.yahoo.vespa.hosted.controller.deployment; + +public class LockedStep { + +} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepRunner.java deleted file mode 100644 index 5e1b8d904c6..00000000000 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepRunner.java +++ /dev/null @@ -1,22 +0,0 @@ -package com.yahoo.vespa.hosted.controller.deployment; - -import com.yahoo.config.provision.ApplicationId; -import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; - -import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.succeeded; - -/** - * Executor which runs given {@link Step}s, for given {@link ApplicationId} and {@link JobType} combinations. - * - * @author jonmv - */ -public class StepRunner { - - /** Returns the new status of the given step for the implied job run. */ - Step.Status run(Step step, ApplicationId application, JobType jobType) { - switch (step) { - default: return succeeded; - } - } - -} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java index 40563c4cf95..f6ccbf6aa4e 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java @@ -12,6 +12,7 @@ import java.time.Duration; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.logging.Level; import java.util.logging.Logger; @@ -50,7 +51,7 @@ public abstract class Maintainer extends AbstractComponent implements Runnable { } } } - catch (UncheckedTimeoutException e) { + catch (TimeoutException e) { // another controller instance is running this job at the moment; ok } catch (Throwable t) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/StepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/StepRunner.java new file mode 100644 index 00000000000..14acaf97fa8 --- /dev/null +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/StepRunner.java @@ -0,0 +1,76 @@ +package com.yahoo.vespa.hosted.controller.deployment; + +import com.yahoo.config.provision.ApplicationId; +import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; + +import java.time.Instant; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.Map; +import java.util.Optional; +import java.util.stream.IntStream; + +import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.aborted; +import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.failed; +import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.succeeded; +import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.unfinished; + +/** + * Advances a given job run by running the appropriate {@link Step}s, based on their current status. + * + * When an attempt is made to advance a given job, a lock for that job (application and type) is + * taken, and released again only when the attempt finishes. Multiple other attempts may be made in + * the meantime, but they should give up unless the lock is promptly acquired. + * + * @author jonmv + */ +public class JobRunner { + + /** + * Attempts to run the given step, and returns the new status. + * + * If the step fails, + */ + RunStatus run(Step step, RunStatus run) { + switch (step) { + default: throw new AssertionError(); + } + } + + private Step.Status deployInitialReal(ApplicationId id, JobType type) { + throw new AssertionError(); + } + + /** + * Attempts to advance the given job run by running the first eligible step, and returns the new status. + * + * Only the first unfinished step is attempted, to split the jobs into the smallest possible chunks, in case + * of sudden shutdown, etc.. + */ + public RunStatus advance(RunStatus run, Instant now) { + // If the run has failed, run any remaining alwaysRun steps, and return. + if (run.status().values().contains(failed)) + return JobProfile.of(run.id().type()).alwaysRun().stream() + .filter(step -> run.status().get(step) == unfinished) + .findFirst() + .map(step -> run(step, run)) + .orElse(run.with(now)); + + // Otherwise, try to run the first unfinished step. + return run.status().entrySet().stream() + .filter(entry -> entry.getValue() == unfinished + && entry.getKey().prerequisites().stream() + .filter(run.status().keySet()::contains) + .map(run.status()::get) + .allMatch(succeeded::equals)) + .findFirst() + .map(entry -> run(entry.getKey(), run)) + .orElse(run.with(now)); + } + + RunStatus forceEnd(RunStatus run) { + // Run each pending alwaysRun step. + throw new AssertionError(); + } + +} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java index 184ac90691a..09daaef2af6 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java @@ -1,6 +1,7 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.persistence; +import com.google.common.util.concurrent.UncheckedTimeoutException; import com.google.inject.Inject; import com.yahoo.component.Version; import com.yahoo.component.Vtag; @@ -31,6 +32,7 @@ import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeoutException; import java.util.function.Function; import java.util.function.Predicate; import java.util.logging.Level; @@ -113,11 +115,8 @@ public class CuratorDb { return lock(lockRoot.append("inactiveJobsLock"), defaultLockTimeout); } - public Lock lockMaintenanceJob(String jobName) { - // Use a short timeout such that if maintenance jobs are started at about the same time on different nodes - // and the maintenance job takes a long time to complete, only one of the nodes will run the job - // in each maintenance interval - return lock(lockRoot.append("maintenanceJobLocks").append(jobName), Duration.ofSeconds(1)); + public Lock lockMaintenanceJob(String jobName) throws TimeoutException { + return tryLock(lockRoot.append("maintenanceJobLocks").append(jobName)); } @SuppressWarnings("unused") // Called by internal code @@ -137,6 +136,19 @@ public class CuratorDb { // -------------- Helpers ------------------------------------------ + /** Try locking with a low timeout, meaning it is OK to fail lock acquisition. + * + * Useful for maintenance jobs, where there is no point in running the jobs back to back. + */ + private Lock tryLock(Path path) throws TimeoutException { + try { + return lock(path, Duration.ofSeconds(1)); + } + catch (UncheckedTimeoutException e) { + throw new TimeoutException(e.getMessage()); + } + } + private <T> Optional<T> read(Path path, Function<byte[], T> mapper) { return curator.getData(path).filter(data -> data.length > 0).map(mapper); } |