summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Marius Venstad <jvenstad@yahoo-inc.com>2018-06-25 09:27:52 +0200
committerJon Marius Venstad <jvenstad@yahoo-inc.com>2018-07-02 13:42:47 +0200
commit61e0edbca9756906f1cae34c66365ce62269bca5 (patch)
tree0df1de1d421a589d3d3bde8ac18f087b58975bac
parent3dee94396f87eae9ec5c421a407333b01d1c7853 (diff)
Avoid hiding unexpected lock timeouts
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobProfile.java5
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/LockedStep.java5
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepRunner.java22
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java3
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/StepRunner.java76
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java22
6 files changed, 105 insertions, 28 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobProfile.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobProfile.java
new file mode 100644
index 00000000000..f0093d20f56
--- /dev/null
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobProfile.java
@@ -0,0 +1,5 @@
+package com.yahoo.vespa.hosted.controller.deployment;
+
+public class JobProfile {
+
+}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/LockedStep.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/LockedStep.java
new file mode 100644
index 00000000000..cc2e46e5132
--- /dev/null
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/LockedStep.java
@@ -0,0 +1,5 @@
+package com.yahoo.vespa.hosted.controller.deployment;
+
+public class LockedStep {
+
+}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepRunner.java
deleted file mode 100644
index 5e1b8d904c6..00000000000
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/StepRunner.java
+++ /dev/null
@@ -1,22 +0,0 @@
-package com.yahoo.vespa.hosted.controller.deployment;
-
-import com.yahoo.config.provision.ApplicationId;
-import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType;
-
-import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.succeeded;
-
-/**
- * Executor which runs given {@link Step}s, for given {@link ApplicationId} and {@link JobType} combinations.
- *
- * @author jonmv
- */
-public class StepRunner {
-
- /** Returns the new status of the given step for the implied job run. */
- Step.Status run(Step step, ApplicationId application, JobType jobType) {
- switch (step) {
- default: return succeeded;
- }
- }
-
-}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java
index 40563c4cf95..f6ccbf6aa4e 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java
@@ -12,6 +12,7 @@ import java.time.Duration;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -50,7 +51,7 @@ public abstract class Maintainer extends AbstractComponent implements Runnable {
}
}
}
- catch (UncheckedTimeoutException e) {
+ catch (TimeoutException e) {
// another controller instance is running this job at the moment; ok
}
catch (Throwable t) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/StepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/StepRunner.java
new file mode 100644
index 00000000000..14acaf97fa8
--- /dev/null
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/StepRunner.java
@@ -0,0 +1,76 @@
+package com.yahoo.vespa.hosted.controller.deployment;
+
+import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType;
+
+import java.time.Instant;
+import java.util.Arrays;
+import java.util.EnumSet;
+import java.util.Map;
+import java.util.Optional;
+import java.util.stream.IntStream;
+
+import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.aborted;
+import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.failed;
+import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.succeeded;
+import static com.yahoo.vespa.hosted.controller.deployment.Step.Status.unfinished;
+
+/**
+ * Advances a given job run by running the appropriate {@link Step}s, based on their current status.
+ *
+ * When an attempt is made to advance a given job, a lock for that job (application and type) is
+ * taken, and released again only when the attempt finishes. Multiple other attempts may be made in
+ * the meantime, but they should give up unless the lock is promptly acquired.
+ *
+ * @author jonmv
+ */
+public class JobRunner {
+
+ /**
+ * Attempts to run the given step, and returns the new status.
+ *
+ * If the step fails,
+ */
+ RunStatus run(Step step, RunStatus run) {
+ switch (step) {
+ default: throw new AssertionError();
+ }
+ }
+
+ private Step.Status deployInitialReal(ApplicationId id, JobType type) {
+ throw new AssertionError();
+ }
+
+ /**
+ * Attempts to advance the given job run by running the first eligible step, and returns the new status.
+ *
+ * Only the first unfinished step is attempted, to split the jobs into the smallest possible chunks, in case
+ * of sudden shutdown, etc..
+ */
+ public RunStatus advance(RunStatus run, Instant now) {
+ // If the run has failed, run any remaining alwaysRun steps, and return.
+ if (run.status().values().contains(failed))
+ return JobProfile.of(run.id().type()).alwaysRun().stream()
+ .filter(step -> run.status().get(step) == unfinished)
+ .findFirst()
+ .map(step -> run(step, run))
+ .orElse(run.with(now));
+
+ // Otherwise, try to run the first unfinished step.
+ return run.status().entrySet().stream()
+ .filter(entry -> entry.getValue() == unfinished
+ && entry.getKey().prerequisites().stream()
+ .filter(run.status().keySet()::contains)
+ .map(run.status()::get)
+ .allMatch(succeeded::equals))
+ .findFirst()
+ .map(entry -> run(entry.getKey(), run))
+ .orElse(run.with(now));
+ }
+
+ RunStatus forceEnd(RunStatus run) {
+ // Run each pending alwaysRun step.
+ throw new AssertionError();
+ }
+
+}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java
index 184ac90691a..09daaef2af6 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/CuratorDb.java
@@ -1,6 +1,7 @@
// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.persistence;
+import com.google.common.util.concurrent.UncheckedTimeoutException;
import com.google.inject.Inject;
import com.yahoo.component.Version;
import com.yahoo.component.Vtag;
@@ -31,6 +32,7 @@ import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeoutException;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.logging.Level;
@@ -113,11 +115,8 @@ public class CuratorDb {
return lock(lockRoot.append("inactiveJobsLock"), defaultLockTimeout);
}
- public Lock lockMaintenanceJob(String jobName) {
- // Use a short timeout such that if maintenance jobs are started at about the same time on different nodes
- // and the maintenance job takes a long time to complete, only one of the nodes will run the job
- // in each maintenance interval
- return lock(lockRoot.append("maintenanceJobLocks").append(jobName), Duration.ofSeconds(1));
+ public Lock lockMaintenanceJob(String jobName) throws TimeoutException {
+ return tryLock(lockRoot.append("maintenanceJobLocks").append(jobName));
}
@SuppressWarnings("unused") // Called by internal code
@@ -137,6 +136,19 @@ public class CuratorDb {
// -------------- Helpers ------------------------------------------
+ /** Try locking with a low timeout, meaning it is OK to fail lock acquisition.
+ *
+ * Useful for maintenance jobs, where there is no point in running the jobs back to back.
+ */
+ private Lock tryLock(Path path) throws TimeoutException {
+ try {
+ return lock(path, Duration.ofSeconds(1));
+ }
+ catch (UncheckedTimeoutException e) {
+ throw new TimeoutException(e.getMessage());
+ }
+ }
+
private <T> Optional<T> read(Path path, Function<byte[], T> mapper) {
return curator.getData(path).filter(data -> data.length > 0).map(mapper);
}