summaryrefslogtreecommitdiffstats
path: root/controller-server/src/main/java/com/yahoo
diff options
context:
space:
mode:
Diffstat (limited to 'controller-server/src/main/java/com/yahoo')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java115
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BlockedChangeDeployer.java (renamed from controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java)6
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java12
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DelayedDeployer.java24
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java35
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobControl.java3
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java2
7 files changed, 169 insertions, 28 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
index 97ac317d15b..1faaa15f054 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
@@ -145,24 +145,15 @@ public class DeploymentTrigger {
List<JobType> jobs = order.jobsFrom(application.deploymentSpec());
// Should the first step be triggered?
- if ( ! jobs.isEmpty() && jobs.get(0).equals(JobType.systemTest) ) {
- JobStatus systemTestStatus = application.deploymentJobs().jobStatus().get(JobType.systemTest);
- if (application.deploying().get() instanceof Change.VersionChange) {
- Version target = ((Change.VersionChange) application.deploying().get()).version();
- if (systemTestStatus == null
- || ! systemTestStatus.lastTriggered().isPresent()
- || ! systemTestStatus.isSuccess()
- || ! systemTestStatus.lastTriggered().get().version().equals(target)) {
- application = trigger(JobType.systemTest, application, false, "Upgrade to " + target);
- controller.applications().store(application);
- }
- }
- else {
- JobStatus componentStatus = application.deploymentJobs().jobStatus().get(JobType.component);
- if (changesAvailable(application, componentStatus, systemTestStatus)) {
- application = trigger(JobType.systemTest, application, false, "Available change in component");
- controller.applications().store(application);
- }
+ // TODO: How can the first job not be systemTest (second ccondition)?
+ if ( ! jobs.isEmpty() && jobs.get(0).equals(JobType.systemTest) &&
+ application.deploying().get() instanceof Change.VersionChange) {
+ Version target = ((Change.VersionChange)application.deploying().get()).version();
+ JobStatus jobStatus = application.deploymentJobs().jobStatus().get(JobType.systemTest);
+ if (jobStatus == null || ! jobStatus.lastTriggered().isPresent()
+ || ! jobStatus.lastTriggered().get().version().equals(target)) {
+ application = trigger(JobType.systemTest, application, false, "Upgrade to " + target);
+ controller.applications().store(application);
}
}
@@ -216,6 +207,62 @@ public class DeploymentTrigger {
}
/**
+ * Called periodically to cause triggering of jobs in the background
+ */
+ public void triggerFailing(ApplicationId applicationId) {
+ try (Lock lock = applications().lock(applicationId)) {
+ LockedApplication application = applications().require(applicationId, lock);
+ if ( ! application.deploying().isPresent()) return; // No ongoing change, no need to retry
+
+ // Retry first failing job
+ for (JobType jobType : order.jobsFrom(application.deploymentSpec())) {
+ JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType);
+ if (isFailing(application.deploying().get(), jobStatus)) {
+ if (shouldRetryNow(jobStatus)) {
+ application = trigger(jobType, application, false, "Retrying failing job");
+ applications().store(application);
+ }
+ break;
+ }
+ }
+
+ // Retry dead job
+ Optional<JobStatus> firstDeadJob = firstDeadJob(application.deploymentJobs());
+ if (firstDeadJob.isPresent()) {
+ application = trigger(firstDeadJob.get().type(), application, false, "Retrying dead job");
+ applications().store(application);
+ }
+ }
+ }
+
+ /** Triggers jobs that have been delayed according to deployment spec */
+ public void triggerDelayed() {
+ for (Application application : applications().asList()) {
+ if ( ! application.deploying().isPresent() ) continue;
+ if (application.deploymentJobs().hasFailures()) continue;
+ if (application.deploymentJobs().isRunning(controller.applications().deploymentTrigger().jobTimeoutLimit())) continue;
+ if (application.deploymentSpec().steps().stream().noneMatch(step -> step instanceof DeploymentSpec.Delay)) {
+ continue; // Application does not have any delayed deployments
+ }
+
+ Optional<JobStatus> lastSuccessfulJob = application.deploymentJobs().jobStatus().values()
+ .stream()
+ .filter(j -> j.lastSuccess().isPresent())
+ .sorted(Comparator.<JobStatus, Instant>comparing(j -> j.lastSuccess().get().at()).reversed())
+ .findFirst();
+ if ( ! lastSuccessfulJob.isPresent() ) continue;
+
+ // Trigger next
+ try (Lock lock = applications().lock(application.id())) {
+ LockedApplication lockedApplication = applications().require(application.id(), lock);
+ lockedApplication = trigger(order.nextAfter(lastSuccessfulJob.get().type(), lockedApplication),
+ lockedApplication, "Resuming delayed deployment");
+ applications().store(lockedApplication);
+ }
+ }
+ }
+
+ /**
* Triggers a change of this application
*
* @param applicationId the application to trigger
@@ -254,10 +301,42 @@ public class DeploymentTrigger {
private ApplicationController applications() { return controller.applications(); }
+ /** Returns whether a job is failing for the current change in the given application */
+ private boolean isFailing(Change change, JobStatus status) {
+ return status != null
+ && ! status.isSuccess()
+ && status.lastCompleted().isPresent()
+ && status.lastCompleted().get().lastCompletedWas(change);
+ }
+
private boolean isCapacityConstrained(JobType jobType) {
return jobType == JobType.stagingTest || jobType == JobType.systemTest;
}
+ /** Returns the first job that has been running for more than the given timeout */
+ private Optional<JobStatus> firstDeadJob(DeploymentJobs jobs) {
+ Optional<JobStatus> oldestRunningJob = jobs.jobStatus().values().stream()
+ .filter(job -> job.isRunning(Instant.ofEpochMilli(0)))
+ .sorted(Comparator.comparing(status -> status.lastTriggered().get().at()))
+ .findFirst();
+ return oldestRunningJob.filter(job -> job.lastTriggered().get().at().isBefore(jobTimeoutLimit()));
+ }
+
+ /** Decide whether the job should be triggered by the periodic trigger */
+ private boolean shouldRetryNow(JobStatus job) {
+ if (job.isSuccess()) return false;
+ if (job.isRunning(jobTimeoutLimit())) return false;
+
+ // Retry after 10% of the time since it started failing
+ Duration aTenthOfFailTime = Duration.ofMillis( (clock.millis() - job.firstFailing().get().at().toEpochMilli()) / 10);
+ if (job.lastCompleted().get().at().isBefore(clock.instant().minus(aTenthOfFailTime))) return true;
+
+ // ... or retry anyway if we haven't tried in 4 hours
+ if (job.lastCompleted().get().at().isBefore(clock.instant().minus(Duration.ofHours(4)))) return true;
+
+ return false;
+ }
+
/** Retry immediately only if this job just started failing. Otherwise retry periodically */
private boolean shouldRetryNow(Application application, JobType jobType) {
JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType);
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BlockedChangeDeployer.java
index f165b4e4ea3..4a68fd6cfab 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BlockedChangeDeployer.java
@@ -14,14 +14,14 @@ import java.time.Duration;
* @author bratseth
*/
@SuppressWarnings("unused")
-public class ReadyJobsTrigger extends Maintainer {
+public class BlockedChangeDeployer extends Maintainer {
- public ReadyJobsTrigger(Controller controller, Duration interval, JobControl jobControl) {
+ public BlockedChangeDeployer(Controller controller, Duration interval, JobControl jobControl) {
super(controller, interval, jobControl);
}
@Override
- public void maintain() {
+ protected void maintain() {
controller().applications().deploymentTrigger().triggerReadyJobs();
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java
index 01edc269116..2fdce2802ab 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java
@@ -25,10 +25,12 @@ public class ControllerMaintenance extends AbstractComponent {
private final DeploymentExpirer deploymentExpirer;
private final DeploymentIssueReporter deploymentIssueReporter;
private final MetricsReporter metricsReporter;
+ private final FailureRedeployer failureRedeployer;
private final OutstandingChangeDeployer outstandingChangeDeployer;
private final VersionStatusUpdater versionStatusUpdater;
private final Upgrader upgrader;
- private final ReadyJobsTrigger readyJobsTrigger;
+ private final DelayedDeployer delayedDeployer;
+ private final BlockedChangeDeployer blockedChangeDeployer;
private final ClusterInfoMaintainer clusterInfoMaintainer;
private final ClusterUtilizationMaintainer clusterUtilizationMaintainer;
private final DeploymentMetricsMaintainer deploymentMetricsMaintainer;
@@ -42,10 +44,12 @@ public class ControllerMaintenance extends AbstractComponent {
deploymentExpirer = new DeploymentExpirer(controller, maintenanceInterval, jobControl);
deploymentIssueReporter = new DeploymentIssueReporter(controller, deploymentIssues, maintenanceInterval, jobControl);
metricsReporter = new MetricsReporter(controller, metric, chefClient, jobControl, controller.system());
+ failureRedeployer = new FailureRedeployer(controller, maintenanceInterval, jobControl);
outstandingChangeDeployer = new OutstandingChangeDeployer(controller, maintenanceInterval, jobControl);
versionStatusUpdater = new VersionStatusUpdater(controller, Duration.ofMinutes(3), jobControl);
upgrader = new Upgrader(controller, maintenanceInterval, jobControl, curator);
- readyJobsTrigger = new ReadyJobsTrigger(controller, maintenanceInterval, jobControl);
+ delayedDeployer = new DelayedDeployer(controller, maintenanceInterval, jobControl);
+ blockedChangeDeployer = new BlockedChangeDeployer(controller, maintenanceInterval, jobControl);
clusterInfoMaintainer = new ClusterInfoMaintainer(controller, Duration.ofHours(2), jobControl);
clusterUtilizationMaintainer = new ClusterUtilizationMaintainer(controller, Duration.ofHours(2), jobControl);
deploymentMetricsMaintainer = new DeploymentMetricsMaintainer(controller, Duration.ofMinutes(10), jobControl);
@@ -61,10 +65,12 @@ public class ControllerMaintenance extends AbstractComponent {
deploymentExpirer.deconstruct();
deploymentIssueReporter.deconstruct();
metricsReporter.deconstruct();
+ failureRedeployer.deconstruct();
outstandingChangeDeployer.deconstruct();
versionStatusUpdater.deconstruct();
upgrader.deconstruct();
- readyJobsTrigger.deconstruct();
+ delayedDeployer.deconstruct();
+ blockedChangeDeployer.deconstruct();
clusterUtilizationMaintainer.deconstruct();
clusterInfoMaintainer.deconstruct();
deploymentMetricsMaintainer.deconstruct();
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DelayedDeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DelayedDeployer.java
new file mode 100644
index 00000000000..cb09c41a034
--- /dev/null
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DelayedDeployer.java
@@ -0,0 +1,24 @@
+// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.controller.maintenance;
+
+import com.yahoo.vespa.hosted.controller.Controller;
+
+import java.time.Duration;
+
+/**
+ * Maintenance job which triggers jobs that have been delayed according to the applications deployment spec.
+ *
+ * @author mpolden
+ */
+public class DelayedDeployer extends Maintainer {
+
+ public DelayedDeployer(Controller controller, Duration interval, JobControl jobControl) {
+ super(controller, interval, jobControl);
+ }
+
+ @Override
+ protected void maintain() {
+ controller().applications().deploymentTrigger().triggerDelayed();
+ }
+
+}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
new file mode 100644
index 00000000000..72f8faa5180
--- /dev/null
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java
@@ -0,0 +1,35 @@
+// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.controller.maintenance;
+
+import com.yahoo.vespa.hosted.controller.Application;
+import com.yahoo.vespa.hosted.controller.Controller;
+import com.yahoo.vespa.hosted.controller.application.ApplicationList;
+
+import java.time.Duration;
+import java.util.List;
+
+/**
+ * Attempts redeployment of failed jobs and deployments.
+ *
+ * @author bratseth
+ * @author mpolden
+ */
+public class FailureRedeployer extends Maintainer {
+
+ public FailureRedeployer(Controller controller, Duration interval, JobControl jobControl) {
+ super(controller, interval, jobControl);
+ }
+
+ @Override
+ public void maintain() {
+ List<Application> applications = ApplicationList.from(controller().applications().asList())
+ .notPullRequest()
+ .asList();
+ applications.forEach(application -> triggerFailing(application));
+ }
+
+ private void triggerFailing(Application application) {
+ controller().applications().deploymentTrigger().triggerFailing(application.id());
+ }
+
+}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobControl.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobControl.java
index 6aa1b89c605..d7396cb2acb 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobControl.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobControl.java
@@ -5,7 +5,6 @@ import com.yahoo.vespa.curator.Lock;
import com.yahoo.vespa.hosted.controller.persistence.CuratorDb;
import java.util.HashSet;
-import java.util.LinkedHashSet;
import java.util.Set;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.logging.Logger;
@@ -41,7 +40,7 @@ public class JobControl {
* Returns a snapshot of the set of jobs started on this system (whether deactivated or not).
* Each job is represented by its simple (omitting package) class name.
*/
- public Set<String> jobs() { return new LinkedHashSet<>(startedJobs); }
+ public Set<String> jobs() { return new HashSet<>(startedJobs); }
/** Returns an unmodifiable set containing the currently inactive jobs in this */
public Set<String> inactiveJobs() { return curator.readInactiveJobs(); }
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java
index 1d19d8ca522..bbef7980273 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java
@@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.controller.maintenance;
import com.google.common.util.concurrent.UncheckedTimeoutException;
import com.yahoo.component.AbstractComponent;
-import com.yahoo.component.ComponentId;
import com.yahoo.vespa.curator.Lock;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.persistence.CuratorDb;
@@ -30,7 +29,6 @@ public abstract class Maintainer extends AbstractComponent implements Runnable {
private final ScheduledExecutorService service;
public Maintainer(Controller controller, Duration interval, JobControl jobControl) {
- initId(new ComponentId(name()));
this.controller = controller;
this.maintenanceInterval = interval;
this.jobControl = jobControl;