diff options
Diffstat (limited to 'controller-server/src/main/java/com/yahoo')
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java | 115 | ||||
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BlockedChangeDeployer.java (renamed from controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java) | 6 | ||||
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java | 12 | ||||
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DelayedDeployer.java | 24 | ||||
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java | 35 | ||||
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobControl.java | 3 | ||||
-rw-r--r-- | controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java | 2 |
7 files changed, 169 insertions, 28 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 97ac317d15b..1faaa15f054 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -145,24 +145,15 @@ public class DeploymentTrigger { List<JobType> jobs = order.jobsFrom(application.deploymentSpec()); // Should the first step be triggered? - if ( ! jobs.isEmpty() && jobs.get(0).equals(JobType.systemTest) ) { - JobStatus systemTestStatus = application.deploymentJobs().jobStatus().get(JobType.systemTest); - if (application.deploying().get() instanceof Change.VersionChange) { - Version target = ((Change.VersionChange) application.deploying().get()).version(); - if (systemTestStatus == null - || ! systemTestStatus.lastTriggered().isPresent() - || ! systemTestStatus.isSuccess() - || ! systemTestStatus.lastTriggered().get().version().equals(target)) { - application = trigger(JobType.systemTest, application, false, "Upgrade to " + target); - controller.applications().store(application); - } - } - else { - JobStatus componentStatus = application.deploymentJobs().jobStatus().get(JobType.component); - if (changesAvailable(application, componentStatus, systemTestStatus)) { - application = trigger(JobType.systemTest, application, false, "Available change in component"); - controller.applications().store(application); - } + // TODO: How can the first job not be systemTest (second ccondition)? + if ( ! jobs.isEmpty() && jobs.get(0).equals(JobType.systemTest) && + application.deploying().get() instanceof Change.VersionChange) { + Version target = ((Change.VersionChange)application.deploying().get()).version(); + JobStatus jobStatus = application.deploymentJobs().jobStatus().get(JobType.systemTest); + if (jobStatus == null || ! jobStatus.lastTriggered().isPresent() + || ! jobStatus.lastTriggered().get().version().equals(target)) { + application = trigger(JobType.systemTest, application, false, "Upgrade to " + target); + controller.applications().store(application); } } @@ -216,6 +207,62 @@ public class DeploymentTrigger { } /** + * Called periodically to cause triggering of jobs in the background + */ + public void triggerFailing(ApplicationId applicationId) { + try (Lock lock = applications().lock(applicationId)) { + LockedApplication application = applications().require(applicationId, lock); + if ( ! application.deploying().isPresent()) return; // No ongoing change, no need to retry + + // Retry first failing job + for (JobType jobType : order.jobsFrom(application.deploymentSpec())) { + JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); + if (isFailing(application.deploying().get(), jobStatus)) { + if (shouldRetryNow(jobStatus)) { + application = trigger(jobType, application, false, "Retrying failing job"); + applications().store(application); + } + break; + } + } + + // Retry dead job + Optional<JobStatus> firstDeadJob = firstDeadJob(application.deploymentJobs()); + if (firstDeadJob.isPresent()) { + application = trigger(firstDeadJob.get().type(), application, false, "Retrying dead job"); + applications().store(application); + } + } + } + + /** Triggers jobs that have been delayed according to deployment spec */ + public void triggerDelayed() { + for (Application application : applications().asList()) { + if ( ! application.deploying().isPresent() ) continue; + if (application.deploymentJobs().hasFailures()) continue; + if (application.deploymentJobs().isRunning(controller.applications().deploymentTrigger().jobTimeoutLimit())) continue; + if (application.deploymentSpec().steps().stream().noneMatch(step -> step instanceof DeploymentSpec.Delay)) { + continue; // Application does not have any delayed deployments + } + + Optional<JobStatus> lastSuccessfulJob = application.deploymentJobs().jobStatus().values() + .stream() + .filter(j -> j.lastSuccess().isPresent()) + .sorted(Comparator.<JobStatus, Instant>comparing(j -> j.lastSuccess().get().at()).reversed()) + .findFirst(); + if ( ! lastSuccessfulJob.isPresent() ) continue; + + // Trigger next + try (Lock lock = applications().lock(application.id())) { + LockedApplication lockedApplication = applications().require(application.id(), lock); + lockedApplication = trigger(order.nextAfter(lastSuccessfulJob.get().type(), lockedApplication), + lockedApplication, "Resuming delayed deployment"); + applications().store(lockedApplication); + } + } + } + + /** * Triggers a change of this application * * @param applicationId the application to trigger @@ -254,10 +301,42 @@ public class DeploymentTrigger { private ApplicationController applications() { return controller.applications(); } + /** Returns whether a job is failing for the current change in the given application */ + private boolean isFailing(Change change, JobStatus status) { + return status != null + && ! status.isSuccess() + && status.lastCompleted().isPresent() + && status.lastCompleted().get().lastCompletedWas(change); + } + private boolean isCapacityConstrained(JobType jobType) { return jobType == JobType.stagingTest || jobType == JobType.systemTest; } + /** Returns the first job that has been running for more than the given timeout */ + private Optional<JobStatus> firstDeadJob(DeploymentJobs jobs) { + Optional<JobStatus> oldestRunningJob = jobs.jobStatus().values().stream() + .filter(job -> job.isRunning(Instant.ofEpochMilli(0))) + .sorted(Comparator.comparing(status -> status.lastTriggered().get().at())) + .findFirst(); + return oldestRunningJob.filter(job -> job.lastTriggered().get().at().isBefore(jobTimeoutLimit())); + } + + /** Decide whether the job should be triggered by the periodic trigger */ + private boolean shouldRetryNow(JobStatus job) { + if (job.isSuccess()) return false; + if (job.isRunning(jobTimeoutLimit())) return false; + + // Retry after 10% of the time since it started failing + Duration aTenthOfFailTime = Duration.ofMillis( (clock.millis() - job.firstFailing().get().at().toEpochMilli()) / 10); + if (job.lastCompleted().get().at().isBefore(clock.instant().minus(aTenthOfFailTime))) return true; + + // ... or retry anyway if we haven't tried in 4 hours + if (job.lastCompleted().get().at().isBefore(clock.instant().minus(Duration.ofHours(4)))) return true; + + return false; + } + /** Retry immediately only if this job just started failing. Otherwise retry periodically */ private boolean shouldRetryNow(Application application, JobType jobType) { JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BlockedChangeDeployer.java index f165b4e4ea3..4a68fd6cfab 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ReadyJobsTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BlockedChangeDeployer.java @@ -14,14 +14,14 @@ import java.time.Duration; * @author bratseth */ @SuppressWarnings("unused") -public class ReadyJobsTrigger extends Maintainer { +public class BlockedChangeDeployer extends Maintainer { - public ReadyJobsTrigger(Controller controller, Duration interval, JobControl jobControl) { + public BlockedChangeDeployer(Controller controller, Duration interval, JobControl jobControl) { super(controller, interval, jobControl); } @Override - public void maintain() { + protected void maintain() { controller().applications().deploymentTrigger().triggerReadyJobs(); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java index 01edc269116..2fdce2802ab 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/ControllerMaintenance.java @@ -25,10 +25,12 @@ public class ControllerMaintenance extends AbstractComponent { private final DeploymentExpirer deploymentExpirer; private final DeploymentIssueReporter deploymentIssueReporter; private final MetricsReporter metricsReporter; + private final FailureRedeployer failureRedeployer; private final OutstandingChangeDeployer outstandingChangeDeployer; private final VersionStatusUpdater versionStatusUpdater; private final Upgrader upgrader; - private final ReadyJobsTrigger readyJobsTrigger; + private final DelayedDeployer delayedDeployer; + private final BlockedChangeDeployer blockedChangeDeployer; private final ClusterInfoMaintainer clusterInfoMaintainer; private final ClusterUtilizationMaintainer clusterUtilizationMaintainer; private final DeploymentMetricsMaintainer deploymentMetricsMaintainer; @@ -42,10 +44,12 @@ public class ControllerMaintenance extends AbstractComponent { deploymentExpirer = new DeploymentExpirer(controller, maintenanceInterval, jobControl); deploymentIssueReporter = new DeploymentIssueReporter(controller, deploymentIssues, maintenanceInterval, jobControl); metricsReporter = new MetricsReporter(controller, metric, chefClient, jobControl, controller.system()); + failureRedeployer = new FailureRedeployer(controller, maintenanceInterval, jobControl); outstandingChangeDeployer = new OutstandingChangeDeployer(controller, maintenanceInterval, jobControl); versionStatusUpdater = new VersionStatusUpdater(controller, Duration.ofMinutes(3), jobControl); upgrader = new Upgrader(controller, maintenanceInterval, jobControl, curator); - readyJobsTrigger = new ReadyJobsTrigger(controller, maintenanceInterval, jobControl); + delayedDeployer = new DelayedDeployer(controller, maintenanceInterval, jobControl); + blockedChangeDeployer = new BlockedChangeDeployer(controller, maintenanceInterval, jobControl); clusterInfoMaintainer = new ClusterInfoMaintainer(controller, Duration.ofHours(2), jobControl); clusterUtilizationMaintainer = new ClusterUtilizationMaintainer(controller, Duration.ofHours(2), jobControl); deploymentMetricsMaintainer = new DeploymentMetricsMaintainer(controller, Duration.ofMinutes(10), jobControl); @@ -61,10 +65,12 @@ public class ControllerMaintenance extends AbstractComponent { deploymentExpirer.deconstruct(); deploymentIssueReporter.deconstruct(); metricsReporter.deconstruct(); + failureRedeployer.deconstruct(); outstandingChangeDeployer.deconstruct(); versionStatusUpdater.deconstruct(); upgrader.deconstruct(); - readyJobsTrigger.deconstruct(); + delayedDeployer.deconstruct(); + blockedChangeDeployer.deconstruct(); clusterUtilizationMaintainer.deconstruct(); clusterInfoMaintainer.deconstruct(); deploymentMetricsMaintainer.deconstruct(); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DelayedDeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DelayedDeployer.java new file mode 100644 index 00000000000..cb09c41a034 --- /dev/null +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DelayedDeployer.java @@ -0,0 +1,24 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.controller.maintenance; + +import com.yahoo.vespa.hosted.controller.Controller; + +import java.time.Duration; + +/** + * Maintenance job which triggers jobs that have been delayed according to the applications deployment spec. + * + * @author mpolden + */ +public class DelayedDeployer extends Maintainer { + + public DelayedDeployer(Controller controller, Duration interval, JobControl jobControl) { + super(controller, interval, jobControl); + } + + @Override + protected void maintain() { + controller().applications().deploymentTrigger().triggerDelayed(); + } + +} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java new file mode 100644 index 00000000000..72f8faa5180 --- /dev/null +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java @@ -0,0 +1,35 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.controller.maintenance; + +import com.yahoo.vespa.hosted.controller.Application; +import com.yahoo.vespa.hosted.controller.Controller; +import com.yahoo.vespa.hosted.controller.application.ApplicationList; + +import java.time.Duration; +import java.util.List; + +/** + * Attempts redeployment of failed jobs and deployments. + * + * @author bratseth + * @author mpolden + */ +public class FailureRedeployer extends Maintainer { + + public FailureRedeployer(Controller controller, Duration interval, JobControl jobControl) { + super(controller, interval, jobControl); + } + + @Override + public void maintain() { + List<Application> applications = ApplicationList.from(controller().applications().asList()) + .notPullRequest() + .asList(); + applications.forEach(application -> triggerFailing(application)); + } + + private void triggerFailing(Application application) { + controller().applications().deploymentTrigger().triggerFailing(application.id()); + } + +} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobControl.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobControl.java index 6aa1b89c605..d7396cb2acb 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobControl.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobControl.java @@ -5,7 +5,6 @@ import com.yahoo.vespa.curator.Lock; import com.yahoo.vespa.hosted.controller.persistence.CuratorDb; import java.util.HashSet; -import java.util.LinkedHashSet; import java.util.Set; import java.util.concurrent.ConcurrentSkipListSet; import java.util.logging.Logger; @@ -41,7 +40,7 @@ public class JobControl { * Returns a snapshot of the set of jobs started on this system (whether deactivated or not). * Each job is represented by its simple (omitting package) class name. */ - public Set<String> jobs() { return new LinkedHashSet<>(startedJobs); } + public Set<String> jobs() { return new HashSet<>(startedJobs); } /** Returns an unmodifiable set containing the currently inactive jobs in this */ public Set<String> inactiveJobs() { return curator.readInactiveJobs(); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java index 1d19d8ca522..bbef7980273 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Maintainer.java @@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.google.common.util.concurrent.UncheckedTimeoutException; import com.yahoo.component.AbstractComponent; -import com.yahoo.component.ComponentId; import com.yahoo.vespa.curator.Lock; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.persistence.CuratorDb; @@ -30,7 +29,6 @@ public abstract class Maintainer extends AbstractComponent implements Runnable { private final ScheduledExecutorService service; public Maintainer(Controller controller, Duration interval, JobControl jobControl) { - initId(new ComponentId(name())); this.controller = controller; this.maintenanceInterval = interval; this.jobControl = jobControl; |