diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-10-10 09:34:59 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-10-10 09:34:59 +0200 |
commit | ed7232a5571a44005da7fd3dbe8cbf649ea8bd5e (patch) | |
tree | abaefa4fdd13abc43669ba4369b8eefebfa3a692 /controller-server | |
parent | 9f31611891a8dfef8086f6184e042ec04363e340 (diff) |
Cleaner triggering logic
- Don't change from an upgrade to a revision change when an upgrade is in progress
but has failed so we accept as application change,
as this leads to possible uncertainty about what version we really deployed.
- Allow more application changes when an application change is in progress or we
are blocked.
- Don't trigger already running jobs.
Diffstat (limited to 'controller-server')
10 files changed, 150 insertions, 46 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Application.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Application.java index 4bfbb345516..6f0273ddb20 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Application.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Application.java @@ -14,7 +14,9 @@ import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobReport; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType; +import com.yahoo.vespa.hosted.controller.application.JobStatus; +import java.time.Duration; import java.time.Instant; import java.util.Collections; import java.util.Comparator; @@ -287,5 +289,9 @@ public class Application { if ( ! deploying.isPresent()) return false; return deploying.get().blockedBy(deploymentSpec, instant); } - + + public boolean isBlocked(Instant instant) { + return ! deploymentSpec.canUpgradeAt(instant) || ! deploymentSpec.canChangeRevisionAt(instant); + } + } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java index b4a83528816..9bb21864b0f 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java @@ -12,6 +12,7 @@ import com.yahoo.vespa.hosted.controller.ApplicationController; import java.time.Instant; import java.util.Comparator; import java.util.List; +import java.util.Optional; import java.util.stream.Stream; /** @@ -68,6 +69,15 @@ public class ApplicationList { return listOf(list.stream().filter(application -> ! isDeployingApplicationChange(application))); } + public ApplicationList notDeploying() { + return listOf(list.stream().filter(application -> ! application.deploying().isPresent())); + } + + /** Returns the subset of applications which has a different revision in staging tests than in any prod deployment */ + public ApplicationList hasUndeployedSuccessfulRevisionChange() { + return listOf(list.stream().filter(application -> hasUndeployedSuccessfulRevisionChange(application))); + } + /** Returns the subset of applications which currently does not have any failing jobs */ public ApplicationList notFailing() { return listOf(list.stream().filter(application -> ! application.deploymentJobs().hasFailures())); @@ -179,6 +189,21 @@ public class ApplicationList { .anyMatch(jobRun -> jobRun.version().equals(change.version())); } + private static boolean hasUndeployedSuccessfulRevisionChange(Application application) { + JobStatus stagingStatus = application.deploymentJobs().jobStatus().get(DeploymentJobs.JobType.stagingTest); + if (stagingStatus == null) return false; + if ( ! stagingStatus.lastSuccess().isPresent()) return false; + Optional<ApplicationRevision> lastStagingSuccessRevision = stagingStatus.lastSuccess().get().revision(); + if ( ! lastStagingSuccessRevision.isPresent()) return false; + + for (Deployment deployment : application.deployments().values()) { + if ( ! deployment.zone().environment().equals(Environment.prod)) continue; + + if ( ! deployment.revision().equals(lastStagingSuccessRevision.get())) return true; + } + return false; + } + /** Convenience converter from a stream to an ApplicationList */ private static ApplicationList listOf(Stream<Application> applications) { ImmutableList.Builder<Application> b = new ImmutableList.Builder<>(); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java index 5036493603a..22eff0d8db1 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java @@ -10,6 +10,7 @@ import com.yahoo.config.provision.SystemName; import com.yahoo.config.provision.Zone; import com.yahoo.vespa.hosted.controller.Controller; +import java.time.Duration; import java.time.Instant; import java.util.Collection; import java.util.Collections; @@ -310,4 +311,11 @@ public class DeploymentJobs { return id; } -} + /** Returns whether the given job type is currently running and was started after timeoutLimit */ + public boolean isRunning(JobType jobType, Instant timeoutLimit) { + JobStatus jobStatus = status.get(jobType); + if ( jobStatus == null) return false; + return jobStatus.isRunning(timeoutLimit); + } + +}
\ No newline at end of file diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java index 96e91d41584..38e993efc64 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java @@ -4,6 +4,7 @@ package com.yahoo.vespa.hosted.controller.application; import com.yahoo.component.Version; import com.yahoo.vespa.hosted.controller.Controller; +import java.time.Duration; import java.time.Instant; import java.util.Objects; import java.util.Optional; @@ -98,6 +99,14 @@ public class JobStatus { /** Returns true unless this job last completed with a failure */ public boolean isSuccess() { return ! jobError.isPresent(); } + + /** Returns true if last triggered is newer than last completed and was started after timeoutLimit */ + public boolean isRunning(Instant timeoutLimit) { + if ( ! lastTriggered.isPresent()) return false; + if (lastTriggered.get().at().isBefore(timeoutLimit)) return false; + if ( ! lastCompleted.isPresent()) return true; + return lastTriggered.get().at().isAfter(lastCompleted.get().at()); + } /** The error of the last completion, or empty if the last run succeeded */ public Optional<DeploymentJobs.JobError> jobError() { return jobError; } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 5fe6edf84cc..bf0d7c0c5c5 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -35,7 +35,10 @@ import java.util.logging.Logger; * @author mpolden */ public class DeploymentTrigger { - + + /** The max duration a job may run before we consider it dead/hanging */ + private final static Duration jobTimeout = Duration.ofHours(12); + private final static Logger log = Logger.getLogger(DeploymentTrigger.class.getName()); private final Controller controller; @@ -67,20 +70,18 @@ public class DeploymentTrigger { // Handle successful starting and ending if (report.success()) { - if (order.isFirst(report.jobType())) { - // the first job tells us that a change occurred - if (application.deploying().isPresent() && !application.deploymentJobs().hasFailures()) { // postpone until the current deployment is done + if (order.isFirst(report.jobType())) { // the first job tells us that a change occurred + if (acceptNewRevisionNow(application)) { + // Set this as the change we are doing, unless we are already pushing a platform change + if ( ! ( application.deploying().isPresent() && + (application.deploying().get() instanceof Change.VersionChange))) + application = application.withDeploying(Optional.of(Change.ApplicationChange.unknown())); + } + else { // postpone applications().store(application.withOutstandingChange(true), lock); return; - } else { // start a new change deployment - application = application.withDeploying(Optional.of(Change.ApplicationChange.unknown())); } } - else if (order.isLastBeforeProduction(report.jobType()) && application.deployingBlocked(clock.instant())) { - // run tests to provide feedback on errors but stop before production - applications().store(application.withDeploying(Optional.empty()), lock); - return; - } else if (order.isLast(report.jobType(), application) && application.deployingCompleted()) { // change completed application = application.withDeploying(Optional.empty()); @@ -108,10 +109,10 @@ public class DeploymentTrigger { /** * Called periodically to cause triggering of jobs in the background */ - public void triggerFailing(ApplicationId applicationId, Duration timeout) { + public void triggerFailing(ApplicationId applicationId) { try (Lock lock = applications().lock(applicationId)) { Application application = applications().require(applicationId); - if (!application.deploying().isPresent()) return; // No ongoing change, no need to retry + if ( ! application.deploying().isPresent()) return; // No ongoing change, no need to retry // Retry first failing job for (JobType jobType : order.jobsFrom(application.deploymentSpec())) { @@ -126,7 +127,7 @@ public class DeploymentTrigger { } // Retry dead job - Optional<JobStatus> firstDeadJob = firstDeadJob(application.deploymentJobs(), timeout); + Optional<JobStatus> firstDeadJob = firstDeadJob(application.deploymentJobs(), jobTimeout); if (firstDeadJob.isPresent()) { application = trigger(firstDeadJob.get().type(), application, false, "Retrying dead job", lock); @@ -253,12 +254,12 @@ public class DeploymentTrigger { .isAfter(clock.instant().minus(Duration.ofMinutes(15))); } - /** Decide whether job type should be triggered according to deployment spec */ + /** Returns whether the given job type should be triggered according to deployment spec */ private boolean deploysTo(Application application, JobType jobType) { Optional<Zone> zone = jobType.zone(controller.system()); if (zone.isPresent() && jobType.isProduction()) { // Skip triggering of jobs for zones where the application should not be deployed - if (!application.deploymentSpec().includes(jobType.environment(), Optional.of(zone.get().region()))) { + if ( ! application.deploymentSpec().includes(jobType.environment(), Optional.of(zone.get().region()))) { return false; } } @@ -275,28 +276,46 @@ public class DeploymentTrigger { * @return the application in the triggered state, which *must* be stored by the caller */ private Application trigger(JobType jobType, Application application, boolean first, String cause, Lock lock) { - if (jobType == null) return application; // previous was last job + if (jobType == null) { // previous was last job + return application; + } + + // Note: We could make a more fine-grained and more correct determination about whether to block + // by instead basing the decision on what is currently deployed in the zone. However, + // this leads to some additional corner cases, and the possibility of blocking an application + // fix to a version upgrade, so not doing it now + if (jobType.isProduction() && application.deployingBlocked(clock.instant())) { + return application; + } + + if (application.deploymentJobs().isRunning(jobType, clock.instant().minus(jobTimeout))) { + return application; + } + // TODO: Review code for retrying when no job is running but a change is in progress + // or add more ... + // TODO: Remove when we can determine why this occurs - if (jobType != JobType.component && !application.deploying().isPresent()) { + if (jobType != JobType.component && ! application.deploying().isPresent()) { log.warning(String.format("Want to trigger %s for %s with reason %s, but this application is not " + "currently deploying a change", jobType, application, cause)); return application; } - if (!deploysTo(application, jobType)) { + if ( ! deploysTo(application, jobType)) { return application; } - if (!application.deploymentJobs().isDeployableTo(jobType.environment(), application.deploying())) { + // Note that this allows a new change to catch up and prevent an older one from continuing + if ( ! application.deploymentJobs().isDeployableTo(jobType.environment(), application.deploying())) { log.warning(String.format("Want to trigger %s for %s with reason %s, but change is untested", jobType, application, cause)); return application; } // Ignore applications that are not associated with a project - if (!application.deploymentJobs().projectId().isPresent()) { + if ( ! application.deploymentJobs().projectId().isPresent()) { return application; } @@ -309,12 +328,20 @@ public class DeploymentTrigger { } private Application trigger(List<JobType> jobs, Application application, String cause, Lock lock) { - for (JobType job : jobs) { + for (JobType job : jobs) application = trigger(job, application, false, cause, lock); - } return application; } + private boolean acceptNewRevisionNow(Application application) { + if ( ! application.deploying().isPresent()) return true; + if ( application.deploying().get() instanceof Change.ApplicationChange) return true; // more changes are ok + + if ( application.deploymentJobs().hasFailures()) return true; // allow changes to fix upgrade problems + if ( application.isBlocked(clock.instant())) return true; // allow testing changes while upgrade blocked (debatable) + return false; + } + public BuildSystem buildSystem() { return buildSystem; } public DeploymentOrder deploymentOrder() { return order; } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BlockedChangeDeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BlockedChangeDeployer.java new file mode 100644 index 00000000000..a59e86967d1 --- /dev/null +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/BlockedChangeDeployer.java @@ -0,0 +1,33 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.controller.maintenance; + +import com.yahoo.vespa.hosted.controller.Application; +import com.yahoo.vespa.hosted.controller.Controller; +import com.yahoo.vespa.hosted.controller.application.ApplicationList; +import com.yahoo.vespa.hosted.controller.application.Change; + +import java.time.Duration; + +/** + * Deploys application changes which have not made it to production because of a revision change block. + * + * @author bratseth + */ +public class BlockedChangeDeployer extends Maintainer { + + public BlockedChangeDeployer(Controller controller, Duration interval, JobControl jobControl) { + super(controller, interval, jobControl); + } + + @Override + protected void maintain() { + ApplicationList applications = ApplicationList.from(controller().applications().asList()).notPullRequest(); + applications = applications.notDeploying(); + applications = applications.hasUndeployedSuccessfulRevisionChange(); + for (Application application : applications.asList()) { +// controller().applications().deploymentTrigger().triggerChange(application.id(), +// Change.ApplicationChange.of()); + } + } + +} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java index c8831bd6a56..72f8faa5180 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployer.java @@ -16,8 +16,6 @@ import java.util.List; */ public class FailureRedeployer extends Maintainer { - private final static Duration jobTimeout = Duration.ofHours(12); - public FailureRedeployer(Controller controller, Duration interval, JobControl jobControl) { super(controller, interval, jobControl); } @@ -27,11 +25,11 @@ public class FailureRedeployer extends Maintainer { List<Application> applications = ApplicationList.from(controller().applications().asList()) .notPullRequest() .asList(); - applications.forEach(application -> triggerFailing(application, jobTimeout)); + applications.forEach(application -> triggerFailing(application)); } - private void triggerFailing(Application application, Duration timeout) { - controller().applications().deploymentTrigger().triggerFailing(application.id(), timeout); + private void triggerFailing(Application application) { + controller().applications().deploymentTrigger().triggerFailing(application.id()); } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java index c4829e5594f..be14947de2b 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java @@ -156,9 +156,12 @@ public class DeploymentTester { if (failOnJob.isPresent()) { assertTrue(applications().require(application.id()).deploying().isPresent()); assertTrue(applications().require(application.id()).deploymentJobs().hasFailures()); - } else { + } else if (includingProductionZones) { assertFalse(applications().require(application.id()).deploying().isPresent()); } + else { + assertTrue(applications().require(application.id()).deploying().isPresent()); + } } public void notifyJobCompletion(JobType jobType, Application application, boolean success) { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index 650a5865006..912aa9f2e1a 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -286,12 +286,10 @@ public class DeploymentTriggerTest { ApplicationPackage changedApplication = applicationPackageBuilder.searchDefinition(searchDefinition).build(); tester.deployTestOnly(app, changedApplication); - assertFalse(tester.application("app1").deploying().isPresent()); tester.clock().advance(Duration.ofHours(2)); // Exit block window: 20:30 tester.deployCompletely(app, changedApplication); - assertFalse(tester.application("app1").deploying().isPresent()); } @Test diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 77ec3579df0..a2a71f24275 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -136,9 +136,6 @@ public class UpgraderTest { // --- Failing application is repaired by changing the application, causing confidence to move above 'high' threshold // Deploy application change tester.deployCompletely("default0"); - // Complete upgrade - tester.upgrader().maintain(); - tester.completeUpgrade(default0, version, "default"); tester.updateVersionStatus(version); assertEquals(VespaVersion.Confidence.high, tester.controller().versionStatus().systemVersion().get().confidence()); @@ -165,16 +162,16 @@ public class UpgraderTest { assertEquals("No applications: Nothing to do", 0, tester.buildSystem().jobs().size()); // Setup applications - Application canary0 = tester.createAndDeploy("canary0", 1, "canary"); - Application canary1 = tester.createAndDeploy("canary1", 2, "canary"); - Application default0 = tester.createAndDeploy("default0", 3, "default"); - Application default1 = tester.createAndDeploy("default1", 4, "default"); - Application default2 = tester.createAndDeploy("default2", 5, "default"); - Application default3 = tester.createAndDeploy("default3", 6, "default"); - Application default4 = tester.createAndDeploy("default4", 7, "default"); - Application default5 = tester.createAndDeploy("default5", 8, "default"); - Application default6 = tester.createAndDeploy("default6", 9, "default"); - Application default7 = tester.createAndDeploy("default7", 10, "default"); + Application canary0 = tester.createAndDeploy("canary0", 1, "canary"); + Application canary1 = tester.createAndDeploy("canary1", 2, "canary"); + Application default0 = tester.createAndDeploy("default0", 3, "default"); + Application default1 = tester.createAndDeploy("default1", 4, "default"); + Application default2 = tester.createAndDeploy("default2", 5, "default"); + Application default3 = tester.createAndDeploy("default3", 6, "default"); + Application default4 = tester.createAndDeploy("default4", 7, "default"); + Application default5 = tester.createAndDeploy("default5", 8, "default"); + Application default6 = tester.createAndDeploy("default6", 9, "default"); + Application default7 = tester.createAndDeploy("default7", 10, "default"); Application default8 = tester.createAndDeploy("default8", 11, "default"); Application default9 = tester.createAndDeploy("default9", 12, "default"); |