From 8e303425a1d16efe02bac2e8e2400b13fdfe2f9f Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Mon, 8 Jan 2018 14:14:09 +0100 Subject: More robust upgrading - Support upgrades to a lower version than the highest deployed in an application - Check what's actually deployed when deciding whether a change is done --- .../hosted/controller/ApplicationController.java | 52 +++---- .../controller/application/DeploymentJobs.java | 15 +- .../controller/deployment/DeploymentOrder.java | 9 +- .../controller/deployment/DeploymentTrigger.java | 96 ++++++++---- .../controller/deployment/DeploymentTester.java | 21 ++- .../deployment/DeploymentTriggerTest.java | 16 +- .../ApplicationOwnershipConfirmerTest.java | 7 +- .../maintenance/FailureRedeployerTest.java | 10 +- .../controller/maintenance/UpgraderTest.java | 163 +++++++++++++++++---- .../restapi/application/responses/application.json | 2 +- .../responses/application1-recursive.json | 2 +- 11 files changed, 275 insertions(+), 118 deletions(-) (limited to 'controller-server') diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java index 28fa311b841..03eb5689024 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java @@ -74,7 +74,7 @@ import java.util.stream.Collectors; /** * A singleton owned by the Controller which contains the methods and state for controlling applications. - * + * * @author bratseth */ public class ApplicationController { @@ -127,7 +127,7 @@ public class ApplicationController { /** * Returns the application with the given id - * + * * @throws IllegalArgumentException if it does not exist */ public Application require(ApplicationId id) { @@ -135,7 +135,7 @@ public class ApplicationController { } /** Returns a snapshot of all applications */ - public List asList() { + public List asList() { return db.listApplications(); } @@ -237,7 +237,7 @@ public class ApplicationController { throw new UnsupportedOperationException("Only the instance names 'default' and names starting with 'default-pr' are supported at the moment"); try (Lock lock = lock(id)) { com.yahoo.vespa.hosted.controller.api.identifiers.ApplicationId.validate(id.application().value()); - + Optional tenant = controller.tenants().tenant(new TenantId(id.tenant().value())); if ( ! tenant.isPresent()) throw new IllegalArgumentException("Could not create '" + id + "': This tenant does not exist"); @@ -250,12 +250,12 @@ public class ApplicationController { if (tenant.get().isAthensTenant()) { ZmsClient zmsClient = zmsClientFactory.createZmsClientWithAuthorizedServiceToken(token.get()); try { - zmsClient.deleteApplication(tenant.get().getAthensDomain().get(), + zmsClient.deleteApplication(tenant.get().getAthensDomain().get(), new com.yahoo.vespa.hosted.controller.api.identifiers.ApplicationId(id.application().value())); } catch (ZmsException ignored) { } - zmsClient.addApplication(tenant.get().getAthensDomain().get(), + zmsClient.addApplication(tenant.get().getAthensDomain().get(), new com.yahoo.vespa.hosted.controller.api.identifiers.ApplicationId(id.application().value())); } LockedApplication application = new LockedApplication(new Application(id), lock); @@ -326,7 +326,7 @@ public class ApplicationController { throw new IllegalArgumentException("Rejecting deployment of " + application + " to " + zone + " as " + application.deploying().get() + " is not tested"); Deployment existingDeployment = application.deployments().get(zone); - if (existingDeployment != null && existingDeployment.version().isAfter(version)) + if (zone.environment().isProduction() && existingDeployment != null && existingDeployment.version().isAfter(version)) throw new IllegalArgumentException("Rejecting deployment of " + application + " to " + zone + " as the requested version " + version + " is older than" + " the current version " + existingDeployment.version()); @@ -357,7 +357,7 @@ public class ApplicationController { // Carry out deployment options = withVersion(version, options); - ConfigServerClient.PreparedApplication preparedApplication = + ConfigServerClient.PreparedApplication preparedApplication = configserverClient.prepare(new DeploymentId(applicationId, zone), options, cnames, rotations, applicationPackage.zippedContent()); preparedApplication.activate(); @@ -382,22 +382,22 @@ public class ApplicationController { private LockedApplication deleteRemovedDeployments(LockedApplication application) { List deploymentsToRemove = application.productionDeployments().values().stream() - .filter(deployment -> ! application.deploymentSpec().includes(deployment.zone().environment(), + .filter(deployment -> ! application.deploymentSpec().includes(deployment.zone().environment(), Optional.of(deployment.zone().region()))) .collect(Collectors.toList()); if (deploymentsToRemove.isEmpty()) return application; - + if ( ! application.validationOverrides().allows(ValidationId.deploymentRemoval, clock.instant())) - throw new IllegalArgumentException(ValidationId.deploymentRemoval.value() + ": " + application + + throw new IllegalArgumentException(ValidationId.deploymentRemoval.value() + ": " + application + " is deployed in " + deploymentsToRemove.stream() .map(deployment -> deployment.zone().region().value()) - .collect(Collectors.joining(", ")) + - ", but does not include " + + .collect(Collectors.joining(", ")) + + ", but does not include " + (deploymentsToRemove.size() > 1 ? "these zones" : "this zone") + " in deployment.xml"); - + LockedApplication applicationWithRemoval = application; for (Deployment deployment : deploymentsToRemove) applicationWithRemoval = deactivate(applicationWithRemoval, deployment.zone()); @@ -416,7 +416,7 @@ public class ApplicationController { } /** - * Returns the existing triggering of the given type from this application, + * Returns the existing triggering of the given type from this application, * or an incomplete one created in this method if none is present * This is needed (only) in the case where some external entity triggers a job. */ @@ -428,13 +428,13 @@ public class ApplicationController { } private JobStatus.JobRun incompleteTriggeringEvent(Version version) { - return new JobStatus.JobRun(-1, version, Optional.empty(), false, "", clock.instant()); + return new JobStatus.JobRun(-1, version, Optional.empty(), false, "", clock.instant()); } - + private DeployOptions withVersion(Version version, DeployOptions options) { - return new DeployOptions(options.screwdriverBuildJob, - Optional.of(version), - options.ignoreValidationErrors, + return new DeployOptions(options.screwdriverBuildJob, + Optional.of(version), + options.ignoreValidationErrors, options.deployCurrentVersion); } @@ -442,7 +442,7 @@ public class ApplicationController { Optional screwDriverBuildJob) { if ( ! screwDriverBuildJob.isPresent()) return ApplicationRevision.from(applicationPackage.hash()); - + GitRevision gitRevision = screwDriverBuildJob.get().gitRevision; if (gitRevision.repository == null || gitRevision.branch == null || gitRevision.commit == null) return ApplicationRevision.from(applicationPackage.hash()); @@ -507,7 +507,7 @@ public class ApplicationController { /** * Deletes the the given application. All known instances of the applications will be deleted, * including PR instances. - * + * * @throws IllegalArgumentException if the application has deployments or the caller is not authorized * @throws NotExistsException if no instances of the application exist */ @@ -544,9 +544,9 @@ public class ApplicationController { })); } - /** - * Replace any previous version of this application by this instance - * + /** + * Replace any previous version of this application by this instance + * * @param application a locked application to store */ public void store(LockedApplication application) { @@ -580,7 +580,7 @@ public class ApplicationController { public void notifyJobCompletion(JobReport report) { if ( ! get(report.applicationId()).isPresent()) { - log.log(Level.WARNING, "Ignoring completion of job of project '" + report.projectId() + + log.log(Level.WARNING, "Ignoring completion of job of project '" + report.projectId() + "': Unknown application '" + report.applicationId() + "'"); return; } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java index 5b104f19937..d1e2d91ddbd 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java @@ -23,7 +23,7 @@ import java.util.stream.Stream; /** * Information about which deployment jobs an application should run and their current status. * This is immutable. - * + * * @author bratseth * @author mpolden */ @@ -129,7 +129,7 @@ public class DeploymentJobs { return true; // other environments do not have any preconditions } - /** Returns whether job has completed successfully */ + /** Returns whether the job of the given type has completed successfully for the given change */ public boolean isSuccessful(Change change, JobType jobType) { return Optional.ofNullable(jobStatus().get(jobType)) .flatMap(JobStatus::lastSuccess) @@ -138,7 +138,7 @@ public class DeploymentJobs { } /** - * Returns the id of the Screwdriver project running these deployment jobs + * Returns the id of the Screwdriver project running these deployment jobs * - or empty when this is not known or does not exist. * It is not known until the jobs have run once and reported back to the controller. */ @@ -194,7 +194,10 @@ public class DeploymentJobs { /** Returns whether this is a production job */ public boolean isProduction() { return environment() == Environment.prod; } - + + /** Returns whether this is an automated test job */ + public boolean isTest() { return environment() != null && environment().isTest(); } + /** Returns the environment of this job type, or null if it does not have an environment */ public Environment environment() { switch (this) { @@ -215,7 +218,7 @@ public class DeploymentJobs { .filter(jobType -> jobType.jobName.equals(jobName)) .findAny().orElseThrow(() -> new IllegalArgumentException("Unknown job name '" + jobName + "'")); } - + /** Returns the job type for the given zone */ public static Optional from(SystemName system, ZoneId zone) { return Stream.of(values()) @@ -270,4 +273,4 @@ public class DeploymentJobs { outOfCapacity; } -} \ No newline at end of file +} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentOrder.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentOrder.java index dd7befb6d63..25b3cd0fd1e 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentOrder.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentOrder.java @@ -2,10 +2,10 @@ package com.yahoo.vespa.hosted.controller.deployment; import com.yahoo.config.application.api.DeploymentSpec; -import com.yahoo.vespa.hosted.controller.api.integration.zone.ZoneId; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.LockedApplication; +import com.yahoo.vespa.hosted.controller.api.integration.zone.ZoneId; import com.yahoo.vespa.hosted.controller.application.Change; import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; @@ -51,7 +51,7 @@ public class DeploymentOrder { return Collections.emptyList(); } - // Always trigger system test after component as deployment spec might not be available yet + // Always trigger system test after component as deployment spec might not be available yet // (e.g. if this is a new application with no previous deployments) if (job == JobType.component) { return Collections.singletonList(JobType.systemTest); @@ -70,11 +70,6 @@ public class DeploymentOrder { return Collections.emptyList(); } - // Postpone if step hasn't completed all its jobs for this change - if ( ! completedSuccessfully(currentStep.get(), application.deploying().get(), application)) { - return Collections.emptyList(); - } - // Postpone next job if delay has not passed yet Duration delay = delayAfter(currentStep.get(), application); if (shouldPostponeDeployment(delay, job, application)) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 6a9db3ae917..4279949741d 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -4,11 +4,11 @@ package com.yahoo.vespa.hosted.controller.deployment; import com.yahoo.component.Version; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.SystemName; -import com.yahoo.vespa.hosted.controller.api.integration.zone.ZoneId; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.ApplicationController; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.LockedApplication; +import com.yahoo.vespa.hosted.controller.api.integration.zone.ZoneId; import com.yahoo.vespa.hosted.controller.application.ApplicationList; import com.yahoo.vespa.hosted.controller.application.Change; import com.yahoo.vespa.hosted.controller.application.Change.VersionChange; @@ -118,9 +118,31 @@ public class DeploymentTrigger { /** Returns whether all production zones listed in deployment spec last were successful on the currently deploying change. */ private boolean deploymentComplete(LockedApplication application) { if ( ! application.deploying().isPresent()) return true; - return order.jobsFrom(application.deploymentSpec()).stream() - .filter(JobType::isProduction) - .allMatch(jobType -> application.deploymentJobs().isSuccessful(application.deploying().get(), jobType)); + Change change = application.deploying().get(); + + for (JobType job : order.jobsFrom(application.deploymentSpec())) { + if ( ! job.isProduction()) continue; + + Optional zone = job.zone(this.controller.system()); + if ( ! zone.isPresent()) continue; + + Deployment deployment = application.deployments().get(zone.get()); + if (deployment == null) return false; + + // Check actual job outcome (the deployment) + if (change instanceof VersionChange) { + if (((VersionChange)change).version().isAfter(deployment.version())) return false; // later is ok + } + else if (((Change.ApplicationChange)change).revision().isPresent()) { + if ( ! deployment.revision().equals(((Change.ApplicationChange)change).revision().get())) return false; + } + else { + return false; // If we don't yet know the revision we are changing to, then we are not complete + } + + } + + return true; } /** @@ -134,7 +156,7 @@ public class DeploymentTrigger { } /** Find the next step to trigger if any, and triggers it */ - private void triggerReadyJobs(LockedApplication application) { + public void triggerReadyJobs(LockedApplication application) { if ( ! application.deploying().isPresent()) return; List jobs = order.jobsFrom(application.deploymentSpec()); @@ -186,29 +208,37 @@ public class DeploymentTrigger { */ private boolean changesAvailable(Application application, JobStatus previous, JobStatus next) { if ( ! application.deploying().isPresent()) return false; - Change change = application.deploying().get(); - if ( ! previous.lastSuccess().isPresent()) return false; + Change change = application.deploying().get(); - if (change instanceof Change.VersionChange) { + if (change instanceof Change.VersionChange) { // Propagate upgrade while making sure we never downgrade Version targetVersion = ((Change.VersionChange)change).version(); - if ( ! (targetVersion.equals(previous.lastSuccess().get().version())) ) - return false; // version is outdated - // The below is checked again in allowedTriggering, right before actual triggering. - if (next != null && isOnNewerVersionInProductionThan(targetVersion, application, next.type())) - return false; // Don't downgrade - } - if (next == null) return true; - if ( ! next.lastSuccess().isPresent()) return true; + // Is the target version tested? + if ( ! lastSuccessfulIs(targetVersion, JobType.stagingTest, application)) + return false; + + // Is it this job's turn to upgrade? + if ( previous.type().isProduction() && ! isOnAtLeastProductionVersion(targetVersion, application, previous.type())) + return false; + + // Is the next a test zone which already completed this upgrade successfully? + if (next != null && next.type().isTest() && lastSuccessfulIs(targetVersion, next.type(), application)) + return false; + + // Is the next a production job for a zone already upgraded to this version or a newer one? + if (next != null && next.type().isProduction() && isOnAtLeastProductionVersion(targetVersion, application, next.type())) + return false; - JobStatus.JobRun previousSuccess = previous.lastSuccess().get(); - JobStatus.JobRun nextSuccess = next.lastSuccess().get(); - if (previousSuccess.revision().isPresent() && ! previousSuccess.revision().equals(nextSuccess.revision())) - return true; - if ( ! previousSuccess.version().equals(nextSuccess.version())) return true; - return false; + } + else { // revision changes do not need to handle downgrading + if ( ! previous.lastSuccess().isPresent()) return false; + if (next == null) return true; + if ( ! next.lastSuccess().isPresent()) return true; + return previous.lastSuccess().get().revision().isPresent() && + ! previous.lastSuccess().get().revision().equals(next.lastSuccess().get().revision()); + } } /** @@ -332,8 +362,9 @@ public class DeploymentTrigger { if (jobType.isProduction() && application.deploying().isPresent() && application.deploying().get().blockedBy(application.deploymentSpec(), clock.instant())) return false; + // Don't downgrade or redeploy the same version in production if (application.deploying().isPresent() && application.deploying().get() instanceof VersionChange && - isOnNewerVersionInProductionThan(((VersionChange) application.deploying().get()).version(), application, jobType)) return false; + jobType.isProduction() && isOnAtLeastProductionVersion(((VersionChange) application.deploying().get()).version(), application, jobType)) return false; if (application.deploymentJobs().isRunning(jobType, jobTimeoutLimit())) return false; if ( ! hasJob(jobType, application)) return false; @@ -352,18 +383,19 @@ public class DeploymentTrigger { /** * Returns whether the current deployed version in the zone given by the job - * is newer than the given version. This may be the case even if the production job + * is newer or equal to the given version. This may be the case even if the production job * in question failed, if the failure happens after deployment. * In that case we should never deploy an earlier version as that may potentially - * downgrade production nodes which we are not guaranteed to support. + * downgrade production nodes which we are not guaranteed to support, and upgradibng to the current + * version is just unnecessary work. */ - private boolean isOnNewerVersionInProductionThan(Version version, Application application, JobType job) { - if ( ! job.isProduction()) return false; + private boolean isOnAtLeastProductionVersion(Version version, Application application, JobType job) { + if ( ! job.isProduction()) return false; // TODO: Throw if not prod, and change name/doc accordingly Optional zone = job.zone(controller.system()); if ( ! zone.isPresent()) return false; Deployment existingDeployment = application.deployments().get(zone.get()); if (existingDeployment == null) return false; - return existingDeployment.version().isAfter(version); + return existingDeployment.version().isAfter(version) || existingDeployment.version().equals(version); } private boolean acceptNewRevisionNow(LockedApplication application) { @@ -379,4 +411,12 @@ public class DeploymentTrigger { return false; } + private boolean lastSuccessfulIs(Version version, JobType jobType, Application application) { + JobStatus status = application.deploymentJobs().jobStatus().get(jobType); + if (status == null) return false; + Optional lastSuccessfulStagingRun = status.lastSuccess(); + if ( ! lastSuccessfulStagingRun.isPresent()) return false; + return lastSuccessfulStagingRun.get().version().equals(version); + } + } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java index 2b0e953c12c..de8b6b9cd18 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java @@ -98,7 +98,7 @@ public class DeploymentTester { public void updateVersionStatus() { controller().updateVersionStatus(VersionStatus.compute(controller(), tester.controller().systemVersion())); } - + public void updateVersionStatus(Version currentVersion) { configServer().setDefaultVersion(currentVersion); controller().updateVersionStatus(VersionStatus.compute(controller(), currentVersion)); @@ -174,7 +174,7 @@ public class DeploymentTester { completeDeployment(application, applicationPackage, Optional.empty(), false); } - private void completeDeployment(Application application, ApplicationPackage applicationPackage, + private void completeDeployment(Application application, ApplicationPackage applicationPackage, Optional failOnJob, boolean includingProductionZones) { DeploymentOrder order = new DeploymentOrder(controller()); List jobs = order.jobsFrom(applicationPackage.deploymentSpec()); @@ -208,9 +208,13 @@ public class DeploymentTester { } public void completeUpgrade(Application application, Version version, String upgradePolicy) { - assertTrue(applications().require(application.id()).deploying().isPresent()); + completeUpgrade(application, version, applicationPackage(upgradePolicy)); + } + + public void completeUpgrade(Application application, Version version, ApplicationPackage applicationPackage) { + assertTrue(application + " has a deployment", applications().require(application.id()).deploying().isPresent()); assertEquals(new Change.VersionChange(version), applications().require(application.id()).deploying().get()); - completeDeployment(application, applicationPackage(upgradePolicy), Optional.empty(), true); + completeDeployment(application, applicationPackage, Optional.empty(), true); } public void completeUpgradeWithError(Application application, Version version, String upgradePolicy, JobType failOnJob) { @@ -235,6 +239,10 @@ public class DeploymentTester { job.zone(controller().system()).ifPresent(zone -> tester.deploy(application, zone, applicationPackage, deployCurrentVersion)); } + public void deployAndNotify(Application application, String upgradePolicy, boolean success, JobType... jobs) { + deployAndNotify(application, applicationPackage(upgradePolicy), success, true, jobs); + } + public void deployAndNotify(Application application, ApplicationPackage applicationPackage, boolean success, JobType... jobs) { deployAndNotify(application, applicationPackage, success, true, jobs); @@ -264,9 +272,10 @@ public class DeploymentTester { } private BuildService.BuildJob findJob(Application application, JobType jobType) { - for (BuildService.BuildJob job : buildSystem().jobs()) + for (BuildService.BuildJob job : buildSystem().jobs()) { if (job.projectId() == application.deploymentJobs().projectId().get() && job.jobName().equals(jobType.jobName())) return job; + } throw new NoSuchElementException(jobType + " is not scheduled for " + application); } @@ -281,6 +290,8 @@ public class DeploymentTester { .upgradePolicy(upgradePolicy) .environment(Environment.prod) .region("us-west-1") + .environment(Environment.prod) + .region("us-east-3") .build(); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index 10f8e80f318..b71a9090c79 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -217,16 +217,14 @@ public class DeploymentTriggerTest { tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.systemTest); tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.stagingTest); - // Parallel deployment - tester.deploy(DeploymentJobs.JobType.productionUsWest1, app, applicationPackage); - tester.deploy(DeploymentJobs.JobType.productionUsEast3, app, applicationPackage); - // Last declared job completes first + tester.deploy(DeploymentJobs.JobType.productionUsWest1, app, applicationPackage); tester.notifyJobCompletion(DeploymentJobs.JobType.productionUsWest1, app, true); assertTrue("Change is present as not all jobs are complete", tester.applications().require(app.id()).deploying().isPresent()); // All jobs complete + tester.deploy(DeploymentJobs.JobType.productionUsEast3, app, applicationPackage); tester.notifyJobCompletion(DeploymentJobs.JobType.productionUsEast3, app, true); assertFalse("Change has been deployed", tester.applications().require(app.id()).deploying().isPresent()); @@ -287,13 +285,13 @@ public class DeploymentTriggerTest { Application app = tester.createAndDeploy("app1", 1, applicationPackageBuilder.build()); - - + + tester.clock().advance(Duration.ofHours(1)); // --------------- Enter block window: 18:30 - + readyJobsTrigger.run(); assertEquals(0, tester.buildSystem().jobs().size()); - + String searchDefinition = "search test {\n" + " document test {\n" + @@ -314,7 +312,7 @@ public class DeploymentTriggerTest { BuildService.BuildJob productionJob = tester.buildSystem().takeJobsToRun().get(0); assertEquals("production-us-west-1", productionJob.jobName()); } - + @Test public void testUpgradingButNoJobStarted() { DeploymentTester tester = new DeploymentTester(); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmerTest.java index f5be882fcb8..5039777ec7d 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/ApplicationOwnershipConfirmerTest.java @@ -1,11 +1,7 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.config.provision.ApplicationId; -import com.yahoo.config.provision.Environment; -import com.yahoo.config.provision.RegionName; -import com.yahoo.config.provision.Zone; import com.yahoo.vespa.hosted.controller.Application; -import com.yahoo.vespa.hosted.controller.ControllerTester; import com.yahoo.vespa.hosted.controller.api.Tenant; import com.yahoo.vespa.hosted.controller.api.identifiers.PropertyId; import com.yahoo.vespa.hosted.controller.api.identifiers.TenantId; @@ -74,9 +70,10 @@ public class ApplicationOwnershipConfirmerTest { assertEquals("Confirmation issue reference is not updated when no issue id is returned.", issueId, propertyApp.get().ownershipIssueId()); assertEquals("Confirmation issue reference is not updated when no issue id is returned.", issueId, userApp.get().ownershipIssueId()); - // The user deletes its production deployment — see that the issue is forgotten. + // The user deletes all production deployments — see that the issue is forgotten. assertEquals("Confirmation issue for user is sitll open.", issueId, userApp.get().ownershipIssueId()); tester.controller().applications().deactivate(userApp.get(), userApp.get().productionDeployments().keySet().stream().findAny().get()); + tester.controller().applications().deactivate(userApp.get(), userApp.get().productionDeployments().keySet().stream().findAny().get()); assertTrue("No production deployments are listed for user.", userApp.get().productionDeployments().isEmpty()); confirmer.maintain(); confirmer.maintain(); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java index fd00123c697..62e6d379c60 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/FailureRedeployerTest.java @@ -89,6 +89,7 @@ public class FailureRedeployerTest { tester.clock().advance(Duration.ofMinutes(5)); tester.readyJobTrigger().maintain(); assertEquals("Job is retried", 1, tester.buildSystem().jobs().size()); + assertEquals(DeploymentJobs.JobType.productionUsEast3.jobName(), tester.buildSystem().jobs().get(0).jobName()); // Production job finally succeeds tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.productionUsEast3); @@ -220,9 +221,14 @@ public class FailureRedeployerTest { tester.readyJobTrigger().maintain(); assertTrue("No jobs retried", tester.buildSystem().jobs().isEmpty()); - // Deployment completes + // Deployment notifies completeness but has not actually made a deployment + tester.notifyJobCompletion(DeploymentJobs.JobType.productionCdUsCentral1, application, true); + assertTrue("Change not really deployed", tester.application(application.id()).deploying().isPresent()); + + // Deployment actually deploys and notifies completeness + tester.deploy(DeploymentJobs.JobType.productionCdUsCentral1, application, applicationPackage); tester.notifyJobCompletion(DeploymentJobs.JobType.productionCdUsCentral1, application, true); - assertFalse("Change deployed", tester.application(application.id()).deploying().isPresent()); + assertFalse("Change not really deployed", tester.application(application.id()).deploying().isPresent()); } @Test diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 90721e7be6b..fd13b99f25e 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -4,10 +4,10 @@ package com.yahoo.vespa.hosted.controller.maintenance; import com.yahoo.component.Version; import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.RegionName; -import com.yahoo.vespa.hosted.controller.api.integration.zone.ZoneId; import com.yahoo.test.ManualClock; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.ControllerTester; +import com.yahoo.vespa.hosted.controller.api.integration.zone.ZoneId; import com.yahoo.vespa.hosted.controller.application.ApplicationPackage; import com.yahoo.vespa.hosted.controller.application.Change; import com.yahoo.vespa.hosted.controller.application.Deployment; @@ -95,7 +95,7 @@ public class UpgraderTest { assertEquals("New system version: Should upgrade Canaries", 2, tester.buildSystem().jobs().size()); tester.completeUpgradeWithError(canary0, version, "canary", DeploymentJobs.JobType.stagingTest); assertEquals("Other Canary was cancelled", 2, tester.buildSystem().jobs().size()); - // TODO: Cancelled would mean it was triggerd, removed from the build system, but never reported in. + // TODO: Cancelled would mean it was triggered, removed from the build system, but never reported in. // Thus, the expected number of jobs should be 1, above: the retrying canary0. // Further, canary1 should be retried after the timeout period of 12 hours, but verifying this is // not possible when jobs are consumed form the build system on notification, rather than on deploy. @@ -161,7 +161,7 @@ public class UpgraderTest { tester.updateVersionStatus(version); tester.upgrader().maintain(); assertEquals("Applications are on 5.3 - nothing to do", 0, tester.buildSystem().jobs().size()); - + // --- Starting upgrading to a new version which breaks, causing upgrades to commence on the previous version Version version54 = Version.fromString("5.4"); Application default3 = tester.createAndDeploy("default3", 5, "default"); // need 4 to break a version @@ -384,6 +384,101 @@ public class UpgraderTest { assertTrue("All jobs consumed", tester.buildSystem().jobs().isEmpty()); } + /** + * Scenario: + * An application A is on version V0 + * Version V2 is released. + * A upgrades one production zone to V2. + * V2 is marked as broken and upgrade of A to V2 is cancelled. + * Upgrade of A to V1 is scheduled: Should skip the zone on V2 but upgrade the next zone to V1 + */ + @Test + public void testVersionIsBrokenAfterAZoneIsLive() { + DeploymentTester tester = new DeploymentTester(); + Version v0 = Version.fromString("5.0"); + tester.updateVersionStatus(v0); + + // Setup applications on V0 + Application canary0 = tester.createAndDeploy("canary0", 1, "canary"); + Application canary1 = tester.createAndDeploy("canary1", 2, "canary"); + Application default0 = tester.createAndDeploy("default0", 3, "default"); + Application default1 = tester.createAndDeploy("default1", 4, "default"); + Application default2 = tester.createAndDeploy("default2", 5, "default"); + Application default3 = tester.createAndDeploy("default3", 6, "default"); + Application default4 = tester.createAndDeploy("default4", 7, "default"); + + // V1 is released + Version v1 = Version.fromString("5.1"); + tester.updateVersionStatus(v1); + assertEquals(v1, tester.controller().versionStatus().systemVersion().get().versionNumber()); + tester.upgrader().maintain(); + + // Canaries upgrade and raise confidence of V+1 (other apps are not upgraded) + tester.completeUpgrade(canary0, v1, "canary"); + tester.completeUpgrade(canary1, v1, "canary"); + tester.updateVersionStatus(v1); + assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + + // V2 is released + Version v2 = Version.fromString("5.2"); + tester.updateVersionStatus(v2); + assertEquals(v2, tester.controller().versionStatus().systemVersion().get().versionNumber()); + tester.upgrader().maintain(); + + // We "manually" cancel upgrades to V1 so that we can use the applications to make V2 fail instead + // But we keep one (default4) to avoid V1 being garbage collected + tester.deploymentTrigger().cancelChange(default0.id()); + tester.deploymentTrigger().cancelChange(default1.id()); + tester.deploymentTrigger().cancelChange(default2.id()); + tester.deploymentTrigger().cancelChange(default3.id()); + tester.clock().advance(Duration.ofHours(13)); // Currently we don't cancel running jobs, so this is necessary to allow a new triggering below + + // Canaries upgrade and raise confidence of V2 + tester.completeUpgrade(canary0, v2, "canary"); + tester.completeUpgrade(canary1, v2, "canary"); + tester.updateVersionStatus(v2); + assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); + + // Applications with default policy start upgrading to V2 + tester.upgrader().maintain(); + assertEquals("Upgrade scheduled for remaining apps", 5, tester.buildSystem().jobs().size()); + + // 4/5 applications fail (in the last prod zone) and lowers confidence + tester.completeUpgradeWithError(default0, v2, "default", DeploymentJobs.JobType.productionUsEast3); + tester.completeUpgradeWithError(default1, v2, "default", DeploymentJobs.JobType.productionUsEast3); + tester.completeUpgradeWithError(default2, v2, "default", DeploymentJobs.JobType.productionUsEast3); + tester.completeUpgradeWithError(default3, v2, "default", DeploymentJobs.JobType.productionUsEast3); + tester.updateVersionStatus(v2); + assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); + + assertEquals(v2, tester.application("default0").deployments().get(ZoneId.from("prod.us-west-1")).version()); + assertEquals(v0, tester.application("default0").deployments().get(ZoneId.from("prod.us-east-3")).version()); + tester.upgrader().maintain(); + assertEquals("Upgrade to 5.1 scheduled for apps not completely on 5.1 or 5.2", 5, tester.buildSystem().jobs().size()); + + tester.deploymentTrigger().triggerReadyJobs(); + assertEquals("Testing of 5.1 for 5 applications is triggered", 5, tester.buildSystem().jobs().size()); + assertEquals(DeploymentJobs.JobType.systemTest.jobName(), tester.buildSystem().jobs().get(0).jobName()); + assertEquals(DeploymentJobs.JobType.systemTest.jobName(), tester.buildSystem().jobs().get(1).jobName()); + assertEquals(DeploymentJobs.JobType.systemTest.jobName(), tester.buildSystem().jobs().get(2).jobName()); + assertEquals(DeploymentJobs.JobType.systemTest.jobName(), tester.buildSystem().jobs().get(3).jobName()); + assertEquals(DeploymentJobs.JobType.systemTest.jobName(), tester.buildSystem().jobs().get(4).jobName()); + + // The tester code for completing upgrades does not handle this scenario, so we trigger each step manually (for one app) + tester.deployAndNotify(tester.application("default0"), "default", true, DeploymentJobs.JobType.systemTest); + tester.deployAndNotify(tester.application("default0"), "default", true, DeploymentJobs.JobType.stagingTest); + // prod zone on 5.2 (usWest1) is skipped, but we still trigger the next zone from triggerReadyJobs: + tester.clock().advance(Duration.ofHours(13)); // Currently we don't cancel running jobs, so this is necessary to allow a new triggering below + tester.deploymentTrigger().triggerReadyJobs(); + tester.deployAndNotify(tester.application("default0"), "default", true, DeploymentJobs.JobType.productionUsEast3); + + // Resulting state: + assertEquals(v2, tester.application("default0").deployments().get(ZoneId.from("prod.us-west-1")).version()); + assertEquals("Last zone is upgraded to v1", + v1, tester.application("default0").deployments().get(ZoneId.from("prod.us-east-3")).version()); + assertFalse(tester.application("default0").deploying().isPresent()); + } + @Test public void testConfidenceIgnoresFailingApplicationChanges() { DeploymentTester tester = new DeploymentTester(); @@ -581,7 +676,7 @@ public class UpgraderTest { tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.productionUsCentral1); tester.deployAndNotify(app, applicationPackage, true, DeploymentJobs.JobType.productionUsEast3); assertTrue("All jobs consumed", tester.buildSystem().jobs().isEmpty()); - + // App is completely upgraded to the latest version for (Deployment deployment : tester.applications().require(app.id()).deployments().values()) assertEquals(version, deployment.version()); @@ -593,20 +688,26 @@ public class UpgraderTest { Version version = Version.fromString("5.0"); tester.updateVersionStatus(version); - ApplicationPackage applicationPackage = new ApplicationPackageBuilder() + ApplicationPackage canaryApplicationPackage = new ApplicationPackageBuilder() + .upgradePolicy("canary") + .environment(Environment.prod) + .region("us-west-1") + .build(); + ApplicationPackage defaultApplicationPackage = new ApplicationPackageBuilder() + .upgradePolicy("default") .environment(Environment.prod) .region("us-west-1") .build(); // Setup applications - Application canary0 = tester.createAndDeploy("canary0", 1, "canary"); - Application canary1 = tester.createAndDeploy("canary1", 2, "canary"); - Application default0 = tester.createAndDeploy("default0", 3, "default"); - Application default1 = tester.createAndDeploy("default1", 4, "default"); - Application default2 = tester.createAndDeploy("default2", 5, "default"); - Application default3 = tester.createAndDeploy("default3", 6, "default"); - Application default4 = tester.createAndDeploy("default4", 7, "default"); - + Application canary0 = tester.createAndDeploy("canary0", 1, canaryApplicationPackage); + Application canary1 = tester.createAndDeploy("canary1", 2, canaryApplicationPackage); + Application default0 = tester.createAndDeploy("default0", 3, defaultApplicationPackage); + Application default1 = tester.createAndDeploy("default1", 4, defaultApplicationPackage); + Application default2 = tester.createAndDeploy("default2", 5, defaultApplicationPackage); + Application default3 = tester.createAndDeploy("default3", 6, defaultApplicationPackage); + Application default4 = tester.createAndDeploy("default4", 7, defaultApplicationPackage); + assertEquals(version, default0.oldestDeployedVersion().get()); // New version is released @@ -616,8 +717,8 @@ public class UpgraderTest { tester.upgrader().maintain(); // Canaries upgrade and raise confidence - tester.completeUpgrade(canary0, version, "canary"); - tester.completeUpgrade(canary1, version, "canary"); + tester.completeUpgrade(canary0, version, canaryApplicationPackage); + tester.completeUpgrade(canary1, version, canaryApplicationPackage); tester.updateVersionStatus(version); assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); @@ -627,10 +728,10 @@ public class UpgraderTest { assertEquals("Upgrade scheduled for remaining apps", 5, tester.buildSystem().jobs().size()); // 4/5 applications fail, confidence is lowered and upgrade is cancelled - tester.completeUpgradeWithError(default0, version, "default", DeploymentJobs.JobType.systemTest); - tester.completeUpgradeWithError(default1, version, "default", DeploymentJobs.JobType.systemTest); - tester.completeUpgradeWithError(default2, version, "default", DeploymentJobs.JobType.systemTest); - tester.completeUpgradeWithError(default3, version, "default", DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default0, version, defaultApplicationPackage, DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default1, version, defaultApplicationPackage, DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default2, version, defaultApplicationPackage, DeploymentJobs.JobType.systemTest); + tester.completeUpgradeWithError(default3, version, defaultApplicationPackage, DeploymentJobs.JobType.systemTest); tester.updateVersionStatus(version); assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); @@ -648,21 +749,27 @@ public class UpgraderTest { assertTrue("Jobs in progress", deadLocked.deploymentJobs().isRunning(tester.controller().applications().deploymentTrigger().jobTimeoutLimit())); assertFalse("No change present", deadLocked.deploying().isPresent()); - // 4/5 applications are repaired and confidence is restored - tester.deployCompletely(default0, applicationPackage); - tester.deployCompletely(default1, applicationPackage); - tester.deployCompletely(default2, applicationPackage); - tester.deployCompletely(default3, applicationPackage); + // 4 out of 5 applications are repaired and confidence is restored + ApplicationPackage defaultApplicationPackageV2 = new ApplicationPackageBuilder() + .searchDefinition("search test { field test type string {} }") + .upgradePolicy("default") + .environment(Environment.prod) + .region("us-west-1") + .build(); + tester.deployCompletely(default0, defaultApplicationPackageV2); + tester.deployCompletely(default1, defaultApplicationPackageV2); + tester.deployCompletely(default2, defaultApplicationPackageV2); + tester.deployCompletely(default3, defaultApplicationPackageV2); tester.updateVersionStatus(version); assertEquals(VespaVersion.Confidence.normal, tester.controller().versionStatus().systemVersion().get().confidence()); tester.upgrader().maintain(); assertEquals("Upgrade scheduled for previously failing apps", 4, tester.buildSystem().jobs().size()); - tester.completeUpgrade(default0, version, "default"); - tester.completeUpgrade(default1, version, "default"); - tester.completeUpgrade(default2, version, "default"); - tester.completeUpgrade(default3, version, "default"); + tester.completeUpgrade(default0, version, defaultApplicationPackageV2); + tester.completeUpgrade(default1, version, defaultApplicationPackageV2); + tester.completeUpgrade(default2, version, defaultApplicationPackageV2); + tester.completeUpgrade(default3, version, defaultApplicationPackageV2); assertEquals(version, tester.application(default0.id()).oldestDeployedVersion().get()); assertEquals(version, tester.application(default1.id()).oldestDeployedVersion().get()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application.json index b5ed2d407df..66b8442730a 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application.json @@ -111,7 +111,7 @@ "gitCommit": "commit1" } }, - "reason": "Immediate retry on failure", + "reason": "staging-test completed", "at": "(ignore)" }, "lastCompleted": { diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application1-recursive.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application1-recursive.json index caca0ad8970..89c84a98543 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application1-recursive.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application1-recursive.json @@ -111,7 +111,7 @@ "gitCommit": "commit1" } }, - "reason": "Immediate retry on failure", + "reason": "staging-test completed", "at": "(ignore)" }, "lastCompleted": { -- cgit v1.2.3