diff options
author | Martin Polden <mpolden@mpolden.no> | 2018-06-15 15:32:58 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-06-15 15:32:58 +0200 |
commit | fe1de99c5baab5147a49e20b1a5ddf3d7fada0f0 (patch) | |
tree | fe04443f97342442b13caf0d32debd786d4f924c | |
parent | 59b82da9c6fb37328a2ed1f7a7f485972537a9a7 (diff) | |
parent | ea00a066a41493412272dac1d03c4164796b615a (diff) |
Merge pull request #6201 from vespa-engine/mpolden/back-off-failed-job-retry
Back off when retrying failing jobs
11 files changed, 460 insertions, 261 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java index f44d2218145..8d15db888fd 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java @@ -2,8 +2,6 @@ package com.yahoo.vespa.hosted.controller.application; import com.yahoo.component.Version; -import com.yahoo.vespa.hosted.controller.Controller; -import com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger; import java.time.Instant; import java.util.Objects; @@ -94,6 +92,11 @@ public class JobStatus { /** The error of the last completion, or empty if the last run succeeded */ public Optional<DeploymentJobs.JobError> jobError() { return jobError; } + /** Returns whether this last failed on out of capacity */ + public boolean isOutOfCapacity() { + return jobError.filter(error -> error == DeploymentJobs.JobError.outOfCapacity).isPresent(); + } + /** * Returns the last triggering of this job, or empty if the controller has never triggered it * and not seen a deployment for it diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentOrder.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentOrder.java deleted file mode 100644 index 1c535a5a331..00000000000 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentOrder.java +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hosted.controller.deployment; - -import com.yahoo.config.application.api.DeploymentSpec; -import com.yahoo.config.provision.SystemName; -import com.yahoo.vespa.hosted.controller.api.integration.zone.ZoneId; -import com.yahoo.vespa.hosted.controller.application.Deployment; -import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; -import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType; -import com.yahoo.vespa.hosted.controller.application.JobStatus; - -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Objects; -import java.util.function.Supplier; - -import static java.util.Comparator.comparingInt; -import static java.util.stream.Collectors.collectingAndThen; -import static java.util.stream.Collectors.toList; - -/** - * This class determines the order of deployments according to an application's deployment spec. - * - * @author mpolden - */ -public class DeploymentOrder { - - private final Supplier<SystemName> system; - - public DeploymentOrder(Supplier<SystemName> system) { - this.system = Objects.requireNonNull(system, "system may not be null"); - } - - /** Returns jobs for given deployment spec, in the order they are declared */ - public List<JobType> jobsFrom(DeploymentSpec deploymentSpec) { - return deploymentSpec.steps().stream() - .flatMap(step -> step.zones().stream()) - .map(this::toJob) - .collect(collectingAndThen(toList(), Collections::unmodifiableList)); - } - - /** Returns job status sorted according to deployment spec */ - public List<JobStatus> sortBy(DeploymentSpec deploymentSpec, Collection<JobStatus> jobStatus) { - List<DeploymentJobs.JobType> sortedJobs = jobsFrom(deploymentSpec); - return jobStatus.stream() - .sorted(comparingInt(job -> sortedJobs.indexOf(job.type()))) - .collect(collectingAndThen(toList(), Collections::unmodifiableList)); - } - - /** Returns deployments sorted according to declared zones */ - public List<Deployment> sortBy(List<DeploymentSpec.DeclaredZone> zones, Collection<Deployment> deployments) { - List<ZoneId> productionZones = zones.stream() - .filter(z -> z.region().isPresent()) - .map(z -> ZoneId.from(z.environment(), z.region().get())) - .collect(toList()); - return deployments.stream() - .sorted(comparingInt(deployment -> productionZones.indexOf(deployment.zone()))) - .collect(collectingAndThen(toList(), Collections::unmodifiableList)); - } - - /** Resolve job from deployment step */ - public JobType toJob(DeploymentSpec.DeclaredZone zone) { - return JobType.from(system.get(), zone.environment(), zone.region().orElse(null)) - .orElseThrow(() -> new IllegalArgumentException("Invalid zone " + zone)); - } - -} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentSteps.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentSteps.java new file mode 100644 index 00000000000..d399ef977ac --- /dev/null +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentSteps.java @@ -0,0 +1,113 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.controller.deployment; + +import com.yahoo.config.application.api.DeploymentSpec; +import com.yahoo.config.provision.Environment; +import com.yahoo.config.provision.SystemName; +import com.yahoo.vespa.hosted.controller.api.integration.zone.ZoneId; +import com.yahoo.vespa.hosted.controller.application.Deployment; +import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; +import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType; +import com.yahoo.vespa.hosted.controller.application.JobStatus; + +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +import static java.util.Collections.singletonList; +import static java.util.Comparator.comparingInt; +import static java.util.stream.Collectors.collectingAndThen; + +/** + * This class provides helper methods for reading a deployment spec. + * + * @author mpolden + */ +public class DeploymentSteps { + + private final DeploymentSpec spec; + private final Supplier<SystemName> system; + + public DeploymentSteps(DeploymentSpec spec, Supplier<SystemName> system) { + this.spec = Objects.requireNonNull(spec, "spec cannot be null"); + this.system = Objects.requireNonNull(system, "system cannot be null"); + } + + /** Returns jobs for this, in the order they are declared */ + public List<JobType> jobs() { + return spec.steps().stream() + .flatMap(step -> step.zones().stream()) + .map(this::toJob) + .collect(collectingAndThen(Collectors.toList(), Collections::unmodifiableList)); + } + + /** Returns job status sorted according to deployment spec */ + public List<JobStatus> sortBy(Collection<JobStatus> jobStatus) { + List<DeploymentJobs.JobType> sortedJobs = jobs(); + return jobStatus.stream() + .sorted(comparingInt(job -> sortedJobs.indexOf(job.type()))) + .collect(collectingAndThen(Collectors.toList(), Collections::unmodifiableList)); + } + + /** Returns deployments sorted according to declared zones */ + public List<Deployment> sortBy2(Collection<Deployment> deployments) { + List<ZoneId> productionZones = spec.zones().stream() + .filter(z -> z.region().isPresent()) + .map(z -> ZoneId.from(z.environment(), z.region().get())) + .collect(Collectors.toList()); + return deployments.stream() + .sorted(comparingInt(deployment -> productionZones.indexOf(deployment.zone()))) + .collect(collectingAndThen(Collectors.toList(), Collections::unmodifiableList)); + } + + /** Resolve jobs from step */ + public List<JobType> toJobs(DeploymentSpec.Step step) { + return step.zones().stream() + .map(this::toJob) + .collect(collectingAndThen(Collectors.toList(), Collections::unmodifiableList)); + } + + /** Returns test jobs in this */ + public List<JobType> testJobs() { + return toJobs(test()); + } + + /** Returns production jobs in this */ + public List<JobType> productionJobs() { + return toJobs(production()); + } + + /** Returns test steps in this */ + public List<DeploymentSpec.Step> test() { + if (spec.steps().isEmpty()) { + return singletonList(new DeploymentSpec.DeclaredZone(Environment.test)); + } + return spec.steps().stream() + .filter(step -> step.deploysTo(Environment.test) || step.deploysTo(Environment.staging)) + .collect(collectingAndThen(Collectors.toList(), Collections::unmodifiableList)); + } + + /** Returns production steps in this */ + public List<DeploymentSpec.Step> production() { + return spec.steps().stream() + .filter(step -> step.deploysTo(Environment.prod) || step.zones().isEmpty()) + .collect(collectingAndThen(Collectors.toList(), Collections::unmodifiableList)); + } + + /** Resolve job from deployment zone */ + private JobType toJob(DeploymentSpec.DeclaredZone zone) { + return JobType.from(system.get(), zone.environment(), zone.region().orElse(null)) + .orElseThrow(() -> new IllegalArgumentException("Invalid zone " + zone)); + } + + /** Resolve jobs from steps */ + private List<JobType> toJobs(List<DeploymentSpec.Step> steps) { + return steps.stream() + .flatMap(step -> toJobs(step).stream()) + .collect(collectingAndThen(Collectors.toList(), Collections::unmodifiableList)); + } + +} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index 63a6ac234ff..00b2a5c9b52 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -1,7 +1,6 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.deployment; -import com.yahoo.component.Version; import com.yahoo.config.application.api.DeploymentSpec; import com.yahoo.config.application.api.DeploymentSpec.Step; import com.yahoo.config.provision.ApplicationId; @@ -15,7 +14,6 @@ import com.yahoo.vespa.hosted.controller.application.ApplicationList; import com.yahoo.vespa.hosted.controller.application.ApplicationVersion; import com.yahoo.vespa.hosted.controller.application.Change; import com.yahoo.vespa.hosted.controller.application.Deployment; -import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobError; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobReport; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType; import com.yahoo.vespa.hosted.controller.application.JobStatus; @@ -26,6 +24,7 @@ import java.time.Duration; import java.time.Instant; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.EnumSet; import java.util.List; import java.util.Map; @@ -33,14 +32,10 @@ import java.util.NoSuchElementException; import java.util.Objects; import java.util.Optional; import java.util.OptionalLong; -import java.util.Set; import java.util.function.Supplier; import java.util.logging.Logger; import java.util.stream.Stream; -import static com.yahoo.config.provision.Environment.prod; -import static com.yahoo.config.provision.Environment.staging; -import static com.yahoo.config.provision.Environment.test; import static com.yahoo.vespa.hosted.controller.api.integration.BuildService.BuildJob; import static com.yahoo.vespa.hosted.controller.api.integration.BuildService.JobState.idle; import static com.yahoo.vespa.hosted.controller.api.integration.BuildService.JobState.queued; @@ -56,7 +51,6 @@ import static java.util.stream.Collectors.groupingBy; import static java.util.stream.Collectors.joining; import static java.util.stream.Collectors.partitioningBy; import static java.util.stream.Collectors.toList; -import static java.util.stream.Collectors.toSet; /** * Responsible for scheduling deployment jobs in a build system and keeping @@ -74,18 +68,16 @@ public class DeploymentTrigger { private final Controller controller; private final Clock clock; - private final DeploymentOrder order; private final BuildService buildService; public DeploymentTrigger(Controller controller, BuildService buildService, Clock clock) { this.controller = Objects.requireNonNull(controller, "controller cannot be null"); this.buildService = Objects.requireNonNull(buildService, "buildService cannot be null"); this.clock = Objects.requireNonNull(clock, "clock cannot be null"); - this.order = new DeploymentOrder(controller::system); } - public DeploymentOrder deploymentOrder() { - return order; + public DeploymentSteps steps(DeploymentSpec spec) { + return new DeploymentSteps(spec, controller::system); } /** @@ -109,8 +101,8 @@ public class DeploymentTrigger { JobRun triggering; if (report.jobType() == component) { ApplicationVersion applicationVersion = ApplicationVersion.from(report.sourceRevision().get(), report.buildNumber()); - triggering = JobRun.triggering(controller.systemVersion(), applicationVersion, Optional - .empty(), Optional.empty(), "Application commit", clock.instant()); + triggering = JobRun.triggering(controller.systemVersion(), applicationVersion, Optional.empty(), + Optional.empty(), "Application commit", clock.instant()); if (report.success()) { if (acceptNewApplicationVersion(application.get())) application = application.withChange(application.get().change().with(applicationVersion)) @@ -195,7 +187,8 @@ public class DeploymentTrigger { buildService.trigger(BuildJob.of(applicationId, application.deploymentJobs().projectId().getAsLong(), jobType.jobName())); return singletonList(component); } - Versions versions = versions(application, application.change(), deploymentFor(application, jobType)); + Versions versions = Versions.from(application.change(), application, deploymentFor(application, jobType), + controller.systemVersion()); String reason = "Job triggered manually by " + user; return (jobType.isProduction() && ! isTested(application, versions) ? testJobs(application, versions, reason, clock.instant()).stream() @@ -241,7 +234,7 @@ public class DeploymentTrigger { private Optional<JobRun> successOn(Application application, JobType jobType, Versions versions) { return application.deploymentJobs().statusOf(jobType).flatMap(JobStatus::lastSuccess) - .filter(run -> targetsMatch(versions, run)); + .filter(versions::targetsMatch); } private Optional<Deployment> deploymentFor(Application application, JobType jobType) { @@ -279,47 +272,90 @@ public class DeploymentTrigger { .<Instant>flatMap(job -> job.lastSuccess().map(JobRun::at))); String reason = "New change available"; List<Job> testJobs = null; // null means "uninitialised", while empty means "don't run any jobs". + DeploymentSteps steps = steps(application.deploymentSpec()); - if (change.isPresent()) - for (Step step : productionStepsOf(application)) { - Set<JobType> stepJobs = step.zones().stream().map(order::toJob).collect(toSet()); - List<JobType> remainingJobs = stepJobs.stream().filter(job -> ! isComplete(change, application, job)).collect(toList()); - if ( ! remainingJobs.isEmpty()) { // Step is incomplete; trigger remaining jobs if ready, or their test jobs if untested. + if (change.isPresent()) { + for (Step step : steps.production()) { + List<JobType> stepJobs = steps.toJobs(step); + List<JobType> remainingJobs = stepJobs.stream().filter(job -> !isComplete(change, application, job)).collect(toList()); + if (!remainingJobs.isEmpty()) { // Step is incomplete; trigger remaining jobs if ready, or their test jobs if untested. for (JobType job : remainingJobs) { - Versions versions = versions(application, change, deploymentFor(application, job)); + Versions versions = Versions.from(change, application, deploymentFor(application, job), + controller.systemVersion()); if (isTested(application, versions)) { - if ( completedAt.isPresent() - && jobStateOf(application, job) == idle - && stepJobs.containsAll(runningProductionJobs(application))) + if (completedAt.isPresent() && canTrigger(job, versions, application, stepJobs)) { jobs.add(deploymentJob(application, versions, change, job, reason, completedAt.get())); - if ( ! alreadyTriggered(application, versions)) + } + if (!alreadyTriggered(application, versions)) { testJobs = emptyList(); - } - else if (testJobs == null) { - testJobs = testJobs(application, versions, String.format("Testing deployment for %s (%s)", job.jobName(), versions.toString()), - completedAt.orElse(clock.instant())); + } + } else if (testJobs == null) { + testJobs = testJobs(application, versions, + String.format("Testing deployment for %s (%s)", + job.jobName(), versions.toString()), + completedAt.orElseGet(clock::instant)); } } completedAt = Optional.empty(); - } - else { // All jobs are complete; find the time of completion of this step. + } else { // All jobs are complete; find the time of completion of this step. if (stepJobs.isEmpty()) { // No jobs means this is delay step. Duration delay = ((DeploymentSpec.Delay) step).duration(); - completedAt = completedAt.map(at -> at.plus(delay)).filter(at -> ! at.isAfter(clock.instant())); + completedAt = completedAt.map(at -> at.plus(delay)).filter(at -> !at.isAfter(clock.instant())); reason += " after a delay of " + delay; - } - else { + } else { completedAt = stepJobs.stream().map(job -> application.deploymentJobs().statusOf(job).get().lastCompleted().get().at()).max(naturalOrder()); reason = "Available change in " + stepJobs.stream().map(JobType::jobName).collect(joining(", ")); } } } - if (testJobs == null) - testJobs = testJobs(application, versions(application, application.change(), Optional.empty()), + } + if (testJobs == null) { + testJobs = testJobs(application, Versions.from(application, controller.systemVersion()), "Testing last changes outside prod", clock.instant()); + } jobs.addAll(testJobs); }); - return jobs; + return Collections.unmodifiableList(jobs); + } + + /** Returns whether given job should be triggered */ + private boolean canTrigger(JobType job, Versions versions, Application application, List<JobType> parallelJobs) { + if (jobStateOf(application, job) != idle) return false; + if (parallelJobs != null && !parallelJobs.containsAll(runningProductionJobs(application))) return false; + + return triggerAt(clock.instant(), job, versions, application); + } + + /** Returns whether given job should be triggered */ + private boolean canTrigger(JobType job, Versions versions, Application application) { + return canTrigger(job, versions, application, null); + } + + /** Returns whether job can trigger at given instant */ + private boolean triggerAt(Instant instant, JobType job, Versions versions, Application application) { + Optional<JobStatus> jobStatus = application.deploymentJobs().statusOf(job); + if (!jobStatus.isPresent()) return true; + if (jobStatus.get().isSuccess()) return true; // Success + if (!jobStatus.get().lastCompleted().isPresent()) return true; // Never completed + if (!jobStatus.get().firstFailing().isPresent()) return true; // Should not happen as firstFailing should be set for an unsuccessful job + if (!versions.targetsMatch(jobStatus.get().lastCompleted().get())) return true; // Always trigger as targets have changed + + Instant firstFailing = jobStatus.get().firstFailing().get().at(); + Instant lastCompleted = jobStatus.get().lastCompleted().get().at(); + + // Retry all errors immediately for 1 minute + if (firstFailing.isAfter(instant.minus(Duration.ofMinutes(1)))) return true; + + // Retry out of capacity errors in test environments every minute + if (job.isTest() && jobStatus.get().isOutOfCapacity()) { + return lastCompleted.isBefore(instant.minus(Duration.ofMinutes(1))); + } + + // Retry other errors + if (firstFailing.isAfter(instant.minus(Duration.ofHours(1)))) { // If we failed within the last hour ... + return lastCompleted.isBefore(instant.minus(Duration.ofMinutes(10))); // ... retry every 10 minutes + } + return lastCompleted.isBefore(instant.minus(Duration.ofHours(2))); // Retry at most every 2 hours } // ---------- Job state helpers ---------- @@ -358,7 +394,7 @@ public class DeploymentTrigger { */ private boolean isComplete(Change change, Application application, JobType jobType) { Optional<Deployment> existingDeployment = deploymentFor(application, jobType); - return successOn(application, jobType, versions(application, change, existingDeployment)).isPresent() + return successOn(application, jobType, Versions.from(change, application, existingDeployment, controller.systemVersion())).isPresent() || jobType.isProduction() && existingDeployment.map(deployment -> ! isUpgrade(change, deployment) && isDowngrade(application.change(), deployment)) .orElse(false); @@ -379,7 +415,7 @@ public class DeploymentTrigger { private Optional<Instant> testedAt(Application application, Versions versions) { Optional<JobRun> testRun = successOn(application, systemTest, versions); Optional<JobRun> stagingRun = successOn(application, stagingTest, versions) - .filter(run -> sourcesMatchIfPresent(versions, run)); + .filter(versions::sourcesMatchIfPresent); return max(testRun.map(JobRun::at), stagingRun.map(JobRun::at)) .filter(__ -> testRun.isPresent() && stagingRun.isPresent()); } @@ -388,24 +424,11 @@ public class DeploymentTrigger { return application.deploymentJobs().jobStatus().values().stream() .filter(job -> job.type().isProduction()) .anyMatch(job -> job.lastTriggered() - .filter(run -> targetsMatch(versions, run)) - .filter(run -> sourcesMatchIfPresent(versions, run)) + .filter(versions::targetsMatch) + .filter(versions::sourcesMatchIfPresent) .isPresent()); } - /** If the given state's sources are present and differ from its targets, returns whether they are equal to those - * of the given job run. */ - private static boolean sourcesMatchIfPresent(Versions versions, JobRun jobRun) { - return ( ! versions.sourcePlatform.filter(version -> ! version.equals(versions.targetPlatform)).isPresent() - || versions.sourcePlatform.equals(jobRun.sourcePlatform())) - && ( ! versions.sourceApplication.filter(version -> ! version.equals(versions.targetApplication)).isPresent() - || versions.sourceApplication.equals(jobRun.sourceApplication())); - } - - private static boolean targetsMatch(Versions versions, JobRun jobRun) { - return versions.targetPlatform.equals(jobRun.platform()) && versions.targetApplication.equals(jobRun.application()); - } - // ---------- Change management o_O ---------- private boolean acceptNewApplicationVersion(Application application) { @@ -415,9 +438,10 @@ public class DeploymentTrigger { } private Change remainingChange(Application application) { - List<JobType> jobs = productionStepsOf(application).isEmpty() - ? jobsOf(testStepsOf(application)) - : jobsOf(productionStepsOf(application)); + DeploymentSteps steps = steps(application.deploymentSpec()); + List<JobType> jobs = steps.production().isEmpty() + ? steps.testJobs() + : steps.productionJobs(); Change change = application.change(); if (jobs.stream().allMatch(job -> isComplete(application.change().withoutApplication(), application, job))) @@ -436,61 +460,28 @@ public class DeploymentTrigger { */ private List<Job> testJobs(Application application, Versions versions, String reason, Instant availableSince) { List<Job> jobs = new ArrayList<>(); - for (JobType jobType : jobsOf(testStepsOf(application))) { + for (JobType jobType : steps(application.deploymentSpec()).testJobs()) { Optional<JobRun> completion = successOn(application, jobType, versions) - .filter(run -> sourcesMatchIfPresent(versions, run) || jobType == systemTest); - if ( ! completion.isPresent() && jobStateOf(application, jobType) == idle) + .filter(run -> versions.sourcesMatchIfPresent(run) || jobType == systemTest); + if (!completion.isPresent() && canTrigger(jobType, versions, application)) { jobs.add(deploymentJob(application, versions, application.change(), jobType, reason, availableSince)); + } } return jobs; } - private List<JobType> jobsOf(Collection<Step> steps) { - return steps.stream().flatMap(step -> step.zones().stream()).map(order::toJob).collect(toList()); - } - - private List<Step> testStepsOf(Application application) { - return application.deploymentSpec().steps().isEmpty() - ? singletonList(new DeploymentSpec.DeclaredZone(test)) - : application.deploymentSpec().steps().stream() - .filter(step -> step.deploysTo(test) || step.deploysTo(staging)) - .collect(toList()); - } - - private List<Step> productionStepsOf(Application application) { - return application.deploymentSpec().steps().stream() - .filter(step -> step.deploysTo(prod) || step.zones().isEmpty()) - .collect(toList()); - } - private Job deploymentJob(Application application, Versions versions, Change change, JobType jobType, String reason, Instant availableSince) { - boolean isRetry = application.deploymentJobs().statusOf(jobType).flatMap(JobStatus::jobError) - .filter(JobError.outOfCapacity::equals).isPresent(); + boolean isRetry = application.deploymentJobs().statusOf(jobType) + .map(JobStatus::isOutOfCapacity) + .orElse(false); if (isRetry) reason += "; retrying on out of capacity"; - JobRun triggering = JobRun.triggering(versions.targetPlatform, versions.targetApplication, versions.sourcePlatform, versions.sourceApplication, reason, clock.instant()); + JobRun triggering = JobRun.triggering(versions.targetPlatform(), versions.targetApplication(), + versions.sourcePlatform(), versions.sourceApplication(), + reason, clock.instant()); return new Job(application, triggering, jobType, availableSince, isRetry, change.application().isPresent()); } - private Versions versions(Application application, Change change, Optional<Deployment> deployment) { - return new Versions(targetPlatform(application, change, deployment), - targetApplication(application, change, deployment), - deployment.map(Deployment::version), - deployment.map(Deployment::applicationVersion)); - } - - private Version targetPlatform(Application application, Change change, Optional<Deployment> deployment) { - return max(deployment.map(Deployment::version), change.platform()) - .orElse(application.oldestDeployedPlatform() - .orElse(controller.systemVersion())); - } - - private ApplicationVersion targetApplication(Application application, Change change, Optional<Deployment> deployment) { - return max(deployment.map(Deployment::applicationVersion), change.application()) - .orElse(application.oldestDeployedApplication() - .orElse(application.deploymentJobs().jobStatus().get(component).lastSuccess().get().application())); - } - // ---------- Data containers ---------- @@ -519,34 +510,5 @@ public class DeploymentTrigger { } - - private static class Versions { - - private final Version targetPlatform; - private final ApplicationVersion targetApplication; - private final Optional<Version> sourcePlatform; - private final Optional<ApplicationVersion> sourceApplication; - - private Versions(Version targetPlatform, ApplicationVersion targetApplication, Optional<Version> sourcePlatform, - Optional<ApplicationVersion> sourceApplication) { - this.targetPlatform = targetPlatform; - this.targetApplication = targetApplication; - this.sourcePlatform = sourcePlatform; - this.sourceApplication = sourceApplication; - } - - @Override - public String toString() { - return String.format("platform %s%s, application %s%s", - sourcePlatform.filter(source -> ! source.equals(targetPlatform)) - .map(source -> source + " -> ").orElse(""), - targetPlatform, - sourceApplication.filter(source -> ! source.equals(targetApplication)) - .map(source -> source.id() + " -> ").orElse(""), - targetApplication.id()); - } - - } - } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Versions.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Versions.java new file mode 100644 index 00000000000..16f6378d2dc --- /dev/null +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Versions.java @@ -0,0 +1,115 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.controller.deployment; + +import com.yahoo.component.Version; +import com.yahoo.vespa.hosted.controller.Application; +import com.yahoo.vespa.hosted.controller.application.ApplicationVersion; +import com.yahoo.vespa.hosted.controller.application.Change; +import com.yahoo.vespa.hosted.controller.application.Deployment; +import com.yahoo.vespa.hosted.controller.application.JobStatus; + +import java.util.Optional; + +import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.component; + +/** + * Source and target versions for an application. + * + * @author jvenstad + * @author mpolden + */ +public class Versions { + + private final Version targetPlatform; + private final ApplicationVersion targetApplication; + private final Optional<Version> sourcePlatform; + private final Optional<ApplicationVersion> sourceApplication; + + public Versions(Version targetPlatform, ApplicationVersion targetApplication, Optional<Version> sourcePlatform, + Optional<ApplicationVersion> sourceApplication) { + this.targetPlatform = targetPlatform; + this.targetApplication = targetApplication; + this.sourcePlatform = sourcePlatform; + this.sourceApplication = sourceApplication; + } + + /** Target platform version for this */ + public Version targetPlatform() { + return targetPlatform; + } + + /** Target application version for this */ + public ApplicationVersion targetApplication() { + return targetApplication; + } + + /** Source platform version for this */ + public Optional<Version> sourcePlatform() { + return sourcePlatform; + } + + /** Source application version for this */ + public Optional<ApplicationVersion> sourceApplication() { + return sourceApplication; + } + + /** Returns whether source versions are present and match those of the given job run */ + public boolean sourcesMatchIfPresent(JobStatus.JobRun jobRun) { + return (!sourcePlatform.filter(version -> !version.equals(targetPlatform)).isPresent() || + sourcePlatform.equals(jobRun.sourcePlatform())) && + (!sourceApplication.filter(version -> !version.equals(targetApplication)).isPresent() || + sourceApplication.equals(jobRun.sourceApplication())); + } + + public boolean targetsMatch(JobStatus.JobRun jobRun) { + return targetPlatform.equals(jobRun.platform()) && + targetApplication.equals(jobRun.application()); + } + + @Override + public String toString() { + return String.format("platform %s%s, application %s%s", + sourcePlatform.filter(source -> !source.equals(targetPlatform)) + .map(source -> source + " -> ").orElse(""), + targetPlatform, + sourceApplication.filter(source -> !source.equals(targetApplication)) + .map(source -> source.id() + " -> ").orElse(""), + targetApplication.id()); + } + + /** Create versions using change contained in application */ + public static Versions from(Application application, Version defaultPlatformVersion) { + return from(application.change(), application, Optional.empty(), defaultPlatformVersion); + } + + /** Create versions using given change and application */ + public static Versions from(Change change, Application application, Optional<Deployment> deployment, + Version defaultPlatformVersion) { + return new Versions(targetPlatform(application, change, deployment, defaultPlatformVersion), + targetApplication(application, change, deployment), + deployment.map(Deployment::version), + deployment.map(Deployment::applicationVersion)); + } + + private static Version targetPlatform(Application application, Change change, Optional<Deployment> deployment, + Version defaultVersion) { + return max(deployment.map(Deployment::version), change.platform()) + .orElse(application.oldestDeployedPlatform() + .orElse(defaultVersion)); + } + + private static ApplicationVersion targetApplication(Application application, Change change, + Optional<Deployment> deployment) { + return max(deployment.map(Deployment::applicationVersion), change.application()) + .orElse(application.oldestDeployedApplication() + .orElse(application.deploymentJobs().jobStatus().get(component) + .lastSuccess() + .get() + .application())); + } + + private static <T extends Comparable<T>> Optional<T> max(Optional<T> o1, Optional<T> o2) { + return ! o1.isPresent() ? o2 : ! o2.isPresent() ? o1 : o1.get().compareTo(o2.get()) >= 0 ? o1 : o2; + } + +} diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java index edf68054481..76a8eaec5d9 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/ApplicationApiHandler.java @@ -355,8 +355,8 @@ public class ApplicationApiHandler extends LoggingRequestHandler { // Jobs sorted according to deployment spec List<JobStatus> jobStatus = controller.applications().deploymentTrigger() - .deploymentOrder() - .sortBy(application.deploymentSpec(), application.deploymentJobs().jobStatus().values()); + .steps(application.deploymentSpec()) + .sortBy(application.deploymentJobs().jobStatus().values()); Cursor deploymentsArray = object.setArray("deploymentJobs"); for (JobStatus job : jobStatus) { @@ -396,8 +396,8 @@ public class ApplicationApiHandler extends LoggingRequestHandler { // Deployments sorted according to deployment spec List<Deployment> deployments = controller.applications().deploymentTrigger() - .deploymentOrder() - .sortBy(application.deploymentSpec().zones(), application.deployments().values()); + .steps(application.deploymentSpec()) + .sortBy2(application.deployments().values()); Cursor instancesArray = object.setArray("instances"); for (Deployment deployment : deployments) { Cursor deploymentObject = instancesArray.addObject(); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java index c24c8693688..225516644eb 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java @@ -16,10 +16,10 @@ import com.yahoo.vespa.config.SlimeUtils; import com.yahoo.vespa.hosted.controller.api.application.v4.model.DeployOptions; import com.yahoo.vespa.hosted.controller.api.application.v4.model.EndpointStatus; import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; -import com.yahoo.vespa.hosted.controller.api.identifiers.ScrewdriverId; import com.yahoo.vespa.hosted.controller.api.integration.BuildService; import com.yahoo.vespa.hosted.controller.api.integration.dns.Record; import com.yahoo.vespa.hosted.controller.api.integration.dns.RecordName; +import com.yahoo.vespa.hosted.controller.api.integration.stubs.MockBuildService; import com.yahoo.vespa.hosted.controller.api.integration.zone.ZoneId; import com.yahoo.vespa.hosted.controller.application.ApplicationPackage; import com.yahoo.vespa.hosted.controller.application.ApplicationVersion; @@ -31,7 +31,6 @@ import com.yahoo.vespa.hosted.controller.application.JobStatus; import com.yahoo.vespa.hosted.controller.application.SourceRevision; import com.yahoo.vespa.hosted.controller.deployment.ApplicationPackageBuilder; import com.yahoo.vespa.hosted.controller.deployment.BuildJob; -import com.yahoo.vespa.hosted.controller.api.integration.stubs.MockBuildService; import com.yahoo.vespa.hosted.controller.deployment.DeploymentTester; import com.yahoo.vespa.hosted.controller.persistence.ApplicationSerializer; import com.yahoo.vespa.hosted.controller.rotation.RotationId; @@ -103,7 +102,7 @@ public class ControllerTest { assertEquals(4, applications.require(app1.id()).deploymentJobs().jobStatus().size()); ApplicationVersion applicationVersion = tester.controller().applications().require(app1.id()).change().application().get(); - assertTrue("Application version has been set during deployment", applicationVersion != ApplicationVersion.unknown); + assertFalse("Application version has been set during deployment", applicationVersion.isUnknown()); assertStatus(JobStatus.initial(stagingTest) .withTriggering(version1, applicationVersion, Optional.empty(),"", tester.clock().instant()) .withCompletion(42, Optional.empty(), tester.clock().instant()), app1.id(), tester.controller()); @@ -151,6 +150,7 @@ public class ControllerTest { .withTriggering(version1, applicationVersion, productionCorpUsEast1.zone(main).map(tester.application(app1.id()).deployments()::get), "", tester.clock().instant()) .withCompletion(42, Optional.empty(), tester.clock().instant()), app1.id(), tester.controller()); + tester.clock().advance(Duration.ofHours(1)); // Stop retrying tester.jobCompletion(productionCorpUsEast1).application(app1).unsuccessful().submit(); tester.deployAndNotify(app1, applicationPackage, true, stagingTest); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java index ffc93f7b5ba..f91dce2b203 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java @@ -12,7 +12,6 @@ import com.yahoo.vespa.hosted.controller.ArtifactRepositoryMock; import com.yahoo.vespa.hosted.controller.ConfigServerMock; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.ControllerTester; -import com.yahoo.vespa.hosted.controller.api.integration.BuildService; import com.yahoo.vespa.hosted.controller.api.integration.stubs.MockBuildService; import com.yahoo.vespa.hosted.controller.api.integration.zone.ZoneId; import com.yahoo.vespa.hosted.controller.application.ApplicationPackage; @@ -193,8 +192,8 @@ public class DeploymentTester { private void completeDeployment(Application application, ApplicationPackage applicationPackage, Optional<JobType> failOnJob, boolean includingProductionZones) { - DeploymentOrder order = new DeploymentOrder(controller()::system); - List<JobType> jobs = order.jobsFrom(applicationPackage.deploymentSpec()); + DeploymentSteps steps = controller().applications().deploymentTrigger().steps(applicationPackage.deploymentSpec()); + List<JobType> jobs = steps.jobs(); if ( ! includingProductionZones) jobs = jobs.stream().filter(job -> ! job.isProduction()).collect(Collectors.toList()); for (JobType job : jobs) { @@ -262,6 +261,10 @@ public class DeploymentTester { deployAndNotify(application, Optional.of(applicationPackage), success, job); } + public void deployAndNotify(Application application, boolean success, JobType job) { + deployAndNotify(application, Optional.empty(), success, job); + } + public void deployAndNotify(Application application, Optional<ApplicationPackage> applicationPackage, boolean success, JobType job) { if (success) { // Staging deploys twice, once with current version and once with new version @@ -296,8 +299,16 @@ public class DeploymentTester { .build(); } - public void assertRunning(ApplicationId id, JobType jobType) { - assertTrue(buildService().jobs().contains(BuildService.BuildJob.of(id, application(id).deploymentJobs().projectId().getAsLong(), jobType.jobName()))); + public void assertRunning(JobType job, ApplicationId application) { + assertTrue(String.format("Job %s for %s is running", job, application), isRunning(job, application)); + } + + public void assertNotRunning(JobType job, ApplicationId application) { + assertFalse(String.format("Job %s for %s is not running", job, application), isRunning(job, application)); + } + + private boolean isRunning(JobType job, ApplicationId application) { + return buildService().jobs().contains(ControllerTester.buildJob(application(application), job)); } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index c26852a2153..10f53f17153 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -21,7 +21,7 @@ import org.junit.Test; import java.time.Duration; import java.time.Instant; -import java.util.Arrays; +import java.util.Optional; import java.util.function.Supplier; import static com.yahoo.config.provision.SystemName.main; @@ -34,7 +34,6 @@ import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobTy import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.stagingTest; import static com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType.systemTest; import static java.util.Collections.singletonList; -import static java.util.Optional.empty; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotEquals; @@ -74,11 +73,11 @@ public class DeploymentTriggerTest { tester.buildService().remove(buildJob(app, stagingTest)); tester.readyJobTrigger().maintain(); assertEquals("Retried dead job", 2, tester.buildService().jobs().size()); - tester.assertRunning(app.id(), stagingTest); + tester.assertRunning(stagingTest, app.id()); tester.deployAndNotify(app, applicationPackage, true, stagingTest); // system-test is now the only running job -- production jobs haven't started yet, since it is unfinished. - tester.assertRunning(app.id(), systemTest); + tester.assertRunning(systemTest, app.id()); assertEquals(1, tester.buildService().jobs().size()); // system-test fails and is retried @@ -86,7 +85,7 @@ public class DeploymentTriggerTest { assertEquals("Job is retried on failure", 1, tester.buildService().jobs().size()); tester.deployAndNotify(app, applicationPackage, true, JobType.systemTest); - tester.assertRunning(app.id(), productionUsWest1); + tester.assertRunning(productionUsWest1, app.id()); } @Test @@ -145,13 +144,13 @@ public class DeploymentTriggerTest { // 30 seconds later, the first jobs may trigger. assertEquals(1, mockBuildService.jobs().size()); - tester.assertRunning(application.id(), productionUsWest1); + tester.assertRunning(productionUsWest1, application.id()); // 3 minutes pass, delayed trigger does nothing as us-west-1 is still in progress tester.clock().advance(Duration.ofMinutes(3)); tester.deploymentTrigger().triggerReadyJobs(); assertEquals(1, mockBuildService.jobs().size()); - tester.assertRunning(application.id(), productionUsWest1); + tester.assertRunning(productionUsWest1, application.id()); // us-west-1 completes tester.deployAndNotify(application, applicationPackage, true, productionUsWest1); @@ -202,8 +201,8 @@ public class DeploymentTriggerTest { // Deploys in two regions in parallel assertEquals(2, tester.buildService().jobs().size()); - tester.assertRunning(application.id(), productionUsEast3); - tester.assertRunning(application.id(), productionUsWest1); + tester.assertRunning(productionUsEast3, application.id()); + tester.assertRunning(productionUsWest1, application.id()); tester.deploy(JobType.productionUsWest1, application, applicationPackage, false); tester.jobCompletion(JobType.productionUsWest1).application(application).submit(); @@ -405,8 +404,8 @@ public class DeploymentTriggerTest { }); assertEquals(0, tester.buildService().jobs().size()); readyJobsTrigger.run(); - tester.assertRunning(app.id(), systemTest); - tester.assertRunning(app.id(), stagingTest); + tester.assertRunning(systemTest, app.id()); + tester.assertRunning(stagingTest, app.id()); } @Test @@ -426,8 +425,8 @@ public class DeploymentTriggerTest { tester.completeDeploymentWithError(application, applicationPackage, BuildJob.defaultBuildNumber + 1, productionUsCentral1); // deployAndNotify doesn't actually deploy if the job fails, so we need to do that manually. - tester.deployAndNotify(application, empty(), false, productionUsCentral1); - tester.deploy(productionUsCentral1, application, empty(), false); + tester.deployAndNotify(application, false, productionUsCentral1); + tester.deploy(productionUsCentral1, application, Optional.empty(), false); ApplicationVersion appVersion1 = ApplicationVersion.from(BuildJob.defaultSourceRevision, BuildJob.defaultBuildNumber + 1); assertEquals(appVersion1, app.get().deployments().get(ZoneId.from("prod.us-central-1")).applicationVersion()); @@ -444,18 +443,18 @@ public class DeploymentTriggerTest { Version version1 = new Version("6.2"); tester.upgradeSystem(version1); tester.jobCompletion(productionUsCentral1).application(application).unsuccessful().submit(); - tester.deployAndNotify(application, empty(), true, systemTest); - tester.deployAndNotify(application, empty(), true, stagingTest); - tester.deployAndNotify(application, empty(), false, productionUsCentral1); + tester.deployAndNotify(application, true, systemTest); + tester.deployAndNotify(application, true, stagingTest); + tester.deployAndNotify(application, false, productionUsCentral1); // The last job has a different target, and the tests need to run again. // These may now start, since the first job has been triggered once, and thus is verified already. - tester.deployAndNotify(application, empty(), true, systemTest); - tester.deployAndNotify(application, empty(), true, stagingTest); + tester.deployAndNotify(application, true, systemTest); + tester.deployAndNotify(application, true, stagingTest); // Finally, the two production jobs complete, in order. - tester.deployAndNotify(application, empty(), true, productionUsCentral1); - tester.deployAndNotify(application, empty(), true, productionEuWest1); + tester.deployAndNotify(application, true, productionUsCentral1); + tester.deployAndNotify(application, true, productionEuWest1); assertEquals(appVersion1, app.get().deployments().get(ZoneId.from("prod.us-central-1")).applicationVersion()); } @@ -500,22 +499,26 @@ public class DeploymentTriggerTest { tester.deployAndNotify(application, applicationPackage, true, systemTest); tester.deployAndNotify(application, applicationPackage, true, stagingTest); + tester.assertRunning(productionUsCentral1, application.id()); assertEquals(v2, app.get().deployments().get(productionUsCentral1.zone(main).get()).version()); - assertEquals((Long) 42L, app.get().deployments().get(productionUsCentral1.zone(main).get()).applicationVersion().buildNumber().get()); + assertEquals(Long.valueOf(42L), app.get().deployments().get(productionUsCentral1.zone(main).get()).applicationVersion().buildNumber().get()); assertNotEquals(triggered, app.get().deploymentJobs().jobStatus().get(productionUsCentral1).lastTriggered().get().at()); // Change has a higher application version than what is deployed -- deployment should trigger. tester.deployAndNotify(application, applicationPackage, false, productionUsCentral1); tester.deploy(productionUsCentral1, application, applicationPackage); assertEquals(v2, app.get().deployments().get(productionUsCentral1.zone(main).get()).version()); - assertEquals((Long) 43L, app.get().deployments().get(productionUsCentral1.zone(main).get()).applicationVersion().buildNumber().get()); + assertEquals(Long.valueOf(43), app.get().deployments().get(productionUsCentral1.zone(main).get()).applicationVersion().buildNumber().get()); // Change is again strictly dominated, and us-central-1 is skipped, even though it is still failing. - tester.deployAndNotify(application, applicationPackage, false, productionUsCentral1); + tester.clock().advance(Duration.ofHours(2).plus(Duration.ofSeconds(1))); // Enough time for retry + tester.readyJobTrigger().maintain(); + // Failing job is not retried as change has been deployed + tester.assertNotRunning(productionUsCentral1, application.id()); // Last job has a different deployment target, so tests need to run again. - tester.deployAndNotify(application, empty(), true, systemTest); - tester.deployAndNotify(application, empty(), true, stagingTest); + tester.deployAndNotify(application, true, systemTest); + tester.deployAndNotify(application, true, stagingTest); tester.deployAndNotify(application, applicationPackage, true, productionEuWest1); assertFalse(app.get().change().isPresent()); assertFalse(app.get().deploymentJobs().jobStatus().get(productionUsCentral1).isSuccess()); @@ -538,8 +541,8 @@ public class DeploymentTriggerTest { Version v1 = new Version("6.1"); Version v2 = new Version("6.2"); tester.upgradeSystem(v2); - tester.deployAndNotify(application, empty(), true, systemTest); - tester.deployAndNotify(application, empty(), true, stagingTest); + tester.deployAndNotify(application, true, systemTest); + tester.deployAndNotify(application, true, stagingTest); tester.deploymentTrigger().cancelChange(application.id(), true); tester.deploy(productionEuWest1, application, applicationPackage); assertEquals(v2, app.get().deployments().get(productionEuWest1.zone(main).get()).version()); @@ -550,8 +553,8 @@ public class DeploymentTriggerTest { Version firstTested = app.get().deploymentJobs().jobStatus().get(systemTest).lastTriggered().get().platform(); assertEquals(firstTested, app.get().deploymentJobs().jobStatus().get(stagingTest).lastTriggered().get().platform()); - tester.deployAndNotify(application, empty(), true, systemTest); - tester.deployAndNotify(application, empty(), true, stagingTest); + tester.deployAndNotify(application, true, systemTest); + tester.deployAndNotify(application, true, stagingTest); // Tests are not re-triggered, because the jobs they were run for has not yet been triggered with the tested versions. assertEquals(firstTested, app.get().deploymentJobs().jobStatus().get(systemTest).lastTriggered().get().platform()); @@ -565,14 +568,14 @@ public class DeploymentTriggerTest { // New upgrade is already tested for one of the jobs, which has now been triggered, and tests may run for the other job. assertNotEquals(firstTested, app.get().deploymentJobs().jobStatus().get(systemTest).lastTriggered().get().platform()); assertNotEquals(firstTested, app.get().deploymentJobs().jobStatus().get(stagingTest).lastTriggered().get().platform()); - tester.deployAndNotify(application, empty(), true, systemTest); - tester.deployAndNotify(application, empty(), true, stagingTest); + tester.deployAndNotify(application, true, systemTest); + tester.deployAndNotify(application, true, stagingTest); // Both jobs fail again, and must be re-triggered -- this is ok, as they are both already triggered on their current targets. - tester.deployAndNotify(application, empty(), false, productionEuWest1); - tester.deployAndNotify(application, empty(), false, productionUsEast3); - tester.deployAndNotify(application, empty(), true, productionUsEast3); - tester.deployAndNotify(application, empty(), true, productionEuWest1); + tester.deployAndNotify(application, false, productionEuWest1); + tester.deployAndNotify(application, false, productionUsEast3); + tester.deployAndNotify(application, true, productionUsEast3); + tester.deployAndNotify(application, true, productionEuWest1); assertFalse(app.get().change().isPresent()); assertEquals(43, app.get().deploymentJobs().jobStatus().get(productionEuWest1).lastSuccess().get().application().buildNumber().get().longValue()); assertEquals(43, app.get().deploymentJobs().jobStatus().get(productionUsEast3).lastSuccess().get().application().buildNumber().get().longValue()); @@ -595,28 +598,86 @@ public class DeploymentTriggerTest { Version v1 = new Version("6.1"); Version v2 = new Version("6.2"); tester.upgradeSystem(v2); - tester.deployAndNotify(application, empty(), true, systemTest); - tester.deployAndNotify(application, empty(), true, stagingTest); - tester.deployAndNotify(application, empty(), true, productionUsCentral1); - tester.deployAndNotify(application, empty(), true, productionEuWest1); - tester.deployAndNotify(application, empty(), false, productionUsEast3); + tester.deployAndNotify(application, true, systemTest); + tester.deployAndNotify(application, true, stagingTest); + tester.deployAndNotify(application, true, productionUsCentral1); + tester.deployAndNotify(application, true, productionEuWest1); + tester.deployAndNotify(application, false, productionUsEast3); assertEquals(v2, app.get().deployments().get(ZoneId.from("prod", "us-central-1")).version()); assertEquals(v2, app.get().deployments().get(ZoneId.from("prod", "eu-west-1")).version()); assertEquals(v1, app.get().deployments().get(ZoneId.from("prod", "us-east-3")).version()); Version v3 = new Version("6.3"); tester.upgradeSystem(v3); - tester.deployAndNotify(application, empty(), false, productionUsEast3); + tester.deployAndNotify(application, false, productionUsEast3); // See that sources for staging are: first v2, then v1. - tester.deployAndNotify(application, empty(), true, systemTest); - tester.deployAndNotify(application, empty(), true, stagingTest); + tester.deployAndNotify(application, true, systemTest); + tester.deployAndNotify(application, true, stagingTest); assertEquals(v2, app.get().deploymentJobs().jobStatus().get(stagingTest).lastSuccess().get().sourcePlatform().get()); - tester.deployAndNotify(application, empty(), true, productionUsCentral1); + tester.deployAndNotify(application, true, productionUsCentral1); assertEquals(v1, app.get().deploymentJobs().jobStatus().get(stagingTest).lastTriggered().get().sourcePlatform().get()); - tester.deployAndNotify(application, empty(), true, stagingTest); - tester.deployAndNotify(application, empty(), true, productionEuWest1); - tester.deployAndNotify(application, empty(), true, productionUsEast3); + tester.deployAndNotify(application, true, stagingTest); + tester.deployAndNotify(application, true, productionEuWest1); + tester.deployAndNotify(application, true, productionUsEast3); + } + + @Test + public void retriesFailingJobs() { + DeploymentTester tester = new DeploymentTester(); + Application application = tester.createApplication("app1", "tenant1", 1, 1L); + ApplicationPackage applicationPackage = new ApplicationPackageBuilder() + .environment(Environment.prod) + .region("us-central-1") + .build(); + + // Deploy completely on default application and platform versions + tester.deployCompletely(application, applicationPackage); + + // New application change is deployed and fails in system-test for a while + tester.jobCompletion(component).application(application).nextBuildNumber().uploadArtifact(applicationPackage).submit(); + tester.deployAndNotify(application, false, systemTest); + tester.deployAndNotify(application, true, stagingTest); + + // Retries immediately in the first minute after failing + tester.clock().advance(Duration.ofSeconds(59)); + tester.jobCompletion(systemTest).application(application).unsuccessful().submit(); + tester.readyJobTrigger().maintain(); + tester.assertRunning(systemTest, application.id()); + + // Stops immediate retry after failing for 1 minute + tester.clock().advance(Duration.ofSeconds(1)); + tester.jobCompletion(systemTest).application(application).unsuccessful().submit(); + tester.readyJobTrigger().maintain(); + tester.assertNotRunning(systemTest, application.id()); + + // Retries after 10 minutes since previous completion as we failed within the last hour + tester.clock().advance(Duration.ofMinutes(10).plus(Duration.ofSeconds(1))); + tester.readyJobTrigger().maintain(); + tester.assertRunning(systemTest, application.id()); + + // Retries less frequently after 1 hour of failure + tester.clock().advance(Duration.ofMinutes(50)); + tester.jobCompletion(systemTest).application(application).unsuccessful().submit(); + tester.readyJobTrigger().maintain(); + tester.assertNotRunning(systemTest, application.id()); + + // Retries after two hours pass since last completion + tester.clock().advance(Duration.ofHours(2).plus(Duration.ofSeconds(1))); + tester.readyJobTrigger().maintain(); + tester.assertRunning(systemTest, application.id()); + + // Still fails and is not retried + tester.jobCompletion(systemTest).application(application).unsuccessful().submit(); + tester.readyJobTrigger().maintain(); + tester.assertNotRunning(systemTest, application.id()); + + // Another application change is deployed and fixes system-test. Change is triggered immediately as target changes + tester.jobCompletion(component).application(application).nextBuildNumber(2).uploadArtifact(applicationPackage).submit(); + tester.deployAndNotify(application, true, systemTest); + tester.deployAndNotify(application, true, stagingTest); + tester.deployAndNotify(application, true, productionUsCentral1); + assertTrue("Deployment completed", tester.buildService().jobs().isEmpty()); } } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployerTest.java index 0b55d13a5ad..3b14ad4bf10 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/OutstandingChangeDeployerTest.java @@ -75,8 +75,8 @@ public class OutstandingChangeDeployerTest { List<BuildService.BuildJob> jobs = tester.buildService().jobs(); assertEquals(2, jobs.size()); assertEquals(11, jobs.get(0).projectId()); - tester.assertRunning(app.id(), DeploymentJobs.JobType.systemTest); - tester.assertRunning(app.id(), DeploymentJobs.JobType.stagingTest); + tester.assertRunning(DeploymentJobs.JobType.systemTest, app.id()); + tester.assertRunning(DeploymentJobs.JobType.stagingTest, app.id()); assertFalse(tester.application("app1").outstandingChange().isPresent()); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java index 5d6fb76cacf..cf32a18522f 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/maintenance/UpgraderTest.java @@ -229,6 +229,8 @@ public class UpgraderTest { tester.completeUpgradeWithError(default0, version55, "default", stagingTest); tester.completeUpgradeWithError(default1, version55, "default", stagingTest); tester.completeUpgradeWithError(default2, version55, "default", stagingTest); + tester.clock().advance(Duration.ofHours(2).plus(Duration.ofSeconds(1))); // Retry failing job for default3 + tester.readyJobTrigger().maintain(); tester.completeUpgradeWithError(default3, version55, "default", DeploymentJobs.JobType.productionUsWest1); tester.upgradeSystem(version55); assertEquals(VespaVersion.Confidence.broken, tester.controller().versionStatus().systemVersion().get().confidence()); @@ -801,7 +803,7 @@ public class UpgraderTest { // 5th app never reports back and has a dead job, but no ongoing change Application deadLocked = tester.applications().require(default4.id()); - tester.assertRunning(deadLocked.id(), systemTest); + tester.assertRunning(systemTest, deadLocked.id()); assertFalse("No change present", deadLocked.change().isPresent()); // 4 out of 5 applications are repaired and confidence is restored |