diff options
15 files changed, 200 insertions, 371 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Application.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Application.java index 84bedabc4e6..f80a70b84ad 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Application.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/Application.java @@ -17,7 +17,6 @@ import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; import com.yahoo.vespa.hosted.controller.rotation.RotationId; -import java.time.Instant; import java.util.Collections; import java.util.Comparator; import java.util.List; @@ -151,6 +150,7 @@ public class Application { /** Returns the version a new deployment to this zone should use for this application */ public Version deployVersionIn(ZoneId zone, Controller controller) { + // TODO jvenstad: Return max of current and change. return change.platform().orElse(versionIn(zone, controller)); } @@ -176,6 +176,7 @@ public class Application { public Optional<ApplicationVersion> deployApplicationVersionFor(DeploymentJobs.JobType jobType, Controller controller, boolean currentVersion) { + // TODO jvenstad: Return max of current and change's. if (jobType == DeploymentJobs.JobType.component) return Optional.empty(); @@ -216,9 +217,4 @@ public class Application { return "application '" + id + "'"; } - /** Returns whether changes to this are blocked in the given instant */ - public boolean isBlocked(Instant instant) { - return ! deploymentSpec().canUpgradeAt(instant) || ! deploymentSpec().canChangeRevisionAt(instant); - } - } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java index 1d3fff57a78..f63966703d9 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/ApplicationController.java @@ -314,7 +314,7 @@ public class ApplicationController { // TODO jvenstad: Use code from DeploymentTrigger? Also, validate application version. // Validate the change being deployed - if (!canDeployDirectly) { + if ( ! canDeployDirectly) { validateChange(application, zone, version); } @@ -360,9 +360,8 @@ public class ApplicationController { if (!job.isPresent()) { throw new IllegalArgumentException("No job found for zone " + zone); } - version = application - .deployApplicationVersionFor(job.get(), controller, deployCurrentVersion) - .orElseThrow(() -> new IllegalArgumentException("Cannot determine application version to use for " + + version = application.deployApplicationVersionFor(job.get(), controller, deployCurrentVersion) + .orElseThrow(() -> new IllegalArgumentException("Cannot determine application version to use for " + job.get())); pkg = new ApplicationPackage(artifactRepository.getApplicationPackage(application.id(), version.id())); } @@ -553,11 +552,6 @@ public class ApplicationController { } public void notifyJobCompletion(JobReport report) { - log.log(Level.INFO, String.format("Notified of %s of %s %d for '%s'.", - report.jobError().map(error -> error + " failure").orElse("success"), - report.jobType(), - report.buildNumber(), - report.applicationId())); if ( ! get(report.applicationId()).isPresent()) { log.log(Level.WARNING, "Ignoring completion of job of project '" + report.projectId() + "': Unknown application '" + report.applicationId() + "'"); @@ -655,6 +649,7 @@ public class ApplicationController { /** Verify that change is tested and that we aren't downgrading */ private void validateChange(Application application, ZoneId zone, Version version) { + // TODO jvenstad: Hmmm ... Make it possible to deploy only the legal parts of the Change. if (!application.deploymentJobs().isDeployableTo(zone.environment(), application.change())) { throw new IllegalArgumentException("Rejecting deployment of " + application + " to " + zone + " as " + application.change() + " is not tested"); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java index 777cb4011b6..287fd09f724 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/ApplicationList.java @@ -76,6 +76,11 @@ public class ApplicationList { return notUpgradingTo(version.get()); } + /** Returns the subset of applications which are currently deploying a change */ + public ApplicationList deploying() { + return listOf(list.stream().filter(application -> application.change().isPresent())); + } + /** Returns the subset of applications which is currently not deploying a change */ public ApplicationList notDeploying() { return listOf(list.stream().filter(application -> ! application.change().isPresent())); @@ -141,6 +146,11 @@ public class ApplicationList { return listOf(list.stream().filter(a -> ! a.id().instance().value().matches("^(default-pr)?\\d+$"))); } + /** Returns the subset of applications which have a project ID */ + public ApplicationList withProjectId() { + return listOf(list.stream().filter(a -> a.deploymentJobs().projectId().isPresent())); + } + /** Returns the subset of applications which have at least one production deployment */ public ApplicationList hasProductionDeployment() { return listOf(list.stream().filter(a -> ! a.productionDeployments().isEmpty())); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java index a15102148e2..d29e876ffa6 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/DeploymentJobs.java @@ -122,9 +122,9 @@ public class DeploymentJobs { return true; } if (environment == Environment.staging) { - return isSuccessful(change, JobType.systemTest); + return successAt(change, JobType.systemTest).isPresent(); } else if (environment == Environment.prod) { - return isSuccessful(change, JobType.stagingTest); + return successAt(change, JobType.stagingTest).isPresent(); } return true; // other environments do not have any preconditions } @@ -145,12 +145,12 @@ public class DeploymentJobs { public Optional<IssueId> issueId() { return issueId; } - /** Returns whether the job of the given type has completed successfully for the given change */ - private boolean isSuccessful(Change change, JobType jobType) { + /** Returns the time of success for the given change for the given job type, or empty if unsuccessful. */ + public Optional<Instant> successAt(Change change, JobType jobType) { return Optional.ofNullable(jobStatus().get(jobType)) .flatMap(JobStatus::lastSuccess) .filter(status -> status.lastCompletedWas(change)) - .isPresent(); + .map(JobStatus.JobRun::at); } private static Optional<Long> requireId(Optional<Long> id, String message) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java index d5d7cf1c0ef..a698b028786 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/application/JobStatus.java @@ -116,11 +116,6 @@ public class JobStatus { return ! lastTriggered.get().at().isBefore(lastCompleted.get().at()); } - /** Returns true if this is running and has been so since before the given limit */ - public boolean isHanging(Instant timeoutLimit) { - return isRunning(Instant.MIN) && lastTriggered.get().at().isBefore(timeoutLimit.plusMillis(1)); - } - /** The error of the last completion, or empty if the last run succeeded */ public Optional<DeploymentJobs.JobError> jobError() { return jobError; } @@ -185,7 +180,6 @@ public class JobStatus { this.at = at; } - // TODO: Replace with proper ID, and make the build number part optional, or something -- it's not there for lastTriggered! /** Returns the id of this run of this job, or -1 if not known */ public long id() { return id; } @@ -201,7 +195,6 @@ public class JobStatus { /** Returns the time if this triggering or completion */ public Instant at() { return at; } - // TODO: Consider a version and application version for each JobStatus, to compare against a Target (instead of Change, which is, really, a Target). /** Returns whether the job last completed for the given change */ public boolean lastCompletedWas(Change change) { if (change.platform().isPresent() && ! change.platform().get().equals(version())) return false; diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentOrder.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentOrder.java index 66bb05df308..405c8d17263 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentOrder.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentOrder.java @@ -2,24 +2,19 @@ package com.yahoo.vespa.hosted.controller.deployment; import com.yahoo.config.application.api.DeploymentSpec; -import com.yahoo.vespa.hosted.controller.Application; +import com.yahoo.config.provision.SystemName; import com.yahoo.vespa.hosted.controller.Controller; -import com.yahoo.vespa.hosted.controller.LockedApplication; import com.yahoo.vespa.hosted.controller.api.integration.zone.ZoneId; import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs; import com.yahoo.vespa.hosted.controller.application.DeploymentJobs.JobType; import com.yahoo.vespa.hosted.controller.application.JobStatus; -import java.time.Clock; -import java.time.Duration; -import java.time.Instant; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Objects; -import java.util.Optional; -import java.util.logging.Logger; +import java.util.function.Supplier; import static java.util.Comparator.comparingInt; import static java.util.stream.Collectors.collectingAndThen; @@ -32,61 +27,19 @@ import static java.util.stream.Collectors.toList; */ public class DeploymentOrder { - private static final Logger log = Logger.getLogger(DeploymentOrder.class.getName()); + private final Supplier<SystemName> system; - private final Controller controller; - private final Clock clock; - - public DeploymentOrder(Controller controller) { - Objects.requireNonNull(controller, "controller cannot be null"); - this.controller = controller; - this.clock = controller.clock(); - } - - /** Returns a list of jobs to trigger after the given job */ - // TODO: This does too much - should just tell us the order, as advertised - public List<JobType> nextAfter(JobType job, LockedApplication application) { - if ( ! application.change().isPresent()) { // Change was cancelled - return Collections.emptyList(); - } - - // Always trigger system test after component as deployment spec might not be available yet - // (e.g. if this is a new application with no previous deployments) - if (job == JobType.component) { - return Collections.singletonList(JobType.systemTest); - } - - // At this point we have deployed to system test, so deployment spec is available - List<DeploymentSpec.Step> deploymentSteps = deploymentSteps(application); - Optional<DeploymentSpec.Step> currentStep = fromJob(job, application); - if ( ! currentStep.isPresent()) { - return Collections.emptyList(); - } - - // If this is the last deployment step there's nothing more to trigger - int currentIndex = deploymentSteps.indexOf(currentStep.get()); - if (currentIndex == deploymentSteps.size() - 1) { - return Collections.emptyList(); - } - - // Postpone next job if delay has not passed yet - Duration delay = delayAfter(currentStep.get(), application); - if (shouldPostponeDeployment(delay, job, application)) { - log.info(String.format("Delaying next job after %s of %s by %s", job, application, delay)); - return Collections.emptyList(); - } - - DeploymentSpec.Step nextStep = deploymentSteps.get(currentIndex + 1); - return nextStep.zones().stream() - .map(this::toJob) - .collect(collectingAndThen(toList(), Collections::unmodifiableList)); + public DeploymentOrder(Supplier<SystemName> system) { + Objects.requireNonNull(system, "system may not be null"); + this.system = system; } /** Returns jobs for given deployment spec, in the order they are declared */ public List<JobType> jobsFrom(DeploymentSpec deploymentSpec) { return deploymentSpec.steps().stream() - .flatMap(step -> jobsFrom(step).stream()) - .collect(collectingAndThen(toList(), Collections::unmodifiableList)); + .flatMap(step -> step.zones().stream()) + .map(this::toJob) + .collect(collectingAndThen(toList(), Collections::unmodifiableList)); } /** Returns job status sorted according to deployment spec */ @@ -108,60 +61,10 @@ public class DeploymentOrder { .collect(collectingAndThen(toList(), Collections::unmodifiableList)); } - /** Returns jobs for the given step */ - private List<JobType> jobsFrom(DeploymentSpec.Step step) { - return step.zones().stream() - .map(this::toJob) - .collect(collectingAndThen(toList(), Collections::unmodifiableList)); - } - - /** Resolve deployment step from job */ - private Optional<DeploymentSpec.Step> fromJob(JobType job, Application application) { - for (DeploymentSpec.Step step : application.deploymentSpec().steps()) { - if (step.deploysTo(job.environment(), job.isProduction() ? job.region(controller.system()) : Optional.empty())) { - return Optional.of(step); - } - } - return Optional.empty(); - } - /** Resolve job from deployment step */ - private JobType toJob(DeploymentSpec.DeclaredZone zone) { - return JobType.from(controller.system(), zone.environment(), zone.region().orElse(null)) + public JobType toJob(DeploymentSpec.DeclaredZone zone) { + return JobType.from(system.get(), zone.environment(), zone.region().orElse(null)) .orElseThrow(() -> new IllegalArgumentException("Invalid zone " + zone)); } - /** Returns whether deployment should be postponed according to delay */ - private boolean shouldPostponeDeployment(Duration delay, JobType job, Application application) { - Optional<Instant> lastSuccess = Optional.ofNullable(application.deploymentJobs().jobStatus().get(job)) - .flatMap(JobStatus::lastSuccess) - .map(JobStatus.JobRun::at); - return lastSuccess.isPresent() && lastSuccess.get().plus(delay).isAfter(clock.instant()); - } - - /** Find all steps that deploy to one or more zones */ - private static List<DeploymentSpec.Step> deploymentSteps(Application application) { - return application.deploymentSpec().steps().stream() - .filter(step -> ! step.zones().isEmpty()) - .collect(toList()); - } - - /** Determines the delay that should pass after the given step */ - private static Duration delayAfter(DeploymentSpec.Step step, Application application) { - int stepIndex = application.deploymentSpec().steps().indexOf(step); - if (stepIndex == -1 || stepIndex == application.deploymentSpec().steps().size() - 1) { - return Duration.ZERO; - } - Duration totalDelay = Duration.ZERO; - List<DeploymentSpec.Step> remainingSteps = application.deploymentSpec().steps() - .subList(stepIndex + 1, application.deploymentSpec().steps().size()); - for (DeploymentSpec.Step s : remainingSteps) { - if (! (s instanceof DeploymentSpec.Delay)) { - break; - } - totalDelay = totalDelay.plus(((DeploymentSpec.Delay) s).duration()); - } - return totalDelay; - } - } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java index f6f65df56b7..6ad511c13e5 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java @@ -1,9 +1,11 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.deployment; -import com.yahoo.component.Version; +import com.yahoo.config.application.api.DeploymentSpec; import com.yahoo.config.provision.ApplicationId; +import com.yahoo.config.provision.Environment; import com.yahoo.config.provision.SystemName; + import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.ApplicationController; import com.yahoo.vespa.hosted.controller.Controller; @@ -22,19 +24,25 @@ import com.yahoo.vespa.hosted.controller.persistence.CuratorDb; import java.time.Clock; import java.time.Duration; import java.time.Instant; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.Optional; +import java.util.Set; import java.util.logging.Logger; -import java.util.stream.Stream; + +import static java.util.Comparator.naturalOrder; +import static java.util.stream.Collectors.joining; +import static java.util.stream.Collectors.toList; +import static java.util.stream.Collectors.toSet; /** * Responsible for scheduling deployment jobs in a build system and keeping - * Application.deploying() in sync with what is scheduled. + * {@link Application#change()} in sync with what is scheduled. * - * This class is multithread safe. + * This class is multi-thread safe. * * @author bratseth * @author mpolden @@ -42,7 +50,9 @@ import java.util.stream.Stream; */ public class DeploymentTrigger { - /** The max duration a job may run before we consider it dead/hanging */ + /** + * The max duration a job may run before we consider it dead/hanging + */ private final Duration jobTimeout; private final static Logger log = Logger.getLogger(DeploymentTrigger.class.getName()); @@ -53,54 +63,53 @@ public class DeploymentTrigger { private final DeploymentOrder order; public DeploymentTrigger(Controller controller, CuratorDb curator, Clock clock) { - Objects.requireNonNull(controller,"controller cannot be null"); - Objects.requireNonNull(curator,"curator cannot be null"); - Objects.requireNonNull(clock,"clock cannot be null"); + Objects.requireNonNull(controller, "controller cannot be null"); + Objects.requireNonNull(curator, "curator cannot be null"); + Objects.requireNonNull(clock, "clock cannot be null"); this.controller = controller; this.clock = clock; this.deploymentQueue = new DeploymentQueue(controller, curator); - this.order = new DeploymentOrder(controller); + this.order = new DeploymentOrder(controller::system); this.jobTimeout = controller.system().equals(SystemName.main) ? Duration.ofHours(12) : Duration.ofHours(1); } - /** Returns the time in the past before which jobs are at this moment considered unresponsive */ - public Instant jobTimeoutLimit() { return clock.instant().minus(jobTimeout); } + /** + * Returns the time in the past before which jobs are at this moment considered unresponsive + */ + public Instant jobTimeoutLimit() { + return clock.instant().minus(jobTimeout); + } - public DeploymentQueue deploymentQueue() { return deploymentQueue; } + public DeploymentQueue deploymentQueue() { + return deploymentQueue; + } - public DeploymentOrder deploymentOrder() { return order; } + public DeploymentOrder deploymentOrder() { + return order; + } //--- Start of methods which triggers deployment jobs ------------------------- /** - * Called each time a job completes (successfully or not) to cause triggering of one or more follow-up jobs - * (which may possibly the same job once over). + * Called each time a job completes (successfully or not) to record information used when deciding what to trigger. * * @param report information about the job that just completed */ public void triggerFromCompletion(JobReport report) { applications().lockOrThrow(report.applicationId(), application -> { - ApplicationVersion applicationVersion = applicationVersionFrom(report); + ApplicationVersion applicationVersion = report.sourceRevision().map(sr -> ApplicationVersion.from(sr, report.buildNumber())) + .orElse(ApplicationVersion.unknown); application = application.withJobCompletion(report, applicationVersion, clock.instant(), controller); application = application.withProjectId(report.projectId()); - // Handle successful starting and ending - if (report.jobType() == JobType.component) { - if (report.success()) { - if ( ! acceptNewApplicationVersionNow(application)) - application = application.withOutstandingChange(Change.of(applicationVersion)); - else - // Note that in case of an ongoing upgrade this may result in both the upgrade and application - // change being deployed together - application = application.withChange(application.change().with(applicationVersion)); - } + if (report.jobType() == JobType.component && report.success()) { + if ( ! acceptNewApplicationVersionNow(application)) + application = application.withOutstandingChange(Change.of(applicationVersion)); + else + // Note that in case of an ongoing upgrade this may result in both the upgrade and application + // change being deployed together + application = application.withChange(application.change().with(applicationVersion)); } - else if (report.jobType().isProduction() && deploymentComplete(application)) { - // change completed - // TODO jvenstad: Check for and remove individual parts of Change. - application = application.withChange(Change.empty()); - } - applications().store(application); }); } @@ -110,7 +119,9 @@ public class DeploymentTrigger { */ public void triggerReadyJobs() { ApplicationList applications = ApplicationList.from(applications().asList()); - applications = applications.notPullRequest(); + applications = applications.notPullRequest() + .withProjectId() + .deploying(); for (Application application : applications.asList()) applications().lockIfPresent(application.id(), this::triggerReadyJobs); } @@ -119,37 +130,25 @@ public class DeploymentTrigger { * Trigger a job for an application, if allowed * * @param triggering the triggering to execute, i.e., application, job type and reason - * @param concurrentlyWith production jobs that may run concurrently with the job to trigger - * @param force true to disable checks which should normally prevent this triggering from happening * @return the application in the triggered state, if actually triggered. This *must* be stored by the caller */ - public LockedApplication trigger(Triggering triggering, Collection<JobType> concurrentlyWith, boolean force) { - if (triggering.jobType == null) return triggering.application; // we are passed null when the last job has been reached - - List<JobType> runningProductionJobs = JobList.from(triggering.application) - .production() - .running(jobTimeoutLimit()) - .mapToList(JobStatus::type); - if ( ! force && triggering.jobType().isProduction() && ! concurrentlyWith.containsAll(runningProductionJobs)) - return triggering.application; - + public LockedApplication trigger(Triggering triggering, LockedApplication application) { // Never allow untested changes to go through // Note that this may happen because a new change catches up and prevents an older one from continuing - if ( ! triggering.application.deploymentJobs().isDeployableTo(triggering.jobType.environment(), triggering.application.change())) { + if ( ! application.deploymentJobs().isDeployableTo(triggering.jobType.environment(), application.change())) { log.warning(String.format("Want to trigger %s for %s with reason %s, but change is untested", triggering.jobType, - triggering.application, triggering.reason)); - return triggering.application; + application, triggering.reason)); + return application; } - if ( ! force && ! allowedTriggering(triggering.jobType, triggering.application)) return triggering.application; log.info(triggering.toString()); - deploymentQueue.addJob(triggering.application.id(), triggering.jobType, triggering.retry); + deploymentQueue.addJob(application.id(), triggering.jobType, triggering.retry); // TODO jvenstad: Let triggering set only time of triggering (and reason, for debugging?) when build system is polled for job status. - return triggering.application.withJobTriggering(triggering.jobType, + return application.withJobTriggering(triggering.jobType, clock.instant(), - triggering.application.deployVersionFor(triggering.jobType, controller), - triggering.application.deployApplicationVersionFor(triggering.jobType, controller, false) - .orElse(ApplicationVersion.unknown), + application.deployVersionFor(triggering.jobType, controller), + application.deployApplicationVersionFor(triggering.jobType, controller, false) + .orElse(ApplicationVersion.unknown), triggering.reason); } @@ -161,7 +160,7 @@ public class DeploymentTrigger { */ public void triggerChange(ApplicationId applicationId, Change change) { applications().lockOrThrow(applicationId, application -> { - if (application.change().isPresent() && ! application.deploymentJobs().hasFailures()) + if (application.change().isPresent() && !application.deploymentJobs().hasFailures()) throw new IllegalArgumentException("Could not start " + change + " on " + application + ": " + application.change() + " is already in progress"); application = application.withChange(change); @@ -190,222 +189,148 @@ public class DeploymentTrigger { //--- End of methods which triggers deployment jobs ---------------------------- - /** Find the next step to trigger if any, and triggers it */ + /** + * Finds the next step to trigger for the given application, if any, and triggers it + */ private void triggerReadyJobs(LockedApplication application) { - List<JobType> jobs = order.jobsFrom(application.deploymentSpec()); - - // Should the first step be triggered? - if ( ! jobs.isEmpty() && jobs.get(0).equals(JobType.systemTest) ) { - JobStatus systemTestStatus = application.deploymentJobs().jobStatus().get(JobType.systemTest); - if (application.change().platform().isPresent()) { - Version target = application.change().platform().get(); - if (systemTestStatus == null - || ! systemTestStatus.lastTriggered().isPresent() - || ! systemTestStatus.isSuccess() - || ! systemTestStatus.lastTriggered().get().version().equals(target) - || systemTestStatus.isHanging(jobTimeoutLimit())) { - application = trigger(new Triggering(application, JobType.systemTest, false, "Upgrade to " + target), Collections.emptySet(), false); - applications().store(application); + List<Triggering> triggerings = new ArrayList<>(); + Change change = application.change(); + + // Urgh. Empty spec means unknown spec. Should we write it at component completion? + List<DeploymentSpec.Step> steps = application.deploymentSpec().steps(); + if (steps.isEmpty()) steps = Collections.singletonList(new DeploymentSpec.DeclaredZone(Environment.test)); + + Optional<Instant> completedAt = Optional.of(clock.instant()); + String reason = "Deploying " + change.toString(); + + for (DeploymentSpec.Step step : steps) { + LockedApplication app = application; + Set<JobType> stepJobs = step.zones().stream().map(order::toJob).collect(toSet()); + Set<JobType> remainingJobs = stepJobs.stream().filter(job -> ! completedAt(app, job).isPresent()).collect(toSet()); + if (remainingJobs.isEmpty()) { // All jobs are complete -- find the time of completion for this step. + if (stepJobs.isEmpty()) { // No jobs means this is delay step. + Duration delay = ((DeploymentSpec.Delay) step).duration(); + completedAt = completedAt.map(at -> at.plus(delay)).filter(at -> ! at.isAfter(clock.instant())); + reason += " after a delay of " + delay; } - } - } - - // Find next steps to trigger based on the state of the previous step - for (JobType jobType : (Iterable<JobType>) Stream.concat(Stream.of(JobType.component), jobs.stream())::iterator) { - JobStatus jobStatus = application.deploymentJobs().jobStatus().get(jobType); - if (jobStatus == null) continue; // job has never run - - // Collect the subset of next jobs which have not run with the last changes - // TODO jvenstad: Change to be step-centric. - List<JobType> nextJobs = order.nextAfter(jobType, application); - for (JobType nextJobType : nextJobs) { - JobStatus nextStatus = application.deploymentJobs().jobStatus().get(nextJobType); - if (changesAvailable(application, jobStatus, nextStatus) || nextStatus.isHanging(jobTimeoutLimit())) { - boolean isRetry = nextStatus != null && nextStatus.jobError().filter(JobError.outOfCapacity::equals).isPresent(); - application = trigger(new Triggering(application, nextJobType, isRetry, isRetry ? "Retrying on out of capacity" : "Available change in " + jobType.jobName()), nextJobs, false); + else { + completedAt = stepJobs.stream().map(job -> completedAt(app, job).get()).max(naturalOrder()); + reason = "Available change in " + stepJobs.stream().map(JobType::jobName).collect(joining(", ")); } } - applications().store(application); + else if (completedAt.isPresent()) { // Some jobs remain, and this step is not complete -- trigger those jobs if the previous step was done. + for (JobType job : remainingJobs) + triggerings.add(new Triggering(app, job, reason, stepJobs)); + completedAt = Optional.empty(); + } } - } + if (completedAt.isPresent()) + application = application.withChange(Change.empty()); - private ApplicationController applications() { return controller.applications(); } + for (Triggering triggering : triggerings) + if (application.deploymentJobs().isDeployableTo(triggering.jobType.environment(), change) && allowedToTriggerNow(triggering, application)) + application = trigger(triggering, application); - /** Returns whether the given job type should be triggered according to deployment spec */ - private boolean hasJob(JobType jobType, Application application) { - if ( ! jobType.isProduction()) return true; // Deployment spec only determines this for production jobs. - return application.deploymentSpec().includes(jobType.environment(), jobType.region(controller.system())); - } - /** Create application version from job report */ - private ApplicationVersion applicationVersionFrom(JobReport report) { - return report.sourceRevision().map(sr -> ApplicationVersion.from(sr, report.buildNumber())) - .orElse(ApplicationVersion.unknown); + applications().store(application); } - /** Returns true if the given proposed job triggering should be effected */ - private boolean allowedTriggering(JobType jobType, LockedApplication application) { - // Note: We could make a more fine-grained and more correct determination about whether to block - // by instead basing the decision on what is currently deployed in the zone. However, - // this leads to some additional corner cases, and the possibility of blocking an application - // fix to a version upgrade, so not doing it now - - if (jobType.isProduction() && application.change().isPresent() && - application.change().blockedBy(application.deploymentSpec(), clock.instant())) return false; + private Optional<Instant> completedAt(Application application, JobType jobType) { + return jobType.isProduction() + ? changeCompletedAt(application, jobType) + : application.deploymentJobs().successAt(application.change(), jobType); + } - // Don't downgrade or redeploy the same version in production needlessly - if (jobType.isProduction() && changeDeployed(application, jobType)) return false; + private boolean allowedToTriggerNow(Triggering triggering, Application application) { + if (application.deploymentJobs().isRunning(triggering.jobType, jobTimeoutLimit())) + return false; - if (application.deploymentJobs().isRunning(jobType, jobTimeoutLimit())) return false; - if ( ! hasJob(jobType, application)) return false; - // Ignore applications that are not associated with a project - if ( ! application.deploymentJobs().projectId().isPresent()) return false; + if ( ! triggering.jobType.isProduction()) + return true; - return true; - } + if ( ! triggering.concurrentlyWith.containsAll(JobList.from(application) + .production() + .running(jobTimeoutLimit()) + .mapToList(JobStatus::type))) + return false; - /** - * Returns true if the previous job has completed successfully with a application version and/or Vespa version - * which is newer (different) than the one last completed successfully in next - */ - private boolean changesAvailable(Application application, JobStatus previous, JobStatus next) { - if ( ! application.change().isPresent()) return false; - if (next == null) return true; - - if (next.type().isTest()) { - // Is it not yet this job's turn to upgrade? - if ( ! lastSuccessfulIs(application.change(), previous.type(), application)) - return false; - - // Has the upgrade test already been done? - if (lastSuccessfulIs(application.change(), next.type(), application)) - return false; - } - else if (next.type().isProduction()) { - // Is the target version tested? - if ( ! lastSuccessfulIs(application.change(), JobType.stagingTest, application)) - return false; - - // Is the previous a job production which neither succeed with the target version, nor has a higher version? - if (previous.type().isProduction() && ! changeDeployed(application, previous.type())) - return false; - - // Did the next job already succeed on the target version, or does it already have a higher version? - if (changeDeployed(application, next.type())) - return false; - } - else - throw new IllegalStateException("Unclassified type of next job: " + next); + // TODO jvenstad: This blocks all changes when dual, and in block window. Should rather remove the blocked component. + // TODO jvenstad: If the above is implemented, take care not to deploy untested stuff? + if (application.change().blockedBy(application.deploymentSpec(), clock.instant())) + return false; return true; } - /** Returns whether all production zones listed in deployment spec has this change (or a newer version, if upgrade) */ - private boolean deploymentComplete(LockedApplication application) { - return order.jobsFrom(application.deploymentSpec()).stream() - .filter(JobType::isProduction) - .filter(job -> job.zone(controller.system()).isPresent()) - .allMatch(job -> changeDeployed(application, job)); + private ApplicationController applications() { + return controller.applications(); } - /** - * Returns whether the given application should skip deployment of its current change to the given production job zone. - * - * If the currently deployed application has a newer platform or application version than the application's - * current change, the method returns {@code true}, to avoid a downgrade. - * Otherwise, it returns whether the current change is redundant, i.e., all its components are already deployed. - */ - private boolean changeDeployed(Application application, JobType job) { + /** Returns the instant when the given application's current change was completed for the given job. */ + private Optional<Instant> changeCompletedAt(Application application, JobType job) { if ( ! job.isProduction()) throw new IllegalArgumentException(job + " is not a production job!"); Deployment deployment = application.deployments().get(job.zone(controller.system()).get()); if (deployment == null) - return false; + return Optional.empty(); int applicationComparison = application.change().application() .map(version -> version.compareTo(deployment.applicationVersion())) .orElse(0); - int platformComparion = application.change().platform() - .map(version -> version.compareTo(deployment.version())) - .orElse(0); + int platformComparison = application.change().platform() + .map(version -> version.compareTo(deployment.version())) + .orElse(0); - if (applicationComparison == -1 || platformComparion == -1) - return true; // Avoid downgrades! - - return applicationComparison == 0 && platformComparion == 0; + // TODO jvenstad: Allow downgrades when considering whether to trigger -- stop them at choice of deployment version. + // TODO jvenstad: This allows tests to be re-run, for instance, while keeping the deployment itself a no-op. + return Optional.of(deployment.at()) + .filter(ignored -> applicationComparison == -1 || platformComparison == -1 + || (applicationComparison == 0 && platformComparison == 0)); } private boolean acceptNewApplicationVersionNow(LockedApplication application) { if ( ! application.change().isPresent()) return true; - if (application.change().application().isPresent()) return true; // more application changes are ok + if (application.change().application().isPresent()) return true; // More application changes are ok. - if (application.deploymentJobs().hasFailures()) return true; // allow changes to fix upgrade problems + if (application.deploymentJobs().hasFailures()) return true; // Allow changes to fix upgrade problems. - if (application.isBlocked(clock.instant())) return true; // allow testing changes while upgrade blocked (debatable) + if ( ! application.deploymentSpec().canUpgradeAt(clock.instant()) + || ! application.deploymentSpec().canChangeRevisionAt(clock.instant())) + return true; // Allow testing changes while upgrade blocked (debatable). - // Otherwise, the application is currently upgrading, without failures, and we should wait with the new - // application version. + // Otherwise, the application is currently upgrading, without failures, and we should wait with the new application version. return false; } - private boolean lastSuccessfulIs(Change change, JobType jobType, Application application) { - JobStatus status = application.deploymentJobs().jobStatus().get(jobType); - if (status == null) - return false; - - Optional<JobStatus.JobRun> lastSuccessfulRun = status.lastSuccess(); - if ( ! lastSuccessfulRun.isPresent()) return false; - - if (change.platform().isPresent() && ! change.platform().get().equals(lastSuccessfulRun.get().version())) - return false; - - if (change.application().isPresent() && ! change.application().get().equals(lastSuccessfulRun.get().applicationVersion())) - return false; - - return true; - } - - public static class Triggering { private final LockedApplication application; // TODO jvenstad: Consider passing an ID instead. private final JobType jobType; private final boolean retry; private final String reason; + private final Collection<JobType> concurrentlyWith; - public Triggering(LockedApplication application, JobType jobType, boolean retry, String reason) { + public Triggering(LockedApplication application, JobType jobType, String reason, Collection<JobType> concurrentlyWith) { this.application = application; this.jobType = jobType; - this.retry = retry; - this.reason = reason; - } - - public LockedApplication application() { - return application; - } + this.concurrentlyWith = concurrentlyWith; - public JobType jobType() { - return jobType; + JobStatus status = application.deploymentJobs().jobStatus().get(jobType); + this.retry = status != null && status.jobError().filter(JobError.outOfCapacity::equals).isPresent(); + this.reason = retry ? "Retrying on out of capacity" : reason; } - public boolean isRetry() { - return retry; - } - - public String reason() { - return reason; + public Triggering(LockedApplication application, JobType jobType, String reason) { + this(application, jobType, reason, Collections.emptySet()); } public String toString() { - return String.format("Triggering %s for %s, %s: %s", - jobType, - application, - application.change().isPresent() ? "deploying " + application.change() : "restarted deployment", - reason); + return String.format("Triggering %s for %s, deploying %s: %s", jobType, application, application.change(), reason); } } } + diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/screwdriver/ScrewdriverApiHandler.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/screwdriver/ScrewdriverApiHandler.java index d7f224d15aa..bcabcf48e91 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/screwdriver/ScrewdriverApiHandler.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/screwdriver/ScrewdriverApiHandler.java @@ -91,9 +91,7 @@ public class ScrewdriverApiHandler extends LoggingRequestHandler { controller.applications().lockOrThrow(applicationId, application -> { // Since this is a manual operation we likely want it to trigger as soon as possible so we add it at to the // front of the queue - application = controller.applications().deploymentTrigger().trigger( - new DeploymentTrigger.Triggering(application, jobType, true, "Triggered from screwdriver/v1"), Collections.emptySet(), true - ); + application = controller.applications().deploymentTrigger().trigger(new DeploymentTrigger.Triggering(application, jobType, "Triggered from screwdriver/v1"), application); controller.applications().store(application); }); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java index 5c24b70fd65..4d6de0bf4ef 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/ControllerTest.java @@ -173,10 +173,10 @@ public class ControllerTest { .environment(Environment.prod) .region("us-east-3") .build(); - tester.jobCompletion(component).application(app1).nextBuildNumber().uploadArtifact(applicationPackage).submit(); + tester.jobCompletion(component).application(app1).nextBuildNumber().nextBuildNumber().uploadArtifact(applicationPackage).submit(); try { tester.deploy(systemTest, app1, applicationPackage); - fail("Expected exception due to unallowed production deployment removal"); + fail("Expected exception due to illegal production deployment removal"); } catch (IllegalArgumentException e) { assertEquals("deployment-removal: application 'tenant1.app1' is deployed in corp-us-east-1, but does not include this zone in deployment.xml", e.getMessage()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java index 6bc544605b6..c4e9240dfcb 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTester.java @@ -174,7 +174,7 @@ public class DeploymentTester { private void completeDeployment(Application application, ApplicationPackage applicationPackage, Optional<JobType> failOnJob, boolean includingProductionZones) { - DeploymentOrder order = new DeploymentOrder(controller()); + DeploymentOrder order = new DeploymentOrder(controller()::system); List<JobType> jobs = order.jobsFrom(applicationPackage.deploymentSpec()); if ( ! includingProductionZones) jobs = jobs.stream().filter(job -> ! job.isProduction()).collect(Collectors.toList()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java index 364cb66c3d1..dbe6c13bc68 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTriggerTest.java @@ -117,7 +117,7 @@ public class DeploymentTriggerTest { .environment(Environment.prod) .delay(Duration.ofSeconds(30)) .region("us-west-1") - .delay(Duration.ofMinutes(1)) + .delay(Duration.ofMinutes(2)) .delay(Duration.ofMinutes(2)) // Multiple delays are summed up .region("us-central-1") .delay(Duration.ofMinutes(10)) // Delays after last region are valid, but have no effect @@ -154,9 +154,14 @@ public class DeploymentTriggerTest { tester.deploymentTrigger().triggerReadyJobs(); assertTrue("No more jobs triggered at this time", deploymentQueue.jobs().isEmpty()); - // 3 minutes pass, us-central-1 is triggered + // 3 minutes pass, us-central-1 is still not triggered tester.clock().advance(Duration.ofMinutes(3)); tester.deploymentTrigger().triggerReadyJobs(); + assertTrue("No more jobs triggered at this time", deploymentQueue.jobs().isEmpty()); + + // 4 minutes pass, us-central-1 is triggered + tester.clock().advance(Duration.ofMinutes(1)); + tester.deploymentTrigger().triggerReadyJobs(); tester.deployAndNotify(application, applicationPackage, true, JobType.productionUsCentral1); assertTrue("All jobs consumed", deploymentQueue.jobs().isEmpty()); diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application-without-change-multiple-deployments.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application-without-change-multiple-deployments.json index 06d562d40f2..e1ff2712e4a 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application-without-change-multiple-deployments.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application-without-change-multiple-deployments.json @@ -48,7 +48,7 @@ "gitCommit": "commit1" } }, - "reason": "Available change in component", + "reason": "Deploying application change to 1.0.101-commit1", "at": "(ignore)" }, "lastCompleted": { @@ -62,7 +62,7 @@ "gitCommit": "commit1" } }, - "reason": "Available change in component", + "reason": "Deploying application change to 1.0.101-commit1", "at": "(ignore)" }, "lastSuccess": { @@ -76,7 +76,7 @@ "gitCommit": "commit1" } }, - "reason": "Available change in component", + "reason": "Deploying application change to 1.0.101-commit1", "at": "(ignore)" } }, diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application.json index a067d19fb6a..a4a4d1932ff 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application.json @@ -58,7 +58,7 @@ "gitCommit": "commit1" } }, - "reason": "Available change in component", + "reason": "Deploying application change to 1.0.42-commit1", "at": "(ignore)" }, "lastCompleted": { @@ -72,7 +72,7 @@ "gitCommit": "commit1" } }, - "reason": "Available change in component", + "reason": "Deploying application change to 1.0.42-commit1", "at": "(ignore)" }, "lastSuccess": { @@ -86,7 +86,7 @@ "gitCommit": "commit1" } }, - "reason": "Available change in component", + "reason": "Deploying application change to 1.0.42-commit1", "at": "(ignore)" } }, diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application1-recursive.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application1-recursive.json index 3c4cc32111c..3d644c0d9ba 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application1-recursive.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/application1-recursive.json @@ -58,7 +58,7 @@ "gitCommit": "commit1" } }, - "reason": "Available change in component", + "reason": "Deploying application change to 1.0.42-commit1", "at": "(ignore)" }, "lastCompleted": { @@ -72,7 +72,7 @@ "gitCommit": "commit1" } }, - "reason": "Available change in component", + "reason": "Deploying application change to 1.0.42-commit1", "at": "(ignore)" }, "lastSuccess": { @@ -86,7 +86,7 @@ "gitCommit": "commit1" } }, - "reason": "Available change in component", + "reason": "Deploying application change to 1.0.42-commit1", "at": "(ignore)" } }, diff --git a/filedistribution/src/main/java/com/yahoo/vespa/filedistribution/FileReferenceDownloader.java b/filedistribution/src/main/java/com/yahoo/vespa/filedistribution/FileReferenceDownloader.java index 04e5f1a9577..985b9a0069e 100644 --- a/filedistribution/src/main/java/com/yahoo/vespa/filedistribution/FileReferenceDownloader.java +++ b/filedistribution/src/main/java/com/yahoo/vespa/filedistribution/FileReferenceDownloader.java @@ -20,6 +20,7 @@ import java.util.Map; import java.util.Optional; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.logging.Level; import java.util.logging.Logger; /** @@ -55,11 +56,13 @@ public class FileReferenceDownloader { FileReference fileReference = fileReferenceDownload.fileReference(); long end = System.currentTimeMillis() + timeout.toMillis(); boolean downloadStarted = false; + int retryCount = 0; while ((System.currentTimeMillis() < end) && !downloadStarted) { try { - if (startDownloadRpc(fileReferenceDownload)) { + if (startDownloadRpc(fileReferenceDownload, retryCount)) { downloadStarted = true; } else { + retryCount++; Thread.sleep(sleepBetweenRetries.toMillis()); } } @@ -104,7 +107,7 @@ public class FileReferenceDownloader { } } - private boolean startDownloadRpc(FileReferenceDownload fileReferenceDownload) { + private boolean startDownloadRpc(FileReferenceDownload fileReferenceDownload, int retryCount) { Connection connection = connectionPool.getCurrent(); Request request = new Request("filedistribution.serveFile"); String fileReference = fileReferenceDownload.fileReference().value(); @@ -112,18 +115,19 @@ public class FileReferenceDownloader { request.parameters().add(new Int32Value(fileReferenceDownload.downloadFromOtherSourceIfNotFound() ? 0 : 1)); execute(request, connection); + Level logLevel = (retryCount > 0 ? LogLevel.INFO : LogLevel.DEBUG); if (validateResponse(request)) { - log.log(LogLevel.DEBUG, () -> "Request callback, OK. Req: " + request + "\nSpec: " + connection); + log.log(logLevel, () -> "Request callback, OK. Req: " + request + "\nSpec: " + connection); if (request.returnValues().get(0).asInt32() == 0) { - log.log(LogLevel.DEBUG, () -> "Found file reference '" + fileReference + "' available at " + connection.getAddress()); + log.log(logLevel, () -> "Found file reference '" + fileReference + "' available at " + connection.getAddress()); return true; } else { - log.log(LogLevel.DEBUG, "File reference '" + fileReference + "' not found for " + connection.getAddress()); + log.log(logLevel, "File reference '" + fileReference + "' not found for " + connection.getAddress()); connectionPool.setNewCurrentConnection(); return false; } } else { - log.log(LogLevel.DEBUG, "Request failed. Req: " + request + "\nSpec: " + connection.getAddress() + + log.log(logLevel, "Request failed. Req: " + request + "\nSpec: " + connection.getAddress() + ", error code: " + request.errorCode() + ", set error for connection and use another for next request"); connectionPool.setError(connection, request.errorCode()); return false; |