aboutsummaryrefslogtreecommitdiffstats
path: root/controller-server/src/main/java/com
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2022-07-05 12:58:38 +0200
committerGitHub <noreply@github.com>2022-07-05 12:58:38 +0200
commit605fbd6498ececb73926aaea649d0d669259a297 (patch)
treeb18b3bc615914ba71dfc4b0e2dde0041f680d7d5 /controller-server/src/main/java/com
parent0e8c824813bb49fbe399dbb0f9170511701960b1 (diff)
parent3be71ffbfe3f2a3c147fafa4fb27d72e600743fe (diff)
Merge pull request #23320 from vespa-engine/jonmv/set-faiiling-revisions-aside
Jonmv/set faiiling revisions aside
Diffstat (limited to 'controller-server/src/main/java/com')
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java13
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java3
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java9
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java35
4 files changed, 43 insertions, 17 deletions
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java
index 22df5ca559e..4a00a272c75 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentStatusList.java
@@ -3,6 +3,7 @@ package com.yahoo.vespa.hosted.controller.deployment;
import com.yahoo.collections.AbstractFilteringList;
import com.yahoo.component.Version;
+import com.yahoo.vespa.hosted.controller.application.Change;
import java.time.Instant;
import java.util.Collection;
@@ -36,8 +37,10 @@ public class DeploymentStatusList extends AbstractFilteringList<DeploymentStatus
/** Returns the subset of applications which have been failing an application change since the given instant */
public DeploymentStatusList failingApplicationChangeSince(Instant threshold) {
- return matching(status -> status.instanceJobs().values().stream()
- .anyMatch(jobs -> failingApplicationChangeSince(jobs, threshold)));
+ return matching(status -> status.instanceJobs().entrySet().stream()
+ .anyMatch(jobs -> failingApplicationChangeSince(jobs.getValue(),
+ status.application().require(jobs.getKey().instance()).change(),
+ threshold)));
}
private static boolean failingUpgradeToVersionSince(JobList jobs, Version version, Instant threshold) {
@@ -47,10 +50,8 @@ public class DeploymentStatusList extends AbstractFilteringList<DeploymentStatus
.isEmpty();
}
- private static boolean failingApplicationChangeSince(JobList jobs, Instant threshold) {
- return ! jobs.failingApplicationChange()
- .firstFailing().endedNoLaterThan(threshold)
- .isEmpty();
+ private static boolean failingApplicationChangeSince(JobList jobs, Change change, Instant threshold) {
+ return change.revision().map(revision -> ! jobs.failingWithBrokenRevisionSince(revision, threshold).isEmpty()).orElse(false);
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
index c28f94bc4d7..7ceeda08d3a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/DeploymentTrigger.java
@@ -61,6 +61,7 @@ import static java.util.stream.Collectors.toMap;
public class DeploymentTrigger {
public static final Duration maxPause = Duration.ofDays(3);
+ public static final Duration maxFailingRevisionTime = Duration.ofDays(5);
private final static Logger log = Logger.getLogger(DeploymentTrigger.class.getName());
private final Controller controller;
@@ -448,6 +449,8 @@ public class DeploymentTrigger {
private boolean acceptNewRevision(DeploymentStatus status, InstanceName instance, RevisionId revision) {
if (status.application().deploymentSpec().instance(instance).isEmpty()) return false; // Unknown instance.
+ if ( ! status.jobs().failingWithBrokenRevisionSince(revision, clock.instant().minus(maxFailingRevisionTime))
+ .isEmpty()) return false; // Don't deploy a broken revision.
boolean isChangingRevision = status.application().require(instance).change().revision().isPresent();
DeploymentInstanceSpec spec = status.application().deploymentSpec().requireInstance(instance);
Predicate<RevisionId> revisionFilter = spec.revisionTarget() == DeploymentSpec.RevisionTarget.next
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java
index 551f841233e..3074c9ac3ba 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobList.java
@@ -4,7 +4,6 @@ package com.yahoo.vespa.hosted.controller.deployment;
import com.yahoo.collections.AbstractFilteringList;
import com.yahoo.component.Version;
import com.yahoo.config.provision.InstanceName;
-import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationVersion;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobId;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.RevisionId;
@@ -74,6 +73,14 @@ public class JobList extends AbstractFilteringList<JobStatus, JobList> {
return matching(JobList::failingApplicationChange);
}
+ /** Returns the subset of jobs which are failing because of an application change, and have been since the threshold, on the given revision. */
+ public JobList failingWithBrokenRevisionSince(RevisionId broken, Instant threshold) {
+ return failingApplicationChange().matching(job -> job.runs().values().stream()
+ .anyMatch(run -> run.versions().targetRevision().equals(broken)
+ && run.hasFailed()
+ && run.start().isBefore(threshold)));
+ }
+
/** Returns the subset of jobs which are failing with the given run status. */
public JobList withStatus(RunStatus status) {
return matching(job -> job.lastStatus().map(status::equals).orElse(false));
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
index d654f63fff2..a2fb0df626f 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/Upgrader.java
@@ -5,17 +5,20 @@ import com.yahoo.component.Version;
import com.yahoo.config.application.api.DeploymentSpec.UpgradePolicy;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.transaction.Mutex;
-import com.yahoo.vespa.curator.Lock;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.application.ApplicationList;
import com.yahoo.vespa.hosted.controller.application.Change;
import com.yahoo.vespa.hosted.controller.application.InstanceList;
+import com.yahoo.vespa.hosted.controller.deployment.DeploymentStatusList;
+import com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger;
+import com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger.ChangesToCancel;
import com.yahoo.vespa.hosted.controller.persistence.CuratorDb;
import com.yahoo.vespa.hosted.controller.versions.VersionStatus;
import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import com.yahoo.vespa.hosted.controller.versions.VespaVersion.Confidence;
import java.time.Duration;
+import java.time.Instant;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
@@ -23,6 +26,7 @@ import java.util.Map;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.Random;
+import java.util.Set;
import java.util.function.UnaryOperator;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -58,18 +62,22 @@ public class Upgrader extends ControllerMaintainer {
cancelBrokenUpgrades(versionStatus);
OptionalInt targetMajorVersion = targetMajorVersion();
- InstanceList instances = instances(versionStatus);
+ DeploymentStatusList deploymentStatuses = deploymentStatuses(versionStatus);
for (UpgradePolicy policy : UpgradePolicy.values())
- updateTargets(versionStatus, instances, policy, targetMajorVersion);
+ updateTargets(versionStatus, deploymentStatuses, policy, targetMajorVersion);
return 1.0;
}
+ private DeploymentStatusList deploymentStatuses(VersionStatus versionStatus) {
+ return controller().jobController().deploymentStatuses(ApplicationList.from(controller().applications().readable())
+ .withProjectId(),
+ versionStatus);
+ }
+
/** Returns a list of all production application instances, except those which are pinned, which we should not manipulate here. */
- private InstanceList instances(VersionStatus versionStatus) {
- return InstanceList.from(controller().jobController().deploymentStatuses(ApplicationList.from(controller().applications().readable())
- .withProjectId(),
- versionStatus))
+ private InstanceList instances(DeploymentStatusList deploymentStatuses) {
+ return InstanceList.from(deploymentStatuses)
.withDeclaredJobs()
.shuffle(random)
.byIncreasingDeployedVersion()
@@ -78,7 +86,7 @@ public class Upgrader extends ControllerMaintainer {
private void cancelBrokenUpgrades(VersionStatus versionStatus) {
// Cancel upgrades to broken targets (let other ongoing upgrades complete to avoid starvation)
- InstanceList instances = instances(controller().readVersionStatus());
+ InstanceList instances = instances(deploymentStatuses(controller().readVersionStatus()));
for (VespaVersion version : versionStatus.versions()) {
if (version.confidence() == Confidence.broken)
cancelUpgradesOf(instances.upgradingTo(version.versionNumber()).not().with(UpgradePolicy.canary),
@@ -86,8 +94,12 @@ public class Upgrader extends ControllerMaintainer {
}
}
- private void updateTargets(VersionStatus versionStatus, InstanceList instances, UpgradePolicy policy, OptionalInt targetMajorVersion) {
+ private void updateTargets(VersionStatus versionStatus, DeploymentStatusList deploymentStatuses, UpgradePolicy policy, OptionalInt targetMajorVersion) {
+ InstanceList instances = instances(deploymentStatuses);
InstanceList remaining = instances.with(policy);
+ Instant failureThreshold = controller().clock().instant().minus(DeploymentTrigger.maxFailingRevisionTime);
+ Set<ApplicationId> failingRevision = InstanceList.from(deploymentStatuses.failingApplicationChangeSince(failureThreshold)).asSet();
+
List<Version> targetAndNewer = new ArrayList<>();
UnaryOperator<InstanceList> cancellationCriterion = policy == UpgradePolicy.canary ? i -> i.not().upgradingTo(targetAndNewer)
: i -> i.failing()
@@ -103,13 +115,16 @@ public class Upgrader extends ControllerMaintainer {
// Prefer the newest target for each instance.
remaining = remaining.not().matching(eligible.asList()::contains)
.not().hasCompleted(Change.of(version));
- for (ApplicationId id : outdated.and(eligible.not().upgrading()).not().changingRevision())
+ for (ApplicationId id : outdated.and(eligible.not().upgrading()))
targets.put(id, version);
}
int numberToUpgrade = policy == UpgradePolicy.canary ? instances.size() : numberOfApplicationsToUpgrade();
for (ApplicationId id : instances.matching(targets.keySet()::contains).first(numberToUpgrade)) {
log.log(Level.INFO, "Triggering upgrade to " + targets.get(id) + " for " + id);
+ if (failingRevision.contains(id))
+ controller().applications().deploymentTrigger().cancelChange(id, ChangesToCancel.APPLICATION);
+
controller().applications().deploymentTrigger().triggerChange(id, Change.of(targets.get(id)));
}
}