diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2020-01-25 21:44:05 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-01-25 21:44:05 +0100 |
commit | 3bf181256ef24570f14c977ec37d23b754a7df53 (patch) | |
tree | e275fdbdaad2163bc8ff86245024b0271b35d946 | |
parent | 4691c2159b80e9ba7dbdc2a0ccdda60cc9510790 (diff) | |
parent | 91d5b1c4b828b8c8a85c40ed0a507533e7c2aa47 (diff) |
Merge pull request #11940 from vespa-engine/jvenstad/adaptive-deployment-job-timeout
Jvenstad/adaptive deployment job timeout
16 files changed, 273 insertions, 62 deletions
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java index 3877119e8b0..d8103c864df 100644 --- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java +++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java @@ -8,6 +8,7 @@ import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.TenantName; +import java.time.Instant; import java.util.Objects; import java.util.Optional; @@ -30,6 +31,9 @@ public class Node { private final Version currentOsVersion; private final Version wantedOsVersion; private final ServiceState serviceState; + private final Optional<Instant> suspendedSince; + private final Optional<Instant> currentFirmwareCheck; + private final Optional<Instant> wantedFirmwareCheck; private final long restartGeneration; private final long wantedRestartGeneration; private final long rebootGeneration; @@ -43,8 +47,9 @@ public class Node { private final Optional<TenantName> reservedTo; public Node(HostName hostname, Optional<HostName> parentHostname, State state, NodeType type, NodeResources resources, Optional<ApplicationId> owner, - Version currentVersion, Version wantedVersion, Version currentOsVersion, Version wantedOsVersion, ServiceState serviceState, - long restartGeneration, long wantedRestartGeneration, long rebootGeneration, long wantedRebootGeneration, + Version currentVersion, Version wantedVersion, Version currentOsVersion, Version wantedOsVersion, + Optional<Instant> currentFirmwareCheck, Optional<Instant> wantedFirmwareCheck, ServiceState serviceState, + Optional<Instant> suspendedSince, long restartGeneration, long wantedRestartGeneration, long rebootGeneration, long wantedRebootGeneration, int cost, String flavor, String clusterId, ClusterType clusterType, boolean wantToRetire, boolean wantToDeprovision, Optional<TenantName> reservedTo) { this.hostname = hostname; @@ -57,7 +62,10 @@ public class Node { this.wantedVersion = wantedVersion; this.currentOsVersion = currentOsVersion; this.wantedOsVersion = wantedOsVersion; + this.currentFirmwareCheck = currentFirmwareCheck; + this.wantedFirmwareCheck = wantedFirmwareCheck; this.serviceState = serviceState; + this.suspendedSince = suspendedSince; this.restartGeneration = restartGeneration; this.wantedRestartGeneration = wantedRestartGeneration; this.rebootGeneration = rebootGeneration; @@ -109,10 +117,22 @@ public class Node { return wantedOsVersion; } + public Optional<Instant> currentFirmwareCheck() { + return currentFirmwareCheck; + } + + public Optional<Instant> wantedFirmwareCheck() { + return wantedFirmwareCheck; + } + public ServiceState serviceState() { return serviceState; } + public Optional<Instant> suspendedSince() { + return suspendedSince; + } + public long restartGeneration() { return restartGeneration; } @@ -208,7 +228,10 @@ public class Node { private Version wantedVersion; private Version currentOsVersion; private Version wantedOsVersion; + private Optional<Instant> currentFirmwareCheck = Optional.empty(); + private Optional<Instant> wantedFirmwareCheck = Optional.empty(); private ServiceState serviceState; + private Optional<Instant> suspendedSince = Optional.empty(); private long restartGeneration; private long wantedRestartGeneration; private long rebootGeneration; @@ -234,7 +257,10 @@ public class Node { this.wantedVersion = node.wantedVersion; this.currentOsVersion = node.currentOsVersion; this.wantedOsVersion = node.wantedOsVersion; + this.currentFirmwareCheck = node.currentFirmwareCheck; + this.wantedFirmwareCheck = node.wantedFirmwareCheck; this.serviceState = node.serviceState; + this.suspendedSince = node.suspendedSince; this.restartGeneration = node.restartGeneration; this.wantedRestartGeneration = node.wantedRestartGeneration; this.rebootGeneration = node.rebootGeneration; @@ -298,11 +324,26 @@ public class Node { return this; } + public Builder currentFirmwareCheck(Instant currentFirmwareCheck) { + this.currentFirmwareCheck = Optional.ofNullable(currentFirmwareCheck); + return this; + } + + public Builder wantedFirmwareCheck(Instant wantedFirmwareCheck) { + this.wantedFirmwareCheck = Optional.ofNullable(wantedFirmwareCheck); + return this; + } + public Builder serviceState(ServiceState serviceState) { this.serviceState = serviceState; return this; } + public Builder suspendedSince(Instant suspendedSince) { + this.suspendedSince = Optional.ofNullable(suspendedSince); + return this; + } + public Builder restartGeneration(long restartGeneration) { this.restartGeneration = restartGeneration; return this; @@ -359,9 +400,10 @@ public class Node { } public Node build() { - return new Node(hostname, parentHostname, state, type, resources, owner, currentVersion, wantedVersion, currentOsVersion, - wantedOsVersion, serviceState, restartGeneration, wantedRestartGeneration, rebootGeneration, wantedRebootGeneration, - cost, flavor, clusterId, clusterType, wantToRetire, wantToDeprovision, reservedTo); + return new Node(hostname, parentHostname, state, type, resources, owner, currentVersion, wantedVersion, + currentOsVersion, wantedOsVersion, currentFirmwareCheck, wantedFirmwareCheck, serviceState, + suspendedSince, restartGeneration, wantedRestartGeneration, rebootGeneration, wantedRebootGeneration, + cost, flavor, clusterId, clusterType, wantToRetire, wantToDeprovision, reservedTo); } } diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java index 58122b04712..22c373e97ee 100644 --- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java +++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java @@ -13,6 +13,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeMemb import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeRepositoryNode; import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeState; +import java.time.Instant; import java.util.Collection; import java.util.List; import java.util.Optional; @@ -107,7 +108,10 @@ public interface NodeRepository { versionFrom(node.getWantedVespaVersion()), versionFrom(node.getCurrentOsVersion()), versionFrom(node.getWantedOsVersion()), + Optional.ofNullable(node.getCurrentFirmwareCheck()).map(Instant::ofEpochMilli), + Optional.ofNullable(node.getWantedFirmwareCheck()).map(Instant::ofEpochMilli), fromBoolean(node.getAllowedToBeDown()), + Optional.ofNullable(node.suspendedSince()).map(Instant::ofEpochMilli), toInt(node.getCurrentRestartGeneration()), toInt(node.getRestartGeneration()), toInt(node.getCurrentRebootGeneration()), diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java index d9957d77ff3..2abf40be527 100644 --- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java +++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java @@ -6,8 +6,10 @@ import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.JsonNode; +import java.time.Instant; import java.util.Arrays; import java.util.Map; +import java.util.Optional; import java.util.Set; @JsonIgnoreProperties(ignoreUnknown = true) @@ -54,6 +56,10 @@ public class NodeRepositoryNode { private String currentOsVersion; @JsonProperty("wantedOsVersion") private String wantedOsVersion; + @JsonProperty("currentFirmwareCheck") + private Long currentFirmwareCheck; + @JsonProperty("wantedFirmwareCheck") + private Long wantedFirmwareCheck; @JsonProperty("failCount") private Integer failCount; @JsonProperty("environment") @@ -76,6 +82,8 @@ public class NodeRepositoryNode { private NodeHistory[] history; @JsonProperty("allowedToBeDown") private Boolean allowedToBeDown; + @JsonProperty("suspendedSince") + private Long suspendedSince; @JsonProperty("reports") private Map<String, JsonNode> reports; @JsonProperty("modelName") @@ -309,6 +317,14 @@ public class NodeRepositoryNode { return allowedToBeDown; } + public Long suspendedSince() { + return suspendedSince; + } + + public void setSuspendedSince(long suspendedSinceMillis) { + this.suspendedSince = suspendedSinceMillis; + } + public String getCurrentOsVersion() { return currentOsVersion; } @@ -325,6 +341,22 @@ public class NodeRepositoryNode { this.wantedOsVersion = wantedOsVersion; } + public Long getCurrentFirmwareCheck() { + return currentFirmwareCheck; + } + + public void setCurrentFirmwareCheck(Long currentFirmwareCheck) { + this.currentFirmwareCheck = currentFirmwareCheck; + } + + public Long getWantedFirmwareCheck() { + return wantedFirmwareCheck; + } + + public void setWantedFirmwareCheck(Long wantedFirmwareCheck) { + this.wantedFirmwareCheck = wantedFirmwareCheck; + } + public Map<String, JsonNode> getReports() { return reports; } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/ConvergenceSummary.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/ConvergenceSummary.java index 789e6ab79af..d874a8042f2 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/ConvergenceSummary.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/ConvergenceSummary.java @@ -13,6 +13,7 @@ public class ConvergenceSummary { private final long nodes; private final long down; private final long upgradingOs; + private final long upgradingFirmware; private final long needPlatformUpgrade; private final long upgradingPlatform; private final long needReboot; @@ -22,11 +23,12 @@ public class ConvergenceSummary { private final long services; private final long needNewConfig; - public ConvergenceSummary(long nodes, long down, long upgradingOs, long needPlatformUpgrade, long upgradingPlatform, + public ConvergenceSummary(long nodes, long down, long upgradingOs, long upgradingFirmware, long needPlatformUpgrade, long upgradingPlatform, long needReboot, long rebooting, long needRestart, long restarting, long services, long needNewConfig) { this.nodes = nodes; this.down = down; this.upgradingOs = upgradingOs; + this.upgradingFirmware = upgradingFirmware; this.needPlatformUpgrade = needPlatformUpgrade; this.upgradingPlatform = upgradingPlatform; this.needReboot = needReboot; @@ -52,6 +54,11 @@ public class ConvergenceSummary { return upgradingOs; } + /** Number of nodes down for firmware upgrade. */ + public long upgradingFirmware() { + return upgradingFirmware; + } + /** Number of nodes in need of a platform upgrade. */ public long needPlatformUpgrade() { return needPlatformUpgrade; @@ -110,6 +117,7 @@ public class ConvergenceSummary { return nodes == that.nodes && down == that.down && upgradingOs == that.upgradingOs && + upgradingFirmware == that.upgradingFirmware && needPlatformUpgrade == that.needPlatformUpgrade && upgradingPlatform == that.upgradingPlatform && needReboot == that.needReboot && @@ -122,7 +130,7 @@ public class ConvergenceSummary { @Override public int hashCode() { - return Objects.hash(nodes, down, upgradingOs, needPlatformUpgrade, upgradingPlatform, needReboot, rebooting, needRestart, restarting, services, needNewConfig); + return Objects.hash(nodes, down, upgradingOs, upgradingFirmware, needPlatformUpgrade, upgradingPlatform, needReboot, rebooting, needRestart, restarting, services, needNewConfig); } } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java index 220925c1da7..100b66289bb 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java @@ -41,6 +41,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.organization.Deployment import com.yahoo.vespa.hosted.controller.application.ApplicationPackage; import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId; +import com.yahoo.vespa.hosted.controller.maintenance.JobRunner; import com.yahoo.yolean.Exceptions; import javax.security.auth.x500.X500Principal; @@ -115,7 +116,7 @@ public class InternalStepRunner implements StepRunner { static final Duration endpointTimeout = Duration.ofMinutes(15); static final Duration testerTimeout = Duration.ofMinutes(30); - static final Duration installationTimeout = Duration.ofMinutes(150); + static final Duration installationTimeout = Duration.ofMinutes(60); static final Duration certificateTimeout = Duration.ofMinutes(300); private final Controller controller; @@ -346,13 +347,35 @@ public class InternalStepRunner implements StepRunner { return Optional.of(error); } } - controller.jobController().locked(id, lockedRun -> lockedRun.withSummary(summary)); - if (timedOut(id, deployment.get(), installationTimeout)) { + boolean failed = false; + + NodeList suspendedTooLong = nodeList.suspendedSince(controller.clock().instant().minus(installationTimeout)); + if ( ! suspendedTooLong.isEmpty()) { + logger.log(INFO, "Some nodes have been suspended for more than " + installationTimeout.toMinutes() + " minutes."); + failed = true; + } + + if (run.noNodesDownSince() + .map(since -> since.isBefore(controller.clock().instant().minus(installationTimeout))) + .orElse(false)) { + if (summary.needPlatformUpgrade() > 0 || summary.needReboot() > 0 || summary.needRestart() > 0) + logger.log(INFO, "No nodes allowed to suspend to progress installation for " + installationTimeout.toMinutes() + " minutes."); + else + logger.log(INFO, "Nodes not able to start with new application package."); + failed = true; + } + + Duration timeout = JobRunner.jobTimeout.minusHours(1); // Time out before job dies. + if (timedOut(id, deployment.get(), timeout)) { + logger.log(INFO, "Installation failed to complete within " + timeout.toHours() + "hours!"); + failed = true; + } + + if (failed) { logger.log(nodeList.asList().stream() .flatMap(node -> nodeDetails(node, true)) .collect(toList())); - logger.log(INFO, "Installation failed to complete within " + installationTimeout.toMinutes() + " minutes!"); return Optional.of(installationFailed); } @@ -360,6 +383,12 @@ public class InternalStepRunner implements StepRunner { logger.log(nodeList.allowedDown().asList().stream() .flatMap(node -> nodeDetails(node, false)) .collect(toList())); + + controller.jobController().locked(id, lockedRun -> { + Instant noNodesDownSince = summary.down() == 0 ? lockedRun.noNodesDownSince().orElse(controller.clock().instant()) : null; + return lockedRun.noNodesDownSince(noNodesDownSince).withSummary(summary); + }); + return Optional.empty(); } @@ -454,20 +483,23 @@ public class InternalStepRunner implements StepRunner { private Stream<String> nodeDetails(NodeWithServices node, boolean printAllServices) { return Stream.concat(Stream.of(node.node().hostname() + ": " + humanize(node.node().serviceState()), - "--- platform " + node.node().wantedVersion() + (node.node().currentVersion().equals(node.node().wantedVersion()) - ? "" - : " <-- " + (node.node().currentVersion().isEmpty() ? "not booted" : node.node().currentVersion())) + - (node.node().wantedOsVersion().isAfter(node.node().currentOsVersion()) && node.node().serviceState() == Node.ServiceState.allowedDown + "--- platform " + node.node().wantedVersion() + (node.needsPlatformUpgrade() + ? " <-- " + (node.node().currentVersion().isEmpty() ? "not booted" : node.node().currentVersion()) + : "") + + (node.needsOsUpgrade() && node.isAllowedDown() ? ", upgrading OS (" + node.node().wantedOsVersion() + " <-- " + node.node().currentOsVersion() + ")" : "") + - (node.node().wantedRestartGeneration() > node.node().restartGeneration() + (node.needsFirmwareUpgrade() && node.isAllowedDown() + ? ", upgrading firmware" + : "") + + (node.needsRestart() ? ", restart pending (" + node.node().wantedRestartGeneration() + " <-- " + node.node().restartGeneration() + ")" : "") + - (node.node().wantedRebootGeneration() > node.node().rebootGeneration() + (node.needsReboot() ? ", reboot pending (" + node.node().wantedRebootGeneration() + " <-- " + node.node().rebootGeneration() + ")" : "")), node.services().stream() - .filter(service -> printAllServices || node.wantedConfigGeneration() > service.currentGeneration()) + .filter(service -> printAllServices || node.needsNewConfig()) .map(service -> "--- " + service.type() + " on port " + service.port() + (service.currentGeneration() == -1 ? " has not started " : " has config generation " + service.currentGeneration() + ", wanted is " + node.wantedConfigGeneration()))); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java index 0feadebd6d2..d3533fc5200 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java @@ -2,19 +2,15 @@ package com.yahoo.vespa.hosted.controller.deployment; import com.yahoo.collections.AbstractFilteringList; -import com.yahoo.config.provision.HostName; import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node; import com.yahoo.vespa.hosted.controller.api.integration.configserver.ServiceConvergence; +import java.time.Instant; import java.util.Collection; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import java.util.function.Predicate; import java.util.stream.Collectors; import static java.util.stream.Collectors.groupingBy; -import static java.util.stream.Collectors.toList; public class NodeList extends AbstractFilteringList<NodeWithServices, NodeList> { @@ -41,38 +37,48 @@ public class NodeList extends AbstractFilteringList<NodeWithServices, NodeList> } /** The nodes on an outdated OS. */ - public NodeList upgradingOs() { - return matching(node -> node.parent().wantedOsVersion().isAfter(node.parent().currentOsVersion())); + public NodeList needsOsUpgrade() { + return matching(NodeWithServices::needsOsUpgrade); + } + + /** The nodes with outdated firmware. */ + public NodeList needsFirmwareUpgrade() { + return matching(NodeWithServices::needsFirmwareUpgrade); } /** The nodes whose parent is down. */ public NodeList withParentDown() { - return matching(node -> node.parent().serviceState() == Node.ServiceState.allowedDown); + return matching(NodeWithServices::hasParentDown); } /** The nodes on an outdated platform. */ - public NodeList upgradingPlatform() { - return matching(node -> node.node().wantedVersion().isAfter(node.node().currentVersion())); + public NodeList needsPlatformUpgrade() { + return matching(NodeWithServices::needsPlatformUpgrade); } /** The nodes in need of a reboot. */ - public NodeList rebooting() { - return matching(node -> node.node().wantedRebootGeneration() > node.node().rebootGeneration()); + public NodeList needsReboot() { + return matching(NodeWithServices::needsReboot); } /** The nodes in need of a restart. */ - public NodeList restarting() { - return matching(node -> node.node().wantedRestartGeneration() > node.node().restartGeneration()); + public NodeList needsRestart() { + return matching(NodeWithServices::needsRestart); } /** The nodes currently allowed to be down. */ public NodeList allowedDown() { - return matching(node -> node.node().serviceState() == Node.ServiceState.allowedDown); + return matching(node -> node.isAllowedDown() || node.isNewlyProvisioned()); + } + + /** The nodes which have been suspended since before the given instant. */ + public NodeList suspendedSince(Instant instant) { + return matching(node -> node.isSuspendedSince(instant)); } /** The nodes with services on outdated config generation. */ - public NodeList upgradingApplication() { - return matching(node -> node.services().stream().anyMatch(service -> wantedConfigGeneration > service.currentGeneration())); + public NodeList needsNewConfig() { + return matching(NodeWithServices::needsNewConfig); } /** Returns a summary of the convergence status of the nodes in this list. */ @@ -80,13 +86,14 @@ public class NodeList extends AbstractFilteringList<NodeWithServices, NodeList> NodeList allowedDown = allowedDown(); return new ConvergenceSummary(size(), allowedDown.size(), - withParentDown().upgradingOs().size(), - upgradingPlatform().size(), - allowedDown.upgradingPlatform().size(), - rebooting().size(), - allowedDown.rebooting().size(), - restarting().size(), - allowedDown.restarting().size(), + withParentDown().needsOsUpgrade().size(), + withParentDown().needsFirmwareUpgrade().size(), + needsPlatformUpgrade().size(), + allowedDown.needsPlatformUpgrade().size(), + needsReboot().size(), + allowedDown.needsReboot().size(), + needsRestart().size(), + allowedDown.needsRestart().size(), asList().stream().mapToLong(node -> node.services().size()).sum(), asList().stream().mapToLong(node -> node.services().stream().filter(service -> wantedConfigGeneration > service.currentGeneration()).count()).sum()); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java index c6bdd42ffb0..80c1fe0f40b 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java @@ -1,8 +1,10 @@ package com.yahoo.vespa.hosted.controller.deployment; +import com.yahoo.component.Version; import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node; import com.yahoo.vespa.hosted.controller.api.integration.configserver.ServiceConvergence; +import java.time.Instant; import java.util.List; import static java.util.Objects.requireNonNull; @@ -33,4 +35,48 @@ public class NodeWithServices { public long wantedConfigGeneration() { return wantedConfigGeneration; } public List<ServiceConvergence.Status> services() { return services; } + public boolean needsOsUpgrade() { + return parent.wantedOsVersion().isAfter(parent.currentOsVersion()); + } + + public boolean needsFirmwareUpgrade(){ + return parent.wantedFirmwareCheck() + .map(wanted -> parent.currentFirmwareCheck() + .map(wanted::isAfter) + .orElse(true)) + .orElse(false); + } + + public boolean hasParentDown() { + return parent.serviceState() == Node.ServiceState.allowedDown; + } + + public boolean needsPlatformUpgrade() { + return node.wantedVersion().isAfter(node.currentVersion()); + } + + public boolean needsReboot() { + return node.wantedRebootGeneration() > node.rebootGeneration(); + } + + public boolean needsRestart() { + return node.wantedRestartGeneration() > node.restartGeneration(); + } + + public boolean isAllowedDown() { + return node.serviceState() == Node.ServiceState.allowedDown; + } + + public boolean isNewlyProvisioned() { + return node.currentVersion().equals(Version.emptyVersion); + } + + public boolean isSuspendedSince(Instant instant) { + return node.suspendedSince().map(instant::isAfter).orElse(false); + } + + public boolean needsNewConfig() { + return services.stream().anyMatch(service -> wantedConfigGeneration > service.currentGeneration()); + } + } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java index a3ca20fb8b4..8cd57fa7d3a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java @@ -119,7 +119,7 @@ public class Run { public Run noNodesDownSince(Instant noNodesDownSince) { requireActive(); return new Run(id, steps, versions, start, end, status, lastTestRecord, lastVespaLogTimestamp, - Optional.of(noNodesDownSince), convergenceSummary, testerCertificate); + Optional.ofNullable(noNodesDownSince), convergenceSummary, testerCertificate); } public Run withSummary(ConvergenceSummary convergenceSummary) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java index 83173fc32a7..23e3149ec1e 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java @@ -27,7 +27,7 @@ import java.util.logging.Logger; */ public class JobRunner extends Maintainer { - static final Duration jobTimeout = Duration.ofDays(1); + public static final Duration jobTimeout = Duration.ofDays(1).plusHours(1); private static final Logger log = Logger.getLogger(JobRunner.class.getName()); private final JobController jobs; diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java index 22129d8bb06..a4b0df31883 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java @@ -180,8 +180,8 @@ class RunSerializer { if ( ! summaryArray.valid()) return Optional.empty(); - if (summaryArray.entries() != 11) - throw new IllegalArgumentException("Convergence summary must have 11 entries"); + if (summaryArray.entries() != 12) + throw new IllegalArgumentException("Convergence summary must have 12 entries"); return Optional.of(new ConvergenceSummary(summaryArray.entry(0).asLong(), summaryArray.entry(1).asLong(), @@ -193,7 +193,8 @@ class RunSerializer { summaryArray.entry(7).asLong(), summaryArray.entry(8).asLong(), summaryArray.entry(9).asLong(), - summaryArray.entry(10).asLong())); + summaryArray.entry(10).asLong(), + summaryArray.entry(11).asLong())); } Slime toSlime(Iterable<Run> runs) { @@ -261,6 +262,7 @@ class RunSerializer { summaryArray.addLong(summary.nodes()); summaryArray.addLong(summary.down()); summaryArray.addLong(summary.upgradingOs()); + summaryArray.addLong(summary.upgradingFirmware()); summaryArray.addLong(summary.needPlatformUpgrade()); summaryArray.addLong(summary.upgradingPlatform()); summaryArray.addLong(summary.needReboot()); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java index 2fd64bb6ba7..91a0455db11 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java @@ -476,6 +476,7 @@ class JobControllerApiHandlerHelper { summaryObject.setLong("needRestart", summary.needRestart()); summaryObject.setLong("restarting", summary.restarting()); summaryObject.setLong("upgradingOs", summary.upgradingOs()); + summaryObject.setLong("upgradingFirmware", summary.upgradingFirmware()); summaryObject.setLong("services", summary.services()); summaryObject.setLong("needNewConfig", summary.needNewConfig()); } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java index f84bd86194a..ff66ab38d32 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java @@ -19,6 +19,7 @@ import com.yahoo.vespa.hosted.controller.api.application.v4.model.configserverbi import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; import com.yahoo.vespa.hosted.controller.api.integration.LogEntry; import com.yahoo.vespa.hosted.controller.api.integration.configserver.ConfigServerException; +import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; import com.yahoo.vespa.hosted.controller.api.integration.deployment.RunId; import com.yahoo.vespa.hosted.controller.api.integration.deployment.TesterCloud; @@ -191,6 +192,53 @@ public class InternalStepRunnerTest { } @Test + public void timesOutWithoutInstallationProgress() { + tester.controllerTester().upgradeSystem(new Version("7.1")); + tester.controllerTester().computeVersionStatus(); + tester.upgrader().maintain(); + app.newRun(JobType.systemTest); + + // Node is down too long in system test, and no nodes go down in staging. + tester.runner().run(); + tester.setEndpoints(app.testerId().id(), JobType.systemTest.zone(system())); + tester.configServer().setVersion(app.testerId().id(), JobType.systemTest.zone(system()), tester.controller().systemVersion()); + tester.configServer().convergeServices(app.testerId().id(), JobType.systemTest.zone(system())); + tester.setEndpoints(app.instanceId(), JobType.systemTest.zone(system())); + tester.setEndpoints(app.testerId().id(), JobType.stagingTest.zone(system())); + tester.configServer().setVersion(app.testerId().id(), JobType.stagingTest.zone(system()), tester.controller().systemVersion()); + tester.configServer().convergeServices(app.testerId().id(), JobType.stagingTest.zone(system())); + tester.setEndpoints(app.instanceId(), JobType.stagingTest.zone(system())); + tester.runner().run(); + assertEquals(succeeded, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installTester)); + assertEquals(succeeded, tester.jobs().last(app.instanceId(), JobType.stagingTest).get().stepStatuses().get(Step.installTester)); + + Node systemTestNode = tester.configServer().nodeRepository().list(JobType.systemTest.zone(system()), + app.instanceId()).iterator().next(); + tester.clock().advance(InternalStepRunner.installationTimeout.minus(Duration.ofSeconds(1))); + tester.configServer().nodeRepository().putByHostname(JobType.systemTest.zone(system()), + new Node.Builder(systemTestNode) + .serviceState(Node.ServiceState.allowedDown) + .suspendedSince(tester.clock().instant()) + .build()); + tester.runner().run(); + assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal)); + assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.stagingTest).get().stepStatuses().get(Step.installInitialReal)); + + tester.clock().advance(Duration.ofSeconds(2)); + tester.runner().run(); + assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal)); + assertEquals(failed, tester.jobs().last(app.instanceId(), JobType.stagingTest).get().stepStatuses().get(Step.installInitialReal)); + + tester.clock().advance(InternalStepRunner.installationTimeout.minus(Duration.ofSeconds(3))); + tester.runner().run(); + assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal)); + + tester.clock().advance(Duration.ofSeconds(2)); + tester.runner().run(); + assertEquals(failed, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal)); + } + + @Test public void startingTestsFailsIfDeploymentExpires() { app.newRun(JobType.systemTest); tester.runner().run(); @@ -211,19 +259,6 @@ public class InternalStepRunnerTest { } @Test - public void startTestsFailsIfDeploymentExpires() { - app.newRun(JobType.systemTest); - tester.runner().run(); - tester.configServer().convergeServices(app.instanceId(), JobType.systemTest.zone(system())); - tester.configServer().convergeServices(app.testerId().id(), JobType.systemTest.zone(system())); - tester.runner().run(); - - tester.applications().deactivate(app.instanceId(), JobType.systemTest.zone(system())); - tester.runner().run(); - assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.startTests)); - } - - @Test public void alternativeEndpointsAreDetected() { app.newRun(JobType.systemTest); tester.runner().run();; diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java index 3839e2103cd..e5757604caf 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java @@ -100,7 +100,7 @@ public class RunSerializerTest { "badb17"), 122), run.versions().sourceApplication().get()); - assertEquals(Optional.of(new ConvergenceSummary(1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89)), + assertEquals(Optional.of(new ConvergenceSummary(1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144)), run.convergenceSummary()); assertEquals(X509CertificateUtils.fromPem("-----BEGIN CERTIFICATE-----\n" + "MIIBEzCBu6ADAgECAgEBMAoGCCqGSM49BAMEMBQxEjAQBgNVBAMTCW15c2Vydmlj\n" + diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json index a66b9d3e955..a7e5d249a9d 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json @@ -8,7 +8,7 @@ "lastTestRecord": 3, "lastVespaLogTimestamp": 1196676930000432, "noNodesDownSince": 321321321321, - "convergenceSummary": [1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89], + "convergenceSummary": [1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144], "testerCertificate": "-----BEGIN CERTIFICATE-----\nMIIBEzCBu6ADAgECAgEBMAoGCCqGSM49BAMEMBQxEjAQBgNVBAMTCW15c2Vydmlj\nZTAeFw0xOTA5MDYwNzM3MDZaFw0xOTA5MDcwNzM3MDZaMBQxEjAQBgNVBAMTCW15\nc2VydmljZTBZMBMGByqGSM49AgEGCCqGSM49AwEHA0IABM0JhD8fV2DlAkjQOGX3\nY50ryMBr3g2+v/uFiRoxJ1muuSOWYrW7HCQIGuzc04fa0QwtaX/voAZKCV51t6jF\n0fwwCgYIKoZIzj0EAwQDRwAwRAIgVbQ3Co1H4X0gmRrtXSyTU0HgBQu9PXHMmX20\n5MyyPSoCIBltOcmaPfdN03L3zqbqZ6PgUBWsvAHgiBzL3hrtJ+iy\n-----END CERTIFICATE-----", "steps": { "deployInitialReal": "unfinished", diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/dev-us-east-1-log-first-part.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/dev-us-east-1-log-first-part.json index fac963fd5eb..e1c2310ce7e 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/dev-us-east-1-log-first-part.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/dev-us-east-1-log-first-part.json @@ -68,6 +68,7 @@ "needRestart": 0, "restarting": 0, "upgradingOs": 0, + "upgradingFirmware": 0, "services": 1, "needNewConfig": 1 } diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/staging-test-log.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/staging-test-log.json index 5559ac952a2..273887c26c4 100644 --- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/staging-test-log.json +++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/staging-test-log.json @@ -152,6 +152,7 @@ "needRestart": 0, "restarting": 0, "upgradingOs": 0, + "upgradingFirmware": 0, "services": 1, "needNewConfig": 1 } |