summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Marius Venstad <jonmv@users.noreply.github.com>2020-01-25 21:44:05 +0100
committerGitHub <noreply@github.com>2020-01-25 21:44:05 +0100
commit3bf181256ef24570f14c977ec37d23b754a7df53 (patch)
treee275fdbdaad2163bc8ff86245024b0271b35d946
parent4691c2159b80e9ba7dbdc2a0ccdda60cc9510790 (diff)
parent91d5b1c4b828b8c8a85c40ed0a507533e7c2aa47 (diff)
Merge pull request #11940 from vespa-engine/jvenstad/adaptive-deployment-job-timeout
Jvenstad/adaptive deployment job timeout
-rw-r--r--controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java52
-rw-r--r--controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java4
-rw-r--r--controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java32
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/ConvergenceSummary.java12
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java54
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java55
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java46
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java8
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java1
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java61
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java2
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json2
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/dev-us-east-1-log-first-part.json1
-rw-r--r--controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/staging-test-log.json1
16 files changed, 273 insertions, 62 deletions
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java
index 3877119e8b0..d8103c864df 100644
--- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java
+++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java
@@ -8,6 +8,7 @@ import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.TenantName;
+import java.time.Instant;
import java.util.Objects;
import java.util.Optional;
@@ -30,6 +31,9 @@ public class Node {
private final Version currentOsVersion;
private final Version wantedOsVersion;
private final ServiceState serviceState;
+ private final Optional<Instant> suspendedSince;
+ private final Optional<Instant> currentFirmwareCheck;
+ private final Optional<Instant> wantedFirmwareCheck;
private final long restartGeneration;
private final long wantedRestartGeneration;
private final long rebootGeneration;
@@ -43,8 +47,9 @@ public class Node {
private final Optional<TenantName> reservedTo;
public Node(HostName hostname, Optional<HostName> parentHostname, State state, NodeType type, NodeResources resources, Optional<ApplicationId> owner,
- Version currentVersion, Version wantedVersion, Version currentOsVersion, Version wantedOsVersion, ServiceState serviceState,
- long restartGeneration, long wantedRestartGeneration, long rebootGeneration, long wantedRebootGeneration,
+ Version currentVersion, Version wantedVersion, Version currentOsVersion, Version wantedOsVersion,
+ Optional<Instant> currentFirmwareCheck, Optional<Instant> wantedFirmwareCheck, ServiceState serviceState,
+ Optional<Instant> suspendedSince, long restartGeneration, long wantedRestartGeneration, long rebootGeneration, long wantedRebootGeneration,
int cost, String flavor, String clusterId, ClusterType clusterType, boolean wantToRetire, boolean wantToDeprovision,
Optional<TenantName> reservedTo) {
this.hostname = hostname;
@@ -57,7 +62,10 @@ public class Node {
this.wantedVersion = wantedVersion;
this.currentOsVersion = currentOsVersion;
this.wantedOsVersion = wantedOsVersion;
+ this.currentFirmwareCheck = currentFirmwareCheck;
+ this.wantedFirmwareCheck = wantedFirmwareCheck;
this.serviceState = serviceState;
+ this.suspendedSince = suspendedSince;
this.restartGeneration = restartGeneration;
this.wantedRestartGeneration = wantedRestartGeneration;
this.rebootGeneration = rebootGeneration;
@@ -109,10 +117,22 @@ public class Node {
return wantedOsVersion;
}
+ public Optional<Instant> currentFirmwareCheck() {
+ return currentFirmwareCheck;
+ }
+
+ public Optional<Instant> wantedFirmwareCheck() {
+ return wantedFirmwareCheck;
+ }
+
public ServiceState serviceState() {
return serviceState;
}
+ public Optional<Instant> suspendedSince() {
+ return suspendedSince;
+ }
+
public long restartGeneration() {
return restartGeneration;
}
@@ -208,7 +228,10 @@ public class Node {
private Version wantedVersion;
private Version currentOsVersion;
private Version wantedOsVersion;
+ private Optional<Instant> currentFirmwareCheck = Optional.empty();
+ private Optional<Instant> wantedFirmwareCheck = Optional.empty();
private ServiceState serviceState;
+ private Optional<Instant> suspendedSince = Optional.empty();
private long restartGeneration;
private long wantedRestartGeneration;
private long rebootGeneration;
@@ -234,7 +257,10 @@ public class Node {
this.wantedVersion = node.wantedVersion;
this.currentOsVersion = node.currentOsVersion;
this.wantedOsVersion = node.wantedOsVersion;
+ this.currentFirmwareCheck = node.currentFirmwareCheck;
+ this.wantedFirmwareCheck = node.wantedFirmwareCheck;
this.serviceState = node.serviceState;
+ this.suspendedSince = node.suspendedSince;
this.restartGeneration = node.restartGeneration;
this.wantedRestartGeneration = node.wantedRestartGeneration;
this.rebootGeneration = node.rebootGeneration;
@@ -298,11 +324,26 @@ public class Node {
return this;
}
+ public Builder currentFirmwareCheck(Instant currentFirmwareCheck) {
+ this.currentFirmwareCheck = Optional.ofNullable(currentFirmwareCheck);
+ return this;
+ }
+
+ public Builder wantedFirmwareCheck(Instant wantedFirmwareCheck) {
+ this.wantedFirmwareCheck = Optional.ofNullable(wantedFirmwareCheck);
+ return this;
+ }
+
public Builder serviceState(ServiceState serviceState) {
this.serviceState = serviceState;
return this;
}
+ public Builder suspendedSince(Instant suspendedSince) {
+ this.suspendedSince = Optional.ofNullable(suspendedSince);
+ return this;
+ }
+
public Builder restartGeneration(long restartGeneration) {
this.restartGeneration = restartGeneration;
return this;
@@ -359,9 +400,10 @@ public class Node {
}
public Node build() {
- return new Node(hostname, parentHostname, state, type, resources, owner, currentVersion, wantedVersion, currentOsVersion,
- wantedOsVersion, serviceState, restartGeneration, wantedRestartGeneration, rebootGeneration, wantedRebootGeneration,
- cost, flavor, clusterId, clusterType, wantToRetire, wantToDeprovision, reservedTo);
+ return new Node(hostname, parentHostname, state, type, resources, owner, currentVersion, wantedVersion,
+ currentOsVersion, wantedOsVersion, currentFirmwareCheck, wantedFirmwareCheck, serviceState,
+ suspendedSince, restartGeneration, wantedRestartGeneration, rebootGeneration, wantedRebootGeneration,
+ cost, flavor, clusterId, clusterType, wantToRetire, wantToDeprovision, reservedTo);
}
}
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java
index 58122b04712..22c373e97ee 100644
--- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java
+++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java
@@ -13,6 +13,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeMemb
import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeRepositoryNode;
import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeState;
+import java.time.Instant;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
@@ -107,7 +108,10 @@ public interface NodeRepository {
versionFrom(node.getWantedVespaVersion()),
versionFrom(node.getCurrentOsVersion()),
versionFrom(node.getWantedOsVersion()),
+ Optional.ofNullable(node.getCurrentFirmwareCheck()).map(Instant::ofEpochMilli),
+ Optional.ofNullable(node.getWantedFirmwareCheck()).map(Instant::ofEpochMilli),
fromBoolean(node.getAllowedToBeDown()),
+ Optional.ofNullable(node.suspendedSince()).map(Instant::ofEpochMilli),
toInt(node.getCurrentRestartGeneration()),
toInt(node.getRestartGeneration()),
toInt(node.getCurrentRebootGeneration()),
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java
index d9957d77ff3..2abf40be527 100644
--- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java
+++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java
@@ -6,8 +6,10 @@ import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.JsonNode;
+import java.time.Instant;
import java.util.Arrays;
import java.util.Map;
+import java.util.Optional;
import java.util.Set;
@JsonIgnoreProperties(ignoreUnknown = true)
@@ -54,6 +56,10 @@ public class NodeRepositoryNode {
private String currentOsVersion;
@JsonProperty("wantedOsVersion")
private String wantedOsVersion;
+ @JsonProperty("currentFirmwareCheck")
+ private Long currentFirmwareCheck;
+ @JsonProperty("wantedFirmwareCheck")
+ private Long wantedFirmwareCheck;
@JsonProperty("failCount")
private Integer failCount;
@JsonProperty("environment")
@@ -76,6 +82,8 @@ public class NodeRepositoryNode {
private NodeHistory[] history;
@JsonProperty("allowedToBeDown")
private Boolean allowedToBeDown;
+ @JsonProperty("suspendedSince")
+ private Long suspendedSince;
@JsonProperty("reports")
private Map<String, JsonNode> reports;
@JsonProperty("modelName")
@@ -309,6 +317,14 @@ public class NodeRepositoryNode {
return allowedToBeDown;
}
+ public Long suspendedSince() {
+ return suspendedSince;
+ }
+
+ public void setSuspendedSince(long suspendedSinceMillis) {
+ this.suspendedSince = suspendedSinceMillis;
+ }
+
public String getCurrentOsVersion() {
return currentOsVersion;
}
@@ -325,6 +341,22 @@ public class NodeRepositoryNode {
this.wantedOsVersion = wantedOsVersion;
}
+ public Long getCurrentFirmwareCheck() {
+ return currentFirmwareCheck;
+ }
+
+ public void setCurrentFirmwareCheck(Long currentFirmwareCheck) {
+ this.currentFirmwareCheck = currentFirmwareCheck;
+ }
+
+ public Long getWantedFirmwareCheck() {
+ return wantedFirmwareCheck;
+ }
+
+ public void setWantedFirmwareCheck(Long wantedFirmwareCheck) {
+ this.wantedFirmwareCheck = wantedFirmwareCheck;
+ }
+
public Map<String, JsonNode> getReports() {
return reports;
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/ConvergenceSummary.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/ConvergenceSummary.java
index 789e6ab79af..d874a8042f2 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/ConvergenceSummary.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/ConvergenceSummary.java
@@ -13,6 +13,7 @@ public class ConvergenceSummary {
private final long nodes;
private final long down;
private final long upgradingOs;
+ private final long upgradingFirmware;
private final long needPlatformUpgrade;
private final long upgradingPlatform;
private final long needReboot;
@@ -22,11 +23,12 @@ public class ConvergenceSummary {
private final long services;
private final long needNewConfig;
- public ConvergenceSummary(long nodes, long down, long upgradingOs, long needPlatformUpgrade, long upgradingPlatform,
+ public ConvergenceSummary(long nodes, long down, long upgradingOs, long upgradingFirmware, long needPlatformUpgrade, long upgradingPlatform,
long needReboot, long rebooting, long needRestart, long restarting, long services, long needNewConfig) {
this.nodes = nodes;
this.down = down;
this.upgradingOs = upgradingOs;
+ this.upgradingFirmware = upgradingFirmware;
this.needPlatformUpgrade = needPlatformUpgrade;
this.upgradingPlatform = upgradingPlatform;
this.needReboot = needReboot;
@@ -52,6 +54,11 @@ public class ConvergenceSummary {
return upgradingOs;
}
+ /** Number of nodes down for firmware upgrade. */
+ public long upgradingFirmware() {
+ return upgradingFirmware;
+ }
+
/** Number of nodes in need of a platform upgrade. */
public long needPlatformUpgrade() {
return needPlatformUpgrade;
@@ -110,6 +117,7 @@ public class ConvergenceSummary {
return nodes == that.nodes &&
down == that.down &&
upgradingOs == that.upgradingOs &&
+ upgradingFirmware == that.upgradingFirmware &&
needPlatformUpgrade == that.needPlatformUpgrade &&
upgradingPlatform == that.upgradingPlatform &&
needReboot == that.needReboot &&
@@ -122,7 +130,7 @@ public class ConvergenceSummary {
@Override
public int hashCode() {
- return Objects.hash(nodes, down, upgradingOs, needPlatformUpgrade, upgradingPlatform, needReboot, rebooting, needRestart, restarting, services, needNewConfig);
+ return Objects.hash(nodes, down, upgradingOs, upgradingFirmware, needPlatformUpgrade, upgradingPlatform, needReboot, rebooting, needRestart, restarting, services, needNewConfig);
}
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
index 220925c1da7..100b66289bb 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
@@ -41,6 +41,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.organization.Deployment
import com.yahoo.vespa.hosted.controller.application.ApplicationPackage;
import com.yahoo.vespa.hosted.controller.application.Deployment;
import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId;
+import com.yahoo.vespa.hosted.controller.maintenance.JobRunner;
import com.yahoo.yolean.Exceptions;
import javax.security.auth.x500.X500Principal;
@@ -115,7 +116,7 @@ public class InternalStepRunner implements StepRunner {
static final Duration endpointTimeout = Duration.ofMinutes(15);
static final Duration testerTimeout = Duration.ofMinutes(30);
- static final Duration installationTimeout = Duration.ofMinutes(150);
+ static final Duration installationTimeout = Duration.ofMinutes(60);
static final Duration certificateTimeout = Duration.ofMinutes(300);
private final Controller controller;
@@ -346,13 +347,35 @@ public class InternalStepRunner implements StepRunner {
return Optional.of(error);
}
}
- controller.jobController().locked(id, lockedRun -> lockedRun.withSummary(summary));
- if (timedOut(id, deployment.get(), installationTimeout)) {
+ boolean failed = false;
+
+ NodeList suspendedTooLong = nodeList.suspendedSince(controller.clock().instant().minus(installationTimeout));
+ if ( ! suspendedTooLong.isEmpty()) {
+ logger.log(INFO, "Some nodes have been suspended for more than " + installationTimeout.toMinutes() + " minutes.");
+ failed = true;
+ }
+
+ if (run.noNodesDownSince()
+ .map(since -> since.isBefore(controller.clock().instant().minus(installationTimeout)))
+ .orElse(false)) {
+ if (summary.needPlatformUpgrade() > 0 || summary.needReboot() > 0 || summary.needRestart() > 0)
+ logger.log(INFO, "No nodes allowed to suspend to progress installation for " + installationTimeout.toMinutes() + " minutes.");
+ else
+ logger.log(INFO, "Nodes not able to start with new application package.");
+ failed = true;
+ }
+
+ Duration timeout = JobRunner.jobTimeout.minusHours(1); // Time out before job dies.
+ if (timedOut(id, deployment.get(), timeout)) {
+ logger.log(INFO, "Installation failed to complete within " + timeout.toHours() + "hours!");
+ failed = true;
+ }
+
+ if (failed) {
logger.log(nodeList.asList().stream()
.flatMap(node -> nodeDetails(node, true))
.collect(toList()));
- logger.log(INFO, "Installation failed to complete within " + installationTimeout.toMinutes() + " minutes!");
return Optional.of(installationFailed);
}
@@ -360,6 +383,12 @@ public class InternalStepRunner implements StepRunner {
logger.log(nodeList.allowedDown().asList().stream()
.flatMap(node -> nodeDetails(node, false))
.collect(toList()));
+
+ controller.jobController().locked(id, lockedRun -> {
+ Instant noNodesDownSince = summary.down() == 0 ? lockedRun.noNodesDownSince().orElse(controller.clock().instant()) : null;
+ return lockedRun.noNodesDownSince(noNodesDownSince).withSummary(summary);
+ });
+
return Optional.empty();
}
@@ -454,20 +483,23 @@ public class InternalStepRunner implements StepRunner {
private Stream<String> nodeDetails(NodeWithServices node, boolean printAllServices) {
return Stream.concat(Stream.of(node.node().hostname() + ": " + humanize(node.node().serviceState()),
- "--- platform " + node.node().wantedVersion() + (node.node().currentVersion().equals(node.node().wantedVersion())
- ? ""
- : " <-- " + (node.node().currentVersion().isEmpty() ? "not booted" : node.node().currentVersion())) +
- (node.node().wantedOsVersion().isAfter(node.node().currentOsVersion()) && node.node().serviceState() == Node.ServiceState.allowedDown
+ "--- platform " + node.node().wantedVersion() + (node.needsPlatformUpgrade()
+ ? " <-- " + (node.node().currentVersion().isEmpty() ? "not booted" : node.node().currentVersion())
+ : "") +
+ (node.needsOsUpgrade() && node.isAllowedDown()
? ", upgrading OS (" + node.node().wantedOsVersion() + " <-- " + node.node().currentOsVersion() + ")"
: "") +
- (node.node().wantedRestartGeneration() > node.node().restartGeneration()
+ (node.needsFirmwareUpgrade() && node.isAllowedDown()
+ ? ", upgrading firmware"
+ : "") +
+ (node.needsRestart()
? ", restart pending (" + node.node().wantedRestartGeneration() + " <-- " + node.node().restartGeneration() + ")"
: "") +
- (node.node().wantedRebootGeneration() > node.node().rebootGeneration()
+ (node.needsReboot()
? ", reboot pending (" + node.node().wantedRebootGeneration() + " <-- " + node.node().rebootGeneration() + ")"
: "")),
node.services().stream()
- .filter(service -> printAllServices || node.wantedConfigGeneration() > service.currentGeneration())
+ .filter(service -> printAllServices || node.needsNewConfig())
.map(service -> "--- " + service.type() + " on port " + service.port() + (service.currentGeneration() == -1
? " has not started "
: " has config generation " + service.currentGeneration() + ", wanted is " + node.wantedConfigGeneration())));
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java
index 0feadebd6d2..d3533fc5200 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java
@@ -2,19 +2,15 @@ package com.yahoo.vespa.hosted.controller.deployment;
import com.yahoo.collections.AbstractFilteringList;
-import com.yahoo.config.provision.HostName;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.ServiceConvergence;
+import java.time.Instant;
import java.util.Collection;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
-import java.util.function.Predicate;
import java.util.stream.Collectors;
import static java.util.stream.Collectors.groupingBy;
-import static java.util.stream.Collectors.toList;
public class NodeList extends AbstractFilteringList<NodeWithServices, NodeList> {
@@ -41,38 +37,48 @@ public class NodeList extends AbstractFilteringList<NodeWithServices, NodeList>
}
/** The nodes on an outdated OS. */
- public NodeList upgradingOs() {
- return matching(node -> node.parent().wantedOsVersion().isAfter(node.parent().currentOsVersion()));
+ public NodeList needsOsUpgrade() {
+ return matching(NodeWithServices::needsOsUpgrade);
+ }
+
+ /** The nodes with outdated firmware. */
+ public NodeList needsFirmwareUpgrade() {
+ return matching(NodeWithServices::needsFirmwareUpgrade);
}
/** The nodes whose parent is down. */
public NodeList withParentDown() {
- return matching(node -> node.parent().serviceState() == Node.ServiceState.allowedDown);
+ return matching(NodeWithServices::hasParentDown);
}
/** The nodes on an outdated platform. */
- public NodeList upgradingPlatform() {
- return matching(node -> node.node().wantedVersion().isAfter(node.node().currentVersion()));
+ public NodeList needsPlatformUpgrade() {
+ return matching(NodeWithServices::needsPlatformUpgrade);
}
/** The nodes in need of a reboot. */
- public NodeList rebooting() {
- return matching(node -> node.node().wantedRebootGeneration() > node.node().rebootGeneration());
+ public NodeList needsReboot() {
+ return matching(NodeWithServices::needsReboot);
}
/** The nodes in need of a restart. */
- public NodeList restarting() {
- return matching(node -> node.node().wantedRestartGeneration() > node.node().restartGeneration());
+ public NodeList needsRestart() {
+ return matching(NodeWithServices::needsRestart);
}
/** The nodes currently allowed to be down. */
public NodeList allowedDown() {
- return matching(node -> node.node().serviceState() == Node.ServiceState.allowedDown);
+ return matching(node -> node.isAllowedDown() || node.isNewlyProvisioned());
+ }
+
+ /** The nodes which have been suspended since before the given instant. */
+ public NodeList suspendedSince(Instant instant) {
+ return matching(node -> node.isSuspendedSince(instant));
}
/** The nodes with services on outdated config generation. */
- public NodeList upgradingApplication() {
- return matching(node -> node.services().stream().anyMatch(service -> wantedConfigGeneration > service.currentGeneration()));
+ public NodeList needsNewConfig() {
+ return matching(NodeWithServices::needsNewConfig);
}
/** Returns a summary of the convergence status of the nodes in this list. */
@@ -80,13 +86,14 @@ public class NodeList extends AbstractFilteringList<NodeWithServices, NodeList>
NodeList allowedDown = allowedDown();
return new ConvergenceSummary(size(),
allowedDown.size(),
- withParentDown().upgradingOs().size(),
- upgradingPlatform().size(),
- allowedDown.upgradingPlatform().size(),
- rebooting().size(),
- allowedDown.rebooting().size(),
- restarting().size(),
- allowedDown.restarting().size(),
+ withParentDown().needsOsUpgrade().size(),
+ withParentDown().needsFirmwareUpgrade().size(),
+ needsPlatformUpgrade().size(),
+ allowedDown.needsPlatformUpgrade().size(),
+ needsReboot().size(),
+ allowedDown.needsReboot().size(),
+ needsRestart().size(),
+ allowedDown.needsRestart().size(),
asList().stream().mapToLong(node -> node.services().size()).sum(),
asList().stream().mapToLong(node -> node.services().stream().filter(service -> wantedConfigGeneration > service.currentGeneration()).count()).sum());
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java
index c6bdd42ffb0..80c1fe0f40b 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeWithServices.java
@@ -1,8 +1,10 @@
package com.yahoo.vespa.hosted.controller.deployment;
+import com.yahoo.component.Version;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.ServiceConvergence;
+import java.time.Instant;
import java.util.List;
import static java.util.Objects.requireNonNull;
@@ -33,4 +35,48 @@ public class NodeWithServices {
public long wantedConfigGeneration() { return wantedConfigGeneration; }
public List<ServiceConvergence.Status> services() { return services; }
+ public boolean needsOsUpgrade() {
+ return parent.wantedOsVersion().isAfter(parent.currentOsVersion());
+ }
+
+ public boolean needsFirmwareUpgrade(){
+ return parent.wantedFirmwareCheck()
+ .map(wanted -> parent.currentFirmwareCheck()
+ .map(wanted::isAfter)
+ .orElse(true))
+ .orElse(false);
+ }
+
+ public boolean hasParentDown() {
+ return parent.serviceState() == Node.ServiceState.allowedDown;
+ }
+
+ public boolean needsPlatformUpgrade() {
+ return node.wantedVersion().isAfter(node.currentVersion());
+ }
+
+ public boolean needsReboot() {
+ return node.wantedRebootGeneration() > node.rebootGeneration();
+ }
+
+ public boolean needsRestart() {
+ return node.wantedRestartGeneration() > node.restartGeneration();
+ }
+
+ public boolean isAllowedDown() {
+ return node.serviceState() == Node.ServiceState.allowedDown;
+ }
+
+ public boolean isNewlyProvisioned() {
+ return node.currentVersion().equals(Version.emptyVersion);
+ }
+
+ public boolean isSuspendedSince(Instant instant) {
+ return node.suspendedSince().map(instant::isAfter).orElse(false);
+ }
+
+ public boolean needsNewConfig() {
+ return services.stream().anyMatch(service -> wantedConfigGeneration > service.currentGeneration());
+ }
+
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java
index a3ca20fb8b4..8cd57fa7d3a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java
@@ -119,7 +119,7 @@ public class Run {
public Run noNodesDownSince(Instant noNodesDownSince) {
requireActive();
return new Run(id, steps, versions, start, end, status, lastTestRecord, lastVespaLogTimestamp,
- Optional.of(noNodesDownSince), convergenceSummary, testerCertificate);
+ Optional.ofNullable(noNodesDownSince), convergenceSummary, testerCertificate);
}
public Run withSummary(ConvergenceSummary convergenceSummary) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
index 83173fc32a7..23e3149ec1e 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
@@ -27,7 +27,7 @@ import java.util.logging.Logger;
*/
public class JobRunner extends Maintainer {
- static final Duration jobTimeout = Duration.ofDays(1);
+ public static final Duration jobTimeout = Duration.ofDays(1).plusHours(1);
private static final Logger log = Logger.getLogger(JobRunner.class.getName());
private final JobController jobs;
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
index 22129d8bb06..a4b0df31883 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializer.java
@@ -180,8 +180,8 @@ class RunSerializer {
if ( ! summaryArray.valid())
return Optional.empty();
- if (summaryArray.entries() != 11)
- throw new IllegalArgumentException("Convergence summary must have 11 entries");
+ if (summaryArray.entries() != 12)
+ throw new IllegalArgumentException("Convergence summary must have 12 entries");
return Optional.of(new ConvergenceSummary(summaryArray.entry(0).asLong(),
summaryArray.entry(1).asLong(),
@@ -193,7 +193,8 @@ class RunSerializer {
summaryArray.entry(7).asLong(),
summaryArray.entry(8).asLong(),
summaryArray.entry(9).asLong(),
- summaryArray.entry(10).asLong()));
+ summaryArray.entry(10).asLong(),
+ summaryArray.entry(11).asLong()));
}
Slime toSlime(Iterable<Run> runs) {
@@ -261,6 +262,7 @@ class RunSerializer {
summaryArray.addLong(summary.nodes());
summaryArray.addLong(summary.down());
summaryArray.addLong(summary.upgradingOs());
+ summaryArray.addLong(summary.upgradingFirmware());
summaryArray.addLong(summary.needPlatformUpgrade());
summaryArray.addLong(summary.upgradingPlatform());
summaryArray.addLong(summary.needReboot());
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java
index 2fd64bb6ba7..91a0455db11 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/restapi/application/JobControllerApiHandlerHelper.java
@@ -476,6 +476,7 @@ class JobControllerApiHandlerHelper {
summaryObject.setLong("needRestart", summary.needRestart());
summaryObject.setLong("restarting", summary.restarting());
summaryObject.setLong("upgradingOs", summary.upgradingOs());
+ summaryObject.setLong("upgradingFirmware", summary.upgradingFirmware());
summaryObject.setLong("services", summary.services());
summaryObject.setLong("needNewConfig", summary.needNewConfig());
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
index f84bd86194a..ff66ab38d32 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunnerTest.java
@@ -19,6 +19,7 @@ import com.yahoo.vespa.hosted.controller.api.application.v4.model.configserverbi
import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId;
import com.yahoo.vespa.hosted.controller.api.integration.LogEntry;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.ConfigServerException;
+import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.RunId;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.TesterCloud;
@@ -191,6 +192,53 @@ public class InternalStepRunnerTest {
}
@Test
+ public void timesOutWithoutInstallationProgress() {
+ tester.controllerTester().upgradeSystem(new Version("7.1"));
+ tester.controllerTester().computeVersionStatus();
+ tester.upgrader().maintain();
+ app.newRun(JobType.systemTest);
+
+ // Node is down too long in system test, and no nodes go down in staging.
+ tester.runner().run();
+ tester.setEndpoints(app.testerId().id(), JobType.systemTest.zone(system()));
+ tester.configServer().setVersion(app.testerId().id(), JobType.systemTest.zone(system()), tester.controller().systemVersion());
+ tester.configServer().convergeServices(app.testerId().id(), JobType.systemTest.zone(system()));
+ tester.setEndpoints(app.instanceId(), JobType.systemTest.zone(system()));
+ tester.setEndpoints(app.testerId().id(), JobType.stagingTest.zone(system()));
+ tester.configServer().setVersion(app.testerId().id(), JobType.stagingTest.zone(system()), tester.controller().systemVersion());
+ tester.configServer().convergeServices(app.testerId().id(), JobType.stagingTest.zone(system()));
+ tester.setEndpoints(app.instanceId(), JobType.stagingTest.zone(system()));
+ tester.runner().run();
+ assertEquals(succeeded, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installTester));
+ assertEquals(succeeded, tester.jobs().last(app.instanceId(), JobType.stagingTest).get().stepStatuses().get(Step.installTester));
+
+ Node systemTestNode = tester.configServer().nodeRepository().list(JobType.systemTest.zone(system()),
+ app.instanceId()).iterator().next();
+ tester.clock().advance(InternalStepRunner.installationTimeout.minus(Duration.ofSeconds(1)));
+ tester.configServer().nodeRepository().putByHostname(JobType.systemTest.zone(system()),
+ new Node.Builder(systemTestNode)
+ .serviceState(Node.ServiceState.allowedDown)
+ .suspendedSince(tester.clock().instant())
+ .build());
+ tester.runner().run();
+ assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal));
+ assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.stagingTest).get().stepStatuses().get(Step.installInitialReal));
+
+ tester.clock().advance(Duration.ofSeconds(2));
+ tester.runner().run();
+ assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal));
+ assertEquals(failed, tester.jobs().last(app.instanceId(), JobType.stagingTest).get().stepStatuses().get(Step.installInitialReal));
+
+ tester.clock().advance(InternalStepRunner.installationTimeout.minus(Duration.ofSeconds(3)));
+ tester.runner().run();
+ assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal));
+
+ tester.clock().advance(Duration.ofSeconds(2));
+ tester.runner().run();
+ assertEquals(failed, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.installReal));
+ }
+
+ @Test
public void startingTestsFailsIfDeploymentExpires() {
app.newRun(JobType.systemTest);
tester.runner().run();
@@ -211,19 +259,6 @@ public class InternalStepRunnerTest {
}
@Test
- public void startTestsFailsIfDeploymentExpires() {
- app.newRun(JobType.systemTest);
- tester.runner().run();
- tester.configServer().convergeServices(app.instanceId(), JobType.systemTest.zone(system()));
- tester.configServer().convergeServices(app.testerId().id(), JobType.systemTest.zone(system()));
- tester.runner().run();
-
- tester.applications().deactivate(app.instanceId(), JobType.systemTest.zone(system()));
- tester.runner().run();
- assertEquals(unfinished, tester.jobs().last(app.instanceId(), JobType.systemTest).get().stepStatuses().get(Step.startTests));
- }
-
- @Test
public void alternativeEndpointsAreDetected() {
app.newRun(JobType.systemTest);
tester.runner().run();;
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java
index 3839e2103cd..e5757604caf 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/RunSerializerTest.java
@@ -100,7 +100,7 @@ public class RunSerializerTest {
"badb17"),
122),
run.versions().sourceApplication().get());
- assertEquals(Optional.of(new ConvergenceSummary(1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89)),
+ assertEquals(Optional.of(new ConvergenceSummary(1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144)),
run.convergenceSummary());
assertEquals(X509CertificateUtils.fromPem("-----BEGIN CERTIFICATE-----\n" +
"MIIBEzCBu6ADAgECAgEBMAoGCCqGSM49BAMEMBQxEjAQBgNVBAMTCW15c2Vydmlj\n" +
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json
index a66b9d3e955..a7e5d249a9d 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/persistence/testdata/run-status.json
@@ -8,7 +8,7 @@
"lastTestRecord": 3,
"lastVespaLogTimestamp": 1196676930000432,
"noNodesDownSince": 321321321321,
- "convergenceSummary": [1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89],
+ "convergenceSummary": [1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144],
"testerCertificate": "-----BEGIN CERTIFICATE-----\nMIIBEzCBu6ADAgECAgEBMAoGCCqGSM49BAMEMBQxEjAQBgNVBAMTCW15c2Vydmlj\nZTAeFw0xOTA5MDYwNzM3MDZaFw0xOTA5MDcwNzM3MDZaMBQxEjAQBgNVBAMTCW15\nc2VydmljZTBZMBMGByqGSM49AgEGCCqGSM49AwEHA0IABM0JhD8fV2DlAkjQOGX3\nY50ryMBr3g2+v/uFiRoxJ1muuSOWYrW7HCQIGuzc04fa0QwtaX/voAZKCV51t6jF\n0fwwCgYIKoZIzj0EAwQDRwAwRAIgVbQ3Co1H4X0gmRrtXSyTU0HgBQu9PXHMmX20\n5MyyPSoCIBltOcmaPfdN03L3zqbqZ6PgUBWsvAHgiBzL3hrtJ+iy\n-----END CERTIFICATE-----",
"steps": {
"deployInitialReal": "unfinished",
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/dev-us-east-1-log-first-part.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/dev-us-east-1-log-first-part.json
index fac963fd5eb..e1c2310ce7e 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/dev-us-east-1-log-first-part.json
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/dev-us-east-1-log-first-part.json
@@ -68,6 +68,7 @@
"needRestart": 0,
"restarting": 0,
"upgradingOs": 0,
+ "upgradingFirmware": 0,
"services": 1,
"needNewConfig": 1
}
diff --git a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/staging-test-log.json b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/staging-test-log.json
index 5559ac952a2..273887c26c4 100644
--- a/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/staging-test-log.json
+++ b/controller-server/src/test/java/com/yahoo/vespa/hosted/controller/restapi/application/responses/staging-test-log.json
@@ -152,6 +152,7 @@
"needRestart": 0,
"restarting": 0,
"upgradingOs": 0,
+ "upgradingFirmware": 0,
"services": 1,
"needNewConfig": 1
}