summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Marius Venstad <venstad@gmail.com>2020-01-24 17:47:56 +0100
committerJon Marius Venstad <venstad@gmail.com>2020-01-24 17:47:56 +0100
commitd844eccc4630693396fd39c33697746b22bdbfbb (patch)
tree7351117ad4090238b64cac33f51913c66ab2d48a
parent867865729efe99dc2f5d9425056059d26061a7fc (diff)
Time out when no nodes are suspended, or the same node is suspended, for too long
-rw-r--r--controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java18
-rw-r--r--controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java2
-rw-r--r--controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java12
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java34
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java6
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java2
7 files changed, 68 insertions, 8 deletions
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java
index 3877119e8b0..684dc3cbc25 100644
--- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java
+++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java
@@ -8,6 +8,7 @@ import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.TenantName;
+import java.time.Instant;
import java.util.Objects;
import java.util.Optional;
@@ -30,6 +31,7 @@ public class Node {
private final Version currentOsVersion;
private final Version wantedOsVersion;
private final ServiceState serviceState;
+ private final Optional<Instant> suspendedSince;
private final long restartGeneration;
private final long wantedRestartGeneration;
private final long rebootGeneration;
@@ -44,7 +46,7 @@ public class Node {
public Node(HostName hostname, Optional<HostName> parentHostname, State state, NodeType type, NodeResources resources, Optional<ApplicationId> owner,
Version currentVersion, Version wantedVersion, Version currentOsVersion, Version wantedOsVersion, ServiceState serviceState,
- long restartGeneration, long wantedRestartGeneration, long rebootGeneration, long wantedRebootGeneration,
+ Optional<Instant> suspendedSince, long restartGeneration, long wantedRestartGeneration, long rebootGeneration, long wantedRebootGeneration,
int cost, String flavor, String clusterId, ClusterType clusterType, boolean wantToRetire, boolean wantToDeprovision,
Optional<TenantName> reservedTo) {
this.hostname = hostname;
@@ -58,6 +60,7 @@ public class Node {
this.currentOsVersion = currentOsVersion;
this.wantedOsVersion = wantedOsVersion;
this.serviceState = serviceState;
+ this.suspendedSince = suspendedSince;
this.restartGeneration = restartGeneration;
this.wantedRestartGeneration = wantedRestartGeneration;
this.rebootGeneration = rebootGeneration;
@@ -113,6 +116,10 @@ public class Node {
return serviceState;
}
+ public Optional<Instant> suspendedSince() {
+ return suspendedSince;
+ }
+
public long restartGeneration() {
return restartGeneration;
}
@@ -209,6 +216,7 @@ public class Node {
private Version currentOsVersion;
private Version wantedOsVersion;
private ServiceState serviceState;
+ private Optional<Instant> suspendedSince = Optional.empty();
private long restartGeneration;
private long wantedRestartGeneration;
private long rebootGeneration;
@@ -235,6 +243,7 @@ public class Node {
this.currentOsVersion = node.currentOsVersion;
this.wantedOsVersion = node.wantedOsVersion;
this.serviceState = node.serviceState;
+ this.suspendedSince = node.suspendedSince;
this.restartGeneration = node.restartGeneration;
this.wantedRestartGeneration = node.wantedRestartGeneration;
this.rebootGeneration = node.rebootGeneration;
@@ -303,6 +312,11 @@ public class Node {
return this;
}
+ public Builder suspendedSince(Instant suspendedSince) {
+ this.suspendedSince = Optional.ofNullable(suspendedSince);
+ return this;
+ }
+
public Builder restartGeneration(long restartGeneration) {
this.restartGeneration = restartGeneration;
return this;
@@ -360,7 +374,7 @@ public class Node {
public Node build() {
return new Node(hostname, parentHostname, state, type, resources, owner, currentVersion, wantedVersion, currentOsVersion,
- wantedOsVersion, serviceState, restartGeneration, wantedRestartGeneration, rebootGeneration, wantedRebootGeneration,
+ wantedOsVersion, serviceState, suspendedSince, restartGeneration, wantedRestartGeneration, rebootGeneration, wantedRebootGeneration,
cost, flavor, clusterId, clusterType, wantToRetire, wantToDeprovision, reservedTo);
}
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java
index 58122b04712..43d8d1c5a6e 100644
--- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java
+++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java
@@ -13,6 +13,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeMemb
import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeRepositoryNode;
import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeState;
+import java.time.Instant;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
@@ -108,6 +109,7 @@ public interface NodeRepository {
versionFrom(node.getCurrentOsVersion()),
versionFrom(node.getWantedOsVersion()),
fromBoolean(node.getAllowedToBeDown()),
+ Optional.ofNullable(node.suspendedSince()).map(Instant::ofEpochMilli),
toInt(node.getCurrentRestartGeneration()),
toInt(node.getRestartGeneration()),
toInt(node.getCurrentRebootGeneration()),
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java
index d9957d77ff3..985b7e3a339 100644
--- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java
+++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java
@@ -6,8 +6,10 @@ import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.JsonNode;
+import java.time.Instant;
import java.util.Arrays;
import java.util.Map;
+import java.util.Optional;
import java.util.Set;
@JsonIgnoreProperties(ignoreUnknown = true)
@@ -76,6 +78,8 @@ public class NodeRepositoryNode {
private NodeHistory[] history;
@JsonProperty("allowedToBeDown")
private Boolean allowedToBeDown;
+ @JsonProperty("suspendedSince")
+ private Long suspendedSince;
@JsonProperty("reports")
private Map<String, JsonNode> reports;
@JsonProperty("modelName")
@@ -309,6 +313,14 @@ public class NodeRepositoryNode {
return allowedToBeDown;
}
+ public Long suspendedSince() {
+ return suspendedSince;
+ }
+
+ public void setSuspendedSince(long suspendedSinceMillis) {
+ this.suspendedSince = suspendedSinceMillis;
+ }
+
public String getCurrentOsVersion() {
return currentOsVersion;
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
index 220925c1da7..223ffa93c99 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java
@@ -41,6 +41,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.organization.Deployment
import com.yahoo.vespa.hosted.controller.application.ApplicationPackage;
import com.yahoo.vespa.hosted.controller.application.Deployment;
import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId;
+import com.yahoo.vespa.hosted.controller.maintenance.JobRunner;
import com.yahoo.yolean.Exceptions;
import javax.security.auth.x500.X500Principal;
@@ -115,7 +116,7 @@ public class InternalStepRunner implements StepRunner {
static final Duration endpointTimeout = Duration.ofMinutes(15);
static final Duration testerTimeout = Duration.ofMinutes(30);
- static final Duration installationTimeout = Duration.ofMinutes(150);
+ static final Duration installationTimeout = Duration.ofMinutes(60);
static final Duration certificateTimeout = Duration.ofMinutes(300);
private final Controller controller;
@@ -346,13 +347,32 @@ public class InternalStepRunner implements StepRunner {
return Optional.of(error);
}
}
- controller.jobController().locked(id, lockedRun -> lockedRun.withSummary(summary));
- if (timedOut(id, deployment.get(), installationTimeout)) {
+ boolean failed = false;
+
+ NodeList suspendedTooLong = nodeList.suspendedSince(controller.clock().instant().minus(installationTimeout));
+ if ( ! suspendedTooLong.isEmpty()) {
+ logger.log(INFO, "Some nodes have been suspended for more than " + installationTimeout.toMinutes() + " minutes.");
+ failed = true;
+ }
+
+ if (run.noNodesDownSince()
+ .map(since -> since.isBefore(controller.clock().instant().minus(installationTimeout)))
+ .orElse(false)) {
+ logger.log(INFO, "No nodes allowed to suspend to progress installation for " + installationTimeout.toMinutes() + " minutes.");
+ failed = true;
+ }
+
+ Duration timeout = JobRunner.jobTimeout.minusHours(1); // Time out before job dies.
+ if (timedOut(id, deployment.get(), timeout)) {
+ logger.log(INFO, "Installation failed to complete within " + timeout.toHours() + "hours!");
+ failed = true;
+ }
+
+ if (failed) {
logger.log(nodeList.asList().stream()
.flatMap(node -> nodeDetails(node, true))
.collect(toList()));
- logger.log(INFO, "Installation failed to complete within " + installationTimeout.toMinutes() + " minutes!");
return Optional.of(installationFailed);
}
@@ -360,6 +380,12 @@ public class InternalStepRunner implements StepRunner {
logger.log(nodeList.allowedDown().asList().stream()
.flatMap(node -> nodeDetails(node, false))
.collect(toList()));
+
+ controller.jobController().locked(id, lockedRun -> {
+ Instant noNodesDownSince = summary.down() == 0 ? lockedRun.noNodesDownSince().orElse(controller.clock().instant()) : null;
+ return lockedRun.noNodesDownSince(noNodesDownSince).withSummary(summary);
+ });
+
return Optional.empty();
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java
index 0feadebd6d2..0e337126d5c 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java
@@ -6,6 +6,7 @@ import com.yahoo.config.provision.HostName;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.ServiceConvergence;
+import java.time.Instant;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
@@ -70,6 +71,11 @@ public class NodeList extends AbstractFilteringList<NodeWithServices, NodeList>
return matching(node -> node.node().serviceState() == Node.ServiceState.allowedDown);
}
+ /** The nodes which have been suspended since before the given instant. */
+ public NodeList suspendedSince(Instant instant) {
+ return matching(node -> node.node().suspendedSince().map(instant::isBefore).orElse(false));
+ }
+
/** The nodes with services on outdated config generation. */
public NodeList upgradingApplication() {
return matching(node -> node.services().stream().anyMatch(service -> wantedConfigGeneration > service.currentGeneration()));
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java
index a3ca20fb8b4..8cd57fa7d3a 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java
@@ -119,7 +119,7 @@ public class Run {
public Run noNodesDownSince(Instant noNodesDownSince) {
requireActive();
return new Run(id, steps, versions, start, end, status, lastTestRecord, lastVespaLogTimestamp,
- Optional.of(noNodesDownSince), convergenceSummary, testerCertificate);
+ Optional.ofNullable(noNodesDownSince), convergenceSummary, testerCertificate);
}
public Run withSummary(ConvergenceSummary convergenceSummary) {
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
index 83173fc32a7..23e3149ec1e 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java
@@ -27,7 +27,7 @@ import java.util.logging.Logger;
*/
public class JobRunner extends Maintainer {
- static final Duration jobTimeout = Duration.ofDays(1);
+ public static final Duration jobTimeout = Duration.ofDays(1).plusHours(1);
private static final Logger log = Logger.getLogger(JobRunner.class.getName());
private final JobController jobs;