diff options
author | Jon Marius Venstad <venstad@gmail.com> | 2020-01-24 17:47:56 +0100 |
---|---|---|
committer | Jon Marius Venstad <venstad@gmail.com> | 2020-01-24 17:47:56 +0100 |
commit | d844eccc4630693396fd39c33697746b22bdbfbb (patch) | |
tree | 7351117ad4090238b64cac33f51913c66ab2d48a | |
parent | 867865729efe99dc2f5d9425056059d26061a7fc (diff) |
Time out when no nodes are suspended, or the same node is suspended, for too long
7 files changed, 68 insertions, 8 deletions
diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java index 3877119e8b0..684dc3cbc25 100644 --- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java +++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/Node.java @@ -8,6 +8,7 @@ import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.config.provision.TenantName; +import java.time.Instant; import java.util.Objects; import java.util.Optional; @@ -30,6 +31,7 @@ public class Node { private final Version currentOsVersion; private final Version wantedOsVersion; private final ServiceState serviceState; + private final Optional<Instant> suspendedSince; private final long restartGeneration; private final long wantedRestartGeneration; private final long rebootGeneration; @@ -44,7 +46,7 @@ public class Node { public Node(HostName hostname, Optional<HostName> parentHostname, State state, NodeType type, NodeResources resources, Optional<ApplicationId> owner, Version currentVersion, Version wantedVersion, Version currentOsVersion, Version wantedOsVersion, ServiceState serviceState, - long restartGeneration, long wantedRestartGeneration, long rebootGeneration, long wantedRebootGeneration, + Optional<Instant> suspendedSince, long restartGeneration, long wantedRestartGeneration, long rebootGeneration, long wantedRebootGeneration, int cost, String flavor, String clusterId, ClusterType clusterType, boolean wantToRetire, boolean wantToDeprovision, Optional<TenantName> reservedTo) { this.hostname = hostname; @@ -58,6 +60,7 @@ public class Node { this.currentOsVersion = currentOsVersion; this.wantedOsVersion = wantedOsVersion; this.serviceState = serviceState; + this.suspendedSince = suspendedSince; this.restartGeneration = restartGeneration; this.wantedRestartGeneration = wantedRestartGeneration; this.rebootGeneration = rebootGeneration; @@ -113,6 +116,10 @@ public class Node { return serviceState; } + public Optional<Instant> suspendedSince() { + return suspendedSince; + } + public long restartGeneration() { return restartGeneration; } @@ -209,6 +216,7 @@ public class Node { private Version currentOsVersion; private Version wantedOsVersion; private ServiceState serviceState; + private Optional<Instant> suspendedSince = Optional.empty(); private long restartGeneration; private long wantedRestartGeneration; private long rebootGeneration; @@ -235,6 +243,7 @@ public class Node { this.currentOsVersion = node.currentOsVersion; this.wantedOsVersion = node.wantedOsVersion; this.serviceState = node.serviceState; + this.suspendedSince = node.suspendedSince; this.restartGeneration = node.restartGeneration; this.wantedRestartGeneration = node.wantedRestartGeneration; this.rebootGeneration = node.rebootGeneration; @@ -303,6 +312,11 @@ public class Node { return this; } + public Builder suspendedSince(Instant suspendedSince) { + this.suspendedSince = Optional.ofNullable(suspendedSince); + return this; + } + public Builder restartGeneration(long restartGeneration) { this.restartGeneration = restartGeneration; return this; @@ -360,7 +374,7 @@ public class Node { public Node build() { return new Node(hostname, parentHostname, state, type, resources, owner, currentVersion, wantedVersion, currentOsVersion, - wantedOsVersion, serviceState, restartGeneration, wantedRestartGeneration, rebootGeneration, wantedRebootGeneration, + wantedOsVersion, serviceState, suspendedSince, restartGeneration, wantedRestartGeneration, rebootGeneration, wantedRebootGeneration, cost, flavor, clusterId, clusterType, wantToRetire, wantToDeprovision, reservedTo); } diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java index 58122b04712..43d8d1c5a6e 100644 --- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java +++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/configserver/NodeRepository.java @@ -13,6 +13,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeMemb import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeRepositoryNode; import com.yahoo.vespa.hosted.controller.api.integration.noderepository.NodeState; +import java.time.Instant; import java.util.Collection; import java.util.List; import java.util.Optional; @@ -108,6 +109,7 @@ public interface NodeRepository { versionFrom(node.getCurrentOsVersion()), versionFrom(node.getWantedOsVersion()), fromBoolean(node.getAllowedToBeDown()), + Optional.ofNullable(node.suspendedSince()).map(Instant::ofEpochMilli), toInt(node.getCurrentRestartGeneration()), toInt(node.getRestartGeneration()), toInt(node.getCurrentRebootGeneration()), diff --git a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java index d9957d77ff3..985b7e3a339 100644 --- a/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java +++ b/controller-api/src/main/java/com/yahoo/vespa/hosted/controller/api/integration/noderepository/NodeRepositoryNode.java @@ -6,8 +6,10 @@ import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.JsonNode; +import java.time.Instant; import java.util.Arrays; import java.util.Map; +import java.util.Optional; import java.util.Set; @JsonIgnoreProperties(ignoreUnknown = true) @@ -76,6 +78,8 @@ public class NodeRepositoryNode { private NodeHistory[] history; @JsonProperty("allowedToBeDown") private Boolean allowedToBeDown; + @JsonProperty("suspendedSince") + private Long suspendedSince; @JsonProperty("reports") private Map<String, JsonNode> reports; @JsonProperty("modelName") @@ -309,6 +313,14 @@ public class NodeRepositoryNode { return allowedToBeDown; } + public Long suspendedSince() { + return suspendedSince; + } + + public void setSuspendedSince(long suspendedSinceMillis) { + this.suspendedSince = suspendedSinceMillis; + } + public String getCurrentOsVersion() { return currentOsVersion; } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java index 220925c1da7..223ffa93c99 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/InternalStepRunner.java @@ -41,6 +41,7 @@ import com.yahoo.vespa.hosted.controller.api.integration.organization.Deployment import com.yahoo.vespa.hosted.controller.application.ApplicationPackage; import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId; +import com.yahoo.vespa.hosted.controller.maintenance.JobRunner; import com.yahoo.yolean.Exceptions; import javax.security.auth.x500.X500Principal; @@ -115,7 +116,7 @@ public class InternalStepRunner implements StepRunner { static final Duration endpointTimeout = Duration.ofMinutes(15); static final Duration testerTimeout = Duration.ofMinutes(30); - static final Duration installationTimeout = Duration.ofMinutes(150); + static final Duration installationTimeout = Duration.ofMinutes(60); static final Duration certificateTimeout = Duration.ofMinutes(300); private final Controller controller; @@ -346,13 +347,32 @@ public class InternalStepRunner implements StepRunner { return Optional.of(error); } } - controller.jobController().locked(id, lockedRun -> lockedRun.withSummary(summary)); - if (timedOut(id, deployment.get(), installationTimeout)) { + boolean failed = false; + + NodeList suspendedTooLong = nodeList.suspendedSince(controller.clock().instant().minus(installationTimeout)); + if ( ! suspendedTooLong.isEmpty()) { + logger.log(INFO, "Some nodes have been suspended for more than " + installationTimeout.toMinutes() + " minutes."); + failed = true; + } + + if (run.noNodesDownSince() + .map(since -> since.isBefore(controller.clock().instant().minus(installationTimeout))) + .orElse(false)) { + logger.log(INFO, "No nodes allowed to suspend to progress installation for " + installationTimeout.toMinutes() + " minutes."); + failed = true; + } + + Duration timeout = JobRunner.jobTimeout.minusHours(1); // Time out before job dies. + if (timedOut(id, deployment.get(), timeout)) { + logger.log(INFO, "Installation failed to complete within " + timeout.toHours() + "hours!"); + failed = true; + } + + if (failed) { logger.log(nodeList.asList().stream() .flatMap(node -> nodeDetails(node, true)) .collect(toList())); - logger.log(INFO, "Installation failed to complete within " + installationTimeout.toMinutes() + " minutes!"); return Optional.of(installationFailed); } @@ -360,6 +380,12 @@ public class InternalStepRunner implements StepRunner { logger.log(nodeList.allowedDown().asList().stream() .flatMap(node -> nodeDetails(node, false)) .collect(toList())); + + controller.jobController().locked(id, lockedRun -> { + Instant noNodesDownSince = summary.down() == 0 ? lockedRun.noNodesDownSince().orElse(controller.clock().instant()) : null; + return lockedRun.noNodesDownSince(noNodesDownSince).withSummary(summary); + }); + return Optional.empty(); } diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java index 0feadebd6d2..0e337126d5c 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/NodeList.java @@ -6,6 +6,7 @@ import com.yahoo.config.provision.HostName; import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node; import com.yahoo.vespa.hosted.controller.api.integration.configserver.ServiceConvergence; +import java.time.Instant; import java.util.Collection; import java.util.HashMap; import java.util.List; @@ -70,6 +71,11 @@ public class NodeList extends AbstractFilteringList<NodeWithServices, NodeList> return matching(node -> node.node().serviceState() == Node.ServiceState.allowedDown); } + /** The nodes which have been suspended since before the given instant. */ + public NodeList suspendedSince(Instant instant) { + return matching(node -> node.node().suspendedSince().map(instant::isBefore).orElse(false)); + } + /** The nodes with services on outdated config generation. */ public NodeList upgradingApplication() { return matching(node -> node.services().stream().anyMatch(service -> wantedConfigGeneration > service.currentGeneration())); diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java index a3ca20fb8b4..8cd57fa7d3a 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/Run.java @@ -119,7 +119,7 @@ public class Run { public Run noNodesDownSince(Instant noNodesDownSince) { requireActive(); return new Run(id, steps, versions, start, end, status, lastTestRecord, lastVespaLogTimestamp, - Optional.of(noNodesDownSince), convergenceSummary, testerCertificate); + Optional.ofNullable(noNodesDownSince), convergenceSummary, testerCertificate); } public Run withSummary(ConvergenceSummary convergenceSummary) { diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java index 83173fc32a7..23e3149ec1e 100644 --- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java +++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/JobRunner.java @@ -27,7 +27,7 @@ import java.util.logging.Logger; */ public class JobRunner extends Maintainer { - static final Duration jobTimeout = Duration.ofDays(1); + public static final Duration jobTimeout = Duration.ofDays(1).plusHours(1); private static final Logger log = Logger.getLogger(JobRunner.class.getName()); private final JobController jobs; |