diff options
Diffstat (limited to 'node-repository/src/main')
7 files changed, 55 insertions, 42 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java index 96c8fe21959..c1a05a3c32d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java @@ -5,10 +5,12 @@ import com.yahoo.concurrent.DaemonThreadFactory; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; +import com.yahoo.config.provision.TransientException; import com.yahoo.log.LogLevel; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; +import com.yahoo.yolean.Exceptions; import java.time.Duration; import java.time.Instant; @@ -88,6 +90,8 @@ public abstract class ApplicationMaintainer extends Maintainer { if ( ! deployment.isPresent()) return; // this will be done at another config server log.log(LogLevel.DEBUG, this.getClass().getSimpleName() + " deploying " + application); deployment.get().activate(); + } catch (TransientException e) { + log.log(LogLevel.INFO, "Failed to redeploy " + application + " with a transient error: " + Exceptions.toMessageString(e)); } catch (RuntimeException e) { log.log(LogLevel.WARNING, "Exception on maintenance redeploy", e); } finally { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java index b7e8395cc92..a7b750c4e46 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java @@ -5,7 +5,9 @@ import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; import com.yahoo.config.provision.HostLivenessTracker; import com.yahoo.config.provision.NodeType; +import com.yahoo.config.provision.TransientException; import com.yahoo.jdisc.Metric; +import com.yahoo.log.LogLevel; import com.yahoo.transaction.Mutex; import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.applicationmodel.ServiceInstance; @@ -21,6 +23,7 @@ import com.yahoo.vespa.orchestrator.Orchestrator; import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus; import com.yahoo.vespa.orchestrator.status.HostStatus; import com.yahoo.vespa.service.monitor.ServiceMonitor; +import com.yahoo.yolean.Exceptions; import java.time.Clock; import java.time.Duration; @@ -368,8 +371,11 @@ public class NodeFailer extends Maintainer { try { deployment.get().activate(); return true; - } - catch (RuntimeException e) { + } catch (TransientException e) { + log.log(LogLevel.INFO, "Failed to redeploy " + node.allocation().get().owner() + + " with a transient error, will be retried by application maintainer: " + Exceptions.toMessageString(e)); + return true; + } catch (RuntimeException e) { // The expected reason for deployment to fail here is that there is no capacity available to redeploy. // In that case we should leave the node in the active state to avoid failing additional nodes. nodeRepository().reactivate(node.hostname(), Agent.system, diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java index 52e7a28acc8..b9b1200d473 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java @@ -65,7 +65,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent { nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, durationFromEnv("fail_grace").orElse(defaults.failGrace), clock, orchestrator, throttlePolicyFromEnv().orElse(defaults.throttlePolicy), metric); periodicApplicationMaintainer = new PeriodicApplicationMaintainer(deployer, nodeRepository, defaults.redeployMaintainerInterval, durationFromEnv("periodic_redeploy_interval").orElse(defaults.periodicRedeployInterval)); - operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, nodeRepository, clock, durationFromEnv("operator_change_redeploy_interval").orElse(defaults.operatorChangeRedeployInterval)); + operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, nodeRepository, durationFromEnv("operator_change_redeploy_interval").orElse(defaults.operatorChangeRedeployInterval)); reservationExpirer = new ReservationExpirer(nodeRepository, clock, durationFromEnv("reservation_expiry").orElse(defaults.reservationExpiry)); retiredExpirer = new RetiredExpirer(nodeRepository, orchestrator, deployer, clock, durationFromEnv("retired_interval").orElse(defaults.retiredInterval), durationFromEnv("retired_expiry").orElse(defaults.retiredExpiry)); inactiveExpirer = new InactiveExpirer(nodeRepository, clock, durationFromEnv("inactive_expiry").orElse(defaults.inactiveExpiry)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java index 46571fd0deb..ab7a565688e 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java @@ -7,12 +7,12 @@ import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.Agent; -import com.yahoo.vespa.hosted.provision.node.Allocation; +import com.yahoo.vespa.hosted.provision.node.History; -import java.time.Clock; import java.time.Duration; -import java.time.Instant; import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -28,31 +28,25 @@ import java.util.stream.Collectors; * @author bratseth */ public class OperatorChangeApplicationMaintainer extends ApplicationMaintainer { - - private final Clock clock; - - private Instant previousRun; - OperatorChangeApplicationMaintainer(Deployer deployer, NodeRepository nodeRepository, Clock clock, Duration interval) { + OperatorChangeApplicationMaintainer(Deployer deployer, NodeRepository nodeRepository, Duration interval) { super(deployer, nodeRepository, interval); - this.clock = clock; - previousRun = clock.instant(); // Changes before this will be caught by the first PeriodicApplicationMaintainer run } @Override protected Set<ApplicationId> applicationsNeedingMaintenance() { - Instant windowEnd = clock.instant(); - Instant windowStart = previousRun; - previousRun = windowEnd; - return nodeRepository().getNodes(NodeType.tenant).stream() - .filter(node -> hasManualStateChangeSince(windowStart, node)) - .flatMap(node -> node.allocation().map(Allocation::owner).stream()) - .collect(Collectors.toCollection(LinkedHashSet::new)); - } - - private boolean hasManualStateChangeSince(Instant instant, Node node) { - return node.history().events().stream() - .anyMatch(event -> event.agent() == Agent.operator && event.at().isAfter(instant)); + Map<ApplicationId, List<Node>> nodesByApplication = nodeRepository().getNodes(NodeType.tenant).stream() + .filter(node -> node.allocation().isPresent()) + .collect(Collectors.groupingBy(node -> node.allocation().get().owner(), Collectors.toList())); + + return nodesByApplication.entrySet().stream() + .filter(entry -> entry.getValue().stream() + .flatMap(node -> node.history().events().stream()) + .filter(event -> event.agent() == Agent.operator) + .map(History.Event::at) + .anyMatch(getLastDeployTime(entry.getKey())::isBefore)) + .map(Map.Entry::getKey) + .collect(Collectors.toCollection(LinkedHashSet::new)); } /** diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java index 174591b0836..6ab85e76ba2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java @@ -9,7 +9,6 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import java.time.Clock; import java.time.Duration; import java.time.Instant; -import java.util.Collections; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; @@ -40,21 +39,24 @@ public class PeriodicApplicationMaintainer extends ApplicationMaintainer { @Override protected boolean canDeployNow(ApplicationId application) { - // Don't deploy if a regular deploy just happened - return getLastDeployTime(application).isBefore(nodeRepository().clock().instant().minus(minTimeBetweenRedeployments)); + return deployer().lastDeployTime(application) + // Don't deploy if a regular deploy just happened + .map(lastDeployTime -> lastDeployTime.isBefore(nodeRepository().clock().instant().minus(minTimeBetweenRedeployments))) + // We only know last deploy time for applications that were deployed on this config server, + // the rest will be deployed on another config server + .orElse(false); } // Returns the applications that need to be redeployed by this config server at this point in time. @Override protected Set<ApplicationId> applicationsNeedingMaintenance() { - if (waitInitially()) return Collections.emptySet(); + if (waitInitially()) return Set.of(); // Collect all deployment times before sorting as deployments may happen while we build the set, breaking // the comparable contract. Stale times are fine as the time is rechecked in ApplicationMaintainer#deployWithLock Map<ApplicationId, Instant> deploymentTimes = nodesNeedingMaintenance().stream() .map(node -> node.allocation().get().owner()) .distinct() - .filter(this::shouldBeDeployedOnThisServer) .filter(this::canDeployNow) .collect(Collectors.toMap(Function.identity(), this::getLastDeployTime)); @@ -64,12 +66,6 @@ public class PeriodicApplicationMaintainer extends ApplicationMaintainer { .collect(Collectors.toCollection(LinkedHashSet::new)); } - // We only know last deploy time for applications that were deployed on this config server, - // the rest will be deployed on another config server - protected boolean shouldBeDeployedOnThisServer(ApplicationId application) { - return deployer().lastDeployTime(application).isPresent(); - } - // TODO: Do not start deploying until some time has gone (ideally only until bootstrap of config server is finished) private boolean waitInitially() { return clock.instant().isBefore(start.plus(minTimeBetweenRedeployments)); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java index c27989cb852..dea0b8c19d0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java @@ -4,12 +4,15 @@ package com.yahoo.vespa.hosted.provision.maintenance; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Deployer; import com.yahoo.config.provision.Deployment; +import com.yahoo.config.provision.TransientException; +import com.yahoo.log.LogLevel; import com.yahoo.vespa.applicationmodel.HostName; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.node.History; import com.yahoo.vespa.orchestrator.OrchestrationException; import com.yahoo.vespa.orchestrator.Orchestrator; +import com.yahoo.yolean.Exceptions; import java.time.Clock; import java.time.Duration; @@ -73,6 +76,9 @@ public class RetiredExpirer extends Maintainer { String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", ")); log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList); + } catch (TransientException e) { + log.log(LogLevel.INFO, "Failed to redeploy " + application + + " with a transient error, will be retried by application maintainer: " + Exceptions.toMessageString(e)); } catch (RuntimeException e) { String nodeList = retiredNodes.stream().map(Node::hostname).collect(Collectors.joining(", ")); log.log(Level.WARNING, "Exception trying to deactivate retired nodes from " + application diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java index 299dc66c547..e628e823025 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java @@ -15,7 +15,6 @@ import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner; import java.time.Clock; import java.time.Duration; import java.time.Instant; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -41,7 +40,7 @@ public class MockDeployer implements Deployer { @Inject @SuppressWarnings("unused") public MockDeployer() { - this(null, Clock.systemUTC(), Collections.emptyMap()); + this(null, Clock.systemUTC(), Map.of()); } /** @@ -53,7 +52,7 @@ public class MockDeployer implements Deployer { Map<ApplicationId, ApplicationContext> applications) { this.provisioner = provisioner; this.clock = clock; - this.applications = applications; + this.applications = new HashMap<>(applications); } public ReentrantLock lock() { @@ -73,8 +72,8 @@ public class MockDeployer implements Deployer { throw new RuntimeException(e); } try { - lastDeployTimes.put(id, clock.instant()); - return Optional.of(new MockDeployment(provisioner, applications.get(id))); + return Optional.ofNullable(applications.get(id)) + .map(application -> new MockDeployment(provisioner, application)); } finally { lock.unlock(); } @@ -90,6 +89,13 @@ public class MockDeployer implements Deployer { return Optional.ofNullable(lastDeployTimes.get(application)); } + public void removeApplication(ApplicationId applicationId) { + new MockDeployment(provisioner, new ApplicationContext(applicationId, List.of())).activate(); + + applications.remove(applicationId); + lastDeployTimes.remove(applicationId); + } + public class MockDeployment implements Deployment { private final NodeRepositoryProvisioner provisioner; @@ -116,6 +122,7 @@ public class MockDeployer implements Deployer { try (NestedTransaction t = new NestedTransaction()) { provisioner.activate(t, application.id(), preparedHosts); t.commit(); + lastDeployTimes.put(application.id, clock.instant()); } } @@ -136,7 +143,7 @@ public class MockDeployer implements Deployer { } public ApplicationContext(ApplicationId id, ClusterSpec cluster, Capacity capacity, int groups) { - this(id, Collections.singletonList(new ClusterContext(id, cluster, capacity, groups))); + this(id, List.of(new ClusterContext(id, cluster, capacity, groups))); } public ApplicationId id() { return id; } |