aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src/main
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository/src/main')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java38
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java18
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java6
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java19
7 files changed, 55 insertions, 42 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java
index 96c8fe21959..c1a05a3c32d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/ApplicationMaintainer.java
@@ -5,10 +5,12 @@ import com.yahoo.concurrent.DaemonThreadFactory;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
+import com.yahoo.config.provision.TransientException;
import com.yahoo.log.LogLevel;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import com.yahoo.yolean.Exceptions;
import java.time.Duration;
import java.time.Instant;
@@ -88,6 +90,8 @@ public abstract class ApplicationMaintainer extends Maintainer {
if ( ! deployment.isPresent()) return; // this will be done at another config server
log.log(LogLevel.DEBUG, this.getClass().getSimpleName() + " deploying " + application);
deployment.get().activate();
+ } catch (TransientException e) {
+ log.log(LogLevel.INFO, "Failed to redeploy " + application + " with a transient error: " + Exceptions.toMessageString(e));
} catch (RuntimeException e) {
log.log(LogLevel.WARNING, "Exception on maintenance redeploy", e);
} finally {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
index b7e8395cc92..a7b750c4e46 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeFailer.java
@@ -5,7 +5,9 @@ import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.config.provision.NodeType;
+import com.yahoo.config.provision.TransientException;
import com.yahoo.jdisc.Metric;
+import com.yahoo.log.LogLevel;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
@@ -21,6 +23,7 @@ import com.yahoo.vespa.orchestrator.Orchestrator;
import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
import com.yahoo.vespa.orchestrator.status.HostStatus;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
+import com.yahoo.yolean.Exceptions;
import java.time.Clock;
import java.time.Duration;
@@ -368,8 +371,11 @@ public class NodeFailer extends Maintainer {
try {
deployment.get().activate();
return true;
- }
- catch (RuntimeException e) {
+ } catch (TransientException e) {
+ log.log(LogLevel.INFO, "Failed to redeploy " + node.allocation().get().owner() +
+ " with a transient error, will be retried by application maintainer: " + Exceptions.toMessageString(e));
+ return true;
+ } catch (RuntimeException e) {
// The expected reason for deployment to fail here is that there is no capacity available to redeploy.
// In that case we should leave the node in the active state to avoid failing additional nodes.
nodeRepository().reactivate(node.hostname(), Agent.system,
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
index 52e7a28acc8..b9b1200d473 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRepositoryMaintenance.java
@@ -65,7 +65,7 @@ public class NodeRepositoryMaintenance extends AbstractComponent {
nodeFailer = new NodeFailer(deployer, hostLivenessTracker, serviceMonitor, nodeRepository, durationFromEnv("fail_grace").orElse(defaults.failGrace), clock, orchestrator, throttlePolicyFromEnv().orElse(defaults.throttlePolicy), metric);
periodicApplicationMaintainer = new PeriodicApplicationMaintainer(deployer, nodeRepository, defaults.redeployMaintainerInterval, durationFromEnv("periodic_redeploy_interval").orElse(defaults.periodicRedeployInterval));
- operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, nodeRepository, clock, durationFromEnv("operator_change_redeploy_interval").orElse(defaults.operatorChangeRedeployInterval));
+ operatorChangeApplicationMaintainer = new OperatorChangeApplicationMaintainer(deployer, nodeRepository, durationFromEnv("operator_change_redeploy_interval").orElse(defaults.operatorChangeRedeployInterval));
reservationExpirer = new ReservationExpirer(nodeRepository, clock, durationFromEnv("reservation_expiry").orElse(defaults.reservationExpiry));
retiredExpirer = new RetiredExpirer(nodeRepository, orchestrator, deployer, clock, durationFromEnv("retired_interval").orElse(defaults.retiredInterval), durationFromEnv("retired_expiry").orElse(defaults.retiredExpiry));
inactiveExpirer = new InactiveExpirer(nodeRepository, clock, durationFromEnv("inactive_expiry").orElse(defaults.inactiveExpiry));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java
index 46571fd0deb..ab7a565688e 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/OperatorChangeApplicationMaintainer.java
@@ -7,12 +7,12 @@ import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
-import com.yahoo.vespa.hosted.provision.node.Allocation;
+import com.yahoo.vespa.hosted.provision.node.History;
-import java.time.Clock;
import java.time.Duration;
-import java.time.Instant;
import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
@@ -28,31 +28,25 @@ import java.util.stream.Collectors;
* @author bratseth
*/
public class OperatorChangeApplicationMaintainer extends ApplicationMaintainer {
-
- private final Clock clock;
-
- private Instant previousRun;
- OperatorChangeApplicationMaintainer(Deployer deployer, NodeRepository nodeRepository, Clock clock, Duration interval) {
+ OperatorChangeApplicationMaintainer(Deployer deployer, NodeRepository nodeRepository, Duration interval) {
super(deployer, nodeRepository, interval);
- this.clock = clock;
- previousRun = clock.instant(); // Changes before this will be caught by the first PeriodicApplicationMaintainer run
}
@Override
protected Set<ApplicationId> applicationsNeedingMaintenance() {
- Instant windowEnd = clock.instant();
- Instant windowStart = previousRun;
- previousRun = windowEnd;
- return nodeRepository().getNodes(NodeType.tenant).stream()
- .filter(node -> hasManualStateChangeSince(windowStart, node))
- .flatMap(node -> node.allocation().map(Allocation::owner).stream())
- .collect(Collectors.toCollection(LinkedHashSet::new));
- }
-
- private boolean hasManualStateChangeSince(Instant instant, Node node) {
- return node.history().events().stream()
- .anyMatch(event -> event.agent() == Agent.operator && event.at().isAfter(instant));
+ Map<ApplicationId, List<Node>> nodesByApplication = nodeRepository().getNodes(NodeType.tenant).stream()
+ .filter(node -> node.allocation().isPresent())
+ .collect(Collectors.groupingBy(node -> node.allocation().get().owner(), Collectors.toList()));
+
+ return nodesByApplication.entrySet().stream()
+ .filter(entry -> entry.getValue().stream()
+ .flatMap(node -> node.history().events().stream())
+ .filter(event -> event.agent() == Agent.operator)
+ .map(History.Event::at)
+ .anyMatch(getLastDeployTime(entry.getKey())::isBefore))
+ .map(Map.Entry::getKey)
+ .collect(Collectors.toCollection(LinkedHashSet::new));
}
/**
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java
index 174591b0836..6ab85e76ba2 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/PeriodicApplicationMaintainer.java
@@ -9,7 +9,6 @@ import com.yahoo.vespa.hosted.provision.NodeRepository;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
-import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
@@ -40,21 +39,24 @@ public class PeriodicApplicationMaintainer extends ApplicationMaintainer {
@Override
protected boolean canDeployNow(ApplicationId application) {
- // Don't deploy if a regular deploy just happened
- return getLastDeployTime(application).isBefore(nodeRepository().clock().instant().minus(minTimeBetweenRedeployments));
+ return deployer().lastDeployTime(application)
+ // Don't deploy if a regular deploy just happened
+ .map(lastDeployTime -> lastDeployTime.isBefore(nodeRepository().clock().instant().minus(minTimeBetweenRedeployments)))
+ // We only know last deploy time for applications that were deployed on this config server,
+ // the rest will be deployed on another config server
+ .orElse(false);
}
// Returns the applications that need to be redeployed by this config server at this point in time.
@Override
protected Set<ApplicationId> applicationsNeedingMaintenance() {
- if (waitInitially()) return Collections.emptySet();
+ if (waitInitially()) return Set.of();
// Collect all deployment times before sorting as deployments may happen while we build the set, breaking
// the comparable contract. Stale times are fine as the time is rechecked in ApplicationMaintainer#deployWithLock
Map<ApplicationId, Instant> deploymentTimes = nodesNeedingMaintenance().stream()
.map(node -> node.allocation().get().owner())
.distinct()
- .filter(this::shouldBeDeployedOnThisServer)
.filter(this::canDeployNow)
.collect(Collectors.toMap(Function.identity(), this::getLastDeployTime));
@@ -64,12 +66,6 @@ public class PeriodicApplicationMaintainer extends ApplicationMaintainer {
.collect(Collectors.toCollection(LinkedHashSet::new));
}
- // We only know last deploy time for applications that were deployed on this config server,
- // the rest will be deployed on another config server
- protected boolean shouldBeDeployedOnThisServer(ApplicationId application) {
- return deployer().lastDeployTime(application).isPresent();
- }
-
// TODO: Do not start deploying until some time has gone (ideally only until bootstrap of config server is finished)
private boolean waitInitially() {
return clock.instant().isBefore(start.plus(minTimeBetweenRedeployments));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
index c27989cb852..dea0b8c19d0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/RetiredExpirer.java
@@ -4,12 +4,15 @@ package com.yahoo.vespa.hosted.provision.maintenance;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
+import com.yahoo.config.provision.TransientException;
+import com.yahoo.log.LogLevel;
import com.yahoo.vespa.applicationmodel.HostName;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.orchestrator.OrchestrationException;
import com.yahoo.vespa.orchestrator.Orchestrator;
+import com.yahoo.yolean.Exceptions;
import java.time.Clock;
import java.time.Duration;
@@ -73,6 +76,9 @@ public class RetiredExpirer extends Maintainer {
String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", "));
log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList);
+ } catch (TransientException e) {
+ log.log(LogLevel.INFO, "Failed to redeploy " + application +
+ " with a transient error, will be retried by application maintainer: " + Exceptions.toMessageString(e));
} catch (RuntimeException e) {
String nodeList = retiredNodes.stream().map(Node::hostname).collect(Collectors.joining(", "));
log.log(Level.WARNING, "Exception trying to deactivate retired nodes from " + application
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java
index 299dc66c547..e628e823025 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockDeployer.java
@@ -15,7 +15,6 @@ import com.yahoo.vespa.hosted.provision.provisioning.NodeRepositoryProvisioner;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
-import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -41,7 +40,7 @@ public class MockDeployer implements Deployer {
@Inject
@SuppressWarnings("unused")
public MockDeployer() {
- this(null, Clock.systemUTC(), Collections.emptyMap());
+ this(null, Clock.systemUTC(), Map.of());
}
/**
@@ -53,7 +52,7 @@ public class MockDeployer implements Deployer {
Map<ApplicationId, ApplicationContext> applications) {
this.provisioner = provisioner;
this.clock = clock;
- this.applications = applications;
+ this.applications = new HashMap<>(applications);
}
public ReentrantLock lock() {
@@ -73,8 +72,8 @@ public class MockDeployer implements Deployer {
throw new RuntimeException(e);
}
try {
- lastDeployTimes.put(id, clock.instant());
- return Optional.of(new MockDeployment(provisioner, applications.get(id)));
+ return Optional.ofNullable(applications.get(id))
+ .map(application -> new MockDeployment(provisioner, application));
} finally {
lock.unlock();
}
@@ -90,6 +89,13 @@ public class MockDeployer implements Deployer {
return Optional.ofNullable(lastDeployTimes.get(application));
}
+ public void removeApplication(ApplicationId applicationId) {
+ new MockDeployment(provisioner, new ApplicationContext(applicationId, List.of())).activate();
+
+ applications.remove(applicationId);
+ lastDeployTimes.remove(applicationId);
+ }
+
public class MockDeployment implements Deployment {
private final NodeRepositoryProvisioner provisioner;
@@ -116,6 +122,7 @@ public class MockDeployer implements Deployer {
try (NestedTransaction t = new NestedTransaction()) {
provisioner.activate(t, application.id(), preparedHosts);
t.commit();
+ lastDeployTimes.put(application.id, clock.instant());
}
}
@@ -136,7 +143,7 @@ public class MockDeployer implements Deployer {
}
public ApplicationContext(ApplicationId id, ClusterSpec cluster, Capacity capacity, int groups) {
- this(id, Collections.singletonList(new ClusterContext(id, cluster, capacity, groups)));
+ this(id, List.of(new ClusterContext(id, cluster, capacity, groups)));
}
public ApplicationId id() { return id; }