summaryrefslogtreecommitdiffstats
path: root/configserver/src/main
diff options
context:
space:
mode:
authorjonmv <venstad@gmail.com>2024-01-12 16:12:26 +0100
committerjonmv <venstad@gmail.com>2024-01-12 16:12:26 +0100
commite49f46a6e42d3cddc8dca363d90f27a24c483024 (patch)
tree41554882ac71d3416103d90f4b9d558bf5d97140 /configserver/src/main
parent469d7dc346f3c900953e9cd8110c6fa070ff5e74 (diff)
Add a maintainer for triggering node restarts triggered by deployment
Diffstat (limited to 'configserver/src/main')
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java10
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/application/ApplicationCuratorDatabase.java59
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/application/PendingRestarts.java58
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java57
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintenance.java2
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/PendingRestartsMaintainer.java102
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java4
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/provision/HostProvisionerProvider.java2
8 files changed, 240 insertions, 54 deletions
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java b/configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java
index fc60f1225a7..742e92500d9 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java
@@ -56,6 +56,7 @@ import com.yahoo.vespa.config.server.application.ActiveTokenFingerprints;
import com.yahoo.vespa.config.server.application.DefaultClusterReindexingStatusClient;
import com.yahoo.vespa.config.server.application.FileDistributionStatus;
import com.yahoo.vespa.config.server.application.HttpProxy;
+import com.yahoo.vespa.config.server.application.PendingRestarts;
import com.yahoo.vespa.config.server.application.TenantApplications;
import com.yahoo.vespa.config.server.configchange.ConfigChangeActions;
import com.yahoo.vespa.config.server.configchange.RefeedActions;
@@ -1041,6 +1042,15 @@ public class ApplicationRepository implements com.yahoo.config.provision.Deploye
tenant.getApplicationRepo().database().modifyReindexing(id, ApplicationReindexing.empty(), modifications);
}
+ public PendingRestarts getPendingRestarts(ApplicationId id) {
+ return requireDatabase(id).readPendingRestarts(id);
+ }
+
+ public void modifyPendingRestarts(ApplicationId id, UnaryOperator<PendingRestarts> modifications) {
+ if (hostProvisioner.isEmpty()) return;
+ getTenant(id).getApplicationRepo().database().modifyPendingRestarts(id, modifications);
+ }
+
public ConfigserverConfig configserverConfig() {
return configserverConfig;
}
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/application/ApplicationCuratorDatabase.java b/configserver/src/main/java/com/yahoo/vespa/config/server/application/ApplicationCuratorDatabase.java
index e17fda76a01..b2dbea569b6 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/application/ApplicationCuratorDatabase.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/application/ApplicationCuratorDatabase.java
@@ -28,6 +28,8 @@ import java.util.concurrent.ExecutorService;
import java.util.function.UnaryOperator;
import static com.yahoo.vespa.curator.transaction.CuratorOperations.setData;
+import static com.yahoo.yolean.Exceptions.uncheck;
+import static java.util.stream.Collectors.toMap;
import static java.util.stream.Collectors.toUnmodifiableMap;
/**
@@ -197,6 +199,25 @@ public class ApplicationCuratorDatabase {
.toList();
}
+ public PendingRestarts readPendingRestarts(ApplicationId id) {
+ return curator.getData(pendingRestartsPath(id))
+ .map(PendingRestartsSerializer::fromBytes)
+ .orElse(PendingRestarts.empty());
+ }
+
+ public void modifyPendingRestarts(ApplicationId id, UnaryOperator<PendingRestarts> modification) {
+ try (Lock lock = curator.lock(restartsLockPath(id), Duration.ofMinutes(1))) {
+ PendingRestarts original = readPendingRestarts(id);
+ PendingRestarts modified = modification.apply(original);
+ if (original != modified) {
+ if (modified.isEmpty())
+ curator.delete(pendingRestartsPath(id));
+ else
+ curator.set(pendingRestartsPath(id), PendingRestartsSerializer.toBytes(modified));
+ }
+ }
+ }
+
public Optional<ApplicationReindexing> readReindexingStatus(ApplicationId id) {
return curator.getData(reindexingDataPath(id))
.map(ReindexingStatusSerializer::fromBytes);
@@ -211,6 +232,10 @@ public class ApplicationCuratorDatabase {
return curator.createDirectoryCache(applicationsPath.getAbsolute(), false, false, zkCacheExecutor);
}
+ private Path restartsLockPath(ApplicationId id) {
+ return locksPath.append(id.serializedForm() + "::restarts");
+ }
+
private Path reindexingLockPath(ApplicationId id) {
return locksPath.append(id.serializedForm()).append("reindexing");
}
@@ -227,6 +252,38 @@ public class ApplicationCuratorDatabase {
return applicationPath(id).append("reindexing");
}
+ private Path pendingRestartsPath(ApplicationId id) {
+ return applicationPath(id).append("restarts");
+ }
+
+ private static class PendingRestartsSerializer {
+
+ private static final String GENERATIONS = "generations";
+ private static final String GENERATION = "generation";
+ private static final String HOSTNAMES = "hostnames";
+
+ private static byte[] toBytes(PendingRestarts pendingRestarts) {
+ Cursor root = new Slime().setObject();
+ Cursor generationsArray = root.setArray(GENERATIONS);
+ pendingRestarts.generationsForRestarts().forEach((generation, hostnames) -> {
+ Cursor generationObject = generationsArray.addObject();
+ generationObject.setLong(GENERATION, generation);
+ hostnames.forEach(generationObject.setArray(HOSTNAMES)::addString);
+ });
+ return uncheck(() -> SlimeUtils.toJsonBytes(root));
+ }
+
+ private static PendingRestarts fromBytes(byte[] data) {
+ Cursor root = SlimeUtils.jsonToSlimeOrThrow(data).get();
+ return new PendingRestarts(SlimeUtils.entriesStream(root.field(GENERATIONS))
+ .collect(toMap(entry -> entry.field(GENERATION).asLong(),
+ entry -> SlimeUtils.entriesStream(entry.field(HOSTNAMES))
+ .map(Inspector::asString)
+ .toList())));
+ }
+
+ }
+
private static class ReindexingStatusSerializer {
private static final String ENABLED = "enabled";
@@ -263,7 +320,7 @@ public class ApplicationCuratorDatabase {
setStatus(statusObject, status);
});
});
- return Exceptions.uncheck(() -> SlimeUtils.toJsonBytes(root));
+ return uncheck(() -> SlimeUtils.toJsonBytes(root));
}
private static void setStatus(Cursor statusObject, Status status) {
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/application/PendingRestarts.java b/configserver/src/main/java/com/yahoo/vespa/config/server/application/PendingRestarts.java
new file mode 100644
index 00000000000..e573effc8a9
--- /dev/null
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/application/PendingRestarts.java
@@ -0,0 +1,58 @@
+package com.yahoo.vespa.config.server.application;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Set;
+
+import static java.util.Collections.unmodifiableMap;
+import static java.util.Collections.unmodifiableSet;
+
+/**
+ * @author jonmv
+ */
+public class PendingRestarts {
+
+ private static final PendingRestarts empty = new PendingRestarts(Map.of());
+
+ private final Map<Long, Set<String>> generationsForRestarts;
+
+ public static PendingRestarts empty() {
+ return empty;
+ }
+
+ PendingRestarts(Map<Long, ? extends Collection<String>> generationsForRestarts) {
+ Map<Long, Set<String>> builder = new LinkedHashMap<>();
+ generationsForRestarts.forEach((generation, hostnames) -> builder.put(generation, unmodifiableSet(new LinkedHashSet<>(hostnames))));
+ this.generationsForRestarts = unmodifiableMap(builder);
+ }
+
+ public Map<Long, Set<String>> generationsForRestarts() { return generationsForRestarts; }
+
+ public boolean isEmpty() { return generationsForRestarts.isEmpty(); }
+
+ public PendingRestarts withRestarts(long atGeneration, Collection<String> hostnames) {
+ Map<Long, Set<String>> newRestarts = new LinkedHashMap<>(generationsForRestarts);
+ newRestarts.put(atGeneration, new LinkedHashSet<>(newRestarts.getOrDefault(atGeneration, Set.of())) {{ addAll(hostnames); }});
+ return new PendingRestarts(newRestarts);
+ }
+
+ public PendingRestarts withoutPreviousGenerations(long generation) {
+ Map<Long, Set<String>> newRestarts = new LinkedHashMap<>(generationsForRestarts);
+ newRestarts.keySet().removeIf(g -> g <= generation);
+ return new PendingRestarts(newRestarts);
+ }
+
+ public Set<String> hostnames() {
+ return restartsReadyAt(Long.MAX_VALUE);
+ }
+
+ public Set<String> restartsReadyAt(long generation) {
+ LinkedHashSet<String> ready = new LinkedHashSet<>();
+ generationsForRestarts.forEach((g, hosts) -> { if (g <= generation) ready.addAll(hosts); });
+ return ready;
+ }
+
+}
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java
index f10bea5da6e..7c7608e5e5c 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java
@@ -45,6 +45,7 @@ import java.util.stream.Collectors;
import static com.yahoo.vespa.config.server.application.ConfigConvergenceChecker.ServiceListResponse;
import static com.yahoo.vespa.config.server.session.Session.Status.DELETE;
+import static java.util.stream.Collectors.joining;
import static java.util.stream.Collectors.toSet;
/**
@@ -171,60 +172,14 @@ public class Deployment implements com.yahoo.config.provision.Deployment {
Set<String> nodesToRestart = session.getActivationTriggers().nodeRestarts().stream().map(NodeRestart::hostname).collect(toSet());
if (nodesToRestart.isEmpty()) return;
- // TODO: replace this with a maintainer that waits for active config >= this session's config generation,
- // and let config convergence waiter in controller also check _pending_ restarts, maintained by that.
- // Here, we'll then instead hand these restarts over to that maintainer, with our session id.
- waitForConfigToConverge(applicationId, nodesToRestart);
-
- provisioner.get().restart(applicationId, HostFilter.from(nodesToRestart));
- String restartsMessage = String.format("Scheduled service restart of %d nodes: %s",
- nodesToRestart.size(), nodesToRestart.stream().sorted().collect(Collectors.joining(", ")));
- deployLogger.log(Level.INFO, restartsMessage);
- log.info(String.format("%sScheduled service restart of %d nodes: %s",
- session.logPre(), nodesToRestart.size(), restartsMessage));
+ applicationRepository.modifyPendingRestarts(applicationId, pendingRestarts -> pendingRestarts.withRestarts(session.getSessionId(), nodesToRestart));
+ deployLogger.log(Level.INFO, String.format("Scheduled service restart of %d nodes: %s",
+ nodesToRestart.size(), nodesToRestart.stream().sorted().collect(joining(", "))));
+ log.info(String.format("%sWill schedule service restart of %d nodes after convergence on generation %d: %s",
+ session.logPre(), nodesToRestart.size(), session.getSessionId(), nodesToRestart.stream().sorted().collect(joining(", "))));
this.configChangeActions = configChangeActions.withRestartActions(new RestartActions());
}
- private void waitForConfigToConverge(ApplicationId applicationId, Set<String> hostnames) {
- deployLogger.log(Level.INFO, "Wait for all services to use new config generation before restarting");
- var convergenceChecker = applicationRepository.configConvergenceChecker();
- var app = applicationRepository.getActiveApplication(applicationId);
-
- ServiceListResponse response = null;
- while (timeLeft(applicationId, response)) {
- response = convergenceChecker.checkConvergenceUnlessDeferringChangesUntilRestart(app, hostnames);
- if (response.converged) {
- deployLogger.log(Level.INFO, "Services converged on new config generation " + response.currentGeneration);
- return;
- } else {
- deployLogger.log(Level.INFO, "Services that did not converge on new config generation " +
- response.wantedGeneration + ": " +
- servicesNotConvergedFormatted(response) + ". Will retry");
- try { Thread.sleep(5_000); } catch (InterruptedException e) { /* ignore */ }
- }
- }
- }
-
- private boolean timeLeft(ApplicationId applicationId, ServiceListResponse response) {
- try {
- params.get().getTimeoutBudget().assertNotTimedOut(
- () -> "Timeout exceeded while waiting for config convergence for " + applicationId +
- ", wanted generation " + response.wantedGeneration + ", these services had another generation: " +
- servicesNotConvergedFormatted(response));
- } catch (UncheckedTimeoutException e) {
- throw new ConfigNotConvergedException(e);
- }
- return true;
- }
-
- private String servicesNotConvergedFormatted(ServiceListResponse response) {
- return response.services().stream()
- .filter(service -> service.currentGeneration != response.wantedGeneration)
- .map(service -> service.serviceInfo.getHostName() + ":" + service.serviceInfo.getServiceName() +
- " on generation " + service.currentGeneration)
- .collect(Collectors.joining(", "));
- }
-
private void storeReindexing(ApplicationId applicationId) {
applicationRepository.modifyReindexing(applicationId, reindexing -> {
for (Reindexing entry : session.getActivationTriggers().reindexings())
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintenance.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintenance.java
index dbd30f72c24..a0e48f3066a 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintenance.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ConfigServerMaintenance.java
@@ -49,6 +49,8 @@ public class ConfigServerMaintenance {
maintainers.add(new SessionsMaintainer(applicationRepository, curator, Duration.ofSeconds(30)));
maintainers.add(new ReindexingMaintainer(applicationRepository, curator,
Duration.ofMinutes(3), convergenceChecker, Clock.systemUTC()));
+ if (applicationRepository.configserverConfig().hostedVespa())
+ maintainers.add(new PendingRestartsMaintainer(applicationRepository, curator, Clock.systemUTC(), Duration.ofSeconds(30)));
}
public void shutdown() {
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/PendingRestartsMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/PendingRestartsMaintainer.java
new file mode 100644
index 00000000000..e228e0edcb6
--- /dev/null
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/PendingRestartsMaintainer.java
@@ -0,0 +1,102 @@
+package com.yahoo.vespa.config.server.maintenance;
+
+import com.yahoo.config.provision.ApplicationId;
+import com.yahoo.config.provision.HostFilter;
+import com.yahoo.vespa.config.server.ApplicationRepository;
+import com.yahoo.vespa.config.server.application.Application;
+import com.yahoo.vespa.config.server.application.ApplicationCuratorDatabase;
+import com.yahoo.vespa.config.server.application.ConfigConvergenceChecker.ServiceListResponse;
+import com.yahoo.vespa.config.server.application.PendingRestarts;
+import com.yahoo.vespa.config.server.tenant.Tenant;
+import com.yahoo.vespa.curator.Curator;
+import com.yahoo.yolean.Exceptions;
+
+import java.time.Clock;
+import java.time.Duration;
+import java.util.Optional;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.BiConsumer;
+import java.util.function.Function;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.stream.Collectors;
+
+/**
+ * @author jonmv
+ */
+public class PendingRestartsMaintainer extends ConfigServerMaintainer {
+
+ private final Clock clock;
+
+ public PendingRestartsMaintainer(ApplicationRepository applicationRepository, Curator curator, Clock clock, Duration interval) {
+ super(applicationRepository, curator, applicationRepository.flagSource(), clock, interval, true);
+ this.clock = clock;
+ }
+
+ @Override
+ protected double maintain() {
+ AtomicInteger attempts = new AtomicInteger(0);
+ AtomicInteger failures = new AtomicInteger(0);
+ for (Tenant tenant : applicationRepository.tenantRepository().getAllTenants()) {
+ ApplicationCuratorDatabase database = tenant.getApplicationRepo().database();
+ for (ApplicationId id : database.activeApplications())
+ applicationRepository.getActiveApplicationSet(id)
+ .map(application -> application.getForVersionOrLatest(Optional.empty(), clock.instant()))
+ .ifPresent(application -> {
+ try {
+ attempts.incrementAndGet();
+ applicationRepository.modifyPendingRestarts(id, restarts ->
+ triggerPendingRestarts(restartingHosts -> convergenceOf(application, restartingHosts),
+ this::restart,
+ id,
+ restarts,
+ log));
+ }
+ catch (RuntimeException e) {
+ log.log(Level.INFO, "Failed to update reindexing status for " + id + ": " + Exceptions.toMessageString(e));
+ failures.incrementAndGet();
+ }
+ });
+ }
+ return asSuccessFactorDeviation(attempts.get(), failures.get());
+ }
+
+ private ServiceListResponse convergenceOf(Application application, Set<String> restartingHosts) {
+ return applicationRepository.configConvergenceChecker().checkConvergenceUnlessDeferringChangesUntilRestart(application, restartingHosts);
+ }
+
+ private void restart(ApplicationId id, Set<String> nodesToRestart) {
+ applicationRepository.restart(id, HostFilter.from(nodesToRestart));
+ }
+
+ static PendingRestarts triggerPendingRestarts(Function<Set<String>, ServiceListResponse> convergenceChecker,
+ BiConsumer<ApplicationId, Set<String>> restarter,
+ ApplicationId id,
+ PendingRestarts restarts,
+ Logger log) {
+ Set<String> restartingHosts = restarts.hostnames();
+ if (restartingHosts.isEmpty()) return restarts;
+
+ ServiceListResponse convergence = convergenceChecker.apply(restartingHosts);
+ long lowestGeneration = convergence.currentGeneration;
+ Set<String> nodesToRestart = restarts.restartsReadyAt(lowestGeneration);
+ if (nodesToRestart.isEmpty()) {
+ log.info(String.format("Cannot yet restart nodes of %s, as some services are still on generation %d:\n\t%s",
+ id.toFullString(),
+ lowestGeneration,
+ convergence.services().stream()
+ .filter(service -> service.currentGeneration == lowestGeneration)
+ .map(service -> service.serviceInfo.getHostName() + ":" + service.serviceInfo.getServiceName())
+ .collect(Collectors.joining("\n\t"))));
+ return restarts;
+ }
+
+ restarter.accept(id, nodesToRestart);
+ log.info(String.format("Scheduled restart of %d nodes after observing generation %d: %s",
+ nodesToRestart.size(), lowestGeneration, nodesToRestart.stream().sorted().collect(Collectors.joining(", "))));
+
+ return restarts.withoutPreviousGenerations(lowestGeneration);
+ }
+
+}
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java
index 8171d63ae37..0dc6cc004be 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/maintenance/ReindexingMaintainer.java
@@ -25,6 +25,8 @@ import java.util.function.Supplier;
import java.util.logging.Level;
import java.util.logging.Logger;
+import static java.util.Comparator.naturalOrder;
+
/**
* Watches pending reindexing, and sets these to ready when config convergence is observed.
* Also removes data for clusters or document types which no longer exist.
@@ -84,7 +86,7 @@ public class ReindexingMaintainer extends ConfigServerMaintainer {
return () -> {
if (oldest.get() == 0)
oldest.set(convergence.getServiceConfigGenerations(application, timeout).values().stream()
- .min(Comparator.naturalOrder())
+ .min(naturalOrder())
.orElse(-1L));
return oldest.get();
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/provision/HostProvisionerProvider.java b/configserver/src/main/java/com/yahoo/vespa/config/server/provision/HostProvisionerProvider.java
index 8f733c1dd08..7c93004b557 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/provision/HostProvisionerProvider.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/provision/HostProvisionerProvider.java
@@ -10,7 +10,7 @@ import java.util.Optional;
/**
* This class is necessary to support both having and not having a host provisioner. We inject
- * a component registry here, which then enables us to check whether or not we have a provisioner available.
+ * a component registry here, which then enables us to check whether we have a provisioner available.
* We only have a provisioner if we are running in hosted mode.
*
* @author Ulf Lilleengen