aboutsummaryrefslogtreecommitdiffstats
path: root/configserver
diff options
context:
space:
mode:
authorHarald Musum <musum@yahooinc.com>2023-05-03 09:37:20 +0200
committerHarald Musum <musum@yahooinc.com>2023-05-03 09:37:20 +0200
commit37f2c509e4f9e33109d93737c5662f27337b3557 (patch)
treeadc81a30e7d7e3fa274ee08904a7af905e1bb651 /configserver
parentd0ec31ec3d0de3e4bc9e283cee079fb30eb349aa (diff)
Only check convergence before restart for services on hosts that should be restarted
When checking config convergence for app always check all services as before
Diffstat (limited to 'configserver')
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/application/ConfigConvergenceChecker.java33
-rw-r--r--configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java8
-rw-r--r--configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java5
-rw-r--r--configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java45
4 files changed, 51 insertions, 40 deletions
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/application/ConfigConvergenceChecker.java b/configserver/src/main/java/com/yahoo/vespa/config/server/application/ConfigConvergenceChecker.java
index 132d3d913e7..040f230a40e 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/application/ConfigConvergenceChecker.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/application/ConfigConvergenceChecker.java
@@ -86,19 +86,21 @@ public class ConfigConvergenceChecker extends AbstractComponent {
/** Fetches the active config generation for all services in the given application. */
public Map<ServiceInfo, Long> getServiceConfigGenerations(Application application, Duration timeoutPerService) {
- return getServiceConfigGenerations(application, timeoutPerService, true);
+ return getServiceConfigGenerations(application, timeoutPerService, new HostsToCheck(Set.of()));
}
/**
* Fetches the active config generation for all services in the given application. Will not check services
- * which defer config changes until restart if checkAll is false.
+ * which defer config changes until restart if checkAll is false. hostsToCheck are names to check, or empty to check all.
*/
- private Map<ServiceInfo, Long> getServiceConfigGenerations(Application application, Duration timeoutPerService, boolean checkAll) {
+ private Map<ServiceInfo, Long> getServiceConfigGenerations(Application application,
+ Duration timeoutPerService,
+ HostsToCheck hostsToCheck) {
List<ServiceInfo> servicesToCheck = new ArrayList<>();
application.getModel().getHosts()
.forEach(host -> host.getServices().stream()
.filter(service -> serviceTypesToCheck.contains(service.getServiceType()))
- .filter(serviceInfo -> shouldCheckService(checkAll, application, serviceInfo))
+ .filter(serviceInfo -> shouldCheckService(hostsToCheck, application, serviceInfo))
.forEach(service -> getStatePort(service).ifPresent(port -> servicesToCheck.add(service))));
log.log(Level.FINE, "Services to check for config convergence: " + servicesToCheck);
@@ -107,20 +109,20 @@ public class ConfigConvergenceChecker extends AbstractComponent {
/** Checks all services in given application. Returns the minimum current generation of all services */
public ServiceListResponse checkConvergenceForAllServices(Application application, Duration timeoutPerService) {
- return checkConvergence(application, timeoutPerService, true);
+ return checkConvergence(application, timeoutPerService, new HostsToCheck(Set.of()));
}
/**
* Checks services except those which defer config changes until restart in the given application.
* Returns the minimum current generation of those services.
*/
- public ServiceListResponse checkConvergenceUnlessDeferringChangesUntilRestart(Application application) {
+ public ServiceListResponse checkConvergenceUnlessDeferringChangesUntilRestart(Application application, Set<String> hostnames) {
Duration timeoutPerService = Duration.ofSeconds(10);
- return checkConvergence(application, timeoutPerService, false);
+ return checkConvergence(application, timeoutPerService, new HostsToCheck(hostnames));
}
- private ServiceListResponse checkConvergence(Application application, Duration timeoutPerService, boolean checkAll) {
- Map<ServiceInfo, Long> currentGenerations = getServiceConfigGenerations(application, timeoutPerService, checkAll);
+ private ServiceListResponse checkConvergence(Application application, Duration timeoutPerService, HostsToCheck hostsToCheck) {
+ Map<ServiceInfo, Long> currentGenerations = getServiceConfigGenerations(application, timeoutPerService, hostsToCheck);
long currentGeneration = currentGenerations.values().stream().mapToLong(Long::longValue).min().orElse(-1);
return new ServiceListResponse(currentGenerations, application.getApplicationGeneration(), currentGeneration);
}
@@ -142,8 +144,9 @@ public class ConfigConvergenceChecker extends AbstractComponent {
}
}
- private boolean shouldCheckService(boolean checkServicesWithDeferChangesUntilRestart, Application application, ServiceInfo serviceInfo) {
- if (checkServicesWithDeferChangesUntilRestart) return true;
+ private boolean shouldCheckService(HostsToCheck hostsToCheck, Application application, ServiceInfo serviceInfo) {
+ if (hostsToCheck.checkAll()) return true;
+ if ( ! hostsToCheck.check(serviceInfo.getHostName())) return false;
if (isNotContainer(serviceInfo)) return true;
return serviceIsInClusterWhichShouldBeChecked(application, serviceInfo);
}
@@ -307,6 +310,14 @@ public class ConfigConvergenceChecker extends AbstractComponent {
.build();
}
+ private record HostsToCheck(Set<String> hostnames) {
+
+ public boolean checkAll() { return hostnames.isEmpty(); }
+
+ public boolean check(String hostname) { return checkAll() || hostnames.contains(hostname); }
+
+ }
+
public static class ServiceResponse {
public enum Status { ok, notFound, hostNotFound, error }
diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java
index e135bc7f0e2..cab3e89c606 100644
--- a/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java
+++ b/configserver/src/main/java/com/yahoo/vespa/config/server/deploy/Deployment.java
@@ -173,9 +173,9 @@ public class Deployment implements com.yahoo.config.provision.Deployment {
RestartActions restartActions = configChangeActions.getRestartActions().useForInternalRestart(internalRedeploy);
if (restartActions.isEmpty()) return;
- waitForConfigToConverge(applicationId);
-
Set<String> hostnames = restartActions.hostnames();
+ waitForConfigToConverge(applicationId, hostnames);
+
provisioner.get().restart(applicationId, HostFilter.from(hostnames));
deployLogger.log(Level.INFO, String.format("Scheduled service restart of %d nodes: %s",
hostnames.size(), hostnames.stream().sorted().collect(Collectors.joining(", "))));
@@ -184,14 +184,14 @@ public class Deployment implements com.yahoo.config.provision.Deployment {
this.configChangeActions = configChangeActions.withRestartActions(new RestartActions());
}
- private void waitForConfigToConverge(ApplicationId applicationId) {
+ private void waitForConfigToConverge(ApplicationId applicationId, Set<String> hostnames) {
deployLogger.log(Level.INFO, "Wait for all services to use new config generation before restarting");
var convergenceChecker = applicationRepository.configConvergenceChecker();
var app = applicationRepository.getActiveApplication(applicationId);
ServiceListResponse response = null;
while (timeLeft(applicationId, response)) {
- response = convergenceChecker.checkConvergenceUnlessDeferringChangesUntilRestart(app);
+ response = convergenceChecker.checkConvergenceUnlessDeferringChangesUntilRestart(app, hostnames);
if (response.converged) {
deployLogger.log(Level.INFO, "Services converged on new config generation " + response.currentGeneration);
return;
diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java b/configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java
index f335e978235..ccbdcde2c2e 100644
--- a/configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java
+++ b/configserver/src/test/java/com/yahoo/vespa/config/server/MockConfigConvergenceChecker.java
@@ -8,8 +8,7 @@ import java.time.Duration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.function.Function;
-import java.util.stream.Collectors;
+import java.util.Set;
public class MockConfigConvergenceChecker extends ConfigConvergenceChecker {
@@ -43,7 +42,7 @@ public class MockConfigConvergenceChecker extends ConfigConvergenceChecker {
}
@Override
- public ServiceListResponse checkConvergenceUnlessDeferringChangesUntilRestart(Application application) {
+ public ServiceListResponse checkConvergenceUnlessDeferringChangesUntilRestart(Application application, Set<String> hostnames) {
iteration++;
if (servicesThatFailFirstIteration.isEmpty() || iteration > 1)
return new ServiceListResponse(Map.of(), wantedGeneration, wantedGeneration);
diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java b/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java
index 58e76d5dc0a..c1da4e0502f 100644
--- a/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java
+++ b/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/HostedDeployTest.java
@@ -43,6 +43,7 @@ import java.nio.file.Files;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
+import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@@ -52,6 +53,7 @@ import java.util.stream.IntStream;
import static com.yahoo.vespa.config.server.deploy.DeployTester.CountingModelFactory;
import static com.yahoo.vespa.config.server.deploy.DeployTester.createFailingModelFactory;
import static com.yahoo.vespa.config.server.deploy.DeployTester.createHostedModelFactory;
+import static com.yahoo.yolean.Exceptions.findCause;
import static com.yahoo.yolean.Exceptions.uncheck;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
@@ -446,7 +448,7 @@ public class HostedDeployTest {
@Test
public void testThatConfigChangeActionsAreCollectedFromAllModels() {
List<Host> hosts = createHosts(9, "6.1.0", "6.2.0");
- List<ServiceInfo> services = createServices();
+ List<ServiceInfo> services = createServices(1);
List<ModelFactory> modelFactories = List.of(
new ConfigChangeActionsModelFactory(Version.fromString("6.1.0"),
@@ -463,13 +465,12 @@ public class HostedDeployTest {
@Test
public void testConfigConvergenceBeforeRestart() {
List<Host> hosts = createHosts(9, "6.1.0", "6.2.0");
- List<ServiceInfo> services = createServices();
+ List<ServiceInfo> services = createServices(1);
+ List<ServiceInfo> twoServices = createServices(2);
List<ModelFactory> modelFactories = List.of(
- new ConfigChangeActionsModelFactory(Version.fromString("6.1.0"),
- new VespaRestartAction(ClusterSpec.Id.from("test"), "change", services)),
new ConfigChangeActionsModelFactory(Version.fromString("6.2.0"),
- new VespaRestartAction(ClusterSpec.Id.from("test"), "other change", services)));
+ new VespaRestartAction(ClusterSpec.Id.from("test"), "change", services)));
DeployTester tester = createTester(hosts,
modelFactories,
@@ -479,9 +480,10 @@ public class HostedDeployTest {
var result = tester.deployApp("src/test/apps/hosted/", "6.2.0");
DeployHandlerLogger deployLogger = result.deployLogger();
- System.out.println(SlimeUtils.toJson(deployLogger.slime().get()));
+ assertLogContainsMessage(deployLogger, "Scheduled service restart of 1 nodes: hostName0");
assertLogContainsMessage(deployLogger, "Wait for all services to use new config generation before restarting");
- assertLogContainsMessage(deployLogger, "Services that did not converge on new config generation 2: hostName:serviceName on generation 1, hostName2:serviceName2 on generation 1. Will retry");
+ // Should only check convergence on 1 of the nodes
+ assertLogContainsMessage(deployLogger, "Services that did not converge on new config generation 2: hostName0:serviceName0 on generation 1. Will retry");
assertLogContainsMessage(deployLogger, "Services converged on new config generation 2");
}
@@ -495,7 +497,7 @@ public class HostedDeployTest {
@Test
public void testThatAllowedConfigChangeActionsAreActedUpon() {
List<Host> hosts = createHosts(9, "6.1.0");
- List<ServiceInfo> services = createServices();
+ List<ServiceInfo> services = createServices(1);
ManualClock clock = new ManualClock(Instant.EPOCH);
List<ModelFactory> modelFactories = List.of(
@@ -517,7 +519,7 @@ public class HostedDeployTest {
assertEquals(9, tester.getAllocatedHostsOf(tester.applicationId()).getHosts().size());
assertTrue(prepareResult.configChangeActions().getRestartActions().isEmpty()); // Handled by deployment.
assertEquals(Optional.of(ApplicationReindexing.empty()
- .withPending("cluster", "music", prepareResult.sessionId())),
+ .withPending("cluster0", "music", prepareResult.sessionId())),
tester.tenant().getApplicationRepo().database().readReindexingStatus(tester.applicationId()));
}
@@ -589,19 +591,18 @@ public class HostedDeployTest {
.build();
}
- private static List<ServiceInfo> createServices() {
- return List.of(new ServiceInfo("serviceName",
- "serviceType",
- null,
- Map.of("clustername", "cluster"),
- "configId",
- "hostName"),
- new ServiceInfo("serviceName2",
- "serviceType2",
- null,
- Map.of("clustername", "cluster"),
- "configId",
- "hostName2"));
+ private static List<ServiceInfo> createServices(int serviceCount) {
+ List<ServiceInfo> services = new ArrayList<>();
+
+ IntStream.range(0, serviceCount)
+ .forEach(i -> services.add(new ServiceInfo("serviceName" + i,
+ i == 0 ? "searchnode" : "container",
+ null,
+ Map.of("clustername", "cluster" + i),
+ "configId" + i,
+ "hostName" + i)));
+
+ return services;
}
private static class ConfigChangeActionsModelFactory extends TestModelFactory {