diff options
author | Harald Musum <musum@oath.com> | 2018-06-22 09:50:47 +0200 |
---|---|---|
committer | Harald Musum <musum@oath.com> | 2018-06-22 09:50:47 +0200 |
commit | 77df3ad739ba6eead6d5b8417c2974c1a00d566d (patch) | |
tree | 996f2e33c6e53cd4f7edb7ecf244b56968c6dd76 | |
parent | ea9a1eea042617764dfe474cf161e422d1123338 (diff) |
Retry redeployment of applications if that fails
Retry redeployment of applications if they fail when bootstrapping,
do System.exit if if takes longer than a configured amount of time
6 files changed, 63 insertions, 20 deletions
diff --git a/configdefinitions/src/vespa/configserver.def b/configdefinitions/src/vespa/configserver.def index bf4c9599f4a..228a5c6fb4f 100644 --- a/configdefinitions/src/vespa/configserver.def +++ b/configdefinitions/src/vespa/configserver.def @@ -58,3 +58,6 @@ nodeAdminInContainer bool default=true maintainerIntervalMinutes int default=30 # TODO: Default set to a high value (1 year) => maintainer will not run, change when maintainer verified out in prod tenantsMaintainerIntervalMinutes int default=525600 + +# How long bootstrapping can take before giving up (in seconds) +maxDurationOfBootstrap long default=7200 diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java b/configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java index 821162353d6..ab7702e26d1 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/ApplicationRepository.java @@ -110,16 +110,16 @@ public class ApplicationRepository implements com.yahoo.config.provision.Deploye public ApplicationRepository(TenantRepository tenantRepository, Provisioner hostProvisioner, Clock clock) { - this(tenantRepository, new ConfigConvergenceChecker(), hostProvisioner, clock); + this(tenantRepository, hostProvisioner, clock, new ConfigserverConfig(new ConfigserverConfig.Builder())); } + // For testing public ApplicationRepository(TenantRepository tenantRepository, - ConfigConvergenceChecker convergenceChecker, Provisioner hostProvisioner, - Clock clock) { - this(tenantRepository, Optional.of(hostProvisioner), - convergenceChecker, new HttpProxy(new SimpleHttpFetcher()), - new ConfigserverConfig(new ConfigserverConfig.Builder()), clock, new FileDistributionStatus()); + Clock clock, + ConfigserverConfig configserverConfig) { + this(tenantRepository, Optional.of(hostProvisioner), new ConfigConvergenceChecker(), new HttpProxy(new SimpleHttpFetcher()), + configserverConfig, clock, new FileDistributionStatus()); } private ApplicationRepository(TenantRepository tenantRepository, @@ -492,6 +492,10 @@ public class ApplicationRepository implements com.yahoo.config.provision.Deploye return getLocalSession(tenant, sessionId).getMetaData(); } + ConfigserverConfig configserverConfig() { + return configserverConfig; + } + private void validateThatLocalSessionIsNotActive(Tenant tenant, long sessionId) { LocalSession session = getLocalSession(tenant, sessionId); if (Session.Status.ACTIVATE.equals(session.getStatus())) { @@ -557,24 +561,42 @@ public class ApplicationRepository implements com.yahoo.config.provision.Deploye } } - void redeployAllApplications() throws InterruptedException { + boolean redeployAllApplications(Duration maxDuration) throws InterruptedException { + Instant end = Instant.now().plus(maxDuration); + Set<ApplicationId> applicationIds = listApplications(); + do { + applicationIds = redeployApplications(applicationIds); + } while ( ! applicationIds.isEmpty() && Instant.now().isBefore(end)); + + if ( ! applicationIds.isEmpty()) { + log.log(LogLevel.ERROR, "Redeploying applications not finished after " + maxDuration + + ", exiting, applications that failed redeployment: " + applicationIds); + return false; + } + return true; + } + + // Returns the set of applications that failed to redeploy + private Set<ApplicationId> redeployApplications(Set<ApplicationId> applicationIds) throws InterruptedException { ExecutorService executor = Executors.newFixedThreadPool(configserverConfig.numParallelTenantLoaders(), new DaemonThreadFactory("redeploy apps")); // Keep track of deployment per application Map<ApplicationId, Future<?>> futures = new HashMap<>(); - tenantRepository.getAllTenants() - .forEach(tenant -> listApplicationIds(tenant) - .forEach(appId -> deployFromLocalActive(appId).ifPresent( - deployment -> futures.put(appId,executor.submit(deployment::activate))))); + Set<ApplicationId> failedDeployments = new HashSet<>(); + applicationIds.forEach(appId -> deployFromLocalActive(appId).ifPresent( + deployment -> futures.put(appId, executor.submit(deployment::activate)))); for (Map.Entry<ApplicationId, Future<?>> f : futures.entrySet()) { try { f.getValue().get(); } catch (ExecutionException e) { - throw new RuntimeException("Redeploying of " + f.getKey() + " failed", e); + ApplicationId app = f.getKey(); + log.log(LogLevel.WARNING, "Redeploying " + app + " failed, will retry"); + failedDeployments.add(app); } } executor.shutdown(); executor.awaitTermination(365, TimeUnit.DAYS); // Timeout should never happen + return failedDeployments; } private LocalSession getExistingSession(Tenant tenant, ApplicationId applicationId) { diff --git a/configserver/src/main/java/com/yahoo/vespa/config/server/ConfigServerBootstrap.java b/configserver/src/main/java/com/yahoo/vespa/config/server/ConfigServerBootstrap.java index 916fde97e35..6e6f60b29dd 100644 --- a/configserver/src/main/java/com/yahoo/vespa/config/server/ConfigServerBootstrap.java +++ b/configserver/src/main/java/com/yahoo/vespa/config/server/ConfigServerBootstrap.java @@ -31,12 +31,17 @@ public class ConfigServerBootstrap extends AbstractComponent implements Runnable private static final ExecutorService rpcServerExecutor = Executors.newSingleThreadExecutor(new DaemonThreadFactory("config server RPC server")); private static final String vipStatusClusterIdentifier = "configserver"; + enum MainThread {START, DO_NOT_START} + enum RedeployingApplicationsFails {EXIT_JVM, CONTINUE} + private final ApplicationRepository applicationRepository; private final RpcServer server; private final Thread serverThread; private final VersionState versionState; private final StateMonitor stateMonitor; private final VipStatus vipStatus; + private final Duration maxDurationOfRedeployment; + private final RedeployingApplicationsFails exitIfRedeployingApplicationsFails; // The tenants object is injected so that all initial requests handlers are // added to the rpc server before it starts answering rpc requests. @@ -44,20 +49,23 @@ public class ConfigServerBootstrap extends AbstractComponent implements Runnable @Inject public ConfigServerBootstrap(ApplicationRepository applicationRepository, RpcServer server, VersionState versionState, StateMonitor stateMonitor, VipStatus vipStatus) { - this(applicationRepository, server, versionState, stateMonitor, vipStatus, true); + this(applicationRepository, server, versionState, stateMonitor, vipStatus, MainThread.START, RedeployingApplicationsFails.EXIT_JVM); } // For testing only ConfigServerBootstrap(ApplicationRepository applicationRepository, RpcServer server, VersionState versionState, - StateMonitor stateMonitor, VipStatus vipStatus, boolean startMainThread) { + StateMonitor stateMonitor, VipStatus vipStatus, MainThread mainThread, + RedeployingApplicationsFails exitIfRedeployingApplicationsFails) { this.applicationRepository = applicationRepository; this.server = server; this.versionState = versionState; this.stateMonitor = stateMonitor; this.serverThread = new Thread(this, "configserver main"); this.vipStatus = vipStatus; + this.maxDurationOfRedeployment = Duration.ofSeconds(applicationRepository.configserverConfig().maxDurationOfBootstrap()); + this.exitIfRedeployingApplicationsFails = exitIfRedeployingApplicationsFails; initializing(); // Initially take server out of rotation - if (startMainThread) + if (mainThread == MainThread.START) start(); } @@ -80,11 +88,15 @@ public class ConfigServerBootstrap extends AbstractComponent implements Runnable log.log(LogLevel.INFO, "Configserver upgrading from " + versionState.storedVersion() + " to " + versionState.currentVersion() + ". Redeploying all applications"); try { - applicationRepository.redeployAllApplications(); + if ( ! applicationRepository.redeployAllApplications(maxDurationOfRedeployment)) { + redeployingApplicationsFailed(); + return; // Status will not be set to 'up' since we return here + } versionState.saveNewVersion(); log.log(LogLevel.INFO, "All applications redeployed successfully"); } catch (Exception e) { log.log(LogLevel.ERROR, "Redeployment of applications failed", e); + redeployingApplicationsFailed(); return; // Status will not be set to 'up' since we return here } } @@ -144,5 +156,9 @@ public class ConfigServerBootstrap extends AbstractComponent implements Runnable log.log(LogLevel.INFO, "RPC server started"); } + private void redeployingApplicationsFailed() { + if (exitIfRedeployingApplicationsFails == RedeployingApplicationsFails.EXIT_JVM) System.exit(1); + } + } diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/ConfigServerBootstrapTest.java b/configserver/src/test/java/com/yahoo/vespa/config/server/ConfigServerBootstrapTest.java index 992d46d3115..082c3058598 100644 --- a/configserver/src/test/java/com/yahoo/vespa/config/server/ConfigServerBootstrapTest.java +++ b/configserver/src/test/java/com/yahoo/vespa/config/server/ConfigServerBootstrapTest.java @@ -76,7 +76,9 @@ public class ConfigServerBootstrapTest { RpcServer rpcServer = createRpcServer(configserverConfig); VipStatus vipStatus = new VipStatus(); ConfigServerBootstrap bootstrap = new ConfigServerBootstrap(tester.applicationRepository(), rpcServer, versionState, - createStateMonitor(), vipStatus, false /* do not call run method */); + createStateMonitor(), vipStatus, + ConfigServerBootstrap.MainThread.DO_NOT_START, + ConfigServerBootstrap.RedeployingApplicationsFails.CONTINUE); assertFalse(vipStatus.isInRotation()); // Call method directly, to be sure that it is finished redeploying all applications and we can check status bootstrap.run(); @@ -112,7 +114,8 @@ public class ConfigServerBootstrapTest { .configServerDBDir(temporaryFolder.newFolder("serverdb").getAbsolutePath()) .configDefinitionsDir(temporaryFolder.newFolder("configdefinitions").getAbsolutePath()) .hostedVespa(true) - .multitenant(true)); + .multitenant(true) + .maxDurationOfBootstrap(1) /* seconds */); } public static class MockRpc extends com.yahoo.vespa.config.server.rpc.MockRpc { diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/DeployTester.java b/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/DeployTester.java index ce53451ae2e..a4f5679aa39 100644 --- a/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/DeployTester.java +++ b/configserver/src/test/java/com/yahoo/vespa/config/server/deploy/DeployTester.java @@ -112,7 +112,7 @@ public class DeployTester { catch (Exception e) { throw new IllegalArgumentException(e); } - applicationRepository = new ApplicationRepository(tenantRepository, new ProvisionerAdapter(provisioner), clock); + applicationRepository = new ApplicationRepository(tenantRepository, new ProvisionerAdapter(provisioner), clock, configserverConfig); } public Tenant tenant() { diff --git a/configserver/src/test/java/com/yahoo/vespa/config/server/http/v2/ApplicationHandlerTest.java b/configserver/src/test/java/com/yahoo/vespa/config/server/http/v2/ApplicationHandlerTest.java index d8c5e33ca65..6680907e1c0 100644 --- a/configserver/src/test/java/com/yahoo/vespa/config/server/http/v2/ApplicationHandlerTest.java +++ b/configserver/src/test/java/com/yahoo/vespa/config/server/http/v2/ApplicationHandlerTest.java @@ -69,7 +69,6 @@ public class ApplicationHandlerTest { tenantRepository.addTenant(TenantBuilder.create(componentRegistry, foobar)); provisioner = new SessionHandlerTest.MockProvisioner(); applicationRepository = new ApplicationRepository(tenantRepository, - new ConfigConvergenceChecker(stateApiFactory), provisioner, Clock.systemUTC()); listApplicationsHandler = new ListApplicationsHandler(ListApplicationsHandler.testOnlyContext(), tenantRepository, |