// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.hosted.controller.deployment; import com.google.common.collect.ImmutableSet; import com.yahoo.component.Version; import com.yahoo.config.application.api.DeploymentSpec; import com.yahoo.config.application.api.Notifications; import com.yahoo.config.application.api.Notifications.When; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.AthenzDomain; import com.yahoo.config.provision.AthenzService; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.zone.ZoneId; import com.yahoo.security.KeyAlgorithm; import com.yahoo.security.KeyUtils; import com.yahoo.security.SignatureAlgorithm; import com.yahoo.security.X509CertificateBuilder; import com.yahoo.security.X509CertificateUtils; import com.yahoo.vespa.hosted.controller.Application; import com.yahoo.vespa.hosted.controller.Controller; import com.yahoo.vespa.hosted.controller.Instance; import com.yahoo.vespa.hosted.controller.api.ActivateResult; import com.yahoo.vespa.hosted.controller.api.application.v4.model.DeployOptions; import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId; import com.yahoo.vespa.hosted.controller.api.identifiers.Hostname; import com.yahoo.vespa.hosted.controller.api.integration.configserver.ConfigServerException; import com.yahoo.vespa.hosted.controller.api.integration.configserver.Node; import com.yahoo.vespa.hosted.controller.api.integration.configserver.PrepareResponse; import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationVersion; import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType; import com.yahoo.vespa.hosted.controller.api.integration.deployment.RunId; import com.yahoo.vespa.hosted.controller.api.integration.deployment.TesterCloud; import com.yahoo.vespa.hosted.controller.api.integration.deployment.TesterId; import com.yahoo.vespa.hosted.controller.api.integration.organization.DeploymentFailureMails; import com.yahoo.vespa.hosted.controller.application.ApplicationPackage; import com.yahoo.vespa.hosted.controller.application.Deployment; import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId; import com.yahoo.yolean.Exceptions; import javax.security.auth.x500.X500Principal; import java.io.ByteArrayOutputStream; import java.io.PrintStream; import java.io.UncheckedIOException; import java.math.BigInteger; import java.net.URI; import java.security.KeyPair; import java.security.cert.CertificateExpiredException; import java.security.cert.CertificateNotYetValidException; import java.security.cert.X509Certificate; import java.time.Duration; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.function.Supplier; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; import static com.yahoo.config.application.api.Notifications.Role.author; import static com.yahoo.config.application.api.Notifications.When.failing; import static com.yahoo.config.application.api.Notifications.When.failingCommit; import static com.yahoo.vespa.hosted.controller.api.integration.configserver.Node.State.active; import static com.yahoo.vespa.hosted.controller.api.integration.configserver.Node.State.reserved; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.aborted; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.deploymentFailed; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.error; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.installationFailed; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.outOfCapacity; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.running; import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.testFailure; import static com.yahoo.vespa.hosted.controller.deployment.Step.installTester; import static java.nio.charset.StandardCharsets.UTF_8; import static java.util.logging.Level.INFO; import static java.util.logging.Level.WARNING; /** * Runs steps of a deployment job against its provided controller. * * A dual-purpose logger is set up for each step run here: * 1. all messages are logged to a buffer which is stored in an external log storage at the end of execution, and * 2. all messages are also logged through the usual logging framework; by default, any messages of level * {@code Level.INFO} or higher end up in the Vespa log, and all messages may be sent there by means of log-control. * * @author jonmv */ public class InternalStepRunner implements StepRunner { private static final Logger logger = Logger.getLogger(InternalStepRunner.class.getName()); private static final NodeResources DEFAULT_TESTER_RESOURCES = new NodeResources(1, 4, 50, 0.3, NodeResources.DiskSpeed.any); // Must match exactly the advertised resources of an AWS instance type. Also consider that the container // will have ~1.8 GB less memory than equivalent resources in AWS (VESPA-16259). private static final NodeResources DEFAULT_TESTER_RESOURCES_AWS = new NodeResources(2, 8, 50, 0.3, NodeResources.DiskSpeed.any); static final Duration endpointTimeout = Duration.ofMinutes(15); static final Duration testerTimeout = Duration.ofMinutes(30); static final Duration installationTimeout = Duration.ofMinutes(150); static final Duration certificateTimeout = Duration.ofMinutes(300); private final Controller controller; private final TestConfigSerializer testConfigSerializer; private final DeploymentFailureMails mails; public InternalStepRunner(Controller controller) { this.controller = controller; this.testConfigSerializer = new TestConfigSerializer(controller.system()); this.mails = new DeploymentFailureMails(controller.zoneRegistry()); } @Override public Optional run(LockedStep step, RunId id) { DualLogger logger = new DualLogger(id, step.get()); try { switch (step.get()) { case deployTester: return deployTester(id, logger); case deployInitialReal: return deployInitialReal(id, logger); case installInitialReal: return installInitialReal(id, logger); case deployReal: return deployReal(id, logger); case installTester: return installTester(id, logger); case installReal: return installReal(id, logger); case startStagingSetup: return startTests(id, true, logger); case endStagingSetup: return endTests(id, logger); case startTests: return startTests(id, false, logger); case endTests: return endTests(id, logger); case copyVespaLogs: return copyVespaLogs(id, logger); case deactivateReal: return deactivateReal(id, logger); case deactivateTester: return deactivateTester(id, logger); case report: return report(id, logger); default: throw new AssertionError("Unknown step '" + step + "'!"); } } catch (UncheckedIOException e) { logger.logWithInternalException(INFO, "IO exception running " + id + ": " + Exceptions.toMessageString(e), e); return Optional.empty(); } catch (RuntimeException e) { logger.log(WARNING, "Unexpected exception running " + id, e); if (JobProfile.of(id.type()).alwaysRun().contains(step.get())) { logger.log("Will keep trying, as this is a cleanup step."); return Optional.empty(); } return Optional.of(error); } } private Optional deployInitialReal(RunId id, DualLogger logger) { Versions versions = controller.jobController().run(id).get().versions(); logger.log("Deploying platform version " + versions.sourcePlatform().orElse(versions.targetPlatform()) + " and application version " + versions.sourceApplication().orElse(versions.targetApplication()).id() + " ..."); return deployReal(id, true, versions, logger); } private Optional deployReal(RunId id, DualLogger logger) { Versions versions = controller.jobController().run(id).get().versions(); logger.log("Deploying platform version " + versions.targetPlatform() + " and application version " + versions.targetApplication().id() + " ..."); return deployReal(id, false, versions, logger); } private Optional deployReal(RunId id, boolean setTheStage, Versions versions, DualLogger logger) { Optional applicationPackage = id.type().environment().isManuallyDeployed() ? Optional.of(new ApplicationPackage(controller.applications().applicationStore() .getDev(id.application(), id.type().zone(controller.system())))) : Optional.empty(); Optional vespaVersion = id.type().environment().isManuallyDeployed() ? Optional.of(versions.targetPlatform()) : Optional.empty(); return deploy(id.application(), id.type(), () -> controller.applications().deploy(id.application(), id.type().zone(controller.system()), applicationPackage, new DeployOptions(false, vespaVersion, false, setTheStage)), logger); } private Optional deployTester(RunId id, DualLogger logger) { Version platform = controller.systemVersion(); logger.log("Deploying the tester container on platform " + platform + " ..."); return deploy(id.tester().id(), id.type(), () -> controller.applications().deployTester(id.tester(), testerPackage(id), id.type().zone(controller.system()), new DeployOptions(true, Optional.of(platform), false, false)), logger); } private Optional deploy(ApplicationId id, JobType type, Supplier deployment, DualLogger logger) { try { PrepareResponse prepareResponse = deployment.get().prepareResponse(); if ( ! prepareResponse.configChangeActions.refeedActions.stream().allMatch(action -> action.allowed)) { List messages = new ArrayList<>(); messages.add("Deploy failed due to non-compatible changes that require re-feed."); messages.add("Your options are:"); messages.add("1. Revert the incompatible changes."); messages.add("2. If you think it is safe in your case, you can override this validation, see"); messages.add(" http://docs.vespa.ai/documentation/reference/validation-overrides.html"); messages.add("3. Deploy as a new application under a different name."); messages.add("Illegal actions:"); prepareResponse.configChangeActions.refeedActions.stream() .filter(action -> ! action.allowed) .flatMap(action -> action.messages.stream()) .forEach(messages::add); messages.add("Details:"); prepareResponse.log.stream() .map(entry -> entry.message) .forEach(messages::add); logger.log(messages); return Optional.of(deploymentFailed); } if (prepareResponse.configChangeActions.restartActions.isEmpty()) logger.log("No services requiring restart."); else prepareResponse.configChangeActions.restartActions.stream() .flatMap(action -> action.services.stream()) .map(service -> service.hostName) .sorted().distinct() .map(Hostname::new) .forEach(hostname -> { controller.applications().restart(new DeploymentId(id, type.zone(controller.system())), Optional.of(hostname)); logger.log("Restarting services on host " + hostname.id() + "."); }); logger.log("Deployment successful."); if (prepareResponse.message != null) logger.log(prepareResponse.message); return Optional.of(running); } catch (ConfigServerException e) { switch (e.getErrorCode()) { case ACTIVATION_CONFLICT: case APPLICATION_LOCK_FAILURE: case CERTIFICATE_NOT_READY: logger.log("Deployment failed with possibly transient error " + e.getErrorCode() + ", will retry: " + e.getMessage()); return Optional.empty(); case LOAD_BALANCER_NOT_READY: case PARENT_HOST_NOT_READY: logger.log(e.getServerMessage()); return Optional.empty(); case OUT_OF_CAPACITY: logger.log(e.getServerMessage()); return Optional.of(outOfCapacity); case INVALID_APPLICATION_PACKAGE: case BAD_REQUEST: logger.log(e.getMessage()); return Optional.of(deploymentFailed); } throw e; } } private Optional installInitialReal(RunId id, DualLogger logger) { return installReal(id, true, logger); } private Optional installReal(RunId id, DualLogger logger) { return installReal(id, false, logger); } private Optional installReal(RunId id, boolean setTheStage, DualLogger logger) { Optional deployment = deployment(id.application(), id.type()); if ( ! deployment.isPresent()) { logger.log(INFO, "Deployment expired before installation was successful."); return Optional.of(installationFailed); } Versions versions = controller.jobController().run(id).get().versions(); Version platform = setTheStage ? versions.sourcePlatform().orElse(versions.targetPlatform()) : versions.targetPlatform(); ApplicationVersion application = setTheStage ? versions.sourceApplication().orElse(versions.targetApplication()) : versions.targetApplication(); logger.log("Checking installation of " + platform + " and " + application.id() + " ..."); if ( nodesConverged(id.application(), id.type(), platform, logger) && servicesConverged(id.application(), id.type(), platform, logger)) { if (endpointsAvailable(id.application(), id.type().zone(controller.system()), logger)) { if (containersAreUp(id.application(), id.type().zone(controller.system()), logger)) { logger.log("Installation succeeded!"); return Optional.of(running); } } else if (timedOut(id, deployment.get(), endpointTimeout)) { logger.log(WARNING, "Endpoints failed to show up within " + endpointTimeout.toMinutes() + " minutes!"); return Optional.of(error); } } if (timedOut(id, deployment.get(), installationTimeout)) { logger.log(INFO, "Installation failed to complete within " + installationTimeout.toMinutes() + " minutes!"); return Optional.of(installationFailed); } logger.log("Installation not yet complete."); return Optional.empty(); } private Optional installTester(RunId id, DualLogger logger) { Run run = controller.jobController().run(id).get(); Version platform = controller.systemVersion(); logger.log("Checking installation of tester container ..."); if ( nodesConverged(id.tester().id(), id.type(), platform, logger) && servicesConverged(id.tester().id(), id.type(), platform, logger)) { if (endpointsAvailable(id.tester().id(), id.type().zone(controller.system()), logger)) { if (containersAreUp(id.tester().id(), id.type().zone(controller.system()), logger)) { logger.log("Tester container successfully installed!"); return Optional.of(running); } } else if (run.stepInfo(installTester).get().startTime().get().plus(endpointTimeout).isBefore(controller.clock().instant())) { logger.log(WARNING, "Tester failed to show up within " + endpointTimeout.toMinutes() + " minutes!"); return Optional.of(error); } } if (run.stepInfo(installTester).get().startTime().get().plus(endpointTimeout).isBefore(controller.clock().instant())) { logger.log(WARNING, "Installation of tester failed to complete within " + testerTimeout.toMinutes() + " minutes!"); return Optional.of(error); } logger.log("Installation of tester not yet complete."); return Optional.empty(); } /** Returns true iff all containers in the deployment give 100 consecutive 200 OK responses on /status.html. */ private boolean containersAreUp(ApplicationId id, ZoneId zoneId, DualLogger logger) { var endpoints = controller.applications().clusterEndpoints(Set.of(new DeploymentId(id, zoneId))); if ( ! endpoints.containsKey(zoneId)) return false; for (URI endpoint : endpoints.get(zoneId).values()) { boolean ready = id.instance().isTester() ? controller.jobController().cloud().testerReady(endpoint) : controller.jobController().cloud().ready(endpoint); if (!ready) { logger.log("Failed to get 100 consecutive OKs from " + endpoint); return false; } } return true; } private boolean endpointsAvailable(ApplicationId id, ZoneId zone, DualLogger logger) { logger.log("Attempting to find deployment endpoints ..."); var endpoints = controller.applications().clusterEndpoints(Set.of(new DeploymentId(id, zone))); if ( ! endpoints.containsKey(zone)) { logger.log("Endpoints not yet ready."); return false; } for (var endpoint : endpoints.get(zone).values()) if ( ! controller.jobController().cloud().exists(endpoint)) { logger.log(INFO, "DNS lookup yielded no IP address for '" + endpoint + "'."); return false; } logEndpoints(endpoints, logger); return true; } private void logEndpoints(Map> endpoints, DualLogger logger) { List messages = new ArrayList<>(); messages.add("Found endpoints:"); endpoints.forEach((zone, uris) -> { messages.add("- " + zone); uris.forEach((cluster, uri) -> messages.add(" |-- " + uri + " (" + cluster + ")")); }); logger.log(messages); } private boolean nodesConverged(ApplicationId id, JobType type, Version target, DualLogger logger) { List nodes = controller.serviceRegistry().configServer().nodeRepository().list(type.zone(controller.system()), id, ImmutableSet.of(active, reserved)); List statuses = nodes.stream() .map(node -> String.format("%70s: %-16s%-25s%-32s%s", node.hostname(), node.serviceState(), node.wantedVersion() + (node.currentVersion().equals(node.wantedVersion()) ? "" : " <-- " + node.currentVersion()), node.restartGeneration() >= node.wantedRestartGeneration() ? "" : "restart pending (" + node.wantedRestartGeneration() + " <-- " + node.restartGeneration() + ")", node.rebootGeneration() >= node.wantedRebootGeneration() ? "" : "reboot pending (" + node.wantedRebootGeneration() + " <-- " + node.rebootGeneration() + ")")) .collect(Collectors.toList()); logger.log(statuses); return nodes.stream().allMatch(node -> node.currentVersion().equals(target) && node.restartGeneration() >= node.wantedRestartGeneration() && node.rebootGeneration() >= node.wantedRebootGeneration()); } private boolean servicesConverged(ApplicationId id, JobType type, Version platform, DualLogger logger) { var convergence = controller.serviceRegistry().configServer().serviceConvergence(new DeploymentId(id, type.zone(controller.system())), Optional.of(platform)); if (convergence.isEmpty()) { logger.log("Config status not currently available -- will retry."); return false; } logger.log("Wanted config generation is " + convergence.get().wantedGeneration()); List statuses = convergence.get().services().stream() .filter(serviceStatus -> serviceStatus.currentGeneration() != convergence.get().wantedGeneration()) .map(serviceStatus -> String.format("%70s: %11s on port %4d has config generation %s", serviceStatus.host().value(), serviceStatus.type(), serviceStatus.port(), serviceStatus.currentGeneration() == -1 ? "not started!" : Long.toString(serviceStatus.currentGeneration()))) .collect(Collectors.toList()); logger.log(statuses); if (statuses.isEmpty()) logger.log("All services on wanted config generation."); return convergence.get().converged(); } private Optional startTests(RunId id, boolean isSetup, DualLogger logger) { Optional deployment = deployment(id.application(), id.type()); if (deployment.isEmpty()) { logger.log(INFO, "Deployment expired before tests could start."); return Optional.of(error); } var deployments = controller.applications().requireInstance(id.application()) .productionDeployments().keySet().stream() .map(zone -> new DeploymentId(id.application(), zone)) .collect(Collectors.toSet()); deployments.add(new DeploymentId(id.application(), id.type().zone(controller.system()))); logger.log("Attempting to find endpoints ..."); var endpoints = controller.applications().clusterEndpoints(deployments); if ( ! endpoints.containsKey(id.type().zone(controller.system()))) { logger.log(WARNING, "Endpoints for the deployment to test vanished again, while it was still active!"); return Optional.of(error); } logEndpoints(endpoints, logger); Optional testerEndpoint = controller.jobController().testerEndpoint(id); if (testerEndpoint.isEmpty()) { logger.log(WARNING, "Endpoints for the tester container vanished again, while it was still active!"); return Optional.of(error); } if ( ! controller.jobController().cloud().testerReady(testerEndpoint.get())) { logger.log(WARNING, "Tester container went bad!"); return Optional.of(error); } logger.log("Starting tests ..."); controller.jobController().cloud().startTests(testerEndpoint.get(), TesterCloud.Suite.of(id.type(), isSetup), testConfigSerializer.configJson(id.application(), id.type(), true, endpoints, controller.applications().contentClustersByZone(deployments))); return Optional.of(running); } private Optional endTests(RunId id, DualLogger logger) { if ( ! deployment(id.application(), id.type()).isPresent()) { logger.log(INFO, "Deployment expired before tests could complete."); return Optional.of(aborted); } Optional testerCertificate = controller.jobController().run(id).get().testerCertificate(); if (testerCertificate.isPresent()) { try { testerCertificate.get().checkValidity(Date.from(controller.clock().instant())); } catch (CertificateExpiredException | CertificateNotYetValidException e) { logger.log(INFO, "Tester certificate expired before tests could complete."); return Optional.of(aborted); } } Optional testerEndpoint = controller.jobController().testerEndpoint(id); if ( ! testerEndpoint.isPresent()) { logger.log("Endpoints for tester not found -- trying again later."); return Optional.empty(); } controller.jobController().updateTestLog(id); TesterCloud.Status testStatus = controller.jobController().cloud().getStatus(testerEndpoint.get()); switch (testStatus) { case NOT_STARTED: throw new IllegalStateException("Tester reports tests not started, even though they should have!"); case RUNNING: return Optional.empty(); case FAILURE: logger.log("Tests failed."); return Optional.of(testFailure); case ERROR: logger.log(INFO, "Tester failed running its tests!"); return Optional.of(error); case SUCCESS: logger.log("Tests completed successfully."); return Optional.of(running); default: throw new IllegalStateException("Unknown status '" + testStatus + "'!"); } } private Optional copyVespaLogs(RunId id, DualLogger logger) { if (deployment(id.application(), id.type()).isPresent()) try { controller.jobController().updateVespaLog(id); } catch (Exception e) { logger.log(INFO, "Failure getting vespa logs for " + id, e); return Optional.of(error); } return Optional.of(running); } private Optional deactivateReal(RunId id, DualLogger logger) { try { return retrying(10, () -> { logger.log("Deactivating deployment of " + id.application() + " in " + id.type().zone(controller.system()) + " ..."); controller.applications().deactivate(id.application(), id.type().zone(controller.system())); return running; }); } catch (RuntimeException e) { logger.log(WARNING, "Failed deleting application " + id.application(), e); return Optional.of(error); } } private Optional deactivateTester(RunId id, DualLogger logger) { try { return retrying(10, () -> { logger.log("Deactivating tester of " + id.application() + " in " + id.type().zone(controller.system()) + " ..."); controller.jobController().deactivateTester(id.tester(), id.type()); return running; }); } catch (RuntimeException e) { logger.log(WARNING, "Failed deleting tester of " + id.application(), e); return Optional.of(error); } } private static Optional retrying(int retries, Supplier task) { RuntimeException exception = null; do { try { return Optional.of(task.get()); } catch (RuntimeException e) { if (exception == null) exception = e; else exception.addSuppressed(e); } } while (--retries >= 0); throw exception; } private Optional report(RunId id, DualLogger logger) { try { controller.jobController().active(id).ifPresent(run -> { if (run.hasFailed()) sendNotification(run, logger); }); } catch (IllegalStateException e) { logger.log(INFO, "Job '" + id.type() + "' no longer supposed to run?", e); return Optional.of(error); } return Optional.of(running); } /** Sends a mail with a notification of a failed run, if one should be sent. */ private void sendNotification(Run run, DualLogger logger) { Application application = controller.applications().requireApplication(TenantAndApplicationId.from(run.id().application())); Notifications notifications = application.deploymentSpec().requireInstance(run.id().application().instance()).notifications(); boolean newCommit = application.require(run.id().application().instance()).change().application() .map(run.versions().targetApplication()::equals) .orElse(false); When when = newCommit ? failingCommit : failing; List recipients = new ArrayList<>(notifications.emailAddressesFor(when)); if (notifications.emailRolesFor(when).contains(author)) run.versions().targetApplication().authorEmail().ifPresent(recipients::add); if (recipients.isEmpty()) return; try { if (run.status() == outOfCapacity && run.id().type().isProduction()) controller.serviceRegistry().mailer().send(mails.outOfCapacity(run.id(), recipients)); if (run.status() == deploymentFailed) controller.serviceRegistry().mailer().send(mails.deploymentFailure(run.id(), recipients)); if (run.status() == installationFailed) controller.serviceRegistry().mailer().send(mails.installationFailure(run.id(), recipients)); if (run.status() == testFailure) controller.serviceRegistry().mailer().send(mails.testFailure(run.id(), recipients)); if (run.status() == error) controller.serviceRegistry().mailer().send(mails.systemError(run.id(), recipients)); } catch (RuntimeException e) { logger.log(INFO, "Exception trying to send mail for " + run.id(), e); } } /** Returns the deployment of the real application in the zone of the given job, if it exists. */ private Optional deployment(ApplicationId id, JobType type) { return Optional.ofNullable(application(id).deployments().get(type.zone(controller.system()))); } /** Returns the real application with the given id. */ private Instance application(ApplicationId id) { controller.applications().lockApplicationOrThrow(TenantAndApplicationId.from(id), __ -> { }); // Memory fence. return controller.applications().requireInstance(id); } /** * Returns whether the time since deployment is more than the zone deployment expiry, or the given timeout. * * We time out the job before the deployment expires, for zones where deployments are not persistent, * to be able to collect the Vespa log from the deployment. Thus, the lower of the zone's deployment expiry, * and the given default installation timeout, minus one minute, is used as a timeout threshold. */ private boolean timedOut(RunId id, Deployment deployment, Duration defaultTimeout) { // TODO jonmv: This is a workaround for new deployment writes not yet being visible in spite of Curator locking. // TODO Investigate what's going on here, and remove this workaround. Run run = controller.jobController().run(id).get(); if (run.start().isAfter(deployment.at())) return false; Duration timeout = controller.zoneRegistry().getDeploymentTimeToLive(deployment.zone()) .filter(zoneTimeout -> zoneTimeout.compareTo(defaultTimeout) < 0) .orElse(defaultTimeout); return deployment.at().isBefore(controller.clock().instant().minus(timeout.minus(Duration.ofMinutes(1)))); } /** Returns the application package for the tester application, assembled from a generated config, fat-jar and services.xml. */ private ApplicationPackage testerPackage(RunId id) { ApplicationVersion version = controller.jobController().run(id).get().versions().targetApplication(); DeploymentSpec spec = controller.applications().requireApplication(TenantAndApplicationId.from(id.application())).deploymentSpec(); ZoneId zone = id.type().zone(controller.system()); boolean useTesterCertificate = controller.system().isPublic() && id.type().environment().isTest(); byte[] servicesXml = servicesXml(controller.zoneRegistry().accessControlDomain(), ! controller.system().isPublic(), useTesterCertificate, testerFlavorFor(id, spec) .map(NodeResources::fromLegacyName) .orElse(zone.region().value().contains("aws-") ? DEFAULT_TESTER_RESOURCES_AWS : DEFAULT_TESTER_RESOURCES)); byte[] testPackage = controller.applications().applicationStore().getTester(id.application().tenant(), id.application().application(), version); byte[] deploymentXml = deploymentXml(id.tester(), spec.athenzDomain(), spec.requireInstance(id.application().instance()).athenzService(zone.environment(), zone.region())); try (ZipBuilder zipBuilder = new ZipBuilder(testPackage.length + servicesXml.length + 1000)) { zipBuilder.add(testPackage); zipBuilder.add("services.xml", servicesXml); zipBuilder.add("deployment.xml", deploymentXml); if (useTesterCertificate) appendAndStoreCertificate(zipBuilder, id); zipBuilder.close(); return new ApplicationPackage(zipBuilder.toByteArray()); } } private void appendAndStoreCertificate(ZipBuilder zipBuilder, RunId id) { KeyPair keyPair = KeyUtils.generateKeypair(KeyAlgorithm.RSA, 2048); X500Principal subject = new X500Principal("CN=" + id.tester().id().toFullString() + "." + id.type() + "." + id.number()); X509Certificate certificate = X509CertificateBuilder.fromKeypair(keyPair, subject, controller.clock().instant(), controller.clock().instant().plus(certificateTimeout), SignatureAlgorithm.SHA512_WITH_RSA, BigInteger.valueOf(1)) .build(); controller.jobController().storeTesterCertificate(id, certificate); zipBuilder.add("artifacts/key", KeyUtils.toPem(keyPair.getPrivate()).getBytes(UTF_8)); zipBuilder.add("artifacts/cert", X509CertificateUtils.toPem(certificate).getBytes(UTF_8)); } private static Optional testerFlavorFor(RunId id, DeploymentSpec spec) { for (DeploymentSpec.Step step : spec.steps()) if (step.concerns(id.type().environment())) return step.zones().get(0).testerFlavor(); return Optional.empty(); } /** Returns the generated services.xml content for the tester application. */ static byte[] servicesXml(AthenzDomain domain, boolean systemUsesAthenz, boolean useTesterCertificate, NodeResources resources) { int jdiscMemoryGb = 2; // 2Gb memory for tester application (excessive?). int jdiscMemoryPct = (int) Math.ceil(100 * jdiscMemoryGb / resources.memoryGb()); // Of the remaining memory, split 50/50 between Surefire running the tests and the rest int testMemoryMb = (int) (1024 * (resources.memoryGb() - jdiscMemoryGb) / 2); String resourceString = String.format(Locale.ENGLISH, "", resources.vcpu(), resources.memoryGb(), resources.diskGb(), resources.diskSpeed().name(), resources.storageType().name()); AthenzDomain idDomain = ("vespa.vespa.cd".equals(domain.value()) ? AthenzDomain.from("vespa.vespa") : domain); String servicesXml = "\n" + "\n" + " \n" + "\n" + " \n" + " \n" + " artifacts\n" + " " + testMemoryMb + "\n" + " " + systemUsesAthenz + "\n" + " " + useTesterCertificate + "\n" + " \n" + " \n" + "\n" + " \n" + " http://*/tester/v1/*\n" + " \n" + "\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " true\n" + " \n" + " /status.html\n" + " /state/v1/config\n" + " \n" + " \n" + " \n" + " \n" + " /var/lib/sia/keys/" + idDomain.value() + ".tenant.key.pem\n" + " /var/lib/sia/certs/" + idDomain.value() + ".tenant.cert.pem\n" + " /opt/yahoo/share/ssl/certs/athenz_certificate_bundle.pem\n" + " want\n" + " \n" + " \n" + " \n" + (systemUsesAthenz ? " \n" + // Set up dummy access control to pass validation :/ " \n" + " http://*/tester/v1/*\n" + " \n" + " \n" : "") + " \n" + " \n" + " \n" + " TOKEN_ONLY\n" + " Yahoo-Role-Auth\n" + " \n" + " \n" + " \n" + " " + domain.value() + ":tester-application\n" + " deploy\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "\n" + " \n" + " " + resourceString + "\n" + " \n" + " \n" + "\n"; return servicesXml.getBytes(UTF_8); } /** Returns a dummy deployment xml which sets up the service identity for the tester, if present. */ private static byte[] deploymentXml(TesterId id, Optional athenzDomain, Optional athenzService) { String deploymentSpec = "\n" + " "athenz-domain=\"" + domain.value() + "\" ").orElse("") + athenzService.map(service -> "athenz-service=\"" + service.value() + "\" ").orElse("") + ">" + " " + ""; return deploymentSpec.getBytes(UTF_8); } /** Logger which logs to a {@link JobController}, as well as to the parent class' {@link Logger}. */ private class DualLogger { private final RunId id; private final Step step; private DualLogger(RunId id, Step step) { this.id = id; this.step = step; } private void log(String... messages) { log(List.of(messages)); } private void log(List messages) { controller.jobController().log(id, step, INFO, messages); } private void log(Level level, String message) { log(level, message, null); } // Print stack trace in our logs, but don't expose it to end users private void logWithInternalException(Level level, String message, Throwable thrown) { logger.log(level, id + " at " + step + ": " + message, thrown); controller.jobController().log(id, step, level, message); } private void log(Level level, String message, Throwable thrown) { logger.log(level, id + " at " + step + ": " + message, thrown); if (thrown != null) { ByteArrayOutputStream traceBuffer = new ByteArrayOutputStream(); thrown.printStackTrace(new PrintStream(traceBuffer)); message += "\n" + traceBuffer; } controller.jobController().log(id, step, level, message); } } }