summaryrefslogtreecommitdiffstats
path: root/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/deployment/JobController.java
blob: dfa72967aaeafe467d1975ed30b81097f2ebbcd0 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.deployment;

import com.google.common.collect.ImmutableMap;
import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.zone.ZoneId;
import com.yahoo.io.IOUtils;
import com.yahoo.vespa.curator.Lock;
import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.LockedApplication;
import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId;
import com.yahoo.vespa.hosted.controller.api.integration.LogEntry;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.NotFoundException;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationVersion;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.RunId;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.SourceRevision;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.TesterCloud;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.TesterId;
import com.yahoo.vespa.hosted.controller.application.ApplicationPackage;
import com.yahoo.vespa.hosted.controller.application.Deployment;
import com.yahoo.vespa.hosted.controller.application.DeploymentJobs;
import com.yahoo.vespa.hosted.controller.application.JobStatus;
import com.yahoo.vespa.hosted.controller.persistence.BufferedLogStore;
import com.yahoo.vespa.hosted.controller.persistence.CuratorDb;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UncheckedIOException;
import java.net.URI;
import java.security.cert.X509Certificate;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import java.util.function.UnaryOperator;
import java.util.logging.Level;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static com.google.common.collect.ImmutableList.copyOf;
import static com.yahoo.vespa.hosted.controller.deployment.RunStatus.error;
import static com.yahoo.vespa.hosted.controller.deployment.Step.copyVespaLogs;
import static com.yahoo.vespa.hosted.controller.deployment.Step.deactivateTester;
import static com.yahoo.vespa.hosted.controller.deployment.Step.endTests;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.logging.Level.INFO;
import static java.util.stream.Collectors.toList;

/**
 * A singleton owned by the controller, which contains the state and methods for controlling deployment jobs.
 *
 * Keys are the {@link ApplicationId} of the real application, for which the deployment job is run, the
 * {@link JobType} to run, and the strictly increasing run number of this combination.
 * The deployment jobs run tests using regular applications, but these tester application IDs are not to be used elsewhere.
 *
 * Jobs consist of sets of {@link Step}s, defined in {@link JobProfile}s.
 * Each run is represented by a {@link Run}, which holds the status of each step of the run, as well as
 * some other meta data.
 *
 * @author jonmv
 */
public class JobController {

    private static final int historyLength = 256;
    private static final Duration maxHistoryAge = Duration.ofDays(60);

    private final Controller controller;
    private final CuratorDb curator;
    private final BufferedLogStore logs;
    private final TesterCloud cloud;
    private final Badges badges;

    private AtomicReference<Consumer<Run>> runner = new AtomicReference<>(__ -> { });

    public JobController(Controller controller) {
        this.controller = controller;
        this.curator = controller.curator();
        this.logs = new BufferedLogStore(curator, controller.serviceRegistry().runDataStore());
        this.cloud = controller.serviceRegistry().testerCloud();
        this.badges = new Badges(controller.zoneRegistry().badgeUrl());
    }

    public TesterCloud cloud() { return cloud; }
    public int historyLength() { return historyLength; }
    public void setRunner(Consumer<Run> runner) { this.runner.set(runner); }

    /** Rewrite all job data with the newest format. */
    public void updateStorage() {
        for (ApplicationId id : applications())
            for (JobType type : jobs(id)) {
                locked(id, type, runs -> { // runs is not modified here, and is written as it was.
                    curator.readLastRun(id, type).ifPresent(curator::writeLastRun);
                });
            }
    }

    /** Returns all entries currently logged for the given run. */
    public Optional<RunLog> details(RunId id) {
        return details(id, -1);
    }

    /** Returns the logged entries for the given run, which are after the given id threshold. */
    public Optional<RunLog> details(RunId id, long after) {
        try (Lock __ = curator.lock(id.application(), id.type())) {
            Run run = runs(id.application(), id.type()).get(id);
            if (run == null)
                return Optional.empty();

            return active(id).isPresent()
                    ? Optional.of(logs.readActive(id.application(), id.type(), after))
                    : logs.readFinished(id, after);
        }
    }

    /** Stores the given log entries for the given run and step. */
    public void log(RunId id, Step step, List<LogEntry> entries) {
        locked(id, __ -> {
            logs.append(id.application(), id.type(), step, entries);
            return __;
        });
    }

    /** Stores the given log messages for the given run and step. */
    public void log(RunId id, Step step, Level level, List<String> messages) {
        log(id, step, messages.stream()
                              .map(message -> new LogEntry(0, controller.clock().millis(), LogEntry.typeOf(level), message))
                              .collect(toList()));
    }

    /** Stores the given log message for the given run and step. */
    public void log(RunId id, Step step, Level level, String message) {
        log(id, step, level, Collections.singletonList(message));
    }

    /** Fetches any new Vespa log entries, and records the timestamp of the last of these, for continuation. */
    public void updateVespaLog(RunId id) {
        locked(id, run -> {
            ZoneId zone = id.type().zone(controller.system());
            Optional<Deployment> deployment = Optional.ofNullable(controller.applications().require(id.application())
                                                                            .deployments().get(zone));
            if (deployment.isEmpty())
                return run;

            long from = Math.max(run.lastVespaLogTimestamp().toEpochMilli(),
                                 deployment.get().at().toEpochMilli());
            List<LogEntry> log = LogEntry.parseVespaLog(controller.serviceRegistry().configServer()
                                                                  .getLogs(new DeploymentId(id.application(), zone),
                                                                           Map.of("from", Long.toString(from))));
            if (log.isEmpty())
                return run;

            logs.append(id.application(), id.type(), Step.copyVespaLogs, log);
            return run.with(Instant.ofEpochMilli(log.get(log.size() - 1).at()));
        });
    }

    /** Fetches any new test log entries, and records the id of the last of these, for continuation. */
    public void updateTestLog(RunId id) {
            locked(id, run -> {
                if ( ! run.readySteps().contains(endTests))
                    return run;

                Optional<URI> testerEndpoint = testerEndpoint(id);
                if ( ! testerEndpoint.isPresent())
                    return run;

                List<LogEntry> entries = cloud.getLog(testerEndpoint.get(), run.lastTestLogEntry());
                if (entries.isEmpty())
                    return run;

                logs.append(id.application(), id.type(), endTests, entries);
                return run.with(entries.stream().mapToLong(LogEntry::id).max().getAsLong());
            });
    }

    /** Stores the given certificate as the tester certificate for this run, or throws if it's already set. */
    public void storeTesterCertificate(RunId id, X509Certificate testerCertificate) {
        locked(id, run -> run.with(testerCertificate));
    }

    /** Returns a list of all application which have registered. */
    public List<ApplicationId> applications() {
        return copyOf(controller.applications().asList().stream()
                                .filter(application -> application.deploymentJobs().deployedInternally())
                                .map(Application::id)
                                .iterator());
    }

    /** Returns all job types which have been run for the given application. */
    public List<JobType> jobs(ApplicationId id) {
        return copyOf(Stream.of(JobType.values())
                            .filter(type -> last(id, type).isPresent())
                            .iterator());
    }

    /** Returns an immutable map of all known runs for the given application and job type. */
    public Map<RunId, Run> runs(ApplicationId id, JobType type) {
        SortedMap<RunId, Run> runs = curator.readHistoricRuns(id, type);
        last(id, type).ifPresent(run -> runs.put(run.id(), run));
        return ImmutableMap.copyOf(runs);
    }

    /** Returns the run with the given id, if it exists. */
    public Optional<Run> run(RunId id) {
        return runs(id.application(), id.type()).values().stream()
                                                .filter(run -> run.id().equals(id))
                                                .findAny();
    }

    /** Returns the last run of the given type, for the given application, if one has been run. */
    public Optional<Run> last(ApplicationId id, JobType type) {
        return curator.readLastRun(id, type);
    }

    /** Returns the run with the given id, provided it is still active. */
    public Optional<Run> active(RunId id) {
        return last(id.application(), id.type())
                .filter(run -> ! run.hasEnded())
                .filter(run -> run.id().equals(id));
    }

    /** Returns a list of all active runs. */
    public List<Run> active() {
        return copyOf(applications().stream()
                                    .flatMap(id -> Stream.of(JobType.values())
                                                         .map(type -> last(id, type))
                                                         .flatMap(Optional::stream)
                                                         .filter(run -> ! run.hasEnded()))
                                    .iterator());
    }

    /** Changes the status of the given step, for the given run, provided it is still active. */
    public void update(RunId id, RunStatus status, LockedStep step) {
        locked(id, run -> run.with(status, step));
    }

    /** Changes the status of the given run to inactive, and stores it as a historic run. */
    public void finish(RunId id) {
        locked(id, run -> { // Store the modified run after it has been written to history, in case the latter fails.
            Run finishedRun = run.finished(controller.clock().instant());
            locked(id.application(), id.type(), runs -> {
                runs.put(run.id(), finishedRun);
                long last = id.number();
                var oldEntries = runs.entrySet().iterator();
                for (var old = oldEntries.next();
                        old.getKey().number() <= last - historyLength
                     || old.getValue().start().isBefore(controller.clock().instant().minus(maxHistoryAge));
                     old = oldEntries.next()) {
                    logs.delete(old.getKey());
                    oldEntries.remove();
                }
            });
            logs.flush(id);
            return finishedRun;
        });
    }

    /** Marks the given run as aborted; no further normal steps will run, but run-always steps will try to succeed. */
    public void abort(RunId id) {
        locked(id, run -> run.aborted());
    }

    /**
     * Accepts and stores a new application package and test jar pair under a generated application version key.
     */
    public ApplicationVersion submit(ApplicationId id, SourceRevision revision, String authorEmail, long projectId,
                                     ApplicationPackage applicationPackage, byte[] testPackageBytes) {
        AtomicReference<ApplicationVersion> version = new AtomicReference<>();
        controller.applications().lockOrThrow(id, application -> {
            if ( ! application.get().deploymentJobs().deployedInternally())
                application = registered(application);

            long run = nextBuild(id);
            if (applicationPackage.compileVersion().isPresent() && applicationPackage.buildTime().isPresent())
                version.set(ApplicationVersion.from(revision, run, authorEmail,
                                                    applicationPackage.compileVersion().get(),
                                                    applicationPackage.buildTime().get()));
            else
                version.set(ApplicationVersion.from(revision, run, authorEmail));

            controller.applications().applicationStore().put(id,
                                                             version.get(),
                                                             applicationPackage.zippedContent());
            controller.applications().applicationStore().put(TesterId.of(id),
                                                             version.get(),
                                                             testPackageBytes);

            prunePackages(id);
            controller.applications().storeWithUpdatedConfig(application, applicationPackage);

            controller.applications().deploymentTrigger().notifyOfCompletion(DeploymentJobs.JobReport.ofSubmission(id, projectId, version.get()));
        });
        return version.get();
    }

    /** Registers the given application, copying necessary application packages, and returns the modified version. */
    private LockedApplication registered(LockedApplication application) {
                // TODO jvenstad: Remove when there are no more SDv3 pipelines.
                // Copy all current packages to the new application store
                application.get().productionDeployments().values().stream()
                           .map(Deployment::applicationVersion)
                           .distinct()
                           .forEach(appVersion -> {
                               byte[] content = controller.applications().artifacts().getApplicationPackage(application.get().id(), appVersion.id());
                               controller.applications().applicationStore().put(application.get().id(), appVersion, content);
                           });
                // Make sure any ongoing upgrade is cancelled, since future jobs will require the tester artifact.
        return application.withChange(application.get().change().withoutPlatform().withoutApplication())
                          .withBuiltInternally(true);
    }

    /** Orders a run of the given type, or throws an IllegalStateException if that job type is already running. */
    public void start(ApplicationId id, JobType type, Versions versions) {
        if ( ! type.environment().isManuallyDeployed() && versions.targetApplication().isUnknown())
            throw new IllegalArgumentException("Target application must be a valid reference.");

        controller.applications().lockIfPresent(id, application -> {
            if ( ! application.get().deploymentJobs().deployedInternally())
                throw new IllegalArgumentException(id + " is not built here!");

            locked(id, type, __ -> {
                Optional<Run> last = last(id, type);
                if (last.flatMap(run -> active(run.id())).isPresent())
                    throw new IllegalStateException("Can not start " + type + " for " + id + "; it is already running!");

                RunId newId = new RunId(id, type, last.map(run -> run.id().number()).orElse(0L) + 1);
                curator.writeLastRun(Run.initial(newId, versions, controller.clock().instant()));
            });
        });
    }

    /** Stores the given package and starts a deployment of it, after aborting any such ongoing deployment. */
    public void deploy(ApplicationId id, JobType type, Optional<Version> platform, ApplicationPackage applicationPackage) {
        controller.applications().lockOrThrow(id, application -> {
            if ( ! application.get().deploymentJobs().deployedInternally())
                controller.applications().store(registered(application));
        });
        if ( ! type.environment().isManuallyDeployed())
            throw new IllegalArgumentException("Direct deployments are only allowed to manually deployed environments.");

        last(id, type).filter(run -> ! run.hasEnded()).ifPresent(run -> abortAndWait(run.id()));
        locked(id, type, __ -> {
            controller.applications().applicationStore().putDev(id, type.zone(controller.system()), applicationPackage.zippedContent());
            start(id, type, new Versions(platform.orElse(controller.systemVersion()),
                                         ApplicationVersion.unknown,
                                         Optional.empty(),
                                         Optional.empty()));

            runner.get().accept(last(id, type).get());
        });
    }

    /** Aborts a run and waits for it complete. */
    private void abortAndWait(RunId id) {
        abort(id);
        runner.get().accept(last(id.application(), id.type()).get());

        while ( ! last(id.application(), id.type()).get().hasEnded()) {
            try {
                Thread.sleep(100);
            }
            catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                throw new RuntimeException(e);
            }
        }
    }

    /** Unregisters the given application and makes all associated data eligible for garbage collection. */
    public void unregister(ApplicationId id) {
        controller.applications().lockIfPresent(id, application -> {
            controller.applications().store(application.withBuiltInternally(false));
            jobs(id).forEach(type -> last(id, type).ifPresent(last -> abort(last.id())));
        });
    }

    /** Deletes run data and tester deployments for applications which are unknown, or no longer built internally. */
    public void collectGarbage() {
        Set<ApplicationId> applicationsToBuild = new HashSet<>(applications());
        curator.applicationsWithJobs().stream()
               .filter(id -> ! applicationsToBuild.contains(id))
               .forEach(id -> {
                   try {
                       TesterId tester = TesterId.of(id);
                       for (JobType type : jobs(id))
                           locked(id, type, deactivateTester, __ -> {
                               try (Lock ___ = curator.lock(id, type)) {
                                   deactivateTester(tester, type);
                                   curator.deleteRunData(id, type);
                                   logs.delete(id);
                               }
                           });
                   }
                   catch (TimeoutException e) {
                       return; // Don't remove the data if we couldn't clean up all resources.
                   }
                   curator.deleteRunData(id);
               });
    }

    public void deactivateTester(TesterId id, JobType type) {
        try {
            controller.serviceRegistry().configServer().deactivate(new DeploymentId(id.id(), type.zone(controller.system())));
        }
        catch (NotFoundException ignored) {
            // Already gone -- great!
        }
    }

    /** Returns a URI which points at a badge showing historic status of given length for the given job type for the given application. */
    public URI historicBadge(ApplicationId id, JobType type, int historyLength) {
        List<Run> runs = new ArrayList<>(runs(id, type).values());
        Run lastCompleted = null;
        if (runs.size() > 0)
            lastCompleted = runs.get(runs.size() - 1);
        if (runs.size() > 1 && ! lastCompleted.hasEnded())
            lastCompleted = runs.get(runs.size() - 2);

        return badges.historic(id, Optional.ofNullable(lastCompleted), runs.subList(Math.max(0, runs.size() - historyLength), runs.size()));
    }

    /** Returns a URI which points at a badge showing current status for all jobs for the given application. */
    public URI overviewBadge(ApplicationId id) {
        DeploymentSteps steps = new DeploymentSteps(controller.applications().require(id).deploymentSpec(), controller::system);
        return badges.overview(id,
                               steps.jobs().stream()
                                    .map(type -> last(id, type))
                                    .flatMap(Optional::stream)
                                    .collect(toList()));
    }

    /** Returns a URI of the tester endpoint retrieved from the routing generator, provided it matches an expected form. */
    Optional<URI> testerEndpoint(RunId id) {
        DeploymentId testerId = new DeploymentId(id.tester().id(), id.type().zone(controller.system()));
        return controller.applications().getDeploymentEndpoints(testerId)
                         .stream().findAny()
                         .or(() -> controller.applications().routingPolicies().get(testerId).stream()
                                             .findAny()
                                             .map(policy -> policy.endpointIn(controller.system()).url()))
                         // TODO jvenstad: Remove ugly thing when public deployments have a valid web certificate.
                         .map(uri -> controller.system().isPublic() ? URI.create("http://" + uri.getHost() + ":443/") : uri);
    }

    /** Returns a set containing the zone of the deployment tested in the given run, and all production zones for the application. */
    public Set<ZoneId> testedZoneAndProductionZones(ApplicationId id, JobType type) {
        return Stream.concat(Stream.of(type.zone(controller.system())),
                             controller.applications().require(id).productionDeployments().keySet().stream())
                     .collect(Collectors.toSet());
    }

    // TODO jvenstad: Find a more appropriate way of doing this, at least when this is the only build service.
    private long nextBuild(ApplicationId id) {
        return 1 + controller.applications().require(id).deploymentJobs()
                             .statusOf(JobType.component)
                             .flatMap(JobStatus::lastCompleted)
                             .map(JobStatus.JobRun::id)
                             .orElse(0L);
    }

    private void prunePackages(ApplicationId id) {
        controller.applications().lockIfPresent(id, application -> {
            application.get().productionDeployments().values().stream()
                       .map(Deployment::applicationVersion)
                       .min(Comparator.comparingLong(applicationVersion -> applicationVersion.buildNumber().getAsLong()))
                       .ifPresent(oldestDeployed -> {
                           controller.applications().applicationStore().prune(id, oldestDeployed);
                           controller.applications().applicationStore().prune(TesterId.of(id), oldestDeployed);
                       });
        });
    }

    /** Locks all runs and modifies the list of historic runs for the given application and job type. */
    private void locked(ApplicationId id, JobType type, Consumer<SortedMap<RunId, Run>> modifications) {
        try (Lock __ = curator.lock(id, type)) {
            SortedMap<RunId, Run> runs = curator.readHistoricRuns(id, type);
            modifications.accept(runs);
            curator.writeHistoricRuns(id, type, runs.values());
        }
    }

    /** Locks and modifies the run with the given id, provided it is still active. */
    public void locked(RunId id, UnaryOperator<Run> modifications) {
        try (Lock __ = curator.lock(id.application(), id.type())) {
            active(id).ifPresent(run -> {
                run = modifications.apply(run);
                curator.writeLastRun(run);
            });
        }
    }

    /** Locks the given step and checks none of its prerequisites are running, then performs the given actions. */
    public void locked(ApplicationId id, JobType type, Step step, Consumer<LockedStep> action) throws TimeoutException {
        try (Lock lock = curator.lock(id, type, step)) {
            for (Step prerequisite : step.prerequisites()) // Check that no prerequisite is still running.
                try (Lock __ = curator.lock(id, type, prerequisite)) { ; }

            action.accept(new LockedStep(lock, step));
        }
    }

}