summaryrefslogtreecommitdiffstats
path: root/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/OsUpgradeScheduler.java
blob: 9dd90163683d1634f977c184a555a1753d93a735 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.maintenance;

import com.yahoo.component.Version;
import com.yahoo.config.provision.CloudName;
import com.yahoo.config.provision.SystemName;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.ArtifactRepository;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.OsRelease;
import com.yahoo.vespa.hosted.controller.versions.OsVersion;
import com.yahoo.vespa.hosted.controller.versions.OsVersionTarget;
import com.yahoo.yolean.Exceptions;

import java.time.DayOfWeek;
import java.time.Duration;
import java.time.Instant;
import java.time.LocalDate;
import java.time.ZoneOffset;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.time.temporal.ChronoUnit;
import java.util.Objects;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * Automatically schedule upgrades to the next OS version.
 *
 * @author mpolden
 */
public class OsUpgradeScheduler extends ControllerMaintainer {

    private static final Logger LOG = Logger.getLogger(OsUpgradeScheduler.class.getName());

    public OsUpgradeScheduler(Controller controller, Duration interval) {
        super(controller, interval);
    }

    @Override
    protected double maintain() {
        Instant now = controller().clock().instant();
        int attempts = 0;
        int failures = 0;
        for (var cloud : controller().clouds()) {
            Optional<Change> change = changeIn(cloud, now);
            if (change.isEmpty()) continue;
            if (!change.get().scheduleAt(now)) continue;
            try {
                attempts++;
                controller().os().upgradeTo(change.get().osVersion().version(), cloud, false, false);
            } catch (IllegalArgumentException e) {
                failures++;
                LOG.log(Level.WARNING, "Failed to schedule OS upgrade: " + Exceptions.toMessageString(e) +
                                       ". Retrying in " + interval());
            }
        }
        return asSuccessFactorDeviation(attempts, failures);
    }

    /** Returns the wanted change for cloud at given instant, if any */
    public Optional<Change> changeIn(CloudName cloud, Instant instant) {
        Optional<OsVersionTarget> currentTarget = controller().os().target(cloud);
        if (currentTarget.isEmpty()) return Optional.empty();
        if (upgradingToNewMajor(cloud)) return Optional.empty(); // Skip further upgrades until major version upgrade is complete

        Release release = releaseIn(cloud);
        Optional<Change> change = release.change(currentTarget.get().version(), instant);
        return change.filter(this::certified);
    }

    private boolean certified(Change change) {
        boolean certified = controller().os().certified(change.osVersion());
        if (!certified) {
            LOG.log(Level.WARNING, "Want to schedule " + change + ", but this change is not certified for " +
                                   "the current system version");
        }
        return certified;
    }

    private boolean upgradingToNewMajor(CloudName cloud) {
        return controller().os().status().versionsIn(cloud).stream()
                           .filter(version -> !version.isEmpty()) // Ignore empty/unknown versions
                           .map(Version::getMajor)
                           .distinct()
                           .count() > 1;
    }

    private Release releaseIn(CloudName cloud) {
        boolean useTaggedRelease = controller().zoneRegistry().zones().all().dynamicallyProvisioned().in(cloud)
                                               .zones().isEmpty();
        if (useTaggedRelease) {
            return new TaggedRelease(controller().system(), cloud, controller().serviceRegistry().artifactRepository());
        }
        return new CalendarVersionedRelease(controller().system(), cloud);
    }

    private static boolean canTriggerAt(Instant instant, boolean isCd) {
        ZonedDateTime dateTime = instant.atZone(ZoneOffset.UTC);
        int hourOfDay = dateTime.getHour();
        int dayOfWeek = dateTime.getDayOfWeek().getValue();
        // Upgrade can only be scheduled between 07:00 (02:00 in CD systems) and 12:59 UTC, Monday-Thursday
        int startHour = isCd ? 2 : 7;
        return hourOfDay >= startHour && hourOfDay <= 12 && dayOfWeek < 5;
    }

    /** Returns the earliest time, at or after instant, an upgrade can be scheduled */
    private static Instant schedulingInstant(Instant instant, SystemName system) {
        ChronoUnit schedulingResolution = ChronoUnit.HOURS;
        while (!canTriggerAt(instant, system.isCd())) {
            instant = instant.truncatedTo(schedulingResolution)
                             .plus(schedulingResolution.getDuration());
        }
        return instant;
    }

    /** Returns the remaining cool-down period relative to releaseAge */
    private static Duration remainingCooldownOf(Duration cooldown, Duration releaseAge) {
        return releaseAge.compareTo(cooldown) < 0 ? cooldown.minus(releaseAge) : Duration.ZERO;
    }

    private interface Release {

        /** The pending change for this release at given instant, if any */
        Optional<Change> change(Version currentVersion, Instant instant);

    }

    /** OS version change and the earliest time it can be scheduled */
    public record Change(OsVersion osVersion, Instant scheduleAt) {

        public Change {
            Objects.requireNonNull(osVersion);
            Objects.requireNonNull(scheduleAt);
        }

        /** Returns whether this can be scheduled at given instant */
        public boolean scheduleAt(Instant instant) {
            return !instant.isBefore(scheduleAt);
        }

    }

    /** OS release based on a tag */
    private record TaggedRelease(SystemName system, CloudName cloud, ArtifactRepository artifactRepository) implements Release {

        public TaggedRelease {
            Objects.requireNonNull(system);
            Objects.requireNonNull(cloud);
            Objects.requireNonNull(artifactRepository);
        }

        @Override
        public Optional<Change> change(Version currentVersion, Instant instant) {
            OsRelease release = artifactRepository.osRelease(currentVersion.getMajor(), tag());
            if (!release.version().isAfter(currentVersion)) return Optional.empty();
            Duration cooldown = remainingCooldownOf(cooldown(), release.age(instant));
            Instant scheduleAt = schedulingInstant(instant.plus(cooldown), system);
            return Optional.of(new Change(new OsVersion(release.version(), cloud), scheduleAt));
        }

        /** Returns the release tag tracked by this system */
        private OsRelease.Tag tag() {
            return system.isCd() ? OsRelease.Tag.latest : OsRelease.Tag.stable;
        }

        /** The cool-down period that must pass before a release can be used */
        private Duration cooldown() {
            return system.isCd() ? Duration.ofDays(1) : Duration.ZERO;
        }

    }

    /** OS release based on calendar-versioning */
    record CalendarVersionedRelease(SystemName system, CloudName cloud) implements Release {

        /** A fixed point in time which the release schedule is calculated from */
        private static final Instant START_OF_SCHEDULE = LocalDate.of(2022, 1, 1)
                                                                  .atStartOfDay()
                                                                  .toInstant(ZoneOffset.UTC);

        /** The approximate time that should elapse between versions */
        private static final Duration SCHEDULING_STEP = Duration.ofDays(60);

        /** The day of week new releases are published */
        private static final DayOfWeek RELEASE_DAY = DayOfWeek.TUESDAY;

        /** How far into release day we should wait before triggering. This is to give the new release some time to propagate */
        private static final Duration COOLDOWN = Duration.ofHours(6);

        public CalendarVersionedRelease {
            Objects.requireNonNull(system);
        }

        @Override
        public Optional<Change> change(Version currentVersion, Instant instant) {
            CalendarVersion version = findVersion(instant, currentVersion);
            Instant predicatedInstant = instant;
            while (!version.version().isAfter(currentVersion)) {
                predicatedInstant = predicatedInstant.plus(Duration.ofDays(1));
                version = findVersion(predicatedInstant, currentVersion);
            }
            Duration cooldown = remainingCooldownOf(COOLDOWN, version.age(instant));
            Instant schedulingInstant = schedulingInstant(instant.plus(cooldown), system);
            return Optional.of(new Change(new OsVersion(version.version(), cloud), schedulingInstant));
        }

        /** Find the most recent version available according to the scheduling step, relative to now */
        static CalendarVersion findVersion(Instant now, Version currentVersion) {
            Instant candidate = START_OF_SCHEDULE;
            while (!candidate.plus(SCHEDULING_STEP).isAfter(now)) {
                candidate = candidate.plus(SCHEDULING_STEP);
            }
            LocalDate date = LocalDate.ofInstant(candidate, ZoneOffset.UTC);
            while (date.getDayOfWeek() != RELEASE_DAY) {
                date = date.minusDays(1);
            }
            return CalendarVersion.from(date, currentVersion);
        }

        record CalendarVersion(Version version, LocalDate date) {

            private static final DateTimeFormatter CALENDAR_VERSION_PATTERN = DateTimeFormatter.ofPattern("yyyyMMdd");

            private static CalendarVersion from(LocalDate date, Version currentVersion) {
                String qualifier = date.format(CALENDAR_VERSION_PATTERN);
                return new CalendarVersion(new Version(currentVersion.getMajor(),
                                                       currentVersion.getMinor(),
                                                       currentVersion.getMicro(),
                                                       qualifier),
                                           date);
            }

            /** Returns the age of this at given instant */
            private Duration age(Instant instant) {
                return Duration.between(date.atStartOfDay().toInstant(ZoneOffset.UTC), instant);
            }

        }

    }

}