aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/os/OsVersionsTest.java
blob: 6a41e766ace8bb014d5c241bb6f264a03fe58ab5 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.os;

import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.test.ManualClock;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.OsVersion;
import com.yahoo.vespa.hosted.provision.node.Status;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester;
import org.junit.Test;

import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.function.UnaryOperator;
import java.util.stream.Collectors;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

/**
 * @author mpolden
 */
public class OsVersionsTest {

    private final ProvisioningTester tester = new ProvisioningTester.Builder().build();
    private final ApplicationId infraApplication = ApplicationId.from("hosted-vespa", "infra", "default");

    @Test
    public void upgrade() {
        var versions = new OsVersions(tester.nodeRepository(), new DelegatingUpgrader(tester.nodeRepository(), Integer.MAX_VALUE));
        provisionInfraApplication(10);
        Supplier<List<Node>> hostNodes = () -> tester.nodeRepository().getNodes(NodeType.host);

        // Upgrade OS
        assertTrue("No versions set", versions.readChange().targets().isEmpty());
        var version1 = Version.fromString("7.1");
        versions.setTarget(NodeType.host, version1, Optional.empty(), false);
        assertEquals(version1, versions.targetFor(NodeType.host).get());
        assertTrue("Per-node wanted OS version remains unset", hostNodes.get().stream().allMatch(node -> node.status().osVersion().wanted().isEmpty()));

        // One host upgrades to a later version outside the control of orchestration
        Node hostOnLaterVersion = hostNodes.get().get(0);
        setCurrentVersion(List.of(hostOnLaterVersion), Version.fromString("8.1"));

        // Upgrade OS again
        var version2 = Version.fromString("7.2");
        versions.setTarget(NodeType.host, version2, Optional.empty(), false);
        assertEquals(version2, versions.targetFor(NodeType.host).get());

        // Resume upgrade
        versions.resumeUpgradeOf(NodeType.host, true);
        List<Node> allHosts = hostNodes.get();
        assertTrue("Wanted version is set", allHosts.stream()
                                                       .filter(node -> !node.equals(hostOnLaterVersion))
                                                       .allMatch(node -> node.status().osVersion().wanted().isPresent()));
        assertTrue("Wanted version is not set for host on later version",
                   allHosts.get(0).status().osVersion().wanted().isEmpty());

        // Halt upgrade
        versions.resumeUpgradeOf(NodeType.host, false);
        assertTrue("Wanted version is unset", hostNodes.get().stream()
                                                       .allMatch(node -> node.status().osVersion().wanted().isEmpty()));

        // Downgrading fails
        try {
            versions.setTarget(NodeType.host, version1, Optional.empty(), false);
            fail("Expected exception");
        } catch (IllegalArgumentException ignored) {}

        // Forcing downgrade succeeds
        versions.setTarget(NodeType.host, version1, Optional.empty(), true);
        assertEquals(version1, versions.targetFor(NodeType.host).get());

        // Target can be removed
        versions.removeTarget(NodeType.host);
        assertFalse(versions.targetFor(NodeType.host).isPresent());
        assertTrue(hostNodes.get().stream().allMatch(node -> node.status().osVersion().wanted().isEmpty()));
    }

    @Test
    public void max_active_upgrades() {
        int totalNodes = 20;
        int maxActiveUpgrades = 5;
        var versions = new OsVersions(tester.nodeRepository(), new DelegatingUpgrader(tester.nodeRepository(), maxActiveUpgrades));
        provisionInfraApplication(totalNodes);
        Supplier<NodeList> hostNodes = () -> tester.nodeRepository().list().state(Node.State.active).nodeType(NodeType.host);

        // 5 nodes have no version. The other 15 are spread across different versions
        var hostNodesList = hostNodes.get().asList();
        for (int i = totalNodes - maxActiveUpgrades - 1; i >= 0; i--) {
            setCurrentVersion(List.of(hostNodesList.get(i)), new Version(7, 0, i));
        }

        // Deprovisioned hosts are not considered
        for (var host : tester.makeReadyNodes(10, "default", NodeType.host)) {
            tester.nodeRepository().fail(host.hostname(), Agent.system, OsVersions.class.getSimpleName());
            tester.nodeRepository().removeRecursively(host.hostname());
        }
        assertEquals(10, tester.nodeRepository().getNodes(Node.State.deprovisioned).size());

        // Set target
        var version1 = Version.fromString("7.1");
        versions.setTarget(NodeType.host, version1, Optional.empty(), false);
        assertEquals(version1, versions.targetFor(NodeType.host).get());

        // Activate target
        for (int i = 0; i < totalNodes; i += maxActiveUpgrades) {
            versions.resumeUpgradeOf(NodeType.host, true);
            var nodes = hostNodes.get();
            var nodesUpgrading = nodes.changingOsVersion();
            assertEquals("Target is changed for a subset of nodes", maxActiveUpgrades, nodesUpgrading.size());
            assertEquals("Wanted version is set for nodes upgrading", version1,
                         minVersion(nodesUpgrading, OsVersion::wanted));
            var nodesOnLowestVersion = nodes.asList().stream()
                                            .sorted(Comparator.comparing(node -> node.status().osVersion().current().orElse(Version.emptyVersion)))
                                            .collect(Collectors.toList())
                                            .subList(0, maxActiveUpgrades);
            assertEquals("Nodes on lowest version are told to upgrade",
                         nodesUpgrading.asList(), nodesOnLowestVersion);
            completeUpgradeOf(nodesUpgrading.asList());
        }

        // Activating again after all nodes have upgraded does nothing
        versions.resumeUpgradeOf(NodeType.host, true);
        assertEquals("All nodes upgraded", version1, minVersion(hostNodes.get(), OsVersion::current));
    }

    @Test
    public void newer_upgrade_aborts_upgrade_to_stale_version() {
        var versions = new OsVersions(tester.nodeRepository(), new DelegatingUpgrader(tester.nodeRepository(), Integer.MAX_VALUE));
        provisionInfraApplication(10);
        Supplier<NodeList> hostNodes = () -> tester.nodeRepository().list().nodeType(NodeType.host);

        // Some nodes are targeting an older version
        var version1 = Version.fromString("7.1");
        setWantedVersion(hostNodes.get().asList().subList(0, 5), version1);

        // Trigger upgrade to next version
        var version2 = Version.fromString("7.2");
        versions.setTarget(NodeType.host, version2, Optional.empty(), false);
        versions.resumeUpgradeOf(NodeType.host, true);

        // Wanted version is changed to newest target for all nodes
        assertEquals(version2, minVersion(hostNodes.get(), OsVersion::wanted));
    }

    @Test
    public void upgrade_by_retiring() {
        var versions = new OsVersions(tester.nodeRepository(), new RetiringUpgrader(tester.nodeRepository()));
        var clock = (ManualClock) tester.nodeRepository().clock();
        int hostCount = 10;
        // Provision hosts and children
        List<Node> hosts = provisionInfraApplication(hostCount);
        NodeResources resources = new NodeResources(2, 4, 8, 1);
        for (var host : hosts) {
            tester.makeReadyVirtualDockerNodes(2, resources, host.hostname());
        }
        Supplier<NodeList> hostNodes = () -> tester.nodeRepository().list()
                                                   .nodeType(NodeType.host)
                                                   .not().state(Node.State.deprovisioned);

        // Target is set and upgrade started
        var version1 = Version.fromString("7.1");
        Duration totalBudget = Duration.ofHours(12);
        Duration nodeBudget = totalBudget.dividedBy(hostCount);
        versions.setTarget(NodeType.host, version1, Optional.of(totalBudget),false);
        versions.resumeUpgradeOf(NodeType.host, true);

        // One host is deprovisioning
        assertEquals(1, hostNodes.get().deprovisioning().size());

        // Nothing happens on next resume as first host has not spent its budget
        versions.resumeUpgradeOf(NodeType.host, true);
        NodeList nodesDeprovisioning = hostNodes.get().deprovisioning();
        assertEquals(1, nodesDeprovisioning.size());
        assertEquals(2, retiringChildrenOf(nodesDeprovisioning.asList().get(0)).size());

        // Budget has been spent and another host is retired
        clock.advance(nodeBudget);
        versions.resumeUpgradeOf(NodeType.host, true);
        assertEquals(2, hostNodes.get().deprovisioning().size());

        // Two nodes complete their upgrade by being reprovisioned
        completeUpgradeOf(hostNodes.get().deprovisioning().asList());
        assertEquals(2, hostNodes.get().onOsVersion(version1).size());
        // The remaining hosts complete their upgrade
        for (int i = 0; i < hostCount - 2; i++) {
            clock.advance(nodeBudget);
            versions.resumeUpgradeOf(NodeType.host, true);
            nodesDeprovisioning = hostNodes.get().deprovisioning();
            assertEquals(1, nodesDeprovisioning.size());
            assertEquals(2, retiringChildrenOf(nodesDeprovisioning.asList().get(0)).size());
            completeUpgradeOf(nodesDeprovisioning.asList());
        }

        // All hosts upgraded and none are deprovisioning
        assertEquals(hostCount, hostNodes.get().onOsVersion(version1).not().deprovisioning().size());
        assertEquals(hostCount, tester.nodeRepository().list().state(Node.State.deprovisioned).size());
        var lastRetiredAt = clock.instant().truncatedTo(ChronoUnit.MILLIS);

        // Resuming after everything has upgraded does nothing
        versions.resumeUpgradeOf(NodeType.host, true);
        assertEquals(0, hostNodes.get().deprovisioning().size());

        // Another upgrade is triggered. Last retirement time is preserved
        clock.advance(Duration.ofDays(1));
        var version2 = Version.fromString("7.2");
        versions.setTarget(NodeType.host, version2, Optional.of(totalBudget), false);
        assertEquals(lastRetiredAt, versions.readChange().targets().get(NodeType.host).lastRetiredAt().get());
    }

    @Test
    public void upgrade_by_retiring_everything_at_once() {
        var versions = new OsVersions(tester.nodeRepository(), new RetiringUpgrader(tester.nodeRepository()));
        int hostCount = 3;
        provisionInfraApplication(hostCount, NodeType.confighost);
        Supplier<NodeList> hostNodes = () -> tester.nodeRepository().list()
                                                   .nodeType(NodeType.confighost)
                                                   .not().state(Node.State.deprovisioned);

        // Target is set with zero budget and upgrade started
        var version1 = Version.fromString("7.1");
        versions.setTarget(NodeType.confighost, version1, Optional.of(Duration.ZERO),false);
        for (int i = 0; i < hostCount; i++) {
            versions.resumeUpgradeOf(NodeType.confighost, true);
        }

        // All hosts are deprovisioning
        assertEquals(hostCount, hostNodes.get().deprovisioning().size());
        // Nodes complete their upgrade by being reprovisioned
        completeUpgradeOf(hostNodes.get().deprovisioning().asList(), NodeType.confighost);
        assertEquals(hostCount, hostNodes.get().onOsVersion(version1).size());
    }

    private NodeList retiringChildrenOf(Node parent) {
        return tester.nodeRepository().list().childrenOf(parent).matching(child -> child.status().wantToRetire());
    }

    private List<Node> provisionInfraApplication(int nodeCount) {
        return provisionInfraApplication(nodeCount, NodeType.host);
    }

    private List<Node> provisionInfraApplication(int nodeCount, NodeType nodeType) {
        var nodes = tester.makeReadyNodes(nodeCount, "default", nodeType);
        tester.prepareAndActivateInfraApplication(infraApplication, nodeType);
        return nodes.stream()
                    .map(Node::hostname)
                    .flatMap(hostname -> tester.nodeRepository().getNode(hostname).stream())
                    .collect(Collectors.toList());
    }

    private Version minVersion(NodeList nodes, Function<OsVersion, Optional<Version>> versionField) {
        return nodes.asList().stream()
                    .map(Node::status)
                    .map(Status::osVersion)
                    .map(versionField)
                    .flatMap(Optional::stream)
                    .min(Comparator.naturalOrder())
                    .orElse(Version.emptyVersion);

    }

    private void setWantedVersion(List<Node> nodes, Version wantedVersion) {
        writeNode(nodes, node -> node.with(node.status().withOsVersion(node.status().osVersion().withWanted(Optional.of(wantedVersion)))));
    }

    private void setCurrentVersion(List<Node> nodes, Version currentVersion) {
        writeNode(nodes, node -> node.with(node.status().withOsVersion(node.status().osVersion().withCurrent(Optional.of(currentVersion)))));
    }

    private void writeNode(List<Node> nodes, UnaryOperator<Node> updateFunc) {
        for (var node : nodes) {
            try (var lock = tester.nodeRepository().lock(node)) {
                node = tester.nodeRepository().getNode(node.hostname()).get();
                tester.nodeRepository().write(updateFunc.apply(node), lock);
            }
        }
    }

    private void completeUpgradeOf(List<Node> nodes) {
        completeUpgradeOf(nodes, NodeType.host);
    }

    private void completeUpgradeOf(List<Node> nodes, NodeType nodeType) {
        writeNode(nodes, (node) -> {
            Optional<Version> wantedOsVersion = node.status().osVersion().wanted();
            if (node.status().wantToDeprovision()) {
                // Complete upgrade by deprovisioning stale hosts and provisioning new ones
                tester.nodeRepository().park(node.hostname(), false, Agent.system,
                                             OsVersionsTest.class.getSimpleName());
                tester.nodeRepository().removeRecursively(node.hostname());
                node = provisionInfraApplication(1, nodeType).get(0);
            }
            return node.with(node.status().withOsVersion(node.status().osVersion().withCurrent(wantedOsVersion)));
        });
    }

}