aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/NodeRebooterTest.java
blob: 2477e737759b9e99b213fc7c7db18d81629951c3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;

import com.yahoo.component.Version;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.curator.mock.MockCurator;
import com.yahoo.vespa.flags.InMemoryFlagSource;
import com.yahoo.vespa.flags.PermanentFlags;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester;
import org.junit.Test;

import java.time.Duration;
import java.time.Instant;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;

import static org.junit.Assert.assertEquals;

/**
 * @author bratseth
 */
public class NodeRebooterTest {

    @Test
    public void testRebootScheduling() {
        Duration rebootInterval = Duration.ofDays(30);
        InMemoryFlagSource flagSource = new InMemoryFlagSource();
        ProvisioningTester tester = createTester(rebootInterval, flagSource);

        makeReadyHosts(15, tester);
        NodeRepository nodeRepository = tester.nodeRepository();
        NodeRebooter rebooter = new NodeRebooter(nodeRepository, flagSource, new TestMetric());

        assertReadyHosts(15, nodeRepository, 0L);

        // No reboots within 0x-1x reboot interval
        tester.clock().advance(rebootInterval);
        rebooter.maintain();
        simulateReboot(nodeRepository);
        assertReadyHosts(15, nodeRepository, 0L);

        // All nodes/hosts reboots within 1x-2x reboot interval
        tester.clock().advance(rebootInterval);
        rebooter.maintain();
        simulateReboot(nodeRepository);
        assertReadyHosts(15, nodeRepository, 1L);

        // OS upgrade just before reboots would have been scheduled again
        tester.clock().advance(rebootInterval);
        scheduleOsUpgrade(nodeRepository);
        simulateOsUpgrade(nodeRepository);
        rebooter.maintain();
        simulateReboot(nodeRepository);
        assertReadyHosts(15, nodeRepository, 1L);

        // OS upgrade counts as reboot, so within 0x-1x there is no reboots
        tester.clock().advance(rebootInterval);
        rebooter.maintain();
        scheduleOsUpgrade(nodeRepository);
        simulateOsUpgrade(nodeRepository);
        assertReadyHosts(15, nodeRepository, 1L);

        // OS upgrade counts as reboot, but within 1x-2x reboots are scheduled again
        tester.clock().advance(rebootInterval);
        rebooter.maintain();
        simulateReboot(nodeRepository);
        assertReadyHosts(15, nodeRepository, 2L);
    }

    @Test(timeout = 30_000) // Avoid looping forever if assertions don't hold
    public void testRebootScheduledEvenWithSmallProbability() {
        Duration rebootInterval = Duration.ofDays(30);
        InMemoryFlagSource flagSource = new InMemoryFlagSource();
        ProvisioningTester tester = createTester(rebootInterval, flagSource);

        makeReadyHosts(2, tester);
        NodeRepository nodeRepository = tester.nodeRepository();
        NodeRebooter rebooter = new NodeRebooter(nodeRepository, flagSource, new TestMetric());

        assertReadyHosts(2, nodeRepository, 0L);

        // No reboots within 0x-1x reboot interval
        tester.clock().advance(rebootInterval);
        rebooter.maintain();
        simulateReboot(nodeRepository);
        assertReadyHosts(2, nodeRepository, 0L);

        // Advancing just a little bit into the 1x-2x interval, there is a >0 probability of
        // rebooting a host. Run until all have been scheduled.
        tester.clock().advance(Duration.ofMinutes(25));
        while (true) {
            rebooter.maintain();
            simulateReboot(nodeRepository);
            NodeList nodes = nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.host);
            int count = withCurrentRebootGeneration(1L, nodes.asList()).size();
            if (count == 2) {
                break;
            }
        }
    }

    private void assertReadyHosts(int expectedCount, NodeRepository nodeRepository, long generation) {
        NodeList nodes = nodeRepository.nodes().list(Node.State.ready).nodeType(NodeType.host);
        assertEquals(expectedCount, withCurrentRebootGeneration(generation, nodes.asList()).size());
    }

    private void makeReadyHosts(int count, ProvisioningTester tester) {
        tester.makeReadyNodes(count, new NodeResources(64, 256, 1000, 10), NodeType.host, 10);
        // Set initial OS version
        tester.patchNodes(node -> node.type().isHost(), (node) -> node.with(node.status().withOsVersion(node.status().osVersion().withCurrent(Optional.of(Version.fromString("7.0"))))));
    }

    /** Set current reboot generation to the wanted reboot generation whenever it is larger (i.e record a reboot) */
    private void simulateReboot(NodeRepository nodeRepository) {
        for (Node node : nodeRepository.nodes().list(Node.State.ready, Node.State.active)) {
            if (node.status().reboot().wanted() > node.status().reboot().current())
                nodeRepository.nodes().write(node.withCurrentRebootGeneration(node.status().reboot().wanted(),
                                                                              nodeRepository.clock().instant()), () -> {});
        }
    }

    /** Schedule OS upgrade for all host nodes */
    private void scheduleOsUpgrade(NodeRepository nodeRepository) {
        nodeRepository.osVersions().setTarget(NodeType.host, Version.fromString("7.1"), false);
    }

    /** Simulate completion of an OS upgrade */
    private void simulateOsUpgrade(NodeRepository nodeRepository) {
        var wantedOsVersion = nodeRepository.osVersions().targetFor(NodeType.host);
        if (wantedOsVersion.isEmpty()) return;
        for (Node node : nodeRepository.nodes().list(Node.State.ready, Node.State.active)) {
            if (wantedOsVersion.get().isAfter(node.status().osVersion().current().orElse(Version.emptyVersion)))
                nodeRepository.nodes().write(node.withCurrentOsVersion(wantedOsVersion.get(), nodeRepository.clock().instant()),
                                             () -> {});
        }
    }

    /** Returns the subset of the given nodes which have the given current reboot generation */
    private List<Node> withCurrentRebootGeneration(long generation, List<Node> nodes) {
        return nodes.stream().filter(n -> n.status().reboot().current() == generation).toList();
    }

    private static ProvisioningTester createTester(Duration rebootInterval, InMemoryFlagSource flagSource) {
        flagSource = flagSource.withIntFlag(PermanentFlags.REBOOT_INTERVAL_IN_DAYS.id(), (int) rebootInterval.toDays());
        ProvisioningTester tester = new ProvisioningTester.Builder().flagSource(flagSource).build();
        tester.clock().setInstant(Instant.ofEpochMilli(1605522619000L)); // Use a fixed random seed
        ((MockCurator) tester.getCurator()).setZooKeeperEnsembleConnectionSpec("zk1.host:1,zk2.host:2,zk3.host:3");
        return tester;
    }

}