aboutsummaryrefslogtreecommitdiffstats
path: root/clustercontroller-core/src/test/java/com/yahoo/vespa/clustercontroller/core/SlobrokTest.java
blob: 65dd13ab8a692223c5be14626b3cc542df84c02c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;

import com.yahoo.jrt.slobrok.server.Slobrok;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import java.util.logging.Level;
import java.util.logging.Logger;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

@ExtendWith(CleanupZookeeperLogsOnSuccess.class)
public class SlobrokTest extends FleetControllerTest {

    private static final Logger log = Logger.getLogger(SlobrokTest.class.getName());

    private final FakeTimer timer = new FakeTimer();

    @Test
    void testSingleSlobrokRestart() throws Exception {
        FleetControllerOptions.Builder builder = defaultOptions()
                .setNodeStateRequestTimeoutMS(60 * 60 * 1000)
                .setMaxSlobrokDisconnectGracePeriod(60 * 60 * 1000);
        setUpFleetController(timer, builder);
        setUpVdsNodes(timer);
        waitForStableSystem();

        FleetController fleetController = fleetController();
        int version = fleetController.getSystemState().getVersion();
        int slobrokPort = slobrok.port();

        // Test that we survive some slobrok instability without changing system state.
        for (int j = 0; j < 4; ++j) {
            log.log(Level.INFO, "Mirror updateForDistributor count is " + fleetController.getSlobrokMirrorUpdates());
            log.log(Level.INFO, "STOPPING SLOBROK SERVER (" + (j + 1) + "/4)");
            slobrok.stop();
            for (int i = 0; i < 10; ++i) {
                // Force one node to at least notice that the slobrok server is gone
                if (i == 5) {
                    log.log(Level.INFO, "Forcing one node to initate a resend: " + nodes.get(3));
                    nodes.get(3).replyToPendingNodeStateRequests();
                }
                waitForCompleteCycle();
                timer.advanceTime(100);
            }
            log.log(Level.INFO, "STARTING SLOBROK SERVER AGAIN (" + (j + 1) + "/4)");
            slobrok = new Slobrok(slobrokPort);
            // May take up to 30 seconds for slobrok clients to re-register. Trigger retry.
            for (DummyVdsNode node : nodes) {
                node.disconnectSlobrok();
                node.registerSlobrok();
            }
            //fleetController.setFreshSlobrokMirror();
            waitForCompleteCycle();
            fleetController.waitForNodesInSlobrok(10, 10, timeout());

            log.log(Level.INFO, "Waiting for cluster to be up and available again");
            for (int i = 0; i < timeout().toMillis(); i += 10) {
                if (clusterAvailable()) break;
                timer.advanceTime(1000);
                waitForCompleteCycle();
                try {
                    Thread.sleep(10);
                } catch (InterruptedException e) { /* ignore */
                }
            }
            assertClusterAvailable();
        }

        assertEquals(version, fleetController.getSystemState().getVersion(), "Cluster state was affected, although it should not have been.");
    }

    @Test
    void testNodeTooLongOutOfSlobrok() throws Exception {
        FleetControllerOptions.Builder builder = defaultOptions()
                .setMaxSlobrokDisconnectGracePeriod(60 * 1000)
                .setNodeStateRequestTimeoutMS(10000 * 60 * 1000);
        setUpFleetController(timer, builder);
        setUpVdsNodes(timer);
        waitForStableSystem();

        int version = fleetController().getSystemState().getVersion();
        nodes.get(0).disconnectSlobrok();
        log.log(Level.INFO, "DISCONNECTED NODE FROM SLOBROK. SHOULD BE IN COOLDOWN PERIOD");
        fleetController().waitForNodesInSlobrok(9, 10, timeout());
        synchronized(timer) {
            nodes.get(0).sendGetNodeStateReply(0);
        }

        // Give system a little time to possible faultily removing node not in slobrok
        timer.advanceTime(1000);
        try {
            Thread.sleep(10);
        } catch (InterruptedException e) { /* ignore */
        }
        assertEquals(version, fleetController().getSystemState().getVersion());
        log.log(Level.INFO, "JUMPING TIME. NODE SHOULD BE MARKED DOWN");
        // At this point the fleetcontroller might not have noticed that the node is out of slobrok yet.
        // Thus we keep advancing time another minute such that it should get down.
        timer.advanceTime(builder.nodeStateRequestTimeoutMS() + builder.maxSlobrokDisconnectGracePeriod());
        waitForState("version:\\d+ distributor:10 .0.s:d storage:10");
    }

    private boolean clusterAvailable() {
        boolean ok = true;
        ContentCluster cluster = fleetController().getCluster();
        for (NodeInfo info : cluster.getNodeInfos()) {
            if (info.getConnectionAttemptCount() > 0) ok = false;
            if (info.getLatestNodeStateRequestTime() == null) ok = false;
        }
        return ok;
    }
    private void assertClusterAvailable() {
        ContentCluster cluster = fleetController().getCluster();
        for (NodeInfo info : cluster.getNodeInfos()) {
            assertEquals(0, info.getConnectionAttemptCount(), "Node " + info + " connection attempts.");
            assertTrue(info.getLatestNodeStateRequestTime() != 0, "Node " + info + " has no last request time.");
        }
    }
}