summaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/NodeHealthTracker.java
blob: 37969a30b810e06409e81c2f3a3a9eb410ed081c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;

import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.ApplicationLockException;
import com.yahoo.config.provision.HostLivenessTracker;
import com.yahoo.jdisc.Metric;
import com.yahoo.lang.MutableInteger;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.applicationmodel.ServiceInstance;
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.service.monitor.ServiceMonitor;
import com.yahoo.yolean.Exceptions;

import java.time.Duration;
import java.time.Instant;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.logging.Level;
import java.util.stream.Collectors;

import static java.util.stream.Collectors.counting;

/**
 * Checks if nodes are responding and updates their status accordingly
 *
 * @author bratseth
 */
public class NodeHealthTracker extends NodeRepositoryMaintainer {

    /** Provides information about the status of ready hosts */
    private final HostLivenessTracker hostLivenessTracker;

    /** Provides (more accurate) information about the status of active hosts */
    private final ServiceMonitor serviceMonitor;

    public NodeHealthTracker(HostLivenessTracker hostLivenessTracker,
                             ServiceMonitor serviceMonitor, NodeRepository nodeRepository,
                             Duration interval, Metric metric) {
        super(nodeRepository, interval, metric);
        this.hostLivenessTracker = hostLivenessTracker;
        this.serviceMonitor = serviceMonitor;
    }

    @Override
    protected double maintain() {
        return ( updateReadyNodeLivenessEvents() + updateActiveNodeDownState() ) / 2;
    }

    private double updateReadyNodeLivenessEvents() {
        // Update node last request events through ZooKeeper to collect request to all config servers.
        // We do this here ("lazily") to avoid writing to zk for each config request.
        try (Mutex lock = nodeRepository().nodes().lockUnallocated()) {
            for (Node node : nodeRepository().nodes().list(Node.State.ready)) {
                Optional<Instant> lastLocalRequest = hostLivenessTracker.lastRequestFrom(node.hostname());
                if (lastLocalRequest.isEmpty()) continue;

                if (!node.history().hasEventAfter(History.Event.Type.requested, lastLocalRequest.get())) {
                    History updatedHistory = node.history()
                                                 .with(new History.Event(History.Event.Type.requested, Agent.NodeHealthTracker, lastLocalRequest.get()));
                    nodeRepository().nodes().write(node.with(updatedHistory), lock);
                }
            }
        }
        return 1.0;
    }

    /**
     * If the node is down (see {@link #allDown}), and there is no "down" history record, we add it.
     * Otherwise we remove any "down" history record.
     */
    private double updateActiveNodeDownState() {
        var attempts = new MutableInteger(0);
        var failures = new MutableInteger(0);
        NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
        serviceMonitor.getServiceModelSnapshot().getServiceInstancesByHostName().forEach((hostname, serviceInstances) -> {
            Optional<Node> node = activeNodes.node(hostname.toString());
            if (node.isEmpty()) return;

            // Already correct record, nothing to do
            boolean isDown = allDown(serviceInstances);
            if (isDown == node.get().isDown()) return;

            // Lock and update status
            ApplicationId owner = node.get().allocation().get().owner();
            try (var lock = nodeRepository().nodes().lock(owner)) {
                node = getNode(hostname.toString(), owner, lock); // Re-get inside lock
                if (node.isEmpty()) return; // Node disappeared or changed allocation
                attempts.add(1);
                if (isDown) {
                    recordAsDown(node.get(), lock);
                } else {
                    clearDownRecord(node.get(), lock);
                }
            } catch (ApplicationLockException e) {
                // Fine, carry on with other nodes. We'll try updating this one in the next run
                log.log(Level.WARNING, "Could not lock " + owner + ": " + Exceptions.toMessageString(e));
                failures.add(1);
            }
        });
        return asSuccessFactor(attempts.get(), failures.get());
    }

    /**
     * Returns true if the node is considered bad: All monitored services services are down.
     * If a node remains bad for a long time, the NodeFailer will try to fail the node.
     */
    static boolean allDown(List<ServiceInstance> services) {
        Map<ServiceStatus, Long> countsByStatus = services.stream()
                                                          .collect(Collectors.groupingBy(ServiceInstance::serviceStatus, counting()));

        return countsByStatus.getOrDefault(ServiceStatus.UP, 0L) <= 0L &&
               countsByStatus.getOrDefault(ServiceStatus.DOWN, 0L) > 0L;
    }

    /** Get node by given hostname and application. The applicationLock must be held when calling this */
    private Optional<Node> getNode(String hostname, ApplicationId application, @SuppressWarnings("unused") Mutex applicationLock) {
        return nodeRepository().nodes().node(hostname, Node.State.active)
                               .filter(node -> node.allocation().isPresent())
                               .filter(node -> node.allocation().get().owner().equals(application));
    }

    /** Record a node as down if not already recorded */
    private void recordAsDown(Node node, Mutex lock) {
        if (node.history().event(History.Event.Type.down).isPresent()) return; // already down: Don't change down timestamp
        nodeRepository().nodes().write(node.downAt(clock().instant(), Agent.NodeHealthTracker), lock);
    }

    /** Clear down record for node, if any */
    private void clearDownRecord(Node node, Mutex lock) {
        if (node.history().event(History.Event.Type.down).isEmpty()) return;
        nodeRepository().nodes().write(node.up(), lock);
    }

}