aboutsummaryrefslogtreecommitdiffstats
path: root/container-core/src/main/java/com/yahoo/container/protect/Watchdog.java
blob: ff4252425878a9ae7865f5fa0f37bb76369d7f73 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.container.protect;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TimerTask;
import java.util.logging.Level;
import java.util.logging.Logger;

import com.yahoo.concurrent.ThreadLocalDirectory;
import com.yahoo.log.LogLevel;
import com.yahoo.protect.Process;

/**
 * Watchdog for a frozen process, too many timeouts, etc.
 *
 * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a>
 * @deprecated this is not in use and will be removed in the next major release
 */
// TODO: Remove on Vespa 7
@Deprecated
class Watchdog extends TimerTask {

    public static final String FREEZEDETECTOR_DISABLE = "vespa.freezedetector.disable";
    Logger log = Logger.getLogger(Watchdog.class.getName());
    private long lastRun = 0L;
    private long lastQpsCheck = 0L;
    // Local copy to avoid ever _reading_ the volatile version
    private boolean breakdownCopy = false;
    private volatile boolean breakdown;
    // The fraction of queries which must time out to view the QRS as being
    // in breakdown
    private final double timeoutThreshold;
    // The minimal QPS to care about timeoutThreshold
    private final int minimalQps;
    private final boolean disableSevereBreakdownCheck;
    private final List<ThreadLocalDirectory<TimeoutRate, Boolean>> timeoutRegistry = new ArrayList<>();
    private final boolean shutdownIfFrozen;

    Watchdog(double timeoutThreshold, int minimalQps, boolean shutdownIfFrozen) {
        this.timeoutThreshold = timeoutThreshold;
        this.minimalQps = minimalQps;
        if (System.getProperty(FREEZEDETECTOR_DISABLE) != null) {
            disableSevereBreakdownCheck = true;
        } else {
            disableSevereBreakdownCheck = false;
        }
        this.shutdownIfFrozen = shutdownIfFrozen;
    }

    @Override
    public void run() {
        long now = System.currentTimeMillis();
        if (lastRun != 0L) {
            severeBreakdown(now);
            queryTimeouts(now);
        } else {
            lastQpsCheck = now;
        }
        lastRun = now;
    }

    private void severeBreakdown(final long now) {
        if (disableSevereBreakdownCheck) {
            return;
        }
        if (now - lastRun < 5000L) {
            return;
        }

        threadStackMessage();

        if (shutdownIfFrozen) {
            Process.logAndDie("Watchdog timer meant to run ten times per second"
                    + " not run for five seconds or more."
                    + " Assuming severe failure or overloaded node, shutting down container.");
        } else {
            log.log(LogLevel.ERROR,
                    "A watchdog meant to run 10 times a second has not been invoked for 5 seconds."
                            + " This usually means this machine is swapping or otherwise severely overloaded.");
        }
    }

    private void threadStackMessage() {
        log.log(LogLevel.INFO, "System seems unresponsive, performing full thread dump for diagnostics.");
        threadDump();
        log.log(LogLevel.INFO, "End of diagnostic thread dump.");
    }

    private void threadDump() {
        try {
            Map<Thread, StackTraceElement[]> allStackTraces = Thread.getAllStackTraces();
            for (Map.Entry<Thread, StackTraceElement[]> e : allStackTraces.entrySet()) {
                Thread t = e.getKey();
                StackTraceElement[] stack = e.getValue();
                StringBuilder forOneThread = new StringBuilder();
                int initLen;
                forOneThread.append("Stack for thread: ").append(t.getName()).append(": ");
                initLen = forOneThread.length();
                for (StackTraceElement s : stack) {
                    if (forOneThread.length() > initLen) {
                        forOneThread.append(" ");
                    }
                    forOneThread.append(s.toString());
                }
                log.log(LogLevel.INFO, forOneThread.toString());
            }
        } catch (Exception e) {
            // just give up...
        }
    }

    private void queryTimeouts(final long now) {
        // only check query timeout every 10s
        if (now - lastQpsCheck < 10000L) {
            return;
        } else {
            lastQpsCheck = now;
        }

        final TimeoutRate globalState = new TimeoutRate();
        synchronized (timeoutRegistry) {
            for (ThreadLocalDirectory<TimeoutRate, Boolean> timeouts : timeoutRegistry) {
                final List<TimeoutRate> threadStates = timeouts.fetch();
                for (final TimeoutRate t : threadStates) {
                    globalState.merge(t);
                }
            }
        }
        if (globalState.timeoutFraction() > timeoutThreshold && globalState.getTotal() > (10 * minimalQps)) {
            setBreakdown(true);
            log.log(Level.WARNING, "Too many queries timed out. Assuming container is in breakdown.");
        } else {
            if (!breakdown()) {
                return;
            }
            setBreakdown(false);
            log.log(Level.WARNING, "Fewer queries timed out. Assuming container is no longer in breakdown.");
        }
    }

    private void setBreakdown(final boolean state) {
        breakdown = state;
        breakdownCopy = state;
    }

    private boolean breakdown() {
        return breakdownCopy;
    }

    boolean isBreakdown() {
        return breakdown;
    }

    void addTimeouts(ThreadLocalDirectory<TimeoutRate, Boolean> t) {
        synchronized (timeoutRegistry) {
            timeoutRegistry.add(t);
        }
    }

    void removeTimeouts(ThreadLocalDirectory<TimeoutRate, Boolean> timeouts) {
        synchronized (timeoutRegistry) {
            timeoutRegistry.remove(timeouts);
        }
    }

}