summaryrefslogtreecommitdiffstats
path: root/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/maintenance/StorageMaintainer.java
blob: 16d0d823ac2c50a658cfd77a31b208facabe6d51 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.maintenance;

import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.yahoo.config.provision.NodeType;
import java.util.logging.Level;
import com.yahoo.vespa.hosted.dockerapi.Container;
import com.yahoo.vespa.hosted.dockerapi.ContainerName;
import com.yahoo.vespa.hosted.node.admin.component.TaskContext;
import com.yahoo.vespa.hosted.node.admin.maintenance.coredump.CoredumpHandler;
import com.yahoo.vespa.hosted.node.admin.maintenance.disk.CoredumpCleanupRule;
import com.yahoo.vespa.hosted.node.admin.maintenance.disk.DiskCleanup;
import com.yahoo.vespa.hosted.node.admin.maintenance.disk.DiskCleanupRule;
import com.yahoo.vespa.hosted.node.admin.maintenance.disk.LinearCleanupRule;
import com.yahoo.vespa.hosted.node.admin.nodeadmin.ConvergenceException;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext;
import com.yahoo.vespa.hosted.node.admin.task.util.file.FileFinder;
import com.yahoo.vespa.hosted.node.admin.task.util.file.DiskSize;
import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixPath;
import com.yahoo.vespa.hosted.node.admin.task.util.process.Terminal;

import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.logging.Logger;

import static com.yahoo.vespa.hosted.node.admin.maintenance.disk.DiskCleanupRule.Priority;
import static com.yahoo.yolean.Exceptions.uncheck;

/**
 * @author freva
 */
public class StorageMaintainer {
    private static final Logger logger = Logger.getLogger(StorageMaintainer.class.getName());
    private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter
            .ofPattern("yyyyMMddHHmmss").withZone(ZoneOffset.UTC);

    private final Terminal terminal;
    private final CoredumpHandler coredumpHandler;
    private final Path archiveContainerStoragePath;
    private final DiskCleanup diskCleanup;
    private final Clock clock;

    // We cache disk usage to avoid doing expensive disk operations so often
    private final Cache<ContainerName, DiskSize> diskUsage = CacheBuilder.newBuilder()
            .maximumSize(100)
            .expireAfterWrite(5, TimeUnit.MINUTES)
            .build();

    public StorageMaintainer(Terminal terminal, CoredumpHandler coredumpHandler, Path archiveContainerStoragePath) {
        this(terminal, coredumpHandler, archiveContainerStoragePath, new DiskCleanup(), Clock.systemUTC());
    }

    public StorageMaintainer(Terminal terminal, CoredumpHandler coredumpHandler, Path archiveContainerStoragePath, DiskCleanup diskCleanup, Clock clock) {
        this.terminal = terminal;
        this.coredumpHandler = coredumpHandler;
        this.archiveContainerStoragePath = archiveContainerStoragePath;
        this.diskCleanup = diskCleanup;
        this.clock = clock;
    }

    public Optional<DiskSize> diskUsageFor(NodeAgentContext context) {
        try {
            DiskSize cachedDiskUsage = diskUsage.getIfPresent(context.containerName());
            if (cachedDiskUsage != null) return Optional.of(cachedDiskUsage);

            DiskSize diskUsageBytes = getDiskUsed(context, context.pathOnHostFromPathInNode("/"));
            diskUsage.put(context.containerName(), diskUsageBytes);
            return Optional.of(diskUsageBytes);
        } catch (Exception e) {
            context.log(logger, LogLevel.WARNING, "Failed to get disk usage", e);
            return Optional.empty();
        }
    }

    DiskSize getDiskUsed(TaskContext context, Path path) {
        if (!Files.exists(path)) return DiskSize.ZERO;

        String output = terminal.newCommandLine(context)
                .add("du", "-xsk", path.toString())
                .setTimeout(Duration.ofSeconds(60))
                .executeSilently()
                .getOutput();

        String[] results = output.split("\t");
        if (results.length != 2) {
            throw new ConvergenceException("Result from disk usage command not as expected: " + output);
        }

        return DiskSize.of(Long.parseLong(results[0]), DiskSize.Unit.kiB);
    }

    public boolean cleanDiskIfFull(NodeAgentContext context) {
        double totalBytes = context.node().diskSize().bytes();
        // Delete enough bytes to get below 80% disk usage, but only if we are already using more than 90% disk
        long bytesToRemove = diskUsageFor(context)
                .map(diskUsage -> (long) (diskUsage.bytes() - 0.8 * totalBytes))
                .filter(bytes -> bytes > totalBytes * 0.1)
                .orElse(0L);

        if (bytesToRemove > 0 && diskCleanup.cleanup(context, createCleanupRules(context), bytesToRemove)) {
            diskUsage.invalidate(context.containerName());
            return true;
        }
        return false;
    }

    private List<DiskCleanupRule> createCleanupRules(NodeAgentContext context) {
        Instant start = clock.instant();
        double oneMonthSeconds = Duration.ofDays(30).getSeconds();
        Function<Instant, Double> monthNormalizer = instant -> Duration.between(instant, start).getSeconds() / oneMonthSeconds;
        Function<String, Path> pathOnHostUnderContainerVespaHome = path ->
                context.pathOnHostFromPathInNode(context.pathInNodeUnderVespaHome(path));
        List<DiskCleanupRule> rules = new ArrayList<>();

        rules.add(CoredumpCleanupRule.forContainer(pathOnHostUnderContainerVespaHome.apply("var/crash")));

        if (context.node().membership().map(m -> m.type().isContainer()).orElse(false))
            rules.add(new LinearCleanupRule(() -> FileFinder.files(pathOnHostUnderContainerVespaHome.apply("logs/vespa/qrs")).list(),
                    fa -> monthNormalizer.apply(fa.lastModifiedTime()), Priority.LOWEST, Priority.HIGHEST));

        if (context.nodeType() == NodeType.tenant && context.node().membership().map(m -> m.type().isAdmin()).orElse(false))
            rules.add(new LinearCleanupRule(() -> FileFinder.files(pathOnHostUnderContainerVespaHome.apply("logs/vespa/logarchive")).list(),
                    fa -> monthNormalizer.apply(fa.lastModifiedTime()), Priority.LOWEST, Priority.HIGHEST));

        if (context.nodeType() == NodeType.proxy)
            rules.add(new LinearCleanupRule(() -> FileFinder.files(pathOnHostUnderContainerVespaHome.apply("logs/nginx")).list(),
                    fa -> monthNormalizer.apply(fa.lastModifiedTime()), Priority.LOWEST, Priority.MEDIUM));

        return rules;
    }

    /** Checks if container has any new coredumps, reports and archives them if so */
    public void handleCoreDumpsForContainer(NodeAgentContext context, Optional<Container> container) {
        coredumpHandler.converge(context, () -> getCoredumpNodeAttributes(context, container));
    }

    private Map<String, Object> getCoredumpNodeAttributes(NodeAgentContext context, Optional<Container> container) {
        Map<String, String> attributes = new HashMap<>();
        attributes.put("hostname", context.node().hostname());
        attributes.put("region", context.zone().getRegionName().value());
        attributes.put("environment", context.zone().getEnvironment().value());
        attributes.put("flavor", context.node().flavor());
        attributes.put("kernel_version", System.getProperty("os.version"));
        attributes.put("cpu_microcode_version", getMicrocodeVersion());

        container.map(c -> c.image).ifPresent(image -> attributes.put("docker_image", image.asString()));
        context.node().parentHostname().ifPresent(parent -> attributes.put("parent_hostname", parent));
        context.node().currentVespaVersion().ifPresent(version -> attributes.put("vespa_version", version.toFullString()));
        context.node().owner().ifPresent(owner -> {
            attributes.put("tenant", owner.tenant().value());
            attributes.put("application", owner.application().value());
            attributes.put("instance", owner.instance().value());
        });
        return Collections.unmodifiableMap(attributes);
    }

    /**
     * Prepares the container-storage for the next container by deleting/archiving all the data of the current container.
     * Removes old files, reports coredumps and archives container data, runs when container enters state "dirty"
     */
    public void archiveNodeStorage(NodeAgentContext context) {
        Path logsDirInContainer = context.pathInNodeUnderVespaHome("logs");
        Path containerLogsInArchiveDir = archiveContainerStoragePath
                .resolve(context.containerName().asString() + "_" + DATE_TIME_FORMATTER.format(clock.instant()) + logsDirInContainer);
        UnixPath containerLogsOnHost = new UnixPath(context.pathOnHostFromPathInNode(logsDirInContainer));

        if (containerLogsOnHost.exists()) {
            new UnixPath(containerLogsInArchiveDir).createParents();
            containerLogsOnHost.moveIfExists(containerLogsInArchiveDir);
        }
        new UnixPath(context.pathOnHostFromPathInNode("/")).deleteRecursively();
    }

    private String getMicrocodeVersion() {
        String output = uncheck(() -> Files.readAllLines(Paths.get("/proc/cpuinfo")).stream()
                .filter(line -> line.startsWith("microcode"))
                .findFirst()
                .orElseThrow(() -> new ConvergenceException("No microcode information found in /proc/cpuinfo")));

        String[] results = output.split(":");
        if (results.length != 2) {
            throw new ConvergenceException("Result from detect microcode command not as expected: " + output);
        }

        return results[1].trim();
    }
}