aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/HostFlavorUpgrader.java
blob: 83a710cb5a9c22d2c45c87ae454ea8fb4b8cf83f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;

import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.NodeAllocationException;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.provisioning.HostProvisioner;

import java.time.Duration;
import java.util.Objects;
import java.util.Optional;
import java.util.Random;
import java.util.function.Predicate;
import java.util.logging.Level;

/**
 * This maintainer attempts to upgrade a single host running on an older flavor generation. The upgrade happens by
 * marking and retiring the host on the old generation, and redeploying to provision a replacement host on a newer
 * generation.
 *
 * If the cloud provider reports a lack of capacity for the newer generation, retirement of the host is
 * cancelled, and upgrade is attempted of the next host on an old flavor, if any.
 *
 * Once a host has been marked for upgrade, {@link HostResumeProvisioner} will complete provisioning of the replacement
 * host.
 *
 * @author mpolden
 */
public class HostFlavorUpgrader extends NodeRepositoryMaintainer {

    private final HostProvisioner hostProvisioner;
    private final Random random;
    private final Deployer deployer;
    private final Metric metric;

    public HostFlavorUpgrader(NodeRepository nodeRepository, Duration interval, Metric metric, Deployer deployer, HostProvisioner hostProvisioner) {
        super(nodeRepository, interval, metric);
        this.hostProvisioner = Objects.requireNonNull(hostProvisioner);
        this.deployer = Objects.requireNonNull(deployer);
        this.metric = Objects.requireNonNull(metric);
        this.random = new Random(nodeRepository.clock().millis());
    }

    @Override
    protected double maintain() {
        if (!nodeRepository().zone().cloud().dynamicProvisioning()) return 1.0; // Not relevant in zones with static capacity
        if (nodeRepository().zone().environment().isTest()) return 1.0; // Short-lived deployments
        if (!nodeRepository().nodes().isWorking()) return 0.0;

        NodeList allNodes = nodeRepository().nodes().list();
        if (!NodeMover.zoneIsStable(allNodes)) return 1.0;
        return upgradeHostFlavor(allNodes);
    }

    private double upgradeHostFlavor(NodeList allNodes) {
        NodeList activeNodes = allNodes.nodeType(NodeType.tenant)
                                       .state(Node.State.active)
                                       .shuffle(random); // Shuffle to avoid getting stuck trying to upgrade the same host
        for (var node : activeNodes) {
            Optional<Node> parent = allNodes.parentOf(node);
            if (parent.isEmpty()) continue;
            Allocation allocation = node.allocation().get();
            Predicate<NodeResources> realHostResourcesWithinLimits = resources -> nodeRepository().nodeResourceLimits().isWithinRealLimits(resources, allocation.owner(), allocation.membership().cluster());
            if (!hostProvisioner.canUpgradeFlavor(parent.get(), node, realHostResourcesWithinLimits)) continue;
            if (parent.get().status().wantToUpgradeFlavor()) continue; // Already upgrading

            boolean redeployed = false;
            boolean deploymentValid = false;
            try (MaintenanceDeployment deployment = new MaintenanceDeployment(allocation.owner(), deployer, metric, nodeRepository(), true)) {
                deploymentValid = deployment.isValid();
                if (!deploymentValid) continue;

                log.log(Level.INFO, () -> "Redeploying " + allocation.owner() + " to upgrade flavor (" +
                                          parent.get().flavor().name() + ") of " + parent.get());
                upgradeFlavor(parent.get(), true);
                deployment.activate();
                redeployed = true;
                return 1.0;
            } catch (NodeAllocationException e) {
               // Fine, no capacity for upgrade
            } finally {
                if (deploymentValid && !redeployed) { // Cancel upgrade if redeploy failed
                    upgradeFlavor(parent.get(), false);
                }
            }
        }
        return 1.0;
    }

    private void upgradeFlavor(Node host, boolean upgrade) {
        nodeRepository().nodes().upgradeFlavor(host.hostname(),
                                               Agent.HostFlavorUpgrader,
                                               nodeRepository().clock().instant(),
                                               upgrade);
    }

}