blob: 8c891339e2911838d296e63720e2f9a248a2df48 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.maintenance;
import com.yahoo.vespa.hosted.controller.ApplicationController;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.Instance;
import com.yahoo.vespa.hosted.controller.api.integration.configserver.NodeRepository;
import com.yahoo.vespa.hosted.controller.application.Deployment;
import java.time.Duration;
import java.util.logging.Level;
/**
* This computes, for every application deployment
* - the current fraction of the application's global traffic it receives
* - the max fraction it can possibly receive, assuming traffic is evenly distributed over regions
* and max one region is down at any time. (We can let deployment.xml override these assumptions later).
*
* These two numbers are sent to a config server of each region where it is ultimately
* consumed by autoscaling.
*
* It depends on the traffic metrics collected by DeploymentMetricsMaintainer.
*
* @author bratseth
*/
public class TrafficShareUpdater extends ControllerMaintainer {
private final ApplicationController applications;
private final NodeRepository nodeRepository;
public TrafficShareUpdater(Controller controller, Duration duration) {
super(controller, duration);
this.applications = controller.applications();
this.nodeRepository = controller.serviceRegistry().configServer().nodeRepository();
}
@Override
protected double maintain() {
Exception lastException = null;
int attempts = 0;
int failures = 0;
for (var application : applications.asList()) {
for (var instance : application.instances().values()) {
for (var deployment : instance.deployments().values()) {
if ( ! deployment.zone().environment().isProduction()) continue;
if (shuttingDown()) return 1.0;
try {
attempts++;
updateTrafficFraction(instance, deployment);
}
catch (Exception e) {
// Some failures due to locked applications are expected and benign
failures++;
lastException = e;
}
}
}
}
double successFactor = asSuccessFactor(attempts, failures);
if ( successFactor == 0 )
log.log(Level.WARNING, "Could not update traffic share on any applications", lastException);
return successFactor;
}
private void updateTrafficFraction(Instance instance, Deployment deployment) {
double qpsInZone = deployment.metrics().queriesPerSecond();
double totalQps = instance.deployments().values().stream()
.filter(i -> i.zone().environment().isProduction())
.mapToDouble(i -> i.metrics().queriesPerSecond()).sum();
long prodRegions = instance.deployments().values().stream()
.filter(i -> i.zone().environment().isProduction())
.count();
double currentReadShare = totalQps == 0 ? 0 : qpsInZone / totalQps;
double maxReadShare = prodRegions < 2 ? 1.0 : 1.0 / ( prodRegions - 1.0);
if (currentReadShare > maxReadShare) // This can happen because the assumption of equal traffic
maxReadShare = currentReadShare; // distribution can be incorrect
nodeRepository.patchApplication(deployment.zone(), instance.id(), currentReadShare, maxReadShare);
}
}
|