summaryrefslogtreecommitdiffstats
path: root/container-search/src/main/java/com/yahoo/search/dispatch/TopKEstimator.java
blob: d48e337b0c1dccc16318d83f582ed1874dc3173a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.search.dispatch;

import org.apache.commons.math3.distribution.TDistribution;

/**
 * Use StudentT distribution and estimate how many hits you need from each partition
 * to to get the globally top-k documents with the desired probability
 * @author baldersheim
 */
public class TopKEstimator {
    private final TDistribution studentT;
    private final double defaultP;
    private final boolean estimate;
    private final double skewFactor;

    private static boolean needEstimate(double p) {
        return (0.0 < p) && (p < 1.0);
    }
    TopKEstimator(double freedom, double defaultProbability) {
        this(freedom, defaultProbability, 0.0);
    }
    public TopKEstimator(double freedom, double defaultProbability, double skewFactor) {
        this.studentT = new TDistribution(null, freedom);
        defaultP = defaultProbability;
        estimate = needEstimate(defaultP);
        this.skewFactor = skewFactor;
    }
    double estimateExactK(double k, double n, double p) {
        double p_max = (1 + skewFactor)/n;
        n = Math.max(1, 1/p_max);
        double variance = k * 1/n * (1 - 1/n);
        double p_inverse = 1 - (1 - p)/n;
        return k/n + studentT.inverseCumulativeProbability(p_inverse) * Math.sqrt(variance);
    }
    double estimateExactK(double k, double n) {
        return estimateExactK(k, n, defaultP);
    }
    public int estimateK(int k, int n) {
        return (estimate && n > 1)
                ? Math.min(k, (int)Math.ceil(estimateExactK(k, n, defaultP)))
                : k;
    }
    public int estimateK(int k, int n, double p) {
        return (needEstimate(p) && (n > 1))
                ? Math.min(k, (int)Math.ceil(estimateExactK(k, n, p)))
                : k;
    }
}