aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/test/java/com/yahoo/searchlib/aggregation/hll/HyperLogLogEstimatorTest.java
blob: 47ffe4432d1665c046be54c7c6e53bc4e3282273 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.searchlib.aggregation.hll;

import net.jpountz.xxhash.XXHash32;
import net.jpountz.xxhash.XXHashFactory;
import org.junit.Test;

import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Random;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class HyperLogLogEstimatorTest {

    private XXHash32 hashGenerator = XXHashFactory.safeInstance().hash32();

    @Test
    public void requireThatEstimateInRangeForSmallValueSetUsingNormalSketch() {
        testEstimateUsingNormalSketch(15, 1337);
    }

    @Test
    public void requireThatEstimateInRangeForLargeValueSetUsingNormalSketch() {
        testEstimateUsingNormalSketch(1_000_000, 1337);
    }

    @Test
    public void requireThatEstimateIsReasonableForFullNormalSketch() {
        HyperLogLogEstimator estimator = new HyperLogLogEstimator(10);
        NormalSketch sketch = new NormalSketch(10);
        // Fill sketch with 23 - highest possible zero prefix for precision 10.
        Arrays.fill(sketch.data(), (byte) 23);
        long estimate = estimator.estimateCount(sketch);
        assertTrue(estimate > 6_000_000_000l);
    }

    @Test
    public void requireThatEstimateIsCorrectForSparseSketch() {
        SparseSketch sketch = new SparseSketch();
        HyperLogLogEstimator estimator = new HyperLogLogEstimator(10);
        long estimate = estimator.estimateCount(sketch);
        assertEquals(0, estimate);

        // Check that estimate is correct for every possible sketch size up to threshold
        for (int i = 1; i <= HyperLogLog.SPARSE_SKETCH_CONVERSION_THRESHOLD; i++) {
            sketch.aggregate(i);
            estimate = estimator.estimateCount(sketch);
            assertEquals(i, estimate);
        }
    }

    private void testEstimateUsingNormalSketch(int nValues, int seed) {
        for (int precision = 4; precision <= 16; precision++) {
            HyperLogLogEstimator estimator = new HyperLogLogEstimator(precision);

            long uniqueCount = new Random(seed)
                    .ints(nValues)
                    .map(this::makeHash)
                    .distinct()
                    .count();

            Iterable<Integer> hashValues = () ->
                    new Random(seed)
                        .ints(nValues)
                        .map(this::makeHash)
                        .iterator();

            NormalSketch sketch = new NormalSketch(precision);
            sketch.aggregate(hashValues);
            long estimate = estimator.estimateCount(sketch);
            double standardError = standardErrorForPrecision(precision);
            assertTrue(estimate > uniqueCount * (1 - standardError) * 0.9);
            assertTrue(estimate < uniqueCount * (1 + standardError) * 1.1);
        }
    }

    private static double standardErrorForPrecision(int precision) {
        return 1.04 / Math.sqrt(1 << precision); // HLL standard error
    }


    private int makeHash(int value) {
        final int seed = 42424242;
        byte[] bytes = ByteBuffer.allocate(4).putInt(value).array();
        return hashGenerator.hash(bytes, 0, 4, seed);
    }
}