blob: 47ffe4432d1665c046be54c7c6e53bc4e3282273 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.searchlib.aggregation.hll;
import net.jpountz.xxhash.XXHash32;
import net.jpountz.xxhash.XXHashFactory;
import org.junit.Test;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Random;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class HyperLogLogEstimatorTest {
private XXHash32 hashGenerator = XXHashFactory.safeInstance().hash32();
@Test
public void requireThatEstimateInRangeForSmallValueSetUsingNormalSketch() {
testEstimateUsingNormalSketch(15, 1337);
}
@Test
public void requireThatEstimateInRangeForLargeValueSetUsingNormalSketch() {
testEstimateUsingNormalSketch(1_000_000, 1337);
}
@Test
public void requireThatEstimateIsReasonableForFullNormalSketch() {
HyperLogLogEstimator estimator = new HyperLogLogEstimator(10);
NormalSketch sketch = new NormalSketch(10);
// Fill sketch with 23 - highest possible zero prefix for precision 10.
Arrays.fill(sketch.data(), (byte) 23);
long estimate = estimator.estimateCount(sketch);
assertTrue(estimate > 6_000_000_000l);
}
@Test
public void requireThatEstimateIsCorrectForSparseSketch() {
SparseSketch sketch = new SparseSketch();
HyperLogLogEstimator estimator = new HyperLogLogEstimator(10);
long estimate = estimator.estimateCount(sketch);
assertEquals(0, estimate);
// Check that estimate is correct for every possible sketch size up to threshold
for (int i = 1; i <= HyperLogLog.SPARSE_SKETCH_CONVERSION_THRESHOLD; i++) {
sketch.aggregate(i);
estimate = estimator.estimateCount(sketch);
assertEquals(i, estimate);
}
}
private void testEstimateUsingNormalSketch(int nValues, int seed) {
for (int precision = 4; precision <= 16; precision++) {
HyperLogLogEstimator estimator = new HyperLogLogEstimator(precision);
long uniqueCount = new Random(seed)
.ints(nValues)
.map(this::makeHash)
.distinct()
.count();
Iterable<Integer> hashValues = () ->
new Random(seed)
.ints(nValues)
.map(this::makeHash)
.iterator();
NormalSketch sketch = new NormalSketch(precision);
sketch.aggregate(hashValues);
long estimate = estimator.estimateCount(sketch);
double standardError = standardErrorForPrecision(precision);
assertTrue(estimate > uniqueCount * (1 - standardError) * 0.9);
assertTrue(estimate < uniqueCount * (1 + standardError) * 1.1);
}
}
private static double standardErrorForPrecision(int precision) {
return 1.04 / Math.sqrt(1 << precision); // HLL standard error
}
private int makeHash(int value) {
final int seed = 42424242;
byte[] bytes = ByteBuffer.allocate(4).putInt(value).array();
return hashGenerator.hash(bytes, 0, 4, seed);
}
}
|