aboutsummaryrefslogtreecommitdiffstats
path: root/storage/src/vespa/storage/storageutil/bloomfilter.h
blob: 6f7d8e0fdc85d7e662fbc513f0653fb19a0b066b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <stdio.h>
#include <inttypes.h>

class BloomFilter
{
private:
    BloomFilter(const BloomFilter &);
    BloomFilter& operator=(const BloomFilter &);

public:
        BloomFilter(int size, int hashes, uint32_t *buf = NULL);
        ~BloomFilter();

        bool check(const uint32_t *data, int len, bool add);
        bool check(const char *data, int len, bool add);
        bool check(const char *data, bool add);


    private:
        int _size;
        int _hashes;
        uint32_t *_buf;
        bool _mine;

        static const uint32_t MULT1 = 1500450271;
        static const uint32_t MULT2 = 2860486313U;
        uint32_t hash(const uint32_t *data, int len, uint32_t multiplier, uint32_t max);
        uint32_t hash(const char *data, int len, uint32_t multiplier, uint32_t max);
        uint32_t hash(const char *data, uint32_t multiplier, uint32_t max);

        bool check(uint32_t hash1, uint32_t hash2, bool add);
        bool isSet(uint32_t pos, bool set);

};

uint32_t
BloomFilter::hash(const uint32_t *data, int len, uint32_t multiplier, uint32_t max)
{
    uint32_t val = 1;
    for (int i = 0; i < len; i++) {
        val = (multiplier * val + data[i]) % max;
    }
    return val;
}

uint32_t
BloomFilter::hash(const char *data, int len, uint32_t multiplier, uint32_t max)
{
    uint32_t val = 1;
    for (int i = 0; i < len; i++) {
        val = (multiplier * val + data[i]) % max;
    }
    return val;
}

uint32_t
BloomFilter::hash(const char *data, uint32_t multiplier, uint32_t max)
{
    uint32_t val = 1;
    for (int i = 0; data[i]; i++) {
        val = (multiplier * val + data[i]) % max;
    }
    return val;
}


bool
BloomFilter::check(const uint32_t *data, int len, bool add)
{
    uint32_t hash1 = hash(data, len, MULT1, _size);
    uint32_t hash2 = hash(data, len, MULT2, _size);
    return check(hash1, hash2, add);
}

bool
BloomFilter::check(const char *data, int len, bool add)
{
    uint32_t hash1 = hash(data, len, MULT1, _size);
    uint32_t hash2 = hash(data, len, MULT2, _size);
    return check(hash1, hash2, add);
}
bool
BloomFilter::check(const char *data, bool add)
{
    uint32_t hash1 = hash(data, MULT1, _size);
    uint32_t hash2 = hash(data, MULT2, _size);
    return check(hash1, hash2, add);
}

bool
BloomFilter::check(uint32_t hash1, uint32_t hash2, bool add)
{
    bool found = true;
    for (int i = 0; i < _hashes; i++) {
        hash1 = (hash1 + hash2) % _size;
        hash2 = (hash2 + i) % _size;
        if (!isSet(hash1, add)) {
            if (!add) {
                return false;
            }
            found = false;
        }
    }
    return found;
}

bool
BloomFilter::isSet(uint32_t pos, bool add)
{
    if ((_buf[pos >> 5] & (1 << (pos & 31))) == 0) {
        if (add) {
            _buf[pos >> 5] |= (1 << (pos & 31));
        }
        return false;
    }
    return true;
}