aboutsummaryrefslogtreecommitdiffstats
path: root/document/src/vespa/document/base/globalid.h
blob: c3757de4afd20fd83b07f4c9ab511a291ad932ec (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
/**
 * \class document::GlobalId
 * \ingroup base
 *
 * \brief Representation of a global ID.
 *
 * The global ID, is a hash of the document ID, used where we need to
 * distinguish between documents, but where storing a variable length string is
 * not practical. VESPA search currently assumes that there will be no hash
 * collisions, and if one should occur, the latest document will be the only
 * one present in the indexes. VESPA document storage handles global ID
 * collisions, but optimize code for the instances where it doesn't happen.
 *
 * It's a 96 bit MD5 checksum, so the chances of a collision is very very small.
 *
 * This class should not inherit from anything, as clients may use it's memory
 * representation directly to save it.
 *
 * The interface for creating or modifying these objects is not user friendly.
 * You should create it by calling DocumentId::getGlobalId() and you should not
 * need to modify it, unless by standard copy constructor or assignment
 * operator.
 */
#pragma once

#include <cstdint>
#include <vespa/vespalib/stllike/string.h>
#include <vespa/document/bucket/bucketid.h>

namespace document {

class GlobalId {
public:
    /**
     * The number of bytes used to represent a global id.
     */
    static const unsigned int LENGTH = 12;

    /** Hash function that can be used to put global ids in hash set/maps. */
    struct hash {
        size_t operator () (const GlobalId & g) const {
            return g._gid._bucketId._gid;
        }
    };

private:
    struct BucketIdS {
        uint32_t _location;
        uint64_t _gid;
    } __attribute__((packed));
    union {
        unsigned char _buffer[LENGTH];
        BucketIdS     _bucketId;
        uint32_t      _nums[LENGTH/sizeof(uint32_t)];
    } _gid;

public:
    /**
     * Defines a comparator object that can be used for sorting global ids based on bucket order. An std::map
     * using this comparator to order gid keys can use
     * map.lower_bound(GlobalId::calculateFirstInBucket(bucket)) and
     * map.upper_bound(GlobalId::calculateLastInBucket(bucket)) to traverse only those gids that belong to the
     * given bucket.
     */
    struct BucketOrderCmp {
        bool operator()(const GlobalId &lhs, const GlobalId &rhs) const {
            const uint32_t * __restrict__ a = lhs._gid._nums;
            const uint32_t * __restrict__ b = rhs._gid._nums;
            if (a[0] != b[0]) {
                return bitswap(a[0]) < bitswap(b[0]);
            }
            if (a[2] != b[2]) {
                return bitswap(a[2]) < bitswap(b[2]);
            }
            return __builtin_bswap32(a[1]) < __builtin_bswap32(b[1]);
        }
        static uint32_t bitswap(uint32_t value) {
            value = ((value & 0x55555555) << 1) | ((value & 0xaaaaaaaa) >> 1);
            value = ((value & 0x33333333) << 2) | ((value & 0xcccccccc) >> 2);
            value = ((value & 0x0f0f0f0f) << 4) | ((value & 0xf0f0f0f0) >> 4);
            return __builtin_bswap32(value);
        }
        // Return most significant 32 bits of gid key
        static uint32_t gid_key32(const GlobalId &gid) {
            return bitswap(gid._gid._nums[0]);
        }

        //These 2 compare methods are exposed only for testing
        static int compareRaw(unsigned char a, unsigned char b) {
            return a - b;
        }
        static int compare(unsigned char a, unsigned char b) {
            return compareRaw(document::reverseBitTable[a], document::reverseBitTable[b]);
        }
    };

    /**
     * Constructs a new global id with all 0 bits.
     */
    GlobalId() noexcept { set("\0\0\0\0\0\0\0\0\0\0\0\0"); }



    /**
     * Constructs a new global id with initial content. This copies the first LENGTH bytes from the given
     * address into the internal buffer.
     *
     * @param gid The address to the data to copy.
     */
    explicit GlobalId(const void *gid) noexcept { set(gid); }

    GlobalId(const GlobalId &rhs) noexcept = default;

    /**
     * Implements the assignment operator.
     *
     * @param other The global id whose value to copy to this.
     * @return This.
     */
    GlobalId& operator=(const GlobalId& other) noexcept { memcpy(_gid._buffer, other._gid._buffer, sizeof(_gid._buffer)); return *this; }

    /**
     * Implements the equality operator.
     *
     * @param other The global id to compare to.
     * @return True if this equals the other, false otherwise.
     */
    bool operator==(const GlobalId& other) const noexcept { return (memcmp(_gid._buffer, other._gid._buffer, sizeof(_gid._buffer)) == 0); }

    /**
     * Implements the inequality operator.
     *
     * @param other The global id to compare to.
     * @return True if this does NOT equal the other, false otherwise.
     */
    bool operator!=(const GlobalId& other) const noexcept { return (memcmp(_gid._buffer, other._gid._buffer, sizeof(_gid._buffer)) != 0); }

    /**
     * Implements the less-than operator. If you intend to map global ids in such a way that they can
     * efficiently be looked up based on corresponding bucket ids, you should use the {@link BucketOrderCmp}
     * for ordering your collection.
     *
     * @param other The global id to compare to.
     * @return True if comparing the bits of this to the other yields a negative result.
     */
    bool operator<(const GlobalId& other) const noexcept { return (memcmp(_gid._buffer, other._gid._buffer, sizeof(_gid._buffer)) < 0); }

    /**
     * Copies the first LENGTH bytes from the given address into the internal byte array.
     *
     * @param id The bytes to set.
     */
    void set(const void *id) noexcept { memcpy(_gid._buffer, id, sizeof(_gid._buffer)); }

    /**
     * Returns the raw byte array that constitutes this global id.
     */
    const unsigned char *get() const { return _gid._buffer; }

    /**
     * If a GID has been generated from a document ID with a location (n=, g=),
     * the returned value will be deterministic based on the location, and two
     * different document IDs with the same location will return the same value.
     * If not, the value will not have any usable semantics but will still be
     * deterministic in the sense that two identical document IDs will generate
     * the same returned value.
     */
    uint32_t getLocationSpecificBits() const noexcept {
        return _gid._bucketId._location;
    }

    /**
     * Returns a string representation of this global id.
     */
    vespalib::string toString() const;

    /**
     * Parse the source string to generate a global id object.  The source is expected to contain exactly what
     * toString() creates from a global identifier.
     *
     * @param str The string to parse.
     * @throws vespalib::IllegalArgumentException Thrown if input is not in GID format.
     */
    static GlobalId parse(vespalib::stringref str);

    /**
     * Returns the most specified bucket id to which this global id belongs.
     *
     * This function should not be used as it puts too strict limitations on
     * what the bucket identifier can be.. Simplifying implementation to not
     * use bucket id internals. Function should be removed before we can change
     * bucket id.
     *
     * @return The bucket id.
     */
    BucketId convertToBucketId() const {
        uint64_t location(_gid._bucketId._location);
        uint64_t gid(_gid._bucketId._gid);
        return BucketId(58, (gid & 0xFFFFFFFF00000000ull)
                            | (location & 0xFFFFFFFF));
    }

    /**
     * Returns whether or not this global id is contained in the given bucket.
     *
     * @param bucket The bucket to check.
     * @return True, if this gid is contained in the bucket.
     */
    bool containedInBucket(const BucketId &bucket) const;

    /**
     * Given a list of global identifiers sorted in bucket order (see {@link BucketOrderCmp}), this function
     * returns the global id that is the smallest that can exist in the given bucket.
     *
     * @param bucket The bucket id whose lowest gid to find.
     * @return The first global id of the bucket.
     */
    static GlobalId calculateFirstInBucket(const BucketId &bucket);

    /**
     * Given a list of global identifiers sorted in bucket order (see {@link BucketOrderCmp}), this function
     * returns the global id that is the largest that can exist in the given bucket.
     *
     * @param bucket The bucket id whose largest gid to find.
     * @return The last global id of the bucket.
     */
    static GlobalId calculateLastInBucket(const BucketId &bucket);
};

vespalib::asciistream & operator << (vespalib::asciistream & os, const GlobalId & gid);
std::ostream& operator<<(std::ostream& out, const GlobalId& gid);

} // document