aboutsummaryrefslogtreecommitdiffstats
path: root/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp
blob: 602faeb4dc0ee721dc860c53bf714a5617bb01a8 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#pragma once

#include "private_helpers.hpp"

namespace vespalib::hwaccelrated::avx {

namespace {

inline bool validAlignment(const void * p, const size_t align) noexcept {
    return (reinterpret_cast<uint64_t>(p) & (align-1)) == 0;
}

template <typename T, typename V>
T sumT(const V & v) noexcept {
    T sum(0);
    for (size_t i(0); i < (sizeof(V)/sizeof(T)); i++) {
        sum += v[i];
    }
    return sum;
}

template <typename T, size_t C>
T sumR(const T * v) noexcept {
    if (C == 1) {
        return v[0];
    } else if (C == 2) {
        return v[0] + v[1];
    } else {
        return sumR<T, C/2>(v) + sumR<T, C/2>(v+C/2);
    }
}

template <typename T, size_t VLEN, unsigned AlignA, unsigned AlignB, size_t VectorsPerChunk>
static T computeDotProduct(const T * af, const T * bf, size_t sz) noexcept __attribute__((noinline));

template <typename T, size_t VLEN, unsigned AlignA, unsigned AlignB, size_t VectorsPerChunk>
T computeDotProduct(const T * af, const T * bf, size_t sz) noexcept
{
    constexpr const size_t ChunkSize = VLEN*VectorsPerChunk/sizeof(T);
    typedef T V __attribute__ ((vector_size (VLEN)));
    typedef T A __attribute__ ((vector_size (VLEN), aligned(AlignA)));
    typedef T B __attribute__ ((vector_size (VLEN), aligned(AlignB)));
    V partial[VectorsPerChunk];
    memset(partial, 0, sizeof(partial));
    const A * a = reinterpret_cast<const A *>(af);
    const B * b = reinterpret_cast<const B *>(bf);

    const size_t numChunks(sz/ChunkSize);
    for (size_t i(0); i < numChunks; i++) {
        for (size_t j(0); j < VectorsPerChunk; j++) {
            partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
        }
    }
    T sum(0);
    for (size_t i(numChunks*ChunkSize); i < sz; i++) {
        sum += af[i] * bf[i];
    }
    partial[0] = sumR<V, VectorsPerChunk>(partial);

    return sum + sumT<T, V>(partial[0]);
}

}

template <typename T, size_t VLEN, size_t VectorsPerChunk=4>
VESPA_DLL_LOCAL T dotProductSelectAlignment(const T * af, const T * bf, size_t sz) noexcept;

template <typename T, size_t VLEN, size_t VectorsPerChunk>
T dotProductSelectAlignment(const T * af, const T * bf, size_t sz) noexcept
{
    if (validAlignment(af, VLEN)) {
        if (validAlignment(bf, VLEN)) {
            return computeDotProduct<T, VLEN, VLEN, VLEN, VectorsPerChunk>(af, bf, sz);
        } else {
            return computeDotProduct<T, VLEN, VLEN, 1, VectorsPerChunk>(af, bf, sz);
        }
    } else {
        if (validAlignment(bf, VLEN)) {
            return computeDotProduct<T, VLEN, 1, VLEN, VectorsPerChunk>(af, bf, sz);
        } else {
            return computeDotProduct<T, VLEN, 1, 1, VectorsPerChunk>(af, bf, sz);
        }
    }
}

template <typename T, unsigned VLEN, unsigned AlignA, unsigned AlignB>
double
euclideanDistanceT(const T * af, const T * bf, size_t sz) noexcept
{
    constexpr unsigned VectorsPerChunk = 4;
    constexpr unsigned ChunkSize = VLEN*VectorsPerChunk/sizeof(T);
    typedef T V __attribute__ ((vector_size (VLEN)));
    typedef T A __attribute__ ((vector_size (VLEN), aligned(AlignA)));
    typedef T B __attribute__ ((vector_size (VLEN), aligned(AlignB)));
    V partial[VectorsPerChunk];
    memset(partial, 0, sizeof(partial));
    const A * a = reinterpret_cast<const A *>(af);
    const B * b = reinterpret_cast<const B *>(bf);

    const size_t numChunks(sz/ChunkSize);
    for (size_t i(0); i < numChunks; i++) {
        for (size_t j(0); j < VectorsPerChunk; j++) {
            partial[j] += (a[VectorsPerChunk*i+j] - b[VectorsPerChunk*i+j]) * (a[VectorsPerChunk*i+j] - b[VectorsPerChunk*i+j]);
        }
    }
    double sum(0);
    for (size_t i(numChunks*ChunkSize); i < sz; i++) {
        sum += (af[i] - bf[i]) * (af[i] - bf[i]);
    }
    partial[0] = sumR<V, VectorsPerChunk>(partial);

    return sum + sumT<T, V>(partial[0]);
}

template <typename T, unsigned VLEN>
double euclideanDistanceSelectAlignment(const T * af, const T * bf, size_t sz) noexcept
{
    constexpr unsigned ALIGN = 32;
    if (validAlignment(af, ALIGN)) {
        if (validAlignment(bf, ALIGN)) {
            return euclideanDistanceT<T, VLEN, ALIGN, ALIGN>(af, bf, sz);
        } else {
            return euclideanDistanceT<T, VLEN, ALIGN, 1>(af, bf, sz);
        }
    } else {
        if (validAlignment(bf, ALIGN)) {
            return euclideanDistanceT<T, VLEN, 1, ALIGN>(af, bf, sz);
        } else {
            return euclideanDistanceT<T, VLEN, 1, 1>(af, bf, sz);
        }
    }
}

}