From 4ff929491b2966e91cefdb473470a628488205c8 Mon Sep 17 00:00:00 2001
From: Henning Baldersheim <balder@yahoo-inc.com>
Date: Tue, 11 Oct 2016 21:19:38 +0000
Subject: Differentiate between avx, avx2 and avx512f

---
 .../src/vespa/vespalib/hwaccelrated/CMakeLists.txt |   4 +-
 .../src/vespa/vespalib/hwaccelrated/avx.cpp        |  88 +--------------
 .../src/vespa/vespalib/hwaccelrated/avx.h          |  14 +--
 .../src/vespa/vespalib/hwaccelrated/avx2.cpp       |  86 +-------------
 .../src/vespa/vespalib/hwaccelrated/avx2.h         |  13 +--
 .../src/vespa/vespalib/hwaccelrated/avx512.cpp     |  23 ++++
 .../src/vespa/vespalib/hwaccelrated/avx512.h       |  22 ++++
 .../src/vespa/vespalib/hwaccelrated/avxprivate.hpp | 123 +++++++++++++++++++++
 .../vespa/vespalib/hwaccelrated/iaccelrated.cpp    |  10 +-
 .../src/vespa/vespalib/hwaccelrated/sse2.h         |   4 +-
 10 files changed, 194 insertions(+), 193 deletions(-)
 create mode 100644 staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
 create mode 100644 staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
 create mode 100644 staging_vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp

(limited to 'staging_vespalib')
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt b/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt
index 2495c106277..1c80add3d8e 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt
@@ -6,7 +6,9 @@ vespa_add_library(staging_vespalib_vespalib_hwaccelrated OBJECT
     sse2.cpp
     avx.cpp
     avx2.cpp
+    avx512.cpp
     DEPENDS
 )
 set_source_files_properties(avx.cpp PROPERTIES COMPILE_FLAGS -march=sandybridge)
-set_source_files_properties(avx2.cpp PROPERTIES COMPILE_FLAGS -march=broadwell)
+set_source_files_properties(avx2.cpp PROPERTIES COMPILE_FLAGS -march=haswell)
+set_source_files_properties(avx512.cpp PROPERTIES COMPILE_FLAGS -march=skylake) # should be skylake-avx512 when assembler supports it.
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp
index d9e0cc41882..ec5064bf647 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp
@@ -1,104 +1,22 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-// Copyright (C) 2003 Fast Search & Transfer ASA
-// Copyright (C) 2003 Overture Services Norway AS
 
 #include <vespa/vespalib/hwaccelrated/avx.h>
+#include <vespa/vespalib/hwaccelrated/avxprivate.hpp>
 
 namespace vespalib {
 
 namespace hwaccelrated {
 
-namespace {
-
-bool validAlignment32(const void * p) {
-    return (reinterpret_cast<uint64_t>(p) & 0x1ful) == 0;
-}
-
-template <typename T>
-class TypeSpecifics { };
-
-template <>
-struct TypeSpecifics<float> {
-    static constexpr const size_t V_SZ = 32;
-    typedef float V __attribute__ ((vector_size (V_SZ)));
-    static constexpr const size_t VectorsPerChunk = 4;
-    static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-    static float sum(V v) {
-        return v[0] + v[1] + v[2] + v[3] + v[4] + v[5] + v[6] + v[7];
-    }
-};
-
-template <>
-struct TypeSpecifics<double> {
-    static constexpr const size_t V_SZ = 32;
-    typedef double V __attribute__ ((vector_size (V_SZ)));
-    static constexpr const size_t VectorsPerChunk = 4;
-    static constexpr const V zero = {0.0, 0.0, 0.0, 0.0};
-    static float sum(V v) {
-        return v[0] + v[1] + v[2] + v[3];
-    }
-};
-
-}
-
-template <typename T, unsigned AlignA, unsigned AlignB>
-T
-AvxAccelrator::computeDotProduct(const T * af, const T * bf, size_t sz)
-{
-    using TT = TypeSpecifics<T>;
-    constexpr const size_t ChunkSize = TT::V_SZ*4/sizeof(T);
-    constexpr const size_t VectorsPerChunk = TT::VectorsPerChunk;
-    typename TT::V partial[VectorsPerChunk] = { TT::zero, TT::zero, TT::zero, TT::zero};
-    typedef T A __attribute__ ((vector_size (TT::V_SZ), aligned(AlignA)));
-    typedef T B __attribute__ ((vector_size (TT::V_SZ), aligned(AlignB)));
-    const A * a = reinterpret_cast<const A *>(af);
-    const B * b = reinterpret_cast<const B *>(bf);
-
-    const size_t numChunks(sz/ChunkSize);
-    for (size_t i(0); i < numChunks; i++) {
-        for (size_t j(0); j < VectorsPerChunk; j++) {
-            partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
-        }
-    }
-    T sum(0);
-    for (size_t i(numChunks*ChunkSize); i < sz; i++) {
-        sum += af[i] * bf[i];
-    }
-    for (size_t i(1); i < VectorsPerChunk; i++) {
-        partial[0] += partial[i];
-    }
-    return sum + TT::sum(partial[0]);
-}
-
-template <typename T>
-T
-AvxAccelrator::dotProductSelectAlignment(const T * af, const T * bf, size_t sz)
-{
-    if (validAlignment32(af)) {
-        if (validAlignment32(bf)) {
-            return computeDotProduct<T, 32, 32>(af, bf, sz);
-        } else {
-            return computeDotProduct<T, 32, 1>(af, bf, sz);
-        }
-    } else {
-        if (validAlignment32(bf)) {
-            return computeDotProduct<T, 1, 32>(af, bf, sz);
-        } else {
-            return computeDotProduct<T, 1, 1>(af, bf, sz);
-        }
-    }
-}
-
 float
 AvxAccelrator::dotProduct(const float * af, const float * bf, size_t sz) const
 {
-    return dotProductSelectAlignment(af, bf, sz);
+    return avx::dotProductSelectAlignment<float, 32>(af, bf, sz);
 }
 
 double
 AvxAccelrator::dotProduct(const double * af, const double * bf, size_t sz) const
 {
-    return dotProductSelectAlignment(af, bf, sz);
+    return avx::dotProductSelectAlignment<double, 32>(af, bf, sz);
 }
 
 }
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h
index ed833713c2d..4b391c163ac 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h
@@ -1,29 +1,21 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-// Copyright (C) 2003 Fast Search & Transfer ASA
-// Copyright (C) 2003 Overture Services Norway AS
 
 #pragma once
 
 #include <vespa/vespalib/hwaccelrated/sse2.h>
-#include <vespa/fastos/dynamiclibrary.h>
 
 namespace vespalib {
 
 namespace hwaccelrated {
 
 /**
- * Generic cpu agnostic implementation.
+ * Avx-256 implementation.
  */
 class AvxAccelrator : public Sse2Accelrator
 {
 public:
-    virtual float dotProduct(const float * a, const float * b, size_t sz) const;
-    virtual double dotProduct(const double * a, const double * b, size_t sz) const;
-private:
-    template <typename T>
-    VESPA_DLL_LOCAL static T dotProductSelectAlignment(const T * af, const T * bf, size_t sz);
-    template <typename T, unsigned AlignA, unsigned AlignB>
-    VESPA_DLL_LOCAL static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline));
+    float dotProduct(const float * a, const float * b, size_t sz) const override;
+    double dotProduct(const double * a, const double * b, size_t sz) const override;
 };
 
 }
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
index 867b39ec326..f87738e3a6c 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
@@ -1,102 +1,22 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include <vespa/vespalib/hwaccelrated/avx2.h>
+#include <vespa/vespalib/hwaccelrated/avxprivate.hpp>
 
 namespace vespalib {
 
 namespace hwaccelrated {
 
-namespace {
-
-bool validAlignment32(const void * p) {
-    return (reinterpret_cast<uint64_t>(p) & 0x1ful) == 0;
-}
-
-template <typename T>
-class TypeSpecifics { };
-
-template <>
-struct TypeSpecifics<float> {
-    static constexpr const size_t V_SZ = 32;
-    typedef float V __attribute__ ((vector_size (V_SZ)));
-    static constexpr const size_t VectorsPerChunk = 4;
-    static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-    static float sum(V v) {
-        return v[0] + v[1] + v[2] + v[3] + v[4] + v[5] + v[6] + v[7];
-    }
-};
-
-template <>
-struct TypeSpecifics<double> {
-    static constexpr const size_t V_SZ = 32;
-    typedef double V __attribute__ ((vector_size (V_SZ)));
-    static constexpr const size_t VectorsPerChunk = 4;
-    static constexpr const V zero = {0.0, 0.0, 0.0, 0.0};
-    static float sum(V v) {
-        return v[0] + v[1] + v[2] + v[3];
-    }
-};
-
-}
-
-template <typename T, unsigned AlignA, unsigned AlignB>
-T
-Avx2Accelrator::computeDotProduct(const T * af, const T * bf, size_t sz)
-{
-    using TT = TypeSpecifics<T>;
-    constexpr const size_t ChunkSize = TT::V_SZ*4/sizeof(T);
-    constexpr const size_t VectorsPerChunk = TT::VectorsPerChunk;
-    typename TT::V partial[VectorsPerChunk] = { TT::zero, TT::zero, TT::zero, TT::zero};
-    typedef T A __attribute__ ((vector_size (TT::V_SZ), aligned(AlignA)));
-    typedef T B __attribute__ ((vector_size (TT::V_SZ), aligned(AlignB)));
-    const A * a = reinterpret_cast<const A *>(af);
-    const B * b = reinterpret_cast<const B *>(bf);
-
-    const size_t numChunks(sz/ChunkSize);
-    for (size_t i(0); i < numChunks; i++) {
-        for (size_t j(0); j < VectorsPerChunk; j++) {
-            partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
-        }
-    }
-    T sum(0);
-    for (size_t i(numChunks*ChunkSize); i < sz; i++) {
-        sum += af[i] * bf[i];
-    }
-    for (size_t i(1); i < VectorsPerChunk; i++) {
-        partial[0] += partial[i];
-    }
-    return sum + TT::sum(partial[0]);
-}
-
-template <typename T>
-T
-Avx2Accelrator::dotProductSelectAlignment(const T * af, const T * bf, size_t sz)
-{
-    if (validAlignment32(af)) {
-        if (validAlignment32(bf)) {
-            return computeDotProduct<T, 32, 32>(af, bf, sz);
-        } else {
-            return computeDotProduct<T, 32, 1>(af, bf, sz);
-        }
-    } else {
-        if (validAlignment32(bf)) {
-            return computeDotProduct<T, 1, 32>(af, bf, sz);
-        } else {
-            return computeDotProduct<T, 1, 1>(af, bf, sz);
-        }
-    }
-}
-
 float
 Avx2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const
 {
-    return dotProductSelectAlignment(af, bf, sz);
+    return avx::dotProductSelectAlignment<float, 32>(af, bf, sz);
 }
 
 double
 Avx2Accelrator::dotProduct(const double * af, const double * bf, size_t sz) const
 {
-    return dotProductSelectAlignment(af, bf, sz);
+    return avx::dotProductSelectAlignment<double, 32>(af, bf, sz);
 }
 
 }
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
index e434bfc84e6..56d3a8ac65e 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
@@ -1,6 +1,4 @@
 // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-// Copyright (C) 2003 Fast Search & Transfer ASA
-// Copyright (C) 2003 Overture Services Norway AS
 
 #pragma once
 
@@ -11,18 +9,13 @@ namespace vespalib {
 namespace hwaccelrated {
 
 /**
- * Generic cpu agnostic implementation.
+ * Avx-512 implementation.
  */
 class Avx2Accelrator : public AvxAccelrator
 {
 public:
-    virtual float dotProduct(const float * a, const float * b, size_t sz) const;
-    virtual double dotProduct(const double * a, const double * b, size_t sz) const;
-private:
-    template <typename T>
-    VESPA_DLL_LOCAL static T dotProductSelectAlignment(const T * af, const T * bf, size_t sz);
-    template <typename T, unsigned AlignA, unsigned AlignB>
-    VESPA_DLL_LOCAL static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline));
+    float dotProduct(const float * a, const float * b, size_t sz) const override;
+    double dotProduct(const double * a, const double * b, size_t sz) const override;
 };
 
 }
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
new file mode 100644
index 00000000000..9f7a6dcda3e
--- /dev/null
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
@@ -0,0 +1,23 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/hwaccelrated/avx512.h>
+#include <vespa/vespalib/hwaccelrated/avxprivate.hpp>
+
+namespace vespalib {
+
+namespace hwaccelrated {
+
+float
+Avx512Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const
+{
+    return avx::dotProductSelectAlignment<float, 64>(af, bf, sz);
+}
+
+double
+Avx512Accelrator::dotProduct(const double * af, const double * bf, size_t sz) const
+{
+    return avx::dotProductSelectAlignment<double, 64>(af, bf, sz);
+}
+
+}
+}
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
new file mode 100644
index 00000000000..5d7028c30ba
--- /dev/null
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
@@ -0,0 +1,22 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/hwaccelrated/avx2.h>
+
+namespace vespalib {
+
+namespace hwaccelrated {
+
+/**
+ * Avx-512 implementation.
+ */
+class Avx512Accelrator : public Avx2Accelrator
+{
+public:
+    float dotProduct(const float * a, const float * b, size_t sz) const override;
+    double dotProduct(const double * a, const double * b, size_t sz) const override;
+};
+
+}
+}
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp
new file mode 100644
index 00000000000..5491fe1eef7
--- /dev/null
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp
@@ -0,0 +1,123 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/fastos/dynamiclibrary.h>
+
+namespace vespalib {
+
+namespace hwaccelrated {
+
+namespace avx {
+
+namespace {
+
+inline bool validAlignment(const void * p, const size_t align) {
+    return (reinterpret_cast<uint64_t>(p) & (align-1)) == 0;
+}
+
+template <typename T, typename V>
+T sumT(const V & v) {
+    T sum(0);
+    for (size_t i(0); i < (sizeof(V)/sizeof(T)); i++) {
+        sum += v[i];
+    }
+    return sum;
+}
+
+template <typename T, size_t VLEN>
+class TypeSpecifics { };
+
+template <>
+struct TypeSpecifics<float, 32u> {
+    static constexpr const size_t V_SZ = 32u;
+    typedef float V __attribute__ ((vector_size (V_SZ)));
+    static constexpr const size_t VectorsPerChunk = 4;
+    static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    static float sum(const V & v) { return sumT<float, V>(v); }
+};
+
+template <>
+struct TypeSpecifics<double, 32u> {
+    static constexpr const size_t V_SZ = 32u;
+    typedef double V __attribute__ ((vector_size (V_SZ)));
+    static constexpr const size_t VectorsPerChunk = 4;
+    static constexpr const V zero = {0.0, 0.0, 0.0, 0.0};
+    static double sum(const V & v) { return sumT<double, V>(v); }
+};
+
+template <>
+struct TypeSpecifics<float, 64u> {
+    static constexpr const size_t V_SZ = 64u;
+    typedef float V __attribute__ ((vector_size (V_SZ)));
+    static constexpr const size_t VectorsPerChunk = 4;
+    static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    static float sum(const V & v) { return sumT<float, V>(v); }
+};
+
+template <>
+struct TypeSpecifics<double, 64u> {
+    static constexpr const size_t V_SZ = 64u;
+    typedef double V __attribute__ ((vector_size (V_SZ)));
+    static constexpr const size_t VectorsPerChunk = 4;
+    static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    static double sum(const V & v) { return sumT<double, V>(v); }
+};
+
+template <typename T, size_t VLEN, unsigned AlignA, unsigned AlignB>
+static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline));
+
+template <typename T, size_t VLEN, unsigned AlignA, unsigned AlignB>
+T computeDotProduct(const T * af, const T * bf, size_t sz)
+{
+    using TT = TypeSpecifics<T, VLEN>;
+    constexpr const size_t ChunkSize = TT::V_SZ*4/sizeof(T);
+    constexpr const size_t VectorsPerChunk = TT::VectorsPerChunk;
+    typename TT::V partial[VectorsPerChunk] = { TT::zero, TT::zero, TT::zero, TT::zero};
+    typedef T A __attribute__ ((vector_size (TT::V_SZ), aligned(AlignA)));
+    typedef T B __attribute__ ((vector_size (TT::V_SZ), aligned(AlignB)));
+    const A * a = reinterpret_cast<const A *>(af);
+    const B * b = reinterpret_cast<const B *>(bf);
+
+    const size_t numChunks(sz/ChunkSize);
+    for (size_t i(0); i < numChunks; i++) {
+        for (size_t j(0); j < VectorsPerChunk; j++) {
+            partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
+        }
+    }
+    T sum(0);
+    for (size_t i(numChunks*ChunkSize); i < sz; i++) {
+        sum += af[i] * bf[i];
+    }
+    for (size_t i(1); i < VectorsPerChunk; i++) {
+        partial[0] += partial[i];
+    }
+    return sum + TT::sum(partial[0]);
+}
+
+}
+
+template <typename T, size_t VLEN>
+VESPA_DLL_LOCAL static T dotProductSelectAlignment(const T * af, const T * bf, size_t sz);
+
+template <typename T, size_t VLEN>
+T dotProductSelectAlignment(const T * af, const T * bf, size_t sz)
+{
+    if (validAlignment(af, VLEN)) {
+        if (validAlignment(bf, VLEN)) {
+            return computeDotProduct<T, VLEN, VLEN, VLEN>(af, bf, sz);
+        } else {
+            return computeDotProduct<T, VLEN, VLEN, 1>(af, bf, sz);
+        }
+    } else {
+        if (validAlignment(bf, VLEN)) {
+            return computeDotProduct<T, VLEN, 1, VLEN>(af, bf, sz);
+        } else {
+            return computeDotProduct<T, VLEN, 1, 1>(af, bf, sz);
+        }
+    }
+}
+
+}
+}
+}
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
index 769d40fd0ec..aede024f5af 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
@@ -7,6 +7,7 @@
 #include <vespa/vespalib/hwaccelrated/sse2.h>
 #include <vespa/vespalib/hwaccelrated/avx.h>
 #include <vespa/vespalib/hwaccelrated/avx2.h>
+#include <vespa/vespalib/hwaccelrated/avx512.h>
 #include <assert.h>
 
 namespace vespalib {
@@ -41,6 +42,11 @@ public:
     virtual IAccelrated::UP create() const { return IAccelrated::UP(new Avx2Accelrator()); }
 };
 
+class Avx512Factory :public Factory{
+public:
+    virtual IAccelrated::UP create() const { return IAccelrated::UP(new Avx512Accelrator()); }
+};
+
 template<typename T>
 void verifyAccelrator(const IAccelrated & accel)
 {
@@ -95,7 +101,9 @@ Selector::Selector() :
     _factory(new GenericFactory())
 {
     __builtin_cpu_init ();
-    if (__builtin_cpu_supports("avx2")) {
+    if (__builtin_cpu_supports("avx512f")) {
+        _factory.reset(new Avx512Factory());
+    } else if (__builtin_cpu_supports("avx2")) {
         _factory.reset(new Avx2Factory());
     } else if (__builtin_cpu_supports("avx")) {
         _factory.reset(new AvxFactory());
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/sse2.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/sse2.h
index 86fbc41a486..a7c39581997 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/sse2.h
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/sse2.h
@@ -16,8 +16,8 @@ namespace hwaccelrated {
 class Sse2Accelrator : public GenericAccelrator
 {
 public:
-    virtual float dotProduct(const float * a, const float * b, size_t sz) const;
-    virtual double dotProduct(const double * a, const double * b, size_t sz) const;
+    float dotProduct(const float * a, const float * b, size_t sz) const override;
+    double dotProduct(const double * a, const double * b, size_t sz) const override;
 };
 
 }
-- 
cgit v1.2.3