use openblas for matrix multiplication

author: Håvard Pettersen <havardpe@oath.com> 2020-01-24 13:30:31 +0000
committer: Håvard Pettersen <havardpe@oath.com> 2020-01-29 12:01:46 +0000
commit: 7ee101e3880de1366ea7441481c1eb37a5fa24c8 (patch)
tree: eb64d69311f21314fba2143547cdec51dbf9aa94 /eval
parent: ff2010ffd3c15c5518dbc9276cbc7b24cde77879 (diff)
3 files changed, 52 insertions, 33 deletions
diff --git a/eval/src/vespa/eval/CMakeLists.txt b/eval/src/vespa/eval/CMakeLists.txt
index 90972de7c80..c28643e605e 100644
--- a/eval/src/vespa/eval/CMakeLists.txt
+++ b/eval/src/vespa/eval/CMakeLists.txt
@@ -14,3 +14,6 @@ vespa_add_library(vespaeval
     DEPENDS
     ${VESPA_LLVM_LIB}
 )
+
+set(BLA_VENDOR OpenBLAS)
+vespa_add_target_package_dependency(vespaeval BLAS)
diff --git a/eval/src/vespa/eval/tensor/dense/dense_matmul_function.cpp b/eval/src/vespa/eval/tensor/dense/dense_matmul_function.cpp
index 7ba186b622a..ce0d4230c23 100644
--- a/eval/src/vespa/eval/tensor/dense/dense_matmul_function.cpp
+++ b/eval/src/vespa/eval/tensor/dense/dense_matmul_function.cpp
@@ -10,6 +10,8 @@
 #include <vespa/eval/tensor/tensor.h>
 #include <assert.h>
 
+#include <openblas/cblas.h>
+
 namespace vespalib::tensor {
 
 using eval::ValueType;
@@ -21,29 +23,8 @@ using namespace eval::operation;
 
 namespace {
 
-template <typename LCT, typename RCT>
-struct HWSupport {
-    static double call(hwaccelrated::IAccelrated *, const LCT *lhs, const RCT *rhs, size_t len) {
-        double result = 0.0;
-        for (size_t i = 0; i < len; ++i) {
-            result += (lhs[i] * rhs[i]);
-        }
-        return result;
-    }
-};
-template <> struct HWSupport<float, float> {
-    static double call(hwaccelrated::IAccelrated *hw, const float *lhs, const float *rhs, size_t len) {
-        return hw->dotProduct(lhs, rhs, len);
-    }
-};
-template <> struct HWSupport<double, double> {
-    static double call(hwaccelrated::IAccelrated *hw, const double *lhs, const double *rhs, size_t len) {
-        return hw->dotProduct(lhs, rhs, len);
-    }
-};
-
 template <typename LCT, typename RCT, bool lhs_common_inner, bool rhs_common_inner>
-double sparse_dot_product(const LCT *lhs, const RCT *rhs, size_t lhs_size, size_t common_size, size_t rhs_size) {
+double my_dot_product(const LCT *lhs, const RCT *rhs, size_t lhs_size, size_t common_size, size_t rhs_size) {
     double result = 0.0;
     for (size_t i = 0; i < common_size; ++i) {
         result += ((*lhs) * (*rhs));
@@ -65,11 +46,7 @@ void my_matmul_op(eval::InterpretedFunction::State &state, uint64_t param) {
     for (size_t i = 0; i < self.lhs_size; ++i) {
         const RCT *rhs = rhs_cells.cbegin();
         for (size_t j = 0; j < self.rhs_size; ++j) {
-            if (lhs_common_inner && rhs_common_inner) {
-                *dst++ = HWSupport<LCT,RCT>::call(self.hw.get(), lhs, rhs, self.common_size);
-            } else {
-                *dst++ = sparse_dot_product<LCT,RCT,lhs_common_inner,rhs_common_inner>(lhs, rhs, self.lhs_size, self.common_size, self.rhs_size);
-            }
+            *dst++ = my_dot_product<LCT,RCT,lhs_common_inner,rhs_common_inner>(lhs, rhs, self.lhs_size, self.common_size, self.rhs_size);
             rhs += (rhs_common_inner ? self.common_size : 1);
         }
         lhs += (lhs_common_inner ? self.common_size : 1);
@@ -78,19 +55,61 @@ void my_matmul_op(eval::InterpretedFunction::State &state, uint64_t param) {
 }
 
 template <bool lhs_common_inner, bool rhs_common_inner>
+void my_cblas_double_matmul_op(eval::InterpretedFunction::State &state, uint64_t param) {
+    const DenseMatMulFunction::Self &self = *((const DenseMatMulFunction::Self *)(param));
+    auto lhs_cells = DenseTensorView::typify_cells<double>(state.peek(1));
+    auto rhs_cells = DenseTensorView::typify_cells<double>(state.peek(0));
+    auto dst_cells = state.stash.create_array<double>(self.lhs_size * self.rhs_size);
+    cblas_dgemm(CblasRowMajor, lhs_common_inner ? CblasNoTrans : CblasTrans, rhs_common_inner ? CblasTrans : CblasNoTrans,
+                self.lhs_size, self.rhs_size, self.common_size, 1.0,
+                lhs_cells.cbegin(), lhs_common_inner ? self.common_size : self.lhs_size,
+                rhs_cells.cbegin(), rhs_common_inner ? self.common_size : self.rhs_size,
+                0.0, dst_cells.begin(), self.rhs_size);
+    state.pop_pop_push(state.stash.create<DenseTensorView>(self.result_type, TypedCells(dst_cells)));
+}
+
+template <bool lhs_common_inner, bool rhs_common_inner>
+void my_cblas_float_matmul_op(eval::InterpretedFunction::State &state, uint64_t param) {
+    const DenseMatMulFunction::Self &self = *((const DenseMatMulFunction::Self *)(param));
+    auto lhs_cells = DenseTensorView::typify_cells<float>(state.peek(1));
+    auto rhs_cells = DenseTensorView::typify_cells<float>(state.peek(0));
+    auto dst_cells = state.stash.create_array<float>(self.lhs_size * self.rhs_size);
+    cblas_sgemm(CblasRowMajor, lhs_common_inner ? CblasNoTrans : CblasTrans, rhs_common_inner ? CblasTrans : CblasNoTrans,
+                self.lhs_size, self.rhs_size, self.common_size, 1.0,
+                lhs_cells.cbegin(), lhs_common_inner ? self.common_size : self.lhs_size,
+                rhs_cells.cbegin(), rhs_common_inner ? self.common_size : self.rhs_size,
+                0.0, dst_cells.begin(), self.rhs_size);
+    state.pop_pop_push(state.stash.create<DenseTensorView>(self.result_type, TypedCells(dst_cells)));
+}
+
+template <bool lhs_common_inner, bool rhs_common_inner>
 struct MyMatMulOp {
     template <typename LCT, typename RCT>
     static auto get_fun() { return my_matmul_op<LCT,RCT,lhs_common_inner,rhs_common_inner>; }
 };
 
+template <bool lhs_common_inner, bool rhs_common_inner>
+eval::InterpretedFunction::op_function my_select3(CellType lct, CellType rct)
+{
+    if (lct == rct) {
+        if (lct == ValueType::CellType::DOUBLE) {
+            return my_cblas_double_matmul_op<lhs_common_inner,rhs_common_inner>;
+        }
+        if (lct == ValueType::CellType::FLOAT) {
+            return my_cblas_float_matmul_op<lhs_common_inner,rhs_common_inner>;
+        }
+    }
+    return select_2<MyMatMulOp<lhs_common_inner,rhs_common_inner>>(lct, rct);
+}
+
 template <bool lhs_common_inner>
 eval::InterpretedFunction::op_function my_select2(CellType lct, CellType rct,
                                                   bool rhs_common_inner)
 {
     if (rhs_common_inner) {
-        return select_2<MyMatMulOp<lhs_common_inner,true>>(lct, rct);
+        return my_select3<lhs_common_inner,true>(lct, rct);
     } else {
-        return select_2<MyMatMulOp<lhs_common_inner,false>>(lct, rct);
+        return my_select3<lhs_common_inner,false>(lct, rct);
     }
 }
 
@@ -152,8 +171,7 @@ DenseMatMulFunction::Self::Self(const eval::ValueType &result_type_in,
     : result_type(result_type_in),
       lhs_size(lhs_size_in),
       common_size(common_size_in),
-      rhs_size(rhs_size_in),
-      hw(hwaccelrated::IAccelrated::getAccelrator())
+      rhs_size(rhs_size_in)
 {
 }
 
diff --git a/eval/src/vespa/eval/tensor/dense/dense_matmul_function.h b/eval/src/vespa/eval/tensor/dense/dense_matmul_function.h
index 276a455bda4..f0b6d8b6c19 100644
--- a/eval/src/vespa/eval/tensor/dense/dense_matmul_function.h
+++ b/eval/src/vespa/eval/tensor/dense/dense_matmul_function.h
@@ -4,7 +4,6 @@
 
 #include <vespa/eval/eval/tensor_function.h>
 #include "dense_tensor_view.h"
-#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
 
 namespace vespalib::tensor {
 
@@ -20,7 +19,6 @@ public:
         size_t lhs_size;
         size_t common_size;
         size_t rhs_size;
-        hwaccelrated::IAccelrated::UP hw;
         Self(const eval::ValueType &result_type_in,
              size_t lhs_size_in, size_t common_size_in, size_t rhs_size_in);
         ~Self();
author	Håvard Pettersen <havardpe@oath.com>	2020-01-24 13:30:31 +0000
committer	Håvard Pettersen <havardpe@oath.com>	2020-01-29 12:01:46 +0000
commit	7ee101e3880de1366ea7441481c1eb37a5fa24c8 (patch)
tree	eb64d69311f21314fba2143547cdec51dbf9aa94 /eval
parent	ff2010ffd3c15c5518dbc9276cbc7b24cde77879 (diff)