From a30dfd15685c34fb65f7c8b5c0fb7bd3fd92e541 Mon Sep 17 00:00:00 2001
From: Håvard Pettersen <havardpe@yahooinc.com>
Date: Mon, 14 Aug 2023 12:35:45 +0000
Subject: use common inlined code for low-level dot products

---
 eval/src/vespa/eval/eval/inline_operation.h        | 28 +++++++++++++++
 .../eval/instruction/best_similarity_function.cpp  |  5 ++-
 .../instruction/dense_dot_product_function.cpp     | 40 +++-------------------
 .../eval/instruction/mixed_112_dot_product.cpp     | 15 +++-----
 .../instruction/mixed_inner_product_function.cpp   | 31 ++---------------
 .../instruction/sum_max_dot_product_function.cpp   |  6 ++--
 6 files changed, 44 insertions(+), 81 deletions(-)
diff --git a/eval/src/vespa/eval/eval/inline_operation.h b/eval/src/vespa/eval/eval/inline_operation.h
index 9b862b59e37..910fa9cffaa 100644
--- a/eval/src/vespa/eval/eval/inline_operation.h
+++ b/eval/src/vespa/eval/eval/inline_operation.h
@@ -4,6 +4,7 @@
 
 #include "operation.h"
 #include <vespa/vespalib/util/typify.h>
+#include <cblas.h>
 #include <cmath>
 
 namespace vespalib::eval::operation {
@@ -148,4 +149,31 @@ void apply_op2_vec_vec(D *dst, const A *a, const B *b, size_t n, OP2 &&f) {
 
 //-----------------------------------------------------------------------------
 
+template <typename LCT, typename RCT>
+struct DotProduct {
+    static double apply(const LCT * lhs, const RCT * rhs, size_t count) {
+        double result = 0.0;
+        for (size_t i = 0; i < count; ++i) {
+            result += lhs[i] * rhs[i];
+        }
+        return result;
+    }
+};
+
+template <>
+struct DotProduct<float,float> {
+    static float apply(const float * lhs, const float * rhs, size_t count) {
+        return cblas_sdot(count, lhs, 1, rhs, 1);
+    }
+};
+
+template <>
+struct DotProduct<double,double> {
+    static double apply(const double * lhs, const double * rhs, size_t count) {
+        return cblas_ddot(count, lhs, 1, rhs, 1);
+    }
+};
+
+//-----------------------------------------------------------------------------
+
 }
diff --git a/eval/src/vespa/eval/instruction/best_similarity_function.cpp b/eval/src/vespa/eval/instruction/best_similarity_function.cpp
index 964f27a4564..415a08d0d93 100644
--- a/eval/src/vespa/eval/instruction/best_similarity_function.cpp
+++ b/eval/src/vespa/eval/instruction/best_similarity_function.cpp
@@ -1,10 +1,9 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "best_similarity_function.h"
-#include <vespa/eval/eval/operation.h>
+#include <vespa/eval/eval/inline_operation.h>
 #include <vespa/eval/eval/value.h>
 #include <vespa/vespalib/util/binary_hamming_distance.h>
-#include <cblas.h>
 
 namespace vespalib::eval {
 
@@ -22,7 +21,7 @@ struct BestSimParam {
 
 struct UseDotProduct {
     static float calc(const float *pri, const float *sec, size_t size) {
-        return cblas_sdot(size, pri, 1, sec, 1);
+        return DotProduct<float,float>::apply(pri, sec, size);
     }
 };
 
diff --git a/eval/src/vespa/eval/instruction/dense_dot_product_function.cpp b/eval/src/vespa/eval/instruction/dense_dot_product_function.cpp
index a2048707685..de9e029f377 100644
--- a/eval/src/vespa/eval/instruction/dense_dot_product_function.cpp
+++ b/eval/src/vespa/eval/instruction/dense_dot_product_function.cpp
@@ -1,9 +1,8 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "dense_dot_product_function.h"
-#include <vespa/eval/eval/operation.h>
+#include <vespa/eval/eval/inline_operation.h>
 #include <vespa/eval/eval/value.h>
-#include <cblas.h>
 
 namespace vespalib::eval {
 
@@ -16,26 +15,7 @@ template <typename LCT, typename RCT>
 void my_dot_product_op(InterpretedFunction::State &state, uint64_t) {
     auto lhs_cells = state.peek(1).cells().typify<LCT>();
     auto rhs_cells = state.peek(0).cells().typify<RCT>();
-    double result = 0.0;
-    const LCT *lhs = lhs_cells.cbegin();
-    const RCT *rhs = rhs_cells.cbegin();
-    for (size_t i = 0; i < lhs_cells.size(); ++i) {
-        result += ((*lhs++) * (*rhs++));
-    }
-    state.pop_pop_push(state.stash.create<DoubleValue>(result));
-}
-
-void my_cblas_double_dot_product_op(InterpretedFunction::State &state, uint64_t) {
-    auto lhs_cells = state.peek(1).cells().typify<double>();
-    auto rhs_cells = state.peek(0).cells().typify<double>();
-    double result = cblas_ddot(lhs_cells.size(), lhs_cells.cbegin(), 1, rhs_cells.cbegin(), 1);
-    state.pop_pop_push(state.stash.create<DoubleValue>(result));
-}
-
-void my_cblas_float_dot_product_op(InterpretedFunction::State &state, uint64_t) {
-    auto lhs_cells = state.peek(1).cells().typify<float>();
-    auto rhs_cells = state.peek(0).cells().typify<float>();
-    double result = cblas_sdot(lhs_cells.size(), lhs_cells.cbegin(), 1, rhs_cells.cbegin(), 1);
+    double result = DotProduct<LCT,RCT>::apply(lhs_cells.cbegin(), rhs_cells.cbegin(), lhs_cells.size());
     state.pop_pop_push(state.stash.create<DoubleValue>(result));
 }
 
@@ -44,19 +24,6 @@ struct MyDotProductOp {
     static auto invoke() { return my_dot_product_op<LCT,RCT>; }
 };
 
-InterpretedFunction::op_function my_select(CellType lct, CellType rct) {
-    if (lct == rct) {
-        if (lct == CellType::DOUBLE) {
-            return my_cblas_double_dot_product_op;
-        }
-        if (lct == CellType::FLOAT) {
-            return my_cblas_float_dot_product_op;
-        }
-    }
-    using MyTypify = TypifyCellType;
-    return typify_invoke<2,MyTypify,MyDotProductOp>(lct, rct);
-}
-
 } // namespace <unnamed>
 
 DenseDotProductFunction::DenseDotProductFunction(const TensorFunction &lhs_in,
@@ -68,7 +35,8 @@ DenseDotProductFunction::DenseDotProductFunction(const TensorFunction &lhs_in,
 InterpretedFunction::Instruction
 DenseDotProductFunction::compile_self(const ValueBuilderFactory &, Stash &) const
 {
-    auto op = my_select(lhs().result_type().cell_type(), rhs().result_type().cell_type());
+    auto op = typify_invoke<2,TypifyCellType,MyDotProductOp>(lhs().result_type().cell_type(),
+                                                             rhs().result_type().cell_type());
     return InterpretedFunction::Instruction(op);
 }
 
diff --git a/eval/src/vespa/eval/instruction/mixed_112_dot_product.cpp b/eval/src/vespa/eval/instruction/mixed_112_dot_product.cpp
index 8bfa4b07980..47e1dbb58ed 100644
--- a/eval/src/vespa/eval/instruction/mixed_112_dot_product.cpp
+++ b/eval/src/vespa/eval/instruction/mixed_112_dot_product.cpp
@@ -5,7 +5,6 @@
 #include <vespa/vespalib/util/typify.h>
 #include <vespa/vespalib/util/require.h>
 #include <vespa/eval/eval/visit_stuff.h>
-#include <cblas.h>
 #include <algorithm>
 #include <optional>
 
@@ -17,14 +16,6 @@ using namespace instruction;
 
 namespace {
 
-template <typename CT> double my_dot_product(const CT * lhs, const CT * rhs, size_t count);
-template <> double my_dot_product<double>(const double * lhs, const double * rhs, size_t count) {
-    return cblas_ddot(count, lhs, 1, rhs, 1);
-}
-template <> double my_dot_product<float>(const float * lhs, const float * rhs, size_t count) {
-    return cblas_sdot(count, lhs, 1, rhs, 1);
-}
-
 template <typename T, size_t N>
 ConstArrayRef<const T *> as_ccar(std::array<T *, N> &array) {
     return {array.data(), array.size()};
@@ -54,10 +45,11 @@ double my_mixed_112_dot_product_fallback(const Value::Index &a_idx, const Value:
     auto outer = a_idx.create_view({});
     auto model = c_idx.create_view({&single_dim[0], 1});
     outer->lookup({});
+    using dot_product = DotProduct<CT,CT>;
     while (outer->next_result(as_car(c_addr_ref[0]), a_space)) {
         model->lookup(as_ccar(c_addr_ref));
         if (model->next_result({}, c_space)) {
-            result += my_dot_product<CT>(b_cells, c_cells + (c_space * dense_size), dense_size) * a_cells[a_space];
+            result += dot_product::apply(b_cells, c_cells + (c_space * dense_size), dense_size) * a_cells[a_space];
         }
     }
     return result;
@@ -70,11 +62,12 @@ double my_fast_mixed_112_dot_product(const FastAddrMap *a_map, const FastAddrMap
 {
     double result = 0.0;
     const auto &a_labels = a_map->labels();
+    using dot_product = DotProduct<CT,CT>;
     for (size_t a_space = 0; a_space < a_labels.size(); ++a_space) {
         if (a_cells[a_space] != 0.0) { // handle pseudo-sparse input
             auto c_space = c_map->lookup_singledim(a_labels[a_space]);
             if (c_space != FastAddrMap::npos()) {
-                result += my_dot_product<CT>(b_cells, c_cells + (c_space * dense_size), dense_size) * a_cells[a_space];
+                result += dot_product::apply(b_cells, c_cells + (c_space * dense_size), dense_size) * a_cells[a_space];
             }
         }
     }
diff --git a/eval/src/vespa/eval/instruction/mixed_inner_product_function.cpp b/eval/src/vespa/eval/instruction/mixed_inner_product_function.cpp
index 248f909fcf5..5880a90a2cd 100644
--- a/eval/src/vespa/eval/instruction/mixed_inner_product_function.cpp
+++ b/eval/src/vespa/eval/instruction/mixed_inner_product_function.cpp
@@ -1,9 +1,8 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "mixed_inner_product_function.h"
-#include <vespa/eval/eval/operation.h>
+#include <vespa/eval/eval/inline_operation.h>
 #include <vespa/eval/eval/value.h>
-#include <cblas.h>
 
 namespace vespalib::eval {
 
@@ -12,31 +11,6 @@ using namespace operation;
 
 namespace {
 
-template <typename LCT, typename RCT>
-struct MyDotProduct {
-    static double apply(const LCT * lhs, const RCT * rhs, size_t count) {
-        double result = 0.0;
-        for (size_t i = 0; i < count; ++i) {
-            result += lhs[i] * rhs[i];
-        }
-        return result;
-    }
-};
-
-template <>
-struct MyDotProduct<double,double> {
-    static double apply(const double * lhs, const double * rhs, size_t count) {
-        return cblas_ddot(count, lhs, 1, rhs, 1);
-    }
-};
-
-template <>
-struct MyDotProduct<float,float> {
-    static float apply(const float * lhs, const float * rhs, size_t count) {
-        return cblas_sdot(count, lhs, 1, rhs, 1);
-    }
-};
-
 struct MixedInnerProductParam {
     ValueType res_type;
     size_t vector_size;
@@ -66,8 +40,9 @@ void my_mixed_inner_product_op(InterpretedFunction::State &state, uint64_t param
     ArrayRef<OCT> out_cells = state.stash.create_uninitialized_array<OCT>(num_output_cells);
     const MCT *m_cp = m_cells.begin();
     const VCT *v_cp = v_cells.begin();
+    using dot_product = DotProduct<MCT,VCT>;
     for (OCT &out : out_cells) {
-        out = MyDotProduct<MCT,VCT>::apply(m_cp, v_cp, param.vector_size);
+        out = dot_product::apply(m_cp, v_cp, param.vector_size);
         m_cp += param.vector_size;
     }
     assert(m_cp == m_cells.end());
diff --git a/eval/src/vespa/eval/instruction/sum_max_dot_product_function.cpp b/eval/src/vespa/eval/instruction/sum_max_dot_product_function.cpp
index a76eaa38925..41017bc3687 100644
--- a/eval/src/vespa/eval/instruction/sum_max_dot_product_function.cpp
+++ b/eval/src/vespa/eval/instruction/sum_max_dot_product_function.cpp
@@ -1,9 +1,8 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "sum_max_dot_product_function.h"
-#include <vespa/eval/eval/operation.h>
+#include <vespa/eval/eval/inline_operation.h>
 #include <vespa/eval/eval/value.h>
-#include <cblas.h>
 
 namespace vespalib::eval {
 
@@ -16,11 +15,12 @@ void my_sum_max_dot_product_op(InterpretedFunction::State &state, uint64_t dp_si
     double result = 0.0;
     auto query_cells = state.peek(1).cells().typify<float>();
     auto document_cells = state.peek(0).cells().typify<float>();
+    using dot_product = DotProduct<float,float>;
     if ((query_cells.size() > 0) && (document_cells.size() > 0)) {
         for (const float *query = query_cells.begin(); query < query_cells.end(); query += dp_size) {
             float max_dp = aggr::Max<float>::null_value();
             for (const float *document = document_cells.begin(); document < document_cells.end(); document += dp_size) {
-                max_dp = aggr::Max<float>::combine(max_dp, cblas_sdot(dp_size, query, 1, document, 1));
+                max_dp = aggr::Max<float>::combine(max_dp, dot_product::apply(query, document, dp_size));
             }
             result += max_dp;
         }
-- 
cgit v1.2.3