multi-matmul

author: Håvard Pettersen <havardpe@oath.com> 2020-04-27 14:07:26 +0000
committer: Håvard Pettersen <havardpe@oath.com> 2020-05-04 08:13:56 +0000
commit: a38091735759c16e86cab1353db1f439a3cd35c4 (patch)
tree: de913055842b7e54461867360bde42d15fb43b25
parent: e3f135fc9ccad48719e462397915c6efeaf450b6 (diff)
11 files changed, 522 insertions, 24 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt
index bb13638cf1d..9cd5f396144 100644
--- a/eval/CMakeLists.txt
+++ b/eval/CMakeLists.txt
@@ -35,7 +35,8 @@ vespa_define_module(
     src/tests/tensor/dense_generic_join
     src/tests/tensor/dense_inplace_join_function
     src/tests/tensor/dense_inplace_map_function
-    src/tests/tensor/dense_matmul_function    
+    src/tests/tensor/dense_matmul_function
+    src/tests/tensor/dense_multi_matmul_function
     src/tests/tensor/dense_remove_dimension_optimizer
     src/tests/tensor/dense_replace_type_function
     src/tests/tensor/dense_tensor_create_function
diff --git a/eval/src/tests/tensor/dense_matmul_function/dense_matmul_function_test.cpp b/eval/src/tests/tensor/dense_matmul_function/dense_matmul_function_test.cpp
index 5d7c0be704e..a571837b8e9 100644
--- a/eval/src/tests/tensor/dense_matmul_function/dense_matmul_function_test.cpp
+++ b/eval/src/tests/tensor/dense_matmul_function/dense_matmul_function_test.cpp
@@ -23,30 +23,14 @@ using namespace vespalib::eval::tensor_function;
 
 const TensorEngine &prod_engine = DefaultTensorEngine::ref();
 
-void add_matrix(EvalFixture::ParamRepo &repo, const char *d1, size_t s1, const char *d2, size_t s2) {
-    for (bool float_cells: {false, true}) {
-        auto name = make_string("%s%zu%s%zu%s", d1, s1, d2, s2, float_cells ? "f" : "");
-        auto type_str = make_string("tensor%s(%s[%zu],%s[%zu])", float_cells ? "<float>" : "", d1, s1, d2, s2);
-        TensorSpec matrix(type_str);
-        for (size_t i = 0; i < s1; ++i) {
-            for (size_t j = 0; j < s2; ++j) {
-                double value = (i + s1 + s2) * 3.0 + (j + s2) * 7.0;
-                matrix.add({{d1, i}, {d2, j}}, value);
-            }
-        }
-        repo.add(name, matrix);
-    }
-}
-
 EvalFixture::ParamRepo make_params() {
-    EvalFixture::ParamRepo repo;
-    add_matrix(repo, "a", 2, "d", 3); // inner/inner
-    add_matrix(repo, "a", 2, "b", 5); // inner/outer
-    add_matrix(repo, "b", 5, "c", 2); // outer/outer
-    add_matrix(repo, "a", 2, "c", 3); // not matching
-    //-----------------------------------------------
-    add_matrix(repo, "b", 5, "d", 3); // fixed param
-    return repo;
+    return EvalFixture::ParamRepo()
+        .add_matrix("a", 2, "d", 3)  // inner/inner
+        .add_matrix("a", 2, "b", 5)  // inner/outer
+        .add_matrix("b", 5, "c", 2)  // outer/outer
+        .add_matrix("a", 2, "c", 3)  // not matching
+        //------------------------------------------
+        .add_matrix("b", 5, "d", 3); // fixed param
 }
 EvalFixture::ParamRepo param_repo = make_params();
 
diff --git a/eval/src/tests/tensor/dense_multi_matmul_function/CMakeLists.txt b/eval/src/tests/tensor/dense_multi_matmul_function/CMakeLists.txt
new file mode 100644
index 00000000000..1619f42c897
--- /dev/null
+++ b/eval/src/tests/tensor/dense_multi_matmul_function/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(eval_dense_multi_matmul_function_test_app TEST
+    SOURCES
+    dense_multi_matmul_function_test.cpp
+    DEPENDS
+    vespaeval
+)
+vespa_add_test(NAME eval_dense_multi_matmul_function_test_app COMMAND eval_dense_multi_matmul_function_test_app)
diff --git a/eval/src/tests/tensor/dense_multi_matmul_function/dense_multi_matmul_function_test.cpp b/eval/src/tests/tensor/dense_multi_matmul_function/dense_multi_matmul_function_test.cpp
new file mode 100644
index 00000000000..c0823248538
--- /dev/null
+++ b/eval/src/tests/tensor/dense_multi_matmul_function/dense_multi_matmul_function_test.cpp
@@ -0,0 +1,155 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/testkit/test_kit.h>
+#include <vespa/eval/eval/tensor_function.h>
+#include <vespa/eval/eval/operation.h>
+#include <vespa/eval/eval/simple_tensor.h>
+#include <vespa/eval/eval/simple_tensor_engine.h>
+#include <vespa/eval/tensor/default_tensor_engine.h>
+#include <vespa/eval/tensor/dense/dense_multi_matmul_function.h>
+#include <vespa/eval/tensor/dense/dense_tensor.h>
+#include <vespa/eval/tensor/dense/dense_tensor_view.h>
+#include <vespa/eval/eval/test/tensor_model.hpp>
+#include <vespa/eval/eval/test/eval_fixture.h>
+
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/vespalib/util/stash.h>
+
+using namespace vespalib;
+using namespace vespalib::eval;
+using namespace vespalib::eval::test;
+using namespace vespalib::tensor;
+using namespace vespalib::eval::tensor_function;
+
+const TensorEngine &prod_engine = DefaultTensorEngine::ref();
+
+EvalFixture::ParamRepo make_params() {
+    return EvalFixture::ParamRepo()
+        .add_dense({{"A", 2}, {"B", 1}, {"C", 3}, {"a", 2}, {"d", 3}})  // inner/inner
+        .add_dense({{"B", 1}, {"C", 3}, {"a", 2}, {"d", 3}})            // inner/inner, missing A
+        .add_dense({{"A", 1}, {"a", 2}, {"d", 3}})                      // inner/inner, single mat
+        .add_dense({{"A", 2}, {"D", 3}, {"a", 2}, {"b", 1}, {"c", 3}})  // inner/inner, inverted
+        .add_dense({{"A", 2}, {"B", 1}, {"C", 3}, {"a", 2}, {"b", 5}})  // inner/outer
+        .add_dense({{"A", 2}, {"B", 1}, {"C", 3}, {"b", 5}, {"c", 2}})  // outer/outer
+        .add_dense({{"A", 2}, {"B", 1}, {"C", 3}, {"a", 2}, {"c", 3}})  // not matching
+        //----------------------------------------------------------------------------------------
+        .add_dense({{"A", 2}, {"B", 1}, {"C", 3}, {"b", 5}, {"d", 3}})  // fixed param
+        .add_dense({{"B", 1}, {"C", 3}, {"b", 5}, {"d", 3}})            // fixed param, missing A
+        .add_dense({{"A", 1}, {"b", 5}, {"d", 3}})                      // fixed param, single mat
+        .add_dense({{"B", 5}, {"D", 3}, {"a", 2}, {"b", 1}, {"c", 3}}); // fixed param, inverted
+}
+EvalFixture::ParamRepo param_repo = make_params();
+
+void verify_optimized(const vespalib::string &expr,
+                      size_t lhs_size, size_t common_size, size_t rhs_size, size_t matmul_cnt,
+                      bool lhs_inner, bool rhs_inner)
+{
+    EvalFixture slow_fixture(prod_engine, expr, param_repo, false);
+    EvalFixture fixture(prod_engine, expr, param_repo, true);
+    EXPECT_EQUAL(fixture.result(), EvalFixture::ref(expr, param_repo));
+    EXPECT_EQUAL(fixture.result(), slow_fixture.result());
+    auto info = fixture.find_all<DenseMultiMatMulFunction>();
+    ASSERT_EQUAL(info.size(), 1u);
+    EXPECT_TRUE(info[0]->result_is_mutable());
+    EXPECT_EQUAL(info[0]->lhs_size(), lhs_size);
+    EXPECT_EQUAL(info[0]->common_size(), common_size);
+    EXPECT_EQUAL(info[0]->rhs_size(), rhs_size);
+    EXPECT_EQUAL(info[0]->matmul_cnt(), matmul_cnt);
+    EXPECT_EQUAL(info[0]->lhs_common_inner(), lhs_inner);
+    EXPECT_EQUAL(info[0]->rhs_common_inner(), rhs_inner);
+}
+
+void verify_not_optimized(const vespalib::string &expr) {
+    EvalFixture slow_fixture(prod_engine, expr, param_repo, false);
+    EvalFixture fixture(prod_engine, expr, param_repo, true);
+    EXPECT_EQUAL(fixture.result(), EvalFixture::ref(expr, param_repo));
+    EXPECT_EQUAL(fixture.result(), slow_fixture.result());
+    auto info = fixture.find_all<DenseMultiMatMulFunction>();
+    EXPECT_TRUE(info.empty());
+}
+
+TEST("require that multi matmul can be optimized") {
+    TEST_DO(verify_optimized("reduce(A2B1C3a2d3*A2B1C3b5d3,sum,d)", 2, 3, 5, 6, true, true));
+}
+
+TEST("require that single multi matmul can be optimized") {
+    TEST_DO(verify_optimized("reduce(A1a2d3*A1b5d3,sum,d)", 2, 3, 5, 1, true, true));
+}
+
+TEST("require that multi matmul with lambda can be optimized") {
+    TEST_DO(verify_optimized("reduce(join(A2B1C3a2d3,A2B1C3b5d3,f(x,y)(x*y)),sum,d)", 2, 3, 5, 6, true, true));
+    TEST_DO(verify_optimized("reduce(join(A2B1C3a2d3,A2B1C3b5d3,f(x,y)(y*x)),sum,d)", 2, 3, 5, 6, true, true));
+}
+
+TEST("require that expressions similar to multi matmul are not optimized") {
+    TEST_DO(verify_not_optimized("reduce(A2B1C3a2d3*A2B1C3b5d3,sum,a)"));
+    TEST_DO(verify_not_optimized("reduce(A2B1C3a2d3*A2B1C3b5d3,sum,b)"));
+    TEST_DO(verify_not_optimized("reduce(A2B1C3a2d3*A2B1C3b5d3,prod,d)"));
+    TEST_DO(verify_not_optimized("reduce(A2B1C3a2d3*A2B1C3b5d3,sum)"));
+    TEST_DO(verify_not_optimized("reduce(join(A2B1C3a2d3,A2B1C3b5d3,f(x,y)(x+y)),sum,d)"));
+    TEST_DO(verify_not_optimized("reduce(join(A2B1C3a2d3,A2B1C3b5d3,f(x,y)(x*x)),sum,d)"));
+    TEST_DO(verify_not_optimized("reduce(join(A2B1C3a2d3,A2B1C3b5d3,f(x,y)(y*y)),sum,d)"));
+    TEST_DO(verify_not_optimized("reduce(join(A2B1C3a2d3,A2B1C3b5d3,f(x,y)(x*y*1)),sum,d)"));
+    TEST_DO(verify_not_optimized("reduce(A2B1C3a2c3*A2B1C3b5d3,sum,d)"));
+    TEST_DO(verify_not_optimized("reduce(A2B1C3a2c3*A2B1C3b5d3,sum,c)"));
+}
+
+TEST("require that multi matmul must have matching cell type") {
+    TEST_DO(verify_not_optimized("reduce(A2B1C3a2d3f*A2B1C3b5d3,sum,d)"));
+    TEST_DO(verify_not_optimized("reduce(A2B1C3a2d3*A2B1C3b5d3f,sum,d)"));
+}
+
+TEST("require that multi matmul must have matching dimension prefix") {
+    TEST_DO(verify_not_optimized("reduce(B1C3a2d3*A2B1C3b5d3,sum,d)"));
+    TEST_DO(verify_not_optimized("reduce(A2B1C3a2d3*B1C3b5d3,sum,d)"));
+}
+
+TEST("require that multi matmul must have inner nesting of matmul dimensions") {
+    TEST_DO(verify_not_optimized("reduce(A2D3a2b1c3*B5D3a2b1c3,sum,D)"));
+    TEST_DO(verify_not_optimized("reduce(B5D3a2b1c3*A2D3a2b1c3,sum,D)"));
+}
+
+TEST("require that multi matmul function can be debug dumped") {
+    EvalFixture fixture(prod_engine, "reduce(A2B1C3a2d3*A2B1C3b5d3,sum,d)", param_repo, true);
+    auto info = fixture.find_all<DenseMultiMatMulFunction>();
+    ASSERT_EQUAL(info.size(), 1u);
+    fprintf(stderr, "%s\n", info[0]->as_string().c_str());
+}
+
+vespalib::string make_expr(const vespalib::string &a, const vespalib::string &b, const vespalib::string &common,
+                           bool float_cells)
+{
+    return make_string("reduce(%s%s*%s%s,sum,%s)", a.c_str(), float_cells ? "f" : "", b.c_str(), float_cells ? "f" : "", common.c_str());
+}
+
+void verify_optimized_multi(const vespalib::string &a, const vespalib::string &b, const vespalib::string &common,
+                            size_t lhs_size, size_t common_size, size_t rhs_size, size_t matmul_cnt,
+                            bool lhs_inner, bool rhs_inner)
+{
+    for (bool float_cells: {false, true}) {
+        {
+            auto expr = make_expr(a, b, common, float_cells);
+            TEST_STATE(expr.c_str());
+            TEST_DO(verify_optimized(expr, lhs_size, common_size, rhs_size, matmul_cnt, lhs_inner, rhs_inner));
+        }
+        {
+            auto expr = make_expr(b, a, common, float_cells);
+            TEST_STATE(expr.c_str());
+            TEST_DO(verify_optimized(expr, lhs_size, common_size, rhs_size, matmul_cnt, lhs_inner, rhs_inner));
+        }
+    }
+}
+
+TEST("require that multi matmul inner/inner works correctly") {
+    TEST_DO(verify_optimized_multi("A2B1C3a2d3", "A2B1C3b5d3", "d", 2, 3, 5, 6, true, true));
+}
+
+TEST("require that multi matmul inner/outer works correctly") {
+    TEST_DO(verify_optimized_multi("A2B1C3a2b5", "A2B1C3b5d3", "b", 2, 5, 3, 6, true, false));
+}
+
+TEST("require that multi matmul outer/outer works correctly") {
+    TEST_DO(verify_optimized_multi("A2B1C3b5c2", "A2B1C3b5d3", "b", 2, 5, 3, 6, false, false));
+}
+
+TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/eval/src/vespa/eval/eval/test/eval_fixture.cpp b/eval/src/vespa/eval/eval/test/eval_fixture.cpp
index 325fb208319..7ce05ccea8f 100644
--- a/eval/src/vespa/eval/eval/test/eval_fixture.cpp
+++ b/eval/src/vespa/eval/eval/test/eval_fixture.cpp
@@ -3,6 +3,9 @@
 #include <vespa/vespalib/testkit/test_kit.h>
 #include "eval_fixture.h"
 #include <vespa/eval/eval/make_tensor_function.h>
+#include <vespa/vespalib/util/stringfmt.h>
+
+using vespalib::make_string_short::fmt;
 
 namespace vespalib::eval::test {
 
@@ -96,8 +99,71 @@ std::vector<Value::CREF> get_refs(const std::vector<Value::UP> &values) {
     return result;
 }
 
+void add_cell_values(TensorSpec &spec, TensorSpec::Address &addr,
+                     const std::vector<std::pair<vespalib::string, size_t> > &dims,
+                     size_t idx, size_t &seq)
+{
+    if (idx < dims.size()) {
+        for (size_t i = 0; i < dims[idx].second; ++i) {
+            addr.emplace(dims[idx].first, TensorSpec::Label(i)).first->second = TensorSpec::Label(i);
+            add_cell_values(spec, addr, dims, idx + 1, seq);
+        }
+    } else {
+        spec.add(addr, seq++);
+    }
+}
+
+TensorSpec make_dense(const vespalib::string &type,
+                      const std::vector<std::pair<vespalib::string, size_t> > &dims,
+                      size_t seed)
+{
+    TensorSpec spec(type);
+    TensorSpec::Address addr;
+    size_t seq = seed;
+    add_cell_values(spec, addr, dims, 0, seq);
+    return spec;
+}
+
 } // namespace vespalib::eval::test
 
+ParamRepo &
+EvalFixture::ParamRepo::add_vector(const char *d1, size_t s1, size_t seed)
+{
+    return add_dense({{d1, s1}}, seed);
+}
+
+ParamRepo &
+EvalFixture::ParamRepo::add_matrix(const char *d1, size_t s1, const char *d2, size_t s2, size_t seed)
+{
+    return add_dense({{d1, s1}, {d2, s2}}, seed);
+}
+
+ParamRepo &
+EvalFixture::ParamRepo::add_cube(const char *d1, size_t s1, const char *d2, size_t s2, const char *d3, size_t s3, size_t seed)
+{
+    return add_dense({{d1, s1}, {d2, s2}, {d3, s3}}, seed);
+}
+
+ParamRepo &
+EvalFixture::ParamRepo::add_dense(const std::vector<std::pair<vespalib::string, size_t> > &dims, size_t seed)
+{
+    vespalib::string prev;
+    vespalib::string name;
+    vespalib::string type;
+    for (const auto &dim: dims) {
+        if (!prev.empty()) {
+            ASSERT_LESS(prev, dim.first);
+            type += ",";
+        }
+        name += fmt("%s%zu", dim.first.c_str(), dim.second);
+        type += fmt("%s[%zu]", dim.first.c_str(), dim.second);
+        prev = dim.first;
+    }
+    add(name, make_dense(fmt("tensor(%s)", type.c_str()), dims, seed));
+    add(name + "f", make_dense(fmt("tensor<float>(%s)", type.c_str()), dims, seed));
+    return *this;
+}
+
 void
 EvalFixture::detect_param_tampering(const ParamRepo &param_repo, bool allow_mutable) const
 {
diff --git a/eval/src/vespa/eval/eval/test/eval_fixture.h b/eval/src/vespa/eval/eval/test/eval_fixture.h
index 1d39dc52cba..f11fb8ebf22 100644
--- a/eval/src/vespa/eval/eval/test/eval_fixture.h
+++ b/eval/src/vespa/eval/eval/test/eval_fixture.h
@@ -37,6 +37,10 @@ public:
         ParamRepo &add_mutable(const vespalib::string &name, const TensorSpec &value) {
             return add(name, value, true);
         }
+        ParamRepo &add_vector(const char *d1, size_t s1, size_t seed = 1);
+        ParamRepo &add_matrix(const char *d1, size_t s1, const char *d2, size_t s2, size_t seed = 1);
+        ParamRepo &add_cube(const char *d1, size_t s1, const char *d2, size_t s2, const char *d3, size_t s3, size_t seed = 1);
+        ParamRepo &add_dense(const std::vector<std::pair<vespalib::string, size_t> > &dims, size_t seed = 1);
         ~ParamRepo() {}
     };
 
diff --git a/eval/src/vespa/eval/eval/test/tensor_model.hpp b/eval/src/vespa/eval/eval/test/tensor_model.hpp
index 2466701df62..42f0dc7e996 100644
--- a/eval/src/vespa/eval/eval/test/tensor_model.hpp
+++ b/eval/src/vespa/eval/eval/test/tensor_model.hpp
@@ -1,5 +1,7 @@
 // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
+#pragma once
+
 #include <vespa/vespalib/testkit/test_kit.h>
 #include <vespa/eval/eval/tensor_spec.h>
 #include <vespa/eval/eval/value_type.h>
diff --git a/eval/src/vespa/eval/tensor/default_tensor_engine.cpp b/eval/src/vespa/eval/tensor/default_tensor_engine.cpp
index b16241fe5e5..0cadbd64372 100644
--- a/eval/src/vespa/eval/tensor/default_tensor_engine.cpp
+++ b/eval/src/vespa/eval/tensor/default_tensor_engine.cpp
@@ -11,6 +11,7 @@
 #include "dense/dense_dot_product_function.h"
 #include "dense/dense_xw_product_function.h"
 #include "dense/dense_matmul_function.h"
+#include "dense/dense_multi_matmul_function.h"
 #include "dense/dense_fast_rename_optimizer.h"
 #include "dense/dense_add_dimension_optimizer.h"
 #include "dense/dense_remove_dimension_optimizer.h"
@@ -273,6 +274,7 @@ DefaultTensorEngine::optimize(const TensorFunction &expr, Stash &stash) const
         child.set(DenseDotProductFunction::optimize(child.get(), stash));
         child.set(DenseXWProductFunction::optimize(child.get(), stash));
         child.set(DenseMatMulFunction::optimize(child.get(), stash));
+        child.set(DenseMultiMatMulFunction::optimize(child.get(), stash));
         child.set(DenseFastRenameOptimizer::optimize(child.get(), stash));
         child.set(DenseAddDimensionOptimizer::optimize(child.get(), stash));
         child.set(DenseRemoveDimensionOptimizer::optimize(child.get(), stash));
diff --git a/eval/src/vespa/eval/tensor/dense/CMakeLists.txt b/eval/src/vespa/eval/tensor/dense/CMakeLists.txt
index 1b9b51d6ad2..7019749e123 100644
--- a/eval/src/vespa/eval/tensor/dense/CMakeLists.txt
+++ b/eval/src/vespa/eval/tensor/dense/CMakeLists.txt
@@ -11,6 +11,7 @@ vespa_add_library(eval_tensor_dense OBJECT
     dense_lambda_peek_function.cpp
     dense_lambda_peek_optimizer.cpp
     dense_matmul_function.cpp
+    dense_multi_matmul_function.cpp
     dense_remove_dimension_optimizer.cpp
     dense_replace_type_function.cpp
     dense_tensor.cpp
diff --git a/eval/src/vespa/eval/tensor/dense/dense_multi_matmul_function.cpp b/eval/src/vespa/eval/tensor/dense/dense_multi_matmul_function.cpp
new file mode 100644
index 00000000000..73942f7f044
--- /dev/null
+++ b/eval/src/vespa/eval/tensor/dense/dense_multi_matmul_function.cpp
@@ -0,0 +1,223 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "dense_multi_matmul_function.h"
+#include "dense_tensor_view.h"
+#include <vespa/vespalib/objects/objectvisitor.h>
+#include <vespa/eval/eval/value.h>
+#include <vespa/eval/eval/operation.h>
+#include <cassert>
+
+#include <cblas.h>
+
+namespace vespalib::tensor {
+
+using eval::ValueType;
+using eval::TensorFunction;
+using eval::InterpretedFunction;
+using eval::TensorEngine;
+using eval::as;
+using eval::Aggr;
+using namespace eval::tensor_function;
+using namespace eval::operation;
+
+namespace {
+
+void my_cblas_double_multi_matmul_op(InterpretedFunction::State &state, uint64_t param) {
+    using CT = double;
+    const DenseMultiMatMulFunction &self = *((const DenseMultiMatMulFunction *)(param));
+    size_t lhs_block_size = self.lhs_size() * self.common_size();
+    size_t rhs_block_size = self.rhs_size() * self.common_size();
+    size_t dst_block_size = self.lhs_size() * self.rhs_size();
+    size_t num_blocks = self.matmul_cnt();
+    const CT *lhs = DenseTensorView::typify_cells<CT>(state.peek(1)).cbegin();
+    const CT *rhs = DenseTensorView::typify_cells<CT>(state.peek(0)).cbegin();
+    auto dst_cells = state.stash.create_array<CT>(dst_block_size * num_blocks);
+    CT *dst = dst_cells.begin();
+    for (size_t i = 0; i < num_blocks; ++i, lhs += lhs_block_size, rhs += rhs_block_size, dst += dst_block_size) {
+        cblas_dgemm(CblasRowMajor, self.lhs_common_inner() ? CblasNoTrans : CblasTrans, self.rhs_common_inner() ? CblasTrans : CblasNoTrans,
+                    self.lhs_size(), self.rhs_size(), self.common_size(), 1.0,
+                    lhs, self.lhs_common_inner() ? self.common_size() : self.lhs_size(),
+                    rhs, self.rhs_common_inner() ? self.common_size() : self.rhs_size(),
+                    0.0, dst, self.rhs_size());
+    }
+    state.pop_pop_push(state.stash.create<DenseTensorView>(self.result_type(), TypedCells(dst_cells)));
+}
+
+void my_cblas_float_multi_matmul_op(InterpretedFunction::State &state, uint64_t param) {
+    using CT = float;
+    const DenseMultiMatMulFunction &self = *((const DenseMultiMatMulFunction *)(param));
+    size_t lhs_block_size = self.lhs_size() * self.common_size();
+    size_t rhs_block_size = self.rhs_size() * self.common_size();
+    size_t dst_block_size = self.lhs_size() * self.rhs_size();
+    size_t num_blocks = self.matmul_cnt();
+    const CT *lhs = DenseTensorView::typify_cells<CT>(state.peek(1)).cbegin();
+    const CT *rhs = DenseTensorView::typify_cells<CT>(state.peek(0)).cbegin();
+    auto dst_cells = state.stash.create_array<CT>(dst_block_size * num_blocks);
+    CT *dst = dst_cells.begin();
+    for (size_t i = 0; i < num_blocks; ++i, lhs += lhs_block_size, rhs += rhs_block_size, dst += dst_block_size) {
+        cblas_sgemm(CblasRowMajor, self.lhs_common_inner() ? CblasNoTrans : CblasTrans, self.rhs_common_inner() ? CblasTrans : CblasNoTrans,
+                    self.lhs_size(), self.rhs_size(), self.common_size(), 1.0,
+                    lhs, self.lhs_common_inner() ? self.common_size() : self.lhs_size(),
+                    rhs, self.rhs_common_inner() ? self.common_size() : self.rhs_size(),
+                    0.0, dst, self.rhs_size());
+    }
+    state.pop_pop_push(state.stash.create<DenseTensorView>(self.result_type(), TypedCells(dst_cells)));
+}
+
+InterpretedFunction::op_function my_select(CellType cell_type) {
+    if (cell_type == ValueType::CellType::DOUBLE) {
+        return my_cblas_double_multi_matmul_op;
+    }
+    if (cell_type == ValueType::CellType::FLOAT) {
+        return my_cblas_float_multi_matmul_op;
+    }
+    abort();
+}
+
+struct CommonDim {
+    bool valid;
+    bool inner;
+    CommonDim(const ValueType &type, const vespalib::string &dim)
+        : valid(true), inner(false)
+    {
+        size_t size = type.dimensions().size();
+        if (type.dimensions()[size - 1].name == dim) {
+            inner = true;
+        } else if (type.dimensions()[size - 2].name != dim) {
+            valid = false;
+        }
+    }
+    const ValueType::Dimension &get(const ValueType &type) const {
+        size_t size = type.dimensions().size();
+        return type.dimensions()[size - (inner ? 1 : 2)];
+    }
+    const ValueType::Dimension &get(const TensorFunction &expr) const {
+        return get(expr.result_type());
+    }
+    const ValueType::Dimension &inv(const ValueType &type) const {
+        size_t size = type.dimensions().size();
+        return type.dimensions()[size - (inner ? 2 : 1)];
+    }
+    const ValueType::Dimension &inv(const TensorFunction &expr) const {
+        return inv(expr.result_type());
+    }
+};
+
+// Currently, non-matmul dimensions are required to be identical. This
+// restriction is added to reduce complexity and might be removed in
+// the future if/when a relevant use-case arises.
+struct DimPrefix {
+    bool valid;
+    size_t size;
+    DimPrefix(const ValueType &a, const ValueType &b)
+        : valid(true), size(1)
+    {
+        if (a.dimensions().size() == b.dimensions().size()) {
+            for (size_t i = 0; i < (a.dimensions().size() - 2); ++i) {
+                if (a.dimensions()[i] == b.dimensions()[i]) {
+                    size *= a.dimensions()[i].size;
+                } else {
+                    valid = false;
+                }
+            }
+        } else {
+            valid = false;
+        }
+    }
+};
+
+bool check_input_type(const ValueType &type) {
+    return (type.is_dense() &&
+            (type.dimensions().size() >= 2) &&
+            ((type.cell_type() == CellType::FLOAT) || (type.cell_type() == CellType::DOUBLE)));
+}
+
+bool is_multi_matmul(const ValueType &a, const ValueType &b, const vespalib::string &reduce_dim) {
+    if (check_input_type(a) && check_input_type(b) && (a.cell_type() == b.cell_type())) {
+        CommonDim cd_a(a, reduce_dim);
+        CommonDim cd_b(b, reduce_dim);
+        DimPrefix prefix(a, b);
+        return (cd_a.valid && cd_b.valid && prefix.valid &&
+                (b.dimension_index(cd_a.inv(a).name) == ValueType::Dimension::npos) &&
+                (a.dimension_index(cd_b.inv(b).name) == ValueType::Dimension::npos));
+    }
+    return false;
+}
+
+const TensorFunction &create_multi_matmul(const TensorFunction &a, const TensorFunction &b,
+                                          const vespalib::string &reduce_dim, const ValueType &result_type, Stash &stash)
+{
+    CommonDim cd_a(a.result_type(), reduce_dim);
+    CommonDim cd_b(b.result_type(), reduce_dim);
+    DimPrefix prefix(a.result_type(), b.result_type());
+    size_t a_size = cd_a.inv(a).size;
+    size_t b_size = cd_b.inv(b).size;
+    size_t common_size = cd_a.get(a).size;
+    bool a_is_lhs = (cd_a.inv(a).name < cd_b.inv(b).name);
+    if (a_is_lhs) {
+        return stash.create<DenseMultiMatMulFunction>(result_type, a, b, a_size, common_size, b_size, prefix.size, cd_a.inner, cd_b.inner);
+    } else {
+        return stash.create<DenseMultiMatMulFunction>(result_type, b, a, b_size, common_size, a_size, prefix.size, cd_b.inner, cd_a.inner);
+    }
+}
+
+} // namespace vespalib::tensor::<unnamed>
+
+DenseMultiMatMulFunction::DenseMultiMatMulFunction(const ValueType &result_type,
+                                                   const TensorFunction &lhs_in,
+                                                   const TensorFunction &rhs_in,
+                                                   size_t lhs_size,
+                                                   size_t common_size,
+                                                   size_t rhs_size,
+                                                   size_t matmul_cnt,
+                                                   bool lhs_common_inner,
+                                                   bool rhs_common_inner)
+    : Super(result_type, lhs_in, rhs_in),
+      _lhs_size(lhs_size),
+      _common_size(common_size),
+      _rhs_size(rhs_size),
+      _matmul_cnt(matmul_cnt),
+      _lhs_common_inner(lhs_common_inner),
+      _rhs_common_inner(rhs_common_inner)
+{
+}
+
+DenseMultiMatMulFunction::~DenseMultiMatMulFunction() = default;
+
+InterpretedFunction::Instruction
+DenseMultiMatMulFunction::compile_self(const TensorEngine &, Stash &) const
+{
+    auto op = my_select(lhs().result_type().cell_type());
+    return InterpretedFunction::Instruction(op, (uint64_t)(this));
+}
+
+void
+DenseMultiMatMulFunction::visit_self(vespalib::ObjectVisitor &visitor) const
+{
+    Super::visit_self(visitor);
+    visitor.visitInt("lhs_size", _lhs_size);
+    visitor.visitInt("common_size", _common_size);
+    visitor.visitInt("rhs_size", _rhs_size);
+    visitor.visitInt("matmul_cnt", _matmul_cnt);
+    visitor.visitBool("lhs_common_inner", _lhs_common_inner);
+    visitor.visitBool("rhs_common_inner", _rhs_common_inner);
+}
+
+const TensorFunction &
+DenseMultiMatMulFunction::optimize(const TensorFunction &expr, Stash &stash)
+{
+    auto reduce = as<Reduce>(expr);
+    if (reduce && (reduce->aggr() == Aggr::SUM) && (reduce->dimensions().size() == 1)) {
+        auto join = as<Join>(reduce->child());
+        if (join && (join->function() == Mul::f)) {
+            const TensorFunction &a = join->lhs();
+            const TensorFunction &b = join->rhs();
+            if (is_multi_matmul(a.result_type(), b.result_type(), reduce->dimensions()[0])) {
+                return create_multi_matmul(a, b, reduce->dimensions()[0], expr.result_type(), stash);
+            }
+        }
+    }
+    return expr;
+}
+
+} // namespace vespalib::tensor
diff --git a/eval/src/vespa/eval/tensor/dense/dense_multi_matmul_function.h b/eval/src/vespa/eval/tensor/dense/dense_multi_matmul_function.h
new file mode 100644
index 00000000000..f80ca307a59
--- /dev/null
+++ b/eval/src/vespa/eval/tensor/dense/dense_multi_matmul_function.h
@@ -0,0 +1,52 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/eval/eval/tensor_function.h>
+#include "dense_tensor_view.h"
+
+namespace vespalib::tensor {
+
+/**
+ * Tensor function for multiple dense matrix multiplications. This is
+ * an extension to normal matrix multiplication where the tensors
+ * combined may have more than 2 dimensions.
+ **/
+class DenseMultiMatMulFunction : public eval::tensor_function::Op2
+{
+    using Super = eval::tensor_function::Op2;
+private:
+    size_t _lhs_size;
+    size_t _common_size;
+    size_t _rhs_size;
+    size_t _matmul_cnt;
+    bool   _lhs_common_inner;
+    bool   _rhs_common_inner;
+
+public:
+    DenseMultiMatMulFunction(const eval::ValueType &result_type,
+                             const eval::TensorFunction &lhs_in,
+                             const eval::TensorFunction &rhs_in,
+                             size_t lhs_size,
+                             size_t common_size,
+                             size_t rhs_size,
+                             size_t matmul_cnt,
+                             bool lhs_common_inner,
+                             bool rhs_common_inner);
+    ~DenseMultiMatMulFunction() override;
+
+    bool result_is_mutable() const override { return true; }
+
+    size_t lhs_size() const { return _lhs_size; }
+    size_t common_size() const { return _common_size; }
+    size_t rhs_size() const { return _rhs_size; }
+    size_t matmul_cnt() const { return _matmul_cnt; }
+    bool lhs_common_inner() const { return _lhs_common_inner; }
+    bool rhs_common_inner() const { return _rhs_common_inner; }
+
+    eval::InterpretedFunction::Instruction compile_self(const eval::TensorEngine &engine, Stash &stash) const override;
+    void visit_self(vespalib::ObjectVisitor &visitor) const override;
+    static const eval::TensorFunction &optimize(const eval::TensorFunction &expr, Stash &stash);
+};
+
+} // namespace vespalib::tensor
author	Håvard Pettersen <havardpe@oath.com>	2020-04-27 14:07:26 +0000
committer	Håvard Pettersen <havardpe@oath.com>	2020-05-04 08:13:56 +0000
commit	a38091735759c16e86cab1353db1f439a3cd35c4 (patch)
tree	de913055842b7e54461867360bde42d15fb43b25
parent	e3f135fc9ccad48719e462397915c6efeaf450b6 (diff)