From e32c2fa3e9ce2656d3f0086d90b012e02c6c8535 Mon Sep 17 00:00:00 2001
From: Håvard Pettersen <havardpe@oath.com>
Date: Fri, 17 Jan 2020 11:14:05 +0000
Subject: matrix multiplication

---
 eval/CMakeLists.txt                                |   1 +
 .../tensor/dense_matmul_function/CMakeLists.txt    |   8 +
 .../dense_matmul_function_test.cpp                 | 147 ++++++++++++++
 .../vespa/eval/tensor/default_tensor_engine.cpp    |   2 +
 eval/src/vespa/eval/tensor/dense/CMakeLists.txt    |   1 +
 .../eval/tensor/dense/dense_matmul_function.cpp    | 218 +++++++++++++++++++++
 .../eval/tensor/dense/dense_matmul_function.h      |  60 ++++++
 7 files changed, 437 insertions(+)
 create mode 100644 eval/src/tests/tensor/dense_matmul_function/CMakeLists.txt
 create mode 100644 eval/src/tests/tensor/dense_matmul_function/dense_matmul_function_test.cpp
 create mode 100644 eval/src/vespa/eval/tensor/dense/dense_matmul_function.cpp
 create mode 100644 eval/src/vespa/eval/tensor/dense/dense_matmul_function.h
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt
index b1212650a3f..febb254c53e 100644
--- a/eval/CMakeLists.txt
+++ b/eval/CMakeLists.txt
@@ -33,6 +33,7 @@ vespa_define_module(
     src/tests/tensor/dense_generic_join
     src/tests/tensor/dense_inplace_join_function
     src/tests/tensor/dense_inplace_map_function
+    src/tests/tensor/dense_matmul_function    
     src/tests/tensor/dense_remove_dimension_optimizer
     src/tests/tensor/dense_replace_type_function
     src/tests/tensor/dense_tensor_create_function
diff --git a/eval/src/tests/tensor/dense_matmul_function/CMakeLists.txt b/eval/src/tests/tensor/dense_matmul_function/CMakeLists.txt
new file mode 100644
index 00000000000..7234e8b9e69
--- /dev/null
+++ b/eval/src/tests/tensor/dense_matmul_function/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(eval_dense_matmul_function_test_app TEST
+    SOURCES
+    dense_matmul_function_test.cpp
+    DEPENDS
+    vespaeval
+)
+vespa_add_test(NAME eval_dense_matmul_function_test_app COMMAND eval_dense_matmul_function_test_app)
diff --git a/eval/src/tests/tensor/dense_matmul_function/dense_matmul_function_test.cpp b/eval/src/tests/tensor/dense_matmul_function/dense_matmul_function_test.cpp
new file mode 100644
index 00000000000..5d7c0be704e
--- /dev/null
+++ b/eval/src/tests/tensor/dense_matmul_function/dense_matmul_function_test.cpp
@@ -0,0 +1,147 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/testkit/test_kit.h>
+#include <vespa/eval/eval/tensor_function.h>
+#include <vespa/eval/eval/operation.h>
+#include <vespa/eval/eval/simple_tensor.h>
+#include <vespa/eval/eval/simple_tensor_engine.h>
+#include <vespa/eval/tensor/default_tensor_engine.h>
+#include <vespa/eval/tensor/dense/dense_matmul_function.h>
+#include <vespa/eval/tensor/dense/dense_tensor.h>
+#include <vespa/eval/tensor/dense/dense_tensor_view.h>
+#include <vespa/eval/eval/test/tensor_model.hpp>
+#include <vespa/eval/eval/test/eval_fixture.h>
+
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/vespalib/util/stash.h>
+
+using namespace vespalib;
+using namespace vespalib::eval;
+using namespace vespalib::eval::test;
+using namespace vespalib::tensor;
+using namespace vespalib::eval::tensor_function;
+
+const TensorEngine &prod_engine = DefaultTensorEngine::ref();
+
+void add_matrix(EvalFixture::ParamRepo &repo, const char *d1, size_t s1, const char *d2, size_t s2) {
+    for (bool float_cells: {false, true}) {
+        auto name = make_string("%s%zu%s%zu%s", d1, s1, d2, s2, float_cells ? "f" : "");
+        auto type_str = make_string("tensor%s(%s[%zu],%s[%zu])", float_cells ? "<float>" : "", d1, s1, d2, s2);
+        TensorSpec matrix(type_str);
+        for (size_t i = 0; i < s1; ++i) {
+            for (size_t j = 0; j < s2; ++j) {
+                double value = (i + s1 + s2) * 3.0 + (j + s2) * 7.0;
+                matrix.add({{d1, i}, {d2, j}}, value);
+            }
+        }
+        repo.add(name, matrix);
+    }
+}
+
+EvalFixture::ParamRepo make_params() {
+    EvalFixture::ParamRepo repo;
+    add_matrix(repo, "a", 2, "d", 3); // inner/inner
+    add_matrix(repo, "a", 2, "b", 5); // inner/outer
+    add_matrix(repo, "b", 5, "c", 2); // outer/outer
+    add_matrix(repo, "a", 2, "c", 3); // not matching
+    //-----------------------------------------------
+    add_matrix(repo, "b", 5, "d", 3); // fixed param
+    return repo;
+}
+EvalFixture::ParamRepo param_repo = make_params();
+
+void verify_optimized(const vespalib::string &expr,
+                      size_t lhs_size, size_t common_size, size_t rhs_size,
+                      bool lhs_inner, bool rhs_inner)
+{
+    EvalFixture slow_fixture(prod_engine, expr, param_repo, false);
+    EvalFixture fixture(prod_engine, expr, param_repo, true);
+    EXPECT_EQUAL(fixture.result(), EvalFixture::ref(expr, param_repo));
+    EXPECT_EQUAL(fixture.result(), slow_fixture.result());
+    auto info = fixture.find_all<DenseMatMulFunction>();
+    ASSERT_EQUAL(info.size(), 1u);
+    EXPECT_TRUE(info[0]->result_is_mutable());
+    EXPECT_EQUAL(info[0]->lhs_size(), lhs_size);
+    EXPECT_EQUAL(info[0]->common_size(), common_size);
+    EXPECT_EQUAL(info[0]->rhs_size(), rhs_size);
+    EXPECT_EQUAL(info[0]->lhs_common_inner(), lhs_inner);
+    EXPECT_EQUAL(info[0]->rhs_common_inner(), rhs_inner);
+}
+
+void verify_not_optimized(const vespalib::string &expr) {
+    EvalFixture slow_fixture(prod_engine, expr, param_repo, false);
+    EvalFixture fixture(prod_engine, expr, param_repo, true);
+    EXPECT_EQUAL(fixture.result(), EvalFixture::ref(expr, param_repo));
+    EXPECT_EQUAL(fixture.result(), slow_fixture.result());
+    auto info = fixture.find_all<DenseMatMulFunction>();
+    EXPECT_TRUE(info.empty());
+}
+
+TEST("require that matmul can be optimized") {
+    TEST_DO(verify_optimized("reduce(a2d3*b5d3,sum,d)", 2, 3, 5, true, true));
+}
+
+TEST("require that matmul with lambda can be optimized") {
+    TEST_DO(verify_optimized("reduce(join(a2d3,b5d3,f(x,y)(x*y)),sum,d)", 2, 3, 5, true, true));
+    TEST_DO(verify_optimized("reduce(join(a2d3,b5d3,f(x,y)(y*x)),sum,d)", 2, 3, 5, true, true));
+}
+
+TEST("require that expressions similar to matmul are not optimized") {
+    TEST_DO(verify_not_optimized("reduce(a2d3*b5d3,sum,a)"));
+    TEST_DO(verify_not_optimized("reduce(a2d3*b5d3,sum,b)"));
+    TEST_DO(verify_not_optimized("reduce(a2d3*b5d3,prod,d)"));
+    TEST_DO(verify_not_optimized("reduce(a2d3*b5d3,sum)"));
+    TEST_DO(verify_not_optimized("reduce(join(a2d3,b5d3,f(x,y)(x+y)),sum,d)"));
+    TEST_DO(verify_not_optimized("reduce(join(a2d3,b5d3,f(x,y)(x*x)),sum,d)"));
+    TEST_DO(verify_not_optimized("reduce(join(a2d3,b5d3,f(x,y)(y*y)),sum,d)"));
+    TEST_DO(verify_not_optimized("reduce(join(a2d3,b5d3,f(x,y)(x*y*1)),sum,d)"));
+    TEST_DO(verify_not_optimized("reduce(a2c3*b5d3,sum,d)"));
+    TEST_DO(verify_not_optimized("reduce(a2c3*b5d3,sum,c)"));
+}
+
+TEST("require that xw product can be debug dumped") {
+    EvalFixture fixture(prod_engine, "reduce(a2d3*b5d3,sum,d)", param_repo, true);
+    auto info = fixture.find_all<DenseMatMulFunction>();
+    ASSERT_EQUAL(info.size(), 1u);
+    fprintf(stderr, "%s\n", info[0]->as_string().c_str());
+}
+
+vespalib::string make_expr(const vespalib::string &a, const vespalib::string &b, const vespalib::string &common,
+                           bool float_a, bool float_b)
+{
+    return make_string("reduce(%s%s*%s%s,sum,%s)", a.c_str(), float_a ? "f" : "", b.c_str(), float_b ? "f" : "", common.c_str());
+}
+
+void verify_optimized_multi(const vespalib::string &a, const vespalib::string &b, const vespalib::string &common,
+                            size_t lhs_size, size_t common_size, size_t rhs_size,
+                            bool lhs_inner, bool rhs_inner)
+{
+    for (bool float_a: {false, true}) {
+        for (bool float_b: {false, true}) {
+            {
+                auto expr = make_expr(a, b, common, float_a, float_b);
+                TEST_STATE(expr.c_str());
+                TEST_DO(verify_optimized(expr, lhs_size, common_size, rhs_size, lhs_inner, rhs_inner));
+            }
+            {
+                auto expr = make_expr(b, a, common, float_b, float_a);
+                TEST_STATE(expr.c_str());
+                TEST_DO(verify_optimized(expr, lhs_size, common_size, rhs_size, lhs_inner, rhs_inner));
+            }
+        }
+    }
+}
+
+TEST("require that matmul inner/inner works correctly") {
+    TEST_DO(verify_optimized_multi("a2d3", "b5d3", "d", 2, 3, 5, true, true));
+}
+
+TEST("require that matmul inner/outer works correctly") {
+    TEST_DO(verify_optimized_multi("a2b5", "b5d3", "b", 2, 5, 3, true, false));
+}
+
+TEST("require that matmul outer/outer works correctly") {
+    TEST_DO(verify_optimized_multi("b5c2", "b5d3", "b", 2, 5, 3, false, false));
+}
+
+TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/eval/src/vespa/eval/tensor/default_tensor_engine.cpp b/eval/src/vespa/eval/tensor/default_tensor_engine.cpp
index b4449309812..a9e1ad84eb7 100644
--- a/eval/src/vespa/eval/tensor/default_tensor_engine.cpp
+++ b/eval/src/vespa/eval/tensor/default_tensor_engine.cpp
@@ -10,6 +10,7 @@
 #include "dense/typed_dense_tensor_builder.h"
 #include "dense/dense_dot_product_function.h"
 #include "dense/dense_xw_product_function.h"
+#include "dense/dense_matmul_function.h"
 #include "dense/dense_fast_rename_optimizer.h"
 #include "dense/dense_add_dimension_optimizer.h"
 #include "dense/dense_remove_dimension_optimizer.h"
@@ -269,6 +270,7 @@ DefaultTensorEngine::optimize(const TensorFunction &expr, Stash &stash) const
         child.set(DenseTensorPeekFunction::optimize(child.get(), stash));
         child.set(DenseDotProductFunction::optimize(child.get(), stash));
         child.set(DenseXWProductFunction::optimize(child.get(), stash));
+        child.set(DenseMatMulFunction::optimize(child.get(), stash));
         child.set(DenseFastRenameOptimizer::optimize(child.get(), stash));
         child.set(DenseAddDimensionOptimizer::optimize(child.get(), stash));
         child.set(DenseRemoveDimensionOptimizer::optimize(child.get(), stash));
diff --git a/eval/src/vespa/eval/tensor/dense/CMakeLists.txt b/eval/src/vespa/eval/tensor/dense/CMakeLists.txt
index 2ad54d48ab3..635a49cb4a9 100644
--- a/eval/src/vespa/eval/tensor/dense/CMakeLists.txt
+++ b/eval/src/vespa/eval/tensor/dense/CMakeLists.txt
@@ -7,6 +7,7 @@ vespa_add_library(eval_tensor_dense OBJECT
     dense_fast_rename_optimizer.cpp
     dense_inplace_join_function.cpp
     dense_inplace_map_function.cpp
+    dense_matmul_function.cpp
     dense_remove_dimension_optimizer.cpp
     dense_replace_type_function.cpp
     dense_tensor.cpp
diff --git a/eval/src/vespa/eval/tensor/dense/dense_matmul_function.cpp b/eval/src/vespa/eval/tensor/dense/dense_matmul_function.cpp
new file mode 100644
index 00000000000..7ba186b622a
--- /dev/null
+++ b/eval/src/vespa/eval/tensor/dense/dense_matmul_function.cpp
@@ -0,0 +1,218 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "dense_matmul_function.h"
+#include "dense_tensor_view.h"
+#include <vespa/vespalib/objects/objectvisitor.h>
+#include <vespa/vespalib/util/overload.h>
+#include <vespa/vespalib/util/visit_ranges.h>
+#include <vespa/eval/eval/value.h>
+#include <vespa/eval/eval/operation.h>
+#include <vespa/eval/tensor/tensor.h>
+#include <assert.h>
+
+namespace vespalib::tensor {
+
+using eval::ValueType;
+using eval::TensorFunction;
+using eval::as;
+using eval::Aggr;
+using namespace eval::tensor_function;
+using namespace eval::operation;
+
+namespace {
+
+template <typename LCT, typename RCT>
+struct HWSupport {
+    static double call(hwaccelrated::IAccelrated *, const LCT *lhs, const RCT *rhs, size_t len) {
+        double result = 0.0;
+        for (size_t i = 0; i < len; ++i) {
+            result += (lhs[i] * rhs[i]);
+        }
+        return result;
+    }
+};
+template <> struct HWSupport<float, float> {
+    static double call(hwaccelrated::IAccelrated *hw, const float *lhs, const float *rhs, size_t len) {
+        return hw->dotProduct(lhs, rhs, len);
+    }
+};
+template <> struct HWSupport<double, double> {
+    static double call(hwaccelrated::IAccelrated *hw, const double *lhs, const double *rhs, size_t len) {
+        return hw->dotProduct(lhs, rhs, len);
+    }
+};
+
+template <typename LCT, typename RCT, bool lhs_common_inner, bool rhs_common_inner>
+double sparse_dot_product(const LCT *lhs, const RCT *rhs, size_t lhs_size, size_t common_size, size_t rhs_size) {
+    double result = 0.0;
+    for (size_t i = 0; i < common_size; ++i) {
+        result += ((*lhs) * (*rhs));
+        lhs += (lhs_common_inner ? 1 : lhs_size);
+        rhs += (rhs_common_inner ? 1 : rhs_size);
+    }
+    return result;
+}
+
+template <typename LCT, typename RCT, bool lhs_common_inner, bool rhs_common_inner>
+void my_matmul_op(eval::InterpretedFunction::State &state, uint64_t param) {
+    const DenseMatMulFunction::Self &self = *((const DenseMatMulFunction::Self *)(param));
+    using OCT = typename eval::UnifyCellTypes<LCT,RCT>::type;
+    auto lhs_cells = DenseTensorView::typify_cells<LCT>(state.peek(1));
+    auto rhs_cells = DenseTensorView::typify_cells<RCT>(state.peek(0));
+    auto dst_cells = state.stash.create_array<OCT>(self.lhs_size * self.rhs_size);
+    OCT *dst = dst_cells.begin();
+    const LCT *lhs = lhs_cells.cbegin();
+    for (size_t i = 0; i < self.lhs_size; ++i) {
+        const RCT *rhs = rhs_cells.cbegin();
+        for (size_t j = 0; j < self.rhs_size; ++j) {
+            if (lhs_common_inner && rhs_common_inner) {
+                *dst++ = HWSupport<LCT,RCT>::call(self.hw.get(), lhs, rhs, self.common_size);
+            } else {
+                *dst++ = sparse_dot_product<LCT,RCT,lhs_common_inner,rhs_common_inner>(lhs, rhs, self.lhs_size, self.common_size, self.rhs_size);
+            }
+            rhs += (rhs_common_inner ? self.common_size : 1);
+        }
+        lhs += (lhs_common_inner ? self.common_size : 1);
+    }
+    state.pop_pop_push(state.stash.create<DenseTensorView>(self.result_type, TypedCells(dst_cells)));
+}
+
+template <bool lhs_common_inner, bool rhs_common_inner>
+struct MyMatMulOp {
+    template <typename LCT, typename RCT>
+    static auto get_fun() { return my_matmul_op<LCT,RCT,lhs_common_inner,rhs_common_inner>; }
+};
+
+template <bool lhs_common_inner>
+eval::InterpretedFunction::op_function my_select2(CellType lct, CellType rct,
+                                                  bool rhs_common_inner)
+{
+    if (rhs_common_inner) {
+        return select_2<MyMatMulOp<lhs_common_inner,true>>(lct, rct);
+    } else {
+        return select_2<MyMatMulOp<lhs_common_inner,false>>(lct, rct);
+    }
+}
+
+eval::InterpretedFunction::op_function my_select(CellType lct, CellType rct,
+                                                 bool lhs_common_inner, bool rhs_common_inner)
+{
+    if (lhs_common_inner) {
+        return my_select2<true>(lct, rct, rhs_common_inner);
+    } else {
+        return my_select2<false>(lct, rct, rhs_common_inner);
+    }
+}
+
+bool is_matrix(const ValueType &type) {
+    return (type.is_dense() && (type.dimensions().size() == 2));
+}
+
+bool is_matmul(const eval::ValueType &a, const eval::ValueType &b,
+               const vespalib::string &reduce_dim, const eval::ValueType &result_type)
+{
+    size_t npos = ValueType::Dimension::npos;
+    return (is_matrix(a) && is_matrix(b) && is_matrix(result_type) &&
+            (a.dimension_index(reduce_dim) != npos) &&
+            (b.dimension_index(reduce_dim) != npos));
+}
+
+const ValueType::Dimension &dim(const TensorFunction &expr, size_t idx) {
+    return expr.result_type().dimensions()[idx];
+}
+
+size_t inv(size_t idx) { return (1 - idx); }
+
+const TensorFunction &create_matmul(const TensorFunction &a, const TensorFunction &b,
+                                    const vespalib::string &reduce_dim, const ValueType &result_type, Stash &stash) {
+    size_t a_idx = a.result_type().dimension_index(reduce_dim);
+    size_t b_idx = b.result_type().dimension_index(reduce_dim);
+    assert(a_idx != ValueType::Dimension::npos);
+    assert(b_idx != ValueType::Dimension::npos);
+    assert(dim(a, a_idx).size == dim(b, b_idx).size);
+    bool a_common_inner = (a_idx == 1);
+    bool b_common_inner = (b_idx == 1);
+    size_t a_size = dim(a, inv(a_idx)).size;
+    size_t b_size = dim(b, inv(b_idx)).size;
+    size_t common_size = dim(a, a_idx).size;
+    bool a_is_lhs = (dim(a, inv(a_idx)).name < dim(b, inv(b_idx)).name);
+    if (a_is_lhs) {
+        return stash.create<DenseMatMulFunction>(result_type, a, b, a_size, common_size, b_size, a_common_inner, b_common_inner);
+    } else {
+        return stash.create<DenseMatMulFunction>(result_type, b, a, b_size, common_size, a_size, b_common_inner, a_common_inner);
+    }
+}
+
+} // namespace vespalib::tensor::<unnamed>
+
+DenseMatMulFunction::Self::Self(const eval::ValueType &result_type_in,
+                                size_t lhs_size_in,
+                                size_t common_size_in,
+                                size_t rhs_size_in)
+    : result_type(result_type_in),
+      lhs_size(lhs_size_in),
+      common_size(common_size_in),
+      rhs_size(rhs_size_in),
+      hw(hwaccelrated::IAccelrated::getAccelrator())
+{
+}
+
+DenseMatMulFunction::Self::~Self() = default;
+
+DenseMatMulFunction::DenseMatMulFunction(const eval::ValueType &result_type,
+                                         const eval::TensorFunction &lhs_in,
+                                         const eval::TensorFunction &rhs_in,
+                                         size_t lhs_size,
+                                         size_t common_size,
+                                         size_t rhs_size,
+                                         bool lhs_common_inner,
+                                         bool rhs_common_inner)
+    : Super(result_type, lhs_in, rhs_in),
+      _lhs_size(lhs_size),
+      _common_size(common_size),
+      _rhs_size(rhs_size),
+      _lhs_common_inner(lhs_common_inner),
+      _rhs_common_inner(rhs_common_inner)
+{
+}
+
+DenseMatMulFunction::~DenseMatMulFunction() = default;
+
+eval::InterpretedFunction::Instruction
+DenseMatMulFunction::compile_self(Stash &stash) const
+{
+    Self &self = stash.create<Self>(result_type(), _lhs_size, _common_size, _rhs_size);
+    auto op = my_select(lhs().result_type().cell_type(), rhs().result_type().cell_type(),
+                        _lhs_common_inner, _rhs_common_inner);
+    return eval::InterpretedFunction::Instruction(op, (uint64_t)(&self));
+}
+
+void
+DenseMatMulFunction::visit_self(vespalib::ObjectVisitor &visitor) const
+{
+    Super::visit_self(visitor);
+    visitor.visitInt("lhs_size", _lhs_size);
+    visitor.visitInt("common_size", _common_size);
+    visitor.visitInt("rhs_size", _rhs_size);
+    visitor.visitBool("lhs_common_inner", _lhs_common_inner);
+    visitor.visitBool("rhs_common_inner", _rhs_common_inner);
+}
+
+const TensorFunction &
+DenseMatMulFunction::optimize(const eval::TensorFunction &expr, Stash &stash)
+{
+    auto reduce = as<Reduce>(expr);
+    if (reduce && (reduce->aggr() == Aggr::SUM) && (reduce->dimensions().size() == 1)) {
+        auto join = as<Join>(reduce->child());
+        if (join && (join->function() == Mul::f)) {
+            const TensorFunction &a = join->lhs();
+            const TensorFunction &b = join->rhs();
+            if (is_matmul(a.result_type(), b.result_type(), reduce->dimensions()[0], expr.result_type())) {
+                return create_matmul(a, b, reduce->dimensions()[0], expr.result_type(), stash);
+            }
+        }
+    }
+    return expr;
+}
+
+} // namespace vespalib::tensor
diff --git a/eval/src/vespa/eval/tensor/dense/dense_matmul_function.h b/eval/src/vespa/eval/tensor/dense/dense_matmul_function.h
new file mode 100644
index 00000000000..276a455bda4
--- /dev/null
+++ b/eval/src/vespa/eval/tensor/dense/dense_matmul_function.h
@@ -0,0 +1,60 @@
+// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/eval/eval/tensor_function.h>
+#include "dense_tensor_view.h"
+#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
+
+namespace vespalib::tensor {
+
+/**
+ * Tensor function for dense matrix multiplication.
+ **/
+class DenseMatMulFunction : public eval::tensor_function::Op2
+{
+    using Super = eval::tensor_function::Op2;
+public:
+    struct Self {
+        eval::ValueType result_type;
+        size_t lhs_size;
+        size_t common_size;
+        size_t rhs_size;
+        hwaccelrated::IAccelrated::UP hw;
+        Self(const eval::ValueType &result_type_in,
+             size_t lhs_size_in, size_t common_size_in, size_t rhs_size_in);
+        ~Self();
+    };
+
+private:
+    size_t _lhs_size;
+    size_t _common_size;
+    size_t _rhs_size;
+    bool   _lhs_common_inner;
+    bool   _rhs_common_inner;
+
+public:
+    DenseMatMulFunction(const eval::ValueType &result_type,
+                        const eval::TensorFunction &lhs_in,
+                        const eval::TensorFunction &rhs_in,
+                        size_t lhs_size,
+                        size_t common_size,
+                        size_t rhs_size,
+                        bool lhs_common_inner,
+                        bool rhs_common_inner);
+    ~DenseMatMulFunction();
+
+    bool result_is_mutable() const override { return true; }
+
+    size_t lhs_size() const { return _lhs_size; }
+    size_t common_size() const { return _common_size; }
+    size_t rhs_size() const { return _rhs_size; }
+    bool lhs_common_inner() const { return _lhs_common_inner; }
+    bool rhs_common_inner() const { return _rhs_common_inner; }
+
+    eval::InterpretedFunction::Instruction compile_self(Stash &stash) const override;
+    void visit_self(vespalib::ObjectVisitor &visitor) const override;
+    static const eval::TensorFunction &optimize(const eval::TensorFunction &expr, Stash &stash);
+};
+
+} // namespace vespalib::tensor
-- 
cgit v1.2.3