From e32c2fa3e9ce2656d3f0086d90b012e02c6c8535 Mon Sep 17 00:00:00 2001 From: HÃ¥vard Pettersen Date: Fri, 17 Jan 2020 11:14:05 +0000 Subject: matrix multiplication --- eval/CMakeLists.txt | 1 + .../tensor/dense_matmul_function/CMakeLists.txt | 8 + .../dense_matmul_function_test.cpp | 147 ++++++++++++++ .../vespa/eval/tensor/default_tensor_engine.cpp | 2 + eval/src/vespa/eval/tensor/dense/CMakeLists.txt | 1 + .../eval/tensor/dense/dense_matmul_function.cpp | 218 +++++++++++++++++++++ .../eval/tensor/dense/dense_matmul_function.h | 60 ++++++ 7 files changed, 437 insertions(+) create mode 100644 eval/src/tests/tensor/dense_matmul_function/CMakeLists.txt create mode 100644 eval/src/tests/tensor/dense_matmul_function/dense_matmul_function_test.cpp create mode 100644 eval/src/vespa/eval/tensor/dense/dense_matmul_function.cpp create mode 100644 eval/src/vespa/eval/tensor/dense/dense_matmul_function.h diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt index b1212650a3f..febb254c53e 100644 --- a/eval/CMakeLists.txt +++ b/eval/CMakeLists.txt @@ -33,6 +33,7 @@ vespa_define_module( src/tests/tensor/dense_generic_join src/tests/tensor/dense_inplace_join_function src/tests/tensor/dense_inplace_map_function + src/tests/tensor/dense_matmul_function src/tests/tensor/dense_remove_dimension_optimizer src/tests/tensor/dense_replace_type_function src/tests/tensor/dense_tensor_create_function diff --git a/eval/src/tests/tensor/dense_matmul_function/CMakeLists.txt b/eval/src/tests/tensor/dense_matmul_function/CMakeLists.txt new file mode 100644 index 00000000000..7234e8b9e69 --- /dev/null +++ b/eval/src/tests/tensor/dense_matmul_function/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(eval_dense_matmul_function_test_app TEST + SOURCES + dense_matmul_function_test.cpp + DEPENDS + vespaeval +) +vespa_add_test(NAME eval_dense_matmul_function_test_app COMMAND eval_dense_matmul_function_test_app) diff --git a/eval/src/tests/tensor/dense_matmul_function/dense_matmul_function_test.cpp b/eval/src/tests/tensor/dense_matmul_function/dense_matmul_function_test.cpp new file mode 100644 index 00000000000..5d7c0be704e --- /dev/null +++ b/eval/src/tests/tensor/dense_matmul_function/dense_matmul_function_test.cpp @@ -0,0 +1,147 @@ +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +using namespace vespalib; +using namespace vespalib::eval; +using namespace vespalib::eval::test; +using namespace vespalib::tensor; +using namespace vespalib::eval::tensor_function; + +const TensorEngine &prod_engine = DefaultTensorEngine::ref(); + +void add_matrix(EvalFixture::ParamRepo &repo, const char *d1, size_t s1, const char *d2, size_t s2) { + for (bool float_cells: {false, true}) { + auto name = make_string("%s%zu%s%zu%s", d1, s1, d2, s2, float_cells ? "f" : ""); + auto type_str = make_string("tensor%s(%s[%zu],%s[%zu])", float_cells ? "" : "", d1, s1, d2, s2); + TensorSpec matrix(type_str); + for (size_t i = 0; i < s1; ++i) { + for (size_t j = 0; j < s2; ++j) { + double value = (i + s1 + s2) * 3.0 + (j + s2) * 7.0; + matrix.add({{d1, i}, {d2, j}}, value); + } + } + repo.add(name, matrix); + } +} + +EvalFixture::ParamRepo make_params() { + EvalFixture::ParamRepo repo; + add_matrix(repo, "a", 2, "d", 3); // inner/inner + add_matrix(repo, "a", 2, "b", 5); // inner/outer + add_matrix(repo, "b", 5, "c", 2); // outer/outer + add_matrix(repo, "a", 2, "c", 3); // not matching + //----------------------------------------------- + add_matrix(repo, "b", 5, "d", 3); // fixed param + return repo; +} +EvalFixture::ParamRepo param_repo = make_params(); + +void verify_optimized(const vespalib::string &expr, + size_t lhs_size, size_t common_size, size_t rhs_size, + bool lhs_inner, bool rhs_inner) +{ + EvalFixture slow_fixture(prod_engine, expr, param_repo, false); + EvalFixture fixture(prod_engine, expr, param_repo, true); + EXPECT_EQUAL(fixture.result(), EvalFixture::ref(expr, param_repo)); + EXPECT_EQUAL(fixture.result(), slow_fixture.result()); + auto info = fixture.find_all(); + ASSERT_EQUAL(info.size(), 1u); + EXPECT_TRUE(info[0]->result_is_mutable()); + EXPECT_EQUAL(info[0]->lhs_size(), lhs_size); + EXPECT_EQUAL(info[0]->common_size(), common_size); + EXPECT_EQUAL(info[0]->rhs_size(), rhs_size); + EXPECT_EQUAL(info[0]->lhs_common_inner(), lhs_inner); + EXPECT_EQUAL(info[0]->rhs_common_inner(), rhs_inner); +} + +void verify_not_optimized(const vespalib::string &expr) { + EvalFixture slow_fixture(prod_engine, expr, param_repo, false); + EvalFixture fixture(prod_engine, expr, param_repo, true); + EXPECT_EQUAL(fixture.result(), EvalFixture::ref(expr, param_repo)); + EXPECT_EQUAL(fixture.result(), slow_fixture.result()); + auto info = fixture.find_all(); + EXPECT_TRUE(info.empty()); +} + +TEST("require that matmul can be optimized") { + TEST_DO(verify_optimized("reduce(a2d3*b5d3,sum,d)", 2, 3, 5, true, true)); +} + +TEST("require that matmul with lambda can be optimized") { + TEST_DO(verify_optimized("reduce(join(a2d3,b5d3,f(x,y)(x*y)),sum,d)", 2, 3, 5, true, true)); + TEST_DO(verify_optimized("reduce(join(a2d3,b5d3,f(x,y)(y*x)),sum,d)", 2, 3, 5, true, true)); +} + +TEST("require that expressions similar to matmul are not optimized") { + TEST_DO(verify_not_optimized("reduce(a2d3*b5d3,sum,a)")); + TEST_DO(verify_not_optimized("reduce(a2d3*b5d3,sum,b)")); + TEST_DO(verify_not_optimized("reduce(a2d3*b5d3,prod,d)")); + TEST_DO(verify_not_optimized("reduce(a2d3*b5d3,sum)")); + TEST_DO(verify_not_optimized("reduce(join(a2d3,b5d3,f(x,y)(x+y)),sum,d)")); + TEST_DO(verify_not_optimized("reduce(join(a2d3,b5d3,f(x,y)(x*x)),sum,d)")); + TEST_DO(verify_not_optimized("reduce(join(a2d3,b5d3,f(x,y)(y*y)),sum,d)")); + TEST_DO(verify_not_optimized("reduce(join(a2d3,b5d3,f(x,y)(x*y*1)),sum,d)")); + TEST_DO(verify_not_optimized("reduce(a2c3*b5d3,sum,d)")); + TEST_DO(verify_not_optimized("reduce(a2c3*b5d3,sum,c)")); +} + +TEST("require that xw product can be debug dumped") { + EvalFixture fixture(prod_engine, "reduce(a2d3*b5d3,sum,d)", param_repo, true); + auto info = fixture.find_all(); + ASSERT_EQUAL(info.size(), 1u); + fprintf(stderr, "%s\n", info[0]->as_string().c_str()); +} + +vespalib::string make_expr(const vespalib::string &a, const vespalib::string &b, const vespalib::string &common, + bool float_a, bool float_b) +{ + return make_string("reduce(%s%s*%s%s,sum,%s)", a.c_str(), float_a ? "f" : "", b.c_str(), float_b ? "f" : "", common.c_str()); +} + +void verify_optimized_multi(const vespalib::string &a, const vespalib::string &b, const vespalib::string &common, + size_t lhs_size, size_t common_size, size_t rhs_size, + bool lhs_inner, bool rhs_inner) +{ + for (bool float_a: {false, true}) { + for (bool float_b: {false, true}) { + { + auto expr = make_expr(a, b, common, float_a, float_b); + TEST_STATE(expr.c_str()); + TEST_DO(verify_optimized(expr, lhs_size, common_size, rhs_size, lhs_inner, rhs_inner)); + } + { + auto expr = make_expr(b, a, common, float_b, float_a); + TEST_STATE(expr.c_str()); + TEST_DO(verify_optimized(expr, lhs_size, common_size, rhs_size, lhs_inner, rhs_inner)); + } + } + } +} + +TEST("require that matmul inner/inner works correctly") { + TEST_DO(verify_optimized_multi("a2d3", "b5d3", "d", 2, 3, 5, true, true)); +} + +TEST("require that matmul inner/outer works correctly") { + TEST_DO(verify_optimized_multi("a2b5", "b5d3", "b", 2, 5, 3, true, false)); +} + +TEST("require that matmul outer/outer works correctly") { + TEST_DO(verify_optimized_multi("b5c2", "b5d3", "b", 2, 5, 3, false, false)); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/eval/src/vespa/eval/tensor/default_tensor_engine.cpp b/eval/src/vespa/eval/tensor/default_tensor_engine.cpp index b4449309812..a9e1ad84eb7 100644 --- a/eval/src/vespa/eval/tensor/default_tensor_engine.cpp +++ b/eval/src/vespa/eval/tensor/default_tensor_engine.cpp @@ -10,6 +10,7 @@ #include "dense/typed_dense_tensor_builder.h" #include "dense/dense_dot_product_function.h" #include "dense/dense_xw_product_function.h" +#include "dense/dense_matmul_function.h" #include "dense/dense_fast_rename_optimizer.h" #include "dense/dense_add_dimension_optimizer.h" #include "dense/dense_remove_dimension_optimizer.h" @@ -269,6 +270,7 @@ DefaultTensorEngine::optimize(const TensorFunction &expr, Stash &stash) const child.set(DenseTensorPeekFunction::optimize(child.get(), stash)); child.set(DenseDotProductFunction::optimize(child.get(), stash)); child.set(DenseXWProductFunction::optimize(child.get(), stash)); + child.set(DenseMatMulFunction::optimize(child.get(), stash)); child.set(DenseFastRenameOptimizer::optimize(child.get(), stash)); child.set(DenseAddDimensionOptimizer::optimize(child.get(), stash)); child.set(DenseRemoveDimensionOptimizer::optimize(child.get(), stash)); diff --git a/eval/src/vespa/eval/tensor/dense/CMakeLists.txt b/eval/src/vespa/eval/tensor/dense/CMakeLists.txt index 2ad54d48ab3..635a49cb4a9 100644 --- a/eval/src/vespa/eval/tensor/dense/CMakeLists.txt +++ b/eval/src/vespa/eval/tensor/dense/CMakeLists.txt @@ -7,6 +7,7 @@ vespa_add_library(eval_tensor_dense OBJECT dense_fast_rename_optimizer.cpp dense_inplace_join_function.cpp dense_inplace_map_function.cpp + dense_matmul_function.cpp dense_remove_dimension_optimizer.cpp dense_replace_type_function.cpp dense_tensor.cpp diff --git a/eval/src/vespa/eval/tensor/dense/dense_matmul_function.cpp b/eval/src/vespa/eval/tensor/dense/dense_matmul_function.cpp new file mode 100644 index 00000000000..7ba186b622a --- /dev/null +++ b/eval/src/vespa/eval/tensor/dense/dense_matmul_function.cpp @@ -0,0 +1,218 @@ +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "dense_matmul_function.h" +#include "dense_tensor_view.h" +#include +#include +#include +#include +#include +#include +#include + +namespace vespalib::tensor { + +using eval::ValueType; +using eval::TensorFunction; +using eval::as; +using eval::Aggr; +using namespace eval::tensor_function; +using namespace eval::operation; + +namespace { + +template +struct HWSupport { + static double call(hwaccelrated::IAccelrated *, const LCT *lhs, const RCT *rhs, size_t len) { + double result = 0.0; + for (size_t i = 0; i < len; ++i) { + result += (lhs[i] * rhs[i]); + } + return result; + } +}; +template <> struct HWSupport { + static double call(hwaccelrated::IAccelrated *hw, const float *lhs, const float *rhs, size_t len) { + return hw->dotProduct(lhs, rhs, len); + } +}; +template <> struct HWSupport { + static double call(hwaccelrated::IAccelrated *hw, const double *lhs, const double *rhs, size_t len) { + return hw->dotProduct(lhs, rhs, len); + } +}; + +template +double sparse_dot_product(const LCT *lhs, const RCT *rhs, size_t lhs_size, size_t common_size, size_t rhs_size) { + double result = 0.0; + for (size_t i = 0; i < common_size; ++i) { + result += ((*lhs) * (*rhs)); + lhs += (lhs_common_inner ? 1 : lhs_size); + rhs += (rhs_common_inner ? 1 : rhs_size); + } + return result; +} + +template +void my_matmul_op(eval::InterpretedFunction::State &state, uint64_t param) { + const DenseMatMulFunction::Self &self = *((const DenseMatMulFunction::Self *)(param)); + using OCT = typename eval::UnifyCellTypes::type; + auto lhs_cells = DenseTensorView::typify_cells(state.peek(1)); + auto rhs_cells = DenseTensorView::typify_cells(state.peek(0)); + auto dst_cells = state.stash.create_array(self.lhs_size * self.rhs_size); + OCT *dst = dst_cells.begin(); + const LCT *lhs = lhs_cells.cbegin(); + for (size_t i = 0; i < self.lhs_size; ++i) { + const RCT *rhs = rhs_cells.cbegin(); + for (size_t j = 0; j < self.rhs_size; ++j) { + if (lhs_common_inner && rhs_common_inner) { + *dst++ = HWSupport::call(self.hw.get(), lhs, rhs, self.common_size); + } else { + *dst++ = sparse_dot_product(lhs, rhs, self.lhs_size, self.common_size, self.rhs_size); + } + rhs += (rhs_common_inner ? self.common_size : 1); + } + lhs += (lhs_common_inner ? self.common_size : 1); + } + state.pop_pop_push(state.stash.create(self.result_type, TypedCells(dst_cells))); +} + +template +struct MyMatMulOp { + template + static auto get_fun() { return my_matmul_op; } +}; + +template +eval::InterpretedFunction::op_function my_select2(CellType lct, CellType rct, + bool rhs_common_inner) +{ + if (rhs_common_inner) { + return select_2>(lct, rct); + } else { + return select_2>(lct, rct); + } +} + +eval::InterpretedFunction::op_function my_select(CellType lct, CellType rct, + bool lhs_common_inner, bool rhs_common_inner) +{ + if (lhs_common_inner) { + return my_select2(lct, rct, rhs_common_inner); + } else { + return my_select2(lct, rct, rhs_common_inner); + } +} + +bool is_matrix(const ValueType &type) { + return (type.is_dense() && (type.dimensions().size() == 2)); +} + +bool is_matmul(const eval::ValueType &a, const eval::ValueType &b, + const vespalib::string &reduce_dim, const eval::ValueType &result_type) +{ + size_t npos = ValueType::Dimension::npos; + return (is_matrix(a) && is_matrix(b) && is_matrix(result_type) && + (a.dimension_index(reduce_dim) != npos) && + (b.dimension_index(reduce_dim) != npos)); +} + +const ValueType::Dimension &dim(const TensorFunction &expr, size_t idx) { + return expr.result_type().dimensions()[idx]; +} + +size_t inv(size_t idx) { return (1 - idx); } + +const TensorFunction &create_matmul(const TensorFunction &a, const TensorFunction &b, + const vespalib::string &reduce_dim, const ValueType &result_type, Stash &stash) { + size_t a_idx = a.result_type().dimension_index(reduce_dim); + size_t b_idx = b.result_type().dimension_index(reduce_dim); + assert(a_idx != ValueType::Dimension::npos); + assert(b_idx != ValueType::Dimension::npos); + assert(dim(a, a_idx).size == dim(b, b_idx).size); + bool a_common_inner = (a_idx == 1); + bool b_common_inner = (b_idx == 1); + size_t a_size = dim(a, inv(a_idx)).size; + size_t b_size = dim(b, inv(b_idx)).size; + size_t common_size = dim(a, a_idx).size; + bool a_is_lhs = (dim(a, inv(a_idx)).name < dim(b, inv(b_idx)).name); + if (a_is_lhs) { + return stash.create(result_type, a, b, a_size, common_size, b_size, a_common_inner, b_common_inner); + } else { + return stash.create(result_type, b, a, b_size, common_size, a_size, b_common_inner, a_common_inner); + } +} + +} // namespace vespalib::tensor:: + +DenseMatMulFunction::Self::Self(const eval::ValueType &result_type_in, + size_t lhs_size_in, + size_t common_size_in, + size_t rhs_size_in) + : result_type(result_type_in), + lhs_size(lhs_size_in), + common_size(common_size_in), + rhs_size(rhs_size_in), + hw(hwaccelrated::IAccelrated::getAccelrator()) +{ +} + +DenseMatMulFunction::Self::~Self() = default; + +DenseMatMulFunction::DenseMatMulFunction(const eval::ValueType &result_type, + const eval::TensorFunction &lhs_in, + const eval::TensorFunction &rhs_in, + size_t lhs_size, + size_t common_size, + size_t rhs_size, + bool lhs_common_inner, + bool rhs_common_inner) + : Super(result_type, lhs_in, rhs_in), + _lhs_size(lhs_size), + _common_size(common_size), + _rhs_size(rhs_size), + _lhs_common_inner(lhs_common_inner), + _rhs_common_inner(rhs_common_inner) +{ +} + +DenseMatMulFunction::~DenseMatMulFunction() = default; + +eval::InterpretedFunction::Instruction +DenseMatMulFunction::compile_self(Stash &stash) const +{ + Self &self = stash.create(result_type(), _lhs_size, _common_size, _rhs_size); + auto op = my_select(lhs().result_type().cell_type(), rhs().result_type().cell_type(), + _lhs_common_inner, _rhs_common_inner); + return eval::InterpretedFunction::Instruction(op, (uint64_t)(&self)); +} + +void +DenseMatMulFunction::visit_self(vespalib::ObjectVisitor &visitor) const +{ + Super::visit_self(visitor); + visitor.visitInt("lhs_size", _lhs_size); + visitor.visitInt("common_size", _common_size); + visitor.visitInt("rhs_size", _rhs_size); + visitor.visitBool("lhs_common_inner", _lhs_common_inner); + visitor.visitBool("rhs_common_inner", _rhs_common_inner); +} + +const TensorFunction & +DenseMatMulFunction::optimize(const eval::TensorFunction &expr, Stash &stash) +{ + auto reduce = as(expr); + if (reduce && (reduce->aggr() == Aggr::SUM) && (reduce->dimensions().size() == 1)) { + auto join = as(reduce->child()); + if (join && (join->function() == Mul::f)) { + const TensorFunction &a = join->lhs(); + const TensorFunction &b = join->rhs(); + if (is_matmul(a.result_type(), b.result_type(), reduce->dimensions()[0], expr.result_type())) { + return create_matmul(a, b, reduce->dimensions()[0], expr.result_type(), stash); + } + } + } + return expr; +} + +} // namespace vespalib::tensor diff --git a/eval/src/vespa/eval/tensor/dense/dense_matmul_function.h b/eval/src/vespa/eval/tensor/dense/dense_matmul_function.h new file mode 100644 index 00000000000..276a455bda4 --- /dev/null +++ b/eval/src/vespa/eval/tensor/dense/dense_matmul_function.h @@ -0,0 +1,60 @@ +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include +#include "dense_tensor_view.h" +#include + +namespace vespalib::tensor { + +/** + * Tensor function for dense matrix multiplication. + **/ +class DenseMatMulFunction : public eval::tensor_function::Op2 +{ + using Super = eval::tensor_function::Op2; +public: + struct Self { + eval::ValueType result_type; + size_t lhs_size; + size_t common_size; + size_t rhs_size; + hwaccelrated::IAccelrated::UP hw; + Self(const eval::ValueType &result_type_in, + size_t lhs_size_in, size_t common_size_in, size_t rhs_size_in); + ~Self(); + }; + +private: + size_t _lhs_size; + size_t _common_size; + size_t _rhs_size; + bool _lhs_common_inner; + bool _rhs_common_inner; + +public: + DenseMatMulFunction(const eval::ValueType &result_type, + const eval::TensorFunction &lhs_in, + const eval::TensorFunction &rhs_in, + size_t lhs_size, + size_t common_size, + size_t rhs_size, + bool lhs_common_inner, + bool rhs_common_inner); + ~DenseMatMulFunction(); + + bool result_is_mutable() const override { return true; } + + size_t lhs_size() const { return _lhs_size; } + size_t common_size() const { return _common_size; } + size_t rhs_size() const { return _rhs_size; } + bool lhs_common_inner() const { return _lhs_common_inner; } + bool rhs_common_inner() const { return _rhs_common_inner; } + + eval::InterpretedFunction::Instruction compile_self(Stash &stash) const override; + void visit_self(vespalib::ObjectVisitor &visitor) const override; + static const eval::TensorFunction &optimize(const eval::TensorFunction &expr, Stash &stash); +}; + +} // namespace vespalib::tensor -- cgit v1.2.3