diff options
author | Arne H Juul <arnej27959@users.noreply.github.com> | 2020-11-20 08:14:16 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-11-20 08:14:16 +0100 |
commit | b7f88b27a8e2183981a8bc8966f181e816e0dc56 (patch) | |
tree | 7037e0571054ad364d662c356513d5411734f49b /eval | |
parent | 9f3970a99e29f29885b23a9373ce595d1f9ed687 (diff) | |
parent | aef0081a38061e9fa69b32a3351ae14c357734eb (diff) |
Merge pull request #15396 from vespa-engine/havardpe/split-reduce-operations-and-combine-adjacent-dimensions
combine dimensions and split reduce operations
Diffstat (limited to 'eval')
7 files changed, 275 insertions, 84 deletions
diff --git a/eval/src/tests/eval/aggr/aggr_test.cpp b/eval/src/tests/eval/aggr/aggr_test.cpp index db000840f59..9045df68305 100644 --- a/eval/src/tests/eval/aggr/aggr_test.cpp +++ b/eval/src/tests/eval/aggr/aggr_test.cpp @@ -29,6 +29,16 @@ TEST("require that aggr::is_simple works as expected") { EXPECT_TRUE (aggr::is_simple(Aggr::MIN)); } +TEST("require that aggr::is_ident works as expected") { + EXPECT_TRUE (aggr::is_ident(Aggr::AVG)); + EXPECT_FALSE(aggr::is_ident(Aggr::COUNT)); + EXPECT_TRUE (aggr::is_ident(Aggr::PROD)); + EXPECT_TRUE (aggr::is_ident(Aggr::SUM)); + EXPECT_TRUE (aggr::is_ident(Aggr::MAX)); + EXPECT_TRUE (aggr::is_ident(Aggr::MEDIAN)); + EXPECT_TRUE (aggr::is_ident(Aggr::MIN)); +} + TEST("require that aggr::is_complex works as expected") { EXPECT_FALSE(aggr::is_complex(Aggr::AVG)); EXPECT_FALSE(aggr::is_complex(Aggr::COUNT)); diff --git a/eval/src/tests/tensor/dense_single_reduce_function/dense_single_reduce_function_test.cpp b/eval/src/tests/tensor/dense_single_reduce_function/dense_single_reduce_function_test.cpp index 949c5277e18..d3495befe7e 100644 --- a/eval/src/tests/tensor/dense_single_reduce_function/dense_single_reduce_function_test.cpp +++ b/eval/src/tests/tensor/dense_single_reduce_function/dense_single_reduce_function_test.cpp @@ -26,6 +26,7 @@ const TensorEngine &prod_engine = DefaultTensorEngine::ref(); EvalFixture::ParamRepo make_params() { return EvalFixture::ParamRepo() .add_dense({{"a", 2}, {"b", 3}, {"c", 4}, {"d", 5}}) + .add_dense({{"a", 9}, {"b", 9}, {"c", 9}, {"d", 9}}) .add_cube("a", 2, "b", 1, "c", 1) .add_cube("a", 1, "b", 2, "c", 1) .add_cube("a", 1, "b", 1, "c", 2) @@ -36,17 +37,35 @@ EvalFixture::ParamRepo make_params() { } EvalFixture::ParamRepo param_repo = make_params(); -void verify_optimized(const vespalib::string &expr, size_t dim_idx, Aggr aggr) -{ +struct ReduceSpec { + size_t outer_size; + size_t reduce_size; + size_t inner_size; + Aggr aggr; +}; + +void verify_optimized_impl(const vespalib::string &expr, const std::vector<ReduceSpec> &spec_list) { EvalFixture slow_fixture(prod_engine, expr, param_repo, false); EvalFixture fixture(prod_engine, expr, param_repo, true); EXPECT_EQUAL(fixture.result(), EvalFixture::ref(expr, param_repo)); EXPECT_EQUAL(fixture.result(), slow_fixture.result()); auto info = fixture.find_all<DenseSingleReduceFunction>(); - ASSERT_EQUAL(info.size(), 1u); - EXPECT_TRUE(info[0]->result_is_mutable()); - EXPECT_EQUAL(info[0]->dim_idx(), dim_idx); - EXPECT_EQUAL(int(info[0]->aggr()), int(aggr)); + ASSERT_EQUAL(info.size(), spec_list.size()); + for (size_t i = 0; i < spec_list.size(); ++i) { + EXPECT_TRUE(info[i]->result_is_mutable()); + EXPECT_EQUAL(info[i]->outer_size(), spec_list[i].outer_size); + EXPECT_EQUAL(info[i]->reduce_size(), spec_list[i].reduce_size); + EXPECT_EQUAL(info[i]->inner_size(), spec_list[i].inner_size); + EXPECT_EQUAL(int(info[i]->aggr()), int(spec_list[i].aggr)); + } +} + +void verify_optimized(const vespalib::string &expr, const ReduceSpec &spec) { + verify_optimized_impl(expr, {spec}); +} + +void verify_optimized(const vespalib::string &expr, const ReduceSpec &spec1, const ReduceSpec &spec2) { + verify_optimized_impl(expr, {spec1, spec2}); } void verify_not_optimized(const vespalib::string &expr) { @@ -58,11 +77,6 @@ void verify_not_optimized(const vespalib::string &expr) { EXPECT_TRUE(info.empty()); } -TEST("require that multi-dimensional reduce is not optimized") { - TEST_DO(verify_not_optimized("reduce(a2b3c4d5,sum,a,b)")); - TEST_DO(verify_not_optimized("reduce(a2b3c4d5,sum,c,d)")); -} - TEST("require that reduce to scalar is not optimized") { TEST_DO(verify_not_optimized("reduce(a10,sum,a)")); TEST_DO(verify_not_optimized("reduce(a10,sum)")); @@ -79,45 +93,83 @@ TEST("require that mixed reduce is not optimized") { TEST_DO(verify_not_optimized("reduce(xyz_mixed,sum,z)")); } -// NB: these are shadowed by the remove dimension optimization -TEST("require that reducing self-aggregating trivial dimensions is not optimized") { +TEST("require that reducing trivial dimensions is not optimized") { TEST_DO(verify_not_optimized("reduce(a1b1c1,avg,c)")); + TEST_DO(verify_not_optimized("reduce(a1b1c1,count,c)")); TEST_DO(verify_not_optimized("reduce(a1b1c1,prod,c)")); TEST_DO(verify_not_optimized("reduce(a1b1c1,sum,c)")); TEST_DO(verify_not_optimized("reduce(a1b1c1,max,c)")); + TEST_DO(verify_not_optimized("reduce(a1b1c1,median,c)")); TEST_DO(verify_not_optimized("reduce(a1b1c1,min,c)")); } -TEST("require that reducing trivial dimension with COUNT is 'optimized'") { - TEST_DO(verify_optimized("reduce(a1b1c1,count,a)", 0, Aggr::COUNT)); - TEST_DO(verify_optimized("reduce(a1b1c1,count,b)", 1, Aggr::COUNT)); - TEST_DO(verify_optimized("reduce(a1b1c1,count,c)", 2, Aggr::COUNT)); +TEST("require that atleast_8 dense single reduce works") { + TEST_DO(verify_optimized("reduce(a9b9c9d9,avg,a)", {1, 9, 729, Aggr::AVG})); + TEST_DO(verify_optimized("reduce(a9b9c9d9,avg,b)", {9, 9, 81, Aggr::AVG})); + TEST_DO(verify_optimized("reduce(a9b9c9d9,avg,c)", {81, 9, 9, Aggr::AVG})); + TEST_DO(verify_optimized("reduce(a9b9c9d9,avg,d)", {729, 9, 1, Aggr::AVG})); + TEST_DO(verify_optimized("reduce(a9b9c9d9,sum,c,d)", {81, 81, 1, Aggr::SUM})); +} + +TEST("require that simple aggregators can be decomposed into multiple reduce operations") { + TEST_DO(verify_optimized("reduce(a2b3c4d5,sum,a,c)", {3, 4, 5, Aggr::SUM}, {1, 2, 60, Aggr::SUM})); + TEST_DO(verify_optimized("reduce(a2b3c4d5,min,a,c)", {3, 4, 5, Aggr::MIN}, {1, 2, 60, Aggr::MIN})); + TEST_DO(verify_optimized("reduce(a2b3c4d5,max,a,c)", {3, 4, 5, Aggr::MAX}, {1, 2, 60, Aggr::MAX})); +} + +TEST("require that reduce dimensions can be listed in reverse order") { + TEST_DO(verify_optimized("reduce(a2b3c4d5,sum,c,a)", {3, 4, 5, Aggr::SUM}, {1, 2, 60, Aggr::SUM})); + TEST_DO(verify_optimized("reduce(a2b3c4d5,min,c,a)", {3, 4, 5, Aggr::MIN}, {1, 2, 60, Aggr::MIN})); + TEST_DO(verify_optimized("reduce(a2b3c4d5,max,c,a)", {3, 4, 5, Aggr::MAX}, {1, 2, 60, Aggr::MAX})); +} + +TEST("require that non-simple aggregators cannot be decomposed into multiple reduce operations") { + TEST_DO(verify_not_optimized("reduce(a2b3c4d5,avg,a,c)")); + TEST_DO(verify_not_optimized("reduce(a2b3c4d5,count,a,c)")); + TEST_DO(verify_not_optimized("reduce(a2b3c4d5,median,a,c)")); } vespalib::string make_expr(const vespalib::string &arg, const vespalib::string &dim, bool float_cells, Aggr aggr) { return make_string("reduce(%s%s,%s,%s)", arg.c_str(), float_cells ? "f" : "", AggrNames::name_of(aggr)->c_str(), dim.c_str()); } -void verify_optimized_multi(const vespalib::string &arg, const vespalib::string &dim, size_t dim_idx) { +void verify_optimized_multi(const vespalib::string &arg, const vespalib::string &dim, size_t outer_size, size_t reduce_size, size_t inner_size) { for (bool float_cells: {false, true}) { for (Aggr aggr: Aggregator::list()) { - auto expr = make_expr(arg, dim, float_cells, aggr); - TEST_DO(verify_optimized(expr, dim_idx, aggr)); + if (aggr != Aggr::PROD) { + auto expr = make_expr(arg, dim, float_cells, aggr); + TEST_DO(verify_optimized(expr, {outer_size, reduce_size, inner_size, aggr})); + } } } } TEST("require that normal dense single reduce works") { - TEST_DO(verify_optimized_multi("a2b3c4d5", "a", 0)); - TEST_DO(verify_optimized_multi("a2b3c4d5", "b", 1)); - TEST_DO(verify_optimized_multi("a2b3c4d5", "c", 2)); - TEST_DO(verify_optimized_multi("a2b3c4d5", "d", 3)); + TEST_DO(verify_optimized_multi("a2b3c4d5", "a", 1, 2, 60)); + TEST_DO(verify_optimized_multi("a2b3c4d5", "b", 2, 3, 20)); + TEST_DO(verify_optimized_multi("a2b3c4d5", "c", 6, 4, 5)); + TEST_DO(verify_optimized_multi("a2b3c4d5", "d", 24, 5, 1)); +} + +TEST("require that dimension-combined dense single reduce works") { + TEST_DO(verify_optimized_multi("a2b3c4d5", "a,b", 1, 6, 20)); + TEST_DO(verify_optimized_multi("a2b3c4d5", "b,c", 2, 12, 5)); + TEST_DO(verify_optimized_multi("a2b3c4d5", "c,d", 6, 20, 1)); } TEST("require that minimal dense single reduce works") { - TEST_DO(verify_optimized_multi("a2b1c1", "a", 0)); - TEST_DO(verify_optimized_multi("a1b2c1", "b", 1)); - TEST_DO(verify_optimized_multi("a1b1c2", "c", 2)); + TEST_DO(verify_optimized_multi("a2b1c1", "a", 1, 2, 1)); + TEST_DO(verify_optimized_multi("a1b2c1", "b", 1, 2, 1)); + TEST_DO(verify_optimized_multi("a1b1c2", "c", 1, 2, 1)); +} + +TEST("require that trivial dimensions can be trivially reduced") { + TEST_DO(verify_optimized_multi("a2b1c1", "a,b", 1, 2, 1)); + TEST_DO(verify_optimized_multi("a2b1c1", "a,c", 1, 2, 1)); + TEST_DO(verify_optimized_multi("a1b2c1", "b,a", 1, 2, 1)); + TEST_DO(verify_optimized_multi("a1b2c1", "b,c", 1, 2, 1)); + TEST_DO(verify_optimized_multi("a1b1c2", "c,a", 1, 2, 1)); + TEST_DO(verify_optimized_multi("a1b1c2", "c,b", 1, 2, 1)); } TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp index 8887f2cb6aa..816923bb87c 100644 --- a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp +++ b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp @@ -122,6 +122,32 @@ MyPeekSpec verbatim_peek() { return MyPeekSpec(false); } //----------------------------------------------------------------------------- +struct MultiOpParam { + std::vector<Instruction> list; +}; + +void my_multi_instruction_op(InterpretedFunction::State &state, uint64_t param_in) { + const auto ¶m = *(MultiOpParam*)(param_in); + for (const auto &item: param.list) { + item.perform(state); + } +} + +void collect_op1_chain(const TensorFunction &node, const EngineOrFactory &engine, Stash &stash, std::vector<Instruction> &list) { + if (auto op1 = as<tensor_function::Op1>(node)) { + collect_op1_chain(op1->child(), engine, stash, list); + list.push_back(node.compile_self(engine, stash)); + } +} + +Instruction compile_op1_chain(const TensorFunction &node, const EngineOrFactory &engine, Stash &stash) { + auto ¶m = stash.create<MultiOpParam>(); + collect_op1_chain(node, engine, stash, param.list); + return {my_multi_instruction_op,(uint64_t)(¶m)}; +} + +//----------------------------------------------------------------------------- + struct Impl { size_t order; vespalib::string name; @@ -145,7 +171,10 @@ struct Impl { const auto &lhs_node = tensor_function::inject(lhs, 0, stash); const auto &reduce_node = tensor_function::reduce(lhs_node, aggr, dims, stash); const auto &node = optimize ? optimize_tensor_function(engine, reduce_node, stash) : reduce_node; - return node.compile_self(engine, stash); + // since reduce might be optimized into multiple chained + // instructions, we need some extra magic to package these + // instructions into a single compound instruction. + return compile_op1_chain(node, engine, stash); } Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const { // create a complete tensor function, but only compile the relevant instruction diff --git a/eval/src/vespa/eval/eval/aggr.h b/eval/src/vespa/eval/eval/aggr.h index 2173205dc82..e69b1071e61 100644 --- a/eval/src/vespa/eval/eval/aggr.h +++ b/eval/src/vespa/eval/eval/aggr.h @@ -71,6 +71,16 @@ constexpr bool is_simple(Aggr aggr) { (aggr == Aggr::MIN)); } +// will a single value reduce to itself? +constexpr bool is_ident(Aggr aggr) { + return ((aggr == Aggr::AVG) || + (aggr == Aggr::PROD) || + (aggr == Aggr::SUM) || + (aggr == Aggr::MAX) || + (aggr == Aggr::MEDIAN) || + (aggr == Aggr::MIN)); +} + // should we avoid doing clever stuff with this aggregator? constexpr bool is_complex(Aggr aggr) { return (aggr == Aggr::MEDIAN); diff --git a/eval/src/vespa/eval/tensor/dense/dense_remove_dimension_optimizer.cpp b/eval/src/vespa/eval/tensor/dense/dense_remove_dimension_optimizer.cpp index 0cecd588317..a48527e83f5 100644 --- a/eval/src/vespa/eval/tensor/dense/dense_remove_dimension_optimizer.cpp +++ b/eval/src/vespa/eval/tensor/dense/dense_remove_dimension_optimizer.cpp @@ -14,15 +14,6 @@ using namespace eval::tensor_function; namespace { -bool is_ident_aggr(Aggr aggr) { - return ((aggr == Aggr::AVG) || - (aggr == Aggr::PROD) || - (aggr == Aggr::SUM) || - (aggr == Aggr::MAX) || - (aggr == Aggr::MEDIAN) || - (aggr == Aggr::MIN)); -} - bool is_trivial_dim_list(const ValueType &type, const std::vector<vespalib::string> &dim_list) { size_t npos = ValueType::Dimension::npos; for (const vespalib::string &dim: dim_list) { @@ -43,7 +34,7 @@ DenseRemoveDimensionOptimizer::optimize(const eval::TensorFunction &expr, Stash const TensorFunction &child = reduce->child(); if (expr.result_type().is_dense() && child.result_type().is_dense() && - is_ident_aggr(reduce->aggr()) && + eval::aggr::is_ident(reduce->aggr()) && is_trivial_dim_list(child.result_type(), reduce->dimensions())) { assert(expr.result_type().cell_type() == child.result_type().cell_type()); diff --git a/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp b/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp index 2d8afa638ce..5f688657645 100644 --- a/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp +++ b/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp @@ -4,6 +4,7 @@ #include "dense_tensor_view.h" #include <vespa/vespalib/util/typify.h> #include <vespa/eval/eval/value.h> +#include <cassert> namespace vespalib::tensor { @@ -25,27 +26,16 @@ namespace { struct Params { const ValueType &result_type; size_t outer_size; - size_t dim_size; + size_t reduce_size; size_t inner_size; - Params(const ValueType &result_type_in, const ValueType &child_type, size_t dim_idx) - : result_type(result_type_in), outer_size(1), dim_size(1), inner_size(1) - { - for (size_t i = 0; i < child_type.dimensions().size(); ++i) { - if (i < dim_idx) { - outer_size *= child_type.dimensions()[i].size; - } else if (i == dim_idx) { - dim_size *= child_type.dimensions()[i].size; - } else { - inner_size *= child_type.dimensions()[i].size; - } - } - } + Params(const ValueType &result_type_in, size_t outer_size_in, size_t reduce_size_in, size_t inner_size_in) + : result_type(result_type_in), outer_size(outer_size_in), reduce_size(reduce_size_in), inner_size(inner_size_in) {} }; template <typename CT, typename AGGR> -CT reduce_cells(const CT *src, size_t dim_size, size_t stride) { +CT reduce_cells(const CT *src, size_t reduce_size, size_t stride) { AGGR aggr(*src); - for (size_t i = 1; i < dim_size; ++i) { + for (size_t i = 1; i < reduce_size; ++i) { src += stride; aggr.sample(*src); } @@ -88,17 +78,17 @@ auto reduce_cells_atleast_8(const CT *src, size_t n, size_t stride) { template <typename CT, typename AGGR, bool atleast_8, bool is_inner> void trace_reduce_impl(const Params ¶ms, const CT *src, CT *dst) { constexpr bool aggr_is_complex = is_complex(AGGR::enum_value()); - const size_t block_size = (params.dim_size * params.inner_size); + const size_t block_size = (params.reduce_size * params.inner_size); for (size_t outer = 0; outer < params.outer_size; ++outer) { for (size_t inner = 0; inner < params.inner_size; ++inner) { if (atleast_8 && !aggr_is_complex) { if (is_inner) { - *dst++ = reduce_cells_atleast_8<CT, AGGR>(src + inner, params.dim_size); + *dst++ = reduce_cells_atleast_8<CT, AGGR>(src + inner, params.reduce_size); } else { - *dst++ = reduce_cells_atleast_8<CT, AGGR>(src + inner, params.dim_size, params.inner_size); + *dst++ = reduce_cells_atleast_8<CT, AGGR>(src + inner, params.reduce_size, params.inner_size); } } else { - *dst++ = reduce_cells<CT, AGGR>(src + inner, params.dim_size, params.inner_size); + *dst++ = reduce_cells<CT, AGGR>(src + inner, params.reduce_size, params.inner_size); } } src += block_size; @@ -112,7 +102,7 @@ void fold_reduce_impl(const Params ¶ms, const CT *src, CT *dst) { for (size_t inner = 0; inner < params.inner_size; ++inner) { *dst++ = *src++; } - for (size_t dim = 1; dim < params.dim_size; ++dim) { + for (size_t dim = 1; dim < params.reduce_size; ++dim) { dst = saved_dst; for (size_t inner = 0; inner < params.inner_size; ++inner) { *dst = AGGR::combine(*dst, *src++); @@ -147,14 +137,99 @@ struct MyGetFun { using MyTypify = TypifyValue<TypifyCellType,TypifyAggr,TypifyBool>; +std::pair<std::vector<vespalib::string>,ValueType> sort_and_drop_trivial(const std::vector<vespalib::string> &list_in, const ValueType &type_in) { + std::vector<vespalib::string> dropped; + std::vector<vespalib::string> list_out; + for (const auto &dim_name: list_in) { + auto dim_idx = type_in.dimension_index(dim_name); + assert(dim_idx != ValueType::Dimension::npos); + const auto &dim = type_in.dimensions()[dim_idx]; + assert(dim.is_indexed()); + if (dim.is_trivial()) { + dropped.push_back(dim_name); + } else { + list_out.push_back(dim_name); + } + } + std::sort(list_out.begin(), list_out.end()); + ValueType type_out = dropped.empty() ? type_in : type_in.reduce(dropped); + assert(!type_out.is_error()); + return {list_out, type_out}; +} + +template <typename T> struct VectorLookupLoop { + const std::vector<T> &list; + size_t index; + VectorLookupLoop(const std::vector<T> &list_in) : list(list_in), index(0) {} + bool valid() const { return (index < list.size()); } + void next() { ++index; } + const T &get() const { return list[index]; } +}; + +DenseSingleReduceSpec extract_next(const eval::ValueType &type, eval::Aggr aggr, + std::vector<vespalib::string> &todo) +{ + size_t outer_size = 1; + size_t reduce_size = 1; + size_t inner_size = 1; + auto dims = type.nontrivial_indexed_dimensions(); + std::vector<vespalib::string> do_now; + std::vector<vespalib::string> do_later; + auto a = VectorLookupLoop(dims); + auto b = VectorLookupLoop(todo); + while (a.valid() && b.valid() && (a.get().name < b.get())) { + outer_size *= a.get().size; + a.next(); + } + while (a.valid() && b.valid() && (a.get().name == b.get())) { + reduce_size *= a.get().size; + do_now.push_back(b.get()); + a.next(); + b.next(); + } + while (a.valid()) { + inner_size *= a.get().size; + a.next(); + } + while (b.valid()) { + do_later.push_back(b.get()); + b.next(); + } + todo = do_later; + assert(!do_now.empty()); + return {type.reduce(do_now), outer_size, reduce_size, inner_size, aggr}; +} + } // namespace vespalib::tensor::<unnamed> -DenseSingleReduceFunction::DenseSingleReduceFunction(const ValueType &result_type, - const TensorFunction &child, - size_t dim_idx, Aggr aggr) - : Op1(result_type, child), - _dim_idx(dim_idx), - _aggr(aggr) +std::vector<DenseSingleReduceSpec> +make_dense_single_reduce_list(const eval::ValueType &type, eval::Aggr aggr, + const std::vector<vespalib::string> &reduce_dims) +{ + auto res_type = type.reduce(reduce_dims); + if (reduce_dims.empty() || !type.is_dense() || !res_type.is_dense()) { + return {}; + } + std::vector<DenseSingleReduceSpec> list; + auto [todo, curr_type] = sort_and_drop_trivial(reduce_dims, type); + while (!todo.empty()) { + list.push_back(extract_next(curr_type, aggr, todo)); + curr_type = list.back().result_type; + } + assert(curr_type == res_type); + if ((list.size() > 1) && !eval::aggr::is_simple(aggr)) { + return {}; + } + return list; +} + +DenseSingleReduceFunction::DenseSingleReduceFunction(const DenseSingleReduceSpec &spec, + const TensorFunction &child) + : Op1(spec.result_type, child), + _outer_size(spec.outer_size), + _reduce_size(spec.reduce_size), + _inner_size(spec.inner_size), + _aggr(spec.aggr) { } @@ -163,25 +238,25 @@ DenseSingleReduceFunction::~DenseSingleReduceFunction() = default; InterpretedFunction::Instruction DenseSingleReduceFunction::compile_self(eval::EngineOrFactory, Stash &stash) const { - auto ¶ms = stash.create<Params>(result_type(), child().result_type(), _dim_idx); auto op = typify_invoke<4,MyTypify,MyGetFun>(result_type().cell_type(), _aggr, - (params.dim_size >= 8), - (params.inner_size == 1)); + (_reduce_size >= 8), (_inner_size == 1)); + auto ¶ms = stash.create<Params>(result_type(), _outer_size, _reduce_size, _inner_size); return InterpretedFunction::Instruction(op, wrap_param<Params>(params)); } const TensorFunction & DenseSingleReduceFunction::optimize(const TensorFunction &expr, Stash &stash) { - auto reduce = as<Reduce>(expr); - if (reduce && (reduce->dimensions().size() == 1) && - reduce->child().result_type().is_dense() && - expr.result_type().is_dense()) - { - size_t dim_idx = reduce->child().result_type().dimension_index(reduce->dimensions()[0]); - assert(dim_idx != ValueType::Dimension::npos); - assert(expr.result_type().cell_type() == reduce->child().result_type().cell_type()); - return stash.create<DenseSingleReduceFunction>(expr.result_type(), reduce->child(), dim_idx, reduce->aggr()); + if (auto reduce = as<Reduce>(expr)) { + const auto &child = reduce->child(); + auto spec_list = make_dense_single_reduce_list(child.result_type(), reduce->aggr(), reduce->dimensions()); + if (!spec_list.empty()) { + const auto *prev = &child; + for (const auto &spec: spec_list) { + prev = &stash.create<DenseSingleReduceFunction>(spec, *prev); + } + return *prev; + } } return expr; } diff --git a/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.h b/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.h index 7f9313df600..f2db3155290 100644 --- a/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.h +++ b/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.h @@ -6,22 +6,46 @@ namespace vespalib::tensor { +struct DenseSingleReduceSpec { + eval::ValueType result_type; + size_t outer_size; + size_t reduce_size; + size_t inner_size; + eval::Aggr aggr; +}; + +/** + * Decompose the specified reduce operation into a sequence of single + * dense reduce operations. Returns an empty list if decomposition + * fails. + **/ +std::vector<DenseSingleReduceSpec> +make_dense_single_reduce_list(const eval::ValueType &type, eval::Aggr aggr, + const std::vector<vespalib::string> &reduce_dims); + /** - * Tensor function reducing a single dimension of a dense - * tensor where the result is also a dense tensor. + * Tensor function reducing a single dimension of a dense tensor where + * the result is also a dense tensor. The optimize function may create + * multiple tensor functions to compose a multi-stage reduce + * operation. Adjacent reduced dimensions will be handled is if they + * were a single dimension. Trivial dimensions will be trivially + * reduced along with any other dimension. **/ class DenseSingleReduceFunction : public eval::tensor_function::Op1 { private: - size_t _dim_idx; + size_t _outer_size; + size_t _reduce_size; + size_t _inner_size; eval::Aggr _aggr; public: - DenseSingleReduceFunction(const eval::ValueType &result_type, - const eval::TensorFunction &child, - size_t dim_idx, eval::Aggr aggr); + DenseSingleReduceFunction(const DenseSingleReduceSpec &spec, + const eval::TensorFunction &child); ~DenseSingleReduceFunction() override; - size_t dim_idx() const { return _dim_idx; } + size_t outer_size() const { return _outer_size; } + size_t reduce_size() const { return _reduce_size; } + size_t inner_size() const { return _inner_size; } eval::Aggr aggr() const { return _aggr; } bool result_is_mutable() const override { return true; } eval::InterpretedFunction::Instruction compile_self(eval::EngineOrFactory engine, Stash &stash) const override; |