summaryrefslogtreecommitdiffstats
path: root/eval
diff options
context:
space:
mode:
authorArne H Juul <arnej27959@users.noreply.github.com>2020-10-22 22:22:24 +0200
committerGitHub <noreply@github.com>2020-10-22 22:22:24 +0200
commitb6e7689c75e280e27cd397f41a949fd98635d3a3 (patch)
tree9a702f7c82c815c5017c9dd1afac6f3a80ef4c8b /eval
parentd01edb5dff3a2c63a47109d387a079abd7e0bfe7 (diff)
parentdb9120c61f1f5974aed00f06a8f6fa56809e0e68 (diff)
Merge pull request #15012 from vespa-engine/havardpe/better-dense-plan-for-generic-reduce
improve generic dense reduce with more robust cell ordering
Diffstat (limited to 'eval')
-rw-r--r--eval/src/tests/eval/aggr/aggr_test.cpp28
-rw-r--r--eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp14
-rw-r--r--eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp56
-rw-r--r--eval/src/vespa/eval/eval/aggr.cpp4
-rw-r--r--eval/src/vespa/eval/eval/aggr.h68
-rw-r--r--eval/src/vespa/eval/instruction/generic_reduce.cpp98
-rw-r--r--eval/src/vespa/eval/instruction/generic_reduce.h14
-rw-r--r--eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp9
8 files changed, 178 insertions, 113 deletions
diff --git a/eval/src/tests/eval/aggr/aggr_test.cpp b/eval/src/tests/eval/aggr/aggr_test.cpp
index b1da9704bfc..603fdb508e2 100644
--- a/eval/src/tests/eval/aggr/aggr_test.cpp
+++ b/eval/src/tests/eval/aggr/aggr_test.cpp
@@ -5,6 +5,7 @@
using vespalib::Stash;
using namespace vespalib::eval;
+using namespace vespalib::eval::aggr;
TEST("require that aggregator list returns appropriate entries") {
auto list = Aggregator::list();
@@ -83,4 +84,31 @@ TEST("require that MIN aggregator works as expected") {
aggr.next(200.0), EXPECT_EQUAL(aggr.result(), 100.0);
}
+template <template <typename T> typename A>
+float aggr_merge(const std::vector<float> &a, const std::vector<float> &b) {
+ A<float> aggr0;
+ A<float> aggr1;
+ A<float> aggr2;
+ A<float> aggr3;
+ for (float v: a) {
+ aggr1.sample(v);
+ }
+ for (float v: b) {
+ aggr2.sample(v);
+ }
+ aggr0.merge(aggr1);
+ aggr2.merge(aggr3);
+ aggr0.merge(aggr2);
+ return aggr0.result();
+}
+
+TEST("require that aggregator merge works") {
+ EXPECT_EQUAL(aggr_merge<Avg>({1,2},{3,4}), 2.5);
+ EXPECT_EQUAL(aggr_merge<Count>({1,2},{3,4}), 4.0);
+ EXPECT_EQUAL(aggr_merge<Prod>({1,2},{3,4}), 24.0);
+ EXPECT_EQUAL(aggr_merge<Sum>({1,2},{3,4}), 10.0);
+ EXPECT_EQUAL(aggr_merge<Max>({1,2},{3,4}), 4.0);
+ EXPECT_EQUAL(aggr_merge<Min>({1,2},{3,4}), 1.0);
+}
+
TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp b/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp
index cc36c139c99..d894d273f02 100644
--- a/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp
+++ b/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp
@@ -76,16 +76,14 @@ TensorSpec perform_generic_reduce(const TensorSpec &a, const std::vector<vespali
TEST(GenericReduceTest, dense_reduce_plan_can_be_created) {
auto type = ValueType::from_spec("tensor(a[2],aa{},b[2],bb[1],c[2],cc{},d[2],dd[1],e[2],ee{},f[2])");
auto plan = DenseReducePlan(type, type.reduce({"a", "d", "e"}));
- std::vector<size_t> expect_keep_loop = {4,2};
- std::vector<size_t> expect_keep_stride = {8,1};
- std::vector<size_t> expect_reduce_loop = {2,4};
- std::vector<size_t> expect_reduce_stride = {32,2};
+ std::vector<size_t> expect_loop_cnt = {2,4,4,2};
+ std::vector<size_t> expect_in_stride = {32,2,8,1};
+ std::vector<size_t> expect_out_stride = {0,0,2,1};
EXPECT_EQ(plan.in_size, 64);
EXPECT_EQ(plan.out_size, 8);
- EXPECT_EQ(plan.keep_loop, expect_keep_loop);
- EXPECT_EQ(plan.keep_stride, expect_keep_stride);
- EXPECT_EQ(plan.reduce_loop, expect_reduce_loop);
- EXPECT_EQ(plan.reduce_stride, expect_reduce_stride);
+ EXPECT_EQ(plan.loop_cnt, expect_loop_cnt);
+ EXPECT_EQ(plan.in_stride, expect_in_stride);
+ EXPECT_EQ(plan.out_stride, expect_out_stride);
}
TEST(GenericReduceTest, sparse_reduce_plan_can_be_created) {
diff --git a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
index 1c892a0a3a5..0299dc3ebba 100644
--- a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
+++ b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
@@ -127,35 +127,40 @@ struct Impl {
vespalib::string name;
vespalib::string short_name;
EngineOrFactory engine;
- Impl(size_t order_in, const vespalib::string &name_in, const vespalib::string &short_name_in, EngineOrFactory engine_in)
- : order(order_in), name(name_in), short_name(short_name_in), engine(engine_in) {}
+ bool optimize;
+ Impl(size_t order_in, const vespalib::string &name_in, const vespalib::string &short_name_in, EngineOrFactory engine_in, bool optimize_in)
+ : order(order_in), name(name_in), short_name(short_name_in), engine(engine_in), optimize(optimize_in) {}
Value::UP create_value(const TensorSpec &spec) const { return engine.from_spec(spec); }
TensorSpec create_spec(const Value &value) const { return engine.to_spec(value); }
Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const {
// create a complete tensor function, but only compile the relevant instruction
const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
const auto &rhs_node = tensor_function::inject(rhs, 1, stash);
- const auto &join_node = tensor_function::join(lhs_node, rhs_node, function, stash);
- return join_node.compile_self(engine, stash);
+ const auto &join_node = tensor_function::join(lhs_node, rhs_node, function, stash);
+ const auto &node = optimize ? engine.optimize(join_node, stash) : join_node;
+ return node.compile_self(engine, stash);
}
Instruction create_reduce(const ValueType &lhs, Aggr aggr, const std::vector<vespalib::string> &dims, Stash &stash) const {
// create a complete tensor function, but only compile the relevant instruction
const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
- const auto &reduce_node = tensor_function::reduce(lhs_node, aggr, dims, stash);
- return reduce_node.compile_self(engine, stash);
+ const auto &reduce_node = tensor_function::reduce(lhs_node, aggr, dims, stash);
+ const auto &node = optimize ? engine.optimize(reduce_node, stash) : reduce_node;
+ return node.compile_self(engine, stash);
}
Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const {
// create a complete tensor function, but only compile the relevant instruction
const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
const auto &rename_node = tensor_function::rename(lhs_node, from, to, stash);
- return rename_node.compile_self(engine, stash);
+ const auto &node = optimize ? engine.optimize(rename_node, stash) : rename_node;
+ return node.compile_self(engine, stash);
}
Instruction create_merge(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const {
// create a complete tensor function, but only compile the relevant instruction
const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
const auto &rhs_node = tensor_function::inject(rhs, 1, stash);
const auto &merge_node = tensor_function::merge(lhs_node, rhs_node, function, stash);
- return merge_node.compile_self(engine, stash);
+ const auto &node = optimize ? engine.optimize(merge_node, stash) : merge_node;
+ return node.compile_self(engine, stash);
}
Instruction create_concat(const ValueType &lhs, const ValueType &rhs, const std::string &dimension, Stash &stash) const {
// create a complete tensor function, but only compile the relevant instruction
@@ -163,12 +168,15 @@ struct Impl {
const auto &rhs_node = tensor_function::inject(rhs, 1, stash);
const auto &concat_node = tensor_function::concat(lhs_node, rhs_node, dimension, stash);
return concat_node.compile_self(engine, stash);
+ const auto &node = optimize ? engine.optimize(concat_node, stash) : concat_node;
+ return node.compile_self(engine, stash);
}
Instruction create_map(const ValueType &lhs, operation::op1_t function, Stash &stash) const {
// create a complete tensor function, but only compile the relevant instruction
const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
const auto &map_node = tensor_function::map(lhs_node, function, stash);
- return map_node.compile_self(engine, stash);
+ const auto &node = optimize ? engine.optimize(map_node, stash) : map_node;
+ return node.compile_self(engine, stash);
}
Instruction create_tensor_create(const ValueType &proto_type, const TensorSpec &proto, Stash &stash) const {
// create a complete tensor function, but only compile the relevant instruction
@@ -178,7 +186,8 @@ struct Impl {
spec.emplace(cell.first, my_double);
}
const auto &create_tensor_node = tensor_function::create(proto_type, spec, stash);
- return create_tensor_node.compile_self(engine, stash);
+ const auto &node = optimize ? engine.optimize(create_tensor_node, stash) : create_tensor_node;
+ return node.compile_self(engine, stash);
}
Instruction create_tensor_lambda(const ValueType &type, const Function &function, const ValueType &p0_type, Stash &stash) const {
std::vector<ValueType> arg_types(type.dimensions().size(), ValueType::double_type());
@@ -186,7 +195,8 @@ struct Impl {
NodeTypes types(function, arg_types);
EXPECT_EQ(types.errors(), std::vector<vespalib::string>());
const auto &tensor_lambda_node = tensor_function::lambda(type, {0}, function, std::move(types), stash);
- return tensor_lambda_node.compile_self(engine, stash);
+ const auto &node = optimize ? engine.optimize(tensor_lambda_node, stash) : tensor_lambda_node;
+ return node.compile_self(engine, stash);
}
Instruction create_tensor_peek(const ValueType &type, const MyPeekSpec &my_spec, Stash &stash) const {
// create a complete tensor function, but only compile the relevant instruction
@@ -209,18 +219,19 @@ struct Impl {
}
}
const auto &peek_node = tensor_function::peek(my_param, spec, stash);
- return peek_node.compile_self(engine, stash);
+ const auto &node = optimize ? engine.optimize(peek_node, stash) : peek_node;
+ return node.compile_self(engine, stash);
}
};
//-----------------------------------------------------------------------------
-Impl simple_tensor_engine_impl(5, " SimpleTensorEngine", " SimpleT", SimpleTensorEngine::ref());
-Impl default_tensor_engine_impl(1, "DefaultTensorEngine", "OLD PROD", DefaultTensorEngine::ref());
-Impl simple_value_impl(2, " SimpleValue", " SimpleV", SimpleValueBuilderFactory::get());
-Impl fast_value_impl(0, " FastValue", "NEW PROD", FastValueBuilderFactory::get());
-Impl packed_mixed_tensor_impl(4, " PackedMixedTensor", " Packed", PackedMixedTensorBuilderFactory::get());
-Impl default_tensor_value_impl(3, " DefaultValue", "DefaultV", DefaultValueBuilderFactory::get());
+Impl default_tensor_engine_impl(1, "DefaultTensorEngine", "OLD PROD", DefaultTensorEngine::ref(), false);
+Impl simple_value_impl(3, " SimpleValue", " SimpleV", SimpleValueBuilderFactory::get(), false);
+Impl fast_value_impl(0, " FastValue", "NEW PROD", FastValueBuilderFactory::get(), false);
+Impl optimized_fast_value_impl(2, "Optimized FastValue", "Optimize", FastValueBuilderFactory::get(), true);
+Impl packed_mixed_tensor_impl(5, " PackedMixedTensor", " Packed", PackedMixedTensorBuilderFactory::get(), false);
+Impl default_tensor_value_impl(4, " DefaultValue", "DefaultV", DefaultValueBuilderFactory::get(), false);
vespalib::string short_header("--------");
constexpr double budget = 5.0;
@@ -228,10 +239,10 @@ constexpr double best_limit = 0.95; // everything within 95% of best performance
constexpr double bad_limit = 0.90; // BAD: new prod has performance lower than 90% of old prod
constexpr double good_limit = 1.10; // GOOD: new prod has performance higher than 110% of old prod
-std::vector<CREF<Impl>> impl_list = {simple_tensor_engine_impl,
- default_tensor_engine_impl,
+std::vector<CREF<Impl>> impl_list = {default_tensor_engine_impl,
simple_value_impl,
fast_value_impl,
+ optimized_fast_value_impl,
packed_mixed_tensor_impl,
default_tensor_value_impl};
@@ -756,6 +767,11 @@ TEST(MixedJoin, no_overlap) {
//-----------------------------------------------------------------------------
+TEST(ReduceBench, number_reduce) {
+ auto lhs = make_spec(1.0);
+ benchmark_reduce("number reduce", lhs, Aggr::SUM, {});
+}
+
TEST(ReduceBench, dense_reduce) {
auto lhs = make_cube(D::idx("a", 16), D::idx("b", 16), D::idx("c", 16), 1.0);
benchmark_reduce("dense reduce inner", lhs, Aggr::SUM, {"c"});
diff --git a/eval/src/vespa/eval/eval/aggr.cpp b/eval/src/vespa/eval/eval/aggr.cpp
index 8efb0ec9fe7..e731c7a1f09 100644
--- a/eval/src/vespa/eval/eval/aggr.cpp
+++ b/eval/src/vespa/eval/eval/aggr.cpp
@@ -14,8 +14,8 @@ namespace {
template <typename T>
struct Wrapper : Aggregator {
T aggr;
- virtual void first(double value) final override { aggr.first(value); }
- virtual void next(double value) final override { aggr.next(value); }
+ virtual void first(double value) final override { aggr = T{value}; }
+ virtual void next(double value) final override { aggr.sample(value); }
virtual double result() const final override { return aggr.result(); }
};
diff --git a/eval/src/vespa/eval/eval/aggr.h b/eval/src/vespa/eval/eval/aggr.h
index a67d8f77a15..050287d183c 100644
--- a/eval/src/vespa/eval/eval/aggr.h
+++ b/eval/src/vespa/eval/eval/aggr.h
@@ -60,63 +60,75 @@ namespace aggr {
template <typename T> class Avg {
private:
- T _sum = 0.0;
- size_t _cnt = 0;
+ T _sum;
+ size_t _cnt;
public:
- void first(T value) {
- _sum = value;
- _cnt = 1;
- }
- void next(T value) {
+ constexpr Avg() : _sum{0}, _cnt{0} {}
+ constexpr Avg(T value) : _sum{value}, _cnt{1} {}
+ constexpr void sample(T value) {
_sum += value;
++_cnt;
}
- T result() const { return (_sum / _cnt); }
+ constexpr void merge(const Avg &rhs) {
+ _sum += rhs._sum;
+ _cnt += rhs._cnt;
+ };
+ constexpr T result() const { return (_sum / _cnt); }
};
template <typename T> class Count {
private:
- size_t _cnt = 0;
+ size_t _cnt;
public:
- void first(T) { _cnt = 1; }
- void next(T) { ++_cnt; }
- T result() const { return _cnt; }
+ constexpr Count() : _cnt{0} {}
+ constexpr Count(T) : _cnt{1} {}
+ constexpr void sample(T) { ++_cnt; }
+ constexpr void merge(const Count &rhs) { _cnt += rhs._cnt; }
+ constexpr T result() const { return _cnt; }
};
template <typename T> class Prod {
private:
- T _prod = 1.0;
+ T _prod;
public:
- void first(T value) { _prod = value; }
- void next(T value) { _prod *= value; }
- T result() const { return _prod; }
+ constexpr Prod() : _prod{1} {}
+ constexpr Prod(T value) : _prod{value} {}
+ constexpr void sample(T value) { _prod *= value; }
+ constexpr void merge(const Prod &rhs) { _prod *= rhs._prod; }
+ constexpr T result() const { return _prod; }
};
template <typename T> class Sum {
private:
- T _sum = 0.0;
+ T _sum;
public:
- void first(T value) { _sum = value; }
- void next(T value) { _sum += value; }
- T result() const { return _sum; }
+ constexpr Sum() : _sum{0} {}
+ constexpr Sum(T value) : _sum{value} {}
+ constexpr void sample(T value) { _sum += value; }
+ constexpr void merge(const Sum &rhs) { _sum += rhs._sum; }
+ constexpr T result() const { return _sum; }
};
template <typename T> class Max {
private:
- T _max = -std::numeric_limits<T>::infinity();
+ T _max;
public:
- void first(T value) { _max = value; }
- void next(T value) { _max = std::max(_max, value); }
- T result() const { return _max; }
+ constexpr Max() : _max{-std::numeric_limits<T>::infinity()} {}
+ constexpr Max(T value) : _max{value} {}
+ constexpr void sample(T value) { _max = std::max(_max, value); }
+ constexpr void merge(const Max &rhs) { _max = std::max(_max, rhs._max); }
+ constexpr T result() const { return _max; }
};
template <typename T> class Min {
private:
- T _min = std::numeric_limits<T>::infinity();
+ T _min;
public:
- void first(T value) { _min = value; }
- void next(T value) { _min = std::min(_min, value); }
- T result() const { return _min; }
+ constexpr Min() : _min{std::numeric_limits<T>::infinity()} {}
+ constexpr Min(T value) : _min{value} {}
+ constexpr void sample(T value) { _min = std::min(_min, value); }
+ constexpr void merge(const Min &rhs) { _min = std::min(_min, rhs._min); }
+ constexpr T result() const { return _min; }
};
} // namespave vespalib::eval::aggr
diff --git a/eval/src/vespa/eval/instruction/generic_reduce.cpp b/eval/src/vespa/eval/instruction/generic_reduce.cpp
index 43e11ced49b..ca9b9f0cd52 100644
--- a/eval/src/vespa/eval/instruction/generic_reduce.cpp
+++ b/eval/src/vespa/eval/instruction/generic_reduce.cpp
@@ -6,6 +6,8 @@
#include <vespa/eval/eval/array_array_map.h>
#include <vespa/vespalib/util/stash.h>
#include <vespa/vespalib/util/typify.h>
+#include <vespa/vespalib/util/overload.h>
+#include <vespa/vespalib/util/visit_ranges.h>
#include <cassert>
using namespace vespalib::eval::tensor_function;
@@ -77,13 +79,9 @@ generic_reduce(const Value &value, const ReduceParam &param) {
ConstArrayRef<vespalib::stringref*> keep_addr(sparse.keep_address);
while (full_view->next_result(sparse.fetch_address, sparse.subspace)) {
auto [tag, ignore] = map.lookup_or_add_entry(keep_addr);
- AGGR *dst = nullptr;
- auto next = [&](size_t idx) { (dst++)->next(cells[idx]); };
- auto fill_aggrs = [&,tag = tag](size_t idx) {
- dst = map.get_values(tag).begin();
- param.dense_plan.execute_keep(idx, next);
- };
- param.dense_plan.execute_reduce((sparse.subspace * param.dense_plan.in_size), fill_aggrs);
+ AGGR *dst = map.get_values(tag).begin();
+ auto sample = [&](size_t src_idx, size_t dst_idx) { dst[dst_idx].sample(cells[src_idx]); };
+ param.dense_plan.execute(sparse.subspace * param.dense_plan.in_size, sample);
}
auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, map.size());
map.each_entry([&](const auto &keys, const auto &values)
@@ -116,12 +114,21 @@ template <typename ICT, typename AGGR>
void my_full_reduce_op(State &state, uint64_t) {
auto cells = state.peek(0).cells().typify<ICT>();
if (cells.size() > 0) {
- AGGR aggr;
- aggr.first(cells[0]);
- for (size_t i = 1; i < cells.size(); ++i) {
- aggr.next(cells[i]);
+ AGGR aggr[4];
+ size_t i = 0;
+ for (; (i + 3) < cells.size(); i += 4) {
+ aggr[0].sample(cells[i+0]);
+ aggr[1].sample(cells[i+1]);
+ aggr[2].sample(cells[i+2]);
+ aggr[3].sample(cells[i+3]);
}
- state.pop_push(state.stash.create<DoubleValue>(aggr.result()));
+ for (; i < cells.size(); ++i) {
+ aggr[0].sample(cells[i]);
+ }
+ aggr[0].merge(aggr[1]);
+ aggr[0].merge(aggr[2]);
+ aggr[0].merge(aggr[3]);
+ state.pop_push(state.stash.create<DoubleValue>(aggr[0].result()));
} else {
state.pop_push(state.stash.create<DoubleValue>(0.0));
}
@@ -154,39 +161,48 @@ struct PerformGenericReduce {
DenseReducePlan::DenseReducePlan(const ValueType &type, const ValueType &res_type)
: in_size(1),
out_size(1),
- keep_loop(),
- keep_stride(),
- reduce_loop(),
- reduce_stride()
+ loop_cnt(),
+ in_stride(),
+ out_stride()
{
- std::vector<bool> keep;
- std::vector<size_t> size;
- for (const auto &dim: type.nontrivial_indexed_dimensions()) {
- keep.push_back(res_type.dimension_index(dim.name) != ValueType::Dimension::npos);
- size.push_back(dim.size);
- }
- std::vector<size_t> stride(size.size(), 0);
- for (size_t i = stride.size(); i-- > 0; ) {
- stride[i] = in_size;
- in_size *= size[i];
- if (keep[i]) {
- out_size *= size[i];
- }
- }
- int prev_case = 2;
- for (size_t i = 0; i < size.size(); ++i) {
- int my_case = keep[i] ? 1 : 0;
- auto &my_loop = keep[i] ? keep_loop : reduce_loop;
- auto &my_stride = keep[i] ? keep_stride : reduce_stride;
+ enum class Case { NONE, KEEP, REDUCE };
+ Case prev_case = Case::NONE;
+ auto update_plan = [&](Case my_case, size_t my_size) {
if (my_case == prev_case) {
- assert(!my_loop.empty());
- my_loop.back() *= size[i];
- my_stride.back() = stride[i];
+ assert(!loop_cnt.empty());
+ loop_cnt.back() *= my_size;
} else {
- my_loop.push_back(size[i]);
- my_stride.push_back(stride[i]);
+ loop_cnt.push_back(my_size);
+ in_stride.push_back(1);
+ out_stride.push_back((my_case == Case::KEEP) ? 1 : 0);
+ prev_case = my_case;
+ }
+ };
+ auto visitor = overload
+ {
+ [&](visit_ranges_either, const auto &a) { update_plan(Case::REDUCE, a.size); },
+ [&](visit_ranges_both, const auto &a, const auto &) { update_plan(Case::KEEP, a.size); }
+ };
+ auto in_dims = type.nontrivial_indexed_dimensions();
+ auto out_dims = res_type.nontrivial_indexed_dimensions();
+ visit_ranges(visitor, in_dims.begin(), in_dims.end(), out_dims.begin(), out_dims.end(),
+ [](const auto &a, const auto &b){ return (a.name < b.name); });
+ for (size_t i = loop_cnt.size(); i-- > 0; ) {
+ in_stride[i] = in_size;
+ in_size *= loop_cnt[i];
+ if (out_stride[i] != 0) {
+ out_stride[i] = out_size;
+ out_size *= loop_cnt[i];
+ }
+ }
+ for (size_t i = 1; i < loop_cnt.size(); ++i) {
+ for (size_t j = i; j > 0; --j) {
+ if ((out_stride[j] == 0) && (out_stride[j - 1] > 0)) {
+ std::swap(loop_cnt[j], loop_cnt[j - 1]);
+ std::swap(in_stride[j], in_stride[j - 1]);
+ std::swap(out_stride[j], out_stride[j - 1]);
+ }
}
- prev_case = my_case;
}
}
diff --git a/eval/src/vespa/eval/instruction/generic_reduce.h b/eval/src/vespa/eval/instruction/generic_reduce.h
index 2edceb291b1..cacc0f4cfd3 100644
--- a/eval/src/vespa/eval/instruction/generic_reduce.h
+++ b/eval/src/vespa/eval/instruction/generic_reduce.h
@@ -17,17 +17,13 @@ namespace vespalib::eval::instruction {
struct DenseReducePlan {
size_t in_size;
size_t out_size;
- std::vector<size_t> keep_loop;
- std::vector<size_t> keep_stride;
- std::vector<size_t> reduce_loop;
- std::vector<size_t> reduce_stride;
+ std::vector<size_t> loop_cnt;
+ std::vector<size_t> in_stride;
+ std::vector<size_t> out_stride;
DenseReducePlan(const ValueType &type, const ValueType &res_type);
~DenseReducePlan();
- template <typename F> void execute_keep(size_t offset, const F &f) const {
- run_nested_loop(offset, keep_loop, keep_stride, f);
- }
- template <typename F> void execute_reduce(size_t offset, const F &f) const {
- run_nested_loop(offset, reduce_loop, reduce_stride, f);
+ template <typename F> void execute(size_t in_idx, const F &f) const {
+ run_nested_loop(in_idx, 0, loop_cnt, in_stride, out_stride, f);
}
};
diff --git a/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp b/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp
index c0bc3f26fe9..c0fc29d2800 100644
--- a/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp
+++ b/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp
@@ -43,11 +43,11 @@ struct Params {
};
template <typename CT, typename AGGR>
-CT reduce_cells(const CT *src, size_t dim_size, size_t stride, AGGR &aggr) {
- aggr.first(*src);
+CT reduce_cells(const CT *src, size_t dim_size, size_t stride) {
+ AGGR aggr(*src);
for (size_t i = 1; i < dim_size; ++i) {
src += stride;
- aggr.next(*src);
+ aggr.sample(*src);
}
return aggr.result();
}
@@ -57,12 +57,11 @@ void my_single_reduce_op(InterpretedFunction::State &state, uint64_t param) {
const auto &params = unwrap_param<Params>(param);
const CT *src = state.peek(0).cells().typify<CT>().cbegin();
auto dst_cells = state.stash.create_array<CT>(params.outer_size * params.inner_size);
- AGGR aggr;
CT *dst = dst_cells.begin();
const size_t block_size = (params.dim_size * params.inner_size);
for (size_t outer = 0; outer < params.outer_size; ++outer) {
for (size_t inner = 0; inner < params.inner_size; ++inner) {
- *dst++ = reduce_cells<CT, AGGR>(src + inner, params.dim_size, params.inner_size, aggr);
+ *dst++ = reduce_cells<CT, AGGR>(src + inner, params.dim_size, params.inner_size);
}
src += block_size;
}