summaryrefslogtreecommitdiffstats
path: root/eval
diff options
context:
space:
mode:
authorArne H Juul <arnej27959@users.noreply.github.com>2023-08-28 15:28:01 +0200
committerGitHub <noreply@github.com>2023-08-28 15:28:01 +0200
commitc0e937263e09b4723f3313665fa2480046a069c8 (patch)
tree80f6a7ed84ebe83ab83fdcb1bc69db10ab8cd0ae /eval
parentc5d373464251e232c99ba70be72b851b2b5bda7f (diff)
parent387a1206484925aee3de22988655ce5c588bc3ab (diff)
Merge pull request #28156 from vespa-engine/havardpe/dense-join-reduce-plan
dense join reduce plan
Diffstat (limited to 'eval')
-rw-r--r--eval/CMakeLists.txt1
-rw-r--r--eval/src/tests/eval/nested_loop/nested_loop_test.cpp80
-rw-r--r--eval/src/tests/eval/value_type/value_type_test.cpp14
-rw-r--r--eval/src/tests/instruction/dense_join_reduce_plan/CMakeLists.txt9
-rw-r--r--eval/src/tests/instruction/dense_join_reduce_plan/dense_join_reduce_plan_test.cpp101
-rw-r--r--eval/src/vespa/eval/eval/nested_loop.h37
-rw-r--r--eval/src/vespa/eval/eval/value_type.cpp31
-rw-r--r--eval/src/vespa/eval/eval/value_type.h1
-rw-r--r--eval/src/vespa/eval/instruction/CMakeLists.txt1
-rw-r--r--eval/src/vespa/eval/instruction/dense_join_reduce_plan.cpp95
-rw-r--r--eval/src/vespa/eval/instruction/dense_join_reduce_plan.h27
11 files changed, 372 insertions, 25 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt
index e149727d94f..9b6bcc990fc 100644
--- a/eval/CMakeLists.txt
+++ b/eval/CMakeLists.txt
@@ -50,6 +50,7 @@ vespa_define_module(
src/tests/instruction/dense_dot_product_function
src/tests/instruction/dense_hamming_distance
src/tests/instruction/dense_inplace_join_function
+ src/tests/instruction/dense_join_reduce_plan
src/tests/instruction/dense_matmul_function
src/tests/instruction/dense_multi_matmul_function
src/tests/instruction/dense_replace_type_function
diff --git a/eval/src/tests/eval/nested_loop/nested_loop_test.cpp b/eval/src/tests/eval/nested_loop/nested_loop_test.cpp
index caa76816599..5c5d4b219c5 100644
--- a/eval/src/tests/eval/nested_loop/nested_loop_test.cpp
+++ b/eval/src/tests/eval/nested_loop/nested_loop_test.cpp
@@ -5,7 +5,7 @@
using namespace vespalib::eval;
-std::vector<size_t> run_single(size_t idx_in, const std::vector<size_t> &loop, const std::vector<size_t> &stride) {
+std::vector<size_t> run_loop(size_t idx_in, const std::vector<size_t> &loop, const std::vector<size_t> &stride) {
std::vector<size_t> result;
auto capture = [&](size_t idx_out) { result.push_back(idx_out); };
assert(loop.size() == stride.size());
@@ -13,8 +13,8 @@ std::vector<size_t> run_single(size_t idx_in, const std::vector<size_t> &loop, c
return result;
}
-std::vector<std::pair<size_t,size_t>> run_double(size_t idx1_in, size_t idx2_in, const std::vector<size_t> &loop,
- const std::vector<size_t> &stride1, const std::vector<size_t> &stride2)
+std::vector<std::pair<size_t,size_t>> run_two_loops(size_t idx1_in, size_t idx2_in, const std::vector<size_t> &loop,
+ const std::vector<size_t> &stride1, const std::vector<size_t> &stride2)
{
std::vector<std::pair<size_t,size_t>> result;
auto capture = [&](size_t idx1_out, size_t idx2_out) { result.emplace_back(idx1_out, idx2_out); };
@@ -24,37 +24,77 @@ std::vector<std::pair<size_t,size_t>> run_double(size_t idx1_in, size_t idx2_in,
return result;
}
-void verify_double(size_t idx1_in, size_t idx2_in, const std::vector<size_t> &loop,
- const std::vector<size_t> &stride1, const std::vector<size_t> &stride2)
+void add_entry(std::vector<std::vector<size_t>> &result, std::vector<size_t> value) {
+ result.push_back(std::move(value));
+}
+
+std::vector<std::vector<size_t>> run_three_loops(size_t idx1_in, size_t idx2_in, size_t idx3_in, const std::vector<size_t> &loop,
+ const std::vector<size_t> &stride1, const std::vector<size_t> &stride2, const std::vector<size_t> &stride3)
{
- auto res1 = run_single(idx1_in, loop, stride1);
- auto res2 = run_single(idx2_in, loop, stride2);
+ std::vector<std::vector<size_t>> result;
+ auto capture = [&](size_t idx1_out, size_t idx2_out, size_t idx3_out) { add_entry(result, {idx1_out, idx2_out, idx3_out}); };
+ assert(loop.size() == stride1.size());
+ assert(loop.size() == stride2.size());
+ assert(loop.size() == stride3.size());
+ run_nested_loop(idx1_in, idx2_in, idx3_in, loop, stride1, stride2, stride3, capture);
+ return result;
+}
+
+void verify_two(size_t idx1_in, size_t idx2_in, const std::vector<size_t> &loop,
+ const std::vector<size_t> &stride1, const std::vector<size_t> &stride2)
+{
+ auto res1 = run_loop(idx1_in, loop, stride1);
+ auto res2 = run_loop(idx2_in, loop, stride2);
ASSERT_EQ(res1.size(), res2.size());
std::vector<std::pair<size_t,size_t>> expect;
for (size_t i = 0; i < res1.size(); ++i) {
expect.emplace_back(res1[i], res2[i]);
}
- auto actual = run_double(idx1_in, idx2_in, loop, stride1, stride2);
+ auto actual = run_two_loops(idx1_in, idx2_in, loop, stride1, stride2);
+ EXPECT_EQ(actual, expect);
+}
+
+void verify_three(size_t idx1_in, size_t idx2_in, size_t idx3_in, const std::vector<size_t> &loop,
+ const std::vector<size_t> &stride1, const std::vector<size_t> &stride2, const std::vector<size_t> &stride3)
+{
+ auto res1 = run_loop(idx1_in, loop, stride1);
+ auto res2 = run_loop(idx2_in, loop, stride2);
+ auto res3 = run_loop(idx3_in, loop, stride3);
+ ASSERT_EQ(res1.size(), res2.size());
+ ASSERT_EQ(res1.size(), res3.size());
+ std::vector<std::vector<size_t>> expect;
+ for (size_t i = 0; i < res1.size(); ++i) {
+ add_entry(expect, {res1[i], res2[i], res3[i]});
+ }
+ auto actual = run_three_loops(idx1_in, idx2_in, idx3_in, loop, stride1, stride2, stride3);
EXPECT_EQ(actual, expect);
}
std::vector<size_t> v(std::vector<size_t> vec) { return vec; }
-TEST(NestedLoopTest, single_nested_loop_can_be_executed) {
- EXPECT_EQ(v({123}), run_single(123, {}, {}));
- EXPECT_EQ(v({10,11}), run_single(10, {2}, {1}));
- EXPECT_EQ(v({100,110,101,111}), run_single(100, {2,2}, {1,10}));
- EXPECT_EQ(v({100,110,100,110,101,111,101,111}), run_single(100, {2,2,2}, {1,0,10}));
+TEST(NestedLoopTest, nested_loop_can_be_executed) {
+ EXPECT_EQ(v({123}), run_loop(123, {}, {}));
+ EXPECT_EQ(v({10,11}), run_loop(10, {2}, {1}));
+ EXPECT_EQ(v({100,110,101,111}), run_loop(100, {2,2}, {1,10}));
+ EXPECT_EQ(v({100,110,100,110,101,111,101,111}), run_loop(100, {2,2,2}, {1,0,10}));
EXPECT_EQ(v({100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),
- run_single(100, {2,2,2,2}, {8,4,2,1}));
+ run_loop(100, {2,2,2,2}, {8,4,2,1}));
+}
+
+TEST(NestedLoopTest, two_parallel_nested_loops_can_be_executed) {
+ verify_two(10, 20, {}, {}, {});
+ verify_two(10, 20, {3}, {5}, {7});
+ verify_two(10, 20, {3,3}, {2,3}, {7,5});
+ verify_two(10, 20, {3,3,2}, {2,0,3}, {0,7,5});
+ verify_two(10, 20, {2,3,2,3}, {7,2,1,3}, {3,7,5,1});
}
-TEST(NestedLoopTest, double_nested_loop_can_be_executed) {
- verify_double(10, 20, {}, {}, {});
- verify_double(10, 20, {3}, {5}, {7});
- verify_double(10, 20, {3,3}, {2,3}, {7,5});
- verify_double(10, 20, {3,3,2}, {2,0,3}, {0,7,5});
- verify_double(10, 20, {2,3,2,3}, {7,2,1,3}, {3,7,5,1});
+TEST(NestedLoopTest, three_parallel_nested_loops_can_be_executed) {
+ verify_three(10, 20, 30, {}, {}, {}, {});
+ verify_three(10, 20, 30, {3}, {5}, {7}, {3});
+ verify_three(10, 20, 30, {3,3}, {2,3}, {7,5}, {5, 3});
+ verify_three(10, 20, 30, {3,3,2}, {2,0,3}, {0,7,5}, {5, 3, 0});
+ verify_three(10, 20, 30, {2,3,2,3}, {7,2,1,3}, {3,7,5,1}, {1,5,7,3});
}
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/tests/eval/value_type/value_type_test.cpp b/eval/src/tests/eval/value_type/value_type_test.cpp
index feb6fb5c368..2e505a9d16d 100644
--- a/eval/src/tests/eval/value_type/value_type_test.cpp
+++ b/eval/src/tests/eval/value_type/value_type_test.cpp
@@ -363,6 +363,20 @@ TEST("require that dimension index can be obtained") {
EXPECT_EQUAL(type("tensor(y[10],x{},z[5])").dimension_index("w"), ValueType::Dimension::npos);
}
+TEST("require that dimension stride can be calculated") {
+ EXPECT_EQUAL(type("error").stride_of("x"), 0u);
+ EXPECT_EQUAL(type("double").stride_of("x"), 0u);
+ EXPECT_EQUAL(type("tensor()").stride_of("x"), 0u);
+ EXPECT_EQUAL(type("tensor(x{})").stride_of("x"), 0u);
+ EXPECT_EQUAL(type("tensor(x[10])").stride_of("x"), 1u);
+ EXPECT_EQUAL(type("tensor(x[10])").stride_of("y"), 0u);
+ EXPECT_EQUAL(type("tensor(x[10],y[5])").stride_of("x"), 5u);
+ EXPECT_EQUAL(type("tensor(x[10],y[5],z[3])").stride_of("x"), 15u);
+ EXPECT_EQUAL(type("tensor(x[10],y[5],z[3])").stride_of("y"), 3u);
+ EXPECT_EQUAL(type("tensor(x[10],y[5],z[3])").stride_of("z"), 1u);
+ EXPECT_EQUAL(type("tensor(x[10],y{},z[3])").stride_of("x"), 3u);
+}
+
void verify_predicates(const ValueType &type,
bool expect_error, bool expect_double, bool expect_tensor,
bool expect_sparse, bool expect_dense, bool expect_mixed)
diff --git a/eval/src/tests/instruction/dense_join_reduce_plan/CMakeLists.txt b/eval/src/tests/instruction/dense_join_reduce_plan/CMakeLists.txt
new file mode 100644
index 00000000000..9de33bcf8a0
--- /dev/null
+++ b/eval/src/tests/instruction/dense_join_reduce_plan/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(eval_dense_join_reduce_plan_test_app TEST
+ SOURCES
+ dense_join_reduce_plan_test.cpp
+ DEPENDS
+ vespaeval
+ GTest::GTest
+)
+vespa_add_test(NAME eval_dense_join_reduce_plan_test_app COMMAND eval_dense_join_reduce_plan_test_app)
diff --git a/eval/src/tests/instruction/dense_join_reduce_plan/dense_join_reduce_plan_test.cpp b/eval/src/tests/instruction/dense_join_reduce_plan/dense_join_reduce_plan_test.cpp
new file mode 100644
index 00000000000..9851e209ba5
--- /dev/null
+++ b/eval/src/tests/instruction/dense_join_reduce_plan/dense_join_reduce_plan_test.cpp
@@ -0,0 +1,101 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/eval/instruction/dense_join_reduce_plan.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using namespace vespalib;
+using namespace vespalib::eval;
+using namespace vespalib::eval::instruction;
+
+ValueType type(const vespalib::string &type_spec) {
+ return ValueType::from_spec(type_spec);
+}
+
+TEST(DenseJoinReducePlanTest, make_trivial_plan) {
+ auto plan = DenseJoinReducePlan(type("double"), type("double"), type("double"));
+ EXPECT_TRUE(plan.distinct_result());
+ EXPECT_EQ(plan.lhs_size, 1);
+ EXPECT_EQ(plan.rhs_size, 1);
+ EXPECT_EQ(plan.res_size, 1);
+ EXPECT_TRUE(plan.loop_cnt.empty());
+ EXPECT_TRUE(plan.lhs_stride.empty());
+ EXPECT_TRUE(plan.rhs_stride.empty());
+ EXPECT_TRUE(plan.res_stride.empty());
+}
+
+TEST(DenseJoinReducePlanTest, execute_trivial_plan) {
+ auto plan = DenseJoinReducePlan(type("double"), type("double"), type("double"));
+ size_t res = 0;
+ auto join_reduce = [&](size_t a_idx, size_t b_idx, size_t c_idx) {
+ res += (12 + a_idx + b_idx + c_idx);
+ };
+ plan.execute(5, 10, 15, join_reduce);
+ EXPECT_EQ(res, 42);
+}
+
+TEST(DenseJoinReducePlanTest, make_simple_plan) {
+ auto plan = DenseJoinReducePlan(type("tensor(a[2])"), type("tensor(b[3])"), type("tensor(a[2])"));
+ SmallVector<size_t> expect_loop = {2,3};
+ SmallVector<size_t> expect_lhs_stride = {1,0};
+ SmallVector<size_t> expect_rhs_stride = {0,1};
+ SmallVector<size_t> expect_res_stride = {1,0};
+ EXPECT_FALSE(plan.distinct_result());
+ EXPECT_EQ(plan.lhs_size, 2);
+ EXPECT_EQ(plan.rhs_size, 3);
+ EXPECT_EQ(plan.res_size, 2);
+ EXPECT_EQ(plan.loop_cnt, expect_loop);
+ EXPECT_EQ(plan.lhs_stride, expect_lhs_stride);
+ EXPECT_EQ(plan.rhs_stride, expect_rhs_stride);
+ EXPECT_EQ(plan.res_stride, expect_res_stride);
+}
+
+TEST(DenseJoinReducePlanTest, execute_simple_plan) {
+ auto plan = DenseJoinReducePlan(type("tensor(a[2])"), type("tensor(b[3])"), type("tensor(a[2])"));
+ std::vector<int> a({1, 2});
+ std::vector<int> b({3, 4, 5});
+ std::vector<int> c(2, 0);
+ std::vector<int> expect = {12, 24};
+ ASSERT_EQ(plan.res_size, 2);
+ auto join_reduce = [&](size_t a_idx, size_t b_idx, size_t c_idx) { c[c_idx] += (a[a_idx] * b[b_idx]); };
+ plan.execute(0, 0, 0, join_reduce);
+ EXPECT_EQ(c, expect);
+}
+
+TEST(DenseJoinReducePlanTest, make_distinct_plan) {
+ auto plan = DenseJoinReducePlan(type("tensor(a[2])"),
+ type("tensor(b[3])"),
+ type("tensor(a[2],b[3])"));
+ SmallVector<size_t> expect_loop = {2,3};
+ SmallVector<size_t> expect_lhs_stride = {1,0};
+ SmallVector<size_t> expect_rhs_stride = {0,1};
+ SmallVector<size_t> expect_res_stride = {3,1};
+ EXPECT_TRUE(plan.distinct_result());
+ EXPECT_EQ(plan.lhs_size, 2);
+ EXPECT_EQ(plan.rhs_size, 3);
+ EXPECT_EQ(plan.res_size, 6);
+ EXPECT_EQ(plan.loop_cnt, expect_loop);
+ EXPECT_EQ(plan.lhs_stride, expect_lhs_stride);
+ EXPECT_EQ(plan.rhs_stride, expect_rhs_stride);
+ EXPECT_EQ(plan.res_stride, expect_res_stride);
+}
+
+TEST(DenseJoinReducePlanTest, make_complex_plan) {
+ auto lhs = type("tensor(a{},b[6],c[5],e[3],f[2],g{})");
+ auto rhs = type("tensor(a{},b[6],c[5],d[4],h{})");
+ auto res = type("tensor(a{},b[6],c[5],d[4],e[3])");
+ auto plan = DenseJoinReducePlan(lhs, rhs, res);
+ SmallVector<size_t> expect_loop = {30,4,3,2};
+ SmallVector<size_t> expect_lhs_stride = {6,0,2,1};
+ SmallVector<size_t> expect_rhs_stride = {4,1,0,0};
+ SmallVector<size_t> expect_res_stride = {12,3,1,0};
+ EXPECT_FALSE(plan.distinct_result());
+ EXPECT_EQ(plan.lhs_size, 180);
+ EXPECT_EQ(plan.rhs_size, 120);
+ EXPECT_EQ(plan.res_size, 360);
+ EXPECT_EQ(plan.loop_cnt, expect_loop);
+ EXPECT_EQ(plan.lhs_stride, expect_lhs_stride);
+ EXPECT_EQ(plan.rhs_stride, expect_rhs_stride);
+ EXPECT_EQ(plan.res_stride, expect_res_stride);
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/vespa/eval/eval/nested_loop.h b/eval/src/vespa/eval/eval/nested_loop.h
index 92e84f5c052..695b212dd90 100644
--- a/eval/src/vespa/eval/eval/nested_loop.h
+++ b/eval/src/vespa/eval/eval/nested_loop.h
@@ -65,6 +65,28 @@ template <typename F> void execute_many(size_t idx1, size_t idx2, const size_t *
//-----------------------------------------------------------------------------
+template <typename F, size_t N> void execute_few(size_t idx1, size_t idx2, size_t idx3, const size_t *loop, const size_t *stride1, const size_t *stride2, const size_t *stride3, const F &f) {
+ if constexpr (N == 0) {
+ f(idx1, idx2, idx3);
+ } else {
+ for (size_t i = 0; i < *loop; ++i, idx1 += *stride1, idx2 += *stride2, idx3 += *stride3) {
+ execute_few<F, N - 1>(idx1, idx2, idx3, loop + 1, stride1 + 1, stride2 + 1, stride3 + 1, f);
+ }
+ }
+}
+
+template <typename F> void execute_many(size_t idx1, size_t idx2, size_t idx3, const size_t *loop, const size_t *stride1, const size_t *stride2, const size_t *stride3, size_t levels, const F &f) {
+ for (size_t i = 0; i < *loop; ++i, idx1 += *stride1, idx2 += *stride2, idx3 += *stride3) {
+ if ((levels - 1) == 3) {
+ execute_few<F, 3>(idx1, idx2, idx3, loop + 1, stride1 + 1, stride2 + 1, stride3 + 1, f);
+ } else {
+ execute_many<F>(idx1, idx2, idx3, loop + 1, stride1 + 1, stride2 + 1, stride3 + 1, levels - 1, f);
+ }
+ }
+}
+
+//-----------------------------------------------------------------------------
+
} // implementation details
// Run a nested loop and pass indexes to 'f'
@@ -95,4 +117,19 @@ void run_nested_loop(size_t idx1, size_t idx2, const V &loop, const V &stride1,
}
}
+// Run three nested loops in parallel and all three indexes to
+// 'f'. Note that 'loop' is shared, which means that only individual
+// strides may differ between the three loops.
+template <typename F, typename V>
+void run_nested_loop(size_t idx1, size_t idx2, size_t idx3, const V &loop, const V &stride1, const V &stride2, const V &stride3, const F &f) {
+ size_t levels = loop.size();
+ switch(levels) {
+ case 0: return f(idx1, idx2, idx3);
+ case 1: return nested_loop::execute_few<F, 1>(idx1, idx2, idx3, &loop[0], &stride1[0], &stride2[0], &stride3[0], f);
+ case 2: return nested_loop::execute_few<F, 2>(idx1, idx2, idx3, &loop[0], &stride1[0], &stride2[0], &stride3[0], f);
+ case 3: return nested_loop::execute_few<F, 3>(idx1, idx2, idx3, &loop[0], &stride1[0], &stride2[0], &stride3[0], f);
+ default: return nested_loop::execute_many<F>(idx1, idx2, idx3, &loop[0], &stride1[0], &stride2[0], &stride3[0], levels, f);
+ }
+}
+
} // namespace
diff --git a/eval/src/vespa/eval/eval/value_type.cpp b/eval/src/vespa/eval/eval/value_type.cpp
index 7d088b22e06..ae0ea1a0cd6 100644
--- a/eval/src/vespa/eval/eval/value_type.cpp
+++ b/eval/src/vespa/eval/eval/value_type.cpp
@@ -155,7 +155,8 @@ ValueType::error_if(bool has_error, ValueType else_type)
ValueType::~ValueType() = default;
bool
-ValueType::is_double() const {
+ValueType::is_double() const
+{
if (!_error && _dimensions.empty()) {
assert(_cell_type == CellType::DOUBLE);
return true;
@@ -240,7 +241,8 @@ ValueType::dense_subspace_size() const
}
std::vector<ValueType::Dimension>
-ValueType::nontrivial_indexed_dimensions() const {
+ValueType::nontrivial_indexed_dimensions() const
+{
std::vector<ValueType::Dimension> result;
for (const auto &dim: dimensions()) {
if (dim.is_indexed() && !dim.is_trivial()) {
@@ -251,7 +253,8 @@ ValueType::nontrivial_indexed_dimensions() const {
}
std::vector<ValueType::Dimension>
-ValueType::indexed_dimensions() const {
+ValueType::indexed_dimensions() const
+{
std::vector<ValueType::Dimension> result;
for (const auto &dim: dimensions()) {
if (dim.is_indexed()) {
@@ -262,7 +265,8 @@ ValueType::indexed_dimensions() const {
}
std::vector<ValueType::Dimension>
-ValueType::mapped_dimensions() const {
+ValueType::mapped_dimensions() const
+{
std::vector<ValueType::Dimension> result;
for (const auto &dim: dimensions()) {
if (dim.is_mapped()) {
@@ -273,10 +277,27 @@ ValueType::mapped_dimensions() const {
}
size_t
-ValueType::dimension_index(const vespalib::string &name) const {
+ValueType::dimension_index(const vespalib::string &name) const
+{
return my_dimension_index(_dimensions, name);
}
+size_t
+ValueType::stride_of(const vespalib::string &name) const
+{
+ size_t stride = 0;
+ for (const auto &dim: dimensions()) {
+ if (dim.is_indexed()) {
+ if (stride == 0 && dim.name == name) {
+ stride = 1;
+ } else {
+ stride *= dim.size;
+ }
+ }
+ }
+ return stride;
+}
+
std::vector<vespalib::string>
ValueType::dimension_names() const
{
diff --git a/eval/src/vespa/eval/eval/value_type.h b/eval/src/vespa/eval/eval/value_type.h
index ddbdcfb4a5f..3bdfcbdd980 100644
--- a/eval/src/vespa/eval/eval/value_type.h
+++ b/eval/src/vespa/eval/eval/value_type.h
@@ -69,6 +69,7 @@ public:
std::vector<Dimension> indexed_dimensions() const;
std::vector<Dimension> mapped_dimensions() const;
size_t dimension_index(const vespalib::string &name) const;
+ size_t stride_of(const vespalib::string &name) const;
bool has_dimension(const vespalib::string &name) const {
return (dimension_index(name) != Dimension::npos);
}
diff --git a/eval/src/vespa/eval/instruction/CMakeLists.txt b/eval/src/vespa/eval/instruction/CMakeLists.txt
index 487539aeff1..5fea5052eba 100644
--- a/eval/src/vespa/eval/instruction/CMakeLists.txt
+++ b/eval/src/vespa/eval/instruction/CMakeLists.txt
@@ -7,6 +7,7 @@ vespa_add_library(eval_instruction OBJECT
dense_cell_range_function.cpp
dense_dot_product_function.cpp
dense_hamming_distance.cpp
+ dense_join_reduce_plan.cpp
dense_lambda_peek_function.cpp
dense_lambda_peek_optimizer.cpp
dense_matmul_function.cpp
diff --git a/eval/src/vespa/eval/instruction/dense_join_reduce_plan.cpp b/eval/src/vespa/eval/instruction/dense_join_reduce_plan.cpp
new file mode 100644
index 00000000000..20b7d3364a8
--- /dev/null
+++ b/eval/src/vespa/eval/instruction/dense_join_reduce_plan.cpp
@@ -0,0 +1,95 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "dense_join_reduce_plan.h"
+#include <vespa/vespalib/util/overload.h>
+#include <vespa/vespalib/util/visit_ranges.h>
+#include <cassert>
+
+namespace vespalib::eval::instruction {
+
+namespace {
+
+using Dim = ValueType::Dimension;
+using Dims = std::vector<ValueType::Dimension>;
+
+void visit(auto &v, const Dims &a, const Dims &b) {
+ visit_ranges(v, a.begin(), a.end(), b.begin(), b.end(),
+ [](const auto &x, const auto &y){ return (x.name < y.name); });
+}
+
+Dims merge(const Dims &first, const Dims &second) {
+ Dims result;
+ auto visitor = overload {
+ [&result](visit_ranges_either, const Dim &dim) { result.push_back(dim); },
+ [&result](visit_ranges_both, const Dim &dim, const Dim &) { result.push_back(dim); }
+ };
+ visit(visitor, first, second);
+ return result;
+}
+
+size_t count_only_in_second(const Dims &first, const Dims &second) {
+ size_t result = 0;
+ auto visitor = overload {
+ [](visit_ranges_first, const Dim &) {},
+ [&result](visit_ranges_second, const Dim &) { ++result; },
+ [](visit_ranges_both, const Dim &, const Dim &) {}
+ };
+ visit(visitor, first, second);
+ return result;
+}
+
+struct Strides {
+ size_t lhs;
+ size_t rhs;
+ size_t res;
+ Strides() noexcept : lhs(0), rhs(0), res(0) {}
+ Strides(size_t lhs_in, size_t rhs_in, size_t res_in) noexcept
+ : lhs(lhs_in), rhs(rhs_in), res(res_in) {}
+ bool can_combine_with(const Strides &prev) const noexcept {
+ return ((lhs > 0) == (prev.lhs > 0)) &&
+ ((rhs > 0) == (prev.rhs > 0)) &&
+ ((res > 0) == (prev.res > 0));
+ }
+};
+
+} // <unnamed>
+
+DenseJoinReducePlan::DenseJoinReducePlan(const ValueType &lhs, const ValueType &rhs, const ValueType &res)
+ : lhs_size(lhs.dense_subspace_size()), rhs_size(rhs.dense_subspace_size()), res_size(res.dense_subspace_size()),
+ loop_cnt(), lhs_stride(), rhs_stride(), res_stride()
+{
+ auto dims = merge(lhs.nontrivial_indexed_dimensions(), rhs.nontrivial_indexed_dimensions());
+ assert(count_only_in_second(dims, res.nontrivial_indexed_dimensions()) == 0);
+ Strides prev_strides;
+ for (const auto &dim: dims) {
+ Strides strides(lhs.stride_of(dim.name), rhs.stride_of(dim.name), res.stride_of(dim.name));
+ if (strides.can_combine_with(prev_strides)) {
+ assert(!loop_cnt.empty());
+ loop_cnt.back() *= dim.size;
+ lhs_stride.back() = strides.lhs;
+ rhs_stride.back() = strides.rhs;
+ res_stride.back() = strides.res;
+ } else {
+ loop_cnt.push_back(dim.size);
+ lhs_stride.push_back(strides.lhs);
+ rhs_stride.push_back(strides.rhs);
+ res_stride.push_back(strides.res);
+ }
+ prev_strides = strides;
+ }
+}
+
+DenseJoinReducePlan::~DenseJoinReducePlan() = default;
+
+bool
+DenseJoinReducePlan::distinct_result() const
+{
+ for (size_t stride: res_stride) {
+ if (stride == 0) {
+ return false;
+ }
+ }
+ return true;
+}
+
+} // namespace
diff --git a/eval/src/vespa/eval/instruction/dense_join_reduce_plan.h b/eval/src/vespa/eval/instruction/dense_join_reduce_plan.h
new file mode 100644
index 00000000000..8f9d5218630
--- /dev/null
+++ b/eval/src/vespa/eval/instruction/dense_join_reduce_plan.h
@@ -0,0 +1,27 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/eval/eval/value_type.h>
+#include <vespa/eval/eval/nested_loop.h>
+#include <vespa/vespalib/util/small_vector.h>
+
+namespace vespalib::eval::instruction {
+
+struct DenseJoinReducePlan {
+ size_t lhs_size;
+ size_t rhs_size;
+ size_t res_size;
+ SmallVector<size_t> loop_cnt;
+ SmallVector<size_t> lhs_stride;
+ SmallVector<size_t> rhs_stride;
+ SmallVector<size_t> res_stride;
+ DenseJoinReducePlan(const ValueType &lhs, const ValueType &rhs, const ValueType &res);
+ ~DenseJoinReducePlan();
+ template <typename F> void execute(size_t lhs, size_t rhs, size_t res, const F &f) const {
+ run_nested_loop(lhs, rhs, res, loop_cnt, lhs_stride, rhs_stride, res_stride, f);
+ }
+ bool distinct_result() const;
+};
+
+} // namespace