added universal dot product

note that optimization is not yet active in production
author: Håvard Pettersen <havardpe@yahooinc.com> 2023-08-25 09:30:20 +0000
committer: Håvard Pettersen <havardpe@yahooinc.com> 2023-08-29 13:56:08 +0000
commit: 82ef4f4dfde0a5dc0ec722b54d66b70012fa2966 (patch)
tree: d3a8370f33c7c0f50a019cb2af726ec94224a953 /eval
parent: eb72c809f9ef74d8e300f21321486e8fe4f6b527 (diff)
9 files changed, 250 insertions, 1 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt
index 9b6bcc990fc..34f1cc6788d 100644
--- a/eval/CMakeLists.txt
+++ b/eval/CMakeLists.txt
@@ -89,6 +89,7 @@ vespa_define_module(
     src/tests/instruction/sparse_no_overlap_join_function
     src/tests/instruction/sparse_singledim_lookup
     src/tests/instruction/sum_max_dot_product_function
+    src/tests/instruction/universal_dot_product
     src/tests/instruction/unpack_bits_function
     src/tests/instruction/vector_from_doubles_function
     src/tests/streamed/value
diff --git a/eval/src/tests/instruction/universal_dot_product/CMakeLists.txt b/eval/src/tests/instruction/universal_dot_product/CMakeLists.txt
new file mode 100644
index 00000000000..19023b48d04
--- /dev/null
+++ b/eval/src/tests/instruction/universal_dot_product/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(eval_universal_dot_product_test_app TEST
+    SOURCES
+    universal_dot_product_test.cpp
+    DEPENDS
+    vespaeval
+    GTest::GTest
+)
+vespa_add_test(NAME eval_universal_dot_product_test_app COMMAND eval_universal_dot_product_test_app)
diff --git a/eval/src/tests/instruction/universal_dot_product/universal_dot_product_test.cpp b/eval/src/tests/instruction/universal_dot_product/universal_dot_product_test.cpp
new file mode 100644
index 00000000000..3f60ad69b86
--- /dev/null
+++ b/eval/src/tests/instruction/universal_dot_product/universal_dot_product_test.cpp
@@ -0,0 +1,89 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/eval/eval/fast_value.h>
+#include <vespa/eval/eval/value_codec.h>
+#include <vespa/eval/eval/interpreted_function.h>
+#include <vespa/eval/eval/tensor_function.h>
+#include <vespa/eval/instruction/universal_dot_product.h>
+#include <vespa/eval/eval/test/reference_operations.h>
+#include <vespa/eval/eval/test/gen_spec.h>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using namespace vespalib;
+using namespace vespalib::eval;
+using namespace vespalib::eval::test;
+
+using vespalib::make_string_short::fmt;
+
+const ValueBuilderFactory &prod_factory = FastValueBuilderFactory::get();
+
+GenSpec::seq_t N_16ths = [] (size_t i) noexcept { return (i + 33.0) / 16.0; };
+
+GenSpec G() { return GenSpec().seq(N_16ths); }
+
+const std::vector<GenSpec> layouts = {
+    G(),                                                         G(),
+    G().idx("x", 5),                                             G().idx("x", 5),
+    G().idx("x", 5),                                             G().idx("y", 5),
+    G().idx("x", 5),                                             G().idx("x", 5).idx("y", 5),
+    G().idx("y", 3),                                             G().idx("x", 2).idx("z", 3),
+    G().idx("x", 3).idx("y", 5),                                 G().idx("y", 5).idx("z", 7),
+    G().map("x", {"a","b","c"}),                                 G().map("x", {"a","b","c"}),
+    G().map("x", {"a","b","c"}),                                 G().map("x", {"a","b"}),
+    G().map("x", {"a","b","c"}),                                 G().map("y", {"foo","bar","baz"}),
+    G().map("x", {"a","b","c"}),                                 G().map("x", {"a","b","c"}).map("y", {"foo","bar","baz"}),
+    G().map("x", {"a","b"}).map("y", {"foo","bar","baz"}),       G().map("x", {"a","b","c"}).map("y", {"foo","bar"}),
+    G().map("x", {"a","b"}).map("y", {"foo","bar","baz"}),       G().map("y", {"foo","bar"}).map("z", {"i","j","k","l"}),
+    G().idx("x", 3).map("y", {"foo", "bar"}),                    G().map("y", {"foo", "bar"}).idx("z", 7),
+    G().map("x", {"a","b","c"}).idx("y", 5),                     G().idx("y", 5).map("z", {"i","j","k","l"})
+};
+
+const std::vector<std::vector<vespalib::string>> reductions = {
+    {}, {"x"}, {"y"}, {"z"}, {"x", "y"}, {"x", "z"}, {"y", "z"}
+};
+
+TensorSpec perform_dot_product(const TensorSpec &a, const TensorSpec &b, const std::vector<vespalib::string> &dims)
+{
+    Stash stash;
+    auto lhs = value_from_spec(a, prod_factory);
+    auto rhs = value_from_spec(b, prod_factory);
+    auto res_type = ValueType::join(lhs->type(), rhs->type()).reduce(dims);
+    EXPECT_FALSE(res_type.is_error());
+    UniversalDotProduct dot_product(res_type,
+                                    tensor_function::inject(lhs->type(), 0, stash),
+                                    tensor_function::inject(rhs->type(), 1, stash));
+    auto my_op = dot_product.compile_self(prod_factory, stash);
+    InterpretedFunction::EvalSingle single(prod_factory, my_op);
+    return spec_from_value(single.eval(std::vector<Value::CREF>({*lhs,*rhs})));
+}
+
+TEST(UniversalDotProductTest, generic_dot_product_works_for_various_cases) {
+    size_t test_cases = 0;
+    ASSERT_TRUE((layouts.size() % 2) == 0);
+    for (size_t i = 0; i < layouts.size(); i += 2) {
+        const auto &l = layouts[i];
+        const auto &r = layouts[i+1];
+        for (CellType lct : CellTypeUtils::list_types()) {
+            auto lhs = l.cpy().cells(lct);
+            if (lhs.bad_scalar()) continue;
+            for (CellType rct : CellTypeUtils::list_types()) {
+                auto rhs = r.cpy().cells(rct);
+                if (rhs.bad_scalar()) continue;
+                for (const std::vector<vespalib::string> &dims: reductions) {
+                    if (ValueType::join(lhs.type(), rhs.type()).reduce(dims).is_error()) continue;
+                    ++test_cases;
+                    SCOPED_TRACE(fmt("\n===\nLHS: %s\nRHS: %s\n===\n", lhs.gen().to_string().c_str(), rhs.gen().to_string().c_str()));
+                    auto expect = ReferenceOperations::reduce(ReferenceOperations::join(lhs, rhs, operation::Mul::f), Aggr::SUM, dims);
+                    auto actual = perform_dot_product(lhs, rhs, dims);
+                    // fprintf(stderr, "\n===\nLHS: %s\nRHS: %s\n===\nRESULT: %s\n===\n", lhs.gen().to_string().c_str(), rhs.gen().to_string().c_str(), actual.to_string().c_str());
+                    EXPECT_EQ(actual, expect);
+                }
+            }
+        }
+    }
+    EXPECT_GT(test_cases, 500);
+    fprintf(stderr, "total test cases run: %zu\n", test_cases);
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/vespa/eval/eval/optimize_tensor_function.cpp b/eval/src/vespa/eval/eval/optimize_tensor_function.cpp
index 1d0be47a309..3d9152d6b80 100644
--- a/eval/src/vespa/eval/eval/optimize_tensor_function.cpp
+++ b/eval/src/vespa/eval/eval/optimize_tensor_function.cpp
@@ -37,6 +37,7 @@
 #include <vespa/eval/instruction/mixed_l2_distance.h>
 #include <vespa/eval/instruction/simple_join_count.h>
 #include <vespa/eval/instruction/mapped_lookup.h>
+#include <vespa/eval/instruction/universal_dot_product.h>
 
 #include <vespa/log/log.h>
 LOG_SETUP(".eval.eval.optimize_tensor_function");
@@ -88,6 +89,7 @@ const TensorFunction &optimize_for_factory(const ValueBuilderFactory &, const Te
                           child.set(DenseHammingDistance::optimize(child.get(), stash));
                           child.set(SimpleJoinCount::optimize(child.get(), stash));
                           child.set(MappedLookup::optimize(child.get(), stash));
+                          // child.set(UniversalDotProduct::optimize(child.get(), stash));
                       });
     run_optimize_pass(root, [&stash](const Child &child)
                       {
diff --git a/eval/src/vespa/eval/instruction/CMakeLists.txt b/eval/src/vespa/eval/instruction/CMakeLists.txt
index 5fea5052eba..006a363a64f 100644
--- a/eval/src/vespa/eval/instruction/CMakeLists.txt
+++ b/eval/src/vespa/eval/instruction/CMakeLists.txt
@@ -49,6 +49,7 @@ vespa_add_library(eval_instruction OBJECT
     sparse_no_overlap_join_function.cpp
     sparse_singledim_lookup.cpp
     sum_max_dot_product_function.cpp
+    universal_dot_product.cpp
     unpack_bits_function.cpp
     vector_from_doubles_function.cpp
 )
diff --git a/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.cpp b/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.cpp
index 806e24dd7df..00499e7f997 100644
--- a/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.cpp
+++ b/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.cpp
@@ -153,7 +153,7 @@ est_fun select_estimate(const std::vector<Est> &est_list) {
 } // <unnamed>
 
 SparseJoinReducePlan::SparseJoinReducePlan(const ValueType &lhs, const ValueType &rhs, const ValueType &res)
-  : _in_lhs(), _in_rhs(), _in_res(), _estimate()
+  : _in_lhs(), _in_rhs(), _in_res(), _res_dims(0), _estimate()
 {
     auto dims = merge(lhs.mapped_dimensions(), rhs.mapped_dimensions());
     assert(count_only_in_second(dims, res.mapped_dimensions()) == 0); 
@@ -162,6 +162,9 @@ SparseJoinReducePlan::SparseJoinReducePlan(const ValueType &lhs, const ValueType
         _in_lhs.push_back(lhs.has_dimension(dim.name));
         _in_rhs.push_back(rhs.has_dimension(dim.name));
         _in_res.push_back(res.has_dimension(dim.name));
+        if (_in_res.back()) {
+            ++_res_dims;
+        }
         update_est_list(est_list, _in_lhs.back(), _in_rhs.back(), _in_res.back());
     }
     _estimate = select_estimate(est_list);
diff --git a/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.h b/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.h
index 8864c56887a..c93bf46e2dc 100644
--- a/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.h
+++ b/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.h
@@ -21,11 +21,14 @@ private:
     BitList _in_lhs;
     BitList _in_rhs;
     BitList _in_res;
+    size_t _res_dims;
     est_fun_t _estimate;
 
 public:
     SparseJoinReducePlan(const ValueType &lhs, const ValueType &rhs, const ValueType &res);
     ~SparseJoinReducePlan();
+    size_t res_dims() const { return _res_dims; }
+    bool distinct_result() const { return (_res_dims == _in_res.size()); }
     size_t estimate_result_size(const Value::Index &lhs, const Value::Index &rhs) const {
         return _estimate(lhs.size(), rhs.size());
     }
diff --git a/eval/src/vespa/eval/instruction/universal_dot_product.cpp b/eval/src/vespa/eval/instruction/universal_dot_product.cpp
new file mode 100644
index 00000000000..79a94d862bf
--- /dev/null
+++ b/eval/src/vespa/eval/instruction/universal_dot_product.cpp
@@ -0,0 +1,119 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "universal_dot_product.h"
+#include "sparse_join_reduce_plan.h"
+#include "dense_join_reduce_plan.h"
+#include <vespa/eval/eval/inline_operation.h>
+#include <vespa/eval/eval/fast_value.hpp>
+
+namespace vespalib::eval {
+
+using namespace tensor_function;
+using namespace instruction;
+using namespace operation;
+
+namespace {
+
+struct UniversalDotProductParam {
+    ValueType            res_type;
+    SparseJoinReducePlan sparse_plan;
+    DenseJoinReducePlan  dense_plan;
+    size_t               vector_size;
+
+    UniversalDotProductParam(const ValueType &res_type_in,
+                             const ValueType &lhs_type,
+                             const ValueType &rhs_type)
+      : res_type(res_type_in),
+        sparse_plan(lhs_type, rhs_type, res_type),
+        dense_plan(lhs_type, rhs_type, res_type),
+        vector_size(1)
+    {
+        if (!dense_plan.loop_cnt.empty() &&
+            dense_plan.lhs_stride.back() == 1 &&
+            dense_plan.rhs_stride.back() == 1 &&
+            dense_plan.res_stride.back() == 0)
+        {
+            vector_size = dense_plan.loop_cnt.back();
+            dense_plan.loop_cnt.pop_back();
+            dense_plan.lhs_stride.pop_back();
+            dense_plan.rhs_stride.pop_back();
+            dense_plan.res_stride.pop_back();
+        }
+    }
+};
+
+template <typename LCT, typename RCT, typename OCT>
+void my_universal_dot_product_op(InterpretedFunction::State &state, uint64_t param_in) {
+    using dot_product = DotProduct<LCT,RCT>;
+    const auto &param = unwrap_param<UniversalDotProductParam>(param_in);
+    const auto &lhs = state.peek(1);
+    const auto &rhs = state.peek(0);
+    const auto &lhs_index = lhs.index();
+    const auto &rhs_index = rhs.index();
+    const auto lhs_cells = lhs.cells().typify<LCT>();
+    const auto rhs_cells = rhs.cells().typify<RCT>();
+    auto &stored_result = state.stash.create<std::unique_ptr<FastValue<OCT,true>>>(
+        std::make_unique<FastValue<OCT,true>>(param.res_type, param.sparse_plan.res_dims(), param.dense_plan.res_size,
+                                              param.sparse_plan.estimate_result_size(lhs_index, rhs_index)));
+    auto &result = *(stored_result.get());
+    ArrayRef<OCT> dst;
+    auto dense_fun = [&](size_t lhs_idx, size_t rhs_idx, size_t dst_idx) {
+                         dst[dst_idx] += dot_product::apply(&lhs_cells[lhs_idx], &rhs_cells[rhs_idx], param.vector_size);
+                     };
+    auto sparse_fun = [&](size_t lhs_subspace, size_t rhs_subspace, ConstArrayRef<string_id> res_addr) {
+                          bool first;
+                          std::tie(dst, first) = result.insert_subspace(res_addr);
+                          if (first) {
+                              std::fill(dst.begin(), dst.end(), OCT{});
+                          }
+                          param.dense_plan.execute(lhs_subspace * param.dense_plan.lhs_size,
+                                                   rhs_subspace * param.dense_plan.rhs_size,
+                                                   0, dense_fun);
+                      };
+    param.sparse_plan.execute(lhs_index, rhs_index, sparse_fun);
+    state.pop_pop_push(result);
+}
+
+struct SelectUniversalDotProduct {
+    template <typename LCM, typename RCM, typename SCALAR> static auto invoke(const UniversalDotProductParam &) {
+        constexpr CellMeta ocm = CellMeta::join(LCM::value, RCM::value).reduce(SCALAR::value);
+        using LCT = CellValueType<LCM::value.cell_type>;
+        using RCT = CellValueType<RCM::value.cell_type>;
+        using OCT = CellValueType<ocm.cell_type>;
+        return my_universal_dot_product_op<LCT,RCT,OCT>;
+    }
+};
+
+} // namespace <unnamed>
+
+UniversalDotProduct::UniversalDotProduct(const ValueType &res_type_in,
+                                         const TensorFunction &lhs_in,
+                                         const TensorFunction &rhs_in)
+  : tensor_function::Op2(res_type_in, lhs_in, rhs_in)
+{
+}
+
+InterpretedFunction::Instruction
+UniversalDotProduct::compile_self(const ValueBuilderFactory &, Stash &stash) const
+{
+    auto &param = stash.create<UniversalDotProductParam>(result_type(), lhs().result_type(), rhs().result_type());
+    using MyTypify = TypifyValue<TypifyCellMeta,TypifyBool>;
+    auto op = typify_invoke<3,MyTypify,SelectUniversalDotProduct>(lhs().result_type().cell_meta(),
+                                                                  rhs().result_type().cell_meta(),
+                                                                  result_type().cell_meta().is_scalar,
+                                                                  param);
+    return InterpretedFunction::Instruction(op, wrap_param<UniversalDotProductParam>(param));
+}
+
+const TensorFunction &
+UniversalDotProduct::optimize(const TensorFunction &expr, Stash &stash)
+{
+    if (auto reduce = as<Reduce>(expr); reduce && (reduce->aggr() == Aggr::SUM)) {
+        if (auto join = as<Join>(reduce->child()); join && (join->function() == Mul::f)) {
+            return stash.create<UniversalDotProduct>(expr.result_type(), join->lhs(), join->rhs());
+        }
+    }
+    return expr;
+}
+
+} // namespace
diff --git a/eval/src/vespa/eval/instruction/universal_dot_product.h b/eval/src/vespa/eval/instruction/universal_dot_product.h
new file mode 100644
index 00000000000..ac5aa157f17
--- /dev/null
+++ b/eval/src/vespa/eval/instruction/universal_dot_product.h
@@ -0,0 +1,22 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/eval/eval/tensor_function.h>
+
+namespace vespalib::eval {
+
+/**
+ * Tensor function performing dot product compatible operations
+ * (join:mul, reduce:sum) on values of arbitrary complexity.
+ **/
+class UniversalDotProduct : public tensor_function::Op2
+{
+public:
+    UniversalDotProduct(const ValueType &res_type, const TensorFunction &lhs, const TensorFunction &rhs);
+    InterpretedFunction::Instruction compile_self(const ValueBuilderFactory &factory, Stash &stash) const override;
+    bool result_is_mutable() const override { return true; }
+    static const TensorFunction &optimize(const TensorFunction &expr, Stash &stash);
+};
+
+} // namespace
author	Håvard Pettersen <havardpe@yahooinc.com>	2023-08-25 09:30:20 +0000
committer	Håvard Pettersen <havardpe@yahooinc.com>	2023-08-29 13:56:08 +0000
commit	82ef4f4dfde0a5dc0ec722b54d66b70012fa2966 (patch)
tree	d3a8370f33c7c0f50a019cb2af726ec94224a953 /eval
parent	eb72c809f9ef74d8e300f21321486e8fe4f6b527 (diff)