diff options
Diffstat (limited to 'eval')
4 files changed, 330 insertions, 0 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt index 66ee9916c83..33bd098975a 100644 --- a/eval/CMakeLists.txt +++ b/eval/CMakeLists.txt @@ -56,6 +56,7 @@ vespa_define_module( src/tests/tensor/direct_dense_tensor_builder src/tests/tensor/direct_sparse_tensor_builder src/tests/tensor/index_lookup_table + src/tests/tensor/instruction_benchmark src/tests/tensor/onnx_wrapper src/tests/tensor/packed_mappings src/tests/tensor/tensor_add_operation diff --git a/eval/src/tests/tensor/instruction_benchmark/.gitignore b/eval/src/tests/tensor/instruction_benchmark/.gitignore new file mode 100644 index 00000000000..31b087883e0 --- /dev/null +++ b/eval/src/tests/tensor/instruction_benchmark/.gitignore @@ -0,0 +1 @@ +/eval_instruction_benchmark_app diff --git a/eval/src/tests/tensor/instruction_benchmark/CMakeLists.txt b/eval/src/tests/tensor/instruction_benchmark/CMakeLists.txt new file mode 100644 index 00000000000..d2384eaf129 --- /dev/null +++ b/eval/src/tests/tensor/instruction_benchmark/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(eval_instruction_benchmark_app TEST + SOURCES + instruction_benchmark.cpp + DEPENDS + vespaeval + GTest::GTest +) diff --git a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp new file mode 100644 index 00000000000..8bb227a7e85 --- /dev/null +++ b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp @@ -0,0 +1,320 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +// Microbenchmark exploring performance differences between +// interpreted function instructions. + +// This benchmark was initially written to measure the difference in +// performance between (old) instructions using the TensorEngine +// immediate API and (new) instructions using the Value API +// directly. Note that all previous optimizations for dense tensors +// are trivially transformed to use the Value API, and thus only the +// generic cases need to be compared. Specifically; we want to make +// sure join performance for sparse tensors with full dimensional +// overlap does not suffer too much. Also, we want to showcase an +// improvement in generic dense join and possibly also in sparse join +// with partial dimensional overlap. Benchmarks are done using float +// cells since this is what gives best overall performance in +// production. Also, we use the multiply operation since it is the +// most optimized operations across all implementations. When +// benchmarking different implementations against each other, a smoke +// test is performed by verifying that all implementations produce the +// same result. + +#include <vespa/eval/eval/simple_value.h> +#include <vespa/eval/eval/interpreted_function.h> +#include <vespa/eval/instruction/generic_join.h> +#include <vespa/eval/eval/simple_tensor_engine.h> +#include <vespa/eval/eval/tensor_spec.h> +#include <vespa/eval/eval/value_codec.h> +#include <vespa/eval/eval/operation.h> +#include <vespa/eval/eval/tensor_function.h> +#include <vespa/eval/tensor/default_tensor_engine.h> +#include <vespa/eval/tensor/mixed/packed_mixed_tensor_builder_factory.h> +#include <vespa/vespalib/util/benchmark_timer.h> +#include <vespa/vespalib/util/stringfmt.h> +#include <vespa/vespalib/util/stash.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <optional> + +using namespace vespalib; +using namespace vespalib::eval; +using namespace vespalib::tensor; +using namespace vespalib::eval::instruction; +using vespalib::make_string_short::fmt; + +using Instruction = InterpretedFunction::Instruction; +using EvalSingle = InterpretedFunction::EvalSingle; + +template <typename T> using CREF = std::reference_wrapper<const T>; + +//----------------------------------------------------------------------------- + +struct Impl { + virtual const vespalib::string &name() const = 0; + virtual Value::UP create_value(const TensorSpec &spec) const = 0; + virtual TensorSpec create_spec(const Value &value) const = 0; + virtual Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const = 0; + virtual const TensorEngine &engine() const { return SimpleTensorEngine::ref(); } // engine used by EvalSingle + virtual ~Impl() {} +}; + +struct ValueImpl : Impl { + vespalib::string my_name; + const ValueBuilderFactory &my_factory; + ValueImpl(const vespalib::string &name_in, const ValueBuilderFactory &factory) + : my_name(name_in), my_factory(factory) {} + const vespalib::string &name() const override { return my_name; } + Value::UP create_value(const TensorSpec &spec) const override { return value_from_spec(spec, my_factory); } + TensorSpec create_spec(const Value &value) const override { return spec_from_value(value); } + Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const override { + return GenericJoin::make_instruction(lhs, rhs, function, my_factory, stash); + } +}; + +struct EngineImpl : Impl { + vespalib::string my_name; + const TensorEngine &my_engine; + EngineImpl(const vespalib::string &name_in, const TensorEngine &engine_in) + : my_name(name_in), my_engine(engine_in) {} + const vespalib::string &name() const override { return my_name; } + Value::UP create_value(const TensorSpec &spec) const override { return my_engine.from_spec(spec); } + TensorSpec create_spec(const Value &value) const override { return my_engine.to_spec(value); } + Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const override { + // create a complete tensor function joining two parameters, but only compile the join instruction itself + const auto &lhs_node = tensor_function::inject(lhs, 0, stash); + const auto &rhs_node = tensor_function::inject(rhs, 1, stash); + const auto &join_node = tensor_function::join(lhs_node, rhs_node, function, stash); + return join_node.compile_self(my_engine, stash); + } + const TensorEngine &engine() const override { return my_engine; } +}; + +//----------------------------------------------------------------------------- + +EngineImpl simple_tensor_engine_impl(" [SimpleTensorEngine]", SimpleTensorEngine::ref()); +EngineImpl default_tensor_engine_impl("[DefaultTensorEngine]", DefaultTensorEngine::ref()); +ValueImpl simple_value_impl(" [SimpleValue]", SimpleValueBuilderFactory::get()); +ValueImpl packed_mixed_tensor_impl(" [PackedMixedTensor]", PackedMixedTensorBuilderFactory::get()); + +double budget = 5.0; +std::vector<CREF<Impl>> impl_list = {simple_tensor_engine_impl, + default_tensor_engine_impl, + simple_value_impl, + packed_mixed_tensor_impl}; + +//----------------------------------------------------------------------------- + +struct EvalOp { + using UP = std::unique_ptr<EvalOp>; + const Impl &impl; + std::vector<Value::UP> values; + std::vector<Value::CREF> stack; + EvalSingle single; + EvalOp(const EvalOp &) = delete; + EvalOp &operator=(const EvalOp &) = delete; + EvalOp(Instruction op, const std::vector<CREF<TensorSpec>> &stack_spec, const Impl &impl_in) + : impl(impl_in), values(), stack(), single(impl.engine(), op) + { + for (const TensorSpec &spec: stack_spec) { + values.push_back(impl.create_value(spec)); + } + for (const auto &value: values) { + stack.push_back(*value.get()); + } + } + TensorSpec result() { return impl.create_spec(single.eval(stack)); } + double estimate_cost_us() { + auto actual = [&](){ single.eval(stack); }; + return BenchmarkTimer::benchmark(actual, budget) * 1000.0 * 1000.0; + } +}; + +//----------------------------------------------------------------------------- + +void benchmark(const vespalib::string &desc, const std::vector<EvalOp::UP> &list) { + fprintf(stderr, "--------------------------------------------------------\n"); + fprintf(stderr, "Benchmark Case: [%s]\n", desc.c_str()); + std::optional<TensorSpec> expect = std::nullopt; + for (const auto &eval: list) { + if (expect.has_value()) { + ASSERT_EQ(eval->result(), expect.value()); + } else { + expect = eval->result(); + } + } + for (const auto &eval: list) { + fprintf(stderr, " %s: %10.3f us\n", eval->impl.name().c_str(), eval->estimate_cost_us()); + } + fprintf(stderr, "--------------------------------------------------------\n"); +} + +//----------------------------------------------------------------------------- + +void benchmark_join(const vespalib::string &desc, const TensorSpec &lhs, + const TensorSpec &rhs, operation::op2_t function) +{ + Stash stash; + ValueType lhs_type = ValueType::from_spec(lhs.type()); + ValueType rhs_type = ValueType::from_spec(rhs.type()); + ValueType res_type = ValueType::join(lhs_type, rhs_type); + ASSERT_FALSE(lhs_type.is_error()); + ASSERT_FALSE(rhs_type.is_error()); + ASSERT_FALSE(res_type.is_error()); + std::vector<EvalOp::UP> list; + for (const Impl &impl: impl_list) { + auto op = impl.create_join(lhs_type, rhs_type, function, stash); + std::vector<CREF<TensorSpec>> stack_spec({lhs, rhs}); + list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl)); + } + benchmark(desc, list); +} + +//----------------------------------------------------------------------------- + +struct D { + vespalib::string name; + bool mapped; + size_t size; + size_t stride; + static D map(const vespalib::string &name_in, size_t size_in, size_t stride_in) { return D{name_in, true, size_in, stride_in}; } + static D idx(const vespalib::string &name_in, size_t size_in) { return D{name_in, false, size_in, 1}; } + operator ValueType::Dimension() const { + if (mapped) { + return ValueType::Dimension(name); + } else { + return ValueType::Dimension(name, size); + } + } + std::pair<vespalib::string,TensorSpec::Label> operator()(size_t idx) const { + if (mapped) { + return std::make_pair(name, TensorSpec::Label(fmt("label_%zu", idx))); + } else { + return std::make_pair(name, TensorSpec::Label(idx)); + } + } +}; + +TensorSpec make_vector(const D &d1, double seq) { + auto type = ValueType::tensor_type({d1}, ValueType::CellType::FLOAT); + TensorSpec spec(type.to_spec()); + for (size_t i = 0, idx1 = 0; i < d1.size; ++i, idx1 += d1.stride, seq += 1.0) { + spec.add({d1(idx1)}, seq); + } + return spec; +} + +TensorSpec make_cube(const D &d1, const D &d2, const D &d3, double seq) { + auto type = ValueType::tensor_type({d1, d2, d3}, ValueType::CellType::FLOAT); + TensorSpec spec(type.to_spec()); + for (size_t i = 0, idx1 = 0; i < d1.size; ++i, idx1 += d1.stride) { + for (size_t j = 0, idx2 = 0; j < d2.size; ++j, idx2 += d2.stride) { + for (size_t k = 0, idx3 = 0; k < d3.size; ++k, idx3 += d3.stride, seq += 1.0) { + spec.add({d1(idx1), d2(idx2), d3(idx3)}, seq); + } + } + } + return spec; +} + +//----------------------------------------------------------------------------- + +TEST(MakeInputTest, print_some_test_input) { + auto sparse = make_vector(D::map("x", 5, 3), 1.0); + auto dense = make_vector(D::idx("x", 5), 10.0); + auto mixed = make_cube(D::map("x", 3, 7), D::idx("y", 2), D::idx("z", 2), 100.0); + fprintf(stderr, "--------------------------------------------------------\n"); + fprintf(stderr, "sparse vector: %s\n", sparse.to_string().c_str()); + fprintf(stderr, "dense vector: %s\n", dense.to_string().c_str()); + fprintf(stderr, "mixed cube: %s\n", mixed.to_string().c_str()); + fprintf(stderr, "--------------------------------------------------------\n"); +} + +//----------------------------------------------------------------------------- + +TEST(NumberJoin, plain_op2) { + auto lhs = TensorSpec("double").add({}, 2.0); + auto rhs = TensorSpec("double").add({}, 3.0); + benchmark_join("simple numbers multiply", lhs, rhs, operation::Mul::f); +} + +//----------------------------------------------------------------------------- + +TEST(DenseJoin, small_vectors) { + auto lhs = make_vector(D::idx("x", 10), 1.0); + auto rhs = make_vector(D::idx("x", 10), 2.0); + benchmark_join("small dense vector multiply", lhs, rhs, operation::Mul::f); +} + +TEST(DenseJoin, full_overlap) { + auto lhs = make_cube(D::idx("a", 16), D::idx("b", 16), D::idx("c", 16), 1.0); + auto rhs = make_cube(D::idx("a", 16), D::idx("b", 16), D::idx("c", 16), 2.0); + benchmark_join("dense full overlap multiply", lhs, rhs, operation::Mul::f); +} + +TEST(DenseJoin, partial_overlap) { + auto lhs = make_cube(D::idx("a", 8), D::idx("c", 8), D::idx("d", 8), 1.0); + auto rhs = make_cube(D::idx("b", 8), D::idx("c", 8), D::idx("d", 8), 2.0); + benchmark_join("dense partial overlap multiply", lhs, rhs, operation::Mul::f); +} + +TEST(DenseJoin, no_overlap) { + auto lhs = make_cube(D::idx("a", 4), D::idx("e", 4), D::idx("f", 4), 1.0); + auto rhs = make_cube(D::idx("b", 4), D::idx("c", 4), D::idx("d", 4), 2.0); + benchmark_join("dense no overlap multiply", lhs, rhs, operation::Mul::f); +} + +//----------------------------------------------------------------------------- + +TEST(SparseJoin, small_vectors) { + auto lhs = make_vector(D::map("x", 10, 1), 1.0); + auto rhs = make_vector(D::map("x", 10, 2), 2.0); + benchmark_join("small sparse vector multiply", lhs, rhs, operation::Mul::f); +} + +TEST(SparseJoin, full_overlap) { + auto lhs = make_cube(D::map("a", 16, 1), D::map("b", 16, 1), D::map("c", 16, 1), 1.0); + auto rhs = make_cube(D::map("a", 16, 2), D::map("b", 16, 2), D::map("c", 16, 2), 2.0); + benchmark_join("sparse full overlap multiply", lhs, rhs, operation::Mul::f); +} + +TEST(SparseJoin, full_overlap_big_vs_small) { + auto lhs = make_cube(D::map("a", 16, 1), D::map("b", 16, 1), D::map("c", 16, 1), 1.0); + auto rhs = make_cube(D::map("a", 2, 1), D::map("b", 2, 1), D::map("c", 2, 1), 2.0); + benchmark_join("sparse full overlap big vs small multiply", lhs, rhs, operation::Mul::f); +} + +TEST(SparseJoin, partial_overlap) { + auto lhs = make_cube(D::map("a", 8, 1), D::map("c", 8, 1), D::map("d", 8, 1), 1.0); + auto rhs = make_cube(D::map("b", 8, 2), D::map("c", 8, 2), D::map("d", 8, 2), 2.0); + benchmark_join("sparse partial overlap multiply", lhs, rhs, operation::Mul::f); +} + +TEST(SparseJoin, no_overlap) { + auto lhs = make_cube(D::map("a", 4, 1), D::map("e", 4, 1), D::map("f", 4, 1), 1.0); + auto rhs = make_cube(D::map("b", 4, 1), D::map("c", 4, 1), D::map("d", 4, 1), 2.0); + benchmark_join("sparse no overlap multiply", lhs, rhs, operation::Mul::f); +} + +//----------------------------------------------------------------------------- + +TEST(MixedJoin, full_overlap) { + auto lhs = make_cube(D::map("a", 16, 1), D::map("b", 16, 1), D::idx("c", 16), 1.0); + auto rhs = make_cube(D::map("a", 16, 2), D::map("b", 16, 2), D::idx("c", 16), 2.0); + benchmark_join("mixed full overlap multiply", lhs, rhs, operation::Mul::f); +} + +TEST(MixedJoin, partial_sparse_overlap) { + auto lhs = make_cube(D::map("a", 8, 1), D::map("c", 8, 1), D::idx("d", 8), 1.0); + auto rhs = make_cube(D::map("b", 8, 2), D::map("c", 8, 2), D::idx("d", 8), 2.0); + benchmark_join("mixed partial sparse overlap multiply", lhs, rhs, operation::Mul::f); +} + +TEST(MixedJoin, no_overlap) { + auto lhs = make_cube(D::map("a", 4, 1), D::map("e", 4, 1), D::idx("f", 4), 1.0); + auto rhs = make_cube(D::map("b", 4, 1), D::map("c", 4, 1), D::idx("d", 4), 2.0); + benchmark_join("mixed no overlap multiply", lhs, rhs, operation::Mul::f); +} + +//----------------------------------------------------------------------------- + +GTEST_MAIN_RUN_ALL_TESTS() |