From d6718f2dc8c08b191e2af3003da59bff8a558401 Mon Sep 17 00:00:00 2001
From: Håvard Pettersen <havardpe@oath.com>
Date: Tue, 3 Nov 2020 14:36:04 +0000
Subject: untangle factory-based optimization pipeline from DefaultTensorEngine

---
 .../eval/tensor_function/tensor_function_test.cpp  | 47 ++++-------
 .../instruction_benchmark.cpp                      | 45 +++++-----
 eval/src/vespa/eval/eval/CMakeLists.txt            |  1 +
 eval/src/vespa/eval/eval/engine_or_factory.cpp     | 11 ---
 eval/src/vespa/eval/eval/engine_or_factory.h       |  1 -
 eval/src/vespa/eval/eval/interpreted_function.cpp  |  3 +-
 .../vespa/eval/eval/optimize_tensor_function.cpp   | 95 ++++++++++++++++++++++
 .../src/vespa/eval/eval/optimize_tensor_function.h | 15 ++++
 eval/src/vespa/eval/eval/test/eval_fixture.cpp     |  3 +-
 .../vespa/eval/tensor/default_tensor_engine.cpp    |  2 -
 10 files changed, 153 insertions(+), 70 deletions(-)
 create mode 100644 eval/src/vespa/eval/eval/optimize_tensor_function.cpp
 create mode 100644 eval/src/vespa/eval/eval/optimize_tensor_function.h

(limited to 'eval')

diff --git a/eval/src/tests/eval/tensor_function/tensor_function_test.cpp b/eval/src/tests/eval/tensor_function/tensor_function_test.cpp
index 5c33cdacc44..9441061d6e1 100644
--- a/eval/src/tests/eval/tensor_function/tensor_function_test.cpp
+++ b/eval/src/tests/eval/tensor_function/tensor_function_test.cpp
@@ -38,9 +38,6 @@ struct EvalCtx {
         ictx = std::make_unique<InterpretedFunction::Context>(*ifun);
         return ifun->eval(*ictx, SimpleObjectParams(params));
     }
-    const TensorFunction &compile(const TensorFunction &expr) {
-        return engine.optimize(expr, stash);
-    }
     Value::UP make_double(double value) {
         return engine.from_spec(TensorSpec("double").add({}, value));
     }
@@ -196,8 +193,7 @@ TEST("require that const_value works") {
     const auto &fun = const_value(*my_const, ctx.stash);
     EXPECT_TRUE(!fun.result_is_mutable());
     EXPECT_EQUAL(expect->type(), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    TEST_DO(verify_equal(*expect, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect, ctx.eval(fun)));
 }
 
 TEST("require that tensor injection works") {
@@ -207,8 +203,7 @@ TEST("require that tensor injection works") {
     const auto &fun = inject(ValueType::from_spec("tensor(x[2],y[2])"), a_id, ctx.stash);
     EXPECT_TRUE(!fun.result_is_mutable());
     EXPECT_EQUAL(expect->type(), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    TEST_DO(verify_equal(*expect, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect, ctx.eval(fun)));
 }
 
 TEST("require that partial tensor reduction works") {
@@ -218,8 +213,7 @@ TEST("require that partial tensor reduction works") {
     const auto &fun = reduce(inject(ValueType::from_spec("tensor(x[3],y[2])"), a_id, ctx.stash), Aggr::SUM, {"y"}, ctx.stash);
     EXPECT_TRUE(fun.result_is_mutable());
     EXPECT_EQUAL(expect->type(), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    TEST_DO(verify_equal(*expect, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect, ctx.eval(fun)));
 }
 
 TEST("require that full tensor reduction works") {
@@ -228,8 +222,7 @@ TEST("require that full tensor reduction works") {
     const auto &fun = reduce(inject(ValueType::from_spec("tensor(x[3],y[2])"), a_id, ctx.stash), Aggr::SUM, {}, ctx.stash);
     EXPECT_TRUE(fun.result_is_mutable());
     EXPECT_EQUAL(ValueType::from_spec("double"), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    const Value &result = ctx.eval(prog);
+    const Value &result = ctx.eval(fun);
     EXPECT_TRUE(result.is_double());
     EXPECT_EQUAL(21.0, result.as_double());
 }
@@ -241,8 +234,7 @@ TEST("require that tensor map works") {
     const auto &fun = map(inject(ValueType::from_spec("tensor(x{},y{})"), a_id, ctx.stash), operation::Neg::f, ctx.stash);
     EXPECT_TRUE(fun.result_is_mutable());
     EXPECT_EQUAL(expect->type(), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    TEST_DO(verify_equal(*expect, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect, ctx.eval(fun)));
 }
 
 TEST("require that tensor join works") {
@@ -255,8 +247,7 @@ TEST("require that tensor join works") {
                            operation::Mul::f, ctx.stash);
     EXPECT_TRUE(fun.result_is_mutable());
     EXPECT_EQUAL(expect->type(), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    TEST_DO(verify_equal(*expect, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect, ctx.eval(fun)));
 }
 
 TEST("require that tensor merge works") {
@@ -269,8 +260,7 @@ TEST("require that tensor merge works") {
                             operation::Add::f, ctx.stash);
     EXPECT_TRUE(fun.result_is_mutable());
     EXPECT_EQUAL(expect->type(), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    TEST_DO(verify_equal(*expect, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect, ctx.eval(fun)));
 }
 
 TEST("require that tensor concat works") {
@@ -283,8 +273,7 @@ TEST("require that tensor concat works") {
                              "y", ctx.stash);
     EXPECT_TRUE(fun.result_is_mutable());
     EXPECT_EQUAL(expect->type(), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    TEST_DO(verify_equal(*expect, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect, ctx.eval(fun)));
 }
 
 TEST("require that tensor create works") {
@@ -305,8 +294,7 @@ TEST("require that tensor create works") {
                              ctx.stash);
     EXPECT_TRUE(fun.result_is_mutable());
     EXPECT_EQUAL(expect->type(), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    TEST_DO(verify_equal(*expect, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect, ctx.eval(fun)));
 }
 
 TEST("require that single value tensor peek works") {
@@ -328,8 +316,7 @@ TEST("require that single value tensor peek works") {
                              ctx.stash);
     EXPECT_TRUE(fun.result_is_mutable());
     EXPECT_EQUAL(expect->type(), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    TEST_DO(verify_equal(*expect, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect, ctx.eval(fun)));
 }
 
 TEST("require that tensor subspace tensor peek works") {
@@ -340,8 +327,7 @@ TEST("require that tensor subspace tensor peek works") {
     const auto &fun = peek(t, {{"x", "bar"}}, ctx.stash);
     EXPECT_TRUE(fun.result_is_mutable());
     EXPECT_EQUAL(expect->type(), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    TEST_DO(verify_equal(*expect, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect, ctx.eval(fun)));
 }
 
 TEST("require that automatic string conversion tensor peek works") {
@@ -353,8 +339,7 @@ TEST("require that automatic string conversion tensor peek works") {
     const auto &fun = peek(t, {{"x", a}}, ctx.stash);
     EXPECT_TRUE(fun.result_is_mutable());
     EXPECT_TRUE(fun.result_type().is_double());
-    const auto &prog = ctx.compile(fun);
-    const Value &result = ctx.eval(prog);
+    const Value &result = ctx.eval(fun);
     EXPECT_TRUE(result.is_double());
     EXPECT_EQUAL(2.0, result.as_double());
 }
@@ -367,8 +352,7 @@ TEST("require that tensor rename works") {
                              {"x"}, {"z"}, ctx.stash);
     EXPECT_TRUE(fun.result_is_mutable());
     EXPECT_EQUAL(expect->type(), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    TEST_DO(verify_equal(*expect, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect, ctx.eval(fun)));
 }
 
 TEST("require that if_node works") {
@@ -383,10 +367,9 @@ TEST("require that if_node works") {
                               inject(ValueType::from_spec("tensor(x[2])"), c_id, ctx.stash), ctx.stash);
     EXPECT_TRUE(!fun.result_is_mutable());
     EXPECT_EQUAL(expect_true->type(), fun.result_type());
-    const auto &prog = ctx.compile(fun);
-    TEST_DO(verify_equal(*expect_true, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect_true, ctx.eval(fun)));
     ctx.replace_tensor(a_id, ctx.make_false());
-    TEST_DO(verify_equal(*expect_false, ctx.eval(prog)));
+    TEST_DO(verify_equal(*expect_false, ctx.eval(fun)));
 }
 
 TEST("require that if_node result is mutable only when both children produce mutable results") {
diff --git a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
index 7182d66f8aa..bcd021b05fb 100644
--- a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
+++ b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
@@ -34,6 +34,7 @@
 #include <vespa/eval/eval/value_codec.h>
 #include <vespa/eval/eval/operation.h>
 #include <vespa/eval/eval/tensor_function.h>
+#include <vespa/eval/eval/optimize_tensor_function.h>
 #include <vespa/eval/tensor/default_tensor_engine.h>
 #include <vespa/eval/tensor/default_value_builder_factory.h>
 #include <vespa/vespalib/util/benchmark_timer.h>
@@ -136,21 +137,21 @@ struct Impl {
         const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
         const auto &rhs_node = tensor_function::inject(rhs, 1, stash);
         const auto &join_node = tensor_function::join(lhs_node, rhs_node, function, stash);
-        const auto &node = optimize ? engine.optimize(join_node, stash) : join_node;
+        const auto &node = optimize ? optimize_tensor_function(engine, join_node, stash) : join_node;
         return node.compile_self(engine, stash);
     }
     Instruction create_reduce(const ValueType &lhs, Aggr aggr, const std::vector<vespalib::string> &dims, Stash &stash) const {
         // create a complete tensor function, but only compile the relevant instruction
         const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
         const auto &reduce_node = tensor_function::reduce(lhs_node, aggr, dims, stash);
-        const auto &node = optimize ? engine.optimize(reduce_node, stash) : reduce_node;
+        const auto &node = optimize ? optimize_tensor_function(engine, reduce_node, stash) : reduce_node;
         return node.compile_self(engine, stash);
     }
     Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const {
         // create a complete tensor function, but only compile the relevant instruction
         const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
         const auto &rename_node = tensor_function::rename(lhs_node, from, to, stash);
-        const auto &node = optimize ? engine.optimize(rename_node, stash) : rename_node;
+        const auto &node = optimize ? optimize_tensor_function(engine, rename_node, stash) : rename_node;
         return node.compile_self(engine, stash);
     }
     Instruction create_merge(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const {
@@ -158,7 +159,7 @@ struct Impl {
         const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
         const auto &rhs_node = tensor_function::inject(rhs, 1, stash);
         const auto &merge_node = tensor_function::merge(lhs_node, rhs_node, function, stash); 
-        const auto &node = optimize ? engine.optimize(merge_node, stash) : merge_node;
+        const auto &node = optimize ? optimize_tensor_function(engine, merge_node, stash) : merge_node;
         return node.compile_self(engine, stash);
     }
     Instruction create_concat(const ValueType &lhs, const ValueType &rhs, const std::string &dimension, Stash &stash) const {
@@ -167,14 +168,14 @@ struct Impl {
         const auto &rhs_node = tensor_function::inject(rhs, 1, stash);
         const auto &concat_node = tensor_function::concat(lhs_node, rhs_node, dimension, stash); 
         return concat_node.compile_self(engine, stash);
-        const auto &node = optimize ? engine.optimize(concat_node, stash) : concat_node;
+        const auto &node = optimize ? optimize_tensor_function(engine, concat_node, stash) : concat_node;
         return node.compile_self(engine, stash);
     }
     Instruction create_map(const ValueType &lhs, operation::op1_t function, Stash &stash) const {
         // create a complete tensor function, but only compile the relevant instruction
         const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
         const auto &map_node = tensor_function::map(lhs_node, function, stash); 
-        const auto &node = optimize ? engine.optimize(map_node, stash) : map_node;
+        const auto &node = optimize ? optimize_tensor_function(engine, map_node, stash) : map_node;
         return node.compile_self(engine, stash);
     }
     Instruction create_tensor_create(const ValueType &proto_type, const TensorSpec &proto, Stash &stash) const {
@@ -185,7 +186,7 @@ struct Impl {
             spec.emplace(cell.first, my_double);
         }
         const auto &create_tensor_node = tensor_function::create(proto_type, spec, stash); 
-        const auto &node = optimize ? engine.optimize(create_tensor_node, stash) : create_tensor_node;
+        const auto &node = optimize ? optimize_tensor_function(engine, create_tensor_node, stash) : create_tensor_node;
         return node.compile_self(engine, stash);
     }
     Instruction create_tensor_lambda(const ValueType &type, const Function &function, const ValueType &p0_type, Stash &stash) const {
@@ -194,7 +195,7 @@ struct Impl {
         NodeTypes types(function, arg_types);
         EXPECT_EQ(types.errors(), std::vector<vespalib::string>());
         const auto &tensor_lambda_node = tensor_function::lambda(type, {0}, function, std::move(types), stash);
-        const auto &node = optimize ? engine.optimize(tensor_lambda_node, stash) : tensor_lambda_node;
+        const auto &node = optimize ? optimize_tensor_function(engine, tensor_lambda_node, stash) : tensor_lambda_node;
         return node.compile_self(engine, stash);
     }
     Instruction create_tensor_peek(const ValueType &type, const MyPeekSpec &my_spec, Stash &stash) const {
@@ -218,30 +219,30 @@ struct Impl {
             }
         }
         const auto &peek_node = tensor_function::peek(my_param, spec, stash);
-        const auto &node = optimize ? engine.optimize(peek_node, stash) : peek_node;
+        const auto &node = optimize ? optimize_tensor_function(engine, peek_node, stash) : peek_node;
         return node.compile_self(engine, stash);
     }
 };
 
 //-----------------------------------------------------------------------------
 
-Impl  default_tensor_engine_impl(1, "DefaultTensorEngine", "OLD PROD", DefaultTensorEngine::ref(), false);
-Impl           simple_value_impl(3, "        SimpleValue", " SimpleV", SimpleValueBuilderFactory::get(), false);
-Impl             fast_value_impl(0, "          FastValue", "NEW PROD", FastValueBuilderFactory::get(), false);
-Impl   optimized_fast_value_impl(2, "Optimized FastValue", "Optimize", FastValueBuilderFactory::get(), true);
-Impl   default_tensor_value_impl(4, "       DefaultValue", "DefaultV", DefaultValueBuilderFactory::get(), false);
-vespalib::string                              short_header("--------");
+Impl             optimized_fast_value_impl(0, "          Optimized FastValue", "NEW PROD", FastValueBuilderFactory::get(), true);
+Impl  optimized_default_tensor_engine_impl(1, "Optimized DefaultTensorEngine", "OLD PROD", DefaultTensorEngine::ref(), true);
+Impl                       fast_value_impl(2, "                    FastValue", "   FastV", FastValueBuilderFactory::get(), false);
+Impl            default_tensor_engine_impl(3, "          DefaultTensorEngine", "DefaultT", DefaultTensorEngine::ref(), false);
+Impl                     simple_value_impl(4, "                  SimpleValue", " SimpleV", SimpleValueBuilderFactory::get(), false);
+vespalib::string                                                  short_header("--------");
 
 constexpr double budget = 5.0;
 constexpr double best_limit = 0.95; // everything within 95% of best performance gets a star
-constexpr double bad_limit = 0.90; // BAD: new prod has performance lower than 90% of old prod
+constexpr double bad_limit = 0.90;  // BAD: new prod has performance lower than 90% of old prod
 constexpr double good_limit = 1.10; // GOOD: new prod has performance higher than 110% of old prod
 
-std::vector<CREF<Impl>> impl_list = {default_tensor_engine_impl,
-                                     simple_value_impl,
-                                     fast_value_impl,
+std::vector<CREF<Impl>> impl_list = {simple_value_impl,
                                      optimized_fast_value_impl,
-                                     default_tensor_value_impl};
+                                     optimized_default_tensor_engine_impl,
+                                     fast_value_impl,
+                                     default_tensor_engine_impl};
 
 //-----------------------------------------------------------------------------
 
@@ -982,8 +983,8 @@ int main(int argc, char **argv) {
     const std::string run_only_prod_option = "--limit-implementations";
     if ((argc > 1) && (argv[1] == run_only_prod_option )) {
         impl_list.clear();
-        impl_list.push_back(fast_value_impl);
-        impl_list.push_back(default_tensor_engine_impl);
+        impl_list.push_back(optimized_fast_value_impl);
+        impl_list.push_back(optimized_default_tensor_engine_impl);
         ++argv;
         --argc;
     }
diff --git a/eval/src/vespa/eval/eval/CMakeLists.txt b/eval/src/vespa/eval/eval/CMakeLists.txt
index 6c1f99265a7..d27de8e3d21 100644
--- a/eval/src/vespa/eval/eval/CMakeLists.txt
+++ b/eval/src/vespa/eval/eval/CMakeLists.txt
@@ -22,6 +22,7 @@ vespa_add_library(eval_eval OBJECT
     node_types.cpp
     operation.cpp
     operator_nodes.cpp
+    optimize_tensor_function.cpp
     param_usage.cpp
     simple_tensor.cpp
     simple_tensor_engine.cpp
diff --git a/eval/src/vespa/eval/eval/engine_or_factory.cpp b/eval/src/vespa/eval/eval/engine_or_factory.cpp
index e4f710be625..4a95a57e10e 100644
--- a/eval/src/vespa/eval/eval/engine_or_factory.cpp
+++ b/eval/src/vespa/eval/eval/engine_or_factory.cpp
@@ -36,17 +36,6 @@ EngineOrFactory::get_shared(EngineOrFactory hint)
     return shared;
 }
 
-const TensorFunction &
-EngineOrFactory::optimize(const TensorFunction &expr, Stash &stash) const {
-    if (is_engine()) {
-        return engine().optimize(expr, stash);
-    } else if (&factory() == &FastValueBuilderFactory::get()) {
-        return tensor::DefaultTensorEngine::ref().optimize(expr, stash);
-    } else {
-        return expr;
-    }
-}
-
 TensorSpec
 EngineOrFactory::to_spec(const Value &value) const
 {
diff --git a/eval/src/vespa/eval/eval/engine_or_factory.h b/eval/src/vespa/eval/eval/engine_or_factory.h
index 4784356ae8d..e1f7c503bcd 100644
--- a/eval/src/vespa/eval/eval/engine_or_factory.h
+++ b/eval/src/vespa/eval/eval/engine_or_factory.h
@@ -42,7 +42,6 @@ public:
     const TensorEngine &engine() const { return *std::get<engine_t>(_value); }
     const ValueBuilderFactory &factory() const { return *std::get<factory_t>(_value); }
     // functions that can be called with either engine or factory
-    const TensorFunction &optimize(const TensorFunction &expr, Stash &stash) const;
     TensorSpec to_spec(const Value &value) const;
     std::unique_ptr<Value> from_spec(const TensorSpec &spec) const;
     void encode(const Value &value, nbostream &output) const;
diff --git a/eval/src/vespa/eval/eval/interpreted_function.cpp b/eval/src/vespa/eval/eval/interpreted_function.cpp
index 2b0e915d69a..1016b929574 100644
--- a/eval/src/vespa/eval/eval/interpreted_function.cpp
+++ b/eval/src/vespa/eval/eval/interpreted_function.cpp
@@ -6,6 +6,7 @@
 #include "tensor_nodes.h"
 #include "tensor_engine.h"
 #include "make_tensor_function.h"
+#include "optimize_tensor_function.h"
 #include "compile_tensor_function.h"
 #include "simple_tensor_engine.h"
 #include <vespa/vespalib/util/classname.h>
@@ -73,7 +74,7 @@ InterpretedFunction::InterpretedFunction(EngineOrFactory engine, const nodes::No
       _tensor_engine(engine)
 {
     const TensorFunction &plain_fun = make_tensor_function(engine, root, types, _stash);
-    const TensorFunction &optimized = engine.optimize(plain_fun, _stash);
+    const TensorFunction &optimized = optimize_tensor_function(engine, plain_fun, _stash);
     _program = compile_tensor_function(engine, optimized, _stash);
 }
 
diff --git a/eval/src/vespa/eval/eval/optimize_tensor_function.cpp b/eval/src/vespa/eval/eval/optimize_tensor_function.cpp
new file mode 100644
index 00000000000..83f806178e8
--- /dev/null
+++ b/eval/src/vespa/eval/eval/optimize_tensor_function.cpp
@@ -0,0 +1,95 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "optimize_tensor_function.h"
+#include "tensor_function.h"
+#include "tensor_engine.h"
+#include "simple_value.h"
+
+#include <vespa/eval/tensor/dense/dense_dot_product_function.h>
+#include <vespa/eval/tensor/dense/dense_xw_product_function.h>
+#include <vespa/eval/tensor/dense/dense_matmul_function.h>
+#include <vespa/eval/tensor/dense/dense_multi_matmul_function.h>
+#include <vespa/eval/tensor/dense/dense_fast_rename_optimizer.h>
+#include <vespa/eval/tensor/dense/dense_add_dimension_optimizer.h>
+#include <vespa/eval/tensor/dense/dense_single_reduce_function.h>
+#include <vespa/eval/tensor/dense/dense_remove_dimension_optimizer.h>
+#include <vespa/eval/tensor/dense/dense_lambda_peek_optimizer.h>
+#include <vespa/eval/tensor/dense/dense_lambda_function.h>
+#include <vespa/eval/tensor/dense/dense_simple_expand_function.h>
+#include <vespa/eval/tensor/dense/dense_simple_join_function.h>
+#include <vespa/eval/tensor/dense/dense_number_join_function.h>
+#include <vespa/eval/tensor/dense/dense_pow_as_map_optimizer.h>
+#include <vespa/eval/tensor/dense/dense_simple_map_function.h>
+#include <vespa/eval/tensor/dense/vector_from_doubles_function.h>
+#include <vespa/eval/tensor/dense/dense_tensor_create_function.h>
+#include <vespa/eval/tensor/dense/dense_tensor_peek_function.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".eval.eval.optimize_tensor_function");
+
+namespace vespalib::eval {
+
+namespace {
+
+using namespace vespalib::tensor;
+
+const TensorFunction &optimize_for_factory(const ValueBuilderFactory &factory, const TensorFunction &expr, Stash &stash) {
+    if (&factory == &SimpleValueBuilderFactory::get()) {
+        // never optimize simple value evaluation
+        return expr;
+    }
+    using Child = TensorFunction::Child;
+    Child root(expr);
+    {
+        std::vector<Child::CREF> nodes({root});
+        for (size_t i = 0; i < nodes.size(); ++i) {
+            nodes[i].get().get().push_children(nodes);
+        }
+        while (!nodes.empty()) {
+            const Child &child = nodes.back().get();
+            child.set(DenseDotProductFunction::optimize(child.get(), stash));
+            child.set(DenseXWProductFunction::optimize(child.get(), stash));
+            child.set(DenseMatMulFunction::optimize(child.get(), stash));
+            child.set(DenseMultiMatMulFunction::optimize(child.get(), stash));
+            nodes.pop_back();
+        }
+    }
+    {
+        std::vector<Child::CREF> nodes({root});
+        for (size_t i = 0; i < nodes.size(); ++i) {
+            nodes[i].get().get().push_children(nodes);
+        }
+        while (!nodes.empty()) {
+            const Child &child = nodes.back().get();
+            child.set(DenseSimpleExpandFunction::optimize(child.get(), stash));
+            child.set(DenseAddDimensionOptimizer::optimize(child.get(), stash));
+            child.set(DenseRemoveDimensionOptimizer::optimize(child.get(), stash));
+            child.set(VectorFromDoublesFunction::optimize(child.get(), stash));
+            child.set(DenseTensorCreateFunction::optimize(child.get(), stash));
+            child.set(DenseTensorPeekFunction::optimize(child.get(), stash));
+            child.set(DenseLambdaPeekOptimizer::optimize(child.get(), stash));
+            child.set(DenseLambdaFunction::optimize(child.get(), stash));
+            child.set(DenseFastRenameOptimizer::optimize(child.get(), stash));
+            child.set(DensePowAsMapOptimizer::optimize(child.get(), stash));
+            child.set(DenseSimpleMapFunction::optimize(child.get(), stash));
+            child.set(DenseSimpleJoinFunction::optimize(child.get(), stash));
+            child.set(DenseNumberJoinFunction::optimize(child.get(), stash));
+            child.set(DenseSingleReduceFunction::optimize(child.get(), stash));
+            nodes.pop_back();
+        }
+    }
+    return root.get();
+}
+
+} // namespace vespalib::eval::<unnamed>
+
+const TensorFunction &optimize_tensor_function(EngineOrFactory engine, const TensorFunction &function, Stash &stash) {
+    LOG(debug, "tensor function before optimization:\n%s\n", function.as_string().c_str());
+    const TensorFunction &optimized = (engine.is_engine())
+                                      ? engine.engine().optimize(function, stash)
+                                      : optimize_for_factory(engine.factory(), function, stash);
+    LOG(debug, "tensor function after optimization:\n%s\n", optimized.as_string().c_str());
+    return optimized;
+}
+
+} // namespace vespalib::eval
diff --git a/eval/src/vespa/eval/eval/optimize_tensor_function.h b/eval/src/vespa/eval/eval/optimize_tensor_function.h
new file mode 100644
index 00000000000..bc2bc10cca6
--- /dev/null
+++ b/eval/src/vespa/eval/eval/optimize_tensor_function.h
@@ -0,0 +1,15 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "engine_or_factory.h"
+
+namespace vespalib { class Stash; }
+
+namespace vespalib::eval {
+
+struct TensorFunction;
+
+const TensorFunction &optimize_tensor_function(EngineOrFactory engine, const TensorFunction &function, Stash &stash);
+
+} // namespace vespalib::eval
diff --git a/eval/src/vespa/eval/eval/test/eval_fixture.cpp b/eval/src/vespa/eval/eval/test/eval_fixture.cpp
index a353f3a9ae2..b7655a6ee2f 100644
--- a/eval/src/vespa/eval/eval/test/eval_fixture.cpp
+++ b/eval/src/vespa/eval/eval/test/eval_fixture.cpp
@@ -3,6 +3,7 @@
 #include <vespa/vespalib/testkit/test_kit.h>
 #include "eval_fixture.h"
 #include <vespa/eval/eval/make_tensor_function.h>
+#include <vespa/eval/eval/optimize_tensor_function.h>
 #include <vespa/vespalib/util/stringfmt.h>
 
 using vespalib::make_string_short::fmt;
@@ -203,7 +204,7 @@ EvalFixture::EvalFixture(EngineOrFactory engine,
       _mutable_set(get_mutable(*_function, param_repo)),
       _plain_tensor_function(make_tensor_function(_engine, _function->root(), _node_types, _stash)),
       _patched_tensor_function(maybe_patch(allow_mutable, _plain_tensor_function, _mutable_set, _stash)),
-      _tensor_function(optimized ? _engine.optimize(_patched_tensor_function, _stash) : _patched_tensor_function),
+      _tensor_function(optimized ? optimize_tensor_function(engine, _patched_tensor_function, _stash) : _patched_tensor_function),
       _ifun(_engine, _tensor_function),
       _ictx(_ifun),
       _param_values(make_params(_engine, *_function, param_repo)),
diff --git a/eval/src/vespa/eval/tensor/default_tensor_engine.cpp b/eval/src/vespa/eval/tensor/default_tensor_engine.cpp
index d44e822792b..b50092c88b5 100644
--- a/eval/src/vespa/eval/tensor/default_tensor_engine.cpp
+++ b/eval/src/vespa/eval/tensor/default_tensor_engine.cpp
@@ -277,7 +277,6 @@ DefaultTensorEngine::optimize(const TensorFunction &expr, Stash &stash) const
 {
     using Child = TensorFunction::Child;
     Child root(expr);
-    LOG(debug, "tensor function before optimization:\n%s\n", root.get().as_string().c_str());
     {
         std::vector<Child::CREF> nodes({root});
         for (size_t i = 0; i < nodes.size(); ++i) {
@@ -316,7 +315,6 @@ DefaultTensorEngine::optimize(const TensorFunction &expr, Stash &stash) const
             nodes.pop_back();
         }
     }
-    LOG(debug, "tensor function after optimization:\n%s\n", root.get().as_string().c_str());
     return root.get();
 }
 
-- 
cgit v1.2.3