From 65d64069597882cf273d72fe872ee1fe6bd9de23 Mon Sep 17 00:00:00 2001 From: HÃ¥vard Pettersen Date: Tue, 24 Sep 2019 13:44:28 +0000 Subject: faster gbdt forest evaluation This is a draft implementation of gbdt forest evaluation doing feature-at-a-time rather than tree-at-a-time. --- eval/src/tests/eval/gbdt/gbdt_test.cpp | 33 ++++- eval/src/tests/eval/gbdt/model.cpp | 17 ++- eval/src/vespa/eval/eval/CMakeLists.txt | 1 + eval/src/vespa/eval/eval/fast_forest.cpp | 232 +++++++++++++++++++++++++++++++ eval/src/vespa/eval/eval/fast_forest.h | 141 +++++++++++++++++++ 5 files changed, 417 insertions(+), 7 deletions(-) create mode 100644 eval/src/vespa/eval/eval/fast_forest.cpp create mode 100644 eval/src/vespa/eval/eval/fast_forest.h (limited to 'eval') diff --git a/eval/src/tests/eval/gbdt/gbdt_test.cpp b/eval/src/tests/eval/gbdt/gbdt_test.cpp index 865b01b861b..14fa4510f4d 100644 --- a/eval/src/tests/eval/gbdt/gbdt_test.cpp +++ b/eval/src/tests/eval/gbdt/gbdt_test.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -295,6 +296,24 @@ TEST("require that models with too large sets are rejected by general vm optimiz EXPECT_TRUE(!Optimize::apply_chain(general_vm_chain, stats, trees).valid()); } +TEST("require that FastForest model evaluation works") { + Function function = Function::parse("if((a<2),1.0,if((b<2),if((c<2),2.0,3.0),4.0))+" + "if(!(c>=1),10.0,if((a<1),if((b<1),20.0,30.0),40.0))"); + CompiledFunction compiled(function, PassParams::ARRAY, Optimize::none); + auto f = compiled.get_function(); + EXPECT_TRUE(compiled.get_forests().empty()); + auto forest = FastForest::try_convert(function); + ASSERT_TRUE(forest); + FastForest::Context ctx(*forest); + std::vector p1({0.5, 0.5, 0.5}); // all true: 1.0 + 10.0 + std::vector p2({2.5, 2.5, 2.5}); // all false: 4.0 + 40.0 + std::vector pn(3, std::numeric_limits::quiet_NaN()); // default: 4.0 + 10.0 + EXPECT_EQUAL(forest->eval(ctx, [&p1](size_t i){return p1[i];}), f(&p1[0])); + EXPECT_EQUAL(forest->eval(ctx, [&p2](size_t i){return p2[i];}), f(&p2[0])); + EXPECT_EQUAL(forest->eval(ctx, [&pn](size_t i){return pn[i];}), f(&pn[0])); + EXPECT_EQUAL(forest->eval(ctx, [&p1](size_t i){return p1[i];}), f(&p1[0])); +} + //----------------------------------------------------------------------------- double eval_compiled(const CompiledFunction &cfun, std::vector ¶ms) { @@ -311,11 +330,13 @@ double eval_compiled(const CompiledFunction &cfun, std::vector ¶ms) TEST("require that forests evaluate to approximately the same for all evaluation options") { for (PassParams pass_params: {PassParams::ARRAY, PassParams::LAZY}) { for (size_t tree_size: std::vector({20})) { - for (size_t num_trees: std::vector({10, 60})) { + for (size_t num_trees: std::vector({60})) { for (size_t less_percent: std::vector({100, 80})) { for (size_t invert_percent: std::vector({0, 50})) { vespalib::string expression = Model().less_percent(less_percent).invert_percent(invert_percent).make_forest(num_trees, tree_size); Function function = Function::parse(expression); + auto forest = FastForest::try_convert(function); + EXPECT_EQUAL(bool(forest), bool(less_percent == 100)); CompiledFunction none(function, pass_params, Optimize::none); CompiledFunction deinline(function, pass_params, DeinlineForest::optimize_chain); CompiledFunction vm_forest(function, pass_params, VMForest::optimize_chain); @@ -325,10 +346,20 @@ TEST("require that forests evaluate to approximately the same for all evaluation ASSERT_EQUAL(1u, vm_forest.get_forests().size()); EXPECT_TRUE(dynamic_cast(vm_forest.get_forests()[0].get()) != nullptr); std::vector inputs(function.num_params(), 0.5); + std::vector inputs_nan(function.num_params(), std::numeric_limits::quiet_NaN()); double expected = eval_double(function, inputs); + double expected_nan = eval_double(function, inputs_nan); EXPECT_EQUAL(expected, eval_compiled(none, inputs)); EXPECT_EQUAL(expected, eval_compiled(deinline, inputs)); EXPECT_EQUAL(expected, eval_compiled(vm_forest, inputs)); + EXPECT_EQUAL(expected_nan, eval_compiled(none, inputs_nan)); + EXPECT_EQUAL(expected_nan, eval_compiled(deinline, inputs_nan)); + EXPECT_EQUAL(expected_nan, eval_compiled(vm_forest, inputs_nan)); + if (forest) { + FastForest::Context ctx(*forest); + EXPECT_EQUAL(expected, forest->eval(ctx, [&inputs](size_t i){return inputs[i];})); + EXPECT_EQUAL(expected_nan, forest->eval(ctx, [&inputs_nan](size_t i){return inputs_nan[i];})); + } } } } diff --git a/eval/src/tests/eval/gbdt/model.cpp b/eval/src/tests/eval/gbdt/model.cpp index e531b327e89..ae1c9bea437 100644 --- a/eval/src/tests/eval/gbdt/model.cpp +++ b/eval/src/tests/eval/gbdt/model.cpp @@ -21,9 +21,14 @@ private: return dist(_gen); } - double get_real(double min, double max) { - std::uniform_real_distribution dist(min, max); - return dist(_gen); + double get_real() { + std::uniform_real_distribution dist(0.0, 1.0); + double result = dist(_gen); + // avoid different decisions based on using float vs. double split values + while (float(result) == 0.5) { + result = dist(_gen); + } + return result; } std::string make_feature_name() { @@ -45,11 +50,11 @@ private: if (get_int(1,100) > _invert_percent) { return make_string("(%s<%g)", make_feature_name().c_str(), - get_real(0.0, 1.0)); + get_real()); } else { return make_string("(!(%s>=%g))", make_feature_name().c_str(), - get_real(0.0, 1.0)); + get_real()); } } } @@ -70,7 +75,7 @@ public: std::string make_tree(size_t size) { assert(size > 0); if (size == 1) { - return make_string("%g", get_real(0.0, 1.0)); + return make_string("%g", get_real()); } size_t pivot = get_int(1, size - 1); return make_string("if(%s,%s,%s)", diff --git a/eval/src/vespa/eval/eval/CMakeLists.txt b/eval/src/vespa/eval/eval/CMakeLists.txt index 1baa1b3b4bf..23aa504f31e 100644 --- a/eval/src/vespa/eval/eval/CMakeLists.txt +++ b/eval/src/vespa/eval/eval/CMakeLists.txt @@ -6,6 +6,7 @@ vespa_add_library(eval_eval OBJECT call_nodes.cpp compile_tensor_function.cpp delete_node.cpp + fast_forest.cpp function.cpp gbdt.cpp interpreted_function.cpp diff --git a/eval/src/vespa/eval/eval/fast_forest.cpp b/eval/src/vespa/eval/eval/fast_forest.cpp new file mode 100644 index 00000000000..927c5355e6a --- /dev/null +++ b/eval/src/vespa/eval/eval/fast_forest.cpp @@ -0,0 +1,232 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "fast_forest.h" +#include "gbdt.h" +#include +#include +#include +#include +#include +#include + +namespace vespalib::eval::gbdt { + +namespace { + +struct BitRange { + uint32_t first; + uint32_t last; + BitRange(uint32_t bit) : first(bit), last(bit) {} + BitRange(uint32_t a, uint32_t b) : first(a), last(b) {} + static BitRange join(const BitRange &a, const BitRange &b) { + assert((a.last + 1) == b.first); + return BitRange(a.first, b.last); + } + ~BitRange() = default; +}; + +struct CmpNode { + float value; + uint32_t tree_id; + BitRange false_mask; + bool false_is_default; + CmpNode(float v, uint32_t t, BitRange m, bool f_def) + : value(v), tree_id(t), false_mask(m), false_is_default(f_def) {} + bool operator<(const CmpNode &rhs) const { + return (value < rhs.value); + } + ~CmpNode() = default; +}; + +struct State { + using Leafs = std::vector; + using CmpNodes = std::vector; + std::vector cmp_nodes; + std::vector leafs; + BitRange encode_node(uint32_t tree_id, const nodes::Node &node); + State(size_t num_params, const std::vector &trees); + ~State() = default; +}; + +BitRange +State::encode_node(uint32_t tree_id, const nodes::Node &node) +{ + auto if_node = nodes::as(node); + if (if_node) { + BitRange true_leafs = encode_node(tree_id, if_node->true_expr()); + BitRange false_leafs = encode_node(tree_id, if_node->false_expr()); + auto less = nodes::as(if_node->cond()); + auto inverted = nodes::as(if_node->cond()); + if (less) { + auto symbol = nodes::as(less->lhs()); + assert(symbol); + assert(less->rhs().is_const()); + size_t feature = symbol->id(); + assert(feature < cmp_nodes.size()); + cmp_nodes[feature].emplace_back(less->rhs().get_const_value(), tree_id, true_leafs, true); + } else { + assert(inverted); + auto ge = nodes::as(inverted->child()); + assert(ge); + auto symbol = nodes::as(ge->lhs()); + assert(symbol); + assert(ge->rhs().is_const()); + size_t feature = symbol->id(); + assert(feature < cmp_nodes.size()); + cmp_nodes[feature].emplace_back(ge->rhs().get_const_value(), tree_id, true_leafs, false); + } + return BitRange::join(true_leafs, false_leafs); + } else { + assert(node.is_const()); + BitRange leaf_range(leafs[tree_id].size()); + leafs[tree_id].push_back(node.get_const_value()); + return leaf_range; + } +} + +State::State(size_t num_params, const std::vector &trees) + : cmp_nodes(num_params), + leafs(trees.size()) +{ + for (uint32_t tree_id = 0; tree_id < trees.size(); ++tree_id) { + BitRange leaf_range = encode_node(tree_id, *trees[tree_id]); + assert(leaf_range.first == 0); + assert((leaf_range.last + 1) == leafs[tree_id].size()); + } + for (CmpNodes &cmp_range: cmp_nodes) { + assert(!cmp_range.empty()); + std::sort(cmp_range.begin(), cmp_range.end()); + } +} + +} + +struct FastForestBuilder { + + static FastForest::MaskType get_mask_type(uint32_t idx1, uint32_t idx2) { + assert(idx1 <= idx2); + if (idx1 == idx2) { + return FastForest::MaskType::ONE; + } else if ((idx1 + 1) == idx2) { + return FastForest::MaskType::TWO; + } else { + return FastForest::MaskType::MANY; + } + } + + static FastForest::Mask make_mask(const CmpNode &cmp_node) { + BitRange range = cmp_node.false_mask; + assert(range.last < (8 * 256)); + assert(range.first <= range.last); + uint32_t idx1 = (range.first / 8); + uint32_t idx2 = (range.last / 8); + uint8_t bits1 = 0; + uint8_t bits2 = 0; + for (uint32_t i = 0; i < 8; ++i) { + uint32_t bit1 = (idx1 * 8) + i; + if ((bit1 < range.first) || (bit1 > range.last)) { + bits1 |= (1 << i); + } + uint32_t bit2 = (idx2 * 8) + i; + if ((bit2 < range.first) || (bit2 > range.last)) { + bits2 |= (1 << i); + } + } + assert(cmp_node.tree_id < (256 * 256)); + return FastForest::Mask(cmp_node.tree_id, get_mask_type(idx1, idx2), cmp_node.false_is_default, + idx1, bits1, idx2, bits2); + } + + static void build(State &state, FastForest &ff) { + for (const auto &cmp_nodes: state.cmp_nodes) { + ff._feature_sizes.push_back(cmp_nodes.size()); + for (const CmpNode &cmp_node: cmp_nodes) { + ff._values.push_back(cmp_node.value); + ff._masks.push_back(make_mask(cmp_node)); + } + } + for (const auto &leafs: state.leafs) { + ff._tree_sizes.push_back(leafs.size()); + for (float leaf: leafs) { + ff._leafs.push_back(leaf); + } + } + } +}; + +FastForest::Context::Context(const FastForest &ff) + : _forest(&ff), + _bytes_per_tree((ff.max_leafs() + 7) / 8), + _bits(_bytes_per_tree * ff.num_trees()) {} + +FastForest::Context::~Context() = default; + +FastForest::FastForest() = default; +FastForest::~FastForest() = default; + +size_t +FastForest::num_params() const +{ + return _feature_sizes.size(); +} + +size_t +FastForest::num_trees() const +{ + return _tree_sizes.size(); +} + +size_t +FastForest::max_leafs() const +{ + size_t res = 0; + size_t sum = 0; + for (size_t sz: _tree_sizes) { + res = std::max(res, sz); + sum += sz; + } + assert(res <= (8 * 256)); + assert(sum == _leafs.size()); + return res; +} + +FastForest::UP +FastForest::try_convert(const Function &fun) +{ + const auto &root = fun.root(); + if (!root.is_forest()) { + // must be only forest + return FastForest::UP(); + } + auto trees = gbdt::extract_trees(root); + if (trees.size() > (256 * 256)) { + // too many trees + return FastForest::UP(); + } + gbdt::ForestStats stats(trees); + if (stats.total_in_checks > 0) { + // set membership not supported + return FastForest::UP(); + } + if (stats.tree_sizes.back().size > (8 * 256)) { + // too many leaf nodes per tree + return FastForest::UP(); + } + State state(fun.num_params(), trees); + FastForest::UP res = FastForest::UP(new FastForest()); + FastForestBuilder::build(state, *res); + assert(fun.num_params() == res->num_params()); + assert(trees.size() == res->num_trees()); + return res; +} + +double +FastForest::estimate_cost_us(const std::vector ¶ms, double budget) const +{ + Context ctx(*this); + auto get_param = [¶ms](size_t i)->float{ return params[i]; }; + auto self_eval = [&](){ this->eval(ctx, get_param); }; + return BenchmarkTimer::benchmark(self_eval, budget) * 1000.0 * 1000.0; +} + +} diff --git a/eval/src/vespa/eval/eval/fast_forest.h b/eval/src/vespa/eval/eval/fast_forest.h new file mode 100644 index 00000000000..df3be2c4aef --- /dev/null +++ b/eval/src/vespa/eval/eval/fast_forest.h @@ -0,0 +1,141 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "function.h" +#include +#include +#include +#include + +namespace vespalib::eval::gbdt { + +/** + * Use modern optimization strategies to improve evaluation + * performance of GBDT forests. + * + * This model evaluation supports up to 65536 trees with up to 2048 + * leaf nodes in each tree. Comparisons must be on the form 'feature < + * const' or '!(feature >= const)'. The inverted form is used to + * signal that the true branch should be selected when the feature + * values is missing (NaN). + **/ +class FastForest +{ +public: + class Context { + friend class FastForest; + private: + const FastForest *_forest; + size_t _bytes_per_tree; + std::vector _bits; + public: + explicit Context(const FastForest &ff); + ~Context(); + }; + +private: + friend struct FastForestBuilder; + + enum class MaskType : uint8_t { ONE, TWO, MANY }; + + struct Mask { + uint16_t tree_id; + MaskType type; + uint8_t false_is_default; + uint8_t first_idx; + uint8_t first_bits; + uint8_t last_idx; + uint8_t last_bits; + Mask(uint16_t tree, MaskType mt, bool def_false, + uint8_t idx1, uint8_t bits1, uint8_t idx2, uint8_t bits2) + : tree_id(tree), type(mt), false_is_default(def_false), + first_idx(idx1), first_bits(bits1), last_idx(idx2), last_bits(bits2) {} + }; + + std::vector _feature_sizes; + std::vector _values; + std::vector _masks; + std::vector _tree_sizes; + std::vector _leafs; + + FastForest(); + + static void apply_mask(uint8_t *bits, size_t bytes_per_tree, const Mask &mask) { + uint8_t *dst = (bits + (mask.tree_id * bytes_per_tree) + mask.first_idx); + *dst &= mask.first_bits; + if (__builtin_expect(mask.type != MaskType::ONE, false)) { + if (__builtin_expect(mask.type == MaskType::MANY, false)) { + size_t n = (mask.last_idx - mask.first_idx - 1); + while (n-- > 0) { + *++dst = 0x00; + } + } + *++dst &= mask.last_bits; + } + } + + static float find_leaf(const uint8_t *bits, const float *leafs) { + while (__builtin_expect(*bits == 0, true)) { + ++bits; + leafs += 8; + } + return leafs[vespalib::Optimized::lsbIdx(uint32_t(*bits))]; + } + +public: + ~FastForest(); + size_t num_params() const; + size_t num_trees() const; + size_t max_leafs() const; + using UP = std::unique_ptr; + static UP try_convert(const Function &fun); + + template + double eval(Context &ctx, F &&f) const { + assert(ctx._forest == this); + size_t bytes_per_tree = ctx._bytes_per_tree; + uint8_t *bits = &ctx._bits[0]; + const float *value_ptr = &_values[0]; + const Mask *mask_ptr = &_masks[0]; + memset(bits, 0xff, ctx._bits.size()); + for (size_t f_idx = 0; f_idx < _feature_sizes.size(); ++f_idx) { + size_t feature_size = _feature_sizes[f_idx]; + float feature_value = f(f_idx); // get param + if (__builtin_expect(std::isnan(feature_value), false)) { + // handle 'missing' input feature + for (size_t i = 0; i < feature_size; ++i) { + const Mask &mask = mask_ptr[i]; + if (mask.false_is_default) { + apply_mask(bits, bytes_per_tree, mask); + } + } + } else { + for (size_t i = 0; i < feature_size; ++i) { + if (__builtin_expect(feature_value < value_ptr[i], false)) { + break; + } else { + apply_mask(bits, bytes_per_tree, mask_ptr[i]); + } + } + } + value_ptr += feature_size; + mask_ptr += feature_size; + } + assert(value_ptr == &*_values.end()); + assert(mask_ptr == &*_masks.end()); + const float *leafs = &_leafs[0]; + double result = 0.0; + for (size_t tree_size: _tree_sizes) { + result += find_leaf(bits, leafs); + bits += bytes_per_tree; + leafs += tree_size; + } + assert(bits == &*ctx._bits.end()); + assert(leafs == &*_leafs.end()); + return result; + } + double estimate_cost_us(const std::vector ¶ms, double budget = 5.0) const; +}; + +} -- cgit v1.2.3