diff options
author | Håvard Pettersen <havardpe@oath.com> | 2019-10-17 11:27:23 +0000 |
---|---|---|
committer | Håvard Pettersen <havardpe@oath.com> | 2019-11-01 13:02:31 +0000 |
commit | 3018ff30938c797d47b8c9dbd55e57d3f037aa10 (patch) | |
tree | 6ac16ef5bb4e0b25136fba4ade9e56746cf81ddb /eval | |
parent | bd372b2cb06a89c5427523be0080324883a0602b (diff) |
fast forest refactoring and experimentation
Diffstat (limited to 'eval')
-rw-r--r-- | eval/src/tests/eval/gbdt/.gitignore | 1 | ||||
-rw-r--r-- | eval/src/tests/eval/gbdt/CMakeLists.txt | 6 | ||||
-rw-r--r-- | eval/src/tests/eval/gbdt/fast_forest_bench.cpp | 56 | ||||
-rw-r--r-- | eval/src/tests/eval/gbdt/gbdt_test.cpp | 78 | ||||
-rw-r--r-- | eval/src/tests/eval/gbdt/model.cpp | 14 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/fast_forest.cpp | 606 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/fast_forest.h | 129 |
7 files changed, 670 insertions, 220 deletions
diff --git a/eval/src/tests/eval/gbdt/.gitignore b/eval/src/tests/eval/gbdt/.gitignore index d0ee762745c..952736e3543 100644 --- a/eval/src/tests/eval/gbdt/.gitignore +++ b/eval/src/tests/eval/gbdt/.gitignore @@ -1 +1,2 @@ /eval_gbdt_benchmark_app +/eval_fast_forest_bench_app diff --git a/eval/src/tests/eval/gbdt/CMakeLists.txt b/eval/src/tests/eval/gbdt/CMakeLists.txt index edbe56e3143..874a2d7bd02 100644 --- a/eval/src/tests/eval/gbdt/CMakeLists.txt +++ b/eval/src/tests/eval/gbdt/CMakeLists.txt @@ -13,3 +13,9 @@ vespa_add_executable(eval_gbdt_benchmark_app vespaeval ) vespa_add_test(NAME eval_gbdt_benchmark_app COMMAND eval_gbdt_benchmark_app BENCHMARK) +vespa_add_executable(eval_fast_forest_bench_app + SOURCES + fast_forest_bench.cpp + DEPENDS + vespaeval +) diff --git a/eval/src/tests/eval/gbdt/fast_forest_bench.cpp b/eval/src/tests/eval/gbdt/fast_forest_bench.cpp new file mode 100644 index 00000000000..76a56bec50c --- /dev/null +++ b/eval/src/tests/eval/gbdt/fast_forest_bench.cpp @@ -0,0 +1,56 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/eval/eval/function.h> +#include <vespa/eval/eval/fast_forest.h> +#include <vespa/eval/eval/vm_forest.h> +#include <vespa/eval/eval/llvm/compiled_function.h> +#include "model.cpp" + +using namespace vespalib::eval; +using namespace vespalib::eval::gbdt; + +template <typename T> +void estimate_cost(size_t num_params, const char *label, const T &impl) { + std::vector<double> inputs_min(num_params, 0.25); + std::vector<double> inputs_med(num_params, 0.50); + std::vector<double> inputs_max(num_params, 0.75); + std::vector<double> inputs_nan(num_params, std::numeric_limits<double>::quiet_NaN()); + double us_min = impl.estimate_cost_us(inputs_min, 5.0); + double us_med = impl.estimate_cost_us(inputs_med, 5.0); + double us_max = impl.estimate_cost_us(inputs_max, 5.0); + double us_nan = impl.estimate_cost_us(inputs_nan, 5.0); + fprintf(stderr, "[%12s] (per 100 eval): [low values] %6.3f ms, [medium values] %6.3f ms, [high values] %6.3f ms, [nan values] %6.3f ms\n", + label, (us_min / 10.0), (us_med / 10.0), (us_max / 10.0), (us_nan / 10.0)); +} + +void run_fast_forest_bench() { + for (size_t tree_size: std::vector<size_t>({8,16,32,64,128,256})) { + for (size_t num_trees: std::vector<size_t>({100, 500, 2500, 5000, 10000})) { + for (size_t max_features: std::vector<size_t>({200})) { + for (size_t less_percent: std::vector<size_t>({100})) { + for (size_t invert_percent: std::vector<size_t>({50})) { + fprintf(stderr, "\n=== features: %zu, num leafs: %zu, num trees: %zu\n", max_features, tree_size, num_trees); + vespalib::string expression = Model().max_features(max_features).less_percent(less_percent).invert_percent(invert_percent).make_forest(num_trees, tree_size); + Function function = Function::parse(expression); + for (size_t min_bits = std::max(size_t(8), tree_size); true; min_bits *= 2) { + auto forest = FastForest::try_convert(function, min_bits, 64); + if (forest) { + estimate_cost(function.num_params(), forest->impl_name().c_str(), *forest); + } + if (min_bits > 64) { + break; + } + } + estimate_cost(function.num_params(), "vm forest", CompiledFunction(function, PassParams::ARRAY, VMForest::optimize_chain)); + } + } + } + } + } + fprintf(stderr, "\n"); +} + +int main(int, char **) { + run_fast_forest_bench(); + return 0; +} diff --git a/eval/src/tests/eval/gbdt/gbdt_test.cpp b/eval/src/tests/eval/gbdt/gbdt_test.cpp index 14fa4510f4d..adb3d22847a 100644 --- a/eval/src/tests/eval/gbdt/gbdt_test.cpp +++ b/eval/src/tests/eval/gbdt/gbdt_test.cpp @@ -17,6 +17,14 @@ using namespace vespalib::eval::gbdt; //----------------------------------------------------------------------------- +bool is_little_endian() { + uint32_t value = 0; + uint8_t bytes[4] = {0, 1, 2, 3}; + static_assert(sizeof(bytes) == sizeof(value)); + memcpy(&value, bytes, sizeof(bytes)); + return (value == 0x03020100); +} + double eval_double(const Function &function, const std::vector<double> ¶ms) { InterpretedFunction ifun(SimpleTensorEngine::ref(), function, NodeTypes()); InterpretedFunction::Context ctx(ifun); @@ -26,6 +34,22 @@ double eval_double(const Function &function, const std::vector<double> ¶ms) double my_resolve(void *ctx, size_t idx) { return ((double*)ctx)[idx]; } +double eval_compiled(const CompiledFunction &cfun, std::vector<double> ¶ms) { + ASSERT_EQUAL(params.size(), cfun.num_params()); + if (cfun.pass_params() == PassParams::ARRAY) { + return cfun.get_function()(¶ms[0]); + } + if (cfun.pass_params() == PassParams::LAZY) { + return cfun.get_lazy_function()(my_resolve, ¶ms[0]); + } + return 31212.0; +} + +double eval_ff(const FastForest &ff, FastForest::Context &ctx, const std::vector<double> ¶ms) { + std::vector<float> my_params(params.begin(), params.end()); + return ff.eval(ctx, &my_params[0]); +} + //----------------------------------------------------------------------------- TEST("require that tree stats can be calculated") { @@ -304,29 +328,18 @@ TEST("require that FastForest model evaluation works") { EXPECT_TRUE(compiled.get_forests().empty()); auto forest = FastForest::try_convert(function); ASSERT_TRUE(forest); - FastForest::Context ctx(*forest); + auto ctx = forest->create_context(); std::vector<double> p1({0.5, 0.5, 0.5}); // all true: 1.0 + 10.0 std::vector<double> p2({2.5, 2.5, 2.5}); // all false: 4.0 + 40.0 std::vector<double> pn(3, std::numeric_limits<double>::quiet_NaN()); // default: 4.0 + 10.0 - EXPECT_EQUAL(forest->eval(ctx, [&p1](size_t i){return p1[i];}), f(&p1[0])); - EXPECT_EQUAL(forest->eval(ctx, [&p2](size_t i){return p2[i];}), f(&p2[0])); - EXPECT_EQUAL(forest->eval(ctx, [&pn](size_t i){return pn[i];}), f(&pn[0])); - EXPECT_EQUAL(forest->eval(ctx, [&p1](size_t i){return p1[i];}), f(&p1[0])); + EXPECT_EQUAL(eval_ff(*forest, *ctx, p1), f(&p1[0])); + EXPECT_EQUAL(eval_ff(*forest, *ctx, p2), f(&p2[0])); + EXPECT_EQUAL(eval_ff(*forest, *ctx, pn), f(&pn[0])); + EXPECT_EQUAL(eval_ff(*forest, *ctx, p1), f(&p1[0])); } //----------------------------------------------------------------------------- -double eval_compiled(const CompiledFunction &cfun, std::vector<double> ¶ms) { - ASSERT_EQUAL(params.size(), cfun.num_params()); - if (cfun.pass_params() == PassParams::ARRAY) { - return cfun.get_function()(¶ms[0]); - } - if (cfun.pass_params() == PassParams::LAZY) { - return cfun.get_lazy_function()(my_resolve, ¶ms[0]); - } - return 31212.0; -} - TEST("require that forests evaluate to approximately the same for all evaluation options") { for (PassParams pass_params: {PassParams::ARRAY, PassParams::LAZY}) { for (size_t tree_size: std::vector<size_t>({20})) { @@ -356,9 +369,36 @@ TEST("require that forests evaluate to approximately the same for all evaluation EXPECT_EQUAL(expected_nan, eval_compiled(deinline, inputs_nan)); EXPECT_EQUAL(expected_nan, eval_compiled(vm_forest, inputs_nan)); if (forest) { - FastForest::Context ctx(*forest); - EXPECT_EQUAL(expected, forest->eval(ctx, [&inputs](size_t i){return inputs[i];})); - EXPECT_EQUAL(expected_nan, forest->eval(ctx, [&inputs_nan](size_t i){return inputs_nan[i];})); + auto ctx = forest->create_context(); + EXPECT_EQUAL(expected, eval_ff(*forest, *ctx, inputs)); + EXPECT_EQUAL(expected_nan, eval_ff(*forest, *ctx, inputs_nan)); + } + } + } + } + } + } +} + +TEST("require that fast forest evaluation is correct for all tree size categories") { + for (size_t tree_size: std::vector<size_t>({7,15,30,61,127})) { + for (size_t num_trees: std::vector<size_t>({127})) { + for (size_t num_features: std::vector<size_t>({35})) { + for (size_t less_percent: std::vector<size_t>({100})) { + for (size_t invert_percent: std::vector<size_t>({50})) { + vespalib::string expression = Model().max_features(num_features).less_percent(less_percent).invert_percent(invert_percent).make_forest(num_trees, tree_size); + Function function = Function::parse(expression); + auto forest = FastForest::try_convert(function); + if ((tree_size <= 64) || is_little_endian()) { + ASSERT_TRUE(forest); + TEST_STATE(forest->impl_name().c_str()); + std::vector<double> inputs(function.num_params(), 0.5); + std::vector<double> inputs_nan(function.num_params(), std::numeric_limits<double>::quiet_NaN()); + double expected = eval_double(function, inputs); + double expected_nan = eval_double(function, inputs_nan); + auto ctx = forest->create_context(); + EXPECT_EQUAL(expected, eval_ff(*forest, *ctx, inputs)); + EXPECT_EQUAL(expected_nan, eval_ff(*forest, *ctx, inputs_nan)); } } } diff --git a/eval/src/tests/eval/gbdt/model.cpp b/eval/src/tests/eval/gbdt/model.cpp index ae1c9bea437..8f0d87a4020 100644 --- a/eval/src/tests/eval/gbdt/model.cpp +++ b/eval/src/tests/eval/gbdt/model.cpp @@ -13,6 +13,7 @@ class Model { private: std::mt19937 _gen; + size_t _max_features; size_t _less_percent; size_t _invert_percent; @@ -32,9 +33,9 @@ private: } std::string make_feature_name() { - size_t max_feature = 2; - while ((max_feature < 1024) && (get_int(0, 99) < 55)) { - max_feature *= 2; + size_t max_feature = 7; + while ((max_feature < _max_features) && (get_int(0, 99) < 55)) { + max_feature = std::min(max_feature * 2, _max_features); } return make_string("feature_%zu", get_int(1, max_feature)); } @@ -60,7 +61,12 @@ private: } public: - explicit Model(size_t seed = 5489u) : _gen(seed), _less_percent(80), _invert_percent(0) {} + explicit Model(size_t seed = 5489u) : _gen(seed), _max_features(1024), _less_percent(80), _invert_percent(0) {} + + Model &max_features(size_t value) { + _max_features = value; + return *this; + } Model &less_percent(size_t value) { _less_percent = value; diff --git a/eval/src/vespa/eval/eval/fast_forest.cpp b/eval/src/vespa/eval/eval/fast_forest.cpp index 927c5355e6a..be58c78d834 100644 --- a/eval/src/vespa/eval/eval/fast_forest.cpp +++ b/eval/src/vespa/eval/eval/fast_forest.cpp @@ -8,16 +8,38 @@ #include <vespa/vespalib/util/benchmark_timer.h> #include <algorithm> #include <cassert> +#include <arpa/inet.h> namespace vespalib::eval::gbdt { namespace { +//----------------------------------------------------------------------------- +// internal concepts used during model creation +//----------------------------------------------------------------------------- + +constexpr size_t bits_per_byte = 8; + +bool is_little_endian() { + uint32_t value = 0; + uint8_t bytes[4] = {0, 1, 2, 3}; + static_assert(sizeof(bytes) == sizeof(value)); + memcpy(&value, bytes, sizeof(bytes)); + return (value == 0x03020100); +} + struct BitRange { uint32_t first; uint32_t last; BitRange(uint32_t bit) : first(bit), last(bit) {} BitRange(uint32_t a, uint32_t b) : first(a), last(b) {} + template <typename T> + size_t covered_words() const { + assert(first <= last); + uint32_t v1 = (first / (bits_per_byte * sizeof(T))); + uint32_t v2 = (last / (bits_per_byte * sizeof(T))); + return ((v2 - v1) + 1); + } static BitRange join(const BitRange &a, const BitRange &b) { assert((a.last + 1) == b.first); return BitRange(a.first, b.last); @@ -43,8 +65,11 @@ struct State { using CmpNodes = std::vector<CmpNode>; std::vector<CmpNodes> cmp_nodes; std::vector<Leafs> leafs; + size_t max_leafs; BitRange encode_node(uint32_t tree_id, const nodes::Node &node); State(size_t num_params, const std::vector<const nodes::Node *> &trees); + size_t num_params() const { return cmp_nodes.size(); } + size_t num_trees() const { return leafs.size(); } ~State() = default; }; @@ -86,12 +111,14 @@ State::encode_node(uint32_t tree_id, const nodes::Node &node) State::State(size_t num_params, const std::vector<const nodes::Node *> &trees) : cmp_nodes(num_params), - leafs(trees.size()) + leafs(trees.size()), + max_leafs(0) { for (uint32_t tree_id = 0; tree_id < trees.size(); ++tree_id) { BitRange leaf_range = encode_node(tree_id, *trees[tree_id]); assert(leaf_range.first == 0); assert((leaf_range.last + 1) == leafs[tree_id].size()); + max_leafs = std::max(max_leafs, leafs[tree_id].size()); } for (CmpNodes &cmp_range: cmp_nodes) { assert(!cmp_range.empty()); @@ -99,134 +126,547 @@ State::State(size_t num_params, const std::vector<const nodes::Node *> &trees) } } +template <typename T> size_t get_lsb(T value) { return vespalib::Optimized::lsbIdx(value); } +template <> size_t get_lsb<uint8_t>(uint8_t value) { return vespalib::Optimized::lsbIdx(uint32_t(value)); } +template <> size_t get_lsb<uint16_t>(uint16_t value) { return vespalib::Optimized::lsbIdx(uint32_t(value)); } + +//----------------------------------------------------------------------------- +// implementation using single value mask per tree +//----------------------------------------------------------------------------- + +template <typename T> vespalib::string fixed_impl_name(); +template <> vespalib::string fixed_impl_name<uint8_t>() { return "ff-fixed<8>"; } +template <> vespalib::string fixed_impl_name<uint16_t>() { return "ff-fixed<16>"; } +template <> vespalib::string fixed_impl_name<uint32_t>() { return "ff-fixed<32>"; } +template <> vespalib::string fixed_impl_name<uint64_t>() { return "ff-fixed<64>"; } + +template <typename T> +constexpr size_t max_leafs() { return (sizeof(T) * bits_per_byte); } + +template <typename T> +struct FixedContext : FastForest::Context { + std::vector<T> masks; + FixedContext(size_t num_trees) : masks(num_trees) {} +}; + +template <typename T> +struct FixedForest : FastForest { + + static T make_mask(const CmpNode &cmp_node) { + BitRange range = cmp_node.false_mask; + size_t num_bits = (sizeof(T) * bits_per_byte); + assert(range.last < num_bits); + assert(range.first <= range.last); + T mask = 0; + for (uint32_t i = 0; i < num_bits; ++i) { + if ((i < range.first) || (i > range.last)) { + mask |= (T(1) << i); + } + } + return mask; + } + + struct Mask { + float value; + uint32_t tree; + T bits; + Mask(float v, uint32_t t, T b) + : value(v), tree(t), bits(b) {} + }; + + struct DMask { + uint32_t tree; + T bits; + DMask(uint32_t t, T b) + : tree(t), bits(b) {} + }; + + std::vector<uint32_t> _mask_sizes; + std::vector<Mask> _masks; + std::vector<uint32_t> _default_offsets; + std::vector<DMask> _default_masks; + std::vector<float> _padded_leafs; + uint32_t _num_trees; + uint32_t _max_leafs; + + FixedForest(const State &state); + static FastForest::UP try_build(const State &state, size_t min_fixed, size_t max_fixed); + + void init_state(T *ctx_masks) const; + static void apply_masks(T *ctx_masks, const Mask *pos, const Mask *end, float limit); + static void apply_masks(T *ctx_masks, const DMask *pos, const DMask *end); + double get_result(const T *ctx_masks) const; + + vespalib::string impl_name() const override { return fixed_impl_name<T>(); } + Context::UP create_context() const override; + double eval(Context &context, const float *params) const override; +}; + +template <typename T> +FixedForest<T>::FixedForest(const State &state) + : _mask_sizes(), + _masks(), + _default_offsets(), + _default_masks(), + _padded_leafs(), + _num_trees(state.num_trees()), + _max_leafs(state.max_leafs) +{ + for (const auto &cmp_nodes: state.cmp_nodes) { + _mask_sizes.emplace_back(cmp_nodes.size()); + _default_offsets.push_back(_default_masks.size()); + for (const CmpNode &cmp_node: cmp_nodes) { + _masks.emplace_back(cmp_node.value, cmp_node.tree_id, make_mask(cmp_node)); + if (cmp_node.false_is_default) { + _default_masks.emplace_back(cmp_node.tree_id, make_mask(cmp_node)); + } + } + } + _default_offsets.push_back(_default_masks.size()); + for (const auto &leafs: state.leafs) { + for (float leaf: leafs) { + _padded_leafs.push_back(leaf); + } + size_t padding = (_max_leafs - leafs.size()); + while (padding-- > 0) { + _padded_leafs.push_back(0.0); + } + } + assert(_padded_leafs.size() == (_num_trees * _max_leafs)); +} + +template <typename T> +FastForest::UP +FixedForest<T>::try_build(const State &state, size_t min_fixed, size_t max_fixed) +{ + if ((max_leafs<T>() >= min_fixed) && + (max_leafs<T>() <= max_fixed) && + (state.max_leafs <= max_leafs<T>())) + { + return std::make_unique<FixedForest<T>>(state); + } + return FastForest::UP(); +} + +template <typename T> +void +FixedForest<T>::init_state(T *ctx_masks) const +{ + memset(ctx_masks, 0xff, _num_trees * sizeof(T)); +} + +template <typename T> +void +FixedForest<T>::apply_masks(T *ctx_masks, const Mask *pos, const Mask *end, float limit) +{ + for (; ((pos+3) < end) && !(limit < pos[3].value); pos += 4) { + ctx_masks[pos[0].tree] &= pos[0].bits; + ctx_masks[pos[1].tree] &= pos[1].bits; + ctx_masks[pos[2].tree] &= pos[2].bits; + ctx_masks[pos[3].tree] &= pos[3].bits; + } + for (; (pos < end) && !(limit < pos->value); ++pos) { + ctx_masks[pos->tree] &= pos->bits; + } +} + +template <typename T> +void +FixedForest<T>::apply_masks(T *ctx_masks, const DMask *pos, const DMask *end) +{ + for (; ((pos+3) < end); pos += 4) { + ctx_masks[pos[0].tree] &= pos[0].bits; + ctx_masks[pos[1].tree] &= pos[1].bits; + ctx_masks[pos[2].tree] &= pos[2].bits; + ctx_masks[pos[3].tree] &= pos[3].bits; + } + for (; (pos < end); ++pos) { + ctx_masks[pos->tree] &= pos->bits; + } +} + +template <typename T> +double +FixedForest<T>::get_result(const T *ctx_masks) const +{ + double result1 = 0.0; + double result2 = 0.0; + const T *ctx_end = (ctx_masks + _num_trees); + const float *leafs = &_padded_leafs[0]; + size_t leaf_cnt = _max_leafs; + for (; (ctx_masks + 3) < ctx_end; ctx_masks += 4, leafs += (leaf_cnt * 4)) { + result1 += leafs[(0 * leaf_cnt) + get_lsb(ctx_masks[0])]; + result2 += leafs[(1 * leaf_cnt) + get_lsb(ctx_masks[1])]; + result1 += leafs[(2 * leaf_cnt) + get_lsb(ctx_masks[2])]; + result2 += leafs[(3 * leaf_cnt) + get_lsb(ctx_masks[3])]; + } + for (; ctx_masks < ctx_end; ++ctx_masks, leafs += leaf_cnt) { + result1 += leafs[get_lsb(*ctx_masks)]; + } + return (result1 + result2); } -struct FastForestBuilder { +template <typename T> +FastForest::Context::UP +FixedForest<T>::create_context() const +{ + return std::make_unique<FixedContext<T>>(_num_trees); +} - static FastForest::MaskType get_mask_type(uint32_t idx1, uint32_t idx2) { - assert(idx1 <= idx2); - if (idx1 == idx2) { - return FastForest::MaskType::ONE; - } else if ((idx1 + 1) == idx2) { - return FastForest::MaskType::TWO; +template <typename T> +double +FixedForest<T>::eval(Context &context, const float *params) const +{ + T *ctx_masks = &static_cast<FixedContext<T>&>(context).masks[0]; + init_state(ctx_masks); + const Mask *mask_pos = &_masks[0]; + const float *param_pos = params; + for (uint32_t size: _mask_sizes) { + float feature = *param_pos++; + if (!std::isnan(feature)) { + apply_masks(ctx_masks, mask_pos, mask_pos + size, feature); } else { - return FastForest::MaskType::MANY; + apply_masks(ctx_masks, + &_default_masks[_default_offsets[(param_pos-params)-1]], + &_default_masks[_default_offsets[(param_pos-params)]]); } + mask_pos += size; } + return get_result(ctx_masks); +} + +//----------------------------------------------------------------------------- +// implementation using multiple words for each tree +//----------------------------------------------------------------------------- + +struct MultiWordContext : FastForest::Context { + std::vector<uint32_t> words; + MultiWordContext(size_t size) : words(size) {} +}; + +struct MultiWordForest : FastForest { + + constexpr static size_t word_size = sizeof(uint32_t); + constexpr static size_t bits_per_word = (word_size * bits_per_byte); + + struct Sizes { + uint32_t fixed; + uint32_t rle; + Sizes(uint32_t f, uint32_t r) : fixed(f), rle(r) {} + }; + + struct Mask { + float value; + uint32_t offset; + union { + uint32_t bits; + uint8_t rle_mask[3]; + }; + Mask(float v, uint32_t word_offset, uint32_t mask_bits) + : value(v), offset(word_offset), bits(mask_bits) {} + Mask(float v, uint32_t byte_offset, uint8_t first_bits, uint8_t empty_bytes, uint8_t last_bits) + : value(v), offset(byte_offset), rle_mask{first_bits, empty_bytes, last_bits} {} + }; - static FastForest::Mask make_mask(const CmpNode &cmp_node) { + struct DMask { + uint32_t offset; + union { + uint32_t bits; + uint8_t rle_mask[3]; + }; + DMask(uint32_t word_offset, uint32_t mask_bits) + : offset(word_offset), bits(mask_bits) {} + DMask(uint32_t byte_offset, uint8_t first_bits, uint8_t empty_bytes, uint8_t last_bits) + : offset(byte_offset), rle_mask{first_bits, empty_bytes, last_bits} {} + }; + + static Mask make_fixed_mask(const CmpNode &cmp_node, size_t words_per_tree) { BitRange range = cmp_node.false_mask; - assert(range.last < (8 * 256)); - assert(range.first <= range.last); - uint32_t idx1 = (range.first / 8); - uint32_t idx2 = (range.last / 8); + assert(range.covered_words<uint32_t>() == 1); + size_t offset = (range.first / bits_per_word); + uint32_t bits = 0; + for (uint32_t i = 0; i < bits_per_word; ++i) { + uint32_t bit = (offset * bits_per_word) + i; + if ((bit < range.first) || (bit > range.last)) { + bits |= (uint32_t(1) << i); + } + } + offset += (words_per_tree * cmp_node.tree_id); + return Mask(cmp_node.value, offset, bits); + } + + static Mask make_rle_mask(const CmpNode &cmp_node, size_t words_per_tree) { + BitRange range = cmp_node.false_mask; + assert(range.covered_words<uint32_t>() > 1); + uint32_t idx1 = (range.first / bits_per_byte); + uint32_t idx2 = (range.last / bits_per_byte); uint8_t bits1 = 0; uint8_t bits2 = 0; - for (uint32_t i = 0; i < 8; ++i) { - uint32_t bit1 = (idx1 * 8) + i; + for (uint32_t i = 0; i < bits_per_byte; ++i) { + uint32_t bit1 = (idx1 * bits_per_byte) + i; if ((bit1 < range.first) || (bit1 > range.last)) { - bits1 |= (1 << i); + bits1 |= (uint8_t(1) << i); } - uint32_t bit2 = (idx2 * 8) + i; + uint32_t bit2 = (idx2 * bits_per_byte) + i; if ((bit2 < range.first) || (bit2 > range.last)) { - bits2 |= (1 << i); + bits2 |= (uint8_t(1) << i); } } - assert(cmp_node.tree_id < (256 * 256)); - return FastForest::Mask(cmp_node.tree_id, get_mask_type(idx1, idx2), cmp_node.false_is_default, - idx1, bits1, idx2, bits2); + uint32_t offset = (idx1 + (word_size * words_per_tree * cmp_node.tree_id)); + uint32_t empty_cnt = ((idx2 - idx1) - 1); + assert(empty_cnt < 256); + return Mask(cmp_node.value, offset, bits1, empty_cnt, bits2); } - static void build(State &state, FastForest &ff) { - for (const auto &cmp_nodes: state.cmp_nodes) { - ff._feature_sizes.push_back(cmp_nodes.size()); - for (const CmpNode &cmp_node: cmp_nodes) { - ff._values.push_back(cmp_node.value); - ff._masks.push_back(make_mask(cmp_node)); + std::vector<Sizes> _mask_sizes; + std::vector<Mask> _masks; + std::vector<Sizes> _default_offsets; + std::vector<DMask> _default_masks; + std::vector<uint32_t> _tree_offsets; + std::vector<float> _leafs; + uint32_t _words_per_tree; + + MultiWordForest(const State &state); + static FastForest::UP try_build(const State &state); + + void init_state(uint32_t *ctx_words) const; + static void apply_fixed_masks(uint32_t *ctx_words, const Mask *pos, const Mask *end, float limit); + static void apply_rle_masks(unsigned char *ctx_bytes, const Mask *pos, const Mask *end, float limit); + static void apply_fixed_masks(uint32_t *ctx_words, const DMask *pos, const DMask *end); + static void apply_rle_masks(unsigned char *ctx_bytes, const DMask *pos, const DMask *end); + static size_t find_leaf(const uint32_t *ctx_words); + double get_result(const uint32_t *ctx_words) const; + + vespalib::string impl_name() const override { return "ff-multiword"; } + Context::UP create_context() const override; + double eval(Context &context, const float *params) const override; +}; + +MultiWordForest::MultiWordForest(const State &state) + : _mask_sizes(), + _masks(), + _default_offsets(), + _default_masks(), + _tree_offsets(), + _leafs(), + _words_per_tree(BitRange(0, state.max_leafs - 1).covered_words<uint32_t>()) +{ + for (const auto &cmp_nodes: state.cmp_nodes) { + std::vector<CmpNode> fixed; + std::vector<CmpNode> rle; + size_t default_fixed_cnt = 0; + for (const CmpNode &cmp_node: cmp_nodes) { + if (cmp_node.false_mask.covered_words<uint32_t>() == 1) { + fixed.push_back(cmp_node); + if (cmp_node.false_is_default) { + ++default_fixed_cnt; + } + } else { + rle.push_back(cmp_node); } } - for (const auto &leafs: state.leafs) { - ff._tree_sizes.push_back(leafs.size()); - for (float leaf: leafs) { - ff._leafs.push_back(leaf); + _mask_sizes.emplace_back(fixed.size(), rle.size()); + _default_offsets.emplace_back(_default_masks.size(), + _default_masks.size() + default_fixed_cnt); + for (const CmpNode &cmp_node: fixed) { + _masks.push_back(make_fixed_mask(cmp_node, _words_per_tree)); + if (cmp_node.false_is_default) { + _default_masks.emplace_back(_masks.back().offset, + _masks.back().bits); + } + } + assert(_default_masks.size() == _default_offsets.back().rle); + for (const CmpNode &cmp_node: rle) { + _masks.push_back(make_rle_mask(cmp_node, _words_per_tree)); + if (cmp_node.false_is_default) { + _default_masks.emplace_back(_masks.back().offset, + _masks.back().rle_mask[0], + _masks.back().rle_mask[1], + _masks.back().rle_mask[2]); } } } -}; + _default_offsets.emplace_back(_default_masks.size(), _default_masks.size()); + for (const auto &leafs: state.leafs) { + _tree_offsets.push_back(_leafs.size()); + for (float leaf: leafs) { + _leafs.push_back(leaf); + } + } +} -FastForest::Context::Context(const FastForest &ff) - : _forest(&ff), - _bytes_per_tree((ff.max_leafs() + 7) / 8), - _bits(_bytes_per_tree * ff.num_trees()) {} +FastForest::UP +MultiWordForest::try_build(const State &state) +{ + if (is_little_endian()) { + if (state.max_leafs <= (bits_per_byte * 256)) { + return std::make_unique<MultiWordForest>(state); + } + } + return FastForest::UP(); +} -FastForest::Context::~Context() = default; +void +MultiWordForest::init_state(uint32_t *ctx_words) const +{ + memset(ctx_words, 0xff, word_size * _words_per_tree * _tree_offsets.size()); +} -FastForest::FastForest() = default; -FastForest::~FastForest() = default; +void +MultiWordForest::apply_fixed_masks(uint32_t *ctx_words, const Mask *pos, const Mask *end, float limit) +{ + for (; ((pos+3) < end) && !(limit < pos[3].value); pos += 4) { + ctx_words[pos[0].offset] &= pos[0].bits; + ctx_words[pos[1].offset] &= pos[1].bits; + ctx_words[pos[2].offset] &= pos[2].bits; + ctx_words[pos[3].offset] &= pos[3].bits; + } + for (; (pos < end) && !(limit < pos->value); ++pos) { + ctx_words[pos->offset] &= pos->bits; + } +} -size_t -FastForest::num_params() const +void +MultiWordForest::apply_rle_masks(unsigned char *ctx_bytes, const Mask *pos, const Mask *end, float limit) { - return _feature_sizes.size(); + for (; (pos < end) && !(limit < pos->value); ++pos) { + unsigned char *dst = (ctx_bytes + pos->offset); + *dst++ &= pos->rle_mask[0]; + for (size_t e = pos->rle_mask[1]; e-- > 0; ) { + *dst++ = 0; + } + *dst &= pos->rle_mask[2]; + } } -size_t -FastForest::num_trees() const +void +MultiWordForest::apply_fixed_masks(uint32_t *ctx_words, const DMask *pos, const DMask *end) { - return _tree_sizes.size(); + for (; ((pos+3) < end); pos += 4) { + ctx_words[pos[0].offset] &= pos[0].bits; + ctx_words[pos[1].offset] &= pos[1].bits; + ctx_words[pos[2].offset] &= pos[2].bits; + ctx_words[pos[3].offset] &= pos[3].bits; + } + for (; (pos < end); ++pos) { + ctx_words[pos->offset] &= pos->bits; + } } -size_t -FastForest::max_leafs() const +void +MultiWordForest::apply_rle_masks(unsigned char *ctx_bytes, const DMask *pos, const DMask *end) { - size_t res = 0; - size_t sum = 0; - for (size_t sz: _tree_sizes) { - res = std::max(res, sz); - sum += sz; + for (; pos < end; ++pos) { + unsigned char *dst = (ctx_bytes + pos->offset); + *dst++ &= pos->rle_mask[0]; + for (size_t e = pos->rle_mask[1]; e-- > 0; ) { + *dst++ = 0; + } + *dst &= pos->rle_mask[2]; } - assert(res <= (8 * 256)); - assert(sum == _leafs.size()); - return res; } -FastForest::UP -FastForest::try_convert(const Function &fun) +size_t +MultiWordForest::find_leaf(const uint32_t *word) { - const auto &root = fun.root(); - if (!root.is_forest()) { - // must be only forest - return FastForest::UP(); + size_t idx = 0; + for (; *word == 0; ++word) { + idx += bits_per_word; } - auto trees = gbdt::extract_trees(root); - if (trees.size() > (256 * 256)) { - // too many trees - return FastForest::UP(); + return (idx + get_lsb(*word)); +} + +double +MultiWordForest::get_result(const uint32_t *ctx_words) const +{ + double result = 0.0; + const float *leafs = &_leafs[0]; + for (size_t tree_offset: _tree_offsets) { + result += leafs[tree_offset + find_leaf(ctx_words)]; + ctx_words += _words_per_tree; } - gbdt::ForestStats stats(trees); - if (stats.total_in_checks > 0) { - // set membership not supported - return FastForest::UP(); + return result; +} + +FastForest::Context::UP +MultiWordForest::create_context() const +{ + return std::make_unique<MultiWordContext>(_words_per_tree * _tree_offsets.size()); +} + +double +MultiWordForest::eval(Context &context, const float *params) const +{ + uint32_t *ctx_words = &static_cast<MultiWordContext&>(context).words[0]; + init_state(ctx_words); + const Mask *mask_pos = &_masks[0]; + const float *param_pos = params; + for (const Sizes &size: _mask_sizes) { + float feature = *param_pos++; + if (!std::isnan(feature)) { + apply_fixed_masks(ctx_words, mask_pos, mask_pos + size.fixed, feature); + apply_rle_masks(reinterpret_cast<unsigned char *>(ctx_words), + mask_pos + size.fixed, mask_pos + size.fixed + size.rle, feature); + } else { + apply_fixed_masks(ctx_words, + &_default_masks[_default_offsets[(param_pos-params)-1].fixed], + &_default_masks[_default_offsets[(param_pos-params)-1].rle]); + apply_rle_masks(reinterpret_cast<unsigned char *>(ctx_words), + &_default_masks[_default_offsets[(param_pos-params)-1].rle], + &_default_masks[_default_offsets[(param_pos-params)].fixed]); + } + mask_pos += (size.fixed + size.rle); } - if (stats.tree_sizes.back().size > (8 * 256)) { - // too many leaf nodes per tree - return FastForest::UP(); + return get_result(ctx_words); +} + +} + +//----------------------------------------------------------------------------- +// outer shell unifying the different implementations +//----------------------------------------------------------------------------- + +FastForest::Context::Context() = default; +FastForest::Context::~Context() = default; + +FastForest::FastForest() = default; +FastForest::~FastForest() = default; + +FastForest::UP +FastForest::try_convert(const Function &fun, size_t min_fixed, size_t max_fixed) +{ + const auto &root = fun.root(); + if (root.is_forest()) { + auto trees = gbdt::extract_trees(root); + gbdt::ForestStats stats(trees); + if (stats.total_in_checks == 0) { + State state(fun.num_params(), trees); + if (auto forest = FixedForest<uint8_t>::try_build(state, min_fixed, max_fixed)) { + return forest; + } + if (auto forest = FixedForest<uint16_t>::try_build(state, min_fixed, max_fixed)) { + return forest; + } + if (auto forest = FixedForest<uint32_t>::try_build(state, min_fixed, max_fixed)) { + return forest; + } + if (auto forest = FixedForest<uint64_t>::try_build(state, min_fixed, max_fixed)) { + return forest; + } + if (auto forest = MultiWordForest::try_build(state)) { + return forest; + } + } } - State state(fun.num_params(), trees); - FastForest::UP res = FastForest::UP(new FastForest()); - FastForestBuilder::build(state, *res); - assert(fun.num_params() == res->num_params()); - assert(trees.size() == res->num_trees()); - return res; + return FastForest::UP(); } double FastForest::estimate_cost_us(const std::vector<double> ¶ms, double budget) const { - Context ctx(*this); - auto get_param = [¶ms](size_t i)->float{ return params[i]; }; - auto self_eval = [&](){ this->eval(ctx, get_param); }; - return BenchmarkTimer::benchmark(self_eval, budget) * 1000.0 * 1000.0; + auto ctx = create_context(); + std::vector<float> my_params(params.begin(), params.end()); + return BenchmarkTimer::benchmark([&](){ eval(*ctx, &my_params[0]); }, budget) * 1000.0 * 1000.0; } } diff --git a/eval/src/vespa/eval/eval/fast_forest.h b/eval/src/vespa/eval/eval/fast_forest.h index df3be2c4aef..27171322454 100644 --- a/eval/src/vespa/eval/eval/fast_forest.h +++ b/eval/src/vespa/eval/eval/fast_forest.h @@ -14,127 +14,28 @@ namespace vespalib::eval::gbdt { * Use modern optimization strategies to improve evaluation * performance of GBDT forests. * - * This model evaluation supports up to 65536 trees with up to 2048 - * leaf nodes in each tree. Comparisons must be on the form 'feature < - * const' or '!(feature >= const)'. The inverted form is used to - * signal that the true branch should be selected when the feature - * values is missing (NaN). + * Comparisons must be on the form 'feature < const' or '!(feature >= + * const)'. The inverted form is used to signal that the true branch + * should be selected when the feature value is missing (NaN). **/ class FastForest { +protected: + FastForest(); public: + virtual ~FastForest(); + using UP = std::unique_ptr<FastForest>; class Context { - friend class FastForest; - private: - const FastForest *_forest; - size_t _bytes_per_tree; - std::vector<uint8_t> _bits; + protected: + Context(); public: - explicit Context(const FastForest &ff); - ~Context(); - }; - -private: - friend struct FastForestBuilder; - - enum class MaskType : uint8_t { ONE, TWO, MANY }; - - struct Mask { - uint16_t tree_id; - MaskType type; - uint8_t false_is_default; - uint8_t first_idx; - uint8_t first_bits; - uint8_t last_idx; - uint8_t last_bits; - Mask(uint16_t tree, MaskType mt, bool def_false, - uint8_t idx1, uint8_t bits1, uint8_t idx2, uint8_t bits2) - : tree_id(tree), type(mt), false_is_default(def_false), - first_idx(idx1), first_bits(bits1), last_idx(idx2), last_bits(bits2) {} + virtual ~Context(); + using UP = std::unique_ptr<Context>; }; - - std::vector<uint32_t> _feature_sizes; - std::vector<float> _values; - std::vector<Mask> _masks; - std::vector<uint32_t> _tree_sizes; - std::vector<float> _leafs; - - FastForest(); - - static void apply_mask(uint8_t *bits, size_t bytes_per_tree, const Mask &mask) { - uint8_t *dst = (bits + (mask.tree_id * bytes_per_tree) + mask.first_idx); - *dst &= mask.first_bits; - if (__builtin_expect(mask.type != MaskType::ONE, false)) { - if (__builtin_expect(mask.type == MaskType::MANY, false)) { - size_t n = (mask.last_idx - mask.first_idx - 1); - while (n-- > 0) { - *++dst = 0x00; - } - } - *++dst &= mask.last_bits; - } - } - - static float find_leaf(const uint8_t *bits, const float *leafs) { - while (__builtin_expect(*bits == 0, true)) { - ++bits; - leafs += 8; - } - return leafs[vespalib::Optimized::lsbIdx(uint32_t(*bits))]; - } - -public: - ~FastForest(); - size_t num_params() const; - size_t num_trees() const; - size_t max_leafs() const; - using UP = std::unique_ptr<FastForest>; - static UP try_convert(const Function &fun); - - template <typename F> - double eval(Context &ctx, F &&f) const { - assert(ctx._forest == this); - size_t bytes_per_tree = ctx._bytes_per_tree; - uint8_t *bits = &ctx._bits[0]; - const float *value_ptr = &_values[0]; - const Mask *mask_ptr = &_masks[0]; - memset(bits, 0xff, ctx._bits.size()); - for (size_t f_idx = 0; f_idx < _feature_sizes.size(); ++f_idx) { - size_t feature_size = _feature_sizes[f_idx]; - float feature_value = f(f_idx); // get param - if (__builtin_expect(std::isnan(feature_value), false)) { - // handle 'missing' input feature - for (size_t i = 0; i < feature_size; ++i) { - const Mask &mask = mask_ptr[i]; - if (mask.false_is_default) { - apply_mask(bits, bytes_per_tree, mask); - } - } - } else { - for (size_t i = 0; i < feature_size; ++i) { - if (__builtin_expect(feature_value < value_ptr[i], false)) { - break; - } else { - apply_mask(bits, bytes_per_tree, mask_ptr[i]); - } - } - } - value_ptr += feature_size; - mask_ptr += feature_size; - } - assert(value_ptr == &*_values.end()); - assert(mask_ptr == &*_masks.end()); - const float *leafs = &_leafs[0]; - double result = 0.0; - for (size_t tree_size: _tree_sizes) { - result += find_leaf(bits, leafs); - bits += bytes_per_tree; - leafs += tree_size; - } - assert(bits == &*ctx._bits.end()); - assert(leafs == &*_leafs.end()); - return result; - } + static UP try_convert(const Function &fun, size_t min_fixed = 8, size_t max_fixed = 64); + virtual vespalib::string impl_name() const = 0; + virtual Context::UP create_context() const = 0; + virtual double eval(Context &context, const float *params) const = 0; double estimate_cost_us(const std::vector<double> ¶ms, double budget = 5.0) const; }; |