aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHåvard Pettersen <havardpe@oath.com>2019-10-17 11:27:23 +0000
committerHåvard Pettersen <havardpe@oath.com>2019-11-01 13:02:31 +0000
commit3018ff30938c797d47b8c9dbd55e57d3f037aa10 (patch)
tree6ac16ef5bb4e0b25136fba4ade9e56746cf81ddb
parentbd372b2cb06a89c5427523be0080324883a0602b (diff)
fast forest refactoring and experimentation
-rw-r--r--eval/src/tests/eval/gbdt/.gitignore1
-rw-r--r--eval/src/tests/eval/gbdt/CMakeLists.txt6
-rw-r--r--eval/src/tests/eval/gbdt/fast_forest_bench.cpp56
-rw-r--r--eval/src/tests/eval/gbdt/gbdt_test.cpp78
-rw-r--r--eval/src/tests/eval/gbdt/model.cpp14
-rw-r--r--eval/src/vespa/eval/eval/fast_forest.cpp606
-rw-r--r--eval/src/vespa/eval/eval/fast_forest.h129
-rw-r--r--searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp27
8 files changed, 689 insertions, 228 deletions
diff --git a/eval/src/tests/eval/gbdt/.gitignore b/eval/src/tests/eval/gbdt/.gitignore
index d0ee762745c..952736e3543 100644
--- a/eval/src/tests/eval/gbdt/.gitignore
+++ b/eval/src/tests/eval/gbdt/.gitignore
@@ -1 +1,2 @@
/eval_gbdt_benchmark_app
+/eval_fast_forest_bench_app
diff --git a/eval/src/tests/eval/gbdt/CMakeLists.txt b/eval/src/tests/eval/gbdt/CMakeLists.txt
index edbe56e3143..874a2d7bd02 100644
--- a/eval/src/tests/eval/gbdt/CMakeLists.txt
+++ b/eval/src/tests/eval/gbdt/CMakeLists.txt
@@ -13,3 +13,9 @@ vespa_add_executable(eval_gbdt_benchmark_app
vespaeval
)
vespa_add_test(NAME eval_gbdt_benchmark_app COMMAND eval_gbdt_benchmark_app BENCHMARK)
+vespa_add_executable(eval_fast_forest_bench_app
+ SOURCES
+ fast_forest_bench.cpp
+ DEPENDS
+ vespaeval
+)
diff --git a/eval/src/tests/eval/gbdt/fast_forest_bench.cpp b/eval/src/tests/eval/gbdt/fast_forest_bench.cpp
new file mode 100644
index 00000000000..76a56bec50c
--- /dev/null
+++ b/eval/src/tests/eval/gbdt/fast_forest_bench.cpp
@@ -0,0 +1,56 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/eval/eval/function.h>
+#include <vespa/eval/eval/fast_forest.h>
+#include <vespa/eval/eval/vm_forest.h>
+#include <vespa/eval/eval/llvm/compiled_function.h>
+#include "model.cpp"
+
+using namespace vespalib::eval;
+using namespace vespalib::eval::gbdt;
+
+template <typename T>
+void estimate_cost(size_t num_params, const char *label, const T &impl) {
+ std::vector<double> inputs_min(num_params, 0.25);
+ std::vector<double> inputs_med(num_params, 0.50);
+ std::vector<double> inputs_max(num_params, 0.75);
+ std::vector<double> inputs_nan(num_params, std::numeric_limits<double>::quiet_NaN());
+ double us_min = impl.estimate_cost_us(inputs_min, 5.0);
+ double us_med = impl.estimate_cost_us(inputs_med, 5.0);
+ double us_max = impl.estimate_cost_us(inputs_max, 5.0);
+ double us_nan = impl.estimate_cost_us(inputs_nan, 5.0);
+ fprintf(stderr, "[%12s] (per 100 eval): [low values] %6.3f ms, [medium values] %6.3f ms, [high values] %6.3f ms, [nan values] %6.3f ms\n",
+ label, (us_min / 10.0), (us_med / 10.0), (us_max / 10.0), (us_nan / 10.0));
+}
+
+void run_fast_forest_bench() {
+ for (size_t tree_size: std::vector<size_t>({8,16,32,64,128,256})) {
+ for (size_t num_trees: std::vector<size_t>({100, 500, 2500, 5000, 10000})) {
+ for (size_t max_features: std::vector<size_t>({200})) {
+ for (size_t less_percent: std::vector<size_t>({100})) {
+ for (size_t invert_percent: std::vector<size_t>({50})) {
+ fprintf(stderr, "\n=== features: %zu, num leafs: %zu, num trees: %zu\n", max_features, tree_size, num_trees);
+ vespalib::string expression = Model().max_features(max_features).less_percent(less_percent).invert_percent(invert_percent).make_forest(num_trees, tree_size);
+ Function function = Function::parse(expression);
+ for (size_t min_bits = std::max(size_t(8), tree_size); true; min_bits *= 2) {
+ auto forest = FastForest::try_convert(function, min_bits, 64);
+ if (forest) {
+ estimate_cost(function.num_params(), forest->impl_name().c_str(), *forest);
+ }
+ if (min_bits > 64) {
+ break;
+ }
+ }
+ estimate_cost(function.num_params(), "vm forest", CompiledFunction(function, PassParams::ARRAY, VMForest::optimize_chain));
+ }
+ }
+ }
+ }
+ }
+ fprintf(stderr, "\n");
+}
+
+int main(int, char **) {
+ run_fast_forest_bench();
+ return 0;
+}
diff --git a/eval/src/tests/eval/gbdt/gbdt_test.cpp b/eval/src/tests/eval/gbdt/gbdt_test.cpp
index 14fa4510f4d..adb3d22847a 100644
--- a/eval/src/tests/eval/gbdt/gbdt_test.cpp
+++ b/eval/src/tests/eval/gbdt/gbdt_test.cpp
@@ -17,6 +17,14 @@ using namespace vespalib::eval::gbdt;
//-----------------------------------------------------------------------------
+bool is_little_endian() {
+ uint32_t value = 0;
+ uint8_t bytes[4] = {0, 1, 2, 3};
+ static_assert(sizeof(bytes) == sizeof(value));
+ memcpy(&value, bytes, sizeof(bytes));
+ return (value == 0x03020100);
+}
+
double eval_double(const Function &function, const std::vector<double> &params) {
InterpretedFunction ifun(SimpleTensorEngine::ref(), function, NodeTypes());
InterpretedFunction::Context ctx(ifun);
@@ -26,6 +34,22 @@ double eval_double(const Function &function, const std::vector<double> &params)
double my_resolve(void *ctx, size_t idx) { return ((double*)ctx)[idx]; }
+double eval_compiled(const CompiledFunction &cfun, std::vector<double> &params) {
+ ASSERT_EQUAL(params.size(), cfun.num_params());
+ if (cfun.pass_params() == PassParams::ARRAY) {
+ return cfun.get_function()(&params[0]);
+ }
+ if (cfun.pass_params() == PassParams::LAZY) {
+ return cfun.get_lazy_function()(my_resolve, &params[0]);
+ }
+ return 31212.0;
+}
+
+double eval_ff(const FastForest &ff, FastForest::Context &ctx, const std::vector<double> &params) {
+ std::vector<float> my_params(params.begin(), params.end());
+ return ff.eval(ctx, &my_params[0]);
+}
+
//-----------------------------------------------------------------------------
TEST("require that tree stats can be calculated") {
@@ -304,29 +328,18 @@ TEST("require that FastForest model evaluation works") {
EXPECT_TRUE(compiled.get_forests().empty());
auto forest = FastForest::try_convert(function);
ASSERT_TRUE(forest);
- FastForest::Context ctx(*forest);
+ auto ctx = forest->create_context();
std::vector<double> p1({0.5, 0.5, 0.5}); // all true: 1.0 + 10.0
std::vector<double> p2({2.5, 2.5, 2.5}); // all false: 4.0 + 40.0
std::vector<double> pn(3, std::numeric_limits<double>::quiet_NaN()); // default: 4.0 + 10.0
- EXPECT_EQUAL(forest->eval(ctx, [&p1](size_t i){return p1[i];}), f(&p1[0]));
- EXPECT_EQUAL(forest->eval(ctx, [&p2](size_t i){return p2[i];}), f(&p2[0]));
- EXPECT_EQUAL(forest->eval(ctx, [&pn](size_t i){return pn[i];}), f(&pn[0]));
- EXPECT_EQUAL(forest->eval(ctx, [&p1](size_t i){return p1[i];}), f(&p1[0]));
+ EXPECT_EQUAL(eval_ff(*forest, *ctx, p1), f(&p1[0]));
+ EXPECT_EQUAL(eval_ff(*forest, *ctx, p2), f(&p2[0]));
+ EXPECT_EQUAL(eval_ff(*forest, *ctx, pn), f(&pn[0]));
+ EXPECT_EQUAL(eval_ff(*forest, *ctx, p1), f(&p1[0]));
}
//-----------------------------------------------------------------------------
-double eval_compiled(const CompiledFunction &cfun, std::vector<double> &params) {
- ASSERT_EQUAL(params.size(), cfun.num_params());
- if (cfun.pass_params() == PassParams::ARRAY) {
- return cfun.get_function()(&params[0]);
- }
- if (cfun.pass_params() == PassParams::LAZY) {
- return cfun.get_lazy_function()(my_resolve, &params[0]);
- }
- return 31212.0;
-}
-
TEST("require that forests evaluate to approximately the same for all evaluation options") {
for (PassParams pass_params: {PassParams::ARRAY, PassParams::LAZY}) {
for (size_t tree_size: std::vector<size_t>({20})) {
@@ -356,9 +369,36 @@ TEST("require that forests evaluate to approximately the same for all evaluation
EXPECT_EQUAL(expected_nan, eval_compiled(deinline, inputs_nan));
EXPECT_EQUAL(expected_nan, eval_compiled(vm_forest, inputs_nan));
if (forest) {
- FastForest::Context ctx(*forest);
- EXPECT_EQUAL(expected, forest->eval(ctx, [&inputs](size_t i){return inputs[i];}));
- EXPECT_EQUAL(expected_nan, forest->eval(ctx, [&inputs_nan](size_t i){return inputs_nan[i];}));
+ auto ctx = forest->create_context();
+ EXPECT_EQUAL(expected, eval_ff(*forest, *ctx, inputs));
+ EXPECT_EQUAL(expected_nan, eval_ff(*forest, *ctx, inputs_nan));
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+TEST("require that fast forest evaluation is correct for all tree size categories") {
+ for (size_t tree_size: std::vector<size_t>({7,15,30,61,127})) {
+ for (size_t num_trees: std::vector<size_t>({127})) {
+ for (size_t num_features: std::vector<size_t>({35})) {
+ for (size_t less_percent: std::vector<size_t>({100})) {
+ for (size_t invert_percent: std::vector<size_t>({50})) {
+ vespalib::string expression = Model().max_features(num_features).less_percent(less_percent).invert_percent(invert_percent).make_forest(num_trees, tree_size);
+ Function function = Function::parse(expression);
+ auto forest = FastForest::try_convert(function);
+ if ((tree_size <= 64) || is_little_endian()) {
+ ASSERT_TRUE(forest);
+ TEST_STATE(forest->impl_name().c_str());
+ std::vector<double> inputs(function.num_params(), 0.5);
+ std::vector<double> inputs_nan(function.num_params(), std::numeric_limits<double>::quiet_NaN());
+ double expected = eval_double(function, inputs);
+ double expected_nan = eval_double(function, inputs_nan);
+ auto ctx = forest->create_context();
+ EXPECT_EQUAL(expected, eval_ff(*forest, *ctx, inputs));
+ EXPECT_EQUAL(expected_nan, eval_ff(*forest, *ctx, inputs_nan));
}
}
}
diff --git a/eval/src/tests/eval/gbdt/model.cpp b/eval/src/tests/eval/gbdt/model.cpp
index ae1c9bea437..8f0d87a4020 100644
--- a/eval/src/tests/eval/gbdt/model.cpp
+++ b/eval/src/tests/eval/gbdt/model.cpp
@@ -13,6 +13,7 @@ class Model
{
private:
std::mt19937 _gen;
+ size_t _max_features;
size_t _less_percent;
size_t _invert_percent;
@@ -32,9 +33,9 @@ private:
}
std::string make_feature_name() {
- size_t max_feature = 2;
- while ((max_feature < 1024) && (get_int(0, 99) < 55)) {
- max_feature *= 2;
+ size_t max_feature = 7;
+ while ((max_feature < _max_features) && (get_int(0, 99) < 55)) {
+ max_feature = std::min(max_feature * 2, _max_features);
}
return make_string("feature_%zu", get_int(1, max_feature));
}
@@ -60,7 +61,12 @@ private:
}
public:
- explicit Model(size_t seed = 5489u) : _gen(seed), _less_percent(80), _invert_percent(0) {}
+ explicit Model(size_t seed = 5489u) : _gen(seed), _max_features(1024), _less_percent(80), _invert_percent(0) {}
+
+ Model &max_features(size_t value) {
+ _max_features = value;
+ return *this;
+ }
Model &less_percent(size_t value) {
_less_percent = value;
diff --git a/eval/src/vespa/eval/eval/fast_forest.cpp b/eval/src/vespa/eval/eval/fast_forest.cpp
index 927c5355e6a..be58c78d834 100644
--- a/eval/src/vespa/eval/eval/fast_forest.cpp
+++ b/eval/src/vespa/eval/eval/fast_forest.cpp
@@ -8,16 +8,38 @@
#include <vespa/vespalib/util/benchmark_timer.h>
#include <algorithm>
#include <cassert>
+#include <arpa/inet.h>
namespace vespalib::eval::gbdt {
namespace {
+//-----------------------------------------------------------------------------
+// internal concepts used during model creation
+//-----------------------------------------------------------------------------
+
+constexpr size_t bits_per_byte = 8;
+
+bool is_little_endian() {
+ uint32_t value = 0;
+ uint8_t bytes[4] = {0, 1, 2, 3};
+ static_assert(sizeof(bytes) == sizeof(value));
+ memcpy(&value, bytes, sizeof(bytes));
+ return (value == 0x03020100);
+}
+
struct BitRange {
uint32_t first;
uint32_t last;
BitRange(uint32_t bit) : first(bit), last(bit) {}
BitRange(uint32_t a, uint32_t b) : first(a), last(b) {}
+ template <typename T>
+ size_t covered_words() const {
+ assert(first <= last);
+ uint32_t v1 = (first / (bits_per_byte * sizeof(T)));
+ uint32_t v2 = (last / (bits_per_byte * sizeof(T)));
+ return ((v2 - v1) + 1);
+ }
static BitRange join(const BitRange &a, const BitRange &b) {
assert((a.last + 1) == b.first);
return BitRange(a.first, b.last);
@@ -43,8 +65,11 @@ struct State {
using CmpNodes = std::vector<CmpNode>;
std::vector<CmpNodes> cmp_nodes;
std::vector<Leafs> leafs;
+ size_t max_leafs;
BitRange encode_node(uint32_t tree_id, const nodes::Node &node);
State(size_t num_params, const std::vector<const nodes::Node *> &trees);
+ size_t num_params() const { return cmp_nodes.size(); }
+ size_t num_trees() const { return leafs.size(); }
~State() = default;
};
@@ -86,12 +111,14 @@ State::encode_node(uint32_t tree_id, const nodes::Node &node)
State::State(size_t num_params, const std::vector<const nodes::Node *> &trees)
: cmp_nodes(num_params),
- leafs(trees.size())
+ leafs(trees.size()),
+ max_leafs(0)
{
for (uint32_t tree_id = 0; tree_id < trees.size(); ++tree_id) {
BitRange leaf_range = encode_node(tree_id, *trees[tree_id]);
assert(leaf_range.first == 0);
assert((leaf_range.last + 1) == leafs[tree_id].size());
+ max_leafs = std::max(max_leafs, leafs[tree_id].size());
}
for (CmpNodes &cmp_range: cmp_nodes) {
assert(!cmp_range.empty());
@@ -99,134 +126,547 @@ State::State(size_t num_params, const std::vector<const nodes::Node *> &trees)
}
}
+template <typename T> size_t get_lsb(T value) { return vespalib::Optimized::lsbIdx(value); }
+template <> size_t get_lsb<uint8_t>(uint8_t value) { return vespalib::Optimized::lsbIdx(uint32_t(value)); }
+template <> size_t get_lsb<uint16_t>(uint16_t value) { return vespalib::Optimized::lsbIdx(uint32_t(value)); }
+
+//-----------------------------------------------------------------------------
+// implementation using single value mask per tree
+//-----------------------------------------------------------------------------
+
+template <typename T> vespalib::string fixed_impl_name();
+template <> vespalib::string fixed_impl_name<uint8_t>() { return "ff-fixed<8>"; }
+template <> vespalib::string fixed_impl_name<uint16_t>() { return "ff-fixed<16>"; }
+template <> vespalib::string fixed_impl_name<uint32_t>() { return "ff-fixed<32>"; }
+template <> vespalib::string fixed_impl_name<uint64_t>() { return "ff-fixed<64>"; }
+
+template <typename T>
+constexpr size_t max_leafs() { return (sizeof(T) * bits_per_byte); }
+
+template <typename T>
+struct FixedContext : FastForest::Context {
+ std::vector<T> masks;
+ FixedContext(size_t num_trees) : masks(num_trees) {}
+};
+
+template <typename T>
+struct FixedForest : FastForest {
+
+ static T make_mask(const CmpNode &cmp_node) {
+ BitRange range = cmp_node.false_mask;
+ size_t num_bits = (sizeof(T) * bits_per_byte);
+ assert(range.last < num_bits);
+ assert(range.first <= range.last);
+ T mask = 0;
+ for (uint32_t i = 0; i < num_bits; ++i) {
+ if ((i < range.first) || (i > range.last)) {
+ mask |= (T(1) << i);
+ }
+ }
+ return mask;
+ }
+
+ struct Mask {
+ float value;
+ uint32_t tree;
+ T bits;
+ Mask(float v, uint32_t t, T b)
+ : value(v), tree(t), bits(b) {}
+ };
+
+ struct DMask {
+ uint32_t tree;
+ T bits;
+ DMask(uint32_t t, T b)
+ : tree(t), bits(b) {}
+ };
+
+ std::vector<uint32_t> _mask_sizes;
+ std::vector<Mask> _masks;
+ std::vector<uint32_t> _default_offsets;
+ std::vector<DMask> _default_masks;
+ std::vector<float> _padded_leafs;
+ uint32_t _num_trees;
+ uint32_t _max_leafs;
+
+ FixedForest(const State &state);
+ static FastForest::UP try_build(const State &state, size_t min_fixed, size_t max_fixed);
+
+ void init_state(T *ctx_masks) const;
+ static void apply_masks(T *ctx_masks, const Mask *pos, const Mask *end, float limit);
+ static void apply_masks(T *ctx_masks, const DMask *pos, const DMask *end);
+ double get_result(const T *ctx_masks) const;
+
+ vespalib::string impl_name() const override { return fixed_impl_name<T>(); }
+ Context::UP create_context() const override;
+ double eval(Context &context, const float *params) const override;
+};
+
+template <typename T>
+FixedForest<T>::FixedForest(const State &state)
+ : _mask_sizes(),
+ _masks(),
+ _default_offsets(),
+ _default_masks(),
+ _padded_leafs(),
+ _num_trees(state.num_trees()),
+ _max_leafs(state.max_leafs)
+{
+ for (const auto &cmp_nodes: state.cmp_nodes) {
+ _mask_sizes.emplace_back(cmp_nodes.size());
+ _default_offsets.push_back(_default_masks.size());
+ for (const CmpNode &cmp_node: cmp_nodes) {
+ _masks.emplace_back(cmp_node.value, cmp_node.tree_id, make_mask(cmp_node));
+ if (cmp_node.false_is_default) {
+ _default_masks.emplace_back(cmp_node.tree_id, make_mask(cmp_node));
+ }
+ }
+ }
+ _default_offsets.push_back(_default_masks.size());
+ for (const auto &leafs: state.leafs) {
+ for (float leaf: leafs) {
+ _padded_leafs.push_back(leaf);
+ }
+ size_t padding = (_max_leafs - leafs.size());
+ while (padding-- > 0) {
+ _padded_leafs.push_back(0.0);
+ }
+ }
+ assert(_padded_leafs.size() == (_num_trees * _max_leafs));
+}
+
+template <typename T>
+FastForest::UP
+FixedForest<T>::try_build(const State &state, size_t min_fixed, size_t max_fixed)
+{
+ if ((max_leafs<T>() >= min_fixed) &&
+ (max_leafs<T>() <= max_fixed) &&
+ (state.max_leafs <= max_leafs<T>()))
+ {
+ return std::make_unique<FixedForest<T>>(state);
+ }
+ return FastForest::UP();
+}
+
+template <typename T>
+void
+FixedForest<T>::init_state(T *ctx_masks) const
+{
+ memset(ctx_masks, 0xff, _num_trees * sizeof(T));
+}
+
+template <typename T>
+void
+FixedForest<T>::apply_masks(T *ctx_masks, const Mask *pos, const Mask *end, float limit)
+{
+ for (; ((pos+3) < end) && !(limit < pos[3].value); pos += 4) {
+ ctx_masks[pos[0].tree] &= pos[0].bits;
+ ctx_masks[pos[1].tree] &= pos[1].bits;
+ ctx_masks[pos[2].tree] &= pos[2].bits;
+ ctx_masks[pos[3].tree] &= pos[3].bits;
+ }
+ for (; (pos < end) && !(limit < pos->value); ++pos) {
+ ctx_masks[pos->tree] &= pos->bits;
+ }
+}
+
+template <typename T>
+void
+FixedForest<T>::apply_masks(T *ctx_masks, const DMask *pos, const DMask *end)
+{
+ for (; ((pos+3) < end); pos += 4) {
+ ctx_masks[pos[0].tree] &= pos[0].bits;
+ ctx_masks[pos[1].tree] &= pos[1].bits;
+ ctx_masks[pos[2].tree] &= pos[2].bits;
+ ctx_masks[pos[3].tree] &= pos[3].bits;
+ }
+ for (; (pos < end); ++pos) {
+ ctx_masks[pos->tree] &= pos->bits;
+ }
+}
+
+template <typename T>
+double
+FixedForest<T>::get_result(const T *ctx_masks) const
+{
+ double result1 = 0.0;
+ double result2 = 0.0;
+ const T *ctx_end = (ctx_masks + _num_trees);
+ const float *leafs = &_padded_leafs[0];
+ size_t leaf_cnt = _max_leafs;
+ for (; (ctx_masks + 3) < ctx_end; ctx_masks += 4, leafs += (leaf_cnt * 4)) {
+ result1 += leafs[(0 * leaf_cnt) + get_lsb(ctx_masks[0])];
+ result2 += leafs[(1 * leaf_cnt) + get_lsb(ctx_masks[1])];
+ result1 += leafs[(2 * leaf_cnt) + get_lsb(ctx_masks[2])];
+ result2 += leafs[(3 * leaf_cnt) + get_lsb(ctx_masks[3])];
+ }
+ for (; ctx_masks < ctx_end; ++ctx_masks, leafs += leaf_cnt) {
+ result1 += leafs[get_lsb(*ctx_masks)];
+ }
+ return (result1 + result2);
}
-struct FastForestBuilder {
+template <typename T>
+FastForest::Context::UP
+FixedForest<T>::create_context() const
+{
+ return std::make_unique<FixedContext<T>>(_num_trees);
+}
- static FastForest::MaskType get_mask_type(uint32_t idx1, uint32_t idx2) {
- assert(idx1 <= idx2);
- if (idx1 == idx2) {
- return FastForest::MaskType::ONE;
- } else if ((idx1 + 1) == idx2) {
- return FastForest::MaskType::TWO;
+template <typename T>
+double
+FixedForest<T>::eval(Context &context, const float *params) const
+{
+ T *ctx_masks = &static_cast<FixedContext<T>&>(context).masks[0];
+ init_state(ctx_masks);
+ const Mask *mask_pos = &_masks[0];
+ const float *param_pos = params;
+ for (uint32_t size: _mask_sizes) {
+ float feature = *param_pos++;
+ if (!std::isnan(feature)) {
+ apply_masks(ctx_masks, mask_pos, mask_pos + size, feature);
} else {
- return FastForest::MaskType::MANY;
+ apply_masks(ctx_masks,
+ &_default_masks[_default_offsets[(param_pos-params)-1]],
+ &_default_masks[_default_offsets[(param_pos-params)]]);
}
+ mask_pos += size;
}
+ return get_result(ctx_masks);
+}
+
+//-----------------------------------------------------------------------------
+// implementation using multiple words for each tree
+//-----------------------------------------------------------------------------
+
+struct MultiWordContext : FastForest::Context {
+ std::vector<uint32_t> words;
+ MultiWordContext(size_t size) : words(size) {}
+};
+
+struct MultiWordForest : FastForest {
+
+ constexpr static size_t word_size = sizeof(uint32_t);
+ constexpr static size_t bits_per_word = (word_size * bits_per_byte);
+
+ struct Sizes {
+ uint32_t fixed;
+ uint32_t rle;
+ Sizes(uint32_t f, uint32_t r) : fixed(f), rle(r) {}
+ };
+
+ struct Mask {
+ float value;
+ uint32_t offset;
+ union {
+ uint32_t bits;
+ uint8_t rle_mask[3];
+ };
+ Mask(float v, uint32_t word_offset, uint32_t mask_bits)
+ : value(v), offset(word_offset), bits(mask_bits) {}
+ Mask(float v, uint32_t byte_offset, uint8_t first_bits, uint8_t empty_bytes, uint8_t last_bits)
+ : value(v), offset(byte_offset), rle_mask{first_bits, empty_bytes, last_bits} {}
+ };
- static FastForest::Mask make_mask(const CmpNode &cmp_node) {
+ struct DMask {
+ uint32_t offset;
+ union {
+ uint32_t bits;
+ uint8_t rle_mask[3];
+ };
+ DMask(uint32_t word_offset, uint32_t mask_bits)
+ : offset(word_offset), bits(mask_bits) {}
+ DMask(uint32_t byte_offset, uint8_t first_bits, uint8_t empty_bytes, uint8_t last_bits)
+ : offset(byte_offset), rle_mask{first_bits, empty_bytes, last_bits} {}
+ };
+
+ static Mask make_fixed_mask(const CmpNode &cmp_node, size_t words_per_tree) {
BitRange range = cmp_node.false_mask;
- assert(range.last < (8 * 256));
- assert(range.first <= range.last);
- uint32_t idx1 = (range.first / 8);
- uint32_t idx2 = (range.last / 8);
+ assert(range.covered_words<uint32_t>() == 1);
+ size_t offset = (range.first / bits_per_word);
+ uint32_t bits = 0;
+ for (uint32_t i = 0; i < bits_per_word; ++i) {
+ uint32_t bit = (offset * bits_per_word) + i;
+ if ((bit < range.first) || (bit > range.last)) {
+ bits |= (uint32_t(1) << i);
+ }
+ }
+ offset += (words_per_tree * cmp_node.tree_id);
+ return Mask(cmp_node.value, offset, bits);
+ }
+
+ static Mask make_rle_mask(const CmpNode &cmp_node, size_t words_per_tree) {
+ BitRange range = cmp_node.false_mask;
+ assert(range.covered_words<uint32_t>() > 1);
+ uint32_t idx1 = (range.first / bits_per_byte);
+ uint32_t idx2 = (range.last / bits_per_byte);
uint8_t bits1 = 0;
uint8_t bits2 = 0;
- for (uint32_t i = 0; i < 8; ++i) {
- uint32_t bit1 = (idx1 * 8) + i;
+ for (uint32_t i = 0; i < bits_per_byte; ++i) {
+ uint32_t bit1 = (idx1 * bits_per_byte) + i;
if ((bit1 < range.first) || (bit1 > range.last)) {
- bits1 |= (1 << i);
+ bits1 |= (uint8_t(1) << i);
}
- uint32_t bit2 = (idx2 * 8) + i;
+ uint32_t bit2 = (idx2 * bits_per_byte) + i;
if ((bit2 < range.first) || (bit2 > range.last)) {
- bits2 |= (1 << i);
+ bits2 |= (uint8_t(1) << i);
}
}
- assert(cmp_node.tree_id < (256 * 256));
- return FastForest::Mask(cmp_node.tree_id, get_mask_type(idx1, idx2), cmp_node.false_is_default,
- idx1, bits1, idx2, bits2);
+ uint32_t offset = (idx1 + (word_size * words_per_tree * cmp_node.tree_id));
+ uint32_t empty_cnt = ((idx2 - idx1) - 1);
+ assert(empty_cnt < 256);
+ return Mask(cmp_node.value, offset, bits1, empty_cnt, bits2);
}
- static void build(State &state, FastForest &ff) {
- for (const auto &cmp_nodes: state.cmp_nodes) {
- ff._feature_sizes.push_back(cmp_nodes.size());
- for (const CmpNode &cmp_node: cmp_nodes) {
- ff._values.push_back(cmp_node.value);
- ff._masks.push_back(make_mask(cmp_node));
+ std::vector<Sizes> _mask_sizes;
+ std::vector<Mask> _masks;
+ std::vector<Sizes> _default_offsets;
+ std::vector<DMask> _default_masks;
+ std::vector<uint32_t> _tree_offsets;
+ std::vector<float> _leafs;
+ uint32_t _words_per_tree;
+
+ MultiWordForest(const State &state);
+ static FastForest::UP try_build(const State &state);
+
+ void init_state(uint32_t *ctx_words) const;
+ static void apply_fixed_masks(uint32_t *ctx_words, const Mask *pos, const Mask *end, float limit);
+ static void apply_rle_masks(unsigned char *ctx_bytes, const Mask *pos, const Mask *end, float limit);
+ static void apply_fixed_masks(uint32_t *ctx_words, const DMask *pos, const DMask *end);
+ static void apply_rle_masks(unsigned char *ctx_bytes, const DMask *pos, const DMask *end);
+ static size_t find_leaf(const uint32_t *ctx_words);
+ double get_result(const uint32_t *ctx_words) const;
+
+ vespalib::string impl_name() const override { return "ff-multiword"; }
+ Context::UP create_context() const override;
+ double eval(Context &context, const float *params) const override;
+};
+
+MultiWordForest::MultiWordForest(const State &state)
+ : _mask_sizes(),
+ _masks(),
+ _default_offsets(),
+ _default_masks(),
+ _tree_offsets(),
+ _leafs(),
+ _words_per_tree(BitRange(0, state.max_leafs - 1).covered_words<uint32_t>())
+{
+ for (const auto &cmp_nodes: state.cmp_nodes) {
+ std::vector<CmpNode> fixed;
+ std::vector<CmpNode> rle;
+ size_t default_fixed_cnt = 0;
+ for (const CmpNode &cmp_node: cmp_nodes) {
+ if (cmp_node.false_mask.covered_words<uint32_t>() == 1) {
+ fixed.push_back(cmp_node);
+ if (cmp_node.false_is_default) {
+ ++default_fixed_cnt;
+ }
+ } else {
+ rle.push_back(cmp_node);
}
}
- for (const auto &leafs: state.leafs) {
- ff._tree_sizes.push_back(leafs.size());
- for (float leaf: leafs) {
- ff._leafs.push_back(leaf);
+ _mask_sizes.emplace_back(fixed.size(), rle.size());
+ _default_offsets.emplace_back(_default_masks.size(),
+ _default_masks.size() + default_fixed_cnt);
+ for (const CmpNode &cmp_node: fixed) {
+ _masks.push_back(make_fixed_mask(cmp_node, _words_per_tree));
+ if (cmp_node.false_is_default) {
+ _default_masks.emplace_back(_masks.back().offset,
+ _masks.back().bits);
+ }
+ }
+ assert(_default_masks.size() == _default_offsets.back().rle);
+ for (const CmpNode &cmp_node: rle) {
+ _masks.push_back(make_rle_mask(cmp_node, _words_per_tree));
+ if (cmp_node.false_is_default) {
+ _default_masks.emplace_back(_masks.back().offset,
+ _masks.back().rle_mask[0],
+ _masks.back().rle_mask[1],
+ _masks.back().rle_mask[2]);
}
}
}
-};
+ _default_offsets.emplace_back(_default_masks.size(), _default_masks.size());
+ for (const auto &leafs: state.leafs) {
+ _tree_offsets.push_back(_leafs.size());
+ for (float leaf: leafs) {
+ _leafs.push_back(leaf);
+ }
+ }
+}
-FastForest::Context::Context(const FastForest &ff)
- : _forest(&ff),
- _bytes_per_tree((ff.max_leafs() + 7) / 8),
- _bits(_bytes_per_tree * ff.num_trees()) {}
+FastForest::UP
+MultiWordForest::try_build(const State &state)
+{
+ if (is_little_endian()) {
+ if (state.max_leafs <= (bits_per_byte * 256)) {
+ return std::make_unique<MultiWordForest>(state);
+ }
+ }
+ return FastForest::UP();
+}
-FastForest::Context::~Context() = default;
+void
+MultiWordForest::init_state(uint32_t *ctx_words) const
+{
+ memset(ctx_words, 0xff, word_size * _words_per_tree * _tree_offsets.size());
+}
-FastForest::FastForest() = default;
-FastForest::~FastForest() = default;
+void
+MultiWordForest::apply_fixed_masks(uint32_t *ctx_words, const Mask *pos, const Mask *end, float limit)
+{
+ for (; ((pos+3) < end) && !(limit < pos[3].value); pos += 4) {
+ ctx_words[pos[0].offset] &= pos[0].bits;
+ ctx_words[pos[1].offset] &= pos[1].bits;
+ ctx_words[pos[2].offset] &= pos[2].bits;
+ ctx_words[pos[3].offset] &= pos[3].bits;
+ }
+ for (; (pos < end) && !(limit < pos->value); ++pos) {
+ ctx_words[pos->offset] &= pos->bits;
+ }
+}
-size_t
-FastForest::num_params() const
+void
+MultiWordForest::apply_rle_masks(unsigned char *ctx_bytes, const Mask *pos, const Mask *end, float limit)
{
- return _feature_sizes.size();
+ for (; (pos < end) && !(limit < pos->value); ++pos) {
+ unsigned char *dst = (ctx_bytes + pos->offset);
+ *dst++ &= pos->rle_mask[0];
+ for (size_t e = pos->rle_mask[1]; e-- > 0; ) {
+ *dst++ = 0;
+ }
+ *dst &= pos->rle_mask[2];
+ }
}
-size_t
-FastForest::num_trees() const
+void
+MultiWordForest::apply_fixed_masks(uint32_t *ctx_words, const DMask *pos, const DMask *end)
{
- return _tree_sizes.size();
+ for (; ((pos+3) < end); pos += 4) {
+ ctx_words[pos[0].offset] &= pos[0].bits;
+ ctx_words[pos[1].offset] &= pos[1].bits;
+ ctx_words[pos[2].offset] &= pos[2].bits;
+ ctx_words[pos[3].offset] &= pos[3].bits;
+ }
+ for (; (pos < end); ++pos) {
+ ctx_words[pos->offset] &= pos->bits;
+ }
}
-size_t
-FastForest::max_leafs() const
+void
+MultiWordForest::apply_rle_masks(unsigned char *ctx_bytes, const DMask *pos, const DMask *end)
{
- size_t res = 0;
- size_t sum = 0;
- for (size_t sz: _tree_sizes) {
- res = std::max(res, sz);
- sum += sz;
+ for (; pos < end; ++pos) {
+ unsigned char *dst = (ctx_bytes + pos->offset);
+ *dst++ &= pos->rle_mask[0];
+ for (size_t e = pos->rle_mask[1]; e-- > 0; ) {
+ *dst++ = 0;
+ }
+ *dst &= pos->rle_mask[2];
}
- assert(res <= (8 * 256));
- assert(sum == _leafs.size());
- return res;
}
-FastForest::UP
-FastForest::try_convert(const Function &fun)
+size_t
+MultiWordForest::find_leaf(const uint32_t *word)
{
- const auto &root = fun.root();
- if (!root.is_forest()) {
- // must be only forest
- return FastForest::UP();
+ size_t idx = 0;
+ for (; *word == 0; ++word) {
+ idx += bits_per_word;
}
- auto trees = gbdt::extract_trees(root);
- if (trees.size() > (256 * 256)) {
- // too many trees
- return FastForest::UP();
+ return (idx + get_lsb(*word));
+}
+
+double
+MultiWordForest::get_result(const uint32_t *ctx_words) const
+{
+ double result = 0.0;
+ const float *leafs = &_leafs[0];
+ for (size_t tree_offset: _tree_offsets) {
+ result += leafs[tree_offset + find_leaf(ctx_words)];
+ ctx_words += _words_per_tree;
}
- gbdt::ForestStats stats(trees);
- if (stats.total_in_checks > 0) {
- // set membership not supported
- return FastForest::UP();
+ return result;
+}
+
+FastForest::Context::UP
+MultiWordForest::create_context() const
+{
+ return std::make_unique<MultiWordContext>(_words_per_tree * _tree_offsets.size());
+}
+
+double
+MultiWordForest::eval(Context &context, const float *params) const
+{
+ uint32_t *ctx_words = &static_cast<MultiWordContext&>(context).words[0];
+ init_state(ctx_words);
+ const Mask *mask_pos = &_masks[0];
+ const float *param_pos = params;
+ for (const Sizes &size: _mask_sizes) {
+ float feature = *param_pos++;
+ if (!std::isnan(feature)) {
+ apply_fixed_masks(ctx_words, mask_pos, mask_pos + size.fixed, feature);
+ apply_rle_masks(reinterpret_cast<unsigned char *>(ctx_words),
+ mask_pos + size.fixed, mask_pos + size.fixed + size.rle, feature);
+ } else {
+ apply_fixed_masks(ctx_words,
+ &_default_masks[_default_offsets[(param_pos-params)-1].fixed],
+ &_default_masks[_default_offsets[(param_pos-params)-1].rle]);
+ apply_rle_masks(reinterpret_cast<unsigned char *>(ctx_words),
+ &_default_masks[_default_offsets[(param_pos-params)-1].rle],
+ &_default_masks[_default_offsets[(param_pos-params)].fixed]);
+ }
+ mask_pos += (size.fixed + size.rle);
}
- if (stats.tree_sizes.back().size > (8 * 256)) {
- // too many leaf nodes per tree
- return FastForest::UP();
+ return get_result(ctx_words);
+}
+
+}
+
+//-----------------------------------------------------------------------------
+// outer shell unifying the different implementations
+//-----------------------------------------------------------------------------
+
+FastForest::Context::Context() = default;
+FastForest::Context::~Context() = default;
+
+FastForest::FastForest() = default;
+FastForest::~FastForest() = default;
+
+FastForest::UP
+FastForest::try_convert(const Function &fun, size_t min_fixed, size_t max_fixed)
+{
+ const auto &root = fun.root();
+ if (root.is_forest()) {
+ auto trees = gbdt::extract_trees(root);
+ gbdt::ForestStats stats(trees);
+ if (stats.total_in_checks == 0) {
+ State state(fun.num_params(), trees);
+ if (auto forest = FixedForest<uint8_t>::try_build(state, min_fixed, max_fixed)) {
+ return forest;
+ }
+ if (auto forest = FixedForest<uint16_t>::try_build(state, min_fixed, max_fixed)) {
+ return forest;
+ }
+ if (auto forest = FixedForest<uint32_t>::try_build(state, min_fixed, max_fixed)) {
+ return forest;
+ }
+ if (auto forest = FixedForest<uint64_t>::try_build(state, min_fixed, max_fixed)) {
+ return forest;
+ }
+ if (auto forest = MultiWordForest::try_build(state)) {
+ return forest;
+ }
+ }
}
- State state(fun.num_params(), trees);
- FastForest::UP res = FastForest::UP(new FastForest());
- FastForestBuilder::build(state, *res);
- assert(fun.num_params() == res->num_params());
- assert(trees.size() == res->num_trees());
- return res;
+ return FastForest::UP();
}
double
FastForest::estimate_cost_us(const std::vector<double> &params, double budget) const
{
- Context ctx(*this);
- auto get_param = [&params](size_t i)->float{ return params[i]; };
- auto self_eval = [&](){ this->eval(ctx, get_param); };
- return BenchmarkTimer::benchmark(self_eval, budget) * 1000.0 * 1000.0;
+ auto ctx = create_context();
+ std::vector<float> my_params(params.begin(), params.end());
+ return BenchmarkTimer::benchmark([&](){ eval(*ctx, &my_params[0]); }, budget) * 1000.0 * 1000.0;
}
}
diff --git a/eval/src/vespa/eval/eval/fast_forest.h b/eval/src/vespa/eval/eval/fast_forest.h
index df3be2c4aef..27171322454 100644
--- a/eval/src/vespa/eval/eval/fast_forest.h
+++ b/eval/src/vespa/eval/eval/fast_forest.h
@@ -14,127 +14,28 @@ namespace vespalib::eval::gbdt {
* Use modern optimization strategies to improve evaluation
* performance of GBDT forests.
*
- * This model evaluation supports up to 65536 trees with up to 2048
- * leaf nodes in each tree. Comparisons must be on the form 'feature <
- * const' or '!(feature >= const)'. The inverted form is used to
- * signal that the true branch should be selected when the feature
- * values is missing (NaN).
+ * Comparisons must be on the form 'feature < const' or '!(feature >=
+ * const)'. The inverted form is used to signal that the true branch
+ * should be selected when the feature value is missing (NaN).
**/
class FastForest
{
+protected:
+ FastForest();
public:
+ virtual ~FastForest();
+ using UP = std::unique_ptr<FastForest>;
class Context {
- friend class FastForest;
- private:
- const FastForest *_forest;
- size_t _bytes_per_tree;
- std::vector<uint8_t> _bits;
+ protected:
+ Context();
public:
- explicit Context(const FastForest &ff);
- ~Context();
- };
-
-private:
- friend struct FastForestBuilder;
-
- enum class MaskType : uint8_t { ONE, TWO, MANY };
-
- struct Mask {
- uint16_t tree_id;
- MaskType type;
- uint8_t false_is_default;
- uint8_t first_idx;
- uint8_t first_bits;
- uint8_t last_idx;
- uint8_t last_bits;
- Mask(uint16_t tree, MaskType mt, bool def_false,
- uint8_t idx1, uint8_t bits1, uint8_t idx2, uint8_t bits2)
- : tree_id(tree), type(mt), false_is_default(def_false),
- first_idx(idx1), first_bits(bits1), last_idx(idx2), last_bits(bits2) {}
+ virtual ~Context();
+ using UP = std::unique_ptr<Context>;
};
-
- std::vector<uint32_t> _feature_sizes;
- std::vector<float> _values;
- std::vector<Mask> _masks;
- std::vector<uint32_t> _tree_sizes;
- std::vector<float> _leafs;
-
- FastForest();
-
- static void apply_mask(uint8_t *bits, size_t bytes_per_tree, const Mask &mask) {
- uint8_t *dst = (bits + (mask.tree_id * bytes_per_tree) + mask.first_idx);
- *dst &= mask.first_bits;
- if (__builtin_expect(mask.type != MaskType::ONE, false)) {
- if (__builtin_expect(mask.type == MaskType::MANY, false)) {
- size_t n = (mask.last_idx - mask.first_idx - 1);
- while (n-- > 0) {
- *++dst = 0x00;
- }
- }
- *++dst &= mask.last_bits;
- }
- }
-
- static float find_leaf(const uint8_t *bits, const float *leafs) {
- while (__builtin_expect(*bits == 0, true)) {
- ++bits;
- leafs += 8;
- }
- return leafs[vespalib::Optimized::lsbIdx(uint32_t(*bits))];
- }
-
-public:
- ~FastForest();
- size_t num_params() const;
- size_t num_trees() const;
- size_t max_leafs() const;
- using UP = std::unique_ptr<FastForest>;
- static UP try_convert(const Function &fun);
-
- template <typename F>
- double eval(Context &ctx, F &&f) const {
- assert(ctx._forest == this);
- size_t bytes_per_tree = ctx._bytes_per_tree;
- uint8_t *bits = &ctx._bits[0];
- const float *value_ptr = &_values[0];
- const Mask *mask_ptr = &_masks[0];
- memset(bits, 0xff, ctx._bits.size());
- for (size_t f_idx = 0; f_idx < _feature_sizes.size(); ++f_idx) {
- size_t feature_size = _feature_sizes[f_idx];
- float feature_value = f(f_idx); // get param
- if (__builtin_expect(std::isnan(feature_value), false)) {
- // handle 'missing' input feature
- for (size_t i = 0; i < feature_size; ++i) {
- const Mask &mask = mask_ptr[i];
- if (mask.false_is_default) {
- apply_mask(bits, bytes_per_tree, mask);
- }
- }
- } else {
- for (size_t i = 0; i < feature_size; ++i) {
- if (__builtin_expect(feature_value < value_ptr[i], false)) {
- break;
- } else {
- apply_mask(bits, bytes_per_tree, mask_ptr[i]);
- }
- }
- }
- value_ptr += feature_size;
- mask_ptr += feature_size;
- }
- assert(value_ptr == &*_values.end());
- assert(mask_ptr == &*_masks.end());
- const float *leafs = &_leafs[0];
- double result = 0.0;
- for (size_t tree_size: _tree_sizes) {
- result += find_leaf(bits, leafs);
- bits += bytes_per_tree;
- leafs += tree_size;
- }
- assert(bits == &*ctx._bits.end());
- assert(leafs == &*_leafs.end());
- return result;
- }
+ static UP try_convert(const Function &fun, size_t min_fixed = 8, size_t max_fixed = 64);
+ virtual vespalib::string impl_name() const = 0;
+ virtual Context::UP create_context() const = 0;
+ virtual double eval(Context &context, const float *params) const = 0;
double estimate_cost_us(const std::vector<double> &params, double budget = 5.0) const;
};
diff --git a/searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp b/searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp
index a4b2280fa57..0246f96f1df 100644
--- a/searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp
+++ b/searchlib/src/vespa/searchlib/features/rankingexpressionfeature.cpp
@@ -50,10 +50,11 @@ class FastForestExecutor : public fef::FeatureExecutor
{
private:
const FastForest &_forest;
- FastForest::Context _ctx;
+ FastForest::Context::UP _ctx;
+ ArrayRef<float> _params;
public:
- FastForestExecutor(const FastForest &forest);
+ FastForestExecutor(ArrayRef<float> param_space, const FastForest &forest);
bool isPure() override { return true; }
void execute(uint32_t docId) override;
};
@@ -128,18 +129,27 @@ public:
//-----------------------------------------------------------------------------
-FastForestExecutor::FastForestExecutor(const FastForest &forest)
+FastForestExecutor::FastForestExecutor(ArrayRef<float> param_space, const FastForest &forest)
: _forest(forest),
- _ctx(_forest)
+ _ctx(_forest.create_context()),
+ _params(param_space)
{
}
void
FastForestExecutor::execute(uint32_t)
{
- const auto &params = inputs();
- double result = _forest.eval(_ctx, [&params](size_t p){ return params.get_number(p); });
- outputs().set_number(0, result);
+ size_t i = 0;
+ for (; (i + 3) < _params.size(); i += 4) {
+ _params[i+0] = inputs().get_number(i+0);
+ _params[i+1] = inputs().get_number(i+1);
+ _params[i+2] = inputs().get_number(i+2);
+ _params[i+3] = inputs().get_number(i+3);
+ }
+ for (; i < _params.size(); ++i) {
+ _params[i] = inputs().get_number(i);
+ }
+ outputs().set_number(0, _forest.eval(*_ctx, &_params[0]));
}
//-----------------------------------------------------------------------------
@@ -342,7 +352,8 @@ RankingExpressionBlueprint::createExecutor(const fef::IQueryEnvironment &env, ve
return stash.create<InterpretedRankingExpressionExecutor>(*_interpreted_function, input_is_object);
}
if (_fast_forest) {
- return stash.create<FastForestExecutor>(*_fast_forest);
+ ArrayRef<float> param_space = stash.create_array<float>(_input_is_object.size(), 0.0);
+ return stash.create<FastForestExecutor>(param_space, *_fast_forest);
}
assert(_compile_token.get() != nullptr); // will be nullptr for VERIFY_SETUP feature motivation
if (_compile_token->get().pass_params() == PassParams::ARRAY) {