diff options
author | HÃ¥vard Pettersen <3535158+havardpe@users.noreply.github.com> | 2020-11-02 11:55:59 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-11-02 11:55:59 +0100 |
commit | ed55e251ef2db9e3ff6a690d2ca011cf59a838a9 (patch) | |
tree | 2267160783b2e795b91d8c120d3d7bddca169f3a | |
parent | 65d0a37a01a4b1f2420b8f4cae3ea9091b01b189 (diff) | |
parent | 5cf5c2d2f0c7e371364eb9a29c255973c7ccd554 (diff) |
Merge pull request #15119 from vespa-engine/havardpe/simple-median-aggr
added simple median aggregator
-rw-r--r-- | eval/src/apps/tensor_conformance/generate.cpp | 2 | ||||
-rw-r--r-- | eval/src/tests/eval/aggr/aggr_test.cpp | 36 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/aggr.cpp | 16 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/aggr.h | 53 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/test/eval_spec.cpp | 1 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/test/tensor_conformance.cpp | 1 | ||||
-rw-r--r-- | eval/src/vespa/eval/tensor/dense/dense_remove_dimension_optimizer.cpp | 9 |
7 files changed, 98 insertions, 20 deletions
diff --git a/eval/src/apps/tensor_conformance/generate.cpp b/eval/src/apps/tensor_conformance/generate.cpp index df1c06593cb..ea535758cda 100644 --- a/eval/src/apps/tensor_conformance/generate.cpp +++ b/eval/src/apps/tensor_conformance/generate.cpp @@ -47,6 +47,8 @@ void generate_tensor_reduce(TestBuilder &dst) { generate_reduce(Aggr::PROD, SigmoidF(N()), dst); generate_reduce(Aggr::SUM, N(), dst); generate_reduce(Aggr::MAX, N(), dst); + // add MEDIAN cases when supported in Java + // generate_reduce(Aggr::MEDIAN, N(), dst); generate_reduce(Aggr::MIN, N(), dst); } diff --git a/eval/src/tests/eval/aggr/aggr_test.cpp b/eval/src/tests/eval/aggr/aggr_test.cpp index 603fdb508e2..b3e9c625fd9 100644 --- a/eval/src/tests/eval/aggr/aggr_test.cpp +++ b/eval/src/tests/eval/aggr/aggr_test.cpp @@ -9,13 +9,14 @@ using namespace vespalib::eval::aggr; TEST("require that aggregator list returns appropriate entries") { auto list = Aggregator::list(); - ASSERT_EQUAL(list.size(), 6u); + ASSERT_EQUAL(list.size(), 7u); EXPECT_EQUAL(int(list[0]), int(Aggr::AVG)); EXPECT_EQUAL(int(list[1]), int(Aggr::COUNT)); EXPECT_EQUAL(int(list[2]), int(Aggr::PROD)); EXPECT_EQUAL(int(list[3]), int(Aggr::SUM)); EXPECT_EQUAL(int(list[4]), int(Aggr::MAX)); - EXPECT_EQUAL(int(list[5]), int(Aggr::MIN)); + EXPECT_EQUAL(int(list[5]), int(Aggr::MEDIAN)); + EXPECT_EQUAL(int(list[6]), int(Aggr::MIN)); } TEST("require that AVG aggregator works as expected") { @@ -73,6 +74,31 @@ TEST("require that MAX aggregator works as expected") { aggr.next(200.0), EXPECT_EQUAL(aggr.result(), 200.0); } +TEST("require that MEDIAN aggregator works as expected") { + Stash stash; + Aggregator &aggr = Aggregator::create(Aggr::MEDIAN, stash); + EXPECT_TRUE(std::isnan(aggr.result())); + aggr.first(10.0), EXPECT_EQUAL(aggr.result(), 10.0); + aggr.next(20.0), EXPECT_EQUAL(aggr.result(), 15.0); + aggr.next(7.0), EXPECT_EQUAL(aggr.result(), 10.0); + aggr.next(40.0), EXPECT_EQUAL(aggr.result(), 15.0); + aggr.next(16.0), EXPECT_EQUAL(aggr.result(), 16.0); + aggr.first(100.0), EXPECT_EQUAL(aggr.result(), 100.0); + aggr.next(200.0), EXPECT_EQUAL(aggr.result(), 150.0); +} + +TEST("require that MEDIAN aggregator handles NaN values") { + Stash stash; + Aggregator &aggr = Aggregator::create(Aggr::MEDIAN, stash); + double my_nan = std::numeric_limits<double>::quiet_NaN(); + aggr.first(10.0); + EXPECT_EQUAL(aggr.result(), 10.0); + aggr.next(my_nan); + EXPECT_TRUE(std::isnan(aggr.result())); + aggr.next(20.0); + EXPECT_TRUE(std::isnan(aggr.result())); +} + TEST("require that MIN aggregator works as expected") { Stash stash; Aggregator &aggr = Aggregator::create(Aggr::MIN, stash); @@ -103,11 +129,17 @@ float aggr_merge(const std::vector<float> &a, const std::vector<float> &b) { } TEST("require that aggregator merge works") { + float my_nan = std::numeric_limits<float>::quiet_NaN(); EXPECT_EQUAL(aggr_merge<Avg>({1,2},{3,4}), 2.5); EXPECT_EQUAL(aggr_merge<Count>({1,2},{3,4}), 4.0); EXPECT_EQUAL(aggr_merge<Prod>({1,2},{3,4}), 24.0); EXPECT_EQUAL(aggr_merge<Sum>({1,2},{3,4}), 10.0); EXPECT_EQUAL(aggr_merge<Max>({1,2},{3,4}), 4.0); + EXPECT_EQUAL(aggr_merge<Median>({1,2},{3,4}), 2.5); + EXPECT_EQUAL(aggr_merge<Median>({1,2},{3,4,5}), 3); + EXPECT_EQUAL(aggr_merge<Median>({0,1,2},{3,4}), 2); + EXPECT_TRUE(std::isnan(aggr_merge<Median>({1,2,my_nan,3},{4,5}))); + EXPECT_TRUE(std::isnan(aggr_merge<Median>({1,2,3},{4,my_nan,5}))); EXPECT_EQUAL(aggr_merge<Min>({1,2},{3,4}), 1.0); } diff --git a/eval/src/vespa/eval/eval/aggr.cpp b/eval/src/vespa/eval/eval/aggr.cpp index e731c7a1f09..4abd5e41f47 100644 --- a/eval/src/vespa/eval/eval/aggr.cpp +++ b/eval/src/vespa/eval/eval/aggr.cpp @@ -34,12 +34,13 @@ AggrNames::AggrNames() : _name_aggr_map(), _aggr_name_map() { - add(Aggr::AVG, "avg"); - add(Aggr::COUNT, "count"); - add(Aggr::PROD, "prod"); - add(Aggr::SUM, "sum"); - add(Aggr::MAX, "max"); - add(Aggr::MIN, "min"); + add(Aggr::AVG, "avg"); + add(Aggr::COUNT, "count"); + add(Aggr::PROD, "prod"); + add(Aggr::SUM, "sum"); + add(Aggr::MAX, "max"); + add(Aggr::MEDIAN, "median"); + add(Aggr::MIN, "min"); } const vespalib::string * @@ -82,7 +83,8 @@ std::vector<Aggr> Aggregator::list() { return std::vector<Aggr>({ Aggr::AVG, Aggr::COUNT, Aggr::PROD, - Aggr::SUM, Aggr::MAX, Aggr::MIN }); + Aggr::SUM, Aggr::MAX, Aggr::MEDIAN, + Aggr::MIN }); } } // namespace vespalib::eval diff --git a/eval/src/vespa/eval/eval/aggr.h b/eval/src/vespa/eval/eval/aggr.h index 050287d183c..f52c029eee5 100644 --- a/eval/src/vespa/eval/eval/aggr.h +++ b/eval/src/vespa/eval/eval/aggr.h @@ -7,6 +7,8 @@ #include <limits> #include <vector> #include <map> +#include <algorithm> +#include <cmath> namespace vespalib { @@ -20,7 +22,7 @@ struct BinaryOperation; * Enumeration of all different aggregators that are allowed to be * used in tensor reduce expressions. **/ -enum class Aggr { AVG, COUNT, PROD, SUM, MAX, MIN }; +enum class Aggr { AVG, COUNT, PROD, SUM, MAX, MEDIAN, MIN }; /** * Utiliy class used to map between aggregator enum value and symbolic @@ -120,6 +122,42 @@ public: constexpr T result() const { return _max; } }; +template <typename T> class Median { +private: + std::vector<T> _seen; +public: + constexpr Median() : _seen() {} + constexpr Median(T value) : _seen({value}) {} + constexpr void sample(T value) { _seen.push_back(value); } + constexpr void merge(const Median &rhs) { + for (T value: rhs._seen) { + _seen.push_back(value); + } + }; + constexpr T result() const { + if (_seen.empty()) { + return std::numeric_limits<T>::quiet_NaN(); + } + std::vector<T> tmp; + tmp.reserve(_seen.size()); + for (T value: _seen) { + if (!std::isnan(value)) { + tmp.push_back(value); + } else { + return std::numeric_limits<T>::quiet_NaN(); + } + } + size_t n = (tmp.size() / 2); + std::nth_element(tmp.begin(), tmp.begin() + n, tmp.end()); + T result = tmp[n]; // the nth element + if ((tmp.size() % 2) == 0) { + result += *std::max_element(tmp.begin(), tmp.begin() + n); + result /= T{2}; + } + return result; + } +}; + template <typename T> class Min { private: T _min; @@ -137,12 +175,13 @@ struct TypifyAggr { template <template<typename> typename TT> using Result = TypifyResultSimpleTemplate<TT>; template <typename F> static decltype(auto) resolve(Aggr aggr, F &&f) { switch (aggr) { - case Aggr::AVG: return f(Result<aggr::Avg>()); - case Aggr::COUNT: return f(Result<aggr::Count>()); - case Aggr::PROD: return f(Result<aggr::Prod>()); - case Aggr::SUM: return f(Result<aggr::Sum>()); - case Aggr::MAX: return f(Result<aggr::Max>()); - case Aggr::MIN: return f(Result<aggr::Min>()); + case Aggr::AVG: return f(Result<aggr::Avg>()); + case Aggr::COUNT: return f(Result<aggr::Count>()); + case Aggr::PROD: return f(Result<aggr::Prod>()); + case Aggr::SUM: return f(Result<aggr::Sum>()); + case Aggr::MAX: return f(Result<aggr::Max>()); + case Aggr::MEDIAN: return f(Result<aggr::Median>()); + case Aggr::MIN: return f(Result<aggr::Min>()); } abort(); } diff --git a/eval/src/vespa/eval/eval/test/eval_spec.cpp b/eval/src/vespa/eval/eval/test/eval_spec.cpp index b1dfa6d3c9c..6b80b65df6c 100644 --- a/eval/src/vespa/eval/eval/test/eval_spec.cpp +++ b/eval/src/vespa/eval/eval/test/eval_spec.cpp @@ -173,6 +173,7 @@ EvalSpec::add_tensor_operation_cases() { add_rule({"a", -1.0, 1.0}, "reduce(a,prod)", [](double a){ return a; }); add_rule({"a", -1.0, 1.0}, "reduce(a,sum)", [](double a){ return a; }); add_rule({"a", -1.0, 1.0}, "reduce(a,max)", [](double a){ return a; }); + add_rule({"a", -1.0, 1.0}, "reduce(a,median)", [](double a){ return a; }); add_rule({"a", -1.0, 1.0}, "reduce(a,min)", [](double a){ return a; }); add_expression({"a"}, "rename(a,x,y)"); add_expression({"a"}, "rename(a,(x,y),(y,x))"); diff --git a/eval/src/vespa/eval/eval/test/tensor_conformance.cpp b/eval/src/vespa/eval/eval/test/tensor_conformance.cpp index 0e703e81073..701595920ac 100644 --- a/eval/src/vespa/eval/eval/test/tensor_conformance.cpp +++ b/eval/src/vespa/eval/eval/test/tensor_conformance.cpp @@ -358,6 +358,7 @@ struct TestContext { TEST_DO(test_reduce_op(Aggr::PROD, SigmoidF(N()))); TEST_DO(test_reduce_op(Aggr::SUM, N())); TEST_DO(test_reduce_op(Aggr::MAX, N())); + TEST_DO(test_reduce_op(Aggr::MEDIAN, N())); TEST_DO(test_reduce_op(Aggr::MIN, N())); } diff --git a/eval/src/vespa/eval/tensor/dense/dense_remove_dimension_optimizer.cpp b/eval/src/vespa/eval/tensor/dense/dense_remove_dimension_optimizer.cpp index a64d5edbb37..0cecd588317 100644 --- a/eval/src/vespa/eval/tensor/dense/dense_remove_dimension_optimizer.cpp +++ b/eval/src/vespa/eval/tensor/dense/dense_remove_dimension_optimizer.cpp @@ -15,10 +15,11 @@ using namespace eval::tensor_function; namespace { bool is_ident_aggr(Aggr aggr) { - return ((aggr == Aggr::AVG) || - (aggr == Aggr::PROD) || - (aggr == Aggr::SUM) || - (aggr == Aggr::MAX) || + return ((aggr == Aggr::AVG) || + (aggr == Aggr::PROD) || + (aggr == Aggr::SUM) || + (aggr == Aggr::MAX) || + (aggr == Aggr::MEDIAN) || (aggr == Aggr::MIN)); } |