diff options
author | Bjørn Christian Seime <bjorncs@yahoo-inc.com> | 2017-05-04 12:34:10 +0000 |
---|---|---|
committer | Bjørn Christian Seime <bjorncs@yahoo-inc.com> | 2017-05-05 12:15:34 +0000 |
commit | e221caec490e206dee6de86d6987934c4fd03c27 (patch) | |
tree | 9a4cc91e7090aed11d9c6bd1730a5117ada1abab /searchlib | |
parent | f58f7592d37f3309eac8c4ec87b64575fac5a9d2 (diff) |
Add aggregator for calculating the population standard deviation
Diffstat (limited to 'searchlib')
-rw-r--r-- | searchlib/src/test/files/testAggregatorResults | bin | 310 -> 364 bytes | |||
-rw-r--r-- | searchlib/src/tests/aggregator/perdocexpr.cpp | 105 | ||||
-rw-r--r-- | searchlib/src/tests/grouping/grouping_serialization_test.cpp | 4 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/aggregation/aggregation.cpp | 77 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/aggregation/aggregation.h | 1 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/aggregation/standarddeviationaggregationresult.h | 37 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/common/identifiable.h | 2 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/expression/resultvector.h | 14 |
8 files changed, 224 insertions, 16 deletions
diff --git a/searchlib/src/test/files/testAggregatorResults b/searchlib/src/test/files/testAggregatorResults Binary files differindex 060b8b86bda..839913ee513 100644 --- a/searchlib/src/test/files/testAggregatorResults +++ b/searchlib/src/test/files/testAggregatorResults diff --git a/searchlib/src/tests/aggregator/perdocexpr.cpp b/searchlib/src/tests/aggregator/perdocexpr.cpp index b68370334c5..0d023685186 100644 --- a/searchlib/src/tests/aggregator/perdocexpr.cpp +++ b/searchlib/src/tests/aggregator/perdocexpr.cpp @@ -46,6 +46,24 @@ void testMin(const ResultNode & a, const ResultNode & b) { ASSERT_TRUE(funcR.getResult().cmp(a) == 0); } +ExpressionNode::UP +createVectorFloat(const std::vector<double> & v) { + std::unique_ptr<FloatResultNodeVector> r = MU<FloatResultNodeVector>(); + for (double d : v) { + r->push_back(FloatResultNode(d)); + } + return MU<ConstantNode>(std::move(r)); +} + +ExpressionNode::UP +createVectorInt(const std::vector<double> & v) { + std::unique_ptr<IntegerResultNodeVector> r = MU<IntegerResultNodeVector>(); + for (double d : v) { + r->push_back(Int64ResultNode(static_cast<int64_t>(d))); + } + return MU<ConstantNode>(std::move(r)); +} + TEST("testMin") { testMin(Int64ResultNode(67), Int64ResultNode(68)); testMin(FloatResultNode(67), FloatResultNode(68)); @@ -155,6 +173,75 @@ TEST("require that expression count estimates rank") { EXPECT_EQUAL(3, func.getRank().getInteger()); } +TEST("require that StandardDeviationAggregationResult can be merged") { + StandardDeviationAggregationResult aggr1; + aggr1.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(8))). + aggregate(DocId(42), HitRank(21)); + + StandardDeviationAggregationResult aggr2; + aggr2.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(10))). + aggregate(DocId(43), HitRank(8)); + + aggr1.merge(aggr2); + EXPECT_EQUAL(2u, aggr1.getCount()); + EXPECT_EQUAL(18.0, aggr1.getSum()); + EXPECT_EQUAL(164.0, aggr1.getSumOfSquared()); +} + +TEST("require that StandardDeviationAggregationResult can be serialized") { + StandardDeviationAggregationResult aggr1; + aggr1.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(8))). + aggregate(DocId(42), HitRank(21)); + + nbostream os; + NBOSerializer nos(os); + nos << aggr1; + Identifiable::UP obj = Identifiable::create(nos); + auto *aggr2 = dynamic_cast<StandardDeviationAggregationResult *>(obj.get()); + ASSERT_TRUE(aggr2); + EXPECT_TRUE(os.empty()); + EXPECT_EQUAL(aggr1.getSumOfSquared(), aggr2->getSumOfSquared()); + EXPECT_EQUAL(aggr1.getSum(), aggr2->getSum()); + EXPECT_EQUAL(aggr1.getCount(), aggr2->getCount()); +} + +TEST("require that StandardDeviationAggregationResult rank is the standard deviation of aggregated values") { + StandardDeviationAggregationResult aggr; + aggr.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(101))). + aggregate(DocId(1), HitRank(21)); + aggr.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(13))). + aggregate(DocId(2), HitRank(8)); + aggr.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(15))). + aggregate(DocId(3), HitRank(30)); + EXPECT_APPROX(41.0203, aggr.getRank().getFloat(), 0.01); +} + +TEST("require that StandardDeviationAggregationResult aggregates multiple expressions correctly") { + StandardDeviationAggregationResult aggr; + aggr.setExpression(MU<ConstantNode>(MU<FloatResultNode>(1.5))). + aggregate(DocId(1), HitRank(21)); + aggr.setExpression(MU<ConstantNode>(MU<FloatResultNode>(100.25))). + aggregate(DocId(2), HitRank(8)); + aggr.setExpression(MU<ConstantNode>(MU<FloatResultNode>(30.125))). + aggregate(DocId(3), HitRank(40)); + + EXPECT_EQUAL(3u, aggr.getCount()); + EXPECT_APPROX(131.875, aggr.getSum(), 0.01); + EXPECT_APPROX(10959.8, aggr.getSumOfSquared(), 0.1); + EXPECT_APPROX(41.5, aggr.getRank().getFloat(), 0.1); +} + +TEST("require that StandardDeviationAggregationResult aggregates multi-value expression correctly") { + StandardDeviationAggregationResult aggr; + aggr.setExpression(createVectorFloat(std::vector<double>({1.5, 100.25, 30.125}))). + aggregate(DocId(42), HitRank(21)); + + EXPECT_EQUAL(3u, aggr.getCount()); + EXPECT_APPROX(131.875, aggr.getSum(), 0.01); + EXPECT_APPROX(10959.8, aggr.getSumOfSquared(), 0.1); + EXPECT_APPROX(41.5, aggr.getRank().getFloat(), 0.1); +} + void testAdd(const ResultNode &a, const ResultNode &b, const ResultNode &c) { AddFunctionNode func; func.appendArg(MU<ConstantNode>(ResultNode::UP(a.clone()))) @@ -1006,22 +1093,7 @@ void testModulo(ExpressionNode::UP arg1, ExpressionNode::UP arg2, testArith(add, std::move(arg1), std::move(arg2), intResult, floatResult); } -ExpressionNode::UP -createVectorInt(const std::vector<double> & v) { - std::unique_ptr<IntegerResultNodeVector> r = MU<IntegerResultNodeVector>(); - for (double d : v) { - r->push_back(Int64ResultNode(static_cast<int64_t>(d))); - } - return MU<ConstantNode>(std::move(r)); -} -ExpressionNode::UP -createVectorFloat(const std::vector<double> & v) { - std::unique_ptr<FloatResultNodeVector> r = MU<FloatResultNodeVector>(); - for (double d : v) { - r->push_back(FloatResultNode(d)); - } - return MU<ConstantNode>(std::move(r)); -} + void testArithmeticArguments(NumericFunctionNode &function, const std::vector<double> & arg1, @@ -1516,6 +1588,7 @@ TEST("testStreamingAll") { testStreaming(RawResultNode("Tester RawResultNode streaming", 30)); testStreaming(CountAggregationResult()); testStreaming(ExpressionCountAggregationResult()); + testStreaming(StandardDeviationAggregationResult()); testStreaming(SumAggregationResult()); testStreaming(MinAggregationResult()); testStreaming(MaxAggregationResult()); diff --git a/searchlib/src/tests/grouping/grouping_serialization_test.cpp b/searchlib/src/tests/grouping/grouping_serialization_test.cpp index 8592995915c..734c9095d48 100644 --- a/searchlib/src/tests/grouping/grouping_serialization_test.cpp +++ b/searchlib/src/tests/grouping/grouping_serialization_test.cpp @@ -229,6 +229,10 @@ TEST_F("testAggregatorResults", Fixture("testAggregatorResults")) { expression_count.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(67))) .aggregate(DocId(42), HitRank(21)); f.checkObject(expression_count); + StandardDeviationAggregationResult stddev; + stddev.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(67))) + .aggregate(DocId(42), HitRank(21)); + f.checkObject(stddev); } TEST_F("testHitCollection", Fixture("testHitCollection")) { diff --git a/searchlib/src/vespa/searchlib/aggregation/aggregation.cpp b/searchlib/src/vespa/searchlib/aggregation/aggregation.cpp index 1cfc9804390..72b04b9d5e8 100644 --- a/searchlib/src/vespa/searchlib/aggregation/aggregation.cpp +++ b/searchlib/src/vespa/searchlib/aggregation/aggregation.cpp @@ -37,6 +37,7 @@ IMPLEMENT_AGGREGATIONRESULT(MinAggregationResult, AggregationResult); IMPLEMENT_AGGREGATIONRESULT(AverageAggregationResult, AggregationResult); IMPLEMENT_AGGREGATIONRESULT(XorAggregationResult, AggregationResult); IMPLEMENT_AGGREGATIONRESULT(ExpressionCountAggregationResult, AggregationResult); +IMPLEMENT_AGGREGATIONRESULT(StandardDeviationAggregationResult, AggregationResult); AggregationResult::AggregationResult() : _expressionTree(new ExpressionTree()), @@ -501,6 +502,82 @@ Deserializer &ExpressionCountAggregationResult::onDeserialize( ExpressionCountAggregationResult::ExpressionCountAggregationResult() : AggregationResult(), _hll() { } ExpressionCountAggregationResult::~ExpressionCountAggregationResult() {} +const NumericResultNode& StandardDeviationAggregationResult::getStandardDeviation() const noexcept +{ + if (_count == 0) { + _stdDevScratchPad->set(Int64ResultNode(0)); + } else { + double variance = (_sumOfSquared.getFloat() - _sum.getFloat() * _sum.getFloat() / _count) / _count; + double stddev = std::sqrt(variance); + _stdDevScratchPad->set(FloatResultNode(stddev)); + } + return *_stdDevScratchPad; +} + +void StandardDeviationAggregationResult::onMerge(const AggregationResult &r) { + const StandardDeviationAggregationResult &result = + Identifiable::cast<const StandardDeviationAggregationResult &>(r); + _count += result._count; + _sum.add(result._sum); + _sumOfSquared.add(result._sumOfSquared); +} + +void StandardDeviationAggregationResult::onAggregate(const ResultNode &result) { + if (result.isMultiValue()) { + static_cast<const ResultNodeVector &>(result).flattenSum(_sum); + static_cast<const ResultNodeVector &>(result).flattenSumOfSquared(_sumOfSquared); + _count += static_cast<const ResultNodeVector &>(result).size(); + } else { + _sum.add(result); + FloatResultNode squared(result.getFloat()); + squared.multiply(result); + _sumOfSquared.add(squared); + _count++; + } +} + +void StandardDeviationAggregationResult::onReset() +{ + _count = 0; + _sum.set(0.0); + _sumOfSquared.set(0.0); +} + +static FieldBase _G_sumOfSquaredField("sumOfSquared"); + +Serializer & StandardDeviationAggregationResult::onSerialize(Serializer & os) const +{ + AggregationResult::onSerialize(os); + double sum = _sum.getFloat(); + double sumOfSquared = _sumOfSquared.getFloat(); + return os. + put(_G_countField, _count). + put(_G_sumField, sum). + put(_G_sumOfSquaredField, sumOfSquared); + +} + +Deserializer & StandardDeviationAggregationResult::onDeserialize(Deserializer & is) +{ + AggregationResult::onDeserialize(is); + double sum; + double sumOfSquared; + Deserializer & r = is.get(_G_countField, _count) + .get(_G_sumField, sum) + .get(_G_sumOfSquaredField, sumOfSquared); + _sum.set(sum); + _sumOfSquared.set(sumOfSquared); + return r; +} + +void StandardDeviationAggregationResult::visitMembers(vespalib::ObjectVisitor &visitor) const +{ + AggregationResult::visitMembers(visitor); + visit(visitor, "count", _count); + visit(visitor, "sum", _sum); + visit(visitor, "sumOfSquared", _sumOfSquared); +} + } // namespace aggregation } // namespace search diff --git a/searchlib/src/vespa/searchlib/aggregation/aggregation.h b/searchlib/src/vespa/searchlib/aggregation/aggregation.h index e17ee88c113..ad5d48b35b8 100644 --- a/searchlib/src/vespa/searchlib/aggregation/aggregation.h +++ b/searchlib/src/vespa/searchlib/aggregation/aggregation.h @@ -11,6 +11,7 @@ #include <vespa/searchlib/aggregation/averageaggregationresult.h> #include <vespa/searchlib/aggregation/xoraggregationresult.h> #include <vespa/searchlib/aggregation/hitsaggregationresult.h> +#include <vespa/searchlib/aggregation/standarddeviationaggregationresult.h> #include <vespa/searchlib/aggregation/grouping.h> namespace search { diff --git a/searchlib/src/vespa/searchlib/aggregation/standarddeviationaggregationresult.h b/searchlib/src/vespa/searchlib/aggregation/standarddeviationaggregationresult.h new file mode 100644 index 00000000000..d2889232101 --- /dev/null +++ b/searchlib/src/vespa/searchlib/aggregation/standarddeviationaggregationresult.h @@ -0,0 +1,37 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "aggregationresult.h" +#include <vespa/searchlib/expression/floatresultnode.h> +#include <vespa/searchlib/expression/integerresultnode.h> + + +namespace search::aggregation { + +// Aggregator that calculates the population standard deviation +class StandardDeviationAggregationResult : public AggregationResult +{ +public: + DECLARE_AGGREGATIONRESULT(StandardDeviationAggregationResult); + StandardDeviationAggregationResult() : AggregationResult(), _count(), _sum(), _sumOfSquared(), _stdDevScratchPad() + { + _stdDevScratchPad.reset(new expression::FloatResultNode()); + } + + virtual void visitMembers(vespalib::ObjectVisitor &visitor) const override; + const double getSum() const noexcept { return _sum.getFloat(); } + const double getSumOfSquared() const noexcept { return _sumOfSquared.getFloat(); } + const uint64_t getCount() const noexcept { return _count; } +private: + virtual const ResultNode& onGetRank() const noexcept override { return getStandardDeviation(); } + virtual void onPrepare(const ResultNode&, bool) override { }; + const expression::NumericResultNode& getStandardDeviation() const noexcept; + + uint64_t _count; + expression::FloatResultNode _sum; + expression::FloatResultNode _sumOfSquared; + mutable expression::FloatResultNode::CP _stdDevScratchPad; +}; + +} + diff --git a/searchlib/src/vespa/searchlib/common/identifiable.h b/searchlib/src/vespa/searchlib/common/identifiable.h index a2a2dfdb7bb..5ee131ccd56 100644 --- a/searchlib/src/vespa/searchlib/common/identifiable.h +++ b/searchlib/src/vespa/searchlib/common/identifiable.h @@ -90,6 +90,8 @@ #define CID_search_aggregation_HitsAggregationResult SEARCHLIB_CID(87) #define CID_search_aggregation_ExpressionCountAggregationResult \ SEARCHLIB_CID(88) +#define CID_search_aggregation_StandardDeviationAggregationResult \ + SEARCHLIB_CID(89) #define CID_search_aggregation_Group SEARCHLIB_CID(90) #define CID_search_aggregation_Grouping SEARCHLIB_CID(91) diff --git a/searchlib/src/vespa/searchlib/expression/resultvector.h b/searchlib/src/vespa/searchlib/expression/resultvector.h index 9d6b409449e..dabfd9937c7 100644 --- a/searchlib/src/vespa/searchlib/expression/resultvector.h +++ b/searchlib/src/vespa/searchlib/expression/resultvector.h @@ -44,6 +44,7 @@ public: virtual ResultNode & flattenAnd(ResultNode & r) const { return r; } virtual ResultNode & flattenOr(ResultNode & r) const { return r; } virtual ResultNode & flattenXor(ResultNode & r) const { return r; } + virtual ResultNode & flattenSumOfSquared(ResultNode & r) const { return r; } virtual void min(const ResultNode & b) { (void) b; } virtual void max(const ResultNode & b) { (void) b; } virtual void add(const ResultNode & b) { (void) b; } @@ -285,6 +286,19 @@ public: r.set(v); return r; } + ResultNode & flattenSumOfSquared(ResultNode & r) const override { + B v; + v.set(r); + const std::vector<B> & vec(this->getVector()); + for(size_t i(0), m(vec.size()); i < m; i++) { + B squared; + squared.set(vec[i]); + squared.multiply(vec[i]); + v.add(squared); + } + r.set(v); + return r; + } }; |