summaryrefslogtreecommitdiffstats
path: root/searchlib/src
diff options
context:
space:
mode:
authorBjørn Christian Seime <bjorncs@yahoo-inc.com>2017-05-04 12:34:10 +0000
committerBjørn Christian Seime <bjorncs@yahoo-inc.com>2017-05-05 12:15:34 +0000
commite221caec490e206dee6de86d6987934c4fd03c27 (patch)
tree9a4cc91e7090aed11d9c6bd1730a5117ada1abab /searchlib/src
parentf58f7592d37f3309eac8c4ec87b64575fac5a9d2 (diff)
Add aggregator for calculating the population standard deviation
Diffstat (limited to 'searchlib/src')
-rw-r--r--searchlib/src/test/files/testAggregatorResultsbin310 -> 364 bytes
-rw-r--r--searchlib/src/tests/aggregator/perdocexpr.cpp105
-rw-r--r--searchlib/src/tests/grouping/grouping_serialization_test.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/aggregation/aggregation.cpp77
-rw-r--r--searchlib/src/vespa/searchlib/aggregation/aggregation.h1
-rw-r--r--searchlib/src/vespa/searchlib/aggregation/standarddeviationaggregationresult.h37
-rw-r--r--searchlib/src/vespa/searchlib/common/identifiable.h2
-rw-r--r--searchlib/src/vespa/searchlib/expression/resultvector.h14
8 files changed, 224 insertions, 16 deletions
diff --git a/searchlib/src/test/files/testAggregatorResults b/searchlib/src/test/files/testAggregatorResults
index 060b8b86bda..839913ee513 100644
--- a/searchlib/src/test/files/testAggregatorResults
+++ b/searchlib/src/test/files/testAggregatorResults
Binary files differ
diff --git a/searchlib/src/tests/aggregator/perdocexpr.cpp b/searchlib/src/tests/aggregator/perdocexpr.cpp
index b68370334c5..0d023685186 100644
--- a/searchlib/src/tests/aggregator/perdocexpr.cpp
+++ b/searchlib/src/tests/aggregator/perdocexpr.cpp
@@ -46,6 +46,24 @@ void testMin(const ResultNode & a, const ResultNode & b) {
ASSERT_TRUE(funcR.getResult().cmp(a) == 0);
}
+ExpressionNode::UP
+createVectorFloat(const std::vector<double> & v) {
+ std::unique_ptr<FloatResultNodeVector> r = MU<FloatResultNodeVector>();
+ for (double d : v) {
+ r->push_back(FloatResultNode(d));
+ }
+ return MU<ConstantNode>(std::move(r));
+}
+
+ExpressionNode::UP
+createVectorInt(const std::vector<double> & v) {
+ std::unique_ptr<IntegerResultNodeVector> r = MU<IntegerResultNodeVector>();
+ for (double d : v) {
+ r->push_back(Int64ResultNode(static_cast<int64_t>(d)));
+ }
+ return MU<ConstantNode>(std::move(r));
+}
+
TEST("testMin") {
testMin(Int64ResultNode(67), Int64ResultNode(68));
testMin(FloatResultNode(67), FloatResultNode(68));
@@ -155,6 +173,75 @@ TEST("require that expression count estimates rank") {
EXPECT_EQUAL(3, func.getRank().getInteger());
}
+TEST("require that StandardDeviationAggregationResult can be merged") {
+ StandardDeviationAggregationResult aggr1;
+ aggr1.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(8))).
+ aggregate(DocId(42), HitRank(21));
+
+ StandardDeviationAggregationResult aggr2;
+ aggr2.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(10))).
+ aggregate(DocId(43), HitRank(8));
+
+ aggr1.merge(aggr2);
+ EXPECT_EQUAL(2u, aggr1.getCount());
+ EXPECT_EQUAL(18.0, aggr1.getSum());
+ EXPECT_EQUAL(164.0, aggr1.getSumOfSquared());
+}
+
+TEST("require that StandardDeviationAggregationResult can be serialized") {
+ StandardDeviationAggregationResult aggr1;
+ aggr1.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(8))).
+ aggregate(DocId(42), HitRank(21));
+
+ nbostream os;
+ NBOSerializer nos(os);
+ nos << aggr1;
+ Identifiable::UP obj = Identifiable::create(nos);
+ auto *aggr2 = dynamic_cast<StandardDeviationAggregationResult *>(obj.get());
+ ASSERT_TRUE(aggr2);
+ EXPECT_TRUE(os.empty());
+ EXPECT_EQUAL(aggr1.getSumOfSquared(), aggr2->getSumOfSquared());
+ EXPECT_EQUAL(aggr1.getSum(), aggr2->getSum());
+ EXPECT_EQUAL(aggr1.getCount(), aggr2->getCount());
+}
+
+TEST("require that StandardDeviationAggregationResult rank is the standard deviation of aggregated values") {
+ StandardDeviationAggregationResult aggr;
+ aggr.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(101))).
+ aggregate(DocId(1), HitRank(21));
+ aggr.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(13))).
+ aggregate(DocId(2), HitRank(8));
+ aggr.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(15))).
+ aggregate(DocId(3), HitRank(30));
+ EXPECT_APPROX(41.0203, aggr.getRank().getFloat(), 0.01);
+}
+
+TEST("require that StandardDeviationAggregationResult aggregates multiple expressions correctly") {
+ StandardDeviationAggregationResult aggr;
+ aggr.setExpression(MU<ConstantNode>(MU<FloatResultNode>(1.5))).
+ aggregate(DocId(1), HitRank(21));
+ aggr.setExpression(MU<ConstantNode>(MU<FloatResultNode>(100.25))).
+ aggregate(DocId(2), HitRank(8));
+ aggr.setExpression(MU<ConstantNode>(MU<FloatResultNode>(30.125))).
+ aggregate(DocId(3), HitRank(40));
+
+ EXPECT_EQUAL(3u, aggr.getCount());
+ EXPECT_APPROX(131.875, aggr.getSum(), 0.01);
+ EXPECT_APPROX(10959.8, aggr.getSumOfSquared(), 0.1);
+ EXPECT_APPROX(41.5, aggr.getRank().getFloat(), 0.1);
+}
+
+TEST("require that StandardDeviationAggregationResult aggregates multi-value expression correctly") {
+ StandardDeviationAggregationResult aggr;
+ aggr.setExpression(createVectorFloat(std::vector<double>({1.5, 100.25, 30.125}))).
+ aggregate(DocId(42), HitRank(21));
+
+ EXPECT_EQUAL(3u, aggr.getCount());
+ EXPECT_APPROX(131.875, aggr.getSum(), 0.01);
+ EXPECT_APPROX(10959.8, aggr.getSumOfSquared(), 0.1);
+ EXPECT_APPROX(41.5, aggr.getRank().getFloat(), 0.1);
+}
+
void testAdd(const ResultNode &a, const ResultNode &b, const ResultNode &c) {
AddFunctionNode func;
func.appendArg(MU<ConstantNode>(ResultNode::UP(a.clone())))
@@ -1006,22 +1093,7 @@ void testModulo(ExpressionNode::UP arg1, ExpressionNode::UP arg2,
testArith(add, std::move(arg1), std::move(arg2), intResult, floatResult);
}
-ExpressionNode::UP
-createVectorInt(const std::vector<double> & v) {
- std::unique_ptr<IntegerResultNodeVector> r = MU<IntegerResultNodeVector>();
- for (double d : v) {
- r->push_back(Int64ResultNode(static_cast<int64_t>(d)));
- }
- return MU<ConstantNode>(std::move(r));
-}
-ExpressionNode::UP
-createVectorFloat(const std::vector<double> & v) {
- std::unique_ptr<FloatResultNodeVector> r = MU<FloatResultNodeVector>();
- for (double d : v) {
- r->push_back(FloatResultNode(d));
- }
- return MU<ConstantNode>(std::move(r));
-}
+
void testArithmeticArguments(NumericFunctionNode &function,
const std::vector<double> & arg1,
@@ -1516,6 +1588,7 @@ TEST("testStreamingAll") {
testStreaming(RawResultNode("Tester RawResultNode streaming", 30));
testStreaming(CountAggregationResult());
testStreaming(ExpressionCountAggregationResult());
+ testStreaming(StandardDeviationAggregationResult());
testStreaming(SumAggregationResult());
testStreaming(MinAggregationResult());
testStreaming(MaxAggregationResult());
diff --git a/searchlib/src/tests/grouping/grouping_serialization_test.cpp b/searchlib/src/tests/grouping/grouping_serialization_test.cpp
index 8592995915c..734c9095d48 100644
--- a/searchlib/src/tests/grouping/grouping_serialization_test.cpp
+++ b/searchlib/src/tests/grouping/grouping_serialization_test.cpp
@@ -229,6 +229,10 @@ TEST_F("testAggregatorResults", Fixture("testAggregatorResults")) {
expression_count.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(67)))
.aggregate(DocId(42), HitRank(21));
f.checkObject(expression_count);
+ StandardDeviationAggregationResult stddev;
+ stddev.setExpression(MU<ConstantNode>(MU<Int64ResultNode>(67)))
+ .aggregate(DocId(42), HitRank(21));
+ f.checkObject(stddev);
}
TEST_F("testHitCollection", Fixture("testHitCollection")) {
diff --git a/searchlib/src/vespa/searchlib/aggregation/aggregation.cpp b/searchlib/src/vespa/searchlib/aggregation/aggregation.cpp
index 1cfc9804390..72b04b9d5e8 100644
--- a/searchlib/src/vespa/searchlib/aggregation/aggregation.cpp
+++ b/searchlib/src/vespa/searchlib/aggregation/aggregation.cpp
@@ -37,6 +37,7 @@ IMPLEMENT_AGGREGATIONRESULT(MinAggregationResult, AggregationResult);
IMPLEMENT_AGGREGATIONRESULT(AverageAggregationResult, AggregationResult);
IMPLEMENT_AGGREGATIONRESULT(XorAggregationResult, AggregationResult);
IMPLEMENT_AGGREGATIONRESULT(ExpressionCountAggregationResult, AggregationResult);
+IMPLEMENT_AGGREGATIONRESULT(StandardDeviationAggregationResult, AggregationResult);
AggregationResult::AggregationResult() :
_expressionTree(new ExpressionTree()),
@@ -501,6 +502,82 @@ Deserializer &ExpressionCountAggregationResult::onDeserialize(
ExpressionCountAggregationResult::ExpressionCountAggregationResult() : AggregationResult(), _hll() { }
ExpressionCountAggregationResult::~ExpressionCountAggregationResult() {}
+const NumericResultNode& StandardDeviationAggregationResult::getStandardDeviation() const noexcept
+{
+ if (_count == 0) {
+ _stdDevScratchPad->set(Int64ResultNode(0));
+ } else {
+ double variance = (_sumOfSquared.getFloat() - _sum.getFloat() * _sum.getFloat() / _count) / _count;
+ double stddev = std::sqrt(variance);
+ _stdDevScratchPad->set(FloatResultNode(stddev));
+ }
+ return *_stdDevScratchPad;
+}
+
+void StandardDeviationAggregationResult::onMerge(const AggregationResult &r) {
+ const StandardDeviationAggregationResult &result =
+ Identifiable::cast<const StandardDeviationAggregationResult &>(r);
+ _count += result._count;
+ _sum.add(result._sum);
+ _sumOfSquared.add(result._sumOfSquared);
+}
+
+void StandardDeviationAggregationResult::onAggregate(const ResultNode &result) {
+ if (result.isMultiValue()) {
+ static_cast<const ResultNodeVector &>(result).flattenSum(_sum);
+ static_cast<const ResultNodeVector &>(result).flattenSumOfSquared(_sumOfSquared);
+ _count += static_cast<const ResultNodeVector &>(result).size();
+ } else {
+ _sum.add(result);
+ FloatResultNode squared(result.getFloat());
+ squared.multiply(result);
+ _sumOfSquared.add(squared);
+ _count++;
+ }
+}
+
+void StandardDeviationAggregationResult::onReset()
+{
+ _count = 0;
+ _sum.set(0.0);
+ _sumOfSquared.set(0.0);
+}
+
+static FieldBase _G_sumOfSquaredField("sumOfSquared");
+
+Serializer & StandardDeviationAggregationResult::onSerialize(Serializer & os) const
+{
+ AggregationResult::onSerialize(os);
+ double sum = _sum.getFloat();
+ double sumOfSquared = _sumOfSquared.getFloat();
+ return os.
+ put(_G_countField, _count).
+ put(_G_sumField, sum).
+ put(_G_sumOfSquaredField, sumOfSquared);
+
+}
+
+Deserializer & StandardDeviationAggregationResult::onDeserialize(Deserializer & is)
+{
+ AggregationResult::onDeserialize(is);
+ double sum;
+ double sumOfSquared;
+ Deserializer & r = is.get(_G_countField, _count)
+ .get(_G_sumField, sum)
+ .get(_G_sumOfSquaredField, sumOfSquared);
+ _sum.set(sum);
+ _sumOfSquared.set(sumOfSquared);
+ return r;
+}
+
+void StandardDeviationAggregationResult::visitMembers(vespalib::ObjectVisitor &visitor) const
+{
+ AggregationResult::visitMembers(visitor);
+ visit(visitor, "count", _count);
+ visit(visitor, "sum", _sum);
+ visit(visitor, "sumOfSquared", _sumOfSquared);
+}
+
} // namespace aggregation
} // namespace search
diff --git a/searchlib/src/vespa/searchlib/aggregation/aggregation.h b/searchlib/src/vespa/searchlib/aggregation/aggregation.h
index e17ee88c113..ad5d48b35b8 100644
--- a/searchlib/src/vespa/searchlib/aggregation/aggregation.h
+++ b/searchlib/src/vespa/searchlib/aggregation/aggregation.h
@@ -11,6 +11,7 @@
#include <vespa/searchlib/aggregation/averageaggregationresult.h>
#include <vespa/searchlib/aggregation/xoraggregationresult.h>
#include <vespa/searchlib/aggregation/hitsaggregationresult.h>
+#include <vespa/searchlib/aggregation/standarddeviationaggregationresult.h>
#include <vespa/searchlib/aggregation/grouping.h>
namespace search {
diff --git a/searchlib/src/vespa/searchlib/aggregation/standarddeviationaggregationresult.h b/searchlib/src/vespa/searchlib/aggregation/standarddeviationaggregationresult.h
new file mode 100644
index 00000000000..d2889232101
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/aggregation/standarddeviationaggregationresult.h
@@ -0,0 +1,37 @@
+// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "aggregationresult.h"
+#include <vespa/searchlib/expression/floatresultnode.h>
+#include <vespa/searchlib/expression/integerresultnode.h>
+
+
+namespace search::aggregation {
+
+// Aggregator that calculates the population standard deviation
+class StandardDeviationAggregationResult : public AggregationResult
+{
+public:
+ DECLARE_AGGREGATIONRESULT(StandardDeviationAggregationResult);
+ StandardDeviationAggregationResult() : AggregationResult(), _count(), _sum(), _sumOfSquared(), _stdDevScratchPad()
+ {
+ _stdDevScratchPad.reset(new expression::FloatResultNode());
+ }
+
+ virtual void visitMembers(vespalib::ObjectVisitor &visitor) const override;
+ const double getSum() const noexcept { return _sum.getFloat(); }
+ const double getSumOfSquared() const noexcept { return _sumOfSquared.getFloat(); }
+ const uint64_t getCount() const noexcept { return _count; }
+private:
+ virtual const ResultNode& onGetRank() const noexcept override { return getStandardDeviation(); }
+ virtual void onPrepare(const ResultNode&, bool) override { };
+ const expression::NumericResultNode& getStandardDeviation() const noexcept;
+
+ uint64_t _count;
+ expression::FloatResultNode _sum;
+ expression::FloatResultNode _sumOfSquared;
+ mutable expression::FloatResultNode::CP _stdDevScratchPad;
+};
+
+}
+
diff --git a/searchlib/src/vespa/searchlib/common/identifiable.h b/searchlib/src/vespa/searchlib/common/identifiable.h
index a2a2dfdb7bb..5ee131ccd56 100644
--- a/searchlib/src/vespa/searchlib/common/identifiable.h
+++ b/searchlib/src/vespa/searchlib/common/identifiable.h
@@ -90,6 +90,8 @@
#define CID_search_aggregation_HitsAggregationResult SEARCHLIB_CID(87)
#define CID_search_aggregation_ExpressionCountAggregationResult \
SEARCHLIB_CID(88)
+#define CID_search_aggregation_StandardDeviationAggregationResult \
+ SEARCHLIB_CID(89)
#define CID_search_aggregation_Group SEARCHLIB_CID(90)
#define CID_search_aggregation_Grouping SEARCHLIB_CID(91)
diff --git a/searchlib/src/vespa/searchlib/expression/resultvector.h b/searchlib/src/vespa/searchlib/expression/resultvector.h
index 9d6b409449e..dabfd9937c7 100644
--- a/searchlib/src/vespa/searchlib/expression/resultvector.h
+++ b/searchlib/src/vespa/searchlib/expression/resultvector.h
@@ -44,6 +44,7 @@ public:
virtual ResultNode & flattenAnd(ResultNode & r) const { return r; }
virtual ResultNode & flattenOr(ResultNode & r) const { return r; }
virtual ResultNode & flattenXor(ResultNode & r) const { return r; }
+ virtual ResultNode & flattenSumOfSquared(ResultNode & r) const { return r; }
virtual void min(const ResultNode & b) { (void) b; }
virtual void max(const ResultNode & b) { (void) b; }
virtual void add(const ResultNode & b) { (void) b; }
@@ -285,6 +286,19 @@ public:
r.set(v);
return r;
}
+ ResultNode & flattenSumOfSquared(ResultNode & r) const override {
+ B v;
+ v.set(r);
+ const std::vector<B> & vec(this->getVector());
+ for(size_t i(0), m(vec.size()); i < m; i++) {
+ B squared;
+ squared.set(vec[i]);
+ squared.multiply(vec[i]);
+ v.add(squared);
+ }
+ r.set(v);
+ return r;
+ }
};