summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@broadpark.no>2019-05-31 12:48:08 +0200
committerTor Egge <Tor.Egge@broadpark.no>2019-05-31 13:01:12 +0200
commit7193310dae6598730c6d27ad2cd4230df35a2445 (patch)
tree0b5b6724437a5a3db9a2447f27b1d670575184fc /searchlib
parent996f7da30eed7e604cb89af2af0f6d00fee46be7 (diff)
Add field length calculator, used to calculate average field length.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/CMakeLists.txt1
-rw-r--r--searchlib/src/tests/index/field_length_calculator/CMakeLists.txt9
-rw-r--r--searchlib/src/tests/index/field_length_calculator/field_length_calculator_test.cpp68
-rw-r--r--searchlib/src/vespa/searchlib/index/field_length_calculator.h41
4 files changed, 119 insertions, 0 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt
index bae47872d6c..f032bbe9c30 100644
--- a/searchlib/CMakeLists.txt
+++ b/searchlib/CMakeLists.txt
@@ -163,6 +163,7 @@ vespa_define_module(
src/tests/hitcollector
src/tests/index/docbuilder
src/tests/index/doctypebuilder
+ src/tests/index/field_length_calculator
src/tests/indexmetainfo
src/tests/ld-library-path
src/tests/memoryindex/compact_words_store
diff --git a/searchlib/src/tests/index/field_length_calculator/CMakeLists.txt b/searchlib/src/tests/index/field_length_calculator/CMakeLists.txt
new file mode 100644
index 00000000000..df09d0abaa7
--- /dev/null
+++ b/searchlib/src/tests/index/field_length_calculator/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(searchlib_field_length_calculator_test_app TEST
+ SOURCES
+ field_length_calculator_test.cpp
+ DEPENDS
+ searchlib
+ gtest
+)
+vespa_add_test(NAME searchlib_field_length_calculator_test_app COMMAND searchlib_field_length_calculator_test_app)
diff --git a/searchlib/src/tests/index/field_length_calculator/field_length_calculator_test.cpp b/searchlib/src/tests/index/field_length_calculator/field_length_calculator_test.cpp
new file mode 100644
index 00000000000..f61a7f103fa
--- /dev/null
+++ b/searchlib/src/tests/index/field_length_calculator/field_length_calculator_test.cpp
@@ -0,0 +1,68 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/searchlib/index/field_length_calculator.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using search::index::FieldLengthCalculator;
+
+namespace search::index {
+
+namespace {
+
+// Arithmetic average of arithmetic sequence 1, 2, ... , samples
+double arith_avg(uint32_t samples) {
+ return static_cast<double>(samples + 1) / 2;
+}
+
+}
+
+TEST(FieldLengthCalculatorTest, empty_is_zero)
+{
+ FieldLengthCalculator calc;
+ EXPECT_EQ(0.0, calc.get_average_field_length());
+ EXPECT_EQ(0, calc.get_num_samples());
+}
+
+TEST(FieldLengthCalculatorTest, startup_is_average)
+{
+ FieldLengthCalculator calc;
+ calc.add_field_length(3);
+ EXPECT_DOUBLE_EQ(3.0, calc.get_average_field_length());
+ EXPECT_EQ(1, calc.get_num_samples());
+ calc.add_field_length(4);
+ EXPECT_DOUBLE_EQ(3.5, calc.get_average_field_length());
+ EXPECT_EQ(2, calc.get_num_samples());
+ calc.add_field_length(7);
+ EXPECT_DOUBLE_EQ((3 + 4 + 7)/3.0, calc.get_average_field_length());
+ EXPECT_EQ(3, calc.get_num_samples());
+ calc.add_field_length(9);
+ EXPECT_DOUBLE_EQ(5.75, calc.get_average_field_length());
+ EXPECT_EQ(4, calc.get_num_samples());
+}
+
+TEST(FieldLengthCalculatorTest, average_until_max_num_samples)
+{
+ FieldLengthCalculator calc;
+ static constexpr double epsilon = 0.000000001; // Allowed difference
+ const uint32_t max_num_samples = calc.get_max_num_samples();
+ for (uint32_t i = 0; i + 1 < max_num_samples; ++i) {
+ calc.add_field_length(i + 1);
+ }
+ // Arithmetic average
+ EXPECT_NEAR(arith_avg(max_num_samples - 1), calc.get_average_field_length(), epsilon);
+ EXPECT_EQ(max_num_samples - 1, calc.get_num_samples());
+ calc.add_field_length(max_num_samples);
+ // Arithmetic average
+ EXPECT_NEAR(arith_avg(max_num_samples), calc.get_average_field_length(), epsilon);
+ EXPECT_EQ(max_num_samples, calc.get_num_samples());
+ calc.add_field_length(max_num_samples + 1);
+ // No longer arithmetic average
+ EXPECT_LT(arith_avg(max_num_samples + 1), calc.get_average_field_length());
+ // Switched to exponential decay
+ EXPECT_NEAR((arith_avg(max_num_samples) * (max_num_samples - 1) + max_num_samples + 1) / max_num_samples, calc.get_average_field_length(), 0.000000001);
+ EXPECT_EQ(max_num_samples, calc.get_num_samples());
+}
+
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/vespa/searchlib/index/field_length_calculator.h b/searchlib/src/vespa/searchlib/index/field_length_calculator.h
new file mode 100644
index 00000000000..e3323654e20
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/index/field_length_calculator.h
@@ -0,0 +1,41 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <atomic>
+#include <algorithm>
+
+namespace search::index {
+
+/**
+ * Class used to calculate average field length, with a bias towards
+ * the latest field lengths when MAX_NUM_SAMPLES samples have been reached.
+ */
+class FieldLengthCalculator {
+ std::atomic<double> _average_field_length;
+ uint32_t _num_samples; // Capped by _max_num_samples
+ static constexpr uint32_t MAX_NUM_SAMPLES = 100000;
+
+public:
+ FieldLengthCalculator()
+ : FieldLengthCalculator(0.0, 0)
+ {
+ }
+
+ FieldLengthCalculator(double average_field_length, uint32_t num_samples)
+ : _average_field_length(average_field_length),
+ _num_samples(std::min(num_samples, MAX_NUM_SAMPLES))
+ {
+ }
+
+ double get_average_field_length() const { return _average_field_length.load(std::memory_order_relaxed); }
+ uint32_t get_num_samples() const { return _num_samples; }
+ static constexpr uint32_t get_max_num_samples() { return MAX_NUM_SAMPLES; }
+
+ void add_field_length(uint32_t field_length) {
+ if (_num_samples < MAX_NUM_SAMPLES) {
+ ++_num_samples;
+ }
+ _average_field_length.store((_average_field_length.load(std::memory_order_relaxed) * (_num_samples - 1) + field_length) / _num_samples, std::memory_order_relaxed);
+ }
+};
+
+}