diff options
author | Geir Storli <geirst@verizonmedia.com> | 2019-05-31 16:06:08 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-05-31 16:06:08 +0200 |
commit | fdb31c2917af662bad4ab7a92e59698df72998ba (patch) | |
tree | 17abd28fea5ba3125f3b68cb43baee223fb7828d | |
parent | a77e3458e427cff87b4b594a7ca927427b906cea (diff) | |
parent | f7b0a4a774f0e0e8ad58862e729c91609803fc09 (diff) |
Merge pull request #9626 from vespa-engine/toregge/add-field-length-calculator
Add field length calculator, used to calculate average field length.
4 files changed, 120 insertions, 0 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index bae47872d6c..f032bbe9c30 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -163,6 +163,7 @@ vespa_define_module( src/tests/hitcollector src/tests/index/docbuilder src/tests/index/doctypebuilder + src/tests/index/field_length_calculator src/tests/indexmetainfo src/tests/ld-library-path src/tests/memoryindex/compact_words_store diff --git a/searchlib/src/tests/index/field_length_calculator/CMakeLists.txt b/searchlib/src/tests/index/field_length_calculator/CMakeLists.txt new file mode 100644 index 00000000000..df09d0abaa7 --- /dev/null +++ b/searchlib/src/tests/index/field_length_calculator/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_field_length_calculator_test_app TEST + SOURCES + field_length_calculator_test.cpp + DEPENDS + searchlib + gtest +) +vespa_add_test(NAME searchlib_field_length_calculator_test_app COMMAND searchlib_field_length_calculator_test_app) diff --git a/searchlib/src/tests/index/field_length_calculator/field_length_calculator_test.cpp b/searchlib/src/tests/index/field_length_calculator/field_length_calculator_test.cpp new file mode 100644 index 00000000000..c99d241cbc0 --- /dev/null +++ b/searchlib/src/tests/index/field_length_calculator/field_length_calculator_test.cpp @@ -0,0 +1,68 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchlib/index/field_length_calculator.h> +#include <vespa/vespalib/gtest/gtest.h> + +using search::index::FieldLengthCalculator; + +namespace search::index { + +namespace { + +// Arithmetic average of arithmetic sequence 1, 2, ... , samples +double arith_avg(uint32_t samples) { + return static_cast<double>(samples + 1) / 2; +} + +} + +TEST(FieldLengthCalculatorTest, empty_is_zero) +{ + FieldLengthCalculator calc; + EXPECT_EQ(0.0, calc.get_average_field_length()); + EXPECT_EQ(0, calc.get_num_samples()); +} + +TEST(FieldLengthCalculatorTest, startup_is_average) +{ + FieldLengthCalculator calc; + calc.add_field_length(3); + EXPECT_DOUBLE_EQ(3.0, calc.get_average_field_length()); + EXPECT_EQ(1, calc.get_num_samples()); + calc.add_field_length(4); + EXPECT_DOUBLE_EQ(3.5, calc.get_average_field_length()); + EXPECT_EQ(2, calc.get_num_samples()); + calc.add_field_length(7); + EXPECT_DOUBLE_EQ((3 + 4 + 7)/3.0, calc.get_average_field_length()); + EXPECT_EQ(3, calc.get_num_samples()); + calc.add_field_length(9); + EXPECT_DOUBLE_EQ((3 + 4 + 7 + 9)/4.0, calc.get_average_field_length()); + EXPECT_EQ(4, calc.get_num_samples()); +} + +TEST(FieldLengthCalculatorTest, average_until_max_num_samples) +{ + const uint32_t max_num_samples = 5; + FieldLengthCalculator calc(0.0, 0, max_num_samples); + static constexpr double epsilon = 0.000000001; // Allowed difference + for (uint32_t i = 0; i + 1 < max_num_samples; ++i) { + calc.add_field_length(i + 1); + } + // Arithmetic average + EXPECT_NEAR(arith_avg(max_num_samples - 1), calc.get_average_field_length(), epsilon); + EXPECT_EQ(max_num_samples - 1, calc.get_num_samples()); + calc.add_field_length(max_num_samples); + // Arithmetic average + EXPECT_NEAR(arith_avg(max_num_samples), calc.get_average_field_length(), epsilon); + EXPECT_EQ(max_num_samples, calc.get_num_samples()); + calc.add_field_length(max_num_samples + 1); + // No longer arithmetic average + EXPECT_LT(arith_avg(max_num_samples + 1), calc.get_average_field_length()); + // Switched to exponential decay + EXPECT_NEAR((arith_avg(max_num_samples) * (max_num_samples - 1) + max_num_samples + 1) / max_num_samples, calc.get_average_field_length(), epsilon); + EXPECT_EQ(max_num_samples, calc.get_num_samples()); +} + +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/index/field_length_calculator.h b/searchlib/src/vespa/searchlib/index/field_length_calculator.h new file mode 100644 index 00000000000..50d47ced063 --- /dev/null +++ b/searchlib/src/vespa/searchlib/index/field_length_calculator.h @@ -0,0 +1,42 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <atomic> +#include <algorithm> + +namespace search::index { + +/** + * Class used to calculate average field length, with a bias towards + * the latest field lengths when max_num_samples samples have been reached. + */ +class FieldLengthCalculator { + std::atomic<double> _average_field_length; + uint32_t _num_samples; // Capped by _max_num_samples + uint32_t _max_num_samples; + +public: + FieldLengthCalculator() + : FieldLengthCalculator(0.0, 0) + { + } + + FieldLengthCalculator(double average_field_length, uint32_t num_samples, uint32_t max_num_samples = 100000) + : _average_field_length(average_field_length), + _num_samples(std::min(num_samples, max_num_samples)), + _max_num_samples(max_num_samples) + { + } + + double get_average_field_length() const { return _average_field_length.load(std::memory_order_relaxed); } + uint32_t get_num_samples() const { return _num_samples; } + uint32_t get_max_num_samples() { return _max_num_samples; } + + void add_field_length(uint32_t field_length) { + if (_num_samples < _max_num_samples) { + ++_num_samples; + } + _average_field_length.store((_average_field_length.load(std::memory_order_relaxed) * (_num_samples - 1) + field_length) / _num_samples, std::memory_order_relaxed); + } +}; + +} |