diff options
author | Tor Egge <Tor.Egge@broadpark.no> | 2019-05-31 12:48:08 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@broadpark.no> | 2019-05-31 13:01:12 +0200 |
commit | 7193310dae6598730c6d27ad2cd4230df35a2445 (patch) | |
tree | 0b5b6724437a5a3db9a2447f27b1d670575184fc /searchlib | |
parent | 996f7da30eed7e604cb89af2af0f6d00fee46be7 (diff) |
Add field length calculator, used to calculate average field length.
Diffstat (limited to 'searchlib')
4 files changed, 119 insertions, 0 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index bae47872d6c..f032bbe9c30 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -163,6 +163,7 @@ vespa_define_module( src/tests/hitcollector src/tests/index/docbuilder src/tests/index/doctypebuilder + src/tests/index/field_length_calculator src/tests/indexmetainfo src/tests/ld-library-path src/tests/memoryindex/compact_words_store diff --git a/searchlib/src/tests/index/field_length_calculator/CMakeLists.txt b/searchlib/src/tests/index/field_length_calculator/CMakeLists.txt new file mode 100644 index 00000000000..df09d0abaa7 --- /dev/null +++ b/searchlib/src/tests/index/field_length_calculator/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_field_length_calculator_test_app TEST + SOURCES + field_length_calculator_test.cpp + DEPENDS + searchlib + gtest +) +vespa_add_test(NAME searchlib_field_length_calculator_test_app COMMAND searchlib_field_length_calculator_test_app) diff --git a/searchlib/src/tests/index/field_length_calculator/field_length_calculator_test.cpp b/searchlib/src/tests/index/field_length_calculator/field_length_calculator_test.cpp new file mode 100644 index 00000000000..f61a7f103fa --- /dev/null +++ b/searchlib/src/tests/index/field_length_calculator/field_length_calculator_test.cpp @@ -0,0 +1,68 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchlib/index/field_length_calculator.h> +#include <vespa/vespalib/gtest/gtest.h> + +using search::index::FieldLengthCalculator; + +namespace search::index { + +namespace { + +// Arithmetic average of arithmetic sequence 1, 2, ... , samples +double arith_avg(uint32_t samples) { + return static_cast<double>(samples + 1) / 2; +} + +} + +TEST(FieldLengthCalculatorTest, empty_is_zero) +{ + FieldLengthCalculator calc; + EXPECT_EQ(0.0, calc.get_average_field_length()); + EXPECT_EQ(0, calc.get_num_samples()); +} + +TEST(FieldLengthCalculatorTest, startup_is_average) +{ + FieldLengthCalculator calc; + calc.add_field_length(3); + EXPECT_DOUBLE_EQ(3.0, calc.get_average_field_length()); + EXPECT_EQ(1, calc.get_num_samples()); + calc.add_field_length(4); + EXPECT_DOUBLE_EQ(3.5, calc.get_average_field_length()); + EXPECT_EQ(2, calc.get_num_samples()); + calc.add_field_length(7); + EXPECT_DOUBLE_EQ((3 + 4 + 7)/3.0, calc.get_average_field_length()); + EXPECT_EQ(3, calc.get_num_samples()); + calc.add_field_length(9); + EXPECT_DOUBLE_EQ(5.75, calc.get_average_field_length()); + EXPECT_EQ(4, calc.get_num_samples()); +} + +TEST(FieldLengthCalculatorTest, average_until_max_num_samples) +{ + FieldLengthCalculator calc; + static constexpr double epsilon = 0.000000001; // Allowed difference + const uint32_t max_num_samples = calc.get_max_num_samples(); + for (uint32_t i = 0; i + 1 < max_num_samples; ++i) { + calc.add_field_length(i + 1); + } + // Arithmetic average + EXPECT_NEAR(arith_avg(max_num_samples - 1), calc.get_average_field_length(), epsilon); + EXPECT_EQ(max_num_samples - 1, calc.get_num_samples()); + calc.add_field_length(max_num_samples); + // Arithmetic average + EXPECT_NEAR(arith_avg(max_num_samples), calc.get_average_field_length(), epsilon); + EXPECT_EQ(max_num_samples, calc.get_num_samples()); + calc.add_field_length(max_num_samples + 1); + // No longer arithmetic average + EXPECT_LT(arith_avg(max_num_samples + 1), calc.get_average_field_length()); + // Switched to exponential decay + EXPECT_NEAR((arith_avg(max_num_samples) * (max_num_samples - 1) + max_num_samples + 1) / max_num_samples, calc.get_average_field_length(), 0.000000001); + EXPECT_EQ(max_num_samples, calc.get_num_samples()); +} + +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/index/field_length_calculator.h b/searchlib/src/vespa/searchlib/index/field_length_calculator.h new file mode 100644 index 00000000000..e3323654e20 --- /dev/null +++ b/searchlib/src/vespa/searchlib/index/field_length_calculator.h @@ -0,0 +1,41 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <atomic> +#include <algorithm> + +namespace search::index { + +/** + * Class used to calculate average field length, with a bias towards + * the latest field lengths when MAX_NUM_SAMPLES samples have been reached. + */ +class FieldLengthCalculator { + std::atomic<double> _average_field_length; + uint32_t _num_samples; // Capped by _max_num_samples + static constexpr uint32_t MAX_NUM_SAMPLES = 100000; + +public: + FieldLengthCalculator() + : FieldLengthCalculator(0.0, 0) + { + } + + FieldLengthCalculator(double average_field_length, uint32_t num_samples) + : _average_field_length(average_field_length), + _num_samples(std::min(num_samples, MAX_NUM_SAMPLES)) + { + } + + double get_average_field_length() const { return _average_field_length.load(std::memory_order_relaxed); } + uint32_t get_num_samples() const { return _num_samples; } + static constexpr uint32_t get_max_num_samples() { return MAX_NUM_SAMPLES; } + + void add_field_length(uint32_t field_length) { + if (_num_samples < MAX_NUM_SAMPLES) { + ++_num_samples; + } + _average_field_length.store((_average_field_length.load(std::memory_order_relaxed) * (_num_samples - 1) + field_length) / _num_samples, std::memory_order_relaxed); + } +}; + +} |