diff options
author | Tor Egge <Tor.Egge@online.no> | 2023-07-04 10:39:18 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2023-07-04 10:39:18 +0200 |
commit | 11776b1e672b4261b5e9701e7dc96c9811031799 (patch) | |
tree | 2977440463d7d4bd822763021709502496b441f6 /searchlib | |
parent | 4ccf8b0182224a61aa0d71de89ba71e1066ebc9d (diff) |
Add sort blob writers.
Diffstat (limited to 'searchlib')
9 files changed, 582 insertions, 0 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index eb199a7ba44..e885f9bfba7 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -97,6 +97,7 @@ vespa_define_module( src/tests/attribute/searchable src/tests/attribute/searchcontext src/tests/attribute/searchcontextelementiterator + src/tests/attribute/sort_blob_writers src/tests/attribute/sourceselector src/tests/attribute/stringattribute src/tests/attribute/tensorattribute diff --git a/searchlib/src/tests/attribute/sort_blob_writers/CMakeLists.txt b/searchlib/src/tests/attribute/sort_blob_writers/CMakeLists.txt new file mode 100644 index 00000000000..5aad3c55cb7 --- /dev/null +++ b/searchlib/src/tests/attribute/sort_blob_writers/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_sort_blob_writers_test_app TEST + SOURCES + sort_blob_writers_test.cpp + DEPENDS + searchlib + GTest::GTest +) +vespa_add_test(NAME searchlib_sort_blob_writers_test_app COMMAND searchlib_sort_blob_writers_test_app) diff --git a/searchlib/src/tests/attribute/sort_blob_writers/sort_blob_writers_test.cpp b/searchlib/src/tests/attribute/sort_blob_writers/sort_blob_writers_test.cpp new file mode 100644 index 00000000000..50df88b8c74 --- /dev/null +++ b/searchlib/src/tests/attribute/sort_blob_writers/sort_blob_writers_test.cpp @@ -0,0 +1,344 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchlib/attribute/numeric_sort_blob_writer.h> +#include <vespa/searchlib/attribute/string_sort_blob_writer.h> +#include <vespa/searchlib/common/converters.h> +#include <vespa/fastlib/text/normwordfolder.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <vespa/vespalib/util/arrayref.h> +#include <vespa/vespalib/util/sort.h> + +using search::attribute::NumericSortBlobWriter; +using search::attribute::SortBlobWriter; +using search::attribute::StringSortBlobWriter; +using search::common::BlobConverter; +using search::common::LowercaseConverter; +using vespalib::ConstArrayRef; + +namespace { + +using SortData = std::vector<unsigned char>; + +SortData missing_value{1}; + +template <typename T, bool asc> +SortData +serialized_present_numeric(T value) +{ + SortData s; + s.resize(1 + sizeof(T)); + s[0] = SortBlobWriter::has_value; + auto ret = vespalib::serializeForSort<vespalib::convertForSort<T, asc>>(value, s.data() + 1, s.size() - 1); + assert(size_t(ret) == s.size() - 1); + return s; +} + +SortData +serialized_present_string(const char *value, bool asc) +{ + ConstArrayRef<unsigned char> src(reinterpret_cast<const unsigned char*>(value), strlen(value) + 1); + SortData s; + s.reserve(src.size() + 1); + s.emplace_back(SortBlobWriter::has_value); + unsigned char xor_value = asc ? 0 : 255; + for (auto c : src) { + s.emplace_back(c ^ xor_value); + } + return s; +} + +template <typename T> +SortData +serialized_present(T value, bool asc) +{ + if constexpr (std::is_same_v<T, const char*>) { + return serialized_present_string(value, asc); + } else { + if (asc) { + return serialized_present_numeric<T, true>(value); + } + return serialized_present_numeric<T, false>(value); + } +} + +template <typename T, bool asc> +SortData +sort_data_numeric(std::vector<T> values) +{ + size_t len = 0; + SortData s; + while (true) { + s.clear(); + s.resize(len); + NumericSortBlobWriter<T, asc> writer; + for (auto& v : values) { + writer.candidate(v); + } + auto result = writer.write(s.data(), s.size()); + if (result >= 0) { + s.resize(result); + return s; + } + ++len; + } +} + +SortData +sort_data_string(std::vector<const char*> values, const BlobConverter* bc, bool asc) +{ + size_t len = 0; + SortData s; + while (true) { + s.clear(); + s.resize(len); + StringSortBlobWriter writer(s.data(), s.size(), bc, asc); + bool fail = false; + for (auto& v : values) { + if (!writer.candidate(v)) { + fail = true; + break; + } + } + if (!fail) { + auto result = writer.write(); + if (result >= 0) { + s.resize(result); + return s; + } + } + ++len; + } +} + +SortData +sort_data_string(std::vector<const char*> values, bool asc) +{ + return sort_data_string(values, nullptr, asc); +} + +template <typename T> +SortData +sort_data(std::vector<T> values, bool asc) +{ + if constexpr (std::is_same_v<T, const char*>) { + return sort_data_string(values, asc); + } + if (asc) { + return sort_data_numeric<T, true>(std::move(values)); + } + return sort_data_numeric<T, false>(std::move(values)); +} + +SortData +switch_sort_order(SortData value) +{ + assert(value.size() >= 1); + ConstArrayRef<unsigned char> src(value.data() + 1, value.size() - 1); + SortData s; + s.reserve(src.size() + 1); + s.emplace_back(value[0]); + for (auto c : src) { + s.emplace_back(c ^ 255); + } + return s; +} + +struct SetupHook { + SetupHook(); +}; + +SetupHook::SetupHook() +{ + Fast_NormalizeWordFolder::Setup(Fast_NormalizeWordFolder::DO_ACCENT_REMOVAL | + Fast_NormalizeWordFolder::DO_SHARP_S_SUBSTITUTION | + Fast_NormalizeWordFolder::DO_LIGATURE_SUBSTITUTION | + Fast_NormalizeWordFolder::DO_MULTICHAR_EXPANSION); +} + +SetupHook setup_hook; + +} + +template <typename Param> +class SortBlobWritersTest : public ::testing::Test +{ +protected: + SortBlobWritersTest(); + ~SortBlobWritersTest() override; +}; + + +template <typename Param> +SortBlobWritersTest<Param>::SortBlobWritersTest() + : ::testing::Test() +{ +} + +template <typename Param> +SortBlobWritersTest<Param>::~SortBlobWritersTest() = default; + +struct Int8Params { + using Type = int8_t; + static constexpr int8_t value = 42; + static SortData sort_asc; + static SortData sort_desc; + static std::vector<int8_t> values; + static constexpr int8_t min_value = -4; + static constexpr int8_t max_value = 9; +}; + +SortData Int8Params::sort_asc{0, 128 ^ 42}; +SortData Int8Params::sort_desc{0, 127 ^ 42}; +std::vector<int8_t> Int8Params::values{5, 7, -4, 9}; + +struct Int16Params { + using Type = int16_t; + static constexpr int16_t value = 43; + static SortData sort_asc; + static SortData sort_desc; + static std::vector<int16_t> values; + static constexpr int16_t min_value = -4; + static constexpr int16_t max_value = 9; +}; + +SortData Int16Params::sort_asc{0, 128, 43}; +SortData Int16Params::sort_desc{0, 127, 255 ^ 43}; +std::vector<int16_t> Int16Params::values{5, 7, -4, 9}; + +struct Int32Params { + using Type = int32_t; + static constexpr int16_t value = 44; + static SortData sort_asc; + static SortData sort_desc; + static std::vector<int32_t> values; + static constexpr int32_t min_value = -4; + static constexpr int32_t max_value = 9; +}; + +SortData Int32Params::sort_asc{0, 128, 0, 0, 44}; +SortData Int32Params::sort_desc{0, 127, 255, 255, 255 ^ 44}; +std::vector<int32_t> Int32Params::values{5, 7, -4, 9}; + +struct Int64Params { + using Type = int64_t; + static constexpr int16_t value = 45; + static SortData sort_asc; + static SortData sort_desc; + static std::vector<int64_t> values; + static constexpr int64_t min_value = -4; + static constexpr int64_t max_value = 9; +}; + +SortData Int64Params::sort_asc{0, 128, 0, 0, 0, 0, 0, 0, 45}; +SortData Int64Params::sort_desc{0, 127, 255, 255, 255, 255, 255, 255, 255 ^ 45}; +std::vector<int64_t> Int64Params::values{5, 7, -4, 9}; + +struct FloatParams { + using Type = float; + static constexpr int16_t value = 46; + static std::vector<float> values; + static std::vector<float> values_with_nan; + static std::vector<float> values_only_nan; + static constexpr float min_value = -4; + static constexpr float max_value = 9; +}; + +std::vector<float> FloatParams::values{5, 7, -4, 9}; +std::vector<float> FloatParams::values_with_nan{5, 7, std::numeric_limits<float>::quiet_NaN(), -4, 9}; +std::vector<float> FloatParams::values_only_nan{std::numeric_limits<float>::quiet_NaN()}; + +struct DoubleParams { + using Type = double; + static constexpr int16_t value = 47; + static std::vector<double> values; + static std::vector<double> values_with_nan; + static std::vector<double> values_only_nan; + static constexpr double min_value = -4; + static constexpr double max_value = 9; +}; + +std::vector<double> DoubleParams::values{5, 7, -4, 9}; +std::vector<double> DoubleParams::values_with_nan{5, 7, std::numeric_limits<double>::quiet_NaN(), -4, 9}; +std::vector<double> DoubleParams::values_only_nan{std::numeric_limits<double>::quiet_NaN()}; + +struct StringParams { + using Type = const char*; + static constexpr const char* value = "Hello"; + static std::vector<const char*> values; + static constexpr const char* min_value = "always"; + static constexpr const char* max_value = "this"; +}; + +std::vector<const char*> StringParams::values{"this", "always", "happens"}; + +using SortBlobWritersTestTypes = testing::Types<Int8Params, Int16Params, Int32Params, Int64Params, FloatParams, DoubleParams, StringParams>; + +TYPED_TEST_SUITE(SortBlobWritersTest, SortBlobWritersTestTypes); + +TYPED_TEST(SortBlobWritersTest, empty_arrays) +{ + using Type = typename TypeParam::Type; + EXPECT_EQ(missing_value, sort_data<Type>({}, true)); + EXPECT_EQ(missing_value, sort_data<Type>({}, false)); +} + +TYPED_TEST(SortBlobWritersTest, single_values) +{ + using Type = typename TypeParam::Type; + auto& value = TypeParam::value; + EXPECT_EQ(serialized_present<Type>(value, true), sort_data<Type>({value}, true)); + EXPECT_EQ(serialized_present<Type>(value, false), sort_data<Type>({value}, false)); + if constexpr (std::is_integral_v<Type>) { + EXPECT_EQ(TypeParam::sort_asc, sort_data<Type>({value}, true)); + EXPECT_EQ(TypeParam::sort_desc, sort_data<Type>({value}, false)); + } + EXPECT_EQ(switch_sort_order(sort_data<Type>({value}, false)), sort_data<Type>({value}, true)); + EXPECT_EQ(switch_sort_order(sort_data<Type>({value}, true)), sort_data<Type>({value}, false)); + EXPECT_GT(missing_value, sort_data<Type>({value}, true)); + EXPECT_GT(missing_value, sort_data<Type>({value}, false)); +} + +TYPED_TEST(SortBlobWritersTest, multiple_values) +{ + using Type = typename TypeParam::Type; + auto& values = TypeParam::values; + EXPECT_EQ(serialized_present<Type>(TypeParam::min_value, true), sort_data<Type>(values, true)); + EXPECT_EQ(serialized_present<Type>(TypeParam::max_value, false), sort_data<Type>(values, false)); +} + +template <typename Param> +using SortBlobFloatingPointWritersTest = SortBlobWritersTest<Param>; + +using SortBlobFloatingPointWritersTestTypes = testing::Types<FloatParams, DoubleParams>; + +TYPED_TEST_SUITE(SortBlobFloatingPointWritersTest, SortBlobFloatingPointWritersTestTypes); + +TYPED_TEST(SortBlobFloatingPointWritersTest, skip_nan_values) +{ + using Type = typename TypeParam::Type; + auto& values_only_nan = TypeParam::values_only_nan; + auto& values_with_nan = TypeParam::values_with_nan; + EXPECT_EQ(missing_value, sort_data<Type>(values_only_nan, true)); + EXPECT_EQ(missing_value, sort_data<Type>(values_only_nan, false)); + EXPECT_EQ(serialized_present<Type>(TypeParam::min_value, true), sort_data<Type>(values_with_nan, true)); + EXPECT_EQ(serialized_present<Type>(TypeParam::max_value, false), sort_data<Type>(values_with_nan, false)); +} + +using SortBlobStringWriterTest = SortBlobWritersTest<const char*>; + +TEST_F(SortBlobStringWriterTest, blob_converter_is_used) +{ + LowercaseConverter lowercase; + EXPECT_EQ(serialized_present_string("hello", true), sort_data_string({"Hello"}, &lowercase, true)); + EXPECT_EQ(serialized_present_string("hello", false), sort_data_string({"Hello"}, &lowercase, false)); + EXPECT_EQ(serialized_present_string("always", true), sort_data_string({"Hello", "always"}, &lowercase, true)); + EXPECT_EQ(serialized_present_string("hello", false), sort_data_string({"Hello", "always"}, &lowercase, false)); +} + +TEST_F(SortBlobStringWriterTest, prefix_is_first) +{ + EXPECT_EQ(serialized_present_string("aaa", true), sort_data_string({"aaa", "aaaa"}, true)); + EXPECT_EQ(serialized_present_string("aaaa", false), sort_data_string({"aaa", "aaaa"}, false)); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt b/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt index 896226f005d..c54169ffd11 100644 --- a/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt @@ -100,6 +100,7 @@ vespa_add_library(searchlib_attribute OBJECT numeric_matcher.cpp numeric_range_matcher.cpp numeric_search_context.cpp + numeric_sort_blob_writer.cpp posting_list_merger.cpp postingchange.cpp postinglistattribute.cpp @@ -143,6 +144,7 @@ vespa_add_library(searchlib_attribute OBJECT string_matcher.cpp string_search_context.cpp string_search_helper.cpp + string_sort_blob_writer.cpp valuemodifier.cpp DEPENDS ) diff --git a/searchlib/src/vespa/searchlib/attribute/numeric_sort_blob_writer.cpp b/searchlib/src/vespa/searchlib/attribute/numeric_sort_blob_writer.cpp new file mode 100644 index 00000000000..0c2aacc4ef1 --- /dev/null +++ b/searchlib/src/vespa/searchlib/attribute/numeric_sort_blob_writer.cpp @@ -0,0 +1,77 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "numeric_sort_blob_writer.h" +#include <vespa/vespalib/util/sort.h> +#include <cmath> +#include <type_traits> + +namespace search::attribute { + +template <typename T, bool asc> +NumericSortBlobWriter<T, asc>::NumericSortBlobWriter() noexcept +: _best() +{ +} + +template <typename T, bool asc> +NumericSortBlobWriter<T, asc>::~NumericSortBlobWriter() noexcept = default; + +template <typename T, bool asc> +void +NumericSortBlobWriter<T, asc>::candidate(T val) +{ + if constexpr (std::disjunction_v<std::is_same<T,float>,std::is_same<T, double>>) { + if (std::isnan(val)) { + return; + } + } + if (_best.has_value()) { + if constexpr (asc) { + if (_best.value() <= val) { + return; + } + } else { + if (_best.value() >= val) { + return; + } + } + } + _best = val; +} + +template <typename T, bool asc> +long +NumericSortBlobWriter<T, asc>::write(void *serTo, size_t available) +{ + auto dst = static_cast<unsigned char*>(serTo); + if (_best.has_value()) { + if (available < 1 + sizeof(T)) { + return -1; + } + *dst = has_value; + auto ret = vespalib::serializeForSort<vespalib::convertForSort<T, asc>>(_best.value(), dst + 1, available - 1); + return (ret >= 0) ? (ret + 1) : -1; + } else { + if (available < 1) { + return -1; + } + *dst = missing_value; + return 1; + } +} + +template class NumericSortBlobWriter<int8_t, true>; +template class NumericSortBlobWriter<int16_t, true>; +template class NumericSortBlobWriter<int32_t, true>; +template class NumericSortBlobWriter<int64_t, true>; +template class NumericSortBlobWriter<float, true>; +template class NumericSortBlobWriter<double, true>; + +template class NumericSortBlobWriter<int8_t, false>; +template class NumericSortBlobWriter<int16_t, false>; +template class NumericSortBlobWriter<int32_t, false>; +template class NumericSortBlobWriter<int64_t, false>; +template class NumericSortBlobWriter<float, false>; +template class NumericSortBlobWriter<double, false>; + +} diff --git a/searchlib/src/vespa/searchlib/attribute/numeric_sort_blob_writer.h b/searchlib/src/vespa/searchlib/attribute/numeric_sort_blob_writer.h new file mode 100644 index 00000000000..b2b6d578628 --- /dev/null +++ b/searchlib/src/vespa/searchlib/attribute/numeric_sort_blob_writer.h @@ -0,0 +1,25 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "sort_blob_writer.h" +#include <optional> + +namespace search::attribute { + +/* + * Class for writing numeric sort blobs for arrays and + * weighted sets of type T with ascending or descending + * sort order. + */ +template <typename T, bool asc> +class NumericSortBlobWriter : public SortBlobWriter { + std::optional<T> _best; +public: + NumericSortBlobWriter() noexcept; + ~NumericSortBlobWriter() noexcept; + void candidate(T val); + long write(void *serTo, size_t available); +}; + +} diff --git a/searchlib/src/vespa/searchlib/attribute/sort_blob_writer.h b/searchlib/src/vespa/searchlib/attribute/sort_blob_writer.h new file mode 100644 index 00000000000..31aca300349 --- /dev/null +++ b/searchlib/src/vespa/searchlib/attribute/sort_blob_writer.h @@ -0,0 +1,21 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +namespace search::attribute { + +/* + * Base class for sort blob writers. Contains definition of tags + * describing if value is present (any present value sorts before + * a missing value). + */ +class SortBlobWriter { +public: + /* + * Missing value is always sorted last. + */ + static constexpr unsigned char has_value = 0; + static constexpr unsigned char missing_value = 1; +}; + +} diff --git a/searchlib/src/vespa/searchlib/attribute/string_sort_blob_writer.cpp b/searchlib/src/vespa/searchlib/attribute/string_sort_blob_writer.cpp new file mode 100644 index 00000000000..dad1df35847 --- /dev/null +++ b/searchlib/src/vespa/searchlib/attribute/string_sort_blob_writer.cpp @@ -0,0 +1,72 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "string_sort_blob_writer.h" +#include <vespa/searchcommon/common/iblobconverter.h> +#include <vespa/vespalib/util/arrayref.h> +#include <algorithm> +#include <cstring> + +namespace search::attribute { + +StringSortBlobWriter::StringSortBlobWriter(void* serialize_to, size_t available, const BlobConverter* bc, bool asc) noexcept + : _best_size(), + _serialize_to(static_cast<unsigned char*>(serialize_to)), + _available(available), + _bc(bc), + _asc(asc) +{ +} + +StringSortBlobWriter::~StringSortBlobWriter() noexcept = default; + +bool +StringSortBlobWriter::candidate(const char* val) +{ + int size = std::strlen(val) + 1; + vespalib::ConstBufferRef buf(val, size); + if (_bc != nullptr) { + buf = _bc->convert(buf); + } + if (_best_size.has_value()) { + auto common_size = std::min(_best_size.value(), buf.size()); + auto cmpres = std::memcmp(_serialize_to + 1, buf.data(), common_size); + if (_asc) { + if (cmpres < 0 || (cmpres == 0 && _best_size.value() < buf.size())) { + return true; + } + } else { + if (cmpres > 0 || (cmpres == 0 && _best_size.value() > buf.size())) { + return true; + } + } + } + if (_available < buf.size() + 1) { + return false; + } + _serialize_to[0] = has_value; + memcpy(_serialize_to + 1, buf.data(), buf.size()); + _best_size = buf.size(); + return true; +} + +long +StringSortBlobWriter::write() +{ + if (_best_size.has_value()) { + if (!_asc) { + vespalib::ArrayRef<unsigned char> buf(_serialize_to + 1, _best_size.value()); + for (auto& c : buf) { + c = 0xff - c; + } + } + return 1 + _best_size.value(); + } else { + if (_available < 1) { + return -1; + } + _serialize_to[0] = missing_value; + return 1; + } +} + +} diff --git a/searchlib/src/vespa/searchlib/attribute/string_sort_blob_writer.h b/searchlib/src/vespa/searchlib/attribute/string_sort_blob_writer.h new file mode 100644 index 00000000000..12b1d871875 --- /dev/null +++ b/searchlib/src/vespa/searchlib/attribute/string_sort_blob_writer.h @@ -0,0 +1,31 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "sort_blob_writer.h" +#include <optional> + +namespace search::common { class BlobConverter; } + +namespace search::attribute { + +/* + * Class for writing numeric sort blobs for arrays and + * weighted sets of string with ascending or descending + * sort order. + */ +class StringSortBlobWriter : public SortBlobWriter { + using BlobConverter = common::BlobConverter; + std::optional<size_t> _best_size; + unsigned char* _serialize_to; + size_t _available; + const BlobConverter* _bc; + const bool _asc; +public: + StringSortBlobWriter(void* serialize_to, size_t available, const BlobConverter* bc, bool asc) noexcept; + ~StringSortBlobWriter() noexcept; + bool candidate(const char* val); + long write(); +}; + +} |