diff options
7 files changed, 176 insertions, 21 deletions
diff --git a/searchcorespi/src/vespa/searchcorespi/flush/iflushtarget.h b/searchcorespi/src/vespa/searchcorespi/flush/iflushtarget.h index 31707643649..03d9ba8d55c 100644 --- a/searchcorespi/src/vespa/searchcorespi/flush/iflushtarget.h +++ b/searchcorespi/src/vespa/searchcorespi/flush/iflushtarget.h @@ -153,7 +153,7 @@ public: virtual Time getLastFlushTime() const = 0; /** - * Return if the traget itself is in bad need for a flush. + * Return if the target itself is in bad need for a flush. * * @return true if an urgent flush is needed */ diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index 4237bede9d5..19cad2f3905 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -68,6 +68,7 @@ vespa_define_module( src/tests/aggregator src/tests/alignment src/tests/attribute + src/tests/attribute/attribute_header src/tests/attribute/attribute_operation src/tests/attribute/attributefilewriter src/tests/attribute/attributemanager diff --git a/searchlib/src/tests/attribute/attribute_header/CMakeLists.txt b/searchlib/src/tests/attribute/attribute_header/CMakeLists.txt new file mode 100644 index 00000000000..e72c0c6a528 --- /dev/null +++ b/searchlib/src/tests/attribute/attribute_header/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_attribute_header_test_app TEST + SOURCES + attribute_header_test.cpp + DEPENDS + searchlib + gtest +) +vespa_add_test(NAME searchlib_attribute_header_test_app COMMAND searchlib_attribute_header_test_app) diff --git a/searchlib/src/tests/attribute/attribute_header/attribute_header_test.cpp b/searchlib/src/tests/attribute/attribute_header/attribute_header_test.cpp new file mode 100644 index 00000000000..0f542d016a9 --- /dev/null +++ b/searchlib/src/tests/attribute/attribute_header/attribute_header_test.cpp @@ -0,0 +1,77 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/eval/eval/value_type.h> +#include <vespa/searchcommon/attribute/config.h> +#include <vespa/searchlib/attribute/attribute_header.h> +#include <vespa/vespalib/data/fileheader.h> +#include <vespa/vespalib/gtest/gtest.h> + +#include <vespa/log/log.h> +LOG_SETUP("attribute_header_test"); + +using namespace search; +using namespace search::attribute; + +using HnswIPO = std::optional<HnswIndexParams>; +using vespalib::eval::ValueType; + +const Config tensor_cfg(BasicType::TENSOR, CollectionType::SINGLE); +const vespalib::string file_name = "my_file_name"; +const ValueType tensor_type = ValueType::from_spec("tensor<float>(x[4])"); +constexpr uint32_t num_docs = 23; +constexpr uint64_t unique_value_count = 11; +constexpr uint64_t total_value_count = 13; +constexpr uint64_t create_serial_num = 17; +constexpr uint32_t version = 19; + +vespalib::GenericHeader +populate_header(const HnswIPO& hnsw_params) +{ + AttributeHeader header(file_name, + tensor_cfg.basicType(), + tensor_cfg.collectionType(), + tensor_type, + false, + PersistentPredicateParams(), + hnsw_params, + num_docs, + unique_value_count, + total_value_count, + create_serial_num, + version); + + vespalib::GenericHeader result; + header.addTags(result); + return result; +} + +void +verify_roundtrip_serialization(const HnswIPO& hnsw_params_in) +{ + auto gen_header = populate_header(hnsw_params_in); + auto attr_header = AttributeHeader::extractTags(gen_header); + + EXPECT_EQ(tensor_cfg.basicType(), attr_header.getBasicType()); + EXPECT_EQ(tensor_cfg.collectionType(), attr_header.getCollectionType()); + EXPECT_EQ(tensor_type, attr_header.getTensorType()); + EXPECT_EQ(num_docs, attr_header.getNumDocs()); + EXPECT_EQ(create_serial_num, attr_header.getCreateSerialNum()); + EXPECT_EQ(version, attr_header.getVersion()); + EXPECT_EQ(false, attr_header.getPredicateParamsSet()); + const auto& hnsw_params_out = attr_header.get_hnsw_index_params(); + EXPECT_EQ(hnsw_params_in.has_value(), hnsw_params_out.has_value()); + if (hnsw_params_in.has_value()) { + EXPECT_EQ(hnsw_params_in.value(), hnsw_params_out.value()); + } +} + +TEST(AttributeHeaderTest, can_be_added_to_and_extracted_from_generic_header) +{ + verify_roundtrip_serialization(HnswIPO({16, 100, DistanceMetric::Euclidean})); + verify_roundtrip_serialization(HnswIPO({16, 100, DistanceMetric::Angular})); + verify_roundtrip_serialization(HnswIPO({16, 100, DistanceMetric::GeoDegrees})); + verify_roundtrip_serialization(HnswIPO()); +} + +GTEST_MAIN_RUN_ALL_TESTS() + diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp index 3d7010ba6c3..b35a88fab77 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp @@ -3,6 +3,7 @@ #include "attribute_header.h" #include <vespa/vespalib/data/fileheader.h> #include <vespa/vespalib/data/databuffer.h> +#include <vespa/vespalib/util/exceptions.h> namespace search::attribute { @@ -18,6 +19,13 @@ const vespalib::string tensorTypeTag = "tensortype"; const vespalib::string predicateArityTag = "predicate.arity"; const vespalib::string predicateLowerBoundTag = "predicate.lower_bound"; const vespalib::string predicateUpperBoundTag = "predicate.upper_bound"; +const vespalib::string hnsw_max_links_tag = "hnsw.max_links_per_node"; +const vespalib::string hnsw_neighbors_to_explore_tag = "hnsw.neighbors_to_explore_at_insert"; +const vespalib::string hnsw_distance_metric = "hnsw.distance_metric"; +const vespalib::string euclidean = "euclidean"; +const vespalib::string angular = "angular"; +const vespalib::string geodegrees = "geodegrees"; +const vespalib::string doc_id_limit_tag = "docIdLimit"; } @@ -35,6 +43,7 @@ AttributeHeader::AttributeHeader(const vespalib::string &fileName) _collectionTypeParamsSet(false), _predicateParamsSet(false), _predicateParams(), + _hnsw_index_params(), _numDocs(0), _uniqueValueCount(0), _totalValueCount(0), @@ -43,11 +52,18 @@ AttributeHeader::AttributeHeader(const vespalib::string &fileName) { } -AttributeHeader::AttributeHeader(const vespalib::string &fileName, attribute::BasicType basicType, - attribute::CollectionType collectionType, const vespalib::eval::ValueType &tensorType, - bool enumerated, const attribute::PersistentPredicateParams &predicateParams, - uint32_t numDocs, [[maybe_unused]] uint32_t fixedWidth, uint64_t uniqueValueCount, - uint64_t totalValueCount, uint64_t createSerialNum, uint32_t version) +AttributeHeader::AttributeHeader(const vespalib::string &fileName, + attribute::BasicType basicType, + attribute::CollectionType collectionType, + const vespalib::eval::ValueType &tensorType, + bool enumerated, + const attribute::PersistentPredicateParams &predicateParams, + const std::optional<HnswIndexParams>& hnsw_index_params, + uint32_t numDocs, + uint64_t uniqueValueCount, + uint64_t totalValueCount, + uint64_t createSerialNum, + uint32_t version) : _fileName(fileName), _basicType(basicType), _collectionType(collectionType), @@ -56,6 +72,7 @@ AttributeHeader::AttributeHeader(const vespalib::string &fileName, attribute::Ba _collectionTypeParamsSet(false), _predicateParamsSet(false), _predicateParams(predicateParams), + _hnsw_index_params(hnsw_index_params), _numDocs(numDocs), _uniqueValueCount(uniqueValueCount), _totalValueCount(totalValueCount), @@ -66,6 +83,35 @@ AttributeHeader::AttributeHeader(const vespalib::string &fileName, attribute::Ba AttributeHeader::~AttributeHeader() = default; +namespace { + +vespalib::string +to_string(DistanceMetric metric) +{ + switch (metric) { + case DistanceMetric::Euclidean: return euclidean; + case DistanceMetric::Angular: return angular; + case DistanceMetric::GeoDegrees: return geodegrees; + } + throw vespalib::IllegalArgumentException("Unknown distance metric " + std::to_string(static_cast<int>(metric))); +} + +DistanceMetric +to_distance_metric(const vespalib::string& metric) +{ + if (metric == euclidean) { + return DistanceMetric::Euclidean; + } else if (metric == angular) { + return DistanceMetric::Angular; + } else if (metric == geodegrees) { + return DistanceMetric::GeoDegrees; + } else { + throw vespalib::IllegalStateException("Unknown distance metric '" + metric + "'"); + } +} + +} + void AttributeHeader::internalExtractTags(const vespalib::GenericHeader &header) { @@ -91,6 +137,15 @@ AttributeHeader::internalExtractTags(const vespalib::GenericHeader &header) if (_basicType.type() == BasicType::Type::TENSOR) { assert(header.hasTag(tensorTypeTag)); _tensorType = vespalib::eval::ValueType::from_spec(header.getTag(tensorTypeTag).asString()); + if (header.hasTag(hnsw_max_links_tag)) { + assert(header.hasTag(hnsw_neighbors_to_explore_tag)); + assert(header.hasTag(hnsw_distance_metric)); + + uint32_t max_links = header.getTag(hnsw_max_links_tag).asInteger(); + uint32_t neighbors_to_explore = header.getTag(hnsw_neighbors_to_explore_tag).asInteger(); + DistanceMetric distance_metric = to_distance_metric(header.getTag(hnsw_distance_metric).asString()); + _hnsw_index_params.emplace(max_links, neighbors_to_explore, distance_metric); + } } if (_basicType.type() == BasicType::Type::PREDICATE) { if (header.hasTag(predicateArityTag)) { @@ -105,6 +160,9 @@ AttributeHeader::internalExtractTags(const vespalib::GenericHeader &header) assert(!header.hasTag(predicateUpperBoundTag)); } } + if (header.hasTag(doc_id_limit_tag)) { + _numDocs = header.getTag(doc_id_limit_tag).asInteger(); + } if (header.hasTag(versionTag)) { _version = header.getTag(versionTag).asInteger(); } @@ -130,7 +188,7 @@ AttributeHeader::addTags(vespalib::GenericHeader &header) const } header.putTag(Tag("uniqueValueCount", _uniqueValueCount)); header.putTag(Tag("totalValueCount", _totalValueCount)); - header.putTag(Tag("docIdLimit", _numDocs)); + header.putTag(Tag(doc_id_limit_tag, _numDocs)); header.putTag(Tag("frozen", 0)); header.putTag(Tag("fileBitSize", 0)); header.putTag(Tag(versionTag, _version)); @@ -142,6 +200,12 @@ AttributeHeader::addTags(vespalib::GenericHeader &header) const } if (_basicType.type() == attribute::BasicType::Type::TENSOR) { header.putTag(Tag(tensorTypeTag, _tensorType.to_spec()));; + if (_hnsw_index_params.has_value()) { + const auto& params = *_hnsw_index_params; + header.putTag(Tag(hnsw_max_links_tag, params.max_links_per_node())); + header.putTag(Tag(hnsw_neighbors_to_explore_tag, params.neighbors_to_explore_at_insert())); + header.putTag(Tag(hnsw_distance_metric, to_string(params.distance_metric()))); + } } if (_basicType.type() == attribute::BasicType::Type::PREDICATE) { const auto & params = _predicateParams; diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_header.h b/searchlib/src/vespa/searchlib/attribute/attribute_header.h index 24eac8336b4..583253eea0f 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_header.h +++ b/searchlib/src/vespa/searchlib/attribute/attribute_header.h @@ -5,8 +5,10 @@ #include <vespa/vespalib/stllike/string.h> #include <vespa/searchcommon/attribute/basictype.h> #include <vespa/searchcommon/attribute/collectiontype.h> +#include <vespa/searchcommon/attribute/hnsw_index_params.h> #include <vespa/searchcommon/attribute/predicate_params.h> #include <vespa/eval/eval/value_type.h> +#include <optional> namespace vespalib { class GenericHeader; } @@ -26,6 +28,7 @@ private: bool _collectionTypeParamsSet; bool _predicateParamsSet; PersistentPredicateParams _predicateParams; + std::optional<HnswIndexParams> _hnsw_index_params; uint32_t _numDocs; uint64_t _uniqueValueCount; uint64_t _totalValueCount; @@ -42,8 +45,8 @@ public: const vespalib::eval::ValueType &tensorType, bool enumerated, const PersistentPredicateParams &predicateParams, + const std::optional<HnswIndexParams>& hnsw_index_params, uint32_t numDocs, - uint32_t fixedWidth, uint64_t uniqueValueCount, uint64_t totalValueCount, uint64_t createSerialNum, @@ -63,6 +66,7 @@ public: const PersistentPredicateParams &getPredicateParams() const { return _predicateParams; } bool getPredicateParamsSet() const { return _predicateParamsSet; } bool getCollectionTypeParamsSet() const { return _collectionTypeParamsSet; } + const std::optional<HnswIndexParams>& get_hnsw_index_params() const { return _hnsw_index_params; } static AttributeHeader extractTags(const vespalib::GenericHeader &header); void addTags(vespalib::GenericHeader &header) const; }; diff --git a/searchlib/src/vespa/searchlib/attribute/attributevector.cpp b/searchlib/src/vespa/searchlib/attribute/attributevector.cpp index ffc62d806e2..1f002ce612c 100644 --- a/searchlib/src/vespa/searchlib/attribute/attributevector.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attributevector.cpp @@ -306,19 +306,19 @@ AttributeVector::save(IAttributeSaveTarget &saveTarget, vespalib::stringref file attribute::AttributeHeader AttributeVector::createAttributeHeader(vespalib::stringref fileName) const { return attribute::AttributeHeader(fileName, - getConfig().basicType(), - getConfig().collectionType(), - getConfig().basicType().type() == BasicType::Type::TENSOR - ? getConfig().tensorType() - : vespalib::eval::ValueType::error_type(), - getEnumeratedSave(), - getConfig().predicateParams(), - getCommittedDocIdLimit(), - getFixedWidth(), - getUniqueValueCount(), - getTotalValueCount(), - getCreateSerialNum(), - getVersion()); + getConfig().basicType(), + getConfig().collectionType(), + (getConfig().basicType().type() == BasicType::Type::TENSOR + ? getConfig().tensorType() + : vespalib::eval::ValueType::error_type()), + getEnumeratedSave(), + getConfig().predicateParams(), + getConfig().hnsw_index_params(), + getCommittedDocIdLimit(), + getUniqueValueCount(), + getTotalValueCount(), + getCreateSerialNum(), + getVersion()); } void AttributeVector::onSave(IAttributeSaveTarget &) |