diff options
author | Geir Storli <geirst@yahooinc.com> | 2022-11-24 11:09:43 +0000 |
---|---|---|
committer | Geir Storli <geirst@yahooinc.com> | 2022-11-24 11:09:43 +0000 |
commit | 93767a816d8f449482012f0c065d3b926db4f811 (patch) | |
tree | b2c91f3e59f76c12bd6786aa8d643115a7adf5c4 /searchlib | |
parent | ba11a8a87877dd3a97586255839b02782b70de87 (diff) |
Add support for multiple vectors per document in exact distance calculator.
This also fixes a problem where previously the distance between the query tensor and
a non-set attribute tensor (dense) would be calculated against origo.
With this change the distance is std::numeric_limits<double>::max() is these cases.
Diffstat (limited to 'searchlib')
10 files changed, 169 insertions, 8 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index ae6469ae07a..6c5d8f4074f 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -219,6 +219,7 @@ vespa_define_module( src/tests/sortspec src/tests/tensor/dense_tensor_store src/tests/tensor/direct_tensor_store + src/tests/tensor/distance_calculator src/tests/tensor/distance_functions src/tests/tensor/hnsw_index src/tests/tensor/hnsw_nodeid_mapping diff --git a/searchlib/src/tests/tensor/distance_calculator/CMakeLists.txt b/searchlib/src/tests/tensor/distance_calculator/CMakeLists.txt new file mode 100644 index 00000000000..f4698ce355e --- /dev/null +++ b/searchlib/src/tests/tensor/distance_calculator/CMakeLists.txt @@ -0,0 +1,11 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_distance_calculator_test_app TEST + SOURCES + distance_calculator_test.cpp + DEPENDS + searchlib + searchlib_test + GTest::GTest +) +vespa_add_test(NAME searchlib_distance_calculator_test_app COMMAND searchlib_distance_calculator_test_app) + diff --git a/searchlib/src/tests/tensor/distance_calculator/distance_calculator_test.cpp b/searchlib/src/tests/tensor/distance_calculator/distance_calculator_test.cpp new file mode 100644 index 00000000000..11f767b546d --- /dev/null +++ b/searchlib/src/tests/tensor/distance_calculator/distance_calculator_test.cpp @@ -0,0 +1,86 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/eval/eval/simple_value.h> +#include <vespa/eval/eval/tensor_spec.h> +#include <vespa/eval/eval/value_type.h> +#include <vespa/searchcommon/attribute/config.h> +#include <vespa/searchlib/attribute/attributevector.h> +#include <vespa/searchlib/tensor/distance_calculator.h> +#include <vespa/searchlib/tensor/distance_function_factory.h> +#include <vespa/searchlib/test/attribute_builder.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <iostream> + +using namespace search::attribute::test; +using namespace search::attribute; +using namespace search::tensor; +using namespace vespalib::eval; + +using search::AttributeVector; + +class DistanceCalculatorTest : public testing::Test { +public: + std::shared_ptr<AttributeVector> attr; + const ITensorAttribute* attr_tensor; + std::unique_ptr<DistanceFunction> func; + + DistanceCalculatorTest() + : attr(), + attr_tensor(), + func(make_distance_function(DistanceMetric::Euclidean, CellType::DOUBLE)) + { + } + + void build_attribute(const vespalib::string& tensor_type, + const std::vector<vespalib::string>& tensor_values) { + Config cfg(BasicType::TENSOR); + cfg.setTensorType(ValueType::from_spec(tensor_type)); + attr = AttributeBuilder("doc_tensor", cfg).fill_tensor(tensor_values).get(); + attr_tensor = dynamic_cast<const ITensorAttribute*>(attr.get()); + ASSERT_TRUE(attr_tensor != nullptr); + } + double calc_distance(uint32_t docid, const vespalib::string& query_tensor) { + auto qv = SimpleValue::from_spec(TensorSpec::from_expr(query_tensor)); + DistanceCalculator calc(*attr_tensor, *qv, *func); + return calc.calc_with_limit(docid, std::numeric_limits<double>::max()); + } + double calc_rawscore(uint32_t docid, const vespalib::string& query_tensor) { + auto qv = SimpleValue::from_spec(TensorSpec::from_expr(query_tensor)); + DistanceCalculator calc(*attr_tensor, *qv, *func); + return calc.calc_raw_score(docid); + } +}; + +constexpr double max_distance = std::numeric_limits<double>::max(); + +TEST_F(DistanceCalculatorTest, calculation_over_dense_tensor_attribute) +{ + build_attribute("tensor(y[2])", {"[3,10]", ""}); + vespalib::string qt = "tensor(y[2]):[7,10]"; + EXPECT_DOUBLE_EQ(16, calc_distance(1, qt)); + EXPECT_DOUBLE_EQ(max_distance, calc_distance(2, qt)); + + EXPECT_DOUBLE_EQ(1.0/(1.0 + 4.0), calc_rawscore(1, qt)); + EXPECT_DOUBLE_EQ(0.0, calc_rawscore(2, qt)); +} + +TEST_F(DistanceCalculatorTest, calculation_over_mixed_tensor_attribute) +{ + build_attribute("tensor(x{},y[2])", + {"{{x:\"a\",y:0}:3,{x:\"a\",y:1}:10,{x:\"b\",y:0}:5,{x:\"b\",y:1}:10}", + "{}", ""}); + vespalib::string qt_1 = "tensor(y[2]):[9,10]"; + vespalib::string qt_2 = "tensor(y[2]):[1,10]"; + EXPECT_DOUBLE_EQ(16, calc_distance(1, qt_1)); + EXPECT_DOUBLE_EQ(4, calc_distance(1, qt_2)); + EXPECT_DOUBLE_EQ(max_distance, calc_distance(2, qt_1)); + EXPECT_DOUBLE_EQ(max_distance, calc_distance(3, qt_1)); + + EXPECT_DOUBLE_EQ(1.0/(1.0 + 4.0), calc_rawscore(1, qt_1)); + EXPECT_DOUBLE_EQ(1.0/(1.0 + 2.0), calc_rawscore(1, qt_2)); + EXPECT_DOUBLE_EQ(0.0, calc_rawscore(2, qt_1)); + EXPECT_DOUBLE_EQ(0.0, calc_rawscore(3, qt_1)); +} + +GTEST_MAIN_RUN_ALL_TESTS() + diff --git a/searchlib/src/vespa/searchlib/tensor/distance_calculator.h b/searchlib/src/vespa/searchlib/tensor/distance_calculator.h index 3ef41906b92..320f071cbbb 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_calculator.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_calculator.h @@ -3,6 +3,7 @@ #include "distance_function.h" #include "i_tensor_attribute.h" +#include "vector_bundle.h" namespace vespalib::eval { struct Value; } @@ -43,12 +44,24 @@ public: const DistanceFunction& function() const { return *_dist_fun; } double calc_raw_score(uint32_t docid) const { - double distance = _dist_fun->calc(_query_tensor_cells, _attr_tensor.extract_cells_ref(docid)); - return _dist_fun->to_rawscore(distance); + auto vectors = _attr_tensor.get_vectors(docid); + double result = 0.0; + for (uint32_t i = 0; i < vectors.subspaces(); ++i) { + double distance = _dist_fun->calc(_query_tensor_cells, vectors.cells(i)); + double score = _dist_fun->to_rawscore(distance); + result = std::max(result, score); + } + return result; } double calc_with_limit(uint32_t docid, double limit) const { - return _dist_fun->calc_with_limit(_query_tensor_cells, _attr_tensor.extract_cells_ref(docid), limit); + auto vectors = _attr_tensor.get_vectors(docid); + double result = std::numeric_limits<double>::max(); + for (uint32_t i = 0; i < vectors.subspaces(); ++i) { + double distance = _dist_fun->calc_with_limit(_query_tensor_cells, vectors.cells(i), limit); + result = std::min(result, distance); + } + return result; } /** diff --git a/searchlib/src/vespa/searchlib/tensor/i_tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/i_tensor_attribute.h index 3bcb55f2c2a..9b5f80b2ece 100644 --- a/searchlib/src/vespa/searchlib/tensor/i_tensor_attribute.h +++ b/searchlib/src/vespa/searchlib/tensor/i_tensor_attribute.h @@ -2,9 +2,10 @@ #pragma once -#include <memory> +#include "doc_vector_access.h" #include <vespa/eval/eval/typed_cells.h> #include <vespa/searchcommon/attribute/distance_metric.h> +#include <memory> namespace vespalib::eval { class ValueType; struct Value; } namespace vespalib::slime { struct Inserter; } @@ -16,8 +17,7 @@ class NearestNeighborIndex; /** * Interface for tensor attribute used by feature executors to get information. */ -class ITensorAttribute -{ +class ITensorAttribute : public DocVectorAccess { public: virtual ~ITensorAttribute() {} virtual std::unique_ptr<vespalib::eval::Value> getTensor(uint32_t docId) const = 0; diff --git a/searchlib/src/vespa/searchlib/tensor/imported_tensor_attribute_vector_read_guard.cpp b/searchlib/src/vespa/searchlib/tensor/imported_tensor_attribute_vector_read_guard.cpp index a9bc69c36c9..f9459823ce4 100644 --- a/searchlib/src/vespa/searchlib/tensor/imported_tensor_attribute_vector_read_guard.cpp +++ b/searchlib/src/vespa/searchlib/tensor/imported_tensor_attribute_vector_read_guard.cpp @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "imported_tensor_attribute_vector_read_guard.h" +#include "vector_bundle.h" #include <vespa/searchlib/attribute/attributevector.h> #include <vespa/eval/eval/value.h> @@ -60,6 +61,18 @@ ImportedTensorAttributeVectorReadGuard::get_tensor_ref(uint32_t docid) const return _target_tensor_attribute.get_tensor_ref(getTargetLid(docid)); } +vespalib::eval::TypedCells +ImportedTensorAttributeVectorReadGuard::get_vector(uint32_t docid, uint32_t subspace) const +{ + return _target_tensor_attribute.get_vector(getTargetLid(docid), subspace); +} + +search::tensor::VectorBundle +ImportedTensorAttributeVectorReadGuard::get_vectors(uint32_t docid) const +{ + return _target_tensor_attribute.get_vectors(getTargetLid(docid)); +} + const vespalib::eval::ValueType & ImportedTensorAttributeVectorReadGuard::getTensorType() const { diff --git a/searchlib/src/vespa/searchlib/tensor/imported_tensor_attribute_vector_read_guard.h b/searchlib/src/vespa/searchlib/tensor/imported_tensor_attribute_vector_read_guard.h index acfd1821e1f..f277d39e97d 100644 --- a/searchlib/src/vespa/searchlib/tensor/imported_tensor_attribute_vector_read_guard.h +++ b/searchlib/src/vespa/searchlib/tensor/imported_tensor_attribute_vector_read_guard.h @@ -40,6 +40,9 @@ public: DistanceMetric distance_metric() const override { return _target_tensor_attribute.distance_metric(); } uint32_t get_num_docs() const override { return getNumDocs(); } + vespalib::eval::TypedCells get_vector(uint32_t docid, uint32_t subspace) const override; + VectorBundle get_vectors(uint32_t docid) const override; + const vespalib::eval::ValueType &getTensorType() const override; void get_state(const vespalib::slime::Inserter& inserter) const override; }; diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h index 15a2db2b861..a4c30a574e5 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h @@ -3,7 +3,6 @@ #pragma once #include "i_tensor_attribute.h" -#include "doc_vector_access.h" #include "prepare_result.h" #include "subspace_type.h" #include "tensor_store.h" @@ -21,7 +20,7 @@ class NearestNeighborIndexFactory; /** * Attribute vector class used to store tensors for all documents in memory. */ -class TensorAttribute : public NotImplementedAttribute, public ITensorAttribute, public DocVectorAccess { +class TensorAttribute : public NotImplementedAttribute, public ITensorAttribute { protected: using AtomicEntryRef = vespalib::datastore::AtomicEntryRef; using EntryRef = TensorStore::EntryRef; diff --git a/searchlib/src/vespa/searchlib/test/attribute_builder.cpp b/searchlib/src/vespa/searchlib/test/attribute_builder.cpp index 04d06cd4c66..cc84355385d 100644 --- a/searchlib/src/vespa/searchlib/test/attribute_builder.cpp +++ b/searchlib/src/vespa/searchlib/test/attribute_builder.cpp @@ -1,13 +1,19 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "attribute_builder.h" +#include <vespa/eval/eval/simple_value.h> +#include <vespa/eval/eval/tensor_spec.h> #include <vespa/searchlib/attribute/attributefactory.h> #include <vespa/searchlib/attribute/attributevector.h> #include <vespa/searchlib/attribute/floatbase.h> #include <vespa/searchlib/attribute/integerbase.h> #include <vespa/searchlib/attribute/stringbase.h> +#include <vespa/searchlib/tensor/tensor_attribute.h> #include <cassert> +using vespalib::eval::SimpleValue; +using vespalib::eval::TensorSpec; + namespace search::attribute::test { AttributeBuilder::AttributeBuilder(const vespalib::string& name, const Config& cfg) @@ -151,5 +157,24 @@ AttributeBuilder::fill_wset(std::initializer_list<WeightedStringList> values) return *this; } +AttributeBuilder& +AttributeBuilder::fill_tensor(const std::vector<vespalib::string>& values) +{ + add_docs(_attr, values.size()); + auto& real = dynamic_cast<search::tensor::TensorAttribute&>(_attr); + vespalib::string tensor_type = real.getConfig().tensorType().to_spec(); + uint32_t docid = 1; + for (const auto& value : values) { + if (!value.empty()) { + auto spec = TensorSpec::from_expr(tensor_type + ":" + value); + auto tensor = SimpleValue::from_spec(spec); + real.setTensor(docid, *tensor); + } + ++docid; + } + _attr.commit(true); + return *this; +} + } diff --git a/searchlib/src/vespa/searchlib/test/attribute_builder.h b/searchlib/src/vespa/searchlib/test/attribute_builder.h index d5c2c3aa90e..339af4e22f5 100644 --- a/searchlib/src/vespa/searchlib/test/attribute_builder.h +++ b/searchlib/src/vespa/searchlib/test/attribute_builder.h @@ -52,6 +52,16 @@ public: AttributeBuilder& fill_array(std::initializer_list<StringList> values); AttributeBuilder& fill_wset(std::initializer_list<WeightedStringList> values); + /** + * Fill this tensor attribute with the given tensor values. + * + * Each string value represents the last part of a vespalib::eval::TensorSpec, + * without the tensor type as this is known from the tensor attribute. + * E.g "[1, 2]" is expanded to "tensor(x[2]):[1, 2]". + * If the string value is empty no tensor is set for that document. + */ + AttributeBuilder& fill_tensor(const std::vector<vespalib::string>& values); + std::shared_ptr<AttributeVector> get() const { return _attr_ptr; } }; |