diff options
author | Geir Storli <geirst@yahooinc.com> | 2023-04-19 15:49:23 +0000 |
---|---|---|
committer | Geir Storli <geirst@yahooinc.com> | 2023-04-20 08:27:54 +0000 |
commit | b032a769ce1cb0518d5fe96014c794837a4641bd (patch) | |
tree | 4f69ce6c0eb646ffba6220d6d25402a699877aaf /streamingvisitors | |
parent | 92101782a9ffebe1d815fbefa5d3ae0c7eab57f1 (diff) |
Add exact nearest neighbor searcher over the streamed values of a tensor field.
Note: Integration into the searchvisitor remains.
Diffstat (limited to 'streamingvisitors')
6 files changed, 362 insertions, 4 deletions
diff --git a/streamingvisitors/CMakeLists.txt b/streamingvisitors/CMakeLists.txt index 1e62203ea03..52e19b25900 100644 --- a/streamingvisitors/CMakeLists.txt +++ b/streamingvisitors/CMakeLists.txt @@ -18,15 +18,16 @@ vespa_define_module( src/vespa/vsm/vsm TESTS - src/tests/hitcollector - src/tests/matching_elements_filler - src/tests/querywrapper - src/tests/searchvisitor src/tests/charbuffer src/tests/docsum src/tests/document + src/tests/hitcollector + src/tests/matching_elements_filler + src/tests/nearest_neighbor_field_searcher src/tests/query_term_filter_factory + src/tests/querywrapper src/tests/rank_processor src/tests/searcher + src/tests/searchvisitor src/tests/textutil ) diff --git a/streamingvisitors/src/tests/nearest_neighbor_field_searcher/CMakeLists.txt b/streamingvisitors/src/tests/nearest_neighbor_field_searcher/CMakeLists.txt new file mode 100644 index 00000000000..297e939ea72 --- /dev/null +++ b/streamingvisitors/src/tests/nearest_neighbor_field_searcher/CMakeLists.txt @@ -0,0 +1,12 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vsm_nearest_neighbor_field_searcher_test_app TEST + SOURCES + nearest_neighbor_field_searcher_test.cpp + DEPENDS + searchlib + searchlib_test + streamingvisitors + GTest::GTest +) + +vespa_add_test(NAME vsm_nearest_neighbor_field_searcher_test_app COMMAND vsm_nearest_neighbor_field_searcher_test_app) diff --git a/streamingvisitors/src/tests/nearest_neighbor_field_searcher/nearest_neighbor_field_searcher_test.cpp b/streamingvisitors/src/tests/nearest_neighbor_field_searcher/nearest_neighbor_field_searcher_test.cpp new file mode 100644 index 00000000000..70c3ae1002f --- /dev/null +++ b/streamingvisitors/src/tests/nearest_neighbor_field_searcher/nearest_neighbor_field_searcher_test.cpp @@ -0,0 +1,152 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/document/datatype/tensor_data_type.h> +#include <vespa/document/fieldvalue/tensorfieldvalue.h> +#include <vespa/eval/eval/simple_value.h> +#include <vespa/eval/eval/tensor_spec.h> +#include <vespa/eval/eval/value_codec.h> +#include <vespa/eval/eval/value_type.h> +#include <vespa/searchlib/fef/indexproperties.h> +#include <vespa/searchlib/fef/properties.h> +#include <vespa/searchlib/fef/tablemanager.h> +#include <vespa/searchlib/query/streaming/nearest_neighbor_query_node.h> +#include <vespa/searchlib/tensor/euclidean_distance.h> +#include <vespa/searchlib/test/mock_attribute_manager.h> +#include <vespa/searchvisitor/indexenvironment.h> +#include <vespa/searchvisitor/queryenvironment.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <vespa/vsm/searcher/nearest_neighbor_field_searcher.h> + +using namespace search::attribute::test; +using namespace search::attribute; +using namespace search::fef; +using namespace search::streaming; +using namespace search::tensor; +using namespace vespalib::eval; +using namespace vsm; + +using document::TensorFieldValue; +using document::TensorDataType; + +struct MockQuery { + std::vector<std::unique_ptr<NearestNeighborQueryNode>> nodes; + QueryTermList term_list; + MockQuery& add(const vespalib::string& query_tensor_name, + double distance_threshold) { + std::unique_ptr<QueryNodeResultBase> base; + auto node = std::make_unique<NearestNeighborQueryNode>(std::move(base), query_tensor_name, "my_tensor_field", 7, search::query::Weight(11), distance_threshold); + nodes.push_back(std::move(node)); + term_list.push_back(nodes.back().get()); + return *this; + } + ~MockQuery() {} + const NearestNeighborQueryNode& get(size_t idx) const { + assert(idx < nodes.size()); + return *nodes[idx]; + } + void reset() { + for (auto term : term_list) { + term->reset(); + } + } +}; + +class NearestNeighborSearcherTest : public testing::Test { +public: + TableManager table_mgr; + streaming::IndexEnvironment index_env; + MockAttributeManager attr_mgr; + Properties query_props; + streaming::QueryEnvironment query_env; + ValueType tensor_type; + TensorDataType data_type; + SquaredEuclideanDistance dist_func; + NearestNeighborFieldSearcher searcher; + MockQuery query; + + NearestNeighborSearcherTest() + : table_mgr(), + index_env(table_mgr), + attr_mgr(), + query_props(), + query_env("", index_env, query_props, &attr_mgr), + tensor_type(ValueType::from_spec("tensor(x[2])")), + data_type(tensor_type), + dist_func(CellType::DOUBLE), + searcher(7, DistanceMetric::Euclidean), + query() + { + } + void set_query_tensor(const vespalib::string& query_tensor_name, + const vespalib::string& spec_expr) { + search::fef::indexproperties::type::QueryFeature::set(index_env.getProperties(), query_tensor_name, tensor_type.to_spec()); + auto tensor = SimpleValue::from_spec(TensorSpec::from_expr(spec_expr)); + vespalib::nbostream stream; + vespalib::eval::encode_value(*tensor, stream); + query_props.add(query_tensor_name, vespalib::stringref(stream.peek(), stream.size())); + } + void prepare() { + auto searcher_buf = std::make_shared<SearcherBuf>(); + searcher.prepare_new(query.term_list, searcher_buf, tensor_type, query_env); + } + void match(const vespalib::string& spec_expr) { + TensorFieldValue fv(data_type); + auto tensor = SimpleValue::from_spec(TensorSpec::from_expr(spec_expr)); + fv = std::move(tensor); + query.reset(); + searcher.onValue(fv); + } + void expect_match(double exp_square_distance, const NearestNeighborQueryNode& node) { + double exp_raw_score = dist_func.to_rawscore(exp_square_distance); + EXPECT_TRUE(node.evaluate()); + EXPECT_DOUBLE_EQ(exp_raw_score, node.get_raw_score().value()); + } +}; + +TEST_F(NearestNeighborSearcherTest, raw_score_calculated_with_distance_threshold) +{ + query.add("qt1", 3); + set_query_tensor("qt1", "tensor(x[2]):[1,3]"); + prepare(); + + match("tensor(x[2]):[1,5]"); + expect_match((5-3)*(5-3), query.get(0)); + + match("tensor(x[2]):[1,6]"); + expect_match((6-3)*(6-3), query.get(0)); + + match("tensor(x[2]):[1,7]"); + // This is not a match since ((7-3)*(7-3) = 16) is larger than the internal distance threshold of (3*3 = 9). + EXPECT_FALSE(query.get(0).evaluate()); +} + +TEST_F(NearestNeighborSearcherTest, raw_score_calculated_for_two_query_operators) +{ + query.add("qt1", 3); + query.add("qt2", 4); + set_query_tensor("qt1", "tensor(x[2]):[1,3]"); + set_query_tensor("qt2", "tensor(x[2]):[1,4]"); + prepare(); + + match("tensor(x[2]):[1,5]"); + expect_match((5-3)*(5-3), query.get(0)); + expect_match((5-4)*(5-4), query.get(1)); + + match("tensor(x[2]):[1,7]"); + // This is not a match since ((7-3)*(7-3) = 16) is larger than the internal distance threshold of (3*3 = 9). + EXPECT_FALSE(query.get(0).evaluate()); + expect_match((7-4)*(7-4), query.get(1)); +} + +TEST_F(NearestNeighborSearcherTest, distance_metric_from_string) +{ + using NNFS = NearestNeighborFieldSearcher; + EXPECT_EQ(DistanceMetric::Euclidean, NNFS::distance_metric_from_string("EUCLIDEAN")); + EXPECT_EQ(DistanceMetric::Angular, NNFS::distance_metric_from_string("ANGULAR")); + EXPECT_EQ(DistanceMetric::GeoDegrees, NNFS::distance_metric_from_string("GEODEGREES")); + EXPECT_EQ(DistanceMetric::InnerProduct, NNFS::distance_metric_from_string("INNERPRODUCT")); + EXPECT_EQ(DistanceMetric::Hamming, NNFS::distance_metric_from_string("HAMMING")); + EXPECT_EQ(DistanceMetric::Euclidean, NNFS::distance_metric_from_string("not_available")); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt index 0a2a9ec21d2..730cba4abba 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt +++ b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt @@ -15,6 +15,7 @@ vespa_add_library(vsm_vsmsearcher OBJECT futf8strchrfieldsearcher.cpp geo_pos_field_searcher.cpp intfieldsearcher.cpp + nearest_neighbor_field_searcher.cpp strchrfieldsearcher.cpp utf8flexiblestringfieldsearcher.cpp utf8strchrfieldsearcher.cpp diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp new file mode 100644 index 00000000000..82243d67e5a --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp @@ -0,0 +1,139 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "nearest_neighbor_field_searcher.h" +#include <vespa/document/fieldvalue/tensorfieldvalue.h> +#include <vespa/searchcommon/attribute/config.h> +#include <vespa/searchlib/fef/iqueryenvironment.h> +#include <vespa/searchlib/fef/query_value.h> +#include <vespa/searchlib/query/streaming/nearest_neighbor_query_node.h> +#include <vespa/searchlib/tensor/distance_calculator.h> +#include <vespa/searchlib/tensor/distance_function.h> +#include <vespa/searchlib/tensor/distance_function_factory.h> +#include <vespa/searchlib/tensor/tensor_ext_attribute.h> +#include <vespa/vespalib/util/exceptions.h> +#include <vespa/vespalib/util/issue.h> + +using search::attribute::BasicType; +using search::attribute::CollectionType; +using search::attribute::Config; +using search::fef::QueryValue; +using search::tensor::DistanceCalculator; +using search::tensor::TensorExtAttribute; +using search::tensor::make_distance_function; +using vespalib::eval::ValueType; + +namespace { + +constexpr uint32_t scratch_docid = 0; + +std::unique_ptr<TensorExtAttribute> +make_attribute(const ValueType& tensor_type) +{ + Config cfg(BasicType::TENSOR, CollectionType::SINGLE); + cfg.setTensorType(tensor_type); + auto result = std::make_unique<TensorExtAttribute>("nnfs_attr", cfg); + uint32_t docid; + result->addDoc(docid); + assert(docid == scratch_docid); + return result; +} + +} + +namespace vsm { + +NearestNeighborFieldSearcher::NodeAndCalc::NodeAndCalc(search::streaming::NearestNeighborQueryNode* node_in, + std::unique_ptr<search::tensor::DistanceCalculator> calc_in) + : node(node_in), + calc(std::move(calc_in)), + distance_threshold(calc->function().convert_threshold(node->get_distance_threshold())) +{ +} + +NearestNeighborFieldSearcher::NearestNeighborFieldSearcher(const FieldIdT& fid, + search::attribute::DistanceMetric metric) + : FieldSearcher(fid), + _metric(metric), + _attr(), + _calcs() +{ +} + +std::unique_ptr<FieldSearcher> +NearestNeighborFieldSearcher::duplicate() const +{ + return std::make_unique<NearestNeighborFieldSearcher>(field(), _metric); +} + +void +NearestNeighborFieldSearcher::prepare_new(search::streaming::QueryTermList& qtl, const SharedSearcherBuf&, + const vespalib::eval::ValueType& tensor_type, search::fef::IQueryEnvironment& query_env) +{ + _attr = make_attribute(tensor_type); + _calcs.clear(); + for (auto term : qtl) { + auto* nn_term = term->as_nearest_neighbor_query_node(); + if (nn_term == nullptr) { + vespalib::Issue::report("Query term (%s) searching field %u is NOT a NearestNeighborQueryNode", + term->getClassName().c_str(), field()); + continue; + } + auto query_value = QueryValue::from_config(nn_term->get_query_tensor_name(), query_env.getIndexEnvironment()); + query_value.prepare_shared_state(query_env, query_env.getObjectStore()); + const auto* tensor_value = query_value.lookup_value(query_env.getObjectStore()); + if (tensor_value == nullptr) { + vespalib::Issue::report("Could not find query tensor for NearestNeighborQueryNode(%s, %s)", + nn_term->index().c_str(), nn_term->get_query_tensor_name().c_str()); + continue; + } + try { + auto calc = DistanceCalculator::make_with_validation(*_attr, *tensor_value); + _calcs.emplace_back(nn_term, std::move(calc)); + } catch (const vespalib::IllegalArgumentException& ex) { + vespalib::Issue::report("Could not create DistanceCalculator for NearestNeighborQueryNode(%s, %s): %s", + nn_term->index().c_str(), nn_term->get_query_tensor_name().c_str(), ex.what()); + } + } +} + +void +NearestNeighborFieldSearcher::onValue(const document::FieldValue& fv) +{ + if (fv.isA(document::FieldValue::Type::TENSOR)) { + const auto* tfv = dynamic_cast<const document::TensorFieldValue*>(&fv); + if (tfv && tfv->getAsTensorPtr()) { + _attr->add(*tfv->getAsTensorPtr(), 1); + for (auto& elem : _calcs) { + double distance = elem.calc->calc_with_limit(scratch_docid, elem.distance_threshold); + if (distance <= elem.distance_threshold) { + double score = elem.calc->function().to_rawscore(distance); + elem.node->set_raw_score(score); + } + } + } + } +} + +search::attribute::DistanceMetric +NearestNeighborFieldSearcher::distance_metric_from_string(const vespalib::string& value) +{ + using search::attribute::DistanceMetric; + // Valid string values must match the definition of DistanceMetric in + // config-model/src/main/java/com/yahoo/schema/document/Attribute.java + if (value == "EUCLIDEAN") { + return DistanceMetric::Euclidean; + } else if (value == "ANGULAR") { + return DistanceMetric::Angular; + } else if (value == "GEODEGREES") { + return DistanceMetric::GeoDegrees; + } else if (value == "INNERPRODUCT") { + return DistanceMetric::InnerProduct; + } else if (value == "HAMMING") { + return DistanceMetric::Hamming; + } + vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", value.c_str()); + return DistanceMetric::Euclidean; +} + +} + diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h new file mode 100644 index 00000000000..ec78360b216 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h @@ -0,0 +1,53 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "fieldsearcher.h" +#include <vespa/eval/eval/value_type.h> +#include <vespa/searchcommon/attribute/distance_metric.h> +#include <vespa/searchlib/tensor/distance_calculator.h> +#include <vespa/searchlib/tensor/tensor_ext_attribute.h> + +namespace search::fef { class IQueryEnvironment; } + +namespace search::tensor { +class TensorExtAttribute; +} + +namespace search::streaming { class NearestNeighborQueryNode; } + +namespace vsm { + +/** + * Class used to perform exact nearest neighbor search over the streamed values of a tensor field. + * + * The raw score from the distance calculation is stored in NearestNeighborQueryNode instances + * searching this field. + */ +class NearestNeighborFieldSearcher : public FieldSearcher { +private: + struct NodeAndCalc { + search::streaming::NearestNeighborQueryNode* node; + std::unique_ptr<search::tensor::DistanceCalculator> calc; + double distance_threshold; + NodeAndCalc(search::streaming::NearestNeighborQueryNode* node_in, + std::unique_ptr<search::tensor::DistanceCalculator> calc_in); + }; + search::attribute::DistanceMetric _metric; + std::unique_ptr<search::tensor::TensorExtAttribute> _attr; + std::vector<NodeAndCalc> _calcs; + +public: + NearestNeighborFieldSearcher(const FieldIdT& fid, + search::attribute::DistanceMetric metric); + + std::unique_ptr<FieldSearcher> duplicate() const override; + // TODO: change FieldSearcher::prepare() to provide the necessary objects. + void prepare_new(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, + const vespalib::eval::ValueType& tensor_type, search::fef::IQueryEnvironment& query_env); + void onValue(const document::FieldValue& fv) override; + + static search::attribute::DistanceMetric distance_metric_from_string(const vespalib::string& value); +}; + +} |