aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2023-04-19 15:49:23 +0000
committerGeir Storli <geirst@yahooinc.com>2023-04-20 08:27:54 +0000
commitb032a769ce1cb0518d5fe96014c794837a4641bd (patch)
tree4f69ce6c0eb646ffba6220d6d25402a699877aaf
parent92101782a9ffebe1d815fbefa5d3ae0c7eab57f1 (diff)
Add exact nearest neighbor searcher over the streamed values of a tensor field.
Note: Integration into the searchvisitor remains.
-rw-r--r--streamingvisitors/CMakeLists.txt9
-rw-r--r--streamingvisitors/src/tests/nearest_neighbor_field_searcher/CMakeLists.txt12
-rw-r--r--streamingvisitors/src/tests/nearest_neighbor_field_searcher/nearest_neighbor_field_searcher_test.cpp152
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt1
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp139
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h53
6 files changed, 362 insertions, 4 deletions
diff --git a/streamingvisitors/CMakeLists.txt b/streamingvisitors/CMakeLists.txt
index 1e62203ea03..52e19b25900 100644
--- a/streamingvisitors/CMakeLists.txt
+++ b/streamingvisitors/CMakeLists.txt
@@ -18,15 +18,16 @@ vespa_define_module(
src/vespa/vsm/vsm
TESTS
- src/tests/hitcollector
- src/tests/matching_elements_filler
- src/tests/querywrapper
- src/tests/searchvisitor
src/tests/charbuffer
src/tests/docsum
src/tests/document
+ src/tests/hitcollector
+ src/tests/matching_elements_filler
+ src/tests/nearest_neighbor_field_searcher
src/tests/query_term_filter_factory
+ src/tests/querywrapper
src/tests/rank_processor
src/tests/searcher
+ src/tests/searchvisitor
src/tests/textutil
)
diff --git a/streamingvisitors/src/tests/nearest_neighbor_field_searcher/CMakeLists.txt b/streamingvisitors/src/tests/nearest_neighbor_field_searcher/CMakeLists.txt
new file mode 100644
index 00000000000..297e939ea72
--- /dev/null
+++ b/streamingvisitors/src/tests/nearest_neighbor_field_searcher/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vsm_nearest_neighbor_field_searcher_test_app TEST
+ SOURCES
+ nearest_neighbor_field_searcher_test.cpp
+ DEPENDS
+ searchlib
+ searchlib_test
+ streamingvisitors
+ GTest::GTest
+)
+
+vespa_add_test(NAME vsm_nearest_neighbor_field_searcher_test_app COMMAND vsm_nearest_neighbor_field_searcher_test_app)
diff --git a/streamingvisitors/src/tests/nearest_neighbor_field_searcher/nearest_neighbor_field_searcher_test.cpp b/streamingvisitors/src/tests/nearest_neighbor_field_searcher/nearest_neighbor_field_searcher_test.cpp
new file mode 100644
index 00000000000..70c3ae1002f
--- /dev/null
+++ b/streamingvisitors/src/tests/nearest_neighbor_field_searcher/nearest_neighbor_field_searcher_test.cpp
@@ -0,0 +1,152 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/document/datatype/tensor_data_type.h>
+#include <vespa/document/fieldvalue/tensorfieldvalue.h>
+#include <vespa/eval/eval/simple_value.h>
+#include <vespa/eval/eval/tensor_spec.h>
+#include <vespa/eval/eval/value_codec.h>
+#include <vespa/eval/eval/value_type.h>
+#include <vespa/searchlib/fef/indexproperties.h>
+#include <vespa/searchlib/fef/properties.h>
+#include <vespa/searchlib/fef/tablemanager.h>
+#include <vespa/searchlib/query/streaming/nearest_neighbor_query_node.h>
+#include <vespa/searchlib/tensor/euclidean_distance.h>
+#include <vespa/searchlib/test/mock_attribute_manager.h>
+#include <vespa/searchvisitor/indexenvironment.h>
+#include <vespa/searchvisitor/queryenvironment.h>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <vespa/vsm/searcher/nearest_neighbor_field_searcher.h>
+
+using namespace search::attribute::test;
+using namespace search::attribute;
+using namespace search::fef;
+using namespace search::streaming;
+using namespace search::tensor;
+using namespace vespalib::eval;
+using namespace vsm;
+
+using document::TensorFieldValue;
+using document::TensorDataType;
+
+struct MockQuery {
+ std::vector<std::unique_ptr<NearestNeighborQueryNode>> nodes;
+ QueryTermList term_list;
+ MockQuery& add(const vespalib::string& query_tensor_name,
+ double distance_threshold) {
+ std::unique_ptr<QueryNodeResultBase> base;
+ auto node = std::make_unique<NearestNeighborQueryNode>(std::move(base), query_tensor_name, "my_tensor_field", 7, search::query::Weight(11), distance_threshold);
+ nodes.push_back(std::move(node));
+ term_list.push_back(nodes.back().get());
+ return *this;
+ }
+ ~MockQuery() {}
+ const NearestNeighborQueryNode& get(size_t idx) const {
+ assert(idx < nodes.size());
+ return *nodes[idx];
+ }
+ void reset() {
+ for (auto term : term_list) {
+ term->reset();
+ }
+ }
+};
+
+class NearestNeighborSearcherTest : public testing::Test {
+public:
+ TableManager table_mgr;
+ streaming::IndexEnvironment index_env;
+ MockAttributeManager attr_mgr;
+ Properties query_props;
+ streaming::QueryEnvironment query_env;
+ ValueType tensor_type;
+ TensorDataType data_type;
+ SquaredEuclideanDistance dist_func;
+ NearestNeighborFieldSearcher searcher;
+ MockQuery query;
+
+ NearestNeighborSearcherTest()
+ : table_mgr(),
+ index_env(table_mgr),
+ attr_mgr(),
+ query_props(),
+ query_env("", index_env, query_props, &attr_mgr),
+ tensor_type(ValueType::from_spec("tensor(x[2])")),
+ data_type(tensor_type),
+ dist_func(CellType::DOUBLE),
+ searcher(7, DistanceMetric::Euclidean),
+ query()
+ {
+ }
+ void set_query_tensor(const vespalib::string& query_tensor_name,
+ const vespalib::string& spec_expr) {
+ search::fef::indexproperties::type::QueryFeature::set(index_env.getProperties(), query_tensor_name, tensor_type.to_spec());
+ auto tensor = SimpleValue::from_spec(TensorSpec::from_expr(spec_expr));
+ vespalib::nbostream stream;
+ vespalib::eval::encode_value(*tensor, stream);
+ query_props.add(query_tensor_name, vespalib::stringref(stream.peek(), stream.size()));
+ }
+ void prepare() {
+ auto searcher_buf = std::make_shared<SearcherBuf>();
+ searcher.prepare_new(query.term_list, searcher_buf, tensor_type, query_env);
+ }
+ void match(const vespalib::string& spec_expr) {
+ TensorFieldValue fv(data_type);
+ auto tensor = SimpleValue::from_spec(TensorSpec::from_expr(spec_expr));
+ fv = std::move(tensor);
+ query.reset();
+ searcher.onValue(fv);
+ }
+ void expect_match(double exp_square_distance, const NearestNeighborQueryNode& node) {
+ double exp_raw_score = dist_func.to_rawscore(exp_square_distance);
+ EXPECT_TRUE(node.evaluate());
+ EXPECT_DOUBLE_EQ(exp_raw_score, node.get_raw_score().value());
+ }
+};
+
+TEST_F(NearestNeighborSearcherTest, raw_score_calculated_with_distance_threshold)
+{
+ query.add("qt1", 3);
+ set_query_tensor("qt1", "tensor(x[2]):[1,3]");
+ prepare();
+
+ match("tensor(x[2]):[1,5]");
+ expect_match((5-3)*(5-3), query.get(0));
+
+ match("tensor(x[2]):[1,6]");
+ expect_match((6-3)*(6-3), query.get(0));
+
+ match("tensor(x[2]):[1,7]");
+ // This is not a match since ((7-3)*(7-3) = 16) is larger than the internal distance threshold of (3*3 = 9).
+ EXPECT_FALSE(query.get(0).evaluate());
+}
+
+TEST_F(NearestNeighborSearcherTest, raw_score_calculated_for_two_query_operators)
+{
+ query.add("qt1", 3);
+ query.add("qt2", 4);
+ set_query_tensor("qt1", "tensor(x[2]):[1,3]");
+ set_query_tensor("qt2", "tensor(x[2]):[1,4]");
+ prepare();
+
+ match("tensor(x[2]):[1,5]");
+ expect_match((5-3)*(5-3), query.get(0));
+ expect_match((5-4)*(5-4), query.get(1));
+
+ match("tensor(x[2]):[1,7]");
+ // This is not a match since ((7-3)*(7-3) = 16) is larger than the internal distance threshold of (3*3 = 9).
+ EXPECT_FALSE(query.get(0).evaluate());
+ expect_match((7-4)*(7-4), query.get(1));
+}
+
+TEST_F(NearestNeighborSearcherTest, distance_metric_from_string)
+{
+ using NNFS = NearestNeighborFieldSearcher;
+ EXPECT_EQ(DistanceMetric::Euclidean, NNFS::distance_metric_from_string("EUCLIDEAN"));
+ EXPECT_EQ(DistanceMetric::Angular, NNFS::distance_metric_from_string("ANGULAR"));
+ EXPECT_EQ(DistanceMetric::GeoDegrees, NNFS::distance_metric_from_string("GEODEGREES"));
+ EXPECT_EQ(DistanceMetric::InnerProduct, NNFS::distance_metric_from_string("INNERPRODUCT"));
+ EXPECT_EQ(DistanceMetric::Hamming, NNFS::distance_metric_from_string("HAMMING"));
+ EXPECT_EQ(DistanceMetric::Euclidean, NNFS::distance_metric_from_string("not_available"));
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
index 0a2a9ec21d2..730cba4abba 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
+++ b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
@@ -15,6 +15,7 @@ vespa_add_library(vsm_vsmsearcher OBJECT
futf8strchrfieldsearcher.cpp
geo_pos_field_searcher.cpp
intfieldsearcher.cpp
+ nearest_neighbor_field_searcher.cpp
strchrfieldsearcher.cpp
utf8flexiblestringfieldsearcher.cpp
utf8strchrfieldsearcher.cpp
diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
new file mode 100644
index 00000000000..82243d67e5a
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
@@ -0,0 +1,139 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "nearest_neighbor_field_searcher.h"
+#include <vespa/document/fieldvalue/tensorfieldvalue.h>
+#include <vespa/searchcommon/attribute/config.h>
+#include <vespa/searchlib/fef/iqueryenvironment.h>
+#include <vespa/searchlib/fef/query_value.h>
+#include <vespa/searchlib/query/streaming/nearest_neighbor_query_node.h>
+#include <vespa/searchlib/tensor/distance_calculator.h>
+#include <vespa/searchlib/tensor/distance_function.h>
+#include <vespa/searchlib/tensor/distance_function_factory.h>
+#include <vespa/searchlib/tensor/tensor_ext_attribute.h>
+#include <vespa/vespalib/util/exceptions.h>
+#include <vespa/vespalib/util/issue.h>
+
+using search::attribute::BasicType;
+using search::attribute::CollectionType;
+using search::attribute::Config;
+using search::fef::QueryValue;
+using search::tensor::DistanceCalculator;
+using search::tensor::TensorExtAttribute;
+using search::tensor::make_distance_function;
+using vespalib::eval::ValueType;
+
+namespace {
+
+constexpr uint32_t scratch_docid = 0;
+
+std::unique_ptr<TensorExtAttribute>
+make_attribute(const ValueType& tensor_type)
+{
+ Config cfg(BasicType::TENSOR, CollectionType::SINGLE);
+ cfg.setTensorType(tensor_type);
+ auto result = std::make_unique<TensorExtAttribute>("nnfs_attr", cfg);
+ uint32_t docid;
+ result->addDoc(docid);
+ assert(docid == scratch_docid);
+ return result;
+}
+
+}
+
+namespace vsm {
+
+NearestNeighborFieldSearcher::NodeAndCalc::NodeAndCalc(search::streaming::NearestNeighborQueryNode* node_in,
+ std::unique_ptr<search::tensor::DistanceCalculator> calc_in)
+ : node(node_in),
+ calc(std::move(calc_in)),
+ distance_threshold(calc->function().convert_threshold(node->get_distance_threshold()))
+{
+}
+
+NearestNeighborFieldSearcher::NearestNeighborFieldSearcher(const FieldIdT& fid,
+ search::attribute::DistanceMetric metric)
+ : FieldSearcher(fid),
+ _metric(metric),
+ _attr(),
+ _calcs()
+{
+}
+
+std::unique_ptr<FieldSearcher>
+NearestNeighborFieldSearcher::duplicate() const
+{
+ return std::make_unique<NearestNeighborFieldSearcher>(field(), _metric);
+}
+
+void
+NearestNeighborFieldSearcher::prepare_new(search::streaming::QueryTermList& qtl, const SharedSearcherBuf&,
+ const vespalib::eval::ValueType& tensor_type, search::fef::IQueryEnvironment& query_env)
+{
+ _attr = make_attribute(tensor_type);
+ _calcs.clear();
+ for (auto term : qtl) {
+ auto* nn_term = term->as_nearest_neighbor_query_node();
+ if (nn_term == nullptr) {
+ vespalib::Issue::report("Query term (%s) searching field %u is NOT a NearestNeighborQueryNode",
+ term->getClassName().c_str(), field());
+ continue;
+ }
+ auto query_value = QueryValue::from_config(nn_term->get_query_tensor_name(), query_env.getIndexEnvironment());
+ query_value.prepare_shared_state(query_env, query_env.getObjectStore());
+ const auto* tensor_value = query_value.lookup_value(query_env.getObjectStore());
+ if (tensor_value == nullptr) {
+ vespalib::Issue::report("Could not find query tensor for NearestNeighborQueryNode(%s, %s)",
+ nn_term->index().c_str(), nn_term->get_query_tensor_name().c_str());
+ continue;
+ }
+ try {
+ auto calc = DistanceCalculator::make_with_validation(*_attr, *tensor_value);
+ _calcs.emplace_back(nn_term, std::move(calc));
+ } catch (const vespalib::IllegalArgumentException& ex) {
+ vespalib::Issue::report("Could not create DistanceCalculator for NearestNeighborQueryNode(%s, %s): %s",
+ nn_term->index().c_str(), nn_term->get_query_tensor_name().c_str(), ex.what());
+ }
+ }
+}
+
+void
+NearestNeighborFieldSearcher::onValue(const document::FieldValue& fv)
+{
+ if (fv.isA(document::FieldValue::Type::TENSOR)) {
+ const auto* tfv = dynamic_cast<const document::TensorFieldValue*>(&fv);
+ if (tfv && tfv->getAsTensorPtr()) {
+ _attr->add(*tfv->getAsTensorPtr(), 1);
+ for (auto& elem : _calcs) {
+ double distance = elem.calc->calc_with_limit(scratch_docid, elem.distance_threshold);
+ if (distance <= elem.distance_threshold) {
+ double score = elem.calc->function().to_rawscore(distance);
+ elem.node->set_raw_score(score);
+ }
+ }
+ }
+ }
+}
+
+search::attribute::DistanceMetric
+NearestNeighborFieldSearcher::distance_metric_from_string(const vespalib::string& value)
+{
+ using search::attribute::DistanceMetric;
+ // Valid string values must match the definition of DistanceMetric in
+ // config-model/src/main/java/com/yahoo/schema/document/Attribute.java
+ if (value == "EUCLIDEAN") {
+ return DistanceMetric::Euclidean;
+ } else if (value == "ANGULAR") {
+ return DistanceMetric::Angular;
+ } else if (value == "GEODEGREES") {
+ return DistanceMetric::GeoDegrees;
+ } else if (value == "INNERPRODUCT") {
+ return DistanceMetric::InnerProduct;
+ } else if (value == "HAMMING") {
+ return DistanceMetric::Hamming;
+ }
+ vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", value.c_str());
+ return DistanceMetric::Euclidean;
+}
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
new file mode 100644
index 00000000000..ec78360b216
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
@@ -0,0 +1,53 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "fieldsearcher.h"
+#include <vespa/eval/eval/value_type.h>
+#include <vespa/searchcommon/attribute/distance_metric.h>
+#include <vespa/searchlib/tensor/distance_calculator.h>
+#include <vespa/searchlib/tensor/tensor_ext_attribute.h>
+
+namespace search::fef { class IQueryEnvironment; }
+
+namespace search::tensor {
+class TensorExtAttribute;
+}
+
+namespace search::streaming { class NearestNeighborQueryNode; }
+
+namespace vsm {
+
+/**
+ * Class used to perform exact nearest neighbor search over the streamed values of a tensor field.
+ *
+ * The raw score from the distance calculation is stored in NearestNeighborQueryNode instances
+ * searching this field.
+ */
+class NearestNeighborFieldSearcher : public FieldSearcher {
+private:
+ struct NodeAndCalc {
+ search::streaming::NearestNeighborQueryNode* node;
+ std::unique_ptr<search::tensor::DistanceCalculator> calc;
+ double distance_threshold;
+ NodeAndCalc(search::streaming::NearestNeighborQueryNode* node_in,
+ std::unique_ptr<search::tensor::DistanceCalculator> calc_in);
+ };
+ search::attribute::DistanceMetric _metric;
+ std::unique_ptr<search::tensor::TensorExtAttribute> _attr;
+ std::vector<NodeAndCalc> _calcs;
+
+public:
+ NearestNeighborFieldSearcher(const FieldIdT& fid,
+ search::attribute::DistanceMetric metric);
+
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+ // TODO: change FieldSearcher::prepare() to provide the necessary objects.
+ void prepare_new(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf,
+ const vespalib::eval::ValueType& tensor_type, search::fef::IQueryEnvironment& query_env);
+ void onValue(const document::FieldValue& fv) override;
+
+ static search::attribute::DistanceMetric distance_metric_from_string(const vespalib::string& value);
+};
+
+}