diff options
author | Arne H Juul <arnej27959@users.noreply.github.com> | 2020-03-16 14:24:56 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-03-16 14:24:56 +0100 |
commit | ed775e961f076df97d7393f27e6cc865607ddc21 (patch) | |
tree | d5fb8e8ca8e129231b361c7bfce227447dc75ca8 /searchlib | |
parent | 3e2650cd1bec18a177d33a7f88335bbd4132bd1c (diff) | |
parent | 7fd2d7d18d88a1decb3ec4c1b74eb93f5659ec0c (diff) |
Merge pull request #12552 from vespa-engine/arnej/polymorphic-distance-feature
Arnej/polymorphic distance feature
Diffstat (limited to 'searchlib')
7 files changed, 332 insertions, 20 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index d754fd78394..30f7fc4f54c 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -134,6 +134,7 @@ vespa_define_module( src/tests/features/item_raw_score src/tests/features/max_reduce_prod_join_replacer src/tests/features/native_dot_product + src/tests/features/nns_distance src/tests/features/ranking_expression src/tests/features/raw_score src/tests/features/subqueries diff --git a/searchlib/src/tests/features/nns_distance/CMakeLists.txt b/searchlib/src/tests/features/nns_distance/CMakeLists.txt new file mode 100644 index 00000000000..5b4c8f86b44 --- /dev/null +++ b/searchlib/src/tests/features/nns_distance/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +vespa_add_executable(searchlib_nns_distance_test_app TEST + SOURCES + nns_distance_test.cpp + DEPENDS + searchlib +) +vespa_add_test(NAME searchlib_nns_distance_test_app COMMAND searchlib_nns_distance_test_app) diff --git a/searchlib/src/tests/features/nns_distance/nns_distance_test.cpp b/searchlib/src/tests/features/nns_distance/nns_distance_test.cpp new file mode 100644 index 00000000000..ea864f9b7a0 --- /dev/null +++ b/searchlib/src/tests/features/nns_distance/nns_distance_test.cpp @@ -0,0 +1,177 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/testkit/test_kit.h> +#include <vespa/searchlib/features/setup.h> +#include <vespa/searchlib/fef/test/indexenvironment.h> +#include <vespa/searchlib/fef/test/indexenvironmentbuilder.h> +#include <vespa/searchlib/fef/test/queryenvironment.h> +#include <vespa/searchlib/features/distancefeature.h> +#include <vespa/searchlib/fef/fef.h> +#include <vespa/searchlib/fef/test/dummy_dependency_handler.h> +#include <vespa/vespalib/stllike/asciistream.h> +#include <vespa/vespalib/util/stringfmt.h> + +using search::feature_t; +using namespace search::fef; +using namespace search::fef::test; +using namespace search::features; +using CollectionType = FieldInfo::CollectionType; +using DataType = FieldInfo::DataType; + +const vespalib::string labelFeatureName("distance(label)"); +const vespalib::string fieldFeatureName("distance(bar)"); + +struct BlueprintFactoryFixture { + BlueprintFactory factory; + BlueprintFactoryFixture() : factory() + { + setup_search_features(factory); + } +}; + +struct IndexFixture { + IndexEnvironment indexEnv; + IndexFixture() : indexEnv() + { + IndexEnvironmentBuilder builder(indexEnv); + builder.addField(FieldType::ATTRIBUTE, CollectionType::SINGLE, DataType::INT64, "foo"); + builder.addField(FieldType::ATTRIBUTE, CollectionType::SINGLE, DataType::TENSOR, "bar"); + } +}; + +struct FeatureDumpFixture : public IDumpFeatureVisitor { + virtual void visitDumpFeature(const vespalib::string &) override { + TEST_ERROR("no features should be dumped"); + } + FeatureDumpFixture() : IDumpFeatureVisitor() {} +}; + +struct Labels { + virtual void inject(Properties &p) const = 0; + virtual ~Labels() {} +}; +struct NoLabel : public Labels { + virtual void inject(Properties &) const override {} +}; +struct SingleLabel : public Labels { + vespalib::string label; + uint32_t uid; + SingleLabel(const vespalib::string &l, uint32_t x) : label(l), uid(x) {} + virtual void inject(Properties &p) const override { + vespalib::asciistream key; + key << "vespa.label." << label << ".id"; + vespalib::asciistream value; + value << uid; + p.add(key.str(), value.str()); + } +}; + +struct RankFixture : BlueprintFactoryFixture, IndexFixture { + QueryEnvironment queryEnv; + RankSetup rankSetup; + MatchDataLayout mdl; + MatchData::UP match_data; + RankProgram::UP rankProgram; + std::vector<TermFieldHandle> fooHandles; + std::vector<TermFieldHandle> barHandles; + RankFixture(size_t fooCnt, size_t barCnt, const Labels &labels, const vespalib::string &featureName) + : queryEnv(&indexEnv), rankSetup(factory, indexEnv), + mdl(), match_data(), rankProgram(), fooHandles(), barHandles() + { + for (size_t i = 0; i < fooCnt; ++i) { + uint32_t fieldId = indexEnv.getFieldByName("foo")->id(); + fooHandles.push_back(mdl.allocTermField(fieldId)); + SimpleTermData term; + term.setUniqueId(i + 1); + term.addField(fieldId).setHandle(fooHandles.back()); + queryEnv.getTerms().push_back(term); + } + for (size_t i = 0; i < barCnt; ++i) { + uint32_t fieldId = indexEnv.getFieldByName("bar")->id(); + barHandles.push_back(mdl.allocTermField(fieldId)); + SimpleTermData term; + term.setUniqueId(fooCnt + i + 1); + term.addField(fieldId).setHandle(barHandles.back()); + queryEnv.getTerms().push_back(term); + } + labels.inject(queryEnv.getProperties()); + rankSetup.setFirstPhaseRank(featureName); + rankSetup.setIgnoreDefaultRankFeatures(true); + ASSERT_TRUE(rankSetup.compile()); + match_data = mdl.createMatchData(); + rankProgram = rankSetup.create_first_phase_program(); + rankProgram->setup(*match_data, queryEnv); + } + feature_t getScore(uint32_t docId) { + return Utils::getScoreFeature(*rankProgram, docId); + } + void setScore(TermFieldHandle handle, uint32_t docId, feature_t score) { + match_data->resolveTermField(handle)->setRawScore(docId, score); + } + void setFooScore(uint32_t i, uint32_t docId, feature_t score) { + ASSERT_LESS(i, fooHandles.size()); + setScore(fooHandles[i], docId, score); + } + void setBarScore(uint32_t i, uint32_t docId, feature_t score) { + ASSERT_LESS(i, barHandles.size()); + setScore(barHandles[i], docId, score); + } +}; + +TEST_F("require that blueprint can be created from factory", BlueprintFactoryFixture) { + Blueprint::SP bp = f.factory.createBlueprint("distance"); + EXPECT_TRUE(bp.get() != 0); + EXPECT_TRUE(dynamic_cast<DistanceBlueprint*>(bp.get()) != 0); +} + +TEST_FFF("require that no features are dumped", DistanceBlueprint, IndexFixture, FeatureDumpFixture) { + f1.visitDumpFeatures(f2.indexEnv, f3); +} + +TEST_FF("require that setup can be done on random label", DistanceBlueprint, IndexFixture) { + DummyDependencyHandler deps(f1); + f1.setName(vespalib::make_string("%s(random_label)", f1.getBaseName().c_str())); + EXPECT_TRUE(static_cast<Blueprint&>(f1).setup(f2.indexEnv, std::vector<vespalib::string>(1, "random_label"))); +} + +TEST_FF("require that no label gives max-double distance", NoLabel(), RankFixture(2, 2, f1, labelFeatureName)) { + EXPECT_EQUAL(std::numeric_limits<feature_t>::max(), f2.getScore(10)); +} + +TEST_FF("require that unrelated label gives max-double distance", SingleLabel("unrelated", 1), RankFixture(2, 2, f1, labelFeatureName)) { + EXPECT_EQUAL(std::numeric_limits<feature_t>::max(), f2.getScore(10)); +} + +TEST_FF("require that labeled item raw score can be obtained", SingleLabel("label", 1), RankFixture(2, 2, f1, labelFeatureName)) { + f2.setFooScore(0, 10, 5.0); + EXPECT_EQUAL(5.0, f2.getScore(10)); +} + +TEST_FF("require that field raw score can be obtained", NoLabel(), RankFixture(2, 2, f1, fieldFeatureName)) { + f2.setBarScore(0, 10, 5.0); + EXPECT_EQUAL(5.0, f2.getScore(10)); +} + +TEST_FF("require that other raw scores are ignored", SingleLabel("label", 2), RankFixture(2, 2, f1, labelFeatureName)) { + f2.setFooScore(0, 10, 1.0); + f2.setFooScore(1, 10, 2.0); + f2.setBarScore(0, 10, 5.0); + f2.setBarScore(1, 10, 6.0); + EXPECT_EQUAL(2.0, f2.getScore(10)); +} + +TEST_FF("require that the correct raw score is used", NoLabel(), RankFixture(2, 2, f1, fieldFeatureName)) { + f2.setFooScore(0, 10, 3.0); + f2.setFooScore(1, 10, 4.0); + f2.setBarScore(0, 10, 8.0); + f2.setBarScore(1, 10, 7.0); + EXPECT_EQUAL(7.0, f2.getScore(10)); +} + +TEST_FF("require that stale data is ignored", SingleLabel("label", 2), RankFixture(2, 2, f1, labelFeatureName)) { + f2.setFooScore(0, 10, 1.0); + f2.setFooScore(1, 5, 2.0); + EXPECT_EQUAL(std::numeric_limits<feature_t>::max(), f2.getScore(10)); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/features/prod_features.cpp b/searchlib/src/tests/features/prod_features.cpp index 5f70085b9ec..253749dcdb7 100644 --- a/searchlib/src/tests/features/prod_features.cpp +++ b/searchlib/src/tests/features/prod_features.cpp @@ -474,11 +474,11 @@ Test::testCloseness() } { // Test executor. - assertCloseness(1, "pos", 0); + TEST_DO(assertCloseness(1, "pos", 0)); assertCloseness(0.8, "pos", 1802661); assertCloseness(0, "pos", 9013306); // use non-existing attribute -> default distance - assertCloseness(0, "no", 0); + TEST_DO(assertCloseness(0, "no", 0)); // use non-default maxDistance assertCloseness(1, "pos", 0, 100); @@ -852,15 +852,23 @@ Test::testDistance() { // test default distance { // non-existing attribute FtFeatureTest ft(_factory, "distance(pos)"); + ft.getIndexEnv().getBuilder().addField(FieldType::ATTRIBUTE, CollectionType::SINGLE, DataType::INT64, "pos"); ft.getQueryEnv().getLocation().setValid(true); ASSERT_TRUE(ft.setup()); ASSERT_TRUE(ft.execute(RankResult().addScore("distance(pos)", 6400000000.0))); } + { // non-existing field + FtFeatureTest ft(_factory, "distance(pos)"); + ft.getQueryEnv().getLocation().setValid(true); + ASSERT_TRUE(ft.setup()); + ASSERT_TRUE(ft.execute(RankResult().addScore("distance(pos)", std::numeric_limits<feature_t>::max()))); + } { // wrong attribute type (float) FtFeatureTest ft(_factory, "distance(pos)"); AttributePtr pos = AttributeFactory::createAttribute("pos", AVC(AVBT::FLOAT, AVCT::SINGLE)); pos->commit(); ft.getIndexEnv().getAttributeMap().add(pos); + ft.getIndexEnv().getBuilder().addField(FieldType::ATTRIBUTE, CollectionType::SINGLE, DataType::INT64, "pos"); ft.getQueryEnv().getLocation().setValid(true); ASSERT_TRUE(ft.setup()); ASSERT_TRUE(ft.execute(RankResult().addScore("distance(pos)", 6400000000.0))); @@ -870,6 +878,7 @@ Test::testDistance() AttributePtr pos = AttributeFactory::createAttribute("pos", AVC(AVBT::STRING, AVCT::SINGLE)); pos->commit(); ft.getIndexEnv().getAttributeMap().add(pos); + ft.getIndexEnv().getBuilder().addField(FieldType::ATTRIBUTE, CollectionType::SINGLE, DataType::INT64, "pos"); ft.getQueryEnv().getLocation().setValid(true); ASSERT_TRUE(ft.setup()); ASSERT_TRUE(ft.execute(RankResult().addScore("distance(pos)", 6400000000.0))); @@ -879,6 +888,7 @@ Test::testDistance() AttributePtr pos = AttributeFactory::createAttribute("pos", AVC(AVBT::INT64, AVCT::WSET)); pos->commit(); ft.getIndexEnv().getAttributeMap().add(pos); + ft.getIndexEnv().getBuilder().addField(FieldType::ATTRIBUTE, CollectionType::WEIGHTEDSET, DataType::INT64, "pos"); ft.getQueryEnv().getLocation().setValid(true); ASSERT_TRUE(ft.setup()); ASSERT_TRUE(ft.execute(RankResult().addScore("distance(pos)", 6400000000.0))); @@ -896,6 +906,7 @@ Test::setupForDistanceTest(FtFeatureTest &ft, const vespalib::string & attrName, pos->addReservedDoc(); pos->addDocs(1); ft.getIndexEnv().getAttributeMap().add(pos); + ft.getIndexEnv().getBuilder().addField(FieldType::ATTRIBUTE, CollectionType::ARRAY, DataType::INT64, attrName); auto ia = dynamic_cast<IntegerAttribute *>(pos.get()); for (const auto & p : positions) { diff --git a/searchlib/src/vespa/searchlib/features/distancefeature.cpp b/searchlib/src/vespa/searchlib/features/distancefeature.cpp index 54863d310e7..588fdcf17b7 100644 --- a/searchlib/src/vespa/searchlib/features/distancefeature.cpp +++ b/searchlib/src/vespa/searchlib/features/distancefeature.cpp @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "distancefeature.h" +#include <vespa/searchcommon/common/schema.h> #include <vespa/searchlib/fef/location.h> #include <vespa/searchlib/fef/matchdata.h> #include <vespa/document/datatype/positiondatatype.h> @@ -8,14 +9,76 @@ #include <vespa/vespalib/util/stash.h> #include <cmath> #include <limits> +#include "utils.h" #include <vespa/log/log.h> LOG_SETUP(".features.distancefeature"); using namespace search::fef; +using namespace search::index::schema; namespace search::features { +/** Implements the executor for converting NNS rawscore to a distance feature. */ +class ConvertRawscoreExecutor : public fef::FeatureExecutor { +private: + std::vector<fef::TermFieldHandle> _handles; + const fef::MatchData *_md; + void handle_bind_match_data(const fef::MatchData &md) override { + _md = &md; + } +public: + ConvertRawscoreExecutor(const fef::IQueryEnvironment &env, uint32_t fieldId); + ConvertRawscoreExecutor(const fef::IQueryEnvironment &env, const vespalib::string &label); + void execute(uint32_t docId) override; +}; + +ConvertRawscoreExecutor::ConvertRawscoreExecutor(const fef::IQueryEnvironment &env, uint32_t fieldId) + : _handles(), + _md(nullptr) +{ + _handles.reserve(env.getNumTerms()); + for (uint32_t i = 0; i < env.getNumTerms(); ++i) { + search::fef::TermFieldHandle handle = util::getTermFieldHandle(env, i, fieldId); + if (handle != search::fef::IllegalHandle) { + _handles.push_back(handle); + } + } +} + +ConvertRawscoreExecutor::ConvertRawscoreExecutor(const fef::IQueryEnvironment &env, const vespalib::string &label) + : _handles(), + _md(nullptr) +{ + const ITermData *term = util::getTermByLabel(env, label); + if (term != nullptr) { + // expect numFields() == 1 + for (uint32_t i = 0; i < term->numFields(); ++i) { + TermFieldHandle handle = term->field(i).getHandle(); + if (handle != IllegalHandle) { + _handles.push_back(handle); + } + } + } +} + +void +ConvertRawscoreExecutor::execute(uint32_t docId) +{ + feature_t output = std::numeric_limits<feature_t>::max(); + assert(_md); + for (auto handle : _handles) { + const TermFieldMatchData *tfmd = _md->resolveTermField(handle); + if (tfmd->getDocId() == docId) { + // add conversion from "closeness" RawScore later: + feature_t converted = tfmd->getRawScore(); + output = std::min(output, converted); + } + } + outputs().set_number(0, output); +} + + feature_t DistanceExecutor::calculateDistance(uint32_t docId) { @@ -82,7 +145,11 @@ const feature_t DistanceExecutor::DEFAULT_DISTANCE(6400000000.0); DistanceBlueprint::DistanceBlueprint() : Blueprint("distance"), - _posAttr() + _arg_string(), + _attr_id(search::index::Schema::UNKNOWN_FIELD_ID), + _use_geo_pos(false), + _use_nns_tensor(false), + _use_item_label(false) { } @@ -101,30 +168,70 @@ DistanceBlueprint::createInstance() const } bool +DistanceBlueprint::setup_geopos(const IIndexEnvironment & env, + const vespalib::string &attr) +{ + _arg_string = attr; + _use_geo_pos = true; + describeOutput("out", "The euclidean distance from the query position."); + env.hintAttributeAccess(_arg_string); + return true; +} + +bool +DistanceBlueprint::setup_nns(const IIndexEnvironment & env, + const vespalib::string &attr) +{ + _arg_string = attr; + _use_nns_tensor = true; + describeOutput("out", "The euclidean distance from the query position."); + env.hintAttributeAccess(_arg_string); + return true; +} + +bool DistanceBlueprint::setup(const IIndexEnvironment & env, const ParameterList & params) { - _posAttr = params[0].getValue(); - describeOutput("out", "The euclidean distance from the query position."); - env.hintAttributeAccess(_posAttr); - env.hintAttributeAccess(document::PositionDataType::getZCurveFieldName(_posAttr)); + vespalib::string arg = params[0].getValue(); + const FieldInfo *fi = env.getFieldByName(arg); + if (fi != nullptr && fi->hasAttribute()) { + auto dt = fi->get_data_type(); + auto ct = fi->collection(); + if (dt == DataType::TENSOR && ct == CollectionType::SINGLE) { + _attr_id = fi->id(); + return setup_nns(env, arg); + } + // could check if dt is DataType::INT64 + // could check if ct is CollectionType::SINGLE or CollectionType::ARRAY) + return setup_geopos(env, arg); + } + vespalib::string z = document::PositionDataType::getZCurveFieldName(arg); + fi = env.getFieldByName(z); + if (fi != nullptr && fi->hasAttribute()) { + return setup_geopos(env, z); + } + _arg_string = arg; + _use_item_label = true; + describeOutput("out", "The euclidean distance from the labeled query item."); return true; } FeatureExecutor & DistanceBlueprint::createExecutor(const IQueryEnvironment &env, vespalib::Stash &stash) const { + if (_use_nns_tensor) { + return stash.create<ConvertRawscoreExecutor>(env, _attr_id); + } + if (_use_item_label) { + return stash.create<ConvertRawscoreExecutor>(env, _arg_string); + } const search::attribute::IAttributeVector * pos = nullptr; const Location & location = env.getLocation(); - LOG(debug, "DistanceBlueprint::createExecutor location.valid='%s', '%s', alternatively '%s'", - location.isValid() ? "true" : "false", _posAttr.c_str(), document::PositionDataType::getZCurveFieldName(_posAttr).c_str()); - if (location.isValid()) { - pos = env.getAttributeContext().getAttribute(_posAttr); - if (pos == nullptr) { - LOG(debug, "Failed to find attribute '%s', resorting too '%s'", - _posAttr.c_str(), document::PositionDataType::getZCurveFieldName(_posAttr).c_str()); - pos = env.getAttributeContext().getAttribute(document::PositionDataType::getZCurveFieldName(_posAttr)); - } + LOG(debug, "DistanceBlueprint::createExecutor location.valid='%s', attribute='%s'", + location.isValid() ? "true" : "false", _arg_string.c_str()); + if (_use_geo_pos && location.isValid()) { + pos = env.getAttributeContext().getAttribute(_arg_string); if (pos != nullptr) { if (!pos->isIntegerType()) { LOG(warning, "The position attribute '%s' is not an integer attribute. Will use default distance.", @@ -136,7 +243,7 @@ DistanceBlueprint::createExecutor(const IQueryEnvironment &env, vespalib::Stash pos = nullptr; } } else { - LOG(warning, "The position attribute '%s' was not found. Will use default distance.", _posAttr.c_str()); + LOG(warning, "The position attribute '%s' was not found. Will use default distance.", _arg_string.c_str()); } } diff --git a/searchlib/src/vespa/searchlib/features/distancefeature.h b/searchlib/src/vespa/searchlib/features/distancefeature.h index 3c75d53ad77..e0eaa98ac4c 100644 --- a/searchlib/src/vespa/searchlib/features/distancefeature.h +++ b/searchlib/src/vespa/searchlib/features/distancefeature.h @@ -12,7 +12,7 @@ namespace search::features { */ class DistanceExecutor : public fef::FeatureExecutor { private: - const fef::Location & _location; + const fef::Location & _location; const attribute::IAttributeVector * _pos; attribute::IntegerContent _intBuf; @@ -37,7 +37,14 @@ public: */ class DistanceBlueprint : public fef::Blueprint { private: - vespalib::string _posAttr; + vespalib::string _arg_string; + uint32_t _attr_id; + bool _use_geo_pos; + bool _use_nns_tensor; + bool _use_item_label; + + bool setup_geopos(const fef::IIndexEnvironment & env, const vespalib::string &attr); + bool setup_nns(const fef::IIndexEnvironment & env, const vespalib::string &attr); public: DistanceBlueprint(); diff --git a/searchlib/src/vespa/searchlib/features/raw_score_feature.h b/searchlib/src/vespa/searchlib/features/raw_score_feature.h index 2a4eb946a68..42813cf9d22 100644 --- a/searchlib/src/vespa/searchlib/features/raw_score_feature.h +++ b/searchlib/src/vespa/searchlib/features/raw_score_feature.h @@ -10,7 +10,7 @@ class RawScoreExecutor : public fef::FeatureExecutor { private: std::vector<fef::TermFieldHandle> _handles; - const fef::MatchData *_md; + const fef::MatchData *_md; void handle_bind_match_data(const fef::MatchData &md) override; public: |