diff options
author | Geir Storli <geirst@yahooinc.com> | 2022-04-12 11:14:16 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-04-12 11:14:16 +0200 |
commit | 2aa11fa4aa657ef668a34b34b3fee2d1aae4e526 (patch) | |
tree | 19c17eab6f3305487fbc30e279e7dd4c967bfaee /searchlib | |
parent | 0bdf1aa3c98e1b985e64cd0483bc4709a429de4f (diff) | |
parent | 0ae79683fb9ed342c68d9ec5d35274b13e803d61 (diff) |
Merge pull request #22091 from vespa-engine/geirst/simplify-global-filter-brute-force-fallback-nearest-neighbor
Simplify calculation of global filter and fallback to brute force when using nearest neighbor search
Diffstat (limited to 'searchlib')
9 files changed, 82 insertions, 52 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index c77dfb2c2d2..1d3305d2c1a 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -1031,7 +1031,7 @@ public: return SimpleValue::from_spec(spec); } - std::unique_ptr<NearestNeighborBlueprint> make_blueprint(bool approximate = true, double brute_force_limit = 0.05) { + std::unique_ptr<NearestNeighborBlueprint> make_blueprint(bool approximate = true, double global_filter_lower_limit = 0.05) { search::queryeval::FieldSpec field("foo", 0, 0); auto bp = std::make_unique<NearestNeighborBlueprint>( field, @@ -1039,7 +1039,7 @@ public: createDenseTensor(vec_2d(17, 42)), 3, approximate, 5, 100100.25, - brute_force_limit); + global_filter_lower_limit, 1.0); EXPECT_EQUAL(11u, bp->getState().estimate().estHits); EXPECT_EQUAL(approximate, bp->may_approximate()); EXPECT_EQUAL(100100.25 * 100100.25, bp->get_distance_threshold()); @@ -1052,9 +1052,16 @@ public: DenseTensorAttributeWithoutIndex() : Fixture(vec_2d_spec, FixtureTraits().dense()) {} }; +using NNBA = NearestNeighborBlueprint::Algorithm; using NearestNeighborBlueprintFixture = NearestNeighborBlueprintFixtureBase<DenseTensorAttributeMockIndex>; using NearestNeighborBlueprintWithoutIndexFixture = NearestNeighborBlueprintFixtureBase<DenseTensorAttributeWithoutIndex>; +TEST_F("NN blueprint can use brute force", NearestNeighborBlueprintFixture) +{ + auto bp = f.make_blueprint(false); + EXPECT_EQUAL(NNBA::BRUTE_FORCE, bp->get_algorithm()); +} + TEST_F("NN blueprint handles empty filter", NearestNeighborBlueprintFixture) { auto bp = f.make_blueprint(); @@ -1062,6 +1069,7 @@ TEST_F("NN blueprint handles empty filter", NearestNeighborBlueprintFixture) bp->set_global_filter(*empty_filter); EXPECT_EQUAL(3u, bp->getState().estimate().estHits); EXPECT_TRUE(bp->may_approximate()); + EXPECT_EQUAL(NNBA::INDEX_TOP_K, bp->get_algorithm()); } TEST_F("NN blueprint handles strong filter", NearestNeighborBlueprintFixture) @@ -1074,6 +1082,7 @@ TEST_F("NN blueprint handles strong filter", NearestNeighborBlueprintFixture) bp->set_global_filter(*strong_filter); EXPECT_EQUAL(1u, bp->getState().estimate().estHits); EXPECT_TRUE(bp->may_approximate()); + EXPECT_EQUAL(NNBA::INDEX_TOP_K_WITH_FILTER, bp->get_algorithm()); } TEST_F("NN blueprint handles weak filter", NearestNeighborBlueprintFixture) @@ -1091,6 +1100,7 @@ TEST_F("NN blueprint handles weak filter", NearestNeighborBlueprintFixture) bp->set_global_filter(*weak_filter); EXPECT_EQUAL(3u, bp->getState().estimate().estHits); EXPECT_TRUE(bp->may_approximate()); + EXPECT_EQUAL(NNBA::INDEX_TOP_K_WITH_FILTER, bp->get_algorithm()); } TEST_F("NN blueprint handles strong filter triggering brute force search", NearestNeighborBlueprintFixture) @@ -1103,6 +1113,7 @@ TEST_F("NN blueprint handles strong filter triggering brute force search", Neare bp->set_global_filter(*strong_filter); EXPECT_EQUAL(11u, bp->getState().estimate().estHits); EXPECT_FALSE(bp->may_approximate()); + EXPECT_EQUAL(NNBA::BRUTE_FORCE_FALLBACK, bp->get_algorithm()); } TEST_F("NN blueprint wants global filter when having index", NearestNeighborBlueprintFixture) diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp index bde73faf466..888156ce352 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp @@ -784,7 +784,8 @@ public: n.get_allow_approximate(), n.get_explore_additional_hits(), n.get_distance_threshold(), - getRequestContext().get_attribute_blueprint_params().nearest_neighbor_brute_force_limit)); + getRequestContext().get_attribute_blueprint_params().global_filter_lower_limit, + getRequestContext().get_attribute_blueprint_params().global_filter_upper_limit)); } void visit(query::FuzzyTerm &n) override { visitTerm(n); } diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h index 3d975c60d7a..39f58c5382e 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h @@ -2,6 +2,8 @@ #pragma once +#include <vespa/searchlib/fef/indexproperties.h> + namespace search::attribute { /** @@ -9,15 +11,19 @@ namespace search::attribute { */ struct AttributeBlueprintParams { - double nearest_neighbor_brute_force_limit; - - AttributeBlueprintParams(double nearest_neighbor_brute_force_limit_in) - : nearest_neighbor_brute_force_limit(nearest_neighbor_brute_force_limit_in) + double global_filter_lower_limit; + double global_filter_upper_limit; + + AttributeBlueprintParams(double global_filter_lower_limit_in, + double global_filter_upper_limit_in) + : global_filter_lower_limit(global_filter_lower_limit_in), + global_filter_upper_limit(global_filter_upper_limit_in) { } AttributeBlueprintParams() - : AttributeBlueprintParams(0.05) + : AttributeBlueprintParams(fef::indexproperties::matching::GlobalFilterLowerLimit::DEFAULT_VALUE, + fef::indexproperties::matching::GlobalFilterUpperLimit::DEFAULT_VALUE) { } }; diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp index 45c1844b28f..168f5e4369f 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp @@ -384,25 +384,9 @@ MinHitsPerThread::lookup(const Properties &props, uint32_t defaultValue) return lookupUint32(props, NAME, defaultValue); } -const vespalib::string NearestNeighborBruteForceLimit::NAME("vespa.matching.nearest_neighbor.brute_force_limit"); - -const double NearestNeighborBruteForceLimit::DEFAULT_VALUE(0.05); - -double -NearestNeighborBruteForceLimit::lookup(const Properties &props) -{ - return lookup(props, DEFAULT_VALUE); -} - -double -NearestNeighborBruteForceLimit::lookup(const Properties &props, double defaultValue) -{ - return lookupDouble(props, NAME, defaultValue); -} - const vespalib::string GlobalFilterLowerLimit::NAME("vespa.matching.global_filter.lower_limit"); -const double GlobalFilterLowerLimit::DEFAULT_VALUE(0.0); +const double GlobalFilterLowerLimit::DEFAULT_VALUE(0.05); double GlobalFilterLowerLimit::lookup(const Properties &props) diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.h b/searchlib/src/vespa/searchlib/fef/indexproperties.h index 5096503da06..ce3798a6c8e 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.h +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.h @@ -294,20 +294,6 @@ namespace matching { }; /** - * Property to control fallback to brute force search for nearest - * neighbor query terms. If the ratio of candidates in the global - * filter (which tracks the documents that can match the query - * based on the other parts of the query) is less than this limit - * then use brute force search. - **/ - struct NearestNeighborBruteForceLimit { - static const vespalib::string NAME; - static const double DEFAULT_VALUE; - static double lookup(const Properties &props); - static double lookup(const Properties &props, double defaultValue); - }; - - /** * Property to control fallback to not building a global filter * for a query with a blueprint that wants a global filter. If the * estimated ratio of matching documents is less than this limit diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp index bf5598f0a5e..5663ae07686 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp @@ -120,7 +120,6 @@ RankSetup::configure() setSoftTimeoutEnabled(softtimeout::Enabled::lookup(_indexEnv.getProperties())); setSoftTimeoutTailCost(softtimeout::TailCost::lookup(_indexEnv.getProperties())); setSoftTimeoutFactor(softtimeout::Factor::lookup(_indexEnv.getProperties())); - set_nearest_neighbor_brute_force_limit(matching::NearestNeighborBruteForceLimit::lookup(_indexEnv.getProperties())); set_global_filter_lower_limit(matching::GlobalFilterLowerLimit::lookup(_indexEnv.getProperties())); set_global_filter_upper_limit(matching::GlobalFilterUpperLimit::lookup(_indexEnv.getProperties())); _mutateOnMatch._attribute = mutate::on_match::Attribute::lookup(_indexEnv.getProperties()); diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.h b/searchlib/src/vespa/searchlib/fef/ranksetup.h index a917c82bc54..bce2af8fa24 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.h +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.h @@ -407,9 +407,6 @@ public: void setSoftTimeoutFactor(double v) { _softTimeoutFactor = v; } double getSoftTimeoutFactor() const { return _softTimeoutFactor; } - void set_nearest_neighbor_brute_force_limit(double v) { _nearest_neighbor_brute_force_limit = v; } - double get_nearest_neighbor_brute_force_limit() const { return _nearest_neighbor_brute_force_limit; } - void set_global_filter_lower_limit(double v) { _global_filter_lower_limit = v; } double get_global_filter_lower_limit() const { return _global_filter_lower_limit; } void set_global_filter_upper_limit(double v) { _global_filter_upper_limit = v; } diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp index 2170bc4ad2a..bdcbb3db633 100644 --- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp @@ -51,13 +51,30 @@ struct ConvertCellsSelector } }; +vespalib::string +to_string(NearestNeighborBlueprint::Algorithm algorithm) +{ + using NNBA = NearestNeighborBlueprint::Algorithm; + switch (algorithm) { + case NNBA::BRUTE_FORCE: return "brute force"; + case NNBA::BRUTE_FORCE_FALLBACK: return "brute force fallback"; + case NNBA::INDEX_TOP_K: return "index top k"; + case NNBA::INDEX_TOP_K_WITH_FILTER: return "index top k using global filter"; + } + return "unknown"; +} + } // namespace <unnamed> NearestNeighborBlueprint::NearestNeighborBlueprint(const queryeval::FieldSpec& field, const tensor::ITensorAttribute& attr_tensor, std::unique_ptr<Value> query_tensor, - uint32_t target_num_hits, bool approximate, uint32_t explore_additional_hits, - double distance_threshold, double brute_force_limit) + uint32_t target_num_hits, + bool approximate, + uint32_t explore_additional_hits, + double distance_threshold, + double global_filter_lower_limit, + double global_filter_upper_limit) : ComplexLeafBlueprint(field), _attr_tensor(attr_tensor), _query_tensor(std::move(query_tensor)), @@ -65,10 +82,12 @@ NearestNeighborBlueprint::NearestNeighborBlueprint(const queryeval::FieldSpec& f _approximate(approximate), _explore_additional_hits(explore_additional_hits), _distance_threshold(std::numeric_limits<double>::max()), - _brute_force_limit(brute_force_limit), + _global_filter_lower_limit(global_filter_lower_limit), + _global_filter_upper_limit(global_filter_upper_limit), _fallback_dist_fun(), _distance_heap(target_num_hits), _found_hits(), + _algorithm(Algorithm::BRUTE_FORCE), _global_filter(GlobalFilter::create()), _global_filter_hits(), _global_filter_hit_ratio() @@ -114,8 +133,9 @@ NearestNeighborBlueprint::set_global_filter(const GlobalFilter &global_filter) uint32_t max_hits = _global_filter->filter()->countTrueBits(); LOG(debug, "set_global_filter getNumDocs: %u / max_hits %u", est_hits, max_hits); double max_hit_ratio = static_cast<double>(max_hits) / est_hits; - if (max_hit_ratio < _brute_force_limit) { + if (max_hit_ratio < _global_filter_lower_limit) { _approximate = false; + _algorithm = Algorithm::BRUTE_FORCE_FALLBACK; LOG(debug, "too many hits filtered out, using brute force implementation"); } else { est_hits = std::min(est_hits, max_hits); @@ -142,8 +162,10 @@ NearestNeighborBlueprint::perform_top_k() if (_global_filter->has_filter()) { auto filter = _global_filter->filter(); _found_hits = nns_index->find_top_k_with_filter(k, lhs, *filter, k + _explore_additional_hits, _distance_threshold); + _algorithm = Algorithm::INDEX_TOP_K_WITH_FILTER; } else { _found_hits = nns_index->find_top_k(k, lhs, k + _explore_additional_hits, _distance_threshold); + _algorithm = Algorithm::INDEX_TOP_K; } } } @@ -171,11 +193,14 @@ NearestNeighborBlueprint::visitMembers(vespalib::ObjectVisitor& visitor) const visitor.visitInt("explore_additional_hits", _explore_additional_hits); visitor.visitBool("approximate", _approximate); visitor.visitBool("has_index", _attr_tensor.nearest_neighbor_index()); - visitor.visitInt("top_k_found_hits", _found_hits.size()); + visitor.visitString("algorithm", to_string(_algorithm)); + visitor.visitInt("top_k_hits", _found_hits.size()); visitor.openStruct("global_filter", "GlobalFilter"); - visitor.visitBool("exists", (_global_filter && _global_filter->has_filter())); - visitor.visitFloat("brute_force_limit", _brute_force_limit); + visitor.visitBool("is_set", (_global_filter != nullptr)); + visitor.visitBool("has_filter", (_global_filter && _global_filter->has_filter())); + visitor.visitFloat("lower_limit", _global_filter_lower_limit); + visitor.visitFloat("upper_limit", _global_filter_upper_limit); if (_global_filter_hits.has_value()) { visitor.visitInt("hits", _global_filter_hits.value()); } @@ -191,4 +216,11 @@ NearestNeighborBlueprint::always_needs_unpack() const return true; } +std::ostream& +operator<<(std::ostream& out, NearestNeighborBlueprint::Algorithm algorithm) +{ + out << to_string(algorithm); + return out; +} + } diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h index e318c53a25a..7922036dc42 100644 --- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h @@ -19,6 +19,13 @@ namespace search::queryeval { * where the query point and document points are dense tensors of order 1. */ class NearestNeighborBlueprint : public ComplexLeafBlueprint { +public: + enum class Algorithm { + BRUTE_FORCE, + BRUTE_FORCE_FALLBACK, + INDEX_TOP_K, + INDEX_TOP_K_WITH_FILTER + }; private: const tensor::ITensorAttribute& _attr_tensor; std::unique_ptr<vespalib::eval::Value> _query_tensor; @@ -26,11 +33,13 @@ private: bool _approximate; uint32_t _explore_additional_hits; double _distance_threshold; - double _brute_force_limit; + double _global_filter_lower_limit; + double _global_filter_upper_limit; search::tensor::DistanceFunction::UP _fallback_dist_fun; const search::tensor::DistanceFunction *_dist_fun; mutable NearestNeighborDistanceHeap _distance_heap; std::vector<search::tensor::NearestNeighborIndex::Neighbor> _found_hits; + Algorithm _algorithm; std::shared_ptr<const GlobalFilter> _global_filter; std::optional<uint32_t> _global_filter_hits; std::optional<double> _global_filter_hit_ratio; @@ -42,7 +51,8 @@ public: std::unique_ptr<vespalib::eval::Value> query_tensor, uint32_t target_num_hits, bool approximate, uint32_t explore_additional_hits, double distance_threshold, - double brute_force_limit); + double global_filter_lower_limit, + double global_filter_upper_limit); NearestNeighborBlueprint(const NearestNeighborBlueprint&) = delete; NearestNeighborBlueprint& operator=(const NearestNeighborBlueprint&) = delete; ~NearestNeighborBlueprint(); @@ -51,6 +61,7 @@ public: uint32_t get_target_num_hits() const { return _target_num_hits; } void set_global_filter(const GlobalFilter &global_filter) override; bool may_approximate() const { return _approximate; } + Algorithm get_algorithm() const { return _algorithm; } double get_distance_threshold() const { return _distance_threshold; } std::unique_ptr<SearchIterator> createLeafSearch(const search::fef::TermFieldMatchDataArray& tfmda, @@ -59,4 +70,7 @@ public: bool always_needs_unpack() const override; }; +std::ostream& +operator<<(std::ostream& out, NearestNeighborBlueprint::Algorithm algorithm); + } |