summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2022-04-12 11:14:16 +0200
committerGitHub <noreply@github.com>2022-04-12 11:14:16 +0200
commit2aa11fa4aa657ef668a34b34b3fee2d1aae4e526 (patch)
tree19c17eab6f3305487fbc30e279e7dd4c967bfaee /searchlib
parent0bdf1aa3c98e1b985e64cd0483bc4709a429de4f (diff)
parent0ae79683fb9ed342c68d9ec5d35274b13e803d61 (diff)
Merge pull request #22091 from vespa-engine/geirst/simplify-global-filter-brute-force-fallback-nearest-neighbor
Simplify calculation of global filter and fallback to brute force when using nearest neighbor search
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp15
-rw-r--r--searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h16
-rw-r--r--searchlib/src/vespa/searchlib/fef/indexproperties.cpp18
-rw-r--r--searchlib/src/vespa/searchlib/fef/indexproperties.h14
-rw-r--r--searchlib/src/vespa/searchlib/fef/ranksetup.cpp1
-rw-r--r--searchlib/src/vespa/searchlib/fef/ranksetup.h3
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp46
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h18
9 files changed, 82 insertions, 52 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
index c77dfb2c2d2..1d3305d2c1a 100644
--- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
+++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
@@ -1031,7 +1031,7 @@ public:
return SimpleValue::from_spec(spec);
}
- std::unique_ptr<NearestNeighborBlueprint> make_blueprint(bool approximate = true, double brute_force_limit = 0.05) {
+ std::unique_ptr<NearestNeighborBlueprint> make_blueprint(bool approximate = true, double global_filter_lower_limit = 0.05) {
search::queryeval::FieldSpec field("foo", 0, 0);
auto bp = std::make_unique<NearestNeighborBlueprint>(
field,
@@ -1039,7 +1039,7 @@ public:
createDenseTensor(vec_2d(17, 42)),
3, approximate, 5,
100100.25,
- brute_force_limit);
+ global_filter_lower_limit, 1.0);
EXPECT_EQUAL(11u, bp->getState().estimate().estHits);
EXPECT_EQUAL(approximate, bp->may_approximate());
EXPECT_EQUAL(100100.25 * 100100.25, bp->get_distance_threshold());
@@ -1052,9 +1052,16 @@ public:
DenseTensorAttributeWithoutIndex() : Fixture(vec_2d_spec, FixtureTraits().dense()) {}
};
+using NNBA = NearestNeighborBlueprint::Algorithm;
using NearestNeighborBlueprintFixture = NearestNeighborBlueprintFixtureBase<DenseTensorAttributeMockIndex>;
using NearestNeighborBlueprintWithoutIndexFixture = NearestNeighborBlueprintFixtureBase<DenseTensorAttributeWithoutIndex>;
+TEST_F("NN blueprint can use brute force", NearestNeighborBlueprintFixture)
+{
+ auto bp = f.make_blueprint(false);
+ EXPECT_EQUAL(NNBA::BRUTE_FORCE, bp->get_algorithm());
+}
+
TEST_F("NN blueprint handles empty filter", NearestNeighborBlueprintFixture)
{
auto bp = f.make_blueprint();
@@ -1062,6 +1069,7 @@ TEST_F("NN blueprint handles empty filter", NearestNeighborBlueprintFixture)
bp->set_global_filter(*empty_filter);
EXPECT_EQUAL(3u, bp->getState().estimate().estHits);
EXPECT_TRUE(bp->may_approximate());
+ EXPECT_EQUAL(NNBA::INDEX_TOP_K, bp->get_algorithm());
}
TEST_F("NN blueprint handles strong filter", NearestNeighborBlueprintFixture)
@@ -1074,6 +1082,7 @@ TEST_F("NN blueprint handles strong filter", NearestNeighborBlueprintFixture)
bp->set_global_filter(*strong_filter);
EXPECT_EQUAL(1u, bp->getState().estimate().estHits);
EXPECT_TRUE(bp->may_approximate());
+ EXPECT_EQUAL(NNBA::INDEX_TOP_K_WITH_FILTER, bp->get_algorithm());
}
TEST_F("NN blueprint handles weak filter", NearestNeighborBlueprintFixture)
@@ -1091,6 +1100,7 @@ TEST_F("NN blueprint handles weak filter", NearestNeighborBlueprintFixture)
bp->set_global_filter(*weak_filter);
EXPECT_EQUAL(3u, bp->getState().estimate().estHits);
EXPECT_TRUE(bp->may_approximate());
+ EXPECT_EQUAL(NNBA::INDEX_TOP_K_WITH_FILTER, bp->get_algorithm());
}
TEST_F("NN blueprint handles strong filter triggering brute force search", NearestNeighborBlueprintFixture)
@@ -1103,6 +1113,7 @@ TEST_F("NN blueprint handles strong filter triggering brute force search", Neare
bp->set_global_filter(*strong_filter);
EXPECT_EQUAL(11u, bp->getState().estimate().estHits);
EXPECT_FALSE(bp->may_approximate());
+ EXPECT_EQUAL(NNBA::BRUTE_FORCE_FALLBACK, bp->get_algorithm());
}
TEST_F("NN blueprint wants global filter when having index", NearestNeighborBlueprintFixture)
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
index bde73faf466..888156ce352 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
@@ -784,7 +784,8 @@ public:
n.get_allow_approximate(),
n.get_explore_additional_hits(),
n.get_distance_threshold(),
- getRequestContext().get_attribute_blueprint_params().nearest_neighbor_brute_force_limit));
+ getRequestContext().get_attribute_blueprint_params().global_filter_lower_limit,
+ getRequestContext().get_attribute_blueprint_params().global_filter_upper_limit));
}
void visit(query::FuzzyTerm &n) override { visitTerm(n); }
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h
index 3d975c60d7a..39f58c5382e 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h
@@ -2,6 +2,8 @@
#pragma once
+#include <vespa/searchlib/fef/indexproperties.h>
+
namespace search::attribute {
/**
@@ -9,15 +11,19 @@ namespace search::attribute {
*/
struct AttributeBlueprintParams
{
- double nearest_neighbor_brute_force_limit;
-
- AttributeBlueprintParams(double nearest_neighbor_brute_force_limit_in)
- : nearest_neighbor_brute_force_limit(nearest_neighbor_brute_force_limit_in)
+ double global_filter_lower_limit;
+ double global_filter_upper_limit;
+
+ AttributeBlueprintParams(double global_filter_lower_limit_in,
+ double global_filter_upper_limit_in)
+ : global_filter_lower_limit(global_filter_lower_limit_in),
+ global_filter_upper_limit(global_filter_upper_limit_in)
{
}
AttributeBlueprintParams()
- : AttributeBlueprintParams(0.05)
+ : AttributeBlueprintParams(fef::indexproperties::matching::GlobalFilterLowerLimit::DEFAULT_VALUE,
+ fef::indexproperties::matching::GlobalFilterUpperLimit::DEFAULT_VALUE)
{
}
};
diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp
index 45c1844b28f..168f5e4369f 100644
--- a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp
+++ b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp
@@ -384,25 +384,9 @@ MinHitsPerThread::lookup(const Properties &props, uint32_t defaultValue)
return lookupUint32(props, NAME, defaultValue);
}
-const vespalib::string NearestNeighborBruteForceLimit::NAME("vespa.matching.nearest_neighbor.brute_force_limit");
-
-const double NearestNeighborBruteForceLimit::DEFAULT_VALUE(0.05);
-
-double
-NearestNeighborBruteForceLimit::lookup(const Properties &props)
-{
- return lookup(props, DEFAULT_VALUE);
-}
-
-double
-NearestNeighborBruteForceLimit::lookup(const Properties &props, double defaultValue)
-{
- return lookupDouble(props, NAME, defaultValue);
-}
-
const vespalib::string GlobalFilterLowerLimit::NAME("vespa.matching.global_filter.lower_limit");
-const double GlobalFilterLowerLimit::DEFAULT_VALUE(0.0);
+const double GlobalFilterLowerLimit::DEFAULT_VALUE(0.05);
double
GlobalFilterLowerLimit::lookup(const Properties &props)
diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.h b/searchlib/src/vespa/searchlib/fef/indexproperties.h
index 5096503da06..ce3798a6c8e 100644
--- a/searchlib/src/vespa/searchlib/fef/indexproperties.h
+++ b/searchlib/src/vespa/searchlib/fef/indexproperties.h
@@ -294,20 +294,6 @@ namespace matching {
};
/**
- * Property to control fallback to brute force search for nearest
- * neighbor query terms. If the ratio of candidates in the global
- * filter (which tracks the documents that can match the query
- * based on the other parts of the query) is less than this limit
- * then use brute force search.
- **/
- struct NearestNeighborBruteForceLimit {
- static const vespalib::string NAME;
- static const double DEFAULT_VALUE;
- static double lookup(const Properties &props);
- static double lookup(const Properties &props, double defaultValue);
- };
-
- /**
* Property to control fallback to not building a global filter
* for a query with a blueprint that wants a global filter. If the
* estimated ratio of matching documents is less than this limit
diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp
index bf5598f0a5e..5663ae07686 100644
--- a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp
+++ b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp
@@ -120,7 +120,6 @@ RankSetup::configure()
setSoftTimeoutEnabled(softtimeout::Enabled::lookup(_indexEnv.getProperties()));
setSoftTimeoutTailCost(softtimeout::TailCost::lookup(_indexEnv.getProperties()));
setSoftTimeoutFactor(softtimeout::Factor::lookup(_indexEnv.getProperties()));
- set_nearest_neighbor_brute_force_limit(matching::NearestNeighborBruteForceLimit::lookup(_indexEnv.getProperties()));
set_global_filter_lower_limit(matching::GlobalFilterLowerLimit::lookup(_indexEnv.getProperties()));
set_global_filter_upper_limit(matching::GlobalFilterUpperLimit::lookup(_indexEnv.getProperties()));
_mutateOnMatch._attribute = mutate::on_match::Attribute::lookup(_indexEnv.getProperties());
diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.h b/searchlib/src/vespa/searchlib/fef/ranksetup.h
index a917c82bc54..bce2af8fa24 100644
--- a/searchlib/src/vespa/searchlib/fef/ranksetup.h
+++ b/searchlib/src/vespa/searchlib/fef/ranksetup.h
@@ -407,9 +407,6 @@ public:
void setSoftTimeoutFactor(double v) { _softTimeoutFactor = v; }
double getSoftTimeoutFactor() const { return _softTimeoutFactor; }
- void set_nearest_neighbor_brute_force_limit(double v) { _nearest_neighbor_brute_force_limit = v; }
- double get_nearest_neighbor_brute_force_limit() const { return _nearest_neighbor_brute_force_limit; }
-
void set_global_filter_lower_limit(double v) { _global_filter_lower_limit = v; }
double get_global_filter_lower_limit() const { return _global_filter_lower_limit; }
void set_global_filter_upper_limit(double v) { _global_filter_upper_limit = v; }
diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp
index 2170bc4ad2a..bdcbb3db633 100644
--- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp
@@ -51,13 +51,30 @@ struct ConvertCellsSelector
}
};
+vespalib::string
+to_string(NearestNeighborBlueprint::Algorithm algorithm)
+{
+ using NNBA = NearestNeighborBlueprint::Algorithm;
+ switch (algorithm) {
+ case NNBA::BRUTE_FORCE: return "brute force";
+ case NNBA::BRUTE_FORCE_FALLBACK: return "brute force fallback";
+ case NNBA::INDEX_TOP_K: return "index top k";
+ case NNBA::INDEX_TOP_K_WITH_FILTER: return "index top k using global filter";
+ }
+ return "unknown";
+}
+
} // namespace <unnamed>
NearestNeighborBlueprint::NearestNeighborBlueprint(const queryeval::FieldSpec& field,
const tensor::ITensorAttribute& attr_tensor,
std::unique_ptr<Value> query_tensor,
- uint32_t target_num_hits, bool approximate, uint32_t explore_additional_hits,
- double distance_threshold, double brute_force_limit)
+ uint32_t target_num_hits,
+ bool approximate,
+ uint32_t explore_additional_hits,
+ double distance_threshold,
+ double global_filter_lower_limit,
+ double global_filter_upper_limit)
: ComplexLeafBlueprint(field),
_attr_tensor(attr_tensor),
_query_tensor(std::move(query_tensor)),
@@ -65,10 +82,12 @@ NearestNeighborBlueprint::NearestNeighborBlueprint(const queryeval::FieldSpec& f
_approximate(approximate),
_explore_additional_hits(explore_additional_hits),
_distance_threshold(std::numeric_limits<double>::max()),
- _brute_force_limit(brute_force_limit),
+ _global_filter_lower_limit(global_filter_lower_limit),
+ _global_filter_upper_limit(global_filter_upper_limit),
_fallback_dist_fun(),
_distance_heap(target_num_hits),
_found_hits(),
+ _algorithm(Algorithm::BRUTE_FORCE),
_global_filter(GlobalFilter::create()),
_global_filter_hits(),
_global_filter_hit_ratio()
@@ -114,8 +133,9 @@ NearestNeighborBlueprint::set_global_filter(const GlobalFilter &global_filter)
uint32_t max_hits = _global_filter->filter()->countTrueBits();
LOG(debug, "set_global_filter getNumDocs: %u / max_hits %u", est_hits, max_hits);
double max_hit_ratio = static_cast<double>(max_hits) / est_hits;
- if (max_hit_ratio < _brute_force_limit) {
+ if (max_hit_ratio < _global_filter_lower_limit) {
_approximate = false;
+ _algorithm = Algorithm::BRUTE_FORCE_FALLBACK;
LOG(debug, "too many hits filtered out, using brute force implementation");
} else {
est_hits = std::min(est_hits, max_hits);
@@ -142,8 +162,10 @@ NearestNeighborBlueprint::perform_top_k()
if (_global_filter->has_filter()) {
auto filter = _global_filter->filter();
_found_hits = nns_index->find_top_k_with_filter(k, lhs, *filter, k + _explore_additional_hits, _distance_threshold);
+ _algorithm = Algorithm::INDEX_TOP_K_WITH_FILTER;
} else {
_found_hits = nns_index->find_top_k(k, lhs, k + _explore_additional_hits, _distance_threshold);
+ _algorithm = Algorithm::INDEX_TOP_K;
}
}
}
@@ -171,11 +193,14 @@ NearestNeighborBlueprint::visitMembers(vespalib::ObjectVisitor& visitor) const
visitor.visitInt("explore_additional_hits", _explore_additional_hits);
visitor.visitBool("approximate", _approximate);
visitor.visitBool("has_index", _attr_tensor.nearest_neighbor_index());
- visitor.visitInt("top_k_found_hits", _found_hits.size());
+ visitor.visitString("algorithm", to_string(_algorithm));
+ visitor.visitInt("top_k_hits", _found_hits.size());
visitor.openStruct("global_filter", "GlobalFilter");
- visitor.visitBool("exists", (_global_filter && _global_filter->has_filter()));
- visitor.visitFloat("brute_force_limit", _brute_force_limit);
+ visitor.visitBool("is_set", (_global_filter != nullptr));
+ visitor.visitBool("has_filter", (_global_filter && _global_filter->has_filter()));
+ visitor.visitFloat("lower_limit", _global_filter_lower_limit);
+ visitor.visitFloat("upper_limit", _global_filter_upper_limit);
if (_global_filter_hits.has_value()) {
visitor.visitInt("hits", _global_filter_hits.value());
}
@@ -191,4 +216,11 @@ NearestNeighborBlueprint::always_needs_unpack() const
return true;
}
+std::ostream&
+operator<<(std::ostream& out, NearestNeighborBlueprint::Algorithm algorithm)
+{
+ out << to_string(algorithm);
+ return out;
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h
index e318c53a25a..7922036dc42 100644
--- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h
+++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.h
@@ -19,6 +19,13 @@ namespace search::queryeval {
* where the query point and document points are dense tensors of order 1.
*/
class NearestNeighborBlueprint : public ComplexLeafBlueprint {
+public:
+ enum class Algorithm {
+ BRUTE_FORCE,
+ BRUTE_FORCE_FALLBACK,
+ INDEX_TOP_K,
+ INDEX_TOP_K_WITH_FILTER
+ };
private:
const tensor::ITensorAttribute& _attr_tensor;
std::unique_ptr<vespalib::eval::Value> _query_tensor;
@@ -26,11 +33,13 @@ private:
bool _approximate;
uint32_t _explore_additional_hits;
double _distance_threshold;
- double _brute_force_limit;
+ double _global_filter_lower_limit;
+ double _global_filter_upper_limit;
search::tensor::DistanceFunction::UP _fallback_dist_fun;
const search::tensor::DistanceFunction *_dist_fun;
mutable NearestNeighborDistanceHeap _distance_heap;
std::vector<search::tensor::NearestNeighborIndex::Neighbor> _found_hits;
+ Algorithm _algorithm;
std::shared_ptr<const GlobalFilter> _global_filter;
std::optional<uint32_t> _global_filter_hits;
std::optional<double> _global_filter_hit_ratio;
@@ -42,7 +51,8 @@ public:
std::unique_ptr<vespalib::eval::Value> query_tensor,
uint32_t target_num_hits, bool approximate, uint32_t explore_additional_hits,
double distance_threshold,
- double brute_force_limit);
+ double global_filter_lower_limit,
+ double global_filter_upper_limit);
NearestNeighborBlueprint(const NearestNeighborBlueprint&) = delete;
NearestNeighborBlueprint& operator=(const NearestNeighborBlueprint&) = delete;
~NearestNeighborBlueprint();
@@ -51,6 +61,7 @@ public:
uint32_t get_target_num_hits() const { return _target_num_hits; }
void set_global_filter(const GlobalFilter &global_filter) override;
bool may_approximate() const { return _approximate; }
+ Algorithm get_algorithm() const { return _algorithm; }
double get_distance_threshold() const { return _distance_threshold; }
std::unique_ptr<SearchIterator> createLeafSearch(const search::fef::TermFieldMatchDataArray& tfmda,
@@ -59,4 +70,7 @@ public:
bool always_needs_unpack() const override;
};
+std::ostream&
+operator<<(std::ostream& out, NearestNeighborBlueprint::Algorithm algorithm);
+
}