diff options
author | Geir Storli <geirst@yahooinc.com> | 2023-09-21 13:32:47 +0000 |
---|---|---|
committer | Geir Storli <geirst@yahooinc.com> | 2023-09-21 13:38:52 +0000 |
commit | 9edf3caed8ecad63d1f1bb5b07510934690cc6d2 (patch) | |
tree | 209187088bfa38a085798d605ddf7bd2c68b2bd5 | |
parent | 92d656cb14e33c4aea1677241aa687bdc70d5bc1 (diff) |
Add query property to control fuzzy matching algorithm.
31 files changed, 229 insertions, 38 deletions
diff --git a/searchcore/src/tests/proton/matching/matching_test.cpp b/searchcore/src/tests/proton/matching/matching_test.cpp index 6ef462f80c4..ec549ee6f71 100644 --- a/searchcore/src/tests/proton/matching/matching_test.cpp +++ b/searchcore/src/tests/proton/matching/matching_test.cpp @@ -1135,12 +1135,15 @@ TEST("require that docsum matcher can extract matching elements from single attr EXPECT_EQUAL(list[1], 3u); } +using FMA = vespalib::FuzzyMatchingAlgorithm; + struct AttributeBlueprintParamsFixture { BlueprintFactory factory; search::fef::test::IndexEnvironment index_env; RankSetup rank_setup; Properties rank_properties; - AttributeBlueprintParamsFixture(double lower_limit, double upper_limit, double target_hits_max_adjustment_factor) + AttributeBlueprintParamsFixture(double lower_limit, double upper_limit, double target_hits_max_adjustment_factor, + FMA fuzzy_matching_algorithm) : factory(), index_env(), rank_setup(factory, index_env), @@ -1149,36 +1152,41 @@ struct AttributeBlueprintParamsFixture { rank_setup.set_global_filter_lower_limit(lower_limit); rank_setup.set_global_filter_upper_limit(upper_limit); rank_setup.set_target_hits_max_adjustment_factor(target_hits_max_adjustment_factor); + rank_setup.set_fuzzy_matching_algorithm(fuzzy_matching_algorithm); } void set_query_properties(vespalib::stringref lower_limit, vespalib::stringref upper_limit, - vespalib::stringref target_hits_max_adjustment_factor) { + vespalib::stringref target_hits_max_adjustment_factor, + const vespalib::string fuzzy_matching_algorithm) { rank_properties.add(GlobalFilterLowerLimit::NAME, lower_limit); rank_properties.add(GlobalFilterUpperLimit::NAME, upper_limit); rank_properties.add(TargetHitsMaxAdjustmentFactor::NAME, target_hits_max_adjustment_factor); + rank_properties.add(FuzzyAlgorithm::NAME, fuzzy_matching_algorithm); } AttributeBlueprintParams extract(uint32_t active_docids = 9, uint32_t docid_limit = 10) const { return MatchToolsFactory::extract_attribute_blueprint_params(rank_setup, rank_properties, active_docids, docid_limit); } }; -TEST_F("attribute blueprint params are extracted from rank profile", AttributeBlueprintParamsFixture(0.2, 0.8, 5.0)) +TEST_F("attribute blueprint params are extracted from rank profile", AttributeBlueprintParamsFixture(0.2, 0.8, 5.0, FMA::BruteForce)) { auto params = f.extract(); EXPECT_EQUAL(0.2, params.global_filter_lower_limit); EXPECT_EQUAL(0.8, params.global_filter_upper_limit); EXPECT_EQUAL(5.0, params.target_hits_max_adjustment_factor); + EXPECT_EQUAL(FMA::BruteForce, params.fuzzy_matching_algorithm); } -TEST_F("attribute blueprint params are extracted from query", AttributeBlueprintParamsFixture(0.2, 0.8, 5.0)) +TEST_F("attribute blueprint params are extracted from query", AttributeBlueprintParamsFixture(0.2, 0.8, 5.0, FMA::BruteForce)) { - f.set_query_properties("0.15", "0.75", "3.0"); + f.set_query_properties("0.15", "0.75", "3.0", "dfa_explicit"); auto params = f.extract(); EXPECT_EQUAL(0.15, params.global_filter_lower_limit); EXPECT_EQUAL(0.75, params.global_filter_upper_limit); EXPECT_EQUAL(3.0, params.target_hits_max_adjustment_factor); + EXPECT_EQUAL(FMA::DfaExplicit, params.fuzzy_matching_algorithm); } -TEST_F("global filter params are scaled with active hit ratio", AttributeBlueprintParamsFixture(0.2, 0.8, 5.0)) +TEST_F("global filter params are scaled with active hit ratio", AttributeBlueprintParamsFixture(0.2, 0.8, 5.0, FMA::BruteForce)) { auto params = f.extract(5, 10); EXPECT_EQUAL(0.12, params.global_filter_lower_limit); diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp index f62f4c60a6c..5ae671b88cb 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp @@ -331,6 +331,7 @@ MatchToolsFactory::extract_attribute_blueprint_params(const RankSetup& rank_setu double lower_limit = GlobalFilterLowerLimit::lookup(rank_properties, rank_setup.get_global_filter_lower_limit()); double upper_limit = GlobalFilterUpperLimit::lookup(rank_properties, rank_setup.get_global_filter_upper_limit()); double target_hits_max_adjustment_factor = TargetHitsMaxAdjustmentFactor::lookup(rank_properties, rank_setup.get_target_hits_max_adjustment_factor()); + auto fuzzy_matching_algorithm = FuzzyAlgorithm::lookup(rank_properties, rank_setup.get_fuzzy_matching_algorithm()); // Note that we count the reserved docid 0 as active. // This ensures that when searchable-copies=1, the ratio is 1.0. @@ -338,7 +339,8 @@ MatchToolsFactory::extract_attribute_blueprint_params(const RankSetup& rank_setu return {lower_limit * active_hit_ratio, upper_limit * active_hit_ratio, - target_hits_max_adjustment_factor}; + target_hits_max_adjustment_factor, + fuzzy_matching_algorithm}; } AttributeOperationTask::AttributeOperationTask(const RequestContext & requestContext, diff --git a/searchlib/src/tests/ranksetup/ranksetup_test.cpp b/searchlib/src/tests/ranksetup/ranksetup_test.cpp index f708df0a862..8d51eb56cc3 100644 --- a/searchlib/src/tests/ranksetup/ranksetup_test.cpp +++ b/searchlib/src/tests/ranksetup/ranksetup_test.cpp @@ -536,6 +536,7 @@ void RankSetupTest::testRankSetup() env.getProperties().add(matching::GlobalFilterLowerLimit::NAME, "0.3"); env.getProperties().add(matching::GlobalFilterUpperLimit::NAME, "0.7"); env.getProperties().add(matching::TargetHitsMaxAdjustmentFactor::NAME, "5.0"); + env.getProperties().add(matching::FuzzyAlgorithm::NAME, "dfa_implicit"); RankSetup rs(_factory, env); EXPECT_FALSE(rs.has_match_features()); @@ -577,6 +578,7 @@ void RankSetupTest::testRankSetup() EXPECT_EQUAL(rs.get_global_filter_lower_limit(), 0.3); EXPECT_EQUAL(rs.get_global_filter_upper_limit(), 0.7); EXPECT_EQUAL(rs.get_target_hits_max_adjustment_factor(), 5.0); + EXPECT_EQUAL(rs.get_fuzzy_matching_algorithm(), vespalib::FuzzyMatchingAlgorithm::DfaImplicit); } bool diff --git a/searchlib/src/vespa/searchcommon/attribute/search_context_params.h b/searchlib/src/vespa/searchcommon/attribute/search_context_params.h index 8ed7eadf919..1c3b32bd777 100644 --- a/searchlib/src/vespa/searchcommon/attribute/search_context_params.h +++ b/searchlib/src/vespa/searchcommon/attribute/search_context_params.h @@ -3,6 +3,8 @@ #pragma once #include "i_document_meta_store_context.h" +#include <vespa/searchlib/fef/indexproperties.h> +#include <vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h> #include <cstddef> #include <limits> #include <cstdint> @@ -21,6 +23,8 @@ private: uint32_t _diversityCutoffGroups; bool _useBitVector; bool _diversityCutoffStrict; + vespalib::FuzzyMatchingAlgorithm _fuzzy_matching_algorithm; + public: SearchContextParams() @@ -28,13 +32,15 @@ public: _metaStoreReadGuard(nullptr), _diversityCutoffGroups(std::numeric_limits<uint32_t>::max()), _useBitVector(false), - _diversityCutoffStrict(false) + _diversityCutoffStrict(false), + _fuzzy_matching_algorithm(search::fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE) { } bool useBitVector() const { return _useBitVector; } const IAttributeVector * diversityAttribute() const { return _diversityAttribute; } uint32_t diversityCutoffGroups() const { return _diversityCutoffGroups; } bool diversityCutoffStrict() const { return _diversityCutoffStrict; } const IDocumentMetaStoreContext::IReadGuard::SP * metaStoreReadGuard() const { return _metaStoreReadGuard; } + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm() const { return _fuzzy_matching_algorithm; } SearchContextParams &useBitVector(bool value) { _useBitVector = value; @@ -56,6 +62,10 @@ public: _metaStoreReadGuard = readGuard; return *this; } + SearchContextParams& fuzzy_matching_algorithm(vespalib::FuzzyMatchingAlgorithm value) { + _fuzzy_matching_algorithm = value; + return *this; + } }; } diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp index 453b7b321b9..1519bb14554 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp @@ -713,6 +713,7 @@ public: template <class TermNode> void visitTerm(TermNode &n) { SearchContextParams scParams = createContextParams(_field.isFilter()); + scParams.fuzzy_matching_algorithm(getRequestContext().get_attribute_blueprint_params().fuzzy_matching_algorithm); const string stack = StackDumpCreator::create(n); setResult(std::make_unique<AttributeFieldBlueprint>(_field, _attr, stack, scParams)); } diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h index 64213235c23..1f9a3ebfa7e 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h @@ -3,6 +3,7 @@ #pragma once #include <vespa/searchlib/fef/indexproperties.h> +#include <vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h> namespace search::attribute { @@ -14,20 +15,24 @@ struct AttributeBlueprintParams double global_filter_lower_limit; double global_filter_upper_limit; double target_hits_max_adjustment_factor; + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm; AttributeBlueprintParams(double global_filter_lower_limit_in, double global_filter_upper_limit_in, - double target_hits_max_adjustment_factor_in) + double target_hits_max_adjustment_factor_in, + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm_in) : global_filter_lower_limit(global_filter_lower_limit_in), global_filter_upper_limit(global_filter_upper_limit_in), - target_hits_max_adjustment_factor(target_hits_max_adjustment_factor_in) + target_hits_max_adjustment_factor(target_hits_max_adjustment_factor_in), + fuzzy_matching_algorithm(fuzzy_matching_algorithm_in) { } AttributeBlueprintParams() : AttributeBlueprintParams(fef::indexproperties::matching::GlobalFilterLowerLimit::DEFAULT_VALUE, fef::indexproperties::matching::GlobalFilterUpperLimit::DEFAULT_VALUE, - fef::indexproperties::matching::TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE) + fef::indexproperties::matching::TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE, + fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE) { } }; diff --git a/searchlib/src/vespa/searchlib/attribute/multi_string_enum_hint_search_context.h b/searchlib/src/vespa/searchlib/attribute/multi_string_enum_hint_search_context.h index 3ae342be61b..f418e698585 100644 --- a/searchlib/src/vespa/searchlib/attribute/multi_string_enum_hint_search_context.h +++ b/searchlib/src/vespa/searchlib/attribute/multi_string_enum_hint_search_context.h @@ -4,6 +4,8 @@ #include "multi_string_enum_search_context.h" #include "enumhintsearchcontext.h" +#include <vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h> + namespace search::attribute { @@ -17,7 +19,12 @@ class MultiStringEnumHintSearchContext : public MultiStringEnumSearchContext<M>, public EnumHintSearchContext { public: - MultiStringEnumHintSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, const AttributeVector& toBeSearched, MultiValueMappingReadView<M> mv_mapping_read_view, const EnumStoreT<const char*>& enum_store, uint32_t doc_id_limit, uint64_t num_values); + MultiStringEnumHintSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm, + const AttributeVector& toBeSearched, + MultiValueMappingReadView<M> mv_mapping_read_view, + const EnumStoreT<const char*>& enum_store, + uint32_t doc_id_limit, uint64_t num_values); ~MultiStringEnumHintSearchContext() override; }; diff --git a/searchlib/src/vespa/searchlib/attribute/multi_string_enum_hint_search_context.hpp b/searchlib/src/vespa/searchlib/attribute/multi_string_enum_hint_search_context.hpp index fc1f72c940f..f4b96a46e3d 100644 --- a/searchlib/src/vespa/searchlib/attribute/multi_string_enum_hint_search_context.hpp +++ b/searchlib/src/vespa/searchlib/attribute/multi_string_enum_hint_search_context.hpp @@ -6,8 +6,13 @@ namespace search::attribute { template <typename M> -MultiStringEnumHintSearchContext<M>::MultiStringEnumHintSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, const AttributeVector& toBeSearched, MultiValueMappingReadView<M> mv_mapping_read_view, const EnumStoreT<const char*>& enum_store, uint32_t doc_id_limit, uint64_t num_values) - : MultiStringEnumSearchContext<M>(std::move(qTerm), cased, toBeSearched, mv_mapping_read_view, enum_store), +MultiStringEnumHintSearchContext<M>::MultiStringEnumHintSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm, + const AttributeVector& toBeSearched, + MultiValueMappingReadView<M> mv_mapping_read_view, + const EnumStoreT<const char*>& enum_store, + uint32_t doc_id_limit, uint64_t num_values) + : MultiStringEnumSearchContext<M>(std::move(qTerm), cased, fuzzy_matching_algorithm, toBeSearched, mv_mapping_read_view, enum_store), EnumHintSearchContext(enum_store.get_dictionary(), doc_id_limit, num_values) { diff --git a/searchlib/src/vespa/searchlib/attribute/multi_string_enum_search_context.h b/searchlib/src/vespa/searchlib/attribute/multi_string_enum_search_context.h index 1787ea0086d..c9b8e8271b1 100644 --- a/searchlib/src/vespa/searchlib/attribute/multi_string_enum_search_context.h +++ b/searchlib/src/vespa/searchlib/attribute/multi_string_enum_search_context.h @@ -4,6 +4,7 @@ #include "multi_enum_search_context.h" #include "string_search_context.h" +#include <vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h> namespace search::attribute { @@ -15,7 +16,11 @@ template <typename M> class MultiStringEnumSearchContext : public MultiEnumSearchContext<const char*, StringSearchContext, M> { public: - MultiStringEnumSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, const AttributeVector& toBeSearched, MultiValueMappingReadView<M> mv_mapping_read_view, const EnumStoreT<const char*>& enum_store); + MultiStringEnumSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm, + const AttributeVector& toBeSearched, + MultiValueMappingReadView<M> mv_mapping_read_view, + const EnumStoreT<const char*>& enum_store); }; } diff --git a/searchlib/src/vespa/searchlib/attribute/multi_string_enum_search_context.hpp b/searchlib/src/vespa/searchlib/attribute/multi_string_enum_search_context.hpp index 1d74db04373..48d1e8b6406 100644 --- a/searchlib/src/vespa/searchlib/attribute/multi_string_enum_search_context.hpp +++ b/searchlib/src/vespa/searchlib/attribute/multi_string_enum_search_context.hpp @@ -9,8 +9,12 @@ namespace search::attribute { template <typename M> -MultiStringEnumSearchContext<M>::MultiStringEnumSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, const AttributeVector& toBeSearched, MultiValueMappingReadView<M> mv_mapping_read_view, const EnumStoreT<const char*>& enum_store) - : MultiEnumSearchContext<const char*, StringSearchContext, M>(StringMatcher(std::move(qTerm), cased), toBeSearched, mv_mapping_read_view, enum_store) +MultiStringEnumSearchContext<M>::MultiStringEnumSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm, + const AttributeVector& toBeSearched, + MultiValueMappingReadView<M> mv_mapping_read_view, + const EnumStoreT<const char*>& enum_store) + : MultiEnumSearchContext<const char*, StringSearchContext, M>(StringMatcher(std::move(qTerm), cased, fuzzy_matching_algorithm), toBeSearched, mv_mapping_read_view, enum_store) { } diff --git a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp index 43bb1c5ebb0..53e5f0d2e12 100644 --- a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp @@ -46,11 +46,12 @@ MultiValueStringAttributeT<B, M>::freezeEnumDictionary() template <typename B, typename M> std::unique_ptr<attribute::SearchContext> MultiValueStringAttributeT<B, M>::getSearch(QueryTermSimpleUP qTerm, - const attribute::SearchContextParams &) const + const attribute::SearchContextParams ¶ms) const { bool cased = this->get_match_is_cased(); auto doc_id_limit = this->getCommittedDocIdLimit(); - return std::make_unique<attribute::MultiStringEnumHintSearchContext<M>>(std::move(qTerm), cased, *this, this->_mvMapping.make_read_view(doc_id_limit), this->_enumStore, doc_id_limit, this->getStatus().getNumValues()); + return std::make_unique<attribute::MultiStringEnumHintSearchContext<M>>(std::move(qTerm), cased, params.fuzzy_matching_algorithm(), + *this, this->_mvMapping.make_read_view(doc_id_limit), this->_enumStore, doc_id_limit, this->getStatus().getNumValues()); } template <typename B, typename M> diff --git a/searchlib/src/vespa/searchlib/attribute/multistringpostattribute.hpp b/searchlib/src/vespa/searchlib/attribute/multistringpostattribute.hpp index fe52b785fa7..3da6357bb53 100644 --- a/searchlib/src/vespa/searchlib/attribute/multistringpostattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/multistringpostattribute.hpp @@ -99,7 +99,7 @@ MultiValueStringPostingAttributeT<B, T>::getSearch(QueryTermSimpleUP qTerm, using SC = attribute::StringPostingSearchContext<BaseSC, SelfType, int32_t>; bool cased = this->get_match_is_cased(); auto doc_id_limit = this->getCommittedDocIdLimit(); - BaseSC base_sc(std::move(qTerm), cased, *this, this->_mvMapping.make_read_view(doc_id_limit), this->_enumStore); + BaseSC base_sc(std::move(qTerm), cased, params.fuzzy_matching_algorithm(), *this, this->_mvMapping.make_read_view(doc_id_limit), this->_enumStore); return std::make_unique<SC>(std::move(base_sc), params.useBitVector(), *this); } diff --git a/searchlib/src/vespa/searchlib/attribute/single_string_enum_hint_search_context.cpp b/searchlib/src/vespa/searchlib/attribute/single_string_enum_hint_search_context.cpp index 2d1748cefa5..95ba37d85be 100644 --- a/searchlib/src/vespa/searchlib/attribute/single_string_enum_hint_search_context.cpp +++ b/searchlib/src/vespa/searchlib/attribute/single_string_enum_hint_search_context.cpp @@ -5,8 +5,13 @@ namespace search::attribute { -SingleStringEnumHintSearchContext::SingleStringEnumHintSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, const AttributeVector& toBeSearched, EnumIndices enum_indices, const EnumStoreT<const char*>& enum_store, uint64_t num_values) - : SingleStringEnumSearchContext(std::move(qTerm), cased, toBeSearched, enum_indices, enum_store), +SingleStringEnumHintSearchContext::SingleStringEnumHintSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm, + const AttributeVector& toBeSearched, + EnumIndices enum_indices, + const EnumStoreT<const char*>& enum_store, + uint64_t num_values) + : SingleStringEnumSearchContext(std::move(qTerm), cased, fuzzy_matching_algorithm, toBeSearched, enum_indices, enum_store), EnumHintSearchContext(enum_store.get_dictionary(), enum_indices.size(), num_values) { diff --git a/searchlib/src/vespa/searchlib/attribute/single_string_enum_hint_search_context.h b/searchlib/src/vespa/searchlib/attribute/single_string_enum_hint_search_context.h index f157bf17a71..595d1ac8c57 100644 --- a/searchlib/src/vespa/searchlib/attribute/single_string_enum_hint_search_context.h +++ b/searchlib/src/vespa/searchlib/attribute/single_string_enum_hint_search_context.h @@ -4,6 +4,7 @@ #include "single_string_enum_search_context.h" #include "enumhintsearchcontext.h" +#include <vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h> namespace search::attribute { @@ -16,7 +17,12 @@ class SingleStringEnumHintSearchContext : public SingleStringEnumSearchContext, public EnumHintSearchContext { public: - SingleStringEnumHintSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, const AttributeVector& toBeSearched, EnumIndices enum_indices, const EnumStoreT<const char*>& enum_store, uint64_t num_values); + SingleStringEnumHintSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm, + const AttributeVector& toBeSearched, + EnumIndices enum_indices, + const EnumStoreT<const char*>& enum_store, + uint64_t num_values); ~SingleStringEnumHintSearchContext() override; }; diff --git a/searchlib/src/vespa/searchlib/attribute/single_string_enum_search_context.cpp b/searchlib/src/vespa/searchlib/attribute/single_string_enum_search_context.cpp index 8d23eaf7af0..42aebe9f814 100644 --- a/searchlib/src/vespa/searchlib/attribute/single_string_enum_search_context.cpp +++ b/searchlib/src/vespa/searchlib/attribute/single_string_enum_search_context.cpp @@ -6,8 +6,13 @@ namespace search::attribute { -SingleStringEnumSearchContext::SingleStringEnumSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, const AttributeVector& toBeSearched, EnumIndices enum_indices, const EnumStoreT<const char*>& enum_store) - : SingleEnumSearchContext<const char*, StringSearchContext>(StringMatcher(std::move(qTerm), cased), toBeSearched, enum_indices, enum_store) +SingleStringEnumSearchContext::SingleStringEnumSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm, + const AttributeVector& toBeSearched, + EnumIndices enum_indices, + const EnumStoreT<const char*>& enum_store) + : SingleEnumSearchContext<const char*, StringSearchContext>(StringMatcher(std::move(qTerm), cased, fuzzy_matching_algorithm), + toBeSearched, enum_indices, enum_store) { } diff --git a/searchlib/src/vespa/searchlib/attribute/single_string_enum_search_context.h b/searchlib/src/vespa/searchlib/attribute/single_string_enum_search_context.h index b8014b1b0e3..71c62af33aa 100644 --- a/searchlib/src/vespa/searchlib/attribute/single_string_enum_search_context.h +++ b/searchlib/src/vespa/searchlib/attribute/single_string_enum_search_context.h @@ -4,6 +4,7 @@ #include "single_enum_search_context.h" #include "string_search_context.h" +#include <vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h> namespace search::attribute { @@ -14,7 +15,11 @@ namespace search::attribute { class SingleStringEnumSearchContext : public SingleEnumSearchContext<const char*, StringSearchContext> { public: - SingleStringEnumSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, const AttributeVector& toBeSearched, EnumIndices enum_indices, const EnumStoreT<const char*>& enum_store); + SingleStringEnumSearchContext(std::unique_ptr<QueryTermSimple> qTerm, bool cased, + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm, + const AttributeVector& toBeSearched, + EnumIndices enum_indices, + const EnumStoreT<const char*>& enum_store); SingleStringEnumSearchContext(SingleStringEnumSearchContext&&) noexcept; ~SingleStringEnumSearchContext() override; }; diff --git a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp index c3f5c295260..c4c6fc97053 100644 --- a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp @@ -43,11 +43,12 @@ SingleValueStringAttributeT<B>::freezeEnumDictionary() template <typename B> std::unique_ptr<attribute::SearchContext> SingleValueStringAttributeT<B>::getSearch(QueryTermSimpleUP qTerm, - const attribute::SearchContextParams &) const + const attribute::SearchContextParams& params) const { bool cased = this->get_match_is_cased(); auto docid_limit = this->getCommittedDocIdLimit(); - return std::make_unique<attribute::SingleStringEnumHintSearchContext>(std::move(qTerm), cased, *this, this->_enumIndices.make_read_view(docid_limit), this->_enumStore, this->getStatus().getNumValues()); + return std::make_unique<attribute::SingleStringEnumHintSearchContext>(std::move(qTerm), cased, params.fuzzy_matching_algorithm(), + *this, this->_enumIndices.make_read_view(docid_limit), this->_enumStore, this->getStatus().getNumValues()); } } diff --git a/searchlib/src/vespa/searchlib/attribute/singlestringpostattribute.hpp b/searchlib/src/vespa/searchlib/attribute/singlestringpostattribute.hpp index 60847636baa..20d672411f8 100644 --- a/searchlib/src/vespa/searchlib/attribute/singlestringpostattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/singlestringpostattribute.hpp @@ -146,7 +146,7 @@ SingleValueStringPostingAttributeT<B>::getSearch(QueryTermSimpleUP qTerm, using SC = attribute::StringPostingSearchContext<BaseSC, SelfType, vespalib::btree::BTreeNoLeafData>; bool cased = this->get_match_is_cased(); auto docid_limit = this->getCommittedDocIdLimit(); - BaseSC base_sc(std::move(qTerm), cased, *this, this->_enumIndices.make_read_view(docid_limit), this->_enumStore); + BaseSC base_sc(std::move(qTerm), cased, params.fuzzy_matching_algorithm(), *this, this->_enumIndices.make_read_view(docid_limit), this->_enumStore); return std::make_unique<SC>(std::move(base_sc), params.useBitVector(), *this); diff --git a/searchlib/src/vespa/searchlib/attribute/string_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/string_matcher.cpp index bc3637e7215..8b755d5f3b1 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_matcher.cpp +++ b/searchlib/src/vespa/searchlib/attribute/string_matcher.cpp @@ -5,9 +5,9 @@ namespace search::attribute { -StringMatcher::StringMatcher(std::unique_ptr<QueryTermSimple> query_term, bool cased) +StringMatcher::StringMatcher(std::unique_ptr<QueryTermSimple> query_term, bool cased, vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm) : _query_term(static_cast<QueryTermUCS4 *>(query_term.release())), - _helper(*_query_term, cased) + _helper(*_query_term, cased, fuzzy_matching_algorithm) { } diff --git a/searchlib/src/vespa/searchlib/attribute/string_matcher.h b/searchlib/src/vespa/searchlib/attribute/string_matcher.h index ea4debecc0d..05089e1251a 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_matcher.h +++ b/searchlib/src/vespa/searchlib/attribute/string_matcher.h @@ -3,6 +3,7 @@ #pragma once #include "string_search_helper.h" +#include <vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h> namespace search { class QueryTermSimple; } @@ -18,7 +19,7 @@ private: std::unique_ptr<QueryTermUCS4> _query_term; attribute::StringSearchHelper _helper; public: - StringMatcher(std::unique_ptr<QueryTermSimple> qTerm, bool cased); + StringMatcher(std::unique_ptr<QueryTermSimple> qTerm, bool cased, vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm); StringMatcher(StringMatcher&&) noexcept; ~StringMatcher(); protected: diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_context.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_context.cpp index fadf7a3151d..119b4a60d0c 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_search_context.cpp +++ b/searchlib/src/vespa/searchlib/attribute/string_search_context.cpp @@ -9,9 +9,10 @@ namespace search::attribute { -StringSearchContext::StringSearchContext(const AttributeVector& to_be_searched, std::unique_ptr<QueryTermSimple> query_term, bool cased) +StringSearchContext::StringSearchContext(const AttributeVector& to_be_searched, std::unique_ptr<QueryTermSimple> query_term, + bool cased, vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm) : SearchContext(to_be_searched), - StringMatcher(std::move(query_term), cased) + StringMatcher(std::move(query_term), cased, fuzzy_matching_algorithm) { } diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_context.h b/searchlib/src/vespa/searchlib/attribute/string_search_context.h index a0014379436..e459153d2b8 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_search_context.h +++ b/searchlib/src/vespa/searchlib/attribute/string_search_context.h @@ -4,6 +4,7 @@ #include "search_context.h" #include "string_matcher.h" +#include <vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h> namespace search { @@ -24,7 +25,8 @@ class StringSearchContext : public SearchContext, public StringMatcher protected: using MatcherType = StringMatcher; public: - StringSearchContext(const AttributeVector& to_be_searched, std::unique_ptr<QueryTermSimple> query_term, bool cased); + StringSearchContext(const AttributeVector& to_be_searched, std::unique_ptr<QueryTermSimple> query_term, + bool cased, vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm); StringSearchContext(const AttributeVector& to_be_searched, StringMatcher&& matcher); StringSearchContext(StringSearchContext &&) noexcept; ~StringSearchContext() override; diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp index 60c00a043d0..1efe39667b8 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp +++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp @@ -9,7 +9,7 @@ namespace search::attribute { -StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased) +StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased, vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm) : _regex(), _fuzzyMatcher(), _term(), @@ -24,6 +24,8 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased) ? vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::None) : vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::IgnoreCase); } else if (isFuzzy()) { + (void) fuzzy_matching_algorithm; + // TODO: Select implementation based on algorithm. _fuzzyMatcher = std::make_unique<vespalib::FuzzyMatcher>(term.getTerm(), term.getFuzzyMaxEditDistance(), term.getFuzzyPrefixLength(), diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.h b/searchlib/src/vespa/searchlib/attribute/string_search_helper.h index 3db0d4dbb5f..0e7a116a874 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.h +++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.h @@ -2,6 +2,7 @@ #pragma once +#include <vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h> #include <vespa/vespalib/regex/regex.h> namespace vespalib { class FuzzyMatcher; } @@ -16,7 +17,8 @@ namespace search::attribute { class StringSearchHelper { public: using FuzzyMatcher = vespalib::FuzzyMatcher; - StringSearchHelper(QueryTermUCS4 & qTerm, bool cased); + StringSearchHelper(QueryTermUCS4 & qTerm, bool cased, + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm = vespalib::FuzzyMatchingAlgorithm::BruteForce); StringSearchHelper(StringSearchHelper&&) noexcept; StringSearchHelper(const StringSearchHelper &) = delete; StringSearchHelper & operator =(const StringSearchHelper &) = delete; diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp index 7871e66970e..b006aebbcdb 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp @@ -438,6 +438,22 @@ TargetHitsMaxAdjustmentFactor::lookup(const Properties& props, double defaultVal return lookupDouble(props, NAME, defaultValue); } +const vespalib::string FuzzyAlgorithm::NAME("vespa.matching.fuzzy.algorithm"); +const vespalib::FuzzyMatchingAlgorithm FuzzyAlgorithm::DEFAULT_VALUE(vespalib::FuzzyMatchingAlgorithm::BruteForce); + +vespalib::FuzzyMatchingAlgorithm +FuzzyAlgorithm::lookup(const Properties& props) +{ + return lookup(props, DEFAULT_VALUE); +} + +vespalib::FuzzyMatchingAlgorithm +FuzzyAlgorithm::lookup(const Properties& props, vespalib::FuzzyMatchingAlgorithm default_value) +{ + auto value = lookupString(props, NAME, vespalib::to_string(default_value)); + return vespalib::fuzzy_matching_algorithm_from_string(value, default_value); +} + } // namespace matching namespace softtimeout { diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.h b/searchlib/src/vespa/searchlib/fef/indexproperties.h index 4f38a27d3fe..1f16d6b5f57 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.h +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.h @@ -2,9 +2,10 @@ #pragma once +#include <vespa/searchlib/common/feature.h> +#include <vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h> #include <vespa/vespalib/stllike/string.h> #include <vector> -#include <vespa/searchlib/common/feature.h> namespace search::fef { class Properties; } @@ -328,6 +329,16 @@ namespace matching { static double lookup(const Properties &props); static double lookup(const Properties &props, double defaultValue); }; + + /** + * Property to control the algorithm using for fuzzy matching. + **/ + struct FuzzyAlgorithm { + static const vespalib::string NAME; + static const vespalib::FuzzyMatchingAlgorithm DEFAULT_VALUE; + static vespalib::FuzzyMatchingAlgorithm lookup(const Properties& props); + static vespalib::FuzzyMatchingAlgorithm lookup(const Properties& props, vespalib::FuzzyMatchingAlgorithm default_value); + }; } namespace softtimeout { diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp index 9d4e547feef..02b56701cdb 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp @@ -69,6 +69,7 @@ RankSetup::RankSetup(const BlueprintFactory &factory, const IIndexEnvironment &i _global_filter_lower_limit(0.0), _global_filter_upper_limit(1.0), _target_hits_max_adjustment_factor(20.0), + _fuzzy_matching_algorithm(vespalib::FuzzyMatchingAlgorithm::BruteForce), _mutateOnMatch(), _mutateOnFirstPhase(), _mutateOnSecondPhase(), @@ -123,6 +124,7 @@ RankSetup::configure() set_global_filter_lower_limit(matching::GlobalFilterLowerLimit::lookup(_indexEnv.getProperties())); set_global_filter_upper_limit(matching::GlobalFilterUpperLimit::lookup(_indexEnv.getProperties())); set_target_hits_max_adjustment_factor(matching::TargetHitsMaxAdjustmentFactor::lookup(_indexEnv.getProperties())); + set_fuzzy_matching_algorithm(matching::FuzzyAlgorithm::lookup(_indexEnv.getProperties())); _mutateOnMatch._attribute = mutate::on_match::Attribute::lookup(_indexEnv.getProperties()); _mutateOnMatch._operation = mutate::on_match::Operation::lookup(_indexEnv.getProperties()); _mutateOnFirstPhase._attribute = mutate::on_first_phase::Attribute::lookup(_indexEnv.getProperties()); diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.h b/searchlib/src/vespa/searchlib/fef/ranksetup.h index 72432c2ed8a..3170f965e58 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.h +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.h @@ -8,6 +8,7 @@ #include "blueprintresolver.h" #include "rank_program.h" #include <vespa/searchlib/common/stringmap.h> +#include <vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h> namespace search::fef { @@ -77,6 +78,7 @@ private: double _global_filter_lower_limit; double _global_filter_upper_limit; double _target_hits_max_adjustment_factor; + vespalib::FuzzyMatchingAlgorithm _fuzzy_matching_algorithm; MutateOperation _mutateOnMatch; MutateOperation _mutateOnFirstPhase; MutateOperation _mutateOnSecondPhase; @@ -396,6 +398,8 @@ public: double get_global_filter_upper_limit() const { return _global_filter_upper_limit; } void set_target_hits_max_adjustment_factor(double v) { _target_hits_max_adjustment_factor = v; } double get_target_hits_max_adjustment_factor() const { return _target_hits_max_adjustment_factor; } + void set_fuzzy_matching_algorithm(vespalib::FuzzyMatchingAlgorithm v) { _fuzzy_matching_algorithm = v; } + vespalib::FuzzyMatchingAlgorithm get_fuzzy_matching_algorithm() const { return _fuzzy_matching_algorithm; } /** * This method may be used to indicate that certain features diff --git a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt index bdbb03bcfee..5e8d29980cd 100644 --- a/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/fuzzy/CMakeLists.txt @@ -3,6 +3,7 @@ vespa_add_library(vespalib_vespalib_fuzzy OBJECT SOURCES explicit_levenshtein_dfa.cpp fuzzy_matcher.cpp + fuzzy_matching_algorithm.cpp implicit_levenshtein_dfa.cpp levenshtein_dfa.cpp levenshtein_distance.cpp diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.cpp b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.cpp new file mode 100644 index 00000000000..826b0beffd6 --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.cpp @@ -0,0 +1,51 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "fuzzy_matching_algorithm.h" + +namespace vespalib { + +namespace { + +const vespalib::string brute_force = "brute_force"; +const vespalib::string dfa_implicit = "dfa_implicit"; +const vespalib::string dfa_explicit = "dfa_explicit"; + +} + +vespalib::string +to_string(FuzzyMatchingAlgorithm algo) +{ + switch (algo) { + case FuzzyMatchingAlgorithm::BruteForce: + return brute_force; + case FuzzyMatchingAlgorithm::DfaImplicit: + return dfa_implicit; + case FuzzyMatchingAlgorithm::DfaExplicit: + return dfa_explicit; + default: + return ""; + } +} + +FuzzyMatchingAlgorithm +fuzzy_matching_algorithm_from_string(const vespalib::string& algo, + FuzzyMatchingAlgorithm default_algo) +{ + if (algo == brute_force) { + return FuzzyMatchingAlgorithm::BruteForce; + } else if (algo == dfa_implicit) { + return FuzzyMatchingAlgorithm::DfaImplicit; + } else if (algo == dfa_explicit) { + return FuzzyMatchingAlgorithm::DfaExplicit; + } + return default_algo; +} + +std::ostream& +operator<<(std::ostream& out, FuzzyMatchingAlgorithm algo) +{ + out << to_string(algo); + return out; +} + +} diff --git a/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h new file mode 100644 index 00000000000..83cb121fe5f --- /dev/null +++ b/vespalib/src/vespa/vespalib/fuzzy/fuzzy_matching_algorithm.h @@ -0,0 +1,26 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> +#include <ostream> + +namespace vespalib { + +/** + * Algorithms that are supported for fuzzy matching. + */ +enum class FuzzyMatchingAlgorithm { + BruteForce, + DfaImplicit, + DfaExplicit +}; + +vespalib::string to_string(FuzzyMatchingAlgorithm algo); + +FuzzyMatchingAlgorithm fuzzy_matching_algorithm_from_string(const vespalib::string& algo, + FuzzyMatchingAlgorithm default_algo); + +std::ostream& operator<<(std::ostream& out, FuzzyMatchingAlgorithm algo); + +} |