diff options
author | Tor Egge <Tor.Egge@broadpark.no> | 2020-05-25 11:25:34 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-05-25 11:25:34 +0200 |
commit | 9fbf28d399520f05180904cde08ece6a27736046 (patch) | |
tree | 22788682502961fb7a9fe3c878d5d53913cd58de | |
parent | fd9418f04c6969b742144902bd6a7cb207e3aec2 (diff) | |
parent | a7adf86dcd43d949ed99b2a8881e0739a4c8ae08 (diff) |
Merge pull request #13347 from vespa-engine/toregge/add-field-match-computer-shared-state
Factor out portions of field match computer to field match computer shared state.
6 files changed, 141 insertions, 61 deletions
diff --git a/searchlib/src/vespa/searchlib/features/fieldmatch/CMakeLists.txt b/searchlib/src/vespa/searchlib/features/fieldmatch/CMakeLists.txt index 7786e3b45a1..a3273a4a39c 100644 --- a/searchlib/src/vespa/searchlib/features/fieldmatch/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/features/fieldmatch/CMakeLists.txt @@ -2,6 +2,7 @@ vespa_add_library(searchlib_features_fieldmatch OBJECT SOURCES computer.cpp + computer_shared_state.cpp metrics.cpp params.cpp segmentstart.cpp diff --git a/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp b/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp index 43aee8167ee..7c3c0c5d638 100644 --- a/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp +++ b/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp @@ -1,8 +1,10 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "computer.h" +#include "computer_shared_state.h" #include <vespa/searchlib/features/utils.h> #include <vespa/searchlib/fef/phrase_splitter_query_env.h> +#include <vespa/searchlib/fef/phrasesplitter.h> #include <vespa/searchlib/fef/properties.h> #include <vespa/vespalib/util/stringfmt.h> #include <vespa/vespalib/locale/c.h> @@ -15,55 +17,24 @@ using namespace search::fef; namespace search::features::fieldmatch { -Computer::Computer(const vespalib::string &propertyNamespace, const PhraseSplitter &splitter, - const FieldInfo &fieldInfo, const Params ¶ms) : - _splitter(splitter), - _fieldId(fieldInfo.id()), - _params(params), - _useCachedHits(true), - _queryTerms(), - _queryTermFieldMatch(), - _totalTermWeight(0), - _totalTermSignificance(0.0f), - _fieldLength(FieldPositionsIterator::UNKNOWN_LENGTH), - _currentMetrics(this), - _finalMetrics(this), - _simpleMetrics(params), - _segments(), - _alternativeSegmentationsTried(0), - _cachedHits() +Computer::Computer(const ComputerSharedState& shared_state, const PhraseSplitter& splitter) + : _shared_state(shared_state), + _splitter(splitter), + _fieldId(_shared_state.get_field_id()), + _params(_shared_state.get_params()), + _useCachedHits(_shared_state.get_use_cached_hits()), + _queryTerms(_shared_state.get_query_terms()), + _queryTermFieldMatch(_queryTerms.size()), + _totalTermWeight(_shared_state.get_total_term_weight()), + _totalTermSignificance(_shared_state.get_total_term_significance()), + _fieldLength(FieldPositionsIterator::UNKNOWN_LENGTH), + _currentMetrics(this), + _finalMetrics(this), + _simpleMetrics(_shared_state.get_simple_metrics()), + _segments(), + _alternativeSegmentationsTried(0), + _cachedHits(_queryTerms.size()) { - // Store term data for all terms searching in this field - const auto& splitter_query_env = splitter.get_query_env(); - _queryTermFieldMatch.reserve(splitter_query_env.getNumTerms()); - _cachedHits.reserve(splitter_query_env.getNumTerms()); - for (uint32_t i = 0; i < splitter_query_env.getNumTerms(); ++i) { - QueryTerm qt = QueryTermFactory::create(splitter_query_env, i, true); - _totalTermWeight += qt.termData()->getWeight().percent(); - _totalTermSignificance += qt.significance(); - _simpleMetrics.addQueryTerm(qt.termData()->getWeight().percent()); - const ITermFieldData *field = qt.termData()->lookupField(_fieldId); - if (field != nullptr) { - qt.fieldHandle(field->getHandle()); - _queryTerms.push_back(qt); - _simpleMetrics.addSearchedTerm(qt.termData()->getWeight().percent()); - _queryTermFieldMatch.emplace_back(nullptr); - _cachedHits.emplace_back(); - } - } - - _totalTermWeight = atoi(splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermWeight"). - get(vespalib::make_string("%d", _totalTermWeight)).c_str()); - _totalTermSignificance = vespalib::locale::c::atof(splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermSignificance"). - get(vespalib::make_string("%f", _totalTermSignificance)).c_str()); - if (splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermWeight").found()) { - _simpleMetrics.setTotalWeightInQuery(_totalTermWeight); - } - - // update current and final metrics after initialization - _currentMetrics = Metrics(this); - _finalMetrics = Metrics(this); - // num query terms searching in this field + 1 _segments.reserve(getNumQueryTerms() + 1); for (uint32_t i = 0; i < (getNumQueryTerms() + 1); ++i) { diff --git a/searchlib/src/vespa/searchlib/features/fieldmatch/computer.h b/searchlib/src/vespa/searchlib/features/fieldmatch/computer.h index b2644208b40..f684f42708a 100644 --- a/searchlib/src/vespa/searchlib/features/fieldmatch/computer.h +++ b/searchlib/src/vespa/searchlib/features/fieldmatch/computer.h @@ -5,15 +5,21 @@ #include "params.h" #include "segmentstart.h" #include "simplemetrics.h" -#include <vespa/searchlib/fef/iqueryenvironment.h> -#include <vespa/searchlib/fef/fieldinfo.h> -#include <vespa/searchlib/fef/matchdata.h> -#include <vespa/searchlib/fef/phrasesplitter.h> #include <vespa/searchlib/features/queryterm.h> #include <vespa/searchlib/common/allocatedbitvector.h> +#include <vespa/vespalib/util/arrayref.h> + +namespace search::fef { + +class PhraseSplitter; +class TermFieldMatchData; + +} namespace search::features::fieldmatch { +class ComputerSharedState; + /** * <p>Calculates a set of metrics capturing information about the degree of agreement between a query and a field * string. This algorithm attempts to capture the property of text that very close tokens are usuall part of the same @@ -57,13 +63,10 @@ public: /** * Constructs a new computer object. * - * @param propertyNamespace The namespace used in query properties. + * @param shared_state The shared state for this computer * @param splitter The environment that holds all query information. - * @param fieldInfo The info object of the matched field. - * @param params The parameter object for this computer. */ - Computer(const vespalib::string &propertyNamespace, const fef::PhraseSplitter &splitter, - const fef::FieldInfo &fieldInfo, const Params ¶ms); + Computer(const ComputerSharedState& shared_state, const fef::PhraseSplitter& splitter); /** * Resets this object according to the given document id @@ -296,12 +299,13 @@ private: }; // per query + const ComputerSharedState& _shared_state; const search::fef::PhraseSplitter & _splitter; uint32_t _fieldId; - Params _params; + const Params _params; bool _useCachedHits; - QueryTermVector _queryTerms; + const vespalib::ConstArrayRef<QueryTerm> _queryTerms; TermFieldMatchDataVector _queryTermFieldMatch; uint32_t _totalTermWeight; feature_t _totalTermSignificance; diff --git a/searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.cpp b/searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.cpp new file mode 100644 index 00000000000..c6093f52619 --- /dev/null +++ b/searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.cpp @@ -0,0 +1,49 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "computer_shared_state.h" +#include <vespa/searchlib/features/utils.h> +#include <vespa/searchlib/fef/phrase_splitter_query_env.h> +#include <vespa/searchlib/fef/properties.h> +#include <vespa/vespalib/util/stringfmt.h> +#include <vespa/vespalib/locale/c.h> +#include <set> + +using namespace search::fef; + +namespace search::features::fieldmatch { + +ComputerSharedState::ComputerSharedState(const vespalib::string& propertyNamespace, const PhraseSplitterQueryEnv& splitter_query_env, + const FieldInfo& fieldInfo, const Params& params) + : _splitter_query_env(splitter_query_env), + _field_id(fieldInfo.id()), + _params(params), + _use_cached_hits(true), + _query_terms(), + _total_term_weight(0), + _total_term_significance(0.0f), + _simple_metrics(_params) +{ + // Store term data for all terms searching in this field + for (uint32_t i = 0; i < splitter_query_env.getNumTerms(); ++i) { + QueryTerm qt = QueryTermFactory::create(splitter_query_env, i, true); + _total_term_weight += qt.termData()->getWeight().percent(); + _total_term_significance += qt.significance(); + _simple_metrics.addQueryTerm(qt.termData()->getWeight().percent()); + const ITermFieldData *field = qt.termData()->lookupField(_field_id); + if (field != nullptr) { + qt.fieldHandle(field->getHandle()); + _query_terms.push_back(qt); + _simple_metrics.addSearchedTerm(qt.termData()->getWeight().percent()); + } + } + + _total_term_weight = atoi(splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermWeight"). + get(vespalib::make_string("%d", _total_term_weight)).c_str()); + _total_term_significance = vespalib::locale::c::atof(splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermSignificance"). + get(vespalib::make_string("%f", _total_term_significance)).c_str()); + if (splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermWeight").found()) { + _simple_metrics.setTotalWeightInQuery(_total_term_weight); + } +} + +} diff --git a/searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.h b/searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.h new file mode 100644 index 00000000000..2d0eaaf2737 --- /dev/null +++ b/searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.h @@ -0,0 +1,52 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "params.h" +#include "simplemetrics.h" +#include <vespa/searchlib/features/queryterm.h> + +namespace search::fef { class PhraseSplitterQueryEnv; } + +namespace search::features::fieldmatch { + +/** + * Shared state for field match computer. + */ +class ComputerSharedState { +public: + /** + * Constructs a new computer shared state object. + * + * @param propertyNamespace The namespace used in query properties. + * @param splitter_query_env The environment that holds all query information. + * @param fieldInfo The info object of the matched field. + * @param params The parameter object for this computer. + */ + ComputerSharedState(const vespalib::string& propertyNamespace, const fef::PhraseSplitterQueryEnv& splitter_query_env, + const fef::FieldInfo& fieldInfo, const Params& params); + + uint32_t get_field_id() const { return _field_id; } + const Params& get_params() const { return _params; } + bool get_use_cached_hits() const { return _use_cached_hits; } + const QueryTermVector& get_query_terms() const { return _query_terms; } + uint32_t get_total_term_weight() const { return _total_term_weight; } + feature_t get_total_term_significance() const { return _total_term_significance; } + const SimpleMetrics& get_simple_metrics() const { return _simple_metrics; } + +private: + + // per query + const search::fef::PhraseSplitterQueryEnv& _splitter_query_env; + uint32_t _field_id; + Params _params; + bool _use_cached_hits; + + QueryTermVector _query_terms; + uint32_t _total_term_weight; + feature_t _total_term_significance; + + // portions per docid (not used here), portions per query + SimpleMetrics _simple_metrics; // The metrics used to compute simple features. +}; + +} diff --git a/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp b/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp index 62b5f1d8165..9e4aa0d96ab 100644 --- a/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp +++ b/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp @@ -3,9 +3,11 @@ #include "fieldmatchfeature.h" #include "utils.h" #include <vespa/searchlib/features/fieldmatch/computer.h> +#include <vespa/searchlib/features/fieldmatch/computer_shared_state.h> #include <vespa/searchlib/fef/featurenamebuilder.h> #include <vespa/searchlib/fef/indexproperties.h> #include <vespa/searchlib/fef/phrase_splitter_query_env.h> +#include <vespa/searchlib/fef/phrasesplitter.h> #include <vespa/searchlib/fef/properties.h> #include <vespa/vespalib/util/stringfmt.h> #include <vespa/vespalib/locale/c.h> @@ -24,6 +26,7 @@ private: PhraseSplitterQueryEnv _splitter_env; fef::PhraseSplitter _splitter; const fef::FieldInfo & _field; + fieldmatch::ComputerSharedState _cmp_shared_state; fieldmatch::Computer _cmp; void handle_bind_match_data(const fef::MatchData &md) override; @@ -42,8 +45,8 @@ FieldMatchExecutor::FieldMatchExecutor(const IQueryEnvironment & queryEnv, _splitter_env(queryEnv, field.id()), _splitter(_splitter_env), _field(field), - _cmp(vespalib::make_string("fieldMatch(%s)", _field.name().c_str()), - _splitter, field, params) + _cmp_shared_state(vespalib::make_string("fieldMatch(%s)", _field.name().c_str()), _splitter_env, field, params), + _cmp(_cmp_shared_state, _splitter) { // empty } |