aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@broadpark.no>2020-05-25 11:25:34 +0200
committerGitHub <noreply@github.com>2020-05-25 11:25:34 +0200
commit9fbf28d399520f05180904cde08ece6a27736046 (patch)
tree22788682502961fb7a9fe3c878d5d53913cd58de
parentfd9418f04c6969b742144902bd6a7cb207e3aec2 (diff)
parenta7adf86dcd43d949ed99b2a8881e0739a4c8ae08 (diff)
Merge pull request #13347 from vespa-engine/toregge/add-field-match-computer-shared-state
Factor out portions of field match computer to field match computer shared state.
-rw-r--r--searchlib/src/vespa/searchlib/features/fieldmatch/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp67
-rw-r--r--searchlib/src/vespa/searchlib/features/fieldmatch/computer.h26
-rw-r--r--searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.cpp49
-rw-r--r--searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.h52
-rw-r--r--searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp7
6 files changed, 141 insertions, 61 deletions
diff --git a/searchlib/src/vespa/searchlib/features/fieldmatch/CMakeLists.txt b/searchlib/src/vespa/searchlib/features/fieldmatch/CMakeLists.txt
index 7786e3b45a1..a3273a4a39c 100644
--- a/searchlib/src/vespa/searchlib/features/fieldmatch/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/features/fieldmatch/CMakeLists.txt
@@ -2,6 +2,7 @@
vespa_add_library(searchlib_features_fieldmatch OBJECT
SOURCES
computer.cpp
+ computer_shared_state.cpp
metrics.cpp
params.cpp
segmentstart.cpp
diff --git a/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp b/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp
index 43aee8167ee..7c3c0c5d638 100644
--- a/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp
+++ b/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp
@@ -1,8 +1,10 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "computer.h"
+#include "computer_shared_state.h"
#include <vespa/searchlib/features/utils.h>
#include <vespa/searchlib/fef/phrase_splitter_query_env.h>
+#include <vespa/searchlib/fef/phrasesplitter.h>
#include <vespa/searchlib/fef/properties.h>
#include <vespa/vespalib/util/stringfmt.h>
#include <vespa/vespalib/locale/c.h>
@@ -15,55 +17,24 @@ using namespace search::fef;
namespace search::features::fieldmatch {
-Computer::Computer(const vespalib::string &propertyNamespace, const PhraseSplitter &splitter,
- const FieldInfo &fieldInfo, const Params &params) :
- _splitter(splitter),
- _fieldId(fieldInfo.id()),
- _params(params),
- _useCachedHits(true),
- _queryTerms(),
- _queryTermFieldMatch(),
- _totalTermWeight(0),
- _totalTermSignificance(0.0f),
- _fieldLength(FieldPositionsIterator::UNKNOWN_LENGTH),
- _currentMetrics(this),
- _finalMetrics(this),
- _simpleMetrics(params),
- _segments(),
- _alternativeSegmentationsTried(0),
- _cachedHits()
+Computer::Computer(const ComputerSharedState& shared_state, const PhraseSplitter& splitter)
+ : _shared_state(shared_state),
+ _splitter(splitter),
+ _fieldId(_shared_state.get_field_id()),
+ _params(_shared_state.get_params()),
+ _useCachedHits(_shared_state.get_use_cached_hits()),
+ _queryTerms(_shared_state.get_query_terms()),
+ _queryTermFieldMatch(_queryTerms.size()),
+ _totalTermWeight(_shared_state.get_total_term_weight()),
+ _totalTermSignificance(_shared_state.get_total_term_significance()),
+ _fieldLength(FieldPositionsIterator::UNKNOWN_LENGTH),
+ _currentMetrics(this),
+ _finalMetrics(this),
+ _simpleMetrics(_shared_state.get_simple_metrics()),
+ _segments(),
+ _alternativeSegmentationsTried(0),
+ _cachedHits(_queryTerms.size())
{
- // Store term data for all terms searching in this field
- const auto& splitter_query_env = splitter.get_query_env();
- _queryTermFieldMatch.reserve(splitter_query_env.getNumTerms());
- _cachedHits.reserve(splitter_query_env.getNumTerms());
- for (uint32_t i = 0; i < splitter_query_env.getNumTerms(); ++i) {
- QueryTerm qt = QueryTermFactory::create(splitter_query_env, i, true);
- _totalTermWeight += qt.termData()->getWeight().percent();
- _totalTermSignificance += qt.significance();
- _simpleMetrics.addQueryTerm(qt.termData()->getWeight().percent());
- const ITermFieldData *field = qt.termData()->lookupField(_fieldId);
- if (field != nullptr) {
- qt.fieldHandle(field->getHandle());
- _queryTerms.push_back(qt);
- _simpleMetrics.addSearchedTerm(qt.termData()->getWeight().percent());
- _queryTermFieldMatch.emplace_back(nullptr);
- _cachedHits.emplace_back();
- }
- }
-
- _totalTermWeight = atoi(splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermWeight").
- get(vespalib::make_string("%d", _totalTermWeight)).c_str());
- _totalTermSignificance = vespalib::locale::c::atof(splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermSignificance").
- get(vespalib::make_string("%f", _totalTermSignificance)).c_str());
- if (splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermWeight").found()) {
- _simpleMetrics.setTotalWeightInQuery(_totalTermWeight);
- }
-
- // update current and final metrics after initialization
- _currentMetrics = Metrics(this);
- _finalMetrics = Metrics(this);
-
// num query terms searching in this field + 1
_segments.reserve(getNumQueryTerms() + 1);
for (uint32_t i = 0; i < (getNumQueryTerms() + 1); ++i) {
diff --git a/searchlib/src/vespa/searchlib/features/fieldmatch/computer.h b/searchlib/src/vespa/searchlib/features/fieldmatch/computer.h
index b2644208b40..f684f42708a 100644
--- a/searchlib/src/vespa/searchlib/features/fieldmatch/computer.h
+++ b/searchlib/src/vespa/searchlib/features/fieldmatch/computer.h
@@ -5,15 +5,21 @@
#include "params.h"
#include "segmentstart.h"
#include "simplemetrics.h"
-#include <vespa/searchlib/fef/iqueryenvironment.h>
-#include <vespa/searchlib/fef/fieldinfo.h>
-#include <vespa/searchlib/fef/matchdata.h>
-#include <vespa/searchlib/fef/phrasesplitter.h>
#include <vespa/searchlib/features/queryterm.h>
#include <vespa/searchlib/common/allocatedbitvector.h>
+#include <vespa/vespalib/util/arrayref.h>
+
+namespace search::fef {
+
+class PhraseSplitter;
+class TermFieldMatchData;
+
+}
namespace search::features::fieldmatch {
+class ComputerSharedState;
+
/**
* <p>Calculates a set of metrics capturing information about the degree of agreement between a query and a field
* string. This algorithm attempts to capture the property of text that very close tokens are usuall part of the same
@@ -57,13 +63,10 @@ public:
/**
* Constructs a new computer object.
*
- * @param propertyNamespace The namespace used in query properties.
+ * @param shared_state The shared state for this computer
* @param splitter The environment that holds all query information.
- * @param fieldInfo The info object of the matched field.
- * @param params The parameter object for this computer.
*/
- Computer(const vespalib::string &propertyNamespace, const fef::PhraseSplitter &splitter,
- const fef::FieldInfo &fieldInfo, const Params &params);
+ Computer(const ComputerSharedState& shared_state, const fef::PhraseSplitter& splitter);
/**
* Resets this object according to the given document id
@@ -296,12 +299,13 @@ private:
};
// per query
+ const ComputerSharedState& _shared_state;
const search::fef::PhraseSplitter & _splitter;
uint32_t _fieldId;
- Params _params;
+ const Params _params;
bool _useCachedHits;
- QueryTermVector _queryTerms;
+ const vespalib::ConstArrayRef<QueryTerm> _queryTerms;
TermFieldMatchDataVector _queryTermFieldMatch;
uint32_t _totalTermWeight;
feature_t _totalTermSignificance;
diff --git a/searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.cpp b/searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.cpp
new file mode 100644
index 00000000000..c6093f52619
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.cpp
@@ -0,0 +1,49 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "computer_shared_state.h"
+#include <vespa/searchlib/features/utils.h>
+#include <vespa/searchlib/fef/phrase_splitter_query_env.h>
+#include <vespa/searchlib/fef/properties.h>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/vespalib/locale/c.h>
+#include <set>
+
+using namespace search::fef;
+
+namespace search::features::fieldmatch {
+
+ComputerSharedState::ComputerSharedState(const vespalib::string& propertyNamespace, const PhraseSplitterQueryEnv& splitter_query_env,
+ const FieldInfo& fieldInfo, const Params& params)
+ : _splitter_query_env(splitter_query_env),
+ _field_id(fieldInfo.id()),
+ _params(params),
+ _use_cached_hits(true),
+ _query_terms(),
+ _total_term_weight(0),
+ _total_term_significance(0.0f),
+ _simple_metrics(_params)
+{
+ // Store term data for all terms searching in this field
+ for (uint32_t i = 0; i < splitter_query_env.getNumTerms(); ++i) {
+ QueryTerm qt = QueryTermFactory::create(splitter_query_env, i, true);
+ _total_term_weight += qt.termData()->getWeight().percent();
+ _total_term_significance += qt.significance();
+ _simple_metrics.addQueryTerm(qt.termData()->getWeight().percent());
+ const ITermFieldData *field = qt.termData()->lookupField(_field_id);
+ if (field != nullptr) {
+ qt.fieldHandle(field->getHandle());
+ _query_terms.push_back(qt);
+ _simple_metrics.addSearchedTerm(qt.termData()->getWeight().percent());
+ }
+ }
+
+ _total_term_weight = atoi(splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermWeight").
+ get(vespalib::make_string("%d", _total_term_weight)).c_str());
+ _total_term_significance = vespalib::locale::c::atof(splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermSignificance").
+ get(vespalib::make_string("%f", _total_term_significance)).c_str());
+ if (splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermWeight").found()) {
+ _simple_metrics.setTotalWeightInQuery(_total_term_weight);
+ }
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.h b/searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.h
new file mode 100644
index 00000000000..2d0eaaf2737
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/features/fieldmatch/computer_shared_state.h
@@ -0,0 +1,52 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "params.h"
+#include "simplemetrics.h"
+#include <vespa/searchlib/features/queryterm.h>
+
+namespace search::fef { class PhraseSplitterQueryEnv; }
+
+namespace search::features::fieldmatch {
+
+/**
+ * Shared state for field match computer.
+ */
+class ComputerSharedState {
+public:
+ /**
+ * Constructs a new computer shared state object.
+ *
+ * @param propertyNamespace The namespace used in query properties.
+ * @param splitter_query_env The environment that holds all query information.
+ * @param fieldInfo The info object of the matched field.
+ * @param params The parameter object for this computer.
+ */
+ ComputerSharedState(const vespalib::string& propertyNamespace, const fef::PhraseSplitterQueryEnv& splitter_query_env,
+ const fef::FieldInfo& fieldInfo, const Params& params);
+
+ uint32_t get_field_id() const { return _field_id; }
+ const Params& get_params() const { return _params; }
+ bool get_use_cached_hits() const { return _use_cached_hits; }
+ const QueryTermVector& get_query_terms() const { return _query_terms; }
+ uint32_t get_total_term_weight() const { return _total_term_weight; }
+ feature_t get_total_term_significance() const { return _total_term_significance; }
+ const SimpleMetrics& get_simple_metrics() const { return _simple_metrics; }
+
+private:
+
+ // per query
+ const search::fef::PhraseSplitterQueryEnv& _splitter_query_env;
+ uint32_t _field_id;
+ Params _params;
+ bool _use_cached_hits;
+
+ QueryTermVector _query_terms;
+ uint32_t _total_term_weight;
+ feature_t _total_term_significance;
+
+ // portions per docid (not used here), portions per query
+ SimpleMetrics _simple_metrics; // The metrics used to compute simple features.
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp b/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp
index 62b5f1d8165..9e4aa0d96ab 100644
--- a/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp
+++ b/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp
@@ -3,9 +3,11 @@
#include "fieldmatchfeature.h"
#include "utils.h"
#include <vespa/searchlib/features/fieldmatch/computer.h>
+#include <vespa/searchlib/features/fieldmatch/computer_shared_state.h>
#include <vespa/searchlib/fef/featurenamebuilder.h>
#include <vespa/searchlib/fef/indexproperties.h>
#include <vespa/searchlib/fef/phrase_splitter_query_env.h>
+#include <vespa/searchlib/fef/phrasesplitter.h>
#include <vespa/searchlib/fef/properties.h>
#include <vespa/vespalib/util/stringfmt.h>
#include <vespa/vespalib/locale/c.h>
@@ -24,6 +26,7 @@ private:
PhraseSplitterQueryEnv _splitter_env;
fef::PhraseSplitter _splitter;
const fef::FieldInfo & _field;
+ fieldmatch::ComputerSharedState _cmp_shared_state;
fieldmatch::Computer _cmp;
void handle_bind_match_data(const fef::MatchData &md) override;
@@ -42,8 +45,8 @@ FieldMatchExecutor::FieldMatchExecutor(const IQueryEnvironment & queryEnv,
_splitter_env(queryEnv, field.id()),
_splitter(_splitter_env),
_field(field),
- _cmp(vespalib::make_string("fieldMatch(%s)", _field.name().c_str()),
- _splitter, field, params)
+ _cmp_shared_state(vespalib::make_string("fieldMatch(%s)", _field.name().c_str()), _splitter_env, field, params),
+ _cmp(_cmp_shared_state, _splitter)
{
// empty
}