summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@broadpark.no>2020-05-20 12:54:22 +0200
committerTor Egge <Tor.Egge@broadpark.no>2020-05-20 12:56:11 +0200
commit2565a2a4244d0347a0e983dd921bbea130ff2d32 (patch)
tree0cc21fe41846ff708cb88c230abf2002a8b8d34d /searchlib
parent034146bc6de7babdeecae443480f11cba4c3461b (diff)
Split out constant portion of PhraseSplitter to PhraseSplitterQueryEnv.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/vespa/searchlib/fef/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp76
-rw-r--r--searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h85
-rw-r--r--searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp68
-rw-r--r--searchlib/src/vespa/searchlib/fef/phrasesplitter.h46
5 files changed, 169 insertions, 107 deletions
diff --git a/searchlib/src/vespa/searchlib/fef/CMakeLists.txt b/searchlib/src/vespa/searchlib/fef/CMakeLists.txt
index 08e64701c05..396775b20c5 100644
--- a/searchlib/src/vespa/searchlib/fef/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/fef/CMakeLists.txt
@@ -23,6 +23,7 @@ vespa_add_library(searchlib_fef OBJECT
parameter.cpp
parameterdescriptions.cpp
parametervalidator.cpp
+ phrase_splitter_query_env.cpp
phrasesplitter.cpp
properties.cpp
queryproperties.cpp
diff --git a/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp
new file mode 100644
index 00000000000..b474317a33b
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp
@@ -0,0 +1,76 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "phrase_splitter_query_env.h"
+
+namespace search::fef {
+
+void
+PhraseSplitterQueryEnv::considerTerm(uint32_t termIdx, const ITermData &term, std::vector<PhraseTerm> &phraseTerms, uint32_t fieldId)
+{
+ typedef search::fef::ITermFieldRangeAdapter FRA;
+
+ for (FRA iter(term); iter.valid(); iter.next()) {
+ if (iter.get().getFieldId() == fieldId) {
+ TermFieldHandle h = iter.get().getHandle();
+ _maxHandle = std::max(_maxHandle, h);
+ if (term.getPhraseLength() > 1) {
+ SimpleTermData prototype;
+ prototype.setWeight(term.getWeight());
+ prototype.setPhraseLength(1);
+ prototype.setUniqueId(term.getUniqueId());
+ prototype.addField(fieldId);
+ phraseTerms.push_back(PhraseTerm(term, _terms.size(), h));
+ for (uint32_t i = 0; i < term.getPhraseLength(); ++i) {
+ _terms.push_back(prototype);
+ _termIdxMap.push_back(TermIdx(_terms.size() - 1, true));
+ }
+ return;
+ }
+ }
+ }
+ _termIdxMap.push_back(TermIdx(termIdx, false));
+}
+
+PhraseSplitterQueryEnv::PhraseSplitterQueryEnv(const IQueryEnvironment & queryEnv, uint32_t fieldId) :
+ _queryEnv(queryEnv),
+ _terms(),
+ _termIdxMap(),
+ _maxHandle(0),
+ _skipHandles(0)
+{
+ TermFieldHandle numHandles = 0; // how many handles existed in underlying data
+ std::vector<PhraseTerm> phraseTerms; // data about original phrase terms
+
+ for (uint32_t i = 0; i < queryEnv.getNumTerms(); ++i) {
+ const ITermData *td = queryEnv.getTerm(i);
+ assert(td != nullptr);
+ considerTerm(i, *td, phraseTerms, fieldId);
+ numHandles += td->numFields();
+ }
+
+ _skipHandles = _maxHandle + 1 + numHandles;
+ TermFieldHandle term_handle = _skipHandles;
+ for (auto & term : _terms) {
+ // start at _skipHandles + 0
+ term.field(0).setHandle(term_handle);
+ ++term_handle;
+ }
+
+ for (uint32_t i = 0; i < phraseTerms.size(); ++i) {
+ const PhraseTerm &pterm = phraseTerms[i];
+
+ for (uint32_t j = 0; j < pterm.term.getPhraseLength(); ++j) {
+ const ITermData &splitp_td = _terms[pterm.idx + j];
+ const ITermFieldData& splitp_tfd = splitp_td.field(0);
+ HowToCopy meta;
+ meta.orig_handle = pterm.orig_handle;
+ meta.split_handle = splitp_tfd.getHandle();
+ meta.offsetInPhrase = j;
+ _copyInfo.push_back(meta);
+ }
+ }
+}
+
+PhraseSplitterQueryEnv::~PhraseSplitterQueryEnv() = default;
+
+}
diff --git a/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h
new file mode 100644
index 00000000000..19a4cb206b9
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h
@@ -0,0 +1,85 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "iqueryenvironment.h"
+#include "simpletermdata.h"
+
+namespace search::fef {
+
+/**
+ * This class is used to split all phrase terms in a query environment
+ * into separate terms. New TermData and TermFieldMatchData objects
+ * are created for each splitted phrase term and managed by this
+ * class. Unmodified single terms are served from the query
+ * environment and match data.
+ *
+ * The TermFieldMatchData objects managed by this class are updated
+ * based on the TermFieldMatchData objects associated with the
+ * original phrase terms. Positions are adjusted with +1 for each term
+ * after the first one.
+ *
+ * Use this class if you want to handle a phrase term the same way as
+ * single terms.
+ **/
+class PhraseSplitterQueryEnv : public IQueryEnvironment
+{
+protected:
+ struct TermIdx {
+ uint32_t idx; // index into either query environment or vector of TermData objects
+ bool splitted; // whether this term has been splitted or not
+ TermIdx(uint32_t i, bool s) : idx(i), splitted(s) {}
+ };
+ struct PhraseTerm {
+ const ITermData & term; // for original phrase
+ uint32_t idx; // index into vector of our TermData objects
+ TermFieldHandle orig_handle;
+ PhraseTerm(const ITermData & t, uint32_t i, uint32_t h) : term(t), idx(i), orig_handle(h) {}
+ };
+ struct HowToCopy {
+ TermFieldHandle orig_handle;
+ TermFieldHandle split_handle;
+ uint32_t offsetInPhrase;
+ };
+
+ const IQueryEnvironment &_queryEnv;
+ std::vector<SimpleTermData> _terms; // splitted terms
+ std::vector<HowToCopy> _copyInfo;
+ std::vector<TermIdx> _termIdxMap; // renumbering of terms
+ TermFieldHandle _maxHandle; // the largest among original term field handles
+ TermFieldHandle _skipHandles; // how many handles to skip
+
+ void considerTerm(uint32_t termIdx, const ITermData &term, std::vector<PhraseTerm> &phraseTerms, uint32_t fieldId);
+
+public:
+ /**
+ * Create a phrase splitter based on the given query environment.
+ *
+ * @param queryEnv the query environment to wrap.
+ * @param field the field where we need to split phrases
+ **/
+ PhraseSplitterQueryEnv(const IQueryEnvironment & queryEnv, uint32_t fieldId);
+ ~PhraseSplitterQueryEnv();
+
+ /**
+ * Update the underlying TermFieldMatchData objects based on the bound MatchData object.
+ **/
+ uint32_t getNumTerms() const override { return _termIdxMap.size(); }
+
+ const ITermData * getTerm(uint32_t idx) const override {
+ if (idx >= _termIdxMap.size()) {
+ return nullptr;
+ }
+ const TermIdx & ti = _termIdxMap[idx];
+ return ti.splitted ? &_terms[ti.idx] : _queryEnv.getTerm(ti.idx);
+ }
+
+ const Properties & getProperties() const override { return _queryEnv.getProperties(); }
+ const Location & getLocation() const override { return _queryEnv.getLocation(); }
+ const attribute::IAttributeContext & getAttributeContext() const override { return _queryEnv.getAttributeContext(); }
+ double get_average_field_length(const vespalib::string &field_name) const override { return _queryEnv.get_average_field_length(field_name); }
+ const IIndexEnvironment & getIndexEnvironment() const override { return _queryEnv.getIndexEnvironment(); }
+};
+
+
+}
diff --git a/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp b/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp
index e84f61332e1..a0d53b12fe5 100644
--- a/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp
+++ b/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp
@@ -4,74 +4,16 @@
namespace search::fef {
-void
-PhraseSplitter::considerTerm(uint32_t termIdx, const ITermData &term, std::vector<PhraseTerm> &phraseTerms, uint32_t fieldId)
-{
- typedef search::fef::ITermFieldRangeAdapter FRA;
-
- for (FRA iter(term); iter.valid(); iter.next()) {
- if (iter.get().getFieldId() == fieldId) {
- TermFieldHandle h = iter.get().getHandle();
- _maxHandle = std::max(_maxHandle, h);
- if (term.getPhraseLength() > 1) {
- SimpleTermData prototype;
- prototype.setWeight(term.getWeight());
- prototype.setPhraseLength(1);
- prototype.setUniqueId(term.getUniqueId());
- prototype.addField(fieldId);
- phraseTerms.push_back(PhraseTerm(term, _terms.size(), h));
- for (uint32_t i = 0; i < term.getPhraseLength(); ++i) {
- _terms.push_back(prototype);
- _termIdxMap.push_back(TermIdx(_terms.size() - 1, true));
- }
- return;
- }
- }
- }
- _termIdxMap.push_back(TermIdx(termIdx, false));
-}
-
-PhraseSplitter::PhraseSplitter(const IQueryEnvironment & queryEnv, uint32_t fieldId) :
- _queryEnv(queryEnv),
- _matchData(nullptr),
- _terms(),
- _termMatches(),
- _termIdxMap(),
- _maxHandle(0),
- _skipHandles(0)
+PhraseSplitter::PhraseSplitter(const IQueryEnvironment & queryEnv, uint32_t fieldId)
+ : PhraseSplitterQueryEnv(queryEnv, fieldId),
+ _matchData(nullptr),
+ _termMatches()
{
- TermFieldHandle numHandles = 0; // how many handles existed in underlying data
- std::vector<PhraseTerm> phraseTerms; // data about original phrase terms
-
- for (uint32_t i = 0; i < queryEnv.getNumTerms(); ++i) {
- const ITermData *td = queryEnv.getTerm(i);
- assert(td != nullptr);
- considerTerm(i, *td, phraseTerms, fieldId);
- numHandles += td->numFields();
- }
-
- _skipHandles = _maxHandle + 1 + numHandles;
_termMatches.reserve(_terms.size());
- for (auto & term : _terms) {
- // start at _skipHandles + 0
- term.field(0).setHandle(_skipHandles + _termMatches.size());
+ for ([[maybe_unused]] auto & term : _terms) {
_termMatches.emplace_back();
_termMatches.back().setFieldId(fieldId);
}
-
- for (uint32_t i = 0; i < phraseTerms.size(); ++i) {
- const PhraseTerm &pterm = phraseTerms[i];
-
- for (uint32_t j = 0; j < pterm.term.getPhraseLength(); ++j) {
- const ITermData &splitp_td = _terms[pterm.idx + j];
- const ITermFieldData& splitp_tfd = splitp_td.field(0);
- HowToCopy meta;
- meta.orig_handle = pterm.orig_handle;
- meta.split_handle = splitp_tfd.getHandle();
- meta.offsetInPhrase = j;
- _copyInfo.push_back(meta);
- }
- }
}
PhraseSplitter::~PhraseSplitter() = default;
diff --git a/searchlib/src/vespa/searchlib/fef/phrasesplitter.h b/searchlib/src/vespa/searchlib/fef/phrasesplitter.h
index 8b399885496..dc7954a7fcc 100644
--- a/searchlib/src/vespa/searchlib/fef/phrasesplitter.h
+++ b/searchlib/src/vespa/searchlib/fef/phrasesplitter.h
@@ -2,11 +2,9 @@
#pragma once
-#include "iqueryenvironment.h"
+#include "phrase_splitter_query_env.h"
#include "matchdata.h"
-#include "simpletermdata.h"
#include "termfieldmatchdata.h"
-#include "fieldinfo.h"
namespace search::fef {
@@ -25,36 +23,10 @@ namespace search::fef {
* Use this class if you want to handle a phrase term the same way as
* single terms.
**/
-class PhraseSplitter : public IQueryEnvironment
+class PhraseSplitter : public PhraseSplitterQueryEnv
{
-private:
- struct TermIdx {
- uint32_t idx; // index into either query environment or vector of TermData objects
- bool splitted; // whether this term has been splitted or not
- TermIdx(uint32_t i, bool s) : idx(i), splitted(s) {}
- };
- struct PhraseTerm {
- const ITermData & term; // for original phrase
- uint32_t idx; // index into vector of our TermData objects
- TermFieldHandle orig_handle;
- PhraseTerm(const ITermData & t, uint32_t i, uint32_t h) : term(t), idx(i), orig_handle(h) {}
- };
- struct HowToCopy {
- TermFieldHandle orig_handle;
- TermFieldHandle split_handle;
- uint32_t offsetInPhrase;
- };
-
- const IQueryEnvironment &_queryEnv;
const MatchData *_matchData;
- std::vector<SimpleTermData> _terms; // splitted terms
std::vector<TermFieldMatchData> _termMatches; // match objects associated with splitted terms
- std::vector<HowToCopy> _copyInfo;
- std::vector<TermIdx> _termIdxMap; // renumbering of terms
- TermFieldHandle _maxHandle; // the largest among original term field handles
- TermFieldHandle _skipHandles; // how many handles to skip
-
- void considerTerm(uint32_t termIdx, const ITermData &term, std::vector<PhraseTerm> &phraseTerms, uint32_t fieldId);
TermFieldMatchData *resolveSplittedTermField(TermFieldHandle handle) {
return &_termMatches[handle - _skipHandles];
@@ -88,15 +60,6 @@ public:
* Update the underlying TermFieldMatchData objects based on the bound MatchData object.
**/
void update();
- uint32_t getNumTerms() const override { return _termIdxMap.size(); }
-
- const ITermData * getTerm(uint32_t idx) const override {
- if (idx >= _termIdxMap.size()) {
- return nullptr;
- }
- const TermIdx & ti = _termIdxMap[idx];
- return ti.splitted ? &_terms[ti.idx] : _queryEnv.getTerm(ti.idx);
- }
/**
* Inherit doc from MatchData.
@@ -108,11 +71,6 @@ public:
return handle < _skipHandles ? _matchData->resolveTermField(handle) : resolveSplittedTermField(handle);
}
- const Properties & getProperties() const override { return _queryEnv.getProperties(); }
- const Location & getLocation() const override { return _queryEnv.getLocation(); }
- const attribute::IAttributeContext & getAttributeContext() const override { return _queryEnv.getAttributeContext(); }
- double get_average_field_length(const vespalib::string &field_name) const override { return _queryEnv.get_average_field_length(field_name); }
- const IIndexEnvironment & getIndexEnvironment() const override { return _queryEnv.getIndexEnvironment(); }
void bind_match_data(const fef::MatchData &md) { _matchData = &md; }
};