diff options
author | Tor Egge <Tor.Egge@broadpark.no> | 2020-05-20 12:54:22 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@broadpark.no> | 2020-05-20 12:56:11 +0200 |
commit | 2565a2a4244d0347a0e983dd921bbea130ff2d32 (patch) | |
tree | 0cc21fe41846ff708cb88c230abf2002a8b8d34d /searchlib | |
parent | 034146bc6de7babdeecae443480f11cba4c3461b (diff) |
Split out constant portion of PhraseSplitter to PhraseSplitterQueryEnv.
Diffstat (limited to 'searchlib')
5 files changed, 169 insertions, 107 deletions
diff --git a/searchlib/src/vespa/searchlib/fef/CMakeLists.txt b/searchlib/src/vespa/searchlib/fef/CMakeLists.txt index 08e64701c05..396775b20c5 100644 --- a/searchlib/src/vespa/searchlib/fef/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/fef/CMakeLists.txt @@ -23,6 +23,7 @@ vespa_add_library(searchlib_fef OBJECT parameter.cpp parameterdescriptions.cpp parametervalidator.cpp + phrase_splitter_query_env.cpp phrasesplitter.cpp properties.cpp queryproperties.cpp diff --git a/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp new file mode 100644 index 00000000000..b474317a33b --- /dev/null +++ b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp @@ -0,0 +1,76 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "phrase_splitter_query_env.h" + +namespace search::fef { + +void +PhraseSplitterQueryEnv::considerTerm(uint32_t termIdx, const ITermData &term, std::vector<PhraseTerm> &phraseTerms, uint32_t fieldId) +{ + typedef search::fef::ITermFieldRangeAdapter FRA; + + for (FRA iter(term); iter.valid(); iter.next()) { + if (iter.get().getFieldId() == fieldId) { + TermFieldHandle h = iter.get().getHandle(); + _maxHandle = std::max(_maxHandle, h); + if (term.getPhraseLength() > 1) { + SimpleTermData prototype; + prototype.setWeight(term.getWeight()); + prototype.setPhraseLength(1); + prototype.setUniqueId(term.getUniqueId()); + prototype.addField(fieldId); + phraseTerms.push_back(PhraseTerm(term, _terms.size(), h)); + for (uint32_t i = 0; i < term.getPhraseLength(); ++i) { + _terms.push_back(prototype); + _termIdxMap.push_back(TermIdx(_terms.size() - 1, true)); + } + return; + } + } + } + _termIdxMap.push_back(TermIdx(termIdx, false)); +} + +PhraseSplitterQueryEnv::PhraseSplitterQueryEnv(const IQueryEnvironment & queryEnv, uint32_t fieldId) : + _queryEnv(queryEnv), + _terms(), + _termIdxMap(), + _maxHandle(0), + _skipHandles(0) +{ + TermFieldHandle numHandles = 0; // how many handles existed in underlying data + std::vector<PhraseTerm> phraseTerms; // data about original phrase terms + + for (uint32_t i = 0; i < queryEnv.getNumTerms(); ++i) { + const ITermData *td = queryEnv.getTerm(i); + assert(td != nullptr); + considerTerm(i, *td, phraseTerms, fieldId); + numHandles += td->numFields(); + } + + _skipHandles = _maxHandle + 1 + numHandles; + TermFieldHandle term_handle = _skipHandles; + for (auto & term : _terms) { + // start at _skipHandles + 0 + term.field(0).setHandle(term_handle); + ++term_handle; + } + + for (uint32_t i = 0; i < phraseTerms.size(); ++i) { + const PhraseTerm &pterm = phraseTerms[i]; + + for (uint32_t j = 0; j < pterm.term.getPhraseLength(); ++j) { + const ITermData &splitp_td = _terms[pterm.idx + j]; + const ITermFieldData& splitp_tfd = splitp_td.field(0); + HowToCopy meta; + meta.orig_handle = pterm.orig_handle; + meta.split_handle = splitp_tfd.getHandle(); + meta.offsetInPhrase = j; + _copyInfo.push_back(meta); + } + } +} + +PhraseSplitterQueryEnv::~PhraseSplitterQueryEnv() = default; + +} diff --git a/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h new file mode 100644 index 00000000000..19a4cb206b9 --- /dev/null +++ b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h @@ -0,0 +1,85 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "iqueryenvironment.h" +#include "simpletermdata.h" + +namespace search::fef { + +/** + * This class is used to split all phrase terms in a query environment + * into separate terms. New TermData and TermFieldMatchData objects + * are created for each splitted phrase term and managed by this + * class. Unmodified single terms are served from the query + * environment and match data. + * + * The TermFieldMatchData objects managed by this class are updated + * based on the TermFieldMatchData objects associated with the + * original phrase terms. Positions are adjusted with +1 for each term + * after the first one. + * + * Use this class if you want to handle a phrase term the same way as + * single terms. + **/ +class PhraseSplitterQueryEnv : public IQueryEnvironment +{ +protected: + struct TermIdx { + uint32_t idx; // index into either query environment or vector of TermData objects + bool splitted; // whether this term has been splitted or not + TermIdx(uint32_t i, bool s) : idx(i), splitted(s) {} + }; + struct PhraseTerm { + const ITermData & term; // for original phrase + uint32_t idx; // index into vector of our TermData objects + TermFieldHandle orig_handle; + PhraseTerm(const ITermData & t, uint32_t i, uint32_t h) : term(t), idx(i), orig_handle(h) {} + }; + struct HowToCopy { + TermFieldHandle orig_handle; + TermFieldHandle split_handle; + uint32_t offsetInPhrase; + }; + + const IQueryEnvironment &_queryEnv; + std::vector<SimpleTermData> _terms; // splitted terms + std::vector<HowToCopy> _copyInfo; + std::vector<TermIdx> _termIdxMap; // renumbering of terms + TermFieldHandle _maxHandle; // the largest among original term field handles + TermFieldHandle _skipHandles; // how many handles to skip + + void considerTerm(uint32_t termIdx, const ITermData &term, std::vector<PhraseTerm> &phraseTerms, uint32_t fieldId); + +public: + /** + * Create a phrase splitter based on the given query environment. + * + * @param queryEnv the query environment to wrap. + * @param field the field where we need to split phrases + **/ + PhraseSplitterQueryEnv(const IQueryEnvironment & queryEnv, uint32_t fieldId); + ~PhraseSplitterQueryEnv(); + + /** + * Update the underlying TermFieldMatchData objects based on the bound MatchData object. + **/ + uint32_t getNumTerms() const override { return _termIdxMap.size(); } + + const ITermData * getTerm(uint32_t idx) const override { + if (idx >= _termIdxMap.size()) { + return nullptr; + } + const TermIdx & ti = _termIdxMap[idx]; + return ti.splitted ? &_terms[ti.idx] : _queryEnv.getTerm(ti.idx); + } + + const Properties & getProperties() const override { return _queryEnv.getProperties(); } + const Location & getLocation() const override { return _queryEnv.getLocation(); } + const attribute::IAttributeContext & getAttributeContext() const override { return _queryEnv.getAttributeContext(); } + double get_average_field_length(const vespalib::string &field_name) const override { return _queryEnv.get_average_field_length(field_name); } + const IIndexEnvironment & getIndexEnvironment() const override { return _queryEnv.getIndexEnvironment(); } +}; + + +} diff --git a/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp b/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp index e84f61332e1..a0d53b12fe5 100644 --- a/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp +++ b/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp @@ -4,74 +4,16 @@ namespace search::fef { -void -PhraseSplitter::considerTerm(uint32_t termIdx, const ITermData &term, std::vector<PhraseTerm> &phraseTerms, uint32_t fieldId) -{ - typedef search::fef::ITermFieldRangeAdapter FRA; - - for (FRA iter(term); iter.valid(); iter.next()) { - if (iter.get().getFieldId() == fieldId) { - TermFieldHandle h = iter.get().getHandle(); - _maxHandle = std::max(_maxHandle, h); - if (term.getPhraseLength() > 1) { - SimpleTermData prototype; - prototype.setWeight(term.getWeight()); - prototype.setPhraseLength(1); - prototype.setUniqueId(term.getUniqueId()); - prototype.addField(fieldId); - phraseTerms.push_back(PhraseTerm(term, _terms.size(), h)); - for (uint32_t i = 0; i < term.getPhraseLength(); ++i) { - _terms.push_back(prototype); - _termIdxMap.push_back(TermIdx(_terms.size() - 1, true)); - } - return; - } - } - } - _termIdxMap.push_back(TermIdx(termIdx, false)); -} - -PhraseSplitter::PhraseSplitter(const IQueryEnvironment & queryEnv, uint32_t fieldId) : - _queryEnv(queryEnv), - _matchData(nullptr), - _terms(), - _termMatches(), - _termIdxMap(), - _maxHandle(0), - _skipHandles(0) +PhraseSplitter::PhraseSplitter(const IQueryEnvironment & queryEnv, uint32_t fieldId) + : PhraseSplitterQueryEnv(queryEnv, fieldId), + _matchData(nullptr), + _termMatches() { - TermFieldHandle numHandles = 0; // how many handles existed in underlying data - std::vector<PhraseTerm> phraseTerms; // data about original phrase terms - - for (uint32_t i = 0; i < queryEnv.getNumTerms(); ++i) { - const ITermData *td = queryEnv.getTerm(i); - assert(td != nullptr); - considerTerm(i, *td, phraseTerms, fieldId); - numHandles += td->numFields(); - } - - _skipHandles = _maxHandle + 1 + numHandles; _termMatches.reserve(_terms.size()); - for (auto & term : _terms) { - // start at _skipHandles + 0 - term.field(0).setHandle(_skipHandles + _termMatches.size()); + for ([[maybe_unused]] auto & term : _terms) { _termMatches.emplace_back(); _termMatches.back().setFieldId(fieldId); } - - for (uint32_t i = 0; i < phraseTerms.size(); ++i) { - const PhraseTerm &pterm = phraseTerms[i]; - - for (uint32_t j = 0; j < pterm.term.getPhraseLength(); ++j) { - const ITermData &splitp_td = _terms[pterm.idx + j]; - const ITermFieldData& splitp_tfd = splitp_td.field(0); - HowToCopy meta; - meta.orig_handle = pterm.orig_handle; - meta.split_handle = splitp_tfd.getHandle(); - meta.offsetInPhrase = j; - _copyInfo.push_back(meta); - } - } } PhraseSplitter::~PhraseSplitter() = default; diff --git a/searchlib/src/vespa/searchlib/fef/phrasesplitter.h b/searchlib/src/vespa/searchlib/fef/phrasesplitter.h index 8b399885496..dc7954a7fcc 100644 --- a/searchlib/src/vespa/searchlib/fef/phrasesplitter.h +++ b/searchlib/src/vespa/searchlib/fef/phrasesplitter.h @@ -2,11 +2,9 @@ #pragma once -#include "iqueryenvironment.h" +#include "phrase_splitter_query_env.h" #include "matchdata.h" -#include "simpletermdata.h" #include "termfieldmatchdata.h" -#include "fieldinfo.h" namespace search::fef { @@ -25,36 +23,10 @@ namespace search::fef { * Use this class if you want to handle a phrase term the same way as * single terms. **/ -class PhraseSplitter : public IQueryEnvironment +class PhraseSplitter : public PhraseSplitterQueryEnv { -private: - struct TermIdx { - uint32_t idx; // index into either query environment or vector of TermData objects - bool splitted; // whether this term has been splitted or not - TermIdx(uint32_t i, bool s) : idx(i), splitted(s) {} - }; - struct PhraseTerm { - const ITermData & term; // for original phrase - uint32_t idx; // index into vector of our TermData objects - TermFieldHandle orig_handle; - PhraseTerm(const ITermData & t, uint32_t i, uint32_t h) : term(t), idx(i), orig_handle(h) {} - }; - struct HowToCopy { - TermFieldHandle orig_handle; - TermFieldHandle split_handle; - uint32_t offsetInPhrase; - }; - - const IQueryEnvironment &_queryEnv; const MatchData *_matchData; - std::vector<SimpleTermData> _terms; // splitted terms std::vector<TermFieldMatchData> _termMatches; // match objects associated with splitted terms - std::vector<HowToCopy> _copyInfo; - std::vector<TermIdx> _termIdxMap; // renumbering of terms - TermFieldHandle _maxHandle; // the largest among original term field handles - TermFieldHandle _skipHandles; // how many handles to skip - - void considerTerm(uint32_t termIdx, const ITermData &term, std::vector<PhraseTerm> &phraseTerms, uint32_t fieldId); TermFieldMatchData *resolveSplittedTermField(TermFieldHandle handle) { return &_termMatches[handle - _skipHandles]; @@ -88,15 +60,6 @@ public: * Update the underlying TermFieldMatchData objects based on the bound MatchData object. **/ void update(); - uint32_t getNumTerms() const override { return _termIdxMap.size(); } - - const ITermData * getTerm(uint32_t idx) const override { - if (idx >= _termIdxMap.size()) { - return nullptr; - } - const TermIdx & ti = _termIdxMap[idx]; - return ti.splitted ? &_terms[ti.idx] : _queryEnv.getTerm(ti.idx); - } /** * Inherit doc from MatchData. @@ -108,11 +71,6 @@ public: return handle < _skipHandles ? _matchData->resolveTermField(handle) : resolveSplittedTermField(handle); } - const Properties & getProperties() const override { return _queryEnv.getProperties(); } - const Location & getLocation() const override { return _queryEnv.getLocation(); } - const attribute::IAttributeContext & getAttributeContext() const override { return _queryEnv.getAttributeContext(); } - double get_average_field_length(const vespalib::string &field_name) const override { return _queryEnv.get_average_field_length(field_name); } - const IIndexEnvironment & getIndexEnvironment() const override { return _queryEnv.getIndexEnvironment(); } void bind_match_data(const fef::MatchData &md) { _matchData = &md; } }; |