diff options
Diffstat (limited to 'searchlib')
8 files changed, 78 insertions, 52 deletions
diff --git a/searchlib/src/tests/fef/phrasesplitter/benchmark.cpp b/searchlib/src/tests/fef/phrasesplitter/benchmark.cpp index 419b5261510..f264a3ab949 100644 --- a/searchlib/src/tests/fef/phrasesplitter/benchmark.cpp +++ b/searchlib/src/tests/fef/phrasesplitter/benchmark.cpp @@ -2,6 +2,7 @@ #include <vespa/vespalib/testkit/testapp.h> #include <vespa/searchlib/fef/matchdatalayout.h> #include <vespa/searchlib/fef/phrasesplitter.h> +#include <vespa/searchlib/fef/phrase_splitter_query_env.h> #include <vespa/searchlib/fef/test/queryenvironment.h> #include <iomanip> #include <iostream> @@ -42,7 +43,8 @@ Benchmark::run(size_t numRuns, size_t numPositions) tmd->appendPosition(TermFieldMatchDataPosition(0, i, 0, numPositions)); } - PhraseSplitter ps(qe, 0); + PhraseSplitterQueryEnv ps_query_env(qe, 0); + PhraseSplitter ps(ps_query_env); std::cout << "Start benchmark with numRuns(" << numRuns << ") and numPositions(" << numPositions << ")" << std::endl; diff --git a/searchlib/src/tests/fef/phrasesplitter/phrasesplitter_test.cpp b/searchlib/src/tests/fef/phrasesplitter/phrasesplitter_test.cpp index 1a7c4ccc467..afd422024d1 100644 --- a/searchlib/src/tests/fef/phrasesplitter/phrasesplitter_test.cpp +++ b/searchlib/src/tests/fef/phrasesplitter/phrasesplitter_test.cpp @@ -5,6 +5,7 @@ LOG_SETUP("phrasesplitter_test"); #include <vespa/searchlib/fef/matchdatalayout.h> #include <vespa/searchlib/fef/phrasesplitter.h> +#include <vespa/searchlib/fef/phrase_splitter_query_env.h> #include <vespa/searchlib/fef/test/queryenvironment.h> namespace search { @@ -84,12 +85,13 @@ PhraseSplitterTest::testSplitter() terms.push_back(SimpleTermData()); terms.back().addField(0).setHandle(mdl.allocTermField(0)); MatchData::UP md = mdl.createMatchData(); - PhraseSplitter ps(qe, 0); - ASSERT_TRUE(ps.getNumTerms() == 1); + PhraseSplitterQueryEnv ps_query_env(qe, 0); + PhraseSplitter ps(ps_query_env); + ASSERT_TRUE(ps.get_phrase_splitter_query_env().getNumTerms() == 1); ps.bind_match_data(*md); ps.update(); // check that nothing is served from the splitter - EXPECT_EQUAL(ps.getTerm(0), &terms[0]); + EXPECT_EQUAL(ps.get_phrase_splitter_query_env().getTerm(0), &terms[0]); TermFieldHandle handle = terms[0].lookupField(0)->getHandle(); EXPECT_EQUAL(ps.resolveTermField(handle), md->resolveTermField(handle)); } @@ -103,14 +105,15 @@ PhraseSplitterTest::testSplitter() terms.back().addField(0).setHandle(mdl.allocTermField(0)); terms.back().addField(7).setHandle(mdl.allocTermField(7)); MatchData::UP md = mdl.createMatchData(); - PhraseSplitter ps(qe, 7); - ASSERT_TRUE(ps.getNumTerms() == 3); + PhraseSplitterQueryEnv ps_query_env(qe, 7); + PhraseSplitter ps(ps_query_env); + ASSERT_TRUE(ps.get_phrase_splitter_query_env().getNumTerms() == 3); ps.bind_match_data(*md); ps.update(); // check that all is served from the splitter for (size_t i = 0; i < 3; ++i) { // fprintf(stderr, "checking term %d\n", (int)i); - const ITermData *td = ps.getTerm(i); + const ITermData *td = ps.get_phrase_splitter_query_env().getTerm(i); EXPECT_NOT_EQUAL(td, &terms[0]); EXPECT_NOT_EQUAL(td->lookupField(7), (ITermFieldData *)0); EXPECT_EQUAL(td->lookupField(0), (ITermFieldData *)0); @@ -135,15 +138,16 @@ PhraseSplitterTest::testSplitter() } terms[1].setPhraseLength(3); MatchData::UP md = mdl.createMatchData(); - PhraseSplitter ps(qe, 4); - ASSERT_TRUE(ps.getNumTerms() == 5); + PhraseSplitterQueryEnv ps_query_env(qe, 4); + PhraseSplitter ps(ps_query_env); + ASSERT_TRUE(ps.get_phrase_splitter_query_env().getNumTerms() == 5); ps.bind_match_data(*md); ps.update(); { // first term // fprintf(stderr, "first term\n"); - EXPECT_EQUAL(ps.getTerm(0), &terms[0]); - TEST_DO(assertTermData(ps.getTerm(0), 0, 1, 4, 0)); - TEST_DO(assertTermData(ps.getTerm(0), 0, 1, 7, 1)); + EXPECT_EQUAL(ps.get_phrase_splitter_query_env().getTerm(0), &terms[0]); + TEST_DO(assertTermData(ps.get_phrase_splitter_query_env().getTerm(0), 0, 1, 4, 0)); + TEST_DO(assertTermData(ps.get_phrase_splitter_query_env().getTerm(0), 0, 1, 7, 1)); TermFieldHandle handle = terms[0].lookupField(4)->getHandle(); EXPECT_EQUAL(ps.resolveTermField(handle), md->resolveTermField(handle)); @@ -152,7 +156,7 @@ PhraseSplitterTest::testSplitter() } for (size_t i = 0; i < 3; ++i) { // phrase // fprintf(stderr, "phrase term %zd\n", i); - const ITermData *td = ps.getTerm(i + 1); + const ITermData *td = ps.get_phrase_splitter_query_env().getTerm(i + 1); EXPECT_NOT_EQUAL(td, &terms[1]); TEST_DO(assertTermData(td, 1, 1, 4, i + 11)); // skipHandles == 11 EXPECT_EQUAL(td->lookupField(7), (ITermFieldData *)0); @@ -161,9 +165,9 @@ PhraseSplitterTest::testSplitter() } { // last term // fprintf(stderr, "last term\n"); - EXPECT_EQUAL(ps.getTerm(4), &terms[2]); - TEST_DO(assertTermData(ps.getTerm(4), 2, 1, 4, 4)); - TEST_DO(assertTermData(ps.getTerm(4), 2, 1, 7, 5)); + EXPECT_EQUAL(ps.get_phrase_splitter_query_env().getTerm(4), &terms[2]); + TEST_DO(assertTermData(ps.get_phrase_splitter_query_env().getTerm(4), 2, 1, 4, 4)); + TEST_DO(assertTermData(ps.get_phrase_splitter_query_env().getTerm(4), 2, 1, 7, 5)); // fprintf(stderr, "inspect term %p #f %zd\n", &terms[2], terms[2].numFields()); fflush(stderr); @@ -189,8 +193,9 @@ PhraseSplitterTest::testSplitterUpdate() terms[0].setPhraseLength(2); terms[2].setPhraseLength(2); MatchData::UP md = mdl.createMatchData(); - PhraseSplitter ps(qe, 0); - ASSERT_TRUE(ps.getNumTerms() == 5); + PhraseSplitterQueryEnv ps_query_env(qe, 0); + PhraseSplitter ps(ps_query_env); + ASSERT_TRUE(ps.get_phrase_splitter_query_env().getNumTerms() == 5); { // first phrase TermFieldMatchData * tmd = md->resolveTermField(terms[0].lookupField(0)->getHandle()); tmd->appendPosition(TermFieldMatchDataPosition(0, 10, 0, 1000)); @@ -206,19 +211,19 @@ PhraseSplitterTest::testSplitterUpdate() ps.bind_match_data(*md); ps.update(); for (size_t i = 0; i < 2; ++i) { // first phrase - const TermFieldMatchData * tmd = ps.resolveTermField(ps.getTerm(i)->lookupField(0)->getHandle()); + const TermFieldMatchData * tmd = ps.resolveTermField(ps.get_phrase_splitter_query_env().getTerm(i)->lookupField(0)->getHandle()); TermFieldMatchData::PositionsIterator itr = tmd->begin(); EXPECT_EQUAL((itr++)->getPosition(), 10 + i); ASSERT_TRUE(itr == tmd->end()); } { // first term - TermFieldMatchData * tmd = md->resolveTermField(ps.getTerm(2)->lookupField(0)->getHandle()); + TermFieldMatchData * tmd = md->resolveTermField(ps.get_phrase_splitter_query_env().getTerm(2)->lookupField(0)->getHandle()); TermFieldMatchData::PositionsIterator itr = tmd->begin(); EXPECT_EQUAL((itr++)->getPosition(), 20u); ASSERT_TRUE(itr == tmd->end()); } for (size_t i = 0; i < 2; ++i) { // second phrase - const TermFieldMatchData * tmd = ps.resolveTermField(ps.getTerm(i + 3)->lookupField(0)->getHandle()); + const TermFieldMatchData * tmd = ps.resolveTermField(ps.get_phrase_splitter_query_env().getTerm(i + 3)->lookupField(0)->getHandle()); TermFieldMatchData::PositionsIterator itr = tmd->begin(); EXPECT_EQUAL((itr++)->getPosition(), 30 + i); ASSERT_TRUE(itr == tmd->end()); diff --git a/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp b/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp index 6dba7b87c08..156d7aeb7eb 100644 --- a/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp +++ b/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp @@ -2,6 +2,7 @@ #include "computer.h" #include <vespa/searchlib/features/utils.h> +#include <vespa/searchlib/fef/phrase_splitter_query_env.h> #include <vespa/searchlib/fef/properties.h> #include <vespa/vespalib/util/stringfmt.h> #include <vespa/vespalib/locale/c.h> @@ -35,10 +36,11 @@ Computer::Computer(const vespalib::string &propertyNamespace, const PhraseSplitt _cachedHits() { // Store term data for all terms searching in this field - _queryTermFieldMatch.reserve(splitter.getNumTerms()); - _cachedHits.reserve(splitter.getNumTerms()); - for (uint32_t i = 0; i < splitter.getNumTerms(); ++i) { - QueryTerm qt = QueryTermFactory::create(splitter, i, true); + const auto& splitter_query_env = splitter.get_phrase_splitter_query_env(); + _queryTermFieldMatch.reserve(splitter_query_env.getNumTerms()); + _cachedHits.reserve(splitter_query_env.getNumTerms()); + for (uint32_t i = 0; i < splitter_query_env.getNumTerms(); ++i) { + QueryTerm qt = QueryTermFactory::create(splitter_query_env, i, true); _totalTermWeight += qt.termData()->getWeight().percent(); _totalTermSignificance += qt.significance(); _simpleMetrics.addQueryTerm(qt.termData()->getWeight().percent()); @@ -52,11 +54,11 @@ Computer::Computer(const vespalib::string &propertyNamespace, const PhraseSplitt } } - _totalTermWeight = atoi(splitter.getProperties().lookup(propertyNamespace, "totalTermWeight"). + _totalTermWeight = atoi(splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermWeight"). get(vespalib::make_string("%d", _totalTermWeight)).c_str()); - _totalTermSignificance = vespalib::locale::c::atof(splitter.getProperties().lookup(propertyNamespace, "totalTermSignificance"). + _totalTermSignificance = vespalib::locale::c::atof(splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermSignificance"). get(vespalib::make_string("%f", _totalTermSignificance)).c_str()); - if (splitter.getProperties().lookup(propertyNamespace, "totalTermWeight").found()) { + if (splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermWeight").found()) { _simpleMetrics.setTotalWeightInQuery(_totalTermWeight); } @@ -139,7 +141,7 @@ Computer::handleError(uint32_t fieldPos, uint32_t docId) const static int errcnt; if (errcnt < 1000) { errcnt++; - const FieldInfo * finfo = _splitter.getIndexEnvironment().getField(getFieldId()); + const FieldInfo * finfo = _splitter.get_phrase_splitter_query_env().getIndexEnvironment().getField(getFieldId()); LOG(debug, "Bad field position %u >= fieldLength %u for field '%s' document %u. " "Document was probably refed during query (Ticket 7104969)", fieldPos, _fieldLength, diff --git a/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp b/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp index 94240422106..de79280517d 100644 --- a/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp +++ b/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp @@ -5,6 +5,7 @@ #include <vespa/searchlib/features/fieldmatch/computer.h> #include <vespa/searchlib/fef/featurenamebuilder.h> #include <vespa/searchlib/fef/indexproperties.h> +#include <vespa/searchlib/fef/phrase_splitter_query_env.h> #include <vespa/searchlib/fef/properties.h> #include <vespa/vespalib/util/stringfmt.h> #include <vespa/vespalib/locale/c.h> @@ -20,6 +21,7 @@ namespace search::features { */ class FieldMatchExecutor : public fef::FeatureExecutor { private: + PhraseSplitterQueryEnv _splitter_env; fef::PhraseSplitter _splitter; const fef::FieldInfo & _field; fieldmatch::Computer _cmp; @@ -37,7 +39,8 @@ FieldMatchExecutor::FieldMatchExecutor(const IQueryEnvironment & queryEnv, const FieldInfo & field, [[maybe_unused]] const fieldmatch::Params & params) : FeatureExecutor(), - _splitter(queryEnv, field.id()), + _splitter_env(queryEnv, field.id()), + _splitter(_splitter_env), _field(field), _cmp(vespalib::make_string("fieldMatch(%s)", _field.name().c_str()), _splitter, field, params) diff --git a/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp index b474317a33b..593b8fb29ce 100644 --- a/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp +++ b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp @@ -31,12 +31,13 @@ PhraseSplitterQueryEnv::considerTerm(uint32_t termIdx, const ITermData &term, st _termIdxMap.push_back(TermIdx(termIdx, false)); } -PhraseSplitterQueryEnv::PhraseSplitterQueryEnv(const IQueryEnvironment & queryEnv, uint32_t fieldId) : - _queryEnv(queryEnv), - _terms(), - _termIdxMap(), - _maxHandle(0), - _skipHandles(0) +PhraseSplitterQueryEnv::PhraseSplitterQueryEnv(const IQueryEnvironment & queryEnv, uint32_t fieldId) + : _queryEnv(queryEnv), + _terms(), + _termIdxMap(), + _maxHandle(0), + _skipHandles(0), + _field_id(fieldId) { TermFieldHandle numHandles = 0; // how many handles existed in underlying data std::vector<PhraseTerm> phraseTerms; // data about original phrase terms diff --git a/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h index 19a4cb206b9..5a9c85b79db 100644 --- a/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h +++ b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h @@ -24,7 +24,7 @@ namespace search::fef { **/ class PhraseSplitterQueryEnv : public IQueryEnvironment { -protected: +private: struct TermIdx { uint32_t idx; // index into either query environment or vector of TermData objects bool splitted; // whether this term has been splitted or not @@ -36,18 +36,20 @@ protected: TermFieldHandle orig_handle; PhraseTerm(const ITermData & t, uint32_t i, uint32_t h) : term(t), idx(i), orig_handle(h) {} }; +public: struct HowToCopy { TermFieldHandle orig_handle; TermFieldHandle split_handle; uint32_t offsetInPhrase; }; - +private: const IQueryEnvironment &_queryEnv; std::vector<SimpleTermData> _terms; // splitted terms std::vector<HowToCopy> _copyInfo; std::vector<TermIdx> _termIdxMap; // renumbering of terms TermFieldHandle _maxHandle; // the largest among original term field handles TermFieldHandle _skipHandles; // how many handles to skip + uint32_t _field_id; void considerTerm(uint32_t termIdx, const ITermData &term, std::vector<PhraseTerm> &phraseTerms, uint32_t fieldId); @@ -79,6 +81,12 @@ public: const attribute::IAttributeContext & getAttributeContext() const override { return _queryEnv.getAttributeContext(); } double get_average_field_length(const vespalib::string &field_name) const override { return _queryEnv.get_average_field_length(field_name); } const IIndexEnvironment & getIndexEnvironment() const override { return _queryEnv.getIndexEnvironment(); } + + // Accessor methods used by PhraseSplitter + TermFieldHandle get_skip_handles() const { return _skipHandles; } + uint32_t get_num_phrase_split_terms() const { return _terms.size(); } + uint32_t get_field_id() const { return _field_id; } + const std::vector<HowToCopy>& get_copy_info() const { return _copyInfo; } }; diff --git a/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp b/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp index a0d53b12fe5..b80a9c9e085 100644 --- a/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp +++ b/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp @@ -1,18 +1,19 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "phrasesplitter.h" +#include "phrase_splitter_query_env.h" namespace search::fef { -PhraseSplitter::PhraseSplitter(const IQueryEnvironment & queryEnv, uint32_t fieldId) - : PhraseSplitterQueryEnv(queryEnv, fieldId), +PhraseSplitter::PhraseSplitter(const PhraseSplitterQueryEnv& phrase_splitter_query_env) + : _phrase_splitter_query_env(phrase_splitter_query_env), + _skipHandles(_phrase_splitter_query_env.get_skip_handles()), _matchData(nullptr), - _termMatches() + _termMatches(_phrase_splitter_query_env.get_num_phrase_split_terms()) { - _termMatches.reserve(_terms.size()); - for ([[maybe_unused]] auto & term : _terms) { - _termMatches.emplace_back(); - _termMatches.back().setFieldId(fieldId); + uint32_t field_id = _phrase_splitter_query_env.get_field_id(); + for (auto & term_match : _termMatches) { + term_match.setFieldId(field_id); } } @@ -33,11 +34,11 @@ PhraseSplitter::copyTermFieldMatchData(TermFieldMatchData & dst, const TermField void PhraseSplitter::update() { - for (uint32_t i = 0; i < _copyInfo.size(); ++i) { - const TermFieldMatchData *src = _matchData->resolveTermField(_copyInfo[i].orig_handle); - TermFieldMatchData *dst = resolveSplittedTermField(_copyInfo[i].split_handle); + for (const auto ©_info : _phrase_splitter_query_env.get_copy_info()) { + const TermFieldMatchData *src = _matchData->resolveTermField(copy_info.orig_handle); + TermFieldMatchData *dst = resolveSplittedTermField(copy_info.split_handle); assert(src != nullptr && dst != nullptr); - copyTermFieldMatchData(*dst, *src, _copyInfo[i].offsetInPhrase); + copyTermFieldMatchData(*dst, *src, copy_info.offsetInPhrase); } } diff --git a/searchlib/src/vespa/searchlib/fef/phrasesplitter.h b/searchlib/src/vespa/searchlib/fef/phrasesplitter.h index dc7954a7fcc..8cec1d2f266 100644 --- a/searchlib/src/vespa/searchlib/fef/phrasesplitter.h +++ b/searchlib/src/vespa/searchlib/fef/phrasesplitter.h @@ -2,12 +2,13 @@ #pragma once -#include "phrase_splitter_query_env.h" #include "matchdata.h" #include "termfieldmatchdata.h" namespace search::fef { +class PhraseSplitterQueryEnv; + /** * This class is used to split all phrase terms in a query environment * into separate terms. New TermData and TermFieldMatchData objects @@ -23,8 +24,10 @@ namespace search::fef { * Use this class if you want to handle a phrase term the same way as * single terms. **/ -class PhraseSplitter : public PhraseSplitterQueryEnv +class PhraseSplitter { + const PhraseSplitterQueryEnv& _phrase_splitter_query_env; + TermFieldHandle _skipHandles; const MatchData *_matchData; std::vector<TermFieldMatchData> _termMatches; // match objects associated with splitted terms @@ -43,7 +46,7 @@ public: * @param queryEnv the query environment to wrap. * @param field the field where we need to split phrases **/ - PhraseSplitter(const IQueryEnvironment & queryEnv, uint32_t fieldId); + PhraseSplitter(const PhraseSplitterQueryEnv &phrase_splitter_query_env); ~PhraseSplitter(); /** @@ -72,6 +75,7 @@ public: } void bind_match_data(const fef::MatchData &md) { _matchData = &md; } + const PhraseSplitterQueryEnv& get_phrase_splitter_query_env() const { return _phrase_splitter_query_env; } }; } |