summaryrefslogtreecommitdiffstats
path: root/searchlib/src
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@broadpark.no>2020-05-20 14:03:25 +0200
committerTor Egge <Tor.Egge@broadpark.no>2020-05-20 14:10:13 +0200
commit1bc201d21999b1463b2123b44830351c40de57e8 (patch)
treed1ebd6d4ff8a403c8f3546a98dee5e1f22102fd2 /searchlib/src
parent2565a2a4244d0347a0e983dd921bbea130ff2d32 (diff)
Explicitly manage PhraseSplitterQueryEnv.
Diffstat (limited to 'searchlib/src')
-rw-r--r--searchlib/src/tests/fef/phrasesplitter/benchmark.cpp4
-rw-r--r--searchlib/src/tests/fef/phrasesplitter/phrasesplitter_test.cpp45
-rw-r--r--searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp18
-rw-r--r--searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp5
-rw-r--r--searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp13
-rw-r--r--searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h12
-rw-r--r--searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp23
-rw-r--r--searchlib/src/vespa/searchlib/fef/phrasesplitter.h10
8 files changed, 78 insertions, 52 deletions
diff --git a/searchlib/src/tests/fef/phrasesplitter/benchmark.cpp b/searchlib/src/tests/fef/phrasesplitter/benchmark.cpp
index 419b5261510..f264a3ab949 100644
--- a/searchlib/src/tests/fef/phrasesplitter/benchmark.cpp
+++ b/searchlib/src/tests/fef/phrasesplitter/benchmark.cpp
@@ -2,6 +2,7 @@
#include <vespa/vespalib/testkit/testapp.h>
#include <vespa/searchlib/fef/matchdatalayout.h>
#include <vespa/searchlib/fef/phrasesplitter.h>
+#include <vespa/searchlib/fef/phrase_splitter_query_env.h>
#include <vespa/searchlib/fef/test/queryenvironment.h>
#include <iomanip>
#include <iostream>
@@ -42,7 +43,8 @@ Benchmark::run(size_t numRuns, size_t numPositions)
tmd->appendPosition(TermFieldMatchDataPosition(0, i, 0, numPositions));
}
- PhraseSplitter ps(qe, 0);
+ PhraseSplitterQueryEnv ps_query_env(qe, 0);
+ PhraseSplitter ps(ps_query_env);
std::cout << "Start benchmark with numRuns(" << numRuns << ") and numPositions(" << numPositions << ")" << std::endl;
diff --git a/searchlib/src/tests/fef/phrasesplitter/phrasesplitter_test.cpp b/searchlib/src/tests/fef/phrasesplitter/phrasesplitter_test.cpp
index 1a7c4ccc467..afd422024d1 100644
--- a/searchlib/src/tests/fef/phrasesplitter/phrasesplitter_test.cpp
+++ b/searchlib/src/tests/fef/phrasesplitter/phrasesplitter_test.cpp
@@ -5,6 +5,7 @@ LOG_SETUP("phrasesplitter_test");
#include <vespa/searchlib/fef/matchdatalayout.h>
#include <vespa/searchlib/fef/phrasesplitter.h>
+#include <vespa/searchlib/fef/phrase_splitter_query_env.h>
#include <vespa/searchlib/fef/test/queryenvironment.h>
namespace search {
@@ -84,12 +85,13 @@ PhraseSplitterTest::testSplitter()
terms.push_back(SimpleTermData());
terms.back().addField(0).setHandle(mdl.allocTermField(0));
MatchData::UP md = mdl.createMatchData();
- PhraseSplitter ps(qe, 0);
- ASSERT_TRUE(ps.getNumTerms() == 1);
+ PhraseSplitterQueryEnv ps_query_env(qe, 0);
+ PhraseSplitter ps(ps_query_env);
+ ASSERT_TRUE(ps.get_phrase_splitter_query_env().getNumTerms() == 1);
ps.bind_match_data(*md);
ps.update();
// check that nothing is served from the splitter
- EXPECT_EQUAL(ps.getTerm(0), &terms[0]);
+ EXPECT_EQUAL(ps.get_phrase_splitter_query_env().getTerm(0), &terms[0]);
TermFieldHandle handle = terms[0].lookupField(0)->getHandle();
EXPECT_EQUAL(ps.resolveTermField(handle), md->resolveTermField(handle));
}
@@ -103,14 +105,15 @@ PhraseSplitterTest::testSplitter()
terms.back().addField(0).setHandle(mdl.allocTermField(0));
terms.back().addField(7).setHandle(mdl.allocTermField(7));
MatchData::UP md = mdl.createMatchData();
- PhraseSplitter ps(qe, 7);
- ASSERT_TRUE(ps.getNumTerms() == 3);
+ PhraseSplitterQueryEnv ps_query_env(qe, 7);
+ PhraseSplitter ps(ps_query_env);
+ ASSERT_TRUE(ps.get_phrase_splitter_query_env().getNumTerms() == 3);
ps.bind_match_data(*md);
ps.update();
// check that all is served from the splitter
for (size_t i = 0; i < 3; ++i) {
// fprintf(stderr, "checking term %d\n", (int)i);
- const ITermData *td = ps.getTerm(i);
+ const ITermData *td = ps.get_phrase_splitter_query_env().getTerm(i);
EXPECT_NOT_EQUAL(td, &terms[0]);
EXPECT_NOT_EQUAL(td->lookupField(7), (ITermFieldData *)0);
EXPECT_EQUAL(td->lookupField(0), (ITermFieldData *)0);
@@ -135,15 +138,16 @@ PhraseSplitterTest::testSplitter()
}
terms[1].setPhraseLength(3);
MatchData::UP md = mdl.createMatchData();
- PhraseSplitter ps(qe, 4);
- ASSERT_TRUE(ps.getNumTerms() == 5);
+ PhraseSplitterQueryEnv ps_query_env(qe, 4);
+ PhraseSplitter ps(ps_query_env);
+ ASSERT_TRUE(ps.get_phrase_splitter_query_env().getNumTerms() == 5);
ps.bind_match_data(*md);
ps.update();
{ // first term
// fprintf(stderr, "first term\n");
- EXPECT_EQUAL(ps.getTerm(0), &terms[0]);
- TEST_DO(assertTermData(ps.getTerm(0), 0, 1, 4, 0));
- TEST_DO(assertTermData(ps.getTerm(0), 0, 1, 7, 1));
+ EXPECT_EQUAL(ps.get_phrase_splitter_query_env().getTerm(0), &terms[0]);
+ TEST_DO(assertTermData(ps.get_phrase_splitter_query_env().getTerm(0), 0, 1, 4, 0));
+ TEST_DO(assertTermData(ps.get_phrase_splitter_query_env().getTerm(0), 0, 1, 7, 1));
TermFieldHandle handle = terms[0].lookupField(4)->getHandle();
EXPECT_EQUAL(ps.resolveTermField(handle), md->resolveTermField(handle));
@@ -152,7 +156,7 @@ PhraseSplitterTest::testSplitter()
}
for (size_t i = 0; i < 3; ++i) { // phrase
// fprintf(stderr, "phrase term %zd\n", i);
- const ITermData *td = ps.getTerm(i + 1);
+ const ITermData *td = ps.get_phrase_splitter_query_env().getTerm(i + 1);
EXPECT_NOT_EQUAL(td, &terms[1]);
TEST_DO(assertTermData(td, 1, 1, 4, i + 11)); // skipHandles == 11
EXPECT_EQUAL(td->lookupField(7), (ITermFieldData *)0);
@@ -161,9 +165,9 @@ PhraseSplitterTest::testSplitter()
}
{ // last term
// fprintf(stderr, "last term\n");
- EXPECT_EQUAL(ps.getTerm(4), &terms[2]);
- TEST_DO(assertTermData(ps.getTerm(4), 2, 1, 4, 4));
- TEST_DO(assertTermData(ps.getTerm(4), 2, 1, 7, 5));
+ EXPECT_EQUAL(ps.get_phrase_splitter_query_env().getTerm(4), &terms[2]);
+ TEST_DO(assertTermData(ps.get_phrase_splitter_query_env().getTerm(4), 2, 1, 4, 4));
+ TEST_DO(assertTermData(ps.get_phrase_splitter_query_env().getTerm(4), 2, 1, 7, 5));
// fprintf(stderr, "inspect term %p #f %zd\n", &terms[2], terms[2].numFields());
fflush(stderr);
@@ -189,8 +193,9 @@ PhraseSplitterTest::testSplitterUpdate()
terms[0].setPhraseLength(2);
terms[2].setPhraseLength(2);
MatchData::UP md = mdl.createMatchData();
- PhraseSplitter ps(qe, 0);
- ASSERT_TRUE(ps.getNumTerms() == 5);
+ PhraseSplitterQueryEnv ps_query_env(qe, 0);
+ PhraseSplitter ps(ps_query_env);
+ ASSERT_TRUE(ps.get_phrase_splitter_query_env().getNumTerms() == 5);
{ // first phrase
TermFieldMatchData * tmd = md->resolveTermField(terms[0].lookupField(0)->getHandle());
tmd->appendPosition(TermFieldMatchDataPosition(0, 10, 0, 1000));
@@ -206,19 +211,19 @@ PhraseSplitterTest::testSplitterUpdate()
ps.bind_match_data(*md);
ps.update();
for (size_t i = 0; i < 2; ++i) { // first phrase
- const TermFieldMatchData * tmd = ps.resolveTermField(ps.getTerm(i)->lookupField(0)->getHandle());
+ const TermFieldMatchData * tmd = ps.resolveTermField(ps.get_phrase_splitter_query_env().getTerm(i)->lookupField(0)->getHandle());
TermFieldMatchData::PositionsIterator itr = tmd->begin();
EXPECT_EQUAL((itr++)->getPosition(), 10 + i);
ASSERT_TRUE(itr == tmd->end());
}
{ // first term
- TermFieldMatchData * tmd = md->resolveTermField(ps.getTerm(2)->lookupField(0)->getHandle());
+ TermFieldMatchData * tmd = md->resolveTermField(ps.get_phrase_splitter_query_env().getTerm(2)->lookupField(0)->getHandle());
TermFieldMatchData::PositionsIterator itr = tmd->begin();
EXPECT_EQUAL((itr++)->getPosition(), 20u);
ASSERT_TRUE(itr == tmd->end());
}
for (size_t i = 0; i < 2; ++i) { // second phrase
- const TermFieldMatchData * tmd = ps.resolveTermField(ps.getTerm(i + 3)->lookupField(0)->getHandle());
+ const TermFieldMatchData * tmd = ps.resolveTermField(ps.get_phrase_splitter_query_env().getTerm(i + 3)->lookupField(0)->getHandle());
TermFieldMatchData::PositionsIterator itr = tmd->begin();
EXPECT_EQUAL((itr++)->getPosition(), 30 + i);
ASSERT_TRUE(itr == tmd->end());
diff --git a/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp b/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp
index 6dba7b87c08..156d7aeb7eb 100644
--- a/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp
+++ b/searchlib/src/vespa/searchlib/features/fieldmatch/computer.cpp
@@ -2,6 +2,7 @@
#include "computer.h"
#include <vespa/searchlib/features/utils.h>
+#include <vespa/searchlib/fef/phrase_splitter_query_env.h>
#include <vespa/searchlib/fef/properties.h>
#include <vespa/vespalib/util/stringfmt.h>
#include <vespa/vespalib/locale/c.h>
@@ -35,10 +36,11 @@ Computer::Computer(const vespalib::string &propertyNamespace, const PhraseSplitt
_cachedHits()
{
// Store term data for all terms searching in this field
- _queryTermFieldMatch.reserve(splitter.getNumTerms());
- _cachedHits.reserve(splitter.getNumTerms());
- for (uint32_t i = 0; i < splitter.getNumTerms(); ++i) {
- QueryTerm qt = QueryTermFactory::create(splitter, i, true);
+ const auto& splitter_query_env = splitter.get_phrase_splitter_query_env();
+ _queryTermFieldMatch.reserve(splitter_query_env.getNumTerms());
+ _cachedHits.reserve(splitter_query_env.getNumTerms());
+ for (uint32_t i = 0; i < splitter_query_env.getNumTerms(); ++i) {
+ QueryTerm qt = QueryTermFactory::create(splitter_query_env, i, true);
_totalTermWeight += qt.termData()->getWeight().percent();
_totalTermSignificance += qt.significance();
_simpleMetrics.addQueryTerm(qt.termData()->getWeight().percent());
@@ -52,11 +54,11 @@ Computer::Computer(const vespalib::string &propertyNamespace, const PhraseSplitt
}
}
- _totalTermWeight = atoi(splitter.getProperties().lookup(propertyNamespace, "totalTermWeight").
+ _totalTermWeight = atoi(splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermWeight").
get(vespalib::make_string("%d", _totalTermWeight)).c_str());
- _totalTermSignificance = vespalib::locale::c::atof(splitter.getProperties().lookup(propertyNamespace, "totalTermSignificance").
+ _totalTermSignificance = vespalib::locale::c::atof(splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermSignificance").
get(vespalib::make_string("%f", _totalTermSignificance)).c_str());
- if (splitter.getProperties().lookup(propertyNamespace, "totalTermWeight").found()) {
+ if (splitter_query_env.getProperties().lookup(propertyNamespace, "totalTermWeight").found()) {
_simpleMetrics.setTotalWeightInQuery(_totalTermWeight);
}
@@ -139,7 +141,7 @@ Computer::handleError(uint32_t fieldPos, uint32_t docId) const
static int errcnt;
if (errcnt < 1000) {
errcnt++;
- const FieldInfo * finfo = _splitter.getIndexEnvironment().getField(getFieldId());
+ const FieldInfo * finfo = _splitter.get_phrase_splitter_query_env().getIndexEnvironment().getField(getFieldId());
LOG(debug, "Bad field position %u >= fieldLength %u for field '%s' document %u. "
"Document was probably refed during query (Ticket 7104969)",
fieldPos, _fieldLength,
diff --git a/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp b/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp
index 94240422106..de79280517d 100644
--- a/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp
+++ b/searchlib/src/vespa/searchlib/features/fieldmatchfeature.cpp
@@ -5,6 +5,7 @@
#include <vespa/searchlib/features/fieldmatch/computer.h>
#include <vespa/searchlib/fef/featurenamebuilder.h>
#include <vespa/searchlib/fef/indexproperties.h>
+#include <vespa/searchlib/fef/phrase_splitter_query_env.h>
#include <vespa/searchlib/fef/properties.h>
#include <vespa/vespalib/util/stringfmt.h>
#include <vespa/vespalib/locale/c.h>
@@ -20,6 +21,7 @@ namespace search::features {
*/
class FieldMatchExecutor : public fef::FeatureExecutor {
private:
+ PhraseSplitterQueryEnv _splitter_env;
fef::PhraseSplitter _splitter;
const fef::FieldInfo & _field;
fieldmatch::Computer _cmp;
@@ -37,7 +39,8 @@ FieldMatchExecutor::FieldMatchExecutor(const IQueryEnvironment & queryEnv,
const FieldInfo & field,
[[maybe_unused]] const fieldmatch::Params & params) :
FeatureExecutor(),
- _splitter(queryEnv, field.id()),
+ _splitter_env(queryEnv, field.id()),
+ _splitter(_splitter_env),
_field(field),
_cmp(vespalib::make_string("fieldMatch(%s)", _field.name().c_str()),
_splitter, field, params)
diff --git a/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp
index b474317a33b..593b8fb29ce 100644
--- a/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp
+++ b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.cpp
@@ -31,12 +31,13 @@ PhraseSplitterQueryEnv::considerTerm(uint32_t termIdx, const ITermData &term, st
_termIdxMap.push_back(TermIdx(termIdx, false));
}
-PhraseSplitterQueryEnv::PhraseSplitterQueryEnv(const IQueryEnvironment & queryEnv, uint32_t fieldId) :
- _queryEnv(queryEnv),
- _terms(),
- _termIdxMap(),
- _maxHandle(0),
- _skipHandles(0)
+PhraseSplitterQueryEnv::PhraseSplitterQueryEnv(const IQueryEnvironment & queryEnv, uint32_t fieldId)
+ : _queryEnv(queryEnv),
+ _terms(),
+ _termIdxMap(),
+ _maxHandle(0),
+ _skipHandles(0),
+ _field_id(fieldId)
{
TermFieldHandle numHandles = 0; // how many handles existed in underlying data
std::vector<PhraseTerm> phraseTerms; // data about original phrase terms
diff --git a/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h
index 19a4cb206b9..5a9c85b79db 100644
--- a/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h
+++ b/searchlib/src/vespa/searchlib/fef/phrase_splitter_query_env.h
@@ -24,7 +24,7 @@ namespace search::fef {
**/
class PhraseSplitterQueryEnv : public IQueryEnvironment
{
-protected:
+private:
struct TermIdx {
uint32_t idx; // index into either query environment or vector of TermData objects
bool splitted; // whether this term has been splitted or not
@@ -36,18 +36,20 @@ protected:
TermFieldHandle orig_handle;
PhraseTerm(const ITermData & t, uint32_t i, uint32_t h) : term(t), idx(i), orig_handle(h) {}
};
+public:
struct HowToCopy {
TermFieldHandle orig_handle;
TermFieldHandle split_handle;
uint32_t offsetInPhrase;
};
-
+private:
const IQueryEnvironment &_queryEnv;
std::vector<SimpleTermData> _terms; // splitted terms
std::vector<HowToCopy> _copyInfo;
std::vector<TermIdx> _termIdxMap; // renumbering of terms
TermFieldHandle _maxHandle; // the largest among original term field handles
TermFieldHandle _skipHandles; // how many handles to skip
+ uint32_t _field_id;
void considerTerm(uint32_t termIdx, const ITermData &term, std::vector<PhraseTerm> &phraseTerms, uint32_t fieldId);
@@ -79,6 +81,12 @@ public:
const attribute::IAttributeContext & getAttributeContext() const override { return _queryEnv.getAttributeContext(); }
double get_average_field_length(const vespalib::string &field_name) const override { return _queryEnv.get_average_field_length(field_name); }
const IIndexEnvironment & getIndexEnvironment() const override { return _queryEnv.getIndexEnvironment(); }
+
+ // Accessor methods used by PhraseSplitter
+ TermFieldHandle get_skip_handles() const { return _skipHandles; }
+ uint32_t get_num_phrase_split_terms() const { return _terms.size(); }
+ uint32_t get_field_id() const { return _field_id; }
+ const std::vector<HowToCopy>& get_copy_info() const { return _copyInfo; }
};
diff --git a/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp b/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp
index a0d53b12fe5..b80a9c9e085 100644
--- a/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp
+++ b/searchlib/src/vespa/searchlib/fef/phrasesplitter.cpp
@@ -1,18 +1,19 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "phrasesplitter.h"
+#include "phrase_splitter_query_env.h"
namespace search::fef {
-PhraseSplitter::PhraseSplitter(const IQueryEnvironment & queryEnv, uint32_t fieldId)
- : PhraseSplitterQueryEnv(queryEnv, fieldId),
+PhraseSplitter::PhraseSplitter(const PhraseSplitterQueryEnv& phrase_splitter_query_env)
+ : _phrase_splitter_query_env(phrase_splitter_query_env),
+ _skipHandles(_phrase_splitter_query_env.get_skip_handles()),
_matchData(nullptr),
- _termMatches()
+ _termMatches(_phrase_splitter_query_env.get_num_phrase_split_terms())
{
- _termMatches.reserve(_terms.size());
- for ([[maybe_unused]] auto & term : _terms) {
- _termMatches.emplace_back();
- _termMatches.back().setFieldId(fieldId);
+ uint32_t field_id = _phrase_splitter_query_env.get_field_id();
+ for (auto & term_match : _termMatches) {
+ term_match.setFieldId(field_id);
}
}
@@ -33,11 +34,11 @@ PhraseSplitter::copyTermFieldMatchData(TermFieldMatchData & dst, const TermField
void
PhraseSplitter::update()
{
- for (uint32_t i = 0; i < _copyInfo.size(); ++i) {
- const TermFieldMatchData *src = _matchData->resolveTermField(_copyInfo[i].orig_handle);
- TermFieldMatchData *dst = resolveSplittedTermField(_copyInfo[i].split_handle);
+ for (const auto &copy_info : _phrase_splitter_query_env.get_copy_info()) {
+ const TermFieldMatchData *src = _matchData->resolveTermField(copy_info.orig_handle);
+ TermFieldMatchData *dst = resolveSplittedTermField(copy_info.split_handle);
assert(src != nullptr && dst != nullptr);
- copyTermFieldMatchData(*dst, *src, _copyInfo[i].offsetInPhrase);
+ copyTermFieldMatchData(*dst, *src, copy_info.offsetInPhrase);
}
}
diff --git a/searchlib/src/vespa/searchlib/fef/phrasesplitter.h b/searchlib/src/vespa/searchlib/fef/phrasesplitter.h
index dc7954a7fcc..8cec1d2f266 100644
--- a/searchlib/src/vespa/searchlib/fef/phrasesplitter.h
+++ b/searchlib/src/vespa/searchlib/fef/phrasesplitter.h
@@ -2,12 +2,13 @@
#pragma once
-#include "phrase_splitter_query_env.h"
#include "matchdata.h"
#include "termfieldmatchdata.h"
namespace search::fef {
+class PhraseSplitterQueryEnv;
+
/**
* This class is used to split all phrase terms in a query environment
* into separate terms. New TermData and TermFieldMatchData objects
@@ -23,8 +24,10 @@ namespace search::fef {
* Use this class if you want to handle a phrase term the same way as
* single terms.
**/
-class PhraseSplitter : public PhraseSplitterQueryEnv
+class PhraseSplitter
{
+ const PhraseSplitterQueryEnv& _phrase_splitter_query_env;
+ TermFieldHandle _skipHandles;
const MatchData *_matchData;
std::vector<TermFieldMatchData> _termMatches; // match objects associated with splitted terms
@@ -43,7 +46,7 @@ public:
* @param queryEnv the query environment to wrap.
* @param field the field where we need to split phrases
**/
- PhraseSplitter(const IQueryEnvironment & queryEnv, uint32_t fieldId);
+ PhraseSplitter(const PhraseSplitterQueryEnv &phrase_splitter_query_env);
~PhraseSplitter();
/**
@@ -72,6 +75,7 @@ public:
}
void bind_match_data(const fef::MatchData &md) { _matchData = &md; }
+ const PhraseSplitterQueryEnv& get_phrase_splitter_query_env() const { return _phrase_splitter_query_env; }
};
}