aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <tegge@vespa.ai>2024-03-27 12:55:10 +0100
committerGitHub <noreply@github.com>2024-03-27 12:55:10 +0100
commit680f29db27d18fa1c2c957b92fb8bb2c67509dc6 (patch)
tree1397974e39e4af557aea6fb7cd37e5239c775d05
parentbf0889897ea22983396290d9ba55a6fdf207d821 (diff)
parent796b4c88c5b990b9446e3166394d8248080bcb05 (diff)
Merge pull request #30743 from vespa-engine/toregge/move-field-searcher-tokenize-member-function-to-tokenize-reader
Move UTF8StringFieldSearcherBase tokenize member function to Tokenize…
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp33
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h4
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp3
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp21
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h3
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp3
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp9
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h1
8 files changed, 47 insertions, 30 deletions
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
index d8a6091fe11..5988bdd912f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
@@ -4,6 +4,19 @@
namespace vsm {
+namespace {
+
+template <bool exact_match> inline bool is_word_char(ucs4_t c);
+
+template <>
+inline bool is_word_char<false>(ucs4_t c) { return Fast_UnicodeUtil::IsWordChar(c); }
+
+// All characters are treated as word characters for exact match
+template <>
+inline constexpr bool is_word_char<true>(ucs4_t) { return true; }
+
+}
+
void
TokenizeReader::fold(ucs4_t c) {
const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
@@ -18,4 +31,24 @@ TokenizeReader::fold(ucs4_t c) {
}
}
+template <bool exact_match>
+size_t
+TokenizeReader::tokenize_helper(Normalizing norm_mode)
+{
+ ucs4_t c(0);
+ while (hasNext()) {
+ if (is_word_char<exact_match>(c = next())) {
+ normalize(c, norm_mode);
+ while (hasNext() && is_word_char<exact_match>(c = next())) {
+ normalize(c, norm_mode);
+ }
+ break;
+ }
+ }
+ return complete();
+}
+
+template size_t TokenizeReader::tokenize_helper<false>(Normalizing);
+template size_t TokenizeReader::tokenize_helper<true>(Normalizing);
+
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
index 2bb5e62e0aa..f680d9b6c47 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
@@ -43,6 +43,10 @@ public:
_q = _q_start;
return token_len;
}
+ template <bool exact_match>
+ size_t tokenize_helper(Normalizing norm_mode);
+ size_t tokenize(Normalizing norm_mode) { return tokenize_helper<false>(norm_mode); }
+ size_t tokenize_exact_match(Normalizing norm_mode) { return tokenize_helper<true>(norm_mode); }
private:
void fold(ucs4_t c);
const byte *_p;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
index 37dc4ffb99c..c860178d583 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
@@ -26,8 +26,7 @@ UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
while ( reader.hasNext() ) {
- tokenize(reader);
- size_t fl = reader.complete();
+ size_t fl = reader.tokenize(normalize_mode());
for (auto qt : _qtl) {
const cmptype_t * term;
termsize_t tsz = qt->term(term);
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index 5036e9bedb1..f016d08ece8 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -10,21 +10,6 @@ using search::byte;
namespace vsm {
-template<typename Reader>
-void
-UTF8StringFieldSearcherBase::tokenize(Reader & reader) {
- ucs4_t c(0);
- Normalizing norm_mode = normalize_mode();
- while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next()));
-
- if (Fast_UnicodeUtil::IsWordChar(c)) {
- reader.normalize(c, norm_mode);
- while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) {
- reader.normalize(c, norm_mode);
- }
- }
-}
-
size_t
UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt)
{
@@ -38,8 +23,7 @@ UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt
TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
while ( reader.hasNext() ) {
- tokenize(reader);
- size_t fl = reader.complete();
+ size_t fl = reader.tokenize(normalize_mode());
if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) {
const cmptype_t *tt=term, *et=term+tsz;
for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++);
@@ -127,8 +111,7 @@ UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
while ( reader.hasNext() ) {
- tokenize(reader);
- size_t tokenlen = reader.complete();
+ size_t tokenlen = reader.tokenize(normalize_mode());
if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) {
addHit(qt, words);
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
index b196f2795a4..c217a7b8866 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
@@ -60,9 +60,6 @@ public:
protected:
SharedSearcherBuf _buf;
- template<typename Reader>
- void tokenize(Reader & reader);
-
/**
* Matches the given query term against the words in the given field reference
* using exact or prefix match strategy.
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
index 8bbacf168cf..d5bf4e4238a 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
@@ -26,8 +26,7 @@ UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
while ( reader.hasNext() ) {
- tokenize(reader);
- size_t tokenlen = reader.complete();
+ size_t tokenlen = reader.tokenize(normalize_mode());
for (auto qt : _qtl) {
const cmptype_t * term;
termsize_t tsz = qt->term(term);
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
index 1ab1b16cb86..1986db79972 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
@@ -273,8 +273,11 @@ buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearch
return ifm;
}
+}
+
search::Normalizing
-normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) {
+FieldSearchSpecMap::convert_normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode)
+{
switch (normalize_mode) {
case VsmfieldsConfig::Fieldspec::Normalize::NONE: return search::Normalizing::NONE;
case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE: return search::Normalizing::LOWERCASE;
@@ -283,8 +286,6 @@ normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) {
return search::Normalizing::LOWERCASE_AND_FOLD;
}
-}
-
void
FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf, const search::fef::IIndexEnvironment& index_env)
{
@@ -292,7 +293,7 @@ FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf, const search::
for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) {
LOG(spam, "Parsing %s", cfs.name.c_str());
FieldIdT fieldId = specMap().size();
- FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength);
+ FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, convert_normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength);
_specMap[fieldId] = std::move(fss);
_nameIdMap.add(cfs.name, fieldId);
LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str());
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
index 5b5a6b9a783..8bab0cad3b6 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
@@ -101,6 +101,7 @@ public:
static vespalib::string stripNonFields(vespalib::stringref rawIndex);
search::attribute::DistanceMetric get_distance_metric(const vespalib::string& name) const;
+ static search::Normalizing convert_normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode);
private:
FieldSearchSpecMapT _specMap; // mapping from field id to field search spec