aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-01-09 07:35:14 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2024-01-10 08:12:55 +0000
commit02c5bce07737a899726097e577c6dd1121ca5a7c (patch)
treee6c73d2df7f9f2c55322330cbc4ba644a2bbb8e0
parent4388490c151581bc6e04059baa04b580c80577d3 (diff)
Simplify ancient carefully hand optimized code in favour of simple readable code
-rw-r--r--streamingvisitors/src/tests/searcher/searcher_test.cpp13
-rw-r--r--streamingvisitors/src/vespa/vsm/config/vsmfields.def1
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp11
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h8
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp6
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h7
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp10
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp119
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h80
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp15
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp61
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h35
12 files changed, 180 insertions, 186 deletions
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index 64952dbe5b5..a691d7671f9 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -22,6 +22,7 @@ using search::streaming::HitList;
using search::streaming::QueryNodeResultFactory;
using search::streaming::QueryTerm;
using search::streaming::Normalizing;
+using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod;
using search::streaming::QueryTermList;
using TermType = QueryTerm::Type;
using namespace vsm;
@@ -763,28 +764,32 @@ TEST("snippet modifier") {
}
}
-TEST("FieldSearchSpec constrution") {
+TEST("FieldSearchSpec construction") {
{
FieldSearchSpec f;
EXPECT_FALSE(f.valid());
EXPECT_EQUAL(0u, f.id());
EXPECT_EQUAL("", f.name());
EXPECT_EQUAL(0x100000u, f.maxLength());
+ EXPECT_EQUAL("", f.arg1());
+ EXPECT_TRUE(Normalizing::LOWERCASE_AND_FOLD == f.normalize_mode());
}
{
- FieldSearchSpec f(7, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 789);
+ FieldSearchSpec f(7, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 789);
EXPECT_TRUE(f.valid());
EXPECT_EQUAL(7u, f.id());
EXPECT_EQUAL("f0", f.name());
EXPECT_EQUAL(789u, f.maxLength());
EXPECT_EQUAL(789u, f.searcher().maxFieldLength());
+ EXPECT_EQUAL("substring", f.arg1());
+ EXPECT_TRUE(Normalizing::LOWERCASE == f.normalize_mode());
}
}
TEST("snippet modifier manager") {
FieldSearchSpecMapT specMap;
- specMap[0] = FieldSearchSpec(0, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 1000);
- specMap[1] = FieldSearchSpec(1, "f1", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "", 1000);
+ specMap[0] = FieldSearchSpec(0, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 1000);
+ specMap[1] = FieldSearchSpec(1, "f1", Searchmethod::AUTOUTF8, Normalizing::NONE, "", 1000);
IndexFieldMapT indexMap;
indexMap["i0"].push_back(0);
indexMap["i1"].push_back(1);
diff --git a/streamingvisitors/src/vespa/vsm/config/vsmfields.def b/streamingvisitors/src/vespa/vsm/config/vsmfields.def
index 442a044d38f..dac732013d2 100644
--- a/streamingvisitors/src/vespa/vsm/config/vsmfields.def
+++ b/streamingvisitors/src/vespa/vsm/config/vsmfields.def
@@ -14,6 +14,7 @@ fieldspec[].name string
## The search method for a given field. Note: same field in 2 different document types must match on type if not a random result might be expected.
fieldspec[].searchmethod enum { NONE, BOOL, AUTOUTF8, UTF8, SSE2UTF8, INT8, INT16, INT32, INT64, FLOAT16, FLOAT, DOUBLE, GEOPOS, NEAREST_NEIGHBOR } default=AUTOUTF8
fieldspec[].arg1 string default=""
+fieldspec[].normalize enum { NONE, LOWERCASE, LOWERCASE_AND_FOLD } default=LOWERCASE_AND_FOLD
## Maximum number of chars to search per field.
fieldspec[].maxlength int default=1048576
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
index 55d80413b8c..b9e1fe8f83c 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
@@ -55,10 +55,8 @@ FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept
_currentElementId(0),
_currentElementWeight(1),
_words(0),
- _badUtf8Count(0),
- _zeroCount(0)
+ _badUtf8Count(0)
{
- zeroStat();
}
FieldSearcher::~FieldSearcher() = default;
@@ -114,13 +112,6 @@ FieldSearcher::prepareFieldId()
}
void
-FieldSearcher::zeroStat()
-{
- _badUtf8Count = 0;
- _zeroCount = 0;
-}
-
-void
FieldSearcher::init()
{
for (unsigned i = 0; i < NELEMS(_foldLowCase); i++) {
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
index 663592ed6d3..75ace16328b 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
@@ -59,13 +59,13 @@ public:
bool exact() const { return _matchType == EXACT; }
bool cased() const { return _matchType == CASED; }
void setMatchType(MatchType mt) { _matchType = mt; }
+ MatchType match_type() const noexcept { return _matchType; }
static void init();
static search::byte fold(search::byte c) { return _foldLowCase[c]; }
static search::byte iswordchar(search::byte c) { return _wordChar[c]; }
static search::byte isspace(search::byte c) { return ! iswordchar(c); }
static size_t countWords(const FieldRef & f);
int32_t getCurrentWeight() const { return _currentElementWeight; }
- void zeroStat();
FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; }
size_t maxFieldLength() const { return _maxFieldLength; }
@@ -96,11 +96,9 @@ private:
int32_t _currentElementWeight; // Contains the weight of the current item being evaluated.
protected:
/// Number of terms searched.
- unsigned _words;
+ unsigned _words;
/// Number of utf8 bytes by utf8 size.
- unsigned _badUtf8Count;
- unsigned _zeroCount;
-protected:
+ unsigned _badUtf8Count;
/**
* Adds a hit to the given query term.
* For each call to onValue() a batch of words are processed, and the position is local to this batch.
diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
index 76fedbd1166..816317bf86d 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
@@ -141,17 +141,17 @@ NearestNeighborFieldSearcher::onValue(const document::FieldValue& fv)
}
DistanceMetric
-NearestNeighborFieldSearcher::distance_metric_from_string(const vespalib::string& value)
+NearestNeighborFieldSearcher::distance_metric_from_string(vespalib::stringref value)
{
// Valid string values must match the definition of DistanceMetric in
// config-model/src/main/java/com/yahoo/schema/document/Attribute.java
- auto v = value;
+ vespalib::string v = value;
std::transform(v.begin(), v.end(), v.begin(),
[](unsigned char c) { return std::tolower(c); });
try {
return DistanceMetricUtils::to_distance_metric(v);
} catch (vespalib::IllegalStateException&) {
- vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", value.c_str());
+ vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", v.c_str());
return DistanceMetric::Euclidean;
}
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
index d08c2fbbc83..ecdc64d1336 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
@@ -11,10 +11,7 @@
#include <vespa/searchlib/tensor/tensor_ext_attribute.h>
namespace search::fef { class IQueryEnvironment; }
-
-namespace search::tensor {
-class TensorExtAttribute;
-}
+namespace search::tensor { class TensorExtAttribute; }
namespace vsm {
@@ -52,7 +49,7 @@ public:
search::fef::IQueryEnvironment& query_env) override;
void onValue(const document::FieldValue& fv) override;
- static search::attribute::DistanceMetric distance_metric_from_string(const vespalib::string& value);
+ static search::attribute::DistanceMetric distance_metric_from_string(vespalib::stringref value);
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
index 00828bcc7b1..fa1fc83728c 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
@@ -18,17 +18,15 @@ UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
{
(void) mintsz;
termcount_t words(0);
- const byte * n = reinterpret_cast<const byte *> (f.data());
- const byte * e = n + f.size();
if (f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
cmptype_t * fn = &(*_buf.get())[0];
- size_t fl(0);
- for( ; n < e; ) {
- if (!*n) { _zeroCount++; n++; }
- n = tokenize(n, _buf->capacity(), fn, fl);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t fl = reader.complete();
for (auto qt : _qtl) {
const cmptype_t * term;
termsize_t tsz = qt->term(term);
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index 1148083b042..ce63f55ea63 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -1,7 +1,6 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8stringfieldsearcherbase.h"
-#include <vespa/fastlib/text/normwordfolder.h>
#include <cassert>
using search::streaming::QueryTerm;
@@ -10,107 +9,36 @@ using search::byte;
namespace vsm {
-const byte *
-UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen)
-{
- if (maxSz > 0) {
- maxSz--;
- }
- ucs4_t c(*p);
- ucs4_t *q(dstbuf);
- const byte * end(p+maxSz);
-
- // Skip non-word characters between words
- for (; p < end; ) {
- if (c < 128) {
- if (!c) { break; }
- p++;
- if (__builtin_expect(Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) {
- *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c);
- c = 0;
- } else {
- c = *p;
- }
- } else {
- c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
- if (Fast_UnicodeUtil::IsWordChar(c)) {
- const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != nullptr) {
- size_t repllen = strlen(repl);
- if (repllen > 0) {
- q = Fast_UnicodeUtil::ucs4copy(q,repl);
- }
- } else {
- c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
- *q++ = c;
- }
- break;
- } else {
- if (c == Fast_UnicodeUtil::_BadUTF8Char) {
- _badUtf8Count++;
- }
- c = *p;
- }
- }
- }
-
- c = *p; // Next char
- for (; p < end;) {
- if (c < 128) { // Common case, ASCII
- if (!c) { break; }
- p++;
- if (__builtin_expect(!Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) {
- c = 0;
- } else {
- *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c);
- c = *p;
- }
- } else {
- c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
- if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) {
- const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != nullptr) {
- size_t repllen = strlen(repl);
- if (repllen > 0) {
- q = Fast_UnicodeUtil::ucs4copy(q,repl);
- }
- } else {
- c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
- *q++ = c;
- }
-
- c = *p;
- } else {
- if (c == Fast_UnicodeUtil::_BadUTF8Char) {
- _badUtf8Count++;
- }
- break;
- }
+template<typename Reader>
+void
+UTF8StringFieldSearcherBase::tokenize(Reader & reader) {
+ ucs4_t c(0);
+ Normalizing norm_mode = normalize_mode();
+ while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next()));
+
+ if (Fast_UnicodeUtil::IsWordChar(c)) {
+ reader.normalize(c, norm_mode);
+ while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) {
+ reader.normalize(c, norm_mode);
}
}
- *q = 0;
- tokenlen = q - dstbuf;
- return p;
}
size_t
UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt)
{
termcount_t words(0);
- const byte * n = reinterpret_cast<const byte *> (f.data());
- // __builtin_prefetch(n, 0, 0);
const cmptype_t * term;
termsize_t tsz = qt.term(term);
- const byte * e = n + f.size();
if ( f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
- cmptype_t * fn = &(*_buf.get())[0];
- size_t fl(0);
+ cmptype_t * fn = _buf->data();
- for( ; n < e; ) {
- if (!*n) { _zeroCount++; n++; }
- n = tokenize(n, _buf->capacity(), fn, fl);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t fl = reader.complete();
if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) {
const cmptype_t *tt=term, *et=term+tsz;
for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++);
@@ -185,22 +113,17 @@ size_t
UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
{
termcount_t words = 0;
- const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
- const byte * srcend = srcbuf + f.size();
const cmptype_t * term;
termsize_t tsz = qt.term(term);
if (f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
- cmptype_t * dstbuf = &(*_buf.get())[0];
- size_t tokenlen = 0;
+ cmptype_t * dstbuf = _buf->data();
- for( ; srcbuf < srcend; ) {
- if (*srcbuf == 0) {
- ++_zeroCount;
- ++srcbuf;
- }
- srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t tokenlen = reader.complete();
if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) {
addHit(qt, words);
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
index 1362b3c4f1d..ed76fb79f4e 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
@@ -2,6 +2,7 @@
#pragma once
#include "strchrfieldsearcher.h"
+#include <vespa/fastlib/text/normwordfolder.h>
namespace vsm {
@@ -34,9 +35,9 @@ public:
void onOffset(size_t) { }
void incBuf(size_t inc) { _cbuf += inc; }
ucs4_t * getBuf() { return _cbuf; }
- bool valid() { return true; }
- size_t size() { return (_cbuf - _bbuf); }
- bool hasOffsets() { return false; }
+ bool valid() const noexcept { return true; }
+ size_t size() const noexcept { return (_cbuf - _bbuf); }
+ bool hasOffsets() const noexcept { return false; }
};
/**
@@ -53,14 +54,81 @@ public:
explicit OffsetWrapper(ucs4_t * buf, size_t * offsets) noexcept : BufferWrapper(buf), _boff(offsets), _coff(offsets) {}
void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; }
void onOffset(size_t of) { *_coff++ = of; }
- bool valid() { return (size() == (size_t)(_coff - _boff)); }
- bool hasOffsets() { return true; }
+ bool valid() const noexcept { return (size() == (size_t)(_coff - _boff)); }
+ bool hasOffsets() const noexcept { return true; }
};
protected:
SharedSearcherBuf _buf;
- const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen);
+ using byte = search::byte;
+ using Normalizing = search::streaming::Normalizing;
+
+ class TokenizeReader {
+ public:
+ TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
+ : _p(p),
+ _p_end(p + len),
+ _q(q),
+ _q_start(q)
+ {}
+ ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
+ void normalize(ucs4_t c, Normalizing normalize_mode) {
+ switch (normalize_mode) {
+ case Normalizing::LOWERCASE:
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+ [[fallthrough]];
+ case Normalizing::NONE:
+ *_q++ = c;
+ break;
+ case Normalizing::LOWERCASE_AND_FOLD:
+ fold(c);
+ break;
+ }
+ }
+ bool hasNext() const noexcept { return _p < _p_end; }
+ const byte * p() const noexcept { return _p; }
+ size_t complete() noexcept {
+ *_q = 0;
+ size_t token_len = _q - _q_start;
+ _q = _q_start;
+ return token_len;
+ }
+ private:
+ void fold(ucs4_t c) {
+ const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
+ if (repl != nullptr) {
+ size_t repllen = strlen(repl);
+ if (repllen > 0) {
+ _q = Fast_UnicodeUtil::ucs4copy(_q,repl);
+ }
+ } else {
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+ *_q++ = c;
+ }
+ }
+ void lowercase(ucs4_t c) {
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+ *_q++ = c;
+ }
+ const byte *_p;
+ const byte *_p_end;
+ ucs4_t *_q;
+ ucs4_t *_q_start;
+ };
+
+
+ template<typename Reader>
+ void tokenize(Reader & reader);
+
+ Normalizing normalize_mode() const noexcept {
+ switch (match_type()) {
+ case EXACT: return Normalizing::LOWERCASE;
+ case CASED: return Normalizing::NONE;
+ default: return Normalizing::LOWERCASE_AND_FOLD;
+ }
+ return Normalizing::LOWERCASE_AND_FOLD;
+ }
/**
* Matches the given query term against the words in the given field reference
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
index e28ce114225..4318d5fe1a3 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
@@ -14,24 +14,19 @@ UTF8SuffixStringFieldSearcher::duplicate() const
}
size_t
-UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
{
(void) mintsz;
termcount_t words = 0;
- const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
- const byte * srcend = srcbuf + f.size();
if (f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
cmptype_t * dstbuf = &(*_buf.get())[0];
- size_t tokenlen = 0;
- for( ; srcbuf < srcend; ) {
- if (*srcbuf == 0) {
- ++_zeroCount;
- ++srcbuf;
- }
- srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t tokenlen = reader.complete();
for (auto qt : _qtl) {
const cmptype_t * term;
termsize_t tsz = qt->term(term);
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
index 4b0efd58a56..22934ba74d2 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
@@ -40,6 +40,8 @@ setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) {
searcher->setMatchType(FieldSearcher::EXACT);
} else if (arg1 == "word") {
searcher->setMatchType(FieldSearcher::EXACT);
+ } else if (arg1 == "cased") {
+ searcher->setMatchType(FieldSearcher::CASED);
}
}
@@ -51,6 +53,7 @@ FieldSearchSpec::FieldSearchSpec()
_maxLength(0x100000),
_searcher(),
_searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE),
+ _normalize_mode(Normalizing::LOWERCASE_AND_FOLD),
_arg1(),
_reconfigured(false)
{
@@ -60,15 +63,15 @@ FieldSearchSpec::~FieldSearchSpec() = default;
FieldSearchSpec::FieldSearchSpec(FieldSearchSpec&& rhs) noexcept = default;
FieldSearchSpec& FieldSearchSpec::operator=(FieldSearchSpec&& rhs) noexcept = default;
-FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname,
- VsmfieldsConfig::Fieldspec::Searchmethod searchDef,
- const vespalib::string & arg1, size_t maxLength_) :
+FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, Searchmethod searchDef,
+ Normalizing normalize_mode, vespalib::stringref arg1_in, size_t maxLength_in) :
_id(fid),
_name(fname),
- _maxLength(maxLength_),
+ _maxLength(maxLength_in),
_searcher(),
_searchMethod(searchDef),
- _arg1(arg1),
+ _normalize_mode(normalize_mode),
+ _arg1(arg1_in),
_reconfigured(false)
{
switch(searchDef) {
@@ -79,14 +82,16 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string &
case VsmfieldsConfig::Fieldspec::Searchmethod::NONE:
case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8:
case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8:
- if (arg1 == "substring") {
+ if (_arg1 == "substring") {
_searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid);
- } else if (arg1 == "suffix") {
+ } else if (_arg1 == "suffix") {
_searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid);
- } else if (arg1 == "exact") {
+ } else if (_arg1 == "exact") {
_searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid);
- } else if (arg1 == "word") {
+ } else if (_arg1 == "word") {
_searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid);
+ } else if (_arg1 == "cased") {
+ _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid);
} else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) {
_searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid);
} else {
@@ -112,12 +117,12 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string &
_searcher = std::make_unique<GeoPosFieldSearcher>(fid);
break;
case VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR:
- auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(arg1);
+ auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(_arg1);
_searcher = std::make_unique<NearestNeighborFieldSearcher>(fid, dm);
break;
}
if (_searcher) {
- setMatchType(_searcher, arg1);
+ setMatchType(_searcher, _arg1);
_searcher->maxFieldLength(maxLength());
}
}
@@ -166,20 +171,20 @@ FieldSearchSpecMap::FieldSearchSpecMap() = default;
FieldSearchSpecMap::~FieldSearchSpecMap() = default;
namespace {
- const std::string _G_empty("");
- const std::string _G_value(".value");
- const std::regex _G_map1("\\{[a-zA-Z0-9]+\\}");
- const std::regex _G_map2("\\{\".*\"\\}");
- const std::regex _G_array("\\[[0-9]+\\]");
+ const std::string G_empty;
+ const std::string G_value(".value");
+ const std::regex G_map1("\\{[a-zA-Z0-9]+\\}");
+ const std::regex G_map2("\\{\".*\"\\}");
+ const std::regex G_array("\\[[0-9]+\\]");
}
vespalib::string
FieldSearchSpecMap::stripNonFields(vespalib::stringref rawIndex)
{
if ((rawIndex.find('[') != vespalib::string::npos) || (rawIndex.find('{') != vespalib::string::npos)) {
- std::string index = std::regex_replace(std::string(rawIndex), _G_map1, _G_value);
- index = std::regex_replace(index, _G_map2, _G_value);
- index = std::regex_replace(index, _G_array, _G_empty);
+ std::string index = std::regex_replace(std::string(rawIndex), G_map1, G_value);
+ index = std::regex_replace(index, G_map2, G_value);
+ index = std::regex_replace(index, G_array, G_empty);
return index;
}
return rawIndex;
@@ -258,17 +263,26 @@ buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearch
return ifm;
}
+search::streaming::Normalizing
+normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) {
+ switch (normalize_mode) {
+ case VsmfieldsConfig::Fieldspec::Normalize::NONE: return search::streaming::Normalizing::NONE;
+ case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE: return search::streaming::Normalizing::LOWERCASE;
+ case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE_AND_FOLD: return search::streaming::Normalizing::LOWERCASE_AND_FOLD;
+ }
+ return search::streaming::Normalizing::LOWERCASE_AND_FOLD;
}
-bool
+}
+
+void
FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
{
- bool retval(true);
LOG(spam, "Parsing %zd fields", conf->fieldspec.size());
for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) {
LOG(spam, "Parsing %s", cfs.name.c_str());
FieldIdT fieldId = specMap().size();
- FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, cfs.arg1.c_str(), cfs.maxlength);
+ FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength);
_specMap[fieldId] = std::move(fss);
_nameIdMap.add(cfs.name, fieldId);
LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str());
@@ -283,7 +297,6 @@ FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
}
_documentTypeMap[di.name] = indexMapp;
}
- return retval;
}
void
@@ -338,7 +351,7 @@ FieldSearchSpecMap::get_distance_metric(const vespalib::string& name) const
if (!itr->second.uses_nearest_neighbor_search_method()) {
return dm;
}
- return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.get_arg1());
+ return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.arg1());
}
vespalib::asciistream &
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
index 43bb5b04481..7ba9799991e 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
@@ -10,25 +10,29 @@ namespace vsm {
class FieldSearchSpec
{
public:
+ using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod;
+ using Normalizing = search::streaming::Normalizing;
FieldSearchSpec();
- FieldSearchSpec(const FieldIdT & id, const vespalib::string & name,
- VsmfieldsConfig::Fieldspec::Searchmethod searchMethod,
- const vespalib::string & arg1, size_t maxLength);
+ FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, Searchmethod searchMethod,
+ Normalizing normalize_mode, vespalib::stringref arg1, size_t maxLength);
~FieldSearchSpec();
FieldSearchSpec(FieldSearchSpec&& rhs) noexcept;
FieldSearchSpec& operator=(FieldSearchSpec&& rhs) noexcept;
- const FieldSearcher & searcher() const { return *_searcher; }
- const vespalib::string & name() const { return _name; }
- FieldIdT id() const { return _id; }
- bool valid() const { return static_cast<bool>(_searcher); }
- size_t maxLength() const { return _maxLength; }
- bool uses_nearest_neighbor_search_method() const noexcept { return _searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR; }
+ const FieldSearcher & searcher() const noexcept { return *_searcher; }
+ const vespalib::string & name() const noexcept { return _name; }
+ FieldIdT id() const noexcept { return _id; }
+ bool valid() const noexcept { return static_cast<bool>(_searcher); }
+ size_t maxLength() const noexcept { return _maxLength; }
+ Normalizing normalize_mode() const noexcept { return _normalize_mode; }
+ const vespalib::string& arg1() const noexcept { return _arg1; }
+ bool uses_nearest_neighbor_search_method() const noexcept {
+ return _searchMethod == Searchmethod::NEAREST_NEIGHBOR;
+ }
bool uses_string_search_method() const noexcept {
- return (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) ||
- (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8) ||
- (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8);
+ return (_searchMethod == Searchmethod::UTF8) ||
+ (_searchMethod == Searchmethod::AUTOUTF8) ||
+ (_searchMethod == Searchmethod::SSE2UTF8);
}
- const vespalib::string& get_arg1() const noexcept { return _arg1; }
/**
* Reconfigures the field searcher based on information in the given query term.
@@ -42,7 +46,8 @@ private:
vespalib::string _name;
size_t _maxLength;
FieldSearcherContainer _searcher;
- VsmfieldsConfig::Fieldspec::Searchmethod _searchMethod;
+ Searchmethod _searchMethod;
+ Normalizing _normalize_mode;
vespalib::string _arg1;
bool _reconfigured;
};
@@ -60,7 +65,7 @@ public:
* and a mapping from field name to field id. It then iterates over all document types and index names
* and creates a mapping from index name to list of field ids for each document type.
**/
- bool buildFromConfig(const VsmfieldsHandle & conf);
+ void buildFromConfig(const VsmfieldsHandle & conf);
/**
* Iterates over the given field name vector adding extra elements to the mapping from field name to field id.