Simplify ancient carefully hand optimized code in favour of simple readable code

author: Henning Baldersheim <balder@yahoo-inc.com> 2024-01-09 07:35:14 +0000
committer: Henning Baldersheim <balder@yahoo-inc.com> 2024-01-10 08:12:55 +0000
commit: 02c5bce07737a899726097e577c6dd1121ca5a7c (patch)
tree: e6c73d2df7f9f2c55322330cbc4ba644a2bbb8e0
parent: 4388490c151581bc6e04059baa04b580c80577d3 (diff)
12 files changed, 180 insertions, 186 deletions
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index 64952dbe5b5..a691d7671f9 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -22,6 +22,7 @@ using search::streaming::HitList;
 using search::streaming::QueryNodeResultFactory;
 using search::streaming::QueryTerm;
 using search::streaming::Normalizing;
+using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod;
 using search::streaming::QueryTermList;
 using TermType = QueryTerm::Type;
 using namespace vsm;
@@ -763,28 +764,32 @@ TEST("snippet modifier") {
     }
 }
 
-TEST("FieldSearchSpec constrution") {
+TEST("FieldSearchSpec construction") {
     {
         FieldSearchSpec f;
         EXPECT_FALSE(f.valid());
         EXPECT_EQUAL(0u, f.id());
         EXPECT_EQUAL("", f.name());
         EXPECT_EQUAL(0x100000u, f.maxLength());
+        EXPECT_EQUAL("", f.arg1());
+        EXPECT_TRUE(Normalizing::LOWERCASE_AND_FOLD == f.normalize_mode());
     }
     {
-        FieldSearchSpec f(7, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 789);
+        FieldSearchSpec f(7, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 789);
         EXPECT_TRUE(f.valid());
         EXPECT_EQUAL(7u, f.id());
         EXPECT_EQUAL("f0", f.name());
         EXPECT_EQUAL(789u, f.maxLength());
         EXPECT_EQUAL(789u, f.searcher().maxFieldLength());
+        EXPECT_EQUAL("substring", f.arg1());
+        EXPECT_TRUE(Normalizing::LOWERCASE == f.normalize_mode());
     }
 }
 
 TEST("snippet modifier manager") {
     FieldSearchSpecMapT specMap;
-    specMap[0] = FieldSearchSpec(0, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 1000);
-    specMap[1] = FieldSearchSpec(1, "f1", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "", 1000);
+    specMap[0] = FieldSearchSpec(0, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 1000);
+    specMap[1] = FieldSearchSpec(1, "f1", Searchmethod::AUTOUTF8, Normalizing::NONE, "", 1000);
     IndexFieldMapT indexMap;
     indexMap["i0"].push_back(0);
     indexMap["i1"].push_back(1);
diff --git a/streamingvisitors/src/vespa/vsm/config/vsmfields.def b/streamingvisitors/src/vespa/vsm/config/vsmfields.def
index 442a044d38f..dac732013d2 100644
--- a/streamingvisitors/src/vespa/vsm/config/vsmfields.def
+++ b/streamingvisitors/src/vespa/vsm/config/vsmfields.def
@@ -14,6 +14,7 @@ fieldspec[].name string
 ## The search method for a given field. Note: same field in 2 different document types must match on type if not a random result might be expected.
 fieldspec[].searchmethod enum { NONE, BOOL, AUTOUTF8, UTF8, SSE2UTF8, INT8, INT16, INT32, INT64, FLOAT16, FLOAT, DOUBLE, GEOPOS, NEAREST_NEIGHBOR } default=AUTOUTF8
 fieldspec[].arg1 string default=""
+fieldspec[].normalize enum { NONE, LOWERCASE, LOWERCASE_AND_FOLD } default=LOWERCASE_AND_FOLD
 
 ## Maximum number of chars to search per field.
 fieldspec[].maxlength int default=1048576
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
index 55d80413b8c..b9e1fe8f83c 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
@@ -55,10 +55,8 @@ FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept
       _currentElementId(0),
       _currentElementWeight(1),
       _words(0),
-      _badUtf8Count(0),
-      _zeroCount(0)
+      _badUtf8Count(0)
 {
-    zeroStat();
 }
 
 FieldSearcher::~FieldSearcher() = default;
@@ -114,13 +112,6 @@ FieldSearcher::prepareFieldId()
 }
 
 void
-FieldSearcher::zeroStat()
-{
-    _badUtf8Count = 0;
-    _zeroCount = 0;
-}
-
-void
 FieldSearcher::init()
 {
     for (unsigned i = 0; i < NELEMS(_foldLowCase); i++) {
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
index 663592ed6d3..75ace16328b 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
@@ -59,13 +59,13 @@ public:
     bool exact()                     const { return _matchType == EXACT; }
     bool cased()                     const { return _matchType == CASED; }
     void setMatchType(MatchType mt)        { _matchType = mt; }
+    MatchType match_type() const noexcept { return _matchType; }
     static void init();
     static search::byte fold(search::byte c)               { return _foldLowCase[c]; }
     static search::byte iswordchar(search::byte c)         { return _wordChar[c]; }
     static search::byte isspace(search::byte c)            { return ! iswordchar(c); }
     static size_t countWords(const FieldRef & f);
     int32_t getCurrentWeight()       const { return _currentElementWeight; }
-    void zeroStat();
     FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; }
     size_t maxFieldLength() const { return _maxFieldLength; }
 
@@ -96,11 +96,9 @@ private:
     int32_t       _currentElementWeight; // Contains the weight of the current item being evaluated.
 protected:
     /// Number of terms searched.
-    unsigned _words;
+    unsigned      _words;
     /// Number of utf8 bytes by utf8 size.
-    unsigned _badUtf8Count;
-    unsigned _zeroCount;
-protected:
+    unsigned      _badUtf8Count;
     /**
      * Adds a hit to the given query term.
      * For each call to onValue() a batch of words are processed, and the position is local to this batch.
diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
index 76fedbd1166..816317bf86d 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
@@ -141,17 +141,17 @@ NearestNeighborFieldSearcher::onValue(const document::FieldValue& fv)
 }
 
 DistanceMetric
-NearestNeighborFieldSearcher::distance_metric_from_string(const vespalib::string& value)
+NearestNeighborFieldSearcher::distance_metric_from_string(vespalib::stringref value)
 {
     // Valid string values must match the definition of DistanceMetric in
     // config-model/src/main/java/com/yahoo/schema/document/Attribute.java
-    auto v = value;
+    vespalib::string v = value;
     std::transform(v.begin(), v.end(), v.begin(),
                    [](unsigned char c) { return std::tolower(c); });
     try {
         return DistanceMetricUtils::to_distance_metric(v);
     } catch (vespalib::IllegalStateException&) {
-        vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", value.c_str());
+        vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", v.c_str());
         return DistanceMetric::Euclidean;
     }
 }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
index d08c2fbbc83..ecdc64d1336 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
@@ -11,10 +11,7 @@
 #include <vespa/searchlib/tensor/tensor_ext_attribute.h>
 
 namespace search::fef { class IQueryEnvironment; }
-
-namespace search::tensor {
-class TensorExtAttribute;
-}
+namespace search::tensor { class TensorExtAttribute; }
 
 namespace vsm {
 
@@ -52,7 +49,7 @@ public:
                  search::fef::IQueryEnvironment& query_env) override;
     void onValue(const document::FieldValue& fv) override;
 
-    static search::attribute::DistanceMetric distance_metric_from_string(const vespalib::string& value);
+    static search::attribute::DistanceMetric distance_metric_from_string(vespalib::stringref value);
 };
 
 }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
index 00828bcc7b1..fa1fc83728c 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
@@ -18,17 +18,15 @@ UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
 {
     (void) mintsz;
     termcount_t words(0);
-    const byte * n = reinterpret_cast<const byte *> (f.data());
-    const byte * e = n + f.size();
     if (f.size() >= _buf->size()) {
         _buf->reserve(f.size() + 1);
     }
     cmptype_t * fn = &(*_buf.get())[0];
-    size_t fl(0);
 
-    for( ; n < e; ) {
-        if (!*n) { _zeroCount++; n++; }
-        n = tokenize(n, _buf->capacity(), fn, fl);
+    TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+    while ( reader.hasNext() ) {
+        tokenize(reader);
+        size_t fl = reader.complete();
         for (auto qt : _qtl) {
             const cmptype_t * term;
             termsize_t tsz = qt->term(term);
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index 1148083b042..ce63f55ea63 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -1,7 +1,6 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "utf8stringfieldsearcherbase.h"
-#include <vespa/fastlib/text/normwordfolder.h>
 #include <cassert>
 
 using search::streaming::QueryTerm;
@@ -10,107 +9,36 @@ using search::byte;
 
 namespace vsm {
 
-const byte *
-UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen)
-{
-    if (maxSz > 0) {
-        maxSz--;
-    }
-    ucs4_t c(*p);
-    ucs4_t *q(dstbuf);
-    const byte * end(p+maxSz);
-
-    // Skip non-word characters between words
-    for (; p < end; ) {
-        if (c < 128) {
-            if (!c) { break; }
-            p++;
-            if (__builtin_expect(Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) {
-                *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c);
-                c = 0;
-            } else {
-                c = *p;
-            }
-        } else {
-            c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
-            if (Fast_UnicodeUtil::IsWordChar(c)) {
-                const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
-                if (repl != nullptr) {
-                    size_t repllen = strlen(repl);
-                    if (repllen > 0) {
-                        q = Fast_UnicodeUtil::ucs4copy(q,repl);
-                    }
-                } else {
-                    c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
-                    *q++ = c;
-                }
-                break;
-            } else {
-                if (c == Fast_UnicodeUtil::_BadUTF8Char) {
-                    _badUtf8Count++;
-                }
-                c = *p;
-            }
-        }
-    }
-
-    c = *p;  // Next char
-    for (; p < end;) {
-        if (c < 128) {             // Common case, ASCII
-            if (!c) { break; }
-            p++;
-            if (__builtin_expect(!Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) {
-                c = 0;
-            } else {
-                *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c);
-                c = *p;
-            }
-        } else {
-            c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
-            if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) {
-                const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
-                if (repl != nullptr) {
-                    size_t repllen = strlen(repl);
-                    if (repllen > 0) {
-                        q = Fast_UnicodeUtil::ucs4copy(q,repl);
-                    }
-                } else {
-                    c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
-                    *q++ = c;
-                }
-
-                c = *p;
-            } else {
-                if (c == Fast_UnicodeUtil::_BadUTF8Char) {
-                    _badUtf8Count++;
-                }
-                break;
-            }
+template<typename Reader>
+void
+UTF8StringFieldSearcherBase::tokenize(Reader & reader) {
+    ucs4_t c(0);
+    Normalizing norm_mode = normalize_mode();
+    while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next()));
+
+    if (Fast_UnicodeUtil::IsWordChar(c)) {
+        reader.normalize(c, norm_mode);
+        while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) {
+            reader.normalize(c, norm_mode);
         }
     }
-    *q = 0;
-    tokenlen = q - dstbuf;
-    return p;
 }
 
 size_t
 UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt)
 {
     termcount_t words(0);
-    const byte * n = reinterpret_cast<const byte *> (f.data());
-    // __builtin_prefetch(n, 0, 0);
     const cmptype_t * term;
     termsize_t tsz = qt.term(term);
-    const byte * e = n + f.size();
     if ( f.size() >= _buf->size()) {
         _buf->reserve(f.size() + 1);
     }
-    cmptype_t * fn = &(*_buf.get())[0];
-    size_t fl(0);
+    cmptype_t * fn = _buf->data();
 
-    for( ; n < e; ) {
-        if (!*n) { _zeroCount++; n++; }
-        n = tokenize(n, _buf->capacity(), fn, fl);
+    TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+    while ( reader.hasNext() ) {
+        tokenize(reader);
+        size_t fl = reader.complete();
         if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) {
             const cmptype_t *tt=term, *et=term+tsz;
             for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++);
@@ -185,22 +113,17 @@ size_t
 UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
 {
     termcount_t words = 0;
-    const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
-    const byte * srcend = srcbuf + f.size();
     const cmptype_t * term;
     termsize_t tsz = qt.term(term);
     if (f.size() >= _buf->size()) {
         _buf->reserve(f.size() + 1);
     }
-    cmptype_t * dstbuf = &(*_buf.get())[0];
-    size_t tokenlen = 0;
+    cmptype_t * dstbuf = _buf->data();
 
-    for( ; srcbuf < srcend; ) {
-        if (*srcbuf == 0) {
-            ++_zeroCount;
-            ++srcbuf;
-        }
-        srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+    TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
+    while ( reader.hasNext() ) {
+        tokenize(reader);
+        size_t tokenlen = reader.complete();
         if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) {
             addHit(qt, words);
         }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
index 1362b3c4f1d..ed76fb79f4e 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
@@ -2,6 +2,7 @@
 #pragma once
 
 #include "strchrfieldsearcher.h"
+#include <vespa/fastlib/text/normwordfolder.h>
 
 namespace vsm {
 
@@ -34,9 +35,9 @@ public:
         void onOffset(size_t) { }
         void incBuf(size_t inc) { _cbuf += inc; }
         ucs4_t * getBuf() { return _cbuf; }
-        bool valid() { return true; }
-        size_t size() { return (_cbuf - _bbuf); }
-        bool hasOffsets() { return false; }
+        bool valid() const noexcept { return true; }
+        size_t size() const noexcept { return (_cbuf - _bbuf); }
+        bool hasOffsets() const noexcept { return false; }
     };
 
     /**
@@ -53,14 +54,81 @@ public:
         explicit OffsetWrapper(ucs4_t * buf, size_t * offsets) noexcept : BufferWrapper(buf), _boff(offsets), _coff(offsets) {}
         void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; }
         void onOffset(size_t of) { *_coff++ = of; }
-        bool valid() { return (size() == (size_t)(_coff - _boff)); }
-        bool hasOffsets() { return true; }
+        bool valid() const noexcept { return (size() == (size_t)(_coff - _boff)); }
+        bool hasOffsets() const noexcept { return true; }
     };
 
 protected:
     SharedSearcherBuf _buf;
 
-    const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen);
+    using byte = search::byte;
+    using Normalizing = search::streaming::Normalizing;
+
+    class TokenizeReader {
+    public:
+        TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
+            : _p(p),
+              _p_end(p + len),
+              _q(q),
+              _q_start(q)
+        {}
+        ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
+        void normalize(ucs4_t c, Normalizing normalize_mode) {
+            switch (normalize_mode) {
+                case Normalizing::LOWERCASE:
+                    c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+                    [[fallthrough]];
+                case Normalizing::NONE:
+                    *_q++ = c;
+                    break;
+                case Normalizing::LOWERCASE_AND_FOLD:
+                    fold(c);
+                    break;
+            }
+        }
+        bool hasNext() const noexcept { return _p < _p_end; }
+        const byte * p() const noexcept { return _p; }
+        size_t complete() noexcept {
+            *_q = 0;
+            size_t token_len = _q - _q_start;
+            _q = _q_start;
+            return token_len;
+        }
+    private:
+        void fold(ucs4_t c) {
+            const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
+            if (repl != nullptr) {
+                size_t repllen = strlen(repl);
+                if (repllen > 0) {
+                    _q = Fast_UnicodeUtil::ucs4copy(_q,repl);
+                }
+            } else {
+                c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+                *_q++ = c;
+            }
+        }
+        void lowercase(ucs4_t c) {
+            c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+            *_q++ = c;
+        }
+        const byte *_p;
+        const byte *_p_end;
+        ucs4_t     *_q;
+        ucs4_t     *_q_start;
+    };
+
+
+    template<typename Reader>
+    void tokenize(Reader & reader);
+
+    Normalizing normalize_mode() const noexcept {
+        switch (match_type()) {
+            case EXACT: return Normalizing::LOWERCASE;
+            case CASED: return Normalizing::NONE;
+            default: return Normalizing::LOWERCASE_AND_FOLD;
+        }
+        return Normalizing::LOWERCASE_AND_FOLD;
+    }
 
     /**
      * Matches the given query term against the words in the given field reference
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
index e28ce114225..4318d5fe1a3 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
@@ -14,24 +14,19 @@ UTF8SuffixStringFieldSearcher::duplicate() const
 }
 
 size_t
-UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
 {
     (void) mintsz;
     termcount_t words = 0;
-    const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
-    const byte * srcend = srcbuf + f.size();
     if (f.size() >= _buf->size()) {
         _buf->reserve(f.size() + 1);
     }
     cmptype_t * dstbuf = &(*_buf.get())[0];
-    size_t tokenlen = 0;
 
-    for( ; srcbuf < srcend; ) {
-        if (*srcbuf == 0) {
-            ++_zeroCount;
-            ++srcbuf;
-        }
-        srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+    TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
+    while ( reader.hasNext() ) {
+        tokenize(reader);
+        size_t tokenlen = reader.complete();
         for (auto qt : _qtl) {
             const cmptype_t * term;
             termsize_t tsz = qt->term(term);
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
index 4b0efd58a56..22934ba74d2 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
@@ -40,6 +40,8 @@ setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) {
         searcher->setMatchType(FieldSearcher::EXACT);
     } else if (arg1 == "word") {
         searcher->setMatchType(FieldSearcher::EXACT);
+    } else if (arg1 == "cased") {
+        searcher->setMatchType(FieldSearcher::CASED);
     }
 }
 
@@ -51,6 +53,7 @@ FieldSearchSpec::FieldSearchSpec()
       _maxLength(0x100000),
       _searcher(),
       _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE),
+      _normalize_mode(Normalizing::LOWERCASE_AND_FOLD),
       _arg1(),
       _reconfigured(false)
 {
@@ -60,15 +63,15 @@ FieldSearchSpec::~FieldSearchSpec() = default;
 FieldSearchSpec::FieldSearchSpec(FieldSearchSpec&& rhs) noexcept = default;
 FieldSearchSpec& FieldSearchSpec::operator=(FieldSearchSpec&& rhs) noexcept = default;
 
-FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname,
-                                 VsmfieldsConfig::Fieldspec::Searchmethod searchDef,
-                                 const vespalib::string & arg1, size_t maxLength_) :
+FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, Searchmethod searchDef,
+                                 Normalizing normalize_mode, vespalib::stringref arg1_in, size_t maxLength_in) :
     _id(fid),
     _name(fname),
-    _maxLength(maxLength_),
+    _maxLength(maxLength_in),
     _searcher(),
     _searchMethod(searchDef),
-    _arg1(arg1),
+    _normalize_mode(normalize_mode),
+    _arg1(arg1_in),
     _reconfigured(false)
 {
     switch(searchDef) {
@@ -79,14 +82,16 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string &
     case VsmfieldsConfig::Fieldspec::Searchmethod::NONE:
     case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8:
     case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8:
-        if (arg1 == "substring") {
+        if (_arg1 == "substring") {
             _searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid);
-        } else if (arg1 == "suffix") {
+        } else if (_arg1 == "suffix") {
             _searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid);
-        } else if (arg1 == "exact") {
+        } else if (_arg1 == "exact") {
             _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid);
-        } else if (arg1 == "word") {
+        } else if (_arg1 == "word") {
             _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid);
+        } else if (_arg1 == "cased") {
+            _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid);
         } else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) {
             _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid);
         } else {
@@ -112,12 +117,12 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string &
         _searcher = std::make_unique<GeoPosFieldSearcher>(fid);
         break;
     case VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR:
-        auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(arg1);
+        auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(_arg1);
         _searcher = std::make_unique<NearestNeighborFieldSearcher>(fid, dm);
         break;
     }
     if (_searcher) {
-        setMatchType(_searcher, arg1);
+        setMatchType(_searcher, _arg1);
         _searcher->maxFieldLength(maxLength());
     }
 }
@@ -166,20 +171,20 @@ FieldSearchSpecMap::FieldSearchSpecMap() = default;
 FieldSearchSpecMap::~FieldSearchSpecMap() = default;
 
 namespace {
-    const std::string _G_empty("");
-    const std::string _G_value(".value");
-    const std::regex _G_map1("\\{[a-zA-Z0-9]+\\}");
-    const std::regex _G_map2("\\{\".*\"\\}");
-    const std::regex _G_array("\\[[0-9]+\\]");
+    const std::string G_empty;
+    const std::string G_value(".value");
+    const std::regex G_map1("\\{[a-zA-Z0-9]+\\}");
+    const std::regex G_map2("\\{\".*\"\\}");
+    const std::regex G_array("\\[[0-9]+\\]");
 }
 
 vespalib::string
 FieldSearchSpecMap::stripNonFields(vespalib::stringref rawIndex)
 {
     if ((rawIndex.find('[') != vespalib::string::npos) || (rawIndex.find('{') != vespalib::string::npos)) {
-        std::string index = std::regex_replace(std::string(rawIndex), _G_map1, _G_value);
-        index = std::regex_replace(index, _G_map2, _G_value);
-        index = std::regex_replace(index, _G_array, _G_empty);
+        std::string index = std::regex_replace(std::string(rawIndex), G_map1, G_value);
+        index = std::regex_replace(index, G_map2, G_value);
+        index = std::regex_replace(index, G_array, G_empty);
         return index;
     }
     return rawIndex;
@@ -258,17 +263,26 @@ buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearch
     return ifm;
 }
 
+search::streaming::Normalizing
+normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) {
+    switch (normalize_mode) {
+        case VsmfieldsConfig::Fieldspec::Normalize::NONE: return search::streaming::Normalizing::NONE;
+        case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE: return search::streaming::Normalizing::LOWERCASE;
+        case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE_AND_FOLD: return search::streaming::Normalizing::LOWERCASE_AND_FOLD;
+    }
+    return search::streaming::Normalizing::LOWERCASE_AND_FOLD;
 }
 
-bool
+}
+
+void
 FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
 {
-    bool retval(true);
     LOG(spam, "Parsing %zd fields", conf->fieldspec.size());
     for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) {
         LOG(spam, "Parsing %s", cfs.name.c_str());
         FieldIdT fieldId = specMap().size();
-        FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, cfs.arg1.c_str(), cfs.maxlength);
+        FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength);
         _specMap[fieldId] = std::move(fss);
         _nameIdMap.add(cfs.name, fieldId);
         LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str());
@@ -283,7 +297,6 @@ FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
         }
         _documentTypeMap[di.name] = indexMapp;
     }
-    return retval;
 }
 
 void
@@ -338,7 +351,7 @@ FieldSearchSpecMap::get_distance_metric(const vespalib::string& name) const
     if (!itr->second.uses_nearest_neighbor_search_method()) {
         return dm;
     }
-    return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.get_arg1());
+    return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.arg1());
 }
 
 vespalib::asciistream &
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
index 43bb5b04481..7ba9799991e 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
@@ -10,25 +10,29 @@ namespace vsm {
 class FieldSearchSpec
 {
 public:
+    using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod;
+    using Normalizing = search::streaming::Normalizing;
     FieldSearchSpec();
-    FieldSearchSpec(const FieldIdT & id, const vespalib::string & name,
-                    VsmfieldsConfig::Fieldspec::Searchmethod searchMethod,
-                    const vespalib::string & arg1, size_t maxLength);
+    FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, Searchmethod searchMethod,
+                    Normalizing normalize_mode, vespalib::stringref arg1, size_t maxLength);
     ~FieldSearchSpec();
     FieldSearchSpec(FieldSearchSpec&& rhs) noexcept;
     FieldSearchSpec& operator=(FieldSearchSpec&& rhs) noexcept;
-    const FieldSearcher & searcher() const { return *_searcher; }
-    const vespalib::string &  name() const { return _name; }
-    FieldIdT                    id() const { return _id; }
-    bool                     valid() const { return static_cast<bool>(_searcher); }
-    size_t               maxLength() const { return _maxLength; }
-    bool uses_nearest_neighbor_search_method() const noexcept { return _searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR; }
+    const FieldSearcher & searcher() const noexcept { return *_searcher; }
+    const vespalib::string &  name() const noexcept { return _name; }
+    FieldIdT                    id() const noexcept { return _id; }
+    bool                     valid() const noexcept { return static_cast<bool>(_searcher); }
+    size_t               maxLength() const noexcept { return _maxLength; }
+    Normalizing     normalize_mode() const noexcept { return _normalize_mode; }
+    const vespalib::string&   arg1() const noexcept { return _arg1; }
+    bool uses_nearest_neighbor_search_method() const noexcept {
+        return _searchMethod == Searchmethod::NEAREST_NEIGHBOR;
+    }
     bool uses_string_search_method() const noexcept {
-        return  (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) ||
-                (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8) ||
-                (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8);
+        return  (_searchMethod == Searchmethod::UTF8) ||
+                (_searchMethod == Searchmethod::AUTOUTF8) ||
+                (_searchMethod == Searchmethod::SSE2UTF8);
     }
-    const vespalib::string& get_arg1() const noexcept { return _arg1; }
 
     /**
      * Reconfigures the field searcher based on information in the given query term.
@@ -42,7 +46,8 @@ private:
     vespalib::string       _name;
     size_t                 _maxLength;
     FieldSearcherContainer _searcher;
-    VsmfieldsConfig::Fieldspec::Searchmethod _searchMethod;
+    Searchmethod           _searchMethod;
+    Normalizing            _normalize_mode;
     vespalib::string       _arg1;
     bool                   _reconfigured;
 };
@@ -60,7 +65,7 @@ public:
      * and a mapping from field name to field id. It then iterates over all document types and index names
      * and creates a mapping from index name to list of field ids for each document type.
      **/
-    bool buildFromConfig(const VsmfieldsHandle & conf);
+    void buildFromConfig(const VsmfieldsHandle & conf);
 
     /**
      * Iterates over the given field name vector adding extra elements to the mapping from field name to field id.
author	Henning Baldersheim <balder@yahoo-inc.com>	2024-01-09 07:35:14 +0000
committer	Henning Baldersheim <balder@yahoo-inc.com>	2024-01-10 08:12:55 +0000
commit	02c5bce07737a899726097e577c6dd1121ca5a7c (patch)
tree	e6c73d2df7f9f2c55322330cbc4ba644a2bbb8e0
parent	4388490c151581bc6e04059baa04b580c80577d3 (diff)