Merge branch 'master' into balder/sliced-parallell-or

author: Henning Baldersheim <balder@yahoo-inc.com> 2024-01-15 10:23:18 +0100
committer: GitHub <noreply@github.com> 2024-01-15 10:23:18 +0100
commit: 29a807d35ac5d9e76ea1b8d653bb25b0e4e2dc73 (patch)
tree: d55fddad443566300bd4a7fdd3ef1118a8460700 /streamingvisitors
parent: 48b1bae2a6cdf58a237aa7be59632a06aba86861 (diff)
parent: 252fbeed13b8622fbc813620dc3b4e45abc6bbe2 (diff)
38 files changed, 629 insertions, 654 deletions
diff --git a/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp b/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp
index 2d138d1d336..93e35e4c6d2 100644
--- a/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp
+++ b/streamingvisitors/src/tests/rank_processor/rank_processor_test.cpp
@@ -40,7 +40,7 @@ protected:
 
 RankProcessorTest::RankProcessorTest()
     : testing::Test(),
-      _factory(),
+      _factory(nullptr),
       _query(),
       _query_wrapper()
 {
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index 4492dfac02b..7f89071868a 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -15,12 +15,15 @@
 #include <vespa/vsm/searcher/utf8substringsearcher.h>
 #include <vespa/vsm/searcher/utf8substringsnippetmodifier.h>
 #include <vespa/vsm/searcher/utf8suffixstringfieldsearcher.h>
+#include <vespa/vsm/searcher/tokenizereader.h>
 #include <vespa/vsm/vsm/snippetmodifier.h>
 
 using namespace document;
 using search::streaming::HitList;
 using search::streaming::QueryNodeResultFactory;
 using search::streaming::QueryTerm;
+using search::streaming::Normalizing;
+using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod;
 using search::streaming::QueryTermList;
 using TermType = QueryTerm::Type;
 using namespace vsm;
@@ -47,7 +50,7 @@ class String
 private:
     const std::string & _str;
 public:
-    String(const std::string & str) : _str(str) {}
+    explicit String(const std::string & str) : _str(str) {}
     bool operator==(const String & rhs) const {
         return _str == rhs._str;
     }
@@ -56,14 +59,14 @@ public:
 class Query
 {
 private:
-    void setupQuery(const StringList & terms) {
-        for (size_t i = 0; i < terms.size(); ++i) {
-            ParsedQueryTerm pqt = parseQueryTerm(terms[i]);
+    void setupQuery(const StringList & terms, Normalizing normalizing) {
+        for (const auto & term : terms) {
+            ParsedQueryTerm pqt = parseQueryTerm(term);
             ParsedTerm pt = parseTerm(pqt.second);
-            qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second));
+            qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second, normalizing));
         }
-        for (size_t i = 0; i < qtv.size(); ++i) {
-            qtl.push_back(qtv[i].get());
+        for (const auto & i : qtv) {
+            qtl.push_back(i.get());
         }
     }
 public:
@@ -72,14 +75,16 @@ public:
     QueryNodeResultFactory   eqnr;
     std::vector<QueryTerm::UP> qtv;
     QueryTermList          qtl;
-    Query(const StringList & terms);
+
+    explicit Query(const StringList & terms) : Query(terms, Normalizing::LOWERCASE_AND_FOLD) {}
+    Query(const StringList & terms, Normalizing normalizing);
     ~Query();
     static ParsedQueryTerm parseQueryTerm(const std::string & queryTerm) {
         size_t i = queryTerm.find(':');
         if (i != std::string::npos) {
-            return ParsedQueryTerm(queryTerm.substr(0, i), queryTerm.substr(i + 1));
+            return {queryTerm.substr(0, i), queryTerm.substr(i + 1)};
         }
-        return ParsedQueryTerm(std::string(), queryTerm);
+        return {std::string(), queryTerm};
     }
     static ParsedTerm parseTerm(const std::string & term) {
         if (term[0] == '*' && term[term.size() - 1] == '*') {
@@ -94,8 +99,8 @@ public:
     }
 };
 
-Query::Query(const StringList & terms) : eqnr(), qtv(), qtl() {
-    setupQuery(terms);
+Query::Query(const StringList & terms, Normalizing normalizing) : eqnr(), qtv(), qtl() {
+    setupQuery(terms, normalizing);
 }
 Query::~Query() = default;
 
@@ -111,7 +116,7 @@ struct SnippetModifierSetup
 
 SnippetModifierSetup::SnippetModifierSetup(const StringList & terms)
     : query(terms),
-      searcher(new UTF8SubstringSnippetModifier()),
+      searcher(new UTF8SubstringSnippetModifier(0)),
       env(),
       modifier(searcher)
 {
@@ -254,8 +259,8 @@ getFieldValue(const StringList & fv)
 
     static ArrayDataType type(*DataType::STRING);
     ArrayFieldValue afv(type);
-    for (size_t i = 0; i < fv.size(); ++i) {
-        afv.add(StringFieldValue(fv[i]));
+    for (const auto & v : fv) {
+        afv.add(StringFieldValue(v));
     }
     return afv;
 }
@@ -265,8 +270,8 @@ getFieldValue(const LongList & fv)
 {
     static ArrayDataType type(*DataType::LONG);
     ArrayFieldValue afv(type);
-    for (size_t i = 0; i < fv.size(); ++i) {
-        afv.add(LongFieldValue(fv[i]));
+    for (long v : fv) {
+        afv.add(LongFieldValue(v));
     }
     return afv;
 }
@@ -276,8 +281,8 @@ getFieldValue(const FloatList & fv)
 {
     static ArrayDataType type(*DataType::FLOAT);
     ArrayFieldValue afv(type);
-    for (size_t i = 0; i < fv.size(); ++i) {
-        afv.add(FloatFieldValue(fv[i]));
+    for (float v : fv) {
+        afv.add(FloatFieldValue(v));
     }
     return afv;
 }
@@ -286,8 +291,8 @@ bool
 assertMatchTermSuffix(const std::string & term, const std::string & word)
 {
     QueryNodeResultFactory eqnr;
-    QueryTerm qa(eqnr.create(), term, "index", TermType::WORD);
-    QueryTerm qb(eqnr.create(), word, "index", TermType::WORD);
+    QueryTerm qa(eqnr.create(), term, "index", TermType::WORD, Normalizing::LOWERCASE_AND_FOLD);
+    QueryTerm qb(eqnr.create(), word, "index", TermType::WORD, Normalizing::LOWERCASE_AND_FOLD);
     const ucs4_t * a;
     size_t alen = qa.term(a);
     const ucs4_t * b;
@@ -299,8 +304,8 @@ void
 assertNumeric(FieldSearcher & fs, const StringList & query, const FieldValue & fv, const BoolList & exp)
 {
     HitsList hl;
-    for (size_t i = 0; i < exp.size(); ++i) {
-        hl.push_back(exp[i] ? Hits().add(0) : Hits());
+    for (bool v : exp) {
+        hl.push_back(v ? Hits().add(0) : Hits());
     }
     assertSearch(fs, query, fv, hl);
 }
@@ -308,7 +313,7 @@ assertNumeric(FieldSearcher & fs, const StringList & query, const FieldValue & f
 std::vector<QueryTerm::UP>
 performSearch(FieldSearcher & fs, const StringList & query, const FieldValue & fv)
 {
-    Query q(query);
+    Query q(query, fs.normalize_mode());
 
     // prepare field searcher
     test::MockFieldSearcherEnv env;
@@ -316,7 +321,7 @@ performSearch(FieldSearcher & fs, const StringList & query, const FieldValue & f
 
     // setup document
     SharedFieldPathMap sfim(new FieldPathMapT());
-    sfim->push_back(FieldPath());
+    sfim->emplace_back();
     StorageDocument doc(std::make_unique<document::Document>(), sfim, 1);
     doc.setField(0, document::FieldValue::UP(fv.clone()));
 
@@ -342,7 +347,7 @@ assertSearch(FieldSearcher & fs, const StringList & query, const FieldValue & fv
 
 bool
 assertFieldInfo(FieldSearcher & fs, const StringList & query,
-                              const FieldValue & fv, const FieldInfoList & exp)
+                const FieldValue & fv, const FieldInfoList & exp)
 {
     auto qtv = performSearch(fs, query, fv);
     if (!EXPECT_EQUAL(qtv.size(), exp.size())) return false;
@@ -358,7 +363,7 @@ assertFieldInfo(FieldSearcher & fs, const StringList & query,
 void
 assertSnippetModifier(const StringList & query, const std::string & fv, const std::string & exp)
 {
-    UTF8SubstringSnippetModifier mod;
+    UTF8SubstringSnippetModifier mod(0);
     performSearch(mod, query, StringFieldValue(fv));
     EXPECT_EQUAL(mod.getModifiedBuf().getPos(), exp.size());
     std::string actual(mod.getModifiedBuf().getBuffer(), mod.getModifiedBuf().getPos());
@@ -369,7 +374,7 @@ assertSnippetModifier(const StringList & query, const std::string & fv, const st
 void assertSnippetModifier(SnippetModifierSetup & setup, const FieldValue & fv, const std::string & exp)
 {
     FieldValue::UP mfv = setup.modifier.modify(fv);
-    const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(*mfv.get());
+    const auto & lfv = static_cast<const document::LiteralFieldValueB &>(*mfv.get());
     const std::string & actual = lfv.getValue();
     EXPECT_EQUAL(actual.size(), exp.size());
     EXPECT_EQUAL(actual, exp);
@@ -377,11 +382,11 @@ void assertSnippetModifier(SnippetModifierSetup & setup, const FieldValue & fv,
 
 void assertQueryTerms(const SnippetModifierManager & man, FieldIdT fId, const StringList & terms)
 {
-    if (terms.size() == 0) {
-        ASSERT_TRUE(man.getModifiers().getModifier(fId) == NULL);
+    if (terms.empty()) {
+        ASSERT_TRUE(man.getModifiers().getModifier(fId) == nullptr);
         return;
     }
-    ASSERT_TRUE(man.getModifiers().getModifier(fId) != NULL);
+    ASSERT_TRUE(man.getModifiers().getModifier(fId) != nullptr);
     UTF8SubstringSnippetModifier * searcher =
         (static_cast<SnippetModifier *>(man.getModifiers().getModifier(fId)))->getSearcher().get();
     EXPECT_EQUAL(searcher->getQueryTerms().size(), terms.size());
@@ -437,11 +442,11 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs)
     assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits()).add(Hits()));
     assertString(fs, StringList().add("and").add("overloading"), field, HitsList().add(Hits().add(1)).add(Hits().add(3)));
 
-    fs.setMatchType(FieldSearcher::PREFIX);
+    fs.match_type(FieldSearcher::PREFIX);
     assertString(fs, "oper",  field, Hits().add(0).add(2));
     assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits().add(0).add(2)).add(Hits()));
 
-    fs.setMatchType(FieldSearcher::REGULAR);
+    fs.match_type(FieldSearcher::REGULAR);
     if (!EXPECT_TRUE(testStringFieldInfo(fs))) return false;
 
     { // test handling of several underscores
@@ -466,7 +471,7 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs)
     TEST("verify correct term parsing") {
         ASSERT_TRUE(Query::parseQueryTerm("index:term").first == "index");
         ASSERT_TRUE(Query::parseQueryTerm("index:term").second == "term");
-        ASSERT_TRUE(Query::parseQueryTerm("term").first == "");
+        ASSERT_TRUE(Query::parseQueryTerm("term").first.empty());
         ASSERT_TRUE(Query::parseQueryTerm("term").second == "term");
         ASSERT_TRUE(Query::parseTerm("*substr*").first == "substr");
         ASSERT_TRUE(Query::parseTerm("*substr*").second == TermType::SUBSTRINGTERM);
@@ -550,12 +555,12 @@ TEST("utf8 substring search with empty term")
 TEST("utf8 suffix search") {
     UTF8SuffixStringFieldSearcher fs(0);
     std::string field = "operators and operator overloading";
-    assertString(fs, "rsand", field, Hits());
-    assertString(fs, "tor",   field, Hits().add(2));
-    assertString(fs, "tors",  field, Hits().add(0));
+    TEST_DO(assertString(fs, "rsand", field, Hits()));
+    TEST_DO(assertString(fs, "tor",   field, Hits().add(2)));
+    TEST_DO(assertString(fs, "tors",  field, Hits().add(0)));
 
-    assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits()));
-    assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3)));
+    TEST_DO(assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits())));
+    TEST_DO(assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))));
 
     EXPECT_TRUE(testStringFieldInfo(fs));
 }
@@ -587,22 +592,22 @@ TEST("utf8 flexible searcher"){
 
     // prefix
     assertString(fs, "vesp*",  "vespa", Hits().add(0));
-    fs.setMatchType(FieldSearcher::PREFIX);
+    fs.match_type(FieldSearcher::PREFIX);
     assertString(fs, "vesp",   "vespa", Hits().add(0));
 
     // substring
-    fs.setMatchType(FieldSearcher::REGULAR);
+    fs.match_type(FieldSearcher::REGULAR);
     assertString(fs, "*esp*",  "vespa", Hits().add(0));
-    fs.setMatchType(FieldSearcher::SUBSTRING);
+    fs.match_type(FieldSearcher::SUBSTRING);
     assertString(fs, "esp",  "vespa", Hits().add(0));
 
     // suffix
-    fs.setMatchType(FieldSearcher::REGULAR);
+    fs.match_type(FieldSearcher::REGULAR);
     assertString(fs, "*espa",  "vespa", Hits().add(0));
-    fs.setMatchType(FieldSearcher::SUFFIX);
+    fs.match_type(FieldSearcher::SUFFIX);
     assertString(fs, "espa",  "vespa", Hits().add(0));
 
-    fs.setMatchType(FieldSearcher::REGULAR);
+    fs.match_type(FieldSearcher::REGULAR);
     EXPECT_TRUE(testStringFieldInfo(fs));
 }
 
@@ -656,7 +661,7 @@ TEST("integer search")
 
 TEST("floating point search")
 {
-    FloatFieldSearcher fs;
+    FloatFieldSearcher fs(0);
     TEST_DO(assertFloat(fs,         "10",    10, true));
     TEST_DO(assertFloat(fs,       "10.5",  10.5, true));
     TEST_DO(assertFloat(fs,      "-10.5", -10.5, true));
@@ -723,7 +728,7 @@ TEST("Snippet modifier search") {
                                                       "\xe7\x9f\xb3\x1f\xe6\x98\x8e\xe5\x87\xb1\x1f\xe5\x9c\xa8");
 
     { // check that resizing works
-        UTF8SubstringSnippetModifier mod;
+        UTF8SubstringSnippetModifier mod(0);
         EXPECT_EQUAL(mod.getModifiedBuf().getLength(), 32u);
         EXPECT_EQUAL(mod.getModifiedBuf().getPos(), 0u);
         performSearch(mod, StringList().add("a"), StringFieldValue("aaaaaaaaaaaaaaaa"));
@@ -760,28 +765,32 @@ TEST("snippet modifier") {
     }
 }
 
-TEST("FieldSearchSpec constrution") {
+TEST("FieldSearchSpec construction") {
     {
         FieldSearchSpec f;
         EXPECT_FALSE(f.valid());
         EXPECT_EQUAL(0u, f.id());
         EXPECT_EQUAL("", f.name());
         EXPECT_EQUAL(0x100000u, f.maxLength());
+        EXPECT_EQUAL("", f.arg1());
+        EXPECT_TRUE(Normalizing::LOWERCASE_AND_FOLD == f.normalize_mode());
     }
     {
-        FieldSearchSpec f(7, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 789);
+        FieldSearchSpec f(7, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 789);
         EXPECT_TRUE(f.valid());
         EXPECT_EQUAL(7u, f.id());
         EXPECT_EQUAL("f0", f.name());
         EXPECT_EQUAL(789u, f.maxLength());
         EXPECT_EQUAL(789u, f.searcher().maxFieldLength());
+        EXPECT_EQUAL("substring", f.arg1());
+        EXPECT_TRUE(Normalizing::LOWERCASE == f.normalize_mode());
     }
 }
 
 TEST("snippet modifier manager") {
     FieldSearchSpecMapT specMap;
-    specMap[0] = FieldSearchSpec(0, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 1000);
-    specMap[1] = FieldSearchSpec(1, "f1", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "", 1000);
+    specMap[0] = FieldSearchSpec(0, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 1000);
+    specMap[1] = FieldSearchSpec(1, "f1", Searchmethod::AUTOUTF8, Normalizing::NONE, "", 1000);
     IndexFieldMapT indexMap;
     indexMap["i0"].push_back(0);
     indexMap["i1"].push_back(1);
@@ -822,13 +831,13 @@ TEST("snippet modifier manager") {
         Query query(StringList().add("i2:foo").add("i2:*bar*"));
         man.setup(query.qtl, specMap, indexMap, *env.field_paths, env.query_env);
         {
-            SnippetModifier * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(0));
+            auto * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(0));
             UTF8SubstringSnippetModifier * searcher = sm->getSearcher().get();
             EXPECT_EQUAL(sm->getValueBuf().getLength(), 128u);
             EXPECT_EQUAL(searcher->getModifiedBuf().getLength(), 64u);
         }
         {
-            SnippetModifier * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(1));
+            auto * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(1));
             UTF8SubstringSnippetModifier * searcher = sm->getSearcher().get();
             EXPECT_EQUAL(sm->getValueBuf().getLength(), 128u);
             EXPECT_EQUAL(searcher->getModifiedBuf().getLength(), 64u);
@@ -863,4 +872,24 @@ TEST("counting of words") {
     assertString(fs, StringList().add("bb").add("not"), field, HitsList().add(Hits().add(2)).add(Hits()));
 }
 
+vespalib::string NormalizationInput = "test That Somehing happens with during NårmØlization";
+
+void
+verifyNormalization(Normalizing normalizing, size_t expected_len, const char * expected) {
+    ucs4_t buf[256];
+    TokenizeReader reader(reinterpret_cast<const search::byte *>(NormalizationInput.c_str()), NormalizationInput.size(), buf);
+    while (reader.hasNext()) {
+        reader.normalize(reader.next(), normalizing);
+    }
+    size_t len = reader.complete();
+    EXPECT_EQUAL(expected_len, len);
+    EXPECT_EQUAL(0,  Fast_UnicodeUtil::utf8cmp(expected, buf));
+}
+
+TEST("test normalizing") {
+    verifyNormalization(Normalizing::NONE, 52, NormalizationInput.c_str());
+    verifyNormalization(Normalizing::LOWERCASE, 52, "test that somehing happens with during nårmølization");
+    verifyNormalization(Normalizing::LOWERCASE_AND_FOLD, 54, "test that somehing happens with during naarmoelization");
+}
+
 TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/streamingvisitors/src/tests/textutil/textutil_test.cpp b/streamingvisitors/src/tests/textutil/textutil_test.cpp
index b926444e4df..f7f340a2182 100644
--- a/streamingvisitors/src/tests/textutil/textutil_test.cpp
+++ b/streamingvisitors/src/tests/textutil/textutil_test.cpp
@@ -2,7 +2,6 @@
 #include <vespa/vespalib/testkit/testapp.h>
 
 #include <vespa/fastlib/text/normwordfolder.h>
-#include <vespa/searchlib/query/base.h>
 #include <vespa/vsm/searcher/fold.h>
 #include <vespa/vsm/searcher/futf8strchrfieldsearcher.h>
 #include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
@@ -66,7 +65,7 @@ TextUtilTest::assertSkipSeparators(const char * input, size_t len, const UCS4V &
     const byte * srcbuf = reinterpret_cast<const byte *>(input);
     auto dstbuf = std::make_unique<ucs4_t[]>(len + 1);
     auto offsets = std::make_unique<size_t[]>(len + 1);
-    UTF8StrChrFieldSearcher fs;
+    UTF8StrChrFieldSearcher fs(0);
     BW bw(dstbuf.get(), offsets.get());
     size_t dstlen = fs.skipSeparators(srcbuf, len, bw);
     EXPECT_EQUAL(dstlen, expdstbuf.size());
diff --git a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h
index 8c1c3771917..38d0e942fbc 100644
--- a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h
+++ b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h
@@ -17,15 +17,32 @@ private:
     search::fef::SimpleTermData   _termData;
 public:
     QueryTermData * clone() const override { return new QueryTermData(); }
-    search::fef::SimpleTermData &getTermData() { return _termData; }
+    search::fef::SimpleTermData &getTermData() noexcept { return _termData; }
+};
+
+class SearchMethodInfo {
+public:
+    using Normalizing = search::streaming::Normalizing;
+    virtual ~SearchMethodInfo() = default;
+    virtual bool is_text_matching(vespalib::stringref index) const noexcept = 0;
+    virtual Normalizing normalizing_mode(vespalib::stringref index) const noexcept = 0;
 };
 
 class QueryTermDataFactory final : public search::streaming::QueryNodeResultFactory {
 public:
+    using Normalizing = search::streaming::Normalizing;
+    QueryTermDataFactory(const SearchMethodInfo * searchMethodInfo) noexcept : _searchMethodInfo(searchMethodInfo) {}
     std::unique_ptr<search::streaming::QueryNodeResultBase> create() const override {
         return std::make_unique<QueryTermData>();
     }
-    bool getRewriteFloatTerms() const override { return true; }
+    Normalizing normalizing_mode(vespalib::stringref index) const noexcept override {
+        return _searchMethodInfo ? _searchMethodInfo->normalizing_mode(index) : Normalizing::LOWERCASE_AND_FOLD;
+    }
+    bool allow_float_terms_rewrite(vespalib::stringref index ) const noexcept override {
+        return _searchMethodInfo && _searchMethodInfo->is_text_matching(index);
+    }
+private:
+    const SearchMethodInfo * _searchMethodInfo;
 };
 
 
diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
index 4d31c71c0a0..cdd1a018d84 100644
--- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
+++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
@@ -42,6 +42,7 @@ using search::aggregation::HitsAggregationResult;
 using search::attribute::IAttributeVector;
 using search::expression::ConfigureStaticParams;
 using search::streaming::Query;
+using search::streaming::Normalizing;
 using search::streaming::QueryTermList;
 using storage::StorageComponent;
 using storage::VisitorEnvironment;
@@ -91,7 +92,7 @@ ForceWordfolderInit::ForceWordfolderInit()
                                     Fast_NormalizeWordFolder::DO_MULTICHAR_EXPANSION);
 }
 
-static ForceWordfolderInit _G_forceNormWordFolderInit;
+static ForceWordfolderInit G_forceNormWordFolderInit;
 
 // Leftovers from FS4 protocol with limited use here.
 enum queryflags {
@@ -238,14 +239,16 @@ SearchVisitor::SummaryGenerator::fillSummary(AttributeVector::DocId lid, const H
     return {};
 }
 
-void SearchVisitor::HitsResultPreparator::execute(vespalib::Identifiable & obj)
+void
+SearchVisitor::HitsResultPreparator::execute(vespalib::Identifiable & obj)
 {
     auto & hitsAggr(static_cast<HitsAggregationResult &>(obj));
     hitsAggr.setSummaryGenerator(_summaryGenerator);
     _numHitsAggregators++;
 }
 
-bool SearchVisitor::HitsResultPreparator::check(const vespalib::Identifiable & obj) const
+bool
+SearchVisitor::HitsResultPreparator::check(const vespalib::Identifiable & obj) const
 {
     return obj.getClass().inherits(HitsAggregationResult::classId);
 }
@@ -259,7 +262,8 @@ SearchVisitor::GroupingEntry::GroupingEntry(Grouping * grouping) :
 
 SearchVisitor::GroupingEntry::~GroupingEntry() = default;
 
-void SearchVisitor::GroupingEntry::aggregate(const document::Document & doc, search::HitRank rank)
+void
+SearchVisitor::GroupingEntry::aggregate(const document::Document & doc, search::HitRank rank)
 {
     if (_count < _limit) {
         _grouping->aggregate(doc, rank);
@@ -310,7 +314,58 @@ SearchVisitor::SearchVisitor(StorageComponent& component,
     LOG(debug, "Created SearchVisitor");
 }
 
-void SearchVisitor::init(const Parameters & params)
+bool
+SearchVisitor::is_text_matching(vespalib::stringref index) const noexcept {
+    StringFieldIdTMap fieldIdMap;
+    _fieldSearchSpecMap.addFieldsFromIndex(index, fieldIdMap);
+    for (const auto & fieldId : fieldIdMap.map()) {
+        auto found = _fieldSearchSpecMap.specMap().find(fieldId.second);
+        if ((found != _fieldSearchSpecMap.specMap().end()) && found->second.uses_string_search_method()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+namespace {
+
+uint32_t
+count_normalize_lowercase(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) {
+    size_t count = 0;
+    for (const auto & fieldId : fieldIdMap.map()) {
+        auto found = specMap.find(fieldId.second);
+        if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::LOWERCASE) {
+            count++;
+        }
+    }
+    return count;
+}
+
+uint32_t
+count_normalize_none(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) {
+    size_t count = 0;
+    for (const auto & fieldId : fieldIdMap.map()) {
+        auto found = specMap.find(fieldId.second);
+        if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::NONE) {
+            count++;
+        }
+    }
+    return count;
+}
+
+}
+
+SearchMethodInfo::Normalizing
+SearchVisitor::normalizing_mode(vespalib::stringref index) const noexcept {
+    StringFieldIdTMap fieldIdMap;
+    _fieldSearchSpecMap.addFieldsFromIndex(index, fieldIdMap);
+    if (count_normalize_none(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::NONE;
+    if (count_normalize_lowercase(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::LOWERCASE;
+    return Normalizing::LOWERCASE_AND_FOLD;
+}
+
+void
+SearchVisitor::init(const Parameters & params)
 {
     VISITOR_TRACE(6, "About to lazily init VSM adapter");
     _attrMan.add(_documentIdAttributeBacking);
@@ -397,7 +452,14 @@ void SearchVisitor::init(const Parameters & params)
         if ( params.lookup("query", queryBlob) ) {
             LOG(spam, "Received query blob of %zu bytes", queryBlob.size());
             VISITOR_TRACE(9, vespalib::make_string("Setting up for query blob of %zu bytes", queryBlob.size()));
-            QueryTermDataFactory addOnFactory;
+            // Create mapping from field name to field id, from field id to search spec,
+            // and from index name to list of field ids
+            _fieldSearchSpecMap.buildFromConfig(_env->get_vsm_fields_config());
+            auto additionalFields = registerAdditionalFields(_env->get_docsum_tools()->getFieldSpecs());
+            // Add extra elements to mapping from field name to field id
+            _fieldSearchSpecMap.buildFromConfig(additionalFields);
+
+            QueryTermDataFactory addOnFactory(this);
             _query = Query(addOnFactory, vespalib::stringref(queryBlob.data(), queryBlob.size()));
             _searchBuffer->reserve(0x10000);
 
@@ -408,19 +470,11 @@ void SearchVisitor::init(const Parameters & params)
                 LOG(warning, "Request without query stack count");
             }
 
-            std::vector<vespalib::string> additionalFields;
-            registerAdditionalFields(_env->get_docsum_tools()->getFieldSpecs(), additionalFields);
-
-            StringFieldIdTMap fieldsInQuery;
-            setupFieldSearchers(additionalFields, fieldsInQuery);
-
-
+            StringFieldIdTMap fieldsInQuery = setupFieldSearchers();
             setupScratchDocument(fieldsInQuery);
-
             _syntheticFieldsController.setup(_fieldSearchSpecMap.nameIdMap(), fieldsInQuery);
 
             setupAttributeVectors();
-
             setupAttributeVectorsForSorting(_sortSpec);
 
             _rankController.setRankManagerSnapshot(_env->get_rank_manager_snapshot());
@@ -436,7 +490,6 @@ void SearchVisitor::init(const Parameters & params)
             // This depends on _fieldPathMap (from setupScratchDocument),
             // and IQueryEnvironment (from setupRankProcessors).
             prepare_field_searchers();
-
         } else {
             LOG(warning, "No query received");
         }
@@ -529,10 +582,7 @@ SearchVisitor::PositionInserter::PositionInserter(AttributeVector & attribute, A
 SearchVisitor::PositionInserter::~PositionInserter() = default;
 
 void
-SearchVisitor::PositionInserter::onPrimitive(uint32_t, const Content & c)
-{
-    (void) c;
-}
+SearchVisitor::PositionInserter::onPrimitive(uint32_t, const Content &) { }
 
 void
 SearchVisitor::PositionInserter::onStructStart(const Content & c)
@@ -605,7 +655,6 @@ SearchVisitor::RankController::setupRankProcessors(Query & query,
 {
     _rankSetup = &_rankManagerSnapshot->getRankSetup(_rankProfile);
     _rankProcessor = std::make_unique<RankProcessor>(_rankManagerSnapshot, _rankProfile, query, location, _queryProperties, &attrMan);
-    LOG(debug, "Initialize rank processor");
     _rankProcessor->initForRanking(wantedHitCount);
     // register attribute vectors needed for ranking
     processAccessedAttributes(_rankProcessor->get_real_query_env(), true, attrMan, attributeFields);
@@ -637,8 +686,7 @@ SearchVisitor::RankController::rankMatchedDocument(uint32_t docId)
 {
     _rankProcessor->runRankProgram(docId);
     LOG(debug, "Rank score for matched document %u: %f",
-        docId,
-        _rankProcessor->getRankScore());
+        docId, _rankProcessor->getRankScore());
     if (_dumpFeatures) {
         _dumpProcessor->runRankProgram(docId);
         // we must transfer the score to this match data to make sure that the same hits
@@ -718,9 +766,8 @@ SearchVisitor::SyntheticFieldsController::setup(const StringFieldIdTMap & fieldR
 }
 
 void
-SearchVisitor::SyntheticFieldsController::onDocument(StorageDocument & document)
+SearchVisitor::SyntheticFieldsController::onDocument(StorageDocument &)
 {
-    (void) document;
 }
 
 void
@@ -730,10 +777,10 @@ SearchVisitor::SyntheticFieldsController::onDocumentMatch(StorageDocument & docu
     document.setField(_documentIdFId, std::make_unique<document::StringFieldValue>(documentId));
 }
 
-void
-SearchVisitor::registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec,
-                                        std::vector<vespalib::string> & fieldList)
+std::vector<vespalib::string>
+SearchVisitor::registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec)
 {
+    std::vector<vespalib::string> fieldList;
     for (const vsm::DocsumTools::FieldSpec & spec : docsumSpec) {
         fieldList.push_back(spec.getOutputName());
         const std::vector<vespalib::string> & inputNames = spec.getInputNames();
@@ -748,25 +795,20 @@ SearchVisitor::registerAdditionalFields(const std::vector<vsm::DocsumTools::Fiel
     fieldList.emplace_back("[docid]");
     fieldList.emplace_back("[rank]");
     fieldList.emplace_back("documentid");
+    return fieldList;
 }
 
-void
-SearchVisitor::setupFieldSearchers(const std::vector<vespalib::string> & additionalFields,
-                                   StringFieldIdTMap & fieldsInQuery)
+StringFieldIdTMap
+SearchVisitor::setupFieldSearchers()
 {
-    // Create mapping from field name to field id, from field id to search spec,
-    // and from index name to list of field ids
-    _fieldSearchSpecMap.buildFromConfig(_env->get_vsm_fields_config());
-    // Add extra elements to mapping from field name to field id
-    _fieldSearchSpecMap.buildFromConfig(additionalFields);
-
     // Reconfig field searchers based on the query
     _fieldSearchSpecMap.reconfigFromQuery(_query);
 
     // Map field name to field id for all fields in the query
-    _fieldSearchSpecMap.buildFieldsInQuery(_query, fieldsInQuery);
+    StringFieldIdTMap fieldsInQuery = _fieldSearchSpecMap.buildFieldsInQuery(_query);
     // Connect field names in the query to field searchers
     _fieldSearchSpecMap.buildSearcherMap(fieldsInQuery.map(), _fieldSearcherMap);
+    return fieldsInQuery;
 }
 
 void
@@ -947,8 +989,7 @@ class SingleDocumentStore : public vsm::IDocSumCache
 {
 public:
     explicit SingleDocumentStore(const StorageDocument & doc) : _doc(doc) { }
-    const vsm::Document & getDocSum(const search::DocumentIdT & docId) const override {
-        (void) docId;
+    const vsm::Document & getDocSum(const search::DocumentIdT &) const override {
         return _doc;
     }
 private:
@@ -959,19 +1000,12 @@ bool
 SearchVisitor::compatibleDocumentTypes(const document::DocumentType& typeA,
                                        const document::DocumentType& typeB)
 {
-    if (&typeA == &typeB) {
-        return true;
-    } else {
-        return (typeA.getName() == typeB.getName());
-    }
+    return (&typeA == &typeB) || (typeA.getName() == typeB.getName());
 }
 
 void
-SearchVisitor::handleDocuments(const document::BucketId&,
-                               DocEntryList & entries,
-                               HitCounter& hitCounter)
+SearchVisitor::handleDocuments(const document::BucketId&, DocEntryList & entries, HitCounter& )
 {
-    (void) hitCounter;
     if (!_init_called) {
         init(_params);
     }
@@ -1016,37 +1050,25 @@ SearchVisitor::handleDocument(StorageDocument & document)
         RankProcessor & rp = *_rankController.getRankProcessor();
         vespalib::string documentId(document.docDoc().getId().getScheme().toString());
         LOG(debug, "Matched document with id '%s'", documentId.c_str());
-
         document.setDocId(rp.getDocId());
-
         fillAttributeVectors(documentId, document);
-
         _rankController.rankMatchedDocument(rp.getDocId());
-
         if (_shouldFillRankAttribute) {
             _rankAttribute.add(rp.getRankScore());
         }
-
         if (_rankController.keepMatchedDocument()) {
-
             bool amongTheBest = _rankController.collectMatchedDocument(!_sortList.empty(), *this, _tmpSortBuffer, &document);
-
             _syntheticFieldsController.onDocumentMatch(document, documentId);
-
             SingleDocumentStore single(document);
             _summaryGenerator.setDocsumCache(single);
             group(document.docDoc(), rp.getRankScore(), false);
-
             if (amongTheBest) {
                 needToKeepDocument = true;
             }
-
         } else {
             _hitsRejectedCount++;
             LOG(debug, "Do not keep document with id '%s' because rank score (%f) <= rank score drop limit (%f)",
-                documentId.c_str(),
-                rp.getRankScore(),
-                _rankController.getRankSetup()->getRankScoreDropLimit());
+                documentId.c_str(), rp.getRankScore(), _rankController.getRankSetup()->getRankScoreDropLimit());
         }
     } else {
         LOG(debug, "Did not match document with id '%s'", document.docDoc().getId().getScheme().toString().c_str());
@@ -1145,7 +1167,8 @@ SearchVisitor::fillSortBuffer()
     return pos;
 }
 
-void SearchVisitor::completedBucket(const document::BucketId&, HitCounter&)
+void
+SearchVisitor::completedBucket(const document::BucketId&, HitCounter&)
 {
     LOG(debug, "Completed bucket");
 }
@@ -1157,7 +1180,8 @@ SearchVisitor::generate_query_result(HitCounter& counter)
     return std::move(_queryResult);
 }
 
-void SearchVisitor::completedVisitingInternal(HitCounter& hitCounter)
+void
+SearchVisitor::completedVisitingInternal(HitCounter& hitCounter)
 {
     if (!_init_called) {
         init(_params);
diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h
index ef7a41f23a5..ce40b5ba742 100644
--- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h
+++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h
@@ -8,6 +8,7 @@
 #include "rankmanager.h"
 #include "rankprocessor.h"
 #include "searchenvironment.h"
+#include "querytermdata.h"
 #include <vespa/vsm/common/docsum.h>
 #include <vespa/vsm/common/documenttypemapping.h>
 #include <vespa/vsm/common/storagedocument.h>
@@ -42,7 +43,8 @@ class SearchEnvironmentSnapshot;
  * @brief Visitor that applies a search query to visitor data and
  * converts them to a QueryResultCommand.
  **/
-class SearchVisitor : public storage::Visitor {
+class SearchVisitor : public storage::Visitor,
+                      public SearchMethodInfo {
 public:
     SearchVisitor(storage::StorageComponent&, storage::VisitorEnvironment& vEnv,
                   const vdslib::Parameters & params);
@@ -253,19 +255,15 @@ private:
      * @param docsumSpec config with the field names used by the docsum setup.
      * @param fieldList list of field names that are built.
      **/
-    static void registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec,
-                                         std::vector<vespalib::string> & fieldList);
+    static std::vector<vespalib::string> registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec);
 
     /**
      * Setup the field searchers used when matching the query with the stream of documents.
      * This includes setting up various mappings in FieldSearchSpecMap and building mapping
      * for fields used by the query.
      *
-     * @param additionalFields list of additional field names used when setting up the mappings.
-     * @param fieldsInQuery mapping from field name to field id that are built based on the query.
      **/
-    void setupFieldSearchers(const std::vector<vespalib::string> & additionalFields,
-                             vsm::StringFieldIdTMap & fieldsInQuery);
+    vsm::StringFieldIdTMap setupFieldSearchers();
 
     /**
      * Prepare the field searchers for the given query.
@@ -488,6 +486,8 @@ private:
     vsm::StringFieldIdTMapT                 _fieldsUnion;
 
     void setupAttributeVector(const vsm::FieldPath &fieldPath);
+    bool is_text_matching(vespalib::stringref index) const noexcept override;
+    Normalizing normalizing_mode(vespalib::stringref index) const noexcept override;
 };
 
 class SearchVisitorFactory : public storage::VisitorFactory {
diff --git a/streamingvisitors/src/vespa/vsm/config/vsmfields.def b/streamingvisitors/src/vespa/vsm/config/vsmfields.def
index 442a044d38f..dac732013d2 100644
--- a/streamingvisitors/src/vespa/vsm/config/vsmfields.def
+++ b/streamingvisitors/src/vespa/vsm/config/vsmfields.def
@@ -14,6 +14,7 @@ fieldspec[].name string
 ## The search method for a given field. Note: same field in 2 different document types must match on type if not a random result might be expected.
 fieldspec[].searchmethod enum { NONE, BOOL, AUTOUTF8, UTF8, SSE2UTF8, INT8, INT16, INT32, INT64, FLOAT16, FLOAT, DOUBLE, GEOPOS, NEAREST_NEIGHBOR } default=AUTOUTF8
 fieldspec[].arg1 string default=""
+fieldspec[].normalize enum { NONE, LOWERCASE, LOWERCASE_AND_FOLD } default=LOWERCASE_AND_FOLD
 
 ## Maximum number of chars to search per field.
 fieldspec[].maxlength int default=1048576
diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
index 1a9238346b0..40aad418b22 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
+++ b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
@@ -17,6 +17,7 @@ vespa_add_library(vsm_vsmsearcher OBJECT
     intfieldsearcher.cpp
     nearest_neighbor_field_searcher.cpp
     strchrfieldsearcher.cpp
+    tokenizereader.cpp
     utf8flexiblestringfieldsearcher.cpp
     utf8strchrfieldsearcher.cpp
     utf8stringfieldsearcherbase.cpp
diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h
index c7e7d2e74bd..3708cca85fb 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h
@@ -9,8 +9,8 @@ class BoolFieldSearcher : public FieldSearcher
 {
 public:
     std::unique_ptr<FieldSearcher> duplicate() const override;
-    BoolFieldSearcher(FieldIdT fId);
-    ~BoolFieldSearcher();
+    explicit BoolFieldSearcher(FieldIdT fId);
+    ~BoolFieldSearcher() override;
     void prepare(search::streaming::QueryTermList& qtl,
                  const SharedSearcherBuf& buf,
                  const vsm::FieldPathMapT& field_paths,
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
index c797e6751ee..5e06ae41a03 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
@@ -23,79 +23,54 @@ class force
   force() { FieldSearcher::init(); }
 };
 
-static force __forceInit;
+static force ForceInit;
 
 byte FieldSearcher::_foldLowCase[256];
 byte FieldSearcher::_wordChar[256];
 
-FieldSearcherBase::FieldSearcherBase() :
-    _qtl(),
-    _qtlFastBuffer(),
-    _qtlFastSize(0),
-    _qtlFast(nullptr)
+FieldSearcherBase::FieldSearcherBase() noexcept
+    : _qtl()
 {
 }
 
-FieldSearcherBase::FieldSearcherBase(const FieldSearcherBase & org) :
-    _qtl(),
-    _qtlFastBuffer(),
-    _qtlFastSize(0),
-    _qtlFast(nullptr)
+FieldSearcherBase::FieldSearcherBase(const FieldSearcherBase & org)
+    : _qtl()
 {
     prepare(org._qtl);
 }
 
-FieldSearcherBase::~FieldSearcherBase()
-{
-}
-
-FieldSearcherBase & FieldSearcherBase::operator = (const FieldSearcherBase & org)
-{
-    if (this != &org) {
-        prepare(org._qtl);
-    }
-    return *this;
-}
+FieldSearcherBase::~FieldSearcherBase() = default;
 
-void FieldSearcherBase::prepare(const QueryTermList & qtl)
+void
+FieldSearcherBase::prepare(const QueryTermList & qtl)
 {
     _qtl = qtl;
-    _qtlFastBuffer.resize(sizeof(*_qtlFast)*(_qtl.size()+1), 0x13);
-    _qtlFast = reinterpret_cast<v16qi *>(reinterpret_cast<unsigned long>(&_qtlFastBuffer[0]+15) & ~0xf);
-    _qtlFastSize = 0;
-    for (auto qt : _qtl) {
-        memcpy(&_qtlFast[_qtlFastSize++], qt->getTerm(), std::min(size_t(16), qt->termLen()));
-    }
 }
 
-FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) :
-    FieldSearcherBase(),
-    _field(fId),
-    _matchType(defaultPrefix ? PREFIX : REGULAR),
-    _maxFieldLength(0x100000),
-    _currentElementId(0),
-    _currentElementWeight(1),
-    _pureUsAsciiCount(0),
-    _pureUsAsciiFieldCount(0),
-    _anyUtf8Count(0),
-    _anyUtf8FieldCount(0),
-    _words(0),
-    _badUtf8Count(0),
-    _zeroCount(0)
+FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept
+    : FieldSearcherBase(),
+      _field(fId),
+      _matchType(defaultPrefix ? PREFIX : REGULAR),
+      _normalize_mode(Normalizing::LOWERCASE_AND_FOLD),
+      _maxFieldLength(0x100000),
+      _currentElementId(0),
+      _currentElementWeight(1),
+      _words(0),
+      _badUtf8Count(0)
 {
-    zeroStat();
 }
 
 FieldSearcher::~FieldSearcher() = default;
 
-bool FieldSearcher::search(const StorageDocument & doc)
+bool
+FieldSearcher::search(const StorageDocument & doc)
 {
     for (auto qt : _qtl) {
         QueryTerm::FieldInfo & fInfo = qt->getFieldInfo(field());
         fInfo.setHitOffset(qt->getHitList().size());
     }
     onSearch(doc);
-    for(auto qt : _qtl) {
+    for (auto qt : _qtl) {
         QueryTerm::FieldInfo & fInfo = qt->getFieldInfo(field());
         fInfo.setHitCount(qt->getHitList().size() - fInfo.getHitOffset());
         fInfo.setFieldLength(_words);
@@ -104,16 +79,16 @@ bool FieldSearcher::search(const StorageDocument & doc)
     return true;
 }
 
-void FieldSearcher::prepare(QueryTermList& qtl,
-                            const SharedSearcherBuf&,
-                            const vsm::FieldPathMapT&,
-                            search::fef::IQueryEnvironment&)
+void
+FieldSearcher::prepare(QueryTermList& qtl, const SharedSearcherBuf&,
+                       const vsm::FieldPathMapT&, search::fef::IQueryEnvironment&)
 {
     FieldSearcherBase::prepare(qtl);
     prepareFieldId();
 }
 
-size_t FieldSearcher::countWords(const FieldRef & f)
+size_t
+FieldSearcher::countWords(const FieldRef & f)
 {
     size_t words = 0;
     const char * n = f.data();
@@ -129,36 +104,16 @@ size_t FieldSearcher::countWords(const FieldRef & f)
     return words;
 }
 
-void FieldSearcher::prepareFieldId()
+void
+FieldSearcher::prepareFieldId()
 {
     for(auto qt : _qtl) {
         qt->resizeFieldId(field());
     }
 }
 
-void FieldSearcher::addStat(const FieldSearcher & toAdd)
-{
-    _pureUsAsciiCount += toAdd._pureUsAsciiCount;
-    _pureUsAsciiFieldCount += toAdd._pureUsAsciiFieldCount;
-    _anyUtf8Count += toAdd._anyUtf8Count;
-    _anyUtf8FieldCount += toAdd._anyUtf8FieldCount;
-    _badUtf8Count += toAdd._badUtf8Count;
-    _zeroCount += toAdd._zeroCount;
-    for (size_t i=0; i<NELEMS(_utf8Count); i++) { _utf8Count[i] += toAdd._utf8Count[i]; }
-}
-
-void FieldSearcher::zeroStat()
-{
-    _pureUsAsciiCount = 0;
-    _pureUsAsciiFieldCount = 0;
-    _anyUtf8Count = 0;
-    _anyUtf8FieldCount = 0;
-    _badUtf8Count = 0;
-    _zeroCount = 0;
-    for (size_t i=0; i<NELEMS(_utf8Count); i++) { _utf8Count[i] = 0; }
-}
-
-void FieldSearcher::init()
+void
+FieldSearcher::init()
 {
     for (unsigned i = 0; i < NELEMS(_foldLowCase); i++) {
         _foldLowCase[i] = 0;
@@ -182,50 +137,59 @@ void FieldSearcher::init()
     _wordChar[0xd7] = 0;
     _wordChar[0xf7] = 0;
 
-    if (1) /* _doAccentRemoval */ {
-        _foldLowCase[0xc0] = 'a';
-        _foldLowCase[0xc1] = 'a';
-        _foldLowCase[0xc2] = 'a';
-        _foldLowCase[0xc3] = 'a';  // A tilde
-        _foldLowCase[0xc7] = 'c';
-        _foldLowCase[0xc8] = 'e';
-        _foldLowCase[0xc9] = 'e';
-        _foldLowCase[0xca] = 'e';
-        _foldLowCase[0xcb] = 'e';
-        _foldLowCase[0xcc] = 'i';  // I grave
-        _foldLowCase[0xcd] = 'i';
-        _foldLowCase[0xce] = 'i';
-        _foldLowCase[0xcf] = 'i';
-        _foldLowCase[0xd3] = 'o';
-        _foldLowCase[0xd4] = 'o';
-        _foldLowCase[0xda] = 'u';
-        _foldLowCase[0xdb] = 'u';
-
-        _foldLowCase[0xe0] = 'a';
-        _foldLowCase[0xe1] = 'a';
-        _foldLowCase[0xe2] = 'a';
-        _foldLowCase[0xe3] = 'a'; // a tilde
-        _foldLowCase[0xe7] = 'c';
-        _foldLowCase[0xe8] = 'e';
-        _foldLowCase[0xe9] = 'e';
-        _foldLowCase[0xea] = 'e';
-        _foldLowCase[0xeb] = 'e';
-        _foldLowCase[0xec] = 'i'; // i grave
-        _foldLowCase[0xed] = 'i';
-        _foldLowCase[0xee] = 'i';
-        _foldLowCase[0xef] = 'i';
-        _foldLowCase[0xf3] = 'o';
-        _foldLowCase[0xf4] = 'o';
-        _foldLowCase[0xfa] = 'u';
-        _foldLowCase[0xfb] = 'u';
-    }
+    _foldLowCase[0xc0] = 'a';
+    _foldLowCase[0xc1] = 'a';
+    _foldLowCase[0xc2] = 'a';
+    _foldLowCase[0xc3] = 'a';
+    _foldLowCase[0xc7] = 'c';
+    _foldLowCase[0xc8] = 'e';
+    _foldLowCase[0xc9] = 'e';
+    _foldLowCase[0xca] = 'e';
+    _foldLowCase[0xcb] = 'e';
+    _foldLowCase[0xcc] = 'i';
+    _foldLowCase[0xcd] = 'i';
+    _foldLowCase[0xce] = 'i';
+    _foldLowCase[0xcf] = 'i';
+    _foldLowCase[0xd1] = 'n';
+    _foldLowCase[0xd2] = 'o';
+    _foldLowCase[0xd3] = 'o';
+    _foldLowCase[0xd4] = 'o';
+    _foldLowCase[0xd5] = 'o';
+    _foldLowCase[0xd9] = 'u';
+    _foldLowCase[0xda] = 'u';
+    _foldLowCase[0xdb] = 'u';
+    _foldLowCase[0xdc] = 'u';
+    _foldLowCase[0xdd] = 'y';
+    _foldLowCase[0xe0] = 'a';
+    _foldLowCase[0xe1] = 'a';
+    _foldLowCase[0xe2] = 'a';
+    _foldLowCase[0xe3] = 'a';
+    _foldLowCase[0xe7] = 'c';
+    _foldLowCase[0xe8] = 'e';
+    _foldLowCase[0xe9] = 'e';
+    _foldLowCase[0xea] = 'e';
+    _foldLowCase[0xeb] = 'e';
+    _foldLowCase[0xec] = 'i';
+    _foldLowCase[0xed] = 'i';
+    _foldLowCase[0xee] = 'i';
+    _foldLowCase[0xef] = 'i';
+    _foldLowCase[0xf1] = 'n';
+    _foldLowCase[0xf2] = 'o';
+    _foldLowCase[0xf3] = 'o';
+    _foldLowCase[0xf4] = 'o';
+    _foldLowCase[0xf5] = 'o';
+    _foldLowCase[0xf9] = 'u';
+    _foldLowCase[0xfa] = 'u';
+    _foldLowCase[0xfb] = 'u';
+    _foldLowCase[0xfc] = 'u';
+    _foldLowCase[0xfd] = 'y';
+    _foldLowCase[0xff] = 'y';
 }
 
-void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm,
-                                  const SharedSearcherBuf& searcherBuf,
-                                  Query& query,
-                                  const vsm::FieldPathMapT& field_paths,
-                                  search::fef::IQueryEnvironment& query_env)
+void
+FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm, const SharedSearcherBuf& searcherBuf,
+                             Query& query, const vsm::FieldPathMapT& field_paths,
+                             search::fef::IQueryEnvironment& query_env)
 {
     QueryTermList qtl;
     query.getLeaves(qtl);
@@ -269,7 +233,8 @@ void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT& difm,
     LOG(debug, "Will search in %s", tmp.c_str());
 }
 
-bool FieldSearcher::onSearch(const StorageDocument & doc)
+bool
+FieldSearcher::onSearch(const StorageDocument & doc)
 {
     bool retval(true);
     size_t fNo(field());
@@ -296,10 +261,10 @@ FieldSearcher::IteratorHandler::onCollectionStart(const Content & c)
     const document::FieldValue & fv = c.getValue();
     LOG(spam, "onCollectionStart: field value '%s'", fv.toString().c_str());
     if (fv.isA(document::FieldValue::Type::ARRAY)) {
-        const document::ArrayFieldValue & afv = static_cast<const document::ArrayFieldValue &>(fv);
+        const auto & afv = static_cast<const document::ArrayFieldValue &>(fv);
         LOG(spam, "onCollectionStart: Array size = '%zu'", afv.size());
     } else if (fv.isA(document::FieldValue::Type::WSET)) {
-        const document::WeightedSetFieldValue & wsfv = static_cast<const document::WeightedSetFieldValue &>(fv);
+        const auto & wsfv = static_cast<const document::WeightedSetFieldValue &>(fv);
         LOG(spam, "onCollectionStart: WeightedSet size = '%zu'", wsfv.size());
     }
 }
@@ -311,5 +276,4 @@ FieldSearcher::IteratorHandler::onStructStart(const Content & c)
     _searcher.onStructValue(static_cast<const document::StructFieldValue &>(c.getValue()));
 }
 
-
 }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
index e79dacf827e..c5bca6f3899 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
@@ -14,77 +14,59 @@ namespace vsm {
 using termcount_t = size_t;
 using termsize_t = size_t;
 
-#if defined(COLLECT_CHAR_STAT)
-  #define NEED_CHAR_STAT(a) { a; }
-#else
-  #define NEED_CHAR_STAT(a)
-#endif
-
 using ucs4_t = uint32_t;
 using cmptype_t = ucs4_t;
 using SearcherBuf = vespalib::Array<cmptype_t>;
 using SharedSearcherBuf = std::shared_ptr<SearcherBuf>;
-using CharVector = std::vector<char>;
 
 class FieldSearcherBase
 {
 protected:
-    search::streaming::QueryTermList _qtl;
-private:
-    CharVector    _qtlFastBuffer;
-protected:
-    FieldSearcherBase();
+    FieldSearcherBase() noexcept;
     FieldSearcherBase(const FieldSearcherBase & org);
-    virtual ~FieldSearcherBase(void);
-    FieldSearcherBase & operator = (const FieldSearcherBase & org);
+    virtual ~FieldSearcherBase();
+    FieldSearcherBase & operator = (const FieldSearcherBase & org) = delete;
     void prepare(const search::streaming::QueryTermList & qtl);
-    size_t          _qtlFastSize;
-    search::v16qi  *_qtlFast;
+protected:
+    search::streaming::QueryTermList _qtl;
 };
 
 class FieldSearcher : public FieldSearcherBase
 {
 public:
+    using Normalizing = search::streaming::Normalizing;
     enum MatchType {
         REGULAR,
         PREFIX,
         SUBSTRING,
         SUFFIX,
-        EXACT
+        EXACT,
     };
 
-    FieldSearcher(FieldIdT fId, bool defaultPrefix=false);
+    explicit FieldSearcher(FieldIdT fId) noexcept : FieldSearcher(fId, false) {}
+    FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept;
     ~FieldSearcher() override;
     virtual std::unique_ptr<FieldSearcher> duplicate() const = 0;
     bool search(const StorageDocument & doc);
-    virtual void prepare(search::streaming::QueryTermList& qtl,
-                         const SharedSearcherBuf& buf,
-                         const vsm::FieldPathMapT& field_paths,
-                         search::fef::IQueryEnvironment& query_env);
-
-    FieldIdT field()                 const { return _field; }
-    void field(FieldIdT v)                 { _field = v; prepareFieldId(); }
-    bool prefix()                    const { return _matchType == PREFIX; }
-    bool substring()                 const { return _matchType == SUBSTRING; }
-    bool suffix()                    const { return _matchType == SUFFIX; }
-    bool exact()                     const { return _matchType == EXACT; }
-    void setMatchType(MatchType mt)        { _matchType = mt; }
+    virtual void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf,
+                         const vsm::FieldPathMapT& field_paths, search::fef::IQueryEnvironment& query_env);
+
+    FieldIdT field()             const noexcept { return _field; }
+    bool prefix()                const noexcept { return _matchType == PREFIX; }
+    bool substring()             const noexcept { return _matchType == SUBSTRING; }
+    bool suffix()                const noexcept { return _matchType == SUFFIX; }
+    bool exact()                 const noexcept { return _matchType == EXACT; }
+    Normalizing normalize_mode() const noexcept { return _normalize_mode; }
+    MatchType match_type()       const noexcept { return _matchType; }
+    void match_type(MatchType mt)         noexcept { _matchType = mt; }
+    void normalize_mode(Normalizing mode) noexcept { _normalize_mode = mode; }
+    void field(FieldIdT v)                noexcept { _field = v; prepareFieldId(); }
     static void init();
     static search::byte fold(search::byte c)               { return _foldLowCase[c]; }
     static search::byte iswordchar(search::byte c)         { return _wordChar[c]; }
     static search::byte isspace(search::byte c)            { return ! iswordchar(c); }
     static size_t countWords(const FieldRef & f);
-    unsigned pureUsAsciiCount()      const { return _pureUsAsciiCount; }
-    unsigned pureUsAsciiFieldCount() const { return _pureUsAsciiFieldCount; }
-    unsigned anyUtf8Count()          const { return _anyUtf8Count; }
-    unsigned anyUtf8FieldCount()     const { return _anyUtf8FieldCount; }
-    unsigned badUtf8Count()          const { return _badUtf8Count; }
-    unsigned zeroCount()             const { return _zeroCount; }
-    unsigned utf8Count(size_t sz)    const { return _utf8Count[1+sz]; }
-    const unsigned * utf8Count()     const { return _utf8Count; }
-    int32_t getCurrentWeight()       const { return _currentElementWeight; }
-    void addStat(const FieldSearcher & toAdd);
-    void zeroStat();
+    int32_t currentWeight()       const { return _currentElementWeight; }
     FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; }
     size_t maxFieldLength() const { return _maxFieldLength; }
 
@@ -98,7 +80,7 @@ private:
         void onStructStart(const Content & c) override;
 
     public:
-        IteratorHandler(FieldSearcher & searcher) : _searcher(searcher) {}
+        explicit IteratorHandler(FieldSearcher & searcher) noexcept : _searcher(searcher) {}
     };
     friend class IteratorHandler; // to allow calls to onValue();
 
@@ -110,33 +92,21 @@ private:
     virtual void onStructValue(const document::StructFieldValue &) { }
     FieldIdT      _field;
     MatchType     _matchType;
+    Normalizing   _normalize_mode;
     unsigned      _maxFieldLength;
     uint32_t      _currentElementId;
     int32_t       _currentElementWeight; // Contains the weight of the current item being evaluated.
-    /// Number of bytes in blocks containing pure us-ascii
-    unsigned _pureUsAsciiCount;
-    /// Number of blocks containing pure us-ascii
-    unsigned _pureUsAsciiFieldCount;
-    /// Number of bytes in blocks containing any non us-ascii
-    unsigned _anyUtf8Count;
-    /// Number of blocks containing any non us-ascii
-    unsigned _anyUtf8FieldCount;
 protected:
     /// Number of terms searched.
-    unsigned _words;
+    unsigned      _words;
     /// Number of utf8 bytes by utf8 size.
-    unsigned _utf8Count[6];
-    unsigned _badUtf8Count;
-    unsigned _zeroCount;
-protected:
-    void addPureUsAsciiField(size_t sz) { _pureUsAsciiCount += sz; _pureUsAsciiFieldCount++;; }
-    void addAnyUtf8Field(size_t sz)     { _anyUtf8Count += sz; _anyUtf8FieldCount++; }
+    unsigned      _badUtf8Count;
     /**
      * Adds a hit to the given query term.
      * For each call to onValue() a batch of words are processed, and the position is local to this batch.
      **/
     void addHit(search::streaming::QueryTerm & qt, uint32_t pos) const {
-        qt.add(_words + pos, field(), _currentElementId, getCurrentWeight());
+        qt.add(_words + pos, field(), _currentElementId, _currentElementWeight);
     }
 public:
     static search::byte _foldLowCase[256];
@@ -149,10 +119,8 @@ using FieldIdTSearcherMapT = std::vector<FieldSearcherContainer>;
 class FieldIdTSearcherMap : public FieldIdTSearcherMapT
 {
 public:
-    void prepare(const DocumentTypeIndexFieldMapT& difm,
-                 const SharedSearcherBuf& searcherBuf,
-                 search::streaming::Query& query,
-                 const vsm::FieldPathMapT& field_paths,
+    void prepare(const DocumentTypeIndexFieldMapT& difm, const SharedSearcherBuf& searcherBuf,
+                 search::streaming::Query& query, const vsm::FieldPathMapT& field_paths,
                  search::fef::IQueryEnvironment& query_env);
 };
 
diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp
index 7dd40348f47..8558522003f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp
@@ -37,7 +37,7 @@ void FloatFieldSearcherT<T>::prepare(search::streaming::QueryTermList& qtl,
     _floatTerm.clear();
     FieldSearcher::prepare(qtl, buf, field_paths, query_env);
     for (auto qt : qtl) {
-    size_t sz(qt->termLen());
+        size_t sz(qt->termLen());
         if (sz) {
             auto range = qt->getRange<T>();
             _floatTerm.emplace_back(range.low, range.high, range.valid);
diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h
index 07b3f6e1c5f..85341472c26 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h
@@ -9,8 +9,8 @@ template <typename T>
 class FloatFieldSearcherT : public FieldSearcher
 {
 public:
-    FloatFieldSearcherT(FieldIdT fId=0);
-    ~FloatFieldSearcherT();
+    explicit FloatFieldSearcherT(FieldIdT fId);
+    ~FloatFieldSearcherT() override;
     void prepare(search::streaming::QueryTermList& qtl,
                  const SharedSearcherBuf& buf,
                  const vsm::FieldPathMapT& field_paths,
@@ -42,14 +42,14 @@ class FloatFieldSearcher : public FloatFieldSearcherTF
 {
 public:
     std::unique_ptr<FieldSearcher> duplicate() const override;
-    FloatFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTF(fId) { }
+    explicit FloatFieldSearcher(FieldIdT fId) : FloatFieldSearcherTF(fId) { }
 };
 
 class DoubleFieldSearcher : public FloatFieldSearcherTD
 {
 public:
     std::unique_ptr<FieldSearcher> duplicate() const override;
-    DoubleFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTD(fId) { }
+    DoubleFieldSearcher(FieldIdT fId) : FloatFieldSearcherTD(fId) { }
 };
 
 }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp
index a2122f08995..c0b5117d6bf 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp
@@ -19,10 +19,6 @@ FUTF8StrChrFieldSearcher::duplicate() const
     return std::make_unique<FUTF8StrChrFieldSearcher>(*this);
 }
 
-FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher()
-    : UTF8StrChrFieldSearcher(),
-      _folded(4_Ki)
-{ }
 FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher(FieldIdT fId)
     : UTF8StrChrFieldSearcher(fId),
       _folded(4_Ki)
@@ -36,7 +32,7 @@ FUTF8StrChrFieldSearcher::ansiFold(const char * toFold, size_t sz, char * folded
   for(size_t i=0; i < sz; i++) {
     byte c = toFold[i];
     if (c>=128) { retval = false; break; }
-    folded[i] = FieldSearcher::_foldLowCase[c];
+    folded[i] = fold(c);
   }
   return retval;
 }
@@ -209,7 +205,6 @@ size_t FUTF8StrChrFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
     folded[f.size()+1] = 0x01;
     memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values
     return match(folded, f.size(), qt);
-    NEED_CHAR_STAT(addPureUsAsciiField(f.size()));
   } else {
     return UTF8StrChrFieldSearcher::matchTerm(f, qt);
   }
@@ -227,7 +222,6 @@ size_t FUTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t min
     folded[f.size()+1] = 0x01;
     memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values
     return match(folded, f.size(), mintsz, &_qtl[0], _qtl.size());
-    NEED_CHAR_STAT(addPureUsAsciiField(f.size()));
   } else {
     return UTF8StrChrFieldSearcher::matchTerms(f, mintsz);
   }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h
index 5d5ca3d6c3c..b8aa287070a 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h
@@ -9,15 +9,14 @@ class FUTF8StrChrFieldSearcher : public UTF8StrChrFieldSearcher
 {
 public:
     std::unique_ptr<FieldSearcher> duplicate() const override;
-    FUTF8StrChrFieldSearcher();
-    FUTF8StrChrFieldSearcher(FieldIdT fId);
+    explicit FUTF8StrChrFieldSearcher(FieldIdT fId);
     ~FUTF8StrChrFieldSearcher() override;
     static bool ansiFold(const char * toFold, size_t sz, char * folded);
     static bool lfoldaa(const char * toFold, size_t sz, char * folded, size_t & unalignedStart);
     static bool lfoldua(const char * toFold, size_t sz, char * folded, size_t & alignedStart);
  private:
     size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
-    size_t matchTerms(const FieldRef&, const size_t shortestTerm) override;
+    size_t matchTerms(const FieldRef&, size_t shortestTerm) override;
     virtual size_t match(const char *folded, size_t sz, search::streaming::QueryTerm & qt);
     size_t match(const char *folded, size_t sz, size_t mintsz, search::streaming::QueryTerm ** qtl, size_t qtlSize);
     std::vector<char> _folded;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h
index 741148fbca1..17c9f23fefb 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h
@@ -8,8 +8,8 @@ namespace vsm {
 
 class GeoPosFieldSearcher : public FieldSearcher {
 public:
-    GeoPosFieldSearcher(FieldIdT fId=0);
-    ~GeoPosFieldSearcher();
+    GeoPosFieldSearcher(FieldIdT fId);
+    ~GeoPosFieldSearcher() override;
     void prepare(search::streaming::QueryTermList& qtl,
                  const SharedSearcherBuf& buf,
                  const vsm::FieldPathMapT& field_paths,
@@ -21,7 +21,7 @@ protected:
     using GeoLocation = search::common::GeoLocation;
     class GeoPosInfo : public GeoLocation {
     public:
-        GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {}
+        explicit GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {}
         bool cmp(const document::StructFieldValue & fv) const;
     };
     using GeoPosInfoListT = std::vector<GeoPosInfo>;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h
index 47b83c1538d..9c63d31e3c3 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h
@@ -9,8 +9,8 @@ class IntFieldSearcher : public FieldSearcher
 {
 public:
     std::unique_ptr<FieldSearcher> duplicate() const override;
-    IntFieldSearcher(FieldIdT fId=0);
-    ~IntFieldSearcher();
+    explicit IntFieldSearcher(FieldIdT fId);
+    ~IntFieldSearcher() override;
     void prepare(search::streaming::QueryTermList& qtl,
                  const SharedSearcherBuf& buf,
                  const vsm::FieldPathMapT& field_paths,
diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
index 76fedbd1166..816317bf86d 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
@@ -141,17 +141,17 @@ NearestNeighborFieldSearcher::onValue(const document::FieldValue& fv)
 }
 
 DistanceMetric
-NearestNeighborFieldSearcher::distance_metric_from_string(const vespalib::string& value)
+NearestNeighborFieldSearcher::distance_metric_from_string(vespalib::stringref value)
 {
     // Valid string values must match the definition of DistanceMetric in
     // config-model/src/main/java/com/yahoo/schema/document/Attribute.java
-    auto v = value;
+    vespalib::string v = value;
     std::transform(v.begin(), v.end(), v.begin(),
                    [](unsigned char c) { return std::tolower(c); });
     try {
         return DistanceMetricUtils::to_distance_metric(v);
     } catch (vespalib::IllegalStateException&) {
-        vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", value.c_str());
+        vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", v.c_str());
         return DistanceMetric::Euclidean;
     }
 }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
index 5629b443c78..ecdc64d1336 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
@@ -11,10 +11,7 @@
 #include <vespa/searchlib/tensor/tensor_ext_attribute.h>
 
 namespace search::fef { class IQueryEnvironment; }
-
-namespace search::tensor {
-class TensorExtAttribute;
-}
+namespace search::tensor { class TensorExtAttribute; }
 
 namespace vsm {
 
@@ -43,7 +40,7 @@ private:
 public:
     NearestNeighborFieldSearcher(FieldIdT fid,
                                  search::attribute::DistanceMetric metric);
-    ~NearestNeighborFieldSearcher();
+    ~NearestNeighborFieldSearcher() override;
 
     std::unique_ptr<FieldSearcher> duplicate() const override;
     void prepare(search::streaming::QueryTermList& qtl,
@@ -52,7 +49,7 @@ public:
                  search::fef::IQueryEnvironment& query_env) override;
     void onValue(const document::FieldValue& fv) override;
 
-    static search::attribute::DistanceMetric distance_metric_from_string(const vespalib::string& value);
+    static search::attribute::DistanceMetric distance_metric_from_string(vespalib::stringref value);
 };
 
 }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h
index 9ad76712092..19c723d060d 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h
@@ -8,8 +8,7 @@ namespace vsm {
 class StrChrFieldSearcher : public FieldSearcher
 {
 public:
-    StrChrFieldSearcher() : FieldSearcher(0) { }
-    StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { }
+    explicit StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { }
     void onValue(const document::FieldValue & fv) override;
     void prepare(search::streaming::QueryTermList& qtl,
                  const SharedSearcherBuf& buf,
@@ -19,7 +18,7 @@ private:
     size_t shortestTerm() const;
     bool matchDoc(const FieldRef & field);
     virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) = 0;
-    virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) = 0;
+    virtual size_t matchTerms(const FieldRef & f, size_t shortestTerm) = 0;
 };
 
 }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
new file mode 100644
index 00000000000..d8a6091fe11
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
@@ -0,0 +1,21 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "tokenizereader.h"
+
+namespace vsm {
+
+void
+TokenizeReader::fold(ucs4_t c) {
+    const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
+    if (repl != nullptr) {
+        size_t repllen = strlen(repl);
+        if (repllen > 0) {
+            _q = Fast_UnicodeUtil::ucs4copy(_q,repl);
+        }
+    } else {
+        c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+        *_q++ = c;
+    }
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
new file mode 100644
index 00000000000..f10c8910e82
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.h
@@ -0,0 +1,54 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/searchlib/query/streaming/querynoderesultbase.h>
+#include <vespa/searchlib/query/base.h>
+#include <vespa/fastlib/text/normwordfolder.h>
+
+namespace vsm {
+
+/**
+ * Handles tokenization of utf8 input with on the fly normalization.
+ * It handles Normalizing::NONE, Normalizing::LOWERCASE, and Normalizing::LOWERCASE_AND_FOLD
+ */
+class TokenizeReader {
+public:
+    using byte = search::byte;
+    using Normalizing = search::streaming::Normalizing;
+    TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
+        : _p(p),
+          _p_end(p + len),
+          _q(q),
+          _q_start(q)
+    {}
+    ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
+    void normalize(ucs4_t c, Normalizing normalize_mode) {
+        switch (normalize_mode) {
+            case Normalizing::LOWERCASE:
+                c = Fast_NormalizeWordFolder::lowercase(c);
+                [[fallthrough]];
+            case Normalizing::NONE:
+                *_q++ = c;
+                break;
+            case Normalizing::LOWERCASE_AND_FOLD:
+                fold(c);
+                break;
+        }
+    }
+    bool hasNext() const noexcept { return _p < _p_end; }
+    const byte * p() const noexcept { return _p; }
+    size_t complete() noexcept {
+        *_q = 0;
+        size_t token_len = _q - _q_start;
+        _q = _q_start;
+        return token_len;
+    }
+private:
+    void fold(ucs4_t c);
+    const byte *_p;
+    const byte *_p_end;
+    ucs4_t     *_q;
+    ucs4_t     *_q_start;
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp
index 724efb54331..70cef08428a 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp
@@ -7,6 +7,13 @@ using search::streaming::QueryTermList;
 
 namespace vsm {
 
+UTF8ExactStringFieldSearcher::UTF8ExactStringFieldSearcher(FieldIdT fId)
+    : UTF8StringFieldSearcherBase(fId)
+{
+    match_type(EXACT);
+    normalize_mode(Normalizing::LOWERCASE);
+}
+
 std::unique_ptr<FieldSearcher>
 UTF8ExactStringFieldSearcher::duplicate() const
 {
@@ -14,7 +21,7 @@ UTF8ExactStringFieldSearcher::duplicate() const
 }
 
 size_t
-UTF8ExactStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+UTF8ExactStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
 {
     (void) mintsz;
     for (auto qt : _qtl) {
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
index 997bed74787..9f590156a96 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
@@ -1,10 +1,9 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
+#include "utf8stringfieldsearcherbase.h"
 
-namespace vsm
-{
+namespace vsm {
 
 /**
  * This class does suffix utf8 searches.
@@ -12,14 +11,12 @@ namespace vsm
 class UTF8ExactStringFieldSearcher : public UTF8StringFieldSearcherBase
 {
 protected:
-    virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
-    virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+    size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+    size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
 
 public:
     std::unique_ptr<FieldSearcher> duplicate() const override;
-    UTF8ExactStringFieldSearcher()             : UTF8StringFieldSearcherBase() { }
-    UTF8ExactStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+    explicit UTF8ExactStringFieldSearcher(FieldIdT fId);
 };
 
 }
-
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
index 655b068e152..78f491198ad 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
@@ -58,10 +58,6 @@ UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
     }
 }
 
-UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher() :
-    UTF8StringFieldSearcherBase()
-{ }
-
 UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher(FieldIdT fId) :
     UTF8StringFieldSearcherBase(fId)
 { }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
index 5eee6a8862a..bb1b55dffe4 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
@@ -1,10 +1,9 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
+#include "utf8stringfieldsearcherbase.h"
 
-namespace vsm
-{
+namespace vsm {
 
 /**
  * This class does utf8 searches based on the query term type.
@@ -17,18 +16,17 @@ private:
      * Tries to match the given query term against the content of the given field reference.
      * Search strategy is choosen based on the query term type.
      **/
-    virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+    size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
 
     /**
      * Tries to match each query term in the underlying query against the content of the given field reference.
      * Search strategy is choosen based on the query term type.
      **/
-    virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+    size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
 
 public:
     std::unique_ptr<FieldSearcher> duplicate() const override;
-    UTF8FlexibleStringFieldSearcher();
-    UTF8FlexibleStringFieldSearcher(FieldIdT fId);
+    explicit UTF8FlexibleStringFieldSearcher(FieldIdT fId);
 };
 
 }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
index 2488d198b03..37dc4ffb99c 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
@@ -1,5 +1,6 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "utf8strchrfieldsearcher.h"
+#include "tokenizereader.h"
 
 using search::streaming::QueryTerm;
 using search::streaming::QueryTermList;
@@ -14,21 +15,19 @@ UTF8StrChrFieldSearcher::duplicate() const
 }
 
 size_t
-UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
 {
     (void) mintsz;
     termcount_t words(0);
-    const byte * n = reinterpret_cast<const byte *> (f.data());
-    const byte * e = n + f.size();
     if (f.size() >= _buf->size()) {
         _buf->reserve(f.size() + 1);
     }
     cmptype_t * fn = &(*_buf.get())[0];
-    size_t fl(0);
 
-    for( ; n < e; ) {
-        if (!*n) { _zeroCount++; n++; }
-        n = tokenize(n, _buf->capacity(), fn, fl);
+    TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+    while ( reader.hasNext() ) {
+        tokenize(reader);
+        size_t fl = reader.complete();
         for (auto qt : _qtl) {
             const cmptype_t * term;
             termsize_t tsz = qt->term(term);
@@ -42,7 +41,6 @@ UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
         }
         words++;
     }
-    NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
     return words;
 }
 
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h
index cfe546bc6f6..663ee3a1a62 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h
@@ -13,12 +13,10 @@ class UTF8StrChrFieldSearcher : public UTF8StringFieldSearcherBase
 {
 public:
     std::unique_ptr<FieldSearcher> duplicate() const override;
-    UTF8StrChrFieldSearcher()             : UTF8StringFieldSearcherBase() { }
-    UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
-
+    explicit UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
 protected:
     size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
-    size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+    size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
 };
 
 }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index 4daea693e95..5036e9bedb1 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -1,7 +1,7 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "utf8stringfieldsearcherbase.h"
-#include <vespa/fastlib/text/normwordfolder.h>
+#include "tokenizereader.h"
 #include <cassert>
 
 using search::streaming::QueryTerm;
@@ -10,115 +10,36 @@ using search::byte;
 
 namespace vsm {
 
-const byte *
-UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen)
-{
-    if (maxSz > 0) {
-        maxSz--;
-    }
-    ucs4_t c(*p);
-    ucs4_t *q(dstbuf);
-    const byte * end(p+maxSz);
-
-    // Skip non-word characters between words
-    for (; p < end; ) {
-        if (c < 128) {
-            if (!c) { break; }
-            p++;
-            if (__builtin_expect(Fast_NormalizeWordFolder::_isWord[c], false)) {
-                *q++ = Fast_NormalizeWordFolder::_foldCase[c];
-                c = 0;
-            } else {
-                c = *p;
-            }
-        } else {
-            const byte * oldP(p);
-            c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
-            if (Fast_UnicodeUtil::IsWordChar(c)) {
-                _utf8Count[p-oldP-1]++;
-                const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
-                if (repl != NULL) {
-                    size_t repllen = strlen(repl);
-                    if (repllen > 0) {
-                        q = Fast_UnicodeUtil::ucs4copy(q,repl);
-                    }
-                } else {
-                    c = Fast_NormalizeWordFolder::ToFold(c);
-                    *q++ = c;
-                }
-                break;
-            } else {
-                if (c == Fast_UnicodeUtil::_BadUTF8Char) {
-                    _badUtf8Count++;
-                } else {
-                    _utf8Count[p-oldP-1]++;
-                }
-                c = *p;
-            }
-        }
-    }
-
-    c = *p;  // Next char
-    for (; p < end;) {
-        if (c < 128) {             // Common case, ASCII
-            if (!c) { break; }
-            p++;
-            if (__builtin_expect(!Fast_NormalizeWordFolder::_isWord[c], false)) {
-                c = 0;
-            } else {
-                *q++ = Fast_NormalizeWordFolder::_foldCase[c];
-                c = *p;
-            }
-        } else {
-            const byte * oldP(p);
-            c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
-            if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) {
-                _utf8Count[p-oldP-1]++;
-                const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
-                if (repl != NULL) {
-                    size_t repllen = strlen(repl);
-                    if (repllen > 0) {
-                        q = Fast_UnicodeUtil::ucs4copy(q,repl);
-                    }
-                } else {
-                    c = Fast_NormalizeWordFolder::ToFold(c);
-                    *q++ = c;
-                }
-
-                c = *p;
-            } else {
-                if (c == Fast_UnicodeUtil::_BadUTF8Char) {
-                    _badUtf8Count++;
-                } else {
-                    _utf8Count[p-oldP-1]++;
-                }
-                break;
-            }
+template<typename Reader>
+void
+UTF8StringFieldSearcherBase::tokenize(Reader & reader) {
+    ucs4_t c(0);
+    Normalizing norm_mode = normalize_mode();
+    while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next()));
+
+    if (Fast_UnicodeUtil::IsWordChar(c)) {
+        reader.normalize(c, norm_mode);
+        while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) {
+            reader.normalize(c, norm_mode);
         }
     }
-    *q = 0;
-    tokenlen = q - dstbuf;
-    return p;
 }
 
 size_t
 UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt)
 {
     termcount_t words(0);
-    const byte * n = reinterpret_cast<const byte *> (f.data());
-    // __builtin_prefetch(n, 0, 0);
     const cmptype_t * term;
     termsize_t tsz = qt.term(term);
-    const byte * e = n + f.size();
     if ( f.size() >= _buf->size()) {
         _buf->reserve(f.size() + 1);
     }
-    cmptype_t * fn = &(*_buf.get())[0];
-    size_t fl(0);
+    cmptype_t * fn = _buf->data();
 
-    for( ; n < e; ) {
-        if (!*n) { _zeroCount++; n++; }
-        n = tokenize(n, _buf->capacity(), fn, fl);
+    TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+    while ( reader.hasNext() ) {
+        tokenize(reader);
+        size_t fl = reader.complete();
         if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) {
             const cmptype_t *tt=term, *et=term+tsz;
             for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++);
@@ -128,33 +49,35 @@ UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt
         }
         words++;
     }
-    NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
     return words;
 }
 
 size_t
 UTF8StringFieldSearcherBase::matchTermExact(const FieldRef & f, QueryTerm & qt)
 {
-    const byte * n = reinterpret_cast<const byte *> (f.data());
     const cmptype_t * term;
     termsize_t tsz = qt.term(term);
     const cmptype_t * eterm = term+tsz;
-    const byte * e = n + f.size();
+    if ( f.size() >= _buf->size()) {
+        _buf->reserve(f.size() + 1);
+    }
+    cmptype_t * fn = _buf->data();
     if (tsz <= f.size()) {
         bool equal(true);
-        for (; equal && (n < e) && (term < eterm); term++) {
-            if (*term < 0x80) {
-                equal = (*term == Fast_NormalizeWordFolder::_foldCase[*n++]);
-            } else {
-                cmptype_t c = Fast_NormalizeWordFolder::ToFold(Fast_UnicodeUtil::GetUTF8CharNonAscii(n));
-                equal = (*term == c);
+        Normalizing norm_mode = normalize_mode();
+        TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+        while (equal && reader.hasNext() && (term < eterm)) {
+            reader.normalize(reader.next(), norm_mode);
+            size_t len = reader.complete();
+            for (size_t i(0); i < len; i++) {
+                equal = (term[i] == fn[i]);
             }
+            term += len;
         }
-        if (equal && (term == eterm) && (qt.isPrefix() || (n == e))) {
+        if (equal && (term == eterm) && (qt.isPrefix() || ! reader.hasNext())) {
             addHit(qt,0);
         }
     }
-    NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
     return 1;
 }
 
@@ -188,7 +111,6 @@ UTF8StringFieldSearcherBase::matchTermSubstring(const FieldRef & f, QueryTerm &
             }
         }
     }
-    NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
     return words + 1; // we must also count the last word
 }
 
@@ -196,22 +118,17 @@ size_t
 UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
 {
     termcount_t words = 0;
-    const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
-    const byte * srcend = srcbuf + f.size();
     const cmptype_t * term;
     termsize_t tsz = qt.term(term);
     if (f.size() >= _buf->size()) {
         _buf->reserve(f.size() + 1);
     }
-    cmptype_t * dstbuf = &(*_buf.get())[0];
-    size_t tokenlen = 0;
+    cmptype_t * dstbuf = _buf->data();
 
-    for( ; srcbuf < srcend; ) {
-        if (*srcbuf == 0) {
-            ++_zeroCount;
-            ++srcbuf;
-        }
-        srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+    TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
+    while ( reader.hasNext() ) {
+        tokenize(reader);
+        size_t tokenlen = reader.complete();
         if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) {
             addHit(qt, words);
         }
@@ -220,11 +137,6 @@ UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
     return words;
 }
 
-UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase() :
-    StrChrFieldSearcher()
-{
-}
-
 UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase(FieldIdT fId) :
     StrChrFieldSearcher(fId)
 {
@@ -280,12 +192,12 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T
         if (c < 128) {
             p++;
             if (!isSeparatorCharacter(c)) {
-                dstbuf.onCharacter(Fast_NormalizeWordFolder::_foldCase[c], (oldP - b));
+                dstbuf.onCharacter(Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c), (oldP - b));
             }
         } else {
             c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
             const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
-            if (repl != NULL) {
+            if (repl != nullptr) {
                 size_t repllen = strlen(repl);
                 if (repllen > 0) {
                     ucs4_t * buf = dstbuf.getBuf();
@@ -300,13 +212,11 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T
                     }
                 }
             } else {
-                c = Fast_NormalizeWordFolder::ToFold(c);
+                c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
                 dstbuf.onCharacter(c, (oldP - b));
             }
             if (c == Fast_UnicodeUtil::_BadUTF8Char) {
                 _badUtf8Count++;
-            } else {
-                _utf8Count[p-oldP-1]++;
             }
         }
     }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
index 38aac508f4f..b196f2795a4 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
@@ -28,15 +28,15 @@ public:
         ucs4_t * _cbuf;
 
     public:
-        BufferWrapper(ucs4_t * buf) : _bbuf(buf), _cbuf(buf) { }
-        BufferWrapper(ucs4_t * buf, size_t *) : _bbuf(buf), _cbuf(buf) { }
+        explicit BufferWrapper(ucs4_t * buf) noexcept : _bbuf(buf), _cbuf(buf) { }
+        BufferWrapper(ucs4_t * buf, size_t *) noexcept : _bbuf(buf), _cbuf(buf) { }
         void onCharacter(ucs4_t ch, size_t) { *_cbuf++ = ch; }
         void onOffset(size_t) { }
         void incBuf(size_t inc) { _cbuf += inc; }
         ucs4_t * getBuf() { return _cbuf; }
-        bool valid() { return true; }
-        size_t size() { return (_cbuf - _bbuf); }
-        bool hasOffsets() { return false; }
+        bool valid() const noexcept { return true; }
+        size_t size() const noexcept { return (_cbuf - _bbuf); }
+        bool hasOffsets() const noexcept { return false; }
     };
 
     /**
@@ -50,17 +50,18 @@ public:
         size_t * _coff;
 
     public:
-        OffsetWrapper(ucs4_t * buf, size_t * offsets) : BufferWrapper(buf), _boff(offsets), _coff(offsets) {}
+        explicit OffsetWrapper(ucs4_t * buf, size_t * offsets) noexcept : BufferWrapper(buf), _boff(offsets), _coff(offsets) {}
         void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; }
         void onOffset(size_t of) { *_coff++ = of; }
-        bool valid() { return (size() == (size_t)(_coff - _boff)); }
-        bool hasOffsets() { return true; }
+        bool valid() const noexcept { return (size() == (size_t)(_coff - _boff)); }
+        bool hasOffsets() const noexcept { return true; }
     };
 
 protected:
     SharedSearcherBuf _buf;
 
-    const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen);
+    template<typename Reader>
+    void tokenize(Reader & reader);
 
     /**
      * Matches the given query term against the words in the given field reference
@@ -103,9 +104,8 @@ protected:
     size_t matchTermExact(const FieldRef & f, search::streaming::QueryTerm & qt);
 
 public:
-    UTF8StringFieldSearcherBase();
-    UTF8StringFieldSearcherBase(FieldIdT fId);
-    ~UTF8StringFieldSearcherBase();
+    explicit UTF8StringFieldSearcherBase(FieldIdT fId);
+    ~UTF8StringFieldSearcherBase() override;
     void prepare(search::streaming::QueryTermList& qtl,
                  const SharedSearcherBuf& buf,
                  const vsm::FieldPathMapT& field_paths,
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
index 88091c6ab4e..fcc2893a71d 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
@@ -1,6 +1,6 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
-#include <vespa/vsm/searcher/utf8substringsearcher.h>
+#include "utf8substringsearcher.h"
 #include <vespa/fastlib/text/unicodeutil.h>
 
 using search::byte;
@@ -45,8 +45,6 @@ UTF8SubStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
             for(; (fn < fre) && ! Fast_UnicodeUtil::IsWordChar(*fn); fn++ );
         }
     }
-
-    NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
     return words + 1; // we must also count the last word
 }
 
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h
index b1455d5c5f6..cee35993ce7 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h
@@ -1,7 +1,7 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include <vespa/vsm/searcher/utf8strchrfieldsearcher.h>
+#include "utf8strchrfieldsearcher.h"
 
 namespace vsm {
 
@@ -12,11 +12,10 @@ class UTF8SubStringFieldSearcher : public UTF8StringFieldSearcherBase
 {
 public:
     std::unique_ptr<FieldSearcher> duplicate() const override;
-    UTF8SubStringFieldSearcher()             : UTF8StringFieldSearcherBase() { }
-    UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+    explicit UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
 protected:
     size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
-    size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+    size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
 };
 
 }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
index 8403e69658f..6d8a399cd33 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
@@ -110,20 +110,11 @@ UTF8SubstringSnippetModifier::insertSeparators(const char * mbegin, const char *
     _modified->put(_unitSep);
 }
 
-UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier() :
-    UTF8StringFieldSearcherBase(),
-    _modified(new CharBuffer(32)),
-    _offsets(new std::vector<size_t>(32)),
-    _readPtr(NULL),
-    _unitSep(juniper::separators::unit_separator)
-{
-}
-
 UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId) :
     UTF8StringFieldSearcherBase(fId),
     _modified(new CharBuffer(32)),
     _offsets(new std::vector<size_t>(32)),
-    _readPtr(NULL),
+    _readPtr(nullptr),
     _unitSep(juniper::separators::unit_separator)
 {
 }
@@ -134,12 +125,12 @@ UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId,
     UTF8StringFieldSearcherBase(fId),
     _modified(modBuf),
     _offsets(offBuf),
-    _readPtr(NULL),
+    _readPtr(nullptr),
     _unitSep(juniper::separators::unit_separator)
 {
 }
 
-UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() {}
+UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() = default;
 
 }
 
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h
index ebb806de61c..99e6c29961f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h
@@ -23,8 +23,8 @@ private:
     const char        * _readPtr;  // buffer to read from (field reference)
     char                _unitSep;  // the unit separator character to use
 
-    virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
-    virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+    size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+    size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
 
     /**
      * Copies n bytes from the field reference to the modified buffer and updates the read pointer.
@@ -51,9 +51,8 @@ public:
 
     std::unique_ptr<FieldSearcher> duplicate() const override;
 
-    UTF8SubstringSnippetModifier();
-    UTF8SubstringSnippetModifier(FieldIdT fId);
-    ~UTF8SubstringSnippetModifier();
+    explicit UTF8SubstringSnippetModifier(FieldIdT fId);
+    ~UTF8SubstringSnippetModifier() override;
 
     /**
      * Creates a new instance.
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
index e28ce114225..8bbacf168cf 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
@@ -1,5 +1,6 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "utf8suffixstringfieldsearcher.h"
+#include "tokenizereader.h"
 
 using search::byte;
 using search::streaming::QueryTerm;
@@ -14,24 +15,19 @@ UTF8SuffixStringFieldSearcher::duplicate() const
 }
 
 size_t
-UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
 {
     (void) mintsz;
     termcount_t words = 0;
-    const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
-    const byte * srcend = srcbuf + f.size();
     if (f.size() >= _buf->size()) {
         _buf->reserve(f.size() + 1);
     }
     cmptype_t * dstbuf = &(*_buf.get())[0];
-    size_t tokenlen = 0;
 
-    for( ; srcbuf < srcend; ) {
-        if (*srcbuf == 0) {
-            ++_zeroCount;
-            ++srcbuf;
-        }
-        srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+    TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
+    while ( reader.hasNext() ) {
+        tokenize(reader);
+        size_t tokenlen = reader.complete();
         for (auto qt : _qtl) {
             const cmptype_t * term;
             termsize_t tsz = qt->term(term);
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h
index 556f61a714f..dc3bc214b49 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h
@@ -1,10 +1,9 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
+#include "utf8stringfieldsearcherbase.h"
 
-namespace vsm
-{
+namespace vsm {
 
 /**
  * This class does suffix utf8 searches.
@@ -12,13 +11,12 @@ namespace vsm
 class UTF8SuffixStringFieldSearcher : public UTF8StringFieldSearcherBase
 {
 protected:
-    virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
-    virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+    size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+    size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
 
 public:
     std::unique_ptr<FieldSearcher> duplicate() const override;
-    UTF8SuffixStringFieldSearcher()             : UTF8StringFieldSearcherBase() { }
-    UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+    explicit UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
 };
 
 }
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
index e33408a2e26..715c19a0bb7 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
@@ -28,30 +28,30 @@ namespace vsm {
 
 namespace {
 
-void setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) {
+void
+setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) {
     if (arg1 == "prefix") {
-        searcher->setMatchType(FieldSearcher::PREFIX);
+        searcher->match_type(FieldSearcher::PREFIX);
     } else if (arg1 == "substring") {
-        searcher->setMatchType(FieldSearcher::SUBSTRING);
+        searcher->match_type(FieldSearcher::SUBSTRING);
     } else if (arg1 == "suffix") {
-        searcher->setMatchType(FieldSearcher::SUFFIX);
-    } else if (arg1 == "exact") {
-        searcher->setMatchType(FieldSearcher::EXACT);
-    } else if (arg1 == "word") {
-        searcher->setMatchType(FieldSearcher::EXACT);
+        searcher->match_type(FieldSearcher::SUFFIX);
+    } else if ((arg1 == "exact") || (arg1 == "word")) {
+        searcher->match_type(FieldSearcher::EXACT);
     }
 }
 
 }
 
-FieldSearchSpec::FieldSearchSpec() :
-    _id(0),
-    _name(),
-    _maxLength(0x100000),
-    _searcher(),
-    _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE),
-    _arg1(),
-    _reconfigured(false)
+FieldSearchSpec::FieldSearchSpec()
+    : _id(0),
+      _name(),
+      _maxLength(0x100000),
+      _searcher(),
+      _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE),
+      _normalize_mode(Normalizing::LOWERCASE_AND_FOLD),
+      _arg1(),
+      _reconfigured(false)
 {
 }
 FieldSearchSpec::~FieldSearchSpec() = default;
@@ -59,15 +59,15 @@ FieldSearchSpec::~FieldSearchSpec() = default;
 FieldSearchSpec::FieldSearchSpec(FieldSearchSpec&& rhs) noexcept = default;
 FieldSearchSpec& FieldSearchSpec::operator=(FieldSearchSpec&& rhs) noexcept = default;
 
-FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname,
-                                 VsmfieldsConfig::Fieldspec::Searchmethod searchDef,
-                                 const vespalib::string & arg1, size_t maxLength_) :
+FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, Searchmethod searchDef,
+                                 Normalizing normalize_mode, vespalib::stringref arg1_in, size_t maxLength_in) :
     _id(fid),
     _name(fname),
-    _maxLength(maxLength_),
+    _maxLength(maxLength_in),
     _searcher(),
     _searchMethod(searchDef),
-    _arg1(arg1),
+    _normalize_mode(normalize_mode),
+    _arg1(arg1_in),
     _reconfigured(false)
 {
     switch(searchDef) {
@@ -78,13 +78,11 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string &
     case VsmfieldsConfig::Fieldspec::Searchmethod::NONE:
     case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8:
     case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8:
-        if (arg1 == "substring") {
+        if (_arg1 == "substring") {
             _searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid);
-        } else if (arg1 == "suffix") {
+        } else if (_arg1 == "suffix") {
             _searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid);
-        } else if (arg1 == "exact") {
-            _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid);
-        } else if (arg1 == "word") {
+        } else if ((_arg1 == "exact") || (_arg1 == "word")) {
             _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid);
         } else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) {
             _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid);
@@ -111,13 +109,14 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string &
         _searcher = std::make_unique<GeoPosFieldSearcher>(fid);
         break;
     case VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR:
-        auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(arg1);
+        auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(_arg1);
         _searcher = std::make_unique<NearestNeighborFieldSearcher>(fid, dm);
         break;
     }
     if (_searcher) {
-        setMatchType(_searcher, arg1);
+        setMatchType(_searcher, _arg1);
         _searcher->maxFieldLength(maxLength());
+        _searcher->normalize_mode(_normalize_mode);
     }
 }
 
@@ -150,7 +149,8 @@ FieldSearchSpec::reconfig(const QueryTerm & term)
     }
 }
 
-vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpec & f)
+vespalib::asciistream &
+operator <<(vespalib::asciistream & os, const FieldSearchSpec & f)
 {
     os << f._id << ' ' << f._name << ' ';
     if ( ! f._searcher) {
@@ -164,62 +164,67 @@ FieldSearchSpecMap::FieldSearchSpecMap() = default;
 FieldSearchSpecMap::~FieldSearchSpecMap() = default;
 
 namespace {
-    const std::string _G_empty("");
-    const std::string _G_value(".value");
-    const std::regex _G_map1("\\{[a-zA-Z0-9]+\\}");
-    const std::regex _G_map2("\\{\".*\"\\}");
-    const std::regex _G_array("\\[[0-9]+\\]");
+    const std::string G_empty;
+    const std::string G_value(".value");
+    const std::regex G_map1("\\{[a-zA-Z0-9]+\\}");
+    const std::regex G_map2("\\{\".*\"\\}");
+    const std::regex G_array("\\[[0-9]+\\]");
 }
 
-vespalib::string FieldSearchSpecMap::stripNonFields(const vespalib::string & rawIndex)
+vespalib::string
+FieldSearchSpecMap::stripNonFields(vespalib::stringref rawIndex)
 {
     if ((rawIndex.find('[') != vespalib::string::npos) || (rawIndex.find('{') != vespalib::string::npos)) {
-        std::string index = std::regex_replace(std::string(rawIndex), _G_map1, _G_value);
-        index = std::regex_replace(index, _G_map2, _G_value);
-        index = std::regex_replace(index, _G_array, _G_empty);
+        std::string index = std::regex_replace(std::string(rawIndex), G_map1, G_value);
+        index = std::regex_replace(index, G_map2, G_value);
+        index = std::regex_replace(index, G_array, G_empty);
         return index;
     }
     return rawIndex;
 }
 
-bool FieldSearchSpecMap::buildFieldsInQuery(const Query & query, StringFieldIdTMap & fieldsInQuery) const
+void
+FieldSearchSpecMap::addFieldsFromIndex(vespalib::stringref rawIndex, StringFieldIdTMap & fieldIdMap) const {
+    for (const auto & dtm : documentTypeMap()) {
+        const IndexFieldMapT & fim = dtm.second;
+        vespalib::string index(stripNonFields(rawIndex));
+        auto fIt = fim.find(index);
+        if (fIt != fim.end()) {
+            for(FieldIdT fid : fIt->second) {
+                const FieldSearchSpec & spec = specMap().find(fid)->second;
+                LOG(debug, "buildFieldsInQuery = rawIndex='%s', index='%s'", rawIndex.data(), index.c_str());
+                if ((rawIndex != index) && (spec.name().find(index) == 0)) {
+                    vespalib::string modIndex(rawIndex);
+                    modIndex.append(spec.name().substr(index.size()));
+                    fieldIdMap.add(modIndex, spec.id());
+                } else {
+                    fieldIdMap.add(spec.name(),spec.id());
+                }
+            }
+        } else {
+            LOG(warning, "No valid indexes registered for index %s", rawIndex.data());
+        }
+    }
+}
+
+StringFieldIdTMap
+FieldSearchSpecMap::buildFieldsInQuery(const Query & query) const
 {
-    bool retval(true);
+    StringFieldIdTMap fieldsInQuery;
     ConstQueryTermList qtl;
     query.getLeaves(qtl);
 
     for (const auto & term : qtl) {
-        for (const auto & dtm : documentTypeMap()) {
-            const IndexFieldMapT & fim = dtm.second;
-            vespalib::string rawIndex(term->index());
-            vespalib::string index(stripNonFields(rawIndex));
-            auto fIt = fim.find(index);
-            if (fIt != fim.end()) {
-                for(FieldIdT fid : fIt->second) {
-                    const FieldSearchSpec & spec = specMap().find(fid)->second;
-                    LOG(debug, "buildFieldsInQuery = rawIndex='%s', index='%s'", rawIndex.c_str(), index.c_str());
-                    if ((rawIndex != index) && (spec.name().find(index) == 0)) {
-                        vespalib::string modIndex(rawIndex);
-                        modIndex.append(spec.name().substr(index.size()));
-                        fieldsInQuery.add(modIndex, spec.id());
-                    } else {
-                        fieldsInQuery.add(spec.name(),spec.id());
-                    }
-                }
-            } else {
-                LOG(warning, "No valid indexes registered for index %s", term->index().c_str());
-                retval = false;
-            }
-        }
+        addFieldsFromIndex(term->index(), fieldsInQuery);
     }
-    return retval;
+    return fieldsInQuery;
 }
 
-void FieldSearchSpecMap::buildFromConfig(const std::vector<vespalib::string> & otherFieldsNeeded)
+void
+FieldSearchSpecMap::buildFromConfig(const std::vector<vespalib::string> & otherFieldsNeeded)
 {
-    for(size_t i(0), m(otherFieldsNeeded.size()); i < m; i++) {
-        LOG(debug, "otherFieldsNeeded[%zd] = '%s'", i, otherFieldsNeeded[i].c_str());
-        _nameIdMap.add(otherFieldsNeeded[i]);
+    for (const auto & i : otherFieldsNeeded) {
+        _nameIdMap.add(i);
     }
 }
 
@@ -251,16 +256,26 @@ buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearch
     return ifm;
 }
 
+search::streaming::Normalizing
+normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) {
+    switch (normalize_mode) {
+        case VsmfieldsConfig::Fieldspec::Normalize::NONE: return search::streaming::Normalizing::NONE;
+        case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE: return search::streaming::Normalizing::LOWERCASE;
+        case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE_AND_FOLD: return search::streaming::Normalizing::LOWERCASE_AND_FOLD;
+    }
+    return search::streaming::Normalizing::LOWERCASE_AND_FOLD;
+}
+
 }
 
-bool FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
+void
+FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
 {
-    bool retval(true);
     LOG(spam, "Parsing %zd fields", conf->fieldspec.size());
     for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) {
         LOG(spam, "Parsing %s", cfs.name.c_str());
         FieldIdT fieldId = specMap().size();
-        FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, cfs.arg1.c_str(), cfs.maxlength);
+        FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength);
         _specMap[fieldId] = std::move(fss);
         _nameIdMap.add(cfs.name, fieldId);
         LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str());
@@ -275,7 +290,6 @@ bool FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
         }
         _documentTypeMap[di.name] = indexMapp;
     }
-    return retval;
 }
 
 void
@@ -297,12 +311,14 @@ FieldSearchSpecMap::reconfigFromQuery(const Query & query)
     }
 }
 
-bool lesserField(const FieldSearcherContainer & a, const FieldSearcherContainer & b)
+bool
+lesserField(const FieldSearcherContainer & a, const FieldSearcherContainer & b)
 {
     return a->field() < b->field();
 }
 
-void FieldSearchSpecMap::buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap)
+void
+FieldSearchSpecMap::buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap) const
 {
     fieldSearcherMap.clear();
     for (const auto & entry : fieldsInQuery) {
@@ -328,10 +344,11 @@ FieldSearchSpecMap::get_distance_metric(const vespalib::string& name) const
     if (!itr->second.uses_nearest_neighbor_search_method()) {
         return dm;
     }
-    return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.get_arg1());
+    return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.arg1());
 }
 
-vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & df)
+vespalib::asciistream &
+operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & df)
 {
     os << "DocumentTypeMap = \n";
     for (const auto & dtm : df.documentTypeMap()) {
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
index b0154a82dae..7ba9799991e 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
@@ -10,20 +10,29 @@ namespace vsm {
 class FieldSearchSpec
 {
 public:
+    using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod;
+    using Normalizing = search::streaming::Normalizing;
     FieldSearchSpec();
-    FieldSearchSpec(const FieldIdT & id, const vespalib::string & name,
-                    VsmfieldsConfig::Fieldspec::Searchmethod searchMethod,
-                    const vespalib::string & arg1, size_t maxLength);
+    FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, Searchmethod searchMethod,
+                    Normalizing normalize_mode, vespalib::stringref arg1, size_t maxLength);
     ~FieldSearchSpec();
     FieldSearchSpec(FieldSearchSpec&& rhs) noexcept;
     FieldSearchSpec& operator=(FieldSearchSpec&& rhs) noexcept;
-    const FieldSearcher & searcher() const { return *_searcher; }
-    const vespalib::string &  name() const { return _name; }
-    FieldIdT                    id() const { return _id; }
-    bool                     valid() const { return static_cast<bool>(_searcher); }
-    size_t               maxLength() const { return _maxLength; }
-    bool uses_nearest_neighbor_search_method() const noexcept { return _searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR; }
-    const vespalib::string& get_arg1() const noexcept { return _arg1; }
+    const FieldSearcher & searcher() const noexcept { return *_searcher; }
+    const vespalib::string &  name() const noexcept { return _name; }
+    FieldIdT                    id() const noexcept { return _id; }
+    bool                     valid() const noexcept { return static_cast<bool>(_searcher); }
+    size_t               maxLength() const noexcept { return _maxLength; }
+    Normalizing     normalize_mode() const noexcept { return _normalize_mode; }
+    const vespalib::string&   arg1() const noexcept { return _arg1; }
+    bool uses_nearest_neighbor_search_method() const noexcept {
+        return _searchMethod == Searchmethod::NEAREST_NEIGHBOR;
+    }
+    bool uses_string_search_method() const noexcept {
+        return  (_searchMethod == Searchmethod::UTF8) ||
+                (_searchMethod == Searchmethod::AUTOUTF8) ||
+                (_searchMethod == Searchmethod::SSE2UTF8);
+    }
 
     /**
      * Reconfigures the field searcher based on information in the given query term.
@@ -37,7 +46,8 @@ private:
     vespalib::string       _name;
     size_t                 _maxLength;
     FieldSearcherContainer _searcher;
-    VsmfieldsConfig::Fieldspec::Searchmethod _searchMethod;
+    Searchmethod           _searchMethod;
+    Normalizing            _normalize_mode;
     vespalib::string       _arg1;
     bool                   _reconfigured;
 };
@@ -55,7 +65,7 @@ public:
      * and a mapping from field name to field id. It then iterates over all document types and index names
      * and creates a mapping from index name to list of field ids for each document type.
      **/
-    bool buildFromConfig(const VsmfieldsHandle & conf);
+    void buildFromConfig(const VsmfieldsHandle & conf);
 
     /**
      * Iterates over the given field name vector adding extra elements to the mapping from field name to field id.
@@ -71,17 +81,13 @@ public:
      * Adds a [field name, field id] entry to the given mapping for each field name used in the given query.
      * This is achieved by mapping from query term index name -> list of field ids -> [field name, field id] pairs.
      **/
-    bool buildFieldsInQuery(const search::streaming::Query & query, StringFieldIdTMap & fieldsInQuery) const;
-
-    /**
-     * Adds a [field name, field id] entry to the given mapping for each field name in the given vector.
-     **/
-    void buildFieldsInQuery(const std::vector<vespalib::string> & otherFieldsNeeded, StringFieldIdTMap & fieldsInQuery) const;
+    StringFieldIdTMap buildFieldsInQuery(const search::streaming::Query & query) const;
+    void addFieldsFromIndex(vespalib::stringref index, StringFieldIdTMap & fieldIdMap) const;
 
     /**
      * Adds a FieldSearcher object to the given field searcher map for each field name in the other map.
      **/
-    void buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap);
+    void buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap) const;
 
     const FieldSearchSpecMapT & specMap()                 const { return _specMap; }
     //const IndexFieldMapT & indexMap()                     const { return _documentTypeMap.begin()->second; }
@@ -89,7 +95,7 @@ public:
     const StringFieldIdTMap & nameIdMap()                 const { return _nameIdMap; }
     friend vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & f);
 
-    static vespalib::string stripNonFields(const vespalib::string & rawIndex);
+    static vespalib::string stripNonFields(vespalib::stringref rawIndex);
     search::attribute::DistanceMetric get_distance_metric(const vespalib::string& name) const;
 
 private:
author	Henning Baldersheim <balder@yahoo-inc.com>	2024-01-15 10:23:18 +0100
committer	GitHub <noreply@github.com>	2024-01-15 10:23:18 +0100
commit	29a807d35ac5d9e76ea1b8d653bb25b0e4e2dc73 (patch)
tree	d55fddad443566300bd4a7fdd3ef1118a8460700 /streamingvisitors
parent	48b1bae2a6cdf58a237aa7be59632a06aba86861 (diff)
parent	252fbeed13b8622fbc813620dc3b4e45abc6bbe2 (diff)