Use WordFolder as helper instead of inheriting static stuff.

author: Henning Baldersheim <balder@yahoo-inc.com> 2023-07-23 05:29:32 +0000
committer: Henning Baldersheim <balder@yahoo-inc.com> 2023-07-25 07:56:57 +0000
commit: c703043e1d0ff1501ecd5c19c490a4911240744a (patch)
tree: db84e9461bce0f766658afb03c8f27de99f2b897 /streamingvisitors/src/vespa
parent: 78a211072a21ec5f368b99bce19c1b703d98152d (diff)
8 files changed, 30 insertions, 28 deletions
diff --git a/streamingvisitors/src/vespa/searchvisitor/searchenvironment.cpp b/streamingvisitors/src/vespa/searchvisitor/searchenvironment.cpp
index 2119364c2bc..148ad7daaed 100644
--- a/streamingvisitors/src/vespa/searchvisitor/searchenvironment.cpp
+++ b/streamingvisitors/src/vespa/searchvisitor/searchenvironment.cpp
@@ -11,6 +11,7 @@
 #include <vespa/searchlib/fef/ranking_assets_repo.h>
 #include <vespa/vespalib/stllike/hash_map.hpp>
 #include <vespa/searchsummary/config/config-juniperrc.h>
+#include <vespa/fastlib/text/normwordfolder.h>
 #include <cassert>
 
 #include <vespa/log/log.h>
@@ -108,6 +109,7 @@ SearchEnvironment::Env::~Env()
 SearchEnvironment::SearchEnvironment(const config::ConfigUri & configUri, FNET_Transport* transport, const vespalib::string& file_distributor_connection_spec)
     : VisitorEnvironment(),
       _envMap(),
+      _wordFolder(std::make_unique<Fast_NormalizeWordFolder>()),
       _configUri(configUri),
       _transport(transport),
       _file_distributor_connection_spec(file_distributor_connection_spec)
@@ -137,7 +139,7 @@ SearchEnvironment::getEnv(const vespalib::string & searchCluster)
         auto found = _envMap.find(searchCluster);
         if (found == _envMap.end()) {
             LOG(debug, "Init VSMAdapter with config id = '%s'", searchCluster.c_str());
-            Env::SP env = std::make_shared<Env>(searchClusterUri, _wordFolder, _transport, _file_distributor_connection_spec);
+            Env::SP env = std::make_shared<Env>(searchClusterUri, *_wordFolder, _transport, _file_distributor_connection_spec);
             _envMap[searchCluster] = std::move(env);
             found = _envMap.find(searchCluster);
         }
diff --git a/streamingvisitors/src/vespa/searchvisitor/searchenvironment.h b/streamingvisitors/src/vespa/searchvisitor/searchenvironment.h
index f3bbfddd76c..05909c71ccb 100644
--- a/streamingvisitors/src/vespa/searchvisitor/searchenvironment.h
+++ b/streamingvisitors/src/vespa/searchvisitor/searchenvironment.h
@@ -10,10 +10,10 @@
 #include <vespa/config/retriever/simpleconfigurer.h>
 #include <vespa/config/subscription/configuri.h>
 #include <vespa/vsm/vsm/vsm-adapter.h>
-#include <vespa/fastlib/text/normwordfolder.h>
 #include <mutex>
 
 class FNET_Transport;
+class Fast_NormalizeWordFolder;
 
 namespace search::fef {
 
@@ -70,7 +70,7 @@ private:
     EnvMap                   _envMap;
     ThreadLocals             _threadLocals;
     std::mutex               _lock;
-    Fast_NormalizeWordFolder _wordFolder;
+    std::unique_ptr<Fast_NormalizeWordFolder> _wordFolder;
     config::ConfigUri        _configUri;
     FNET_Transport* const    _transport;
     vespalib::string         _file_distributor_connection_spec;
diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
index 95cd4788d7f..1cfa0224b69 100644
--- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
+++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
@@ -26,6 +26,7 @@
 #include <vespa/vespalib/data/slime/slime.h>
 #include <vespa/vespalib/text/stringtokenizer.h>
 #include <vespa/fnet/databuffer.h>
+#include <vespa/fastlib/text/normwordfolder.h>
 #include <optional>
 
 #include <vespa/log/log.h>
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
index abc2bc9d870..dedf20021e9 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
@@ -20,6 +20,7 @@ using termsize_t = size_t;
   #define NEED_CHAR_STAT(a)
 #endif
 
+using ucs4_t = unsigned int;
 using cmptype_t = ucs4_t;
 using SearcherBuf = vespalib::Array<cmptype_t>;
 using SharedSearcherBuf = std::shared_ptr<SearcherBuf>;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index f991722d623..a7f17cb9006 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -1,6 +1,7 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "utf8stringfieldsearcherbase.h"
+#include <vespa/fastlib/text/normwordfolder.h>
 #include <cassert>
 
 using search::streaming::QueryTerm;
@@ -24,8 +25,8 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t *
         if (c < 128) {
             if (!c) { break; }
             p++;
-            if (__builtin_expect(_isWord[c], false)) {
-                *q++ = _foldCase[c];
+            if (__builtin_expect(Fast_NormalizeWordFolder::_isWord[c], false)) {
+                *q++ = Fast_NormalizeWordFolder::_foldCase[c];
                 c = 0;
             } else {
                 c = *p;
@@ -35,19 +36,19 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t *
             c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
             if (Fast_UnicodeUtil::IsWordChar(c)) {
                 _utf8Count[p-oldP-1]++;
-                const char *repl = ReplacementString(c);
+                const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
                 if (repl != NULL) {
                     size_t repllen = strlen(repl);
                     if (repllen > 0) {
                         q = Fast_UnicodeUtil::ucs4copy(q,repl);
                     }
                 } else {
-                    c = ToFold(c);
+                    c = Fast_NormalizeWordFolder::ToFold(c);
                     *q++ = c;
                 }
                 break;
             } else {
-                if (c == _BadUTF8Char) {
+                if (c == Fast_UnicodeUtil::_BadUTF8Char) {
                     _badUtf8Count++;
                 } else {
                     _utf8Count[p-oldP-1]++;
@@ -62,10 +63,10 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t *
         if (c < 128) {             // Common case, ASCII
             if (!c) { break; }
             p++;
-            if (__builtin_expect(!_isWord[c], false)) {
+            if (__builtin_expect(!Fast_NormalizeWordFolder::_isWord[c], false)) {
                 c = 0;
             } else {
-                *q++ = _foldCase[c];
+                *q++ = Fast_NormalizeWordFolder::_foldCase[c];
                 c = *p;
             }
         } else {
@@ -73,20 +74,20 @@ UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t *
             c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
             if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) {
                 _utf8Count[p-oldP-1]++;
-                const char *repl = ReplacementString(c);
+                const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
                 if (repl != NULL) {
                     size_t repllen = strlen(repl);
                     if (repllen > 0) {
                         q = Fast_UnicodeUtil::ucs4copy(q,repl);
                     }
                 } else {
-                    c = ToFold(c);
+                    c = Fast_NormalizeWordFolder::ToFold(c);
                     *q++ = c;
                 }
 
                 c = *p;
             } else {
-                if (c == _BadUTF8Char) {
+                if (c == Fast_UnicodeUtil::_BadUTF8Char) {
                     _badUtf8Count++;
                 } else {
                     _utf8Count[p-oldP-1]++;
@@ -143,9 +144,9 @@ UTF8StringFieldSearcherBase::matchTermExact(const FieldRef & f, QueryTerm & qt)
         bool equal(true);
         for (; equal && (n < e) && (term < eterm); term++) {
             if (*term < 0x80) {
-                equal = (*term == _foldCase[*n++]);
+                equal = (*term == Fast_NormalizeWordFolder::_foldCase[*n++]);
             } else {
-                cmptype_t c = ToFold(Fast_UnicodeUtil::GetUTF8CharNonAscii(n));
+                cmptype_t c = Fast_NormalizeWordFolder::ToFold(Fast_UnicodeUtil::GetUTF8CharNonAscii(n));
                 equal = (*term == c);
             }
         }
@@ -220,20 +221,16 @@ UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
 }
 
 UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase() :
-    StrChrFieldSearcher(),
-    Fast_NormalizeWordFolder(),
-    Fast_UnicodeUtil()
+    StrChrFieldSearcher()
 {
 }
 
 UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase(FieldIdT fId) :
-    StrChrFieldSearcher(fId),
-    Fast_NormalizeWordFolder(),
-    Fast_UnicodeUtil()
+    StrChrFieldSearcher(fId)
 {
 }
 
-UTF8StringFieldSearcherBase::~UTF8StringFieldSearcherBase() {}
+UTF8StringFieldSearcherBase::~UTF8StringFieldSearcherBase() = default;
 
 void
 UTF8StringFieldSearcherBase::prepare(search::streaming::QueryTermList& qtl,
@@ -283,11 +280,11 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T
         if (c < 128) {
             p++;
             if (!isSeparatorCharacter(c)) {
-                dstbuf.onCharacter(_foldCase[c], (oldP - b));
+                dstbuf.onCharacter(Fast_NormalizeWordFolder::_foldCase[c], (oldP - b));
             }
         } else {
             c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
-            const char *repl = ReplacementString(c);
+            const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
             if (repl != NULL) {
                 size_t repllen = strlen(repl);
                 if (repllen > 0) {
@@ -303,10 +300,10 @@ UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T
                     }
                 }
             } else {
-                c = ToFold(c);
+                c = Fast_NormalizeWordFolder::ToFold(c);
                 dstbuf.onCharacter(c, (oldP - b));
             }
-            if (c == _BadUTF8Char) {
+            if (c == Fast_UnicodeUtil::_BadUTF8Char) {
                 _badUtf8Count++;
             } else {
                 _utf8Count[p-oldP-1]++;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
index a017b501660..f4da5960fd3 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
@@ -2,7 +2,6 @@
 #pragma once
 
 #include "strchrfieldsearcher.h"
-#include <vespa/fastlib/text/normwordfolder.h>
 
 namespace vsm {
 
@@ -15,7 +14,7 @@ namespace vsm {
  * Reuse of this buffer ensures better cache hit ratio because this is just a
  * scratchpad for tokenizing. It will grow till the max size and stay there.
  **/
-class UTF8StringFieldSearcherBase : public StrChrFieldSearcher, protected Fast_NormalizeWordFolder, public Fast_UnicodeUtil
+class UTF8StringFieldSearcherBase : public StrChrFieldSearcher
 {
 public:
     /**
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
index adcf7a937c1..046341b069f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
@@ -1,6 +1,7 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include <vespa/vsm/searcher/utf8substringsearcher.h>
+#include <vespa/fastlib/text/unicodeutil.h>
 
 using search::byte;
 using search::streaming::QueryTerm;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
index 89388c01354..ce14d2bf8e2 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
@@ -1,6 +1,7 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "utf8substringsnippetmodifier.h"
 #include <vespa/juniper/juniper_separators.h>
+#include <vespa/fastlib/text/unicodeutil.h>
 #include <cassert>
 
 using search::byte;
author	Henning Baldersheim <balder@yahoo-inc.com>	2023-07-23 05:29:32 +0000
committer	Henning Baldersheim <balder@yahoo-inc.com>	2023-07-25 07:56:57 +0000
commit	c703043e1d0ff1501ecd5c19c490a4911240744a (patch)
tree	db84e9461bce0f766658afb03c8f27de99f2b897 /streamingvisitors/src/vespa
parent	78a211072a21ec5f368b99bce19c1b703d98152d (diff)