- Separate methods for lowercasing, and lowercasing and folding.

- Hide implementations and use accessors. - Minor code cleanup.
author: Henning Baldersheim <balder@yahoo-inc.com> 2023-12-21 10:56:49 +0000
committer: Henning Baldersheim <balder@yahoo-inc.com> 2023-12-21 10:56:49 +0000
commit: 543366294b1e8ae8a01186c25b74e36ed4c3ae35 (patch)
tree: f62fb82c65a6152fabe944caa4e719051f4ab032 /searchsummary
parent: ef3db955e75e6df68a2a358feb5b95e44979377f (diff)
2 files changed, 15 insertions, 16 deletions
diff --git a/searchsummary/src/vespa/juniper/tokenizer.cpp b/searchsummary/src/vespa/juniper/tokenizer.cpp
index cd3c9c410ce..211ffe7054a 100644
--- a/searchsummary/src/vespa/juniper/tokenizer.cpp
+++ b/searchsummary/src/vespa/juniper/tokenizer.cpp
@@ -8,11 +8,10 @@
 #include <vespa/log/log.h>
 LOG_SETUP(".juniper.tokenizer");
 
-JuniperTokenizer::JuniperTokenizer(const Fast_WordFolder* wordfolder,
-				   const char* text, size_t len, ITokenProcessor* successor,
-                                   const juniper::SpecialTokenRegistry * registry) :
+JuniperTokenizer::JuniperTokenizer(const Fast_WordFolder* wordfolder, const char* text, size_t len,
+                                   ITokenProcessor* successor, const juniper::SpecialTokenRegistry * registry) :
     _wordfolder(wordfolder), _text(text), _len(len), _successor(successor), _registry(registry),
-    _charpos(0), _wordpos(0)
+    _charpos(0), _wordpos(0), _buffer()
 { }
 
 
@@ -32,19 +31,19 @@ void JuniperTokenizer::scan()
 
     const char* src = _text;
     const char* src_end = _text + _len;
-    const char* startpos = NULL;
+    const char* startpos = nullptr;
     ucs4_t* dst = _buffer;
     ucs4_t* dst_end = dst + TOKEN_DSTLEN;
     size_t result_len;
 
     while (src < src_end)
     {
-        if (_registry == NULL) {
+        if (_registry == nullptr) {
             // explicit prefetching seems to have negative effect with many threads
             src = _wordfolder->UCS4Tokenize(src, src_end, dst, dst_end, startpos, result_len);
         } else {
             const char * tmpSrc = _registry->tokenize(src, src_end, dst, dst_end, startpos, result_len);
-            if (tmpSrc == NULL) {
+            if (tmpSrc == nullptr) {
                 src = _wordfolder->UCS4Tokenize(src, src_end, dst, dst_end, startpos, result_len);
             } else {
                 src = tmpSrc;
@@ -63,6 +62,6 @@ void JuniperTokenizer::scan()
     }
     token.bytepos = _len;
     token.bytelen = 0;
-    token.token = NULL;
+    token.token = nullptr;
     _successor->handle_end(token);
 }
diff --git a/searchsummary/src/vespa/juniper/tokenizer.h b/searchsummary/src/vespa/juniper/tokenizer.h
index 68ef8118f5d..910da3f67ef 100644
--- a/searchsummary/src/vespa/juniper/tokenizer.h
+++ b/searchsummary/src/vespa/juniper/tokenizer.h
@@ -12,8 +12,8 @@ class JuniperTokenizer
 {
 public:
     JuniperTokenizer(const Fast_WordFolder* wordfolder,
-                     const char* text, size_t len, ITokenProcessor* = NULL,
-                     const juniper::SpecialTokenRegistry * registry = NULL);
+                     const char* text, size_t len, ITokenProcessor* = nullptr,
+                     const juniper::SpecialTokenRegistry * registry = nullptr);
     inline void SetSuccessor(ITokenProcessor* successor) { _successor = successor; }
     void setRegistry(const juniper::SpecialTokenRegistry * registry) { _registry = registry; }
 
@@ -23,13 +23,13 @@ public:
     void scan();
 private:
     const Fast_WordFolder* _wordfolder;
-    const char* _text;  // The current input text
-    size_t _len;        // Length of the text input
-    ITokenProcessor* _successor;
+    const char*            _text;  // The current input text
+    size_t                 _len;        // Length of the text input
+    ITokenProcessor*       _successor;
     const juniper::SpecialTokenRegistry * _registry;
-    off_t _charpos;  // Last utf8 character position
-    off_t _wordpos;  // Offset in numbering of words compared to input (as result of splits)
-    ucs4_t _buffer[TOKEN_DSTLEN];  // Temp. buffer to store folding result
+    off_t                  _charpos;  // Last utf8 character position
+    off_t                  _wordpos;  // Offset in numbering of words compared to input (as result of splits)
+    ucs4_t                 _buffer[TOKEN_DSTLEN];  // Temp. buffer to store folding result
 private:
     JuniperTokenizer(const JuniperTokenizer&);
     JuniperTokenizer& operator=(const JuniperTokenizer&);
author	Henning Baldersheim <balder@yahoo-inc.com>	2023-12-21 10:56:49 +0000
committer	Henning Baldersheim <balder@yahoo-inc.com>	2023-12-21 10:56:49 +0000
commit	543366294b1e8ae8a01186c25b74e36ed4c3ae35 (patch)
tree	f62fb82c65a6152fabe944caa4e719051f4ab032 /searchsummary
parent	ef3db955e75e6df68a2a358feb5b95e44979377f (diff)