aboutsummaryrefslogtreecommitdiffstats
path: root/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2022-05-15 00:40:43 +0200
committerGitHub <noreply@github.com>2022-05-15 00:40:43 +0200
commitdacf557add1c6a3ffab036cdf2f7dfdf9750b22e (patch)
tree3a9dfff58b98898e2e28c0337925f4f04e5eaeb0 /streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
parent2722ce9d1d1ec12d57ebd3833ce37b0958afb752 (diff)
Revert "Collapse vsm into streamingvisitors"
Diffstat (limited to 'streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp')
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp320
1 files changed, 0 insertions, 320 deletions
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
deleted file mode 100644
index 148cdf2c0c3..00000000000
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#include "utf8stringfieldsearcherbase.h"
-#include <cassert>
-
-using search::streaming::QueryTerm;
-using search::streaming::QueryTermList;
-using search::byte;
-
-namespace vsm {
-
-const byte *
-UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen)
-{
- if (maxSz > 0) {
- maxSz--;
- }
- ucs4_t c(*p);
- ucs4_t *q(dstbuf);
- const byte * end(p+maxSz);
-
- // Skip non-word characters between words
- for (; p < end; ) {
- if (c < 128) {
- if (!c) { break; }
- p++;
- if (__builtin_expect(_isWord[c], false)) {
- *q++ = _foldCase[c];
- c = 0;
- } else {
- c = *p;
- }
- } else {
- const byte * oldP(p);
- c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
- if (Fast_UnicodeUtil::IsWordChar(c)) {
- _utf8Count[p-oldP-1]++;
- const char *repl = ReplacementString(c);
- if (repl != NULL) {
- size_t repllen = strlen(repl);
- if (repllen > 0) {
- q = Fast_UnicodeUtil::ucs4copy(q,repl);
- }
- } else {
- c = ToFold(c);
- *q++ = c;
- }
- break;
- } else {
- if (c == _BadUTF8Char) {
- _badUtf8Count++;
- } else {
- _utf8Count[p-oldP-1]++;
- }
- c = *p;
- }
- }
- }
-
- c = *p; // Next char
- for (; p < end;) {
- if (c < 128) { // Common case, ASCII
- if (!c) { break; }
- p++;
- if (__builtin_expect(!_isWord[c], false)) {
- c = 0;
- } else {
- *q++ = _foldCase[c];
- c = *p;
- }
- } else {
- const byte * oldP(p);
- c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
- if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) {
- _utf8Count[p-oldP-1]++;
- const char *repl = ReplacementString(c);
- if (repl != NULL) {
- size_t repllen = strlen(repl);
- if (repllen > 0) {
- q = Fast_UnicodeUtil::ucs4copy(q,repl);
- }
- } else {
- c = ToFold(c);
- *q++ = c;
- }
-
- c = *p;
- } else {
- if (c == _BadUTF8Char) {
- _badUtf8Count++;
- } else {
- _utf8Count[p-oldP-1]++;
- }
- break;
- }
- }
- }
- *q = 0;
- tokenlen = q - dstbuf;
- return p;
-}
-
-size_t
-UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt)
-{
- termcount_t words(0);
- const byte * n = reinterpret_cast<const byte *> (f.data());
- // __builtin_prefetch(n, 0, 0);
- const cmptype_t * term;
- termsize_t tsz = qt.term(term);
- const byte * e = n + f.size();
- if ( f.size() >= _buf->size()) {
- _buf->reserve(f.size() + 1);
- }
- cmptype_t * fn = &(*_buf.get())[0];
- size_t fl(0);
-
- for( ; n < e; ) {
- if (!*n) { _zeroCount++; n++; }
- n = tokenize(n, _buf->capacity(), fn, fl);
- if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) {
- const cmptype_t *tt=term, *et=term+tsz;
- for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++);
- if (tt == et) {
- addHit(qt, words);
- }
- }
- words++;
- }
- NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
- return words;
-}
-
-size_t
-UTF8StringFieldSearcherBase::matchTermExact(const FieldRef & f, QueryTerm & qt)
-{
- const byte * n = reinterpret_cast<const byte *> (f.data());
- const cmptype_t * term;
- termsize_t tsz = qt.term(term);
- const cmptype_t * eterm = term+tsz;
- const byte * e = n + f.size();
- if (tsz <= f.size()) {
- bool equal(true);
- for (; equal && (n < e) && (term < eterm); term++) {
- if (*term < 0x80) {
- equal = (*term == _foldCase[*n++]);
- } else {
- cmptype_t c = ToFold(Fast_UnicodeUtil::GetUTF8CharNonAscii(n));
- equal = (*term == c);
- }
- }
- if (equal && (term == eterm) && (qt.isPrefix() || (n == e))) {
- addHit(qt,0);
- }
- }
- NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
- return 1;
-}
-
-size_t
-UTF8StringFieldSearcherBase::matchTermSubstring(const FieldRef & f, QueryTerm & qt)
-{
- if (qt.termLen() == 0) { return 0; }
- const byte * n = reinterpret_cast<const byte *> (f.data());
- const cmptype_t * term;
- termsize_t tsz = qt.term(term);
- if ( f.size() >= _buf->size()) {
- _buf->reserve(f.size() + 1);
- }
- cmptype_t * fntemp = &(*_buf.get())[0];
- BufferWrapper wrapper(fntemp);
- size_t fl = skipSeparators(n, f.size(), wrapper);
- const cmptype_t * fn(fntemp);
- const cmptype_t * fe = fn + fl;
- const cmptype_t * fre = fe - tsz;
- termcount_t words(0);
- for(words = 0; fn <= fre; ) {
- const cmptype_t *tt=term, *et=term+tsz, *fnt=fn;
- for (; (tt < et) && (*tt == *fnt); tt++, fnt++);
- if (tt == et) {
- fn = fnt;
- addHit(qt, words);
- } else {
- if ( ! Fast_UnicodeUtil::IsWordChar(*fn++) ) {
- words++;
- for(; (fn < fre) && ! Fast_UnicodeUtil::IsWordChar(*fn) ; fn++ );
- }
- }
- }
- NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
- return words + 1; // we must also count the last word
-}
-
-size_t
-UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
-{
- termcount_t words = 0;
- const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
- const byte * srcend = srcbuf + f.size();
- const cmptype_t * term;
- termsize_t tsz = qt.term(term);
- if (f.size() >= _buf->size()) {
- _buf->reserve(f.size() + 1);
- }
- cmptype_t * dstbuf = &(*_buf.get())[0];
- size_t tokenlen = 0;
-
- for( ; srcbuf < srcend; ) {
- if (*srcbuf == 0) {
- ++_zeroCount;
- ++srcbuf;
- }
- srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
- if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) {
- addHit(qt, words);
- }
- words++;
- }
- return words;
-}
-
-UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase() :
- StrChrFieldSearcher(),
- Fast_NormalizeWordFolder(),
- Fast_UnicodeUtil()
-{
-}
-
-UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase(FieldIdT fId) :
- StrChrFieldSearcher(fId),
- Fast_NormalizeWordFolder(),
- Fast_UnicodeUtil()
-{
-}
-
-UTF8StringFieldSearcherBase::~UTF8StringFieldSearcherBase() {}
-
-void
-UTF8StringFieldSearcherBase::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf)
-{
- StrChrFieldSearcher::prepare(qtl, buf);
- _buf = buf;
-}
-
-bool
-UTF8StringFieldSearcherBase::matchTermSuffix(const cmptype_t * term, size_t termlen,
- const cmptype_t * word, size_t wordlen)
-{
- if ((termlen <= wordlen)) {
- const cmptype_t * titr = term + termlen - 1;
- const cmptype_t * witr = word + wordlen - 1;
- bool hit = true;
- // traverse the term and the word back to front
- for (; titr >= term; --titr, --witr) {
- if (*titr != *witr) {
- hit = false;
- break;
- }
- }
- return hit;
- }
- return false;
-}
-
-bool
-UTF8StringFieldSearcherBase::isSeparatorCharacter(ucs4_t c)
-{
- return ((c < 0x20) && (c != '\n') && (c != '\t'));
-}
-
-template <typename T>
-size_t
-UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T & dstbuf) {
- const search::byte * e(p+sz);
- const search::byte * b(p);
-
- for(; p < e; ) {
- ucs4_t c(*p);
- const search::byte * oldP(p);
- if (c < 128) {
- p++;
- if (!isSeparatorCharacter(c)) {
- dstbuf.onCharacter(_foldCase[c], (oldP - b));
- }
- } else {
- c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
- const char *repl = ReplacementString(c);
- if (repl != NULL) {
- size_t repllen = strlen(repl);
- if (repllen > 0) {
- ucs4_t * buf = dstbuf.getBuf();
- ucs4_t * newBuf = Fast_UnicodeUtil::ucs4copy(buf, repl);
- if (dstbuf.hasOffsets()) {
- for (; buf < newBuf; ++buf) {
- dstbuf.incBuf(1);
- dstbuf.onOffset(oldP - b);
- }
- } else {
- dstbuf.incBuf(newBuf - buf);
- }
- }
- } else {
- c = ToFold(c);
- dstbuf.onCharacter(c, (oldP - b));
- }
- if (c == _BadUTF8Char) {
- _badUtf8Count++;
- } else {
- _utf8Count[p-oldP-1]++;
- }
- }
- }
- assert(dstbuf.valid());
- return dstbuf.size();
-}
-
-template unsigned long UTF8StringFieldSearcherBase::skipSeparators<UTF8StringFieldSearcherBase::BufferWrapper>(unsigned char const*, unsigned long, UTF8StringFieldSearcherBase::BufferWrapper&);
-template unsigned long UTF8StringFieldSearcherBase::skipSeparators<UTF8StringFieldSearcherBase::OffsetWrapper>(unsigned char const*, unsigned long, UTF8StringFieldSearcherBase::OffsetWrapper&);
-
-}