diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2022-05-15 00:41:35 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-15 00:41:35 +0200 |
commit | 4db8dcbf3395fd92b1348155142b85df5a754289 (patch) | |
tree | 912b02e614bc9889ea3543893cbeb699971e8156 /streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h | |
parent | 287a799b270200aca440cad376272328128a5054 (diff) |
Revert "Revert "Collapse vsm into streamingvisitors""
Diffstat (limited to 'streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h')
-rw-r--r-- | streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h | 138 |
1 files changed, 138 insertions, 0 deletions
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h new file mode 100644 index 00000000000..f540a7ac457 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h @@ -0,0 +1,138 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "strchrfieldsearcher.h" +#include <vespa/fastlib/text/normwordfolder.h> + +namespace vsm { + +/** + * This class is the base class for all utf8 string searchers. + * It contains utility functions used by the other searchers. + * As normal the prepare method is called + * after the query is built. A SharedSearcherBuf is used given to it. This is a + * buffer that is shared among all searchers that are run in the same context. + * Reuse of this buffer ensures better cache hit ratio because this is just a + * scratchpad for tokenizing. It will grow till the max size and stay there. + **/ +class UTF8StringFieldSearcherBase : public StrChrFieldSearcher, protected Fast_NormalizeWordFolder, public Fast_UnicodeUtil +{ +public: + /** + * Template class that wraps an ucs4 buffer. + * Used when invoking skipSeparators() during substring matching. + **/ + class BufferWrapper + { + protected: + ucs4_t * _bbuf; + ucs4_t * _cbuf; + + public: + BufferWrapper(ucs4_t * buf) : _bbuf(buf), _cbuf(buf) { } + BufferWrapper(ucs4_t * buf, size_t *) : _bbuf(buf), _cbuf(buf) { } + void onCharacter(ucs4_t ch, size_t) { *_cbuf++ = ch; } + void onOffset(size_t) { } + void incBuf(size_t inc) { _cbuf += inc; } + ucs4_t * getBuf() { return _cbuf; } + bool valid() { return true; } + size_t size() { return (_cbuf - _bbuf); } + bool hasOffsets() { return false; } + }; + + /** + * Template class that wraps an offset buffer in addition to an ucs4 buffer. + * The offset buffer contains offsets into the original utf8 buffer. + **/ + class OffsetWrapper : public BufferWrapper + { + private: + size_t * _boff; + size_t * _coff; + + public: + OffsetWrapper(ucs4_t * buf, size_t * offsets) : BufferWrapper(buf), _boff(offsets), _coff(offsets) {} + void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; } + void onOffset(size_t of) { *_coff++ = of; } + bool valid() { return (size() == (size_t)(_coff - _boff)); } + bool hasOffsets() { return true; } + }; + +protected: + SharedSearcherBuf _buf; + + const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen); + + /** + * Matches the given query term against the words in the given field reference + * using exact or prefix match strategy. + * + * @param f the field reference to match against. + * @param qt the query term trying to match. + * @return the number of words in the field ref. + **/ + size_t matchTermRegular(const FieldRef & f, search::streaming::QueryTerm & qt); + + /** + * Matches the given query term against the characters in the given field reference + * using substring match strategy. + * + * @param f the field reference to match against. + * @param qt the query term trying to match. + * @return the number of words in the field ref. + **/ + size_t matchTermSubstring(const FieldRef & f, search::streaming::QueryTerm & qt); + + /** + * Matches the given query term against the words in the given field reference + * using suffix match strategy. + * + * @param f the field reference to match against. + * @param qt the query term trying to match. + * @return the number of words in the field ref. + **/ + size_t matchTermSuffix(const FieldRef & f, search::streaming::QueryTerm & qt); + + /** + * Matches the given query term against the words in the given field reference + * using exact match strategy. + * + * @param f the field reference to match against. + * @param qt the query term trying to match. + * @return the number of words in the field ref. + **/ + size_t matchTermExact(const FieldRef & f, search::streaming::QueryTerm & qt); + +public: + UTF8StringFieldSearcherBase(); + UTF8StringFieldSearcherBase(FieldIdT fId); + ~UTF8StringFieldSearcherBase(); + void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; + /** + * Matches the given query term against the given word using suffix match strategy. + * + * @param term the buffer with the term. + * @param termLen the length of the term. + * @param word the buffer with the word. + * @param wordlen the length of the word. + * @return true if the term matches the word. + **/ + static bool matchTermSuffix(const cmptype_t * term, size_t termlen, + const cmptype_t * word, size_t wordlen); + + /** + * Checks whether the given character is a separator character. + **/ + static bool isSeparatorCharacter(ucs4_t); + + /** + * Transforms the given utf8 array into an array of ucs4 characters. + * Folding is performed. Separator characters are skipped. + **/ + template <typename T> + size_t skipSeparators(const search::byte * p, size_t sz, T & dstbuf); + +}; + +} + |