// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once #include "strchrfieldsearcher.h" namespace vsm { /** * This class is the base class for all utf8 string searchers. * It contains utility functions used by the other searchers. * As normal the prepare method is called * after the query is built. A SharedSearcherBuf is used given to it. This is a * buffer that is shared among all searchers that are run in the same context. * Reuse of this buffer ensures better cache hit ratio because this is just a * scratchpad for tokenizing. It will grow till the max size and stay there. **/ class UTF8StringFieldSearcherBase : public StrChrFieldSearcher { public: /** * Template class that wraps an ucs4 buffer. * Used when invoking skipSeparators() during substring matching. **/ class BufferWrapper { protected: ucs4_t * _bbuf; ucs4_t * _cbuf; public: explicit BufferWrapper(ucs4_t * buf) noexcept : _bbuf(buf), _cbuf(buf) { } BufferWrapper(ucs4_t * buf, size_t *) noexcept : _bbuf(buf), _cbuf(buf) { } void onCharacter(ucs4_t ch, size_t) { *_cbuf++ = ch; } void onOffset(size_t) { } void incBuf(size_t inc) { _cbuf += inc; } ucs4_t * getBuf() { return _cbuf; } bool valid() const noexcept { return true; } size_t size() const noexcept { return (_cbuf - _bbuf); } bool hasOffsets() const noexcept { return false; } }; /** * Template class that wraps an offset buffer in addition to an ucs4 buffer. * The offset buffer contains offsets into the original utf8 buffer. **/ class OffsetWrapper : public BufferWrapper { private: size_t * _boff; size_t * _coff; public: explicit OffsetWrapper(ucs4_t * buf, size_t * offsets) noexcept : BufferWrapper(buf), _boff(offsets), _coff(offsets) {} void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; } void onOffset(size_t of) { *_coff++ = of; } bool valid() const noexcept { return (size() == (size_t)(_coff - _boff)); } bool hasOffsets() const noexcept { return true; } }; protected: SharedSearcherBuf _buf; template void tokenize(Reader & reader); /** * Matches the given query term against the words in the given field reference * using exact or prefix match strategy. * * @param f the field reference to match against. * @param qt the query term trying to match. * @return the number of words in the field ref. **/ size_t matchTermRegular(const FieldRef & f, search::streaming::QueryTerm & qt); /** * Matches the given query term against the characters in the given field reference * using substring match strategy. * * @param f the field reference to match against. * @param qt the query term trying to match. * @return the number of words in the field ref. **/ size_t matchTermSubstring(const FieldRef & f, search::streaming::QueryTerm & qt); /** * Matches the given query term against the words in the given field reference * using suffix match strategy. * * @param f the field reference to match against. * @param qt the query term trying to match. * @return the number of words in the field ref. **/ size_t matchTermSuffix(const FieldRef & f, search::streaming::QueryTerm & qt); /** * Matches the given query term against the words in the given field reference * using exact match strategy. * * @param f the field reference to match against. * @param qt the query term trying to match. * @return the number of words in the field ref. **/ size_t matchTermExact(const FieldRef & f, search::streaming::QueryTerm & qt); public: explicit UTF8StringFieldSearcherBase(FieldIdT fId); ~UTF8StringFieldSearcherBase() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, search::fef::IQueryEnvironment& query_env) override; /** * Matches the given query term against the given word using suffix match strategy. * * @param term the buffer with the term. * @param termLen the length of the term. * @param word the buffer with the word. * @param wordlen the length of the word. * @return true if the term matches the word. **/ static bool matchTermSuffix(const cmptype_t * term, size_t termlen, const cmptype_t * word, size_t wordlen); /** * Checks whether the given character is a separator character. **/ static bool isSeparatorCharacter(ucs4_t); /** * Transforms the given utf8 array into an array of ucs4 characters. * Folding is performed. Separator characters are skipped. **/ template size_t skipSeparators(const search::byte * p, size_t sz, T & dstbuf); }; }