streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8substringsnippetmodifier.h"
#include <vespa/juniper/juniper_separators.h>
#include <vespa/fastlib/text/unicodeutil.h>
#include <cassert>

using search::byte;
using search::streaming::QueryTerm;
using search::streaming::QueryTermList;

namespace vsm {

std::unique_ptr<FieldSearcher>
UTF8SubstringSnippetModifier::duplicate() const
{
    return std::make_unique<UTF8SubstringSnippetModifier>(*this);
}

size_t
UTF8SubstringSnippetModifier::matchTerms(const FieldRef & f, const size_t mintsz)
{
    _modified->reset();
    _readPtr = f.data();
    const byte * src = reinterpret_cast<const byte *> (f.data());
    // resize ucs4 buffer
    if (f.size() >= _buf->size()) {
        _buf->resize(f.size() + 1);
    }
    // resize offset buffers
    if (f.size() >= _offsets->size()) {
        _offsets->resize(f.size() + 1);
    }
    // resize modified buffer
    if (f.size() + 16 > _modified->getLength()) {
        _modified->resize(f.size() + 16); // make room for some unit separators
    }
    cmptype_t * dbegin = &(*_buf.get())[0];
    OffsetWrapper wrapper(dbegin, &(*_offsets)[0]);
    size_t numchars = skipSeparators(src, f.size(), wrapper);
    const cmptype_t * ditr = dbegin;
    const cmptype_t * dend = ditr + numchars;
    const cmptype_t * drend = dend - mintsz;
    termcount_t words = 0;
    for(; ditr <= drend; ) {
        for (auto qt : _qtl) {
            const cmptype_t * term;
            termsize_t tsz = qt->term(term);

            const cmptype_t * titr = term;
            const cmptype_t * tend = term + tsz;
            const cmptype_t * dtmp = ditr;
            for (; (titr < tend) && (*titr == *dtmp); ++titr, ++dtmp);
            if (titr == tend) {
                const char * mbegin = f.data() + (*_offsets)[ditr - dbegin];
                const char * mend = f.data() + ((dtmp < dend) ? ((*_offsets)[dtmp - dbegin]) : f.size());
                if (_readPtr <= mbegin) {
                    // We will only copy from the field ref once.
                    // If we have overlapping matches only the first one will be considered.
                    insertSeparators(mbegin, mend);
                }
                addHit(*qt, words);
            }
        }
        if ( ! Fast_UnicodeUtil::IsWordChar(*ditr++) ) {
            words++;
            for(; (ditr < drend) && ! Fast_UnicodeUtil::IsWordChar(*ditr) ; ++ditr );
        }
    }
    assert(_readPtr <= (f.data() + f.size()));
    // copy remaining
    size_t toCopy = f.size() - (_readPtr - f.data());
    copyToModified(toCopy);

    return words + 1; // we must also count the last word
}

size_t
UTF8SubstringSnippetModifier::matchTerm(const FieldRef & f, QueryTerm & qt)
{
    const cmptype_t * term;
    termsize_t tsz = qt.term(term);
    return matchTerms(f, tsz);
}

void
UTF8SubstringSnippetModifier::copyToModified(size_t n, bool skipSep)
{
    if (n == 0) {
        return;
    }
    if (skipSep) {
        for (const char * readEnd = _readPtr + n; _readPtr < readEnd; ++_readPtr) {
            if (!isSeparatorCharacter(*_readPtr)) {
                _modified->put(*_readPtr);
            }
        }
    } else {
        _modified->put(_readPtr, n);
        _readPtr += n;
    }
}

void
UTF8SubstringSnippetModifier::insertSeparators(const char * mbegin, const char * mend)
{
    copyToModified(mbegin - _readPtr);
    _modified->put(_unitSep);
    // skip separators such that the match is not splitted.
    copyToModified((mend - mbegin), true);
    _modified->put(_unitSep);
}

UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId) :
    UTF8StringFieldSearcherBase(fId),
    _modified(new CharBuffer(32)),
    _offsets(new std::vector<size_t>(32)),
    _readPtr(nullptr),
    _unitSep(juniper::separators::unit_separator)
{
}

UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId,
                                                           const CharBuffer::SP & modBuf,
                                                           const SharedOffsetBuffer & offBuf) :
    UTF8StringFieldSearcherBase(fId),
    _modified(modBuf),
    _offsets(offBuf),
    _readPtr(nullptr),
    _unitSep(juniper::separators::unit_separator)
{
}

UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() = default;

}