searchlib/src/vespa/searchlib/util/token_extractor.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include "token_extractor.h"
#include "linguisticsannotation.h"
#include <vespa/document/annotation/alternatespanlist.h>
#include <vespa/document/annotation/span.h>
#include <vespa/document/annotation/spanlist.h>
#include <vespa/document/annotation/spantreevisitor.h>
#include <vespa/document/fieldvalue/document.h>
#include <vespa/vespalib/text/utf8.h>
#include <vespa/vespalib/util/exceptions.h>

#include <vespa/log/log.h>
LOG_SETUP(".searchlib.util.token_extractor");

using document::AlternateSpanList;
using document::Annotation;
using document::AnnotationType;
using document::Document;
using document::FieldValue;
using document::SimpleSpanList;
using document::Span;
using document::SpanList;
using document::SpanNode;
using document::SpanTreeVisitor;
using document::StringFieldValue;
using vespalib::Utf8Reader;

namespace search::linguistics {

namespace {

class SpanFinder : public SpanTreeVisitor {
public:
    int32_t begin_pos;
    int32_t end_pos;

    SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {}
    Span span() { return Span(begin_pos, end_pos - begin_pos); }

    void visit(const Span &node) override {
        begin_pos = std::min(begin_pos, node.from());
        end_pos = std::max(end_pos, node.from() + node.length());
    }
    void visit(const SpanList &node) override {
        for (const auto & span_ : node) {
            span_->accept(*this);
        }
    }
    void visit(const SimpleSpanList &node) override {
        for (const auto & span_ : node) {
            span_.accept(*this);
        }
    }
    void visit(const AlternateSpanList &node) override {
        for (size_t i = 0; i < node.getNumSubtrees(); ++i) {
            visit(node.getSubtree(i));
        }
    }
};

Span
getSpan(const SpanNode &span_node)
{
    SpanFinder finder;
    span_node.accept(finder);
    return finder.span();
}

vespalib::stringref
get_span_string_or_alternative(vespalib::stringref s, const Span &span, const FieldValue* fv)
{
    if (fv != nullptr) {
        auto raw = fv->getAsRaw();
        return {raw.first, raw.second};
    } else {
        return {s.data() + span.from(), static_cast<size_t>(span.length())};
    }
}

size_t
truncated_word_len(vespalib::stringref word, size_t max_byte_len)
{
    Utf8Reader reader(word);
    while (reader.hasMore()) {
        auto last_pos = reader.getPos();
        (void) reader.getChar();
        if (reader.getPos() > max_byte_len) {
            return last_pos;
        }
    }
    return reader.getPos(); // No truncation
}

constexpr size_t max_fmt_len = 100; // Max length of word in logs

}

TokenExtractor::TokenExtractor(const vespalib::string& field_name, size_t max_word_len)
    : _field_name(field_name),
      _max_word_len(max_word_len)
{
}

TokenExtractor::~TokenExtractor() = default;

vespalib::stringref
TokenExtractor::sanitize_word(vespalib::stringref word, const document::Document* doc) const
{
    size_t len = strnlen(word.data(), word.size());
    if (len < word.size()) {
        size_t old_len = word.size();
        len = truncated_word_len(word, len);
        word = word.substr(0, len);
        if (doc != nullptr) {
            LOG(error, "Detected NUL byte in word, length reduced from %zu to %zu, document %s field %s, truncated word prefix is %.*s", old_len, word.size(), doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data());
        }
    }
    if (word.size() > _max_word_len) {
        if (doc != nullptr) {
            LOG(warning, "Dropped too long word (len %zu > max len %zu) from document %s field %s, word prefix is %.*s", word.size(), _max_word_len, doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data());
        }
        return {};
    }
    return word;
}

void
TokenExtractor::consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const Span& span, const FieldValue* fv, const Document* doc) const
{
    if (span.length() > 0 && span.from() >= 0 &&
        static_cast<size_t>(span.from()) + static_cast<size_t>(span.length()) <= text.size()) {
        auto word = get_span_string_or_alternative(text, span, fv);
        word = sanitize_word(word, doc);
        if (!word.empty()) {
            terms.emplace_back(span, word, fv != nullptr);
        }
    }
}

void
TokenExtractor::extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const Document* doc) const
{
    auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME);
    if (tree == nullptr) {
        /* field might not be annotated if match type is exact */
        consider_word(terms, text, Span(0, text.size()), nullptr, doc);
        return;
    }
    for (const Annotation & annotation : *tree) {
        const SpanNode *span = annotation.getSpanNode();
        if ((span != nullptr) && annotation.valid() &&
            (annotation.getType() == *AnnotationType::TERM))
        {
            Span sp = getSpan(*span);
            consider_word(terms, text, sp, annotation.getFieldValue(), doc);
        }
    }
    std::sort(terms.begin(), terms.end());
}

}