aboutsummaryrefslogtreecommitdiffstats
path: root/searchsummary/src/vespa/searchsummary/docsummary/tokens_converter.cpp
blob: e2849fe793e9b342fdf6b64f38d629c2c643b861 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include "tokens_converter.h"
#include <vespa/document/fieldvalue/stringfieldvalue.h>
#include <vespa/searchlib/util/token_extractor.h>
#include <vespa/vespalib/data/slime/slime.h>

using document::StringFieldValue;
using search::linguistics::TokenExtractor;
using vespalib::Memory;
using vespalib::slime::ArrayInserter;
using vespalib::slime::Cursor;
using vespalib::slime::Inserter;

namespace search::docsummary {

TokensConverter::TokensConverter(const TokenExtractor& token_extractor)
    : IStringFieldConverter(),
      _token_extractor(token_extractor),
      _text()
{
}

TokensConverter::~TokensConverter() = default;

template <typename ForwardIt>
void
TokensConverter::handle_alternative_index_terms(ForwardIt it, ForwardIt last, Inserter& inserter)
{
    Cursor& a = inserter.insertArray();
    ArrayInserter ai(a);
    for (;it != last; ++it) {
        handle_index_term(it->word, ai);
    }
}

void
TokensConverter::handle_index_term(vespalib::stringref word, Inserter& inserter)
{
    inserter.insertString(Memory(word));
}

void
TokensConverter::handle_indexing_terms(const StringFieldValue& value, vespalib::slime::Inserter& inserter)
{
    Cursor& a = inserter.insertArray();
    ArrayInserter ai(a);
    using SpanTerm = TokenExtractor::SpanTerm;
    std::vector<SpanTerm> terms;
    auto span_trees = value.getSpanTrees();
    _token_extractor.extract(terms, span_trees, _text, nullptr);
    auto it = terms.begin();
    auto ite = terms.end();
    auto itn = it;
    for (; it != ite; it = itn) {
        for (; itn != ite && itn->span == it->span; ++itn);
        if ((itn - it) > 1) {
            handle_alternative_index_terms(it, itn, ai);
        } else {
            handle_index_term(it->word, ai);
        }
    }
}

void
TokensConverter::convert(const StringFieldValue &input, vespalib::slime::Inserter& inserter)
{
    _text = input.getValueRef();
    handle_indexing_terms(input, inserter);
}

bool
TokensConverter::render_weighted_set_as_array() const
{
    return true;
}

}