aboutsummaryrefslogtreecommitdiffstats
path: root/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
blob: 7772430522038d7c0b28df8ca0afd04c9345adba (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include "annotation_converter.h"
#include "i_juniper_converter.h"
#include <vespa/document/annotation/annotation.h>
#include <vespa/document/annotation/span.h>
#include <vespa/document/fieldvalue/stringfieldvalue.h>
#include <vespa/juniper/juniper_separators.h>
#include <vespa/searchlib/memoryindex/field_inverter.h>
#include <vespa/searchlib/util/linguisticsannotation.h>
#include <vespa/searchlib/util/token_extractor.h>
#include <vespa/vespalib/stllike/asciistream.h>
#include <vespa/vespalib/util/exceptions.h>
#include <utility>

using document::Annotation;
using document::FieldValue;
using document::Span;
using document::StringFieldValue;
using search::linguistics::TokenExtractor;
using search::memoryindex::FieldInverter;

namespace search::docsummary {

namespace {

vespalib::stringref
getSpanString(vespalib::stringref s, const Span &span)
{
    return {s.data() + span.from(), static_cast<size_t>(span.length())};
}

vespalib::string dummy_field_name;

}

AnnotationConverter::AnnotationConverter(IJuniperConverter& juniper_converter)
    : IStringFieldConverter(),
      _juniper_converter(juniper_converter),
      _text(),
      _out()
{
}

AnnotationConverter::~AnnotationConverter() = default;

template <typename ForwardIt>
void
AnnotationConverter::handleAnnotations(const document::Span& span, ForwardIt it, ForwardIt last) {
    int annCnt = (last - it);
    if (annCnt > 1 || (annCnt == 1 && it->altered)) {
        annotateSpans(span, it, last);
    } else {
        _out << getSpanString(_text, span) << juniper::separators::unit_separator_string;
    }
}

template <typename ForwardIt>
void
AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, ForwardIt last) {
    _out << juniper::separators::interlinear_annotation_anchor_string  // ANCHOR
         << (getSpanString(_text, span))
         << juniper::separators::interlinear_annotation_separator_string; // SEPARATOR
    while (it != last) {
        _out << it->word;
        if (++it != last) {
            _out << " ";
        }
    }
    _out << juniper::separators::interlinear_annotation_terminator_string  // TERMINATOR
         << juniper::separators::unit_separator_string;
}

void
AnnotationConverter::handleIndexingTerms(const StringFieldValue& value)
{
    using SpanTerm = TokenExtractor::SpanTerm;
    std::vector<SpanTerm> terms;
    auto span_trees = value.getSpanTrees();
    TokenExtractor token_extractor(dummy_field_name, FieldInverter::max_word_len);
    token_extractor.extract(terms, span_trees, _text, nullptr);
    auto it = terms.begin();
    auto ite = terms.end();
    int32_t endPos = 0;
    for (; it != ite; ) {
        auto it_begin = it;
        if (it_begin->span.from() >  endPos) {
            Span tmpSpan(endPos, it_begin->span.from() - endPos);
            handleAnnotations(tmpSpan, it, it);
            endPos = it_begin->span.from();
        }
        for (; it != ite && it->span == it_begin->span; ++it);
        handleAnnotations(it_begin->span, it_begin, it);
        endPos = it_begin->span.from() + it_begin->span.length();
    }
    int32_t wantEndPos = _text.size();
    if (endPos < wantEndPos) {
        Span tmpSpan(endPos, wantEndPos - endPos);
        handleAnnotations(tmpSpan, ite, ite);
    }
}

void
AnnotationConverter::convert(const StringFieldValue &input, vespalib::slime::Inserter& inserter)
{
    _out.clear();
    _text = input.getValueRef();
    handleIndexingTerms(input);
    _juniper_converter.convert(_out.str(), inserter);
}

bool
AnnotationConverter::render_weighted_set_as_array() const
{
    return false;
}

}