aboutsummaryrefslogtreecommitdiffstats
path: root/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp')
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp116
1 files changed, 27 insertions, 89 deletions
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
index f4594cba4f4..77724305220 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
@@ -1,30 +1,24 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "annotation_converter.h"
#include "i_juniper_converter.h"
-#include "linguisticsannotation.h"
-#include <vespa/document/annotation/alternatespanlist.h>
#include <vespa/document/annotation/annotation.h>
-#include <vespa/document/annotation/spantree.h>
-#include <vespa/document/annotation/spantreevisitor.h>
-#include <vespa/document/datatype/annotationtype.h>
+#include <vespa/document/annotation/span.h>
#include <vespa/document/fieldvalue/stringfieldvalue.h>
#include <vespa/juniper/juniper_separators.h>
+#include <vespa/searchlib/memoryindex/field_inverter.h>
+#include <vespa/searchlib/util/linguisticsannotation.h>
+#include <vespa/searchlib/util/token_extractor.h>
#include <vespa/vespalib/stllike/asciistream.h>
#include <vespa/vespalib/util/exceptions.h>
#include <utility>
-using document::AlternateSpanList;
using document::Annotation;
-using document::AnnotationType;
using document::FieldValue;
-using document::SimpleSpanList;
using document::Span;
-using document::SpanList;
-using document::SpanNode;
-using document::SpanTree;
-using document::SpanTreeVisitor;
using document::StringFieldValue;
+using search::linguistics::TokenExtractor;
+using search::memoryindex::FieldInverter;
namespace search::docsummary {
@@ -36,48 +30,7 @@ getSpanString(vespalib::stringref s, const Span &span)
return {s.data() + span.from(), static_cast<size_t>(span.length())};
}
-struct SpanFinder : SpanTreeVisitor {
- int32_t begin_pos;
- int32_t end_pos;
-
- SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {}
- Span span() { return Span(begin_pos, end_pos - begin_pos); }
-
- void visit(const Span &node) override {
- begin_pos = std::min(begin_pos, node.from());
- end_pos = std::max(end_pos, node.from() + node.length());
- }
- void visit(const SpanList &node) override {
- for (const auto & span_ : node) {
- span_->accept(*this);
- }
- }
- void visit(const SimpleSpanList &node) override {
- for (const auto & span_ : node) {
- span_.accept(*this);
- }
- }
- void visit(const AlternateSpanList &node) override {
- for (size_t i = 0; i < node.getNumSubtrees(); ++i) {
- visit(node.getSubtree(i));
- }
- }
-};
-
-Span getSpan(const SpanNode &span_node) {
- SpanFinder finder;
- span_node.accept(finder);
- return finder.span();
-}
-
-const StringFieldValue &ensureStringFieldValue(const FieldValue &value) __attribute__((noinline));
-
-const StringFieldValue &ensureStringFieldValue(const FieldValue &value) {
- if (!value.isA(FieldValue::Type::STRING)) {
- throw vespalib::IllegalArgumentException("Illegal field type. " + value.toString(), VESPA_STRLOC);
- }
- return static_cast<const StringFieldValue &>(value);
-}
+vespalib::string dummy_field_name;
}
@@ -95,7 +48,7 @@ template <typename ForwardIt>
void
AnnotationConverter::handleAnnotations(const document::Span& span, ForwardIt it, ForwardIt last) {
int annCnt = (last - it);
- if (annCnt > 1 || (annCnt == 1 && it->second)) {
+ if (annCnt > 1 || (annCnt == 1 && it->altered)) {
annotateSpans(span, it, last);
} else {
_out << getSpanString(_text, span) << juniper::separators::unit_separator_string;
@@ -109,11 +62,7 @@ AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, For
<< (getSpanString(_text, span))
<< juniper::separators::interlinear_annotation_separator_string; // SEPARATOR
while (it != last) {
- if (it->second) {
- _out << ensureStringFieldValue(*it->second).getValue();
- } else {
- _out << getSpanString(_text, span);
- }
+ _out << it->word;
if (++it != last) {
_out << " ";
}
@@ -125,41 +74,24 @@ AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, For
void
AnnotationConverter::handleIndexingTerms(const StringFieldValue& value)
{
- StringFieldValue::SpanTrees trees = value.getSpanTrees();
- const SpanTree *tree = StringFieldValue::findTree(trees, linguistics::SPANTREE_NAME);
- using SpanTerm = std::pair<Span, const FieldValue *>;
- using SpanTermVector = std::vector<SpanTerm>;
- if (!tree) {
- // Treat a string without annotations as a single span.
- SpanTerm str(Span(0, _text.size()),
- static_cast<const FieldValue*>(nullptr));
- handleAnnotations(str.first, &str, &str + 1);
- return;
- }
- SpanTermVector terms;
- for (const Annotation& annotation : *tree) {
- // For now, skip any composite spans.
- const auto *span = dynamic_cast<const Span*>(annotation.getSpanNode());
- if ((span != nullptr) && annotation.valid() &&
- (annotation.getType() == *AnnotationType::TERM)) {
- terms.push_back(std::make_pair(getSpan(*span),
- annotation.getFieldValue()));
- }
- }
- sort(terms.begin(), terms.end());
+ using SpanTerm = TokenExtractor::SpanTerm;
+ std::vector<SpanTerm> terms;
+ auto span_trees = value.getSpanTrees();
+ TokenExtractor token_extractor(dummy_field_name, FieldInverter::max_word_len);
+ token_extractor.extract(terms, span_trees, _text, nullptr);
auto it = terms.begin();
auto ite = terms.end();
int32_t endPos = 0;
for (; it != ite; ) {
auto it_begin = it;
- if (it_begin->first.from() > endPos) {
- Span tmpSpan(endPos, it_begin->first.from() - endPos);
+ if (it_begin->span.from() > endPos) {
+ Span tmpSpan(endPos, it_begin->span.from() - endPos);
handleAnnotations(tmpSpan, it, it);
- endPos = it_begin->first.from();
+ endPos = it_begin->span.from();
}
- for (; it != ite && it->first == it_begin->first; ++it);
- handleAnnotations(it_begin->first, it_begin, it);
- endPos = it_begin->first.from() + it_begin->first.length();
+ for (; it != ite && it->span == it_begin->span; ++it);
+ handleAnnotations(it_begin->span, it_begin, it);
+ endPos = it_begin->span.from() + it_begin->span.length();
}
int32_t wantEndPos = _text.size();
if (endPos < wantEndPos) {
@@ -177,4 +109,10 @@ AnnotationConverter::convert(const StringFieldValue &input, vespalib::slime::Ins
_juniper_converter.convert(_out.str(), inserter);
}
+bool
+AnnotationConverter::render_weighted_set_as_array() const
+{
+ return false;
+}
+
}