diff options
author | Tor Egge <Tor.Egge@online.no> | 2023-10-12 12:36:50 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2023-10-12 12:36:50 +0200 |
commit | 706bf2929c840606efba2763b177ae435579c1d7 (patch) | |
tree | 45db7324ec136e87809135260f2a7491ca49150a /searchsummary/src | |
parent | 686dc5941b174ffab2de1ee1da90402977947e64 (diff) |
Move more checks to TokenExtractor.
Diffstat (limited to 'searchsummary/src')
-rw-r--r-- | searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp | 40 |
1 files changed, 13 insertions, 27 deletions
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp index b4f76d8e39f..bf267ab9e27 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp @@ -6,6 +6,7 @@ #include <vespa/document/annotation/span.h> #include <vespa/document/fieldvalue/stringfieldvalue.h> #include <vespa/juniper/juniper_separators.h> +#include <vespa/searchlib/memoryindex/field_inverter.h> #include <vespa/searchlib/util/linguisticsannotation.h> #include <vespa/searchlib/util/token_extractor.h> #include <vespa/vespalib/stllike/asciistream.h> @@ -17,6 +18,7 @@ using document::FieldValue; using document::Span; using document::StringFieldValue; using search::linguistics::TokenExtractor; +using search::memoryindex::FieldInverter; namespace search::docsummary { @@ -28,14 +30,7 @@ getSpanString(vespalib::stringref s, const Span &span) return {s.data() + span.from(), static_cast<size_t>(span.length())}; } -const StringFieldValue &ensureStringFieldValue(const FieldValue &value) __attribute__((noinline)); - -const StringFieldValue &ensureStringFieldValue(const FieldValue &value) { - if (!value.isA(FieldValue::Type::STRING)) { - throw vespalib::IllegalArgumentException("Illegal field type. " + value.toString(), VESPA_STRLOC); - } - return static_cast<const StringFieldValue &>(value); -} +vespalib::string dummy_field_name; } @@ -53,7 +48,7 @@ template <typename ForwardIt> void AnnotationConverter::handleAnnotations(const document::Span& span, ForwardIt it, ForwardIt last) { int annCnt = (last - it); - if (annCnt > 1 || (annCnt == 1 && it->second)) { + if (annCnt > 1 || (annCnt == 1 && it->altered)) { annotateSpans(span, it, last); } else { _out << getSpanString(_text, span) << juniper::separators::unit_separator_string; @@ -67,11 +62,7 @@ AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, For << (getSpanString(_text, span)) << juniper::separators::interlinear_annotation_separator_string; // SEPARATOR while (it != last) { - if (it->second) { - _out << ensureStringFieldValue(*it->second).getValue(); - } else { - _out << getSpanString(_text, span); - } + _out << it->word; if (++it != last) { _out << " "; } @@ -86,26 +77,21 @@ AnnotationConverter::handleIndexingTerms(const StringFieldValue& value) using SpanTerm = TokenExtractor::SpanTerm; std::vector<SpanTerm> terms; auto span_trees = value.getSpanTrees(); - if (!TokenExtractor::extract(true, terms, span_trees)) { - // Treat a string without annotations as a single span. - SpanTerm str(Span(0, _text.size()), - static_cast<const FieldValue*>(nullptr)); - handleAnnotations(str.first, &str, &str + 1); - return; - } + TokenExtractor token_extractor(dummy_field_name, FieldInverter::max_word_len); + token_extractor.extract(terms, span_trees, _text, nullptr); auto it = terms.begin(); auto ite = terms.end(); int32_t endPos = 0; for (; it != ite; ) { auto it_begin = it; - if (it_begin->first.from() > endPos) { - Span tmpSpan(endPos, it_begin->first.from() - endPos); + if (it_begin->span.from() > endPos) { + Span tmpSpan(endPos, it_begin->span.from() - endPos); handleAnnotations(tmpSpan, it, it); - endPos = it_begin->first.from(); + endPos = it_begin->span.from(); } - for (; it != ite && it->first == it_begin->first; ++it); - handleAnnotations(it_begin->first, it_begin, it); - endPos = it_begin->first.from() + it_begin->first.length(); + for (; it != ite && it->span == it_begin->span; ++it); + handleAnnotations(it_begin->span, it_begin, it); + endPos = it_begin->span.from() + it_begin->span.length(); } int32_t wantEndPos = _text.size(); if (endPos < wantEndPos) { |