Move more checks to TokenExtractor.

author: Tor Egge <Tor.Egge@online.no> 2023-10-12 12:36:50 +0200
committer: Tor Egge <Tor.Egge@online.no> 2023-10-12 12:36:50 +0200
commit: 706bf2929c840606efba2763b177ae435579c1d7 (patch)
tree: 45db7324ec136e87809135260f2a7491ca49150a /searchsummary
parent: 686dc5941b174ffab2de1ee1da90402977947e64 (diff)
1 files changed, 13 insertions, 27 deletions
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
index b4f76d8e39f..bf267ab9e27 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
@@ -6,6 +6,7 @@
 #include <vespa/document/annotation/span.h>
 #include <vespa/document/fieldvalue/stringfieldvalue.h>
 #include <vespa/juniper/juniper_separators.h>
+#include <vespa/searchlib/memoryindex/field_inverter.h>
 #include <vespa/searchlib/util/linguisticsannotation.h>
 #include <vespa/searchlib/util/token_extractor.h>
 #include <vespa/vespalib/stllike/asciistream.h>
@@ -17,6 +18,7 @@ using document::FieldValue;
 using document::Span;
 using document::StringFieldValue;
 using search::linguistics::TokenExtractor;
+using search::memoryindex::FieldInverter;
 
 namespace search::docsummary {
 
@@ -28,14 +30,7 @@ getSpanString(vespalib::stringref s, const Span &span)
     return {s.data() + span.from(), static_cast<size_t>(span.length())};
 }
 
-const StringFieldValue &ensureStringFieldValue(const FieldValue &value) __attribute__((noinline));
-
-const StringFieldValue &ensureStringFieldValue(const FieldValue &value) {
-    if (!value.isA(FieldValue::Type::STRING)) {
-        throw vespalib::IllegalArgumentException("Illegal field type. " + value.toString(), VESPA_STRLOC);
-    }
-    return static_cast<const StringFieldValue &>(value);
-}
+vespalib::string dummy_field_name;
 
 }
 
@@ -53,7 +48,7 @@ template <typename ForwardIt>
 void
 AnnotationConverter::handleAnnotations(const document::Span& span, ForwardIt it, ForwardIt last) {
     int annCnt = (last - it);
-    if (annCnt > 1 || (annCnt == 1 && it->second)) {
+    if (annCnt > 1 || (annCnt == 1 && it->altered)) {
         annotateSpans(span, it, last);
     } else {
         _out << getSpanString(_text, span) << juniper::separators::unit_separator_string;
@@ -67,11 +62,7 @@ AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, For
          << (getSpanString(_text, span))
          << juniper::separators::interlinear_annotation_separator_string; // SEPARATOR
     while (it != last) {
-        if (it->second) {
-            _out << ensureStringFieldValue(*it->second).getValue();
-        } else {
-            _out << getSpanString(_text, span);
-        }
+        _out << it->word;
         if (++it != last) {
             _out << " ";
         }
@@ -86,26 +77,21 @@ AnnotationConverter::handleIndexingTerms(const StringFieldValue& value)
     using SpanTerm = TokenExtractor::SpanTerm;
     std::vector<SpanTerm> terms;
     auto span_trees = value.getSpanTrees();
-    if (!TokenExtractor::extract(true, terms, span_trees)) {
-        // Treat a string without annotations as a single span.
-        SpanTerm str(Span(0, _text.size()),
-                     static_cast<const FieldValue*>(nullptr));
-        handleAnnotations(str.first, &str, &str + 1);
-        return;
-    }
+    TokenExtractor token_extractor(dummy_field_name, FieldInverter::max_word_len);
+    token_extractor.extract(terms, span_trees, _text, nullptr);
     auto it = terms.begin();
     auto ite = terms.end();
     int32_t endPos = 0;
     for (; it != ite; ) {
         auto it_begin = it;
-        if (it_begin->first.from() >  endPos) {
-            Span tmpSpan(endPos, it_begin->first.from() - endPos);
+        if (it_begin->span.from() >  endPos) {
+            Span tmpSpan(endPos, it_begin->span.from() - endPos);
             handleAnnotations(tmpSpan, it, it);
-            endPos = it_begin->first.from();
+            endPos = it_begin->span.from();
         }
-        for (; it != ite && it->first == it_begin->first; ++it);
-        handleAnnotations(it_begin->first, it_begin, it);
-        endPos = it_begin->first.from() + it_begin->first.length();
+        for (; it != ite && it->span == it_begin->span; ++it);
+        handleAnnotations(it_begin->span, it_begin, it);
+        endPos = it_begin->span.from() + it_begin->span.length();
     }
     int32_t wantEndPos = _text.size();
     if (endPos < wantEndPos) {
author	Tor Egge <Tor.Egge@online.no>	2023-10-12 12:36:50 +0200
committer	Tor Egge <Tor.Egge@online.no>	2023-10-12 12:36:50 +0200
commit	706bf2929c840606efba2763b177ae435579c1d7 (patch)
tree	45db7324ec136e87809135260f2a7491ca49150a /searchsummary
parent	686dc5941b174ffab2de1ee1da90402977947e64 (diff)