From 52cdc152dd61aee3e6c336fd787719ab2eb6d77b Mon Sep 17 00:00:00 2001
From: Tor Egge <Tor.Egge@online.no>
Date: Tue, 10 Oct 2023 16:49:08 +0200
Subject: Consolidate extraction of tokens from annotated string field value.

---
 .../tests/proton/docsummary/docsummary_test.cpp    |  2 +-
 .../vespa/searchlib/memoryindex/field_inverter.cpp | 81 +-------------------
 .../vespa/searchlib/memoryindex/field_inverter.h   |  6 +-
 .../vespa/searchlib/test/string_field_builder.cpp  |  8 +-
 searchlib/src/vespa/searchlib/util/CMakeLists.txt  |  2 +
 .../vespa/searchlib/util/linguisticsannotation.cpp |  9 +++
 .../vespa/searchlib/util/linguisticsannotation.h   | 11 +++
 .../src/vespa/searchlib/util/token_extractor.cpp   | 86 ++++++++++++++++++++++
 .../src/vespa/searchlib/util/token_extractor.h     | 27 +++++++
 .../annotation_converter_test.cpp                  |  2 +-
 .../vespa/searchsummary/docsummary/CMakeLists.txt  |  1 -
 .../docsummary/annotation_converter.cpp            | 70 ++----------------
 .../docsummary/linguisticsannotation.cpp           |  9 ---
 .../docsummary/linguisticsannotation.h             | 11 ---
 14 files changed, 153 insertions(+), 172 deletions(-)
 create mode 100644 searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp
 create mode 100644 searchlib/src/vespa/searchlib/util/linguisticsannotation.h
 create mode 100644 searchlib/src/vespa/searchlib/util/token_extractor.cpp
 create mode 100644 searchlib/src/vespa/searchlib/util/token_extractor.h
 delete mode 100644 searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp
 delete mode 100644 searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h

diff --git a/searchcore/src/tests/proton/docsummary/docsummary_test.cpp b/searchcore/src/tests/proton/docsummary/docsummary_test.cpp
index 1fcb1b09d94..8264ec6b680 100644
--- a/searchcore/src/tests/proton/docsummary/docsummary_test.cpp
+++ b/searchcore/src/tests/proton/docsummary/docsummary_test.cpp
@@ -23,12 +23,12 @@
 #include <vespa/searchlib/index/dummyfileheadercontext.h>
 #include <vespa/searchlib/tensor/tensor_attribute.h>
 #include <vespa/searchlib/test/doc_builder.h>
+#include <vespa/searchlib/util/linguisticsannotation.h>
 #include <vespa/searchlib/transactionlog/nosyncproxy.h>
 #include <vespa/searchlib/transactionlog/translogserver.h>
 #include <vespa/searchsummary/docsummary/i_docsum_field_writer_factory.h>
 #include <vespa/searchsummary/docsummary/i_docsum_store_document.h>
 #include <vespa/searchsummary/docsummary/i_juniper_converter.h>
-#include <vespa/searchsummary/docsummary/linguisticsannotation.h>
 #include <vespa/config-bucketspaces.h>
 #include <vespa/config/helper/configgetter.hpp>
 #include <vespa/document/annotation/annotation.h>
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
index 042b57f0486..c469ba67765 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
@@ -2,12 +2,8 @@
 
 #include "field_inverter.h"
 #include "ordered_field_index_inserter.h"
-#include <vespa/document/annotation/alternatespanlist.h>
 #include <vespa/document/annotation/annotation.h>
 #include <vespa/document/annotation/span.h>
-#include <vespa/document/annotation/spanlist.h>
-#include <vespa/document/annotation/spantree.h>
-#include <vespa/document/annotation/spantreevisitor.h>
 #include <vespa/document/fieldvalue/arrayfieldvalue.h>
 #include <vespa/document/fieldvalue/document.h>
 #include <vespa/document/fieldvalue/stringfieldvalue.h>
@@ -30,9 +26,7 @@ LOG_SETUP(".searchlib.memoryindex.fieldinverter");
 
 namespace search::memoryindex {
 
-using document::AlternateSpanList;
 using document::Annotation;
-using document::AnnotationType;
 using document::ArrayFieldValue;
 using document::DataType;
 using document::Document;
@@ -40,79 +34,23 @@ using document::DocumentType;
 using document::Field;
 using document::FieldValue;
 using document::IntFieldValue;
-using document::SimpleSpanList;
 using document::Span;
-using document::SpanList;
-using document::SpanNode;
-using document::SpanTree;
-using document::SpanTreeVisitor;
 using document::StringFieldValue;
 using document::StructFieldValue;
 using document::WeightedSetFieldValue;
 using index::DocIdAndPosOccFeatures;
 using index::Schema;
 using search::index::schema::CollectionType;
+using search::linguistics::TokenExtractor;
 using search::util::URL;
 using vespalib::make_string;
 using vespalib::datastore::Aligner;
 
-namespace documentinverterkludge::linguistics {
-
-const vespalib::string SPANTREE_NAME("linguistics");
-
-}
-
-using namespace documentinverterkludge;
-
-namespace {
-
-class SpanFinder : public SpanTreeVisitor {
-public:
-    int32_t begin_pos;
-    int32_t end_pos;
-
-    SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {}
-    Span span() { return Span(begin_pos, end_pos - begin_pos); }
-
-    void visit(const Span &node) override {
-        begin_pos = std::min(begin_pos, node.from());
-        end_pos = std::max(end_pos, node.from() + node.length());
-    }
-    void visit(const SpanList &node) override {
-        for (const auto & span_ : node) {
-            const_cast<SpanNode *>(span_)->accept(*this);
-        }
-    }
-    void visit(const SimpleSpanList &node) override {
-        for (const auto & span_ : node) {
-            const_cast<Span &>(span_).accept(*this);
-        }
-    }
-    void visit(const AlternateSpanList &node) override {
-        for (size_t i = 0; i < node.getNumSubtrees(); ++i) {
-            visit(node.getSubtree(i));
-        }
-    }
-};
-
-Span
-getSpan(const SpanNode &span_node)
-{
-    SpanFinder finder;
-    // The SpanNode will not be changed.
-    const_cast<SpanNode &>(span_node).accept(finder);
-    return finder.span();
-}
-
-}
-
 void
 FieldInverter::processAnnotations(const StringFieldValue &value, const Document& doc)
 {
-    _terms.clear();
-    StringFieldValue::SpanTrees spanTrees = value.getSpanTrees();
-    const SpanTree *tree = StringFieldValue::findTree(spanTrees, linguistics::SPANTREE_NAME);
-    if (tree == nullptr) {
+    auto span_trees = value.getSpanTrees();
+    if (!TokenExtractor::extract(false, _terms, span_trees)) {
         /* This is wrong unless field is exact match */
         const vespalib::string &text = value.getValue();
         if (text.empty()) {
@@ -126,19 +64,6 @@ FieldInverter::processAnnotations(const StringFieldValue &value, const Document&
         return;
     }
     const vespalib::string &text = value.getValue();
-    for (const Annotation & annotation : *tree) {
-        const SpanNode *span = annotation.getSpanNode();
-        if ((span != nullptr) && annotation.valid() &&
-            (annotation.getType() == *AnnotationType::TERM))
-        {
-            Span sp = getSpan(*span);
-            if (sp.length() != 0) {
-                _terms.push_back(std::make_pair(sp,
-                                                annotation.getFieldValue()));
-            }
-        }
-    }
-    std::sort(_terms.begin(), _terms.end());
     auto it  = _terms.begin();
     auto ite = _terms.end();
     uint32_t wordRef;
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
index 99830e623eb..23e3f9ddfd8 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
@@ -5,6 +5,7 @@
 #include "i_field_index_remove_listener.h"
 #include <vespa/document/annotation/span.h>
 #include <vespa/searchlib/index/docidandfeatures.h>
+#include <vespa/searchlib/util/token_extractor.h>
 #include <vespa/vespalib/stllike/allocator.h>
 #include <vespa/vespalib/stllike/hash_map.h>
 #include <limits>
@@ -179,9 +180,8 @@ private:
     index::DocIdAndPosOccFeatures  _features;
     UInt32Vector                   _wordRefs;
 
-    using SpanTerm = std::pair<document::Span, const document::FieldValue *>;
-    using SpanTermVector = std::vector<SpanTerm>;
-    SpanTermVector                      _terms;
+    using SpanTerm = linguistics::TokenExtractor::SpanTerm;
+    std::vector<SpanTerm>          _terms;
 
     // Info about aborted and pending documents.
     std::vector<PositionRange>                  _abortedDocs;
diff --git a/searchlib/src/vespa/searchlib/test/string_field_builder.cpp b/searchlib/src/vespa/searchlib/test/string_field_builder.cpp
index e842b7b44d6..d81572d8913 100644
--- a/searchlib/src/vespa/searchlib/test/string_field_builder.cpp
+++ b/searchlib/src/vespa/searchlib/test/string_field_builder.cpp
@@ -7,6 +7,7 @@
 #include <vespa/document/annotation/spanlist.h>
 #include <vespa/document/annotation/spantree.h>
 #include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/searchlib/util/linguisticsannotation.h>
 #include <vespa/fastlib/text/unicodeutil.h>
 #include <vespa/vespalib/text/utf8.h>
 
@@ -22,15 +23,10 @@ using document::SpanNode;
 using document::SpanTree;
 using vespalib::Utf8Reader;
 using vespalib::Utf8Writer;
+using search::linguistics::SPANTREE_NAME;
 
 namespace search::test {
 
-namespace {
-
-const vespalib::string SPANTREE_NAME("linguistics");
-
-}
-
 StringFieldBuilder::StringFieldBuilder(const DocBuilder& doc_builder)
     : _value(),
       _span_start(0u),
diff --git a/searchlib/src/vespa/searchlib/util/CMakeLists.txt b/searchlib/src/vespa/searchlib/util/CMakeLists.txt
index 500b08da815..e9661b5e919 100644
--- a/searchlib/src/vespa/searchlib/util/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/util/CMakeLists.txt
@@ -13,10 +13,12 @@ vespa_add_library(searchlib_util OBJECT
     filesizecalculator.cpp
     fileutil.cpp
     foldedstringcompare.cpp
+    linguisticsannotation.cpp
     logutil.cpp
     rawbuf.cpp
     slime_output_raw_buf_adapter.cpp
     state_explorer_utils.cpp
+    token_extractor.cpp
     url.cpp
     DEPENDS
 )
diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp
new file mode 100644
index 00000000000..c8aef561319
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp
@@ -0,0 +1,9 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "linguisticsannotation.h"
+
+namespace search::linguistics {
+
+const vespalib::string SPANTREE_NAME("linguistics");
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.h b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h
new file mode 100644
index 00000000000..83a19bed986
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h
@@ -0,0 +1,11 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/stllike/string.h>
+
+namespace search::linguistics {
+
+extern const vespalib::string SPANTREE_NAME;
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.cpp b/searchlib/src/vespa/searchlib/util/token_extractor.cpp
new file mode 100644
index 00000000000..a5d9dc0a4a6
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.cpp
@@ -0,0 +1,86 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "token_extractor.h"
+#include "linguisticsannotation.h"
+#include <vespa/document/annotation/alternatespanlist.h>
+#include <vespa/document/annotation/span.h>
+#include <vespa/document/annotation/spanlist.h>
+#include <vespa/document/annotation/spantreevisitor.h>
+
+using document::AlternateSpanList;
+using document::Annotation;
+using document::AnnotationType;
+using document::SimpleSpanList;
+using document::Span;
+using document::SpanList;
+using document::SpanNode;
+using document::SpanTreeVisitor;
+using document::StringFieldValue;
+
+namespace search::linguistics {
+
+namespace {
+
+class SpanFinder : public SpanTreeVisitor {
+public:
+    int32_t begin_pos;
+    int32_t end_pos;
+
+    SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {}
+    Span span() { return Span(begin_pos, end_pos - begin_pos); }
+
+    void visit(const Span &node) override {
+        begin_pos = std::min(begin_pos, node.from());
+        end_pos = std::max(end_pos, node.from() + node.length());
+    }
+    void visit(const SpanList &node) override {
+        for (const auto & span_ : node) {
+            span_->accept(*this);
+        }
+    }
+    void visit(const SimpleSpanList &node) override {
+        for (const auto & span_ : node) {
+            span_.accept(*this);
+        }
+    }
+    void visit(const AlternateSpanList &node) override {
+        for (size_t i = 0; i < node.getNumSubtrees(); ++i) {
+            visit(node.getSubtree(i));
+        }
+    }
+};
+
+Span
+getSpan(const SpanNode &span_node)
+{
+    SpanFinder finder;
+    span_node.accept(finder);
+    return finder.span();
+}
+
+}
+
+bool
+TokenExtractor::extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees)
+{
+    auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME);
+    if (tree == nullptr) {
+        return false;
+    }
+    terms.clear();
+    for (const Annotation & annotation : *tree) {
+        const SpanNode *span = annotation.getSpanNode();
+        if ((span != nullptr) && annotation.valid() &&
+            (annotation.getType() == *AnnotationType::TERM))
+        {
+            Span sp = getSpan(*span);
+            if (sp.length() != 0 || allow_zero_length_tokens) {
+                terms.emplace_back(sp, annotation.getFieldValue());
+            }
+        }
+    }
+    std::sort(terms.begin(), terms.end());
+    return true;
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.h b/searchlib/src/vespa/searchlib/util/token_extractor.h
new file mode 100644
index 00000000000..5796aaa7482
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.h
@@ -0,0 +1,27 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vector>
+
+namespace document {
+
+class FieldValue;
+class StringFieldValue;
+class Span;
+
+}
+
+namespace search::linguistics {
+
+/*
+ * Class used to extract tokens from annotated string field value.
+ */
+class TokenExtractor {
+public:
+    using SpanTerm = std::pair<document::Span, const document::FieldValue*>;
+    static bool extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees);
+};
+
+}
diff --git a/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp b/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp
index 16aa8c70131..0a05e078382 100644
--- a/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp
+++ b/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp
@@ -9,9 +9,9 @@
 #include <vespa/document/repo/configbuilder.h>
 #include <vespa/document/repo/fixedtyperepo.h>
 #include <vespa/juniper/juniper_separators.h>
+#include <vespa/searchlib/util/linguisticsannotation.h>
 #include <vespa/searchsummary/docsummary/annotation_converter.h>
 #include <vespa/searchsummary/docsummary/i_juniper_converter.h>
-#include <vespa/searchsummary/docsummary/linguisticsannotation.h>
 #include <vespa/vespalib/data/slime/slime.h>
 #include <vespa/vespalib/gtest/gtest.h>
 #include <vespa/vespalib/stllike/asciistream.h>
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
index 9d61c61ef7a..32df047c27f 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
+++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
@@ -23,7 +23,6 @@ vespa_add_library(searchsummary_docsummary OBJECT
     juniper_dfw_term_visitor.cpp
     juniper_query_adapter.cpp
     juniperproperties.cpp
-    linguisticsannotation.cpp
     matched_elements_filter_dfw.cpp
     positionsdfw.cpp
     query_term_filter.cpp
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
index 251cad47922..b4f76d8e39f 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
@@ -2,29 +2,21 @@
 
 #include "annotation_converter.h"
 #include "i_juniper_converter.h"
-#include "linguisticsannotation.h"
-#include <vespa/document/annotation/alternatespanlist.h>
 #include <vespa/document/annotation/annotation.h>
-#include <vespa/document/annotation/spantree.h>
-#include <vespa/document/annotation/spantreevisitor.h>
-#include <vespa/document/datatype/annotationtype.h>
+#include <vespa/document/annotation/span.h>
 #include <vespa/document/fieldvalue/stringfieldvalue.h>
 #include <vespa/juniper/juniper_separators.h>
+#include <vespa/searchlib/util/linguisticsannotation.h>
+#include <vespa/searchlib/util/token_extractor.h>
 #include <vespa/vespalib/stllike/asciistream.h>
 #include <vespa/vespalib/util/exceptions.h>
 #include <utility>
 
-using document::AlternateSpanList;
 using document::Annotation;
-using document::AnnotationType;
 using document::FieldValue;
-using document::SimpleSpanList;
 using document::Span;
-using document::SpanList;
-using document::SpanNode;
-using document::SpanTree;
-using document::SpanTreeVisitor;
 using document::StringFieldValue;
+using search::linguistics::TokenExtractor;
 
 namespace search::docsummary {
 
@@ -36,40 +28,6 @@ getSpanString(vespalib::stringref s, const Span &span)
     return {s.data() + span.from(), static_cast<size_t>(span.length())};
 }
 
-struct SpanFinder : SpanTreeVisitor {
-    int32_t begin_pos;
-    int32_t end_pos;
-
-    SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {}
-    Span span() { return Span(begin_pos, end_pos - begin_pos); }
-
-    void visit(const Span &node) override {
-        begin_pos = std::min(begin_pos, node.from());
-        end_pos = std::max(end_pos, node.from() + node.length());
-    }
-    void visit(const SpanList &node) override {
-        for (const auto & span_ : node) {
-            span_->accept(*this);
-        }
-    }
-    void visit(const SimpleSpanList &node) override {
-        for (const auto & span_ : node) {
-            span_.accept(*this);
-        }
-    }
-    void visit(const AlternateSpanList &node) override {
-        for (size_t i = 0; i < node.getNumSubtrees(); ++i) {
-            visit(node.getSubtree(i));
-        }
-    }
-};
-
-Span getSpan(const SpanNode &span_node) {
-    SpanFinder finder;
-    span_node.accept(finder);
-    return finder.span();
-}
-
 const StringFieldValue &ensureStringFieldValue(const FieldValue &value) __attribute__((noinline));
 
 const StringFieldValue &ensureStringFieldValue(const FieldValue &value) {
@@ -125,28 +83,16 @@ AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, For
 void
 AnnotationConverter::handleIndexingTerms(const StringFieldValue& value)
 {
-    StringFieldValue::SpanTrees trees = value.getSpanTrees();
-    const SpanTree *tree = StringFieldValue::findTree(trees, linguistics::SPANTREE_NAME);
-    using SpanTerm = std::pair<Span, const FieldValue *>;
-    using SpanTermVector = std::vector<SpanTerm>;
-    if (!tree) {
+    using SpanTerm = TokenExtractor::SpanTerm;
+    std::vector<SpanTerm> terms;
+    auto span_trees = value.getSpanTrees();
+    if (!TokenExtractor::extract(true, terms, span_trees)) {
         // Treat a string without annotations as a single span.
         SpanTerm str(Span(0, _text.size()),
                      static_cast<const FieldValue*>(nullptr));
         handleAnnotations(str.first, &str, &str + 1);
         return;
     }
-    SpanTermVector terms;
-    for (const Annotation& annotation : *tree) {
-        // For now, skip any composite spans.
-        const auto *span = dynamic_cast<const Span*>(annotation.getSpanNode());
-        if ((span != nullptr) && annotation.valid() &&
-            (annotation.getType() == *AnnotationType::TERM)) {
-            terms.push_back(std::make_pair(getSpan(*span),
-                                           annotation.getFieldValue()));
-        }
-    }
-    sort(terms.begin(), terms.end());
     auto it = terms.begin();
     auto ite = terms.end();
     int32_t endPos = 0;
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp b/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp
deleted file mode 100644
index c8aef561319..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#include "linguisticsannotation.h"
-
-namespace search::linguistics {
-
-const vespalib::string SPANTREE_NAME("linguistics");
-
-}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h b/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h
deleted file mode 100644
index 83a19bed986..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#pragma once
-
-#include <vespa/vespalib/stllike/string.h>
-
-namespace search::linguistics {
-
-extern const vespalib::string SPANTREE_NAME;
-
-}
-- 
cgit v1.2.3