Consolidate extraction of tokens from annotated string field value.

author: Tor Egge <Tor.Egge@online.no> 2023-10-10 16:49:08 +0200
committer: Tor Egge <Tor.Egge@online.no> 2023-10-10 16:49:08 +0200
commit: 52cdc152dd61aee3e6c336fd787719ab2eb6d77b (patch)
tree: cf6f41eafc184cc748729bf4b57a9c2e928e00b8 /searchlib
parent: 3126b4608e11a85b7b58eaf99f4d48a1fb5bacfb (diff)
8 files changed, 143 insertions, 87 deletions
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
index 042b57f0486..c469ba67765 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
@@ -2,12 +2,8 @@
 
 #include "field_inverter.h"
 #include "ordered_field_index_inserter.h"
-#include <vespa/document/annotation/alternatespanlist.h>
 #include <vespa/document/annotation/annotation.h>
 #include <vespa/document/annotation/span.h>
-#include <vespa/document/annotation/spanlist.h>
-#include <vespa/document/annotation/spantree.h>
-#include <vespa/document/annotation/spantreevisitor.h>
 #include <vespa/document/fieldvalue/arrayfieldvalue.h>
 #include <vespa/document/fieldvalue/document.h>
 #include <vespa/document/fieldvalue/stringfieldvalue.h>
@@ -30,9 +26,7 @@ LOG_SETUP(".searchlib.memoryindex.fieldinverter");
 
 namespace search::memoryindex {
 
-using document::AlternateSpanList;
 using document::Annotation;
-using document::AnnotationType;
 using document::ArrayFieldValue;
 using document::DataType;
 using document::Document;
@@ -40,79 +34,23 @@ using document::DocumentType;
 using document::Field;
 using document::FieldValue;
 using document::IntFieldValue;
-using document::SimpleSpanList;
 using document::Span;
-using document::SpanList;
-using document::SpanNode;
-using document::SpanTree;
-using document::SpanTreeVisitor;
 using document::StringFieldValue;
 using document::StructFieldValue;
 using document::WeightedSetFieldValue;
 using index::DocIdAndPosOccFeatures;
 using index::Schema;
 using search::index::schema::CollectionType;
+using search::linguistics::TokenExtractor;
 using search::util::URL;
 using vespalib::make_string;
 using vespalib::datastore::Aligner;
 
-namespace documentinverterkludge::linguistics {
-
-const vespalib::string SPANTREE_NAME("linguistics");
-
-}
-
-using namespace documentinverterkludge;
-
-namespace {
-
-class SpanFinder : public SpanTreeVisitor {
-public:
-    int32_t begin_pos;
-    int32_t end_pos;
-
-    SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {}
-    Span span() { return Span(begin_pos, end_pos - begin_pos); }
-
-    void visit(const Span &node) override {
-        begin_pos = std::min(begin_pos, node.from());
-        end_pos = std::max(end_pos, node.from() + node.length());
-    }
-    void visit(const SpanList &node) override {
-        for (const auto & span_ : node) {
-            const_cast<SpanNode *>(span_)->accept(*this);
-        }
-    }
-    void visit(const SimpleSpanList &node) override {
-        for (const auto & span_ : node) {
-            const_cast<Span &>(span_).accept(*this);
-        }
-    }
-    void visit(const AlternateSpanList &node) override {
-        for (size_t i = 0; i < node.getNumSubtrees(); ++i) {
-            visit(node.getSubtree(i));
-        }
-    }
-};
-
-Span
-getSpan(const SpanNode &span_node)
-{
-    SpanFinder finder;
-    // The SpanNode will not be changed.
-    const_cast<SpanNode &>(span_node).accept(finder);
-    return finder.span();
-}
-
-}
-
 void
 FieldInverter::processAnnotations(const StringFieldValue &value, const Document& doc)
 {
-    _terms.clear();
-    StringFieldValue::SpanTrees spanTrees = value.getSpanTrees();
-    const SpanTree *tree = StringFieldValue::findTree(spanTrees, linguistics::SPANTREE_NAME);
-    if (tree == nullptr) {
+    auto span_trees = value.getSpanTrees();
+    if (!TokenExtractor::extract(false, _terms, span_trees)) {
         /* This is wrong unless field is exact match */
         const vespalib::string &text = value.getValue();
         if (text.empty()) {
@@ -126,19 +64,6 @@ FieldInverter::processAnnotations(const StringFieldValue &value, const Document&
         return;
     }
     const vespalib::string &text = value.getValue();
-    for (const Annotation & annotation : *tree) {
-        const SpanNode *span = annotation.getSpanNode();
-        if ((span != nullptr) && annotation.valid() &&
-            (annotation.getType() == *AnnotationType::TERM))
-        {
-            Span sp = getSpan(*span);
-            if (sp.length() != 0) {
-                _terms.push_back(std::make_pair(sp,
-                                                annotation.getFieldValue()));
-            }
-        }
-    }
-    std::sort(_terms.begin(), _terms.end());
     auto it  = _terms.begin();
     auto ite = _terms.end();
     uint32_t wordRef;
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
index 99830e623eb..23e3f9ddfd8 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
@@ -5,6 +5,7 @@
 #include "i_field_index_remove_listener.h"
 #include <vespa/document/annotation/span.h>
 #include <vespa/searchlib/index/docidandfeatures.h>
+#include <vespa/searchlib/util/token_extractor.h>
 #include <vespa/vespalib/stllike/allocator.h>
 #include <vespa/vespalib/stllike/hash_map.h>
 #include <limits>
@@ -179,9 +180,8 @@ private:
     index::DocIdAndPosOccFeatures  _features;
     UInt32Vector                   _wordRefs;
 
-    using SpanTerm = std::pair<document::Span, const document::FieldValue *>;
-    using SpanTermVector = std::vector<SpanTerm>;
-    SpanTermVector                      _terms;
+    using SpanTerm = linguistics::TokenExtractor::SpanTerm;
+    std::vector<SpanTerm>          _terms;
 
     // Info about aborted and pending documents.
     std::vector<PositionRange>                  _abortedDocs;
diff --git a/searchlib/src/vespa/searchlib/test/string_field_builder.cpp b/searchlib/src/vespa/searchlib/test/string_field_builder.cpp
index e842b7b44d6..d81572d8913 100644
--- a/searchlib/src/vespa/searchlib/test/string_field_builder.cpp
+++ b/searchlib/src/vespa/searchlib/test/string_field_builder.cpp
@@ -7,6 +7,7 @@
 #include <vespa/document/annotation/spanlist.h>
 #include <vespa/document/annotation/spantree.h>
 #include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/searchlib/util/linguisticsannotation.h>
 #include <vespa/fastlib/text/unicodeutil.h>
 #include <vespa/vespalib/text/utf8.h>
 
@@ -22,15 +23,10 @@ using document::SpanNode;
 using document::SpanTree;
 using vespalib::Utf8Reader;
 using vespalib::Utf8Writer;
+using search::linguistics::SPANTREE_NAME;
 
 namespace search::test {
 
-namespace {
-
-const vespalib::string SPANTREE_NAME("linguistics");
-
-}
-
 StringFieldBuilder::StringFieldBuilder(const DocBuilder& doc_builder)
     : _value(),
       _span_start(0u),
diff --git a/searchlib/src/vespa/searchlib/util/CMakeLists.txt b/searchlib/src/vespa/searchlib/util/CMakeLists.txt
index 500b08da815..e9661b5e919 100644
--- a/searchlib/src/vespa/searchlib/util/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/util/CMakeLists.txt
@@ -13,10 +13,12 @@ vespa_add_library(searchlib_util OBJECT
     filesizecalculator.cpp
     fileutil.cpp
     foldedstringcompare.cpp
+    linguisticsannotation.cpp
     logutil.cpp
     rawbuf.cpp
     slime_output_raw_buf_adapter.cpp
     state_explorer_utils.cpp
+    token_extractor.cpp
     url.cpp
     DEPENDS
 )
diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp
new file mode 100644
index 00000000000..c8aef561319
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp
@@ -0,0 +1,9 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "linguisticsannotation.h"
+
+namespace search::linguistics {
+
+const vespalib::string SPANTREE_NAME("linguistics");
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.h b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h
new file mode 100644
index 00000000000..83a19bed986
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h
@@ -0,0 +1,11 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/stllike/string.h>
+
+namespace search::linguistics {
+
+extern const vespalib::string SPANTREE_NAME;
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.cpp b/searchlib/src/vespa/searchlib/util/token_extractor.cpp
new file mode 100644
index 00000000000..a5d9dc0a4a6
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.cpp
@@ -0,0 +1,86 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "token_extractor.h"
+#include "linguisticsannotation.h"
+#include <vespa/document/annotation/alternatespanlist.h>
+#include <vespa/document/annotation/span.h>
+#include <vespa/document/annotation/spanlist.h>
+#include <vespa/document/annotation/spantreevisitor.h>
+
+using document::AlternateSpanList;
+using document::Annotation;
+using document::AnnotationType;
+using document::SimpleSpanList;
+using document::Span;
+using document::SpanList;
+using document::SpanNode;
+using document::SpanTreeVisitor;
+using document::StringFieldValue;
+
+namespace search::linguistics {
+
+namespace {
+
+class SpanFinder : public SpanTreeVisitor {
+public:
+    int32_t begin_pos;
+    int32_t end_pos;
+
+    SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {}
+    Span span() { return Span(begin_pos, end_pos - begin_pos); }
+
+    void visit(const Span &node) override {
+        begin_pos = std::min(begin_pos, node.from());
+        end_pos = std::max(end_pos, node.from() + node.length());
+    }
+    void visit(const SpanList &node) override {
+        for (const auto & span_ : node) {
+            span_->accept(*this);
+        }
+    }
+    void visit(const SimpleSpanList &node) override {
+        for (const auto & span_ : node) {
+            span_.accept(*this);
+        }
+    }
+    void visit(const AlternateSpanList &node) override {
+        for (size_t i = 0; i < node.getNumSubtrees(); ++i) {
+            visit(node.getSubtree(i));
+        }
+    }
+};
+
+Span
+getSpan(const SpanNode &span_node)
+{
+    SpanFinder finder;
+    span_node.accept(finder);
+    return finder.span();
+}
+
+}
+
+bool
+TokenExtractor::extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees)
+{
+    auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME);
+    if (tree == nullptr) {
+        return false;
+    }
+    terms.clear();
+    for (const Annotation & annotation : *tree) {
+        const SpanNode *span = annotation.getSpanNode();
+        if ((span != nullptr) && annotation.valid() &&
+            (annotation.getType() == *AnnotationType::TERM))
+        {
+            Span sp = getSpan(*span);
+            if (sp.length() != 0 || allow_zero_length_tokens) {
+                terms.emplace_back(sp, annotation.getFieldValue());
+            }
+        }
+    }
+    std::sort(terms.begin(), terms.end());
+    return true;
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.h b/searchlib/src/vespa/searchlib/util/token_extractor.h
new file mode 100644
index 00000000000..5796aaa7482
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.h
@@ -0,0 +1,27 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vector>
+
+namespace document {
+
+class FieldValue;
+class StringFieldValue;
+class Span;
+
+}
+
+namespace search::linguistics {
+
+/*
+ * Class used to extract tokens from annotated string field value.
+ */
+class TokenExtractor {
+public:
+    using SpanTerm = std::pair<document::Span, const document::FieldValue*>;
+    static bool extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees);
+};
+
+}
author	Tor Egge <Tor.Egge@online.no>	2023-10-10 16:49:08 +0200
committer	Tor Egge <Tor.Egge@online.no>	2023-10-10 16:49:08 +0200
commit	52cdc152dd61aee3e6c336fd787719ab2eb6d77b (patch)
tree	cf6f41eafc184cc748729bf4b57a9c2e928e00b8 /searchlib
parent	3126b4608e11a85b7b58eaf99f4d48a1fb5bacfb (diff)