summaryrefslogtreecommitdiffstats
path: root/searchsummary
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2023-10-10 16:49:08 +0200
committerTor Egge <Tor.Egge@online.no>2023-10-10 16:49:08 +0200
commit52cdc152dd61aee3e6c336fd787719ab2eb6d77b (patch)
treecf6f41eafc184cc748729bf4b57a9c2e928e00b8 /searchsummary
parent3126b4608e11a85b7b58eaf99f4d48a1fb5bacfb (diff)
Consolidate extraction of tokens from annotated string field value.
Diffstat (limited to 'searchsummary')
-rw-r--r--searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp2
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt1
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp70
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp9
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h11
5 files changed, 9 insertions, 84 deletions
diff --git a/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp b/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp
index 16aa8c70131..0a05e078382 100644
--- a/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp
+++ b/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp
@@ -9,9 +9,9 @@
#include <vespa/document/repo/configbuilder.h>
#include <vespa/document/repo/fixedtyperepo.h>
#include <vespa/juniper/juniper_separators.h>
+#include <vespa/searchlib/util/linguisticsannotation.h>
#include <vespa/searchsummary/docsummary/annotation_converter.h>
#include <vespa/searchsummary/docsummary/i_juniper_converter.h>
-#include <vespa/searchsummary/docsummary/linguisticsannotation.h>
#include <vespa/vespalib/data/slime/slime.h>
#include <vespa/vespalib/gtest/gtest.h>
#include <vespa/vespalib/stllike/asciistream.h>
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
index 9d61c61ef7a..32df047c27f 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
+++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
@@ -23,7 +23,6 @@ vespa_add_library(searchsummary_docsummary OBJECT
juniper_dfw_term_visitor.cpp
juniper_query_adapter.cpp
juniperproperties.cpp
- linguisticsannotation.cpp
matched_elements_filter_dfw.cpp
positionsdfw.cpp
query_term_filter.cpp
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
index 251cad47922..b4f76d8e39f 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
@@ -2,29 +2,21 @@
#include "annotation_converter.h"
#include "i_juniper_converter.h"
-#include "linguisticsannotation.h"
-#include <vespa/document/annotation/alternatespanlist.h>
#include <vespa/document/annotation/annotation.h>
-#include <vespa/document/annotation/spantree.h>
-#include <vespa/document/annotation/spantreevisitor.h>
-#include <vespa/document/datatype/annotationtype.h>
+#include <vespa/document/annotation/span.h>
#include <vespa/document/fieldvalue/stringfieldvalue.h>
#include <vespa/juniper/juniper_separators.h>
+#include <vespa/searchlib/util/linguisticsannotation.h>
+#include <vespa/searchlib/util/token_extractor.h>
#include <vespa/vespalib/stllike/asciistream.h>
#include <vespa/vespalib/util/exceptions.h>
#include <utility>
-using document::AlternateSpanList;
using document::Annotation;
-using document::AnnotationType;
using document::FieldValue;
-using document::SimpleSpanList;
using document::Span;
-using document::SpanList;
-using document::SpanNode;
-using document::SpanTree;
-using document::SpanTreeVisitor;
using document::StringFieldValue;
+using search::linguistics::TokenExtractor;
namespace search::docsummary {
@@ -36,40 +28,6 @@ getSpanString(vespalib::stringref s, const Span &span)
return {s.data() + span.from(), static_cast<size_t>(span.length())};
}
-struct SpanFinder : SpanTreeVisitor {
- int32_t begin_pos;
- int32_t end_pos;
-
- SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {}
- Span span() { return Span(begin_pos, end_pos - begin_pos); }
-
- void visit(const Span &node) override {
- begin_pos = std::min(begin_pos, node.from());
- end_pos = std::max(end_pos, node.from() + node.length());
- }
- void visit(const SpanList &node) override {
- for (const auto & span_ : node) {
- span_->accept(*this);
- }
- }
- void visit(const SimpleSpanList &node) override {
- for (const auto & span_ : node) {
- span_.accept(*this);
- }
- }
- void visit(const AlternateSpanList &node) override {
- for (size_t i = 0; i < node.getNumSubtrees(); ++i) {
- visit(node.getSubtree(i));
- }
- }
-};
-
-Span getSpan(const SpanNode &span_node) {
- SpanFinder finder;
- span_node.accept(finder);
- return finder.span();
-}
-
const StringFieldValue &ensureStringFieldValue(const FieldValue &value) __attribute__((noinline));
const StringFieldValue &ensureStringFieldValue(const FieldValue &value) {
@@ -125,28 +83,16 @@ AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, For
void
AnnotationConverter::handleIndexingTerms(const StringFieldValue& value)
{
- StringFieldValue::SpanTrees trees = value.getSpanTrees();
- const SpanTree *tree = StringFieldValue::findTree(trees, linguistics::SPANTREE_NAME);
- using SpanTerm = std::pair<Span, const FieldValue *>;
- using SpanTermVector = std::vector<SpanTerm>;
- if (!tree) {
+ using SpanTerm = TokenExtractor::SpanTerm;
+ std::vector<SpanTerm> terms;
+ auto span_trees = value.getSpanTrees();
+ if (!TokenExtractor::extract(true, terms, span_trees)) {
// Treat a string without annotations as a single span.
SpanTerm str(Span(0, _text.size()),
static_cast<const FieldValue*>(nullptr));
handleAnnotations(str.first, &str, &str + 1);
return;
}
- SpanTermVector terms;
- for (const Annotation& annotation : *tree) {
- // For now, skip any composite spans.
- const auto *span = dynamic_cast<const Span*>(annotation.getSpanNode());
- if ((span != nullptr) && annotation.valid() &&
- (annotation.getType() == *AnnotationType::TERM)) {
- terms.push_back(std::make_pair(getSpan(*span),
- annotation.getFieldValue()));
- }
- }
- sort(terms.begin(), terms.end());
auto it = terms.begin();
auto ite = terms.end();
int32_t endPos = 0;
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp b/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp
deleted file mode 100644
index c8aef561319..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#include "linguisticsannotation.h"
-
-namespace search::linguistics {
-
-const vespalib::string SPANTREE_NAME("linguistics");
-
-}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h b/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h
deleted file mode 100644
index 83a19bed986..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#pragma once
-
-#include <vespa/vespalib/stllike/string.h>
-
-namespace search::linguistics {
-
-extern const vespalib::string SPANTREE_NAME;
-
-}