diff options
author | Tor Egge <Tor.Egge@online.no> | 2023-10-10 16:49:08 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2023-10-10 16:49:08 +0200 |
commit | 52cdc152dd61aee3e6c336fd787719ab2eb6d77b (patch) | |
tree | cf6f41eafc184cc748729bf4b57a9c2e928e00b8 /searchsummary | |
parent | 3126b4608e11a85b7b58eaf99f4d48a1fb5bacfb (diff) |
Consolidate extraction of tokens from annotated string field value.
Diffstat (limited to 'searchsummary')
5 files changed, 9 insertions, 84 deletions
diff --git a/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp b/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp index 16aa8c70131..0a05e078382 100644 --- a/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp +++ b/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp @@ -9,9 +9,9 @@ #include <vespa/document/repo/configbuilder.h> #include <vespa/document/repo/fixedtyperepo.h> #include <vespa/juniper/juniper_separators.h> +#include <vespa/searchlib/util/linguisticsannotation.h> #include <vespa/searchsummary/docsummary/annotation_converter.h> #include <vespa/searchsummary/docsummary/i_juniper_converter.h> -#include <vespa/searchsummary/docsummary/linguisticsannotation.h> #include <vespa/vespalib/data/slime/slime.h> #include <vespa/vespalib/gtest/gtest.h> #include <vespa/vespalib/stllike/asciistream.h> diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt index 9d61c61ef7a..32df047c27f 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt +++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt @@ -23,7 +23,6 @@ vespa_add_library(searchsummary_docsummary OBJECT juniper_dfw_term_visitor.cpp juniper_query_adapter.cpp juniperproperties.cpp - linguisticsannotation.cpp matched_elements_filter_dfw.cpp positionsdfw.cpp query_term_filter.cpp diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp index 251cad47922..b4f76d8e39f 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp @@ -2,29 +2,21 @@ #include "annotation_converter.h" #include "i_juniper_converter.h" -#include "linguisticsannotation.h" -#include <vespa/document/annotation/alternatespanlist.h> #include <vespa/document/annotation/annotation.h> -#include <vespa/document/annotation/spantree.h> -#include <vespa/document/annotation/spantreevisitor.h> -#include <vespa/document/datatype/annotationtype.h> +#include <vespa/document/annotation/span.h> #include <vespa/document/fieldvalue/stringfieldvalue.h> #include <vespa/juniper/juniper_separators.h> +#include <vespa/searchlib/util/linguisticsannotation.h> +#include <vespa/searchlib/util/token_extractor.h> #include <vespa/vespalib/stllike/asciistream.h> #include <vespa/vespalib/util/exceptions.h> #include <utility> -using document::AlternateSpanList; using document::Annotation; -using document::AnnotationType; using document::FieldValue; -using document::SimpleSpanList; using document::Span; -using document::SpanList; -using document::SpanNode; -using document::SpanTree; -using document::SpanTreeVisitor; using document::StringFieldValue; +using search::linguistics::TokenExtractor; namespace search::docsummary { @@ -36,40 +28,6 @@ getSpanString(vespalib::stringref s, const Span &span) return {s.data() + span.from(), static_cast<size_t>(span.length())}; } -struct SpanFinder : SpanTreeVisitor { - int32_t begin_pos; - int32_t end_pos; - - SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {} - Span span() { return Span(begin_pos, end_pos - begin_pos); } - - void visit(const Span &node) override { - begin_pos = std::min(begin_pos, node.from()); - end_pos = std::max(end_pos, node.from() + node.length()); - } - void visit(const SpanList &node) override { - for (const auto & span_ : node) { - span_->accept(*this); - } - } - void visit(const SimpleSpanList &node) override { - for (const auto & span_ : node) { - span_.accept(*this); - } - } - void visit(const AlternateSpanList &node) override { - for (size_t i = 0; i < node.getNumSubtrees(); ++i) { - visit(node.getSubtree(i)); - } - } -}; - -Span getSpan(const SpanNode &span_node) { - SpanFinder finder; - span_node.accept(finder); - return finder.span(); -} - const StringFieldValue &ensureStringFieldValue(const FieldValue &value) __attribute__((noinline)); const StringFieldValue &ensureStringFieldValue(const FieldValue &value) { @@ -125,28 +83,16 @@ AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, For void AnnotationConverter::handleIndexingTerms(const StringFieldValue& value) { - StringFieldValue::SpanTrees trees = value.getSpanTrees(); - const SpanTree *tree = StringFieldValue::findTree(trees, linguistics::SPANTREE_NAME); - using SpanTerm = std::pair<Span, const FieldValue *>; - using SpanTermVector = std::vector<SpanTerm>; - if (!tree) { + using SpanTerm = TokenExtractor::SpanTerm; + std::vector<SpanTerm> terms; + auto span_trees = value.getSpanTrees(); + if (!TokenExtractor::extract(true, terms, span_trees)) { // Treat a string without annotations as a single span. SpanTerm str(Span(0, _text.size()), static_cast<const FieldValue*>(nullptr)); handleAnnotations(str.first, &str, &str + 1); return; } - SpanTermVector terms; - for (const Annotation& annotation : *tree) { - // For now, skip any composite spans. - const auto *span = dynamic_cast<const Span*>(annotation.getSpanNode()); - if ((span != nullptr) && annotation.valid() && - (annotation.getType() == *AnnotationType::TERM)) { - terms.push_back(std::make_pair(getSpan(*span), - annotation.getFieldValue())); - } - } - sort(terms.begin(), terms.end()); auto it = terms.begin(); auto ite = terms.end(); int32_t endPos = 0; diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp b/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp deleted file mode 100644 index c8aef561319..00000000000 --- a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "linguisticsannotation.h" - -namespace search::linguistics { - -const vespalib::string SPANTREE_NAME("linguistics"); - -} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h b/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h deleted file mode 100644 index 83a19bed986..00000000000 --- a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include <vespa/vespalib/stllike/string.h> - -namespace search::linguistics { - -extern const vespalib::string SPANTREE_NAME; - -} |