From 52cdc152dd61aee3e6c336fd787719ab2eb6d77b Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Tue, 10 Oct 2023 16:49:08 +0200 Subject: Consolidate extraction of tokens from annotated string field value. --- .../tests/proton/docsummary/docsummary_test.cpp | 2 +- .../vespa/searchlib/memoryindex/field_inverter.cpp | 81 +------------------- .../vespa/searchlib/memoryindex/field_inverter.h | 6 +- .../vespa/searchlib/test/string_field_builder.cpp | 8 +- searchlib/src/vespa/searchlib/util/CMakeLists.txt | 2 + .../vespa/searchlib/util/linguisticsannotation.cpp | 9 +++ .../vespa/searchlib/util/linguisticsannotation.h | 11 +++ .../src/vespa/searchlib/util/token_extractor.cpp | 86 ++++++++++++++++++++++ .../src/vespa/searchlib/util/token_extractor.h | 27 +++++++ .../annotation_converter_test.cpp | 2 +- .../vespa/searchsummary/docsummary/CMakeLists.txt | 1 - .../docsummary/annotation_converter.cpp | 70 ++---------------- .../docsummary/linguisticsannotation.cpp | 9 --- .../docsummary/linguisticsannotation.h | 11 --- 14 files changed, 153 insertions(+), 172 deletions(-) create mode 100644 searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp create mode 100644 searchlib/src/vespa/searchlib/util/linguisticsannotation.h create mode 100644 searchlib/src/vespa/searchlib/util/token_extractor.cpp create mode 100644 searchlib/src/vespa/searchlib/util/token_extractor.h delete mode 100644 searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp delete mode 100644 searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h diff --git a/searchcore/src/tests/proton/docsummary/docsummary_test.cpp b/searchcore/src/tests/proton/docsummary/docsummary_test.cpp index 1fcb1b09d94..8264ec6b680 100644 --- a/searchcore/src/tests/proton/docsummary/docsummary_test.cpp +++ b/searchcore/src/tests/proton/docsummary/docsummary_test.cpp @@ -23,12 +23,12 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include #include diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp index 042b57f0486..c469ba67765 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp @@ -2,12 +2,8 @@ #include "field_inverter.h" #include "ordered_field_index_inserter.h" -#include #include #include -#include -#include -#include #include #include #include @@ -30,9 +26,7 @@ LOG_SETUP(".searchlib.memoryindex.fieldinverter"); namespace search::memoryindex { -using document::AlternateSpanList; using document::Annotation; -using document::AnnotationType; using document::ArrayFieldValue; using document::DataType; using document::Document; @@ -40,79 +34,23 @@ using document::DocumentType; using document::Field; using document::FieldValue; using document::IntFieldValue; -using document::SimpleSpanList; using document::Span; -using document::SpanList; -using document::SpanNode; -using document::SpanTree; -using document::SpanTreeVisitor; using document::StringFieldValue; using document::StructFieldValue; using document::WeightedSetFieldValue; using index::DocIdAndPosOccFeatures; using index::Schema; using search::index::schema::CollectionType; +using search::linguistics::TokenExtractor; using search::util::URL; using vespalib::make_string; using vespalib::datastore::Aligner; -namespace documentinverterkludge::linguistics { - -const vespalib::string SPANTREE_NAME("linguistics"); - -} - -using namespace documentinverterkludge; - -namespace { - -class SpanFinder : public SpanTreeVisitor { -public: - int32_t begin_pos; - int32_t end_pos; - - SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {} - Span span() { return Span(begin_pos, end_pos - begin_pos); } - - void visit(const Span &node) override { - begin_pos = std::min(begin_pos, node.from()); - end_pos = std::max(end_pos, node.from() + node.length()); - } - void visit(const SpanList &node) override { - for (const auto & span_ : node) { - const_cast(span_)->accept(*this); - } - } - void visit(const SimpleSpanList &node) override { - for (const auto & span_ : node) { - const_cast(span_).accept(*this); - } - } - void visit(const AlternateSpanList &node) override { - for (size_t i = 0; i < node.getNumSubtrees(); ++i) { - visit(node.getSubtree(i)); - } - } -}; - -Span -getSpan(const SpanNode &span_node) -{ - SpanFinder finder; - // The SpanNode will not be changed. - const_cast(span_node).accept(finder); - return finder.span(); -} - -} - void FieldInverter::processAnnotations(const StringFieldValue &value, const Document& doc) { - _terms.clear(); - StringFieldValue::SpanTrees spanTrees = value.getSpanTrees(); - const SpanTree *tree = StringFieldValue::findTree(spanTrees, linguistics::SPANTREE_NAME); - if (tree == nullptr) { + auto span_trees = value.getSpanTrees(); + if (!TokenExtractor::extract(false, _terms, span_trees)) { /* This is wrong unless field is exact match */ const vespalib::string &text = value.getValue(); if (text.empty()) { @@ -126,19 +64,6 @@ FieldInverter::processAnnotations(const StringFieldValue &value, const Document& return; } const vespalib::string &text = value.getValue(); - for (const Annotation & annotation : *tree) { - const SpanNode *span = annotation.getSpanNode(); - if ((span != nullptr) && annotation.valid() && - (annotation.getType() == *AnnotationType::TERM)) - { - Span sp = getSpan(*span); - if (sp.length() != 0) { - _terms.push_back(std::make_pair(sp, - annotation.getFieldValue())); - } - } - } - std::sort(_terms.begin(), _terms.end()); auto it = _terms.begin(); auto ite = _terms.end(); uint32_t wordRef; diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h index 99830e623eb..23e3f9ddfd8 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h +++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h @@ -5,6 +5,7 @@ #include "i_field_index_remove_listener.h" #include #include +#include #include #include #include @@ -179,9 +180,8 @@ private: index::DocIdAndPosOccFeatures _features; UInt32Vector _wordRefs; - using SpanTerm = std::pair; - using SpanTermVector = std::vector; - SpanTermVector _terms; + using SpanTerm = linguistics::TokenExtractor::SpanTerm; + std::vector _terms; // Info about aborted and pending documents. std::vector _abortedDocs; diff --git a/searchlib/src/vespa/searchlib/test/string_field_builder.cpp b/searchlib/src/vespa/searchlib/test/string_field_builder.cpp index e842b7b44d6..d81572d8913 100644 --- a/searchlib/src/vespa/searchlib/test/string_field_builder.cpp +++ b/searchlib/src/vespa/searchlib/test/string_field_builder.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -22,15 +23,10 @@ using document::SpanNode; using document::SpanTree; using vespalib::Utf8Reader; using vespalib::Utf8Writer; +using search::linguistics::SPANTREE_NAME; namespace search::test { -namespace { - -const vespalib::string SPANTREE_NAME("linguistics"); - -} - StringFieldBuilder::StringFieldBuilder(const DocBuilder& doc_builder) : _value(), _span_start(0u), diff --git a/searchlib/src/vespa/searchlib/util/CMakeLists.txt b/searchlib/src/vespa/searchlib/util/CMakeLists.txt index 500b08da815..e9661b5e919 100644 --- a/searchlib/src/vespa/searchlib/util/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/util/CMakeLists.txt @@ -13,10 +13,12 @@ vespa_add_library(searchlib_util OBJECT filesizecalculator.cpp fileutil.cpp foldedstringcompare.cpp + linguisticsannotation.cpp logutil.cpp rawbuf.cpp slime_output_raw_buf_adapter.cpp state_explorer_utils.cpp + token_extractor.cpp url.cpp DEPENDS ) diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp new file mode 100644 index 00000000000..c8aef561319 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp @@ -0,0 +1,9 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "linguisticsannotation.h" + +namespace search::linguistics { + +const vespalib::string SPANTREE_NAME("linguistics"); + +} diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.h b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h new file mode 100644 index 00000000000..83a19bed986 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h @@ -0,0 +1,11 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include + +namespace search::linguistics { + +extern const vespalib::string SPANTREE_NAME; + +} diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.cpp b/searchlib/src/vespa/searchlib/util/token_extractor.cpp new file mode 100644 index 00000000000..a5d9dc0a4a6 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/token_extractor.cpp @@ -0,0 +1,86 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "token_extractor.h" +#include "linguisticsannotation.h" +#include +#include +#include +#include + +using document::AlternateSpanList; +using document::Annotation; +using document::AnnotationType; +using document::SimpleSpanList; +using document::Span; +using document::SpanList; +using document::SpanNode; +using document::SpanTreeVisitor; +using document::StringFieldValue; + +namespace search::linguistics { + +namespace { + +class SpanFinder : public SpanTreeVisitor { +public: + int32_t begin_pos; + int32_t end_pos; + + SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {} + Span span() { return Span(begin_pos, end_pos - begin_pos); } + + void visit(const Span &node) override { + begin_pos = std::min(begin_pos, node.from()); + end_pos = std::max(end_pos, node.from() + node.length()); + } + void visit(const SpanList &node) override { + for (const auto & span_ : node) { + span_->accept(*this); + } + } + void visit(const SimpleSpanList &node) override { + for (const auto & span_ : node) { + span_.accept(*this); + } + } + void visit(const AlternateSpanList &node) override { + for (size_t i = 0; i < node.getNumSubtrees(); ++i) { + visit(node.getSubtree(i)); + } + } +}; + +Span +getSpan(const SpanNode &span_node) +{ + SpanFinder finder; + span_node.accept(finder); + return finder.span(); +} + +} + +bool +TokenExtractor::extract(bool allow_zero_length_tokens, std::vector& terms, const document::StringFieldValue::SpanTrees& trees) +{ + auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME); + if (tree == nullptr) { + return false; + } + terms.clear(); + for (const Annotation & annotation : *tree) { + const SpanNode *span = annotation.getSpanNode(); + if ((span != nullptr) && annotation.valid() && + (annotation.getType() == *AnnotationType::TERM)) + { + Span sp = getSpan(*span); + if (sp.length() != 0 || allow_zero_length_tokens) { + terms.emplace_back(sp, annotation.getFieldValue()); + } + } + } + std::sort(terms.begin(), terms.end()); + return true; +} + +} diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.h b/searchlib/src/vespa/searchlib/util/token_extractor.h new file mode 100644 index 00000000000..5796aaa7482 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/token_extractor.h @@ -0,0 +1,27 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include +#include + +namespace document { + +class FieldValue; +class StringFieldValue; +class Span; + +} + +namespace search::linguistics { + +/* + * Class used to extract tokens from annotated string field value. + */ +class TokenExtractor { +public: + using SpanTerm = std::pair; + static bool extract(bool allow_zero_length_tokens, std::vector& terms, const document::StringFieldValue::SpanTrees& trees); +}; + +} diff --git a/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp b/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp index 16aa8c70131..0a05e078382 100644 --- a/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp +++ b/searchsummary/src/tests/docsummary/annotation_converter/annotation_converter_test.cpp @@ -9,9 +9,9 @@ #include #include #include +#include #include #include -#include #include #include #include diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt index 9d61c61ef7a..32df047c27f 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt +++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt @@ -23,7 +23,6 @@ vespa_add_library(searchsummary_docsummary OBJECT juniper_dfw_term_visitor.cpp juniper_query_adapter.cpp juniperproperties.cpp - linguisticsannotation.cpp matched_elements_filter_dfw.cpp positionsdfw.cpp query_term_filter.cpp diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp index 251cad47922..b4f76d8e39f 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp @@ -2,29 +2,21 @@ #include "annotation_converter.h" #include "i_juniper_converter.h" -#include "linguisticsannotation.h" -#include #include -#include -#include -#include +#include #include #include +#include +#include #include #include #include -using document::AlternateSpanList; using document::Annotation; -using document::AnnotationType; using document::FieldValue; -using document::SimpleSpanList; using document::Span; -using document::SpanList; -using document::SpanNode; -using document::SpanTree; -using document::SpanTreeVisitor; using document::StringFieldValue; +using search::linguistics::TokenExtractor; namespace search::docsummary { @@ -36,40 +28,6 @@ getSpanString(vespalib::stringref s, const Span &span) return {s.data() + span.from(), static_cast(span.length())}; } -struct SpanFinder : SpanTreeVisitor { - int32_t begin_pos; - int32_t end_pos; - - SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {} - Span span() { return Span(begin_pos, end_pos - begin_pos); } - - void visit(const Span &node) override { - begin_pos = std::min(begin_pos, node.from()); - end_pos = std::max(end_pos, node.from() + node.length()); - } - void visit(const SpanList &node) override { - for (const auto & span_ : node) { - span_->accept(*this); - } - } - void visit(const SimpleSpanList &node) override { - for (const auto & span_ : node) { - span_.accept(*this); - } - } - void visit(const AlternateSpanList &node) override { - for (size_t i = 0; i < node.getNumSubtrees(); ++i) { - visit(node.getSubtree(i)); - } - } -}; - -Span getSpan(const SpanNode &span_node) { - SpanFinder finder; - span_node.accept(finder); - return finder.span(); -} - const StringFieldValue &ensureStringFieldValue(const FieldValue &value) __attribute__((noinline)); const StringFieldValue &ensureStringFieldValue(const FieldValue &value) { @@ -125,28 +83,16 @@ AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, For void AnnotationConverter::handleIndexingTerms(const StringFieldValue& value) { - StringFieldValue::SpanTrees trees = value.getSpanTrees(); - const SpanTree *tree = StringFieldValue::findTree(trees, linguistics::SPANTREE_NAME); - using SpanTerm = std::pair; - using SpanTermVector = std::vector; - if (!tree) { + using SpanTerm = TokenExtractor::SpanTerm; + std::vector terms; + auto span_trees = value.getSpanTrees(); + if (!TokenExtractor::extract(true, terms, span_trees)) { // Treat a string without annotations as a single span. SpanTerm str(Span(0, _text.size()), static_cast(nullptr)); handleAnnotations(str.first, &str, &str + 1); return; } - SpanTermVector terms; - for (const Annotation& annotation : *tree) { - // For now, skip any composite spans. - const auto *span = dynamic_cast(annotation.getSpanNode()); - if ((span != nullptr) && annotation.valid() && - (annotation.getType() == *AnnotationType::TERM)) { - terms.push_back(std::make_pair(getSpan(*span), - annotation.getFieldValue())); - } - } - sort(terms.begin(), terms.end()); auto it = terms.begin(); auto ite = terms.end(); int32_t endPos = 0; diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp b/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp deleted file mode 100644 index c8aef561319..00000000000 --- a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.cpp +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "linguisticsannotation.h" - -namespace search::linguistics { - -const vespalib::string SPANTREE_NAME("linguistics"); - -} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h b/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h deleted file mode 100644 index 83a19bed986..00000000000 --- a/searchsummary/src/vespa/searchsummary/docsummary/linguisticsannotation.h +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include - -namespace search::linguistics { - -extern const vespalib::string SPANTREE_NAME; - -} -- cgit v1.2.3