From 52cdc152dd61aee3e6c336fd787719ab2eb6d77b Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Tue, 10 Oct 2023 16:49:08 +0200 Subject: Consolidate extraction of tokens from annotated string field value. --- .../vespa/searchlib/memoryindex/field_inverter.cpp | 81 +------------------- .../vespa/searchlib/memoryindex/field_inverter.h | 6 +- .../vespa/searchlib/test/string_field_builder.cpp | 8 +- searchlib/src/vespa/searchlib/util/CMakeLists.txt | 2 + .../vespa/searchlib/util/linguisticsannotation.cpp | 9 +++ .../vespa/searchlib/util/linguisticsannotation.h | 11 +++ .../src/vespa/searchlib/util/token_extractor.cpp | 86 ++++++++++++++++++++++ .../src/vespa/searchlib/util/token_extractor.h | 27 +++++++ 8 files changed, 143 insertions(+), 87 deletions(-) create mode 100644 searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp create mode 100644 searchlib/src/vespa/searchlib/util/linguisticsannotation.h create mode 100644 searchlib/src/vespa/searchlib/util/token_extractor.cpp create mode 100644 searchlib/src/vespa/searchlib/util/token_extractor.h (limited to 'searchlib/src') diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp index 042b57f0486..c469ba67765 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp @@ -2,12 +2,8 @@ #include "field_inverter.h" #include "ordered_field_index_inserter.h" -#include #include #include -#include -#include -#include #include #include #include @@ -30,9 +26,7 @@ LOG_SETUP(".searchlib.memoryindex.fieldinverter"); namespace search::memoryindex { -using document::AlternateSpanList; using document::Annotation; -using document::AnnotationType; using document::ArrayFieldValue; using document::DataType; using document::Document; @@ -40,79 +34,23 @@ using document::DocumentType; using document::Field; using document::FieldValue; using document::IntFieldValue; -using document::SimpleSpanList; using document::Span; -using document::SpanList; -using document::SpanNode; -using document::SpanTree; -using document::SpanTreeVisitor; using document::StringFieldValue; using document::StructFieldValue; using document::WeightedSetFieldValue; using index::DocIdAndPosOccFeatures; using index::Schema; using search::index::schema::CollectionType; +using search::linguistics::TokenExtractor; using search::util::URL; using vespalib::make_string; using vespalib::datastore::Aligner; -namespace documentinverterkludge::linguistics { - -const vespalib::string SPANTREE_NAME("linguistics"); - -} - -using namespace documentinverterkludge; - -namespace { - -class SpanFinder : public SpanTreeVisitor { -public: - int32_t begin_pos; - int32_t end_pos; - - SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {} - Span span() { return Span(begin_pos, end_pos - begin_pos); } - - void visit(const Span &node) override { - begin_pos = std::min(begin_pos, node.from()); - end_pos = std::max(end_pos, node.from() + node.length()); - } - void visit(const SpanList &node) override { - for (const auto & span_ : node) { - const_cast(span_)->accept(*this); - } - } - void visit(const SimpleSpanList &node) override { - for (const auto & span_ : node) { - const_cast(span_).accept(*this); - } - } - void visit(const AlternateSpanList &node) override { - for (size_t i = 0; i < node.getNumSubtrees(); ++i) { - visit(node.getSubtree(i)); - } - } -}; - -Span -getSpan(const SpanNode &span_node) -{ - SpanFinder finder; - // The SpanNode will not be changed. - const_cast(span_node).accept(finder); - return finder.span(); -} - -} - void FieldInverter::processAnnotations(const StringFieldValue &value, const Document& doc) { - _terms.clear(); - StringFieldValue::SpanTrees spanTrees = value.getSpanTrees(); - const SpanTree *tree = StringFieldValue::findTree(spanTrees, linguistics::SPANTREE_NAME); - if (tree == nullptr) { + auto span_trees = value.getSpanTrees(); + if (!TokenExtractor::extract(false, _terms, span_trees)) { /* This is wrong unless field is exact match */ const vespalib::string &text = value.getValue(); if (text.empty()) { @@ -126,19 +64,6 @@ FieldInverter::processAnnotations(const StringFieldValue &value, const Document& return; } const vespalib::string &text = value.getValue(); - for (const Annotation & annotation : *tree) { - const SpanNode *span = annotation.getSpanNode(); - if ((span != nullptr) && annotation.valid() && - (annotation.getType() == *AnnotationType::TERM)) - { - Span sp = getSpan(*span); - if (sp.length() != 0) { - _terms.push_back(std::make_pair(sp, - annotation.getFieldValue())); - } - } - } - std::sort(_terms.begin(), _terms.end()); auto it = _terms.begin(); auto ite = _terms.end(); uint32_t wordRef; diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h index 99830e623eb..23e3f9ddfd8 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h +++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h @@ -5,6 +5,7 @@ #include "i_field_index_remove_listener.h" #include #include +#include #include #include #include @@ -179,9 +180,8 @@ private: index::DocIdAndPosOccFeatures _features; UInt32Vector _wordRefs; - using SpanTerm = std::pair; - using SpanTermVector = std::vector; - SpanTermVector _terms; + using SpanTerm = linguistics::TokenExtractor::SpanTerm; + std::vector _terms; // Info about aborted and pending documents. std::vector _abortedDocs; diff --git a/searchlib/src/vespa/searchlib/test/string_field_builder.cpp b/searchlib/src/vespa/searchlib/test/string_field_builder.cpp index e842b7b44d6..d81572d8913 100644 --- a/searchlib/src/vespa/searchlib/test/string_field_builder.cpp +++ b/searchlib/src/vespa/searchlib/test/string_field_builder.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -22,15 +23,10 @@ using document::SpanNode; using document::SpanTree; using vespalib::Utf8Reader; using vespalib::Utf8Writer; +using search::linguistics::SPANTREE_NAME; namespace search::test { -namespace { - -const vespalib::string SPANTREE_NAME("linguistics"); - -} - StringFieldBuilder::StringFieldBuilder(const DocBuilder& doc_builder) : _value(), _span_start(0u), diff --git a/searchlib/src/vespa/searchlib/util/CMakeLists.txt b/searchlib/src/vespa/searchlib/util/CMakeLists.txt index 500b08da815..e9661b5e919 100644 --- a/searchlib/src/vespa/searchlib/util/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/util/CMakeLists.txt @@ -13,10 +13,12 @@ vespa_add_library(searchlib_util OBJECT filesizecalculator.cpp fileutil.cpp foldedstringcompare.cpp + linguisticsannotation.cpp logutil.cpp rawbuf.cpp slime_output_raw_buf_adapter.cpp state_explorer_utils.cpp + token_extractor.cpp url.cpp DEPENDS ) diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp new file mode 100644 index 00000000000..c8aef561319 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp @@ -0,0 +1,9 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "linguisticsannotation.h" + +namespace search::linguistics { + +const vespalib::string SPANTREE_NAME("linguistics"); + +} diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.h b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h new file mode 100644 index 00000000000..83a19bed986 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h @@ -0,0 +1,11 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include + +namespace search::linguistics { + +extern const vespalib::string SPANTREE_NAME; + +} diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.cpp b/searchlib/src/vespa/searchlib/util/token_extractor.cpp new file mode 100644 index 00000000000..a5d9dc0a4a6 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/token_extractor.cpp @@ -0,0 +1,86 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "token_extractor.h" +#include "linguisticsannotation.h" +#include +#include +#include +#include + +using document::AlternateSpanList; +using document::Annotation; +using document::AnnotationType; +using document::SimpleSpanList; +using document::Span; +using document::SpanList; +using document::SpanNode; +using document::SpanTreeVisitor; +using document::StringFieldValue; + +namespace search::linguistics { + +namespace { + +class SpanFinder : public SpanTreeVisitor { +public: + int32_t begin_pos; + int32_t end_pos; + + SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {} + Span span() { return Span(begin_pos, end_pos - begin_pos); } + + void visit(const Span &node) override { + begin_pos = std::min(begin_pos, node.from()); + end_pos = std::max(end_pos, node.from() + node.length()); + } + void visit(const SpanList &node) override { + for (const auto & span_ : node) { + span_->accept(*this); + } + } + void visit(const SimpleSpanList &node) override { + for (const auto & span_ : node) { + span_.accept(*this); + } + } + void visit(const AlternateSpanList &node) override { + for (size_t i = 0; i < node.getNumSubtrees(); ++i) { + visit(node.getSubtree(i)); + } + } +}; + +Span +getSpan(const SpanNode &span_node) +{ + SpanFinder finder; + span_node.accept(finder); + return finder.span(); +} + +} + +bool +TokenExtractor::extract(bool allow_zero_length_tokens, std::vector& terms, const document::StringFieldValue::SpanTrees& trees) +{ + auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME); + if (tree == nullptr) { + return false; + } + terms.clear(); + for (const Annotation & annotation : *tree) { + const SpanNode *span = annotation.getSpanNode(); + if ((span != nullptr) && annotation.valid() && + (annotation.getType() == *AnnotationType::TERM)) + { + Span sp = getSpan(*span); + if (sp.length() != 0 || allow_zero_length_tokens) { + terms.emplace_back(sp, annotation.getFieldValue()); + } + } + } + std::sort(terms.begin(), terms.end()); + return true; +} + +} diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.h b/searchlib/src/vespa/searchlib/util/token_extractor.h new file mode 100644 index 00000000000..5796aaa7482 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/token_extractor.h @@ -0,0 +1,27 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include +#include + +namespace document { + +class FieldValue; +class StringFieldValue; +class Span; + +} + +namespace search::linguistics { + +/* + * Class used to extract tokens from annotated string field value. + */ +class TokenExtractor { +public: + using SpanTerm = std::pair; + static bool extract(bool allow_zero_length_tokens, std::vector& terms, const document::StringFieldValue::SpanTrees& trees); +}; + +} -- cgit v1.2.3