aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2023-10-10 16:49:08 +0200
committerTor Egge <Tor.Egge@online.no>2023-10-10 16:49:08 +0200
commit52cdc152dd61aee3e6c336fd787719ab2eb6d77b (patch)
treecf6f41eafc184cc748729bf4b57a9c2e928e00b8 /searchlib
parent3126b4608e11a85b7b58eaf99f4d48a1fb5bacfb (diff)
Consolidate extraction of tokens from annotated string field value.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp81
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/field_inverter.h6
-rw-r--r--searchlib/src/vespa/searchlib/test/string_field_builder.cpp8
-rw-r--r--searchlib/src/vespa/searchlib/util/CMakeLists.txt2
-rw-r--r--searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp9
-rw-r--r--searchlib/src/vespa/searchlib/util/linguisticsannotation.h11
-rw-r--r--searchlib/src/vespa/searchlib/util/token_extractor.cpp86
-rw-r--r--searchlib/src/vespa/searchlib/util/token_extractor.h27
8 files changed, 143 insertions, 87 deletions
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
index 042b57f0486..c469ba67765 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.cpp
@@ -2,12 +2,8 @@
#include "field_inverter.h"
#include "ordered_field_index_inserter.h"
-#include <vespa/document/annotation/alternatespanlist.h>
#include <vespa/document/annotation/annotation.h>
#include <vespa/document/annotation/span.h>
-#include <vespa/document/annotation/spanlist.h>
-#include <vespa/document/annotation/spantree.h>
-#include <vespa/document/annotation/spantreevisitor.h>
#include <vespa/document/fieldvalue/arrayfieldvalue.h>
#include <vespa/document/fieldvalue/document.h>
#include <vespa/document/fieldvalue/stringfieldvalue.h>
@@ -30,9 +26,7 @@ LOG_SETUP(".searchlib.memoryindex.fieldinverter");
namespace search::memoryindex {
-using document::AlternateSpanList;
using document::Annotation;
-using document::AnnotationType;
using document::ArrayFieldValue;
using document::DataType;
using document::Document;
@@ -40,79 +34,23 @@ using document::DocumentType;
using document::Field;
using document::FieldValue;
using document::IntFieldValue;
-using document::SimpleSpanList;
using document::Span;
-using document::SpanList;
-using document::SpanNode;
-using document::SpanTree;
-using document::SpanTreeVisitor;
using document::StringFieldValue;
using document::StructFieldValue;
using document::WeightedSetFieldValue;
using index::DocIdAndPosOccFeatures;
using index::Schema;
using search::index::schema::CollectionType;
+using search::linguistics::TokenExtractor;
using search::util::URL;
using vespalib::make_string;
using vespalib::datastore::Aligner;
-namespace documentinverterkludge::linguistics {
-
-const vespalib::string SPANTREE_NAME("linguistics");
-
-}
-
-using namespace documentinverterkludge;
-
-namespace {
-
-class SpanFinder : public SpanTreeVisitor {
-public:
- int32_t begin_pos;
- int32_t end_pos;
-
- SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {}
- Span span() { return Span(begin_pos, end_pos - begin_pos); }
-
- void visit(const Span &node) override {
- begin_pos = std::min(begin_pos, node.from());
- end_pos = std::max(end_pos, node.from() + node.length());
- }
- void visit(const SpanList &node) override {
- for (const auto & span_ : node) {
- const_cast<SpanNode *>(span_)->accept(*this);
- }
- }
- void visit(const SimpleSpanList &node) override {
- for (const auto & span_ : node) {
- const_cast<Span &>(span_).accept(*this);
- }
- }
- void visit(const AlternateSpanList &node) override {
- for (size_t i = 0; i < node.getNumSubtrees(); ++i) {
- visit(node.getSubtree(i));
- }
- }
-};
-
-Span
-getSpan(const SpanNode &span_node)
-{
- SpanFinder finder;
- // The SpanNode will not be changed.
- const_cast<SpanNode &>(span_node).accept(finder);
- return finder.span();
-}
-
-}
-
void
FieldInverter::processAnnotations(const StringFieldValue &value, const Document& doc)
{
- _terms.clear();
- StringFieldValue::SpanTrees spanTrees = value.getSpanTrees();
- const SpanTree *tree = StringFieldValue::findTree(spanTrees, linguistics::SPANTREE_NAME);
- if (tree == nullptr) {
+ auto span_trees = value.getSpanTrees();
+ if (!TokenExtractor::extract(false, _terms, span_trees)) {
/* This is wrong unless field is exact match */
const vespalib::string &text = value.getValue();
if (text.empty()) {
@@ -126,19 +64,6 @@ FieldInverter::processAnnotations(const StringFieldValue &value, const Document&
return;
}
const vespalib::string &text = value.getValue();
- for (const Annotation & annotation : *tree) {
- const SpanNode *span = annotation.getSpanNode();
- if ((span != nullptr) && annotation.valid() &&
- (annotation.getType() == *AnnotationType::TERM))
- {
- Span sp = getSpan(*span);
- if (sp.length() != 0) {
- _terms.push_back(std::make_pair(sp,
- annotation.getFieldValue()));
- }
- }
- }
- std::sort(_terms.begin(), _terms.end());
auto it = _terms.begin();
auto ite = _terms.end();
uint32_t wordRef;
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
index 99830e623eb..23e3f9ddfd8 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_inverter.h
@@ -5,6 +5,7 @@
#include "i_field_index_remove_listener.h"
#include <vespa/document/annotation/span.h>
#include <vespa/searchlib/index/docidandfeatures.h>
+#include <vespa/searchlib/util/token_extractor.h>
#include <vespa/vespalib/stllike/allocator.h>
#include <vespa/vespalib/stllike/hash_map.h>
#include <limits>
@@ -179,9 +180,8 @@ private:
index::DocIdAndPosOccFeatures _features;
UInt32Vector _wordRefs;
- using SpanTerm = std::pair<document::Span, const document::FieldValue *>;
- using SpanTermVector = std::vector<SpanTerm>;
- SpanTermVector _terms;
+ using SpanTerm = linguistics::TokenExtractor::SpanTerm;
+ std::vector<SpanTerm> _terms;
// Info about aborted and pending documents.
std::vector<PositionRange> _abortedDocs;
diff --git a/searchlib/src/vespa/searchlib/test/string_field_builder.cpp b/searchlib/src/vespa/searchlib/test/string_field_builder.cpp
index e842b7b44d6..d81572d8913 100644
--- a/searchlib/src/vespa/searchlib/test/string_field_builder.cpp
+++ b/searchlib/src/vespa/searchlib/test/string_field_builder.cpp
@@ -7,6 +7,7 @@
#include <vespa/document/annotation/spanlist.h>
#include <vespa/document/annotation/spantree.h>
#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/searchlib/util/linguisticsannotation.h>
#include <vespa/fastlib/text/unicodeutil.h>
#include <vespa/vespalib/text/utf8.h>
@@ -22,15 +23,10 @@ using document::SpanNode;
using document::SpanTree;
using vespalib::Utf8Reader;
using vespalib::Utf8Writer;
+using search::linguistics::SPANTREE_NAME;
namespace search::test {
-namespace {
-
-const vespalib::string SPANTREE_NAME("linguistics");
-
-}
-
StringFieldBuilder::StringFieldBuilder(const DocBuilder& doc_builder)
: _value(),
_span_start(0u),
diff --git a/searchlib/src/vespa/searchlib/util/CMakeLists.txt b/searchlib/src/vespa/searchlib/util/CMakeLists.txt
index 500b08da815..e9661b5e919 100644
--- a/searchlib/src/vespa/searchlib/util/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/util/CMakeLists.txt
@@ -13,10 +13,12 @@ vespa_add_library(searchlib_util OBJECT
filesizecalculator.cpp
fileutil.cpp
foldedstringcompare.cpp
+ linguisticsannotation.cpp
logutil.cpp
rawbuf.cpp
slime_output_raw_buf_adapter.cpp
state_explorer_utils.cpp
+ token_extractor.cpp
url.cpp
DEPENDS
)
diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp
new file mode 100644
index 00000000000..c8aef561319
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.cpp
@@ -0,0 +1,9 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "linguisticsannotation.h"
+
+namespace search::linguistics {
+
+const vespalib::string SPANTREE_NAME("linguistics");
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/linguisticsannotation.h b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h
new file mode 100644
index 00000000000..83a19bed986
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/linguisticsannotation.h
@@ -0,0 +1,11 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/stllike/string.h>
+
+namespace search::linguistics {
+
+extern const vespalib::string SPANTREE_NAME;
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.cpp b/searchlib/src/vespa/searchlib/util/token_extractor.cpp
new file mode 100644
index 00000000000..a5d9dc0a4a6
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.cpp
@@ -0,0 +1,86 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "token_extractor.h"
+#include "linguisticsannotation.h"
+#include <vespa/document/annotation/alternatespanlist.h>
+#include <vespa/document/annotation/span.h>
+#include <vespa/document/annotation/spanlist.h>
+#include <vespa/document/annotation/spantreevisitor.h>
+
+using document::AlternateSpanList;
+using document::Annotation;
+using document::AnnotationType;
+using document::SimpleSpanList;
+using document::Span;
+using document::SpanList;
+using document::SpanNode;
+using document::SpanTreeVisitor;
+using document::StringFieldValue;
+
+namespace search::linguistics {
+
+namespace {
+
+class SpanFinder : public SpanTreeVisitor {
+public:
+ int32_t begin_pos;
+ int32_t end_pos;
+
+ SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {}
+ Span span() { return Span(begin_pos, end_pos - begin_pos); }
+
+ void visit(const Span &node) override {
+ begin_pos = std::min(begin_pos, node.from());
+ end_pos = std::max(end_pos, node.from() + node.length());
+ }
+ void visit(const SpanList &node) override {
+ for (const auto & span_ : node) {
+ span_->accept(*this);
+ }
+ }
+ void visit(const SimpleSpanList &node) override {
+ for (const auto & span_ : node) {
+ span_.accept(*this);
+ }
+ }
+ void visit(const AlternateSpanList &node) override {
+ for (size_t i = 0; i < node.getNumSubtrees(); ++i) {
+ visit(node.getSubtree(i));
+ }
+ }
+};
+
+Span
+getSpan(const SpanNode &span_node)
+{
+ SpanFinder finder;
+ span_node.accept(finder);
+ return finder.span();
+}
+
+}
+
+bool
+TokenExtractor::extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees)
+{
+ auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME);
+ if (tree == nullptr) {
+ return false;
+ }
+ terms.clear();
+ for (const Annotation & annotation : *tree) {
+ const SpanNode *span = annotation.getSpanNode();
+ if ((span != nullptr) && annotation.valid() &&
+ (annotation.getType() == *AnnotationType::TERM))
+ {
+ Span sp = getSpan(*span);
+ if (sp.length() != 0 || allow_zero_length_tokens) {
+ terms.emplace_back(sp, annotation.getFieldValue());
+ }
+ }
+ }
+ std::sort(terms.begin(), terms.end());
+ return true;
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.h b/searchlib/src/vespa/searchlib/util/token_extractor.h
new file mode 100644
index 00000000000..5796aaa7482
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.h
@@ -0,0 +1,27 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vector>
+
+namespace document {
+
+class FieldValue;
+class StringFieldValue;
+class Span;
+
+}
+
+namespace search::linguistics {
+
+/*
+ * Class used to extract tokens from annotated string field value.
+ */
+class TokenExtractor {
+public:
+ using SpanTerm = std::pair<document::Span, const document::FieldValue*>;
+ static bool extract(bool allow_zero_length_tokens, std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees);
+};
+
+}