aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/util/token_extractor.h
diff options
context:
space:
mode:
Diffstat (limited to 'searchlib/src/vespa/searchlib/util/token_extractor.h')
-rw-r--r--searchlib/src/vespa/searchlib/util/token_extractor.h63
1 files changed, 63 insertions, 0 deletions
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.h b/searchlib/src/vespa/searchlib/util/token_extractor.h
new file mode 100644
index 00000000000..4955448b0c2
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/util/token_extractor.h
@@ -0,0 +1,63 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/document/annotation/span.h>
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/vespalib/stllike/string.h>
+#include <vector>
+
+namespace document {
+
+class Document;
+class Span;
+class StringFieldValue;
+
+}
+
+namespace search::linguistics {
+
+/*
+ * Class used to extract tokens from annotated string field value.
+ */
+class TokenExtractor {
+ const vespalib::string& _field_name;
+ size_t _max_word_len;
+
+public:
+ struct SpanTerm {
+ document::Span span;
+ vespalib::stringref word;
+ bool altered;
+
+ SpanTerm(const document::Span& span_, vespalib::stringref word_, bool altered_) noexcept
+ : span(span_),
+ word(word_),
+ altered(altered_)
+ {
+ }
+ SpanTerm() noexcept
+ : span(),
+ word(),
+ altered(false)
+ {
+ }
+ bool operator<(const SpanTerm& rhs) const noexcept {
+ if (span != rhs.span) {
+ return span < rhs.span;
+ }
+ return word < rhs.word;
+ }
+ };
+
+private:
+ void consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const document::Span& span, const document::FieldValue* fv, const document::Document* doc) const;
+
+public:
+ TokenExtractor(const vespalib::string& field_name, size_t max_word_len);
+ ~TokenExtractor();
+ void extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const document::Document* doc) const;
+ vespalib::stringref sanitize_word(vespalib::stringref word, const document::Document* doc) const;
+};
+
+}