diff options
author | Tor Egge <Tor.Egge@online.no> | 2023-10-12 14:34:04 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2023-10-12 14:34:04 +0200 |
commit | 486cea93db4e6e4239a1932737611bcfbf541996 (patch) | |
tree | f4e6e0d7614c3fc867f7f5e7bd1cbb13cb919a5f /searchsummary/src | |
parent | 4d4fa30cdb221decb7c1462f31635046748c50b2 (diff) |
Add a converter from an annoated string field value to an array of
strings representing the indexing terms.
Multiple indexing terms for same position are placed in a nested array.
Diffstat (limited to 'searchsummary/src')
5 files changed, 301 insertions, 0 deletions
diff --git a/searchsummary/src/tests/docsummary/linguistics_tokens_converter/CMakeLists.txt b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/CMakeLists.txt new file mode 100644 index 00000000000..d9510c3a2b3 --- /dev/null +++ b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchsummary_linguistics_tokens_converter_test_app TEST + SOURCES + linguistics_tokens_converter_test.cpp + DEPENDS + searchsummary + GTest::gtest +) + +vespa_add_test(NAME searchsummary_linguistics_tokens_converter_test_app COMMAND searchsummary_linguistics_tokens_converter_test_app) diff --git a/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp new file mode 100644 index 00000000000..c8d959361ae --- /dev/null +++ b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp @@ -0,0 +1,172 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/document/annotation/annotation.h> +#include <vespa/document/annotation/span.h> +#include <vespa/document/annotation/spanlist.h> +#include <vespa/document/annotation/spantree.h> +#include <vespa/document/datatype/annotationtype.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/repo/configbuilder.h> +#include <vespa/document/repo/fixedtyperepo.h> +#include <vespa/searchlib/util/linguisticsannotation.h> +#include <vespa/searchsummary/docsummary/linguistics_tokens_converter.h> +#include <vespa/vespalib/data/simple_buffer.h> +#include <vespa/vespalib/data/slime/json_format.h> +#include <vespa/vespalib/data/slime/slime.h> +#include <vespa/vespalib/gtest/gtest.h> + +using document::Annotation; +using document::AnnotationType; +using document::DocumentType; +using document::DocumentTypeRepo; +using document::Span; +using document::SpanList; +using document::SpanTree; +using document::StringFieldValue; +using search::docsummary::LinguisticsTokensConverter; +using search::linguistics::SPANTREE_NAME; +using vespalib::SimpleBuffer; +using vespalib::Slime; +using vespalib::slime::JsonFormat; +using vespalib::slime::SlimeInserter; + +namespace { + +vespalib::string +slime_to_string(const Slime& slime) +{ + SimpleBuffer buf; + JsonFormat::encode(slime, buf, true); + return buf.get().make_string(); +} + +DocumenttypesConfig +get_document_types_config() +{ + using namespace document::config_builder; + DocumenttypesConfigBuilderHelper builder; + builder.document(42, "indexingdocument", + Struct("indexingdocument.header"), + Struct("indexingdocument.body")); + return builder.config(); +} + +} + +class LinguisticsTokensConverterTest : public testing::Test +{ +protected: + std::shared_ptr<const DocumentTypeRepo> _repo; + const DocumentType* _document_type; + document::FixedTypeRepo _fixed_repo; + + LinguisticsTokensConverterTest(); + ~LinguisticsTokensConverterTest() override; + void set_span_tree(StringFieldValue& value, std::unique_ptr<SpanTree> tree); + StringFieldValue make_annotated_string(bool alt_tokens); + StringFieldValue make_annotated_chinese_string(); + vespalib::string make_exp_annotated_chinese_string_tokens(); + vespalib::string convert(const StringFieldValue& fv); +}; + +LinguisticsTokensConverterTest::LinguisticsTokensConverterTest() + : testing::Test(), + _repo(std::make_unique<DocumentTypeRepo>(get_document_types_config())), + _document_type(_repo->getDocumentType("indexingdocument")), + _fixed_repo(*_repo, *_document_type) +{ +} + +LinguisticsTokensConverterTest::~LinguisticsTokensConverterTest() = default; + +void +LinguisticsTokensConverterTest::set_span_tree(StringFieldValue & value, std::unique_ptr<SpanTree> tree) +{ + StringFieldValue::SpanTrees trees; + trees.push_back(std::move(tree)); + value.setSpanTrees(trees, _fixed_repo); +} + +StringFieldValue +LinguisticsTokensConverterTest::make_annotated_string(bool alt_tokens) +{ + auto span_list_up = std::make_unique<SpanList>(); + auto span_list = span_list_up.get(); + auto tree = std::make_unique<SpanTree>(SPANTREE_NAME, std::move(span_list_up)); + tree->annotate(span_list->add(std::make_unique<Span>(0, 3)), *AnnotationType::TERM); + if (alt_tokens) { + tree->annotate(span_list->add(std::make_unique<Span>(4, 3)), *AnnotationType::TERM); + } + tree->annotate(span_list->add(std::make_unique<Span>(4, 3)), + Annotation(*AnnotationType::TERM, std::make_unique<StringFieldValue>("baz"))); + StringFieldValue value("foo bar"); + set_span_tree(value, std::move(tree)); + return value; +} + +StringFieldValue +LinguisticsTokensConverterTest::make_annotated_chinese_string() +{ + auto span_list_up = std::make_unique<SpanList>(); + auto span_list = span_list_up.get(); + auto tree = std::make_unique<SpanTree>(SPANTREE_NAME, std::move(span_list_up)); + // These chinese characters each use 3 bytes in their UTF8 encoding. + tree->annotate(span_list->add(std::make_unique<Span>(0, 15)), *AnnotationType::TERM); + tree->annotate(span_list->add(std::make_unique<Span>(15, 9)), *AnnotationType::TERM); + StringFieldValue value("我就是那个大灰狼"); + set_span_tree(value, std::move(tree)); + return value; +} + +vespalib::string +LinguisticsTokensConverterTest::make_exp_annotated_chinese_string_tokens() +{ + return R"(["我就是那个","大灰狼"])"; +} + +vespalib::string +LinguisticsTokensConverterTest::convert(const StringFieldValue& fv) +{ + LinguisticsTokensConverter converter; + Slime slime; + SlimeInserter inserter(slime); + converter.convert(fv, inserter); + return slime_to_string(slime); +} + +TEST_F(LinguisticsTokensConverterTest, convert_empty_string) +{ + vespalib::string exp(R"([])"); + StringFieldValue plain_string(""); + EXPECT_EQ(exp, convert(plain_string)); +} + +TEST_F(LinguisticsTokensConverterTest, convert_plain_string) +{ + vespalib::string exp(R"(["Foo Bar Baz"])"); + StringFieldValue plain_string("Foo Bar Baz"); + EXPECT_EQ(exp, convert(plain_string)); +} + +TEST_F(LinguisticsTokensConverterTest, convert_annotated_string) +{ + vespalib::string exp(R"(["foo","baz"])"); + auto annotated_string = make_annotated_string(false); + EXPECT_EQ(exp, convert(annotated_string)); +} + +TEST_F(LinguisticsTokensConverterTest, convert_annotated_string_with_alternatives) +{ + vespalib::string exp(R"(["foo",["bar","baz"]])"); + auto annotated_string = make_annotated_string(true); + EXPECT_EQ(exp, convert(annotated_string)); +} + +TEST_F(LinguisticsTokensConverterTest, convert_annotated_chinese_string) +{ + auto exp = make_exp_annotated_chinese_string_tokens(); + auto annotated_chinese_string = make_annotated_chinese_string(); + EXPECT_EQ(exp, convert(annotated_chinese_string)); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt index 32df047c27f..e5ae47593e5 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt +++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt @@ -23,6 +23,7 @@ vespa_add_library(searchsummary_docsummary OBJECT juniper_dfw_term_visitor.cpp juniper_query_adapter.cpp juniperproperties.cpp + linguistics_tokens_converter.cpp matched_elements_filter_dfw.cpp positionsdfw.cpp query_term_filter.cpp diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp new file mode 100644 index 00000000000..838b0234cdb --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp @@ -0,0 +1,81 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "linguistics_tokens_converter.h" +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/searchlib/memoryindex/field_inverter.h> +#include <vespa/searchlib/util/linguisticsannotation.h> +#include <vespa/searchlib/util/token_extractor.h> +#include <vespa/vespalib/data/slime/slime.h> + +using document::StringFieldValue; +using search::linguistics::TokenExtractor; +using search::memoryindex::FieldInverter; +using vespalib::Memory; +using vespalib::slime::ArrayInserter; +using vespalib::slime::Cursor; +using vespalib::slime::Inserter; + +namespace search::docsummary { + +namespace { + +vespalib::string dummy_field_name; + +} + +LinguisticsTokensConverter::LinguisticsTokensConverter() + : IStringFieldConverter(), + _text() +{ +} + +LinguisticsTokensConverter::~LinguisticsTokensConverter() = default; + +template <typename ForwardIt> +void +LinguisticsTokensConverter::handle_alternative_index_terms(ForwardIt it, ForwardIt last, Inserter& inserter) +{ + Cursor& a = inserter.insertArray(); + ArrayInserter ai(a); + for (;it != last; ++it) { + handle_index_term(it->word, ai); + } +} + +void +LinguisticsTokensConverter::handle_index_term(vespalib::stringref word, Inserter& inserter) +{ + inserter.insertString(Memory(word)); +} + +void +LinguisticsTokensConverter::handle_indexing_terms(const StringFieldValue& value, vespalib::slime::Inserter& inserter) +{ + Cursor& a = inserter.insertArray(); + ArrayInserter ai(a); + using SpanTerm = TokenExtractor::SpanTerm; + std::vector<SpanTerm> terms; + auto span_trees = value.getSpanTrees(); + TokenExtractor token_extractor(dummy_field_name, FieldInverter::max_word_len); + token_extractor.extract(terms, span_trees, _text, nullptr); + auto it = terms.begin(); + auto ite = terms.end(); + auto itn = it; + for (; it != ite; it = itn) { + for (; itn != ite && itn->span == it->span; ++itn); + if ((itn - it) > 1) { + handle_alternative_index_terms(it, itn, ai); + } else { + handle_index_term(it->word, ai); + } + } +} + +void +LinguisticsTokensConverter::convert(const StringFieldValue &input, vespalib::slime::Inserter& inserter) +{ + _text = input.getValueRef(); + handle_indexing_terms(input, inserter); +} + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h new file mode 100644 index 00000000000..74d1e2ab1f9 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h @@ -0,0 +1,37 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "i_string_field_converter.h" + +namespace document { + +class FieldValue; +class Span; + +} + +namespace search::docsummary { + +class IJuniperConverter; + +/* + * Class converting a string field value with annotations into an array + * containing the index terms. Multiple index terms at same position are + * placed in a nested array. + */ +class LinguisticsTokensConverter : public IStringFieldConverter +{ + vespalib::stringref _text; + + template <typename ForwardIt> + void handle_alternative_index_terms(ForwardIt it, ForwardIt last, vespalib::slime::Inserter& inserter); + void handle_index_term(vespalib::stringref word, vespalib::slime::Inserter& inserter); + void handle_indexing_terms(const document::StringFieldValue& value, vespalib::slime::Inserter& inserter); +public: + LinguisticsTokensConverter(); + ~LinguisticsTokensConverter() override; + void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override; +}; + +} |