From f67d01124f2a19e77c94039e571db3e4c60f4ed1 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Mon, 16 Oct 2023 12:58:04 +0200 Subject: Add linguistics tokens document field writer. --- .../linguistics_tokens_converter_test.cpp | 10 ++++- .../docsummary/slime_filler/slime_filler_test.cpp | 46 +++++++++++++++++++--- .../vespa/searchsummary/docsummary/CMakeLists.txt | 1 + .../docsummary/annotation_converter.cpp | 6 +++ .../docsummary/annotation_converter.h | 1 + .../docsummary/docsum_field_writer_commands.cpp | 1 + .../docsummary/docsum_field_writer_commands.h | 1 + .../docsummary/docsum_field_writer_factory.cpp | 7 ++++ .../docsummary/i_string_field_converter.h | 1 + .../docsummary/linguistics_tokens_converter.cpp | 21 +++++----- .../docsummary/linguistics_tokens_converter.h | 8 +++- .../docsummary/linguistics_tokens_dfw.cpp | 36 +++++++++++++++++ .../docsummary/linguistics_tokens_dfw.h | 28 +++++++++++++ .../searchsummary/docsummary/slime_filler.cpp | 19 ++++++--- 14 files changed, 159 insertions(+), 27 deletions(-) create mode 100644 searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.cpp create mode 100644 searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.h (limited to 'searchsummary/src') diff --git a/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp index c8d959361ae..beaa43c7af8 100644 --- a/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp +++ b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,7 @@ using document::SpanTree; using document::StringFieldValue; using search::docsummary::LinguisticsTokensConverter; using search::linguistics::SPANTREE_NAME; +using search::linguistics::TokenExtractor; using vespalib::SimpleBuffer; using vespalib::Slime; using vespalib::slime::JsonFormat; @@ -59,6 +61,8 @@ protected: std::shared_ptr _repo; const DocumentType* _document_type; document::FixedTypeRepo _fixed_repo; + vespalib::string _dummy_field_name; + TokenExtractor _token_extractor; LinguisticsTokensConverterTest(); ~LinguisticsTokensConverterTest() override; @@ -73,7 +77,9 @@ LinguisticsTokensConverterTest::LinguisticsTokensConverterTest() : testing::Test(), _repo(std::make_unique(get_document_types_config())), _document_type(_repo->getDocumentType("indexingdocument")), - _fixed_repo(*_repo, *_document_type) + _fixed_repo(*_repo, *_document_type), + _dummy_field_name(), + _token_extractor(_dummy_field_name, 100) { } @@ -127,7 +133,7 @@ LinguisticsTokensConverterTest::make_exp_annotated_chinese_string_tokens() vespalib::string LinguisticsTokensConverterTest::convert(const StringFieldValue& fv) { - LinguisticsTokensConverter converter; + LinguisticsTokensConverter converter(_token_extractor); Slime slime; SlimeInserter inserter(slime); converter.convert(fv, inserter); diff --git a/searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp b/searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp index 10aedc6d9d0..c20f9570ef8 100644 --- a/searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp +++ b/searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp @@ -68,6 +68,7 @@ using search::docsummary::IStringFieldConverter; using search::docsummary::ResultConfig; using search::docsummary::SlimeFiller; using search::docsummary::SlimeFillerFilter; +using vespalib::Memory; using vespalib::SimpleBuffer; using vespalib::Slime; using vespalib::eval::SimpleValue; @@ -146,17 +147,27 @@ get_document_types_config() class MockStringFieldConverter : public IStringFieldConverter { std::vector _result; + bool _render_wset_as_array; + bool _insert; public: - MockStringFieldConverter() + MockStringFieldConverter(bool render_wset_as_array, bool insert) : IStringFieldConverter(), - _result() + _result(), + _render_wset_as_array(render_wset_as_array), + _insert(insert) { } ~MockStringFieldConverter() override = default; - void convert(const document::StringFieldValue& input, vespalib::slime::Inserter&) override { + void convert(const document::StringFieldValue& input, vespalib::slime::Inserter& inserter) override { _result.emplace_back(input.getValueRef()); + if (_insert) { + inserter.insertString(Memory(input.getValueRef())); + } } const std::vector& get_result() const noexcept { return _result; } + bool render_weighted_set_as_array() const override { + return _render_wset_as_array; + } }; } @@ -188,6 +199,7 @@ protected: void expect_insert_summary_field_with_filter(const vespalib::string& exp, const FieldValue& fv, const std::vector& matching_elems); void expect_insert_summary_field_with_field_filter(const vespalib::string& exp, const FieldValue& fv, const SlimeFillerFilter* filter); void expect_insert_juniper_field(const std::vector& exp, const vespalib::string& exp_slime, const FieldValue& fv); + void expect_insert_summary_field_with_converter(const std::vector& exp, const vespalib::string& exp_slime, const FieldValue& fv, MockStringFieldConverter& converter); }; SlimeFillerTest::SlimeFillerTest() @@ -317,7 +329,7 @@ SlimeFillerTest::expect_insert_callback(const std::vector& exp { Slime slime; SlimeInserter inserter(slime); - MockStringFieldConverter converter; + MockStringFieldConverter converter(false, false); SlimeFiller filler(inserter, &converter, SlimeFillerFilter::all()); fv.accept(filler); auto act_null = slime_to_string(slime); @@ -361,7 +373,7 @@ SlimeFillerTest::expect_insert_juniper_field(const std::vector { Slime slime; SlimeInserter inserter(slime); - MockStringFieldConverter converter; + MockStringFieldConverter converter(false, false); SlimeFiller::insert_juniper_field(fv, inserter, converter); auto act_slime = slime_to_string(slime); EXPECT_EQ(exp_slime, act_slime); @@ -369,6 +381,18 @@ SlimeFillerTest::expect_insert_juniper_field(const std::vector EXPECT_EQ(exp, act); } +void +SlimeFillerTest::expect_insert_summary_field_with_converter(const std::vector& exp, const vespalib::string& exp_slime, const FieldValue& fv, MockStringFieldConverter& converter) +{ + Slime slime; + SlimeInserter inserter(slime); + SlimeFiller::insert_summary_field(fv, inserter, &converter); + auto act_slime = slime_to_string(slime); + EXPECT_EQ(exp_slime, act_slime); + auto act = converter.get_result(); + EXPECT_EQ(exp, act); +} + TEST_F(SlimeFillerTest, insert_primitive_values) { { @@ -625,4 +649,16 @@ TEST_F(SlimeFillerTest, insert_juniper_field) expect_insert_juniper_field({}, "null", make_empty_array()); } +TEST_F(SlimeFillerTest, string_field_is_not_converted_for_weighted_set_rendering) +{ + MockStringFieldConverter cvt_as_wset(false, true); + expect_insert_summary_field_with_converter({}, R"([{"item":"foo","weight":2},{"item":"bar","weight":4},{"item":"baz","weight":6}])", make_weighted_set(), cvt_as_wset); +} + +TEST_F(SlimeFillerTest, weighted_set_can_be_rendered_as_array) +{ + MockStringFieldConverter cvt_as_array(true, true); + expect_insert_summary_field_with_converter({"foo","bar","baz"}, R"(["foo","bar","baz"])", make_weighted_set(), cvt_as_array); +} + GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt index e5ae47593e5..57b6004fb61 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt +++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt @@ -24,6 +24,7 @@ vespa_add_library(searchsummary_docsummary OBJECT juniper_query_adapter.cpp juniperproperties.cpp linguistics_tokens_converter.cpp + linguistics_tokens_dfw.cpp matched_elements_filter_dfw.cpp positionsdfw.cpp query_term_filter.cpp diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp index bf267ab9e27..77724305220 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp @@ -109,4 +109,10 @@ AnnotationConverter::convert(const StringFieldValue &input, vespalib::slime::Ins _juniper_converter.convert(_out.str(), inserter); } +bool +AnnotationConverter::render_weighted_set_as_array() const +{ + return false; +} + } diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h index b6430b35f29..b082269eb7e 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h +++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h @@ -33,6 +33,7 @@ public: AnnotationConverter(IJuniperConverter& juniper_converter); ~AnnotationConverter() override; void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override; + bool render_weighted_set_as_array() const override; }; } diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp index 2ce809e1cbe..c4823f6beeb 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp @@ -12,6 +12,7 @@ const vespalib::string documentid("documentid"); const vespalib::string dynamic_teaser("dynamicteaser"); const vespalib::string empty("empty"); const vespalib::string geo_position("geopos"); +const vespalib::string linguistics_tokens("linguistics-tokens"); const vespalib::string matched_attribute_elements_filter("matchedattributeelementsfilter"); const vespalib::string matched_elements_filter("matchedelementsfilter"); const vespalib::string positions("positions"); diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h index 26bc33e7e3c..2d0b8c23855 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h @@ -18,6 +18,7 @@ extern const vespalib::string documentid; extern const vespalib::string dynamic_teaser; extern const vespalib::string empty; extern const vespalib::string geo_position; +extern const vespalib::string linguistics_tokens; extern const vespalib::string matched_attribute_elements_filter; extern const vespalib::string matched_elements_filter; extern const vespalib::string positions; diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp index 9b7391dd1ab..d19d2994104 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp @@ -9,6 +9,7 @@ #include "geoposdfw.h" #include "idocsumenvironment.h" #include "juniperdfw.h" +#include "linguistics_tokens_dfw.h" #include "matched_elements_filter_dfw.h" #include "positionsdfw.h" #include "rankfeaturesdfw.h" @@ -84,6 +85,12 @@ DocsumFieldWriterFactory::create_docsum_field_writer(const vespalib::string& fie } else { throw_missing_source(command); } + } else if (command == command::linguistics_tokens) { + if (!source.empty()) { + fieldWriter = std::make_unique(source); + } else { + throw_missing_source(command); + } } else if (command == command::abs_distance) { if (has_attribute_manager()) { fieldWriter = AbsDistanceDFW::create(source.c_str(), getEnvironment().getAttributeManager()); diff --git a/searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h b/searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h index 3b36455d09d..805b5cf3508 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h +++ b/searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h @@ -17,6 +17,7 @@ class IStringFieldConverter public: virtual ~IStringFieldConverter() = default; virtual void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) = 0; + virtual bool render_weighted_set_as_array() const = 0; }; } diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp index 838b0234cdb..b9b9d7c4c97 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp @@ -2,14 +2,11 @@ #include "linguistics_tokens_converter.h" #include -#include -#include #include #include using document::StringFieldValue; using search::linguistics::TokenExtractor; -using search::memoryindex::FieldInverter; using vespalib::Memory; using vespalib::slime::ArrayInserter; using vespalib::slime::Cursor; @@ -17,14 +14,9 @@ using vespalib::slime::Inserter; namespace search::docsummary { -namespace { - -vespalib::string dummy_field_name; - -} - -LinguisticsTokensConverter::LinguisticsTokensConverter() +LinguisticsTokensConverter::LinguisticsTokensConverter(const TokenExtractor& token_extractor) : IStringFieldConverter(), + _token_extractor(token_extractor), _text() { } @@ -56,8 +48,7 @@ LinguisticsTokensConverter::handle_indexing_terms(const StringFieldValue& value, using SpanTerm = TokenExtractor::SpanTerm; std::vector terms; auto span_trees = value.getSpanTrees(); - TokenExtractor token_extractor(dummy_field_name, FieldInverter::max_word_len); - token_extractor.extract(terms, span_trees, _text, nullptr); + _token_extractor.extract(terms, span_trees, _text, nullptr); auto it = terms.begin(); auto ite = terms.end(); auto itn = it; @@ -78,4 +69,10 @@ LinguisticsTokensConverter::convert(const StringFieldValue &input, vespalib::sli handle_indexing_terms(input, inserter); } +bool +LinguisticsTokensConverter::render_weighted_set_as_array() const +{ + return true; +} + } diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h index cba3937c822..d752fe89ed9 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h +++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h @@ -4,6 +4,8 @@ #include "i_string_field_converter.h" +namespace search::linguistics { class TokenExtractor; } + namespace search::docsummary { /* @@ -13,16 +15,18 @@ namespace search::docsummary { */ class LinguisticsTokensConverter : public IStringFieldConverter { - vespalib::stringref _text; + const linguistics::TokenExtractor& _token_extractor; + vespalib::stringref _text; template void handle_alternative_index_terms(ForwardIt it, ForwardIt last, vespalib::slime::Inserter& inserter); void handle_index_term(vespalib::stringref word, vespalib::slime::Inserter& inserter); void handle_indexing_terms(const document::StringFieldValue& value, vespalib::slime::Inserter& inserter); public: - LinguisticsTokensConverter(); + LinguisticsTokensConverter(const linguistics::TokenExtractor& token_extractor); ~LinguisticsTokensConverter() override; void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override; + bool render_weighted_set_as_array() const override; }; } diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.cpp new file mode 100644 index 00000000000..5e94e270c53 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.cpp @@ -0,0 +1,36 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "linguistics_tokens_dfw.h" +#include "i_docsum_store_document.h" +#include "linguistics_tokens_converter.h" +#include + +using search::memoryindex::FieldInverter; + +namespace search::docsummary { + +LinguisticsTokensDFW::LinguisticsTokensDFW(const vespalib::string& input_field_name) + : DocsumFieldWriter(), + _input_field_name(input_field_name), + _token_extractor(_input_field_name, FieldInverter::max_word_len) +{ +} + +LinguisticsTokensDFW::~LinguisticsTokensDFW() = default; + +bool +LinguisticsTokensDFW::isGenerated() const +{ + return false; +} + +void +LinguisticsTokensDFW::insertField(uint32_t, const IDocsumStoreDocument* doc, GetDocsumsState&, vespalib::slime::Inserter& target) const +{ + if (doc != nullptr) { + LinguisticsTokensConverter converter(_token_extractor); + doc->insert_summary_field(_input_field_name, target, &converter); + } +} + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.h b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.h new file mode 100644 index 00000000000..a70f0a69e4c --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.h @@ -0,0 +1,28 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "docsum_field_writer.h" +#include +#include + +namespace search::docsummary { + +/* + * class for writing annotated string field values from document as + * arrays containing the indexing terms. + */ +class LinguisticsTokensDFW : public DocsumFieldWriter +{ +private: + vespalib::string _input_field_name; + linguistics::TokenExtractor _token_extractor; + +public: + explicit LinguisticsTokensDFW(const vespalib::string& input_field_name); + ~LinguisticsTokensDFW() override; + bool isGenerated() const override; + void insertField(uint32_t docid, const IDocsumStoreDocument* doc, GetDocsumsState& state, vespalib::slime::Inserter& target) const override; +}; + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp b/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp index 7266642b18b..080129fe780 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp @@ -285,6 +285,7 @@ SlimeFiller::visit(const WeightedSetFieldValue& value) if (empty_or_empty_after_filtering(value)) { return; } + bool render_as_array = _string_converter != nullptr && _string_converter->render_weighted_set_as_array(); Cursor& a = _inserter.insertArray(); Symbol isym = a.resolve("item"); Symbol wsym = a.resolve("weight"); @@ -305,12 +306,18 @@ SlimeFiller::visit(const WeightedSetFieldValue& value) } ++matching_elements_itr; } - Cursor& o = a.addObject(); - ObjectSymbolInserter ki(o, isym); - SlimeFiller conv(ki); - entry.first->accept(conv); - int weight = static_cast(*entry.second).getValue(); - o.setLong(wsym, weight); + if (render_as_array) { + ArrayInserter ai(a); + SlimeFiller conv(ai, _string_converter, SlimeFillerFilter::all()); + entry.first->accept(conv); + } else { + Cursor& o = a.addObject(); + ObjectSymbolInserter ki(o, isym); + SlimeFiller conv(ki); + entry.first->accept(conv); + int weight = static_cast(*entry.second).getValue(); + o.setLong(wsym, weight); + } ++idx; } } -- cgit v1.2.3