diff options
author | Tor Egge <Tor.Egge@online.no> | 2023-10-16 12:58:04 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2023-10-16 12:58:04 +0200 |
commit | f67d01124f2a19e77c94039e571db3e4c60f4ed1 (patch) | |
tree | 3563782080658bec8986658822c8621e34e79b71 | |
parent | 0ccfe8aab8c12ecd518f882a048f8a13fb2084f1 (diff) |
Add linguistics tokens document field writer.
23 files changed, 196 insertions, 30 deletions
diff --git a/config-model/src/main/java/com/yahoo/schema/derived/SummaryClass.java b/config-model/src/main/java/com/yahoo/schema/derived/SummaryClass.java index ddb6b004070..94b456b3f5e 100644 --- a/config-model/src/main/java/com/yahoo/schema/derived/SummaryClass.java +++ b/config-model/src/main/java/com/yahoo/schema/derived/SummaryClass.java @@ -155,7 +155,8 @@ public class SummaryClass extends Derived { summaryField.getTransform() == SummaryTransform.GEOPOS || summaryField.getTransform() == SummaryTransform.POSITIONS || summaryField.getTransform() == SummaryTransform.MATCHED_ELEMENTS_FILTER || - summaryField.getTransform() == SummaryTransform.MATCHED_ATTRIBUTE_ELEMENTS_FILTER) + summaryField.getTransform() == SummaryTransform.MATCHED_ATTRIBUTE_ELEMENTS_FILTER || + summaryField.getTransform() == SummaryTransform.LINGUISTICS_TOKENS) { return summaryField.getSingleSource(); } else if (summaryField.getTransform().isDynamic()) { diff --git a/config-model/src/main/java/com/yahoo/schema/derived/SummaryClassField.java b/config-model/src/main/java/com/yahoo/schema/derived/SummaryClassField.java index c1e6dd2aea3..54a4883fa00 100644 --- a/config-model/src/main/java/com/yahoo/schema/derived/SummaryClassField.java +++ b/config-model/src/main/java/com/yahoo/schema/derived/SummaryClassField.java @@ -92,6 +92,8 @@ public class SummaryClassField { return Type.FEATUREDATA; } else if (transform != null && transform.equals(SummaryTransform.SUMMARYFEATURES)) { return Type.FEATUREDATA; + } else if (transform != null && transform.equals(SummaryTransform.LINGUISTICS_TOKENS)) { + return Type.JSONSTRING; } else { return Type.LONGSTRING; } diff --git a/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java b/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java index 7c6d62580cb..61f68defe40 100644 --- a/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java +++ b/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java @@ -217,6 +217,8 @@ public class ConvertParsedFields { transform = SummaryTransform.MATCHED_ELEMENTS_FILTER; } else if (parsed.getDynamic()) { transform = SummaryTransform.DYNAMICTEASER; + } else if (parsed.getLinguisticsTokens()) { + transform = SummaryTransform.LINGUISTICS_TOKENS; } if (parsed.getBolded()) { transform = transform.bold(); diff --git a/config-model/src/main/java/com/yahoo/schema/parser/ParsedSummaryField.java b/config-model/src/main/java/com/yahoo/schema/parser/ParsedSummaryField.java index 1d5d73635e7..446981f1ba4 100644 --- a/config-model/src/main/java/com/yahoo/schema/parser/ParsedSummaryField.java +++ b/config-model/src/main/java/com/yahoo/schema/parser/ParsedSummaryField.java @@ -18,6 +18,7 @@ class ParsedSummaryField extends ParsedBlock { private boolean isMEO = false; private boolean isFull = false; private boolean isBold = false; + private boolean isLinguisticsTokens = false; private final List<String> sources = new ArrayList<>(); private final List<String> destinations = new ArrayList<>(); @@ -37,6 +38,7 @@ class ParsedSummaryField extends ParsedBlock { boolean getDynamic() { return isDyn; } boolean getFull() { return isFull; } boolean getMatchedElementsOnly() { return isMEO; } + boolean getLinguisticsTokens() { return isLinguisticsTokens; } void addDestination(String dst) { destinations.add(dst); } void addSource(String src) { sources.add(src); } @@ -44,6 +46,7 @@ class ParsedSummaryField extends ParsedBlock { void setDynamic() { this.isDyn = true; } void setFull() { this.isFull = true; } void setMatchedElementsOnly() { this.isMEO = true; } + void setLinguisticsTokens() { this.isLinguisticsTokens = true; } void setType(ParsedType value) { verifyThat(type == null, "Cannot change type from ", type, "to", value); this.type = value; diff --git a/config-model/src/main/java/com/yahoo/schema/processing/IndexingOutputs.java b/config-model/src/main/java/com/yahoo/schema/processing/IndexingOutputs.java index 1d279242895..e54f8d3e881 100644 --- a/config-model/src/main/java/com/yahoo/schema/processing/IndexingOutputs.java +++ b/config-model/src/main/java/com/yahoo/schema/processing/IndexingOutputs.java @@ -78,7 +78,8 @@ public class IndexingOutputs extends Processor { return; } dynamicSummary.add(summaryName); - } else if (summaryTransform != SummaryTransform.ATTRIBUTE) { + } else if (summaryTransform != SummaryTransform.ATTRIBUTE && + summaryTransform != SummaryTransform.LINGUISTICS_TOKENS) { staticSummary.add(summaryName); } } diff --git a/config-model/src/main/java/com/yahoo/vespa/documentmodel/SummaryTransform.java b/config-model/src/main/java/com/yahoo/vespa/documentmodel/SummaryTransform.java index 575a3a748e6..c7c1606951e 100644 --- a/config-model/src/main/java/com/yahoo/vespa/documentmodel/SummaryTransform.java +++ b/config-model/src/main/java/com/yahoo/vespa/documentmodel/SummaryTransform.java @@ -23,7 +23,8 @@ public enum SummaryTransform { MATCHED_ELEMENTS_FILTER("matchedelementsfilter"), MATCHED_ATTRIBUTE_ELEMENTS_FILTER("matchedattributeelementsfilter"), COPY("copy"), - DOCUMENT_ID("documentid"); + DOCUMENT_ID("documentid"), + LINGUISTICS_TOKENS("linguistics-tokens"); private final String name; diff --git a/config-model/src/main/javacc/SchemaParser.jj b/config-model/src/main/javacc/SchemaParser.jj index ae4c3b365d8..a5238afc86a 100644 --- a/config-model/src/main/javacc/SchemaParser.jj +++ b/config-model/src/main/javacc/SchemaParser.jj @@ -201,6 +201,7 @@ TOKEN : | < FULL: "full" > | < STATIC: "static" > | < DYNAMIC: "dynamic" > +| < LINGUISTICS_TOKENS: "linguistics-tokens" > | < MATCHED_ELEMENTS_ONLY: "matched-elements-only" > | < SSCONTEXTUAL: "contextual" > | < SSOVERRIDE: "override" > @@ -1128,6 +1129,7 @@ void summaryInFieldShort(ParsedField field) : <COLON> ( <DYNAMIC> { psf.setDynamic(); } | <MATCHED_ELEMENTS_ONLY> { psf.setMatchedElementsOnly(); } | (<FULL> | <STATIC>) { psf.setFull(); } + | <LINGUISTICS_TOKENS> { psf.setLinguisticsTokens(); } ) } @@ -1173,6 +1175,7 @@ void summaryTransform(ParsedSummaryField field) : { } ( <DYNAMIC> { field.setDynamic(); } | <MATCHED_ELEMENTS_ONLY> { field.setMatchedElementsOnly(); } | (<FULL> | <STATIC>) { field.setFull(); } + | <LINGUISTICS_TOKENS> { field.setLinguisticsTokens(); } ) } @@ -2712,6 +2715,7 @@ String identifier() : { } | <INLINE> | <INPUTS> | <INTEGER> + | <LINGUISTICS_TOKENS> | <LITERAL> | <LOCALE> | <LONG> diff --git a/config-model/src/test/java/com/yahoo/schema/derived/SummaryTestCase.java b/config-model/src/test/java/com/yahoo/schema/derived/SummaryTestCase.java index 1f18a5ed49b..4128baddcb7 100644 --- a/config-model/src/test/java/com/yahoo/schema/derived/SummaryTestCase.java +++ b/config-model/src/test/java/com/yahoo/schema/derived/SummaryTestCase.java @@ -227,6 +227,19 @@ public class SummaryTestCase extends AbstractSchemaTestCase { } @Test + void linguistics_tokenizer_override() throws ParseException { + var schema = buildSchema("field foo type string { indexing: summary }", + joinLines("document-summary bar {", + " summary baz type string {", + " source: foo ", + " linguistics-tokens", + " }", + " from-disk", + "}")); + assertOverride(schema, "baz", SummaryTransform.LINGUISTICS_TOKENS.getName(), "foo", "bar"); + } + + @Test void documentid_summary_transform_requires_disk_access() { assertFalse(SummaryTransform.DOCUMENT_ID.isInMemory()); } diff --git a/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp index c8d959361ae..beaa43c7af8 100644 --- a/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp +++ b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp @@ -9,6 +9,7 @@ #include <vespa/document/repo/configbuilder.h> #include <vespa/document/repo/fixedtyperepo.h> #include <vespa/searchlib/util/linguisticsannotation.h> +#include <vespa/searchlib/util/token_extractor.h> #include <vespa/searchsummary/docsummary/linguistics_tokens_converter.h> #include <vespa/vespalib/data/simple_buffer.h> #include <vespa/vespalib/data/slime/json_format.h> @@ -25,6 +26,7 @@ using document::SpanTree; using document::StringFieldValue; using search::docsummary::LinguisticsTokensConverter; using search::linguistics::SPANTREE_NAME; +using search::linguistics::TokenExtractor; using vespalib::SimpleBuffer; using vespalib::Slime; using vespalib::slime::JsonFormat; @@ -59,6 +61,8 @@ protected: std::shared_ptr<const DocumentTypeRepo> _repo; const DocumentType* _document_type; document::FixedTypeRepo _fixed_repo; + vespalib::string _dummy_field_name; + TokenExtractor _token_extractor; LinguisticsTokensConverterTest(); ~LinguisticsTokensConverterTest() override; @@ -73,7 +77,9 @@ LinguisticsTokensConverterTest::LinguisticsTokensConverterTest() : testing::Test(), _repo(std::make_unique<DocumentTypeRepo>(get_document_types_config())), _document_type(_repo->getDocumentType("indexingdocument")), - _fixed_repo(*_repo, *_document_type) + _fixed_repo(*_repo, *_document_type), + _dummy_field_name(), + _token_extractor(_dummy_field_name, 100) { } @@ -127,7 +133,7 @@ LinguisticsTokensConverterTest::make_exp_annotated_chinese_string_tokens() vespalib::string LinguisticsTokensConverterTest::convert(const StringFieldValue& fv) { - LinguisticsTokensConverter converter; + LinguisticsTokensConverter converter(_token_extractor); Slime slime; SlimeInserter inserter(slime); converter.convert(fv, inserter); diff --git a/searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp b/searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp index 10aedc6d9d0..c20f9570ef8 100644 --- a/searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp +++ b/searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp @@ -68,6 +68,7 @@ using search::docsummary::IStringFieldConverter; using search::docsummary::ResultConfig; using search::docsummary::SlimeFiller; using search::docsummary::SlimeFillerFilter; +using vespalib::Memory; using vespalib::SimpleBuffer; using vespalib::Slime; using vespalib::eval::SimpleValue; @@ -146,17 +147,27 @@ get_document_types_config() class MockStringFieldConverter : public IStringFieldConverter { std::vector<vespalib::string> _result; + bool _render_wset_as_array; + bool _insert; public: - MockStringFieldConverter() + MockStringFieldConverter(bool render_wset_as_array, bool insert) : IStringFieldConverter(), - _result() + _result(), + _render_wset_as_array(render_wset_as_array), + _insert(insert) { } ~MockStringFieldConverter() override = default; - void convert(const document::StringFieldValue& input, vespalib::slime::Inserter&) override { + void convert(const document::StringFieldValue& input, vespalib::slime::Inserter& inserter) override { _result.emplace_back(input.getValueRef()); + if (_insert) { + inserter.insertString(Memory(input.getValueRef())); + } } const std::vector<vespalib::string>& get_result() const noexcept { return _result; } + bool render_weighted_set_as_array() const override { + return _render_wset_as_array; + } }; } @@ -188,6 +199,7 @@ protected: void expect_insert_summary_field_with_filter(const vespalib::string& exp, const FieldValue& fv, const std::vector<uint32_t>& matching_elems); void expect_insert_summary_field_with_field_filter(const vespalib::string& exp, const FieldValue& fv, const SlimeFillerFilter* filter); void expect_insert_juniper_field(const std::vector<vespalib::string>& exp, const vespalib::string& exp_slime, const FieldValue& fv); + void expect_insert_summary_field_with_converter(const std::vector<vespalib::string>& exp, const vespalib::string& exp_slime, const FieldValue& fv, MockStringFieldConverter& converter); }; SlimeFillerTest::SlimeFillerTest() @@ -317,7 +329,7 @@ SlimeFillerTest::expect_insert_callback(const std::vector<vespalib::string>& exp { Slime slime; SlimeInserter inserter(slime); - MockStringFieldConverter converter; + MockStringFieldConverter converter(false, false); SlimeFiller filler(inserter, &converter, SlimeFillerFilter::all()); fv.accept(filler); auto act_null = slime_to_string(slime); @@ -361,7 +373,7 @@ SlimeFillerTest::expect_insert_juniper_field(const std::vector<vespalib::string> { Slime slime; SlimeInserter inserter(slime); - MockStringFieldConverter converter; + MockStringFieldConverter converter(false, false); SlimeFiller::insert_juniper_field(fv, inserter, converter); auto act_slime = slime_to_string(slime); EXPECT_EQ(exp_slime, act_slime); @@ -369,6 +381,18 @@ SlimeFillerTest::expect_insert_juniper_field(const std::vector<vespalib::string> EXPECT_EQ(exp, act); } +void +SlimeFillerTest::expect_insert_summary_field_with_converter(const std::vector<vespalib::string>& exp, const vespalib::string& exp_slime, const FieldValue& fv, MockStringFieldConverter& converter) +{ + Slime slime; + SlimeInserter inserter(slime); + SlimeFiller::insert_summary_field(fv, inserter, &converter); + auto act_slime = slime_to_string(slime); + EXPECT_EQ(exp_slime, act_slime); + auto act = converter.get_result(); + EXPECT_EQ(exp, act); +} + TEST_F(SlimeFillerTest, insert_primitive_values) { { @@ -625,4 +649,16 @@ TEST_F(SlimeFillerTest, insert_juniper_field) expect_insert_juniper_field({}, "null", make_empty_array()); } +TEST_F(SlimeFillerTest, string_field_is_not_converted_for_weighted_set_rendering) +{ + MockStringFieldConverter cvt_as_wset(false, true); + expect_insert_summary_field_with_converter({}, R"([{"item":"foo","weight":2},{"item":"bar","weight":4},{"item":"baz","weight":6}])", make_weighted_set(), cvt_as_wset); +} + +TEST_F(SlimeFillerTest, weighted_set_can_be_rendered_as_array) +{ + MockStringFieldConverter cvt_as_array(true, true); + expect_insert_summary_field_with_converter({"foo","bar","baz"}, R"(["foo","bar","baz"])", make_weighted_set(), cvt_as_array); +} + GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt index e5ae47593e5..57b6004fb61 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt +++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt @@ -24,6 +24,7 @@ vespa_add_library(searchsummary_docsummary OBJECT juniper_query_adapter.cpp juniperproperties.cpp linguistics_tokens_converter.cpp + linguistics_tokens_dfw.cpp matched_elements_filter_dfw.cpp positionsdfw.cpp query_term_filter.cpp diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp index bf267ab9e27..77724305220 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp @@ -109,4 +109,10 @@ AnnotationConverter::convert(const StringFieldValue &input, vespalib::slime::Ins _juniper_converter.convert(_out.str(), inserter); } +bool +AnnotationConverter::render_weighted_set_as_array() const +{ + return false; +} + } diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h index b6430b35f29..b082269eb7e 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h +++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h @@ -33,6 +33,7 @@ public: AnnotationConverter(IJuniperConverter& juniper_converter); ~AnnotationConverter() override; void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override; + bool render_weighted_set_as_array() const override; }; } diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp index 2ce809e1cbe..c4823f6beeb 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp @@ -12,6 +12,7 @@ const vespalib::string documentid("documentid"); const vespalib::string dynamic_teaser("dynamicteaser"); const vespalib::string empty("empty"); const vespalib::string geo_position("geopos"); +const vespalib::string linguistics_tokens("linguistics-tokens"); const vespalib::string matched_attribute_elements_filter("matchedattributeelementsfilter"); const vespalib::string matched_elements_filter("matchedelementsfilter"); const vespalib::string positions("positions"); diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h index 26bc33e7e3c..2d0b8c23855 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h @@ -18,6 +18,7 @@ extern const vespalib::string documentid; extern const vespalib::string dynamic_teaser; extern const vespalib::string empty; extern const vespalib::string geo_position; +extern const vespalib::string linguistics_tokens; extern const vespalib::string matched_attribute_elements_filter; extern const vespalib::string matched_elements_filter; extern const vespalib::string positions; diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp index 9b7391dd1ab..d19d2994104 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp @@ -9,6 +9,7 @@ #include "geoposdfw.h" #include "idocsumenvironment.h" #include "juniperdfw.h" +#include "linguistics_tokens_dfw.h" #include "matched_elements_filter_dfw.h" #include "positionsdfw.h" #include "rankfeaturesdfw.h" @@ -84,6 +85,12 @@ DocsumFieldWriterFactory::create_docsum_field_writer(const vespalib::string& fie } else { throw_missing_source(command); } + } else if (command == command::linguistics_tokens) { + if (!source.empty()) { + fieldWriter = std::make_unique<LinguisticsTokensDFW>(source); + } else { + throw_missing_source(command); + } } else if (command == command::abs_distance) { if (has_attribute_manager()) { fieldWriter = AbsDistanceDFW::create(source.c_str(), getEnvironment().getAttributeManager()); diff --git a/searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h b/searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h index 3b36455d09d..805b5cf3508 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h +++ b/searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h @@ -17,6 +17,7 @@ class IStringFieldConverter public: virtual ~IStringFieldConverter() = default; virtual void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) = 0; + virtual bool render_weighted_set_as_array() const = 0; }; } diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp index 838b0234cdb..b9b9d7c4c97 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp @@ -2,14 +2,11 @@ #include "linguistics_tokens_converter.h" #include <vespa/document/fieldvalue/stringfieldvalue.h> -#include <vespa/searchlib/memoryindex/field_inverter.h> -#include <vespa/searchlib/util/linguisticsannotation.h> #include <vespa/searchlib/util/token_extractor.h> #include <vespa/vespalib/data/slime/slime.h> using document::StringFieldValue; using search::linguistics::TokenExtractor; -using search::memoryindex::FieldInverter; using vespalib::Memory; using vespalib::slime::ArrayInserter; using vespalib::slime::Cursor; @@ -17,14 +14,9 @@ using vespalib::slime::Inserter; namespace search::docsummary { -namespace { - -vespalib::string dummy_field_name; - -} - -LinguisticsTokensConverter::LinguisticsTokensConverter() +LinguisticsTokensConverter::LinguisticsTokensConverter(const TokenExtractor& token_extractor) : IStringFieldConverter(), + _token_extractor(token_extractor), _text() { } @@ -56,8 +48,7 @@ LinguisticsTokensConverter::handle_indexing_terms(const StringFieldValue& value, using SpanTerm = TokenExtractor::SpanTerm; std::vector<SpanTerm> terms; auto span_trees = value.getSpanTrees(); - TokenExtractor token_extractor(dummy_field_name, FieldInverter::max_word_len); - token_extractor.extract(terms, span_trees, _text, nullptr); + _token_extractor.extract(terms, span_trees, _text, nullptr); auto it = terms.begin(); auto ite = terms.end(); auto itn = it; @@ -78,4 +69,10 @@ LinguisticsTokensConverter::convert(const StringFieldValue &input, vespalib::sli handle_indexing_terms(input, inserter); } +bool +LinguisticsTokensConverter::render_weighted_set_as_array() const +{ + return true; +} + } diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h index cba3937c822..d752fe89ed9 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h +++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h @@ -4,6 +4,8 @@ #include "i_string_field_converter.h" +namespace search::linguistics { class TokenExtractor; } + namespace search::docsummary { /* @@ -13,16 +15,18 @@ namespace search::docsummary { */ class LinguisticsTokensConverter : public IStringFieldConverter { - vespalib::stringref _text; + const linguistics::TokenExtractor& _token_extractor; + vespalib::stringref _text; template <typename ForwardIt> void handle_alternative_index_terms(ForwardIt it, ForwardIt last, vespalib::slime::Inserter& inserter); void handle_index_term(vespalib::stringref word, vespalib::slime::Inserter& inserter); void handle_indexing_terms(const document::StringFieldValue& value, vespalib::slime::Inserter& inserter); public: - LinguisticsTokensConverter(); + LinguisticsTokensConverter(const linguistics::TokenExtractor& token_extractor); ~LinguisticsTokensConverter() override; void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override; + bool render_weighted_set_as_array() const override; }; } diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.cpp new file mode 100644 index 00000000000..5e94e270c53 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.cpp @@ -0,0 +1,36 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "linguistics_tokens_dfw.h" +#include "i_docsum_store_document.h" +#include "linguistics_tokens_converter.h" +#include <vespa/searchlib/memoryindex/field_inverter.h> + +using search::memoryindex::FieldInverter; + +namespace search::docsummary { + +LinguisticsTokensDFW::LinguisticsTokensDFW(const vespalib::string& input_field_name) + : DocsumFieldWriter(), + _input_field_name(input_field_name), + _token_extractor(_input_field_name, FieldInverter::max_word_len) +{ +} + +LinguisticsTokensDFW::~LinguisticsTokensDFW() = default; + +bool +LinguisticsTokensDFW::isGenerated() const +{ + return false; +} + +void +LinguisticsTokensDFW::insertField(uint32_t, const IDocsumStoreDocument* doc, GetDocsumsState&, vespalib::slime::Inserter& target) const +{ + if (doc != nullptr) { + LinguisticsTokensConverter converter(_token_extractor); + doc->insert_summary_field(_input_field_name, target, &converter); + } +} + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.h b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.h new file mode 100644 index 00000000000..a70f0a69e4c --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.h @@ -0,0 +1,28 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "docsum_field_writer.h" +#include <vespa/searchlib/util/token_extractor.h> +#include <memory> + +namespace search::docsummary { + +/* + * class for writing annotated string field values from document as + * arrays containing the indexing terms. + */ +class LinguisticsTokensDFW : public DocsumFieldWriter +{ +private: + vespalib::string _input_field_name; + linguistics::TokenExtractor _token_extractor; + +public: + explicit LinguisticsTokensDFW(const vespalib::string& input_field_name); + ~LinguisticsTokensDFW() override; + bool isGenerated() const override; + void insertField(uint32_t docid, const IDocsumStoreDocument* doc, GetDocsumsState& state, vespalib::slime::Inserter& target) const override; +}; + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp b/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp index 7266642b18b..080129fe780 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp @@ -285,6 +285,7 @@ SlimeFiller::visit(const WeightedSetFieldValue& value) if (empty_or_empty_after_filtering(value)) { return; } + bool render_as_array = _string_converter != nullptr && _string_converter->render_weighted_set_as_array(); Cursor& a = _inserter.insertArray(); Symbol isym = a.resolve("item"); Symbol wsym = a.resolve("weight"); @@ -305,12 +306,18 @@ SlimeFiller::visit(const WeightedSetFieldValue& value) } ++matching_elements_itr; } - Cursor& o = a.addObject(); - ObjectSymbolInserter ki(o, isym); - SlimeFiller conv(ki); - entry.first->accept(conv); - int weight = static_cast<const IntFieldValue&>(*entry.second).getValue(); - o.setLong(wsym, weight); + if (render_as_array) { + ArrayInserter ai(a); + SlimeFiller conv(ai, _string_converter, SlimeFillerFilter::all()); + entry.first->accept(conv); + } else { + Cursor& o = a.addObject(); + ObjectSymbolInserter ki(o, isym); + SlimeFiller conv(ki); + entry.first->accept(conv); + int weight = static_cast<const IntFieldValue&>(*entry.second).getValue(); + o.setLong(wsym, weight); + } ++idx; } } diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp index b48f556f4be..b94de154a35 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp +++ b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp @@ -132,6 +132,7 @@ public: } ~SnippetModifierJuniperConverter() override = default; void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override; + bool render_weighted_set_as_array() const override; }; void @@ -147,6 +148,12 @@ SnippetModifierJuniperConverter::convert(const document::StringFieldValue &input } } +bool +SnippetModifierJuniperConverter::render_weighted_set_as_array() const +{ + return false; +} + /** * Class providing access to a document retrieved from an IDocsumStore * (vsm::DocsumFilter). VSM specific transforms might be applied when |