summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2023-10-16 12:58:04 +0200
committerTor Egge <Tor.Egge@online.no>2023-10-16 12:58:04 +0200
commitf67d01124f2a19e77c94039e571db3e4c60f4ed1 (patch)
tree3563782080658bec8986658822c8621e34e79b71
parent0ccfe8aab8c12ecd518f882a048f8a13fb2084f1 (diff)
Add linguistics tokens document field writer.
-rw-r--r--config-model/src/main/java/com/yahoo/schema/derived/SummaryClass.java3
-rw-r--r--config-model/src/main/java/com/yahoo/schema/derived/SummaryClassField.java2
-rw-r--r--config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java2
-rw-r--r--config-model/src/main/java/com/yahoo/schema/parser/ParsedSummaryField.java3
-rw-r--r--config-model/src/main/java/com/yahoo/schema/processing/IndexingOutputs.java3
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/documentmodel/SummaryTransform.java3
-rw-r--r--config-model/src/main/javacc/SchemaParser.jj4
-rw-r--r--config-model/src/test/java/com/yahoo/schema/derived/SummaryTestCase.java13
-rw-r--r--searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp10
-rw-r--r--searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp46
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt1
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp6
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h1
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp1
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h1
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp7
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h1
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp21
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h8
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.cpp36
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.h28
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp19
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp7
23 files changed, 196 insertions, 30 deletions
diff --git a/config-model/src/main/java/com/yahoo/schema/derived/SummaryClass.java b/config-model/src/main/java/com/yahoo/schema/derived/SummaryClass.java
index ddb6b004070..94b456b3f5e 100644
--- a/config-model/src/main/java/com/yahoo/schema/derived/SummaryClass.java
+++ b/config-model/src/main/java/com/yahoo/schema/derived/SummaryClass.java
@@ -155,7 +155,8 @@ public class SummaryClass extends Derived {
summaryField.getTransform() == SummaryTransform.GEOPOS ||
summaryField.getTransform() == SummaryTransform.POSITIONS ||
summaryField.getTransform() == SummaryTransform.MATCHED_ELEMENTS_FILTER ||
- summaryField.getTransform() == SummaryTransform.MATCHED_ATTRIBUTE_ELEMENTS_FILTER)
+ summaryField.getTransform() == SummaryTransform.MATCHED_ATTRIBUTE_ELEMENTS_FILTER ||
+ summaryField.getTransform() == SummaryTransform.LINGUISTICS_TOKENS)
{
return summaryField.getSingleSource();
} else if (summaryField.getTransform().isDynamic()) {
diff --git a/config-model/src/main/java/com/yahoo/schema/derived/SummaryClassField.java b/config-model/src/main/java/com/yahoo/schema/derived/SummaryClassField.java
index c1e6dd2aea3..54a4883fa00 100644
--- a/config-model/src/main/java/com/yahoo/schema/derived/SummaryClassField.java
+++ b/config-model/src/main/java/com/yahoo/schema/derived/SummaryClassField.java
@@ -92,6 +92,8 @@ public class SummaryClassField {
return Type.FEATUREDATA;
} else if (transform != null && transform.equals(SummaryTransform.SUMMARYFEATURES)) {
return Type.FEATUREDATA;
+ } else if (transform != null && transform.equals(SummaryTransform.LINGUISTICS_TOKENS)) {
+ return Type.JSONSTRING;
} else {
return Type.LONGSTRING;
}
diff --git a/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java b/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java
index 7c6d62580cb..61f68defe40 100644
--- a/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java
+++ b/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedFields.java
@@ -217,6 +217,8 @@ public class ConvertParsedFields {
transform = SummaryTransform.MATCHED_ELEMENTS_FILTER;
} else if (parsed.getDynamic()) {
transform = SummaryTransform.DYNAMICTEASER;
+ } else if (parsed.getLinguisticsTokens()) {
+ transform = SummaryTransform.LINGUISTICS_TOKENS;
}
if (parsed.getBolded()) {
transform = transform.bold();
diff --git a/config-model/src/main/java/com/yahoo/schema/parser/ParsedSummaryField.java b/config-model/src/main/java/com/yahoo/schema/parser/ParsedSummaryField.java
index 1d5d73635e7..446981f1ba4 100644
--- a/config-model/src/main/java/com/yahoo/schema/parser/ParsedSummaryField.java
+++ b/config-model/src/main/java/com/yahoo/schema/parser/ParsedSummaryField.java
@@ -18,6 +18,7 @@ class ParsedSummaryField extends ParsedBlock {
private boolean isMEO = false;
private boolean isFull = false;
private boolean isBold = false;
+ private boolean isLinguisticsTokens = false;
private final List<String> sources = new ArrayList<>();
private final List<String> destinations = new ArrayList<>();
@@ -37,6 +38,7 @@ class ParsedSummaryField extends ParsedBlock {
boolean getDynamic() { return isDyn; }
boolean getFull() { return isFull; }
boolean getMatchedElementsOnly() { return isMEO; }
+ boolean getLinguisticsTokens() { return isLinguisticsTokens; }
void addDestination(String dst) { destinations.add(dst); }
void addSource(String src) { sources.add(src); }
@@ -44,6 +46,7 @@ class ParsedSummaryField extends ParsedBlock {
void setDynamic() { this.isDyn = true; }
void setFull() { this.isFull = true; }
void setMatchedElementsOnly() { this.isMEO = true; }
+ void setLinguisticsTokens() { this.isLinguisticsTokens = true; }
void setType(ParsedType value) {
verifyThat(type == null, "Cannot change type from ", type, "to", value);
this.type = value;
diff --git a/config-model/src/main/java/com/yahoo/schema/processing/IndexingOutputs.java b/config-model/src/main/java/com/yahoo/schema/processing/IndexingOutputs.java
index 1d279242895..e54f8d3e881 100644
--- a/config-model/src/main/java/com/yahoo/schema/processing/IndexingOutputs.java
+++ b/config-model/src/main/java/com/yahoo/schema/processing/IndexingOutputs.java
@@ -78,7 +78,8 @@ public class IndexingOutputs extends Processor {
return;
}
dynamicSummary.add(summaryName);
- } else if (summaryTransform != SummaryTransform.ATTRIBUTE) {
+ } else if (summaryTransform != SummaryTransform.ATTRIBUTE &&
+ summaryTransform != SummaryTransform.LINGUISTICS_TOKENS) {
staticSummary.add(summaryName);
}
}
diff --git a/config-model/src/main/java/com/yahoo/vespa/documentmodel/SummaryTransform.java b/config-model/src/main/java/com/yahoo/vespa/documentmodel/SummaryTransform.java
index 575a3a748e6..c7c1606951e 100644
--- a/config-model/src/main/java/com/yahoo/vespa/documentmodel/SummaryTransform.java
+++ b/config-model/src/main/java/com/yahoo/vespa/documentmodel/SummaryTransform.java
@@ -23,7 +23,8 @@ public enum SummaryTransform {
MATCHED_ELEMENTS_FILTER("matchedelementsfilter"),
MATCHED_ATTRIBUTE_ELEMENTS_FILTER("matchedattributeelementsfilter"),
COPY("copy"),
- DOCUMENT_ID("documentid");
+ DOCUMENT_ID("documentid"),
+ LINGUISTICS_TOKENS("linguistics-tokens");
private final String name;
diff --git a/config-model/src/main/javacc/SchemaParser.jj b/config-model/src/main/javacc/SchemaParser.jj
index ae4c3b365d8..a5238afc86a 100644
--- a/config-model/src/main/javacc/SchemaParser.jj
+++ b/config-model/src/main/javacc/SchemaParser.jj
@@ -201,6 +201,7 @@ TOKEN :
| < FULL: "full" >
| < STATIC: "static" >
| < DYNAMIC: "dynamic" >
+| < LINGUISTICS_TOKENS: "linguistics-tokens" >
| < MATCHED_ELEMENTS_ONLY: "matched-elements-only" >
| < SSCONTEXTUAL: "contextual" >
| < SSOVERRIDE: "override" >
@@ -1128,6 +1129,7 @@ void summaryInFieldShort(ParsedField field) :
<COLON> ( <DYNAMIC> { psf.setDynamic(); }
| <MATCHED_ELEMENTS_ONLY> { psf.setMatchedElementsOnly(); }
| (<FULL> | <STATIC>) { psf.setFull(); }
+ | <LINGUISTICS_TOKENS> { psf.setLinguisticsTokens(); }
)
}
@@ -1173,6 +1175,7 @@ void summaryTransform(ParsedSummaryField field) : { }
( <DYNAMIC> { field.setDynamic(); }
| <MATCHED_ELEMENTS_ONLY> { field.setMatchedElementsOnly(); }
| (<FULL> | <STATIC>) { field.setFull(); }
+ | <LINGUISTICS_TOKENS> { field.setLinguisticsTokens(); }
)
}
@@ -2712,6 +2715,7 @@ String identifier() : { }
| <INLINE>
| <INPUTS>
| <INTEGER>
+ | <LINGUISTICS_TOKENS>
| <LITERAL>
| <LOCALE>
| <LONG>
diff --git a/config-model/src/test/java/com/yahoo/schema/derived/SummaryTestCase.java b/config-model/src/test/java/com/yahoo/schema/derived/SummaryTestCase.java
index 1f18a5ed49b..4128baddcb7 100644
--- a/config-model/src/test/java/com/yahoo/schema/derived/SummaryTestCase.java
+++ b/config-model/src/test/java/com/yahoo/schema/derived/SummaryTestCase.java
@@ -227,6 +227,19 @@ public class SummaryTestCase extends AbstractSchemaTestCase {
}
@Test
+ void linguistics_tokenizer_override() throws ParseException {
+ var schema = buildSchema("field foo type string { indexing: summary }",
+ joinLines("document-summary bar {",
+ " summary baz type string {",
+ " source: foo ",
+ " linguistics-tokens",
+ " }",
+ " from-disk",
+ "}"));
+ assertOverride(schema, "baz", SummaryTransform.LINGUISTICS_TOKENS.getName(), "foo", "bar");
+ }
+
+ @Test
void documentid_summary_transform_requires_disk_access() {
assertFalse(SummaryTransform.DOCUMENT_ID.isInMemory());
}
diff --git a/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp
index c8d959361ae..beaa43c7af8 100644
--- a/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp
+++ b/searchsummary/src/tests/docsummary/linguistics_tokens_converter/linguistics_tokens_converter_test.cpp
@@ -9,6 +9,7 @@
#include <vespa/document/repo/configbuilder.h>
#include <vespa/document/repo/fixedtyperepo.h>
#include <vespa/searchlib/util/linguisticsannotation.h>
+#include <vespa/searchlib/util/token_extractor.h>
#include <vespa/searchsummary/docsummary/linguistics_tokens_converter.h>
#include <vespa/vespalib/data/simple_buffer.h>
#include <vespa/vespalib/data/slime/json_format.h>
@@ -25,6 +26,7 @@ using document::SpanTree;
using document::StringFieldValue;
using search::docsummary::LinguisticsTokensConverter;
using search::linguistics::SPANTREE_NAME;
+using search::linguistics::TokenExtractor;
using vespalib::SimpleBuffer;
using vespalib::Slime;
using vespalib::slime::JsonFormat;
@@ -59,6 +61,8 @@ protected:
std::shared_ptr<const DocumentTypeRepo> _repo;
const DocumentType* _document_type;
document::FixedTypeRepo _fixed_repo;
+ vespalib::string _dummy_field_name;
+ TokenExtractor _token_extractor;
LinguisticsTokensConverterTest();
~LinguisticsTokensConverterTest() override;
@@ -73,7 +77,9 @@ LinguisticsTokensConverterTest::LinguisticsTokensConverterTest()
: testing::Test(),
_repo(std::make_unique<DocumentTypeRepo>(get_document_types_config())),
_document_type(_repo->getDocumentType("indexingdocument")),
- _fixed_repo(*_repo, *_document_type)
+ _fixed_repo(*_repo, *_document_type),
+ _dummy_field_name(),
+ _token_extractor(_dummy_field_name, 100)
{
}
@@ -127,7 +133,7 @@ LinguisticsTokensConverterTest::make_exp_annotated_chinese_string_tokens()
vespalib::string
LinguisticsTokensConverterTest::convert(const StringFieldValue& fv)
{
- LinguisticsTokensConverter converter;
+ LinguisticsTokensConverter converter(_token_extractor);
Slime slime;
SlimeInserter inserter(slime);
converter.convert(fv, inserter);
diff --git a/searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp b/searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp
index 10aedc6d9d0..c20f9570ef8 100644
--- a/searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp
+++ b/searchsummary/src/tests/docsummary/slime_filler/slime_filler_test.cpp
@@ -68,6 +68,7 @@ using search::docsummary::IStringFieldConverter;
using search::docsummary::ResultConfig;
using search::docsummary::SlimeFiller;
using search::docsummary::SlimeFillerFilter;
+using vespalib::Memory;
using vespalib::SimpleBuffer;
using vespalib::Slime;
using vespalib::eval::SimpleValue;
@@ -146,17 +147,27 @@ get_document_types_config()
class MockStringFieldConverter : public IStringFieldConverter
{
std::vector<vespalib::string> _result;
+ bool _render_wset_as_array;
+ bool _insert;
public:
- MockStringFieldConverter()
+ MockStringFieldConverter(bool render_wset_as_array, bool insert)
: IStringFieldConverter(),
- _result()
+ _result(),
+ _render_wset_as_array(render_wset_as_array),
+ _insert(insert)
{
}
~MockStringFieldConverter() override = default;
- void convert(const document::StringFieldValue& input, vespalib::slime::Inserter&) override {
+ void convert(const document::StringFieldValue& input, vespalib::slime::Inserter& inserter) override {
_result.emplace_back(input.getValueRef());
+ if (_insert) {
+ inserter.insertString(Memory(input.getValueRef()));
+ }
}
const std::vector<vespalib::string>& get_result() const noexcept { return _result; }
+ bool render_weighted_set_as_array() const override {
+ return _render_wset_as_array;
+ }
};
}
@@ -188,6 +199,7 @@ protected:
void expect_insert_summary_field_with_filter(const vespalib::string& exp, const FieldValue& fv, const std::vector<uint32_t>& matching_elems);
void expect_insert_summary_field_with_field_filter(const vespalib::string& exp, const FieldValue& fv, const SlimeFillerFilter* filter);
void expect_insert_juniper_field(const std::vector<vespalib::string>& exp, const vespalib::string& exp_slime, const FieldValue& fv);
+ void expect_insert_summary_field_with_converter(const std::vector<vespalib::string>& exp, const vespalib::string& exp_slime, const FieldValue& fv, MockStringFieldConverter& converter);
};
SlimeFillerTest::SlimeFillerTest()
@@ -317,7 +329,7 @@ SlimeFillerTest::expect_insert_callback(const std::vector<vespalib::string>& exp
{
Slime slime;
SlimeInserter inserter(slime);
- MockStringFieldConverter converter;
+ MockStringFieldConverter converter(false, false);
SlimeFiller filler(inserter, &converter, SlimeFillerFilter::all());
fv.accept(filler);
auto act_null = slime_to_string(slime);
@@ -361,7 +373,7 @@ SlimeFillerTest::expect_insert_juniper_field(const std::vector<vespalib::string>
{
Slime slime;
SlimeInserter inserter(slime);
- MockStringFieldConverter converter;
+ MockStringFieldConverter converter(false, false);
SlimeFiller::insert_juniper_field(fv, inserter, converter);
auto act_slime = slime_to_string(slime);
EXPECT_EQ(exp_slime, act_slime);
@@ -369,6 +381,18 @@ SlimeFillerTest::expect_insert_juniper_field(const std::vector<vespalib::string>
EXPECT_EQ(exp, act);
}
+void
+SlimeFillerTest::expect_insert_summary_field_with_converter(const std::vector<vespalib::string>& exp, const vespalib::string& exp_slime, const FieldValue& fv, MockStringFieldConverter& converter)
+{
+ Slime slime;
+ SlimeInserter inserter(slime);
+ SlimeFiller::insert_summary_field(fv, inserter, &converter);
+ auto act_slime = slime_to_string(slime);
+ EXPECT_EQ(exp_slime, act_slime);
+ auto act = converter.get_result();
+ EXPECT_EQ(exp, act);
+}
+
TEST_F(SlimeFillerTest, insert_primitive_values)
{
{
@@ -625,4 +649,16 @@ TEST_F(SlimeFillerTest, insert_juniper_field)
expect_insert_juniper_field({}, "null", make_empty_array());
}
+TEST_F(SlimeFillerTest, string_field_is_not_converted_for_weighted_set_rendering)
+{
+ MockStringFieldConverter cvt_as_wset(false, true);
+ expect_insert_summary_field_with_converter({}, R"([{"item":"foo","weight":2},{"item":"bar","weight":4},{"item":"baz","weight":6}])", make_weighted_set(), cvt_as_wset);
+}
+
+TEST_F(SlimeFillerTest, weighted_set_can_be_rendered_as_array)
+{
+ MockStringFieldConverter cvt_as_array(true, true);
+ expect_insert_summary_field_with_converter({"foo","bar","baz"}, R"(["foo","bar","baz"])", make_weighted_set(), cvt_as_array);
+}
+
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
index e5ae47593e5..57b6004fb61 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
+++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
@@ -24,6 +24,7 @@ vespa_add_library(searchsummary_docsummary OBJECT
juniper_query_adapter.cpp
juniperproperties.cpp
linguistics_tokens_converter.cpp
+ linguistics_tokens_dfw.cpp
matched_elements_filter_dfw.cpp
positionsdfw.cpp
query_term_filter.cpp
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
index bf267ab9e27..77724305220 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp
@@ -109,4 +109,10 @@ AnnotationConverter::convert(const StringFieldValue &input, vespalib::slime::Ins
_juniper_converter.convert(_out.str(), inserter);
}
+bool
+AnnotationConverter::render_weighted_set_as_array() const
+{
+ return false;
+}
+
}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h
index b6430b35f29..b082269eb7e 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h
+++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h
@@ -33,6 +33,7 @@ public:
AnnotationConverter(IJuniperConverter& juniper_converter);
~AnnotationConverter() override;
void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override;
+ bool render_weighted_set_as_array() const override;
};
}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp
index 2ce809e1cbe..c4823f6beeb 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.cpp
@@ -12,6 +12,7 @@ const vespalib::string documentid("documentid");
const vespalib::string dynamic_teaser("dynamicteaser");
const vespalib::string empty("empty");
const vespalib::string geo_position("geopos");
+const vespalib::string linguistics_tokens("linguistics-tokens");
const vespalib::string matched_attribute_elements_filter("matchedattributeelementsfilter");
const vespalib::string matched_elements_filter("matchedelementsfilter");
const vespalib::string positions("positions");
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h
index 26bc33e7e3c..2d0b8c23855 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h
+++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_commands.h
@@ -18,6 +18,7 @@ extern const vespalib::string documentid;
extern const vespalib::string dynamic_teaser;
extern const vespalib::string empty;
extern const vespalib::string geo_position;
+extern const vespalib::string linguistics_tokens;
extern const vespalib::string matched_attribute_elements_filter;
extern const vespalib::string matched_elements_filter;
extern const vespalib::string positions;
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp
index 9b7391dd1ab..d19d2994104 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/docsum_field_writer_factory.cpp
@@ -9,6 +9,7 @@
#include "geoposdfw.h"
#include "idocsumenvironment.h"
#include "juniperdfw.h"
+#include "linguistics_tokens_dfw.h"
#include "matched_elements_filter_dfw.h"
#include "positionsdfw.h"
#include "rankfeaturesdfw.h"
@@ -84,6 +85,12 @@ DocsumFieldWriterFactory::create_docsum_field_writer(const vespalib::string& fie
} else {
throw_missing_source(command);
}
+ } else if (command == command::linguistics_tokens) {
+ if (!source.empty()) {
+ fieldWriter = std::make_unique<LinguisticsTokensDFW>(source);
+ } else {
+ throw_missing_source(command);
+ }
} else if (command == command::abs_distance) {
if (has_attribute_manager()) {
fieldWriter = AbsDistanceDFW::create(source.c_str(), getEnvironment().getAttributeManager());
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h b/searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h
index 3b36455d09d..805b5cf3508 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h
+++ b/searchsummary/src/vespa/searchsummary/docsummary/i_string_field_converter.h
@@ -17,6 +17,7 @@ class IStringFieldConverter
public:
virtual ~IStringFieldConverter() = default;
virtual void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) = 0;
+ virtual bool render_weighted_set_as_array() const = 0;
};
}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp
index 838b0234cdb..b9b9d7c4c97 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.cpp
@@ -2,14 +2,11 @@
#include "linguistics_tokens_converter.h"
#include <vespa/document/fieldvalue/stringfieldvalue.h>
-#include <vespa/searchlib/memoryindex/field_inverter.h>
-#include <vespa/searchlib/util/linguisticsannotation.h>
#include <vespa/searchlib/util/token_extractor.h>
#include <vespa/vespalib/data/slime/slime.h>
using document::StringFieldValue;
using search::linguistics::TokenExtractor;
-using search::memoryindex::FieldInverter;
using vespalib::Memory;
using vespalib::slime::ArrayInserter;
using vespalib::slime::Cursor;
@@ -17,14 +14,9 @@ using vespalib::slime::Inserter;
namespace search::docsummary {
-namespace {
-
-vespalib::string dummy_field_name;
-
-}
-
-LinguisticsTokensConverter::LinguisticsTokensConverter()
+LinguisticsTokensConverter::LinguisticsTokensConverter(const TokenExtractor& token_extractor)
: IStringFieldConverter(),
+ _token_extractor(token_extractor),
_text()
{
}
@@ -56,8 +48,7 @@ LinguisticsTokensConverter::handle_indexing_terms(const StringFieldValue& value,
using SpanTerm = TokenExtractor::SpanTerm;
std::vector<SpanTerm> terms;
auto span_trees = value.getSpanTrees();
- TokenExtractor token_extractor(dummy_field_name, FieldInverter::max_word_len);
- token_extractor.extract(terms, span_trees, _text, nullptr);
+ _token_extractor.extract(terms, span_trees, _text, nullptr);
auto it = terms.begin();
auto ite = terms.end();
auto itn = it;
@@ -78,4 +69,10 @@ LinguisticsTokensConverter::convert(const StringFieldValue &input, vespalib::sli
handle_indexing_terms(input, inserter);
}
+bool
+LinguisticsTokensConverter::render_weighted_set_as_array() const
+{
+ return true;
+}
+
}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h
index cba3937c822..d752fe89ed9 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h
+++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_converter.h
@@ -4,6 +4,8 @@
#include "i_string_field_converter.h"
+namespace search::linguistics { class TokenExtractor; }
+
namespace search::docsummary {
/*
@@ -13,16 +15,18 @@ namespace search::docsummary {
*/
class LinguisticsTokensConverter : public IStringFieldConverter
{
- vespalib::stringref _text;
+ const linguistics::TokenExtractor& _token_extractor;
+ vespalib::stringref _text;
template <typename ForwardIt>
void handle_alternative_index_terms(ForwardIt it, ForwardIt last, vespalib::slime::Inserter& inserter);
void handle_index_term(vespalib::stringref word, vespalib::slime::Inserter& inserter);
void handle_indexing_terms(const document::StringFieldValue& value, vespalib::slime::Inserter& inserter);
public:
- LinguisticsTokensConverter();
+ LinguisticsTokensConverter(const linguistics::TokenExtractor& token_extractor);
~LinguisticsTokensConverter() override;
void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override;
+ bool render_weighted_set_as_array() const override;
};
}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.cpp
new file mode 100644
index 00000000000..5e94e270c53
--- /dev/null
+++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.cpp
@@ -0,0 +1,36 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "linguistics_tokens_dfw.h"
+#include "i_docsum_store_document.h"
+#include "linguistics_tokens_converter.h"
+#include <vespa/searchlib/memoryindex/field_inverter.h>
+
+using search::memoryindex::FieldInverter;
+
+namespace search::docsummary {
+
+LinguisticsTokensDFW::LinguisticsTokensDFW(const vespalib::string& input_field_name)
+ : DocsumFieldWriter(),
+ _input_field_name(input_field_name),
+ _token_extractor(_input_field_name, FieldInverter::max_word_len)
+{
+}
+
+LinguisticsTokensDFW::~LinguisticsTokensDFW() = default;
+
+bool
+LinguisticsTokensDFW::isGenerated() const
+{
+ return false;
+}
+
+void
+LinguisticsTokensDFW::insertField(uint32_t, const IDocsumStoreDocument* doc, GetDocsumsState&, vespalib::slime::Inserter& target) const
+{
+ if (doc != nullptr) {
+ LinguisticsTokensConverter converter(_token_extractor);
+ doc->insert_summary_field(_input_field_name, target, &converter);
+ }
+}
+
+}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.h b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.h
new file mode 100644
index 00000000000..a70f0a69e4c
--- /dev/null
+++ b/searchsummary/src/vespa/searchsummary/docsummary/linguistics_tokens_dfw.h
@@ -0,0 +1,28 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "docsum_field_writer.h"
+#include <vespa/searchlib/util/token_extractor.h>
+#include <memory>
+
+namespace search::docsummary {
+
+/*
+ * class for writing annotated string field values from document as
+ * arrays containing the indexing terms.
+ */
+class LinguisticsTokensDFW : public DocsumFieldWriter
+{
+private:
+ vespalib::string _input_field_name;
+ linguistics::TokenExtractor _token_extractor;
+
+public:
+ explicit LinguisticsTokensDFW(const vespalib::string& input_field_name);
+ ~LinguisticsTokensDFW() override;
+ bool isGenerated() const override;
+ void insertField(uint32_t docid, const IDocsumStoreDocument* doc, GetDocsumsState& state, vespalib::slime::Inserter& target) const override;
+};
+
+}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp b/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp
index 7266642b18b..080129fe780 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp
@@ -285,6 +285,7 @@ SlimeFiller::visit(const WeightedSetFieldValue& value)
if (empty_or_empty_after_filtering(value)) {
return;
}
+ bool render_as_array = _string_converter != nullptr && _string_converter->render_weighted_set_as_array();
Cursor& a = _inserter.insertArray();
Symbol isym = a.resolve("item");
Symbol wsym = a.resolve("weight");
@@ -305,12 +306,18 @@ SlimeFiller::visit(const WeightedSetFieldValue& value)
}
++matching_elements_itr;
}
- Cursor& o = a.addObject();
- ObjectSymbolInserter ki(o, isym);
- SlimeFiller conv(ki);
- entry.first->accept(conv);
- int weight = static_cast<const IntFieldValue&>(*entry.second).getValue();
- o.setLong(wsym, weight);
+ if (render_as_array) {
+ ArrayInserter ai(a);
+ SlimeFiller conv(ai, _string_converter, SlimeFillerFilter::all());
+ entry.first->accept(conv);
+ } else {
+ Cursor& o = a.addObject();
+ ObjectSymbolInserter ki(o, isym);
+ SlimeFiller conv(ki);
+ entry.first->accept(conv);
+ int weight = static_cast<const IntFieldValue&>(*entry.second).getValue();
+ o.setLong(wsym, weight);
+ }
++idx;
}
}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp
index b48f556f4be..b94de154a35 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp
@@ -132,6 +132,7 @@ public:
}
~SnippetModifierJuniperConverter() override = default;
void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override;
+ bool render_weighted_set_as_array() const override;
};
void
@@ -147,6 +148,12 @@ SnippetModifierJuniperConverter::convert(const document::StringFieldValue &input
}
}
+bool
+SnippetModifierJuniperConverter::render_weighted_set_as_array() const
+{
+ return false;
+}
+
/**
* Class providing access to a document retrieved from an IDocsumStore
* (vsm::DocsumFilter). VSM specific transforms might be applied when