diff options
author | Geir Storli <geirst@verizonmedia.com> | 2019-10-04 08:56:23 +0000 |
---|---|---|
committer | Geir Storli <geirst@verizonmedia.com> | 2019-10-04 09:13:31 +0000 |
commit | dea97e69131b865669760daab6af068958d5a6b4 (patch) | |
tree | 2f01de81988ebe18da019d43ecb5ca74fd9edd40 | |
parent | 6fe52f2ed1665f6fe29b74bbaec4db2c889ebacf (diff) |
Add docsum field writer that filters matched elements from a complex field retrieved from document store.
6 files changed, 346 insertions, 15 deletions
diff --git a/searchsummary/CMakeLists.txt b/searchsummary/CMakeLists.txt index 4df636e0219..2a23dd4c495 100644 --- a/searchsummary/CMakeLists.txt +++ b/searchsummary/CMakeLists.txt @@ -25,6 +25,7 @@ vespa_define_module( src/tests/docsumformat src/tests/docsummary src/tests/docsummary/attribute_combiner + src/tests/docsummary/matched_elements_filter src/tests/docsummary/slime_summary src/tests/extractkeywords ) diff --git a/searchsummary/src/tests/docsummary/matched_elements_filter/CMakeLists.txt b/searchsummary/src/tests/docsummary/matched_elements_filter/CMakeLists.txt new file mode 100644 index 00000000000..a87f5638acc --- /dev/null +++ b/searchsummary/src/tests/docsummary/matched_elements_filter/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +find_package(GTest REQUIRED) +vespa_add_executable(searchsummary_matched_elements_filter_test_app TEST + SOURCES + matched_elements_filter_test.cpp + DEPENDS + searchsummary + GTest::GTest +) +vespa_add_test(NAME searchsummary_matched_elements_filter_test_app COMMAND searchsummary_matched_elements_filter_test_app) diff --git a/searchsummary/src/tests/docsummary/matched_elements_filter/matched_elements_filter_test.cpp b/searchsummary/src/tests/docsummary/matched_elements_filter/matched_elements_filter_test.cpp new file mode 100644 index 00000000000..40d0285b1ec --- /dev/null +++ b/searchsummary/src/tests/docsummary/matched_elements_filter/matched_elements_filter_test.cpp @@ -0,0 +1,205 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/document/datatype/datatype.h> +#include <vespa/document/datatype/structdatatype.h> +#include <vespa/document/document.h> +#include <vespa/searchlib/common/matching_elements.h> +#include <vespa/searchlib/util/slime_output_raw_buf_adapter.h> +#include <vespa/searchsummary/docsummary/docsumstate.h> +#include <vespa/searchsummary/docsummary/idocsumenvironment.h> +#include <vespa/searchsummary/docsummary/matched_elements_filter_dfw.h> +#include <vespa/searchsummary/docsummary/resultconfig.h> +#include <vespa/searchsummary/docsummary/resultpacker.h> +#include <vespa/searchsummary/docsummary/summaryfieldconverter.h> +#include <vespa/vespalib/data/slime/json_format.h> +#include <vespa/vespalib/data/slime/slime.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <iostream> + +#include <vespa/log/log.h> +LOG_SETUP("matched_elements_filter_test"); + +using search::MatchingElements; +using search::StructFieldMapper; +using vespalib::Slime; + +using namespace document; +using namespace search::docsummary; +using namespace vespalib::slime; + +using ElementVector = std::vector<uint32_t>; + +struct SlimeValue { + Slime slime; + + SlimeValue(const std::string& json_input) + : slime() + { + size_t used = JsonFormat::decode(json_input, slime); + EXPECT_GT(used, 0); + } + SlimeValue(const Slime& slime_with_raw_field) + : slime() + { + size_t used = BinaryFormat::decode(slime_with_raw_field.get().asString(), slime); + EXPECT_GT(used, 0); + } +}; + +StructDataType::UP +make_struct_elem_type() +{ + auto result = std::make_unique<StructDataType>("elem"); + result->addField(Field("name", *DataType::STRING, true)); + result->addField(Field("weight", *DataType::INT, true)); + return result; +} + +constexpr uint32_t class_id = 3; +constexpr uint32_t doc_id = 2; + +class DocsumStore { +private: + ResultConfig _config; + ResultPacker _packer; + StructDataType::UP _elem_type; + ArrayDataType _array_type; + MapDataType _map_type; + + StructFieldValue::UP make_elem_value(const std::string& name, int weight) const { + auto result = std::make_unique<StructFieldValue>(*_elem_type); + result->setValue("name", StringFieldValue(name)); + result->setValue("weight", IntFieldValue(weight)); + return result; + } + + void write_field_value(const FieldValue& value) { + auto converted = SummaryFieldConverter::convertSummaryField(false, value); + const auto* raw_field = dynamic_cast<const RawFieldValue*>(converted.get()); + ASSERT_TRUE(raw_field); + auto raw_buf = raw_field->getAsRaw(); + bool result = _packer.AddLongString(raw_buf.first, raw_buf.second); + ASSERT_TRUE(result); + } + +public: + DocsumStore() + : _config(), + _packer(&_config), + _elem_type(make_struct_elem_type()), + _array_type(*_elem_type), + _map_type(*DataType::STRING, *_elem_type) + { + auto* result_class = _config.AddResultClass("test", class_id); + EXPECT_TRUE(result_class->AddConfigEntry("array", ResType::RES_JSONSTRING)); + EXPECT_TRUE(result_class->AddConfigEntry("map", ResType::RES_JSONSTRING)); + _config.CreateEnumMaps(); + } + const ResultConfig& get_config() const { return _config; } + const ResultClass* get_class() const { return _config.LookupResultClass(class_id); } + search::docsummary::DocsumStoreValue getMappedDocsum() { + assert(_packer.Init(class_id)); + { + ArrayFieldValue array_value(_array_type); + array_value.append(make_elem_value("a", 3)); + array_value.append(make_elem_value("b", 5)); + array_value.append(make_elem_value("c", 7)); + write_field_value(array_value); + } + { + MapFieldValue map_value(_map_type); + map_value.put(StringFieldValue("a"), *make_elem_value("a", 3)); + map_value.put(StringFieldValue("b"), *make_elem_value("b", 5)); + map_value.put(StringFieldValue("c"), *make_elem_value("c", 7)); + write_field_value(map_value); + } + const char* buf; + uint32_t buf_len; + assert(_packer.GetDocsumBlob(&buf, &buf_len)); + return DocsumStoreValue(buf, buf_len); + } +}; + +class StateCallback : public GetDocsumsStateCallback { +private: + std::string _field_name; + ElementVector _matching_elements; + +public: + StateCallback(const std::string& field_name, const ElementVector& matching_elements) + : _field_name(field_name), + _matching_elements(matching_elements) + { + } + ~StateCallback() {} + void FillSummaryFeatures(GetDocsumsState*, IDocsumEnvironment*) override {} + void FillRankFeatures(GetDocsumsState*, IDocsumEnvironment*) override {} + void ParseLocation(GetDocsumsState*) override {} + std::unique_ptr<MatchingElements> fill_matching_elements(const StructFieldMapper&) override { + auto result = std::make_unique<MatchingElements>(); + result->add_matching_elements(doc_id, _field_name, _matching_elements); + return result; + } +}; + +class MatchedElementsFilterTest : public ::testing::Test { +private: + DocsumStore _store; + + SlimeValue run_filter_field_writer(const std::string& input_field_name, const ElementVector& matching_elements) { + int input_field_enum = _store.get_config().GetFieldNameEnum().Lookup(input_field_name.c_str()); + EXPECT_GE(input_field_enum, 0); + MatchedElementsFilterDFW filter(input_field_name, input_field_enum); + + GeneralResult result(_store.get_class()); + result.inplaceUnpack(_store.getMappedDocsum()); + StateCallback callback(input_field_name, matching_elements); + GetDocsumsState state(callback); + Slime slime; + SlimeInserter inserter(slime); + + filter.insertField(doc_id, &result, &state, ResType::RES_JSONSTRING, inserter); + return SlimeValue(slime); + } + +public: + MatchedElementsFilterTest() + : _store() + { + } + void expect_filtered(const std::string& input_field_name, const ElementVector& matching_elements, const std::string& exp_slime_as_json) { + SlimeValue act = run_filter_field_writer(input_field_name, matching_elements); + SlimeValue exp(exp_slime_as_json); + EXPECT_EQ(exp.slime, act.slime); + } +}; + +TEST_F(MatchedElementsFilterTest, filters_elements_in_array_field_value) +{ + expect_filtered("array", {}, "[]"); + expect_filtered("array", {0}, "[{'name':'a','weight':3}]"); + expect_filtered("array", {1}, "[{'name':'b','weight':5}]"); + expect_filtered("array", {2}, "[{'name':'c','weight':7}]"); + expect_filtered("array", {0, 1, 2}, "[{'name':'a','weight':3}," + "{'name':'b','weight':5}," + "{'name':'c','weight':7}]"); +} + +TEST_F(MatchedElementsFilterTest, filters_elements_in_map_field_value) +{ + expect_filtered("map", {}, "[]"); + expect_filtered("map", {0}, "[{'key':'a','value':{'name':'a','weight':3}}]"); + expect_filtered("map", {1}, "[{'key':'b','value':{'name':'b','weight':5}}]"); + expect_filtered("map", {2}, "[{'key':'c','value':{'name':'c','weight':7}}]"); + expect_filtered("map", {0, 1, 2}, "[{'key':'a','value':{'name':'a','weight':3}}," + "{'key':'b','value':{'name':'b','weight':5}}," + "{'key':'c','value':{'name':'c','weight':7}}]"); +} + +TEST_F(MatchedElementsFilterTest, field_writer_is_not_generated_as_it_depends_on_data_from_document_store) +{ + MatchedElementsFilterDFW filter("foo", 0); + EXPECT_FALSE(filter.IsGenerated()); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt index dccf72b2fe7..fb6a399e71c 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt +++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt @@ -4,29 +4,30 @@ vespa_add_library(searchsummary_docsummary OBJECT array_attribute_combiner_dfw.cpp attribute_combiner_dfw.cpp attribute_field_writer.cpp - resultclass.cpp - resultconfig.cpp - resultpacker.cpp - urlresult.cpp - getdocsumargs.cpp - docsumstate.cpp + attributedfw.cpp + docsumconfig.cpp docsumfieldwriter.cpp + docsumstate.cpp docsumwriter.cpp - keywordextractor.cpp - attributedfw.cpp dynamicteaserdfw.cpp - docsumconfig.cpp - rankfeaturesdfw.cpp - summaryfeaturesdfw.cpp - juniperproperties.cpp - textextractordfw.cpp geoposdfw.cpp - tokenizer.cpp - positionsdfw.cpp + getdocsumargs.cpp + juniperproperties.cpp + keywordextractor.cpp linguisticsannotation.cpp + matched_elements_filter_dfw.cpp + positionsdfw.cpp + rankfeaturesdfw.cpp + resultclass.cpp + resultconfig.cpp + resultpacker.cpp searchdatatype.cpp struct_map_attribute_combiner_dfw.cpp + summaryfeaturesdfw.cpp summaryfieldconverter.cpp + textextractordfw.cpp + tokenizer.cpp + urlresult.cpp AFTER searchsummary_config ) diff --git a/searchsummary/src/vespa/searchsummary/docsummary/matched_elements_filter_dfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/matched_elements_filter_dfw.cpp new file mode 100644 index 00000000000..1b7533b53e3 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/matched_elements_filter_dfw.cpp @@ -0,0 +1,87 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "docsumstate.h" +#include "matched_elements_filter_dfw.h" +#include <vespa/searchlib/common/matching_elements.h> +#include <vespa/searchlib/common/struct_field_mapper.h> +#include <vespa/vespalib/data/slime/binary_format.h> +#include <vespa/vespalib/data/slime/slime.h> +#include <vespa/vespalib/data/smart_buffer.h> +#include <cassert> + +using vespalib::Slime; +using vespalib::slime::ArrayInserter; +using vespalib::slime::BinaryFormat; +using vespalib::slime::Inserter; +using vespalib::slime::Inspector; +using vespalib::slime::SlimeInserter; +using vespalib::slime::inject; + +namespace search::docsummary { + +MatchedElementsFilterDFW::MatchedElementsFilterDFW(const std::string& input_field_name, uint32_t input_field_enum) + : _input_field_name(input_field_name), + _input_field_enum(input_field_enum), + _struct_field_mapper(std::make_shared<StructFieldMapper>()) +{ + // TODO: Take struct field mapper in constructor and populate based on available attribute vectors. +} + +MatchedElementsFilterDFW::~MatchedElementsFilterDFW() = default; + +namespace { + +void +decode_input_field(const ResEntry& entry, search::RawBuf& target_buf, Slime& input_field) +{ + const char* buf; + uint32_t buf_len; + entry._resolve_field(&buf, &buf_len, &target_buf); + BinaryFormat::decode(vespalib::Memory(buf, buf_len), input_field); +} + +void +filter_matching_elements_in_input_field(const Slime& input_field, const std::vector<uint32_t>& matching_elems, Slime& output_field) +{ + SlimeInserter output_inserter(output_field); + Inspector& input_inspector = input_field.get(); + ArrayInserter array_inserter(output_inserter.insertArray()); + auto elems_itr = matching_elems.begin(); + for (size_t i = 0; (i < input_inspector.entries()) && (elems_itr != matching_elems.end()); ++i) { + assert(*elems_itr >= i); + if (*elems_itr == i) { + inject(input_inspector[i], array_inserter); + ++elems_itr; + } + } +} + +void +encode_output_field(const Slime& output_field, Inserter& target) +{ + vespalib::SmartBuffer buf(4096); + BinaryFormat::encode(output_field, buf); + target.insertString(buf.obtain()); +} + +} + +void +MatchedElementsFilterDFW::insertField(uint32_t docid, GeneralResult* result, GetDocsumsState *state, + ResType type, vespalib::slime::Inserter& target) +{ + assert(type == ResType::RES_JSONSTRING); + int entry_idx = result->GetClass()->GetIndexFromEnumValue(_input_field_enum); + ResEntry* entry = result->GetEntry(entry_idx); + if (entry != nullptr) { + Slime input_field; + decode_input_field(*entry, state->_docSumFieldSpace, input_field); + + Slime output_field; + filter_matching_elements_in_input_field(input_field, state->get_matching_elements(*_struct_field_mapper).get_matching_elements(docid, _input_field_name), output_field); + + encode_output_field(output_field, target); + } +} + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/matched_elements_filter_dfw.h b/searchsummary/src/vespa/searchsummary/docsummary/matched_elements_filter_dfw.h new file mode 100644 index 00000000000..b96d3595b0a --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/matched_elements_filter_dfw.h @@ -0,0 +1,27 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "docsumfieldwriter.h" + +namespace search::docsummary { + +/** + * Field writer that filters matched elements (according to the query) from a complex field + * (map of primitives, map of struct, array of struct) that is retrieved from the document store. + */ +class MatchedElementsFilterDFW : public IDocsumFieldWriter { +private: + std::string _input_field_name; + uint32_t _input_field_enum; + std::shared_ptr<StructFieldMapper> _struct_field_mapper; + +public: + MatchedElementsFilterDFW(const std::string& input_field_name, uint32_t input_field_enum); + ~MatchedElementsFilterDFW(); + bool IsGenerated() const override { return false; } + void insertField(uint32_t docid, GeneralResult* result, GetDocsumsState *state, + ResType type, vespalib::slime::Inserter& target) override; +}; + +} |