diff options
Diffstat (limited to 'searchlib')
15 files changed, 264 insertions, 220 deletions
diff --git a/searchlib/src/tests/memoryindex/urlfieldinverter/urlfieldinverter_test.cpp b/searchlib/src/tests/memoryindex/urlfieldinverter/urlfieldinverter_test.cpp index 16957abe915..daec09828f6 100644 --- a/searchlib/src/tests/memoryindex/urlfieldinverter/urlfieldinverter_test.cpp +++ b/searchlib/src/tests/memoryindex/urlfieldinverter/urlfieldinverter_test.cpp @@ -184,7 +184,7 @@ struct Fixture std::vector<std::unique_ptr<FieldInverter> > _inverters; std::unique_ptr<UrlFieldInverter> _urlInverter; test::OrderedDocumentInserter _inserter; - DocTypeBuilder::SchemaIndexFields _schemaIndexFields; + index::SchemaIndexFields _schemaIndexFields; static Schema makeSchema(Schema::CollectionType collectionType) @@ -208,7 +208,7 @@ struct Fixture _inverters.push_back(std::make_unique<FieldInverter>(_schema, fieldId)); } - DocTypeBuilder::UriField &urlField = + index::UriField &urlField = _schemaIndexFields._uriFields.front(); _urlInverter = std::make_unique<UrlFieldInverter> (collectionType, diff --git a/searchlib/src/vespa/searchlib/index/CMakeLists.txt b/searchlib/src/vespa/searchlib/index/CMakeLists.txt index 21209c32816..9143ee867c8 100644 --- a/searchlib/src/vespa/searchlib/index/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/index/CMakeLists.txt @@ -14,5 +14,7 @@ vespa_add_library(searchlib_searchlib_index OBJECT postinglistfile.cpp postinglistparams.cpp schemautil.cpp + schema_index_fields.cpp + uri_field.cpp DEPENDS ) diff --git a/searchlib/src/vespa/searchlib/index/doctypebuilder.cpp b/searchlib/src/vespa/searchlib/index/doctypebuilder.cpp index fb02ee32b98..f79e394e535 100644 --- a/searchlib/src/vespa/searchlib/index/doctypebuilder.cpp +++ b/searchlib/src/vespa/searchlib/index/doctypebuilder.cpp @@ -68,158 +68,6 @@ insertStructType(document::DocumenttypesConfig::Documenttype & cfg, } -DocTypeBuilder::UriField::UriField() - : _all(Schema::UNKNOWN_FIELD_ID), - _scheme(Schema::UNKNOWN_FIELD_ID), - _host(Schema::UNKNOWN_FIELD_ID), - _port(Schema::UNKNOWN_FIELD_ID), - _path(Schema::UNKNOWN_FIELD_ID), - _query(Schema::UNKNOWN_FIELD_ID), - _fragment(Schema::UNKNOWN_FIELD_ID), - _hostname(Schema::UNKNOWN_FIELD_ID) -{ -} - - -bool -DocTypeBuilder::UriField::valid(const Schema &schema, - uint32_t fieldId, - const Schema::CollectionType &collectionType) -{ - if (fieldId == Schema::UNKNOWN_FIELD_ID) - return false; - const Schema::IndexField &field = schema.getIndexField(fieldId); - if (field.getDataType() != schema::DataType::STRING) - return false; - if (field.getCollectionType() != collectionType) - return false; - return true; -} - - -bool -DocTypeBuilder::UriField::broken(const Schema &schema, - const Schema::CollectionType & - collectionType) const -{ - return !valid(schema, _all, collectionType) && - valid(schema, _scheme, collectionType) && - valid(schema, _host, collectionType) && - valid(schema, _port, collectionType) && - valid(schema, _path, collectionType) && - valid(schema, _query, collectionType) && - valid(schema, _fragment, collectionType); -} - -bool -DocTypeBuilder::UriField::valid(const Schema &schema, - const Schema::CollectionType & - collectionType) const -{ - return valid(schema, _all, collectionType) && - valid(schema, _scheme, collectionType) && - valid(schema, _host, collectionType) && - valid(schema, _port, collectionType) && - valid(schema, _path, collectionType) && - valid(schema, _query, collectionType) && - valid(schema, _fragment, collectionType); -} - - -void -DocTypeBuilder::UriField::setup(const Schema &schema, - const vespalib::string &field) -{ - _all = schema.getIndexFieldId(field); - _scheme = schema.getIndexFieldId(field + ".scheme"); - _host = schema.getIndexFieldId(field + ".host"); - _port = schema.getIndexFieldId(field + ".port"); - _path = schema.getIndexFieldId(field + ".path"); - _query = schema.getIndexFieldId(field + ".query"); - _fragment = schema.getIndexFieldId(field + ".fragment"); - _hostname = schema.getIndexFieldId(field + ".hostname"); -} - - -void -DocTypeBuilder::UriField::markUsed(UsedFieldsMap &usedFields, - uint32_t field) -{ - if (field == Schema::UNKNOWN_FIELD_ID) - return; - assert(usedFields.size() > field); - usedFields[field] = true; -} - - -void -DocTypeBuilder::UriField::markUsed(UsedFieldsMap &usedFields) const -{ - markUsed(usedFields, _all); - markUsed(usedFields, _scheme); - markUsed(usedFields, _host); - markUsed(usedFields, _port); - markUsed(usedFields, _path); - markUsed(usedFields, _query); - markUsed(usedFields, _fragment); - markUsed(usedFields, _hostname); -} - - - -DocTypeBuilder::SchemaIndexFields::SchemaIndexFields() - : _textFields(), - _uriFields() -{ -} - -DocTypeBuilder::SchemaIndexFields::~SchemaIndexFields() {} - -void -DocTypeBuilder::SchemaIndexFields::setup(const Schema &schema) -{ - uint32_t numIndexFields = schema.getNumIndexFields(); - UsedFieldsMap usedFields; - usedFields.resize(numIndexFields); - - // Detect all URI fields (flattened structs). - for (uint32_t fieldId = 0; fieldId < numIndexFields; ++fieldId) { - const Schema::IndexField &field = schema.getIndexField(fieldId); - const vespalib::string &name = field.getName(); - size_t dotPos = name.find('.'); - if (dotPos != vespalib::string::npos) { - const vespalib::string suffix = name.substr(dotPos + 1); - if (suffix == "scheme") { - const vespalib::string shortName = name.substr(0, dotPos); - UriField uriField; - uriField.setup(schema, shortName); - if (uriField.valid(schema, field.getCollectionType())) { - _uriFields.push_back(uriField); - uriField.markUsed(usedFields); - } else if (uriField.broken(schema, - field.getCollectionType())) { - // Broken removal of unused URI fields. - uriField.markUsed(usedFields); - } - } - } - } - - // Non-URI fields are currently supposed to be text fields. - for (uint32_t fieldId = 0; fieldId < numIndexFields; ++fieldId) { - if (usedFields[fieldId]) - continue; - const Schema::IndexField &field = schema.getIndexField(fieldId); - switch (field.getDataType()) { - case schema::DataType::STRING: - _textFields.push_back(fieldId); - break; - default: - ; - } - } -} - DocTypeBuilder::DocTypeBuilder(const Schema &schema) : _schema(schema), _iFields() diff --git a/searchlib/src/vespa/searchlib/index/doctypebuilder.h b/searchlib/src/vespa/searchlib/index/doctypebuilder.h index cb0a4b9a9e6..7c19dceafe8 100644 --- a/searchlib/src/vespa/searchlib/index/doctypebuilder.h +++ b/searchlib/src/vespa/searchlib/index/doctypebuilder.h @@ -2,7 +2,7 @@ #pragma once -#include <vespa/searchcommon/common/schema.h> +#include "schema_index_fields.h" #include <vespa/document/config/config-documenttypes.h> #include <vespa/document/datatype/datatypes.h> #include <vespa/document/fieldvalue/fieldvalues.h> @@ -15,50 +15,6 @@ namespace search::index { * Builder for the indexingdocument document type based on an index schema. **/ class DocTypeBuilder { -public: - typedef std::vector<bool> UsedFieldsMap; - typedef std::vector<uint32_t> FieldIdVector; - - class UriField - { - public: - uint32_t _all; - uint32_t _scheme; - uint32_t _host; - uint32_t _port; - uint32_t _path; - uint32_t _query; - uint32_t _fragment; - uint32_t _hostname; - - private: - static void markUsed(UsedFieldsMap &usedFields, uint32_t field); - static bool valid(const Schema &schema, uint32_t fieldId, - const Schema::CollectionType &collectionType); - - public: - UriField(); - - bool broken(const Schema &schema, const Schema::CollectionType &collectionType) const; - bool valid(const Schema &schema, const Schema::CollectionType &collectionType) const; - void setup(const Schema &schema, const vespalib::string &field); - void markUsed(UsedFieldsMap &usedFields) const; - }; - - typedef std::vector<UriField> UriFieldIdVector; - - class SchemaIndexFields - { - public: - FieldIdVector _textFields; - UriFieldIdVector _uriFields; - - SchemaIndexFields(); - ~SchemaIndexFields(); - void setup(const Schema &schema); - }; - -private: const Schema &_schema; SchemaIndexFields _iFields; diff --git a/searchlib/src/vespa/searchlib/index/schema_index_fields.cpp b/searchlib/src/vespa/searchlib/index/schema_index_fields.cpp new file mode 100644 index 00000000000..7b4c0ad47bf --- /dev/null +++ b/searchlib/src/vespa/searchlib/index/schema_index_fields.cpp @@ -0,0 +1,61 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "schema_index_fields.h" + +namespace search::index { + +SchemaIndexFields::SchemaIndexFields() + : _textFields(), + _uriFields() +{ +} + +SchemaIndexFields::~SchemaIndexFields() = default; + +void +SchemaIndexFields::setup(const Schema &schema) +{ + uint32_t numIndexFields = schema.getNumIndexFields(); + UriField::UsedFieldsMap usedFields; + usedFields.resize(numIndexFields); + + // Detect all URI fields (flattened structs). + for (uint32_t fieldId = 0; fieldId < numIndexFields; ++fieldId) { + const Schema::IndexField &field = schema.getIndexField(fieldId); + const vespalib::string &name = field.getName(); + size_t dotPos = name.find('.'); + if (dotPos != vespalib::string::npos) { + const vespalib::string suffix = name.substr(dotPos + 1); + if (suffix == "scheme") { + const vespalib::string shortName = name.substr(0, dotPos); + UriField uriField; + uriField.setup(schema, shortName); + if (uriField.valid(schema, field.getCollectionType())) { + _uriFields.push_back(uriField); + uriField.markUsed(usedFields); + } else if (uriField.broken(schema, + field.getCollectionType())) { + // Broken removal of unused URI fields. + uriField.markUsed(usedFields); + } + } + } + } + + // Non-URI fields are currently supposed to be text fields. + for (uint32_t fieldId = 0; fieldId < numIndexFields; ++fieldId) { + if (usedFields[fieldId]) { + continue; + } + const Schema::IndexField &field = schema.getIndexField(fieldId); + switch (field.getDataType()) { + case schema::DataType::STRING: + _textFields.push_back(fieldId); + break; + default: + ; + } + } +} + +} diff --git a/searchlib/src/vespa/searchlib/index/schema_index_fields.h b/searchlib/src/vespa/searchlib/index/schema_index_fields.h new file mode 100644 index 00000000000..4dbcdcdb781 --- /dev/null +++ b/searchlib/src/vespa/searchlib/index/schema_index_fields.h @@ -0,0 +1,25 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "uri_field.h" + +namespace search::index { + +/** + * Fields from an index schema to be used for indexing + **/ +class SchemaIndexFields +{ +public: + using FieldIdVector = std::vector<uint32_t>; + using UriFieldIdVector = std::vector<UriField>; + FieldIdVector _textFields; + UriFieldIdVector _uriFields; + + SchemaIndexFields(); + ~SchemaIndexFields(); + void setup(const Schema &schema); +}; + +} diff --git a/searchlib/src/vespa/searchlib/index/uri_field.cpp b/searchlib/src/vespa/searchlib/index/uri_field.cpp new file mode 100644 index 00000000000..a5b5ec2b337 --- /dev/null +++ b/searchlib/src/vespa/searchlib/index/uri_field.cpp @@ -0,0 +1,108 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "uri_field.h" + +namespace search::index { + +UriField::UriField() + : _all(Schema::UNKNOWN_FIELD_ID), + _scheme(Schema::UNKNOWN_FIELD_ID), + _host(Schema::UNKNOWN_FIELD_ID), + _port(Schema::UNKNOWN_FIELD_ID), + _path(Schema::UNKNOWN_FIELD_ID), + _query(Schema::UNKNOWN_FIELD_ID), + _fragment(Schema::UNKNOWN_FIELD_ID), + _hostname(Schema::UNKNOWN_FIELD_ID) +{ +} + + +bool +UriField::valid(const Schema &schema, + uint32_t fieldId, + const Schema::CollectionType &collectionType) +{ + if (fieldId == Schema::UNKNOWN_FIELD_ID) { + return false; + } + const Schema::IndexField &field = schema.getIndexField(fieldId); + if (field.getDataType() != schema::DataType::STRING) { + return false; + } + if (field.getCollectionType() != collectionType) { + return false; + } + return true; +} + + +bool +UriField::broken(const Schema &schema, + const Schema::CollectionType & + collectionType) const +{ + return !valid(schema, _all, collectionType) && + valid(schema, _scheme, collectionType) && + valid(schema, _host, collectionType) && + valid(schema, _port, collectionType) && + valid(schema, _path, collectionType) && + valid(schema, _query, collectionType) && + valid(schema, _fragment, collectionType); +} + +bool +UriField::valid(const Schema &schema, + const Schema::CollectionType & + collectionType) const +{ + return valid(schema, _all, collectionType) && + valid(schema, _scheme, collectionType) && + valid(schema, _host, collectionType) && + valid(schema, _port, collectionType) && + valid(schema, _path, collectionType) && + valid(schema, _query, collectionType) && + valid(schema, _fragment, collectionType); +} + + +void +UriField::setup(const Schema &schema, + const vespalib::string &field) +{ + _all = schema.getIndexFieldId(field); + _scheme = schema.getIndexFieldId(field + ".scheme"); + _host = schema.getIndexFieldId(field + ".host"); + _port = schema.getIndexFieldId(field + ".port"); + _path = schema.getIndexFieldId(field + ".path"); + _query = schema.getIndexFieldId(field + ".query"); + _fragment = schema.getIndexFieldId(field + ".fragment"); + _hostname = schema.getIndexFieldId(field + ".hostname"); +} + + +void +UriField::markUsed(UsedFieldsMap &usedFields, + uint32_t field) +{ + if (field == Schema::UNKNOWN_FIELD_ID) { + return; + } + assert(usedFields.size() > field); + usedFields[field] = true; +} + + +void +UriField::markUsed(UsedFieldsMap &usedFields) const +{ + markUsed(usedFields, _all); + markUsed(usedFields, _scheme); + markUsed(usedFields, _host); + markUsed(usedFields, _port); + markUsed(usedFields, _path); + markUsed(usedFields, _query); + markUsed(usedFields, _fragment); + markUsed(usedFields, _hostname); +} + +} diff --git a/searchlib/src/vespa/searchlib/index/uri_field.h b/searchlib/src/vespa/searchlib/index/uri_field.h new file mode 100644 index 00000000000..7730528d607 --- /dev/null +++ b/searchlib/src/vespa/searchlib/index/uri_field.h @@ -0,0 +1,40 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchcommon/common/schema.h> + +namespace search::index { + +/** + * Fields from an index schema used to represent an uri. + **/ +class UriField +{ +public: + using UsedFieldsMap = std::vector<bool>; + + uint32_t _all; + uint32_t _scheme; + uint32_t _host; + uint32_t _port; + uint32_t _path; + uint32_t _query; + uint32_t _fragment; + uint32_t _hostname; + +private: + static void markUsed(UsedFieldsMap &usedFields, uint32_t field); + static bool valid(const Schema &schema, uint32_t fieldId, + const Schema::CollectionType &collectionType); + +public: + UriField(); + + bool broken(const Schema &schema, const Schema::CollectionType &collectionType) const; + bool valid(const Schema &schema, const Schema::CollectionType &collectionType) const; + void setup(const Schema &schema, const vespalib::string &field); + void markUsed(UsedFieldsMap &usedFields) const; +}; + +} diff --git a/searchlib/src/vespa/searchlib/memoryindex/documentinverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/documentinverter.cpp index e609c2aefd1..08d77fa8dce 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/documentinverter.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/documentinverter.cpp @@ -30,18 +30,13 @@ using document::IntFieldValue; using document::StructFieldValue; using document::DataType; using document::DocumentType; -using document::Annotation; -using document::AnnotationType; using document::AlternateSpanList; using document::Span; using document::SpanList; using document::SimpleSpanList; using document::SpanNode; -using document::SpanTree; -using document::SpanTreeVisitor; using index::DocIdAndPosOccFeatures; using index::Schema; -using vespalib::make_string; using search::util::URL; diff --git a/searchlib/src/vespa/searchlib/memoryindex/documentinverter.h b/searchlib/src/vespa/searchlib/memoryindex/documentinverter.h index 177024c9860..e92c0810ea7 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/documentinverter.h +++ b/searchlib/src/vespa/searchlib/memoryindex/documentinverter.h @@ -3,8 +3,15 @@ #pragma once #include "i_document_remove_listener.h" -#include <vespa/searchlib/index/doctypebuilder.h> - +#include <vespa/searchlib/index/schema_index_fields.h> + +namespace document { +class DataType; +class Document; +class DocumentType; +class Field; +class FieldValue; +} namespace search { class ISequencedTaskExecutor; @@ -25,14 +32,10 @@ private: const index::Schema &_schema; - typedef index::DocTypeBuilder DocTypeBuilder; - typedef DocTypeBuilder::UriField UriField; - typedef DocTypeBuilder::SchemaIndexFields SchemaIndexFields; - void addFieldPath(const document::DocumentType &docType, uint32_t fieldId); void buildFieldPath(const document::DocumentType & docType, const document::DataType *dataType); void invertNormalDocTextField(size_t fieldId, const document::FieldValue &field); - void invertNormalDocUriField(const UriField &handle, const document::FieldValue &field); + void invertNormalDocUriField(const index::UriField &handle, const document::FieldValue &field); //typedef document::FieldPath FieldPath; typedef document::Field FieldPath; @@ -40,7 +43,7 @@ private: IndexedFieldPaths _indexedFieldPaths; const document::DataType * _dataType; - DocTypeBuilder::SchemaIndexFields _schemaIndexFields; + index::SchemaIndexFields _schemaIndexFields; std::vector<std::unique_ptr<FieldInverter>> _inverters; std::vector<std::unique_ptr<UrlFieldInverter>> _urlInverters; diff --git a/searchlib/src/vespa/searchlib/memoryindex/fieldinverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/fieldinverter.cpp index 957573d0ad7..78f33b9fae8 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/fieldinverter.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/fieldinverter.cpp @@ -3,10 +3,14 @@ #include "fieldinverter.h" #include "ordereddocumentinserter.h" #include <vespa/document/datatype/urldatatype.h> +#include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/fieldvalue/weightedsetfieldvalue.h> #include <vespa/searchlib/util/url.h> #include <stdexcept> #include <vespa/vespalib/text/utf8.h> #include <vespa/vespalib/text/lowercase.h> +#include <vespa/vespalib/util/stringfmt.h> #include <vespa/searchlib/common/sort.h> #include <vespa/searchlib/bitcompression/compression.h> #include <vespa/searchlib/bitcompression/posocccompression.h> diff --git a/searchlib/src/vespa/searchlib/memoryindex/fieldinverter.h b/searchlib/src/vespa/searchlib/memoryindex/fieldinverter.h index 08aec510002..96108a50f77 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/fieldinverter.h +++ b/searchlib/src/vespa/searchlib/memoryindex/fieldinverter.h @@ -6,7 +6,6 @@ #include <set> #include <vespa/document/fieldvalue/document.h> #include <vespa/document/datatype/datatypes.h> -#include <vespa/searchlib/index/doctypebuilder.h> #include <limits> #include "i_document_remove_listener.h" #include <vespa/searchlib/index/docidandfeatures.h> diff --git a/searchlib/src/vespa/searchlib/memoryindex/memoryindex.cpp b/searchlib/src/vespa/searchlib/memoryindex/memoryindex.cpp index ee61ef5a42f..4c9ea41183a 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/memoryindex.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/memoryindex.cpp @@ -3,6 +3,8 @@ #include "memoryindex.h" #include "postingiterator.h" #include "documentinverter.h" +#include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/document.h> #include <vespa/searchlib/index/schemautil.h> #include <vespa/searchlib/queryeval/create_blueprint_visitor_helper.h> #include <vespa/searchlib/queryeval/booleanmatchiteratorwrapper.h> diff --git a/searchlib/src/vespa/searchlib/memoryindex/urlfieldinverter.cpp b/searchlib/src/vespa/searchlib/memoryindex/urlfieldinverter.cpp index 0f8c05a44b9..be216f7c2ba 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/urlfieldinverter.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/urlfieldinverter.cpp @@ -3,10 +3,14 @@ #include "urlfieldinverter.h" #include "fieldinverter.h" #include <vespa/document/datatype/urldatatype.h> +#include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/fieldvalue/weightedsetfieldvalue.h> #include <vespa/searchlib/util/url.h> #include <stdexcept> #include <vespa/vespalib/text/utf8.h> #include <vespa/vespalib/text/lowercase.h> +#include <vespa/vespalib/util/stringfmt.h> #include <vespa/searchlib/common/sort.h> #include <vespa/log/log.h> diff --git a/searchlib/src/vespa/searchlib/memoryindex/urlfieldinverter.h b/searchlib/src/vespa/searchlib/memoryindex/urlfieldinverter.h index 84aac5a0620..74f96cd40c3 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/urlfieldinverter.h +++ b/searchlib/src/vespa/searchlib/memoryindex/urlfieldinverter.h @@ -2,7 +2,8 @@ #pragma once -#include <vespa/searchlib/index/doctypebuilder.h> +#include <vespa/searchcommon/common/datatype.h> +#include <vespa/document/fieldvalue/structfieldvalue.h> namespace search { @@ -24,12 +25,8 @@ class UrlFieldInverter FieldInverter *_hostname; bool _useAnnotations; - index::Schema::CollectionType _collectionType; + index::schema::CollectionType _collectionType; -public: - using UriField = index::DocTypeBuilder::UriField; - -private: void startDoc(uint32_t docId); void endDoc(); @@ -56,7 +53,7 @@ private: void invertUrlField(const document::FieldValue &field); public: - UrlFieldInverter(index::Schema::CollectionType collectionType, + UrlFieldInverter(index::schema::CollectionType collectionType, FieldInverter *all, FieldInverter *scheme, FieldInverter *host, |