diff options
author | Geir Storli <geirst@yahooinc.com> | 2022-10-12 17:10:37 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-12 17:10:37 +0200 |
commit | c2fff990ef096c124b235ce34e3975ccccdbe8d6 (patch) | |
tree | 194d83650f4cdd245032351abd1967b3985e3a4a | |
parent | f329a9d5e0a323b0485dcae52d90987b675808bc (diff) | |
parent | e1e90137560795397e77203b4e1a75cd3c61396f (diff) |
Merge pull request #24404 from vespa-engine/toregge/remove-docbuilder-and-add-stringfieldbuilder
Remove search::index::DocBuilder. Add search::index::StringFieldBuil…
38 files changed, 920 insertions, 2426 deletions
diff --git a/searchcore/src/tests/proton/attribute/attribute_populator/attribute_populator_test.cpp b/searchcore/src/tests/proton/attribute/attribute_populator/attribute_populator_test.cpp index c66b2dd15dc..19b8348fb7a 100644 --- a/searchcore/src/tests/proton/attribute/attribute_populator/attribute_populator_test.cpp +++ b/searchcore/src/tests/proton/attribute/attribute_populator/attribute_populator_test.cpp @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include <vespa/searchcore/proton/attribute/attribute_populator.h> +#include <vespa/document/repo/documenttyperepo.h> #include <vespa/searchcore/proton/attribute/attributemanager.h> #include <vespa/searchcore/proton/common/hw_info.h> #include <vespa/searchcore/proton/test/test.h> diff --git a/searchcore/src/tests/proton/documentdb/feedhandler/feedhandler_test.cpp b/searchcore/src/tests/proton/documentdb/feedhandler/feedhandler_test.cpp index e89d5eef078..4fc38992368 100644 --- a/searchcore/src/tests/proton/documentdb/feedhandler/feedhandler_test.cpp +++ b/searchcore/src/tests/proton/documentdb/feedhandler/feedhandler_test.cpp @@ -3,7 +3,11 @@ #include <vespa/persistence/spi/result.h> #include <vespa/document/datatype/tensor_data_type.h> #include <vespa/document/datatype/documenttype.h> +#include <vespa/document/fieldvalue/document.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/fieldvalue/tensorfieldvalue.h> #include <vespa/document/update/assignvalueupdate.h> +#include <vespa/document/repo/configbuilder.h> #include <vespa/document/repo/documenttyperepo.h> #include <vespa/document/update/documentupdate.h> #include <vespa/document/update/clearvalueupdate.h> @@ -29,7 +33,7 @@ #include <vespa/searchcore/proton/server/ireplayconfig.h> #include <vespa/searchcore/proton/test/dummy_feed_view.h> #include <vespa/searchcore/proton/test/transport_helper.h> -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/empty_doc_builder.h> #include <vespa/searchlib/index/dummyfileheadercontext.h> #include <vespa/searchlib/transactionlog/translogserver.h> #include <vespa/vespalib/testkit/testapp.h> @@ -271,20 +275,33 @@ MyFeedView::~MyFeedView() = default; struct SchemaContext { - Schema::SP schema; - std::unique_ptr<DocBuilder> builder; + Schema::SP schema; + EmptyDocBuilder builder; SchemaContext(); + SchemaContext(bool has_i2); ~SchemaContext(); DocTypeName getDocType() const { - return DocTypeName(builder->getDocumentType().getName()); + return DocTypeName(builder.get_document_type().getName()); } - const std::shared_ptr<const document::DocumentTypeRepo> &getRepo() const { return builder->getDocumentTypeRepo(); } + std::shared_ptr<const document::DocumentTypeRepo> getRepo() const { return builder.get_repo_sp(); } void addField(vespalib::stringref fieldName); }; SchemaContext::SchemaContext() + : SchemaContext(false) +{ +} + +SchemaContext::SchemaContext(bool has_i2) : schema(std::make_shared<Schema>()), - builder() + builder([has_i2](auto& header) { + header.addTensorField("tensor", "tensor(x{},y{})") + .addTensorField("tensor2", "tensor(x{},y{})") + .addField("i1", document::DataType::T_STRING); + if (has_i2) { + header.addField("i2", document::DataType::T_STRING); + } + }) { schema->addAttributeField(Schema::AttributeField("tensor", DataType::TENSOR, CollectionType::SINGLE, "tensor(x{},y{})")); schema->addAttributeField(Schema::AttributeField("tensor2", DataType::TENSOR, CollectionType::SINGLE, "tensor(x{},y{})")); @@ -298,14 +315,13 @@ void SchemaContext::addField(vespalib::stringref fieldName) { schema->addIndexField(Schema::IndexField(fieldName, DataType::STRING, CollectionType::SINGLE)); - builder = std::make_unique<DocBuilder>(*schema); } struct DocumentContext { Document::SP doc; BucketId bucketId; - DocumentContext(const vespalib::string &docId, DocBuilder &builder) : - doc(builder.startDocument(docId).endDocument().release()), + DocumentContext(const vespalib::string &docId, EmptyDocBuilder &builder) : + doc(builder.make_document(docId)), bucketId(BucketFactory::getBucketId(doc->getId())) { } @@ -313,7 +329,7 @@ struct DocumentContext { struct TwoFieldsSchemaContext : public SchemaContext { TwoFieldsSchemaContext() - : SchemaContext() + : SchemaContext(true) { addField("i2"); } @@ -324,8 +340,8 @@ TensorDataType tensor1DType(ValueType::from_spec("tensor(x{})")); struct UpdateContext { DocumentUpdate::SP update; BucketId bucketId; - UpdateContext(const vespalib::string &docId, DocBuilder &builder) : - update(std::make_shared<DocumentUpdate>(*builder.getDocumentTypeRepo(), builder.getDocumentType(), DocumentId(docId))), + UpdateContext(const vespalib::string &docId, EmptyDocBuilder &builder) : + update(std::make_shared<DocumentUpdate>(builder.get_repo(), builder.get_document_type(), DocumentId(docId))), bucketId(BucketFactory::getBucketId(update->getId())) { } @@ -464,7 +480,7 @@ TEST_F("require that heartBeat calls FeedView's heartBeat", TEST_F("require that outdated remove is ignored", FeedHandlerFixture) { - DocumentContext doc_context("id:ns:searchdocument::foo", *f.schema.builder); + DocumentContext doc_context("id:ns:searchdocument::foo", f.schema.builder); auto op = std::make_unique<RemoveOperationWithDocId>(doc_context.bucketId, Timestamp(10), doc_context.doc->getId()); static_cast<DocumentOperation &>(*op).setPrevDbDocumentId(DbDocumentId(4)); static_cast<DocumentOperation &>(*op).setPrevTimestamp(Timestamp(10000)); @@ -476,7 +492,7 @@ TEST_F("require that outdated remove is ignored", FeedHandlerFixture) TEST_F("require that outdated put is ignored", FeedHandlerFixture) { - DocumentContext doc_context("id:ns:searchdocument::foo", *f.schema.builder); + DocumentContext doc_context("id:ns:searchdocument::foo", f.schema.builder); auto op =std::make_unique<PutOperation>(doc_context.bucketId, Timestamp(10), std::move(doc_context.doc)); static_cast<DocumentOperation &>(*op).setPrevTimestamp(Timestamp(10000)); FeedTokenContext token_context; @@ -496,7 +512,7 @@ addLidToRemove(RemoveDocumentsOperation &op) TEST_F("require that handleMove calls FeedView", FeedHandlerFixture) { - DocumentContext doc_context("id:ns:searchdocument::foo", *f.schema.builder); + DocumentContext doc_context("id:ns:searchdocument::foo", f.schema.builder); MoveOperation op(doc_context.bucketId, Timestamp(2), doc_context.doc, DbDocumentId(0, 2), 1); op.setDbDocumentId(DbDocumentId(1, 2)); f.runAsMaster([&]() { f.handler.handleMove(op, IDestructorCallback::SP()); }); @@ -556,7 +572,7 @@ TEST_F("require that flush cannot unprune", FeedHandlerFixture) TEST_F("require that remove of unknown document with known data type stores remove", FeedHandlerFixture) { - DocumentContext doc_context("id:test:searchdocument::foo", *f.schema.builder); + DocumentContext doc_context("id:test:searchdocument::foo", f.schema.builder); auto op = std::make_unique<RemoveOperationWithDocId>(doc_context.bucketId, Timestamp(10), doc_context.doc->getId()); FeedTokenContext token_context; f.handler.performOperation(std::move(token_context.token), std::move(op)); @@ -566,7 +582,7 @@ TEST_F("require that remove of unknown document with known data type stores remo TEST_F("require that partial update for non-existing document is tagged as such", FeedHandlerFixture) { - UpdateContext upCtx("id:test:searchdocument::foo", *f.schema.builder); + UpdateContext upCtx("id:test:searchdocument::foo", f.schema.builder); auto op = std::make_unique<UpdateOperation>(upCtx.bucketId, Timestamp(10), upCtx.update); FeedTokenContext token_context; f.handler.performOperation(std::move(token_context.token), std::move(op)); @@ -582,7 +598,7 @@ TEST_F("require that partial update for non-existing document is tagged as such" TEST_F("require that partial update for non-existing document is created if specified", FeedHandlerFixture) { f.handler.setSerialNum(15); - UpdateContext upCtx("id:test:searchdocument::foo", *f.schema.builder); + UpdateContext upCtx("id:test:searchdocument::foo", f.schema.builder); upCtx.update->setCreateIfNonExistent(true); f.feedView.metaStore.insert(upCtx.update->getId().getGlobalId(), MyDocumentMetaStore::Entry(5, 5, Timestamp(10))); auto op = std::make_unique<UpdateOperation>(upCtx.bucketId, Timestamp(10), upCtx.update); @@ -605,7 +621,7 @@ TEST_F("require that put is rejected if resource limit is reached", FeedHandlerF f.writeFilter._acceptWriteOperation = false; f.writeFilter._message = "Attribute resource limit reached"; - DocumentContext docCtx("id:test:searchdocument::foo", *f.schema.builder); + DocumentContext docCtx("id:test:searchdocument::foo", f.schema.builder); auto op = std::make_unique<PutOperation>(docCtx.bucketId, Timestamp(10), std::move(docCtx.doc)); FeedTokenContext token; f.handler.performOperation(std::move(token.token), std::move(op)); @@ -620,7 +636,7 @@ TEST_F("require that update is rejected if resource limit is reached", FeedHandl f.writeFilter._acceptWriteOperation = false; f.writeFilter._message = "Attribute resource limit reached"; - UpdateContext updCtx("id:test:searchdocument::foo", *f.schema.builder); + UpdateContext updCtx("id:test:searchdocument::foo", f.schema.builder); updCtx.addFieldUpdate("tensor"); auto op = std::make_unique<UpdateOperation>(updCtx.bucketId, Timestamp(10), updCtx.update); FeedTokenContext token; @@ -637,7 +653,7 @@ TEST_F("require that remove is NOT rejected if resource limit is reached", FeedH f.writeFilter._acceptWriteOperation = false; f.writeFilter._message = "Attribute resource limit reached"; - DocumentContext docCtx("id:test:searchdocument::foo", *f.schema.builder); + DocumentContext docCtx("id:test:searchdocument::foo", f.schema.builder); auto op = std::make_unique<RemoveOperationWithDocId>(docCtx.bucketId, Timestamp(10), docCtx.doc->getId()); FeedTokenContext token; f.handler.performOperation(std::move(token.token), std::move(op)); @@ -651,7 +667,7 @@ checkUpdate(FeedHandlerFixture &f, SchemaContext &schemaContext, const vespalib::string &fieldName, bool expectReject, bool existing) { f.handler.setSerialNum(15); - UpdateContext updCtx("id:test:searchdocument::foo", *schemaContext.builder); + UpdateContext updCtx("id:test:searchdocument::foo", schemaContext.builder); updCtx.addFieldUpdate(fieldName); if (existing) { f.feedView.metaStore.insert(updCtx.update->getId().getGlobalId(), MyDocumentMetaStore::Entry(5, 5, Timestamp(9))); @@ -733,7 +749,7 @@ TEST_F("require that tensor update with wrong tensor type fails", FeedHandlerFix TEST_F("require that put with different document type repo is ok", FeedHandlerFixture) { TwoFieldsSchemaContext schema; - DocumentContext doc_context("id:ns:searchdocument::foo", *schema.builder); + DocumentContext doc_context("id:ns:searchdocument::foo", schema.builder); auto op = std::make_unique<PutOperation>(doc_context.bucketId, Timestamp(10), std::move(doc_context.doc)); FeedTokenContext token_context; @@ -747,7 +763,7 @@ TEST_F("require that put with different document type repo is ok", FeedHandlerFi TEST_F("require that feed stats are updated", FeedHandlerFixture) { - DocumentContext doc_context("id:ns:searchdocument::foo", *f.schema.builder); + DocumentContext doc_context("id:ns:searchdocument::foo", f.schema.builder); auto op =std::make_unique<PutOperation>(doc_context.bucketId, Timestamp(10), std::move(doc_context.doc)); FeedTokenContext token_context; f.handler.performOperation(std::move(token_context.token), std::move(op)); diff --git a/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_common.cpp b/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_common.cpp index 9c68d7d5974..b3a2e9cad83 100644 --- a/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_common.cpp +++ b/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_common.cpp @@ -127,7 +127,8 @@ MyHandler::handleCompactLidSpace(const CompactLidSpaceOperation &op, std::shared } MyHandler::MyHandler(bool storeMoveDoneContexts, bool bucketIdEqualLid) - : _stats(), + : _builder(), + _stats(), _moveFromLid(0), _moveToLid(0), _handleMoveCnt(0), @@ -140,9 +141,8 @@ MyHandler::MyHandler(bool storeMoveDoneContexts, bool bucketIdEqualLid) _rm_listener(), _docs() { - DocBuilder builder = DocBuilder(Schema()); for (uint32_t i(0); i < 10; i++) { - auto doc = builder.startDocument(fmt("%s%d", DOC_ID.c_str(), i)).endDocument(); + auto doc = _builder.make_document(fmt("%s%d", DOC_ID.c_str(), i)); _docs.emplace_back(DocumentMetaData(i, TIMESTAMP_1, createBucketId(i), doc->getId().getGlobalId()), std::move(doc)); } } diff --git a/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_common.h b/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_common.h index b404fc6956a..806729a108c 100644 --- a/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_common.h +++ b/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_common.h @@ -17,11 +17,14 @@ #include <vespa/searchcore/proton/test/test.h> #include <vespa/searchcore/proton/test/dummy_document_store.h> #include <vespa/vespalib/util/idestructorcallback.h> -#include <vespa/searchlib/index/docbuilder.h> -using namespace document; +using document::BucketId; +using document::GlobalId; +using document::Document; +using document::DocumentId; +using document::DocumentTypeRepo; using namespace proton; -using namespace search::index; +using search::index::EmptyDocBuilder; using namespace search; using namespace vespalib; using vespalib::IDestructorCallback; @@ -60,6 +63,7 @@ struct MyScanIterator : public IDocumentScanIterator { }; struct MyHandler : public ILidSpaceCompactionHandler { + EmptyDocBuilder _builder; std::vector<LidUsageStats> _stats; std::vector<LidVector> _lids; mutable uint32_t _moveFromLid; diff --git a/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_handler_test.cpp b/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_handler_test.cpp index bc9cd9a93fa..fd38853dca1 100644 --- a/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_handler_test.cpp +++ b/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_handler_test.cpp @@ -5,7 +5,7 @@ #include <vespa/vespalib/gtest/gtest.h> struct HandlerTest : public ::testing::Test { - DocBuilder _docBuilder; + EmptyDocBuilder _docBuilder; std::shared_ptr<bucketdb::BucketDBOwner> _bucketDB; MyDocumentStore _docStore; MySubDb _subDb; @@ -15,13 +15,13 @@ struct HandlerTest : public ::testing::Test { }; HandlerTest::HandlerTest() - : _docBuilder(Schema()), + : _docBuilder(), _bucketDB(std::make_shared<bucketdb::BucketDBOwner>()), _docStore(), - _subDb(_bucketDB, _docStore, _docBuilder.getDocumentTypeRepo()), + _subDb(_bucketDB, _docStore, _docBuilder.get_repo_sp()), _handler(_subDb.maintenance_sub_db, "test") { - _docStore._readDoc = _docBuilder.startDocument(DOC_ID).endDocument(); + _docStore._readDoc = _docBuilder.make_document(DOC_ID); } HandlerTest::~HandlerTest() = default; diff --git a/searchcore/src/tests/proton/documentdb/maintenancecontroller/maintenancecontroller_test.cpp b/searchcore/src/tests/proton/documentdb/maintenancecontroller/maintenancecontroller_test.cpp index ea4d556c502..915402122b8 100644 --- a/searchcore/src/tests/proton/documentdb/maintenancecontroller/maintenancecontroller_test.cpp +++ b/searchcore/src/tests/proton/documentdb/maintenancecontroller/maintenancecontroller_test.cpp @@ -35,7 +35,6 @@ #include <vespa/searchcore/proton/test/test.h> #include <vespa/searchcore/proton/test/transport_helper.h> #include <vespa/searchlib/common/idocumentmetastore.h> -#include <vespa/searchlib/index/docbuilder.h> #include <vespa/vespalib/data/slime/slime.h> #include <vespa/vespalib/testkit/testapp.h> #include <vespa/vespalib/util/destructor_callbacks.h> @@ -99,11 +98,11 @@ class MyDocumentSubDB uint32_t _subDBId; DocumentMetaStore::SP _metaStoreSP; DocumentMetaStore & _metaStore; - const std::shared_ptr<const document::DocumentTypeRepo> &_repo; + std::shared_ptr<const document::DocumentTypeRepo> _repo; const DocTypeName &_docTypeName; public: - MyDocumentSubDB(uint32_t subDBId, SubDbType subDbType, const std::shared_ptr<const document::DocumentTypeRepo> &repo, + MyDocumentSubDB(uint32_t subDBId, SubDbType subDbType, std::shared_ptr<const document::DocumentTypeRepo> repo, std::shared_ptr<bucketdb::BucketDBOwner> bucketDB, const DocTypeName &docTypeName); ~MyDocumentSubDB(); @@ -136,7 +135,7 @@ public: const IDocumentMetaStore &getMetaStore() const { return _metaStore; } }; -MyDocumentSubDB::MyDocumentSubDB(uint32_t subDBId, SubDbType subDbType, const std::shared_ptr<const document::DocumentTypeRepo> &repo, +MyDocumentSubDB::MyDocumentSubDB(uint32_t subDBId, SubDbType subDbType, std::shared_ptr<const document::DocumentTypeRepo> repo, std::shared_ptr<bucketdb::BucketDBOwner> bucketDB, const DocTypeName &docTypeName) : _docs(), _subDBId(subDBId), @@ -144,7 +143,7 @@ MyDocumentSubDB::MyDocumentSubDB(uint32_t subDBId, SubDbType subDbType, const st std::move(bucketDB), DocumentMetaStore::getFixedName(), search::GrowStrategy(), subDbType)), _metaStore(*_metaStoreSP), - _repo(repo), + _repo(std::move(repo)), _docTypeName(docTypeName) { _metaStore.constructFreeList(); diff --git a/searchcore/src/tests/proton/documentdb/storeonlyfeedview/storeonlyfeedview_test.cpp b/searchcore/src/tests/proton/documentdb/storeonlyfeedview/storeonlyfeedview_test.cpp index 00694b6b78f..67342df5613 100644 --- a/searchcore/src/tests/proton/documentdb/storeonlyfeedview/storeonlyfeedview_test.cpp +++ b/searchcore/src/tests/proton/documentdb/storeonlyfeedview/storeonlyfeedview_test.cpp @@ -2,6 +2,7 @@ #include <vespa/document/base/documentid.h> #include <vespa/document/datatype/datatype.h> +#include <vespa/document/fieldvalue/document.h> #include <vespa/searchcommon/common/schema.h> #include <vespa/searchcore/proton/server/putdonecontext.h> #include <vespa/searchcore/proton/server/removedonecontext.h> @@ -13,7 +14,7 @@ #include <vespa/searchcore/proton/test/mock_summary_adapter.h> #include <vespa/searchcore/proton/test/transport_helper.h> #include <vespa/searchcore/proton/test/thread_utils.h> -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/empty_doc_builder.h> #include <vespa/vespalib/util/destructor_callbacks.h> #include <vespa/vespalib/util/size_literals.h> #include <vespa/vespalib/testkit/testapp.h> @@ -32,7 +33,7 @@ using namespace proton; using search::DocumentIdT; using vespalib::IDestructorCallback; using search::SerialNum; -using search::index::DocBuilder; +using search::index::EmptyDocBuilder; using search::index::Schema; using storage::spi::Timestamp; using vespalib::make_string; @@ -59,9 +60,8 @@ public: }; std::shared_ptr<const DocumentTypeRepo> myGetDocumentTypeRepo() { - Schema schema; - DocBuilder builder(schema); - std::shared_ptr<const DocumentTypeRepo> repo = builder.getDocumentTypeRepo(); + EmptyDocBuilder builder; + std::shared_ptr<const DocumentTypeRepo> repo = builder.get_repo_sp(); ASSERT_TRUE(repo.get()); return repo; } diff --git a/searchcore/src/tests/proton/feed_and_search/feed_and_search.cpp b/searchcore/src/tests/proton/feed_and_search/feed_and_search.cpp index ac540ad2e2d..49f13d8c5b5 100644 --- a/searchcore/src/tests/proton/feed_and_search/feed_and_search.cpp +++ b/searchcore/src/tests/proton/feed_and_search/feed_and_search.cpp @@ -3,6 +3,8 @@ #include <vespa/document/datatype/datatype.h> #include <vespa/document/fieldvalue/document.h> #include <vespa/document/fieldvalue/fieldvalue.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/repo/configbuilder.h> #include <vespa/searchlib/common/documentsummary.h> #include <vespa/vespalib/util/sequencedtaskexecutor.h> #include <vespa/searchlib/common/flush_token.h> @@ -10,8 +12,9 @@ #include <vespa/searchlib/diskindex/fusion.h> #include <vespa/searchlib/diskindex/indexbuilder.h> #include <vespa/searchlib/fef/fef.h> -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/empty_doc_builder.h> #include <vespa/searchlib/index/dummyfileheadercontext.h> +#include <vespa/searchlib/index/string_field_builder.h> #include <vespa/searchlib/memoryindex/memory_index.h> #include <vespa/searchlib/test/index/mock_field_length_inspector.h> #include <vespa/searchlib/query/base.h> @@ -31,6 +34,7 @@ LOG_SETUP("feed_and_search_test"); using document::DataType; using document::Document; using document::FieldValue; +using document::StringFieldValue; using search::DocumentIdT; using search::FlushToken; using search::TuneFileIndexing; @@ -44,9 +48,10 @@ using search::fef::MatchData; using search::fef::MatchDataLayout; using search::fef::TermFieldHandle; using search::fef::TermFieldMatchData; -using search::index::DocBuilder; +using search::index::EmptyDocBuilder; using search::index::DummyFileHeaderContext; using search::index::Schema; +using search::index::StringFieldBuilder; using search::index::test::MockFieldLengthInspector; using search::memoryindex::MemoryIndex; using search::query::SimpleStringTerm; @@ -113,14 +118,13 @@ Schema getSchema() { return schema; } -Document::UP buildDocument(DocBuilder & doc_builder, int id, +Document::UP buildDocument(EmptyDocBuilder & doc_builder, int id, const string &word) { ostringstream ost; ost << "id:ns:searchdocument::" << id; - doc_builder.startDocument(ost.str()); - doc_builder.startIndexField(field_name) - .addStr(noise).addStr(word).endField(); - return doc_builder.endDocument(); + auto doc = doc_builder.make_document(ost.str()); + doc->setValue(field_name, StringFieldBuilder(doc_builder).word(noise).space().word(word).build()); + return doc; } // Performs a search using a Searchable. @@ -165,7 +169,7 @@ void Test::requireThatMemoryIndexCanBeDumpedAndSearched() { auto indexFieldInverter = vespalib::SequencedTaskExecutor::create(invert_executor, 2); auto indexFieldWriter = vespalib::SequencedTaskExecutor::create(write_executor, 2); MemoryIndex memory_index(schema, MockFieldLengthInspector(), *indexFieldInverter, *indexFieldWriter); - DocBuilder doc_builder(schema); + EmptyDocBuilder doc_builder([](auto& header) { header.addField(field_name, DataType::T_STRING); }); Document::UP doc = buildDocument(doc_builder, doc_id1, word1); memory_index.insertDocument(doc_id1, *doc, {}); diff --git a/searchcore/src/tests/proton/index/fusionrunner_test.cpp b/searchcore/src/tests/proton/index/fusionrunner_test.cpp index 850f8a8f0d1..166d34f366b 100644 --- a/searchcore/src/tests/proton/index/fusionrunner_test.cpp +++ b/searchcore/src/tests/proton/index/fusionrunner_test.cpp @@ -1,15 +1,19 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/searchcorespi/index/fusionrunner.h> +#include <vespa/document/fieldvalue/document.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/repo/configbuilder.h> #include <vespa/searchcore/proton/index/indexmanager.h> #include <vespa/searchcore/proton/test/transport_helper.h> -#include <vespa/searchcorespi/index/fusionrunner.h> #include <vespa/vespalib/util/isequencedtaskexecutor.h> #include <vespa/searchlib/common/flush_token.h> #include <vespa/searchlib/diskindex/diskindex.h> #include <vespa/searchlib/diskindex/indexbuilder.h> #include <vespa/searchlib/fef/matchdatalayout.h> -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/empty_doc_builder.h> #include <vespa/searchlib/index/dummyfileheadercontext.h> +#include <vespa/searchlib/index/string_field_builder.h> #include <vespa/searchlib/memoryindex/memory_index.h> #include <vespa/searchlib/query/tree/simplequery.h> #include <vespa/searchlib/test/index/mock_field_length_inspector.h> @@ -25,6 +29,7 @@ using document::Document; using document::FieldValue; +using document::StringFieldValue; using proton::ExecutorThreadingService; using proton::index::IndexManager; using search::FixedSourceSelector; @@ -38,9 +43,10 @@ using search::fef::MatchData; using search::fef::MatchDataLayout; using search::fef::TermFieldHandle; using search::fef::TermFieldMatchData; -using search::index::DocBuilder; +using search::index::EmptyDocBuilder; using search::index::DummyFileHeaderContext; using search::index::Schema; +using search::index::StringFieldBuilder; using search::index::schema::DataType; using search::index::test::MockFieldLengthInspector; using search::memoryindex::MemoryIndex; @@ -149,15 +155,15 @@ void Test::tearDown() { _selector.reset(0); } -Document::UP buildDocument(DocBuilder & doc_builder, int id, const string &word) { +Document::UP buildDocument(EmptyDocBuilder & doc_builder, int id, const string &word) { vespalib::asciistream ost; ost << "id:ns:searchdocument::" << id; - doc_builder.startDocument(ost.str()); - doc_builder.startIndexField(field_name).addStr(word).endField(); - return doc_builder.endDocument(); + auto doc = doc_builder.make_document(ost.str()); + doc->setValue(field_name, StringFieldBuilder(doc_builder).word(word).build()); + return doc; } -void addDocument(DocBuilder & doc_builder, MemoryIndex &index, ISourceSelector &selector, +void addDocument(EmptyDocBuilder & doc_builder, MemoryIndex &index, ISourceSelector &selector, uint8_t index_id, uint32_t docid, const string &word) { Document::UP doc = buildDocument(doc_builder, docid, word); index.insertDocument(docid, *doc, {}); @@ -181,7 +187,7 @@ void Test::createIndex(const string &dir, uint32_t id, bool fusion) { _selector->setDefaultSource(id - _selector->getBaseId()); Schema schema = getSchema(); - DocBuilder doc_builder(schema); + EmptyDocBuilder doc_builder([](auto& header) { header.addField(field_name, document::DataType::T_STRING); }); MemoryIndex memory_index(schema, MockFieldLengthInspector(), _service.write().indexFieldInverter(), _service.write().indexFieldWriter()); diff --git a/searchcore/src/tests/proton/index/index_writer/index_writer_test.cpp b/searchcore/src/tests/proton/index/index_writer/index_writer_test.cpp index 75e6b01b46f..7202d7f0abe 100644 --- a/searchcore/src/tests/proton/index/index_writer/index_writer_test.cpp +++ b/searchcore/src/tests/proton/index/index_writer/index_writer_test.cpp @@ -1,10 +1,12 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/vespalib/testkit/testapp.h> - #include <vespa/searchcore/proton/index/index_writer.h> +#include <vespa/document/fieldvalue/document.h> #include <vespa/searchcore/proton/test/mock_index_manager.h> -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/empty_doc_builder.h> +#include <vespa/vespalib/testkit/testapp.h> +#include <vespa/vespalib/util/stringfmt.h> + #include <vespa/log/log.h> LOG_SETUP("index_writer_test"); @@ -80,21 +82,18 @@ struct Fixture IIndexManager::SP iim; MyIndexManager &mim; IndexWriter iw; - Schema schema; - DocBuilder builder; + EmptyDocBuilder builder; Document::UP dummyDoc; Fixture() : iim(new MyIndexManager()), mim(static_cast<MyIndexManager &>(*iim)), iw(iim), - schema(), - builder(schema), + builder(), dummyDoc(createDoc(1234)) // This content of this is not used { } Document::UP createDoc(uint32_t lid) { - builder.startDocument(vespalib::make_string("id:ns:searchdocument::%u", lid)); - return builder.endDocument(); + return builder.make_document(vespalib::make_string("id:ns:searchdocument::%u", lid)); } void put(SerialNum serialNum, const search::DocumentIdT lid) { iw.put(serialNum, *dummyDoc, lid, {}); diff --git a/searchcore/src/tests/proton/index/indexmanager_test.cpp b/searchcore/src/tests/proton/index/indexmanager_test.cpp index b427daa4ad1..886978f7465 100644 --- a/searchcore/src/tests/proton/index/indexmanager_test.cpp +++ b/searchcore/src/tests/proton/index/indexmanager_test.cpp @@ -1,6 +1,10 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include <vespa/searchcore/proton/index/indexmanager.h> +#include <vespa/document/fieldvalue/document.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/repo/configbuilder.h> +#include <vespa/document/fieldvalue/document.h> #include <vespa/searchcore/proton/test/transport_helper.h> #include <vespa/searchcorespi/index/index_manager_stats.h> #include <vespa/searchcorespi/index/indexcollection.h> @@ -9,8 +13,9 @@ #include <vespa/vespalib/util/sequencedtaskexecutor.h> #include <vespa/searchlib/common/flush_token.h> #include <vespa/searchlib/common/serialnum.h> -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/empty_doc_builder.h> #include <vespa/searchlib/index/dummyfileheadercontext.h> +#include <vespa/searchlib/index/string_field_builder.h> #include <vespa/searchlib/memoryindex/compact_words_store.h> #include <vespa/searchlib/memoryindex/document_inverter.h> #include <vespa/searchlib/memoryindex/document_inverter_context.h> @@ -34,6 +39,7 @@ LOG_SETUP("indexmanager_test"); using document::Document; using document::FieldValue; +using document::StringFieldValue; using proton::index::IndexConfig; using proton::index::IndexManager; using vespalib::SequencedTaskExecutor; @@ -42,10 +48,11 @@ using search::TuneFileAttributes; using search::TuneFileIndexManager; using search::TuneFileIndexing; using vespalib::datastore::EntryRef; -using search::index::DocBuilder; +using search::index::EmptyDocBuilder; using search::index::DummyFileHeaderContext; using search::index::FieldLengthInfo; using search::index::Schema; +using search::index::StringFieldBuilder; using search::index::schema::DataType; using search::index::test::MockFieldLengthInspector; using search::memoryindex::CompactWordsStore; @@ -88,13 +95,13 @@ void removeTestData() { std::filesystem::remove_all(std::filesystem::path(index_dir)); } -Document::UP buildDocument(DocBuilder &doc_builder, int id, +Document::UP buildDocument(EmptyDocBuilder &doc_builder, int id, const string &word) { vespalib::asciistream ost; ost << "id:ns:searchdocument::" << id; - doc_builder.startDocument(ost.str()); - doc_builder.startIndexField(field_name).addStr(word).endField(); - return doc_builder.endDocument(); + auto doc = doc_builder.make_document(ost.str()); + doc->setValue(field_name, StringFieldBuilder(doc_builder).word(word).build()); + return doc; } void push_documents_and_wait(search::memoryindex::DocumentInverter &inverter) { @@ -110,7 +117,7 @@ struct IndexManagerTest : public ::testing::Test { TransportAndExecutorService _service; std::unique_ptr<IndexManager> _index_manager; Schema _schema; - DocBuilder _builder; + EmptyDocBuilder _builder; IndexManagerTest() : _serial_num(0), @@ -119,7 +126,7 @@ struct IndexManagerTest : public ::testing::Test { _service(1), _index_manager(), _schema(getSchema()), - _builder(_schema) + _builder([](auto& header) { header.addField(field_name, document::DataType::T_STRING); }) { removeTestData(); std::filesystem::create_directory(std::filesystem::path(index_dir)); diff --git a/searchcore/src/tests/proton/reprocessing/document_reprocessing_handler/document_reprocessing_handler_test.cpp b/searchcore/src/tests/proton/reprocessing/document_reprocessing_handler/document_reprocessing_handler_test.cpp index da645f9a94b..719e762288e 100644 --- a/searchcore/src/tests/proton/reprocessing/document_reprocessing_handler/document_reprocessing_handler_test.cpp +++ b/searchcore/src/tests/proton/reprocessing/document_reprocessing_handler/document_reprocessing_handler_test.cpp @@ -3,7 +3,7 @@ LOG_SETUP("document_reprocessing_handler_test"); #include <vespa/searchcore/proton/reprocessing/document_reprocessing_handler.h> -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/empty_doc_builder.h> #include <vespa/vespalib/testkit/testapp.h> using namespace document; @@ -32,17 +32,17 @@ const vespalib::string DOC_ID = "id:test:searchdocument::0"; struct FixtureBase { DocumentReprocessingHandler _handler; - DocBuilder _docBuilder; + EmptyDocBuilder _docBuilder; FixtureBase(uint32_t docIdLimit); ~FixtureBase(); std::shared_ptr<Document> createDoc() { - return _docBuilder.startDocument(DOC_ID).endDocument(); + return _docBuilder.make_document(DOC_ID); } }; FixtureBase::FixtureBase(uint32_t docIdLimit) : _handler(docIdLimit), - _docBuilder(Schema()) + _docBuilder() { } FixtureBase::~FixtureBase() {} diff --git a/searchcore/src/vespa/searchcore/proton/test/userdocumentsbuilder.cpp b/searchcore/src/vespa/searchcore/proton/test/userdocumentsbuilder.cpp index 2cdf1c45485..f9f98705144 100644 --- a/searchcore/src/vespa/searchcore/proton/test/userdocumentsbuilder.cpp +++ b/searchcore/src/vespa/searchcore/proton/test/userdocumentsbuilder.cpp @@ -5,8 +5,7 @@ namespace proton::test { UserDocumentsBuilder::UserDocumentsBuilder() - : _schema(), - _builder(_schema), + : _builder(), _docs() { } @@ -17,7 +16,7 @@ UserDocumentsBuilder & UserDocumentsBuilder::createDoc(uint32_t userId, search::DocumentIdT lid) { vespalib::string docId = vespalib::make_string("id:test:searchdocument:n=%u:%u", userId, lid); - document::Document::SP doc(_builder.startDocument(docId).endDocument().release()); + document::Document::SP doc(_builder.make_document(docId)); _docs.addDoc(userId, Document(doc, lid, storage::spi::Timestamp(lid))); return *this; } diff --git a/searchcore/src/vespa/searchcore/proton/test/userdocumentsbuilder.h b/searchcore/src/vespa/searchcore/proton/test/userdocumentsbuilder.h index f05b6da11de..9e806c8a0bf 100644 --- a/searchcore/src/vespa/searchcore/proton/test/userdocumentsbuilder.h +++ b/searchcore/src/vespa/searchcore/proton/test/userdocumentsbuilder.h @@ -2,7 +2,7 @@ #pragma once #include "userdocuments.h" -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/empty_doc_builder.h> #include <vespa/vespalib/util/stringfmt.h> namespace proton::test { @@ -13,14 +13,13 @@ namespace proton::test { class UserDocumentsBuilder { private: - search::index::Schema _schema; - search::index::DocBuilder _builder; + search::index::EmptyDocBuilder _builder; UserDocuments _docs; public: UserDocumentsBuilder(); ~UserDocumentsBuilder(); - const std::shared_ptr<const document::DocumentTypeRepo> &getRepo() const { - return _builder.getDocumentTypeRepo(); + std::shared_ptr<const document::DocumentTypeRepo> getRepo() const { + return _builder.get_repo_sp(); } UserDocumentsBuilder &createDoc(uint32_t userId, search::DocumentIdT lid); UserDocumentsBuilder &createDocs(uint32_t userId, search::DocumentIdT begin, diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index 62aca6d68cc..c8b3db42340 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -168,9 +168,8 @@ vespa_define_module( src/tests/grouping src/tests/groupingengine src/tests/hitcollector - src/tests/index/docbuilder - src/tests/index/doctypebuilder src/tests/index/field_length_calculator + src/tests/index/string_field_builder src/tests/indexmetainfo src/tests/ld-library-path src/tests/memoryindex/compact_words_store diff --git a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp index 6e60d14b8ff..8feb7b7e287 100644 --- a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp +++ b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp @@ -1,14 +1,20 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/searchlib/diskindex/fusion.h> +#include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/document.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/fieldvalue/weightedsetfieldvalue.h> +#include <vespa/document/repo/configbuilder.h> #include <vespa/searchlib/common/flush_token.h> #include <vespa/searchlib/diskindex/diskindex.h> -#include <vespa/searchlib/diskindex/fusion.h> #include <vespa/searchlib/diskindex/indexbuilder.h> #include <vespa/searchlib/diskindex/zcposoccrandread.h> #include <vespa/searchlib/fef/fieldpositionsiterator.h> #include <vespa/searchlib/fef/termfieldmatchdata.h> -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/empty_doc_builder.h> #include <vespa/searchlib/index/dummyfileheadercontext.h> +#include <vespa/searchlib/index/string_field_builder.h> #include <vespa/searchlib/index/schemautil.h> #include <vespa/searchlib/memoryindex/document_inverter.h> #include <vespa/searchlib/memoryindex/document_inverter_context.h> @@ -31,7 +37,10 @@ LOG_SETUP("fusion_test"); namespace search { +using document::ArrayFieldValue; using document::Document; +using document::StringFieldValue; +using document::WeightedSetFieldValue; using fef::FieldPositionsIterator; using fef::TermFieldMatchData; using fef::TermFieldMatchDataArray; @@ -110,26 +119,20 @@ toString(FieldPositionsIterator posItr, bool hasElements = false, bool hasWeight } std::unique_ptr<Document> -make_doc10(DocBuilder &b) +make_doc10(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::10"); - b.startIndexField("f0"). - addStr("a").addStr("b").addStr("c").addStr("d"). - addStr("e").addStr("f").addStr("z"). - endField(); - b.startIndexField("f1"). - addStr("w").addStr("x"). - addStr("y").addStr("z"). - endField(); - b.startIndexField("f2"). - startElement(4).addStr("ax").addStr("ay").addStr("z").endElement(). - startElement(5).addStr("ax").endElement(). - endField(); - b.startIndexField("f3"). - startElement(4).addStr("wx").addStr("z").endElement(). - endField(); - - return b.endDocument(); + auto doc = b.make_document("id:ns:searchdocument::10"); + StringFieldBuilder sfb(b); + doc->setValue("f0", sfb.tokenize("a b c d e f z").build()); + doc->setValue("f1", sfb.tokenize("w x y z").build()); + ArrayFieldValue string_array(b.get_data_type("Array<String>")); + string_array.add(sfb.tokenize("ax ay z").build()); + string_array.add(sfb.tokenize("ax").build()); + doc->setValue("f2", string_array); + WeightedSetFieldValue string_wset(b.get_data_type("WeightedSet<String>")); + string_wset.add(sfb.tokenize("wx z").build(), 4); + doc->setValue("f3", string_wset); + return doc; } Schema::IndexField @@ -151,6 +154,18 @@ make_schema(bool interleaved_features) return schema; } +EmptyDocBuilder::AddFieldsType +make_add_fields() +{ + return [](auto& header) { using namespace document::config_builder; + using DataType = document::DataType; + header.addField("f0", DataType::T_STRING) + .addField("f1", DataType::T_STRING) + .addField("f2", Array(DataType::T_STRING)) + .addField("f3", Wset(DataType::T_STRING)); + }; +} + void assert_interleaved_features(DiskIndex &d, const vespalib::string &field, const vespalib::string &term, uint32_t doc_id, uint32_t exp_num_occs, uint32_t exp_field_length) { @@ -327,7 +342,8 @@ FusionTest::requireThatFusionIsWorking(const vespalib::string &prefix, bool dire addField("f2").addField("f3"). addField("f4")); FieldIndexCollection fic(schema, MockFieldLengthInspector()); - DocBuilder b(schema); + EmptyDocBuilder b(make_add_fields()); + StringFieldBuilder sfb(b); auto invertThreads = SequencedTaskExecutor::create(invert_executor, 2); auto pushThreads = SequencedTaskExecutor::create(push_executor, 2); DocumentInverterContext inv_context(schema, *invertThreads, *pushThreads, fic); @@ -338,19 +354,21 @@ FusionTest::requireThatFusionIsWorking(const vespalib::string &prefix, bool dire inv.invertDocument(10, *doc, {}); myPushDocument(inv); - b.startDocument("id:ns:searchdocument::11"). - startIndexField("f3"). - startElement(-27).addStr("zz").endElement(). - endField(); - doc = b.endDocument(); + doc = b.make_document("id:ns:searchdocument::11"); + { + WeightedSetFieldValue string_wset(b.get_data_type("WeightedSet<String>")); + string_wset.add(sfb.word("zz").build(), -27); + doc->setValue("f3", string_wset); + } inv.invertDocument(11, *doc, {}); myPushDocument(inv); - b.startDocument("id:ns:searchdocument::12"). - startIndexField("f3"). - startElement(0).addStr("zz0").endElement(). - endField(); - doc = b.endDocument(); + doc = b.make_document("id:ns:searchdocument::12"); + { + WeightedSetFieldValue string_wset(b.get_data_type("WeightedSet<String>")); + string_wset.add(sfb.word("zz0").build(), 0); + doc->setValue("f3", string_wset); + } inv.invertDocument(12, *doc, {}); myPushDocument(inv); @@ -468,7 +486,7 @@ FusionTest::make_simple_index(const vespalib::string &dump_dir, const IFieldLeng FieldIndexCollection fic(_schema, field_length_inspector); uint32_t numDocs = 20; uint32_t numWords = 1000; - DocBuilder b(_schema); + EmptyDocBuilder b(make_add_fields()); auto invertThreads = SequencedTaskExecutor::create(invert_executor, 2); auto pushThreads = SequencedTaskExecutor::create(push_executor, 2); DocumentInverterContext inv_context(_schema, *invertThreads, *pushThreads, fic); diff --git a/searchlib/src/tests/index/docbuilder/.gitignore b/searchlib/src/tests/index/docbuilder/.gitignore deleted file mode 100644 index 999644fce87..00000000000 --- a/searchlib/src/tests/index/docbuilder/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -*_test -.depend -Makefile -docbuilder_test -searchlib_docbuilder_test_app diff --git a/searchlib/src/tests/index/docbuilder/CMakeLists.txt b/searchlib/src/tests/index/docbuilder/CMakeLists.txt deleted file mode 100644 index 7a969f602ea..00000000000 --- a/searchlib/src/tests/index/docbuilder/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -vespa_add_executable(searchlib_docbuilder_test_app TEST - SOURCES - docbuilder_test.cpp - DEPENDS - searchlib -) -vespa_add_test(NAME searchlib_docbuilder_test_app COMMAND searchlib_docbuilder_test_app) diff --git a/searchlib/src/tests/index/docbuilder/docbuilder_test.cpp b/searchlib/src/tests/index/docbuilder/docbuilder_test.cpp deleted file mode 100644 index f76b61dcb78..00000000000 --- a/searchlib/src/tests/index/docbuilder/docbuilder_test.cpp +++ /dev/null @@ -1,437 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include <vespa/log/log.h> -LOG_SETUP("docbuilder_test"); -#include <boost/algorithm/string/classification.hpp> -#include <boost/algorithm/string/split.hpp> -#include <vespa/searchlib/index/docbuilder.h> -#include <vespa/vespalib/encoding/base64.h> -#include <vespa/vespalib/testkit/testapp.h> -#include <vespa/document/repo/fixedtyperepo.h> -#include <iostream> - -using namespace document; -using search::index::schema::CollectionType; - -namespace search::index { - -namespace -{ -std::string empty; -} - -namespace linguistics -{ -const vespalib::string SPANTREE_NAME("linguistics"); -} - - -TEST("test docBuilder") -{ - Schema s; - s.addIndexField(Schema::IndexField("ia", schema::DataType::STRING)); - s.addIndexField(Schema::IndexField("ib", schema::DataType::STRING, CollectionType::ARRAY)); - s.addIndexField(Schema::IndexField("ic", schema::DataType::STRING, CollectionType::WEIGHTEDSET)); - s.addUriIndexFields(Schema::IndexField("iu", schema::DataType::STRING)); - s.addUriIndexFields(Schema::IndexField("iau", schema::DataType::STRING, CollectionType::ARRAY)); - s.addUriIndexFields(Schema::IndexField("iwu", schema::DataType::STRING, CollectionType::WEIGHTEDSET)); - s.addAttributeField(Schema::AttributeField("aa", schema::DataType::INT32)); - s.addAttributeField(Schema::AttributeField("ab", schema::DataType::FLOAT)); - s.addAttributeField(Schema::AttributeField("ac", schema::DataType::STRING)); - s.addAttributeField(Schema::AttributeField("ad", schema::DataType::INT32, CollectionType::ARRAY)); - s.addAttributeField(Schema::AttributeField("ae", schema::DataType::FLOAT, CollectionType::ARRAY)); - s.addAttributeField(Schema::AttributeField("af", schema::DataType::STRING, CollectionType::ARRAY)); - s.addAttributeField(Schema::AttributeField("ag", schema::DataType::INT32, CollectionType::WEIGHTEDSET)); - s.addAttributeField(Schema::AttributeField("ah", schema::DataType::FLOAT, CollectionType::WEIGHTEDSET)); - s.addAttributeField(Schema::AttributeField("ai", schema::DataType::STRING, CollectionType::WEIGHTEDSET)); - s.addAttributeField(Schema::AttributeField("asp1", schema::DataType::INT32)); - s.addAttributeField(Schema::AttributeField("asp2", schema::DataType::INT64)); - s.addAttributeField(Schema::AttributeField("aap1", schema::DataType::INT32, CollectionType::ARRAY)); - s.addAttributeField(Schema::AttributeField("aap2", schema::DataType::INT64, CollectionType::ARRAY)); - s.addAttributeField(Schema::AttributeField("awp1", schema::DataType::INT32, CollectionType::WEIGHTEDSET)); - s.addAttributeField(Schema::AttributeField("awp2", schema::DataType::INT64, CollectionType::WEIGHTEDSET)); - - DocBuilder b(s); - Document::UP doc; - std::vector<std::string> lines; - std::vector<std::string>::const_iterator itr; - std::string xml; - - { // empty - doc = b.startDocument("id:ns:searchdocument::0").endDocument(); - xml = doc->toXml(""); - boost::split(lines, xml, boost::is_any_of("\n")); - itr = lines.begin(); - EXPECT_EQUAL("<document documenttype=\"searchdocument\" documentid=\"id:ns:searchdocument::0\"/>", *itr++); - EXPECT_EQUAL("", *itr++); - EXPECT_TRUE(itr == lines.end()); - } - { // all fields set - std::vector<char> binaryBlob; - binaryBlob.push_back('\0'); - binaryBlob.push_back('\2'); - binaryBlob.push_back('\1'); - std::string raw1s("Single Raw Element"); - std::string raw1a0("Array Raw Element 0"); - std::string raw1a1("Array Raw Element 1"); - std::string raw1w0("Weighted Set Raw Element 0"); - std::string raw1w1("Weighted Set Raw Element 1"); - raw1s += std::string(&binaryBlob[0], - &binaryBlob[0] + binaryBlob.size()); - raw1a0 += std::string(&binaryBlob[0], - &binaryBlob[0] + binaryBlob.size()); - raw1a1 += std::string(&binaryBlob[0], - &binaryBlob[0] + binaryBlob.size()); - raw1w0 += std::string(&binaryBlob[0], - &binaryBlob[0] + binaryBlob.size()); - raw1w1 += std::string(&binaryBlob[0], - &binaryBlob[0] + binaryBlob.size()); - b.startDocument("id:ns:searchdocument::1"); - b.startIndexField("ia").addStr("foo").addStr("bar").addStr("baz").addTermAnnotation("altbaz").endField(); - b.startIndexField("ib").startElement().addStr("foo").endElement(). - startElement(1).addStr("bar").addStr("baz").endElement().endField(); - b. startIndexField("ic"). - startElement(20).addStr("bar").addStr("baz").endElement(). - startElement().addStr("foo").endElement(). - endField(); - b.startIndexField("iu"). - startSubField("all"). - addUrlTokenizedString("http://www.example.com:81/fluke?ab=2#4"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.example.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("81"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("4"). - endSubField(). - endField(); - b.startIndexField("iau"). - startElement(1). - startSubField("all"). - addUrlTokenizedString("http://www.example.com:82/fluke?ab=2#8"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.example.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("82"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("8"). - endSubField(). - endElement(). - startElement(1). - startSubField("all"). - addUrlTokenizedString("http://www.flickr.com:82/fluke?ab=2#9"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.flickr.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("82"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("9"). - endSubField(). - endElement(). - endField(); - b.startIndexField("iwu"). - startElement(4). - startSubField("all"). - addUrlTokenizedString("http://www.example.com:83/fluke?ab=2#12"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.example.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("83"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("12"). - endSubField(). - endElement(). - startElement(7). - startSubField("all"). - addUrlTokenizedString("http://www.flickr.com:85/fluke?ab=2#13"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.flickr.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("85"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("13"). - endSubField(). - endElement(). - endField(); - b.startAttributeField("aa").addInt(2147483647).endField(); - b.startAttributeField("ab").addFloat(1234.56).endField(); - b.startAttributeField("ac").addStr("foo baz").endField(); - b.startAttributeField("ad").startElement().addInt(10).endElement().endField(); - b.startAttributeField("ae").startElement().addFloat(10.5).endElement().endField(); - b.startAttributeField("af").startElement().addStr("foo").endElement().endField(); - b.startAttributeField("ag").startElement(2).addInt(20).endElement().endField(); - b.startAttributeField("ah").startElement(3).addFloat(20.5).endElement().endField(); - b.startAttributeField("ai").startElement(4).addStr("bar").endElement().endField(); - b.startAttributeField("asp1").addInt(1001).endField(); - b.startAttributeField("asp2").addPosition(1002, 1003).endField(); - b.startAttributeField("aap1"). - startElement().addInt(1004).endElement(). - startElement().addInt(1005).endElement(). - endField(); - b.startAttributeField("aap2"). - startElement().addPosition(1006, 1007).endElement(). - startElement().addPosition(1008, 1009).endElement(). - endField(); - b.startAttributeField("awp1"). - startElement(41).addInt(1010).endElement(). - startElement(42).addInt(1011).endElement(). - endField(); - b.startAttributeField("awp2"). - startElement(43).addPosition(1012, 1013).endElement(). - startElement(44).addPosition(1014, 1015).endElement(). - endField(); - doc = b.endDocument(); - xml = doc->toXml(""); - boost::split(lines, xml, boost::is_any_of("\n")); - itr = lines.begin(); - EXPECT_EQUAL("<document documenttype=\"searchdocument\" documentid=\"id:ns:searchdocument::1\">", *itr++); - EXPECT_EQUAL("<iu>", *itr++); - EXPECT_EQUAL("<all>http://www.example.com:81/fluke?ab=2#4</all>", *itr++); - EXPECT_EQUAL("<host>www.example.com</host>", *itr++); - EXPECT_EQUAL("<scheme>http</scheme>", *itr++); - EXPECT_EQUAL("<path>/fluke</path>", *itr++); - EXPECT_EQUAL("<port>81</port>", *itr++); - EXPECT_EQUAL("<query>ab=2</query>", *itr++); - EXPECT_EQUAL("<fragment>4</fragment>", *itr++); - EXPECT_EQUAL("</iu>", *itr++); - EXPECT_EQUAL("<aa>2147483647</aa>", *itr++); - EXPECT_EQUAL("<aap2>", *itr++); - EXPECT_EQUAL("<item>1047806</item>", *itr++); - EXPECT_EQUAL("<item>1048322</item>", *itr++); - EXPECT_EQUAL("</aap2>", *itr++); - EXPECT_EQUAL("<ia>foo bar baz</ia>", *itr++); - EXPECT_EQUAL("<ae>", *itr++); - EXPECT_EQUAL("<item>10.5</item>", *itr++); - EXPECT_EQUAL("</ae>", *itr++); - EXPECT_EQUAL("<ib>", *itr++); - EXPECT_EQUAL("<item>foo</item>", *itr++); - EXPECT_EQUAL("<item>bar baz</item>", *itr++); - EXPECT_EQUAL("</ib>", *itr++); - EXPECT_EQUAL("<ah>", *itr++); - EXPECT_EQUAL("<item weight=\"3\">20.5</item>", *itr++); - EXPECT_EQUAL("</ah>", *itr++); - EXPECT_EQUAL("<ic>", *itr++); - EXPECT_EQUAL("<item weight=\"20\">bar baz</item>", *itr++); - EXPECT_EQUAL("<item weight=\"1\">foo</item>", *itr++); - EXPECT_EQUAL("</ic>", *itr++); - EXPECT_EQUAL("<ac>foo baz</ac>", *itr++); - EXPECT_EQUAL("<awp2>", *itr++); - EXPECT_EQUAL("<item weight=\"43\">1048370</item>", *itr++); - EXPECT_EQUAL("<item weight=\"44\">1048382</item>", *itr++); - EXPECT_EQUAL("</awp2>", *itr++); - EXPECT_EQUAL("<iau>", *itr++); - EXPECT_EQUAL("<item>", *itr++); - EXPECT_EQUAL("<all>http://www.example.com:82/fluke?ab=2#8</all>", *itr++); - EXPECT_EQUAL("<host>www.example.com</host>", *itr++); - EXPECT_EQUAL("<scheme>http</scheme>", *itr++); - EXPECT_EQUAL("<path>/fluke</path>", *itr++); - EXPECT_EQUAL("<port>82</port>", *itr++); - EXPECT_EQUAL("<query>ab=2</query>", *itr++); - EXPECT_EQUAL("<fragment>8</fragment>", *itr++); - EXPECT_EQUAL("</item>", *itr++); - EXPECT_EQUAL("<item>", *itr++); - EXPECT_EQUAL("<all>http://www.flickr.com:82/fluke?ab=2#9</all>", *itr++); - EXPECT_EQUAL("<host>www.flickr.com</host>", *itr++); - EXPECT_EQUAL("<scheme>http</scheme>", *itr++); - EXPECT_EQUAL("<path>/fluke</path>", *itr++); - EXPECT_EQUAL("<port>82</port>", *itr++); - EXPECT_EQUAL("<query>ab=2</query>", *itr++); - EXPECT_EQUAL("<fragment>9</fragment>", *itr++); - EXPECT_EQUAL("</item>", *itr++); - EXPECT_EQUAL("</iau>", *itr++); - EXPECT_EQUAL("<asp2>1047758</asp2>", *itr++); - EXPECT_EQUAL("<ai>", *itr++); - EXPECT_EQUAL("<item weight=\"4\">bar</item>", *itr++); - EXPECT_EQUAL("</ai>", *itr++); - EXPECT_EQUAL("<asp1>1001</asp1>", *itr++); - EXPECT_EQUAL("<ad>", *itr++); - EXPECT_EQUAL("<item>10</item>", *itr++); - EXPECT_EQUAL("</ad>", *itr++); - EXPECT_EQUAL("<iwu>", *itr++); - EXPECT_EQUAL("<item weight=\"4\">", *itr++); - EXPECT_EQUAL("<all>http://www.example.com:83/fluke?ab=2#12</all>", *itr++); - EXPECT_EQUAL("<host>www.example.com</host>", *itr++); - EXPECT_EQUAL("<scheme>http</scheme>", *itr++); - EXPECT_EQUAL("<path>/fluke</path>", *itr++); - EXPECT_EQUAL("<port>83</port>", *itr++); - EXPECT_EQUAL("<query>ab=2</query>", *itr++); - EXPECT_EQUAL("<fragment>12</fragment>", *itr++); - EXPECT_EQUAL("</item>", *itr++); - EXPECT_EQUAL("<item weight=\"7\">", *itr++); - EXPECT_EQUAL("<all>http://www.flickr.com:85/fluke?ab=2#13</all>", *itr++); - EXPECT_EQUAL("<host>www.flickr.com</host>", *itr++); - EXPECT_EQUAL("<scheme>http</scheme>", *itr++); - EXPECT_EQUAL("<path>/fluke</path>", *itr++); - EXPECT_EQUAL("<port>85</port>", *itr++); - EXPECT_EQUAL("<query>ab=2</query>", *itr++); - EXPECT_EQUAL("<fragment>13</fragment>", *itr++); - EXPECT_EQUAL("</item>", *itr++); - EXPECT_EQUAL("</iwu>", *itr++); - EXPECT_EQUAL("<ab>1234.56</ab>", *itr++); - EXPECT_EQUAL("<ag>", *itr++); - EXPECT_EQUAL("<item weight=\"2\">20</item>", *itr++); - EXPECT_EQUAL("</ag>", *itr++); - EXPECT_EQUAL("<awp1>", *itr++); - EXPECT_EQUAL("<item weight=\"41\">1010</item>", *itr++); - EXPECT_EQUAL("<item weight=\"42\">1011</item>", *itr++); - EXPECT_EQUAL("</awp1>", *itr++); - EXPECT_EQUAL("<aap1>", *itr++); - EXPECT_EQUAL("<item>1004</item>", *itr++); - EXPECT_EQUAL("<item>1005</item>", *itr++); - EXPECT_EQUAL("</aap1>", *itr++); - EXPECT_EQUAL("<af>", *itr++); - EXPECT_EQUAL("<item>foo</item>", *itr++); - EXPECT_EQUAL("</af>", *itr++); - EXPECT_EQUAL("</document>", *itr++); - EXPECT_TRUE(itr == lines.end()); -#if 0 - std::cout << "onedoc xml start -----" << std::endl << - xml << std::endl << - "-------" << std::endl; - std::cout << "onedoc toString start ----" << std::endl << - doc->toString(true) << std::endl << - "-------" << std::endl; -#endif - } - { // create one more to see that everything is cleared - b.startDocument("id:ns:searchdocument::2"); - b.startIndexField("ia").addStr("yes").endField(); - b.startAttributeField("aa").addInt(20).endField(); - doc = b.endDocument(); - xml = doc->toXml(""); - boost::split(lines, xml, boost::is_any_of("\n")); - itr = lines.begin(); - EXPECT_EQUAL("<document documenttype=\"searchdocument\" documentid=\"id:ns:searchdocument::2\">", *itr++); - EXPECT_EQUAL("<aa>20</aa>", *itr++); - EXPECT_EQUAL("<ia>yes</ia>", *itr++); - EXPECT_EQUAL("</document>", *itr++); - EXPECT_TRUE(itr == lines.end()); - } - { // create field with cjk chars - b.startDocument("id:ns:searchdocument::3"); - b.startIndexField("ia"). - addStr("我就是那个"). - setAutoSpace(false). - addStr("大灰狼"). - setAutoSpace(true). - endField(); - doc = b.endDocument(); - xml = doc->toXml(""); - boost::split(lines, xml, boost::is_any_of("\n")); - itr = lines.begin(); - EXPECT_EQUAL("<document documenttype=\"searchdocument\" documentid=\"id:ns:searchdocument::3\">", *itr++); - EXPECT_EQUAL("<ia>我就是那个大灰狼</ia>", *itr++); - EXPECT_EQUAL("</document>", *itr++); - EXPECT_TRUE(itr == lines.end()); - const FieldValue::UP iaval = doc->getValue("ia"); - ASSERT_TRUE(iaval.get() != NULL); - const StringFieldValue *iasval = dynamic_cast<const StringFieldValue *> - (iaval.get()); - ASSERT_TRUE(iasval != NULL); - StringFieldValue::SpanTrees trees = iasval->getSpanTrees(); - const SpanTree *tree = StringFieldValue::findTree(trees, linguistics::SPANTREE_NAME); - ASSERT_TRUE(tree != NULL); - std::vector<Span> spans; - std::vector<Span> expSpans; - for (SpanTree::const_iterator i = tree->begin(), ie = tree->end(); - i != ie; ++i) { - Annotation &ann = const_cast<Annotation &>(*i); - const Span *span = dynamic_cast<const Span *>(ann.getSpanNode()); - if (span == NULL) - continue; - spans.push_back(*span); - } - expSpans.push_back(Span(0, 15)); - expSpans.push_back(Span(0, 15)); - expSpans.push_back(Span(15, 9)); - expSpans.push_back(Span(15, 9)); - ASSERT_TRUE(expSpans == spans); -#if 0 - std::cout << "onedoc xml start -----" << std::endl << - xml << std::endl << - "-------" << std::endl; - std::cout << "onedoc toString start ----" << std::endl << - doc->toString(true) << std::endl << - "-------" << std::endl; -#endif - } -} - -TEST("test if index names are valid uri parts") { - EXPECT_FALSE(UriField::mightBePartofUri("all")); - EXPECT_FALSE(UriField::mightBePartofUri("fragment")); - EXPECT_FALSE(UriField::mightBePartofUri(".all")); - EXPECT_FALSE(UriField::mightBePartofUri("all.b")); - EXPECT_TRUE(UriField::mightBePartofUri("b.all")); - EXPECT_TRUE(UriField::mightBePartofUri("b.scheme")); - EXPECT_TRUE(UriField::mightBePartofUri("b.host")); - EXPECT_TRUE(UriField::mightBePartofUri("b.port")); - EXPECT_TRUE(UriField::mightBePartofUri("b.hostname")); - EXPECT_TRUE(UriField::mightBePartofUri("b.path")); - EXPECT_TRUE(UriField::mightBePartofUri("b.query")); - EXPECT_TRUE(UriField::mightBePartofUri("b.fragment")); -} - -} - -TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/index/doctypebuilder/.gitignore b/searchlib/src/tests/index/doctypebuilder/.gitignore deleted file mode 100644 index f15be1efcfe..00000000000 --- a/searchlib/src/tests/index/doctypebuilder/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -*_test -.depend -Makefile -doctypebuilder_test -searchlib_doctypebuilder_test_app diff --git a/searchlib/src/tests/index/doctypebuilder/CMakeLists.txt b/searchlib/src/tests/index/doctypebuilder/CMakeLists.txt deleted file mode 100644 index 348ecde5a7c..00000000000 --- a/searchlib/src/tests/index/doctypebuilder/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -vespa_add_executable(searchlib_doctypebuilder_test_app TEST - SOURCES - doctypebuilder_test.cpp - DEPENDS - searchlib -) -vespa_add_test(NAME searchlib_doctypebuilder_test_app COMMAND searchlib_doctypebuilder_test_app) diff --git a/searchlib/src/tests/index/doctypebuilder/doctypebuilder_test.cpp b/searchlib/src/tests/index/doctypebuilder/doctypebuilder_test.cpp deleted file mode 100644 index 95854fa11b2..00000000000 --- a/searchlib/src/tests/index/doctypebuilder/doctypebuilder_test.cpp +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/document/repo/documenttyperepo.h> -#include <vespa/searchlib/index/doctypebuilder.h> -#include <vespa/document/datatype/documenttype.h> -#include <vespa/vespalib/testkit/testapp.h> - -using namespace document; - -namespace search { -namespace index { - -using schema::CollectionType; -using schema::DataType; - -TEST("testSearchDocType") { - Schema s; - s.addIndexField(Schema::IndexField("ia", DataType::STRING)); - s.addIndexField(Schema::IndexField("ib", DataType::STRING, CollectionType::ARRAY)); - s.addIndexField(Schema::IndexField("ic", DataType::STRING, CollectionType::WEIGHTEDSET)); - s.addUriIndexFields(Schema::IndexField("iu", DataType::STRING)); - s.addUriIndexFields(Schema::IndexField("iau", DataType::STRING, CollectionType::ARRAY)); - s.addUriIndexFields(Schema::IndexField("iwu", DataType::STRING, CollectionType::WEIGHTEDSET)); - s.addAttributeField(Schema::AttributeField("aa", DataType::INT32)); - s.addAttributeField(Schema::AttributeField("spos", DataType::INT64)); - s.addAttributeField(Schema::AttributeField("apos", DataType::INT64, CollectionType::ARRAY)); - s.addAttributeField(Schema::AttributeField("wpos", DataType::INT64, CollectionType::WEIGHTEDSET)); - - DocTypeBuilder docTypeBuilder(s); - document::config::DocumenttypesConfig config = docTypeBuilder.makeConfig(); - DocumentTypeRepo repo(config); - const DocumentType *docType = repo.getDocumentType("searchdocument"); - ASSERT_TRUE(docType); - EXPECT_EQUAL(10u, docType->getFieldCount()); - - EXPECT_EQUAL("String", docType->getField("ia").getDataType().getName()); - EXPECT_EQUAL("Array<String>", - docType->getField("ib").getDataType().getName()); - EXPECT_EQUAL("WeightedSet<String>", - docType->getField("ic").getDataType().getName()); - EXPECT_EQUAL("url", docType->getField("iu").getDataType().getName()); - EXPECT_EQUAL("Array<url>", - docType->getField("iau").getDataType().getName()); - EXPECT_EQUAL("WeightedSet<url>", - docType->getField("iwu").getDataType().getName()); - - EXPECT_EQUAL("Int", docType->getField("aa").getDataType().getName()); - EXPECT_EQUAL("Long", docType->getField("spos").getDataType().getName()); - EXPECT_EQUAL("Array<Long>", - docType->getField("apos").getDataType().getName()); - EXPECT_EQUAL("WeightedSet<Long>", - docType->getField("wpos").getDataType().getName()); -} - -TEST("require that multiple fields can have the same type") { - Schema s; - s.addIndexField(Schema::IndexField("array1", DataType::STRING, CollectionType::ARRAY)); - s.addIndexField(Schema::IndexField("array2", DataType::STRING, CollectionType::ARRAY)); - DocTypeBuilder docTypeBuilder(s); - document::config::DocumenttypesConfig config = docTypeBuilder.makeConfig(); - DocumentTypeRepo repo(config); - const DocumentType *docType = repo.getDocumentType("searchdocument"); - ASSERT_TRUE(docType); - EXPECT_EQUAL(2u, docType->getFieldCount()); - - EXPECT_EQUAL("Array<String>", - docType->getField("array1").getDataType().getName()); - EXPECT_EQUAL("Array<String>", - docType->getField("array2").getDataType().getName()); -} - -} // namespace index -} // namespace search - -TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/index/string_field_builder/CMakeLists.txt b/searchlib/src/tests/index/string_field_builder/CMakeLists.txt new file mode 100644 index 00000000000..f8774eae5ca --- /dev/null +++ b/searchlib/src/tests/index/string_field_builder/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_string_field_builder_test_app TEST + SOURCES + string_field_builder_test.cpp + DEPENDS + searchlib + GTest::GTest +) +vespa_add_test(NAME searchlib_string_field_builder_test_app COMMAND searchlib_string_field_builder_test_app) diff --git a/searchlib/src/tests/index/string_field_builder/string_field_builder_test.cpp b/searchlib/src/tests/index/string_field_builder/string_field_builder_test.cpp new file mode 100644 index 00000000000..8c2b641f724 --- /dev/null +++ b/searchlib/src/tests/index/string_field_builder/string_field_builder_test.cpp @@ -0,0 +1,141 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchlib/index/string_field_builder.h> +#include <vespa/document/annotation/annotation.h> +#include <vespa/document/annotation/span.h> +#include <vespa/document/annotation/spanlist.h> +#include <vespa/document/annotation/spantree.h> +#include <vespa/document/datatype/annotationtype.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/searchlib/index/empty_doc_builder.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <cassert> +#include <iostream> + +using document::Annotation; +using document::AnnotationType; +using document::Span; +using document::SpanNode; +using document::SpanTree; +using document::StringFieldValue; +using search::index::EmptyDocBuilder; +using search::index::StringFieldBuilder; + +namespace +{ + +const vespalib::string SPANTREE_NAME("linguistics"); + +struct MyAnnotation { + int32_t start; + int32_t length; + std::optional<vespalib::string> label; + + MyAnnotation(int32_t start_in, int32_t length_in) noexcept + : start(start_in), + length(length_in), + label() + { + } + + MyAnnotation(int32_t start_in, int32_t length_in, vespalib::string label_in) noexcept + : start(start_in), + length(length_in), + label(label_in) + { + } + + bool operator==(const MyAnnotation& rhs) const noexcept; +}; + +bool +MyAnnotation::operator==(const MyAnnotation& rhs) const noexcept +{ + return start == rhs.start && + length == rhs.length && + label == rhs.label; +} + + +std::ostream& operator<<(std::ostream& os, const MyAnnotation& ann) { + os << "[" << ann.start << "," << ann.length << "]"; + if (ann.label.has_value()) { + os << "(\"" << ann.label.value() << "\")"; + } + return os; +} + +} + +class StringFieldBuilderTest : public testing::Test +{ +protected: + EmptyDocBuilder edb; + StringFieldBuilder sfb; + StringFieldBuilderTest(); + ~StringFieldBuilderTest(); + std::vector<MyAnnotation> get_annotations(const StringFieldValue& val); + void assert_annotations(std::vector<MyAnnotation> exp, const vespalib::string& plain, const StringFieldValue& val); +}; + +StringFieldBuilderTest::StringFieldBuilderTest() + : testing::Test(), + edb(), + sfb(edb) +{ +} + +StringFieldBuilderTest::~StringFieldBuilderTest() = default; + +std::vector<MyAnnotation> +StringFieldBuilderTest::get_annotations(const StringFieldValue& val) +{ + std::vector<MyAnnotation> result; + StringFieldValue::SpanTrees trees = val.getSpanTrees(); + const auto* tree = StringFieldValue::findTree(trees, SPANTREE_NAME); + if (tree != nullptr) { + for (auto& ann : *tree) { + assert(ann.getType() == *AnnotationType::TERM); + auto span = dynamic_cast<const Span *>(ann.getSpanNode()); + if (span == nullptr) { + continue; + } + auto ann_fv = ann.getFieldValue(); + if (ann_fv == nullptr) { + result.emplace_back(span->from(), span->length()); + } else { + result.emplace_back(span->from(), span->length(), dynamic_cast<const StringFieldValue &>(*ann_fv).getValue()); + } + } + } + return result; +} + +void +StringFieldBuilderTest::assert_annotations(std::vector<MyAnnotation> exp, const vespalib::string& plain, const StringFieldValue& val) +{ + EXPECT_EQ(exp, get_annotations(val)); + EXPECT_EQ(plain, val.getValue()); +} + +TEST_F(StringFieldBuilderTest, no_annotations) +{ + assert_annotations({}, "foo", StringFieldValue("foo")); +} + +TEST_F(StringFieldBuilderTest, single_word) +{ + assert_annotations({{0, 4}}, "word", sfb.word("word").build()); +} + +TEST_F(StringFieldBuilderTest, tokenize) +{ + assert_annotations({{0, 4}, {5, 2}, {8, 1}, {10, 4}}, "this is a test", sfb.tokenize("this is a test").build()); +} + +TEST_F(StringFieldBuilderTest, alt_word) +{ + assert_annotations({{0, 3}, {4, 3}, {4, 3, "baz"}}, "foo bar", sfb.word("foo").space().word("bar").alt_word("baz").build()); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/tests/memoryindex/document_inverter/document_inverter_test.cpp b/searchlib/src/tests/memoryindex/document_inverter/document_inverter_test.cpp index 3f8a04d9460..83746b611fb 100644 --- a/searchlib/src/tests/memoryindex/document_inverter/document_inverter_test.cpp +++ b/searchlib/src/tests/memoryindex/document_inverter/document_inverter_test.cpp @@ -1,8 +1,13 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/searchlib/index/docbuilder.h> -#include <vespa/searchlib/index/field_length_calculator.h> #include <vespa/searchlib/memoryindex/document_inverter.h> +#include <vespa/document/datatype/datatype.h> +#include <vespa/document/fieldvalue/document.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/repo/configbuilder.h> +#include <vespa/searchlib/index/empty_doc_builder.h> +#include <vespa/searchlib/index/field_length_calculator.h> +#include <vespa/searchlib/index/string_field_builder.h> #include <vespa/searchlib/memoryindex/document_inverter_context.h> #include <vespa/searchlib/memoryindex/field_index_remover.h> #include <vespa/searchlib/memoryindex/field_inverter.h> @@ -19,9 +24,10 @@ namespace search::memoryindex { using document::Document; -using index::DocBuilder; +using index::EmptyDocBuilder; using index::FieldLengthCalculator; using index::Schema; +using index::StringFieldBuilder; using index::schema::CollectionType; using index::schema::DataType; using vespalib::SequencedTaskExecutor; @@ -29,64 +35,68 @@ using vespalib::ISequencedTaskExecutor; namespace { +EmptyDocBuilder::AddFieldsType +make_add_fields() +{ + return [](auto& header) { using namespace document::config_builder; + using DataType = document::DataType; + header.addField("f0", DataType::T_STRING) + .addField("f1", DataType::T_STRING) + .addField("f2", Array(DataType::T_STRING)) + .addField("f3", Wset(DataType::T_STRING)); + }; +} + Document::UP -makeDoc10(DocBuilder &b) +makeDoc10(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::10"); - b.startIndexField("f0"). - addStr("a").addStr("b").addStr("c").addStr("d"). - endField(); - return b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::10"); + doc->setValue("f0", sfb.tokenize("a b c d").build()); + return doc; } Document::UP -makeDoc11(DocBuilder &b) +makeDoc11(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::11"); - b.startIndexField("f0"). - addStr("a").addStr("b").addStr("e").addStr("f"). - endField(); - b.startIndexField("f1"). - addStr("a").addStr("g"). - endField(); - return b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::11"); + doc->setValue("f0", sfb.tokenize("a b e f").build()); + doc->setValue("f1", sfb.tokenize("a g").build()); + return doc; } Document::UP -makeDoc12(DocBuilder &b) +makeDoc12(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::12"); - b.startIndexField("f0"). - addStr("h").addStr("doc12"). - endField(); - return b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::12"); + doc->setValue("f0", sfb.tokenize("h doc12").build()); + return doc; } Document::UP -makeDoc13(DocBuilder &b) +makeDoc13(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::13"); - b.startIndexField("f0"). - addStr("i").addStr("doc13"). - endField(); - return b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::13"); + doc->setValue("f0", sfb.tokenize("i doc13").build()); + return doc; } Document::UP -makeDoc14(DocBuilder &b) +makeDoc14(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::14"); - b.startIndexField("f0"). - addStr("j").addStr("doc14"). - endField(); - return b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::14"); + doc->setValue("f0", sfb.tokenize("j doc14").build()); + return doc; } Document::UP -makeDoc15(DocBuilder &b) +makeDoc15(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::15"); - return b.endDocument(); + return b.make_document("id:ns:searchdocument::15"); } } @@ -96,7 +106,7 @@ VESPA_THREAD_STACK_TAG(push_executor) struct DocumentInverterTest : public ::testing::Test { Schema _schema; - DocBuilder _b; + EmptyDocBuilder _b; std::unique_ptr<ISequencedTaskExecutor> _invertThreads; std::unique_ptr<ISequencedTaskExecutor> _pushThreads; WordStore _word_store; @@ -118,7 +128,7 @@ struct DocumentInverterTest : public ::testing::Test { DocumentInverterTest() : _schema(makeSchema()), - _b(_schema), + _b(make_add_fields()), _invertThreads(SequencedTaskExecutor::create(invert_executor, 1)), _pushThreads(SequencedTaskExecutor::create(push_executor, 1)), _word_store(), diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp index dcca1f136f6..04d1f08db6f 100644 --- a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp +++ b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp @@ -1,13 +1,22 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/document/datatype/datatype.h> +#include <vespa/document/datatype/urldatatype.h> +#include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/document.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/fieldvalue/structfieldvalue.h> +#include <vespa/document/fieldvalue/weightedsetfieldvalue.h> +#include <vespa/document/repo/configbuilder.h> #include <vespa/searchlib/diskindex/fusion.h> #include <vespa/searchlib/diskindex/indexbuilder.h> #include <vespa/searchlib/diskindex/zcposoccrandread.h> #include <vespa/searchlib/fef/fieldpositionsiterator.h> #include <vespa/searchlib/fef/termfieldmatchdata.h> -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/empty_doc_builder.h> #include <vespa/searchlib/index/docidandfeatures.h> #include <vespa/searchlib/index/dummyfileheadercontext.h> +#include <vespa/searchlib/index/string_field_builder.h> #include <vespa/searchlib/memoryindex/document_inverter.h> #include <vespa/searchlib/memoryindex/document_inverter_context.h> #include <vespa/searchlib/memoryindex/field_index_collection.h> @@ -37,7 +46,11 @@ namespace search { using namespace fef; using namespace index; +using document::ArrayFieldValue; using document::Document; +using document::StructFieldValue; +using document::UrlDataType; +using document::WeightedSetFieldValue; using queryeval::RankedSearchIteratorBase; using queryeval::SearchIterator; using search::index::schema::CollectionType; @@ -505,6 +518,12 @@ make_single_field_schema() return result; } +EmptyDocBuilder::AddFieldsType +make_single_add_fields() +{ + return [](auto& header) { header.addField("f0", document::DataType::T_STRING); }; +} + template <typename FieldIndexType> struct FieldIndexTest : public ::testing::Test { Schema schema; @@ -706,6 +725,18 @@ make_multi_field_schema() return result; } +EmptyDocBuilder::AddFieldsType +make_multi_field_add_fields() +{ + return [](auto& header) { using namespace document::config_builder; + using DataType = document::DataType; + header.addField("f0", DataType::T_STRING) + .addField("f1", DataType::T_STRING) + .addField("f2", Array(DataType::T_STRING)) + .addField("f3", Wset(DataType::T_STRING)); + }; +} + struct FieldIndexCollectionTest : public ::testing::Test { Schema schema; FieldIndexCollection fic; @@ -907,16 +938,16 @@ class InverterTest : public ::testing::Test { public: Schema _schema; FieldIndexCollection _fic; - DocBuilder _b; + EmptyDocBuilder _b; std::unique_ptr<ISequencedTaskExecutor> _invertThreads; std::unique_ptr<ISequencedTaskExecutor> _pushThreads; DocumentInverterContext _inv_context; DocumentInverter _inv; - InverterTest(const Schema& schema) + InverterTest(const Schema& schema, EmptyDocBuilder::AddFieldsType add_fields) : _schema(schema), _fic(_schema, MockFieldLengthInspector()), - _b(_schema), + _b(add_fields), _invertThreads(SequencedTaskExecutor::create(invert_executor, 2)), _pushThreads(SequencedTaskExecutor::create(push_executor, 2)), _inv_context(_schema, *_invertThreads, *_pushThreads, _fic), @@ -938,91 +969,63 @@ public: class BasicInverterTest : public InverterTest { public: - BasicInverterTest() : InverterTest(make_multi_field_schema()) {} + BasicInverterTest() : InverterTest(make_multi_field_schema(), make_multi_field_add_fields()) {} }; TEST_F(BasicInverterTest, require_that_inversion_is_working) { Document::UP doc; + StringFieldBuilder sfb(_b); - _b.startDocument("id:ns:searchdocument::10"); - _b.startIndexField("f0"). - addStr("a").addStr("b").addStr("c").addStr("d"). - endField(); - doc = _b.endDocument(); + doc = _b.make_document("id:ns:searchdocument::10"); + doc->setValue("f0", sfb.tokenize("a b c d").build()); _inv.invertDocument(10, *doc, {}); myPushDocument(_inv); - _b.startDocument("id:ns:searchdocument::20"); - _b.startIndexField("f0"). - addStr("a").addStr("a").addStr("b").addStr("c").addStr("d"). - endField(); - doc = _b.endDocument(); + doc = _b.make_document("id:ns:searchdocument::20"); + doc->setValue("f0", sfb.tokenize("a a b c d").build()); _inv.invertDocument(20, *doc, {}); myPushDocument(_inv); - _b.startDocument("id:ns:searchdocument::30"); - _b.startIndexField("f0"). - addStr("a").addStr("b").addStr("c").addStr("d"). - addStr("e").addStr("f"). - endField(); - _b.startIndexField("f1"). - addStr("\nw2").addStr("w").addStr("x"). - addStr("\nw3").addStr("y").addStr("z"). - endField(); - _b.startIndexField("f2"). - startElement(4). - addStr("w").addStr("x"). - endElement(). - startElement(5). - addStr("y").addStr("z"). - endElement(). - endField(); - _b.startIndexField("f3"). - startElement(6). - addStr("w").addStr("x"). - endElement(). - startElement(7). - addStr("y").addStr("z"). - endElement(). - endField(); - doc = _b.endDocument(); + doc = _b.make_document("id:ns:searchdocument::30"); + doc->setValue("f0", sfb.tokenize("a b c d e f").build()); + doc->setValue("f1", sfb.word("\nw2").tokenize(" w x "). + word("\nw3").tokenize(" y z").build()); + { + ArrayFieldValue string_array(_b.get_data_type("Array<String>")); + string_array.add(sfb.tokenize("w x").build()); + string_array.add(sfb.tokenize("y z").build()); + doc->setValue("f2", string_array); + } + { + WeightedSetFieldValue string_wset(_b.get_data_type("WeightedSet<String>")); + string_wset.add(sfb.tokenize("w x").build(), 6); + string_wset.add(sfb.tokenize("y z").build(), 7); + doc->setValue("f3", string_wset); + } _inv.invertDocument(30, *doc, {}); myPushDocument(_inv); - _b.startDocument("id:ns:searchdocument::40"); - _b.startIndexField("f0"). - addStr("a").addStr("a").addStr("b").addStr("c").addStr("a"). - addStr("e").addStr("f"). - endField(); - doc = _b.endDocument(); + doc = _b.make_document("id:ns:searchdocument::40"); + doc->setValue("f0", sfb.tokenize("a a b c a e f").build()); _inv.invertDocument(40, *doc, {}); myPushDocument(_inv); - _b.startDocument("id:ns:searchdocument::999"); - _b.startIndexField("f0"). - addStr("this").addStr("is").addStr("_a_").addStr("test"). - addStr("for").addStr("insertion").addStr("speed").addStr("with"). - addStr("more").addStr("than").addStr("just").addStr("__a__"). - addStr("few").addStr("words").addStr("present").addStr("in"). - addStr("some").addStr("of").addStr("the").addStr("fields"). - endField(); - _b.startIndexField("f1"). - addStr("the").addStr("other").addStr("field").addStr("also"). - addStr("has").addStr("some").addStr("content"). - endField(); - _b.startIndexField("f2"). - startElement(1). - addStr("strange").addStr("things").addStr("here"). - addStr("has").addStr("some").addStr("content"). - endElement(). - endField(); - _b.startIndexField("f3"). - startElement(3). - addStr("not").addStr("a").addStr("weighty").addStr("argument"). - endElement(). - endField(); - doc = _b.endDocument(); + doc = _b.make_document("id:ns:searchdocument::999"); + doc->setValue("f0", sfb.tokenize("this is ").word("_a_"). + tokenize(" test for insertion speed with more than just "). + word("__a__").tokenize(" few words present in some of the fields").build()); + doc->setValue("f1", sfb.tokenize("the other field also has some content").build()); + { + ArrayFieldValue string_array(_b.get_data_type("Array<String>")); + string_array.add(sfb.tokenize("strange things here has some content").build()); + doc->setValue("f2", string_array); + } + { + WeightedSetFieldValue string_wset(_b.get_data_type("WeightedSet<String>")); + string_wset.add(sfb.tokenize("not a weighty argument").build(), 3); + doc->setValue("f3", string_wset); + } for (uint32_t docId = 10000; docId < 20000; ++docId) { _inv.invertDocument(docId, *doc, {}); myPushDocument(_inv); @@ -1132,19 +1135,17 @@ TEST_F(BasicInverterTest, require_that_inversion_is_working) TEST_F(BasicInverterTest, require_that_inverter_handles_remove_via_document_remover) { - Document::UP doc; + StringFieldBuilder sfb(_b); - _b.startDocument("id:ns:searchdocument::1"); - _b.startIndexField("f0").addStr("a").addStr("b").endField(); - _b.startIndexField("f1").addStr("a").addStr("c").endField(); - Document::UP doc1 = _b.endDocument(); - _inv.invertDocument(1, *doc1.get(), {}); + auto doc1 = _b.make_document("id:ns:searchdocument::1"); + doc1->setValue("f0", sfb.tokenize("a b").build()); + doc1->setValue("f1", sfb.tokenize("a c").build()); + _inv.invertDocument(1, *doc1, {}); myPushDocument(_inv); - _b.startDocument("id:ns:searchdocument::2"); - _b.startIndexField("f0").addStr("b").addStr("c").endField(); - Document::UP doc2 = _b.endDocument(); - _inv.invertDocument(2, *doc2.get(), {}); + auto doc2 = _b.make_document("id:ns:searchdocument::2"); + doc2->setValue("f0", sfb.tokenize("b c").build()); + _inv.invertDocument(2, *doc2, {}); myPushDocument(_inv); EXPECT_TRUE(assertPostingList("[1]", find("a", 0))); @@ -1172,136 +1173,71 @@ make_uri_schema() return result; } +EmptyDocBuilder::AddFieldsType +make_uri_add_fields() +{ + return [](auto& header) { using namespace document::config_builder; + header.addField("iu", UrlDataType::getInstance().getId()) + .addField("iau", Array(UrlDataType::getInstance().getId())) + .addField("iwu", Wset(UrlDataType::getInstance().getId())); + }; +} + class UriInverterTest : public InverterTest { public: - UriInverterTest() : InverterTest(make_uri_schema()) {} + UriInverterTest() : InverterTest(make_uri_schema(), make_uri_add_fields()) {} }; TEST_F(UriInverterTest, require_that_uri_indexing_is_working) { Document::UP doc; - - _b.startDocument("id:ns:searchdocument::10"); - _b.startIndexField("iu"). - startSubField("all"). - addUrlTokenizedString("http://www.example.com:81/fluke?ab=2#4"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.example.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("81"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("4"). - endSubField(). - endField(); - _b.startIndexField("iau"). - startElement(1). - startSubField("all"). - addUrlTokenizedString("http://www.example.com:82/fluke?ab=2#8"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.example.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("82"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("8"). - endSubField(). - endElement(). - startElement(1). - startSubField("all"). - addUrlTokenizedString("http://www.flickr.com:82/fluke?ab=2#9"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.flickr.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("82"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("9"). - endSubField(). - endElement(). - endField(); - _b.startIndexField("iwu"). - startElement(4). - startSubField("all"). - addUrlTokenizedString("http://www.example.com:83/fluke?ab=2#12"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.example.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("83"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("12"). - endSubField(). - endElement(). - startElement(7). - startSubField("all"). - addUrlTokenizedString("http://www.flickr.com:85/fluke?ab=2#13"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.flickr.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("85"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("13"). - endSubField(). - endElement(). - endField(); - doc = _b.endDocument(); + StringFieldBuilder sfb(_b); + sfb.url_mode(true); + StructFieldValue url_value(_b.get_data_type("url")); + + doc = _b.make_document("id:ns:searchdocument::10"); + url_value.setValue("all", sfb.tokenize("http://www.example.com:81/fluke?ab=2#4").build()); + url_value.setValue("scheme", sfb.tokenize("http").build()); + url_value.setValue("host", sfb.tokenize("www.example.com").build()); + url_value.setValue("port", sfb.tokenize("81").build()); + url_value.setValue("path", sfb.tokenize("/fluke").build()); + url_value.setValue("query", sfb.tokenize("ab=2").build()); + url_value.setValue("fragment", sfb.tokenize("4").build()); + doc->setValue("iu", url_value); + ArrayFieldValue url_array(_b.get_data_type("Array<url>")); + url_value.setValue("all", sfb.tokenize("http://www.example.com:82/fluke?ab=2#8").build()); + url_value.setValue("scheme", sfb.tokenize("http").build()); + url_value.setValue("host", sfb.tokenize("www.example.com").build()); + url_value.setValue("port", sfb.tokenize("82").build()); + url_value.setValue("path", sfb.tokenize("/fluke").build()); + url_value.setValue("query", sfb.tokenize("ab=2").build()); + url_value.setValue("fragment", sfb.tokenize("8").build()); + url_array.add(url_value); + url_value.setValue("all", sfb.tokenize("http://www.flickr.com:82/fluke?ab=2#9").build()); + url_value.setValue("scheme", sfb.tokenize("http").build()); + url_value.setValue("host", sfb.tokenize("www.flickr.com").build()); + url_value.setValue("path", sfb.tokenize("/fluke").build()); + url_value.setValue("fragment", sfb.tokenize("9").build()); + url_array.add(url_value); + doc->setValue("iau", url_array); + WeightedSetFieldValue url_wset(_b.get_data_type("WeightedSet<url>")); + url_value.setValue("all", sfb.tokenize("http://www.example.com:83/fluke?ab=2#12").build()); + url_value.setValue("scheme", sfb.tokenize("http").build()); + url_value.setValue("host", sfb.tokenize("www.example.com").build()); + url_value.setValue("port", sfb.tokenize("83").build()); + url_value.setValue("path", sfb.tokenize("/fluke").alt_word("altfluke").build()); + url_value.setValue("query", sfb.tokenize("ab=2").build()); + url_value.setValue("fragment", sfb.tokenize("12").build()); + url_wset.add(url_value, 4); + url_value.setValue("all", sfb.tokenize("http://www.flickr.com:85/fluke?ab=2#13").build()); + url_value.setValue("scheme", sfb.tokenize("http").build()); + url_value.setValue("host", sfb.tokenize("www.flickr.com").build()); + url_value.setValue("port", sfb.tokenize("85").build()); + url_value.setValue("path", sfb.tokenize("/fluke").build()); + url_value.setValue("query", sfb.tokenize("ab=2").build()); + url_value.setValue("fragment", sfb.tokenize("13").build()); + url_wset.add(url_value, 7); + doc->setValue("iwu", url_wset); _inv.invertDocument(10, *doc, {}); myPushDocument(_inv); @@ -1360,21 +1296,16 @@ TEST_F(UriInverterTest, require_that_uri_indexing_is_working) class CjkInverterTest : public InverterTest { public: - CjkInverterTest() : InverterTest(make_single_field_schema()) {} + CjkInverterTest() : InverterTest(make_single_field_schema(), make_single_add_fields()) {} }; TEST_F(CjkInverterTest, require_that_cjk_indexing_is_working) { Document::UP doc; + StringFieldBuilder sfb(_b); - _b.startDocument("id:ns:searchdocument::10"); - _b.startIndexField("f0"). - addStr("我就是那个"). - setAutoSpace(false). - addStr("大灰狼"). - setAutoSpace(true). - endField(); - doc = _b.endDocument(); + doc = _b.make_document("id:ns:searchdocument::10"); + doc->setValue("f0", sfb.word("我就是那个").word("大灰狼").build()); _inv.invertDocument(10, *doc, {}); myPushDocument(_inv); diff --git a/searchlib/src/tests/memoryindex/field_inverter/field_inverter_test.cpp b/searchlib/src/tests/memoryindex/field_inverter/field_inverter_test.cpp index ed049a82c42..bf3a911a579 100644 --- a/searchlib/src/tests/memoryindex/field_inverter/field_inverter_test.cpp +++ b/searchlib/src/tests/memoryindex/field_inverter/field_inverter_test.cpp @@ -1,8 +1,14 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/document/repo/fixedtyperepo.h> -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/document/fieldvalue/document.h> +#include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/fieldvalue/weightedsetfieldvalue.h> +#include <vespa/document/repo/configbuilder.h> +#include <vespa/searchcommon/common/schema.h> +#include <vespa/searchlib/index/empty_doc_builder.h> #include <vespa/searchlib/index/field_length_calculator.h> +#include <vespa/searchlib/index/string_field_builder.h> #include <vespa/searchlib/memoryindex/field_index_remover.h> #include <vespa/searchlib/memoryindex/field_inverter.h> #include <vespa/searchlib/memoryindex/word_store.h> @@ -13,9 +19,12 @@ namespace search { +using document::ArrayFieldValue; using document::Document; -using index::DocBuilder; +using document::WeightedSetFieldValue; +using index::EmptyDocBuilder; using index::Schema; +using index::StringFieldBuilder; using index::schema::CollectionType; using index::schema::DataType; @@ -26,93 +35,91 @@ namespace memoryindex { namespace { Document::UP -makeDoc10(DocBuilder &b) +makeDoc10(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::10"); - b.startIndexField("f0"). - addStr("a").addStr("b").addStr("c").addStr("d"). - endField(); - return b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::10"); + doc->setValue("f0", sfb.tokenize("a b c d").build()); + return doc; } Document::UP -makeDoc11(DocBuilder &b) +makeDoc11(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::11"); - b.startIndexField("f0"). - addStr("a").addStr("b").addStr("e").addStr("f"). - endField(); - b.startIndexField("f1"). - addStr("a").addStr("g"). - endField(); - return b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::11"); + doc->setValue("f0", sfb.tokenize("a b e f").build()); + doc->setValue("f1", sfb.tokenize("a g").build()); + return doc; } Document::UP -makeDoc12(DocBuilder &b) +makeDoc12(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::12"); - b.startIndexField("f0"). - addStr("h").addStr("doc12"). - endField(); - return b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::12"); + doc->setValue("f0", sfb.tokenize("h doc12").build()); + return doc; } Document::UP -makeDoc13(DocBuilder &b) +makeDoc13(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::13"); - b.startIndexField("f0"). - addStr("i").addStr("doc13"). - endField(); - return b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::13"); + doc->setValue("f0", sfb.tokenize("i doc13").build()); + return doc; } Document::UP -makeDoc14(DocBuilder &b) +makeDoc14(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::14"); - b.startIndexField("f0"). - addStr("j").addStr("doc14"). - endField(); - return b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::14"); + doc->setValue("f0", sfb.tokenize("j doc14").build()); + return doc; } Document::UP -makeDoc15(DocBuilder &b) +makeDoc15(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::15"); - return b.endDocument(); + return b.make_document("id:ns:searchdocument::15"); } Document::UP -makeDoc16(DocBuilder &b) +makeDoc16(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::16"); - b.startIndexField("f0").addStr("foo").addStr("bar").addStr("baz"). - addTermAnnotation("altbaz").addStr("y").addTermAnnotation("alty"). - addStr("z").endField(); - return b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::16"); + doc->setValue("f0", sfb.tokenize("foo bar baz").alt_word("altbaz").tokenize(" y").alt_word("alty").tokenize(" z").build()); + return doc; } Document::UP -makeDoc17(DocBuilder &b) +makeDoc17(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::17"); - b.startIndexField("f1").addStr("foo0").addStr("bar0").endField(); - b.startIndexField("f2").startElement(1).addStr("foo").addStr("bar").endElement().startElement(1).addStr("bar").endElement().endField(); - b.startIndexField("f3").startElement(3).addStr("foo2").addStr("bar2").endElement().startElement(4).addStr("bar2").endElement().endField(); - return b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::17"); + doc->setValue("f1", sfb.tokenize("foo0 bar0").build()); + ArrayFieldValue string_array(b.get_data_type("Array<String>")); + string_array.add(sfb.tokenize("foo bar").build()); + string_array.add(sfb.tokenize("bar").build()); + doc->setValue("f2", string_array); + WeightedSetFieldValue string_wset(b.get_data_type("WeightedSet<String>")); + string_wset.add(sfb.tokenize("foo2 bar2").build(), 3); + string_wset.add(sfb.tokenize("bar2").build(), 4); + doc->setValue("f3", string_wset); + return doc; } vespalib::string corruptWord = "corruptWord"; Document::UP -makeCorruptDocument(DocBuilder &b, size_t wordOffset) +makeCorruptDocument(EmptyDocBuilder &b, size_t wordOffset) { - b.startDocument("id:ns:searchdocument::18"); - b.startIndexField("f0").addStr("before").addStr(corruptWord).addStr("after").addStr("z").endField(); - auto doc = b.endDocument(); + StringFieldBuilder sfb(b); + auto doc = b.make_document("id:ns:searchdocument::18"); + doc->setValue("f0", sfb.tokenize("before ").word(corruptWord).tokenize(" after z").build()); vespalib::nbostream stream; doc->serialize(stream); std::vector<char> raw; @@ -127,14 +134,14 @@ makeCorruptDocument(DocBuilder &b, size_t wordOffset) } vespalib::nbostream badstream; badstream.write(&raw[0], raw.size()); - return std::make_unique<Document>(*b.getDocumentTypeRepo(), badstream); + return std::make_unique<Document>(b.get_repo(), badstream); } } struct FieldInverterTest : public ::testing::Test { Schema _schema; - DocBuilder _b; + EmptyDocBuilder _b; WordStore _word_store; FieldIndexRemover _remover; test::OrderedFieldIndexInserterBackend _inserter_backend; @@ -151,9 +158,21 @@ struct FieldInverterTest : public ::testing::Test { return schema; } + static EmptyDocBuilder::AddFieldsType + make_add_fields() + { + return [](auto& header) { using namespace document::config_builder; + using DataType = document::DataType; + header.addField("f0", DataType::T_STRING) + .addField("f1", DataType::T_STRING) + .addField("f2", Array(DataType::T_STRING)) + .addField("f3", Wset(DataType::T_STRING)); + }; + } + FieldInverterTest() : _schema(makeSchema()), - _b(_schema), + _b(make_add_fields()), _word_store(), _remover(_word_store), _inserter_backend(), diff --git a/searchlib/src/tests/memoryindex/memory_index/memory_index_test.cpp b/searchlib/src/tests/memoryindex/memory_index/memory_index_test.cpp index b3ea948dfa7..1730e34adb5 100644 --- a/searchlib/src/tests/memoryindex/memory_index/memory_index_test.cpp +++ b/searchlib/src/tests/memoryindex/memory_index/memory_index_test.cpp @@ -1,11 +1,15 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/document/fieldvalue/document.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/repo/configbuilder.h> #include <vespa/searchlib/common/scheduletaskcallback.h> #include <vespa/searchlib/fef/matchdata.h> #include <vespa/searchlib/fef/matchdatalayout.h> #include <vespa/searchlib/fef/termfieldmatchdata.h> -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchlib/index/empty_doc_builder.h> #include <vespa/searchlib/index/i_field_length_inspector.h> +#include <vespa/searchlib/index/string_field_builder.h> #include <vespa/searchlib/memoryindex/memory_index.h> #include <vespa/searchlib/query/tree/simplequery.h> #include <vespa/searchlib/queryeval/booleanmatchiteratorwrapper.h> @@ -59,6 +63,12 @@ struct MySetup : public IFieldLengthInspector { } return FieldLengthInfo(); } + void add_fields(document::config_builder::Struct& header) const { + for (uint32_t i = 0; i < schema.getNumIndexFields(); ++i) { + auto& field = schema.getIndexField(i); + header.addField(field.getName(), document::DataType::T_STRING); + } + } }; @@ -70,31 +80,38 @@ struct Index { std::unique_ptr<ISequencedTaskExecutor> _invertThreads; std::unique_ptr<ISequencedTaskExecutor> _pushThreads; MemoryIndex index; - DocBuilder builder; + EmptyDocBuilder builder; + StringFieldBuilder sfb; + std::unique_ptr<Document> builder_doc; uint32_t docid; std::string currentField; + bool add_space; Index(const MySetup &setup); ~Index(); void closeField() { if (!currentField.empty()) { - builder.endField(); + builder_doc->setValue(currentField, sfb.build()); currentField.clear(); } } Index &doc(uint32_t id) { docid = id; - builder.startDocument(vespalib::make_string("id:ns:searchdocument::%u", id)); + builder_doc = builder.make_document(vespalib::make_string("id:ns:searchdocument::%u", id)); return *this; } Index &field(const std::string &name) { closeField(); - builder.startIndexField(name); currentField = name; + add_space = false; return *this; } Index &add(const std::string &token) { - builder.addStr(token); + if (add_space) { + sfb.space(); + } + add_space = true; + sfb.word(token); return *this; } void internalSyncCommit() { @@ -106,7 +123,7 @@ struct Index { } Document::UP commit() { closeField(); - Document::UP d = builder.endDocument(); + Document::UP d = std::move(builder_doc); index.insertDocument(docid, *d, {}); internalSyncCommit(); return d; @@ -133,9 +150,12 @@ Index::Index(const MySetup &setup) _invertThreads(SequencedTaskExecutor::create(invert_executor, 2)), _pushThreads(SequencedTaskExecutor::create(push_executor, 2)), index(schema, setup, *_invertThreads, *_pushThreads), - builder(schema), + builder([&setup](auto& header) { setup.add_fields(header); }), + sfb(builder), + builder_doc(), docid(1), - currentField() + currentField(), + add_space(false) { } Index::~Index() = default; diff --git a/searchlib/src/tests/memoryindex/url_field_inverter/url_field_inverter_test.cpp b/searchlib/src/tests/memoryindex/url_field_inverter/url_field_inverter_test.cpp index 969f483eef6..3995f06628c 100644 --- a/searchlib/src/tests/memoryindex/url_field_inverter/url_field_inverter_test.cpp +++ b/searchlib/src/tests/memoryindex/url_field_inverter/url_field_inverter_test.cpp @@ -1,11 +1,21 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/searchlib/memoryindex/url_field_inverter.h> +#include <vespa/document/datatype/urldatatype.h> +#include <vespa/document/fieldvalue/document.h> +#include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/fieldvalue/structfieldvalue.h> +#include <vespa/document/fieldvalue/weightedsetfieldvalue.h> +#include <vespa/document/repo/configbuilder.h> #include <vespa/document/repo/fixedtyperepo.h> -#include <vespa/searchlib/index/docbuilder.h> +#include <vespa/searchcommon/common/schema.h> +#include <vespa/searchlib/index/empty_doc_builder.h> #include <vespa/searchlib/index/field_length_calculator.h> +#include <vespa/searchlib/index/schema_index_fields.h> +#include <vespa/searchlib/index/string_field_builder.h> #include <vespa/searchlib/memoryindex/field_index_remover.h> #include <vespa/searchlib/memoryindex/field_inverter.h> -#include <vespa/searchlib/memoryindex/url_field_inverter.h> #include <vespa/searchlib/memoryindex/word_store.h> #include <vespa/searchlib/test/memoryindex/ordered_field_index_inserter.h> #include <vespa/searchlib/test/memoryindex/ordered_field_index_inserter_backend.h> @@ -14,6 +24,10 @@ namespace search { using document::Document; +using document::ArrayFieldValue; +using document::StructFieldValue; +using document::UrlDataType; +using document::WeightedSetFieldValue; using index::schema::CollectionType; using index::schema::DataType; @@ -26,160 +40,88 @@ namespace { const vespalib::string url = "url"; Document::UP -makeDoc10Single(DocBuilder &b) +makeDoc10Single(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::10"); - b.startIndexField("url"). - startSubField("all"). - addUrlTokenizedString("http://www.example.com:81/fluke?ab=2#4"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.example.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("81"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - addTermAnnotation("altfluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("4"). - endSubField(). - endField(); - return b.endDocument(); + auto doc = b.make_document("id:ns:searchdocument::10"); + StructFieldValue url_value(b.get_data_type("url")); + StringFieldBuilder sfb(b); + sfb.url_mode(true); + url_value.setValue("all", sfb.tokenize("http://www.example.com:81/fluke?ab=2#4").build()); + url_value.setValue("scheme", sfb.tokenize("http").build()); + url_value.setValue("host", sfb.tokenize("www.example.com").build()); + url_value.setValue("port", sfb.tokenize("81").build()); + url_value.setValue("path", sfb.tokenize("/fluke").alt_word("altfluke").build()); + url_value.setValue("query", sfb.tokenize("ab=2").build()); + url_value.setValue("fragment", sfb.tokenize("4").build()); + doc->setValue("url", url_value); + return doc; } Document::UP -makeDoc10Array(DocBuilder &b) +makeDoc10Array(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::10"); - b.startIndexField("url"). - startElement(1). - startSubField("all"). - addUrlTokenizedString("http://www.example.com:82/fluke?ab=2#8"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.example.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("82"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - addTermAnnotation("altfluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("8"). - endSubField(). - endElement(). - startElement(1). - startSubField("all"). - addUrlTokenizedString("http://www.flickr.com:82/fluke?ab=2#9"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.flickr.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("82"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("9"). - endSubField(). - endElement(). - endField(); - return b.endDocument(); + auto doc = b.make_document("id:ns:searchdocument::10"); + StringFieldBuilder sfb(b); + sfb.url_mode(true); + ArrayFieldValue url_array(b.get_data_type("Array<url>")); + StructFieldValue url_value(b.get_data_type("url")); + url_value.setValue("all", sfb.tokenize("http://www.example.com:82/fluke?ab=2#8").build()); + url_value.setValue("scheme", sfb.tokenize("http").build()); + url_value.setValue("host", sfb.tokenize("www.example.com").build()); + url_value.setValue("port", sfb.tokenize("82").build()); + url_value.setValue("path", sfb.tokenize("/fluke").alt_word("altfluke").build()); + url_value.setValue("query", sfb.tokenize("ab=2").build()); + url_value.setValue("fragment", sfb.tokenize("8").build()); + url_array.add(url_value); + url_value.setValue("all", sfb.tokenize("http://www.flickr.com:82/fluke?ab=2#9").build()); + url_value.setValue("scheme", sfb.tokenize("http").build()); + url_value.setValue("host", sfb.tokenize("www.flickr.com").build()); + url_value.setValue("path", sfb.tokenize("/fluke").build()); + url_value.setValue("fragment", sfb.tokenize("9").build()); + url_array.add(url_value); + doc->setValue("url", url_array); + return doc; } Document::UP -makeDoc10WeightedSet(DocBuilder &b) +makeDoc10WeightedSet(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::10"); - b.startIndexField("url"). - startElement(4). - startSubField("all"). - addUrlTokenizedString("http://www.example.com:83/fluke?ab=2#12"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.example.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("83"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - addTermAnnotation("altfluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("12"). - endSubField(). - endElement(). - startElement(7). - startSubField("all"). - addUrlTokenizedString("http://www.flickr.com:85/fluke?ab=2#13"). - endSubField(). - startSubField("scheme"). - addUrlTokenizedString("http"). - endSubField(). - startSubField("host"). - addUrlTokenizedString("www.flickr.com"). - endSubField(). - startSubField("port"). - addUrlTokenizedString("85"). - endSubField(). - startSubField("path"). - addUrlTokenizedString("/fluke"). - endSubField(). - startSubField("query"). - addUrlTokenizedString("ab=2"). - endSubField(). - startSubField("fragment"). - addUrlTokenizedString("13"). - endSubField(). - endElement(). - endField(); - return b.endDocument(); + auto doc = b.make_document("id:ns:searchdocument::10"); + StringFieldBuilder sfb(b); + sfb.url_mode(true); + WeightedSetFieldValue url_wset(b.get_data_type("WeightedSet<url>")); + StructFieldValue url_value(b.get_data_type("url")); + url_value.setValue("all", sfb.tokenize("http://www.example.com:83/fluke?ab=2#12").build()); + url_value.setValue("scheme", sfb.tokenize("http").build()); + url_value.setValue("host", sfb.tokenize("www.example.com").build()); + url_value.setValue("port", sfb.tokenize("83").build()); + url_value.setValue("path", sfb.tokenize("/fluke").alt_word("altfluke").build()); + url_value.setValue("query", sfb.tokenize("ab=2").build()); + url_value.setValue("fragment", sfb.tokenize("12").build()); + url_wset.add(url_value, 4); + url_value.setValue("all", sfb.tokenize("http://www.flickr.com:85/fluke?ab=2#13").build()); + url_value.setValue("scheme", sfb.tokenize("http").build()); + url_value.setValue("host", sfb.tokenize("www.flickr.com").build()); + url_value.setValue("port", sfb.tokenize("85").build()); + url_value.setValue("path", sfb.tokenize("/fluke").build()); + url_value.setValue("query", sfb.tokenize("ab=2").build()); + url_value.setValue("fragment", sfb.tokenize("13").build()); + url_wset.add(url_value, 7); + doc->setValue("url", url_wset); + return doc; } Document::UP -makeDoc10Empty(DocBuilder &b) +makeDoc10Empty(EmptyDocBuilder &b) { - b.startDocument("id:ns:searchdocument::10"); - return b.endDocument(); + return b.make_document("id:ns:searchdocument::10"); } } struct UrlFieldInverterTest : public ::testing::Test { Schema _schema; - DocBuilder _b; + EmptyDocBuilder _b; WordStore _word_store; FieldIndexRemover _remover; test::OrderedFieldIndexInserterBackend _inserter_backend; @@ -195,9 +137,10 @@ struct UrlFieldInverterTest : public ::testing::Test { return schema; } - UrlFieldInverterTest(Schema::CollectionType collectionType) + UrlFieldInverterTest(Schema::CollectionType collectionType, + EmptyDocBuilder::AddFieldsType add_fields) : _schema(makeSchema(collectionType)), - _b(_schema), + _b(add_fields), _word_store(), _remover(_word_store), _inserter_backend(), @@ -250,16 +193,32 @@ struct UrlFieldInverterTest : public ::testing::Test { UrlFieldInverterTest::~UrlFieldInverterTest() = default; +EmptyDocBuilder::AddFieldsType +add_single_url = [](auto& header) { + header.addField("url", UrlDataType::getInstance().getId()); }; + +EmptyDocBuilder::AddFieldsType +add_array_url = [](auto& header) { + using namespace document::config_builder; + header.addField("url", Array(UrlDataType::getInstance().getId())); }; + +EmptyDocBuilder::AddFieldsType +add_wset_url = [](auto& header) { + using namespace document::config_builder; + header.addField("url", Wset(UrlDataType::getInstance().getId())); }; + + + struct SingleInverterTest : public UrlFieldInverterTest { - SingleInverterTest() : UrlFieldInverterTest(CollectionType::SINGLE) {} + SingleInverterTest() : UrlFieldInverterTest(CollectionType::SINGLE, add_single_url) {} }; struct ArrayInverterTest : public UrlFieldInverterTest { - ArrayInverterTest() : UrlFieldInverterTest(CollectionType::ARRAY) {} + ArrayInverterTest() : UrlFieldInverterTest(CollectionType::ARRAY, add_array_url) {} }; struct WeightedSetInverterTest : public UrlFieldInverterTest { - WeightedSetInverterTest() : UrlFieldInverterTest(CollectionType::WEIGHTEDSET) {} + WeightedSetInverterTest() : UrlFieldInverterTest(CollectionType::WEIGHTEDSET, add_wset_url) {} }; diff --git a/searchlib/src/vespa/searchlib/index/CMakeLists.txt b/searchlib/src/vespa/searchlib/index/CMakeLists.txt index 958614844d1..afeb020598b 100644 --- a/searchlib/src/vespa/searchlib/index/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/index/CMakeLists.txt @@ -2,9 +2,7 @@ vespa_add_library(searchlib_searchlib_index OBJECT SOURCES dictionaryfile.cpp - docbuilder.cpp docidandfeatures.cpp - doctypebuilder.cpp dummyfileheadercontext.cpp empty_doc_builder.cpp indexbuilder.cpp @@ -15,6 +13,7 @@ vespa_add_library(searchlib_searchlib_index OBJECT postinglistparams.cpp schemautil.cpp schema_index_fields.cpp + string_field_builder.cpp uri_field.cpp DEPENDS ) diff --git a/searchlib/src/vespa/searchlib/index/docbuilder.cpp b/searchlib/src/vespa/searchlib/index/docbuilder.cpp deleted file mode 100644 index d6169f2f396..00000000000 --- a/searchlib/src/vespa/searchlib/index/docbuilder.cpp +++ /dev/null @@ -1,814 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "docbuilder.h" -#include <vespa/document/datatype/urldatatype.h> -#include <vespa/document/datatype/documenttype.h> -#include <vespa/document/repo/documenttyperepo.h> -#include <vespa/fastlib/text/unicodeutil.h> -#include <vespa/vespalib/geo/zcurve.h> -#include <vespa/vespalib/text/utf8.h> -#include <vespa/eval/eval/value.h> -#include <vespa/vespalib/data/slime/slime.h> - -using namespace document; -using namespace search::index; - -using search::index::schema::CollectionType; -using vespalib::Utf8Reader; -using vespalib::Utf8Writer; -using vespalib::geo::ZCurve; - -namespace { - -void -insertStr(const Schema::Field & sfield, document::FieldValue * fvalue, const vespalib::string & val) -{ - if (sfield.getDataType() == schema::DataType::STRING || - sfield.getDataType() == schema::DataType::RAW) - { - (dynamic_cast<LiteralFieldValueB *>(fvalue))->setValue(val); - } else { - throw DocBuilder::Error(vespalib::make_string("Field '%s' not compatible", sfield.getName().c_str())); - } -} - -void -insertInt(const Schema::Field & sfield, document::FieldValue * fvalue, int64_t val) -{ - if (sfield.getDataType() == schema::DataType::INT8) { - (dynamic_cast<ByteFieldValue *>(fvalue))->setValue((uint8_t)val); - } else if (sfield.getDataType() == schema::DataType::INT16) { - (dynamic_cast<ShortFieldValue *>(fvalue))->setValue((int16_t)val); - } else if (sfield.getDataType() == schema::DataType::INT32) { - (dynamic_cast<IntFieldValue *>(fvalue))->setValue((int32_t)val); - } else if (sfield.getDataType() == schema::DataType::INT64) { - (dynamic_cast<LongFieldValue *>(fvalue))->setValue(val); - } else { - throw DocBuilder::Error(vespalib::make_string("Field '%s' not compatible", sfield.getName().c_str())); - } -} - -void -insertFloat(const Schema::Field & sfield, document::FieldValue * fvalue, double val) -{ - if (sfield.getDataType() == schema::DataType::FLOAT) { - (dynamic_cast<FloatFieldValue *>(fvalue))->setValue((float)val); - } else if (sfield.getDataType() == schema::DataType::DOUBLE) { - (dynamic_cast<DoubleFieldValue *>(fvalue))->setValue(val); - } else { - throw DocBuilder::Error(vespalib::make_string("Field '%s' not compatible", sfield.getName().c_str())); - } -} - -void insertPredicate(const Schema::Field &sfield, - document::FieldValue *fvalue, - std::unique_ptr<vespalib::Slime> val) { - if (sfield.getDataType() == schema::DataType::BOOLEANTREE) { - *(dynamic_cast<PredicateFieldValue *>(fvalue)) = - PredicateFieldValue(std::move(val)); - } else { - throw DocBuilder::Error(vespalib::make_string( - "Field '%s' not compatible", - sfield.getName().c_str())); - } -} - -void insertTensor(const Schema::Field &schemaField, - document::FieldValue *fvalue, - std::unique_ptr<vespalib::eval::Value> val) { - if (schemaField.getDataType() == schema::DataType::TENSOR) { - *(dynamic_cast<TensorFieldValue *>(fvalue)) = std::move(val); - } else { - throw DocBuilder::Error(vespalib::make_string( - "Field '%s' not compatible", - schemaField.getName().c_str())); - } -} - -void -insertPosition(const Schema::Field & sfield, - document::FieldValue * fvalue, int32_t xpos, int32_t ypos) -{ - assert(*fvalue->getDataType() == *DataType::LONG); - assert(sfield.getDataType() == schema::DataType::INT64); - (void) sfield; - int64_t zpos = ZCurve::encode(xpos, ypos); - document::LongFieldValue *zvalue = - dynamic_cast<LongFieldValue *>(fvalue); - zvalue->setValue(zpos); -} - -} - -namespace docbuilderkludge -{ - -namespace linguistics -{ - -const vespalib::string SPANTREE_NAME("linguistics"); - -enum TokenType { - UNKNOWN = 0, - SPACE = 1, - PUNCTUATION = 2, - SYMBOL = 3, - ALPHABETIC = 4, - NUMERIC = 5, - MARKER = 6 -}; - -} - -} - -using namespace docbuilderkludge; - -namespace { - -Annotation -makeTokenType(linguistics::TokenType type) -{ - return Annotation(*AnnotationType::TOKEN_TYPE, std::make_unique<IntFieldValue>(type)); -} - -} - -namespace search::index { - -VESPA_IMPLEMENT_EXCEPTION(DocBuilderError, vespalib::Exception); - -DocBuilder::FieldHandle::FieldHandle(const document::Field & dfield, const Schema::Field & field) : - _sfield(field), - _value(), - _element() -{ - _value = dfield.createValue(); -} - -DocBuilder::CollectionFieldHandle::CollectionFieldHandle(const document::Field & dfield, const Schema::Field & field) : - FieldHandle(dfield, field), - _elementWeight(1) -{ -} - -void -DocBuilder::CollectionFieldHandle::startElement(int32_t weight) -{ - assert(!_element); - _elementWeight = weight; - const CollectionFieldValue * value = dynamic_cast<CollectionFieldValue *>(_value.get()); - _element = value->createNested(); -} - -void -DocBuilder::CollectionFieldHandle::endElement() -{ - if (_sfield.getCollectionType() == CollectionType::ARRAY) { - onEndElement(); - ArrayFieldValue * value = dynamic_cast<ArrayFieldValue *>(_value.get()); - value->add(*_element); - } else if (_sfield.getCollectionType() == CollectionType::WEIGHTEDSET) { - onEndElement(); - WeightedSetFieldValue * value = dynamic_cast<WeightedSetFieldValue *>(_value.get()); - value->add(*_element, _elementWeight); - } else { - throw Error(vespalib::make_string("Field '%s' not compatible", _sfield.getName().c_str())); - } - _element.reset(); -} - -DocBuilder::IndexFieldHandle::IndexFieldHandle(const FixedTypeRepo & repo, const document::Field & dfield, const Schema::Field & sfield) - : CollectionFieldHandle(dfield, sfield), - _str(), - _strSymbols(0u), - _spanList(nullptr), - _spanTree(), - _lastSpan(nullptr), - _spanStart(0u), - _autoAnnotate(true), - _autoSpace(true), - _skipAutoSpace(true), - _uriField(false), - _subField(), - _repo(repo) -{ - _str.reserve(1023); - - if (_sfield.getCollectionType() == CollectionType::SINGLE) { - if (*_value->getDataType() == document::UrlDataType::getInstance()) { - _uriField = true; - } - } else { - const CollectionFieldValue * value = dynamic_cast<CollectionFieldValue *>(_value.get()); - if (value->getNestedType() == document::UrlDataType::getInstance()) { - _uriField = true; - } - } - startAnnotate(); -} - -void -DocBuilder::IndexFieldHandle::append(const vespalib::string &val) -{ - _strSymbols += val.size(); - _str += val; -} - -void -DocBuilder::IndexFieldHandle::addStr(const vespalib::string &val) -{ - assert(_spanTree); - if (val.empty()) { - return; - } - if (!_skipAutoSpace && _autoSpace) { - addSpace(); - } - _skipAutoSpace = false; - _spanStart = _strSymbols; - append(val); - if (_autoAnnotate) { - addSpan(); - addTermAnnotation(); - if (val[0] >= '0' && val[0] <= '9') { - addNumericTokenAnnotation(); - } else { - addAlphabeticTokenAnnotation(); - } - } -} - -void -DocBuilder::IndexFieldHandle::addSpace() -{ - addNoWordStr(" "); -} - -void -DocBuilder::IndexFieldHandle::addNoWordStr(const vespalib::string &val) -{ - assert(_spanTree); - if (val.empty()) { - return; - } - _spanStart = _strSymbols; - append(val); - if (_autoAnnotate) { - addSpan(); - if (val[0] == ' ' || val[0] == '\t') { - addSpaceTokenAnnotation(); - } else if (val[0] >= '0' && val[0] <= '9') { - addNumericTokenAnnotation(); - } else { - addAlphabeticTokenAnnotation(); - } - - } - _skipAutoSpace = true; -} - -void -DocBuilder::IndexFieldHandle::addTokenizedString(const vespalib::string &val, - bool urlMode) -{ - Utf8Reader r(val); - vespalib::string sbuf; - Utf8Writer w(sbuf); - uint32_t c = 0u; - bool oldWord = false; - assert(_uriField == urlMode); - assert(_uriField != _subField.empty()); - - while (r.hasMore()) { - c = r.getChar(); - bool newWord = Fast_UnicodeUtil::IsWordChar(c) || - (urlMode && (c == '-' || c == '_')); - if (oldWord != newWord) { - if (!sbuf.empty()) { - if (oldWord) { - addStr(sbuf); - } else { - addNoWordStr(sbuf); - } - sbuf.clear(); - } - oldWord = newWord; - } - w.putChar(c); - } - if (!sbuf.empty()) { - if (oldWord) { - addStr(sbuf); - } else { - addNoWordStr(sbuf); - } - } -} - -void -DocBuilder::IndexFieldHandle::addSpan(size_t start, size_t len) -{ - const SpanNode &span = _spanList->add(std::make_unique<Span>(start, len)); - _lastSpan = &span; -} - -void -DocBuilder::IndexFieldHandle::addSpan() -{ - size_t endPos = _strSymbols; - assert(endPos > _spanStart); - addSpan(_spanStart, endPos - _spanStart); - _spanStart = endPos; -} - -void -DocBuilder::IndexFieldHandle::addSpaceTokenAnnotation() -{ - assert(_spanTree); - assert(_lastSpan != nullptr); - _spanTree->annotate(*_lastSpan, makeTokenType(linguistics::SPACE)); -} - -void -DocBuilder::IndexFieldHandle::addNumericTokenAnnotation() -{ - assert(_spanTree); - assert(_lastSpan != nullptr); - _spanTree->annotate(*_lastSpan, makeTokenType(linguistics::NUMERIC)); -} - -void -DocBuilder::IndexFieldHandle::addAlphabeticTokenAnnotation() -{ - assert(_spanTree); - assert(_lastSpan != nullptr); - _spanTree->annotate(*_lastSpan, makeTokenType(linguistics::ALPHABETIC)); -} - -void -DocBuilder::IndexFieldHandle::addTermAnnotation() -{ - assert(_spanTree); - assert(_lastSpan != nullptr); - _spanTree->annotate(*_lastSpan, *AnnotationType::TERM); -} - -void -DocBuilder::IndexFieldHandle::addTermAnnotation(const vespalib::string &val) -{ - assert(_spanTree); - assert(_lastSpan != nullptr); - _spanTree->annotate(*_lastSpan, - Annotation(*AnnotationType::TERM, - std::make_unique<StringFieldValue>(val))); -} - -void -DocBuilder::IndexFieldHandle::onEndElement() -{ - // Flush data for index field. - assert(_subField.empty()); - if (_uriField) { - return; - } - StringFieldValue * value; - if (_sfield.getCollectionType() != CollectionType::SINGLE) { - value = dynamic_cast<StringFieldValue *>(_element.get()); - } else { - value = dynamic_cast<StringFieldValue *>(_value.get()); - } - value->setValue(_str); - // Also drop all spans no annotation for now - if (_spanTree->numAnnotations() > 0u) { - StringFieldValue::SpanTrees trees; - trees.emplace_back(std::move(_spanTree)); - value->setSpanTrees(trees, _repo); - } else { - _spanTree.reset(); - } - _spanList = nullptr; - _lastSpan = nullptr; - _spanStart = 0u; - _strSymbols = 0u; - _str.clear(); - _skipAutoSpace = true; - startAnnotate(); -} - -void -DocBuilder::IndexFieldHandle::onEndField() -{ - if (_sfield.getCollectionType() == CollectionType::SINGLE) { - onEndElement(); - } -} - -void -DocBuilder::IndexFieldHandle::startAnnotate() -{ - SpanList::UP span_list(new SpanList); - _spanList = span_list.get(); - _spanTree.reset(new SpanTree(linguistics::SPANTREE_NAME, std::move(span_list))); -} - -void -DocBuilder::IndexFieldHandle::setAutoAnnotate(bool autoAnnotate) -{ - _autoAnnotate = autoAnnotate; -} - -void -DocBuilder::IndexFieldHandle::setAutoSpace(bool autoSpace) -{ - _autoSpace = autoSpace; -} - -void -DocBuilder::IndexFieldHandle::startSubField(const vespalib::string &subField) -{ - assert(_subField.empty()); - assert(_uriField); - _subField = subField; -} - -void -DocBuilder::IndexFieldHandle::endSubField() -{ - assert(!_subField.empty()); - assert(_uriField); - StructuredFieldValue *sValue; - if (_sfield.getCollectionType() != CollectionType::SINGLE) { - sValue = dynamic_cast<StructFieldValue *>(_element.get()); - } else { - sValue = dynamic_cast<StructFieldValue *>(_value.get()); - } - const Field &f = sValue->getField(_subField); - FieldValue::UP fval(f.getDataType().createFieldValue()); - *fval = _str; - StringFieldValue *value = dynamic_cast<StringFieldValue *>(fval.get()); - StringFieldValue::SpanTrees trees; - trees.emplace_back(std::move(_spanTree)); - value->setSpanTrees(trees, _repo); - sValue->setValue(f, *fval); - _spanList = nullptr; - _lastSpan = nullptr; - _spanStart = 0u; - _strSymbols = 0u; - _str.clear(); - _skipAutoSpace = true; - startAnnotate(); - _subField.clear(); -} - -DocBuilder::AttributeFieldHandle:: -AttributeFieldHandle(const document::Field &dfield, - const Schema::Field &sfield) - : CollectionFieldHandle(dfield, sfield) -{ -} - -void -DocBuilder::AttributeFieldHandle::addStr(const vespalib::string & val) -{ - if (_element) { - insertStr(_sfield, _element.get(), val); - } else { - insertStr(_sfield, _value.get(), val); - } -} - -void -DocBuilder::AttributeFieldHandle::addInt(int64_t val) -{ - if (_element) { - insertInt(_sfield, _element.get(), val); - } else { - insertInt(_sfield, _value.get(), val); - } -} - -void -DocBuilder::AttributeFieldHandle::addFloat(double val) -{ - if (_element) { - insertFloat(_sfield, _element.get(), val); - } else { - insertFloat(_sfield, _value.get(), val); - } -} - -void -DocBuilder::AttributeFieldHandle::addPredicate( - std::unique_ptr<vespalib::Slime> val) -{ - if (_element) { - insertPredicate(_sfield, _element.get(), std::move(val)); - } else { - insertPredicate(_sfield, _value.get(), std::move(val)); - } -} - -void -DocBuilder::AttributeFieldHandle::addTensor( - std::unique_ptr<vespalib::eval::Value> val) -{ - if (_element) { - insertTensor(_sfield, _element.get(), std::move(val)); - } else { - insertTensor(_sfield, _value.get(), std::move(val)); - } -} - -void -DocBuilder::AttributeFieldHandle::addPosition(int32_t xpos, int32_t ypos) -{ - if (_element) { - insertPosition(_sfield, _element.get(), xpos, ypos); - } else { - insertPosition(_sfield, _value.get(), xpos, ypos); - } -} - -DocBuilder::DocumentHandle::DocumentHandle(document::Document &doc, const vespalib::string & docId) - : _type(&doc.getType()), - _doc(&doc), - _fieldHandle(), - _repo(*_doc->getRepo(), *_type) -{ - (void) docId; -} - -DocBuilder::DocumentHandle::~DocumentHandle() = default; - -void -DocBuilder::DocumentHandle::startIndexField(const Schema::Field & sfield) { - _fieldHandle.reset(new IndexFieldHandle(_repo, _type->getField(sfield.getName()), sfield)); -} -void -DocBuilder::DocumentHandle::startAttributeField(const Schema::Field & sfield) { - _fieldHandle.reset(new AttributeFieldHandle(_type->getField(sfield.getName()), sfield)); -} - -void -DocBuilder::DocumentHandle::endField() { - _fieldHandle->onEndField(); - _doc->setValue(_type->getField(_fieldHandle->getField().getName()), *_fieldHandle->getValue()); - _fieldHandle.reset(); -} - -DocBuilder::DocBuilder(const Schema &schema) - : _schema(schema), - _doctypes_config(DocTypeBuilder(schema).makeConfig()), - _repo(std::make_shared<DocumentTypeRepo>(_doctypes_config)), - _docType(*_repo->getDocumentType("searchdocument")), - _doc(), - _handleDoc(), - _currDoc() -{ -} - -DocBuilder::~DocBuilder() = default; - -DocBuilder & -DocBuilder::startDocument(const vespalib::string & docId) -{ - _doc = std::make_unique<Document>(_docType, DocumentId(docId)); - _doc->setRepo(*_repo); - _handleDoc = std::make_shared<DocumentHandle>(*_doc, docId); - return *this; -} - -document::Document::UP -DocBuilder::endDocument() -{ - _handleDoc->endDocument(_doc); - return std::move(_doc); -} - -DocBuilder & -DocBuilder::startIndexField(const vespalib::string & name) -{ - assert(!_handleDoc->getFieldHandle()); - uint32_t field_id = _schema.getIndexFieldId(name); - assert(field_id != Schema::UNKNOWN_FIELD_ID); - _handleDoc->startIndexField(_schema.getIndexField(field_id)); - _currDoc = _handleDoc.get(); - return *this; -} - -DocBuilder & -DocBuilder::startAttributeField(const vespalib::string & name) -{ - assert(!_handleDoc->getFieldHandle()); - uint32_t field_id = _schema.getIndexFieldId(name); - assert(field_id == Schema::UNKNOWN_FIELD_ID); - field_id = _schema.getAttributeFieldId(name); - assert(field_id != Schema::UNKNOWN_FIELD_ID); - _handleDoc->startAttributeField(_schema.getAttributeField(field_id)); - _currDoc = _handleDoc.get(); - return *this; -} - -DocBuilder & -DocBuilder::endField() -{ - assert(_currDoc != nullptr); - _currDoc->endField(); - _currDoc = nullptr; - return *this; -} - -DocBuilder & -DocBuilder::startElement(int32_t weight) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->startElement(weight); - return *this; -} - -DocBuilder & -DocBuilder::endElement() -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->endElement(); - return *this; -} - -DocBuilder & -DocBuilder::addStr(const vespalib::string & str) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addStr(str); - return *this; -} - -DocBuilder & -DocBuilder::addSpace() -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addSpace(); - return *this; -} - -DocBuilder & -DocBuilder::addNoWordStr(const vespalib::string & str) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addNoWordStr(str); - return *this; -} - -DocBuilder & -DocBuilder::addTokenizedString(const vespalib::string &str) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addTokenizedString(str, false); - return *this; -} - -DocBuilder & -DocBuilder::addUrlTokenizedString(const vespalib::string &str) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addTokenizedString(str, true); - return *this; -} - -DocBuilder & -DocBuilder::addInt(int64_t val) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addInt(val); - return *this; -} - -DocBuilder & -DocBuilder::addFloat(double val) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addFloat(val); - return *this; -} - -DocBuilder & -DocBuilder::addPredicate(std::unique_ptr<vespalib::Slime> val) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addPredicate(std::move(val)); - return *this; -} - -DocBuilder & -DocBuilder::addTensor(std::unique_ptr<vespalib::eval::Value> val) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addTensor(std::move(val)); - return *this; -} - -DocBuilder & -DocBuilder::addSpan(size_t start, size_t len) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addSpan(start, len); - return *this; -} - -DocBuilder & -DocBuilder::addSpan() -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addSpan(); - return *this; -} - -DocBuilder & -DocBuilder::addSpaceTokenAnnotation() -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addSpaceTokenAnnotation(); - return *this; -} - -DocBuilder & -DocBuilder::addNumericTokenAnnotation() -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addNumericTokenAnnotation(); - return *this; -} - -DocBuilder & -DocBuilder::addAlphabeticTokenAnnotation() -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addAlphabeticTokenAnnotation(); - return *this; -} - -DocBuilder& -DocBuilder::addTermAnnotation() -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addTermAnnotation(); - return *this; -} - -DocBuilder & -DocBuilder::addTermAnnotation(const vespalib::string &val) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addTermAnnotation(val); - return *this; -} - -DocBuilder & -DocBuilder::addPosition(int32_t xpos, int32_t ypos) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addPosition(xpos, ypos); - return *this; -} - -DocBuilder & -DocBuilder::addRaw(const void *buf, size_t len) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->addRaw(buf, len); - return *this; -} - -DocBuilder & -DocBuilder::startSubField(const vespalib::string &subField) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->startSubField(subField); - return *this; -} - -DocBuilder & -DocBuilder::endSubField() -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->endSubField(); - return *this; -} - -DocBuilder & -DocBuilder::setAutoAnnotate(bool autoAnnotate) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->setAutoAnnotate(autoAnnotate); - return *this; -} - -DocBuilder & -DocBuilder::setAutoSpace(bool autoSpace) -{ - assert(_currDoc != nullptr); - _currDoc->getFieldHandle()->setAutoSpace(autoSpace); - return *this; -} - -} diff --git a/searchlib/src/vespa/searchlib/index/docbuilder.h b/searchlib/src/vespa/searchlib/index/docbuilder.h deleted file mode 100644 index a8a37b57070..00000000000 --- a/searchlib/src/vespa/searchlib/index/docbuilder.h +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "doctypebuilder.h" -#include <vespa/document/repo/fixedtyperepo.h> -#include <vespa/document/fieldvalue/fieldvalues.h> -#include <vespa/document/annotation/annotation.h> -#include <vespa/document/annotation/span.h> -#include <vespa/document/annotation/spanlist.h> -#include <vespa/document/annotation/spantree.h> -#include <vespa/vespalib/util/exception.h> -#include <vespa/vespalib/util/stringfmt.h> - -namespace vespalib::eval { struct Value; } - -namespace search::index { - -VESPA_DEFINE_EXCEPTION(DocBuilderError, vespalib::Exception); - -/** - * Builder class used to generate a search document that corresponds - * to an index schema. - **/ -class DocBuilder { -public: - typedef DocBuilderError Error; - -private: - /** - * Base class for handling the construction of a field. - **/ - class FieldHandle { - public: - typedef std::shared_ptr<FieldHandle> SP; - protected: - const Schema::Field & _sfield; - document::FieldValue::UP _value; - document::FieldValue::UP _element; - public: - FieldHandle(const document::Field & dfield, const Schema::Field & field); - virtual ~FieldHandle() {} - virtual void startElement(int32_t weight) { (void) weight; throw Error("Function not supported"); } - virtual void endElement() { throw Error("Function not supported"); } - virtual void addStr(const vespalib::string & val) { (void) val; throw Error("Function not supported"); } - - virtual void addSpace() { - throw Error("Function not supported"); - } - - virtual void addNoWordStr(const vespalib::string & val) { - (void) val; - throw Error("Function not supported"); - } - - virtual void addTokenizedString(const vespalib::string &val, bool urlMode) { - (void) val; - (void) urlMode; - throw Error("Function not supported"); - } - - virtual void addSpan(size_t start, size_t len) { - (void) start; - (void) len; - throw Error("Function not supported"); - } - - virtual void addSpan() { - throw Error("Function not supported"); - } - - virtual void addSpaceTokenAnnotation() { - throw Error("Function not supported"); - } - - virtual void addNumericTokenAnnotation() { - throw Error("Function not supported"); - } - - virtual void addAlphabeticTokenAnnotation() { - throw Error("Function not supported"); - } - - virtual void addTermAnnotation() { - throw Error("Function not supported"); - } - - virtual void addTermAnnotation(const vespalib::string &val) { - (void) val; - throw Error("Function not supported"); - } - - virtual void addInt(int64_t val) { (void) val; throw Error("Function not supported"); } - virtual void addFloat(double val) { (void) val; throw Error("Function not supported"); } - virtual void addPredicate(std::unique_ptr<vespalib::Slime>) { - throw Error("Function not supported"); - } - virtual void addTensor(std::unique_ptr<vespalib::eval::Value>) { - throw Error("Function not supported"); - } - const document::FieldValue::UP & getValue() const { return _value; } - const Schema::Field & getField() const { return _sfield; } - - virtual void onEndElement() {} - virtual void onEndField() {} - - virtual void setAutoAnnotate(bool autoAnnotate) { - (void) autoAnnotate; - throw Error("Function not supported"); - } - - virtual void setAutoSpace(bool autoSpace) { - (void) autoSpace; - throw Error("Function not supported"); - } - - virtual void addPosition(int32_t xpos, int32_t ypos) { - (void) xpos; - (void) ypos; - throw Error("Function not supported"); - } - - virtual void addRaw(const void *buf, size_t len) { - (void) buf; - (void) len; - throw Error("Function not supported"); - } - - virtual void startSubField(const vespalib::string &subField) { - (void) subField; - throw Error("Function not supported"); - } - - virtual void endSubField() { - throw Error("Function not supported"); - } - }; - - /** - * Class that can handle multi value fields. - **/ - class CollectionFieldHandle : public FieldHandle { - private: - int32_t _elementWeight; - public: - CollectionFieldHandle(const document::Field & dfield, const Schema::Field & sfield); - void startElement(int32_t weight) override; - void endElement() override; - }; - - /** - * Class for handling the construction of the content of an index field. - **/ - class IndexFieldHandle : public CollectionFieldHandle { - vespalib::string _str; // adjusted as word comes along - size_t _strSymbols; // symbols in string, assuming UTF8 - document::SpanList *_spanList; // owned by _spanTree - document::SpanTree::UP _spanTree; - const document::SpanNode *_lastSpan; - size_t _spanStart; // start of span - bool _autoAnnotate; // Add annotation when adding strings - bool _autoSpace; // Add space before strings - bool _skipAutoSpace; // one shot skip of adding space - bool _uriField; // URI handling (special struct case) - vespalib::string _subField; - const document::FixedTypeRepo & _repo; - - void append(const vespalib::string &val); - - public: - IndexFieldHandle(const document::FixedTypeRepo & repo, - const document::Field &dfield, - const Schema::Field &sfield); - - void addStr(const vespalib::string & val) override; - void addSpace() override; - void addNoWordStr(const vespalib::string & val) override; - void addTokenizedString(const vespalib::string &val, bool urlMode) override; - void addSpan(size_t start, size_t len) override; - void addSpan() override; - void addSpaceTokenAnnotation() override; - void addNumericTokenAnnotation() override; - void addAlphabeticTokenAnnotation() override; - void addTermAnnotation() override; - void addTermAnnotation(const vespalib::string &val) override; - void onEndElement() override; - void onEndField() override; - void startAnnotate(); - void setAutoAnnotate(bool autoAnnotate) override; - void setAutoSpace(bool autoSpace) override; - void startSubField(const vespalib::string &subField) override; - void endSubField() override; - }; - - /** - * Class for handling the construction of the content of an attribute field. - **/ - class AttributeFieldHandle : public CollectionFieldHandle { - public: - AttributeFieldHandle(const document::Field & dfield, const Schema::Field & sfield); - void addStr(const vespalib::string & val) override; - void addInt(int64_t val) override; - void addFloat(double val) override; - void addPredicate(std::unique_ptr<vespalib::Slime> val) override; - void addTensor(std::unique_ptr<vespalib::eval::Value> val) override; - void addPosition(int32_t xpos, int32_t ypos) override; - }; - - /** - * Class for handling the construction of a document (set of fields). - **/ - class DocumentHandle { - public: - typedef std::shared_ptr<DocumentHandle> SP; - private: - const document::DocumentType * _type; - document::Document *const _doc; - FieldHandle::SP _fieldHandle; - document::FixedTypeRepo _repo; - public: - DocumentHandle(document::Document &doc, const vespalib::string & docId); - ~DocumentHandle(); - const FieldHandle::SP & getFieldHandle() const { return _fieldHandle; } - void startIndexField(const Schema::Field & sfield); - void startAttributeField(const Schema::Field & sfield); - void endField(); - void endDocument(const document::Document::UP & doc) { - (void) doc; - } - }; - - const Schema & _schema; - document::config::DocumenttypesConfig _doctypes_config; - std::shared_ptr<const document::DocumentTypeRepo> _repo; - const document::DocumentType &_docType; - document::Document::UP _doc; // the document we are about to generate - - DocumentHandle::SP _handleDoc; // handle for all fields - DocumentHandle * _currDoc; // the current document handle - -public: - DocBuilder(const Schema & schema); - ~DocBuilder(); - - DocBuilder & startDocument(const vespalib::string & docId); - document::Document::UP endDocument(); - - DocBuilder & startIndexField(const vespalib::string & name); - DocBuilder & startAttributeField(const vespalib::string & name); - DocBuilder & endField(); - DocBuilder & startElement(int32_t weight = 1); - DocBuilder & endElement(); - DocBuilder & addStr(const vespalib::string & val); - DocBuilder & addSpace(); - DocBuilder & addNoWordStr(const vespalib::string & val); - DocBuilder & addInt(int64_t val); - DocBuilder & addFloat(double val); - DocBuilder & addPredicate(std::unique_ptr<vespalib::Slime> val); - DocBuilder & addTensor(std::unique_ptr<vespalib::eval::Value> val); - DocBuilder &addTokenizedString(const vespalib::string &val); - DocBuilder &addUrlTokenizedString(const vespalib::string &val); - DocBuilder &addSpan(size_t start, size_t len); - DocBuilder &addSpan(); - DocBuilder &addSpaceTokenAnnotation(); - DocBuilder &addNumericTokenAnnotation(); - DocBuilder &addAlphabeticTokenAnnotation(); - DocBuilder &addTermAnnotation(); - DocBuilder &addTermAnnotation(const vespalib::string &val); - DocBuilder &setAutoAnnotate(bool autoAnnotate); - DocBuilder &setAutoSpace(bool autoSpace); - DocBuilder &addPosition(int32_t xpos, int32_t ypos); - DocBuilder &addRaw(const void *buf, size_t len); - DocBuilder &startSubField(const vespalib::string &subField); - DocBuilder &endSubField(); - static bool hasAnnotations() { return true; } - - const document::DocumentType &getDocumentType() const { return _docType; } - const std::shared_ptr<const document::DocumentTypeRepo> &getDocumentTypeRepo() const { return _repo; } - document::config::DocumenttypesConfig getDocumenttypesConfig() const { return _doctypes_config; } -}; - -} diff --git a/searchlib/src/vespa/searchlib/index/doctypebuilder.cpp b/searchlib/src/vespa/searchlib/index/doctypebuilder.cpp deleted file mode 100644 index 5f655419471..00000000000 --- a/searchlib/src/vespa/searchlib/index/doctypebuilder.cpp +++ /dev/null @@ -1,175 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "doctypebuilder.h" -#include <vespa/document/datatype/urldatatype.h> -#include <vespa/document/datatype/tensor_data_type.h> -#include <vespa/document/datatype/documenttype.h> -#include <vespa/document/repo/configbuilder.h> -#include <set> - -using namespace document; - -namespace search::index { -namespace { - -DataType::Type convert(Schema::DataType type) { - switch (type) { - case schema::DataType::BOOL: - case schema::DataType::UINT2: - case schema::DataType::UINT4: - case schema::DataType::INT8: - return DataType::T_BYTE; - case schema::DataType::INT16: - return DataType::T_SHORT; - case schema::DataType::INT32: - return DataType::T_INT; - case schema::DataType::INT64: - return DataType::T_LONG; - case schema::DataType::FLOAT: - return DataType::T_FLOAT; - case schema::DataType::DOUBLE: - return DataType::T_DOUBLE; - case schema::DataType::STRING: - return DataType::T_STRING; - case schema::DataType::RAW: - return DataType::T_RAW; - case schema::DataType::BOOLEANTREE: - return DataType::T_PREDICATE; - case schema::DataType::TENSOR: - return DataType::T_TENSOR; - default: - break; - } - assert(!"Unknown datatype in schema"); - return DataType::MAX; -} - -void -insertStructType(document::config::DocumenttypesConfig::Documenttype & cfg, const StructDataType & structType) -{ - typedef document::config::DocumenttypesConfig DTC; - DTC::Documenttype::Datatype::Sstruct cfgStruct; - cfgStruct.name = structType.getName(); - Field::Set fieldSet = structType.getFieldSet(); - for (const Field * field : fieldSet) { - DTC::Documenttype::Datatype::Sstruct::Field sField; - sField.name = field->getName(); - sField.datatype = field->getDataType().getId(); - sField.id = field->getId(); - cfgStruct.field.push_back(sField); - } - cfg.datatype.push_back(DTC::Documenttype::Datatype()); - cfg.datatype.back().sstruct = cfgStruct; - cfg.datatype.back().id = structType.getId(); -} - -using namespace document::config_builder; - -TypeOrId makeCollection(TypeOrId datatype, Schema::CollectionType collection_type) { - switch (collection_type) { - case schema::CollectionType::ARRAY: - return Array(datatype); - case schema::CollectionType::WEIGHTEDSET: - // TODO: consider using array of struct<primitive,int32> to keep order - return Wset(datatype); - default: - return datatype; - } -} - -struct TypeCache { - std::map<std::pair<int, Schema::CollectionType>, TypeOrId> types; - - TypeOrId getType(TypeOrId datatype, Schema::CollectionType c_type) { - TypeOrId type = makeCollection(datatype, c_type); - std::pair<int, Schema::CollectionType> key = std::make_pair(datatype.id, c_type); - if (types.find(key) == types.end()) { - types.insert(std::make_pair(key, type)); - } - return types.find(key)->second; - } -}; - -} - -DocTypeBuilder::DocTypeBuilder(const Schema &schema) - : _schema(schema), - _iFields() -{ - _iFields.setup(schema); -} - -document::config::DocumenttypesConfig DocTypeBuilder::makeConfig() const { - using namespace document::config_builder; - TypeCache type_cache; - - typedef std::set<vespalib::string> UsedFields; - UsedFields usedFields; - - Struct header_struct("searchdocument.header"); - header_struct.setId(-1505212454); - - for (size_t i = 0; i < _iFields._textFields.size(); ++i) { - const Schema::IndexField &field = - _schema.getIndexField(_iFields._textFields[i]); - - // only handles string fields for now - assert(field.getDataType() == schema::DataType::STRING); - header_struct.addField(field.getName(), type_cache.getType( - DataType::T_STRING, field.getCollectionType())); - usedFields.insert(field.getName()); - } - - const int32_t uri_type = document::UrlDataType::getInstance().getId(); - for (size_t i = 0; i < _iFields._uriFields.size(); ++i) { - const Schema::IndexField &field = - _schema.getIndexField(_iFields._uriFields[i]._all); - - // only handles string fields for now - assert(field.getDataType() == schema::DataType::STRING); - header_struct.addField(field.getName(), type_cache.getType( - uri_type, field.getCollectionType())); - usedFields.insert(field.getName()); - } - - for (uint32_t i = 0; i < _schema.getNumAttributeFields(); ++i) { - const Schema::AttributeField &field = _schema.getAttributeField(i); - UsedFields::const_iterator usf = usedFields.find(field.getName()); - if (usf != usedFields.end()) { - continue; // taken as index field - } - auto type_id = convert(field.getDataType()); - if (type_id == DataType::T_TENSOR) { - header_struct.addTensorField(field.getName(), field.get_tensor_spec()); - } else { - header_struct.addField(field.getName(), type_cache.getType( - type_id, field.getCollectionType())); - } - usedFields.insert(field.getName()); - } - - DocumenttypesConfigBuilderHelper builder; - builder.document(-645763131, "searchdocument", - header_struct, Struct("searchdocument.body")); - return builder.config(); -} - -document::config::DocumenttypesConfig -DocTypeBuilder::makeConfig(const DocumentType &docType) -{ - typedef document::config::DocumenttypesConfigBuilder DTC; - DTC cfg; - { // document type - DTC::Documenttype dtype; - dtype.id = docType.getId(); - dtype.name = docType.getName(); - // TODO(vekterli): remove header/body config - dtype.headerstruct = docType.getFieldsType().getId(); - dtype.bodystruct = docType.getFieldsType().getId(); - cfg.documenttype.push_back(dtype); - } - insertStructType(cfg.documenttype[0], docType.getFieldsType()); - return cfg; -} - -} diff --git a/searchlib/src/vespa/searchlib/index/doctypebuilder.h b/searchlib/src/vespa/searchlib/index/doctypebuilder.h deleted file mode 100644 index 4db0ba5b0e3..00000000000 --- a/searchlib/src/vespa/searchlib/index/doctypebuilder.h +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "schema_index_fields.h" -#include <vespa/document/config/config-documenttypes.h> -#include <vespa/document/fieldvalue/fieldvalues.h> -#include <vespa/vespalib/util/exception.h> -#include <vespa/vespalib/util/stringfmt.h> - -namespace search::index { - -/** - * Builder for the indexingdocument document type based on an index schema. - **/ -class DocTypeBuilder { - const Schema &_schema; - SchemaIndexFields _iFields; - -public: - DocTypeBuilder(const Schema & schema); - document::config::DocumenttypesConfig makeConfig() const; - - static document::config::DocumenttypesConfig - makeConfig(const document::DocumentType &docType); -}; - -} diff --git a/searchlib/src/vespa/searchlib/index/empty_doc_builder.cpp b/searchlib/src/vespa/searchlib/index/empty_doc_builder.cpp index 6515d896917..fabe630432f 100644 --- a/searchlib/src/vespa/searchlib/index/empty_doc_builder.cpp +++ b/searchlib/src/vespa/searchlib/index/empty_doc_builder.cpp @@ -33,6 +33,11 @@ get_document_types_config(EmptyDocBuilder::AddFieldsType add_fields) } +EmptyDocBuilder::EmptyDocBuilder() + : EmptyDocBuilder([](auto&) noexcept {}) +{ +} + EmptyDocBuilder::EmptyDocBuilder(AddFieldsType add_fields) : _document_types_config(std::make_shared<const DocumenttypesConfig>(get_document_types_config(add_fields))), _repo(DocumentTypeRepoFactory::make(*_document_types_config)), diff --git a/searchlib/src/vespa/searchlib/index/empty_doc_builder.h b/searchlib/src/vespa/searchlib/index/empty_doc_builder.h index 7e734af4e95..18b6543bea1 100644 --- a/searchlib/src/vespa/searchlib/index/empty_doc_builder.h +++ b/searchlib/src/vespa/searchlib/index/empty_doc_builder.h @@ -28,6 +28,7 @@ class EmptyDocBuilder { const document::DocumentType* _document_type; public: using AddFieldsType = std::function<void(document::config_builder::Struct&)>; + EmptyDocBuilder(); explicit EmptyDocBuilder(AddFieldsType add_fields); ~EmptyDocBuilder(); const document::DocumentTypeRepo& get_repo() const noexcept { return *_repo; } diff --git a/searchlib/src/vespa/searchlib/index/string_field_builder.cpp b/searchlib/src/vespa/searchlib/index/string_field_builder.cpp new file mode 100644 index 00000000000..3212a021535 --- /dev/null +++ b/searchlib/src/vespa/searchlib/index/string_field_builder.cpp @@ -0,0 +1,140 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "string_field_builder.h" +#include "empty_doc_builder.h" +#include <vespa/document/annotation/annotation.h> +#include <vespa/document/annotation/span.h> +#include <vespa/document/annotation/spanlist.h> +#include <vespa/document/annotation/spantree.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/fastlib/text/unicodeutil.h> +#include <vespa/vespalib/text/utf8.h> + +#include <cassert> + +using document::Annotation; +using document::AnnotationType; +using document::FixedTypeRepo; +using document::StringFieldValue; +using document::Span; +using document::SpanList; +using document::SpanNode; +using document::SpanTree; +using vespalib::Utf8Reader; +using vespalib::Utf8Writer; + +namespace search::index { + +namespace { + +const vespalib::string SPANTREE_NAME("linguistics"); + +} + +StringFieldBuilder::StringFieldBuilder(const EmptyDocBuilder& empty_doc_builder) + : _value(), + _span_start(0u), + _span_list(nullptr), + _span_tree(), + _last_span(nullptr), + _url_mode(false), + _repo(empty_doc_builder.get_repo(), empty_doc_builder.get_document_type()) +{ +} + +StringFieldBuilder::~StringFieldBuilder() = default; + +void +StringFieldBuilder::start_annotate() +{ + auto span_list_up = std::make_unique<SpanList>(); + _span_list = span_list_up.get(); + _span_tree = std::make_unique<SpanTree>(SPANTREE_NAME, std::move(span_list_up)); +} + +void +StringFieldBuilder::add_span() +{ + assert(_value.size() > _span_start); + const SpanNode &span = _span_list->add(std::make_unique<Span>(_span_start, _value.size() - _span_start)); + _last_span = &span; + _span_start = _value.size(); +} + +StringFieldBuilder& +StringFieldBuilder::token(const vespalib::string& val, bool is_word) +{ + if (val.empty()) { + return *this; + } + if (!_span_tree) { + start_annotate(); + } + _span_start = _value.size(); + _value.append(val); + add_span(); + if (is_word) { + _span_tree->annotate(*_last_span, *AnnotationType::TERM); + } + return *this; +} + +StringFieldBuilder& +StringFieldBuilder::alt_word(const vespalib::string& val) +{ + assert(_last_span != nullptr); + _span_tree->annotate(*_last_span, + Annotation(*AnnotationType::TERM, + std::make_unique<StringFieldValue>(val))); + return *this; +} + +StringFieldBuilder& +StringFieldBuilder::tokenize(const vespalib::string& val) +{ + Utf8Reader reader(val); + vespalib::string token_buffer; + Utf8Writer writer(token_buffer); + uint32_t c = 0u; + bool old_word = false; + + while (reader.hasMore()) { + c = reader.getChar(); + bool new_word = Fast_UnicodeUtil::IsWordChar(c) || + (_url_mode && (c == '-' || c == '_')); + if (old_word != new_word) { + if (!token_buffer.empty()) { + token(token_buffer, old_word); + token_buffer.clear(); + } + old_word = new_word; + } + writer.putChar(c); + } + if (!token_buffer.empty()) { + token(token_buffer, old_word); + } + return *this; +} + + +document::StringFieldValue +StringFieldBuilder::build() +{ + StringFieldValue value(_value); + // Also drop all spans no annotation for now + if (_span_tree && _span_tree->numAnnotations() > 0u) { + StringFieldValue::SpanTrees trees; + trees.emplace_back(std::move(_span_tree)); + value.setSpanTrees(trees, _repo); + } else { + _span_tree.reset(); + } + _span_list = nullptr; + _last_span = nullptr; + _span_start = 0u; + _value.clear(); + return value; +} + +} diff --git a/searchlib/src/vespa/searchlib/index/string_field_builder.h b/searchlib/src/vespa/searchlib/index/string_field_builder.h new file mode 100644 index 00000000000..1987cbbcf74 --- /dev/null +++ b/searchlib/src/vespa/searchlib/index/string_field_builder.h @@ -0,0 +1,45 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> +#include <vespa/document/repo/fixedtyperepo.h> +#include <memory> + +namespace document { +class SpanList; +struct SpanNode; +class SpanTree; +class StringFieldValue; +} + +namespace search::index { + +class EmptyDocBuilder; + +/* + * Helper class to build annotated string field. + */ +class StringFieldBuilder { + vespalib::string _value; + size_t _span_start; + document::SpanList* _span_list; // owned by _span_tree + std::unique_ptr<document::SpanTree> _span_tree; + const document::SpanNode* _last_span; + bool _url_mode; + const document::FixedTypeRepo _repo; + void start_annotate(); + void add_span(); +public: + StringFieldBuilder(const EmptyDocBuilder& empty_doc_builder); + ~StringFieldBuilder(); + StringFieldBuilder& url_mode(bool url_mode_) noexcept { _url_mode = url_mode_; return *this; } + StringFieldBuilder& token(const vespalib::string& val, bool is_word); + StringFieldBuilder& word(const vespalib::string& val) { return token(val, true); } + StringFieldBuilder& space() { return token(" ", false); } + StringFieldBuilder& tokenize(const vespalib::string& val); + StringFieldBuilder& alt_word(const vespalib::string& val); + document::StringFieldValue build(); +}; + +} |