diff options
Diffstat (limited to 'searchlib')
60 files changed, 4277 insertions, 12 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index 412d00a1c6a..7c67508a196 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -4,7 +4,6 @@ vespa_define_module( fastos vespalog vespalib - staging_vespalib vespaeval fnet configdefinitions @@ -12,7 +11,6 @@ vespa_define_module( fastlib_fast document config_cloudconfig - searchcommon EXTERNAL_DEPENDS ${VESPA_GLIBC_RT_LIB} @@ -53,6 +51,8 @@ vespa_define_module( src/vespa/searchlib/transactionlog src/vespa/searchlib/uca src/vespa/searchlib/util + src/vespa/searchcommon/attribute + src/vespa/searchcommon/common APPS src/apps/docstore @@ -213,6 +213,8 @@ vespa_define_module( src/tests/rankingexpression/intrinsic_blueprint_adapter src/tests/ranksetup src/tests/ranksetup/verify_feature + src/tests/searchcommon/attribute/config + src/tests/searchcommon/schema src/tests/sort src/tests/sortresults src/tests/sortspec diff --git a/searchlib/src/tests/searchcommon/.gitignore b/searchlib/src/tests/searchcommon/.gitignore new file mode 100644 index 00000000000..a3e9c375723 --- /dev/null +++ b/searchlib/src/tests/searchcommon/.gitignore @@ -0,0 +1,3 @@ +.depend +Makefile +*_test diff --git a/searchlib/src/tests/searchcommon/attribute/config/.gitignore b/searchlib/src/tests/searchcommon/attribute/config/.gitignore new file mode 100644 index 00000000000..ffdb7b1e933 --- /dev/null +++ b/searchlib/src/tests/searchcommon/attribute/config/.gitignore @@ -0,0 +1 @@ +searchcommon_attribute_config_test_app diff --git a/searchlib/src/tests/searchcommon/attribute/config/CMakeLists.txt b/searchlib/src/tests/searchcommon/attribute/config/CMakeLists.txt new file mode 100644 index 00000000000..f61138c5d73 --- /dev/null +++ b/searchlib/src/tests/searchcommon/attribute/config/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchcommon_attribute_config_test_app TEST + SOURCES + attribute_config_test.cpp + DEPENDS + searchlib +) +vespa_add_test(NAME searchcommon_attribute_config_test_app NO_VALGRIND COMMAND searchcommon_attribute_config_test_app) diff --git a/searchlib/src/tests/searchcommon/attribute/config/attribute_config_test.cpp b/searchlib/src/tests/searchcommon/attribute/config/attribute_config_test.cpp new file mode 100644 index 00000000000..918e14546e6 --- /dev/null +++ b/searchlib/src/tests/searchcommon/attribute/config/attribute_config_test.cpp @@ -0,0 +1,142 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/testkit/test_kit.h> +#include <vespa/searchcommon/attribute/config.h> + +using search::attribute::Config; +using search::attribute::BasicType; +using search::attribute::CollectionType; +using vespalib::eval::ValueType; +using search::GrowStrategy; +using search::DictionaryConfig; + + +struct Fixture +{ + Config _config; + Fixture() + : _config() + { } + + Fixture(BasicType bt, + CollectionType ct = CollectionType::SINGLE, + bool fastSearch_ = false, + bool huge_ = false) + : _config(bt, ct, fastSearch_, huge_) + { } +}; + +TEST_F("test default attribute config", Fixture) +{ + EXPECT_EQUAL(BasicType::Type::NONE, f._config.basicType().type()); + EXPECT_EQUAL(CollectionType::Type::SINGLE, + f._config.collectionType().type()); + EXPECT_TRUE(!f._config.fastSearch()); + EXPECT_TRUE(!f._config.huge()); + EXPECT_TRUE(!f._config.getEnableBitVectors()); + EXPECT_TRUE(!f._config.getEnableOnlyBitVector()); + EXPECT_TRUE(!f._config.getIsFilter()); + EXPECT_TRUE(!f._config.fastAccess()); + EXPECT_TRUE(f._config.tensorType().is_error()); +} + +TEST_F("test integer weightedset attribute config", + Fixture(BasicType::Type::INT32, + CollectionType::Type::WSET)) +{ + EXPECT_EQUAL(BasicType::Type::INT32, f._config.basicType().type()); + EXPECT_EQUAL(CollectionType::Type::WSET, + f._config.collectionType().type()); + EXPECT_TRUE(!f._config.fastSearch()); + EXPECT_TRUE(!f._config.huge()); + EXPECT_TRUE(!f._config.getEnableBitVectors()); + EXPECT_TRUE(!f._config.getEnableOnlyBitVector()); + EXPECT_TRUE(!f._config.getIsFilter()); + EXPECT_TRUE(!f._config.fastAccess()); + EXPECT_TRUE(f._config.tensorType().is_error()); +} + + +TEST("test operator== on attribute config") +{ + Config cfg1(BasicType::Type::INT32, CollectionType::Type::WSET); + Config cfg2(BasicType::Type::INT32, CollectionType::Type::ARRAY); + Config cfg3(BasicType::Type::INT32, CollectionType::Type::WSET); + + EXPECT_TRUE(cfg1 != cfg2); + EXPECT_TRUE(cfg2 != cfg3); + EXPECT_TRUE(cfg1 == cfg3); +} + + +TEST("test operator== on attribute config for tensor type") +{ + Config cfg1(BasicType::Type::TENSOR); + Config cfg2(BasicType::Type::TENSOR); + Config cfg3(BasicType::Type::TENSOR); + + ValueType dense_x = ValueType::from_spec("tensor(x[10])"); + ValueType sparse_x = ValueType::from_spec("tensor(x{})"); + + EXPECT_TRUE(cfg1 == cfg2); + EXPECT_TRUE(cfg2 == cfg3); + EXPECT_TRUE(cfg1 == cfg3); + + cfg1.setTensorType(dense_x); + cfg3.setTensorType(dense_x); + EXPECT_EQUAL(dense_x, cfg1.tensorType()); + EXPECT_EQUAL(dense_x, cfg3.tensorType()); + EXPECT_TRUE(!cfg1.tensorType().is_error()); + EXPECT_TRUE(cfg2.tensorType().is_error()); + EXPECT_TRUE(!cfg3.tensorType().is_error()); + + EXPECT_TRUE(cfg1 != cfg2); + EXPECT_TRUE(cfg2 != cfg3); + EXPECT_TRUE(cfg1 == cfg3); + + cfg3.setTensorType(sparse_x); + EXPECT_EQUAL(sparse_x, cfg3.tensorType()); + EXPECT_TRUE(!cfg3.tensorType().is_error()); + EXPECT_TRUE(cfg1 != cfg3); +} + +TEST("Test GrowStrategy consistency") { + GrowStrategy g(1024, 0.5, 17, 0.4f); + EXPECT_EQUAL(1024u, g.getDocsInitialCapacity()); + EXPECT_EQUAL(50u, g.getDocsGrowPercent()); + EXPECT_EQUAL(0.5, g.getDocsGrowFactor()); + EXPECT_EQUAL(17u, g.getDocsGrowDelta()); + EXPECT_EQUAL(0.4f, g.getMultiValueAllocGrowFactor()); +} + +TEST("DictionaryConfig") { + using Type = DictionaryConfig::Type; + using Match = DictionaryConfig::Match; + EXPECT_EQUAL(Type::BTREE, DictionaryConfig().getType()); + EXPECT_EQUAL(Match::UNCASED, DictionaryConfig().getMatch()); + + EXPECT_EQUAL(Type::BTREE, DictionaryConfig(Type::BTREE).getType()); + EXPECT_EQUAL(Match::UNCASED, DictionaryConfig(Type::BTREE).getMatch()); + EXPECT_EQUAL(Match::UNCASED, DictionaryConfig(Type::BTREE, Match::UNCASED).getMatch()); + EXPECT_EQUAL(Match::CASED, DictionaryConfig(Type::BTREE, Match::CASED).getMatch()); + + EXPECT_EQUAL(Type::HASH, DictionaryConfig(Type::HASH).getType()); + EXPECT_EQUAL(Type::BTREE_AND_HASH, DictionaryConfig(Type::BTREE_AND_HASH).getType()); + + EXPECT_EQUAL(DictionaryConfig(Type::BTREE), DictionaryConfig(Type::BTREE)); + EXPECT_EQUAL(DictionaryConfig(Type::HASH), DictionaryConfig(Type::HASH)); + EXPECT_EQUAL(DictionaryConfig(Type::BTREE_AND_HASH), DictionaryConfig(Type::BTREE_AND_HASH)); + EXPECT_NOT_EQUAL(DictionaryConfig(Type::HASH), DictionaryConfig(Type::BTREE)); + EXPECT_NOT_EQUAL(DictionaryConfig(Type::BTREE), DictionaryConfig(Type::HASH)); + EXPECT_TRUE(Config().set_dictionary_config(DictionaryConfig(Type::HASH)) == + Config().set_dictionary_config(DictionaryConfig(Type::HASH))); + EXPECT_FALSE(Config().set_dictionary_config(DictionaryConfig(Type::HASH)) == + Config().set_dictionary_config(DictionaryConfig(Type::BTREE))); + EXPECT_FALSE(Config().set_dictionary_config(DictionaryConfig(Type::HASH)) != + Config().set_dictionary_config(DictionaryConfig(Type::HASH))); + EXPECT_TRUE(Config().set_dictionary_config(DictionaryConfig(Type::HASH)) != + Config().set_dictionary_config(DictionaryConfig(Type::BTREE))); +} + + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/searchcommon/schema/.gitignore b/searchlib/src/tests/searchcommon/schema/.gitignore new file mode 100644 index 00000000000..e000f0ca2c8 --- /dev/null +++ b/searchlib/src/tests/searchcommon/schema/.gitignore @@ -0,0 +1,9 @@ +/.depend +/Makefile +/schema_test +searchcommon_schema_test_app +/schema-no-imported-fields.txt +/schema-with-timestamps.txt +/schema-without-timestamps.txt +/schema.txt +/schema2.txt diff --git a/searchlib/src/tests/searchcommon/schema/CMakeLists.txt b/searchlib/src/tests/searchcommon/schema/CMakeLists.txt new file mode 100644 index 00000000000..2304c319dea --- /dev/null +++ b/searchlib/src/tests/searchcommon/schema/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchcommon_schema_test_app TEST + SOURCES + schema_test.cpp + DEPENDS + searchlib + GTest::GTest +) +vespa_add_test(NAME searchcommon_schema_test_app NO_VALGRIND COMMAND searchcommon_schema_test_app) diff --git a/searchlib/src/tests/searchcommon/schema/imported-fields-cfg/attributes.cfg b/searchlib/src/tests/searchcommon/schema/imported-fields-cfg/attributes.cfg new file mode 100644 index 00000000000..9a08f7e2324 --- /dev/null +++ b/searchlib/src/tests/searchcommon/schema/imported-fields-cfg/attributes.cfg @@ -0,0 +1,12 @@ +attribute[3] +attribute[0].name imported_a +attribute[0].imported true +attribute[0].datatype INT32 +attribute[0].collectiontype SINGLE +attribute[1].name imported_b +attribute[1].imported true +attribute[1].datatype STRING +attribute[1].collectiontype ARRAY +attribute[2].name regular +attribute[2].datatype INT32 +attribute[2].collectiontype SINGLE diff --git a/searchlib/src/tests/searchcommon/schema/load-save-cfg/attributes.cfg b/searchlib/src/tests/searchcommon/schema/load-save-cfg/attributes.cfg new file mode 100644 index 00000000000..09f711b6a65 --- /dev/null +++ b/searchlib/src/tests/searchcommon/schema/load-save-cfg/attributes.cfg @@ -0,0 +1,22 @@ +attribute[9] +attribute[0].name a +attribute[0].datatype STRING +attribute[0].collectiontype SINGLE +attribute[1].name b +attribute[1].datatype INT8 +attribute[1].collectiontype ARRAY +attribute[2].name c +attribute[2].datatype INT16 +attribute[2].collectiontype WEIGHTEDSET +attribute[3].name d +attribute[3].datatype INT32 +attribute[4].name e +attribute[4].datatype INT64 +attribute[5].name f +attribute[5].datatype FLOAT +attribute[6].name g +attribute[6].datatype DOUBLE +attribute[7].name h +attribute[7].datatype PREDICATE +attribute[8].name i +attribute[8].datatype TENSOR diff --git a/searchlib/src/tests/searchcommon/schema/load-save-cfg/indexschema.cfg b/searchlib/src/tests/searchcommon/schema/load-save-cfg/indexschema.cfg new file mode 100644 index 00000000000..b9d82b9b569 --- /dev/null +++ b/searchlib/src/tests/searchcommon/schema/load-save-cfg/indexschema.cfg @@ -0,0 +1,13 @@ +indexfield[6] +indexfield[0].name a +indexfield[0].datatype STRING +indexfield[1].name b +indexfield[1].datatype INT64 +indexfield[2].name c +indexfield[2].datatype STRING +indexfield[2].interleavedfeatures true +fieldset[1] +fieldset[0].name default +fieldset[0].field[2] +fieldset[0].field[0].name a +fieldset[0].field[1].name c diff --git a/searchlib/src/tests/searchcommon/schema/load-save-cfg/summary.cfg b/searchlib/src/tests/searchcommon/schema/load-save-cfg/summary.cfg new file mode 100644 index 00000000000..0c2de33d076 --- /dev/null +++ b/searchlib/src/tests/searchcommon/schema/load-save-cfg/summary.cfg @@ -0,0 +1,29 @@ +defaultsummaryid 0 +classes[1] +classes[0].id 0 +classes[0].name test +classes[0].fields[12] +classes[0].fields[0].name a +classes[0].fields[0].type byte +classes[0].fields[1].name b +classes[0].fields[1].type short +classes[0].fields[2].name c +classes[0].fields[2].type integer +classes[0].fields[3].name d +classes[0].fields[3].type int64 +classes[0].fields[4].name e +classes[0].fields[4].type float +classes[0].fields[5].name f +classes[0].fields[5].type double +classes[0].fields[6].name g +classes[0].fields[6].type string +classes[0].fields[7].name h +classes[0].fields[7].type longstring +classes[0].fields[8].name i +classes[0].fields[8].type xmlstring +classes[0].fields[9].name j +classes[0].fields[9].type jsonstring +classes[0].fields[10].name k +classes[0].fields[10].type data +classes[0].fields[11].name l +classes[0].fields[11].type longdata diff --git a/searchlib/src/tests/searchcommon/schema/schema-without-index-field-properties.txt b/searchlib/src/tests/searchcommon/schema/schema-without-index-field-properties.txt new file mode 100644 index 00000000000..4491b1242e0 --- /dev/null +++ b/searchlib/src/tests/searchcommon/schema/schema-without-index-field-properties.txt @@ -0,0 +1,7 @@ +attributefield[0] +summaryfield[0] +fieldset[0] +indexfield[1] +indexfield[0].name foo +indexfield[0].datatype STRING +indexfield[0].collectiontype SINGLE diff --git a/searchlib/src/tests/searchcommon/schema/schema_test.cpp b/searchlib/src/tests/searchcommon/schema/schema_test.cpp new file mode 100644 index 00000000000..09a7359bac7 --- /dev/null +++ b/searchlib/src/tests/searchcommon/schema/schema_test.cpp @@ -0,0 +1,396 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/config/common/configparser.h> +#include <vespa/searchcommon/common/schema.h> +#include <vespa/searchcommon/common/schemaconfigurer.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <vespa/vespalib/stllike/string.h> +#include <fstream> + +#include <vespa/log/log.h> +LOG_SETUP("schema_test"); + +using vespalib::string; + +namespace search::index { + +using schema::DataType; +using schema::CollectionType; +using SIAF = Schema::ImportedAttributeField; +using SIF = Schema::IndexField; + +void +assertField(const Schema::Field& exp, const Schema::Field& act) +{ + EXPECT_EQ(exp.getName(), act.getName()); + EXPECT_EQ(exp.getDataType(), act.getDataType()); + EXPECT_EQ(exp.getCollectionType(), act.getCollectionType()); +} + +void +assertIndexField(const Schema::IndexField& exp, + const Schema::IndexField& act) +{ + assertField(exp, act); + EXPECT_EQ(exp.getAvgElemLen(), act.getAvgElemLen()); + EXPECT_EQ(exp.use_interleaved_features(), act.use_interleaved_features()); +} + +void +assertSet(const Schema::FieldSet& exp, + const Schema::FieldSet& act) +{ + EXPECT_EQ(exp.getName(), act.getName()); + ASSERT_EQ(exp.getFields().size(), act.getFields().size()); + for (size_t i = 0; i < exp.getFields().size(); ++i) { + EXPECT_EQ(exp.getFields()[i], act.getFields()[i]); + } +} + +void +assertSchema(const Schema& exp, const Schema& act) +{ + ASSERT_EQ(exp.getNumIndexFields(), act.getNumIndexFields()); + for (size_t i = 0; i < exp.getNumIndexFields(); ++i) { + assertIndexField(exp.getIndexField(i), act.getIndexField(i)); + } + ASSERT_EQ(exp.getNumAttributeFields(), act.getNumAttributeFields()); + for (size_t i = 0; i < exp.getNumAttributeFields(); ++i) { + assertField(exp.getAttributeField(i), act.getAttributeField(i)); + } + ASSERT_EQ(exp.getNumSummaryFields(), act.getNumSummaryFields()); + for (size_t i = 0; i < exp.getNumSummaryFields(); ++i) { + assertField(exp.getSummaryField(i), act.getSummaryField(i)); + } + ASSERT_EQ(exp.getNumFieldSets(), act.getNumFieldSets()); + for (size_t i = 0; i < exp.getNumFieldSets(); ++i) { + assertSet(exp.getFieldSet(i), act.getFieldSet(i)); + } + const auto &expImported = exp.getImportedAttributeFields(); + const auto &actImported = act.getImportedAttributeFields(); + ASSERT_EQ(expImported.size(), actImported.size()); + for (size_t i = 0; i < expImported.size(); ++i) { + assertField(expImported[i], actImported[i]); + } +} + +TEST(SchemaTest, test_basic) +{ + Schema s; + EXPECT_EQ(0u, s.getNumIndexFields()); + EXPECT_EQ(0u, s.getNumAttributeFields()); + EXPECT_EQ(0u, s.getNumSummaryFields()); + EXPECT_EQ(0u, s.getNumImportedAttributeFields()); + + s.addIndexField(Schema::IndexField("foo", DataType::STRING)); + s.addIndexField(Schema::IndexField("bar", DataType::INT32)); + + s.addAttributeField(Schema::AttributeField("foo", DataType::STRING, CollectionType::ARRAY)); + s.addAttributeField(Schema::AttributeField("bar", DataType::INT32, CollectionType::WEIGHTEDSET)); + s.addAttributeField(Schema::AttributeField("cox", DataType::STRING)); + + s.addSummaryField(Schema::SummaryField("foo", DataType::STRING, CollectionType::ARRAY)); + s.addSummaryField(Schema::SummaryField("bar", DataType::INT32, CollectionType::WEIGHTEDSET)); + s.addSummaryField(Schema::SummaryField("cox", DataType::STRING)); + s.addSummaryField(Schema::SummaryField("fox", DataType::RAW)); + + s.addFieldSet(Schema::FieldSet("default").addField("foo").addField("bar")); + + s.addImportedAttributeField(SIAF("imported", DataType::INT32)); + + ASSERT_EQ(2u, s.getNumIndexFields()); + { + EXPECT_EQ("foo", s.getIndexField(0).getName()); + EXPECT_EQ(DataType::STRING, s.getIndexField(0).getDataType()); + EXPECT_EQ(CollectionType::SINGLE, s.getIndexField(0).getCollectionType()); + + EXPECT_EQ("bar", s.getIndexField(1).getName()); + EXPECT_EQ(DataType::INT32, s.getIndexField(1).getDataType()); + EXPECT_EQ(CollectionType::SINGLE, s.getIndexField(1).getCollectionType()); + + EXPECT_EQ(0u, s.getIndexFieldId("foo")); + EXPECT_EQ(1u, s.getIndexFieldId("bar")); + EXPECT_EQ(Schema::UNKNOWN_FIELD_ID, s.getIndexFieldId("cox")); + } + ASSERT_EQ(3u, s.getNumAttributeFields()); + { + EXPECT_EQ("foo", s.getAttributeField(0).getName()); + EXPECT_EQ(DataType::STRING, s.getAttributeField(0).getDataType()); + EXPECT_EQ(CollectionType::ARRAY, s.getAttributeField(0).getCollectionType()); + + EXPECT_EQ("bar", s.getAttributeField(1).getName()); + EXPECT_EQ(DataType::INT32, s.getAttributeField(1).getDataType()); + EXPECT_EQ(CollectionType::WEIGHTEDSET, s.getAttributeField(1).getCollectionType()); + + EXPECT_EQ("cox", s.getAttributeField(2).getName()); + EXPECT_EQ(DataType::STRING, s.getAttributeField(2).getDataType()); + EXPECT_EQ(CollectionType::SINGLE, s.getAttributeField(2).getCollectionType()); + + EXPECT_EQ(0u, s.getAttributeFieldId("foo")); + EXPECT_EQ(1u, s.getAttributeFieldId("bar")); + EXPECT_EQ(2u, s.getAttributeFieldId("cox")); + EXPECT_EQ(Schema::UNKNOWN_FIELD_ID, s.getIndexFieldId("fox")); + } + ASSERT_EQ(4u, s.getNumSummaryFields()); + { + EXPECT_EQ("foo", s.getSummaryField(0).getName()); + EXPECT_EQ(DataType::STRING, s.getSummaryField(0).getDataType()); + EXPECT_EQ(CollectionType::ARRAY, s.getSummaryField(0).getCollectionType()); + + EXPECT_EQ("bar", s.getSummaryField(1).getName()); + EXPECT_EQ(DataType::INT32, s.getSummaryField(1).getDataType()); + EXPECT_EQ(CollectionType::WEIGHTEDSET, s.getSummaryField(1).getCollectionType()); + + EXPECT_EQ("cox", s.getSummaryField(2).getName()); + EXPECT_EQ(DataType::STRING, s.getSummaryField(2).getDataType()); + EXPECT_EQ(CollectionType::SINGLE, s.getSummaryField(2).getCollectionType()); + + EXPECT_EQ("fox", s.getSummaryField(3).getName()); + EXPECT_EQ(DataType::RAW, s.getSummaryField(3).getDataType()); + EXPECT_EQ(CollectionType::SINGLE, s.getSummaryField(3).getCollectionType()); + + EXPECT_EQ(0u, s.getSummaryFieldId("foo")); + EXPECT_EQ(1u, s.getSummaryFieldId("bar")); + EXPECT_EQ(2u, s.getSummaryFieldId("cox")); + EXPECT_EQ(3u, s.getSummaryFieldId("fox")); + EXPECT_EQ(Schema::UNKNOWN_FIELD_ID, s.getSummaryFieldId("not")); + } + ASSERT_EQ(1u, s.getNumFieldSets()); + { + EXPECT_EQ("default", s.getFieldSet(0).getName()); + EXPECT_EQ(2u, s.getFieldSet(0).getFields().size()); + EXPECT_EQ("foo", s.getFieldSet(0).getFields()[0]); + EXPECT_EQ("bar", s.getFieldSet(0).getFields()[1]); + } + EXPECT_EQ(1u, s.getNumImportedAttributeFields()); + { + const auto &imported = s.getImportedAttributeFields(); + EXPECT_EQ(1u, imported.size()); + assertField(SIAF("imported", DataType::INT32, CollectionType::SINGLE), imported[0]); + } +} + +TEST(SchemaTest, test_load_and_save) +{ + using SAF = Schema::AttributeField; + using SSF = Schema::SummaryField; + using SDT = schema::DataType; + using SCT = schema::CollectionType; + using SFS = Schema::FieldSet; + + { // load from config -> save to file -> load from file + Schema s; + SchemaConfigurer configurer(s, "dir:load-save-cfg"); + EXPECT_EQ(3u, s.getNumIndexFields()); + assertIndexField(SIF("a", SDT::STRING), s.getIndexField(0)); + assertIndexField(SIF("b", SDT::INT64), s.getIndexField(1)); + assertIndexField(SIF("c", SDT::STRING).set_interleaved_features(true), s.getIndexField(2)); + + EXPECT_EQ(9u, s.getNumAttributeFields()); + assertField(SAF("a", SDT::STRING, SCT::SINGLE), + s.getAttributeField(0)); + assertField(SAF("b", SDT::INT8, SCT::ARRAY), s.getAttributeField(1)); + assertField(SAF("c", SDT::INT16, SCT::WEIGHTEDSET), + s.getAttributeField(2)); + assertField(SAF("d", SDT::INT32), s.getAttributeField(3)); + assertField(SAF("e", SDT::INT64), s.getAttributeField(4)); + assertField(SAF("f", SDT::FLOAT), s.getAttributeField(5)); + assertField(SAF("g", SDT::DOUBLE), s.getAttributeField(6)); + assertField(SAF("h", SDT::BOOLEANTREE), s.getAttributeField(7)); + assertField(SAF("i", SDT::TENSOR), s.getAttributeField(8)); + + EXPECT_EQ(12u, s.getNumSummaryFields()); + assertField(SSF("a", SDT::INT8), s.getSummaryField(0)); + assertField(SSF("b", SDT::INT16), s.getSummaryField(1)); + assertField(SSF("c", SDT::INT32), s.getSummaryField(2)); + assertField(SSF("d", SDT::INT64), s.getSummaryField(3)); + assertField(SSF("e", SDT::FLOAT), s.getSummaryField(4)); + assertField(SSF("f", SDT::DOUBLE), s.getSummaryField(5)); + assertField(SSF("g", SDT::STRING), s.getSummaryField(6)); + assertField(SSF("h", SDT::STRING), s.getSummaryField(7)); + assertField(SSF("i", SDT::STRING), s.getSummaryField(8)); + assertField(SSF("j", SDT::STRING), s.getSummaryField(9)); + assertField(SSF("k", SDT::RAW), s.getSummaryField(10)); + assertField(SSF("l", SDT::RAW), s.getSummaryField(11)); + + EXPECT_EQ(1u, s.getNumFieldSets()); + assertSet(SFS("default").addField("a").addField("c"), + s.getFieldSet(0)); + + Schema s2 = s; + EXPECT_TRUE(s.saveToFile("schema.txt")); + assertSchema(s, s2); // test copy contructor + Schema s3; + EXPECT_TRUE(s3.loadFromFile("schema.txt")); + assertSchema(s, s3); // test that saved file is loaded correctly + s3.addIndexField(SIF("foo", SDT::STRING)); + s3.addImportedAttributeField(SIAF("imported", DataType::INT32)); + EXPECT_TRUE(s3.loadFromFile("schema.txt")); // load should clear the current content + assertSchema(s, s3); + } + { // empty schema + Schema s; + EXPECT_TRUE(s.saveToFile("schema2.txt")); + Schema s2; + s2.addIndexField(SIF("foo", SDT::STRING)); + s2.addImportedAttributeField(SIAF("imported", DataType::INT32)); + EXPECT_TRUE(s2.loadFromFile("schema2.txt")); + assertSchema(s, s2); + } + { // load with error + Schema s; + EXPECT_TRUE(!s.loadFromFile("not.txt")); + EXPECT_TRUE(!s.saveToFile("not/not.txt")); + } +} + +void +addAllFieldTypes(const string& name, Schema& schema) +{ + Schema::IndexField index_field(name, DataType::STRING); + schema.addIndexField(index_field); + + Schema::AttributeField attribute_field(name, DataType::STRING); + schema.addAttributeField(attribute_field); + + Schema::SummaryField summary_field(name, DataType::STRING); + schema.addSummaryField(summary_field); + + schema.addFieldSet(Schema::FieldSet(name)); +} + +TEST(SchemaTest, require_that_schemas_can_be_added) +{ + const string name1 = "foo"; + const string name2 = "bar"; + Schema s1; + addAllFieldTypes(name1, s1); + Schema s2; + addAllFieldTypes(name2, s2); + + Schema::UP sum = Schema::make_union(s1, s2); + ASSERT_EQ(2u, sum->getNumIndexFields()); + EXPECT_TRUE(s1.getIndexField(0) == + sum->getIndexField(sum->getIndexFieldId(name1))); + EXPECT_TRUE(s2.getIndexField(0) == + sum->getIndexField(sum->getIndexFieldId(name2))); + ASSERT_EQ(2u, sum->getNumAttributeFields()); + EXPECT_TRUE(s1.getAttributeField(0) == + sum->getAttributeField(sum->getAttributeFieldId(name1))); + EXPECT_TRUE(s2.getAttributeField(0) == + sum->getAttributeField(sum->getAttributeFieldId(name2))); + ASSERT_EQ(2u, sum->getNumSummaryFields()); + EXPECT_TRUE(s1.getSummaryField(0) == + sum->getSummaryField(sum->getSummaryFieldId(name1))); + EXPECT_TRUE(s2.getSummaryField(0) == + sum->getSummaryField(sum->getSummaryFieldId(name2))); + ASSERT_EQ(2u, sum->getNumFieldSets()); + EXPECT_TRUE(s1.getFieldSet(0) == + sum->getFieldSet(sum->getFieldSetId(name1))); + EXPECT_TRUE(s2.getFieldSet(0) == + sum->getFieldSet(sum->getFieldSetId(name2))); +} + +TEST(SchemaTest, require_that_S_union_S_equals_S_for_schema_S) +{ + Schema schema; + addAllFieldTypes("foo", schema); + + Schema::UP sum = Schema::make_union(schema, schema); + EXPECT_TRUE(schema == *sum); +} + +TEST(SchemaTest, require_that_schema_can_calculate_set_difference) +{ + const string name1 = "foo"; + const string name2 = "bar"; + Schema s1; + addAllFieldTypes(name1, s1); + addAllFieldTypes(name2, s1); + Schema s2; + addAllFieldTypes(name2, s2); + + Schema::UP schema = Schema::set_difference(s1, s2); + + Schema expected; + addAllFieldTypes(name1, expected); + EXPECT_TRUE(expected == *schema); +} + +TEST(SchemaTest, require_that_schema_can_calculate_intersection) +{ + const string name1 = "foo"; + const string name2 = "bar"; + const string name3 = "baz"; + Schema s1; + addAllFieldTypes(name1, s1); + addAllFieldTypes(name2, s1); + Schema s2; + addAllFieldTypes(name2, s2); + addAllFieldTypes(name3, s2); + + Schema::UP schema = Schema::intersect(s1, s2); + + Schema expected; + addAllFieldTypes(name2, expected); + EXPECT_TRUE(expected == *schema); +} + +TEST(SchemaTest, require_that_incompatible_fields_are_removed_from_intersection) +{ + const string name = "foo"; + Schema s1; + s1.addIndexField(Schema::IndexField(name, DataType::STRING)); + Schema s2; + s2.addIndexField(Schema::IndexField(name, DataType::INT32)); + Schema::UP schema = Schema::intersect(s1, s2); + EXPECT_EQ(0u, schema->getNumIndexFields()); + EXPECT_FALSE(schema->isIndexField(name)); +} + +TEST(SchemaTest, require_that_imported_attribute_fields_are_not_saved_to_disk) +{ + const vespalib::string fileName = "schema-no-imported-fields.txt"; + { + Schema s; + s.addImportedAttributeField(Schema::ImportedAttributeField("imported", DataType::INT32)); + s.saveToFile(fileName); + } + { + Schema s; + s.loadFromFile(fileName); + EXPECT_EQ(0u, s.getNumImportedAttributeFields()); + } +} + +TEST(SchemaTest, require_that_schema_can_be_built_with_imported_attribute_fields) +{ + Schema s; + SchemaConfigurer configurer(s, "dir:imported-fields-cfg"); + + const auto &imported = s.getImportedAttributeFields(); + ASSERT_EQ(2u, imported.size()); + assertField(SIAF("imported_a", DataType::INT32, CollectionType::SINGLE), imported[0]); + assertField(SIAF("imported_b", DataType::STRING, CollectionType::ARRAY), imported[1]); + + const auto ®ular = s.getAttributeFields(); + ASSERT_EQ(1u, regular.size()); + assertField(SIAF("regular", DataType::INT32, CollectionType::SINGLE), regular[0]); +} + +TEST(SchemaTest, require_that_index_field_is_loaded_with_default_values_when_properties_are_not_set) +{ + Schema s; + s.loadFromFile("schema-without-index-field-properties.txt"); + + const auto& index_fields = s.getIndexFields(); + ASSERT_EQ(1, index_fields.size()); + assertIndexField(SIF("foo", DataType::STRING, CollectionType::SINGLE). + setAvgElemLen(512). + set_interleaved_features(false), + index_fields[0]); + assertIndexField(SIF("foo", DataType::STRING, CollectionType::SINGLE), index_fields[0]); +} + +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchcommon/.gitignore b/searchlib/src/vespa/searchcommon/.gitignore new file mode 100644 index 00000000000..f76a9d84bed --- /dev/null +++ b/searchlib/src/vespa/searchcommon/.gitignore @@ -0,0 +1,3 @@ +/.depend +/Makefile +/libsearchcommon.so.5.1 diff --git a/searchlib/src/vespa/searchcommon/attribute/.gitignore b/searchlib/src/vespa/searchcommon/attribute/.gitignore new file mode 100644 index 00000000000..7e7c0fe7fae --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/.gitignore @@ -0,0 +1,2 @@ +/.depend +/Makefile diff --git a/searchlib/src/vespa/searchcommon/attribute/CMakeLists.txt b/searchlib/src/vespa/searchcommon/attribute/CMakeLists.txt new file mode 100644 index 00000000000..704fe238ed5 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/CMakeLists.txt @@ -0,0 +1,11 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(searchcommon_searchcommon_attribute OBJECT + SOURCES + attribute_utils.cpp + basictype.cpp + collectiontype.cpp + config.cpp + search_context_params.cpp + status.cpp + DEPENDS +) diff --git a/searchlib/src/vespa/searchcommon/attribute/attribute_utils.cpp b/searchlib/src/vespa/searchcommon/attribute/attribute_utils.cpp new file mode 100644 index 00000000000..cd5cc58c75c --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/attribute_utils.cpp @@ -0,0 +1,23 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "attribute_utils.h" +#include <vespa/searchcommon/attribute/config.h> + +namespace search::attribute { + +bool +isUpdateableInMemoryOnly(const vespalib::string &attrName, const Config &cfg) +{ + auto basicType = cfg.basicType().type(); + return ((basicType != BasicType::Type::PREDICATE) && + (basicType != BasicType::Type::REFERENCE)) && + !isStructFieldAttribute(attrName); +} + +bool +isStructFieldAttribute(const vespalib::string &attrName) +{ + return attrName.find('.') != vespalib::string::npos; +} + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/attribute_utils.h b/searchlib/src/vespa/searchcommon/attribute/attribute_utils.h new file mode 100644 index 00000000000..e4c2a8e4727 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/attribute_utils.h @@ -0,0 +1,30 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> + +namespace search::attribute { + +class Config; + +/** + * Returns whether the given attribute vector is updateable only in-memory. + * + * For most attributes this is true. + * The data stored in the attribute is equal to the data stored in the field value in the document. + * + * For predicate and reference attributes this is false. + * The original data is transformed (lossy) before it is stored in the attribute. + * During update we also need to update the field value in the document. + * + * For struct field attributes this is false. + * A struct field attribute typically represents a sub-field of a more complex field (e.g. map of struct or array of struct). + * During update the complex field is first updated in the document, + * then the struct field attribute is updated based on the new content of the complex field. + */ +bool isUpdateableInMemoryOnly(const vespalib::string &attrName, const Config &cfg); + +bool isStructFieldAttribute(const vespalib::string &attrName); + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/attributecontent.h b/searchlib/src/vespa/searchcommon/attribute/attributecontent.h new file mode 100644 index 00000000000..f5960ce358b --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/attributecontent.h @@ -0,0 +1,166 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "iattributevector.h" +#include <cstdint> + +namespace search::attribute { + + +/** + * This class is wrapping an array of type T and is used to hold the + * attribute vector content for a given document. The values stored for the + * given document in the attribute vector is copied into the array wrapped + * in an instance of this class. + * + * @param T the type of the data stored in this object + **/ +template <typename T> +class AttributeContent +{ +private: + T _staticBuf[16]; + T * _dynamicBuf; + uint32_t _size; + uint32_t _capacity; + + AttributeContent(const AttributeContent & rhs); + AttributeContent & operator=(const AttributeContent & rhs); + +public: + /** + * Creates a new object with an initial capacity of 16 without dynamic allocation. + **/ + AttributeContent() : + _dynamicBuf(nullptr), + _size(0), + _capacity(16) + { + } + /** + * Destructs the object. + **/ + ~AttributeContent() { + if (_dynamicBuf != nullptr) { + delete [] _dynamicBuf; + } + } + + /** + * Returns a read-only iterator to the beginning of the underlying data array. + * + * @return iterator + **/ + const T * begin() const { + if (_dynamicBuf != nullptr) { + return _dynamicBuf; + } + return _staticBuf; + } + + /** + * Returns a read-only iterator to the end of the underlying data array. + * + * @return iterator + **/ + const T * end() const { + return begin() + _size; + } + + /** + * Returns the element at the given position in the underlying data array. + * + * @return read-only reference to the element + * @param idx position into the underlying data + **/ + const T & operator[](uint32_t idx) const { + return *(begin() + idx); + } + + /** + * Returns the number of elements used in the underlying data array. + * + * @return number of elements used + **/ + uint32_t size() const { + return _size; + } + + /** + * Returns the number of elements allocated in the underlying data array. + * + * @return number of elements allocated + **/ + uint32_t capacity() const { + return _capacity; + } + + /** + * Returns a read/write pointer to the underlying data array. + * + * @return read/write pointer. + **/ + T * data() { + if (_dynamicBuf != nullptr) { + return _dynamicBuf; + } + return _staticBuf; + } + + /** + * Sets the number of elements used in the underlying data array. + * + * @param n number of elements used + **/ + void setSize(uint32_t n) { + _size = n; + } + + /** + * Allocates memory so that the underlying data array can hold the + * given number of elements (capacity) and sets the size to 0. + * A new data array will only be allocated if n > capacity(). + * + * @param n wanted number of elements + **/ + void allocate(uint32_t n) { + if (n > _capacity) { + if (_dynamicBuf != nullptr) { + delete [] _dynamicBuf; + } + _dynamicBuf = new T[n]; + _capacity = n; + _size = 0; + } + } + + /** + * Fill this buffer with the content of the given attribute vector for the given docId. + * + * @param attribute the attribute vector + * @param docId the docId + **/ + void fill(const IAttributeVector & attribute, IAttributeVector::DocId docId) + { + uint32_t count = attribute.get(docId, data(), capacity()); + while (count > capacity()) { + allocate(count); + count = attribute.get(docId, data(), capacity()); + } + setSize(count); + } +}; + +typedef AttributeContent<double> FloatContent; +typedef AttributeContent<const char *> ConstCharContent; +typedef AttributeContent<IAttributeVector::largeint_t> IntegerContent; +typedef AttributeContent<IAttributeVector::EnumHandle> EnumContent; +typedef AttributeContent<IAttributeVector::WeightedInt> WeightedIntegerContent; +typedef AttributeContent<IAttributeVector::WeightedFloat> WeightedFloatContent; +typedef AttributeContent<IAttributeVector::WeightedConstChar> WeightedConstCharContent; +typedef AttributeContent<IAttributeVector::WeightedString> WeightedStringContent; +typedef AttributeContent<IAttributeVector::WeightedEnum> WeightedEnumContent; +typedef IAttributeVector::EnumHandle EnumHandle; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/basictype.cpp b/searchlib/src/vespa/searchcommon/attribute/basictype.cpp new file mode 100644 index 00000000000..5bab2fc06d2 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/basictype.cpp @@ -0,0 +1,37 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchcommon/attribute/basictype.h> +#include <vespa/vespalib/util/exceptions.h> + +namespace search::attribute { + +const BasicType::TypeInfo BasicType::_typeTable[BasicType::MAX_TYPE] = { + { BasicType::NONE, 0, "none" }, + { BasicType::STRING, 0, "string" }, + { BasicType::BOOL, sizeof(int8_t), "bool" }, + { BasicType::UINT2, sizeof(int8_t), "uint2" }, + { BasicType::UINT4, sizeof(int8_t), "uint4" }, + { BasicType::INT8, sizeof(int8_t), "int8" }, + { BasicType::INT16, sizeof(int16_t), "int16" }, + { BasicType::INT32, sizeof(int32_t), "int32" }, + { BasicType::INT64, sizeof(int64_t), "int64" }, + { BasicType::FLOAT, sizeof(float), "float" }, + { BasicType::DOUBLE, sizeof(double), "double" }, + { BasicType::PREDICATE, 0, "predicate" }, + { BasicType::TENSOR, 0, "tensor" }, + { BasicType::REFERENCE, 12, "reference" } +}; + +BasicType::Type +BasicType::asType(const vespalib::string &t) +{ + for (size_t i(0); i < sizeof(_typeTable)/sizeof(_typeTable[0]); i++) { + if (t == _typeTable[i]._name) { + return _typeTable[i]._type; + } + } + throw vespalib::IllegalStateException(t + " not recognized as valid attribute data type"); + return NONE; +} + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/basictype.h b/searchlib/src/vespa/searchcommon/attribute/basictype.h new file mode 100644 index 00000000000..bd7b4a2b4bc --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/basictype.h @@ -0,0 +1,63 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> + +namespace search::attribute { + +class BasicType +{ + public: + enum Type { + NONE = 0, + STRING = 1, + BOOL = 2, + UINT2 = 3, + UINT4 = 4, + INT8 = 5, + INT16 = 6, + INT32 = 7, + INT64 = 8, + FLOAT = 9, + DOUBLE = 10, + PREDICATE = 11, + TENSOR = 12, + REFERENCE = 13, + MAX_TYPE + }; + + explicit BasicType(int t) : _type(Type(t)) { } + explicit BasicType(unsigned int t) : _type(Type(t)) { } + BasicType(Type t) : _type(t) { } + explicit BasicType(const vespalib::string & t) : _type(asType(t)) { } + + Type type() const { return _type; } + const char * asString() const { return asString(_type); } + size_t fixedSize() const { return fixedSize(_type); } + static BasicType fromType(bool) { return BOOL; } + static BasicType fromType(int8_t) { return INT8; } + static BasicType fromType(int16_t) { return INT16; } + static BasicType fromType(int32_t) { return INT32; } + static BasicType fromType(int64_t) { return INT64; } + static BasicType fromType(float) { return FLOAT; } + static BasicType fromType(double) { return DOUBLE; } + bool operator==(const BasicType &b) const { return _type == b._type; } + bool operator!=(const BasicType &b) const { return _type != b._type; } + + private: + static const char * asString(Type t) { return _typeTable[t]._name; } + static size_t fixedSize(Type t) { return _typeTable[t]._fixedSize; } + static Type asType(const vespalib::string & t); + + Type _type; + + struct TypeInfo { + Type _type; + unsigned int _fixedSize; + const char * _name; + }; + static const TypeInfo _typeTable[MAX_TYPE]; +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/collectiontype.cpp b/searchlib/src/vespa/searchcommon/attribute/collectiontype.cpp new file mode 100644 index 00000000000..b77382f6126 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/collectiontype.cpp @@ -0,0 +1,26 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchcommon/attribute/collectiontype.h> +#include <vespa/vespalib/util/exceptions.h> + +namespace search::attribute { + +const CollectionType::TypeInfo CollectionType::_typeTable[CollectionType::MAX_TYPE] = { + { CollectionType::SINGLE, "single" }, + { CollectionType::ARRAY, "array" }, + { CollectionType::WSET, "weightedset" } +}; + +CollectionType::Type +CollectionType::asType(const vespalib::string &t) +{ + for (size_t i(0); i < sizeof(_typeTable)/sizeof(_typeTable[0]); i++) { + if (t == _typeTable[i]._name) { + return _typeTable[i]._type; + } + } + throw vespalib::IllegalStateException(t + " not recognized as valid attribute collection type"); + return SINGLE; +} + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/collectiontype.h b/searchlib/src/vespa/searchcommon/attribute/collectiontype.h new file mode 100644 index 00000000000..35cb7612ed0 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/collectiontype.h @@ -0,0 +1,75 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> + +namespace search::attribute { + +class CollectionType +{ + public: + enum Type { + /** + * Single value type with one value stored for each document. + **/ + SINGLE = 0, + /** + * Array type with zero to n values stored for each document. + **/ + ARRAY = 1, + /** + * Weighted set type with zero to n unique values stored for each document. + * In addition each unique value is accociated with a weight. + **/ + WSET = 2, + MAX_TYPE + }; + + CollectionType(Type t = SINGLE, bool remove = false, bool create = false) : + _type(t), + _removeIfZero(remove), + _createIfNonExistant(create) + { + } + + explicit + CollectionType(const vespalib::string & t, bool remove = false, bool create = false) : + _type(asType(t)), + _removeIfZero(remove), + _createIfNonExistant(create) + { + } + + Type type() const { return _type; } + bool isMultiValue() const { return _type != SINGLE; } + bool isWeightedSet() const { return _type == WSET; } + bool isArray() const { return _type == ARRAY; } + bool removeIfZero() const { return _removeIfZero; } + bool createIfNonExistant() const { return _createIfNonExistant; } + const char * asString() const { return asString(_type); } + void removeIfZero(bool newValue) { _removeIfZero = newValue; } + void createIfNonExistant(bool newValue) { _createIfNonExistant = newValue; } + bool operator!=(const CollectionType &b) const { return !(operator==(b)); } + bool operator==(const CollectionType &b) const { + return _type == b._type && + _removeIfZero == b._removeIfZero && + _createIfNonExistant == b._createIfNonExistant; + } + + private: + struct TypeInfo { + Type _type; + const char * _name; + }; + + static const char * asString(Type t) { return _typeTable[t]._name; } + static Type asType(const vespalib::string &t); + + Type _type; + bool _removeIfZero; + bool _createIfNonExistant; + static const TypeInfo _typeTable[MAX_TYPE]; +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/config.cpp b/searchlib/src/vespa/searchcommon/attribute/config.cpp new file mode 100644 index 00000000000..0a50faa04c0 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/config.cpp @@ -0,0 +1,72 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "config.h" + +namespace search::attribute { + +namespace { + +static constexpr uint64_t MAX_UNCOMMITTED_MEMORY = 8000; + +} + +Config::Config() noexcept + : Config(BasicType::NONE, CollectionType::SINGLE, false, false) +{ +} + +Config::Config(BasicType bt, CollectionType ct, bool fastSearch_, bool huge_) noexcept + : _basicType(bt), + _type(ct), + _fastSearch(fastSearch_), + _huge(huge_), + _enableBitVectors(false), + _enableOnlyBitVector(false), + _isFilter(false), + _fastAccess(false), + _mutable(false), + _paged(false), + _maxUnCommittedMemory(MAX_UNCOMMITTED_MEMORY), + _match(Match::UNCASED), + _dictionary(), + _growStrategy(), + _compactionStrategy(), + _predicateParams(), + _tensorType(vespalib::eval::ValueType::error_type()), + _distance_metric(DistanceMetric::Euclidean), + _hnsw_index_params() +{ +} + +Config::Config(const Config &) = default; +Config & Config::operator = (const Config &) = default; +Config::Config(Config &&) noexcept = default; +Config & Config::operator = (Config &&) noexcept = default; +Config::~Config() = default; + +bool +Config::operator==(const Config &b) const +{ + return _basicType == b._basicType && + _type == b._type && + _huge == b._huge && + _fastSearch == b._fastSearch && + _enableBitVectors == b._enableBitVectors && + _enableOnlyBitVector == b._enableOnlyBitVector && + _isFilter == b._isFilter && + _fastAccess == b._fastAccess && + _mutable == b._mutable && + _paged == b._paged && + _maxUnCommittedMemory == b._maxUnCommittedMemory && + _match == b._match && + _dictionary == b._dictionary && + _growStrategy == b._growStrategy && + _compactionStrategy == b._compactionStrategy && + _predicateParams == b._predicateParams && + (_basicType.type() != BasicType::Type::TENSOR || + _tensorType == b._tensorType) && + _distance_metric == b._distance_metric && + _hnsw_index_params == b._hnsw_index_params; +} + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/config.h b/searchlib/src/vespa/searchcommon/attribute/config.h new file mode 100644 index 00000000000..f572f5038fc --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/config.h @@ -0,0 +1,158 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "basictype.h" +#include "collectiontype.h" +#include "hnsw_index_params.h" +#include "predicate_params.h" +#include <vespa/searchcommon/common/growstrategy.h> +#include <vespa/searchcommon/common/dictionary_config.h> +#include <vespa/eval/eval/value_type.h> +#include <vespa/vespalib/datastore/compaction_strategy.h> +#include <cassert> +#include <optional> + +namespace search::attribute { + +/** + * Configuration for an attribute vector. + * + * Used to determine which implementation to instantiate. + */ +class Config { +public: + enum class Match { CASED, UNCASED }; + using CompactionStrategy = vespalib::datastore::CompactionStrategy; + Config() noexcept; + Config(BasicType bt) noexcept : Config(bt, CollectionType::SINGLE) { } + Config(BasicType bt, CollectionType ct) noexcept : Config(bt, ct, false) { } + Config(BasicType bt, CollectionType ct, bool fastSearch_) noexcept + : Config(bt, ct, fastSearch_, false) + {} + Config(BasicType bt, CollectionType ct, bool fastSearch_, bool huge_) noexcept; + Config(const Config &); + Config & operator = (const Config &); + Config(Config &&) noexcept; + Config & operator = (Config &&) noexcept; + ~Config(); + + BasicType basicType() const { return _basicType; } + CollectionType collectionType() const { return _type; } + bool fastSearch() const { return _fastSearch; } + bool huge() const { return _huge; } + bool paged() const { return _paged; } + const PredicateParams &predicateParams() const { return _predicateParams; } + const vespalib::eval::ValueType & tensorType() const { return _tensorType; } + DistanceMetric distance_metric() const { return _distance_metric; } + const std::optional<HnswIndexParams>& hnsw_index_params() const { return _hnsw_index_params; } + + /** + * Check if attribute posting list can consist of a bitvector in + * addition to (or instead of) a btree. + */ + bool getEnableBitVectors() const { return _enableBitVectors; } + + /** + * Check if attribute posting list can consist of only a bitvector with + * no corresponding btree. + */ + bool getEnableOnlyBitVector() const { return _enableOnlyBitVector; } + + bool getIsFilter() const { return _isFilter; } + bool isMutable() const { return _mutable; } + + /** + * Check if this attribute should be fast accessible at all times. + * If so, attribute is kept in memory also for non-searchable documents. + */ + bool fastAccess() const { return _fastAccess; } + + const GrowStrategy & getGrowStrategy() const { return _growStrategy; } + const CompactionStrategy &getCompactionStrategy() const { return _compactionStrategy; } + const DictionaryConfig & get_dictionary_config() const { return _dictionary; } + Match get_match() const { return _match; } + Config & setHuge(bool v) { _huge = v; return *this;} + Config & setFastSearch(bool v) { _fastSearch = v; return *this; } + Config & setPredicateParams(const PredicateParams &v) { _predicateParams = v; return *this; } + Config & setTensorType(const vespalib::eval::ValueType &tensorType_in) { + _tensorType = tensorType_in; + return *this; + } + Config& set_distance_metric(DistanceMetric value) { + _distance_metric = value; + return *this; + } + Config& set_hnsw_index_params(const HnswIndexParams& params) { + assert(_distance_metric == params.distance_metric()); + _hnsw_index_params = params; + return *this; + } + Config& clear_hnsw_index_params() { + _hnsw_index_params.reset(); + return *this; + } + + /** + * Enable attribute posting list to consist of a bitvector in + * addition to (or instead of) a btree. + */ + Config & setEnableBitVectors(bool enableBitVectors) { + _enableBitVectors = enableBitVectors; + return *this; + } + + /** + * Enable attribute posting list to consist of only a bitvector with + * no corresponding btree. Some information degradation might occur when + * document frequency goes down, since recreated btree representation + * will then have lost weight information. + */ + Config & setEnableOnlyBitVector(bool enableOnlyBitVector) { + _enableOnlyBitVector = enableOnlyBitVector; + return *this; + } + + /** + * Hide weight information when searching in attributes. + */ + Config & setIsFilter(bool isFilter) { _isFilter = isFilter; return *this; } + Config & setMutable(bool isMutable) { _mutable = isMutable; return *this; } + Config & setPaged(bool paged_in) { _paged = paged_in; return *this; } + Config & setFastAccess(bool v) { _fastAccess = v; return *this; } + Config & setGrowStrategy(const GrowStrategy &gs) { _growStrategy = gs; return *this; } + Config & setCompactionStrategy(const CompactionStrategy &compactionStrategy) { + _compactionStrategy = compactionStrategy; + return *this; + } + Config & set_dictionary_config(const DictionaryConfig & cfg) { _dictionary = cfg; return *this; } + Config & set_match(Match match) { _match = match; return *this; } + bool operator!=(const Config &b) const { return !(operator==(b)); } + bool operator==(const Config &b) const; + + uint64_t getMaxUnCommittedMemory() const { return _maxUnCommittedMemory; } + Config & setMaxUnCommittedMemory(uint64_t value) { _maxUnCommittedMemory = value; return *this; } + +private: + BasicType _basicType; + CollectionType _type; + bool _fastSearch; + bool _huge; + bool _enableBitVectors; + bool _enableOnlyBitVector; + bool _isFilter; + bool _fastAccess; + bool _mutable; + bool _paged; + uint64_t _maxUnCommittedMemory; + Match _match; + DictionaryConfig _dictionary; + GrowStrategy _growStrategy; + CompactionStrategy _compactionStrategy; + PredicateParams _predicateParams; + vespalib::eval::ValueType _tensorType; + DistanceMetric _distance_metric; + std::optional<HnswIndexParams> _hnsw_index_params; +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/distance_metric.h b/searchlib/src/vespa/searchcommon/attribute/distance_metric.h new file mode 100644 index 00000000000..26efa30bba4 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/distance_metric.h @@ -0,0 +1,9 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +namespace search::attribute { + +enum class DistanceMetric { Euclidean, Angular, GeoDegrees, InnerProduct, Hamming }; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/hnsw_index_params.h b/searchlib/src/vespa/searchcommon/attribute/hnsw_index_params.h new file mode 100644 index 00000000000..4f9d3c5593c --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/hnsw_index_params.h @@ -0,0 +1,45 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_metric.h" + +namespace search::attribute { + +/** + * Configuration parameters for a hnsw index used together with a 1-dimensional indexed tensor + * for approximate nearest neighbor search. + */ +class HnswIndexParams { +private: + uint32_t _max_links_per_node; + uint32_t _neighbors_to_explore_at_insert; + // This is always the same as in the attribute config, and is duplicated here to simplify usage. + DistanceMetric _distance_metric; + bool _multi_threaded_indexing; + +public: + HnswIndexParams(uint32_t max_links_per_node_in, + uint32_t neighbors_to_explore_at_insert_in, + DistanceMetric distance_metric_in, + bool multi_threaded_indexing_in = false) noexcept + : _max_links_per_node(max_links_per_node_in), + _neighbors_to_explore_at_insert(neighbors_to_explore_at_insert_in), + _distance_metric(distance_metric_in), + _multi_threaded_indexing(multi_threaded_indexing_in) + {} + + uint32_t max_links_per_node() const { return _max_links_per_node; } + uint32_t neighbors_to_explore_at_insert() const { return _neighbors_to_explore_at_insert; } + DistanceMetric distance_metric() const { return _distance_metric; } + bool multi_threaded_indexing() const { return _multi_threaded_indexing; } + + bool operator==(const HnswIndexParams& rhs) const { + return (_max_links_per_node == rhs._max_links_per_node && + _neighbors_to_explore_at_insert == rhs._neighbors_to_explore_at_insert && + _distance_metric == rhs._distance_metric && + _multi_threaded_indexing == rhs._multi_threaded_indexing); + } +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/i_attribute_functor.h b/searchlib/src/vespa/searchcommon/attribute/i_attribute_functor.h new file mode 100644 index 00000000000..da5127de8ee --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/i_attribute_functor.h @@ -0,0 +1,37 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> +#include <memory> + +namespace search::attribute { + +class IAttributeVector; + +/* + * Interface class for access attribute in correct attribute write + * thread as async callback from asyncForEachAttribute() call on + * attribute manager. + */ +class IConstAttributeFunctor +{ +public: + virtual void operator()(const IAttributeVector &attributeVector) = 0; + virtual ~IConstAttributeFunctor() = default; +}; + +class IAttributeFunctor +{ +public: + virtual void operator()(IAttributeVector &attributeVector) = 0; + virtual ~IAttributeFunctor() = default; +}; + +class IAttributeExecutor { +public: + virtual ~IAttributeExecutor() = default; + virtual void asyncForAttribute(const vespalib::string &name, std::unique_ptr<IAttributeFunctor> func) const = 0; +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/i_multi_value_attribute.h b/searchlib/src/vespa/searchcommon/attribute/i_multi_value_attribute.h new file mode 100644 index 00000000000..ea1fbe0b2b4 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/i_multi_value_attribute.h @@ -0,0 +1,55 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "i_multi_value_read_view.h" + +namespace vespalib { class Stash; } + +namespace search::attribute { + +/** + * Interface that provides read views for different multi-value attribute types. + * + * The type-safe down-cast functions only return a valid pointer when that particular type is supported. + * Otherwise a nullptr is returned. + * The returned read view is owned by the supplied stash. + */ +class IMultiValueAttribute { +public: + template<typename MultiValueType> + class MultiValueTag {}; + + template<typename T> + using ArrayTag = MultiValueTag<T>; + + using ArrayEnumTag = ArrayTag<vespalib::datastore::AtomicEntryRef>; + + template<typename T> + using WeightedSetTag = MultiValueTag<search::multivalue::WeightedValue<T>>; + + using WeightedSetEnumTag = WeightedSetTag<vespalib::datastore::AtomicEntryRef>; + + virtual ~IMultiValueAttribute() {} + + virtual const IArrayReadView<int8_t>* make_read_view(ArrayTag<int8_t>, vespalib::Stash&) const { return nullptr; } + virtual const IArrayReadView<int16_t>* make_read_view(ArrayTag<int16_t>, vespalib::Stash&) const { return nullptr; } + virtual const IArrayReadView<int32_t>* make_read_view(ArrayTag<int32_t>, vespalib::Stash&) const { return nullptr; } + virtual const IArrayReadView<int64_t>* make_read_view(ArrayTag<int64_t>, vespalib::Stash&) const { return nullptr; } + virtual const IArrayReadView<float>* make_read_view(ArrayTag<float>, vespalib::Stash&) const { return nullptr; } + virtual const IArrayReadView<double>* make_read_view(ArrayTag<double>, vespalib::Stash&) const { return nullptr; } + virtual const IArrayReadView<const char*>* make_read_view(ArrayTag<const char*>, vespalib::Stash&) const { return nullptr; } + + virtual const IWeightedSetReadView<int8_t>* make_read_view(WeightedSetTag<int8_t>, vespalib::Stash&) const { return nullptr; } + virtual const IWeightedSetReadView<int16_t>* make_read_view(WeightedSetTag<int16_t>, vespalib::Stash&) const { return nullptr; } + virtual const IWeightedSetReadView<int32_t>* make_read_view(WeightedSetTag<int32_t>, vespalib::Stash&) const { return nullptr; } + virtual const IWeightedSetReadView<int64_t>* make_read_view(WeightedSetTag<int64_t>, vespalib::Stash&) const { return nullptr; } + virtual const IWeightedSetReadView<float>* make_read_view(WeightedSetTag<float>, vespalib::Stash&) const { return nullptr; } + virtual const IWeightedSetReadView<double>* make_read_view(WeightedSetTag<double>, vespalib::Stash&) const { return nullptr; } + virtual const IWeightedSetReadView<const char*>* make_read_view(WeightedSetTag<const char*>, vespalib::Stash&) const { return nullptr; } + + virtual const IArrayEnumReadView* make_read_view(ArrayEnumTag, vespalib::Stash&) const { return nullptr; } + virtual const IWeightedSetEnumReadView* make_read_view(WeightedSetEnumTag, vespalib::Stash&) const { return nullptr; } +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/i_multi_value_read_view.h b/searchlib/src/vespa/searchcommon/attribute/i_multi_value_read_view.h new file mode 100644 index 00000000000..8e5005eae8d --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/i_multi_value_read_view.h @@ -0,0 +1,46 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "multivalue.h" +#include <vespa/vespalib/datastore/atomic_entry_ref.h> +#include <vespa/vespalib/util/arrayref.h> + +namespace search::attribute { + +/** + * Read view for the data stored in a multi-value attribute. + * @tparam MultiValueType The multi-value type of the data to access. + */ +template <typename MultiValueType> +class IMultiValueReadView { +public: + virtual ~IMultiValueReadView() {} + virtual vespalib::ConstArrayRef<MultiValueType> get_values(uint32_t docid) const = 0; +}; + +/** + * Read view for the raw data stored in an array attribute. + * @tparam T The value type of the raw data to access. + */ +template <typename T> +using IArrayReadView = IMultiValueReadView<T>; + +/** + * Read view for the raw data stored in a weighted set attribute. + * @tparam T The value type of the raw data to access. + */ +template <typename T> +using IWeightedSetReadView = IMultiValueReadView<multivalue::WeightedValue<T>>; + +/** + * Read view for the raw data stored in an enumerated array attribute. + */ +using IArrayEnumReadView = IArrayReadView<vespalib::datastore::AtomicEntryRef>; + +/** + * Read view for the raw data stored in an enumerated weighted set attribute. + */ +using IWeightedSetEnumReadView = IWeightedSetReadView<vespalib::datastore::AtomicEntryRef>; + +}; diff --git a/searchlib/src/vespa/searchcommon/attribute/i_search_context.h b/searchlib/src/vespa/searchcommon/attribute/i_search_context.h new file mode 100644 index 00000000000..ff62c535e7f --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/i_search_context.h @@ -0,0 +1,74 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchcommon/common/range.h> +#include <vespa/vespalib/stllike/string.h> +#include <memory> + +namespace search::fef { class TermFieldMatchData; } +namespace search::queryeval { + class SearchIterator; + class ExecuteInfo; +} +namespace search { class QueryTermUCS4; } + +namespace search::attribute { + +class ISearchContext { +public: + using UP = std::unique_ptr<ISearchContext>; + using DocId = uint32_t; + +private: + virtual int32_t onFind(DocId docId, int32_t elementId, int32_t &weight) const = 0; + virtual int32_t onFind(DocId docId, int32_t elementId) const = 0; + +public: + virtual ~ISearchContext() {} + + virtual unsigned int approximateHits() const = 0; + + /** + * Creates an attribute search iterator associated with this + * search context. + * + * @return attribute search iterator + * + * @param matchData the attribute match data used when + * unpacking data for a hit + * + * @param strict whether the iterator should be strict or not + **/ + virtual std::unique_ptr<queryeval::SearchIterator> + createIterator(fef::TermFieldMatchData *matchData, bool strict) = 0; + + /* + * Create temporary posting lists. + * Should be called before createIterator() is called. + */ + virtual void fetchPostings(const queryeval::ExecuteInfo &execInfo) = 0; + + virtual bool valid() const = 0; + virtual Int64Range getAsIntegerTerm() const = 0; + virtual const QueryTermUCS4 * queryTerm() const = 0; + virtual const vespalib::string &attributeName() const = 0; + + int32_t find(DocId docId, int32_t elementId, int32_t &weight) const { return onFind(docId, elementId, weight); } + int32_t find(DocId docId, int32_t elementId) const { return onFind(docId, elementId); } + template<typename SC> + static bool matches(const SC & sc, DocId docId, int32_t &weight) { + weight = 0; + int32_t oneWeight(0); + int32_t firstId = sc.find(docId, 0, oneWeight); + for (int32_t id(firstId); id >= 0; id = sc.find(docId, id + 1, oneWeight)) { + weight += oneWeight; + } + return firstId >= 0; + } + bool matches(DocId docId, int32_t &weight) const { return matches(*this, docId, weight); } + bool matches(DocId doc) const { return find(doc, 0) >= 0; } + +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/iattributecontext.h b/searchlib/src/vespa/searchcommon/attribute/iattributecontext.h new file mode 100644 index 00000000000..bb349057ca9 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/iattributecontext.h @@ -0,0 +1,54 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "i_attribute_functor.h" +#include "iattributevector.h" + +namespace search::attribute { + +/** + * This is an interface used to access all registered attribute vectors. + **/ +class IAttributeContext : public IAttributeExecutor { +public: + typedef vespalib::string string; + /** Convenience typedefs **/ + typedef std::unique_ptr<IAttributeContext> UP; + + /** + * Returns the attribute vector with the given name. + * + * @param name the name of the attribute vector. + * @return const view of the attribute vector or NULL if the attribute vector does not exists. + **/ + virtual const IAttributeVector * getAttribute(const string & name) const = 0; + + /** + * Returns the attribute vector with the given name. + * Makes sure that the underlying enum values are stable during the use of this attribute. + * + * @param name the name of the attribute vector + * @return const view of the attribute vector or NULL if the attribute vector does not exists. + **/ + virtual const IAttributeVector * getAttributeStableEnum(const string & name) const = 0; + + /** + * Fill the given list with all attribute vectors registered. + * + * @param list the list to fill in attribute vectors. + **/ + virtual void getAttributeList(std::vector<const IAttributeVector *> & list) const = 0; + + /** + * Releases all cached attribute guards. + **/ + virtual void releaseEnumGuards() {} + + /** + * Virtual destructor to allow safe subclassing. + **/ + virtual ~IAttributeContext() {} +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/iattributevector.h b/searchlib/src/vespa/searchcommon/attribute/iattributevector.h new file mode 100644 index 00000000000..fa91f301b92 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/iattributevector.h @@ -0,0 +1,457 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "collectiontype.h" +#include "basictype.h" +#include <vespa/searchcommon/common/iblobconverter.h> +#include <ostream> +#include <vector> + +namespace search { + struct IDocumentWeightAttribute; + class QueryTermSimple; +} + +namespace search::tensor { + class ITensorAttribute; +} + +namespace search::attribute { + +class IMultiValueAttribute; +class ISearchContext; +class SearchContextParams; + +/** + * This class is used to store a value and a weight. + * It is used when getting content from a weighted set attribute vector. + * + * @param T the type of the value stored in this object + **/ +template <typename T> +class WeightedType +{ +private: + T _value; + int32_t _weight; + +public: + WeightedType() noexcept : _value(T()), _weight(1) { } + WeightedType(T value_, int32_t weight_ = 1) noexcept : _value(value_), _weight(weight_) { } + const T & getValue() const { return _value; } + const T & value() const { return _value; } + void setValue(const T & v) { _value = v; } + int32_t getWeight() const { return _weight; } + int32_t weight() const { return _weight; } + void setWeight(int32_t w) { _weight = w; } + bool operator==(const WeightedType & rhs) const { + return _value == rhs._value && _weight == rhs._weight; + } +}; + +template <typename T> +std::ostream& +operator<<(std::ostream& os, const WeightedType<T>& value) +{ + os << "{" << value.value() << "," << value.weight() << "}"; + return os; +} + +/** + * This is a read interface used to access the content of an attribute vector. + **/ +class IAttributeVector +{ +public: + using SP = std::shared_ptr<IAttributeVector>; + using DocId = uint32_t; + using EnumHandle = uint32_t; + using largeint_t = int64_t; + using WeightedFloat = WeightedType<double>; + using WeightedInt = WeightedType<largeint_t>; + using WeightedEnum = WeightedType<EnumHandle>; + using WeightedConstChar = WeightedType<const char *>; + using WeightedString = WeightedType<vespalib::string>; + + /** + * Returns the name of this attribute vector. + * + * @return attribute name + **/ + virtual const vespalib::string & getName() const = 0; + + vespalib::stringref getNamePrefix() const { + vespalib::stringref name = getName(); + return name.substr(0, name.find('.')); + } + + /** + * Returns the number of documents stored in this attribute vector. + * + * @return number of documents + **/ + virtual uint32_t getNumDocs() const = 0; + + /** + * Returns the number of values stored for the given document. + * + * @return number of values + * @param doc document identifier + **/ + virtual uint32_t getValueCount(uint32_t doc) const = 0; + + /** + * Returns the maximum number of values stored for any document. + * + * @return maximum number of values + **/ + virtual uint32_t getMaxValueCount() const = 0; + + /** + * Returns the first value stored for the given document as an integer. + * + * @param docId document identifier + * @return the integer value + **/ + virtual largeint_t getInt(DocId doc) const = 0; + + /** + * Returns the first value stored for the given document as a floating point number. + * + * @param docId document identifier + * @return the floating point value + **/ + virtual double getFloat(DocId doc) const = 0; + + /** + * Returns the first value stored for the given document as a string. + * Uses the given buffer to store the actual string if no underlying + * string storage is used for this attribute vector. + * + * @param docId document identifier + * @param buffer content buffer to optionally store the string + * @param sz the size of the buffer + * @return the string value + **/ + virtual const char * getString(DocId doc, char * buffer, size_t sz) const = 0; + + /** + * Returns the first value stored for the given document as an enum value. + * + * @param docId document identifier + * @return the enum value + **/ + virtual EnumHandle getEnum(DocId doc) const = 0; + + /** + * Copies the values stored for the given document into the given buffer. + * + * @param docId document identifier + * @param buffer content buffer to copy integer values into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, largeint_t * buffer, uint32_t sz) const = 0; + + /** + * Copies the values stored for the given document into the given buffer. + * + * @param docId document identifier + * @param buffer content buffer to copy floating point values into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, double * buffer, uint32_t sz) const = 0; + + /** + * Copies the values stored for the given document into the given buffer. + * + * @param docId document identifier + * @param buffer content buffer to copy string values into + * @param sz the size of the buffer + * @return the number of values for this document + **/ +// virtual uint32_t get(DocId docId, vespalib::string * buffer, uint32_t sz) const = 0; + + /** + * Copies the values stored for the given document into the given buffer. + * + * @param docId document identifier + * @param buffer content buffer to copy const char values into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, const char ** buffer, uint32_t sz) const = 0; + + /** + * Copies the enum values stored for the given document into the given buffer. + * + * @param docId document identifier + * @param buffer content object to copy enum into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, EnumHandle * buffer, uint32_t sz) const = 0; + + /** + * Copies the values and weights stored for the given document into the given buffer. + * This method should only be invoked if @ref getCollectionType(docId) returns CollectionType::WEIGHTED_SET. + * + * @param docId document identifier + * @param buffer content object to copy integer values and weights into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, WeightedInt * buffer, uint32_t sz) const = 0; + + /** + * Copies the values and weights stored for the given document into the given buffer. + * This method should only be invoked if @ref getCollectionType(docId) returns CollectionType::WEIGHTED_SET. + * + * @param docId document identifier + * @param buffer content object to copy floating point values and weights into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, WeightedFloat * buffer, uint32_t sz) const = 0; + + /** + * Copies the values and weights stored for the given document into the given buffer. + * This method should only be invoked if @ref getCollectionType(docId) returns CollectionType::WEIGHTED_SET. + * + * @param docId document identifier + * @param buffer content object to copy string values and weights into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, WeightedString * buffer, uint32_t sz) const = 0; + + /** + * Copies the values and weights stored for the given document into the given buffer. + * This method should only be invoked if @ref getCollectionType(docId) returns CollectionType::WEIGHTED_SET. + * + * @param docId document identifier + * @param buffer content object to copy const char values and weights into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, WeightedConstChar * buffer, uint32_t sz) const = 0; + + /** + * Copies the enum values and weights stored for the given document into the given buffer. + * This method should only be invoked if @ref getCollectionType(docId) returns CollectionType::WEIGHTED_SET. + * + * @param docId document identifier + * @param buffer content object to copy enum values and weights into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, WeightedEnum * buffer, uint32_t sz) const = 0; + + /** + * Finds the enum value for the given string value. + * This method will only have effect if @ref getBasicType() returns BasicType::STRING and + * @ref hasEnum() returns true. + * + * @param value the string value to lookup. + * @param e the handle in which to store the enum value. + * @return true if found. + **/ + virtual bool findEnum(const char * value, EnumHandle & e) const = 0; + + /** + * Finds all enum values matching the given string value. + * This method will only have effect if @ref getBasicType() returns BasicType::STRING and + * @ref hasEnum() returns true. + * + * @param value the string value to lookup. + * @return vector of EnumHandles, size 0 if no match found. + **/ + virtual std::vector<EnumHandle> findFoldedEnums(const char * value) const = 0; + + /** + * Given an enum handle, returns the string it refers to. + * This method will only have effect if @ref getBasicType() returns BasicType::STRING and + * @ref hasEnum() returns true. + * + * Effectively functions as the inverse of @ref findEnum(value, handle) + * + * @param e a valid enum handle + * @return enum string value, or nullptr if attribute type does + * not support enum handle lookups. + */ + virtual const char * getStringFromEnum(EnumHandle e) const = 0; + + /** + * Creates a context for searching this attribute with the given term. + * The search context is used to create the actual search iterator. + * + * @param term the term to search for. + * @param params optional bitvector and diversity settings for the search. + * @return the search context. + **/ + virtual std::unique_ptr<ISearchContext> createSearchContext(std::unique_ptr<QueryTermSimple> term, + const SearchContextParams ¶ms) const = 0; + + /** + * Type-safe down-cast to an attribute supporting direct document weight iterators. + * + * @return document weight attribute or nullptr if not supported. + */ + virtual const IDocumentWeightAttribute *asDocumentWeightAttribute() const = 0; + + /** + * Type-safe down-cast to a tensor attribute. + * + * @return tensor attribute or nullptr if not supported. + */ + virtual const tensor::ITensorAttribute *asTensorAttribute() const = 0; + + /** + * Type-safe down-cast to a multi-value attribute. + * + * @return multi-value attribute or nullptr if not supported. + */ + virtual const IMultiValueAttribute* as_multi_value_attribute() const = 0; + + /** + * Returns the basic type of this attribute vector. + * + * @return basic type + **/ + virtual BasicType::Type getBasicType() const = 0; + + /** + * Returns the number of bytes a single value in this attribute occupies. + **/ + virtual size_t getFixedWidth() const = 0; + + /** + * Returns the collection type of this attribute vector. + * + * @return collection type + **/ + virtual CollectionType::Type getCollectionType() const = 0; + + /** + * Returns whether this is an integer attribute. + **/ + virtual bool isIntegerType() const { + BasicType::Type t = getBasicType(); + return t == BasicType::BOOL || + t == BasicType::UINT2 || + t == BasicType::UINT4 || + t == BasicType::INT8 || + t == BasicType::INT16 || + t == BasicType::INT32 || + t == BasicType::INT64; + } + + /** + * Returns whether this is a floating point attribute. + **/ + virtual bool isFloatingPointType() const { + BasicType::Type t = getBasicType(); + return t == BasicType::FLOAT || t == BasicType::DOUBLE; + } + + /** + * Returns whether this is a string attribute. + **/ + virtual bool isStringType() const { + return getBasicType() == BasicType::STRING; + } + + /** + * Returns whether this is a multi value attribute. + **/ + virtual bool hasMultiValue() const { + return getCollectionType() != CollectionType::SINGLE; + } + + /** + * Returns whether this is a weighted set attribute. + **/ + virtual bool hasWeightedSetType() const { + return getCollectionType() == CollectionType::WSET; + } + + /** + * Returns whether this attribute vector has underlying enum values. + * + * @return true if it has enum values. + **/ + virtual bool hasEnum() const = 0; + + /** + * Returns whether the attribute vector is a filter attribute. + * + * @return true if attribute vector is a filter attribute. + */ + virtual bool getIsFilter() const = 0; + + /** + * Returns whether the attribute vector is marked as fast search. + * + * @return true if attribute vector is marked as fast search. + */ + virtual bool getIsFastSearch() const = 0; + + /** + * Returns the committed docid limit for the attribute. + * + * @return committed docid limit for the attribute. + */ + virtual uint32_t getCommittedDocIdLimit() const = 0; + + /* + * Returns whether the current attribute vector is an imported attribute + * vector. + */ + virtual bool isImported() const = 0; + + /** + * Will serialize the values for the documentid in ascending order. The serialized form can be used by memcmp and + * sortorder will be preserved. + * @param doc The document id to serialize for. + * @param serTo The buffer to serialize into. + * @param available. Number of bytes available in the serialization buffer. + * @param bc An optional converter to use. + * @return The number of bytes serialized, -1 if not enough space. + */ + long serializeForAscendingSort(DocId doc, void * serTo, long available, const common::BlobConverter * bc=NULL) const { + return onSerializeForAscendingSort(doc, serTo, available, bc); + } + /** + * Will serialize the values for the documentid in descending order. The serialized form can be used by memcmp and + * sortorder will be preserved. + * @param doc The document id to serialize for. + * @param serTo The buffer to serialize into. + * @param available. Number of bytes available in the serialization buffer. + * @param bc An optional converter to use. + * @return The number of bytes serialized, -1 if not enough space. + */ + long serializeForDescendingSort(DocId doc, void * serTo, long available, const common::BlobConverter * bc=NULL) const { + return onSerializeForDescendingSort(doc, serTo, available, bc); + } + + /** + * Virtual destructor to allow safe subclassing. + **/ + virtual ~IAttributeVector() = default; + + /** + * This method is used to simulate sparseness in the single value attributes. + * @param doc The document id to verify if attribute has a undefined value for this document. + * @return true if value is undefined. + */ + virtual bool isUndefined(DocId doc) const { (void) doc; return false; } + +private: + virtual long onSerializeForAscendingSort(DocId doc, void * serTo, long available, const common::BlobConverter * bc) const = 0; + virtual long onSerializeForDescendingSort(DocId doc, void * serTo, long available, const common::BlobConverter * bc) const = 0; + +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/multi_value_traits.h b/searchlib/src/vespa/searchcommon/attribute/multi_value_traits.h new file mode 100644 index 00000000000..f03b031f991 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/multi_value_traits.h @@ -0,0 +1,35 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <type_traits> + +namespace search::multivalue { + +template <typename T> class WeightedValue; + +/* + * Check for the presence of a weight. + */ +template <typename T> +struct is_WeightedValue : std::false_type {}; + +template <typename T> +struct is_WeightedValue<WeightedValue<T>> : std::true_type {}; + +template <typename T> +inline constexpr bool is_WeightedValue_v = is_WeightedValue<T>::value; + +/* + * Extract inner type. + */ +template <typename T> +struct ValueType { using type = T; }; + +template <typename T> +struct ValueType<WeightedValue<T>> { using type = T; }; + +template <typename T> +using ValueType_t = typename ValueType<T>::type; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/multivalue.h b/searchlib/src/vespa/searchcommon/attribute/multivalue.h new file mode 100644 index 00000000000..2ed8309188e --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/multivalue.h @@ -0,0 +1,65 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <cstdint> + +namespace search::multivalue { + +template <typename T> +class WeightedValue { +public: + WeightedValue() noexcept : _v(), _w(1) { } + WeightedValue(T v, int32_t w) noexcept : _v(v), _w(w) { } + T value() const noexcept { return _v; } + const T& value_ref() const noexcept { return _v; } + T& value_ref() noexcept { return _v; } + operator T () const noexcept { return _v; } + operator T & () noexcept { return _v; } + int32_t weight() const noexcept { return _w; } + + bool operator==(const WeightedValue<T> & rhs) const { return _v == rhs._v; } + bool operator <(const WeightedValue<T> & rhs) const { return _v < rhs._v; } + bool operator >(const WeightedValue<T> & rhs) const { return _v > rhs._v; } +private: + T _v; + int32_t _w; +}; + +template <typename T> +inline int32_t get_weight(const T&) noexcept { return 1; } + +template <typename T> +inline int32_t get_weight(const WeightedValue<T>& value) noexcept { return value.weight(); } + +template <typename T> +inline T get_value(const T& value) noexcept { return value; } + +template <typename T> +inline T get_value(const WeightedValue<T>& value) noexcept { return value.value(); } + +template <typename T> +inline const T& get_value_ref(const T& value) noexcept { return value; } + +template <typename T> +inline const T& get_value_ref(const WeightedValue<T>& value) noexcept { return value.value_ref(); } + +template <typename T> +inline T& get_value_ref(T& value) noexcept { return value; } + +template <typename T> +inline T& get_value_ref(WeightedValue<T>& value) noexcept { return value.value_ref(); } + +template <typename M> +struct ValueBuilder +{ + static M build(M value, int32_t) noexcept { return value; } +}; + +template <typename T> +struct ValueBuilder<WeightedValue<T>> +{ + static WeightedValue<T> build(T value, int32_t weight) noexcept { return WeightedValue<T>(value, weight); } +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/persistent_predicate_params.h b/searchlib/src/vespa/searchcommon/attribute/persistent_predicate_params.h new file mode 100644 index 00000000000..d81eb9c5d3c --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/persistent_predicate_params.h @@ -0,0 +1,37 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <limits> + +namespace search::attribute { + +/* + * Persistent parameters for predicate attributes. + */ +class PersistentPredicateParams { + uint32_t _arity; + int64_t _lower_bound; + int64_t _upper_bound; + +public: + PersistentPredicateParams() + : _arity(8), + _lower_bound(std::numeric_limits<int64_t>::min()), + _upper_bound(std::numeric_limits<int64_t>::max()) + { + } + uint32_t arity() const { return _arity; } + int64_t lower_bound() const { return _lower_bound; } + int64_t upper_bound() const { return _upper_bound; } + void setArity(uint32_t v) { _arity = v; } + void setBounds(int64_t lower, int64_t upper) { _lower_bound = lower; _upper_bound = upper; } + + bool operator==(const PersistentPredicateParams &rhs) const { + return ((_arity == rhs._arity) && + (_lower_bound == rhs._lower_bound) && + (_upper_bound == rhs._upper_bound)); + } +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/predicate_params.h b/searchlib/src/vespa/searchcommon/attribute/predicate_params.h new file mode 100644 index 00000000000..133b7331689 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/predicate_params.h @@ -0,0 +1,30 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "persistent_predicate_params.h" + +namespace search::attribute { + +/* + * Parameters for predicate attributes. + */ +class PredicateParams : public PersistentPredicateParams +{ + double _dense_posting_list_threshold; +public: + PredicateParams() + : PersistentPredicateParams(), + _dense_posting_list_threshold(0.4) + { + } + + double dense_posting_list_threshold() const { return _dense_posting_list_threshold; } + void setDensePostingListThreshold(double v) { _dense_posting_list_threshold = v; } + bool operator==(const PredicateParams &rhs) const { + return (PersistentPredicateParams::operator==(rhs) && + (_dense_posting_list_threshold == rhs._dense_posting_list_threshold)); + } +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/search_context_params.cpp b/searchlib/src/vespa/searchcommon/attribute/search_context_params.cpp new file mode 100644 index 00000000000..2e8aba6f5f8 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/search_context_params.cpp @@ -0,0 +1,9 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "search_context_params.h" +#include <cstdint> +#include <limits> + +namespace search::attribute { + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/search_context_params.h b/searchlib/src/vespa/searchcommon/attribute/search_context_params.h new file mode 100644 index 00000000000..168f4215ef6 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/search_context_params.h @@ -0,0 +1,53 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <cstddef> +#include <limits> +#include <cstdint> + +namespace search::attribute { + +class IAttributeVector; + +/** + * Params used to specify diversity and bitvector settings when creating a search context. + */ +class SearchContextParams { +private: + const IAttributeVector * _diversityAttribute; + uint32_t _diversityCutoffGroups; + bool _useBitVector; + bool _diversityCutoffStrict; + +public: + SearchContextParams() + : _diversityAttribute(nullptr), + _diversityCutoffGroups(std::numeric_limits<uint32_t>::max()), + _useBitVector(false), + _diversityCutoffStrict(false) + { } + bool useBitVector() const { return _useBitVector; } + const IAttributeVector * diversityAttribute() const { return _diversityAttribute; } + uint32_t diversityCutoffGroups() const { return _diversityCutoffGroups; } + bool diversityCutoffStrict() const { return _diversityCutoffStrict; } + + SearchContextParams &useBitVector(bool value) { + _useBitVector = value; + return *this; + } + SearchContextParams &diversityAttribute(const IAttributeVector *value) { + _diversityAttribute = value; + return *this; + } + SearchContextParams &diversityCutoffGroups(uint32_t groups) { + _diversityCutoffGroups = groups; + return *this; + } + SearchContextParams &diversityCutoffStrict(bool strict) { + _diversityCutoffStrict = strict; + return *this; + } +}; + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/status.cpp b/searchlib/src/vespa/searchcommon/attribute/status.cpp new file mode 100644 index 00000000000..a7d1f5b3d38 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/status.cpp @@ -0,0 +1,86 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "status.h" +#include <vespa/vespalib/util/atomic.h> + +using namespace vespalib::atomic; + +namespace search::attribute { + +Status::Status() + : _numDocs (0), + _numValues (0), + _numUniqueValues (0), + _allocated (0), + _used (0), + _dead (0), + _unused (0), + _onHold (0), + _onHoldMax (0), + _lastSyncToken (0), + _updates (0), + _nonIdempotentUpdates (0), + _bitVectors(0) +{ +} + +Status::Status(const Status& rhs) + : _numDocs(load_relaxed(rhs._numDocs)), + _numValues(load_relaxed(rhs._numValues)), + _numUniqueValues(load_relaxed(rhs._numUniqueValues)), + _allocated(load_relaxed(rhs._allocated)), + _used(load_relaxed(rhs._used)), + _dead(load_relaxed(rhs._dead)), + _unused(load_relaxed(rhs._unused)), + _onHold(load_relaxed(rhs._onHold)), + _onHoldMax(load_relaxed(rhs._onHoldMax)), + _lastSyncToken(rhs.getLastSyncToken()), + _updates(rhs._updates), + _nonIdempotentUpdates(rhs._nonIdempotentUpdates), + _bitVectors(rhs._bitVectors) +{ +} + +Status& +Status::operator=(const Status& rhs) +{ + store_relaxed(_numDocs, load_relaxed(rhs._numDocs)); + store_relaxed(_numValues, load_relaxed(rhs._numValues)); + store_relaxed(_numUniqueValues, load_relaxed(rhs._numUniqueValues)); + store_relaxed(_allocated, load_relaxed(rhs._allocated)); + store_relaxed(_used, load_relaxed(rhs._used)); + store_relaxed(_dead, load_relaxed(rhs._dead)); + store_relaxed(_unused, load_relaxed(rhs._unused)); + store_relaxed(_onHold, load_relaxed(rhs._onHold)); + store_relaxed(_onHoldMax, load_relaxed(rhs._onHoldMax)); + setLastSyncToken(rhs.getLastSyncToken()); + _updates = rhs._updates; + _nonIdempotentUpdates = rhs._nonIdempotentUpdates; + _bitVectors = rhs._bitVectors; + return *this; +} + +vespalib::string +Status::createName(vespalib::stringref index, vespalib::stringref attr) +{ + vespalib::string name (index); + name += ".attribute."; + name += attr; + return name; +} + +void +Status::updateStatistics(uint64_t numValues, uint64_t numUniqueValue, uint64_t allocated, + uint64_t used, uint64_t dead, uint64_t onHold) +{ + store_relaxed(_numValues, numValues); + store_relaxed(_numUniqueValues, numUniqueValue); + store_relaxed(_allocated, allocated); + store_relaxed(_used, used); + store_relaxed(_dead, dead); + store_relaxed(_unused, allocated - used); + store_relaxed(_onHold, onHold); + store_relaxed(_onHoldMax, std::max(load_relaxed(_onHoldMax), onHold)); +} + +} diff --git a/searchlib/src/vespa/searchcommon/attribute/status.h b/searchlib/src/vespa/searchcommon/attribute/status.h new file mode 100644 index 00000000000..f2212d4c76a --- /dev/null +++ b/searchlib/src/vespa/searchcommon/attribute/status.h @@ -0,0 +1,61 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> +#include <atomic> + +namespace search::attribute { + +class Status +{ +public: + Status(); + Status(const Status& rhs); + Status& operator=(const Status& rhs); + + void updateStatistics(uint64_t numValues, uint64_t numUniqueValue, uint64_t allocated, + uint64_t used, uint64_t dead, uint64_t onHold); + + uint64_t getNumDocs() const { return _numDocs.load(std::memory_order_relaxed); } + uint64_t getNumValues() const { return _numValues.load(std::memory_order_relaxed); } + uint64_t getNumUniqueValues() const { return _numUniqueValues.load(std::memory_order_relaxed); } + uint64_t getAllocated() const { return _allocated.load(std::memory_order_relaxed); } + uint64_t getUsed() const { return _used.load(std::memory_order_relaxed); } + uint64_t getDead() const { return _dead.load(std::memory_order_relaxed); } + uint64_t getOnHold() const { return _onHold.load(std::memory_order_relaxed); } + uint64_t getOnHoldMax() const { return _onHoldMax.load(std::memory_order_relaxed); } + // This might be accessed from other threads than the writer thread. + uint64_t getLastSyncToken() const { return _lastSyncToken.load(std::memory_order_relaxed); } + uint64_t getUpdateCount() const { return _updates; } + uint64_t getNonIdempotentUpdateCount() const { return _nonIdempotentUpdates; } + uint32_t getBitVectors() const { return _bitVectors; } + + void setNumDocs(uint64_t v) { _numDocs.store(v, std::memory_order_relaxed); } + void incNumDocs() { _numDocs.store(_numDocs.load(std::memory_order_relaxed) + 1u, + std::memory_order_relaxed); } + void setLastSyncToken(uint64_t v) { _lastSyncToken.store(v, std::memory_order_relaxed); } + void incUpdates(uint64_t v=1) { _updates += v; } + void incNonIdempotentUpdates(uint64_t v = 1) { _nonIdempotentUpdates += v; } + void incBitVectors() { ++_bitVectors; } + void decBitVectors() { --_bitVectors; } + + static vespalib::string + createName(vespalib::stringref index, vespalib::stringref attr); +private: + std::atomic<uint64_t> _numDocs; + std::atomic<uint64_t> _numValues; + std::atomic<uint64_t> _numUniqueValues; + std::atomic<uint64_t> _allocated; + std::atomic<uint64_t> _used; + std::atomic<uint64_t> _dead; + std::atomic<uint64_t> _unused; + std::atomic<uint64_t> _onHold; + std::atomic<uint64_t> _onHoldMax; + std::atomic<uint64_t> _lastSyncToken; + uint64_t _updates; + uint64_t _nonIdempotentUpdates; + uint32_t _bitVectors; +}; + +} diff --git a/searchlib/src/vespa/searchcommon/common/.gitignore b/searchlib/src/vespa/searchcommon/common/.gitignore new file mode 100644 index 00000000000..7e7c0fe7fae --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/.gitignore @@ -0,0 +1,2 @@ +/.depend +/Makefile diff --git a/searchlib/src/vespa/searchcommon/common/CMakeLists.txt b/searchlib/src/vespa/searchcommon/common/CMakeLists.txt new file mode 100644 index 00000000000..6cc02ae7884 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(searchcommon_searchcommon_common OBJECT + SOURCES + datatype.cpp + dictionary_config.cpp + growstrategy.cpp + schema.cpp + schemaconfigurer.cpp + DEPENDS +) diff --git a/searchlib/src/vespa/searchcommon/common/datatype.cpp b/searchlib/src/vespa/searchcommon/common/datatype.cpp new file mode 100644 index 00000000000..1fe3a488aac --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/datatype.cpp @@ -0,0 +1,99 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "datatype.h" +#include <vespa/config/common/exceptions.h> +#include <vespa/vespalib/stllike/asciistream.h> +#include <vespa/vespalib/util/arraysize.h> + +namespace search::index::schema { + +using config::InvalidConfigException; + +DataType +dataTypeFromName(vespalib::stringref name) { + if (name == "BOOL") { return DataType::BOOL; } + else if (name == "UINT2") { return DataType::UINT2; } + else if (name == "UINT4") { return DataType::UINT4; } + else if (name == "INT8") { return DataType::INT8; } + else if (name == "INT16") { return DataType::INT16; } + else if (name == "INT32") { return DataType::INT32; } + else if (name == "INT64") { return DataType::INT64; } + else if (name == "FLOAT") { return DataType::FLOAT; } + else if (name == "DOUBLE") { return DataType::DOUBLE; } + else if (name == "STRING") { return DataType::STRING; } + else if (name == "RAW") { return DataType::RAW; } + else if (name == "BOOLEANTREE") { return DataType::BOOLEANTREE; } + else if (name == "TENSOR") { return DataType::TENSOR; } + else if (name == "REFERENCE") { return DataType::REFERENCE; } + else { + throw InvalidConfigException("Illegal enum value '" + name + "'"); + } +} + +const char *datatype_str[] = { "BOOL", + "UINT2", + "UINT4", + "INT8", + "INT16", + "INT32", + "INT64", + "FLOAT", + "DOUBLE", + "STRING", + "RAW", + "FEATURE_NOTUSED", + "BOOLEANTREE", + "TENSOR", + "REFERENCE"}; + +vespalib::string +getTypeName(DataType type) { + size_t typeAsNum = static_cast<size_t>(type); + if (typeAsNum > vespalib::arraysize(datatype_str)) { + vespalib::asciistream ost; + ost << "UNKNOWN(" << typeAsNum << ")"; + return ost.str(); + } + return datatype_str[typeAsNum]; +} + +std::ostream & +operator<<(std::ostream &os, const DataType &type) +{ + os << getTypeName(type); + return os; +} + +CollectionType +collectionTypeFromName(vespalib::stringref name) { + if (name == "SINGLE") { return CollectionType::SINGLE; } + else if (name == "ARRAY") { return CollectionType::ARRAY; } + else if (name == "WEIGHTEDSET") { return CollectionType::WEIGHTEDSET; } + else { + throw InvalidConfigException("Illegal enum value '" + name + "'"); + } +} + +const char *collectiontype_str[] = { "SINGLE", + "ARRAY", + "WEIGHTEDSET" }; + +vespalib::string +getTypeName(CollectionType type) { + size_t typeAsNum = static_cast<size_t>(type); + if (typeAsNum > vespalib::arraysize(collectiontype_str)) { + vespalib::asciistream ost; + ost << "UNKNOWN(" << typeAsNum << ")"; + return ost.str(); + } + return collectiontype_str[typeAsNum]; +} + +std::ostream & +operator<<(std::ostream &os, const CollectionType &type) +{ + os << getTypeName(type); + return os; +} + +} diff --git a/searchlib/src/vespa/searchcommon/common/datatype.h b/searchlib/src/vespa/searchcommon/common/datatype.h new file mode 100644 index 00000000000..e1c6a44b620 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/datatype.h @@ -0,0 +1,47 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> + +namespace search::index::schema { + +/** + * Basic data type for a field. + **/ +enum class DataType { + BOOL = 0, + UINT2 = 1, + UINT4 = 2, + INT8 = 3, + INT16 = 4, + INT32 = 5, + INT64 = 6, + FLOAT = 7, + DOUBLE = 8, + STRING = 9, + RAW = 10, + //FEATURE = 11, + BOOLEANTREE = 12, + TENSOR = 13, + REFERENCE = 14 +}; + +/** + * Collection type for a field. + **/ +enum class CollectionType { SINGLE = 0, + ARRAY = 1, + WEIGHTEDSET = 2 +}; + +DataType dataTypeFromName(vespalib::stringref name); +vespalib::string getTypeName(DataType type); +std::ostream &operator<<(std::ostream &os, const DataType &type); + +CollectionType collectionTypeFromName(vespalib::stringref n); +vespalib::string getTypeName(CollectionType type); +std::ostream &operator<<(std::ostream &os, const CollectionType &type); + + +} diff --git a/searchlib/src/vespa/searchcommon/common/dictionary_config.cpp b/searchlib/src/vespa/searchcommon/common/dictionary_config.cpp new file mode 100644 index 00000000000..e1b990e5660 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/dictionary_config.cpp @@ -0,0 +1,39 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "dictionary_config.h" +#include <ostream> +#include <cassert> + +namespace search { + +std::ostream& +operator<<(std::ostream& os, const DictionaryConfig & cfg) { + return os << cfg.getType() << "," << cfg.getMatch(); +} + +std::ostream& +operator<<(std::ostream& os, DictionaryConfig::Type type) { + + switch (type) { + case DictionaryConfig::Type::BTREE: + return os << "BTREE"; + case DictionaryConfig::Type::HASH: + return os << "HASH"; + case DictionaryConfig::Type::BTREE_AND_HASH: + return os << "BTREE_AND_HASH"; + } + assert(false); +} + +std::ostream& +operator<<(std::ostream& os, DictionaryConfig::Match match) { + switch(match) { + case DictionaryConfig::Match::CASED: + return os << "CASE_SENSTITIVE"; + case DictionaryConfig::Match::UNCASED: + return os << "CASE_INSENSTITIVE"; + } + assert(false); +} + +} // namespace search diff --git a/searchlib/src/vespa/searchcommon/common/dictionary_config.h b/searchlib/src/vespa/searchcommon/common/dictionary_config.h new file mode 100644 index 00000000000..f51341ad799 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/dictionary_config.h @@ -0,0 +1,31 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <iosfwd> + +namespace search { + +/** + * Contains the config required for setting up a suitable dictionary. + */ +class DictionaryConfig { +public: + enum class Type { BTREE, HASH, BTREE_AND_HASH }; + enum class Match { CASED, UNCASED }; + DictionaryConfig() noexcept : _type(Type::BTREE), _match(Match::UNCASED) {} + DictionaryConfig(Type type) noexcept : _type(type), _match(Match::UNCASED) {} + DictionaryConfig(Type type, Match match) noexcept : _type(type), _match(match) {} + Type getType() const { return _type; } + Match getMatch() const { return _match; } + bool operator == (const DictionaryConfig & b) const { return (_type == b._type) && (_match == b._match); } +private: + Type _type; + Match _match; +}; + +std::ostream& operator<<(std::ostream& os, const DictionaryConfig & cfg); +std::ostream& operator<<(std::ostream& os, DictionaryConfig::Type type); +std::ostream& operator<<(std::ostream& os, DictionaryConfig::Match match); + +} // namespace search diff --git a/searchlib/src/vespa/searchcommon/common/growstrategy.cpp b/searchlib/src/vespa/searchcommon/common/growstrategy.cpp new file mode 100644 index 00000000000..f35cdbaa640 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/growstrategy.cpp @@ -0,0 +1,18 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "growstrategy.h" +#include <iostream> + +namespace search { + +std::ostream& operator<<(std::ostream& os, const GrowStrategy& grow_strategy) +{ + os << "{docsInitialCapacity=" << grow_strategy.getDocsInitialCapacity() << + ", docsGrowFactor=" << grow_strategy.getDocsGrowFactor() << + ", docsGrowDelta=" << grow_strategy.getDocsGrowDelta() << + ", multiValueAllocGrowFactor=" << grow_strategy.getMultiValueAllocGrowFactor() << + "}"; + return os; +} + +} diff --git a/searchlib/src/vespa/searchcommon/common/growstrategy.h b/searchlib/src/vespa/searchcommon/common/growstrategy.h new file mode 100644 index 00000000000..b9b4a42cf72 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/growstrategy.h @@ -0,0 +1,61 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/util/growstrategy.h> +#include <cstdint> +#include <iosfwd> + +namespace search { + +class GrowStrategy +{ +private: + uint32_t _docsInitialCapacity; + float _docsGrowFactor; + uint32_t _docsGrowDelta; + float _multiValueAllocGrowFactor; +public: + GrowStrategy() noexcept + : GrowStrategy(1024, 0.5, 0, 0.2) + {} + GrowStrategy(uint32_t docsInitialCapacity, float docsGrowFactor, + uint32_t docsGrowDelta, float multiValueAllocGrowFactor) noexcept + : _docsInitialCapacity(docsInitialCapacity), + _docsGrowFactor(docsGrowFactor), + _docsGrowDelta(docsGrowDelta), + _multiValueAllocGrowFactor(multiValueAllocGrowFactor) + { + } + + static GrowStrategy make(uint32_t docsInitialCapacity, float docsGrowFactor, uint32_t docsGrowDelta) { + return GrowStrategy(docsInitialCapacity, docsGrowFactor, docsGrowDelta, 0.2); + } + + uint32_t getDocsInitialCapacity() const { return _docsInitialCapacity; } + uint32_t getDocsGrowPercent() const { return _docsGrowFactor*100; } + float getDocsGrowFactor() const { return _docsGrowFactor; } + uint32_t getDocsGrowDelta() const { return _docsGrowDelta; } + float getMultiValueAllocGrowFactor() const { return _multiValueAllocGrowFactor; } + void setDocsInitialCapacity(uint32_t v) { _docsInitialCapacity = v; } + void setDocsGrowDelta(uint32_t v) { _docsGrowDelta = v; } + + vespalib::GrowStrategy to_generic_strategy() const { + return vespalib::GrowStrategy(_docsInitialCapacity, _docsGrowFactor, _docsGrowDelta); + } + + bool operator==(const GrowStrategy & rhs) const { + return _docsInitialCapacity == rhs._docsInitialCapacity && + _docsGrowFactor == rhs._docsGrowFactor && + _docsGrowDelta == rhs._docsGrowDelta && + _multiValueAllocGrowFactor == rhs._multiValueAllocGrowFactor; + } + bool operator!=(const GrowStrategy & rhs) const { + return !(operator==(rhs)); + } +}; + +std::ostream& operator<<(std::ostream& os, const GrowStrategy& grow_strategy); + +} + diff --git a/searchlib/src/vespa/searchcommon/common/iblobconverter.h b/searchlib/src/vespa/searchcommon/common/iblobconverter.h new file mode 100644 index 00000000000..6581c3e5ccb --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/iblobconverter.h @@ -0,0 +1,22 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/util/buffer.h> +#include <memory> + +namespace search::common { + +class BlobConverter +{ +public: + using SP = std::shared_ptr<BlobConverter>; + using UP = std::unique_ptr<BlobConverter>; + using ConstBufferRef = vespalib::ConstBufferRef; + virtual ~BlobConverter() { } + ConstBufferRef convert(const ConstBufferRef & src) const { return onConvert(src); } +private: + virtual ConstBufferRef onConvert(const ConstBufferRef & src) const = 0; +}; + +} diff --git a/searchlib/src/vespa/searchcommon/common/range.h b/searchlib/src/vespa/searchcommon/common/range.h new file mode 100644 index 00000000000..ea2553c129b --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/range.h @@ -0,0 +1,29 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <limits> +#include <cstdint> + +namespace search { + +template <typename T> +class Range { +public: + Range() : + _lower(std::numeric_limits<T>::max()), + _upper(std::numeric_limits<T>::min()) { } + Range(T v) : _lower(v), _upper(v) { } + Range(T low, T high) : _lower(low), _upper(high) { } + T lower() const { return _lower; } + T upper() const { return _upper; } + bool valid() const { return _lower <= _upper; } + bool isPoint() const { return _lower == _upper; } +private: + T _lower; + T _upper; +}; + +using Int64Range = Range<int64_t>; + +} // namespace search diff --git a/searchlib/src/vespa/searchcommon/common/schema.cpp b/searchlib/src/vespa/searchcommon/common/schema.cpp new file mode 100644 index 00000000000..c6a2a4532a3 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/schema.cpp @@ -0,0 +1,581 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "schema.h" +#include <fstream> +#include <vespa/config/common/configparser.h> +#include <vespa/vespalib/stllike/asciistream.h> +#include <vespa/vespalib/stllike/hashtable.hpp> +#include <vespa/fastos/file.h> + +#include <vespa/log/log.h> +LOG_SETUP(".index.schema"); + +using namespace config; +using namespace search::index; + +namespace { + +template <typename T> +void +writeFields(vespalib::asciistream & os, + vespalib::stringref prefix, + const std::vector<T> & fields) +{ + os << prefix << "[" << fields.size() << "]\n"; + for (size_t i = 0; i < fields.size(); ++i) { + fields[i].write(os, vespalib::make_string("%s[%zu].", prefix.data(), i)); + } +} + +void +writeFieldSets(vespalib::asciistream &os, + const vespalib::string &name, + const std::vector<Schema::FieldSet> &fss) +{ + vespalib::string prefix(name); + prefix += "["; + os << prefix << fss.size() << "]\n"; + for (size_t i = 0; i < fss.size(); ++i) { + os << prefix << i << "].name " << fss[i].getName() << "\n"; + os << prefix << i << "].field[" << fss[i].getFields().size() << "]\n"; + vespalib::asciistream tmp; + tmp << prefix << i << "].field["; + for (size_t j = 0; j < fss[i].getFields().size(); ++j) { + os << tmp.str() << j << "].name " << fss[i].getFields()[j] << "\n"; + } + } +} + +struct FieldName { + vespalib::string name; + FieldName(const config::StringVector & lines) + : name(ConfigParser::parse<vespalib::string>("name", lines)) + { + } +}; + +template <typename T> +uint32_t +getFieldId(vespalib::stringref name, const T &map) +{ + typename T::const_iterator it = map.find(name); + return (it != map.end()) ? it->second : Schema::UNKNOWN_FIELD_ID; +} + +} // namespace + +namespace search::index { + +const uint32_t Schema::UNKNOWN_FIELD_ID(std::numeric_limits<uint32_t>::max()); + +Schema::Field::Field(vespalib::stringref n, DataType dt) noexcept + : Field(n, dt, schema::CollectionType::SINGLE, "") +{ +} + +Schema::Field::Field(vespalib::stringref n, DataType dt, CollectionType ct) noexcept + : Field(n, dt, ct, "") +{ +} + +Schema::Field::Field(vespalib::stringref n, DataType dt, CollectionType ct, vespalib::stringref tensor_spec) noexcept + : _name(n), + _dataType(dt), + _collectionType(ct), + _tensor_spec(tensor_spec) +{ +} + +// XXX: Resource leak if exception is thrown. +Schema::Field::Field(const config::StringVector & lines) + : _name(ConfigParser::parse<vespalib::string>("name", lines)), + _dataType(schema::dataTypeFromName(ConfigParser::parse<vespalib::string>("datatype", lines))), + _collectionType(schema::collectionTypeFromName(ConfigParser::parse<vespalib::string>("collectiontype", lines))) +{ +} + +Schema::Field::Field(const Field &) noexcept = default; +Schema::Field & Schema::Field::operator = (const Field &) noexcept = default; +Schema::Field::Field(Field &&) noexcept = default; +Schema::Field & Schema::Field::operator = (Field &&) noexcept = default; + +Schema::Field::~Field() = default; + +void +Schema::Field::write(vespalib::asciistream & os, vespalib::stringref prefix) const +{ + os << prefix << "name " << _name << "\n"; + os << prefix << "datatype " << getTypeName(_dataType) << "\n"; + os << prefix << "collectiontype " << getTypeName(_collectionType) << "\n"; +} + +bool +Schema::Field::operator==(const Field &rhs) const +{ + return _name == rhs._name && + _dataType == rhs._dataType && + _collectionType == rhs._collectionType && + _tensor_spec == rhs._tensor_spec; +} + +bool +Schema::Field::operator!=(const Field &rhs) const +{ + return !((*this) == rhs); +} + +Schema::IndexField::IndexField(vespalib::stringref name, DataType dt) noexcept + : Field(name, dt), + _avgElemLen(512), + _interleaved_features(false) +{ +} + +Schema::IndexField::IndexField(vespalib::stringref name, DataType dt, + CollectionType ct) noexcept + : Field(name, dt, ct), + _avgElemLen(512), + _interleaved_features(false) +{ +} + +Schema::IndexField::IndexField(const config::StringVector &lines) + : Field(lines), + _avgElemLen(ConfigParser::parse<int32_t>("averageelementlen", lines, 512)), + _interleaved_features(ConfigParser::parse<bool>("interleavedfeatures", lines, false)) +{ +} + +Schema::IndexField::IndexField(const IndexField &) noexcept = default; +Schema::IndexField & Schema::IndexField::operator = (const IndexField &) noexcept = default; +Schema::IndexField::IndexField(IndexField &&) noexcept = default; +Schema::IndexField & Schema::IndexField::operator = (IndexField &&) noexcept = default; + +void +Schema::IndexField::write(vespalib::asciistream & os, vespalib::stringref prefix) const +{ + Field::write(os, prefix); + os << prefix << "averageelementlen " << static_cast<int32_t>(_avgElemLen) << "\n"; + os << prefix << "interleavedfeatures " << (_interleaved_features ? "true" : "false") << "\n"; + + // TODO: Remove prefix, phrases and positions when breaking downgrade is no longer an issue. + os << prefix << "prefix false" << "\n"; + os << prefix << "phrases false" << "\n"; + os << prefix << "positions true" << "\n"; +} + +bool +Schema::IndexField::operator==(const IndexField &rhs) const +{ + return Field::operator==(rhs) && + _avgElemLen == rhs._avgElemLen && + _interleaved_features == rhs._interleaved_features; +} + +bool +Schema::IndexField::operator!=(const IndexField &rhs) const +{ + return Field::operator!=(rhs) || + _avgElemLen != rhs._avgElemLen || + _interleaved_features != rhs._interleaved_features; +} + +Schema::FieldSet::FieldSet(const config::StringVector & lines) : + _name(ConfigParser::parse<vespalib::string>("name", lines)), + _fields() +{ + std::vector<FieldName> fn = ConfigParser::parseArray<std::vector<FieldName>>("field", lines); + for (size_t i = 0; i < fn.size(); ++i) { + _fields.push_back(fn[i].name); + } +} + +Schema::FieldSet::FieldSet(const FieldSet &) = default; +Schema::FieldSet & Schema::FieldSet::operator = (const FieldSet &) = default; + +Schema::FieldSet::~FieldSet() = default; + +bool +Schema::FieldSet::operator==(const FieldSet &rhs) const +{ + return _name == rhs._name && + _fields == rhs._fields; +} + +bool +Schema::FieldSet::operator!=(const FieldSet &rhs) const +{ + return _name != rhs._name || + _fields != rhs._fields; +} + +void +Schema::writeToStream(vespalib::asciistream &os, bool saveToDisk) const +{ + writeFields(os, "attributefield", _attributeFields); + writeFields(os, "summaryfield", _summaryFields); + writeFieldSets(os, "fieldset", _fieldSets); + writeFields(os, "indexfield", _indexFields); + if (!saveToDisk) { + writeFields(os, "importedattributefields", _importedAttributeFields); + } +} + +Schema::Schema() = default; + +Schema::Schema(const Schema & rhs) = default; +Schema & Schema::operator=(const Schema & rhs) = default; +Schema::Schema(Schema && rhs) = default; +Schema & Schema::operator=(Schema && rhs) = default; +Schema::~Schema() = default; + +bool +Schema::loadFromFile(const vespalib::string & fileName) +{ + std::ifstream file(fileName.c_str()); + if (!file) { + LOG(warning, "Could not open input file '%s' as part of loadFromFile()", fileName.c_str()); + return false; + } + config::StringVector lines; + std::string tmpLine; + while (file) { + getline(file, tmpLine); + lines.push_back(tmpLine); + } + _indexFields = ConfigParser::parseArray<std::vector<IndexField>>("indexfield", lines); + _attributeFields = ConfigParser::parseArray<std::vector<AttributeField>>("attributefield", lines); + _summaryFields = ConfigParser::parseArray<std::vector<SummaryField>>("summaryfield", lines); + _fieldSets = ConfigParser::parseArray<std::vector<FieldSet>>("fieldset", lines); + _importedAttributeFields.clear(); // NOTE: these are not persisted to disk + _indexIds.clear(); + for (size_t i(0), m(_indexFields.size()); i < m; i++) { + _indexIds[_indexFields[i].getName()] = i; + } + _attributeIds.clear(); + for (size_t i(0), m(_attributeFields.size()); i < m; i++) { + _attributeIds[_attributeFields[i].getName()] = i; + } + _summaryIds.clear(); + for (size_t i(0), m(_summaryFields.size()); i < m; i++) { + _summaryIds[_summaryFields[i].getName()] = i; + } + _fieldSetIds.clear(); + for (size_t i(0), m(_fieldSets.size()); i < m; i++) { + _fieldSetIds[_fieldSets[i].getName()] = i; + } + _importedAttributeIds.clear(); + return true; +} + +bool +Schema::saveToFile(const vespalib::string & fileName) const +{ + vespalib::asciistream os; + writeToStream(os, true); + std::ofstream file(fileName.c_str()); + if (!file) { + LOG(warning, "Could not open output file '%s' as part of saveToFile()", fileName.c_str()); + return false; + } + file << os.str(); + file.close(); + if (file.fail()) { + LOG(warning, + "Could not write to output file '%s' as part of saveToFile()", + fileName.c_str()); + return false; + } + FastOS_File s; + s.OpenReadWrite(fileName.c_str()); + if (!s.IsOpened()) { + LOG(warning, "Could not open schema file '%s' for fsync", fileName.c_str()); + return false; + } else { + if (!s.Sync()) { + LOG(warning, "Could not fsync schema file '%s'", fileName.c_str()); + return false; + } + } + return true; +} + +vespalib::string +Schema::toString() const +{ + vespalib::asciistream os; + writeToStream(os, false); + return os.str(); +} + +namespace { +Schema::IndexField +cloneIndexField(const Schema::IndexField &field, + const vespalib::string &suffix) +{ + return Schema::IndexField(field.getName() + suffix, + field.getDataType(), + field.getCollectionType()). + setAvgElemLen(field.getAvgElemLen()); +} + +template <typename T, typename M> +Schema & +addField(const T &field, Schema &self, + std::vector<T> &fields, M &name2id_map) +{ + name2id_map[field.getName()] = fields.size(); + fields.push_back(field); + return self; +} +} // namespace + +Schema & +Schema::addIndexField(const IndexField &field) +{ + return addField(field, *this, _indexFields, _indexIds); +} + +Schema & +Schema::addUriIndexFields(const IndexField &field) +{ + addIndexField(field); + addIndexField(cloneIndexField(field, ".scheme")); + addIndexField(cloneIndexField(field, ".host")); + addIndexField(cloneIndexField(field, ".port")); + addIndexField(cloneIndexField(field, ".path")); + addIndexField(cloneIndexField(field, ".query")); + addIndexField(cloneIndexField(field, ".fragment")); + addIndexField(cloneIndexField(field, ".hostname")); + return *this; +} + +Schema & +Schema::addAttributeField(const AttributeField &field) +{ + return addField(field, *this, _attributeFields, _attributeIds); +} + +Schema & +Schema::addSummaryField(const SummaryField &field) +{ + return addField(field, *this, _summaryFields, _summaryIds); +} + +Schema & +Schema::addImportedAttributeField(const ImportedAttributeField &field) +{ + return addField(field, *this, _importedAttributeFields, _importedAttributeIds); +} + +Schema & +Schema::addFieldSet(const FieldSet &fieldSet) +{ + return addField(fieldSet, *this, _fieldSets, _fieldSetIds); +} + +uint32_t +Schema::getIndexFieldId(vespalib::stringref name) const +{ + return getFieldId(name, _indexIds); +} + +uint32_t +Schema::getAttributeFieldId(vespalib::stringref name) const +{ + return getFieldId(name, _attributeIds); +} + +uint32_t +Schema::getSummaryFieldId(vespalib::stringref name) const +{ + return getFieldId(name, _summaryIds); +} + +uint32_t +Schema::getFieldSetId(vespalib::stringref name) const +{ + return getFieldId(name, _fieldSetIds); +} + +bool +Schema::isIndexField(vespalib::stringref name) const +{ + return _indexIds.find(name) != _indexIds.end(); +} + +bool +Schema::isSummaryField(vespalib::stringref name) const +{ + return _summaryIds.find(name) != _summaryIds.end(); +} + +bool +Schema::isAttributeField(vespalib::stringref name) const +{ + return _attributeIds.find(name) != _attributeIds.end(); +} + + +void +Schema::swap(Schema &rhs) +{ + _indexFields.swap(rhs._indexFields); + _attributeFields.swap(rhs._attributeFields); + _summaryFields.swap(rhs._summaryFields); + _fieldSets.swap(rhs._fieldSets); + _importedAttributeFields.swap(rhs._importedAttributeFields); + _indexIds.swap(rhs._indexIds); + _attributeIds.swap(rhs._attributeIds); + _summaryIds.swap(rhs._summaryIds); + _fieldSetIds.swap(rhs._fieldSetIds); + _importedAttributeIds.swap(rhs._importedAttributeIds); +} + +void +Schema::clear() +{ + _indexFields.clear(); + _attributeFields.clear(); + _summaryFields.clear(); + _fieldSets.clear(); + _importedAttributeFields.clear(); + _indexIds.clear(); + _attributeIds.clear(); + _summaryIds.clear(); + _fieldSetIds.clear(); + _importedAttributeIds.clear(); +} + +namespace { +// Helper class allowing the is_matching specialization to access the schema. +struct IntersectHelper { + Schema::UP schema; + IntersectHelper() : schema(new Schema) {} + + template <typename T> + bool is_matching(const T &t1, const T &t2) { return t1.matchingTypes(t2); } + + template <typename T, typename Map> + void intersect(const std::vector<T> &set1, const std::vector<T> &set2, + const Map &set2_map, + std::vector<T> &intersection, Map &intersection_map) { + for (typename std::vector<T>::const_iterator + it = set1.begin(); it != set1.end(); ++it) { + typename Map::const_iterator it2 = set2_map.find(it->getName()); + if (it2 != set2_map.end()) { + if (is_matching(*it, set2[it2->second])) { + intersection_map[it->getName()] = intersection.size(); + intersection.push_back(*it); + } + } + } + } +}; + +template <> +bool IntersectHelper::is_matching(const Schema::FieldSet &f1, const Schema::FieldSet &f2) { + if (f1.getFields() != f2.getFields()) + return false; + for (const vespalib::string & field : f1.getFields()) { + if (schema->getIndexFieldId(field) == Schema::UNKNOWN_FIELD_ID) { + return false; + } + } + return true; +} + +template <typename T, typename Map> +void addEntries(const std::vector<T> &entries, std::vector<T> &v, Map &name2id_map) { + for (const T & key : entries) { + if (name2id_map.find(key.getName()) == name2id_map.end()) { + name2id_map[key.getName()] = v.size(); + v.push_back(key); + } + } +} + +template <typename T, typename Map> +void difference(const std::vector<T> &minuend, const Map &subtrahend_map, + std::vector<T> &diff, Map &diff_map) { + for (const T & key : minuend){ + if (subtrahend_map.find(key.getName()) == subtrahend_map.end()) { + diff_map[key.getName()] = diff.size(); + diff.push_back(key); + } + } +} +} // namespace + +Schema::UP +Schema::intersect(const Schema &lhs, const Schema &rhs) +{ + IntersectHelper h; + h.intersect(lhs._indexFields, rhs._indexFields, rhs._indexIds, + h.schema->_indexFields, h.schema->_indexIds); + h.intersect(lhs._attributeFields, rhs._attributeFields, rhs._attributeIds, + h.schema->_attributeFields, h.schema->_attributeIds); + h.intersect(lhs._summaryFields, rhs._summaryFields, rhs._summaryIds, + h.schema->_summaryFields, h.schema->_summaryIds); + h.intersect(lhs._fieldSets, rhs._fieldSets, rhs._fieldSetIds, + h.schema->_fieldSets, h.schema->_fieldSetIds); + return std::move(h.schema); +} + +Schema::UP +Schema::make_union(const Schema &lhs, const Schema &rhs) +{ + Schema::UP schema(new Schema(lhs)); + addEntries(rhs._indexFields, schema->_indexFields, schema->_indexIds); + addEntries(rhs._attributeFields, schema->_attributeFields, schema->_attributeIds); + addEntries(rhs._summaryFields, schema->_summaryFields, schema->_summaryIds); + addEntries(rhs._fieldSets, schema->_fieldSets, schema->_fieldSetIds); + return schema; +} + +Schema::UP +Schema::set_difference(const Schema &lhs, const Schema &rhs) +{ + Schema::UP schema(new Schema); + difference(lhs._indexFields, rhs._indexIds, + schema->_indexFields, schema->_indexIds); + difference(lhs._attributeFields, rhs._attributeIds, + schema->_attributeFields, schema->_attributeIds); + difference(lhs._summaryFields, rhs._summaryIds, + schema->_summaryFields, schema->_summaryIds); + difference(lhs._fieldSets, rhs._fieldSetIds, + schema->_fieldSets, schema->_fieldSetIds); + return schema; +} + +bool +Schema::operator==(const Schema &rhs) const +{ + return _indexFields == rhs._indexFields && + _attributeFields == rhs._attributeFields && + _summaryFields == rhs._summaryFields && + _fieldSets == rhs._fieldSets && + _importedAttributeFields == rhs._importedAttributeFields; +} + +bool +Schema::operator!=(const Schema &rhs) const +{ + return _indexFields != rhs._indexFields || + _attributeFields != rhs._attributeFields || + _summaryFields != rhs._summaryFields || + _fieldSets != rhs._fieldSets || + _importedAttributeFields != rhs._importedAttributeFields; +} + +bool +Schema::empty() const +{ + return _indexFields.empty() && + _attributeFields.empty() && + _summaryFields.empty() && + _fieldSets.empty() && + _importedAttributeFields.empty(); +} + +} diff --git a/searchlib/src/vespa/searchcommon/common/schema.h b/searchlib/src/vespa/searchcommon/common/schema.h new file mode 100644 index 00000000000..3a9bcbdd904 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/schema.h @@ -0,0 +1,411 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "datatype.h" +#include <vespa/config/common/types.h> +#include <vespa/vespalib/stllike/hash_map.h> +#include <vespa/vespalib/util/ptrholder.h> + +namespace vespalib { class asciistream; } +namespace search::index { + +/** + * Schema class used to give a high-level description of the content + * of an index. + **/ +class Schema +{ +public: + using UP = std::unique_ptr<Schema>; + using SP = std::shared_ptr<Schema>; + + using DataType = schema::DataType; + using CollectionType = schema::CollectionType; + + /** + * A single field has a name, data type and collection + * type. Various aspects (index/attribute/summary) may have + * limitations on what types are supported in the back-end. + **/ + class Field + { + vespalib::string _name; + DataType _dataType; + CollectionType _collectionType; + vespalib::string _tensor_spec; + + public: + Field(vespalib::stringref n, DataType dt) noexcept; + Field(vespalib::stringref n, DataType dt, CollectionType ct) noexcept; + Field(vespalib::stringref n, DataType dt, CollectionType ct, vespalib::stringref tensor_spec) noexcept; + + /** + * Create this field based on the given config lines. + **/ + Field(const config::StringVector & lines); + Field(const Field &) noexcept; + Field & operator = (const Field &) noexcept; + Field(Field &&) noexcept; + Field & operator = (Field &&) noexcept; + + virtual ~Field(); + + virtual void + write(vespalib::asciistream & os, + vespalib::stringref prefix) const; + + const vespalib::string &getName() const { return _name; } + DataType getDataType() const { return _dataType; } + CollectionType getCollectionType() const { return _collectionType; } + const vespalib::string& get_tensor_spec() const { return _tensor_spec; } + + bool matchingTypes(const Field &rhs) const { + return getDataType() == rhs.getDataType() && + getCollectionType() == rhs.getCollectionType(); + } + + bool operator==(const Field &rhs) const; + bool operator!=(const Field &rhs) const; + }; + + + /** + * A representation of an index field with extra information on + * how the index should be generated. + **/ + class IndexField : public Field { + private: + uint32_t _avgElemLen; + // TODO: Remove when posting list format with interleaved features is made default + bool _interleaved_features; + + public: + IndexField(vespalib::stringref name, DataType dt) noexcept; + IndexField(vespalib::stringref name, DataType dt, CollectionType ct) noexcept; + IndexField(const IndexField &) noexcept; + IndexField & operator = (const IndexField &) noexcept; + IndexField(IndexField &&) noexcept; + IndexField & operator = (IndexField &&) noexcept; + /** + * Create this index field based on the given config lines. + **/ + IndexField(const config::StringVector &lines); + + IndexField &setAvgElemLen(uint32_t avgElemLen) { _avgElemLen = avgElemLen; return *this; } + IndexField &set_interleaved_features(bool value) { + _interleaved_features = value; + return *this; + } + + void write(vespalib::asciistream &os, + vespalib::stringref prefix) const override; + + uint32_t getAvgElemLen() const { return _avgElemLen; } + bool use_interleaved_features() const { return _interleaved_features; } + + bool operator==(const IndexField &rhs) const; + bool operator!=(const IndexField &rhs) const; + }; + + using AttributeField = Field; + using SummaryField = Field; + using ImportedAttributeField = Field; + + /** + * A field collection has a name and a list of index field names, + * and is a named physical view over the list of index fields. + **/ + class FieldSet + { + vespalib::string _name; + std::vector<vespalib::string> _fields; + + public: + FieldSet(vespalib::stringref n) : _name(n), _fields() {} + FieldSet(const FieldSet &); + FieldSet & operator =(const FieldSet &); + FieldSet(FieldSet &&) noexcept = default; + FieldSet & operator =(FieldSet &&) noexcept = default; + + /** + * Create this field collection based on the given config lines. + **/ + FieldSet(const config::StringVector & lines); + + ~FieldSet(); + + FieldSet &addField(vespalib::stringref fieldName) { + _fields.push_back(fieldName); + return *this; + } + + const vespalib::string &getName() const { return _name; } + const std::vector<vespalib::string> &getFields() const { + return _fields; + } + + bool operator==(const FieldSet &rhs) const; + bool operator!=(const FieldSet &rhs) const; + }; + + static const uint32_t UNKNOWN_FIELD_ID; + +private: + std::vector<IndexField> _indexFields; + std::vector<AttributeField> _attributeFields; + std::vector<SummaryField> _summaryFields; + std::vector<FieldSet> _fieldSets; + std::vector<ImportedAttributeField> _importedAttributeFields; + using Name2IdMap = vespalib::hash_map<vespalib::string, uint32_t>; + Name2IdMap _indexIds; + Name2IdMap _attributeIds; + Name2IdMap _summaryIds; + Name2IdMap _fieldSetIds; + Name2IdMap _importedAttributeIds; + + void writeToStream(vespalib::asciistream &os, bool saveToDisk) const; + +public: + /** + * Create an initially empty schema + **/ + Schema(); + Schema(const Schema & rhs); + Schema & operator=(const Schema & rhs); + Schema(Schema && rhs); + Schema & operator=(Schema && rhs); + ~Schema(); + + /** + * Load this schema from the file with the given name. + * + * @param fileName the name of the file. + * @return true if the schema could be loaded. + **/ + bool + loadFromFile(const vespalib::string & fileName); + + /** + * Save this schema to the file with the given name. + * + * @param fileName the name of the file. + * @return true if the schema could be saved. + **/ + bool + saveToFile(const vespalib::string & fileName) const; + + vespalib::string toString() const; + + /** + * Add an index field to this schema + * + * @param field the field to add + **/ + Schema & + addIndexField(const IndexField &field); + + // Only used by tests. + Schema & + addUriIndexFields(const IndexField &field); + + /** + * Add an attribute field to this schema + * + * @param field the field to add + **/ + Schema & + addAttributeField(const AttributeField &field); + + /** + * Add a summary field to this schema + * + * @param field the field to add + **/ + Schema & + addSummaryField(const SummaryField &field); + + /** + * Add a field set to this schema. + * + * @param collection the field set to add. + **/ + Schema & + addFieldSet(const FieldSet &collection); + + Schema &addImportedAttributeField(const ImportedAttributeField &field); + + /** + * Obtain the number of index fields in this schema. + * + * @return number of fields + **/ + uint32_t getNumIndexFields() const { return _indexFields.size(); } + + /** + * Obtain the number of attribute fields in this schema. + * + * @return number of fields + **/ + uint32_t getNumAttributeFields() const { return _attributeFields.size(); } + + /** + * Obtain the number of summary fields in this schema. + * + * @return number of fields + **/ + uint32_t getNumSummaryFields() const { return _summaryFields.size(); } + + /** + * Obtain the number of field sets in this schema. + * + * @return number of field sets. + **/ + uint32_t getNumFieldSets() const { return _fieldSets.size(); } + + size_t getNumImportedAttributeFields() const { return _importedAttributeFields.size(); } + + /** + * Get information about a specific index field using the given fieldId. + * + * @return the field + * @param idx an index in the range [0, size - 1]. + **/ + const IndexField & + getIndexField(uint32_t fieldId) const + { + return _indexFields[fieldId]; + } + + /** + * Returns const view of the index fields. + */ + const std::vector<IndexField> &getIndexFields() const { + return _indexFields; + } + + /** + * Get the field id for the index field with the given name. + * + * @return the field id or UNKNOWN_FIELD_ID if not found. + * @param name the name of the field. + **/ + uint32_t getIndexFieldId(vespalib::stringref name) const; + + /** + * Check if a field is an index + * + * @return true if field is an index field. + * @param name the name of the field. + **/ + bool isIndexField(vespalib::stringref name) const; + + /** + * Check if a field is a summary field + * + * @return true if field is an summary field. + * @param name the name of the field. + **/ + bool isSummaryField(vespalib::stringref name) const; + + /** + * Check if a field is a attribute field + * + * @return true if field is an attribute field. + * @param name the name of the field. + **/ + bool isAttributeField(vespalib::stringref name) const; + + /** + * Get information about a specific attribute field using the given fieldId. + * + * @return the field + * @param idx an index in the range [0, size - 1]. + **/ + const AttributeField & + getAttributeField(uint32_t fieldId) const + { + return _attributeFields[fieldId]; + } + + /** + * Returns const view of the attribute fields. + */ + const std::vector<AttributeField> &getAttributeFields() const { + return _attributeFields; + } + + /** + * Get the field id for the attribute field with the given name. + * + * @return the field id or UNKNOWN_FIELD_ID if not found. + * @param name the name of the field. + **/ + uint32_t getAttributeFieldId(vespalib::stringref name) const; + + /** + * Get information about a specific summary field using the given fieldId. + * + * @return the field + * @param idx an index in the range [0, size - 1] + **/ + const SummaryField & + getSummaryField(uint32_t fieldId) const + { + return _summaryFields[fieldId]; + } + + /** + * Returns const view of the summary fields. + */ + const std::vector<SummaryField> &getSummaryFields() const { + return _summaryFields; + } + + /** + * Get the field id for the summary field with the given name. + * + * @return the field id or UNKNOWN_FIELD_ID if not found. + * @param name the name of the field. + **/ + uint32_t getSummaryFieldId(vespalib::stringref name) const; + + /** + * Get information about a specific field set + * + * @return the field set. + * @param idx an index in the range [0, size - 1]. + **/ + const FieldSet & + getFieldSet(uint32_t idx) const + { + return _fieldSets[idx]; + } + + /** + * Get the field id for the field set with the given name. + * + * @return the field id or UNKNOWN_FIELD_ID if not found. + * @param name the name of the field set. + **/ + uint32_t + getFieldSetId(vespalib::stringref name) const; + + const std::vector<ImportedAttributeField> &getImportedAttributeFields() const { + return _importedAttributeFields; + } + + void swap(Schema &rhs); + void clear(); + + static Schema::UP intersect(const Schema &lhs, const Schema &rhs); + static Schema::UP make_union(const Schema &lhs, const Schema &rhs); + static Schema::UP set_difference(const Schema &lhs, const Schema &rhs); + + bool operator==(const Schema &rhs) const; + bool operator!=(const Schema &rhs) const; + + bool empty() const; +}; + +} diff --git a/searchlib/src/vespa/searchcommon/common/schemaconfigurer.cpp b/searchlib/src/vespa/searchcommon/common/schemaconfigurer.cpp new file mode 100644 index 00000000000..8fbebe80b4b --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/schemaconfigurer.cpp @@ -0,0 +1,239 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "schemaconfigurer.h" +#include "subscriptionproxyng.h" +#include <vespa/config-attributes.h> +#include <vespa/config-imported-fields.h> +#include <vespa/config-indexschema.h> +#include <vespa/config-summary.h> +#include <vespa/searchcommon/common/schema.h> +#include <vespa/searchcommon/attribute/collectiontype.h> +#include <vespa/searchcommon/attribute/basictype.h> + +#include <vespa/log/log.h> +LOG_SETUP(".index.schemaconfigurer"); + +using namespace config; +using namespace vespa::config::search; + +namespace search::index { + +using schema::DataType; +using schema::CollectionType; + +namespace { + +Schema::DataType +convertIndexDataType(const IndexschemaConfig::Indexfield::Datatype &type) +{ + switch (type) { + case IndexschemaConfig::Indexfield::Datatype::STRING: + return DataType::STRING; + case IndexschemaConfig::Indexfield::Datatype::INT64: + return DataType::INT64; + } + return DataType::STRING; +} + + +Schema::CollectionType +convertIndexCollectionType(const IndexschemaConfig::Indexfield::Collectiontype &type) +{ + switch (type) { + case IndexschemaConfig::Indexfield::Collectiontype::SINGLE: + return CollectionType::SINGLE; + case IndexschemaConfig::Indexfield::Collectiontype::ARRAY: + return CollectionType::ARRAY; + case IndexschemaConfig::Indexfield::Collectiontype::WEIGHTEDSET: + return CollectionType::WEIGHTEDSET; + } + return CollectionType::SINGLE; +} + +template <typename ConfigType> +Schema::DataType +convertDataType(const ConfigType &type) +{ + switch (type) { + case ConfigType::STRING: + return DataType::STRING; + case ConfigType::BOOL: + return DataType::BOOL; + case ConfigType::UINT2: + return DataType::UINT2; + case ConfigType::UINT4: + return DataType::UINT4; + case ConfigType::INT8: + return DataType::INT8; + case ConfigType::INT16: + return DataType::INT16; + case ConfigType::INT32: + return DataType::INT32; + case ConfigType::INT64: + return DataType::INT64; + case ConfigType::FLOAT: + return DataType::FLOAT; + case ConfigType::DOUBLE: + return DataType::DOUBLE; + case ConfigType::PREDICATE: + return DataType::BOOLEANTREE; + case ConfigType::TENSOR: + return DataType::TENSOR; + case ConfigType::REFERENCE: + return DataType::REFERENCE; + default: + break; + } + // TODO: exception? + return DataType::STRING; +} + +template <typename ConfigType> +Schema::CollectionType +convertCollectionType(const ConfigType &type) +{ + switch (type) { + case ConfigType::SINGLE: + return CollectionType::SINGLE; + case ConfigType::ARRAY: + return CollectionType::ARRAY; + case ConfigType::WEIGHTEDSET: + return CollectionType::WEIGHTEDSET; + } + return CollectionType::SINGLE; +} + + +Schema::DataType +convertSummaryType(const vespalib::string &type) +{ + if (type == "byte") { + return DataType::INT8; + } else if (type == "short") { + return DataType::INT16; + } else if (type == "integer") { + return DataType::INT32; + } else if (type == "int64") { + return DataType::INT64; + } else if (type == "float") { + return DataType::FLOAT; + } else if (type == "double") { + return DataType::DOUBLE; + } else if (type == "string" || + type == "longstring" || + type == "xmlstring" || + type == "featuredata" || + type == "jsonstring") + { + return DataType::STRING; + } else if (type == "data" || + type == "longdata") + { + return DataType::RAW; + } + return DataType::RAW; +} + +} + +void +SchemaBuilder::build(const IndexschemaConfig &cfg, Schema &schema) +{ + for (size_t i = 0; i < cfg.indexfield.size(); ++i) { + const IndexschemaConfig::Indexfield & f = cfg.indexfield[i]; + schema.addIndexField(Schema::IndexField(f.name, convertIndexDataType(f.datatype), + convertIndexCollectionType(f.collectiontype)). + setAvgElemLen(f.averageelementlen). + set_interleaved_features(f.interleavedfeatures)); + } + for (size_t i = 0; i < cfg.fieldset.size(); ++i) { + const IndexschemaConfig::Fieldset &fs = cfg.fieldset[i]; + Schema::FieldSet toAdd(fs.name); + for (size_t j = 0; j < fs.field.size(); ++j) { + toAdd.addField(fs.field[j].name); + } + schema.addFieldSet(toAdd); + } +} + + +void +SchemaBuilder::build(const AttributesConfig &cfg, Schema &schema) +{ + for (const auto &attr : cfg.attribute) { + if (attr.imported) { + schema.addImportedAttributeField(Schema::ImportedAttributeField(attr.name, + convertDataType(attr.datatype), + convertCollectionType(attr.collectiontype))); + } else { + schema.addAttributeField(Schema::Field(attr.name, + convertDataType(attr.datatype), + convertCollectionType(attr.collectiontype))); + } + } +} + + +void +SchemaBuilder::build(const SummaryConfig &cfg, Schema &schema) +{ + for (size_t i = 0; i < cfg.classes.size(); ++i) { + LOG(debug, "class with index %lu has id %d (default has id %d)", + i, cfg.classes[i].id, cfg.defaultsummaryid); + } + for (size_t i = 0; i < cfg.classes.size(); ++i) { + // use the default summary class that has all fields + if (cfg.classes[i].id == cfg.defaultsummaryid) { + for (size_t j = 0; j < cfg.classes[i].fields.size(); ++j) { + const SummaryConfig::Classes::Fields & f = + cfg.classes[i].fields[j]; + schema.addSummaryField(Schema::Field(f.name, + convertSummaryType(f.type))); + } + return; + } + } + if (cfg.classes.empty()) { + LOG(debug, + "No summary class configured that match the default summary id %d", + cfg.defaultsummaryid); + } else { + LOG(warning, + "No summary class configured that match the default summary id %d", + cfg.defaultsummaryid); + } +} + +void +SchemaConfigurer::configure(const IndexschemaConfig &cfg) +{ + SchemaBuilder::build(cfg, _schema); +} + +void +SchemaConfigurer::configure(const AttributesConfig &cfg) +{ + SchemaBuilder::build(cfg, _schema); +} + +void +SchemaConfigurer::configure(const SummaryConfig & cfg) +{ + SchemaBuilder::build(cfg, _schema); +} + +SchemaConfigurer::SchemaConfigurer(Schema &schema, const vespalib::string &configId) + : _schema(schema) +{ + search::SubscriptionProxyNg<SchemaConfigurer, IndexschemaConfig> + indexSchemaSubscriber(*this, &SchemaConfigurer::configure); + search::SubscriptionProxyNg<SchemaConfigurer, AttributesConfig> + attributesSubscriber(*this, &SchemaConfigurer::configure); + search::SubscriptionProxyNg<SchemaConfigurer, SummaryConfig> + summarySubscriber(*this, &SchemaConfigurer::configure); + indexSchemaSubscriber.subscribe(configId.c_str()); + attributesSubscriber.subscribe(configId.c_str()); + summarySubscriber.subscribe(configId.c_str()); +} + +} diff --git a/searchlib/src/vespa/searchcommon/common/schemaconfigurer.h b/searchlib/src/vespa/searchcommon/common/schemaconfigurer.h new file mode 100644 index 00000000000..925aefcfa25 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/schemaconfigurer.h @@ -0,0 +1,68 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> + +namespace vespa::config::search::internal { + class InternalIndexschemaType; + class InternalAttributesType; + class InternalSummaryType; +} + +namespace search::index { + +class Schema; + +/** + * Schema class used to give a high-level description of the content + * of an index. + **/ +class SchemaBuilder +{ +public: + using IndexschemaConfig = const vespa::config::search::internal::InternalIndexschemaType; + using AttributesConfig = const vespa::config::search::internal::InternalAttributesType; + using SummaryConfig = const vespa::config::search::internal::InternalSummaryType; + /** + * Build from indexschema config. + * + * @param indexCfg IndexschemaConfig to use + */ + static void build(const IndexschemaConfig &cfg, Schema &schema); + /** + * Build from attribute config. + * + * @param attributeCfg AttributesConfig to use + **/ + static void build(const AttributesConfig &cfg, Schema &schema); + /** + * Build from summary config. + * + * @param summaryCfg SummaryConfig to use + **/ + static void build(const SummaryConfig &cfg, Schema &schema); + +}; + +class SchemaConfigurer +{ +private: + using IndexschemaConfig = SchemaBuilder::IndexschemaConfig; + using AttributesConfig = SchemaBuilder::AttributesConfig; + using SummaryConfig = SchemaBuilder::SummaryConfig; + Schema & _schema; + void configure(const IndexschemaConfig & cfg); + void configure(const AttributesConfig & cfg); + void configure(const SummaryConfig & cfg); + +public: + /** + * Load this schema from config using the given config id. + * + * @param configId the config id used to retrieve the relevant config. + **/ + SchemaConfigurer(Schema & schema, const vespalib::string &configId); +}; + +} diff --git a/searchlib/src/vespa/searchcommon/common/subscriptionproxyng.h b/searchlib/src/vespa/searchcommon/common/subscriptionproxyng.h new file mode 100644 index 00000000000..dd24480f689 --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/subscriptionproxyng.h @@ -0,0 +1,61 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/config/helper/legacysubscriber.hpp> + +namespace search { + +template <typename ME, typename CFG> +class SubscriptionProxyNg : public config::IFetcherCallback<CFG> +{ + typedef void (ME::*Method)(const CFG &cfg); + +private: + ME &_target; + Method _method; + std::unique_ptr<config::LegacySubscriber> _subscriber; + vespalib::string _cfgId; + + SubscriptionProxyNg(const SubscriptionProxyNg&); + SubscriptionProxyNg &operator=(const SubscriptionProxyNg&); + +public: + SubscriptionProxyNg(ME &target, Method method) + : _target(target), + _method(method), + _subscriber(), + _cfgId("") + { + } + virtual ~SubscriptionProxyNg() { + unsubscribe(); + } + const char *getConfigId() const { + return _cfgId.c_str(); + } + void subscribe(const char *configId) { + if (_subscriber) { + if (configId != nullptr && strcmp(configId, _subscriber->id().c_str()) == 0) + { + return; // same id; ignore + } else { + unsubscribe(); + } + } + if (configId != nullptr && configId[0] != '\0') { + _cfgId = configId; + _subscriber = std::make_unique<config::LegacySubscriber>(); + _subscriber->subscribe<CFG>(configId, this); + } + } + void unsubscribe() { + _subscriber.reset(); + _cfgId = ""; + } + void configure(std::unique_ptr<CFG> cfg) override { + (_target.*_method)(*cfg); + } +}; + +} // namespace search + diff --git a/searchlib/src/vespa/searchcommon/common/undefinedvalues.h b/searchlib/src/vespa/searchcommon/common/undefinedvalues.h new file mode 100644 index 00000000000..bbe3198a8dc --- /dev/null +++ b/searchlib/src/vespa/searchcommon/common/undefinedvalues.h @@ -0,0 +1,69 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <cmath> +#include <limits> +#include <vespa/vespalib/stllike/string.h> + +namespace search::attribute { + +// for all integers +template <typename T> +constexpr T getUndefined() { + return std::numeric_limits<T>::min(); +} + +template <> +inline constexpr float getUndefined<float>() { + return -std::numeric_limits<float>::quiet_NaN(); +} + +template <> +inline constexpr double getUndefined<double>() { + return -std::numeric_limits<double>::quiet_NaN(); +} + + +// for all signed integers +template <typename T> +bool isUndefined(const T & value) { + return value == getUndefined<T>(); +} + +template <> +inline bool isUndefined<uint8_t>(const uint8_t &) { + return false; +} + +template <> +inline bool isUndefined<uint16_t>(const uint16_t &) { + return false; +} + +template <> +inline bool isUndefined<uint32_t>(const uint32_t &) { + return false; +} + +template <> +inline bool isUndefined<uint64_t>(const uint64_t &) { + return false; +} + +template <> +inline bool isUndefined<float>(const float & value) { + return std::isnan(value); +} + +template <> +inline bool isUndefined<double>(const double & value) { + return std::isnan(value); +} + +template <> +inline bool isUndefined<vespalib::string>(const vespalib::string & value) { + return value.empty(); +} + +} diff --git a/searchlib/src/vespa/searchlib/CMakeLists.txt b/searchlib/src/vespa/searchlib/CMakeLists.txt index dac40e0ab5f..91813a17379 100644 --- a/searchlib/src/vespa/searchlib/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/CMakeLists.txt @@ -29,10 +29,12 @@ vespa_add_library(searchlib $<TARGET_OBJECTS:searchlib_tensor> $<TARGET_OBJECTS:searchlib_transactionlog> $<TARGET_OBJECTS:searchlib_util> + $<TARGET_OBJECTS:searchcommon_searchcommon_common> + $<TARGET_OBJECTS:searchcommon_searchcommon_attribute> INSTALL lib64 DEPENDS - staging_vespalib + vespalib ${VESPA_ATOMIC_LIB} ) diff --git a/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp b/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp index b991773c50f..51a4d392839 100644 --- a/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp +++ b/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp @@ -450,7 +450,10 @@ void LogDataStore::compactFile(FileId fileId) IWriteData::UP compacter; FileId destinationFileId = FileId::active(); if (_bucketizer) { - if ( ! shouldCompactToActiveFile(fc->getDiskFootprint() - fc->getDiskBloat())) { + size_t disk_footprint = fc->getDiskFootprint(); + size_t disk_bloat = fc->getDiskBloat(); + size_t compacted_size = (disk_footprint <= disk_bloat) ? 0u : (disk_footprint - disk_bloat); + if ( ! shouldCompactToActiveFile(compacted_size)) { MonitorGuard guard(_updateLock); destinationFileId = allocateFileId(guard); setNewFileChunk(guard, createWritableFile(destinationFileId, fc->getLastPersistedSerialNum(), fc->getNameId().next())); @@ -464,9 +467,8 @@ void LogDataStore::compactFile(FileId fileId) fc->appendTo(_executor, *this, *compacter, fc->getNumChunks(), nullptr, CpuCategory::COMPACT); - if (destinationFileId.isActive()) { - flushActiveAndWait(0); - } else { + flushActiveAndWait(0); + if (!destinationFileId.isActive()) { MonitorGuard guard(_updateLock); auto & compactTo = dynamic_cast<WriteableFileChunk &>(*_fileChunks[destinationFileId.getId()]); flushFileAndWait(std::move(guard), compactTo, 0); diff --git a/searchlib/src/vespa/searchlib/index/field_length_calculator.h b/searchlib/src/vespa/searchlib/index/field_length_calculator.h index 35a18b432f9..15d4c5ec285 100644 --- a/searchlib/src/vespa/searchlib/index/field_length_calculator.h +++ b/searchlib/src/vespa/searchlib/index/field_length_calculator.h @@ -15,7 +15,7 @@ namespace search::index { */ class FieldLengthCalculator { std::atomic<double> _average_field_length; - uint32_t _num_samples; // Capped by _max_num_samples + std::atomic<uint32_t> _num_samples; // Capped by _max_num_samples uint32_t _max_num_samples; public: @@ -39,7 +39,7 @@ public: } double get_average_field_length() const { return _average_field_length.load(std::memory_order_relaxed); } - uint32_t get_num_samples() const { return _num_samples; } + uint32_t get_num_samples() const { return _num_samples.load(std::memory_order_relaxed); } uint32_t get_max_num_samples() const { return _max_num_samples; } FieldLengthInfo get_info() const { @@ -47,10 +47,12 @@ public: } void add_field_length(uint32_t field_length) { - if (_num_samples < _max_num_samples) { - ++_num_samples; + auto num_samples = get_num_samples(); + if (num_samples < _max_num_samples) { + ++num_samples; + _num_samples.store(num_samples, std::memory_order_relaxed); } - _average_field_length.store((_average_field_length.load(std::memory_order_relaxed) * (_num_samples - 1) + field_length) / _num_samples, std::memory_order_relaxed); + _average_field_length.store((_average_field_length.load(std::memory_order_relaxed) * (num_samples - 1) + field_length) / num_samples, std::memory_order_relaxed); } }; |