diff options
Diffstat (limited to 'searchcommon')
47 files changed, 3457 insertions, 0 deletions
diff --git a/searchcommon/.gitignore b/searchcommon/.gitignore new file mode 100644 index 00000000000..a9b20e8992d --- /dev/null +++ b/searchcommon/.gitignore @@ -0,0 +1,2 @@ +Makefile +Testing diff --git a/searchcommon/CMakeLists.txt b/searchcommon/CMakeLists.txt new file mode 100644 index 00000000000..8418f32e736 --- /dev/null +++ b/searchcommon/CMakeLists.txt @@ -0,0 +1,19 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_define_module( + DEPENDS + fastos + vespalog + vespalib + config_cloudconfig + configdefinitions + + TESTS + src/tests/attribute/config + src/tests/schema + + LIBS + src/vespa/searchcommon + src/vespa/searchcommon/attribute + src/vespa/searchcommon/common + src/vespa/searchcommon/config +) diff --git a/searchcommon/OWNERS b/searchcommon/OWNERS new file mode 100644 index 00000000000..6b6bfc6e2ac --- /dev/null +++ b/searchcommon/OWNERS @@ -0,0 +1,3 @@ +geirst +balder +tegge diff --git a/searchcommon/src/.gitignore b/searchcommon/src/.gitignore new file mode 100644 index 00000000000..8b68901f2ce --- /dev/null +++ b/searchcommon/src/.gitignore @@ -0,0 +1,4 @@ +/Makefile.ini +/config_command.sh +/project.dsw +/searchcommon.mak diff --git a/searchcommon/src/testlist.txt b/searchcommon/src/testlist.txt new file mode 100644 index 00000000000..46279ceb830 --- /dev/null +++ b/searchcommon/src/testlist.txt @@ -0,0 +1,2 @@ +tests/attribute/config +tests/schema diff --git a/searchcommon/src/tests/.gitignore b/searchcommon/src/tests/.gitignore new file mode 100644 index 00000000000..a3e9c375723 --- /dev/null +++ b/searchcommon/src/tests/.gitignore @@ -0,0 +1,3 @@ +.depend +Makefile +*_test diff --git a/searchcommon/src/tests/attribute/config/.gitignore b/searchcommon/src/tests/attribute/config/.gitignore new file mode 100644 index 00000000000..ffdb7b1e933 --- /dev/null +++ b/searchcommon/src/tests/attribute/config/.gitignore @@ -0,0 +1 @@ +searchcommon_attribute_config_test_app diff --git a/searchcommon/src/tests/attribute/config/CMakeLists.txt b/searchcommon/src/tests/attribute/config/CMakeLists.txt new file mode 100644 index 00000000000..d0864c68240 --- /dev/null +++ b/searchcommon/src/tests/attribute/config/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchcommon_attribute_config_test_app + SOURCES + attribute_config_test.cpp + DEPENDS + searchcommon +) +vespa_add_test(NAME searchcommon_attribute_config_test_app NO_VALGRIND COMMAND searchcommon_attribute_config_test_app) diff --git a/searchcommon/src/tests/attribute/config/DESC b/searchcommon/src/tests/attribute/config/DESC new file mode 100644 index 00000000000..b98a0b64649 --- /dev/null +++ b/searchcommon/src/tests/attribute/config/DESC @@ -0,0 +1 @@ +search::attribute::Config test. Take a look at attribute_config_test.cpp for details. diff --git a/searchcommon/src/tests/attribute/config/FILES b/searchcommon/src/tests/attribute/config/FILES new file mode 100644 index 00000000000..90f22156a0a --- /dev/null +++ b/searchcommon/src/tests/attribute/config/FILES @@ -0,0 +1 @@ +attribute_config_test.cpp diff --git a/searchcommon/src/tests/attribute/config/attribute_config_test.cpp b/searchcommon/src/tests/attribute/config/attribute_config_test.cpp new file mode 100644 index 00000000000..3a7994ee39b --- /dev/null +++ b/searchcommon/src/tests/attribute/config/attribute_config_test.cpp @@ -0,0 +1,106 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/vespalib/testkit/test_kit.h> +#include <vespa/searchcommon/attribute/config.h> + +using search::attribute::Config; +using search::attribute::BasicType; +using search::attribute::CollectionType; +using vespalib::tensor::TensorType; + + +struct Fixture +{ + Config _config; + Fixture() + : _config() + { + } + + Fixture(BasicType bt, + CollectionType ct = CollectionType::SINGLE, + bool fastSearch_ = false, + bool huge_ = false) + : _config(bt, ct, fastSearch_, huge_) + { + } +}; + +TEST_F("test default attribute config", Fixture) +{ + EXPECT_EQUAL(BasicType::Type::NONE, f._config.basicType().type()); + EXPECT_EQUAL(CollectionType::Type::SINGLE, + f._config.collectionType().type()); + EXPECT_TRUE(!f._config.fastSearch()); + EXPECT_TRUE(!f._config.huge()); + EXPECT_TRUE(!f._config.getEnableBitVectors()); + EXPECT_TRUE(!f._config.getEnableOnlyBitVector()); + EXPECT_TRUE(!f._config.getIsFilter()); + EXPECT_TRUE(!f._config.fastAccess()); + EXPECT_TRUE(!f._config.tensorType().is_valid()); +} + +TEST_F("test integer weightedset attribute config", + Fixture(BasicType::Type::INT32, + CollectionType::Type::WSET)) +{ + EXPECT_EQUAL(BasicType::Type::INT32, f._config.basicType().type()); + EXPECT_EQUAL(CollectionType::Type::WSET, + f._config.collectionType().type()); + EXPECT_TRUE(!f._config.fastSearch()); + EXPECT_TRUE(!f._config.huge()); + EXPECT_TRUE(!f._config.getEnableBitVectors()); + EXPECT_TRUE(!f._config.getEnableOnlyBitVector()); + EXPECT_TRUE(!f._config.getIsFilter()); + EXPECT_TRUE(!f._config.fastAccess()); + EXPECT_TRUE(!f._config.tensorType().is_valid()); +} + + +TEST("test operator== on attribute config") +{ + Config cfg1(BasicType::Type::INT32, CollectionType::Type::WSET); + Config cfg2(BasicType::Type::INT32, CollectionType::Type::ARRAY); + Config cfg3(BasicType::Type::INT32, CollectionType::Type::WSET); + + EXPECT_TRUE(cfg1 != cfg2); + EXPECT_TRUE(cfg2 != cfg3); + EXPECT_TRUE(cfg1 == cfg3); +} + + +TEST("test operator== on attribute config for tensor type") +{ + Config cfg1(BasicType::Type::TENSOR); + Config cfg2(BasicType::Type::TENSOR); + Config cfg3(BasicType::Type::TENSOR); + + TensorType dense_x = TensorType::fromSpec("tensor(x[10])"); + TensorType sparse_x = TensorType::fromSpec("tensor(x{})"); + + // invalid tensors are not equal + EXPECT_TRUE(cfg1 != cfg2); + EXPECT_TRUE(cfg2 != cfg3); + EXPECT_TRUE(cfg1 != cfg3); + + cfg1.setTensorType(dense_x); + cfg3.setTensorType(dense_x); + EXPECT_EQUAL(dense_x, cfg1.tensorType()); + EXPECT_EQUAL(dense_x, cfg3.tensorType()); + EXPECT_TRUE(cfg1.tensorType().is_valid()); + EXPECT_TRUE(!cfg2.tensorType().is_valid()); + EXPECT_TRUE(cfg3.tensorType().is_valid()); + + EXPECT_TRUE(cfg1 != cfg2); + EXPECT_TRUE(cfg2 != cfg3); + EXPECT_TRUE(cfg1 == cfg3); + + cfg3.setTensorType(sparse_x); + EXPECT_EQUAL(sparse_x, cfg3.tensorType()); + EXPECT_TRUE(cfg3.tensorType().is_valid()); + EXPECT_TRUE(cfg1 != cfg3); +} + + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchcommon/src/tests/schema/.gitignore b/searchcommon/src/tests/schema/.gitignore new file mode 100644 index 00000000000..79e714aa5a2 --- /dev/null +++ b/searchcommon/src/tests/schema/.gitignore @@ -0,0 +1,4 @@ +/.depend +/Makefile +/schema_test +searchcommon_schema_test_app diff --git a/searchcommon/src/tests/schema/CMakeLists.txt b/searchcommon/src/tests/schema/CMakeLists.txt new file mode 100644 index 00000000000..0a600a55e7b --- /dev/null +++ b/searchcommon/src/tests/schema/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchcommon_schema_test_app + SOURCES + schema_test.cpp + DEPENDS + searchcommon +) +vespa_add_test(NAME searchcommon_schema_test_app NO_VALGRIND COMMAND searchcommon_schema_test_app) diff --git a/searchcommon/src/tests/schema/DESC b/searchcommon/src/tests/schema/DESC new file mode 100644 index 00000000000..e357c31742a --- /dev/null +++ b/searchcommon/src/tests/schema/DESC @@ -0,0 +1 @@ +schema test. Take a look at schema.cpp for details. diff --git a/searchcommon/src/tests/schema/FILES b/searchcommon/src/tests/schema/FILES new file mode 100644 index 00000000000..4688fde12ce --- /dev/null +++ b/searchcommon/src/tests/schema/FILES @@ -0,0 +1 @@ +schema.cpp diff --git a/searchcommon/src/tests/schema/attributes.cfg b/searchcommon/src/tests/schema/attributes.cfg new file mode 100644 index 00000000000..09f711b6a65 --- /dev/null +++ b/searchcommon/src/tests/schema/attributes.cfg @@ -0,0 +1,22 @@ +attribute[9] +attribute[0].name a +attribute[0].datatype STRING +attribute[0].collectiontype SINGLE +attribute[1].name b +attribute[1].datatype INT8 +attribute[1].collectiontype ARRAY +attribute[2].name c +attribute[2].datatype INT16 +attribute[2].collectiontype WEIGHTEDSET +attribute[3].name d +attribute[3].datatype INT32 +attribute[4].name e +attribute[4].datatype INT64 +attribute[5].name f +attribute[5].datatype FLOAT +attribute[6].name g +attribute[6].datatype DOUBLE +attribute[7].name h +attribute[7].datatype PREDICATE +attribute[8].name i +attribute[8].datatype TENSOR diff --git a/searchcommon/src/tests/schema/indexschema.cfg b/searchcommon/src/tests/schema/indexschema.cfg new file mode 100644 index 00000000000..989f30f7499 --- /dev/null +++ b/searchcommon/src/tests/schema/indexschema.cfg @@ -0,0 +1,26 @@ +indexfield[6] +indexfield[0].name a +indexfield[0].datatype STRING +indexfield[1].name b +indexfield[1].datatype INT64 +indexfield[2].name c +indexfield[2].datatype STRING +indexfield[2].prefix true +indexfield[2].phrases false +indexfield[2].positions false +indexfield[3].name e +indexfield[3].datatype BOOLEANTREE +indexfield[3].collectiontype SINGLE +indexfield[4].name f +indexfield[4].indextype RISE +indexfield[4].datatype STRING +indexfield[4].collectiontype WEIGHTEDSET +indexfield[5].name g +indexfield[5].indextype RISE +indexfield[5].datatype INT64 +indexfield[5].collectiontype WEIGHTEDSET +fieldset[1] +fieldset[0].name default +fieldset[0].field[2] +fieldset[0].field[0].name a +fieldset[0].field[1].name c diff --git a/searchcommon/src/tests/schema/schema_test.cpp b/searchcommon/src/tests/schema/schema_test.cpp new file mode 100644 index 00000000000..56154c7a7d4 --- /dev/null +++ b/searchcommon/src/tests/schema/schema_test.cpp @@ -0,0 +1,387 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/vespalib/stllike/string.h> +#include <fstream> +#include <vespa/vespalib/testkit/testapp.h> +#include <vespa/config/common/configparser.h> +#include <vespa/searchcommon/common/schemaconfigurer.h> +LOG_SETUP("schema_test"); + +using vespalib::string; + +namespace search { +namespace index { + +void assertField(const Schema::Field & exp, const Schema::Field & act) { + EXPECT_EQUAL(exp.getName(), act.getName()); + EXPECT_EQUAL(exp.getDataType(), act.getDataType()); + EXPECT_EQUAL(exp.getCollectionType(), act.getCollectionType()); +} + +void assertIndexField(const Schema::IndexField & exp, + const Schema::IndexField & act) +{ + assertField(exp, act); + EXPECT_EQUAL(exp.hasPrefix(), act.hasPrefix()); + EXPECT_EQUAL(exp.hasPhrases(), act.hasPhrases()); + EXPECT_EQUAL(exp.hasPositions(), act.hasPositions()); +} + +void assertSet(const Schema::FieldSet &exp, + const Schema::FieldSet &act) +{ + EXPECT_EQUAL(exp.getName(), act.getName()); + ASSERT_EQUAL(exp.getFields().size(), act.getFields().size()); + for (size_t i = 0; i < exp.getFields().size(); ++i) { + EXPECT_EQUAL(exp.getFields()[i], act.getFields()[i]); + } +} + +void assertSchema(const Schema & exp, const Schema & act) { + ASSERT_EQUAL(exp.getNumIndexFields(), act.getNumIndexFields()); + for (size_t i = 0; i < exp.getNumIndexFields(); ++i) { + assertIndexField(exp.getIndexField(i), act.getIndexField(i)); + } + ASSERT_EQUAL(exp.getNumAttributeFields(), act.getNumAttributeFields()); + for (size_t i = 0; i < exp.getNumAttributeFields(); ++i) { + assertField(exp.getAttributeField(i), act.getAttributeField(i)); + } + ASSERT_EQUAL(exp.getNumSummaryFields(), act.getNumSummaryFields()); + for (size_t i = 0; i < exp.getNumSummaryFields(); ++i) { + assertField(exp.getSummaryField(i), act.getSummaryField(i)); + } + ASSERT_EQUAL(exp.getNumFieldSets(), act.getNumFieldSets()); + for (size_t i = 0; i < exp.getNumFieldSets(); ++i) { + assertSet(exp.getFieldSet(i), act.getFieldSet(i)); + } +} + +TEST("testBasic") { + Schema s; + EXPECT_EQUAL(0u, s.getNumIndexFields()); + EXPECT_EQUAL(0u, s.getNumAttributeFields()); + EXPECT_EQUAL(0u, s.getNumSummaryFields()); + + s.addIndexField(Schema::IndexField("foo", Schema::STRING)); + s.addIndexField(Schema::IndexField("bar", Schema::INT32)); + + s.addAttributeField(Schema::AttributeField("foo", Schema::STRING, Schema::ARRAY)); + s.addAttributeField(Schema::AttributeField("bar", Schema::INT32, Schema::WEIGHTEDSET)); + s.addAttributeField(Schema::AttributeField("cox", Schema::STRING)); + + s.addSummaryField(Schema::SummaryField("foo", Schema::STRING, Schema::ARRAY)); + s.addSummaryField(Schema::SummaryField("bar", Schema::INT32, Schema::WEIGHTEDSET)); + s.addSummaryField(Schema::SummaryField("cox", Schema::STRING)); + s.addSummaryField(Schema::SummaryField("fox", Schema::RAW)); + + s.addFieldSet(Schema::FieldSet("default"). + addField("foo").addField("bar")); + + EXPECT_EQUAL(2u, s.getNumIndexFields()); + { + EXPECT_EQUAL("foo", s.getIndexField(0).getName()); + EXPECT_EQUAL(Schema::STRING, s.getIndexField(0).getDataType()); + EXPECT_EQUAL(Schema::SINGLE, s.getIndexField(0).getCollectionType()); + EXPECT_TRUE(!s.getIndexField(0).hasPrefix()); + EXPECT_TRUE(!s.getIndexField(0).hasPhrases()); + EXPECT_TRUE(s.getIndexField(0).hasPositions()); + + EXPECT_EQUAL("bar", s.getIndexField(1).getName()); + EXPECT_EQUAL(Schema::INT32, s.getIndexField(1).getDataType()); + EXPECT_EQUAL(Schema::SINGLE, s.getIndexField(1).getCollectionType()); + + EXPECT_EQUAL(0u, s.getIndexFieldId("foo")); + EXPECT_EQUAL(1u, s.getIndexFieldId("bar")); + EXPECT_EQUAL(Schema::UNKNOWN_FIELD_ID, s.getIndexFieldId("cox")); + } + EXPECT_EQUAL(3u, s.getNumAttributeFields()); + { + EXPECT_EQUAL("foo", s.getAttributeField(0).getName()); + EXPECT_EQUAL(Schema::STRING, s.getAttributeField(0).getDataType()); + EXPECT_EQUAL(Schema::ARRAY, + s.getAttributeField(0).getCollectionType()); + + EXPECT_EQUAL("bar", s.getAttributeField(1).getName()); + EXPECT_EQUAL(Schema::INT32, s.getAttributeField(1).getDataType()); + EXPECT_EQUAL(Schema::WEIGHTEDSET, + s.getAttributeField(1).getCollectionType()); + + EXPECT_EQUAL("cox", s.getAttributeField(2).getName()); + EXPECT_EQUAL(Schema::STRING, s.getAttributeField(2).getDataType()); + EXPECT_EQUAL(Schema::SINGLE, + s.getAttributeField(2).getCollectionType()); + + EXPECT_EQUAL(0u, s.getAttributeFieldId("foo")); + EXPECT_EQUAL(1u, s.getAttributeFieldId("bar")); + EXPECT_EQUAL(2u, s.getAttributeFieldId("cox")); + EXPECT_EQUAL(Schema::UNKNOWN_FIELD_ID, s.getIndexFieldId("fox")); + } + EXPECT_EQUAL(4u, s.getNumSummaryFields()); + { + EXPECT_EQUAL("foo", s.getSummaryField(0).getName()); + EXPECT_EQUAL(Schema::STRING, s.getSummaryField(0).getDataType()); + EXPECT_EQUAL(Schema::ARRAY, s.getSummaryField(0).getCollectionType()); + + EXPECT_EQUAL("bar", s.getSummaryField(1).getName()); + EXPECT_EQUAL(Schema::INT32, s.getSummaryField(1).getDataType()); + EXPECT_EQUAL(Schema::WEIGHTEDSET, + s.getSummaryField(1).getCollectionType()); + + EXPECT_EQUAL("cox", s.getSummaryField(2).getName()); + EXPECT_EQUAL(Schema::STRING, s.getSummaryField(2).getDataType()); + EXPECT_EQUAL(Schema::SINGLE, s.getSummaryField(2).getCollectionType()); + + EXPECT_EQUAL("fox", s.getSummaryField(3).getName()); + EXPECT_EQUAL(Schema::RAW, s.getSummaryField(3).getDataType()); + EXPECT_EQUAL(Schema::SINGLE, s.getSummaryField(3).getCollectionType()); + + EXPECT_EQUAL(0u, s.getSummaryFieldId("foo")); + EXPECT_EQUAL(1u, s.getSummaryFieldId("bar")); + EXPECT_EQUAL(2u, s.getSummaryFieldId("cox")); + EXPECT_EQUAL(3u, s.getSummaryFieldId("fox")); + EXPECT_EQUAL(Schema::UNKNOWN_FIELD_ID, s.getSummaryFieldId("not")); + } + EXPECT_EQUAL(1u, s.getNumFieldSets()); + { + EXPECT_EQUAL("default", s.getFieldSet(0).getName()); + EXPECT_EQUAL(2u, s.getFieldSet(0).getFields().size()); + EXPECT_EQUAL("foo", s.getFieldSet(0).getFields()[0]); + EXPECT_EQUAL("bar", s.getFieldSet(0).getFields()[1]); + } +} + +TEST("testLoadAndSave") { + typedef Schema::IndexField SIF; + typedef Schema::AttributeField SAF; + typedef Schema::SummaryField SSF; + typedef Schema SDT; + typedef Schema SCT; + typedef Schema::FieldSet SFS; + + { // load from config -> save to file -> load from file + Schema s; + SchemaConfigurer configurer(s, "dir:."); + EXPECT_EQUAL(3u, s.getNumIndexFields()); + assertIndexField(SIF("a", SDT::STRING), s.getIndexField(0)); + assertIndexField(SIF("b", SDT::INT64), s.getIndexField(1)); + assertIndexField(SIF("c", SDT::STRING).setPrefix(true) + .setPhrases(false).setPositions(false), + s.getIndexField(2)); + + EXPECT_EQUAL(9u, s.getNumAttributeFields()); + assertField(SAF("a", SDT::STRING, SCT::SINGLE), + s.getAttributeField(0)); + assertField(SAF("b", SDT::INT8, SCT::ARRAY), s.getAttributeField(1)); + assertField(SAF("c", SDT::INT16, SCT::WEIGHTEDSET), + s.getAttributeField(2)); + assertField(SAF("d", SDT::INT32), s.getAttributeField(3)); + assertField(SAF("e", SDT::INT64), s.getAttributeField(4)); + assertField(SAF("f", SDT::FLOAT), s.getAttributeField(5)); + assertField(SAF("g", SDT::DOUBLE), s.getAttributeField(6)); + assertField(SAF("h", SDT::BOOLEANTREE), s.getAttributeField(7)); + assertField(SAF("i", SDT::TENSOR), s.getAttributeField(8)); + + EXPECT_EQUAL(12u, s.getNumSummaryFields()); + assertField(SSF("a", SDT::INT8), s.getSummaryField(0)); + assertField(SSF("b", SDT::INT16), s.getSummaryField(1)); + assertField(SSF("c", SDT::INT32), s.getSummaryField(2)); + assertField(SSF("d", SDT::INT64), s.getSummaryField(3)); + assertField(SSF("e", SDT::FLOAT), s.getSummaryField(4)); + assertField(SSF("f", SDT::DOUBLE), s.getSummaryField(5)); + assertField(SSF("g", SDT::STRING), s.getSummaryField(6)); + assertField(SSF("h", SDT::STRING), s.getSummaryField(7)); + assertField(SSF("i", SDT::STRING), s.getSummaryField(8)); + assertField(SSF("j", SDT::STRING), s.getSummaryField(9)); + assertField(SSF("k", SDT::RAW), s.getSummaryField(10)); + assertField(SSF("l", SDT::RAW), s.getSummaryField(11)); + + EXPECT_EQUAL(1u, s.getNumFieldSets()); + assertSet(SFS("default").addField("a").addField("c"), + s.getFieldSet(0)); + + Schema s2 = s; + EXPECT_TRUE(s.saveToFile("schema.txt")); + assertSchema(s, s2); // test copy contructor + Schema s3; + EXPECT_TRUE(s3.loadFromFile("schema.txt")); + assertSchema(s, s3); // test that saved file is loaded correctly + s3.addIndexField(SIF("foo", SDT::STRING)); + EXPECT_TRUE(s3.loadFromFile("schema.txt")); // load should clear the current content + assertSchema(s, s3); + } + { // empty schema + Schema s; + EXPECT_TRUE(s.saveToFile("schema2.txt")); + Schema s2; + s2.addIndexField(SIF("foo", SDT::STRING)); + EXPECT_TRUE(s2.loadFromFile("schema2.txt")); + assertSchema(s, s2); + } + { // load with error + Schema s; + EXPECT_TRUE(!s.loadFromFile("not.txt")); + EXPECT_TRUE(!s.saveToFile("not/not.txt")); + } +} + +TEST("require that schema can save and load timestamps for fields") { + const fastos::TimeStamp timestamp(42); + const std::string file_name = "schema-with-timestamps.txt"; + Schema s; + Schema::IndexField f("foo", Schema::STRING); + f.setTimestamp(timestamp); + s.addIndexField(f); + ASSERT_TRUE(s.saveToFile(file_name)); + Schema s2; + ASSERT_TRUE(s2.loadFromFile(file_name)); + ASSERT_EQUAL(1u, s2.getNumIndexFields()); + ASSERT_EQUAL(timestamp, s2.getIndexField(0).getTimestamp()); +} + +TEST("require that timestamps are omitted when 0.") { + const std::string file_name = "schema-without-timestamps.txt"; + Schema s; + s.addIndexField(Schema::IndexField("foo", Schema::STRING)); + ASSERT_TRUE(s.saveToFile(file_name)); + + std::ifstream file(file_name.c_str()); + ASSERT_TRUE(file.good()); + while (file) { + std::string line; + getline(file, line); + EXPECT_NOT_EQUAL("indexfield[0].timestamp 0", line); + } + + Schema s2; + ASSERT_TRUE(s2.loadFromFile(file_name)); + ASSERT_EQUAL(1u, s2.getNumIndexFields()); +} + +void addAllFieldTypes(const string &name, Schema &schema, + fastos::TimeStamp timestamp) { + Schema::IndexField index_field(name, Schema::STRING); + index_field.setTimestamp(timestamp); + schema.addIndexField(index_field); + + Schema::AttributeField attribute_field(name, Schema::STRING); + attribute_field.setTimestamp(timestamp); + schema.addAttributeField(attribute_field); + + Schema::SummaryField summary_field(name, Schema::STRING); + summary_field.setTimestamp(timestamp); + schema.addSummaryField(summary_field); + + schema.addFieldSet(Schema::FieldSet(name)); +} + +TEST("require that schemas can be added") { + const string name1 = "foo"; + const string name2 = "bar"; + const fastos::TimeStamp timestamp1(42); + const fastos::TimeStamp timestamp2(84); + Schema s1; + addAllFieldTypes(name1, s1, timestamp1); + Schema s2; + addAllFieldTypes(name2, s2, timestamp2); + + Schema::UP sum = Schema::make_union(s1, s2); + ASSERT_EQUAL(2u, sum->getNumIndexFields()); + EXPECT_TRUE(s1.getIndexField(0) == + sum->getIndexField(sum->getIndexFieldId(name1))); + EXPECT_TRUE(s2.getIndexField(0) == + sum->getIndexField(sum->getIndexFieldId(name2))); + ASSERT_EQUAL(2u, sum->getNumAttributeFields()); + EXPECT_TRUE(s1.getAttributeField(0) == + sum->getAttributeField(sum->getAttributeFieldId(name1))); + EXPECT_TRUE(s2.getAttributeField(0) == + sum->getAttributeField(sum->getAttributeFieldId(name2))); + ASSERT_EQUAL(2u, sum->getNumSummaryFields()); + EXPECT_TRUE(s1.getSummaryField(0) == + sum->getSummaryField(sum->getSummaryFieldId(name1))); + EXPECT_TRUE(s2.getSummaryField(0) == + sum->getSummaryField(sum->getSummaryFieldId(name2))); + ASSERT_EQUAL(2u, sum->getNumFieldSets()); + EXPECT_TRUE(s1.getFieldSet(0) == + sum->getFieldSet(sum->getFieldSetId(name1))); + EXPECT_TRUE(s2.getFieldSet(0) == + sum->getFieldSet(sum->getFieldSetId(name2))); +} + +TEST("require that S union S = S for schema S") { + Schema schema; + addAllFieldTypes("foo", schema, 42); + + Schema::UP sum = Schema::make_union(schema, schema); + EXPECT_TRUE(schema == *sum); +} + +TEST("require that schema can calculate set_difference") { + const string name1 = "foo"; + const string name2 = "bar"; + const fastos::TimeStamp timestamp1(42); + const fastos::TimeStamp timestamp2(84); + Schema s1; + addAllFieldTypes(name1, s1, timestamp1); + addAllFieldTypes(name2, s1, timestamp2); + Schema s2; + addAllFieldTypes(name2, s2, timestamp2); + + Schema::UP schema = Schema::set_difference(s1, s2); + + Schema expected; + addAllFieldTypes(name1, expected, timestamp1); + EXPECT_TRUE(expected == *schema); +} + +TEST("require that getOldFields returns a subset of a schema") { + Schema schema; + const int64_t limit_timestamp = 1000; + + addAllFieldTypes("bar", schema, fastos::TimeStamp(limit_timestamp - 1)); + addAllFieldTypes("foo", schema, fastos::TimeStamp(limit_timestamp + 1)); + + Schema::UP old_fields = + schema.getOldFields(fastos::TimeStamp(limit_timestamp)); + + EXPECT_EQUAL(1u, old_fields->getNumIndexFields()); + EXPECT_EQUAL("bar", old_fields->getIndexField(0).getName()); + EXPECT_EQUAL(1u, old_fields->getNumAttributeFields()); + EXPECT_EQUAL(1u, old_fields->getNumSummaryFields()); +} + +TEST("require that schema can calculate intersection") { + const string name1 = "foo"; + const string name2 = "bar"; + const string name3 = "baz"; + const fastos::TimeStamp timestamp1(42); + const fastos::TimeStamp timestamp2(84); + Schema s1; + addAllFieldTypes(name1, s1, timestamp1); + addAllFieldTypes(name2, s1, timestamp2); + Schema s2; + addAllFieldTypes(name2, s2, timestamp2); + addAllFieldTypes(name3, s2, timestamp2); + + Schema::UP schema = Schema::intersect(s1, s2); + + Schema expected; + addAllFieldTypes(name2, expected, timestamp2); + EXPECT_TRUE(expected == *schema); +} + +TEST("require that incompatible fields are removed from intersection") { + const string name = "foo"; + Schema s1; + s1.addIndexField(Schema::IndexField(name, Schema::STRING)); + Schema s2; + s2.addIndexField(Schema::IndexField(name, Schema::INT32)); + Schema::UP schema = Schema::intersect(s1, s2); + EXPECT_EQUAL(0u, schema->getNumIndexFields()); + EXPECT_FALSE(schema->isIndexField(name)); +} + +} // namespace index +} // namespace search + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchcommon/src/tests/schema/summary.cfg b/searchcommon/src/tests/schema/summary.cfg new file mode 100644 index 00000000000..0c2de33d076 --- /dev/null +++ b/searchcommon/src/tests/schema/summary.cfg @@ -0,0 +1,29 @@ +defaultsummaryid 0 +classes[1] +classes[0].id 0 +classes[0].name test +classes[0].fields[12] +classes[0].fields[0].name a +classes[0].fields[0].type byte +classes[0].fields[1].name b +classes[0].fields[1].type short +classes[0].fields[2].name c +classes[0].fields[2].type integer +classes[0].fields[3].name d +classes[0].fields[3].type int64 +classes[0].fields[4].name e +classes[0].fields[4].type float +classes[0].fields[5].name f +classes[0].fields[5].type double +classes[0].fields[6].name g +classes[0].fields[6].type string +classes[0].fields[7].name h +classes[0].fields[7].type longstring +classes[0].fields[8].name i +classes[0].fields[8].type xmlstring +classes[0].fields[9].name j +classes[0].fields[9].type jsonstring +classes[0].fields[10].name k +classes[0].fields[10].type data +classes[0].fields[11].name l +classes[0].fields[11].type longdata diff --git a/searchcommon/src/vespa/searchcommon/.gitignore b/searchcommon/src/vespa/searchcommon/.gitignore new file mode 100644 index 00000000000..f76a9d84bed --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/.gitignore @@ -0,0 +1,3 @@ +/.depend +/Makefile +/libsearchcommon.so.5.1 diff --git a/searchcommon/src/vespa/searchcommon/CMakeLists.txt b/searchcommon/src/vespa/searchcommon/CMakeLists.txt new file mode 100644 index 00000000000..fa17af628ef --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(searchcommon + SOURCES + $<TARGET_OBJECTS:searchcommon_searchcommon_common> + $<TARGET_OBJECTS:searchcommon_searchcommon_attribute> + INSTALL lib64 + DEPENDS +) diff --git a/searchcommon/src/vespa/searchcommon/attribute/.gitignore b/searchcommon/src/vespa/searchcommon/attribute/.gitignore new file mode 100644 index 00000000000..7e7c0fe7fae --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/.gitignore @@ -0,0 +1,2 @@ +/.depend +/Makefile diff --git a/searchcommon/src/vespa/searchcommon/attribute/CMakeLists.txt b/searchcommon/src/vespa/searchcommon/attribute/CMakeLists.txt new file mode 100644 index 00000000000..5343a9eac69 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(searchcommon_searchcommon_attribute OBJECT + SOURCES + basictype.cpp + collectiontype.cpp + config.cpp + status.cpp + DEPENDS +) diff --git a/searchcommon/src/vespa/searchcommon/attribute/attributecontent.h b/searchcommon/src/vespa/searchcommon/attribute/attributecontent.h new file mode 100644 index 00000000000..60471b77608 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/attributecontent.h @@ -0,0 +1,172 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "iattributevector.h" +#include <stdint.h> + +namespace search { +namespace attribute { + + +/** + * This class is wrapping an array of type T and is used to hold the + * attribute vector content for a given document. The values stored for the + * given document in the attribute vector is copied into the array wrapped + * in an instance of this class. + * + * @param T the type of the data stored in this object + **/ +template <typename T> +class AttributeContent +{ +private: + T _staticBuf[16]; + T * _dynamicBuf; + uint32_t _size; + uint32_t _capacity; + + AttributeContent(const AttributeContent & rhs); + AttributeContent & operator=(const AttributeContent & rhs); + +public: + /** + * Creates a new object with an initial capacity of 16 without dynamic allocation. + **/ + AttributeContent() : + _dynamicBuf(NULL), + _size(0), + _capacity(16) + { + } + /** + * Destructs the object. + **/ + ~AttributeContent() { + if (_dynamicBuf != NULL) { + delete [] _dynamicBuf; + } + } + + /** + * Returns a read-only iterator to the beginning of the underlying data array. + * + * @return iterator + **/ + const T * begin() const { + if (_dynamicBuf != NULL) { + return _dynamicBuf; + } + return _staticBuf; + } + + /** + * Returns a read-only iterator to the end of the underlying data array. + * + * @return iterator + **/ + const T * end() const { + return begin() + _size; + } + + /** + * Returns the element at the given position in the underlying data array. + * + * @return read-only reference to the element + * @param idx position into the underlying data + **/ + const T & operator[](uint32_t idx) const { + return *(begin() + idx); + } + + /** + * Returns the number of elements used in the underlying data array. + * + * @return number of elements used + **/ + uint32_t size() const { + return _size; + } + + /** + * Returns the number of elements allocated in the underlying data array. + * + * @return number of elements allocated + **/ + uint32_t capacity() const { + return _capacity; + } + + /** + * Returns a read/write pointer to the underlying data array. + * + * @return read/write pointer. + **/ + T * data() { + if (_dynamicBuf != NULL) { + return _dynamicBuf; + } + return _staticBuf; + } + + /** + * Sets the number of elements used in the underlying data array. + * + * @param n number of elements used + **/ + void setSize(uint32_t n) { + _size = n; + } + + /** + * Allocates memory so that the underlying data array can hold the + * given number of elements (capacity) and sets the size to 0. + * A new data array will only be allocated if n > capacity(). + * + * @param n wanted number of elements + **/ + void allocate(uint32_t n) { + if (n > _capacity) { + if (_dynamicBuf != NULL) { + delete [] _dynamicBuf; + } + _dynamicBuf = new T[n]; + _capacity = n; + _size = 0; + } + } + + /** + * Fill this buffer with the content of the given attribute vector for the given docId. + * + * @param attribute the attribute vector + * @param docId the docId + **/ + void fill(const search::attribute::IAttributeVector & attribute, + search::attribute::IAttributeVector::DocId docId) + { + uint32_t count = attribute.get(docId, data(), capacity()); + if (count > capacity()) { + allocate(count); + count = attribute.get(docId, data(), capacity()); + } + setSize(count); + } +}; + + +typedef AttributeContent<double> FloatContent; +typedef AttributeContent<const char *> ConstCharContent; +typedef AttributeContent<IAttributeVector::largeint_t> IntegerContent; +typedef AttributeContent<IAttributeVector::EnumHandle> EnumContent; +typedef AttributeContent<IAttributeVector::WeightedInt> WeightedIntegerContent; +typedef AttributeContent<IAttributeVector::WeightedFloat> WeightedFloatContent; +typedef AttributeContent<IAttributeVector::WeightedConstChar> WeightedConstCharContent; +typedef AttributeContent<IAttributeVector::WeightedString> WeightedStringContent; +typedef AttributeContent<IAttributeVector::WeightedEnum> WeightedEnumContent; +typedef IAttributeVector::EnumHandle EnumHandle; + + +} // namespace attribute +} // namespace search + diff --git a/searchcommon/src/vespa/searchcommon/attribute/basictype.cpp b/searchcommon/src/vespa/searchcommon/attribute/basictype.cpp new file mode 100644 index 00000000000..b1a4539ebb8 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/basictype.cpp @@ -0,0 +1,40 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchcommon/attribute/basictype.h> +#include <vespa/vespalib/util/exceptions.h> + +namespace search { +namespace attribute { + +const BasicType::TypeInfo BasicType::_typeTable[BasicType::MAX_TYPE] = { + { BasicType::NONE, 0, "none" }, + { BasicType::STRING, 0, "string" }, + { BasicType::UINT1, sizeof(int8_t), "uint1" }, + { BasicType::UINT2, sizeof(int8_t), "uint2" }, + { BasicType::UINT4, sizeof(int8_t), "uint4" }, + { BasicType::INT8, sizeof(int8_t), "int8" }, + { BasicType::INT16, sizeof(int16_t), "int16" }, + { BasicType::INT32, sizeof(int32_t), "int32" }, + { BasicType::INT64, sizeof(int64_t), "int64" }, + { BasicType::FLOAT, sizeof(float), "float" }, + { BasicType::DOUBLE, sizeof(double), "double" }, + { BasicType::PREDICATE, 0, "predicate" }, + { BasicType::TENSOR, 0, "tensor" } +}; + +BasicType::Type +BasicType::asType(const vespalib::string &t) +{ + for (size_t i(0); i < sizeof(_typeTable)/sizeof(_typeTable[0]); i++) { + if (t == _typeTable[i]._name) { + return _typeTable[i]._type; + } + } + throw vespalib::IllegalStateException(t + + " not recognized as " + "valid attribute data type"); + return NONE; +} + +} +} diff --git a/searchcommon/src/vespa/searchcommon/attribute/basictype.h b/searchcommon/src/vespa/searchcommon/attribute/basictype.h new file mode 100644 index 00000000000..26b17c46f60 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/basictype.h @@ -0,0 +1,69 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> + +namespace search { +namespace attribute { + +class BasicType +{ + public: + enum Type { + NONE = 0, + STRING = 1, + UINT1 = 2, + UINT2 = 3, + UINT4 = 4, + INT8 = 5, + INT16 = 6, + INT32 = 7, + INT64 = 8, + FLOAT = 9, + DOUBLE = 10, + PREDICATE = 11, + TENSOR = 12, + MAX_TYPE + }; + + explicit + BasicType(int t) : _type(Type(t)) { } + explicit + BasicType(unsigned int t) : _type(Type(t)) { } + BasicType(Type t) : _type(t) { } + explicit + BasicType(const vespalib::string & t) : _type(asType(t)) { } + + Type type() const { return _type; } + const char * asString() const { return asString(_type); } + bool isUnsigned() const { return isUnsigned(_type); } + size_t fixedSize() const { return fixedSize(_type); } + static BasicType fromType(int8_t) { return INT8; } + static BasicType fromType(int16_t) { return INT16; } + static BasicType fromType(int32_t) { return INT32; } + static BasicType fromType(int64_t) { return INT64; } + static BasicType fromType(float) { return FLOAT; } + static BasicType fromType(double) { return DOUBLE; } + bool operator==(const BasicType &b) const { return _type == b._type; } + bool operator!=(const BasicType &b) const { return _type != b._type; } + + private: + static const char * asString(Type t) { return _typeTable[t]._name; } + static bool isUnsigned(Type t) { return _typeTable[t]._name[0] == 'u'; } + static size_t fixedSize(Type t) { return _typeTable[t]._fixedSize; } + static Type asType(const vespalib::string & t); + + Type _type; + + struct TypeInfo { + Type _type; + unsigned int _fixedSize; + const char * _name; + }; + static const TypeInfo _typeTable[MAX_TYPE]; +}; + +} +} + diff --git a/searchcommon/src/vespa/searchcommon/attribute/collectiontype.cpp b/searchcommon/src/vespa/searchcommon/attribute/collectiontype.cpp new file mode 100644 index 00000000000..33a79fd2929 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/collectiontype.cpp @@ -0,0 +1,30 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchcommon/attribute/collectiontype.h> +#include <vespa/vespalib/util/exceptions.h> + +namespace search { +namespace attribute { + +const CollectionType::TypeInfo CollectionType::_typeTable[CollectionType::MAX_TYPE] = { + { CollectionType::SINGLE, "single" }, + { CollectionType::ARRAY, "array" }, + { CollectionType::WSET, "weightedset" } +}; + +CollectionType::Type +CollectionType::asType(const vespalib::string &t) +{ + for (size_t i(0); i < sizeof(_typeTable)/sizeof(_typeTable[0]); i++) { + if (t == _typeTable[i]._name) { + return _typeTable[i]._type; + } + } + throw vespalib::IllegalStateException(t + + " not recognized as valid attribute " + "collection type"); + return SINGLE; +} + +} +} diff --git a/searchcommon/src/vespa/searchcommon/attribute/collectiontype.h b/searchcommon/src/vespa/searchcommon/attribute/collectiontype.h new file mode 100644 index 00000000000..045c344dec1 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/collectiontype.h @@ -0,0 +1,78 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> + +namespace search { +namespace attribute { + +class CollectionType +{ + public: + enum Type { + /** + * Single value type with one value stored for each document. + **/ + SINGLE = 0, + /** + * Array type with zero to n values stored for each document. + **/ + ARRAY = 1, + /** + * Weighted set type with zero to n unique values stored for each document. + * In addition each unique value is accociated with a weight. + **/ + WSET = 2, + MAX_TYPE + }; + + CollectionType(Type t = SINGLE, bool remove = false, bool create = false) : + _type(t), + _removeIfZero(remove), + _createIfNonExistant(create) + { + } + + explicit + CollectionType(const vespalib::string & t, bool remove = false, bool create = false) : + _type(asType(t)), + _removeIfZero(remove), + _createIfNonExistant(create) + { + } + + Type type() const { return _type; } + bool isMultiValue() const { return _type != SINGLE; } + bool isWeightedSet() const { return _type == WSET; } + bool isArray() const { return _type == ARRAY; } + bool removeIfZero() const { return _removeIfZero; } + bool createIfNonExistant() const { return _createIfNonExistant; } + const char * asString() const { return asString(_type); } + void removeIfZero(bool newValue) { _removeIfZero = newValue; } + void createIfNonExistant(bool newValue) { _createIfNonExistant = newValue; } + bool operator!=(const CollectionType &b) const { return !(operator==(b)); } + bool operator==(const CollectionType &b) const { + return _type == b._type && + _removeIfZero == b._removeIfZero && + _createIfNonExistant == b._createIfNonExistant; + } + + private: + struct TypeInfo { + Type _type; + const char * _name; + }; + + static const char * asString(Type t) { return _typeTable[t]._name; } + static Type asType(const vespalib::string &t); + + Type _type; + bool _removeIfZero; + bool _createIfNonExistant; + static const TypeInfo _typeTable[MAX_TYPE]; +}; + +} +} + diff --git a/searchcommon/src/vespa/searchcommon/attribute/config.cpp b/searchcommon/src/vespa/searchcommon/attribute/config.cpp new file mode 100644 index 00000000000..e160d7b9222 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/config.cpp @@ -0,0 +1,50 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchcommon/attribute/config.h> +#include <vespa/vespalib/util/exceptions.h> +#include <limits.h> + +namespace search { +namespace attribute { + +Config::Config() : + _basicType(BasicType::NONE), + _type(CollectionType::SINGLE), + _fastSearch(false), + _huge(false), + _enableBitVectors(false), + _enableOnlyBitVector(false), + _isFilter(false), + _fastAccess(false), + _maxInternalBlobSize(defaultMaxInternalBlobSize), + _arity(8), + _lower_bound(LLONG_MIN), + _upper_bound(LLONG_MAX), + _dense_posting_list_threshold(0.4), + _tensorType(vespalib::tensor::TensorType::invalid()) +{ +} + +Config::Config(BasicType bt, + CollectionType ct, + bool fastSearch_, + bool huge_) + : _basicType(bt), + _type(ct), + _fastSearch(fastSearch_), + _huge(huge_), + _enableBitVectors(false), + _enableOnlyBitVector(false), + _isFilter(false), + _fastAccess(false), + _maxInternalBlobSize(defaultMaxInternalBlobSize), + _arity(8), + _lower_bound(LLONG_MIN), + _upper_bound(LLONG_MAX), + _dense_posting_list_threshold(0.4), + _tensorType(vespalib::tensor::TensorType::invalid()) +{ +} + +} +} diff --git a/searchcommon/src/vespa/searchcommon/attribute/config.h b/searchcommon/src/vespa/searchcommon/attribute/config.h new file mode 100644 index 00000000000..b63ce37c93b --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/config.h @@ -0,0 +1,155 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchcommon/attribute/basictype.h> +#include <vespa/searchcommon/attribute/collectiontype.h> +#include <vespa/searchcommon/common/growstrategy.h> +#include <vespa/vespalib/tensor/tensor_type.h> + +namespace search { +namespace attribute { + +const size_t defaultMaxInternalBlobSize = 0x400000000ul; + +class Config +{ +public: + Config(); + + Config(BasicType bt, + CollectionType ct = CollectionType::SINGLE, + bool fastSearch_ = false, + bool huge_ = false); + + BasicType basicType() const { return _basicType; } + CollectionType collectionType() const { return _type; } + bool fastSearch() const { return _fastSearch; } + bool huge() const { return _huge; } + size_t getMaxInternalBlobSize() const { return _maxInternalBlobSize; } + uint32_t arity() const { return _arity; } + int64_t lower_bound() const { return _lower_bound; } + int64_t upper_bound() const { return _upper_bound; } + double dense_posting_list_threshold() const { return _dense_posting_list_threshold; } + vespalib::tensor::TensorType tensorType() const { return _tensorType; } + + /** + * Check if attribute posting list can consist of a bitvector in + * addition to (or instead of) a btree. + */ + bool + getEnableBitVectors(void) const + { + return _enableBitVectors; + } + + /** + * Check if attribute posting list can consist of only a bitvector with + * no corresponding btree. + */ + bool + getEnableOnlyBitVector(void) const + { + return _enableOnlyBitVector; + } + + bool + getIsFilter(void) const + { + return _isFilter; + } + + /** + * Check if this attribute should be fast accessible at all times. + * If so, attribute is kept in memory also for non-searchable documents. + */ + bool fastAccess() const { return _fastAccess; } + + const GrowStrategy & getGrowStrategy() const { return _growStrategy; } + void setHuge(bool v) { _huge = v; } + void setFastSearch(bool v) { _fastSearch = v; } + void setMaxInternalBlobSize(size_t v) { _maxInternalBlobSize = v; } + void setArity(uint32_t v) { _arity = v; } + void setBounds(int64_t lower, int64_t upper) { _lower_bound = lower; + _upper_bound = upper; } + void setDensePostingListThreshold(double v) { _dense_posting_list_threshold = v; } + void setTensorType(const vespalib::tensor::TensorType &tensorType_in) { + _tensorType = tensorType_in; + } + + /** + * Enable attribute posting list to consist of a bitvector in + * addition to (or instead of) a btree. + */ + void + setEnableBitVectors(bool enableBitVectors) + { + _enableBitVectors = enableBitVectors; + } + + /** + * Enable attribute posting list to consist of only a bitvector with + * no corresponding btree. Some information degradation might occur when + * document frequency goes down, since recreated btree representation + * will then have lost weight information. + */ + void + setEnableOnlyBitVector(bool enableOnlyBitVector) + { + _enableOnlyBitVector = enableOnlyBitVector; + } + + /** + * Hide weight information when searching in attributes. + */ + void + setIsFilter(bool isFilter) + { + _isFilter = isFilter; + } + + void setFastAccess(bool v) { _fastAccess = v; } + void setGrowStrategy(const GrowStrategy &gs) { _growStrategy = gs; } + bool operator!=(const Config &b) const { return !(operator==(b)); } + + bool + operator==(const Config &b) const + { + return _basicType == b._basicType && + _type == b._type && + _huge == b._huge && + _fastSearch == b._fastSearch && + _enableBitVectors == b._enableBitVectors && + _enableOnlyBitVector == b._enableOnlyBitVector && + _isFilter == b._isFilter && + _fastAccess == b._fastAccess && + _maxInternalBlobSize == b._maxInternalBlobSize && + _growStrategy == b._growStrategy && + _arity == b._arity && + _lower_bound == b._lower_bound && + _upper_bound == b._upper_bound && + _dense_posting_list_threshold == b._dense_posting_list_threshold && + (_basicType.type() != BasicType::Type::TENSOR || + _tensorType == b._tensorType); + } + +private: + BasicType _basicType; + CollectionType _type; + bool _fastSearch; + bool _huge; + bool _enableBitVectors; + bool _enableOnlyBitVector; + bool _isFilter; + bool _fastAccess; + size_t _maxInternalBlobSize; + GrowStrategy _growStrategy; + uint32_t _arity; + int64_t _lower_bound; + int64_t _upper_bound; + double _dense_posting_list_threshold; + vespalib::tensor::TensorType _tensorType; +}; +} // namespace attribute +} // namespace search + diff --git a/searchcommon/src/vespa/searchcommon/attribute/iattributecontext.h b/searchcommon/src/vespa/searchcommon/attribute/iattributecontext.h new file mode 100644 index 00000000000..1cdb86cf274 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/iattributecontext.h @@ -0,0 +1,58 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "iattributevector.h" +#include <vector> +#include <memory> + +namespace search { +namespace attribute { + +/** + * This is an interface used to access all registered attribute vectors. + **/ +class IAttributeContext { +public: + typedef vespalib::string string; + /** Convenience typedefs **/ + typedef std::unique_ptr<IAttributeContext> UP; + + /** + * Returns the attribute vector with the given name. + * + * @param name the name of the attribute vector. + * @return const view of the attribute vector or NULL if the attribute vector does not exists. + **/ + virtual const IAttributeVector * getAttribute(const string & name) const = 0; + + /** + * Returns the attribute vector with the given name. + * Makes sure that the underlying enum values are stable during the use of this attribute. + * + * @param name the name of the attribute vector + * @return const view of the attribute vector or NULL if the attribute vector does not exists. + **/ + virtual const IAttributeVector * getAttributeStableEnum(const string & name) const = 0; + + /** + * Fill the given list with all attribute vectors registered. + * + * @param list the list to fill in attribute vectors. + **/ + virtual void getAttributeList(std::vector<const IAttributeVector *> & list) const = 0; + + /** + * Releases all cached attribute guards. + **/ + virtual void releaseEnumGuards() {} + + /** + * Virtual destructor to allow safe subclassing. + **/ + virtual ~IAttributeContext() {} +}; + +} // namespace attribute +} // namespace search + diff --git a/searchcommon/src/vespa/searchcommon/attribute/iattributevector.h b/searchcommon/src/vespa/searchcommon/attribute/iattributevector.h new file mode 100644 index 00000000000..28f7f7df061 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/iattributevector.h @@ -0,0 +1,352 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <string> +#include <stdint.h> +#include <vespa/vespalib/stllike/string.h> +#include <vespa/searchcommon/common/iblobconverter.h> +#include <vespa/searchcommon/attribute/collectiontype.h> +#include <vespa/searchcommon/attribute/basictype.h> + +namespace search { +namespace attribute { + +/** + * This class is used to store a value and a weight. + * It is used when getting content from a weighted set attribute vector. + * + * @param T the type of the value stored in this object + **/ +template <typename T> +class WeightedType +{ +private: + T _value; + int32_t _weight; + +public: + WeightedType() : _value(T()), _weight(1) { } + WeightedType(T value_, int32_t weight_ = 1) : _value(value_), _weight(weight_) { } + const T & getValue() const { return _value; } + const T & value() const { return _value; } + void setValue(const T & v) { _value = v; } + int32_t getWeight() const { return _weight; } + int32_t weight() const { return _weight; } + void setWeight(int32_t w) { _weight = w; } + bool operator==(const WeightedType & rhs) const { + return _value == rhs._value && _weight == rhs._weight; + } +}; + +/** + * This is a read interface used to access the content of an attribute vector. + **/ +class IAttributeVector +{ +public: + typedef uint32_t DocId; + typedef uint32_t EnumHandle; + typedef int64_t largeint_t; + typedef WeightedType<double> WeightedFloat; + typedef WeightedType<largeint_t> WeightedInt; + typedef WeightedType<EnumHandle> WeightedEnum; + typedef WeightedType<const char *> WeightedConstChar; + typedef WeightedType<vespalib::string> WeightedString; + + /** + * Returns the name of this attribute vector. + * + * @return attribute name + **/ + virtual const vespalib::string & getName() const = 0; + + /** + * Returns the number of documents stored in this attribute vector. + * + * @return number of documents + **/ + virtual uint32_t getNumDocs() const = 0; + + /** + * Returns the number of values stored for the given document. + * + * @return number of values + * @param doc document identifier + **/ + virtual uint32_t getValueCount(uint32_t doc) const = 0; + + /** + * Returns the maximum number of values stored for any document. + * + * @return maximum number of values + **/ + virtual uint32_t getMaxValueCount() const = 0; + + /** + * Returns the first value stored for the given document as an integer. + * + * @param docId document identifier + * @return the integer value + **/ + virtual largeint_t getInt(DocId doc) const = 0; + + /** + * Returns the first value stored for the given document as a floating point number. + * + * @param docId document identifier + * @return the floating point value + **/ + virtual double getFloat(DocId doc) const = 0; + + /** + * Returns the first value stored for the given document as a string. + * Uses the given buffer to store the actual string if no underlying + * string storage is used for this attribute vector. + * + * @param docId document identifier + * @param buffer content buffer to optionally store the string + * @param sz the size of the buffer + * @return the string value + **/ + virtual const char * getString(DocId doc, char * buffer, size_t sz) const = 0; + + /** + * Returns the first value stored for the given document as an enum value. + * + * @param docId document identifier + * @return the enum value + **/ + virtual EnumHandle getEnum(DocId doc) const = 0; + + /** + * Copies the values stored for the given document into the given buffer. + * + * @param docId document identifier + * @param buffer content buffer to copy integer values into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, largeint_t * buffer, uint32_t sz) const = 0; + + /** + * Copies the values stored for the given document into the given buffer. + * + * @param docId document identifier + * @param buffer content buffer to copy floating point values into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, double * buffer, uint32_t sz) const = 0; + + /** + * Copies the values stored for the given document into the given buffer. + * + * @param docId document identifier + * @param buffer content buffer to copy string values into + * @param sz the size of the buffer + * @return the number of values for this document + **/ +// virtual uint32_t get(DocId docId, vespalib::string * buffer, uint32_t sz) const = 0; + + /** + * Copies the values stored for the given document into the given buffer. + * + * @param docId document identifier + * @param buffer content buffer to copy const char values into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, const char ** buffer, uint32_t sz) const = 0; + + /** + * Copies the enum values stored for the given document into the given buffer. + * + * @param docId document identifier + * @param buffer content object to copy enum into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, EnumHandle * buffer, uint32_t sz) const = 0; + + /** + * Copies the values and weights stored for the given document into the given buffer. + * This method should only be invoked if @ref getCollectionType(docId) returns CollectionType::WEIGHTED_SET. + * + * @param docId document identifier + * @param buffer content object to copy integer values and weights into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, WeightedInt * buffer, uint32_t sz) const = 0; + + /** + * Copies the values and weights stored for the given document into the given buffer. + * This method should only be invoked if @ref getCollectionType(docId) returns CollectionType::WEIGHTED_SET. + * + * @param docId document identifier + * @param buffer content object to copy floating point values and weights into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, WeightedFloat * buffer, uint32_t sz) const = 0; + + /** + * Copies the values and weights stored for the given document into the given buffer. + * This method should only be invoked if @ref getCollectionType(docId) returns CollectionType::WEIGHTED_SET. + * + * @param docId document identifier + * @param buffer content object to copy string values and weights into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, WeightedString * buffer, uint32_t sz) const = 0; + + /** + * Copies the values and weights stored for the given document into the given buffer. + * This method should only be invoked if @ref getCollectionType(docId) returns CollectionType::WEIGHTED_SET. + * + * @param docId document identifier + * @param buffer content object to copy const char values and weights into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, WeightedConstChar * buffer, uint32_t sz) const = 0; + + /** + * Copies the enum values and weights stored for the given document into the given buffer. + * This method should only be invoked if @ref getCollectionType(docId) returns CollectionType::WEIGHTED_SET. + * + * @param docId document identifier + * @param buffer content object to copy enum values and weights into + * @param sz the size of the buffer + * @return the number of values for this document + **/ + virtual uint32_t get(DocId docId, WeightedEnum * buffer, uint32_t sz) const = 0; + + /** + * Finds the enum value for the given string value. + * This method will only have effect if @ref getBasicType() returns BasicType::STRING and + * @ref hasEnum() returns true. + * + * @param value the string value to lookup. + * @param e the handle in which to store the enum value. + * @return true if found. + **/ + virtual bool findEnum(const char * value, EnumHandle & e) const = 0; + + /** + * Returns the basic type of this attribute vector. + * + * @return basic type + **/ + virtual BasicType::Type getBasicType() const = 0; + + /** + * Returns the number of bytes a single value in this attribute occupies. + **/ + virtual size_t getFixedWidth() const = 0; + + /** + * Returns the collection type of this attribute vector. + * + * @return collection type + **/ + virtual CollectionType::Type getCollectionType() const = 0; + + /** + * Returns whether this is an integer attribute. + **/ + virtual bool isIntegerType() const { + BasicType::Type t = getBasicType(); + return t == BasicType::UINT1 || + t == BasicType::UINT2 || + t == BasicType::UINT4 || + t == BasicType::INT8 || + t == BasicType::INT16 || + t == BasicType::INT32 || + t == BasicType::INT64; + } + + /** + * Returns whether this is a floating point attribute. + **/ + virtual bool isFloatingPointType() const { + BasicType::Type t = getBasicType(); + return t == BasicType::FLOAT || t == BasicType::DOUBLE; + } + + /** + * Returns whether this is a string attribute. + **/ + virtual bool isStringType() const { + return getBasicType() == BasicType::STRING; + } + + /** + * Returns whether this is a multi value attribute. + **/ + virtual bool hasMultiValue() const { + return getCollectionType() != CollectionType::SINGLE; + } + + /** + * Returns whether this is a weighted set attribute. + **/ + virtual bool hasWeightedSetType() const { + return getCollectionType() == CollectionType::WSET; + } + + /** + * Returns whether this attribute vector has underlying enum values. + * + * @return true if it has enum values. + **/ + virtual bool hasEnum() const = 0; + + /** + * Will serialize the values for the documentid in ascending order. The serialized form can be used by memcmp and + * sortorder will be preserved. + * @param doc The document id to serialize for. + * @param serTo The buffer to serialize into. + * @param available. Number of bytes available in the serialization buffer. + * @param bc An optional converter to use. + * @return The number of bytes serialized, -1 if not enough space. + */ + long serializeForAscendingSort(DocId doc, void * serTo, long available, const common::BlobConverter * bc=NULL) const { + return onSerializeForAscendingSort(doc, serTo, available, bc); + } + /** + * Will serialize the values for the documentid in descending order. The serialized form can be used by memcmp and + * sortorder will be preserved. + * @param doc The document id to serialize for. + * @param serTo The buffer to serialize into. + * @param available. Number of bytes available in the serialization buffer. + * @param bc An optional converter to use. + * @return The number of bytes serialized, -1 if not enough space. + */ + long serializeForDescendingSort(DocId doc, void * serTo, long available, const common::BlobConverter * bc=NULL) const { + return onSerializeForDescendingSort(doc, serTo, available, bc); + } + + /** + * Virtual destructor to allow safe subclassing. + **/ + virtual ~IAttributeVector() {} + + /** + * This method is used to simulate sparseness in the single value attributes. + * @param doc The document id to verify if attribute has a undefined value for this document. + * @return true if value is undefined. + */ + virtual bool isUndefined(DocId doc) const { (void) doc; return false; } + +private: + virtual long onSerializeForAscendingSort(DocId doc, void * serTo, long available, const common::BlobConverter * bc) const = 0; + virtual long onSerializeForDescendingSort(DocId doc, void * serTo, long available, const common::BlobConverter * bc) const = 0; + +}; + +} // namespace fef +} // namespace search + diff --git a/searchcommon/src/vespa/searchcommon/attribute/status.cpp b/searchcommon/src/vespa/searchcommon/attribute/status.cpp new file mode 100644 index 00000000000..7543e13fdcf --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/status.cpp @@ -0,0 +1,74 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchcommon/attribute/status.h> + +namespace search { +namespace attribute { + +Status::Status(const vespalib::string &) + : _numDocs (0), + _numValues (0), + _numUniqueValues (0), + _allocated (0), + _used (0), + _dead (0), + _unused (0), + _onHold (0), + _onHoldMax (0), + _lastSyncToken (0), + _updates (0), + _nonIdempotentUpdates (0), + _bitVectors(0) +{ +} + + +Status::Status() + : _numDocs (0), + _numValues (0), + _numUniqueValues (0), + _allocated (0), + _used (0), + _dead (0), + _unused (0), + _onHold (0), + _onHoldMax (0), + _lastSyncToken (0), + _updates (0), + _nonIdempotentUpdates (0), + _bitVectors(0) +{ +} + + +vespalib::string +Status::createName(const vespalib::stringref &index, + const vespalib::stringref &attr) +{ + vespalib::string name (index); + name += ".attribute."; + name += attr; + return name; +} + + +void +Status::updateStatistics(uint64_t numValues, + uint64_t numUniqueValue, + uint64_t allocated, + uint64_t used, + uint64_t dead, + uint64_t onHold) +{ + _numValues = numValues; + _numUniqueValues = numUniqueValue; + _allocated = allocated; + _used = used; + _dead = dead; + _unused = allocated - used; + _onHold = onHold; + _onHoldMax = std::max(_onHoldMax, onHold); +} + +} +} diff --git a/searchcommon/src/vespa/searchcommon/attribute/status.h b/searchcommon/src/vespa/searchcommon/attribute/status.h new file mode 100644 index 00000000000..09c7cbc6028 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/attribute/status.h @@ -0,0 +1,80 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> + +namespace search { +namespace attribute { + +class Status +{ +public: + // TODO: name isn't stored anywhere or used for anything + Status(const vespalib::string &name); + Status(); + + void + updateStatistics(uint64_t numValues, + uint64_t numUniqueValue, + uint64_t allocated, + uint64_t used, + uint64_t dead, + uint64_t onHold); + + uint64_t getNumDocs() const { return _numDocs; } + uint64_t getNumValues() const { return _numValues; } + uint64_t getNumUniqueValues() const { return _numUniqueValues; } + uint64_t getAllocated() const { return _allocated; } + uint64_t getUsed() const { return _used; } + uint64_t getDead() const { return _dead; } + uint64_t getOnHold() const { return _onHold; } + uint64_t getOnHoldMax() const { return _onHoldMax; } + uint64_t getLastSyncToken() const { return _lastSyncToken; } + uint64_t getUpdateCount() const { return _updates; } + uint64_t getNonIdempotentUpdateCount() const { return _nonIdempotentUpdates; } + uint32_t + getBitVectors() const + { + return _bitVectors; + } + + void setNumDocs(uint64_t v) { _numDocs = v; } + void incNumDocs() { ++_numDocs; } + void setLastSyncToken(uint64_t v) { _lastSyncToken = v; } + void incUpdates(uint64_t v=1) { _updates += v; } + void incNonIdempotentUpdates(uint64_t v = 1) { _nonIdempotentUpdates += v; } + void + incBitVectors() + { + ++_bitVectors; + } + + void + decBitVectors() + { + --_bitVectors; + } + + static vespalib::string + createName(const vespalib::stringref &index, + const vespalib::stringref & attr); +private: + uint64_t _numDocs; + uint64_t _numValues; + uint64_t _numUniqueValues; + uint64_t _allocated; + uint64_t _used; + uint64_t _dead; + uint64_t _unused; + uint64_t _onHold; + uint64_t _onHoldMax; + uint64_t _lastSyncToken; + uint64_t _updates; + uint64_t _nonIdempotentUpdates; + uint32_t _bitVectors; +}; + +} +} + diff --git a/searchcommon/src/vespa/searchcommon/common/.gitignore b/searchcommon/src/vespa/searchcommon/common/.gitignore new file mode 100644 index 00000000000..7e7c0fe7fae --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/common/.gitignore @@ -0,0 +1,2 @@ +/.depend +/Makefile diff --git a/searchcommon/src/vespa/searchcommon/common/CMakeLists.txt b/searchcommon/src/vespa/searchcommon/common/CMakeLists.txt new file mode 100644 index 00000000000..a70a71772f5 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/common/CMakeLists.txt @@ -0,0 +1,7 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(searchcommon_searchcommon_common OBJECT + SOURCES + schema.cpp + schemaconfigurer.cpp + DEPENDS +) diff --git a/searchcommon/src/vespa/searchcommon/common/growstrategy.h b/searchcommon/src/vespa/searchcommon/common/growstrategy.h new file mode 100644 index 00000000000..07d2ee1f35d --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/common/growstrategy.h @@ -0,0 +1,43 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <stdint.h> + +namespace search { + +class GrowStrategy +{ +private: + uint32_t _docsInitialCapacity; + uint32_t _docsGrowPercent; + uint32_t _docsGrowDelta; +public: + GrowStrategy(uint32_t docsInitialCapacity = 1024, + uint32_t docsGrowPercent = 50, + uint32_t docsGrowDelta = 0) + : _docsInitialCapacity(docsInitialCapacity), + _docsGrowPercent(docsGrowPercent), + _docsGrowDelta(docsGrowDelta) + { + } + + uint32_t getDocsInitialCapacity() const { return _docsInitialCapacity; } + uint32_t getDocsGrowPercent() const { return _docsGrowPercent; } + uint32_t getDocsGrowDelta() const { return _docsGrowDelta; } + void setDocsInitialCapacity(uint32_t v) { _docsInitialCapacity = v; } + void setDocsGrowPercent(uint32_t v) { _docsGrowPercent = v; } + void setDocsGrowDelta(uint32_t v) { _docsGrowDelta = v; } + + bool operator==(const GrowStrategy & rhs) const { + return _docsInitialCapacity == rhs._docsInitialCapacity && + _docsGrowPercent == rhs._docsGrowPercent && + _docsGrowDelta == rhs._docsGrowDelta; + } + bool operator!=(const GrowStrategy & rhs) const { + return !(operator==(rhs)); + } +}; + +} + diff --git a/searchcommon/src/vespa/searchcommon/common/iblobconverter.h b/searchcommon/src/vespa/searchcommon/common/iblobconverter.h new file mode 100644 index 00000000000..cb3c3c3d2f3 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/common/iblobconverter.h @@ -0,0 +1,24 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/util/buffer.h> +#include <vespa/vespalib/util/linkedptr.h> + +namespace search { +namespace common { + +class BlobConverter +{ +public: + typedef std::shared_ptr<BlobConverter> SP; + typedef vespalib::LinkedPtr<BlobConverter> LP; + virtual ~BlobConverter() { } + vespalib::ConstBufferRef convert(const vespalib::ConstBufferRef & src) const { return onConvert(src); } +private: + virtual vespalib::ConstBufferRef onConvert(const vespalib::ConstBufferRef & src) const = 0; +}; + +} +} + diff --git a/searchcommon/src/vespa/searchcommon/common/schema.cpp b/searchcommon/src/vespa/searchcommon/common/schema.cpp new file mode 100644 index 00000000000..3215a25e55f --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/common/schema.cpp @@ -0,0 +1,670 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <fstream> +#include <vespa/config/common/configparser.h> +#include <vespa/vespalib/stllike/asciistream.h> +#include <vespa/vespalib/util/arraysize.h> +#include "schema.h" +LOG_SETUP(".index.schema"); + +using namespace config; +using namespace search::index; +using config::InvalidConfigException; + +namespace { + +template <typename T> +void +writeFields(vespalib::asciistream & os, + const vespalib::stringref &prefix, + const std::vector<T> & fields) +{ + os << prefix << "[" << fields.size() << "]\n"; + for (size_t i = 0; i < fields.size(); ++i) { + fields[i].write(os, vespalib::make_string("%s[%zu].", prefix.c_str(), i)); + } +} + +void +writeFieldSets(vespalib::asciistream &os, + const vespalib::string &name, + const std::vector<Schema::FieldSet> &fss) +{ + vespalib::string prefix(name); + prefix += "["; + os << prefix << fss.size() << "]\n"; + for (size_t i = 0; i < fss.size(); ++i) { + os << prefix << i << "].name " << fss[i].getName() << "\n"; + os << prefix << i << "].field[" << fss[i].getFields().size() << "]\n"; + vespalib::asciistream tmp; + tmp << prefix << i << "].field["; + for (size_t j = 0; j < fss[i].getFields().size(); ++j) { + os << tmp.str() << j << "].name " << fss[i].getFields()[j] << "\n"; + } + } +} + +struct FieldName { + vespalib::string name; + FieldName(const std::vector<vespalib::string> & lines) + : name(ConfigParser::parse<vespalib::string>("name", lines)) + { + } +}; + +template <typename T> +uint32_t +getFieldId(const vespalib::stringref & name, const T &map) +{ + typename T::const_iterator it = map.find(name); + return (it != map.end()) ? it->second : Schema::UNKNOWN_FIELD_ID; +} + +} // namespace + +namespace search { +namespace index { + +const uint32_t Schema::UNKNOWN_FIELD_ID(std::numeric_limits<uint32_t>::max()); + +Schema::DataType Schema::dataTypeFromName(const vespalib::stringref &name) { + if (name == "UINT1") { return UINT1; } + else if (name == "UINT2") { return UINT2; } + else if (name == "UINT4") { return UINT4; } + else if (name == "INT8") { return INT8; } + else if (name == "INT16") { return INT16; } + else if (name == "INT32") { return INT32; } + else if (name == "INT64") { return INT64; } + else if (name == "FLOAT") { return FLOAT; } + else if (name == "DOUBLE") { return DOUBLE; } + else if (name == "STRING") { return STRING; } + else if (name == "RAW") { return RAW; } + else if (name == "BOOLEANTREE") { return BOOLEANTREE; } + else if (name == "TENSOR") { return TENSOR; } + else { + throw InvalidConfigException("Illegal enum value '" + name + "'"); + } +} + +const char *datatype_str[] = { "UINT1", + "UINT2", + "UINT4", + "INT8", + "INT16", + "INT32", + "INT64", + "FLOAT", + "DOUBLE", + "STRING", + "RAW", + "FEATURE_NOTUSED", + "BOOLEANTREE", + "TENSOR" }; + +vespalib::string Schema::getTypeName(DataType type) { + if (type > vespalib::arraysize(datatype_str)) { + vespalib::asciistream ost; + ost << "UNKNOWN(" << type << ")"; + return ost.str(); + } + return datatype_str[type]; +} + +Schema::CollectionType Schema::collectionTypeFromName( + const vespalib::stringref &name) { + if (name == "SINGLE") { return SINGLE; } + else if (name == "ARRAY") { return ARRAY; } + else if (name == "WEIGHTEDSET") { return WEIGHTEDSET; } + else { + throw InvalidConfigException("Illegal enum value '" + name + "'"); + } +} + +const char *collectiontype_str[] = { "SINGLE", + "ARRAY", + "WEIGHTEDSET" }; + +vespalib::string Schema::getTypeName(CollectionType type) { + if (type > vespalib::arraysize(collectiontype_str)) { + vespalib::asciistream ost; + ost << "UNKNOWN(" << type << ")"; + return ost.str(); + } + return collectiontype_str[type]; +} + +Schema::Field::Field(const vespalib::stringref &n, DataType dt) + : _name(n), + _dataType(dt), + _collectionType(SINGLE), + _timestamp(0) +{ +} + +Schema::Field::Field(const vespalib::stringref &n, + DataType dt, CollectionType ct) + : _name(n), + _dataType(dt), + _collectionType(ct), + _timestamp(0) +{ +} + +// XXX: Resource leak if exception is thrown. +Schema::Field::Field(const std::vector<vespalib::string> & lines) + : _name(ConfigParser::parse<vespalib::string>("name", lines)), + _dataType(dataTypeFromName(ConfigParser::parse<vespalib::string>( + "datatype", lines))), + _collectionType( + collectionTypeFromName(ConfigParser::parse<vespalib::string>( + "collectiontype", lines))), + _timestamp(ConfigParser::parse<int64_t>("timestamp", lines, 0)) +{ +} + +void +Schema::Field::write(vespalib::asciistream & os, const vespalib::stringref & prefix) const +{ + os << prefix << "name " << _name << "\n"; + os << prefix << "datatype " << getTypeName(_dataType) << "\n"; + os << prefix << "collectiontype " << getTypeName(_collectionType) << "\n"; + if (_timestamp) { + os << prefix << "timestamp " << _timestamp.val() << "\n"; + } +} + + +bool +Schema::Field::operator==(const Field &rhs) const +{ + return _name == rhs._name && + _dataType == rhs._dataType && + _collectionType == rhs._collectionType && + _timestamp == rhs._timestamp; +} + + +bool +Schema::Field::operator!=(const Field &rhs) const +{ + return _name != rhs._name || + _dataType != rhs._dataType || + _collectionType != rhs._collectionType || + _timestamp != rhs._timestamp; +} + + +Schema::IndexField::IndexField(const vespalib::stringref &name, DataType dt) + : Field(name, dt), + _prefix(false), + _phrases(false), + _positions(true), + _avgElemLen(512) +{ +} + +Schema::IndexField::IndexField(const vespalib::stringref &name, DataType dt, + CollectionType ct) + : Field(name, dt, ct), + _prefix(false), + _phrases(false), + _positions(true), + _avgElemLen(512) +{ +} + +Schema::IndexField::IndexField(const std::vector<vespalib::string> &lines) + : Field(lines), + _prefix(ConfigParser::parse<bool>("prefix", lines)), + _phrases(ConfigParser::parse<bool>("phrases", lines)), + _positions(ConfigParser::parse<bool>("positions", lines)), + _avgElemLen(ConfigParser::parse<int32_t>("averageelementlen", lines)) +{ +} + + +void +Schema::IndexField::write(vespalib::asciistream & os, const vespalib::stringref & prefix) const +{ + Field::write(os, prefix); + os << prefix << "prefix " << (_prefix ? "true" : "false") << "\n"; + os << prefix << "phrases " << (_phrases ? "true" : "false") << "\n"; + os << prefix << "positions " << (_positions ? "true" : "false") << "\n"; + os << prefix << "averageelementlen " << static_cast<int32_t>(_avgElemLen) << "\n"; +} + + +bool +Schema::IndexField::operator==(const IndexField &rhs) const +{ + return Field::operator==(rhs) && + _prefix == rhs._prefix && + _phrases == rhs._phrases && + _positions == rhs._positions && + _avgElemLen == rhs._avgElemLen; +} + + +bool +Schema::IndexField::operator!=(const IndexField &rhs) const +{ + return Field::operator!=(rhs) || + _prefix != rhs._prefix || + _phrases != rhs._phrases || + _positions != rhs._positions || + _avgElemLen != rhs._avgElemLen; +} + + +Schema::FieldSet::FieldSet(const std::vector<vespalib::string> & lines) : + _name(ConfigParser::parse<vespalib::string>("name", lines)), + _fields() +{ + std::vector<FieldName> fn = ConfigParser::parseArray<FieldName>("field", lines); + for (size_t i = 0; i < fn.size(); ++i) { + _fields.push_back(fn[i].name); + } +} + + +bool +Schema::FieldSet::operator==(const FieldSet &rhs) const +{ + return _name == rhs._name && + _fields == rhs._fields; +} + +bool +Schema::FieldSet::operator!=(const FieldSet &rhs) const +{ + return _name != rhs._name || + _fields != rhs._fields; +} + +void +Schema::writeToStream(vespalib::asciistream &os) const +{ + writeFields(os, "attributefield", _attributeFields); + writeFields(os, "summaryfield", _summaryFields); + writeFieldSets(os, "fieldset", _fieldSets); + writeFields(os, "indexfield", _indexFields); +} + +Schema::Schema() + : _indexFields(), + _attributeFields(), + _summaryFields(), + _fieldSets(), + _indexIds(), + _attributeIds(), + _summaryIds(), + _fieldSetIds() +{ +} + +bool +Schema::loadFromFile(const vespalib::stringref & fileName) +{ + std::ifstream file(fileName.c_str()); + if (!file) { + LOG(warning, "Could not open input file '%s' as part of loadFromFile()", fileName.c_str()); + return false; + } + std::vector<vespalib::string> lines; + std::string tmpLine; + while (file) { + getline(file, tmpLine); + lines.push_back(tmpLine); + } + _indexFields = ConfigParser::parseArray<IndexField>("indexfield", lines); + _attributeFields = ConfigParser::parseArray<AttributeField>("attributefield", lines); + _summaryFields = ConfigParser::parseArray<SummaryField>("summaryfield", lines); + _fieldSets = ConfigParser::parseArray<FieldSet>("fieldset", lines); + _indexIds.clear(); + for (size_t i(0), m(_indexFields.size()); i < m; i++) { + _indexIds[_indexFields[i].getName()] = i; + } + _attributeIds.clear(); + for (size_t i(0), m(_attributeFields.size()); i < m; i++) { + _attributeIds[_attributeFields[i].getName()] = i; + } + _summaryIds.clear(); + for (size_t i(0), m(_summaryFields.size()); i < m; i++) { + _summaryIds[_summaryFields[i].getName()] = i; + } + _fieldSetIds.clear(); + for (size_t i(0), m(_fieldSets.size()); i < m; i++) { + _fieldSetIds[_fieldSets[i].getName()] = i; + } + return true; +} + +bool +Schema::saveToFile(const vespalib::stringref & fileName) const +{ + vespalib::asciistream os; + writeToStream(os); + std::ofstream file(fileName.c_str()); + if (!file) { + LOG(warning, "Could not open output file '%s' as part of saveToFile()", fileName.c_str()); + return false; + } + file << os.str(); + file.close(); + if (file.fail()) { + LOG(warning, + "Could not write to output file '%s' as part of saveToFile()", + fileName.c_str()); + return false; + } + FastOS_File s; + s.OpenReadWrite(fileName.c_str()); + if (!s.IsOpened()) { + LOG(warning, + "Could not open schema file '%s' for fsync", + fileName.c_str()); + return false; + } else { + if (!s.Sync()) { + LOG(warning, + "Could not fsync schema file '%s'", + fileName.c_str()); + return false; + } + s.Close(); + } + return true; +} + +vespalib::string +Schema::toString() const +{ + vespalib::asciistream os; + writeToStream(os); + return os.str(); +} + +namespace { +Schema::IndexField +cloneIndexField(const Schema::IndexField &field, + const vespalib::string &suffix) +{ + return Schema::IndexField(field.getName() + suffix, + field.getDataType(), + field.getCollectionType()). + setPrefix(field.hasPrefix()). + setPhrases(field.hasPhrases()). + setPositions(field.hasPositions()). + setAvgElemLen(field.getAvgElemLen()); +} + +template <typename T, typename M> +Schema & +addField(const T &field, Schema &self, + std::vector<T> &fields, M &name2id_map) +{ + name2id_map[field.getName()] = fields.size(); + fields.push_back(field); + return self; +} +} // namespace + +Schema & +Schema::addIndexField(const IndexField &field) +{ + return addField(field, *this, _indexFields, _indexIds); +} + +Schema & +Schema::addUriIndexFields(const IndexField &field) +{ + addIndexField(field); + addIndexField(cloneIndexField(field, ".scheme")); + addIndexField(cloneIndexField(field, ".host")); + addIndexField(cloneIndexField(field, ".port")); + addIndexField(cloneIndexField(field, ".path")); + addIndexField(cloneIndexField(field, ".query")); + addIndexField(cloneIndexField(field, ".fragment")); + addIndexField(cloneIndexField(field, ".hostname")); + return *this; +} + +Schema & +Schema::addAttributeField(const AttributeField &field) +{ + return addField(field, *this, _attributeFields, _attributeIds); +} + +Schema & +Schema::addSummaryField(const SummaryField &field) +{ + return addField(field, *this, _summaryFields, _summaryIds); +} + +Schema & +Schema::addFieldSet(const FieldSet &fieldSet) +{ + return addField(fieldSet, *this, _fieldSets, _fieldSetIds); +} + +uint32_t +Schema::getIndexFieldId(const vespalib::stringref & name) const +{ + return getFieldId(name, _indexIds); +} + +uint32_t +Schema::getAttributeFieldId(const vespalib::stringref & name) const +{ + return getFieldId(name, _attributeIds); +} + +uint32_t +Schema::getSummaryFieldId(const vespalib::stringref & name) const +{ + return getFieldId(name, _summaryIds); +} + + +uint32_t +Schema::getFieldSetId(const vespalib::stringref &name) const +{ + return getFieldId(name, _fieldSetIds); +} + + +void +Schema::swap(Schema &rhs) +{ + _indexFields.swap(rhs._indexFields); + _attributeFields.swap(rhs._attributeFields); + _summaryFields.swap(rhs._summaryFields); + _fieldSets.swap(rhs._fieldSets); + _indexIds.swap(rhs._indexIds); + _attributeIds.swap(rhs._attributeIds); + _summaryIds.swap(rhs._summaryIds); + _fieldSetIds.swap(rhs._fieldSetIds); +} + + +void +Schema::clear() +{ + _indexFields.clear(); + _attributeFields.clear(); + _summaryFields.clear(); + _fieldSets.clear(); + _indexIds.clear(); + _attributeIds.clear(); + _summaryIds.clear(); + _fieldSetIds.clear(); +} + + +namespace { +// Helper class allowing the is_matching specialization to access the schema. +struct IntersectHelper { + Schema::UP schema; + IntersectHelper() : schema(new Schema) {} + + template <typename T> + bool is_matching(const T &t1, const T &t2) { return t1.matchingTypes(t2); } + + template <typename T, typename Map> + void intersect(const std::vector<T> &set1, const std::vector<T> &set2, + const Map &set2_map, + std::vector<T> &intersection, Map &intersection_map) { + for (typename std::vector<T>::const_iterator + it = set1.begin(); it != set1.end(); ++it) { + typename Map::const_iterator it2 = set2_map.find(it->getName()); + if (it2 != set2_map.end()) { + if (is_matching(*it, set2[it2->second])) { + intersection_map[it->getName()] = intersection.size(); + intersection.push_back(*it); + } + } + } + } +}; + +template <> +bool IntersectHelper::is_matching(const Schema::FieldSet &f1, + const Schema::FieldSet &f2) { + if (f1.getFields() != f2.getFields()) + return false; + const std::vector<vespalib::string> fields = f1.getFields(); + for (std::vector<vespalib::string>::const_iterator + i = fields.begin(), ie = fields.end(); i != ie; ++i) { + if (schema->getIndexFieldId(*i) == Schema::UNKNOWN_FIELD_ID) { + return false; + } + } + return true; +} + +template <typename T, typename Map> +void addOldEntries(const std::vector<T> &entries, + fastos::TimeStamp limit_timestamp, + std::vector<T> &v, Map &name2id_map) { + for (typename std::vector<T>::const_iterator + it = entries.begin(); it != entries.end(); ++it) { + if (it->getTimestamp() < limit_timestamp) { + name2id_map[it->getName()] = v.size(); + v.push_back(*it); + } + } +} + +template <typename T, typename Map> +void addEntries(const std::vector<T> &entries, std::vector<T> &v, + Map &name2id_map) { + for (typename std::vector<T>::const_iterator + it = entries.begin(); it != entries.end(); ++it) { + if (name2id_map.find(it->getName()) == name2id_map.end()) { + name2id_map[it->getName()] = v.size(); + v.push_back(*it); + } + } +} + +template <typename T, typename Map> +void difference(const std::vector<T> &minuend, const Map &subtrahend_map, + std::vector<T> &diff, Map &diff_map) { + for (typename std::vector<T>::const_iterator + it = minuend.begin(); it != minuend.end(); ++it) { + if (subtrahend_map.find(it->getName()) == subtrahend_map.end()) { + diff_map[it->getName()] = diff.size(); + diff.push_back(*it); + } + } +} +} // namespace + +Schema::UP +Schema::getOldFields(fastos::TimeStamp limit_timestamp) +{ + Schema::UP schema(new Schema); + addOldEntries(_indexFields, limit_timestamp, + schema->_indexFields, schema->_indexIds); + addOldEntries(_attributeFields, limit_timestamp, + schema->_attributeFields, schema->_attributeIds); + addOldEntries(_summaryFields, limit_timestamp, + schema->_summaryFields, schema->_summaryIds); + return schema; +} + +Schema::UP +Schema::intersect(const Schema &lhs, const Schema &rhs) +{ + IntersectHelper h; + h.intersect(lhs._indexFields, rhs._indexFields, rhs._indexIds, + h.schema->_indexFields, h.schema->_indexIds); + h.intersect(lhs._attributeFields, rhs._attributeFields, rhs._attributeIds, + h.schema->_attributeFields, h.schema->_attributeIds); + h.intersect(lhs._summaryFields, rhs._summaryFields, rhs._summaryIds, + h.schema->_summaryFields, h.schema->_summaryIds); + h.intersect(lhs._fieldSets, rhs._fieldSets, rhs._fieldSetIds, + h.schema->_fieldSets, h.schema->_fieldSetIds); + return std::move(h.schema); +} + +Schema::UP +Schema::make_union(const Schema &lhs, const Schema &rhs) +{ + Schema::UP schema(new Schema(lhs)); + addEntries(rhs._indexFields, schema->_indexFields, schema->_indexIds); + addEntries(rhs._attributeFields, schema->_attributeFields, schema->_attributeIds); + addEntries(rhs._summaryFields, schema->_summaryFields, schema->_summaryIds); + addEntries(rhs._fieldSets, schema->_fieldSets, schema->_fieldSetIds); + return schema; +} + +Schema::UP +Schema::set_difference(const Schema &lhs, const Schema &rhs) +{ + Schema::UP schema(new Schema); + difference(lhs._indexFields, rhs._indexIds, + schema->_indexFields, schema->_indexIds); + difference(lhs._attributeFields, rhs._attributeIds, + schema->_attributeFields, schema->_attributeIds); + difference(lhs._summaryFields, rhs._summaryIds, + schema->_summaryFields, schema->_summaryIds); + difference(lhs._fieldSets, rhs._fieldSetIds, + schema->_fieldSets, schema->_fieldSetIds); + return schema; +} + +bool +Schema::operator==(const Schema &rhs) const +{ + return _indexFields == rhs._indexFields && + _attributeFields == rhs._attributeFields && + _summaryFields == rhs._summaryFields && + _fieldSets == rhs._fieldSets; +} + + +bool +Schema::operator!=(const Schema &rhs) const +{ + return _indexFields != rhs._indexFields || + _attributeFields != rhs._attributeFields || + _summaryFields != rhs._summaryFields || + _fieldSets != rhs._fieldSets; +} + + +bool +Schema::empty() const +{ + return _indexFields.empty() && + _attributeFields.empty() && + _summaryFields.empty() && + _fieldSets.empty(); +} + + +} // namespace search::index +} // namespace search diff --git a/searchcommon/src/vespa/searchcommon/common/schema.h b/searchcommon/src/vespa/searchcommon/common/schema.h new file mode 100644 index 00000000000..5f5b7dd3656 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/common/schema.h @@ -0,0 +1,429 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> +#include <vespa/vespalib/stllike/asciistream.h> +#include <vespa/vespalib/stllike/hash_map.h> +#include <vespa/vespalib/util/ptrholder.h> +#include <vector> + +namespace search { +namespace index { + +/** + * Schema class used to give a high-level description of the content + * of an index. + **/ +class Schema +{ +public: + typedef std::unique_ptr<Schema> UP; + typedef std::shared_ptr<Schema> SP; + typedef vespalib::PtrHolder<Schema> PH; + + /** + * Basic data type for a field. + **/ + enum DataType { UINT1 = 0, + UINT2 = 1, + UINT4 = 2, + INT8 = 3, + INT16 = 4, + INT32 = 5, + INT64 = 6, + FLOAT = 7, + DOUBLE = 8, + STRING = 9, + RAW = 10, + //FEATURE = 11, + BOOLEANTREE = 12, + TENSOR = 13}; + static DataType dataTypeFromName(const vespalib::stringref &name); + static vespalib::string getTypeName(DataType type); + + /** + * Collection type for a field. + **/ + enum CollectionType { SINGLE = 0, + ARRAY = 1, + WEIGHTEDSET = 2 }; + static CollectionType collectionTypeFromName(const vespalib::stringref &n); + static vespalib::string getTypeName(CollectionType type); + + /** + * A single field has a name, data type and collection + * type. Various aspects (index/attribute/summary) may have + * limitations on what types are supported in the back-end. + **/ + class Field + { + vespalib::string _name; + DataType _dataType; + CollectionType _collectionType; + fastos::TimeStamp _timestamp; + + public: + Field(const vespalib::stringref &n, DataType dt); + Field(const vespalib::stringref &n, DataType dt, CollectionType ct); + + /** + * Create this field based on the given config lines. + **/ + Field(const std::vector<vespalib::string> & lines); + + virtual ~Field() {} + + void setTimestamp(fastos::TimeStamp ts) { _timestamp = ts; } + + virtual void + write(vespalib::asciistream & os, + const vespalib::stringref & prefix) const; + + const vespalib::string &getName() const { return _name; } + DataType getDataType() const { return _dataType; } + CollectionType getCollectionType() const { return _collectionType; } + fastos::TimeStamp getTimestamp() const { return _timestamp; } + + bool matchingTypes(const Field &rhs) const + { + return getDataType() == rhs.getDataType() && + getCollectionType() == rhs.getCollectionType(); + } + + bool operator==(const Field &rhs) const; + bool operator!=(const Field &rhs) const; + }; + + /** + * A representation of an index field with extra information on + * how the index should be generated. + **/ + class IndexField : public Field + { + bool _prefix; + bool _phrases; + bool _positions; + uint32_t _avgElemLen; + + public: + IndexField(const vespalib::stringref &name, DataType dt); + IndexField(const vespalib::stringref &name, DataType dt, + CollectionType ct); + /** + * Create this index field based on the given config lines. + **/ + IndexField(const std::vector<vespalib::string> &lines); + + IndexField &setPrefix(bool value) { _prefix = value; return *this; } + IndexField &setPhrases(bool value) { _phrases = value; return *this; } + IndexField &setPositions(bool value) + { _positions = value; return *this; } + IndexField &setAvgElemLen(uint32_t avgElemLen) + { _avgElemLen = avgElemLen; return *this; } + + virtual void + write(vespalib::asciistream &os, + const vespalib::stringref &prefix) const; + + bool hasPrefix() const { return _prefix; } + bool hasPhrases() const { return _phrases; } + bool hasPositions() const { return _positions; } + uint32_t getAvgElemLen() const { return _avgElemLen; } + + bool operator==(const IndexField &rhs) const; + bool operator!=(const IndexField &rhs) const; + }; + + typedef Field AttributeField; + typedef Field SummaryField; + + /** + * A field collection has a name and a list of index field names, + * and is a named physical view over the list of index fields. + **/ + class FieldSet + { + vespalib::string _name; + std::vector<vespalib::string> _fields; + + public: + FieldSet(const vespalib::stringref & n) : _name(n), _fields() {} + + /** + * Create this field collection based on the given config lines. + **/ + FieldSet(const std::vector<vespalib::string> & lines); + + FieldSet &addField(const vespalib::stringref &fieldName) { + _fields.push_back(fieldName); + return *this; + } + + const vespalib::string &getName() const { return _name; } + const std::vector<vespalib::string> &getFields() const + { return _fields; } + + bool operator==(const FieldSet &rhs) const; + bool operator!=(const FieldSet &rhs) const; + }; + + static const uint32_t UNKNOWN_FIELD_ID; + +private: + std::vector<IndexField> _indexFields; + std::vector<AttributeField> _attributeFields; + std::vector<SummaryField> _summaryFields; + std::vector<FieldSet> _fieldSets; + typedef vespalib::hash_map<vespalib::string, uint32_t> Name2IdMap; + Name2IdMap _indexIds; + Name2IdMap _attributeIds; + Name2IdMap _summaryIds; + Name2IdMap _fieldSetIds; + + void writeToStream(vespalib::asciistream &os) const; + +public: + /** + * Create an initially empty schema + **/ + Schema(); + + /** + * Load this schema from the file with the given name. + * + * @param fileName the name of the file. + * @return true if the schema could be loaded. + **/ + bool + loadFromFile(const vespalib::stringref & fileName); + + /** + * Save this schema to the file with the given name. + * + * @param fileName the name of the file. + * @return true if the schema could be saved. + **/ + bool + saveToFile(const vespalib::stringref & fileName) const; + + vespalib::string toString() const; + + /** + * Add an index field to this schema + * + * @param field the field to add + **/ + Schema & + addIndexField(const IndexField &field); + + // Only used by tests. + Schema & + addUriIndexFields(const IndexField &field); + + /** + * Add an attribute field to this schema + * + * @param field the field to add + **/ + Schema & + addAttributeField(const AttributeField &field); + + /** + * Add a summary field to this schema + * + * @param field the field to add + **/ + Schema & + addSummaryField(const SummaryField &field); + + /** + * Add a field set to this schema. + * + * @param collection the field set to add. + **/ + Schema & + addFieldSet(const FieldSet &collection); + + /** + * Obtain the number of index fields in this schema. + * + * @return number of fields + **/ + uint32_t getNumIndexFields() const { return _indexFields.size(); } + + /** + * Obtain the number of attribute fields in this schema. + * + * @return number of fields + **/ + uint32_t getNumAttributeFields() const { return _attributeFields.size(); } + + /** + * Obtain the number of summary fields in this schema. + * + * @return number of fields + **/ + uint32_t getNumSummaryFields() const { return _summaryFields.size(); } + + /** + * Obtain the number of field sets in this schema. + * + * @return number of field sets. + **/ + uint32_t getNumFieldSets() const { return _fieldSets.size(); } + + /** + * Get information about a specific index field using the given fieldId. + * + * @return the field + * @param idx an index in the range [0, size - 1]. + **/ + const IndexField & + getIndexField(uint32_t fieldId) const + { + return _indexFields[fieldId]; + } + + /** + * Returns const view of the index fields. + */ + const std::vector<IndexField> &getIndexFields() const { + return _indexFields; + } + + /** + * Get the field id for the index field with the given name. + * + * @return the field id or UNKNOWN_FIELD_ID if not found. + * @param name the name of the field. + **/ + uint32_t getIndexFieldId(const vespalib::stringref & name) const; + + /** + * Check if a field is an index + * + * @return true if field is an index field. + * @param name the name of the field. + **/ + bool + isIndexField(const vespalib::stringref & name) const + { + return _indexIds.find(name) != _indexIds.end(); + } + + /** + * Check if a field is a summary field + * + * @return true if field is an summary field. + * @param name the name of the field. + **/ + bool + isSummaryField(const vespalib::stringref & name) const + { + return _summaryIds.find(name) != _summaryIds.end(); + } + /** + * Check if a field is a attribute field + * + * @return true if field is an attribute field. + * @param name the name of the field. + **/ + bool + isAttributeField(const vespalib::stringref & name) const + { + return _attributeIds.find(name) != _attributeIds.end(); + } + + /** + * Get information about a specific attribute field using the given fieldId. + * + * @return the field + * @param idx an index in the range [0, size - 1]. + **/ + const AttributeField & + getAttributeField(uint32_t fieldId) const + { + return _attributeFields[fieldId]; + } + + /** + * Returns const view of the attribute fields. + */ + const std::vector<AttributeField> &getAttributeFields() const { + return _attributeFields; + } + + /** + * Get the field id for the attribute field with the given name. + * + * @return the field id or UNKNOWN_FIELD_ID if not found. + * @param name the name of the field. + **/ + uint32_t getAttributeFieldId(const vespalib::stringref & name) const; + + /** + * Get information about a specific summary field using the given fieldId. + * + * @return the field + * @param idx an index in the range [0, size - 1] + **/ + const SummaryField & + getSummaryField(uint32_t fieldId) const + { + return _summaryFields[fieldId]; + } + + /** + * Returns const view of the summary fields. + */ + const std::vector<SummaryField> &getSummaryFields() const { + return _summaryFields; + } + + /** + * Get the field id for the summary field with the given name. + * + * @return the field id or UNKNOWN_FIELD_ID if not found. + * @param name the name of the field. + **/ + uint32_t getSummaryFieldId(const vespalib::stringref & name) const; + + /** + * Get information about a specific field set + * + * @return the field set. + * @param idx an index in the range [0, size - 1]. + **/ + const FieldSet & + getFieldSet(uint32_t idx) const + { + return _fieldSets[idx]; + } + + /** + * Get the field id for the field set with the given name. + * + * @return the field id or UNKNOWN_FIELD_ID if not found. + * @param name the name of the field set. + **/ + uint32_t + getFieldSetId(const vespalib::stringref &name) const; + + void swap(Schema &rhs); + void clear(); + + Schema::UP getOldFields(fastos::TimeStamp limit_timestamp); + + static Schema::UP intersect(const Schema &lhs, const Schema &rhs); + static Schema::UP make_union(const Schema &lhs, const Schema &rhs); + static Schema::UP set_difference(const Schema &lhs, const Schema &rhs); + + bool operator==(const Schema &rhs) const; + bool operator!=(const Schema &rhs) const; + + bool empty() const; +}; + +} // namespace search::index +} // namespace search diff --git a/searchcommon/src/vespa/searchcommon/common/schemaconfigurer.cpp b/searchcommon/src/vespa/searchcommon/common/schemaconfigurer.cpp new file mode 100644 index 00000000000..34071e241d7 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/common/schemaconfigurer.cpp @@ -0,0 +1,241 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/searchcommon/config/subscriptionproxyng.h> +#include <vespa/searchcommon/common/schemaconfigurer.h> + +LOG_SETUP(".index.schemaconfigurer"); + +using namespace config; +using namespace vespa::config::search; + +namespace search { +namespace index { + + +Schema::DataType +SchemaBuilder::convert(const IndexschemaConfig::Indexfield::Datatype &type) +{ + switch (type) { + case IndexschemaConfig::Indexfield::STRING: + return Schema::STRING; + case IndexschemaConfig::Indexfield::INT64: + return Schema::INT64; + case IndexschemaConfig::Indexfield::BOOLEANTREE: + return Schema::BOOLEANTREE; + } + return Schema::STRING; +} + + +Schema::CollectionType +SchemaBuilder::convert(const IndexschemaConfig::Indexfield::Collectiontype & type) +{ + switch (type) { + case IndexschemaConfig::Indexfield::SINGLE: + return Schema::SINGLE; + case IndexschemaConfig::Indexfield::ARRAY: + return Schema::ARRAY; + case IndexschemaConfig::Indexfield::WEIGHTEDSET: + return Schema::WEIGHTEDSET; + } + return Schema::SINGLE; +} + + +Schema::DataType +SchemaBuilder::convert(const AttributesConfig::Attribute::Datatype &type) +{ + switch (type) { + case AttributesConfig::Attribute::STRING: + return Schema::STRING; + case AttributesConfig::Attribute::UINT1: + return Schema::UINT1; + case AttributesConfig::Attribute::UINT2: + return Schema::UINT2; + case AttributesConfig::Attribute::UINT4: + return Schema::UINT4; + case AttributesConfig::Attribute::INT8: + return Schema::INT8; + case AttributesConfig::Attribute::INT16: + return Schema::INT16; + case AttributesConfig::Attribute::INT32: + return Schema::INT32; + case AttributesConfig::Attribute::INT64: + return Schema::INT64; + case AttributesConfig::Attribute::FLOAT: + return Schema::FLOAT; + case AttributesConfig::Attribute::DOUBLE: + return Schema::DOUBLE; + case AttributesConfig::Attribute::PREDICATE: + return Schema::BOOLEANTREE; + case AttributesConfig::Attribute::TENSOR: + return Schema::TENSOR; + default: + break; + } + // TODO: exception? + return Schema::STRING; +} + + +Schema::CollectionType +SchemaBuilder::convert(const AttributesConfig::Attribute::Collectiontype &type) +{ + switch (type) { + case AttributesConfig::Attribute::SINGLE: + return Schema::SINGLE; + case AttributesConfig::Attribute::ARRAY: + return Schema::ARRAY; + case AttributesConfig::Attribute::WEIGHTEDSET: + return Schema::WEIGHTEDSET; + } + return Schema::SINGLE; +} + + +Schema::DataType +SchemaBuilder::convertSummaryType(const vespalib::string & type) +{ + if (type == "byte") { + return Schema::INT8; + } else if (type == "short") { + return Schema::INT16; + } else if (type == "integer") { + return Schema::INT32; + } else if (type == "int64") { + return Schema::INT64; + } else if (type == "float") { + return Schema::FLOAT; + } else if (type == "double") { + return Schema::DOUBLE; + } else if (type == "string" || + type == "longstring" || + type == "xmlstring" || + type == "featuredata" || + type == "jsonstring") + { + return Schema::STRING; + } else if (type == "data" || + type == "longdata") + { + return Schema::RAW; + } + return Schema::RAW; +} + + +void +SchemaBuilder::build(const IndexschemaConfig &cfg, Schema &schema) +{ + for (size_t i = 0; i < cfg.indexfield.size(); ++i) { + const IndexschemaConfig::Indexfield & f = cfg.indexfield[i]; + if ((f.datatype == IndexschemaConfig::Indexfield::BOOLEANTREE && + f.collectiontype == IndexschemaConfig::Indexfield::SINGLE) || + (f.indextype == IndexschemaConfig::Indexfield::RISE)) + { + LOG(warning, "Your field '%s' is a rise index. Those are no longer supported as of Vespa-5.89.\n" + " Redeploy and follow instructions to mitigate.", f.name.c_str()); + } else { + schema.addIndexField(Schema::IndexField(f.name, convert(f.datatype), + convert(f.collectiontype)). + setPrefix(f.prefix). + setPhrases(f.phrases). + setPositions(f.positions). + setAvgElemLen(f.averageelementlen)); + } + } + for (size_t i = 0; i < cfg.fieldset.size(); ++i) { + const IndexschemaConfig::Fieldset &fs = cfg.fieldset[i]; + Schema::FieldSet toAdd(fs.name); + for (size_t j = 0; j < fs.field.size(); ++j) { + toAdd.addField(fs.field[j].name); + } + schema.addFieldSet(toAdd); + } +} + + +void +SchemaBuilder::build(const AttributesConfig &cfg, Schema &schema) +{ + for (size_t i = 0; i < cfg.attribute.size(); ++i) { + const AttributesConfig::Attribute & a = cfg.attribute[i]; + schema.addAttributeField(Schema::Field(a.name, + convert(a.datatype), + convert(a.collectiontype))); + } +} + + +void +SchemaBuilder::build(const SummaryConfig &cfg, Schema &schema) +{ + for (size_t i = 0; i < cfg.classes.size(); ++i) { + LOG(debug, "class with index %lu has id %d (default has id %d)", + i, cfg.classes[i].id, cfg.defaultsummaryid); + } + for (size_t i = 0; i < cfg.classes.size(); ++i) { + // use the default summary class that has all fields + if (cfg.classes[i].id == cfg.defaultsummaryid) { + for (size_t j = 0; j < cfg.classes[i].fields.size(); ++j) { + const SummaryConfig::Classes::Fields & f = + cfg.classes[i].fields[j]; + schema.addSummaryField(Schema::Field(f.name, + convertSummaryType(f.type))); + } + return; + } + } + if (cfg.classes.empty()) { + LOG(debug, + "No summary class configured that match the default summary id %d", + cfg.defaultsummaryid); + } else { + LOG(warning, + "No summary class configured that match the default summary id %d", + cfg.defaultsummaryid); + } +} + + +void +SchemaConfigurer::configure(const IndexschemaConfig &cfg) +{ + SchemaBuilder::build(cfg, _schema); +} + + +void +SchemaConfigurer::configure(const AttributesConfig &cfg) +{ + SchemaBuilder::build(cfg, _schema); +} + + +void +SchemaConfigurer::configure(const SummaryConfig & cfg) +{ + SchemaBuilder::build(cfg, _schema); +} + + +SchemaConfigurer::SchemaConfigurer(Schema &schema, + const vespalib::string &configId) + : _schema(schema) +{ + search::SubscriptionProxyNg<SchemaConfigurer, IndexschemaConfig> + indexSchemaSubscriber(*this, &SchemaConfigurer::configure); + search::SubscriptionProxyNg<SchemaConfigurer, AttributesConfig> + attributesSubscriber(*this, &SchemaConfigurer::configure); + search::SubscriptionProxyNg<SchemaConfigurer, SummaryConfig> + summarySubscriber(*this, &SchemaConfigurer::configure); + indexSchemaSubscriber.subscribe(configId.c_str()); + attributesSubscriber.subscribe(configId.c_str()); + summarySubscriber.subscribe(configId.c_str()); +} + + +} // namespace search::index +} // namespace search diff --git a/searchcommon/src/vespa/searchcommon/common/schemaconfigurer.h b/searchcommon/src/vespa/searchcommon/common/schemaconfigurer.h new file mode 100644 index 00000000000..3c63d13ed28 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/common/schemaconfigurer.h @@ -0,0 +1,78 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/config-attributes.h> +#include <vespa/config-indexschema.h> +#include <vespa/config-summary.h> +#include <vespa/searchcommon/common/schema.h> +#include <vespa/searchcommon/attribute/collectiontype.h> +#include <vespa/searchcommon/attribute/basictype.h> + +namespace search { +namespace index { + +/** + * Schema class used to give a high-level description of the content + * of an index. + **/ +class SchemaBuilder +{ + static Schema::DataType + convert(const vespa::config::search::IndexschemaConfig::Indexfield::Datatype &type); + + static Schema::CollectionType + convert(const vespa::config::search::IndexschemaConfig::Indexfield::Collectiontype &type); + + static Schema::DataType + convert(const vespa::config::search::AttributesConfig::Attribute::Datatype &type); + + static Schema::CollectionType + convert(const vespa::config::search::AttributesConfig::Attribute::Collectiontype &type); + + static Schema::DataType + convertSummaryType(const vespalib::string &type); +public: + /** + * Build from indexschema config. + * + * @param indexCfg vespa::config::search::IndexschemaConfig to use + */ + static void + build(const vespa::config::search::IndexschemaConfig &cfg, Schema &schema); + /** + * Build from attribute config. + * + * @param attributeCfg vespa::config::search::AttributesConfig to use + **/ + static void + build(const vespa::config::search::AttributesConfig &cfg, Schema &schema); + /** + * Build from summary config. + * + * @param summaryCfg vespa::config::search::SummaryConfig to use + **/ + static void + build(const vespa::config::search::SummaryConfig &cfg, Schema &schema); +}; + +class SchemaConfigurer +{ +private: + Schema & _schema; + void configure(const vespa::config::search::IndexschemaConfig & cfg); + void configure(const vespa::config::search::AttributesConfig & cfg); + void configure(const vespa::config::search::SummaryConfig & cfg); + +public: + /** + * Load this schema from config using the given config id. + * + * @param configId the config id used to retrieve the relevant config. + **/ + SchemaConfigurer(Schema & schema, const vespalib::string &configId); +}; + +} // namespace search::index +} // namespace search + diff --git a/searchcommon/src/vespa/searchcommon/common/undefinedvalues.h b/searchcommon/src/vespa/searchcommon/common/undefinedvalues.h new file mode 100644 index 00000000000..dc33153dc10 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/common/undefinedvalues.h @@ -0,0 +1,72 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <cmath> +#include <limits> +#include <vespa/vespalib/stllike/string.h> + +namespace search { +namespace attribute { + +// for all integers +template <typename T> +T getUndefined() { + return std::numeric_limits<T>::min(); +} + +template <> +inline float getUndefined<float>() { + return -std::numeric_limits<float>::quiet_NaN(); +} + +template <> +inline double getUndefined<double>() { + return -std::numeric_limits<double>::quiet_NaN(); +} + + +// for all signed integers +template <typename T> +bool isUndefined(const T & value) { + return value == getUndefined<T>(); +} + +template <> +inline bool isUndefined<uint8_t>(const uint8_t &) { + return false; +} + +template <> +inline bool isUndefined<uint16_t>(const uint16_t &) { + return false; +} + +template <> +inline bool isUndefined<uint32_t>(const uint32_t &) { + return false; +} + +template <> +inline bool isUndefined<uint64_t>(const uint64_t &) { + return false; +} + +template <> +inline bool isUndefined<float>(const float & value) { + return std::isnan(value); +} + +template <> +inline bool isUndefined<double>(const double & value) { + return std::isnan(value); +} + +template <> +inline bool isUndefined<vespalib::string>(const vespalib::string & value) { + return value.empty(); +} + +} +} + diff --git a/searchcommon/src/vespa/searchcommon/config/.gitignore b/searchcommon/src/vespa/searchcommon/config/.gitignore new file mode 100644 index 00000000000..7e7c0fe7fae --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/config/.gitignore @@ -0,0 +1,2 @@ +/.depend +/Makefile diff --git a/searchcommon/src/vespa/searchcommon/config/CMakeLists.txt b/searchcommon/src/vespa/searchcommon/config/CMakeLists.txt new file mode 100644 index 00000000000..1f8034bc136 --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/config/CMakeLists.txt @@ -0,0 +1,6 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(searchcommon_config INTERFACE + SOURCES + INSTALL lib64 + DEPENDS +) diff --git a/searchcommon/src/vespa/searchcommon/config/subscriptionproxyng.h b/searchcommon/src/vespa/searchcommon/config/subscriptionproxyng.h new file mode 100644 index 00000000000..d2b5570770f --- /dev/null +++ b/searchcommon/src/vespa/searchcommon/config/subscriptionproxyng.h @@ -0,0 +1,63 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/config/helper/legacysubscriber.h> +#include <vespa/vespalib/stllike/string.h> + +namespace search { + +template <typename ME, typename CFG> +class SubscriptionProxyNg : public config::IFetcherCallback<CFG> +{ + typedef void (ME::*Method)(const CFG &cfg); + +private: + ME &_target; + Method _method; + config::LegacySubscriber *_subscriber; + vespalib::string _cfgId; + + SubscriptionProxyNg(const SubscriptionProxyNg&); + SubscriptionProxyNg &operator=(const SubscriptionProxyNg&); + +public: + SubscriptionProxyNg(ME &target, Method method) + : _target(target), + _method(method), + _subscriber(NULL), + _cfgId("") + { + } + virtual ~SubscriptionProxyNg() { + unsubscribe(); + } + const char *getConfigId() const { + return _cfgId.c_str(); + } + void subscribe(const char *configId) { + if (_subscriber != NULL) { + if (configId != NULL && strcmp(configId, _subscriber->id().c_str()) == 0) + { + return; // same id; ignore + } else { + unsubscribe(); + } + } + if (configId != NULL && configId[0] != '\0') { + _cfgId = configId; + _subscriber = new config::LegacySubscriber(); + _subscriber->subscribe<CFG>(configId, this); + } + } + void unsubscribe() { + delete _subscriber; + _subscriber = NULL; + _cfgId = ""; + } + virtual void configure(std::unique_ptr<CFG> cfg) { + (_target.*_method)(*cfg); + } +}; + +} // namespace search + diff --git a/searchcommon/testrun/.gitignore b/searchcommon/testrun/.gitignore new file mode 100644 index 00000000000..8f0724a7dba --- /dev/null +++ b/searchcommon/testrun/.gitignore @@ -0,0 +1,12 @@ +/test-report.html +/test-report.html.bottom +/test-report.html.entry +/test-report.html.summary +/test-report.html.top +test.*.*.desc +test.*.*.file.* +test.*.*.files.html +test.*.*.log +tmp.* +/test.*.*.result +/Makefile |