diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /searchsummary |
Publish
Diffstat (limited to 'searchsummary')
105 files changed, 9281 insertions, 0 deletions
diff --git a/searchsummary/.gitignore b/searchsummary/.gitignore new file mode 100644 index 00000000000..be0452bed21 --- /dev/null +++ b/searchsummary/.gitignore @@ -0,0 +1,4 @@ +/target +/pom.xml.build +Makefile +Testing diff --git a/searchsummary/CMakeLists.txt b/searchsummary/CMakeLists.txt new file mode 100644 index 00000000000..f3b57ec54f5 --- /dev/null +++ b/searchsummary/CMakeLists.txt @@ -0,0 +1,29 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_define_module( + DEPENDS + fastos + vespalog + vespalib + staging_vespalib + fnet + configdefinitions + fastlib_fast + document + config_cloudconfig + searchcommon + persistencetypes + metrics + searchlib + juniper + + LIBS + src/vespa/searchsummary + src/vespa/searchsummary/config + src/vespa/searchsummary/docsummary + + TESTS + src/tests/docsumformat + src/tests/docsummary + src/tests/docsummary/slime_summary + src/tests/extractkeywords +) diff --git a/searchsummary/OWNERS b/searchsummary/OWNERS new file mode 100644 index 00000000000..9673ef97e16 --- /dev/null +++ b/searchsummary/OWNERS @@ -0,0 +1,2 @@ +geirst +balder diff --git a/searchsummary/pom.xml b/searchsummary/pom.xml new file mode 100644 index 00000000000..0676ac78527 --- /dev/null +++ b/searchsummary/pom.xml @@ -0,0 +1,44 @@ +<!-- Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 + http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>com.yahoo.vespa</groupId> + <artifactId>parent</artifactId> + <version>6-SNAPSHOT</version> + <relativePath>../parent/pom.xml</relativePath> + </parent> + <artifactId>searchsummary</artifactId> + <version>6-SNAPSHOT</version> + <packaging>jar</packaging> + <name>${project.artifactId}</name> + <dependencies> + <dependency> + <groupId>com.yahoo.vespa</groupId> + <artifactId>config-lib</artifactId> + <version>${project.version}</version> + </dependency> + </dependencies> + <build> + <plugins> + <plugin> + <groupId>com.yahoo.vespa</groupId> + <artifactId>config-class-plugin</artifactId> + <version>${project.version}</version> + <configuration> + <defFilesDirectories>src/vespa/searchsummary/config/</defFilesDirectories> + </configuration> + <executions> + <execution> + <id>config-gen</id> + <goals> + <goal>config-gen</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> diff --git a/searchsummary/src/.gitignore b/searchsummary/src/.gitignore new file mode 100644 index 00000000000..47011ff3508 --- /dev/null +++ b/searchsummary/src/.gitignore @@ -0,0 +1,4 @@ +/Makefile.ini +/config_command.sh +/project.dsw +/searchsummary.mak diff --git a/searchsummary/src/testlist.txt b/searchsummary/src/testlist.txt new file mode 100644 index 00000000000..62ea27ae736 --- /dev/null +++ b/searchsummary/src/testlist.txt @@ -0,0 +1,4 @@ +tests/docsumformat +tests/docsummary +tests/docsummary/slime_summary +tests/extractkeywords diff --git a/searchsummary/src/tests/docsumformat/.gitignore b/searchsummary/src/tests/docsumformat/.gitignore new file mode 100644 index 00000000000..2c841cbd43d --- /dev/null +++ b/searchsummary/src/tests/docsumformat/.gitignore @@ -0,0 +1,21 @@ +*.cfg +*.core +*.ilk +*.out +*.pdb +*.pid +.depend +Makefile +core +core.* +datapart.* +docsum-index +docsum-pack +docsum-parse +index.cf +merged +meta-info.txt +schema.txt +summary.cf +version.txt +searchsummary_docsum-pack_app diff --git a/searchsummary/src/tests/docsumformat/CMakeLists.txt b/searchsummary/src/tests/docsumformat/CMakeLists.txt new file mode 100644 index 00000000000..ac8d2151792 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchsummary_docsum-pack_app + SOURCES + docsum-pack.cpp + DEPENDS + searchsummary +) +vespa_add_test(NAME searchsummary_docsum-pack_app COMMAND searchsummary_docsum-pack_app) diff --git a/searchsummary/src/tests/docsumformat/docsum-index.sh b/searchsummary/src/tests/docsumformat/docsum-index.sh new file mode 100755 index 00000000000..0d313191685 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/docsum-index.sh @@ -0,0 +1,16 @@ +#!/bin/sh -e +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +findex=../../../bin/findex + +echo "CLEAN" +rm -f index.cf +rm -f summary.cf +rm -rf merged +rm -rf datapart.* + +echo "DOCSUM-INDEX" +./docsum-index + +echo "AUTOINDEX" +$findex autoindex diff --git a/searchsummary/src/tests/docsumformat/docsum-pack.cpp b/searchsummary/src/tests/docsumformat/docsum-pack.cpp new file mode 100644 index 00000000000..3f1b088bd12 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/docsum-pack.cpp @@ -0,0 +1,631 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 2001-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("docsum-pack"); +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/searchsummary/docsummary/urlresult.h> +#include <vespa/searchsummary/docsummary/resultconfig.h> +#include <vespa/searchsummary/docsummary/resultpacker.h> + +using namespace search::docsummary; + + +// needed to resolve external symbol from httpd.h on AIX +void FastS_block_usr2() {} + + +class MyApp : public FastOS_Application +{ +private: + bool _rc; + uint32_t _cnt; + search::docsummary::ResultConfig _config; + search::docsummary::ResultPacker _packer; + +public: + MyApp() + : _rc(false), + _cnt(0u), + _config(), + _packer(&_config) + { + } + + // log test results + void ReportTestResult(uint32_t line, bool rc); + bool RTR(uint32_t line, bool rc) + { ReportTestResult(line, rc); return rc; } + + // compare runtime info (,but ignore result class) + bool Equal(search::docsummary::ResEntry *a, search::docsummary::ResEntry *b); + bool Equal(search::docsummary::GeneralResult *a, search::docsummary::GeneralResult *b); + + void TestFieldIndex(uint32_t line, search::docsummary::GeneralResult *gres, + const char *field, int idx); + + void TestIntValue(uint32_t line, search::docsummary::GeneralResult *gres, + const char *field, uint32_t value); + + void TestDoubleValue(uint32_t line, search::docsummary::GeneralResult *gres, + const char *field, double value); + + void TestInt64Value(uint32_t line, search::docsummary::GeneralResult *gres, + const char *field, uint64_t value); + + void TestStringValue(uint32_t line, search::docsummary::GeneralResult *gres, + const char *field, const char *value); + + void TestDataValue(uint32_t line, search::docsummary::GeneralResult *gres, + const char *field, const char *value); + + void TestBasic(); + void TestFailLong(); + void TestFailShort(); + void TestFailOrder(); + void TestCompress(); + void TestCompat(); + void TestBasicInplace(); + void TestCompressInplace(); + + int Main(); +}; + + +void +MyApp::ReportTestResult(uint32_t line, bool rc) +{ + _cnt++; + + if (rc) { + LOG(info, "Test case %d: SUCCESS", _cnt); + } else { + LOG(error, "Test case %d: FAIL (see %s:%d)", _cnt, __FILE__, line); + _rc = false; + } +} + + +bool +MyApp::Equal(search::docsummary::ResEntry *a, search::docsummary::ResEntry *b) +{ + if (a->_type != b->_type) + return false; + + if (a->_intval != b->_intval) + return false; + + if (a->_type != RES_INT && + memcmp(a->_pt, b->_pt, a->_intval) != 0) + return false; + + return true; +} + + +bool +MyApp::Equal(search::docsummary::GeneralResult *a, search::docsummary::GeneralResult *b) +{ + uint32_t numEntries = a->GetClass()->GetNumEntries(); + + if (b->GetClass()->GetNumEntries() != numEntries) + return false; + + for (uint32_t i = 0; i < numEntries; i++) { + + if (!Equal(a->GetEntry(i), b->GetEntry(i))) + return false; + + if (a->GetClass()->GetEntry(i)->_bindname != b->GetClass()->GetEntry(i)->_bindname) + return false; + } + + return true; +} + + +void +MyApp::TestFieldIndex(uint32_t line, search::docsummary::GeneralResult *gres, + const char *field, int idx) +{ + bool rc = (gres != NULL && + gres->GetClass()->GetIndexFromName(field) == idx); + + RTR(line, rc); +} + + +void +MyApp::TestIntValue(uint32_t line, search::docsummary::GeneralResult *gres, + const char *field, uint32_t value) +{ + search::docsummary::ResEntry *entry + = (gres != NULL) ? gres->GetEntry(field) : NULL; + + bool rc = (entry != NULL && + entry->_type == RES_INT && + entry->_intval == value); + + RTR(line, rc); +} + + +void +MyApp::TestDoubleValue(uint32_t line, search::docsummary::GeneralResult *gres, + const char *field, double value) +{ + search::docsummary::ResEntry *entry + = (gres != NULL) ? gres->GetEntry(field) : NULL; + + bool rc = (entry != NULL && + entry->_type == RES_DOUBLE && + entry->_doubleval == value); + + RTR(line, rc); +} + + +void +MyApp::TestInt64Value(uint32_t line, search::docsummary::GeneralResult *gres, + const char *field, uint64_t value) +{ + search::docsummary::ResEntry *entry + = (gres != NULL) ? gres->GetEntry(field) : NULL; + + bool rc = (entry != NULL && + entry->_type == RES_INT64 && + entry->_int64val == value); + + RTR(line, rc); +} + + +void +MyApp::TestStringValue(uint32_t line, search::docsummary::GeneralResult *gres, + const char *field, const char *value) +{ + search::docsummary::ResEntry *entry + = (gres != NULL) ? gres->GetEntry(field) : NULL; + + bool rc = (entry != NULL && + entry->_type == RES_STRING && + entry->_stringlen == strlen(value) && + strncmp(entry->_stringval, value, entry->_stringlen) == 0); + + if (!rc && entry != NULL) { + LOG(warning, + "string value '%.*s' != '%s'", + (int) entry->_stringlen, + entry->_stringval, value); + } + + RTR(line, rc); +} + + +void +MyApp::TestDataValue(uint32_t line, search::docsummary::GeneralResult *gres, + const char *field, const char *value) +{ + search::docsummary::ResEntry *entry + = (gres != NULL) ? gres->GetEntry(field) : NULL; + + bool rc = (entry != NULL && + entry->_type == RES_DATA && + entry->_datalen == strlen(value) && + strncmp(entry->_dataval, value, entry->_datalen) == 0); + + RTR(line, rc); +} + + +void +MyApp::TestBasic() +{ + const char *buf; + uint32_t buflen; + + search::docsummary::urlresult *res; + search::docsummary::GeneralResult *gres; + + uint32_t intval = 4; + uint16_t shortval = 2; + uint8_t byteval = 1; + float floatval = 4.5; + double doubleval = 8.75; + uint64_t int64val = 8; + const char *strval = "This is a string"; + const char *datval = "This is data"; + const char *lstrval = "This is a long string"; + const char *ldatval = "This is long data"; + + RTR(__LINE__, _packer.Init(0)); + RTR(__LINE__, _packer.AddInteger(intval)); + RTR(__LINE__, _packer.AddShort(shortval)); + RTR(__LINE__, _packer.AddByte(byteval)); + RTR(__LINE__, _packer.AddFloat(floatval)); + RTR(__LINE__, _packer.AddDouble(doubleval)); + RTR(__LINE__, _packer.AddInt64(int64val)); + RTR(__LINE__, _packer.AddString(strval, strlen(strval))); + RTR(__LINE__, _packer.AddData(datval, strlen(datval))); + RTR(__LINE__, _packer.AddLongString(lstrval, strlen(lstrval))); + RTR(__LINE__, _packer.AddLongData(ldatval, strlen(ldatval))); + RTR(__LINE__, _packer.GetDocsumBlob(&buf, &buflen)); + + res = _config.Unpack(0, 0, 0, buf, buflen); + gres = res->IsGeneral() ? (search::docsummary::GeneralResult *) res : NULL; + + RTR(__LINE__, gres != NULL); + TestIntValue (__LINE__, gres, "integer", 4); + TestIntValue (__LINE__, gres, "short", 2); + TestIntValue (__LINE__, gres, "byte", 1); + TestDoubleValue(__LINE__, gres, "float", floatval); + TestDoubleValue(__LINE__, gres, "double", doubleval); + TestInt64Value (__LINE__, gres, "int64", int64val); + TestStringValue(__LINE__, gres, "string", strval); + TestDataValue (__LINE__, gres, "data", datval); + TestStringValue(__LINE__, gres, "longstring", lstrval); + TestDataValue (__LINE__, gres, "longdata", ldatval); + RTR(__LINE__, (gres != NULL && + gres->GetClass()->GetNumEntries() == 10)); + RTR(__LINE__, (gres != NULL && + gres->GetClass()->GetClassID() == 0)); + delete res; +} + + +void +MyApp::TestFailLong() +{ + const char *buf; + uint32_t buflen; + + uint32_t intval = 4; + uint16_t shortval = 2; + uint8_t byteval = 1; + float floatval = 4.5; + double doubleval = 8.75; + uint64_t int64val = 8; + const char *strval = "This is a string"; + const char *datval = "This is data"; + const char *lstrval = "This is a long string"; + const char *ldatval = "This is long data"; + + RTR(__LINE__, _packer.Init(0)); + RTR(__LINE__, _packer.AddInteger(intval)); + RTR(__LINE__, _packer.AddShort(shortval)); + RTR(__LINE__, _packer.AddByte(byteval)); + RTR(__LINE__, _packer.AddFloat(floatval)); + RTR(__LINE__, _packer.AddDouble(doubleval)); + RTR(__LINE__, _packer.AddInt64(int64val)); + RTR(__LINE__, _packer.AddString(strval, strlen(strval))); + RTR(__LINE__, _packer.AddData(datval, strlen(datval))); + RTR(__LINE__, _packer.AddLongString(lstrval, strlen(lstrval))); + RTR(__LINE__, _packer.AddLongData(ldatval, strlen(ldatval))); + RTR(__LINE__, !_packer.AddByte(byteval)); + RTR(__LINE__, !_packer.GetDocsumBlob(&buf, &buflen)); +} + + +void +MyApp::TestFailShort() +{ + const char *buf; + uint32_t buflen; + + uint32_t intval = 4; + uint16_t shortval = 2; + uint8_t byteval = 1; + float floatval = 4.5; + double doubleval = 8.75; + uint64_t int64val = 8; + const char *strval = "This is a string"; + const char *datval = "This is data"; + const char *lstrval = "This is a long string"; + + RTR(__LINE__, _packer.Init(0)); + RTR(__LINE__, _packer.AddInteger(intval)); + RTR(__LINE__, _packer.AddShort(shortval)); + RTR(__LINE__, _packer.AddByte(byteval)); + RTR(__LINE__, _packer.AddFloat(floatval)); + RTR(__LINE__, _packer.AddDouble(doubleval)); + RTR(__LINE__, _packer.AddInt64(int64val)); + RTR(__LINE__, _packer.AddString(strval, strlen(strval))); + RTR(__LINE__, _packer.AddData(datval, strlen(datval))); + RTR(__LINE__, _packer.AddLongString(lstrval, strlen(lstrval))); + RTR(__LINE__, !_packer.GetDocsumBlob(&buf, &buflen)); +} + + +void +MyApp::TestFailOrder() +{ + const char *buf; + uint32_t buflen; + + uint32_t intval = 4; + uint16_t shortval = 2; + uint8_t byteval = 1; + float floatval = 4.5; + double doubleval = 8.75; + uint64_t int64val = 8; + const char *strval = "This is a string"; + const char *datval = "This is data"; + const char *lstrval = "This is a long string"; + const char *ldatval = "This is long data"; + + RTR(__LINE__, _packer.Init(0)); + RTR(__LINE__, _packer.AddInteger(intval)); + RTR(__LINE__, _packer.AddShort(shortval)); + RTR(__LINE__, !_packer.AddString(strval, strlen(strval))); + RTR(__LINE__, !_packer.AddByte(byteval)); + RTR(__LINE__, !_packer.AddFloat(floatval)); + RTR(__LINE__, !_packer.AddDouble(doubleval)); + RTR(__LINE__, !_packer.AddInt64(int64val)); + RTR(__LINE__, !_packer.AddData(datval, strlen(datval))); + RTR(__LINE__, !_packer.AddLongString(lstrval, strlen(lstrval))); + RTR(__LINE__, !_packer.AddLongData(ldatval, strlen(ldatval))); + RTR(__LINE__, !_packer.GetDocsumBlob(&buf, &buflen)); +} + + +void +MyApp::TestCompress() +{ + const char *buf; + uint32_t buflen; + + search::docsummary::urlresult *res; + search::docsummary::GeneralResult *gres; + + const char *lstrval = "string string string"; + const char *ldatval = "data data data"; + + RTR(__LINE__, _packer.Init(2)); + RTR(__LINE__, _packer.AddLongString(lstrval, strlen(lstrval))); + RTR(__LINE__, _packer.AddLongData(ldatval, strlen(ldatval))); + RTR(__LINE__, _packer.GetDocsumBlob(&buf, &buflen)); + + res = _config.Unpack(0, 0, 0, buf, buflen); + gres = res->IsGeneral() ? (search::docsummary::GeneralResult *) res : NULL; + + RTR(__LINE__, gres != NULL); + TestStringValue(__LINE__, gres, "text", lstrval); + TestDataValue (__LINE__, gres, "data", ldatval); + RTR(__LINE__, (gres != NULL && + gres->GetClass()->GetNumEntries() == 2)); + RTR(__LINE__, (gres != NULL && + gres->GetClass()->GetClassID() == 2)); + delete res; +} + + +void +MyApp::TestCompat() +{ + const char *buf; + uint32_t buflen; + + search::docsummary::urlresult *res1; + search::docsummary::GeneralResult *gres1; + + search::docsummary::urlresult *res2; + search::docsummary::GeneralResult *gres2; + + const char *strval = "string string string string"; + const char *datval = "data data data data"; + + RTR(__LINE__, _packer.Init(1)); + RTR(__LINE__, _packer.AddData(strval, strlen(strval))); + RTR(__LINE__, _packer.AddString(datval, strlen(datval))); + RTR(__LINE__, _packer.GetDocsumBlob(&buf, &buflen)); + res1 = _config.Unpack(0, 0, 0, buf, buflen); + gres1 = res1->IsGeneral() ? (search::docsummary::GeneralResult *) res1 : NULL; + + RTR(__LINE__, _packer.Init(2)); + RTR(__LINE__, _packer.AddLongData(strval, strlen(strval))); + RTR(__LINE__, _packer.AddLongString(datval, strlen(datval))); + RTR(__LINE__, _packer.GetDocsumBlob(&buf, &buflen)); + res2 = _config.Unpack(0, 0, 0, buf, buflen); + gres2 = res2->IsGeneral() ? (search::docsummary::GeneralResult *) res2 : NULL; + + RTR(__LINE__, gres1 != NULL); + RTR(__LINE__, gres2 != NULL); + + TestStringValue(__LINE__, gres1, "text", strval); + TestDataValue (__LINE__, gres1, "data", datval); + TestFieldIndex (__LINE__, gres1, "text", 0); + TestFieldIndex (__LINE__, gres1, "data", 1); + RTR(__LINE__, (gres1 != NULL && + gres1->GetClass()->GetNumEntries() == 2)); + + TestStringValue(__LINE__, gres2, "text", strval); + TestDataValue (__LINE__, gres2, "data", datval); + TestFieldIndex (__LINE__, gres2, "text", 0); + TestFieldIndex (__LINE__, gres2, "data", 1); + RTR(__LINE__, (gres2 != NULL && + gres2->GetClass()->GetNumEntries() == 2)); + + RTR(__LINE__, (gres1 != NULL && + gres1->GetClass()->GetClassID() == 1)); + RTR(__LINE__, (gres2 != NULL && + gres2->GetClass()->GetClassID() == 2)); + + RTR(__LINE__, (gres1 != NULL && gres2 != NULL && + Equal(gres1, gres2))); + + delete res1; + delete res2; +} + + +void +MyApp::TestBasicInplace() +{ + const char *buf; + uint32_t buflen; + + const search::docsummary::ResultClass *resClass; + search::docsummary::GeneralResult *gres; + + uint32_t intval = 4; + uint16_t shortval = 2; + uint8_t byteval = 1; + float floatval = 4.5; + double doubleval = 8.75; + uint64_t int64val = 8; + const char *strval = "This is a string"; + const char *datval = "This is data"; + const char *lstrval = "This is a long string"; + const char *ldatval = "This is long data"; + + RTR(__LINE__, _packer.Init(0)); + RTR(__LINE__, _packer.AddInteger(intval)); + RTR(__LINE__, _packer.AddShort(shortval)); + RTR(__LINE__, _packer.AddByte(byteval)); + RTR(__LINE__, _packer.AddFloat(floatval)); + RTR(__LINE__, _packer.AddDouble(doubleval)); + RTR(__LINE__, _packer.AddInt64(int64val)); + RTR(__LINE__, _packer.AddString(strval, strlen(strval))); + RTR(__LINE__, _packer.AddData(datval, strlen(datval))); + RTR(__LINE__, _packer.AddLongString(lstrval, strlen(lstrval))); + RTR(__LINE__, _packer.AddLongData(ldatval, strlen(ldatval))); + RTR(__LINE__, _packer.GetDocsumBlob(&buf, &buflen)); + + resClass = _config.LookupResultClass(_config.GetClassID(buf, buflen)); + if (resClass == NULL) { + gres = NULL; + } else { + DocsumStoreValue value(buf, buflen); + gres = new search::docsummary::GeneralResult(resClass, 0, 0, 0); + if (!gres->inplaceUnpack(value)) { + delete gres; + gres = NULL; + } + } + + RTR(__LINE__, gres != NULL); + TestIntValue (__LINE__, gres, "integer", 4); + TestIntValue (__LINE__, gres, "short", 2); + TestIntValue (__LINE__, gres, "byte", 1); + TestDoubleValue(__LINE__, gres, "float", floatval); + TestDoubleValue(__LINE__, gres, "double", doubleval); + TestInt64Value (__LINE__, gres, "int64", int64val); + TestStringValue(__LINE__, gres, "string", strval); + TestDataValue (__LINE__, gres, "data", datval); + TestStringValue(__LINE__, gres, "longstring", lstrval); + TestDataValue (__LINE__, gres, "longdata", ldatval); + RTR(__LINE__, (gres != NULL && + gres->GetClass()->GetNumEntries() == 10)); + RTR(__LINE__, (gres != NULL && + gres->GetClass()->GetClassID() == 0)); + delete gres; +} + + +void +MyApp::TestCompressInplace() +{ + const char *buf; + uint32_t buflen; + + search::RawBuf field1(32768); + search::RawBuf field2(32768); + const search::docsummary::ResultClass *resClass; + search::docsummary::GeneralResult *gres; + + const char *lstrval = "string string string"; + const char *ldatval = "data data data"; + + RTR(__LINE__, _packer.Init(2)); + RTR(__LINE__, _packer.AddLongString(lstrval, strlen(lstrval))); + RTR(__LINE__, _packer.AddLongData(ldatval, strlen(ldatval))); + RTR(__LINE__, _packer.GetDocsumBlob(&buf, &buflen)); + + resClass = _config.LookupResultClass(_config.GetClassID(buf, buflen)); + if (resClass == NULL) { + gres = NULL; + } else { + DocsumStoreValue value(buf, buflen); + gres = new search::docsummary::GeneralResult(resClass, 0, 0, 0); + if (!gres->inplaceUnpack(value)) { + delete gres; + gres = NULL; + } + } + + search::docsummary::ResEntry *e1 = (gres == NULL) ? NULL : gres->GetEntry("text"); + search::docsummary::ResEntry *e2 = (gres == NULL) ? NULL : gres->GetEntry("data"); + + if (e1 != NULL) + e1->_extract_field(&field1); + if (e2 != NULL) + e2->_extract_field(&field2); + + RTR(__LINE__, gres != NULL); + RTR(__LINE__, e1 != NULL); + RTR(__LINE__, e2 != NULL); + RTR(__LINE__, strcmp(field1.GetDrainPos(), lstrval) == 0); + RTR(__LINE__, strcmp(field2.GetDrainPos(), ldatval) == 0); + RTR(__LINE__, strlen(lstrval) == field1.GetUsedLen()); + RTR(__LINE__, strlen(ldatval) == field2.GetUsedLen()); + RTR(__LINE__, (gres != NULL && + gres->GetClass()->GetNumEntries() == 2)); + RTR(__LINE__, (gres != NULL && + gres->GetClass()->GetClassID() == 2)); + delete gres; +} + + + +int +MyApp::Main() +{ + _rc = true; + _cnt = 0; + + search::docsummary::ResultClass *resClass; + + resClass = _config.AddResultClass("c0", 0); + resClass->AddConfigEntry("integer", RES_INT); + resClass->AddConfigEntry("short", RES_SHORT); + resClass->AddConfigEntry("byte", RES_BYTE); + resClass->AddConfigEntry("float", RES_FLOAT); + resClass->AddConfigEntry("double", RES_DOUBLE); + resClass->AddConfigEntry("int64", RES_INT64); + resClass->AddConfigEntry("string", RES_STRING); + resClass->AddConfigEntry("data", RES_DATA); + resClass->AddConfigEntry("longstring", RES_LONG_STRING); + resClass->AddConfigEntry("longdata", RES_LONG_DATA); + + resClass = _config.AddResultClass("c1", 1); + resClass->AddConfigEntry("text", RES_STRING); + resClass->AddConfigEntry("data", RES_DATA); + + resClass = _config.AddResultClass("c2", 2); + resClass->AddConfigEntry("text", RES_LONG_STRING); + resClass->AddConfigEntry("data", RES_LONG_DATA); + + TestBasic(); + TestFailLong(); + TestFailShort(); + TestFailOrder(); + TestCompress(); + TestCompat(); + TestBasicInplace(); + TestCompressInplace(); + + LOG(info, "CONCLUSION: %s", (_rc) ? "SUCCESS" : "FAIL"); + return (_rc ? 0 : 1); +} + + +int +main(int argc, char **argv) +{ + MyApp myapp; + return myapp.Entry(argc, argv); +} diff --git a/searchsummary/src/tests/docsumformat/docsum-parse.cpp b/searchsummary/src/tests/docsumformat/docsum-parse.cpp new file mode 100644 index 00000000000..5fa7009464c --- /dev/null +++ b/searchsummary/src/tests/docsumformat/docsum-parse.cpp @@ -0,0 +1,201 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 2001-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("docsum-parse"); +#include <vespa/fnet/frt/frt.h> +#include <vespa/fastlib/io/bufferedfile.h> +#include <vespa/searchsummary/docsummary/urlresult.h> +#include <vespa/searchsummary/docsummary/resultconfig.h> + + +// needed to resolve external symbol from httpd.h on AIX +void FastS_block_usr2() {} + + +class MyApp : public FastOS_Application +{ +public: + bool Equal(search::docsummary::ResConfigEntry *a, search::docsummary::ResConfigEntry *b); + bool Equal(search::docsummary::ResultClass *a, search::docsummary::ResultClass *b); + bool Equal(search::docsummary::ResultConfig *a, search::docsummary::ResultConfig *b); + bool TestCorrect(const char *dirname, const char *filename); + bool TestIncorrect(const char *dirname, const char *filename); + int Main(); +}; + + +bool +MyApp::Equal(search::docsummary::ResConfigEntry *a, search::docsummary::ResConfigEntry *b) +{ + return ((a->_type == b->_type) + && (strcmp(a->_bindname, b->_bindname) == 0)); +} + + +bool +MyApp::Equal(search::docsummary::ResultClass *a, search::docsummary::ResultClass *b) +{ + bool rc = true; + + rc = rc && (a->GetNumEntries() == b->GetNumEntries()); + rc = rc && (a->GetClassID() == b->GetClassID()); + rc = rc && (strcmp(a->GetClassName(), b->GetClassName()) == 0); + + for (uint32_t i = 0; rc && i < a->GetNumEntries(); i++) { + rc = rc && Equal(a->GetEntry(i), b->GetEntry(i)); + } + + return rc; +} + + +bool +MyApp::Equal(search::docsummary::ResultConfig *a, search::docsummary::ResultConfig *b) +{ + bool rc = true; + + search::docsummary::ResultClass *resClassA; + search::docsummary::ResultClass *resClassB; + + rc = rc && (a->GetNumResultClasses() == b->GetNumResultClasses()); + + resClassA = a->GetResultClasses(); + resClassB = b->GetResultClasses(); + + while(rc && resClassA != NULL && resClassB != NULL) { + rc = rc && Equal(resClassA, resClassB); + resClassA = resClassA->GetNextClass(); + resClassB = resClassB->GetNextClass(); + } + rc = rc && (resClassA == NULL); + rc = rc && (resClassB == NULL); + + return rc; +} + + +bool +MyApp::TestCorrect(const char *dirname, const char *filename) +{ + char str1[512]; // test input file + char str2[512]; // test output file + char str3[512]; // summary.cf verification file + + search::docsummary::ResultConfig a; + search::docsummary::ResultConfig b; + search::docsummary::ResultConfig c; + search::docsummary::ResultConfig d; + + sprintf(str1, "%s%s%s", dirname, + FastOS_FileInterface::GetPathSeparator(), filename); + sprintf(str2, "%s%sout.%s", dirname, + FastOS_FileInterface::GetPathSeparator(), filename); + sprintf(str3, "%s%sOK.%s", dirname, + FastOS_FileInterface::GetPathSeparator(), filename); + + if (!a.ReadConfig(str1)) { + LOG(error, "could not read config from : %s", str1); + return false; + } + + if (!a.WriteConfig(str2)) { + LOG(error, "could not write config to : %s", str2); + return false; + } + + if (!b.ReadConfig(str2)) { + LOG(error, "could not read config from : %s", str2); + return false; + } + + if (!c.ReadConfig(str3)) { + LOG(error, "could not read config from : %s", str3); + return false; + } + + if (!Equal(&a, &b)) { + LOG(error, "%s and %s does not contain the same config", str1, str2); + return false; + } + + if (!Equal(&a, &c)) { + LOG(error, "%s and %s does not contain the same config", str1, str3); + return false; + } + + if (!Equal(&b, &c)) { + LOG(error, "%s and %s does not contain the same config", str2, str3); + return false; + } + + FRT_RPCRequest *req = new FRT_RPCRequest(); + assert(req != NULL); + c.GetConfig(req); + d.SetConfig(req); + if (!Equal(&c, &d)) { + LOG(error, "RPC get/set failed (%s)", str3); + req->SubRef(); + return false; + } + req->SubRef(); + + return true; +} + + +bool +MyApp::TestIncorrect(const char *dirname, const char *filename) +{ + char str[512]; + + sprintf(str, "%s%s%s", dirname, + FastOS_FileInterface::GetPathSeparator(), filename); + + search::docsummary::ResultConfig resConfig; + + if (resConfig.ReadConfig(str)) { + LOG(error, "'%s' did not give parse error", str); + return false; + } + return true; +} + + +int +MyApp::Main() +{ + bool rc = true; + + FastOS_DirectoryScan dirScan("parsetest"); + LOG(info, "looking for input files in 'parsetest'..."); + while (dirScan.ReadNext()) { + if (strncmp(dirScan.GetName(), "correct.", 8) == 0) { + if (TestCorrect("parsetest", dirScan.GetName())) { + LOG(info, "'%s' : positive test PASSED", dirScan.GetName()); + } else { + LOG(error, "'%s' : positive test FAILED", dirScan.GetName()); + rc = false; + } + } else if (strncmp(dirScan.GetName(), "incorrect.", 10) == 0) { + if (TestIncorrect("parsetest", dirScan.GetName())) { + LOG(info, "'%s' : negative test PASSED", dirScan.GetName()); + } else { + LOG(error, "'%s' : negative test FAILED", dirScan.GetName()); + rc = false; + } + } + } + return (rc ? 0 : 1); +} + + +int +main(int argc, char **argv) +{ + MyApp myapp; + return myapp.Entry(argc, argv); +} diff --git a/searchsummary/src/tests/docsumformat/dotest.sh b/searchsummary/src/tests/docsumformat/dotest.sh new file mode 100755 index 00000000000..64097b0061d --- /dev/null +++ b/searchsummary/src/tests/docsumformat/dotest.sh @@ -0,0 +1,13 @@ +#!/bin/sh +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +echo "running pack test..." +./docsum-pack > packtest.out 2>&1 +res=$? +if [ $res -eq 0 ]; then + echo "pack test PASSED" +else + echo "pack test FAILED!" + echo "please check packtest.out" + exit 1 +fi diff --git a/searchsummary/src/tests/docsumformat/parsetest/.gitignore b/searchsummary/src/tests/docsumformat/parsetest/.gitignore new file mode 100644 index 00000000000..19815d313ff --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/.gitignore @@ -0,0 +1,2 @@ +*.out +out.* diff --git a/searchsummary/src/tests/docsumformat/parsetest/OK.correct.1 b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.1 new file mode 100644 index 00000000000..8238b53f81c --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.1 @@ -0,0 +1,17 @@ +idtype none + +class default id 0 +field URL type string +field TITLE type string +field TEASER type string +field DSHOST type integer +field DSKEY type integer +field BYTES type integer +field WORDS type integer +field MODDATE type integer +field CRAWLDATE type integer +field LANG1 type byte +field LANG2 type byte +field LANG3 type byte +field LANG4 type byte +field CHARSET type integer diff --git a/searchsummary/src/tests/docsumformat/parsetest/OK.correct.2 b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.2 new file mode 100644 index 00000000000..8996c2dac4c --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.2 @@ -0,0 +1,14 @@ +idtype byte + +class document id 1 +field title type string +field teaser type string +field url type string +field date type integer + +class image id 2 +field title type string +field date type integer +field width type short +field height type short +field bitmaps type byte diff --git a/searchsummary/src/tests/docsumformat/parsetest/OK.correct.3 b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.3 new file mode 100644 index 00000000000..ae29fa40335 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.3 @@ -0,0 +1,5 @@ +idtype none + +class default id 0 +field TITLE type string +field DATE type integer diff --git a/searchsummary/src/tests/docsumformat/parsetest/OK.correct.4 b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.4 new file mode 100644 index 00000000000..8238b53f81c --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.4 @@ -0,0 +1,17 @@ +idtype none + +class default id 0 +field URL type string +field TITLE type string +field TEASER type string +field DSHOST type integer +field DSKEY type integer +field BYTES type integer +field WORDS type integer +field MODDATE type integer +field CRAWLDATE type integer +field LANG1 type byte +field LANG2 type byte +field LANG3 type byte +field LANG4 type byte +field CHARSET type integer diff --git a/searchsummary/src/tests/docsumformat/parsetest/OK.correct.5 b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.5 new file mode 100644 index 00000000000..6b6dc874a68 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.5 @@ -0,0 +1,3 @@ +idtype byte + +class myclass id 42 diff --git a/searchsummary/src/tests/docsumformat/parsetest/OK.correct.6 b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.6 new file mode 100644 index 00000000000..38416fdf45e --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.6 @@ -0,0 +1,5 @@ +idtype none + +class default id 0 +field TEASER type longstring +field DOCTEXT type longdata diff --git a/searchsummary/src/tests/docsumformat/parsetest/OK.correct.7 b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.7 new file mode 100644 index 00000000000..d1f17d25141 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.7 @@ -0,0 +1,11 @@ +idtype short + +class class_1 id 1 +field title type string +field rawteaser type data +field doctext type longdata +field dynteaser type longstring + +class class_2 id 2 +field title type string +field rawteaser type longdata diff --git a/searchsummary/src/tests/docsumformat/parsetest/OK.correct.8 b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.8 new file mode 100644 index 00000000000..e929b872a05 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.8 @@ -0,0 +1,7 @@ +idtype integer + +class class_50 id 50 +field title type data + +class class_100 id 100 +field title type string diff --git a/searchsummary/src/tests/docsumformat/parsetest/OK.correct.9 b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.9 new file mode 100644 index 00000000000..668505be77d --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/OK.correct.9 @@ -0,0 +1,13 @@ +idtype none + +class default id 0 +field f0 type integer +field f1 type short +field f2 type byte +field f3 type float +field f4 type double +field f5 type int64 +field f6 type string +field f7 type data +field f8 type longstring +field f9 type longdata diff --git a/searchsummary/src/tests/docsumformat/parsetest/README b/searchsummary/src/tests/docsumformat/parsetest/README new file mode 100644 index 00000000000..2de83e1b0cb --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/README @@ -0,0 +1,24 @@ +The files in this directory are used to test the parsing of document +summary format config files. The files are named by the following +rules: + +incorrect.* : these files are incorrect; loading them should fail. + +correct.* : these files are correct; loading them should succeed. + +OK.correct.* : these files contain normalized config on 'summary.cf' + format that matches the config contained in + the corresponding 'correct.*' files. + +The 'docsum-parse' program loops through all files in this +directory. For each file that has a name beginning with 'incorrect.', +it checks that loading document summary format config from it +fails. For each file that has a name beginning with 'correct.', it +checks that document summary format config may be read from the +file. It then writes the config back to a file named 'out.correct.<>', +reads the newly generated file back in, reads the corresponding +'OK.correct.<>' file and checks that all 3 configs are exactly the +same. + +New tests may be added simply be adding files conforming to the above +rules to this directory. diff --git a/searchsummary/src/tests/docsumformat/parsetest/correct.1 b/searchsummary/src/tests/docsumformat/parsetest/correct.1 new file mode 100644 index 00000000000..0b3d57b7f9c --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/correct.1 @@ -0,0 +1,14 @@ +STRING URL +STRING TITLE +STRING TEASER +INT DSHOST +INT DSKEY +INT BYTES +INT WORDS +INT MODDATE +INT CRAWLDATE +BYTE LANG1 +BYTE LANG2 +BYTE LANG3 +BYTE LANG4 +INT CHARSET diff --git a/searchsummary/src/tests/docsumformat/parsetest/correct.2 b/searchsummary/src/tests/docsumformat/parsetest/correct.2 new file mode 100644 index 00000000000..8996c2dac4c --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/correct.2 @@ -0,0 +1,14 @@ +idtype byte + +class document id 1 +field title type string +field teaser type string +field url type string +field date type integer + +class image id 2 +field title type string +field date type integer +field width type short +field height type short +field bitmaps type byte diff --git a/searchsummary/src/tests/docsumformat/parsetest/correct.3 b/searchsummary/src/tests/docsumformat/parsetest/correct.3 new file mode 100644 index 00000000000..8a16e3f3fd1 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/correct.3 @@ -0,0 +1,3 @@ +idtype byte +STRING TITLE +INT DATE diff --git a/searchsummary/src/tests/docsumformat/parsetest/correct.4 b/searchsummary/src/tests/docsumformat/parsetest/correct.4 new file mode 100644 index 00000000000..8238b53f81c --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/correct.4 @@ -0,0 +1,17 @@ +idtype none + +class default id 0 +field URL type string +field TITLE type string +field TEASER type string +field DSHOST type integer +field DSKEY type integer +field BYTES type integer +field WORDS type integer +field MODDATE type integer +field CRAWLDATE type integer +field LANG1 type byte +field LANG2 type byte +field LANG3 type byte +field LANG4 type byte +field CHARSET type integer diff --git a/searchsummary/src/tests/docsumformat/parsetest/correct.5 b/searchsummary/src/tests/docsumformat/parsetest/correct.5 new file mode 100644 index 00000000000..d179537e208 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/correct.5 @@ -0,0 +1,4 @@ +idtype byte +class myclass id 42 +STRING TITLE +INT DATE diff --git a/searchsummary/src/tests/docsumformat/parsetest/correct.6 b/searchsummary/src/tests/docsumformat/parsetest/correct.6 new file mode 100644 index 00000000000..a4e41ec72d8 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/correct.6 @@ -0,0 +1,2 @@ +LONGSTRING TEASER +LONGDATA DOCTEXT diff --git a/searchsummary/src/tests/docsumformat/parsetest/correct.7 b/searchsummary/src/tests/docsumformat/parsetest/correct.7 new file mode 100644 index 00000000000..d1f17d25141 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/correct.7 @@ -0,0 +1,11 @@ +idtype short + +class class_1 id 1 +field title type string +field rawteaser type data +field doctext type longdata +field dynteaser type longstring + +class class_2 id 2 +field title type string +field rawteaser type longdata diff --git a/searchsummary/src/tests/docsumformat/parsetest/correct.8 b/searchsummary/src/tests/docsumformat/parsetest/correct.8 new file mode 100644 index 00000000000..e929b872a05 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/correct.8 @@ -0,0 +1,7 @@ +idtype integer + +class class_50 id 50 +field title type data + +class class_100 id 100 +field title type string diff --git a/searchsummary/src/tests/docsumformat/parsetest/correct.9 b/searchsummary/src/tests/docsumformat/parsetest/correct.9 new file mode 100644 index 00000000000..668505be77d --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/correct.9 @@ -0,0 +1,13 @@ +idtype none + +class default id 0 +field f0 type integer +field f1 type short +field f2 type byte +field f3 type float +field f4 type double +field f5 type int64 +field f6 type string +field f7 type data +field f8 type longstring +field f9 type longdata diff --git a/searchsummary/src/tests/docsumformat/parsetest/incorrect.1 b/searchsummary/src/tests/docsumformat/parsetest/incorrect.1 new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/incorrect.1 diff --git a/searchsummary/src/tests/docsumformat/parsetest/incorrect.2 b/searchsummary/src/tests/docsumformat/parsetest/incorrect.2 new file mode 100644 index 00000000000..600380f898d --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/incorrect.2 @@ -0,0 +1,14 @@ +idtype int + +class document id 1 +field title type string +field teaser type string +field url type string +field date type integer + +class image id 2 +field title type string +field date type integer +field width type short +field height type short +field bitmaps type byte diff --git a/searchsummary/src/tests/docsumformat/parsetest/incorrect.3 b/searchsummary/src/tests/docsumformat/parsetest/incorrect.3 new file mode 100644 index 00000000000..35d46b73f96 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/incorrect.3 @@ -0,0 +1,14 @@ +idtype byte + +class document id 1 +field title type string +field teaser type string +field url type string +field date type integer + +class image id 1 +field title type string +field date type integer +field width type short +field height type short +field bitmaps type byte diff --git a/searchsummary/src/tests/docsumformat/parsetest/incorrect.4 b/searchsummary/src/tests/docsumformat/parsetest/incorrect.4 new file mode 100644 index 00000000000..f50c143b4be --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/incorrect.4 @@ -0,0 +1,14 @@ +idtype byte + +class document id 1 +field title type string +field teaser type string +field url type string +field date type int + +class image id 2 +field title type string +field date type integer +field width type short +field height type short +field bitmaps type byte diff --git a/searchsummary/src/tests/docsumformat/parsetest/incorrect.5 b/searchsummary/src/tests/docsumformat/parsetest/incorrect.5 new file mode 100644 index 00000000000..6579c30a29d --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/incorrect.5 @@ -0,0 +1,14 @@ +idtype byte + +class document id 1 +field title type string +field teaser type string +field url type string +field url type integer + +class image id 2 +field title type string +field date type integer +field width type short +field height type short +field bitmaps type byte diff --git a/searchsummary/src/tests/docsumformat/parsetest/incorrect.6 b/searchsummary/src/tests/docsumformat/parsetest/incorrect.6 new file mode 100644 index 00000000000..2ce1ab9507e --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/incorrect.6 @@ -0,0 +1,13 @@ +idtype byte + +field title type string +field teaser type string +field url type string +field date type integer + +class image id 2 +field title type string +field date type integer +field width type short +field height type short +field bitmaps type byte diff --git a/searchsummary/src/tests/docsumformat/parsetest/incorrect.7 b/searchsummary/src/tests/docsumformat/parsetest/incorrect.7 new file mode 100644 index 00000000000..e51bb1d2d48 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/incorrect.7 @@ -0,0 +1,14 @@ +STRING URL +STRING TITLE +STRING TITLE +INT DSHOST +INT DSKEY +INT BYTES +INT WORDS +INT MODDATE +INT CRAWLDATE +BYTE LANG1 +BYTE LANG2 +BYTE LANG3 +BYTE LANG4 +INT CHARSET diff --git a/searchsummary/src/tests/docsumformat/parsetest/incorrect.8 b/searchsummary/src/tests/docsumformat/parsetest/incorrect.8 new file mode 100644 index 00000000000..7639557b734 --- /dev/null +++ b/searchsummary/src/tests/docsumformat/parsetest/incorrect.8 @@ -0,0 +1,2 @@ +idtype byte +STRING TITLE diff --git a/searchsummary/src/tests/docsummary/.gitignore b/searchsummary/src/tests/docsummary/.gitignore new file mode 100644 index 00000000000..3f0be20ca74 --- /dev/null +++ b/searchsummary/src/tests/docsummary/.gitignore @@ -0,0 +1,4 @@ +*_test +.depend +Makefile +searchsummary_positionsdfw_test_app diff --git a/searchsummary/src/tests/docsummary/CMakeLists.txt b/searchsummary/src/tests/docsummary/CMakeLists.txt new file mode 100644 index 00000000000..7eaa78d923e --- /dev/null +++ b/searchsummary/src/tests/docsummary/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchsummary_positionsdfw_test_app + SOURCES + positionsdfw_test.cpp + DEPENDS + searchsummary +) +vespa_add_test(NAME searchsummary_positionsdfw_test_app COMMAND searchsummary_positionsdfw_test_app) diff --git a/searchsummary/src/tests/docsummary/positionsdfw_test.cpp b/searchsummary/src/tests/docsummary/positionsdfw_test.cpp new file mode 100644 index 00000000000..59f91e12ef7 --- /dev/null +++ b/searchsummary/src/tests/docsummary/positionsdfw_test.cpp @@ -0,0 +1,142 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Unit tests for positionsdfw. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("positionsdfw_test"); + +#include <vespa/searchlib/attribute/extendableattributes.h> +#include <vespa/searchlib/attribute/iattributemanager.h> +#include <vespa/searchsummary/docsummary/docsumfieldwriter.h> +#include <vespa/searchsummary/docsummary/positionsdfw.h> +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/vespalib/testkit/testapp.h> + +using search::RawBuf; +using search::IAttributeManager; +using search::SingleInt64ExtAttribute; +using search::attribute::IAttributeContext; +using search::attribute::IAttributeVector; +using vespalib::string; +using std::vector; + +namespace search { +namespace docsummary { + +namespace { + +class Test : public vespalib::TestApp { + void requireThat2DPositionFieldIsWritten(); + +public: + int Main(); +}; + +int +Test::Main() +{ + TEST_INIT("positionsdfw_test"); + + TEST_DO(requireThat2DPositionFieldIsWritten()); + + TEST_DONE(); +} + +struct MyEnvironment : IDocsumEnvironment { + IAttributeManager *attribute_man; + + MyEnvironment() : attribute_man(0) {} + + virtual IAttributeManager *getAttributeManager() { return attribute_man; } + virtual string lookupIndex(const string &s) const { return s; } + virtual juniper::Juniper *getJuniper() { return 0; } +}; + +class MyAttributeContext : public IAttributeContext { + const IAttributeVector &_attr; +public: + MyAttributeContext(const IAttributeVector &attr) : _attr(attr) {} + virtual const IAttributeVector *getAttribute(const string &) const { + return &_attr; + } + virtual const IAttributeVector *getAttributeStableEnum( + const string &) const { abort(); } + virtual void getAttributeList(vector<const IAttributeVector *> &) const + { abort(); } +}; + +class MyAttributeManager : public IAttributeManager { + const IAttributeVector &_attr; +public: + + MyAttributeManager(const IAttributeVector &attr) : _attr(attr) {} + virtual AttributeGuard::UP getAttribute(const string &) const { + abort(); + } + virtual AttributeGuard::UP getAttributeStableEnum(const string &) const { + abort(); + } + virtual void getAttributeList(vector<AttributeGuard> &) const { + abort(); + } + virtual IAttributeContext::UP createContext() const { + return IAttributeContext::UP(new MyAttributeContext(_attr)); + } +}; + +struct MyGetDocsumsStateCallback : GetDocsumsStateCallback { + virtual void FillSummaryFeatures(GetDocsumsState *, IDocsumEnvironment *) {} + virtual void FillRankFeatures(GetDocsumsState *, IDocsumEnvironment *) {} + virtual void ParseLocation(GetDocsumsState *) {} +}; + +template <typename AttrType> +void checkWritePositionField(Test &test, AttrType &attr, + uint32_t doc_id, const string &expected) { + for (AttributeVector::DocId i = 0; i < doc_id + 1; ) { + attr.addDoc(i); + if (i == 007) { + attr.add((int64_t) -1); + } else if (i == 0x42) { + attr.add(0xAAAAaaaaAAAAaaaa); + } else if (i == 0x17) { + attr.add(0x5555aaaa5555aaab); + } else if (i == 42) { + attr.add(0x8000000000000000); + } else { + attr.add(i); // value = docid + } + } + + MyAttributeManager attribute_man(attr); + PositionsDFW::UP writer = + createPositionsDFW(attr.getName().c_str(), &attribute_man); + ASSERT_TRUE(writer.get()); + ResType res_type = RES_LONG_STRING; + RawBuf target(1024); + MyGetDocsumsStateCallback callback; + GetDocsumsState state(callback); + state._attributes.push_back(&attr); + + writer->WriteField(doc_id, 0, &state, res_type, &target); + + test.EXPECT_EQUAL(expected.size(), *(const uint32_t *)(target.GetDrainPos())); + const char *p = target.GetDrainPos() + 4; + test.EXPECT_EQUAL(expected, string(p, p + expected.size())); +} + +void Test::requireThat2DPositionFieldIsWritten() { + SingleInt64ExtAttribute attr("foo"); + checkWritePositionField(*this, attr, 0x3e, "<position x=\"6\" y=\"7\" latlong=\"N0.000007;E0.000006\" />"); + checkWritePositionField(*this, attr, 007, "<position x=\"-1\" y=\"-1\" latlong=\"S0.000001;W0.000001\" />"); + checkWritePositionField(*this, attr, 0x42, "<position x=\"0\" y=\"-1\" latlong=\"S0.000001;E0.000000\" />"); + checkWritePositionField(*this, attr, 0x17, "<position x=\"-16711935\" y=\"16711935\" latlong=\"N16.711935;W16.711935\" />"); + checkWritePositionField(*this, attr, 42, ""); + +} + +} // namespace +} // namespace docsummary +} // namespace search + +TEST_APPHOOK(search::docsummary::Test); diff --git a/searchsummary/src/tests/docsummary/slime_summary/.gitignore b/searchsummary/src/tests/docsummary/slime_summary/.gitignore new file mode 100644 index 00000000000..1df864db333 --- /dev/null +++ b/searchsummary/src/tests/docsummary/slime_summary/.gitignore @@ -0,0 +1 @@ +searchsummary_slime_summary_test_app diff --git a/searchsummary/src/tests/docsummary/slime_summary/CMakeLists.txt b/searchsummary/src/tests/docsummary/slime_summary/CMakeLists.txt new file mode 100644 index 00000000000..a2bd3bbc610 --- /dev/null +++ b/searchsummary/src/tests/docsummary/slime_summary/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchsummary_slime_summary_test_app + SOURCES + slime_summary_test.cpp + DEPENDS + searchsummary +) +vespa_add_test(NAME searchsummary_slime_summary_test_app COMMAND searchsummary_slime_summary_test_app) diff --git a/searchsummary/src/tests/docsummary/slime_summary/FILES b/searchsummary/src/tests/docsummary/slime_summary/FILES new file mode 100644 index 00000000000..6d3a81d3ffc --- /dev/null +++ b/searchsummary/src/tests/docsummary/slime_summary/FILES @@ -0,0 +1 @@ +slime_summary_test.cpp diff --git a/searchsummary/src/tests/docsummary/slime_summary/slime_summary_test.cpp b/searchsummary/src/tests/docsummary/slime_summary/slime_summary_test.cpp new file mode 100644 index 00000000000..6509491d0ac --- /dev/null +++ b/searchsummary/src/tests/docsummary/slime_summary/slime_summary_test.cpp @@ -0,0 +1,125 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/vespalib/testkit/test_kit.h> +#include <vespa/searchsummary/docsummary/docsumwriter.h> +#include <vespa/searchsummary/docsummary/resultpacker.h> +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/vespalib/data/slime/slime.h> +#include <vespa/vespalib/data/slime/simple_buffer.h> +#include <vespa/vespalib/data/slime/json_format.h> +#include <vespa/vespalib/data/slime/binary_format.h> +#include <vespa/searchlib/util/slime_output_raw_buf_adapter.h> + +using namespace vespalib::slime::convenience; +using namespace search::docsummary; + +namespace { + +struct FieldBlock { + Slime slime; + search::RawBuf binary; + + explicit FieldBlock(const vespalib::string &jsonInput) + : slime(), binary(1024) + { + size_t used = vespalib::slime::JsonFormat::decode(jsonInput, slime); + EXPECT_EQUAL(jsonInput.size(), used); + search::SlimeOutputRawBufAdapter adapter(binary); + vespalib::slime::BinaryFormat::encode(slime, adapter); + } + const char *data() const { return binary.GetDrainPos(); } + size_t dataLen() const { return binary.GetUsedLen(); } +}; + +struct DocsumFixture : IDocsumStore, GetDocsumsStateCallback { + std::unique_ptr<DynamicDocsumWriter> writer; + std::unique_ptr<ResultPacker> packer; + GetDocsumsState state; + DocsumFixture() : writer(), packer(), state(*this) { + ResultConfig *config = new ResultConfig(); + ResultClass *cfg = config->AddResultClass("default", 0); + EXPECT_TRUE(cfg != 0); + EXPECT_TRUE(cfg->AddConfigEntry("int_field", RES_INT)); + EXPECT_TRUE(cfg->AddConfigEntry("short_field", RES_SHORT)); + EXPECT_TRUE(cfg->AddConfigEntry("byte_field", RES_BYTE)); + EXPECT_TRUE(cfg->AddConfigEntry("float_field", RES_FLOAT)); + EXPECT_TRUE(cfg->AddConfigEntry("double_field", RES_DOUBLE)); + EXPECT_TRUE(cfg->AddConfigEntry("int64_field", RES_INT64)); + EXPECT_TRUE(cfg->AddConfigEntry("string_field", RES_STRING)); + EXPECT_TRUE(cfg->AddConfigEntry("data_field", RES_DATA)); + EXPECT_TRUE(cfg->AddConfigEntry("longstring_field", RES_LONG_STRING)); + EXPECT_TRUE(cfg->AddConfigEntry("longdata_field", RES_LONG_DATA)); + EXPECT_TRUE(cfg->AddConfigEntry("xmlstring_field", RES_XMLSTRING)); + EXPECT_TRUE(cfg->AddConfigEntry("jsonstring_field", RES_JSONSTRING)); + EXPECT_TRUE(cfg->AddConfigEntry("bad_jsonstring_field", RES_JSONSTRING)); + config->CreateEnumMaps(); + writer.reset(new DynamicDocsumWriter(config, 0)); + packer.reset(new ResultPacker(writer->GetResultConfig())); + state._args.setFlags(search::fs4transport::GDFLAG_ALLOW_SLIME); + } + void getDocsum(Slime &slime) { + uint32_t classId; + search::RawBuf buf(4096); + writer->WriteDocsum(1u, &state, this, &buf); + ASSERT_GREATER(buf.GetUsedLen(), sizeof(classId)); + memcpy(&classId, buf.GetDrainPos(), sizeof(classId)); + buf.Drain(sizeof(classId)); + EXPECT_EQUAL(classId, ::search::fs4transport::SLIME_MAGIC_ID); + EXPECT_GREATER(vespalib::slime::BinaryFormat + ::decode(Memory(buf.GetDrainPos(), buf.GetUsedLen()), slime), 0u); + } + virtual uint32_t getNumDocs() { return 2; } + virtual DocsumStoreValue getMappedDocsum(uint32_t docid, bool useSlimeInsideFields) { + EXPECT_EQUAL(true, useSlimeInsideFields); + EXPECT_EQUAL(1u, docid); + EXPECT_TRUE(packer->Init(0)); + EXPECT_TRUE(packer->AddInteger(4)); + EXPECT_TRUE(packer->AddShort(2)); + EXPECT_TRUE(packer->AddByte(1)); + EXPECT_TRUE(packer->AddFloat(4.5)); + EXPECT_TRUE(packer->AddDouble(8.75)); + EXPECT_TRUE(packer->AddInt64(8)); + EXPECT_TRUE(packer->AddString( "string", + strlen("string"))); + EXPECT_TRUE(packer->AddData( "data", + strlen("data"))); + EXPECT_TRUE(packer->AddLongString( "long_string", + strlen("long_string"))); + EXPECT_TRUE(packer->AddLongData( "long_data", + strlen("long_data"))); + EXPECT_TRUE(packer->AddLongString( "xml_string", + strlen("xml_string"))); + FieldBlock jsf1("{foo:1, bar:2}"); + EXPECT_TRUE(packer->AddLongData(jsf1.data(), jsf1.dataLen())); + EXPECT_TRUE(packer->AddLongString("abc", 3)); + const char *buf; + uint32_t len; + EXPECT_TRUE(packer->GetDocsumBlob(&buf, &len)); + return DocsumStoreValue(buf, len); + } + uint32_t getSummaryClassId() const override { return 0; } + virtual void FillSummaryFeatures(GetDocsumsState *, IDocsumEnvironment *) {} + virtual void FillRankFeatures(GetDocsumsState *, IDocsumEnvironment *) {} + virtual void ParseLocation(GetDocsumsState *) {} +}; + +} // namespace <unnamed> + +TEST_FF("require that docsum can be written as slime", DocsumFixture(), Slime()) { + f1.getDocsum(f2); + EXPECT_EQUAL(f2.get()["int_field"].asLong(), 4u); + EXPECT_EQUAL(f2.get()["short_field"].asLong(), 2u); + EXPECT_EQUAL(f2.get()["byte_field"].asLong(), 1u); + EXPECT_EQUAL(f2.get()["float_field"].asDouble(), 4.5); + EXPECT_EQUAL(f2.get()["double_field"].asDouble(), 8.75); + EXPECT_EQUAL(f2.get()["int64_field"].asLong(), 8u); + EXPECT_EQUAL(f2.get()["string_field"].asString().make_string(), std::string("string")); + EXPECT_EQUAL(f2.get()["data_field"].asData().make_string(), std::string("data")); + EXPECT_EQUAL(f2.get()["longstring_field"].asString().make_string(), std::string("long_string")); + EXPECT_EQUAL(f2.get()["longdata_field"].asData().make_string(), std::string("long_data")); + EXPECT_EQUAL(f2.get()["xmlstring_field"].asString().make_string(), std::string("xml_string")); + EXPECT_EQUAL(f2.get()["jsonstring_field"]["foo"].asLong(), 1u); + EXPECT_EQUAL(f2.get()["jsonstring_field"]["bar"].asLong(), 2u); + EXPECT_EQUAL(f2.get()["bad_jsonstring_field"].type().getId(), 0u); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchsummary/src/tests/extractkeywords/.gitignore b/searchsummary/src/tests/extractkeywords/.gitignore new file mode 100644 index 00000000000..1b50b24b284 --- /dev/null +++ b/searchsummary/src/tests/extractkeywords/.gitignore @@ -0,0 +1,7 @@ +*.core +.depend +Makefile +core +core.* +extractkeywordstest +searchsummary_extractkeywordstest_app diff --git a/searchsummary/src/tests/extractkeywords/CMakeLists.txt b/searchsummary/src/tests/extractkeywords/CMakeLists.txt new file mode 100644 index 00000000000..d726ffe794c --- /dev/null +++ b/searchsummary/src/tests/extractkeywords/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchsummary_extractkeywordstest_app + SOURCES + extractkeywordstest.cpp + DEPENDS + searchsummary +) +vespa_add_test(NAME searchsummary_extractkeywordstest_app COMMAND sh runtests.sh) diff --git a/searchsummary/src/tests/extractkeywords/extractkeywordstest.cpp b/searchsummary/src/tests/extractkeywords/extractkeywordstest.cpp new file mode 100644 index 00000000000..59d949f40ca --- /dev/null +++ b/searchsummary/src/tests/extractkeywords/extractkeywordstest.cpp @@ -0,0 +1,295 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 2001-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/searchsummary/docsummary/keywordextractor.h> +#include <vespa/searchlib/parsequery/simplequerystack.h> +#include <vespa/searchlib/util/rawbuf.h> +#include "extractkeywordstest.h" + +#define NUMTESTS 5 + +int +ExtractKeywordsTest::Main() +{ + int doTest[NUMTESTS]; + int low, high, accnum, num; + int indicator; + bool verify = false; + int multiplier = 1; + bool failed = false; + + if (_argc == 1) + Usage(_argv[0]); + + // default initialize to not run any tests. + for (int n = 0; n < NUMTESTS; n++) + doTest[n] = 0; + + // parse the command line arguments + for (int i = 1; i < _argc; i++) { + low = 0; + high = NUMTESTS - 1; + char *p = _argv[i]; + + // Check if a multiplier is specified + if (*p == '*') { + p++; + accnum = 0; + while (*p != '\0') { + num = *p - '0'; + accnum = accnum * 10 + num; + p++; + } + multiplier = accnum; + continue; + } + + // Default is to run the tests specified, unless the first char is '/' + indicator = 1; + if (*p == '/') { + p++; + indicator = 0; + } + + // Find the first number + accnum = 0; + while (*p != '-' && *p != '\0') { + num = *p - '0'; + accnum = accnum * 10 + num; + p++; + } + if (accnum >= NUMTESTS) + continue; + low = accnum; + // Check for range operator + if (*p == '-') { + p++; + // Find the second number + accnum = 0; + while (*p != '\0') { + num = *p - '0'; + accnum = accnum * 10 + num; + p++; + } + if (accnum > 0) + high = accnum < NUMTESTS ? accnum : NUMTESTS-1; + } else + high = low; + + // Indicate the runrequest for the desired range. + for (int j = low; j <= high; j++) + doTest[j] = indicator; + } + + // Remove unused tests. + // doTest[1] = 0; + + // Remember time + if (multiplier > 1) { + printf("Running all tests %d times.\n", multiplier); + verify = false; + } else { + verify = true; + } + + int testCnt = 0; + + // init keyword extractor + _extractor = new search::docsummary::KeywordExtractor(NULL); + _extractor->AddLegalIndexSpec("*"); + + FastOS_Time timer; + timer.SetNow(); + + // Actually run the tests that we wanted. + for (int j = 0; j < multiplier; j++) + for (int k = 0; k < NUMTESTS; k++) + if (doTest[k] == 1) { + if (!RunTest(k, verify)) + failed = true; + testCnt++; + } + + // Print time taken + double timeTaken = timer.MilliSecsToNow(); + + printf("Time taken : %f ms\n", timeTaken); + printf("Number of tests run: %d\n", testCnt); + double avgTestPrMSec = static_cast<double>(testCnt) / timeTaken; + printf("Tests pr Sec: %f\n", avgTestPrMSec * 1000.0); + + delete _extractor; + _extractor = NULL; + + return failed ? 1 : 0; +} + +bool +ExtractKeywordsTest::ShowResult(int testNo, + const char *actual, const char *correct) +{ + const char *act_word = actual; + const char *cor_word = correct; + printf("%03d: ", testNo); + + while (*act_word != '\0') { + if (strcmp(act_word, cor_word) != 0) { + printf("fail. Keywords differ for act: %s, corr: %s\n", + act_word, cor_word); + return false; + } else { + act_word += strlen(act_word) + 1; + cor_word += strlen(cor_word) + 1; + } + } + if (*cor_word != '\0') { + printf("fail. actual list shorter than correct at %s\n", cor_word); + return false; + } + printf("ok\n"); + return true; +} + +/** + * + * @param testno The test to run. + * @param verify Verify the result of the test. + */ +bool +ExtractKeywordsTest::RunTest(int testno, bool verify) +{ + search::SimpleQueryStack stack; + search::RawBuf buf(32768); + const char *correct = NULL; + const char *keywords = NULL; + + switch (testno) { + case 0: + { + // Simple term query + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "foobar")); + + stack.AppendBuffer(&buf); + keywords = _extractor->ExtractKeywords(vespalib::stringref(buf.GetDrainPos(), buf.GetUsedLen())); + correct = "foobar\0\0"; + + if (verify) ShowResult(testno, keywords, correct); + free(const_cast<char *>(keywords)); + break; + } + + case 1: + { + // multi term query + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "foobar")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "foo")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "bar")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_OR, 3)); + + stack.AppendBuffer(&buf); + keywords = _extractor->ExtractKeywords(vespalib::stringref(buf.GetDrainPos(), buf.GetUsedLen())); + correct = "bar\0foo\0foobar\0\0"; + + if (verify) ShowResult(testno, keywords, correct); + free(const_cast<char *>(keywords)); + break; + } + + case 2: + { + // phrase term query + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "foobar")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "foo")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "bar")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_PHRASE, 3)); + + stack.AppendBuffer(&buf); + keywords = _extractor->ExtractKeywords(vespalib::stringref(buf.GetDrainPos(), buf.GetUsedLen())); + correct = "bar foo foobar\0\0"; + + if (verify) ShowResult(testno, keywords, correct); + free(const_cast<char *>(keywords)); + break; + } + + case 3: + { + // multiple phrase and term query + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "xyzzy")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "xyz")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_PHRASE, 2)); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "foobar")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "foo")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "bar")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_PHRASE, 3)); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "baz")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "zog")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_AND, 3)); + + stack.AppendBuffer(&buf); + keywords = _extractor->ExtractKeywords(vespalib::stringref(buf.GetDrainPos(), buf.GetUsedLen())); + correct = "zog\0baz\0bar foo foobar\0xyz xyzzy\0\0"; + + if (verify) ShowResult(testno, keywords, correct); + free(const_cast<char *>(keywords)); + break; + } + + case 4: + { + // phrase term query with wrong argument items + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "foobar")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "foo")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_AND, 2)); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_TERM, "bar")); + stack.Push(new search::ParseItem(search::ParseItem::ITEM_PHRASE, 2)); + + stack.AppendBuffer(&buf); + keywords = _extractor->ExtractKeywords(vespalib::stringref(buf.GetDrainPos(), buf.GetUsedLen())); + correct = "\0"; + + if (verify) ShowResult(testno, keywords, correct); + free(const_cast<char *>(keywords)); + break; + } + + default: + { + printf("%03d: no such test\n", testno); + return false; + } + } + + bool result = true; + /* + if (verify) { + result = ShowResult(testno, pq->GetStack(), correct); + delete correct; + } else { + result = true; + } + delete pq; + */ + return result; +} + +void +ExtractKeywordsTest::Usage(char *progname) +{ + printf("%s {testnospec}+\n\ + Where testnospec is:\n\ + num: single test\n\ + num-num: inclusive range (open range permitted)\n",progname); + printf("There are tests from %d to %d\n\n", 0, NUMTESTS-1); + exit(-1); +} + +int +main(int argc, char** argv) +{ + ExtractKeywordsTest tester; + return tester.Entry(argc, argv); +} + diff --git a/searchsummary/src/tests/extractkeywords/extractkeywordstest.h b/searchsummary/src/tests/extractkeywords/extractkeywordstest.h new file mode 100644 index 00000000000..1a037fcb9cd --- /dev/null +++ b/searchsummary/src/tests/extractkeywords/extractkeywordstest.h @@ -0,0 +1,34 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 2001-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/fastos/fastos.h> + +namespace search { +namespace docummary { +class KeywordExtractor; +} +} + +class ExtractKeywordsTest : public FastOS_Application +{ +private: + ExtractKeywordsTest(const ExtractKeywordsTest &); + ExtractKeywordsTest& operator=(const ExtractKeywordsTest &); + + search::docsummary::KeywordExtractor *_extractor; + + int Main(); + void Usage(char *progname); + bool ShowResult(int testNo, const char *actual, const char *correct); + bool RunTest(int i, bool verify); + +public: + ExtractKeywordsTest(void) + : _extractor(NULL) + { + } +}; + diff --git a/searchsummary/src/tests/extractkeywords/runtests.sh b/searchsummary/src/tests/extractkeywords/runtests.sh new file mode 100755 index 00000000000..2c09bb25460 --- /dev/null +++ b/searchsummary/src/tests/extractkeywords/runtests.sh @@ -0,0 +1,29 @@ +#!/bin/sh +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +# +# $Id$ +# +# Copyright (C) 2000-2003 Fast Search & Transfer ASA +# Copyright (C) 2003 Overture Services Norway AS +# +# All Rights Reserved +# + +if $VALGRIND ./searchsummary_extractkeywordstest_app - +then + : +else + echo FAILED: searchsummary_extractkeywordstest_app test failed + exit 1 +fi + +if $VALGRIND ./searchsummary_extractkeywordstest_app - '*1000' +then + : +else + echo FAILED: searchsummary_extractkeywordstest_app test failed + exit 1 +fi + +echo SUCCESS: searchsummary_extractkeywordstest_app test completed +exit 0 diff --git a/searchsummary/src/tests/extractkeywords/testowner.ATS b/searchsummary/src/tests/extractkeywords/testowner.ATS new file mode 100644 index 00000000000..6d03b0836a4 --- /dev/null +++ b/searchsummary/src/tests/extractkeywords/testowner.ATS @@ -0,0 +1 @@ +vlarsen diff --git a/searchsummary/src/vespa/searchsummary/.gitignore b/searchsummary/src/vespa/searchsummary/.gitignore new file mode 100644 index 00000000000..4ecafa4a29f --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/.gitignore @@ -0,0 +1,3 @@ +/.depend +/Makefile +/libsearchsummary.so.5.1 diff --git a/searchsummary/src/vespa/searchsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/CMakeLists.txt new file mode 100644 index 00000000000..078c1b137a8 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(searchsummary + SOURCES + $<TARGET_OBJECTS:searchsummary_config> + $<TARGET_OBJECTS:searchsummary_docsummary> + INSTALL lib64 + DEPENDS +) diff --git a/searchsummary/src/vespa/searchsummary/config/.gitignore b/searchsummary/src/vespa/searchsummary/config/.gitignore new file mode 100644 index 00000000000..0d614ad8ec7 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/config/.gitignore @@ -0,0 +1,5 @@ +*.So +.depend +Makefile +config-*.cpp +config-*.h diff --git a/searchsummary/src/vespa/searchsummary/config/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/config/CMakeLists.txt new file mode 100644 index 00000000000..5619b2f0a26 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/config/CMakeLists.txt @@ -0,0 +1,7 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(searchsummary_config OBJECT + SOURCES + DEPENDS +) +vespa_generate_config(searchsummary_config juniperrc.def) +install(FILES juniperrc.def DESTINATION var/db/vespa/config_server/serverdb/classes) diff --git a/searchsummary/src/vespa/searchsummary/config/juniperrc.def b/searchsummary/src/vespa/searchsummary/config/juniperrc.def new file mode 100644 index 00000000000..4a748b2f604 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/config/juniperrc.def @@ -0,0 +1,78 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +namespace=vespa.config.search.summary + +## Set the length (in #characters) of the dynamically generated +## summaries. This is a hint to the module that generates the +## dynamic summary. The result may be slightly longer or shorter +## depending on the structure of the available document text and +## the submitted query. +length int default=256 + +## The number of (possibly partial) set of keywords +## matching the query, to attempt to include in the summary. The larger this +## value compared is set relative to the length parameter, the more +## dense the keywords may appear in the summary. +max_matches int default=3 + +## Minimal desired length of the generated summary in +## bytes. This is the shortest summary length for which the number of +## matches will be respected. Eg. if +## a summary appear to become shorter than min_length bytes with +## max_matches matches, then additional matches will be used if available. +min_length int default=128 + +## Make sure the prefix (length controlled by 'juniper.dynsum.length') +## of all fields with summary: dynamic are returned in the dynamic +## summary if a query does not hit in those fields +prefix bool default=true + +## The maximal number of bytes of context to prepend and append to +## each of the selected query keyword hits. This parameter defines the +## max size a summary would become if there are few keyword hits +## (max_matches set low or document contained few matches of the keywords). +surround_max int default=128 + +## The size of the sliding window used to determine if +## multiple query terms occur together. The larger the value, the more +## likely the system will find (and present in dynamic summary) complete +## matches containing all the search terms. The downside is a potential +## performance overhead of keeping candidates for matches longer during +## matching, and consequently updating more candidates that eventually +## gets thrown. +winsize int default=200 + +## This value multiplied with the winsize gives the size of a fallback +## window used to break out when searching for phrase term matches. +winsize_fallback_multiplier double default=10.0 + +## This value specifies the maximum number of match candidates that are +## managed for a non-leaf query node when matching the query against the +## input text. +max_match_candidates int default=1000 + +## The minimal number of bytes in a query keyword for +## it to be subject to the simple Juniper stemming algorithm. Keywords +## that are shorter than or equal to this limit will only yield exact +## matches in the dynamic summaries. +stem_min_length int default=5 + +## The maximal number of bytes that a word in the document +## can be longer than the keyword itself to yield a match. Eg. for +## the default values, if the keyword is 7 bytes long, it will match any +## word with length less than or equal to 10 for which the keyword is a prefix. +stem_max_extend int default=3 + + +## The parameters above may also be overriden on a per-field basis +## using the following array. +override[].fieldname string +override[].length int default=256 +override[].max_matches int default=3 +override[].min_length int default=128 +override[].prefix bool default=true +override[].surround_max int default=128 +override[].winsize int default=200 +override[].winsize_fallback_multiplier double default=10.0 +override[].max_match_candidates int default=1000 +override[].stem_min_length int default=5 +override[].stem_max_extend int default=3 diff --git a/searchsummary/src/vespa/searchsummary/docsummary/.gitignore b/searchsummary/src/vespa/searchsummary/docsummary/.gitignore new file mode 100644 index 00000000000..ee8938b6bf4 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/.gitignore @@ -0,0 +1,6 @@ +*.So +*.exe +*.ilk +*.pdb +.depend* +Makefile diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt new file mode 100644 index 00000000000..bbd7dc1e177 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt @@ -0,0 +1,26 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(searchsummary_docsummary OBJECT + SOURCES + resultclass.cpp + resultconfig.cpp + resultpacker.cpp + urlresult.cpp + getdocsumargs.cpp + docsumstate.cpp + docsumfieldwriter.cpp + docsumwriter.cpp + keywordextractor.cpp + attributedfw.cpp + dynamicteaserdfw.cpp + docsumconfig.cpp + rankfeaturesdfw.cpp + summaryfeaturesdfw.cpp + juniperproperties.cpp + textextractordfw.cpp + docsumformat.cpp + geoposdfw.cpp + tokenizer.cpp + positionsdfw.cpp + AFTER + searchsummary_config +) diff --git a/searchsummary/src/vespa/searchsummary/docsummary/attributedfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/attributedfw.cpp new file mode 100644 index 00000000000..4eef9e6a9c6 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/attributedfw.cpp @@ -0,0 +1,435 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/searchlib/attribute/stringbase.h> +#include <vespa/searchlib/attribute/integerbase.h> +#include <vespa/searchlib/attribute/floatbase.h> +#include <vespa/searchlib/attribute/tensorattribute.h> +#include <vespa/searchsummary/docsummary/docsumwriter.h> +#include <vespa/searchsummary/docsummary/attributedfw.h> +#include <vespa/vespalib/tensor/tensor.h> +#include <vespa/vespalib/tensor/serialization/slime_binary_format.h> + +LOG_SETUP(".searchlib.docsummary.attributedfw"); + +using namespace search; +using search::attribute::IAttributeContext; +using search::attribute::IAttributeVector; +using search::attribute::BasicType; + +namespace search { +namespace docsummary { + +ResType inferType(const IAttributeVector & vec) { + ResType retval; + if (vec.hasMultiValue()) { + retval = RES_STRING; + } else { + if (vec.isStringType()) { + retval = RES_STRING; + } else { + size_t fw = vec.getFixedWidth(); + if (vec.isIntegerType()) { + if (fw == sizeof(uint8_t)) { + retval = RES_BYTE; + } else if (fw == sizeof(uint16_t)) { + retval = RES_SHORT; + } else if (fw == sizeof(uint32_t)) { + retval = RES_INT; + } else { + retval = RES_INT64; + } + } else if (vec.isFloatingPointType()) { + retval = (fw == sizeof(float)) ? RES_FLOAT : RES_DOUBLE; + } else { + retval = RES_STRING; + } + } + } + return retval; +} + +//----------------------------------------------------------------------------- + +AttrDFW::AttrDFW(const vespalib::string & attrName) : + _attrName(attrName) +{ +} + +//----------------------------------------------------------------------------- + +class SingleAttrDFW : public AttrDFW +{ +public: + SingleAttrDFW(const vespalib::string & attrName) : + AttrDFW(attrName) + { } + virtual uint32_t WriteField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + RawBuf *target); + virtual void insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target); + virtual bool isDefaultValue(uint32_t docid, const GetDocsumsState * state) const; +}; + +uint32_t +SingleAttrDFW::WriteField(uint32_t docid, + GeneralResult *, + GetDocsumsState * state, + ResType type, + RawBuf *target) +{ + const char *s=""; + const IAttributeVector & v = vec(*state); + switch (type) { + case RES_INT: { + uint32_t val = v.getInt(docid); + target->append(&val, sizeof(val)); + return sizeof(val); + break; } + case RES_SHORT: { + uint16_t val = v.getInt(docid); + target->append(&val, sizeof(val)); + return sizeof(val); + break; } + case RES_BYTE: { + uint8_t val = v.getInt(docid); + target->append(&val, sizeof(val)); + return sizeof(val); + break; } + case RES_FLOAT: { + float val = v.getFloat(docid); + target->append(&val, sizeof(val)); + return sizeof(val); + break; } + case RES_DOUBLE: { + double val = v.getFloat(docid); + target->append(&val, sizeof(val)); + return sizeof(val); + break; } + case RES_INT64: { + uint64_t val = v.getInt(docid); + target->append(&val, sizeof(val)); + return sizeof(val); + break; } + case RES_STRING: + case RES_DATA: { + s = v.getString(docid, NULL, 0); // no need to pass in a buffer, this attribute has a string storage. + uint32_t len = strlen(s); + uint16_t slen = (len < 0xffff) ? len : 0xffff; + target->append(&slen, sizeof(slen)); + target->append(s, slen); + return (sizeof(slen) + slen); + break; } + case RES_JSONSTRING: { + BasicType::Type t = v.getBasicType(); + switch (t) { + case BasicType::TENSOR: { + const attribute::TensorAttribute &tv = + static_cast<const attribute::TensorAttribute &>(v); + const auto tensor = tv.getTensor(docid); + vespalib::string str; + if (tensor) { + auto slime = + vespalib::tensor::SlimeBinaryFormat::serialize(*tensor); + vespalib::slime::SimpleBuffer buf; + vespalib::slime::JsonFormat::encode(*slime, buf, true); + str = buf.get().make_string(); + } else { + // No tensor value => empty object + str = ""; + } + uint32_t slen = str.size(); + target->append(&slen, sizeof(slen)); + target->append(str.c_str(), slen); + return (sizeof(slen) + slen); + } + default: + break; + }; + } + /* FALLTHROUGH */ + case RES_XMLSTRING: + case RES_FEATUREDATA: + case RES_LONG_STRING: + case RES_LONG_DATA: { + s = v.getString(docid, NULL, 0); // no need to pass in a buffer, this attribute has a string storage. + uint32_t slen = strlen(s); + target->append(&slen, sizeof(slen)); + target->append(s, slen); + return (sizeof(slen) + slen); + break; } + default: + return 0; + } + return 0; +} + +bool SingleAttrDFW::isDefaultValue(uint32_t docid, const GetDocsumsState * state) const +{ + return vec(*state).isUndefined(docid); +} + +void +SingleAttrDFW::insertField(uint32_t docid, + GeneralResult *, + GetDocsumsState * state, + ResType type, + vespalib::slime::Inserter &target) +{ + const char *s=""; + const IAttributeVector & v = vec(*state); + switch (type) { + case RES_INT: { + uint32_t val = v.getInt(docid); + target.insertLong(val); + break; + } + case RES_SHORT: { + uint16_t val = v.getInt(docid); + target.insertLong(val); + break; + } + case RES_BYTE: { + uint8_t val = v.getInt(docid); + target.insertLong(val); + break; + } + case RES_FLOAT: { + float val = v.getFloat(docid); + target.insertDouble(val); + break; + } + case RES_DOUBLE: { + double val = v.getFloat(docid); + target.insertDouble(val); + break; + } + case RES_INT64: { + uint64_t val = v.getInt(docid); + target.insertLong(val); + break; + } + case RES_JSONSTRING: { + BasicType::Type t = v.getBasicType(); + switch (t) { + case BasicType::TENSOR: { + const attribute::TensorAttribute &tv = + static_cast<const attribute::TensorAttribute &>(v); + const auto tensor = tv.getTensor(docid); + if (tensor) { + vespalib::tensor::SlimeBinaryFormat::serialize(target, *tensor); + } else { + // No tensor value => no object + } + return; + } + default: + break; + }; + } + /* FALLTHROUGH */ + case RES_XMLSTRING: + case RES_FEATUREDATA: + case RES_LONG_STRING: + case RES_STRING: { + s = v.getString(docid, NULL, 0); // no need to pass in a buffer, this attribute has a string storage. + target.insertString(vespalib::slime::Memory(s)); + break; + } + case RES_LONG_DATA: + case RES_DATA: { + s = v.getString(docid, NULL, 0); // no need to pass in a buffer, this attribute has a string storage. + target.insertData(vespalib::slime::Memory(s)); + break; + } + default: + // unknown type, will be missing, should not happen + return; + } +} + + +//----------------------------------------------------------------------------- + +class MultiAttrDFW : public AttrDFW +{ +public: + MultiAttrDFW(const vespalib::string & attrName) : AttrDFW(attrName) {} + virtual uint32_t WriteField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + RawBuf *target); + virtual void insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target); + +}; + +uint32_t +MultiAttrDFW::WriteField(uint32_t docid, + GeneralResult *, + GetDocsumsState * state, + ResType type, + RawBuf *target) +{ + bool isLong = IsBinaryCompatible(type, RES_LONG_STRING); + uint32_t written = 0; + uint16_t str_len_16 = 0; + uint32_t str_len_32 = 0; + int str_len_ofs = target->GetUsedLen(); + vespalib::JSONStringer & jsonStr = state->_jsonStringer; + + if (isLong) { + target->append(&str_len_32, sizeof(str_len_32)); + } else { + target->append(&str_len_16, sizeof(str_len_16)); + } + const IAttributeVector & v = vec(*state); + uint32_t entries = v.getValueCount(docid); + { + std::vector<IAttributeVector::WeightedString> elements(entries); + entries = std::min(entries, v.get(docid, &elements[0], entries)); + jsonStr.clear(); + jsonStr.beginArray(); + for (uint32_t i = 0; i < entries; ++i) { + if (v.hasWeightedSetType()) { + jsonStr.beginArray(); + jsonStr.appendString(elements[i].getValue()); + jsonStr.appendInt64(elements[i].getWeight()); + jsonStr.endArray(); + } else { + jsonStr.appendString(elements[i].getValue()); + } + } + jsonStr.endArray(); + (*target) += jsonStr.toString().c_str(); + jsonStr.clear(); + } + + // calculate number of bytes written + written = target->GetUsedLen() - str_len_ofs; + + // patch in correct field length + if (isLong) { + str_len_32 = written - sizeof(str_len_32); + memcpy(target->GetWritableDrainPos(str_len_ofs), + &str_len_32, sizeof(str_len_32)); + } else { + str_len_16 = written - sizeof(str_len_16); + if (str_len_16 != written - sizeof(str_len_16)) { + target->truncate(str_len_ofs); + str_len_16 = 0; + target->append(&str_len_16, sizeof(uint16_t)); + *target += "***OVERFLOW***"; + written = target->GetUsedLen() - str_len_ofs; + str_len_16 = written - sizeof(uint16_t); + assert(str_len_16 == written - sizeof(uint16_t)); + } + memcpy(target->GetWritableDrainPos(str_len_ofs), + &str_len_16, sizeof(str_len_16)); + } + return written; +} + +void +MultiAttrDFW::insertField(uint32_t docid, + GeneralResult *, + GetDocsumsState *state, + ResType, + vespalib::slime::Inserter &target) +{ + using vespalib::slime::Cursor; + using vespalib::slime::Memory; + const IAttributeVector & v = vec(*state); + uint32_t entries = v.getValueCount(docid); + bool isWeightedSet = v.hasWeightedSetType(); + + Cursor &arr = target.insertArray(); + BasicType::Type t = v.getBasicType(); + switch (t) { + case BasicType::NONE: + case BasicType::STRING: { + std::vector<IAttributeVector::WeightedString> elements(entries); + entries = std::min(entries, v.get(docid, &elements[0], entries)); + for (uint32_t i = 0; i < entries; ++i) { + const vespalib::string &sv = elements[i].getValue(); + Memory value(sv.c_str(), sv.size()); + if (isWeightedSet) { + Cursor &elem = arr.addObject(); + elem.setString("item", value); + elem.setLong("weight", elements[i].getWeight()); + } else { + arr.addString(value); + } + } + return; } + case BasicType::UINT1: + case BasicType::UINT2: + case BasicType::UINT4: + case BasicType::INT8: + case BasicType::INT16: + case BasicType::INT32: + case BasicType::INT64: { + std::vector<IAttributeVector::WeightedInt> elements(entries); + entries = std::min(entries, v.get(docid, &elements[0], entries)); + for (uint32_t i = 0; i < entries; ++i) { + if (isWeightedSet) { + Cursor &elem = arr.addObject(); + elem.setLong("item", elements[i].getValue()); + elem.setLong("weight", elements[i].getWeight()); + } else { + arr.addLong(elements[i].getValue()); + } + } + return; } + case BasicType::FLOAT: + case BasicType::DOUBLE: { + std::vector<IAttributeVector::WeightedFloat> elements(entries); + entries = std::min(entries, v.get(docid, &elements[0], entries)); + for (uint32_t i = 0; i < entries; ++i) { + if (isWeightedSet) { + Cursor &elem = arr.addObject(); + elem.setDouble("item", elements[i].getValue()); + elem.setLong("weight", elements[i].getWeight()); + } else { + arr.addDouble(elements[i].getValue()); + } + } + return; } + default: + // should not happen + LOG(error, "bad value for type: %u\n", t); + LOG_ASSERT(false); + } +} + +//----------------------------------------------------------------------------- + +IDocsumFieldWriter * +AttributeDFWFactory::create(IAttributeManager & vecMan, const char *vecName) +{ + IAttributeContext::UP ctx = vecMan.createContext(); + const IAttributeVector * vec = ctx->getAttribute(vecName); + if (vec == NULL) { + LOG(warning, "No valid attribute vector found: %s", vecName); + return NULL; + } + if (vec->hasMultiValue()) { + return new MultiAttrDFW(vec->getName()); + } else { + return new SingleAttrDFW(vec->getName()); + } +} + +} +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/attributedfw.h b/searchsummary/src/vespa/searchsummary/docsummary/attributedfw.h new file mode 100644 index 00000000000..b8ac9b30510 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/attributedfw.h @@ -0,0 +1,27 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/attribute/iattributemanager.h> +#include <vespa/searchsummary/docsummary/docsumfieldwriter.h> + +namespace search { +namespace docsummary { + +class AttrDFW : public IDocsumFieldWriter +{ +private: + vespalib::string _attrName; +protected: + const attribute::IAttributeVector & vec(const GetDocsumsState & s) const { + return *s.getAttribute(getIndex()); + } + virtual const vespalib::string & getAttributeName() const { return _attrName; } +public: + AttrDFW(const vespalib::string & attrName); + virtual bool IsGenerated() const { return true; } +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp new file mode 100644 index 00000000000..18d383d5bfd --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp @@ -0,0 +1,124 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/searchsummary/docsummary/docsumconfig.h> +#include <vespa/searchsummary/docsummary/rankfeaturesdfw.h> +#include <vespa/searchsummary/docsummary/summaryfeaturesdfw.h> +#include <vespa/searchsummary/docsummary/textextractordfw.h> +#include <vespa/searchsummary/docsummary/geoposdfw.h> +#include <vespa/searchsummary/docsummary/positionsdfw.h> +#include <vespa/searchsummary/docsummary/juniperdfw.h> +#include <vespa/vespalib/util/vstringfmt.h> + +LOG_SETUP(".searchlib.docsummary.docsumconfig"); + + +namespace search { +namespace docsummary { + +using vespalib::IllegalArgumentException; +using vespalib::make_string; + +IDocsumFieldWriter::UP +DynamicDocsumConfig::createFieldWriter(const string & fieldName, const string & overrideName, const string & argument, bool & rc) +{ + const ResultConfig & resultConfig = getResultConfig(); + rc = false; + IDocsumFieldWriter::UP fieldWriter; + if (overrideName == "dynamicteaser") { + if ( ! argument.empty() ) { + const char *langFieldName = "something unused"; + DynamicTeaserDFW *fw = new DynamicTeaserDFW(getEnvironment()->getJuniper()); + fieldWriter.reset(fw); + rc = fw->Init(fieldName.c_str(), langFieldName, resultConfig, argument.c_str()); + } else { + throw IllegalArgumentException("Missing argument"); + } + } else if (overrideName == "textextractor") { + if ( ! argument.empty() ) { + TextExtractorDFW * fw = new TextExtractorDFW(); + fieldWriter.reset(fw); + rc = fw->init(fieldName, argument, resultConfig); + } else { + throw IllegalArgumentException("Missing argument"); + } + } else if (overrideName == "summaryfeatures") { + SummaryFeaturesDFW *fw = new SummaryFeaturesDFW(); + fieldWriter.reset(fw); + fw->init(getEnvironment()); + rc = true; + } else if (overrideName == "rankfeatures") { + RankFeaturesDFW * fw = new RankFeaturesDFW(); + fw->init(getEnvironment()); + fieldWriter.reset(fw); + rc = true; + } else if (overrideName == "empty") { + EmptyDFW *fw = new EmptyDFW(); + fieldWriter.reset(fw); + rc = true; + } else if (overrideName == "copy") { + if ( ! argument.empty() ) { + CopyDFW *fw = new CopyDFW(); + fieldWriter.reset(fw); + rc = fw->Init(resultConfig, argument.c_str()); + } else { + throw IllegalArgumentException("Missing argument"); + } + } else if (overrideName == "absdist") { + if (getEnvironment()) { + IAttributeManager *am = getEnvironment()->getAttributeManager(); + fieldWriter = createAbsDistanceDFW(argument.c_str(), am); + rc = fieldWriter.get(); + } + } else if (overrideName == "positions") { + if (getEnvironment()) { + IAttributeManager *am = getEnvironment()->getAttributeManager(); + fieldWriter = createPositionsDFW(argument.c_str(), am); + rc = fieldWriter.get(); + } + } else if (overrideName == "geopos") { + if (getEnvironment()) { + IAttributeManager *am = getEnvironment()->getAttributeManager(); + fieldWriter = GeoPositionDFW::create(argument.c_str(), am); + rc = fieldWriter.get(); + } + } else if (overrideName == "attribute") { + const char *vectorName = argument.c_str(); + if (getEnvironment() && getEnvironment()->getAttributeManager()) { + IDocsumFieldWriter *fw = AttributeDFWFactory::create(*getEnvironment()->getAttributeManager(), vectorName); + fieldWriter.reset(fw); + rc = fw != NULL; + } + } else { + throw IllegalArgumentException("unknown override operation '" + overrideName + "' for field '" + fieldName + "'."); + } + return fieldWriter; +} + +void +DynamicDocsumConfig::configure(const vespa::config::search::SummarymapConfig &cfg) +{ + std::vector<string> strCfg; + if ((cfg.defaultoutputclass != -1) && !_writer->SetDefaultOutputClass(cfg.defaultoutputclass)) { + throw IllegalArgumentException(make_string("could not set default output class to %d", cfg.defaultoutputclass)); + } + for (size_t i = 0; i < cfg.override.size(); ++i) { + const vespa::config::search::SummarymapConfig::Override & o = cfg.override[i]; + // DYNAMIC TEASER + bool rc(false); + IDocsumFieldWriter::UP fieldWriter = createFieldWriter(o.field, o.command, o.arguments, rc); + if (rc && fieldWriter.get() != NULL) { + rc = _writer->Override(o.field.c_str(), fieldWriter.release()); // OBJECT HAND-OVER + } + if (!rc) { + throw IllegalArgumentException(o.command + " override operation failed during initialization"); + } + } +} + + +} +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.h b/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.h new file mode 100644 index 00000000000..04f2890e7c1 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.h @@ -0,0 +1,41 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/searchsummary/docsummary/docsumwriter.h> +#include <vespa/searchsummary/docsummary/docsumfieldwriter.h> +#include <vespa/searchsummary/docsummary/idocsumenvironment.h> +#include <vespa/config-summarymap.h> + +namespace search { +namespace docsummary { + +class DynamicDocsumConfig +{ +public: + DynamicDocsumConfig(IDocsumEnvironment * env, DynamicDocsumWriter * writer) : + _env(env), + _writer(writer) + { + } + virtual ~DynamicDocsumConfig() { } + void configure(const vespa::config::search::SummarymapConfig &cfg); +protected: + typedef vespalib::string string; + IDocsumEnvironment * getEnvironment() { return _env; } + const IDocsumEnvironment * getEnvironment() const { return _env; } + const ResultConfig & getResultConfig() const { return *_writer->GetResultConfig(); } + + virtual IDocsumFieldWriter::UP + createFieldWriter(const string & fieldName, const string & overrideName, + const string & argument, bool & rc); +private: + IDocsumEnvironment * _env; + DynamicDocsumWriter * _writer; +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumfieldwriter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsumfieldwriter.cpp new file mode 100644 index 00000000000..fa379397476 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumfieldwriter.cpp @@ -0,0 +1,286 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <math.h> +#include <vespa/searchlib/attribute/iattributemanager.h> +#include <vespa/searchlib/common/documentlocations.h> +#include <vespa/searchlib/common/location.h> +#include <vespa/searchsummary/docsummary/docsumfieldwriter.h> +#include <vespa/searchsummary/docsummary/idocsumenvironment.h> +#include <vespa/searchsummary/docsummary/docsumformat.h> +#include <vespa/searchlib/parsequery/stackdumpiterator.h> + +LOG_SETUP(".searchlib.docsummary.docsumfieldwriter"); + +namespace search { +namespace docsummary { + +using search::attribute::IAttributeContext; +using search::attribute::IAttributeVector; +using search::attribute::BasicType; +using search::common::Location; + +//-------------------------------------------------------------------------- + +const vespalib::string IDocsumFieldWriter::_empty(""); + +//-------------------------------------------------------------------------- + +EmptyDFW::EmptyDFW() +{ +} + + +EmptyDFW::~EmptyDFW() +{ +} + +void +EmptyDFW::insertField(uint32_t /*docid*/, + GeneralResult *, + GetDocsumsState *, + ResType, + vespalib::slime::Inserter &target) +{ + // insert explicitly-empty field? + // target.insertNix(); + (void)target; + return; +} + +uint32_t +EmptyDFW::WriteField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + search::RawBuf *target) +{ + (void) docid; + (void) gres; + (void) state; + return DocsumFormat::addEmpty(type, *target); +} + +//-------------------------------------------------------------------------- + +CopyDFW::CopyDFW() + : _inputFieldEnumValue(static_cast<uint32_t>(-1)) +{ +} + + +CopyDFW::~CopyDFW() +{ +} + + +bool +CopyDFW::Init(const ResultConfig & config, const char *inputField) +{ + _inputFieldEnumValue = config.GetFieldNameEnum().Lookup(inputField); + + if (_inputFieldEnumValue >= config.GetFieldNameEnum().GetNumEntries()) { + LOG(warning, "no docsum format contains field '%s'; copied fields will be empty", inputField); + } + + for (ResultConfig::const_iterator it(config.begin()), mt(config.end()); it != mt; it++) { + const ResConfigEntry *entry = + it->GetEntry(it->GetIndexFromEnumValue(_inputFieldEnumValue)); + + if (entry != NULL && + !IsRuntimeCompatible(entry->_type, RES_INT) && + !IsRuntimeCompatible(entry->_type, RES_DOUBLE) && + !IsRuntimeCompatible(entry->_type, RES_INT64) && + !IsRuntimeCompatible(entry->_type, RES_STRING) && + !IsRuntimeCompatible(entry->_type, RES_DATA)) { + + LOG(warning, "cannot use docsum field '%s' as input to copy; type conflict with result class %d (%s)", + inputField, it->GetClassID(), it->GetClassName()); + return false; + } + } + return true; +} + + +void +CopyDFW::insertField(uint32_t /*docid*/, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target) +{ + int idx = gres->GetClass()->GetIndexFromEnumValue(_inputFieldEnumValue); + ResEntry *entry = gres->GetEntry(idx); + + if (entry != NULL && + IsRuntimeCompatible(entry->_type, type)) + { + switch (type) { + case RES_INT: { + uint32_t val32 = entry->_intval; + target.insertLong(val32); + break; } + + case RES_SHORT: { + uint16_t val16 = entry->_intval; + target.insertLong(val16); + break; } + + case RES_BYTE: { + uint8_t val8 = entry->_intval; + target.insertLong(val8); + break; } + + case RES_FLOAT: { + float valfloat = entry->_doubleval; + target.insertDouble(valfloat); + break; } + + case RES_DOUBLE: { + double valdouble = entry->_doubleval; + target.insertDouble(valdouble); + break; } + + case RES_INT64: { + uint64_t valint64 = entry->_int64val; + target.insertLong(valint64); + break; } + + case RES_XMLSTRING: + case RES_JSONSTRING: + case RES_FEATUREDATA: + case RES_LONG_STRING: + case RES_STRING: { + uint32_t len; + const char *spt; + // resolve field + entry->_resolve_field(&spt, &len, + &state->_docSumFieldSpace); + vespalib::slime::Memory value(spt, len); + target.insertString(value); + break; } + + case RES_LONG_DATA: + case RES_DATA: { + uint32_t len; + const char *dpt; + // resolve field + entry->_resolve_field(&dpt, &len, + &state->_docSumFieldSpace); + vespalib::slime::Memory value(dpt, len); + target.insertData(value); + break; } + } + } +} + +uint32_t +CopyDFW::WriteField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + search::RawBuf *target) +{ + (void) docid; + + uint32_t written = 0; + + int idx = gres->GetClass()->GetIndexFromEnumValue(_inputFieldEnumValue); + ResEntry *entry = gres->GetEntry(idx); + + DocsumFormat::Appender appender(*target); + + if (entry != NULL && + IsRuntimeCompatible(entry->_type, type)) { + + // copy field + + switch (type) { + + case RES_INT: { + written += appender.addInt32(entry->_intval); + break; } + + case RES_SHORT: { + written += appender.addShort(entry->_intval); + break; } + + case RES_BYTE: { + written += appender.addByte(entry->_intval); + break; } + + case RES_FLOAT: { + written += appender.addFloat(entry->_doubleval); + break; } + + case RES_DOUBLE: { + written += appender.addDouble(entry->_doubleval); + break; } + + case RES_INT64: { + written += appender.addInt64(entry->_int64val); + break; } + + case RES_STRING: { + uint32_t len; + const char *spt; + // resolve field + entry->_resolve_field(&spt, &len, + &state->_docSumFieldSpace); + written += appender.addShortData(spt, len); + break; } + + case RES_DATA: { + uint32_t len; + const char *dpt; + // resolve field + entry->_resolve_field(&dpt, &len, + &state->_docSumFieldSpace); + written += appender.addShortData(dpt, len); + break; } + + case RES_XMLSTRING: + case RES_JSONSTRING: + case RES_FEATUREDATA: + case RES_LONG_STRING: { + + uint32_t flen = entry->_len; + uint32_t slen = entry->_get_length(); + + // preserve compression flag + target->append(&flen, sizeof(flen)); + written += sizeof(flen); + target->append(entry->_stringval, slen); + written += slen; + + break; } + + case RES_LONG_DATA: { + + uint32_t flen = entry->_len; + uint32_t dlen = entry->_get_length(); + + // preserve compression flag + target->append(&flen, sizeof(flen)); + written += sizeof(flen); + target->append(entry->_dataval, dlen); + written += dlen; + + break; } + } + } else { + // insert empty field + written += appender.addEmpty(type); + } + + return written; +} + +//-------------------------------------------------------------------------- + +} // namespace docsummary +} // namespace search diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumfieldwriter.h b/searchsummary/src/vespa/searchsummary/docsummary/docsumfieldwriter.h new file mode 100644 index 00000000000..4986697c5bd --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumfieldwriter.h @@ -0,0 +1,116 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/searchlib/attribute/iattributemanager.h> +#include <vespa/searchsummary/docsummary/urlresult.h> +#include <vespa/searchsummary/docsummary/docsumstate.h> +#include <vespa/searchsummary/docsummary/resultconfig.h> +#include <vespa/vespalib/data/slime/inserter.h> + +namespace search { +namespace docsummary { + + +using search::IAttributeManager; + +class IDocsumFieldWriter +{ +public: + typedef std::unique_ptr<IDocsumFieldWriter> UP; + IDocsumFieldWriter() : _index(0) { } + virtual ~IDocsumFieldWriter() {} + + static bool IsBinaryCompatible(ResType a, ResType b) + { return ResultConfig::IsBinaryCompatible(a, b); } + + static bool IsRuntimeCompatible(ResType a, ResType b) + { return ResultConfig::IsRuntimeCompatible(a, b); } + + virtual bool IsGenerated() const = 0; + virtual uint32_t WriteField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + search::RawBuf *target) = 0; + virtual void insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target) = 0; + virtual const vespalib::string & getAttributeName() const { return _empty; } + virtual bool isDefaultValue(uint32_t docid, const GetDocsumsState * state) const { + (void) docid; + (void) state; + return false; + } + void setIndex(size_t v) { _index = v; } + size_t getIndex() const { return _index; } +private: + size_t _index; + static const vespalib::string _empty; +}; + +//-------------------------------------------------------------------------- + +class EmptyDFW : public IDocsumFieldWriter +{ +public: + EmptyDFW(); + virtual ~EmptyDFW(); + + virtual bool IsGenerated() const { return true; } + virtual uint32_t WriteField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + search::RawBuf *target); + virtual void insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target); +}; + +//-------------------------------------------------------------------------- + +class CopyDFW : public IDocsumFieldWriter +{ +private: + uint32_t _inputFieldEnumValue; + +public: + CopyDFW(); + virtual ~CopyDFW(); + + bool Init(const ResultConfig & config, const char *inputField); + + virtual bool IsGenerated() const { return false; } + virtual uint32_t WriteField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + search::RawBuf *target); + virtual void insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target); +}; + +//-------------------------------------------------------------------------- + +class AttributeDFWFactory +{ +private: + AttributeDFWFactory(); +public: + static IDocsumFieldWriter *create(IAttributeManager & vecMan, const char *vecName); +}; + +} // namespace docsummary +} // namespace search + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumformat.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsumformat.cpp new file mode 100644 index 00000000000..a837fca3bdb --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumformat.cpp @@ -0,0 +1,108 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/searchsummary/docsummary/docsumformat.h> + +namespace search { +namespace docsummary { + +LOG_SETUP(".searchlib.docsummary.docsumformat"); + + +size_t +DocsumFormat::addByte(search::RawBuf &target, uint8_t value) + +{ + target.append(&value, sizeof(value)); + return sizeof(value); +} + +size_t +DocsumFormat::addShort(search::RawBuf &target, uint16_t value) +{ + target.append(&value, sizeof(value)); + return sizeof(value); +} + +size_t +DocsumFormat::addInt32(search::RawBuf &target, uint32_t value) +{ + target.append(&value, sizeof(value)); + return sizeof(value); +} + +size_t +DocsumFormat::addFloat(search::RawBuf &target, float value) +{ + target.append(&value, sizeof(value)); + return sizeof(value); +} + +size_t +DocsumFormat::addDouble(search::RawBuf &target, double value) +{ + target.append(&value, sizeof(value)); + return sizeof(value); +} + +size_t +DocsumFormat::addInt64(search::RawBuf &target, uint64_t value) +{ + target.append(&value, sizeof(value)); + return sizeof(value); +} + +size_t +DocsumFormat::addShortData(search::RawBuf &target, const char *buf, uint32_t buflen) +{ + uint16_t len = (buflen > 0xffff ? 0xffff : buflen); + target.append(&len, sizeof(len)); + target.append(buf, len); + + return sizeof(len) + len; +} + +size_t +DocsumFormat::addLongData(search::RawBuf &target, const char *buf, uint32_t buflen) +{ + target.append(&buflen, sizeof(buflen)); + target.append(buf, buflen); + + return sizeof(buflen) + buflen; +} + +size_t +DocsumFormat::addEmpty(ResType type, search::RawBuf &target) +{ + switch (type) { + case RES_BYTE: + return addByte(target, 0); + case RES_SHORT: + return addShort(target, 0); + case RES_INT: + return addInt32(target, 0); + case RES_INT64: + return addInt64(target, 0L); + case RES_FLOAT: + return addFloat(target, 0.0f); + case RES_DOUBLE: + return addDouble(target, 0.0); + case RES_STRING: + case RES_DATA: + return addShortData(target, "", 0); + case RES_LONG_STRING: + case RES_LONG_DATA: + case RES_XMLSTRING: + case RES_JSONSTRING: + case RES_FEATUREDATA: + return addLongData(target, "", 0); + } + LOG_ASSERT(type <= RES_FEATUREDATA); + return 0; +} + +} // namespace docsummary +} // namespace search diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumformat.h b/searchsummary/src/vespa/searchsummary/docsummary/docsumformat.h new file mode 100644 index 00000000000..50bce13efd3 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumformat.h @@ -0,0 +1,67 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/searchsummary/docsummary/resultclass.h> + +namespace search { +namespace docsummary { + +class DocsumFormat +{ +public: + static size_t addByte(search::RawBuf &target, uint8_t value); + static size_t addShort(search::RawBuf &target, uint16_t value); + static size_t addInt32(search::RawBuf &target, uint32_t value); + static size_t addFloat(search::RawBuf &target, float value); + static size_t addDouble(search::RawBuf &target, double value); + static size_t addInt64(search::RawBuf &target, uint64_t value); + static size_t addShortData(search::RawBuf &target, const char *buf, uint32_t buflen); + static size_t addLongData(search::RawBuf &target, const char *buf, uint32_t buflen); + + static size_t addEmpty(ResType type, search::RawBuf &target); + + class Appender { + private: + search::RawBuf &_target; + public: + Appender(search::RawBuf &target) : _target(target) {} + + size_t addByte(uint8_t value) { + return DocsumFormat::addByte(_target, value); + } + size_t addShort(uint16_t value) { + return DocsumFormat::addShort(_target, value); + } + size_t addInt32(uint32_t value) { + return DocsumFormat::addInt32(_target, value); + } + size_t addFloat(float value) { + return DocsumFormat::addFloat(_target, value); + } + size_t addDouble(double value) { + return DocsumFormat::addDouble(_target, value); + } + size_t addInt64(uint64_t value) { + return DocsumFormat::addInt64(_target, value); + } + size_t addShortData(const char *buf, uint32_t buflen) { + return DocsumFormat::addShortData(_target, buf, buflen); + } + size_t addLongData(const char *buf, uint32_t buflen) { + return DocsumFormat::addLongData(_target, buf, buflen); + } + + size_t addEmpty(ResType type) { + return DocsumFormat::addEmpty(type, _target); + } + }; + +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumstate.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsumstate.cpp new file mode 100644 index 00000000000..f8139dcb2a9 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumstate.cpp @@ -0,0 +1,50 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/searchsummary/docsummary/docsumstate.h> + +namespace search { +namespace docsummary { + +GetDocsumsState::GetDocsumsState(GetDocsumsStateCallback &callback) + : _args(), + _docsumbuf(NULL), + _docsumcnt(0), + _kwExtractor(NULL), + _keywords(NULL), + _callback(callback), + _dynteaser(), + _docSumFieldSpace(_docSumFieldSpaceStore, sizeof(_docSumFieldSpaceStore)), // only alloc buffer if needed + _attrCtx(), + _attributes(), + _jsonStringer(), + _parsedLocation(), + _summaryFeatures(NULL), + _summaryFeaturesCached(false), + _rankFeatures(NULL) +{ + _dynteaser._docid = static_cast<uint32_t>(-1); + _dynteaser._input = static_cast<uint32_t>(-1); + _dynteaser._lang = static_cast<uint32_t>(-1); + _dynteaser._config = NULL; + _dynteaser._query = NULL; + _dynteaser._result = NULL; +} + + +GetDocsumsState::~GetDocsumsState() +{ + free(_docsumbuf); + free(_keywords); + if (_dynteaser._result != NULL) { + juniper::ReleaseResult(_dynteaser._result); + } + if (_dynteaser._query != NULL) { + juniper::ReleaseQueryHandle(_dynteaser._query); + } +} + +} +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumstate.h b/searchsummary/src/vespa/searchsummary/docsummary/docsumstate.h new file mode 100644 index 00000000000..48ba84fb1e4 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumstate.h @@ -0,0 +1,86 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/juniper/rpinterface.h> + +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/searchlib/attribute/attributeguard.h> +#include <vespa/searchsummary/docsummary/getdocsumargs.h> +#include <vespa/searchsummary/docsummary/idocsumenvironment.h> +#include <vespa/searchsummary/docsummary/keywordextractor.h> +#include <vespa/searchlib/common/featureset.h> +#include <vespa/searchlib/common/location.h> +#include <vespa/vespalib/util/jsonwriter.h> + + +namespace search { +namespace docsummary { + +class GetDocsumsState; + +class GetDocsumsStateCallback +{ +public: + virtual void FillSummaryFeatures(GetDocsumsState * state, IDocsumEnvironment * env) = 0; + virtual void FillRankFeatures(GetDocsumsState * state, IDocsumEnvironment * env) = 0; + virtual void ParseLocation(GetDocsumsState * state) = 0; + virtual ~GetDocsumsStateCallback(void) { } +}; + +/** + * Per-thread memory shared between all docsum field generators. + **/ +class GetDocsumsState +{ +private: + GetDocsumsState(const GetDocsumsState &); + GetDocsumsState& operator=(const GetDocsumsState &); + +public: + const search::attribute::IAttributeVector * getAttribute(size_t index) const { return _attributes[index]; } + + GetDocsumArgs _args; // from getdocsums request + + uint32_t *_docsumbuf; // from getdocsums request + uint32_t _docsumcnt; // from getdocsums request + + KeywordExtractor *_kwExtractor; + char *_keywords; // list of keywords from query + + GetDocsumsStateCallback &_callback; + + struct DynTeaserState { + uint32_t _docid; // document id ('cache key') + uint32_t _input; // input field ('cache key') + uint32_t _lang; // lang field ('cache key') + juniper::Config *_config; // juniper config ('cache key') + juniper::QueryHandle *_query; // juniper query representation + juniper::Result *_result; // juniper analyze result + } _dynteaser; + + search::RawBuf _docSumFieldSpace; + char _docSumFieldSpaceStore[2048]; + search::attribute::IAttributeContext::UP _attrCtx; + std::vector<const search::attribute::IAttributeVector *> _attributes; + vespalib::JSONStringer _jsonStringer; + + // used by AbsDistanceDFW + std::unique_ptr<search::common::Location> _parsedLocation; + + // used by SummaryFeaturesDFW + FeatureSet::SP _summaryFeatures; + bool _summaryFeaturesCached; + + // used by RankFeaturesDFW + FeatureSet::SP _rankFeatures; + + GetDocsumsState(GetDocsumsStateCallback &callback); + ~GetDocsumsState(); +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumstore.h b/searchsummary/src/vespa/searchsummary/docsummary/docsumstore.h new file mode 100644 index 00000000000..fcdb16e2e05 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumstore.h @@ -0,0 +1,57 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/fastos/fastos.h> +#include <utility> + +#include "docsumstorevalue.h" + +namespace search { +namespace docsummary { + + +/** + * Interface for object able to fetch docsum blobs based on local + * document id. + **/ +class IDocsumStore +{ +public: + /** + * Convenience typedef. + */ + typedef std::unique_ptr<IDocsumStore> UP; + + /** + * Destructor. No cleanup needed for base class. + */ + virtual ~IDocsumStore(void) { } + + /** + * @return total number of documents. + **/ + virtual uint32_t getNumDocs() = 0; + + /** + * Get a reference to a docsum blob in memory. The docsum store + * owns the memory (which is either mmap()ed or from a memory-based + * index of some kind). + * + * @return docsum blob location and size + * @param docid local document id + * @param useSlimeInsideFields use serialized slime instead of json for structured fields + **/ + virtual DocsumStoreValue getMappedDocsum(uint32_t docid, bool useSlimeInsideFields) = 0; + + /** + * Will return default input class used. + **/ + virtual uint32_t getSummaryClassId() const = 0; +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumstorevalue.h b/searchsummary/src/vespa/searchsummary/docsummary/docsumstorevalue.h new file mode 100644 index 00000000000..9116ecf1395 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumstorevalue.h @@ -0,0 +1,63 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/fastos/fastos.h> +#include <utility> + +namespace search { +namespace docsummary { + +/** + * Simple wrapper class containing the location and size of a docsum + * blob located in memory. The memory containing the docsum blob is + * owned by the object that emitted the docsum store value object. + * Always start with an uint32_t representing the result class ID. + **/ +class DocsumStoreValue +{ +private: + std::pair<const char *, uint32_t> _value; + +public: + /** + * Construct object representing an empty docsum blob. + **/ + DocsumStoreValue() : _value(static_cast<const char*>(0), 0) {} + + /** + * Construct object encapsulating the given location and size. + * + * @param pt_ docsum location + * @param len_ docsum size + **/ + DocsumStoreValue(const char *pt_, uint32_t len_) : _value(pt_, len_) {} + + /** + * @return docsum blob location + **/ + const char *pt() const { return _value.first; } + + /** + * @return docsum blob size + **/ + uint32_t len() const { return _value.second; } + + /** + * @return pointer to start of serialized docsum fields + **/ + const char *fieldsPt() const { return _value.first + sizeof(uint32_t); } + + /** + * @return size of serialized docsum fields + **/ + uint32_t fieldsSz() const { return _value.second - sizeof(uint32_t); } + + /** + * @return true if this has a valid blob + **/ + bool valid() const { return (_value.first != 0) && (_value.second >= sizeof(uint32_t)); } +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumwriter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsumwriter.cpp new file mode 100644 index 00000000000..8f0df1915db --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumwriter.cpp @@ -0,0 +1,517 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/fastlib/text/normwordfolder.h> +#include <vespa/searchsummary/docsummary/docsumwriter.h> +#include <vespa/searchsummary/docsummary/docsumfieldwriter.h> +#include <vespa/searchsummary/docsummary/docsumstore.h> +#include <vespa/searchsummary/docsummary/keywordextractor.h> +#include <vespa/searchsummary/docsummary/docsumformat.h> +#include <vespa/searchlib/common/transport.h> +#include <vespa/vespalib/data/slime/slime.h> +#include <vespa/searchlib/util/slime_output_raw_buf_adapter.h> + +using namespace vespalib::slime::convenience; + +namespace search { +namespace docsummary { + +LOG_SETUP(".searchlib.docsummary.docsumwriter"); + + +uint32_t +IDocsumWriter::slime2RawBuf(const Slime & slime, RawBuf & buf) +{ + const uint32_t preUsed = buf.GetUsedLen(); + const uint32_t magic = ::search::fs4transport::SLIME_MAGIC_ID; + buf.append(&magic, sizeof(magic)); + SlimeOutputRawBufAdapter adapter(buf); + vespalib::slime::BinaryFormat::encode(slime, adapter); + return (buf.GetUsedLen() - preUsed); +} + +uint32_t +DynamicDocsumWriter::WriteClassID(uint32_t classID, search::RawBuf *target) +{ + uint32_t written = 0; + + target->append(&classID, sizeof(classID)); + written = sizeof(classID); + + return written; +} + + +uint32_t +DynamicDocsumWriter::GenerateDocsum(uint32_t docid, + GetDocsumsState *state, + const ResultClass *outputClass, + search::RawBuf *target) +{ + uint32_t written = 0; + + written += WriteClassID(outputClass->GetClassID(), target); + + for (uint32_t i = 0; i < outputClass->GetNumEntries(); i++) { + + const ResConfigEntry *outCfg = outputClass->GetEntry(i); + IDocsumFieldWriter *writer = _overrideTable[outCfg->_enumValue]; + LOG_ASSERT(writer != NULL); + + written += writer->WriteField(docid, NULL, state, outCfg->_type, target); + } + + return written; +} + + +uint32_t +DynamicDocsumWriter::RepackDocsum(GeneralResult *gres, + GetDocsumsState *state, + const ResultClass *outputClass, + search::RawBuf *target) +{ + uint32_t written = 0; + + written += WriteClassID(outputClass->GetClassID(), target); + + DocsumFormat::Appender appender(*target); + + for (uint32_t i = 0; i < outputClass->GetNumEntries(); i++) { + + const ResConfigEntry *outCfg = outputClass->GetEntry(i); + IDocsumFieldWriter *writer = _overrideTable[outCfg->_enumValue]; + + if (writer != NULL) { + + written += writer->WriteField(gres->GetDocID(), gres, state, + outCfg->_type, target); + + } else { + + int inIdx = gres->GetClass()->GetIndexFromEnumValue(outCfg->_enumValue); + const ResConfigEntry *inCfg = gres->GetClass()->GetEntry(inIdx); + + if (inCfg != NULL && inCfg->_type == outCfg->_type) { + + // copy field + + const ResEntry *entry = gres->GetEntry(inIdx); + LOG_ASSERT(entry != NULL); + + switch (outCfg->_type) { + + case RES_INT: { + written += appender.addInt32(entry->_intval); + break; } + + case RES_SHORT: { + written += appender.addShort(entry->_intval); + break; } + + case RES_BYTE: { + written += appender.addByte(entry->_intval); + break; } + + case RES_FLOAT: { + written += appender.addFloat(entry->_doubleval); + break; } + + case RES_DOUBLE: { + written += appender.addDouble(entry->_doubleval); + break; } + + case RES_INT64: { + written += appender.addInt64(entry->_int64val); + break; } + + case RES_STRING: { + uint32_t slen = entry->_stringlen; + const char *sval = entry->_stringval; + written += appender.addShortData(sval, slen); + break; } + + case RES_DATA: { + uint32_t dlen = entry->_datalen; + const char *dval = entry->_dataval; + written += appender.addShortData(dval, dlen); + break; } + + case RES_XMLSTRING: + case RES_JSONSTRING: + case RES_FEATUREDATA: + case RES_LONG_STRING: { + uint32_t flen = entry->_len; + uint32_t slen = entry->_get_length(); + + // preserve compression flag + target->append(&flen, sizeof(flen)); + written += sizeof(flen); + target->append(entry->_stringval, slen); + written += slen; + break; } + + case RES_LONG_DATA: { + uint32_t flen = entry->_len; + uint32_t dlen = entry->_get_length(); + + // preserve compression flag + target->append(&flen, sizeof(flen)); + written += sizeof(flen); + target->append(entry->_dataval, dlen); + written += dlen; + break; } + } + + } else { + // insert empty field + written += appender.addEmpty(outCfg->_type); + } + } + } // END for loop + + return written; +} + + +DynamicDocsumWriter::ResolveClassInfo +DynamicDocsumWriter::resolveClassInfo(vespalib::stringref outputClassName, uint32_t inputClassId) const +{ + DynamicDocsumWriter::ResolveClassInfo rci = resolveOutputClass(outputClassName); + if (!rci.mustSkip && !rci.allGenerated) { + resolveInputClass(rci, inputClassId); + } + return rci; +} + +DynamicDocsumWriter::ResolveClassInfo +DynamicDocsumWriter::resolveOutputClass(vespalib::stringref summaryClass) const +{ + DynamicDocsumWriter::ResolveClassInfo result; + uint32_t id = _defaultOutputClass; + id = _resultConfig->LookupResultClassId(summaryClass, id); + + if (id != ResultConfig::NoClassID()) { + const ResultClass *oC = _resultConfig->LookupResultClass(id); + if (oC == NULL) { + LOG(warning, "Illegal docsum class requested: %d, using empty docsum for documents", id); + result.mustSkip = true; + } else { + result.outputClass = oC; + const ResultClass::DynamicInfo *rcInfo = oC->getDynamicInfo(); + if (rcInfo->_generateCnt == oC->GetNumEntries()) { + LOG_ASSERT(rcInfo->_overrideCnt == rcInfo->_generateCnt); + result.allGenerated = true; + } + result.outputClassInfo = rcInfo; + } + } + result.outputClassId = id; + return result; +} + +void +DynamicDocsumWriter::resolveInputClass(ResolveClassInfo &rci, uint32_t id) const +{ + rci.inputClass = _resultConfig->LookupResultClass(id); + if (rci.inputClass == NULL) { + rci.mustSkip = true; + return; + } + if (rci.outputClass == NULL) { + LOG_ASSERT(rci.outputClassId == ResultConfig::NoClassID()); + rci.outputClassId = id; + rci.outputClass = rci.inputClass; + rci.outputClassInfo = rci.inputClass->getDynamicInfo(); + } + if ((rci.inputClass == rci.outputClass) && (rci.outputClassInfo->_overrideCnt == 0)) { + rci.mustRepack = false; + } +} + +void +DynamicDocsumWriter::resolveInputClass(ResolveClassInfo &rci, DocsumStoreValue blob) const +{ + uint32_t id = _resultConfig->GetClassID(blob.pt(), blob.len()); + resolveInputClass(rci, id); +} + + +uint32_t +DynamicDocsumWriter::oldWriteDocsum(uint32_t docid, + GetDocsumsState *state, + IDocsumStore *docinfos, + search::RawBuf *target) +{ + ResolveClassInfo rci = resolveOutputClass(state->_args.getResultClassName()); + if (rci.mustSkip) { + return 0; + } else if (rci.allGenerated) { + // generate docsum entry on-the-fly + return GenerateDocsum(docid, state, rci.outputClass, target); + } + // look up docsum entry + DocsumStoreValue value = docinfos->getMappedDocsum(docid, false); + resolveInputClass(rci, value); + if (rci.mustSkip) { + return 0; + } else if (rci.mustRepack) { + // re-pack docsum blob + GeneralResult gres(rci.inputClass, 0, docid, 0); + if (gres.inplaceUnpack(value)) { + return RepackDocsum(&gres, state, rci.outputClass, target); + } else { // unpack failed + LOG(error, "Unpack failed: illegal docsum entry for document %d", docid); + } + } else { + // pass-through docsum blob + target->append(value.pt(), value.len()); + return value.len(); + } + return 0; +} + + +static void convertEntry(GetDocsumsState *state, + const ResConfigEntry *resCfg, + const ResEntry *entry, + Inserter &inserter, + Slime &slime) +{ + using vespalib::slime::BinaryFormat; + const char *ptr; + uint32_t len; + + LOG_ASSERT(resCfg != 0 && entry != 0); + switch (resCfg->_type) { + case RES_INT: + case RES_SHORT: + case RES_BYTE: + inserter.insertLong(entry->_intval); + break; + case RES_FLOAT: + case RES_DOUBLE: + inserter.insertDouble(entry->_doubleval); + break; + case RES_INT64: + inserter.insertLong(entry->_int64val); + break; + case RES_STRING: + case RES_LONG_STRING: + case RES_FEATUREDATA: + case RES_XMLSTRING: + entry->_resolve_field(&ptr, &len, &state->_docSumFieldSpace); + inserter.insertString(Memory(ptr, len)); + break; + case RES_DATA: + case RES_LONG_DATA: + entry->_resolve_field(&ptr, &len, &state->_docSumFieldSpace); + inserter.insertData(Memory(ptr, len)); + break; + case RES_JSONSTRING: + entry->_resolve_field(&ptr, &len, &state->_docSumFieldSpace); + if (len != 0) { + // note: 'JSONSTRING' really means 'structured data', + // and in this code path we depend on calling the + // getMappedDocsum api with flag useSlimeInsideFields=true + size_t d = BinaryFormat::decode_into(Memory(ptr, len), slime, inserter); + if (d != len) { + LOG(warning, "could not decode %u bytes: %zu bytes decoded", len, d); + } + } + break; + } +} + + +void +DynamicDocsumWriter::insertDocsum(const ResolveClassInfo & rci, + uint32_t docid, + GetDocsumsState *state, + IDocsumStore *docinfos, + vespalib::Slime & slime, + vespalib::slime::Inserter & topInserter) +{ + if (rci.allGenerated) { + // generate docsum entry on-the-fly + vespalib::slime::Cursor & docsum = topInserter.insertObject(); + for (uint32_t i = 0; i < rci.outputClass->GetNumEntries(); ++i) { + const ResConfigEntry *resCfg = rci.outputClass->GetEntry(i); + IDocsumFieldWriter *writer = _overrideTable[resCfg->_enumValue]; + if (! writer->isDefaultValue(docid, state)) { + const Memory field_name(resCfg->_bindname.data(), + resCfg->_bindname.size()); + ObjectInserter inserter(docsum, field_name); + writer->insertField(docid, NULL, state, resCfg->_type, inserter); + } + } + } else { + // look up docsum entry + DocsumStoreValue value = docinfos->getMappedDocsum(docid, true); + // re-pack docsum blob + GeneralResult gres(rci.inputClass, 0, docid, 0); + if (! gres.inplaceUnpack(value)) { + LOG(error, "Unpack failed: illegal docsum entry for document %d", docid); + topInserter.insertNix(); + return; + } + vespalib::slime::Cursor & docsum = topInserter.insertObject(); + for (uint32_t i = 0; i < rci.outputClass->GetNumEntries(); ++i) { + const ResConfigEntry *outCfg = rci.outputClass->GetEntry(i); + IDocsumFieldWriter *writer = _overrideTable[outCfg->_enumValue]; + const Memory field_name(outCfg->_bindname.data(), outCfg->_bindname.size()); + ObjectInserter inserter(docsum, field_name); + if (writer != NULL) { + writer->insertField(docid, &gres, state, outCfg->_type, inserter); + } else { + if (rci.inputClass == rci.outputClass) { + convertEntry(state, outCfg, gres.GetEntry(i), inserter, slime); + } else { + int inIdx = rci.inputClass->GetIndexFromEnumValue(outCfg->_enumValue); + const ResConfigEntry *inCfg = rci.inputClass->GetEntry(inIdx); + if (inCfg != NULL && inCfg->_type == outCfg->_type) { + // copy field + const ResEntry *entry = gres.GetEntry(inIdx); + LOG_ASSERT(entry != NULL); + convertEntry(state, outCfg, entry, inserter, slime); + } + } + } + } + } +} + + +DynamicDocsumWriter::DynamicDocsumWriter( ResultConfig *config, KeywordExtractor *extractor) + : _resultConfig(config), + _keywordExtractor(extractor), + _defaultOutputClass(ResultConfig::NoClassID()), + _numClasses(config->GetNumResultClasses()), + _numEnumValues(config->GetFieldNameEnum().GetNumEntries()), + _classInfoTable(NULL), + _overrideTable(NULL) +{ + LOG_ASSERT(config != NULL); + _classInfoTable = new ResultClass::DynamicInfo[_numClasses]; + _overrideTable = new IDocsumFieldWriter*[_numEnumValues]; + + uint32_t i = 0; + for (ResultConfig::iterator it(config->begin()), mt(config->end()); it != mt; it++, i++) { + _classInfoTable[i]._overrideCnt = 0; + _classInfoTable[i]._generateCnt = 0; + it->setDynamicInfo(&(_classInfoTable[i])); + } + LOG_ASSERT(i == _numClasses); + + for (i = 0; i < _numEnumValues; i++) + _overrideTable[i] = NULL; +} + + +DynamicDocsumWriter::~DynamicDocsumWriter() +{ + delete _resultConfig; + delete _keywordExtractor; + + delete [] _classInfoTable; + + for (uint32_t i = 0; i < _numEnumValues; i++) + delete _overrideTable[i]; + delete [] _overrideTable; + +} + +bool +DynamicDocsumWriter::SetDefaultOutputClass(uint32_t classID) +{ + const ResultClass *resClass = _resultConfig->LookupResultClass(classID); + + if (resClass == NULL || + _defaultOutputClass != ResultConfig::NoClassID()) + { + if (resClass == NULL) { + LOG(warning, "cannot set default output docsum class to %d; class not defined", classID); + } else if (_defaultOutputClass != ResultConfig::NoClassID()) { + LOG(warning, "cannot set default output docsum class to %d; value already set", classID); + } + return false; + } + _defaultOutputClass = classID; + return true; +} + + +bool +DynamicDocsumWriter::Override(const char *fieldName, IDocsumFieldWriter *writer) +{ + uint32_t fieldEnumValue = _resultConfig->GetFieldNameEnum().Lookup(fieldName); + + if (fieldEnumValue >= _numEnumValues || + _overrideTable[fieldEnumValue] != NULL) + { + + if (fieldEnumValue >= _numEnumValues) { + LOG(warning, "cannot override docsum field '%s'; undefined field name", fieldName); + } else if (_overrideTable[fieldEnumValue] != NULL) { + LOG(warning, "cannot override docsum field '%s'; already overridden", fieldName); + } + delete writer; + return false; + } + + writer->setIndex(fieldEnumValue); + _overrideTable[fieldEnumValue] = writer; + + for (ResultConfig::iterator it(_resultConfig->begin()), mt(_resultConfig->end()); it != mt; it++) { + + if (it->GetIndexFromEnumValue(fieldEnumValue) >= 0) { + ResultClass::DynamicInfo *info = it->getDynamicInfo(); + info->_overrideCnt++; + if (writer->IsGenerated()) + info->_generateCnt++; + } + } + + return true; +} + + +void +DynamicDocsumWriter::InitState(IAttributeManager & attrMan, GetDocsumsState *state) +{ + state->_kwExtractor = _keywordExtractor; + state->_attrCtx = attrMan.createContext(); + state->_attributes.resize(_numEnumValues); + for (size_t i(0); i < state->_attributes.size(); i++) { + const IDocsumFieldWriter *fw = _overrideTable[i]; + if (fw) { + const vespalib::string & attributeName = fw->getAttributeName(); + if (!attributeName.empty()) { + state->_attributes[i] = state->_attrCtx->getAttribute(attributeName); + } + } + } +} + + +uint32_t +DynamicDocsumWriter::WriteDocsum(uint32_t docid, + GetDocsumsState *state, + IDocsumStore *docinfos, + search::RawBuf *target) +{ + if ((state->_args.getFlags() & ::search::fs4transport::GDFLAG_ALLOW_SLIME) != 0) { + vespalib::Slime slime; + vespalib::slime::SlimeInserter inserter(slime); + insertDocsum(resolveClassInfo(state->_args.getResultClassName(), docinfos->getSummaryClassId()), docid, state, docinfos, slime, inserter); + return slime2RawBuf(slime, *target); + } + return oldWriteDocsum(docid, state, docinfos, target); +} + + +} // namespace search::docsummary +} // namespace search diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumwriter.h b/searchsummary/src/vespa/searchsummary/docsummary/docsumwriter.h new file mode 100644 index 00000000000..0dd7204ba16 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumwriter.h @@ -0,0 +1,123 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/searchlib/attribute/iattributemanager.h> +#include <vespa/searchsummary/docsummary/urlresult.h> +#include <vespa/searchsummary/docsummary/resultconfig.h> +#include <vespa/searchsummary/docsummary/docsumstate.h> +#include <vespa/searchsummary/docsummary/docsumstore.h> +#include <vespa/searchsummary/docsummary/keywordextractor.h> +#include <vespa/searchsummary/docsummary/docsumfieldwriter.h> +#include <vespa/fastlib/text/unicodeutil.h> +#include <vespa/fastlib/text/wordfolder.h> +#include "juniperproperties.h" + +using search::IAttributeManager; + +namespace search { +namespace docsummary { + +class IDocsumWriter +{ +public: + struct ResolveClassInfo { + bool mustSkip; + bool allGenerated; + bool mustRepack; + uint32_t outputClassId; + const ResultClass *outputClass; + const ResultClass::DynamicInfo *outputClassInfo; + const ResultClass *inputClass; + ResolveClassInfo() + : mustSkip(false), allGenerated(false), mustRepack(true), + outputClassId(ResultConfig::NoClassID()), + outputClass(NULL), outputClassInfo(NULL), inputClass(NULL) + {} + }; + + virtual ~IDocsumWriter() {} + virtual void InitState(IAttributeManager & attrMan, GetDocsumsState *state) = 0; + virtual uint32_t WriteDocsum(uint32_t docid, + GetDocsumsState *state, + IDocsumStore *docinfos, + search::RawBuf *target) = 0; + virtual void insertDocsum(const ResolveClassInfo & rci, + uint32_t docid, + GetDocsumsState *state, + IDocsumStore *docinfos, + vespalib::Slime & slime, + vespalib::slime::Inserter & target) = 0; + virtual ResolveClassInfo resolveClassInfo(vespalib::stringref outputClassName, uint32_t inputClassId) const = 0; + + static uint32_t slime2RawBuf(const vespalib::Slime & slime, RawBuf & buf); +}; + +//-------------------------------------------------------------------------- + +class DynamicDocsumWriter : public IDocsumWriter +{ +private: + DynamicDocsumWriter(const DynamicDocsumWriter &); + DynamicDocsumWriter& operator=(const DynamicDocsumWriter &); + + +private: + ResultConfig *_resultConfig; + KeywordExtractor *_keywordExtractor; + uint32_t _defaultOutputClass; + uint32_t _numClasses; + uint32_t _numEnumValues; + ResultClass::DynamicInfo *_classInfoTable; + IDocsumFieldWriter **_overrideTable; + + uint32_t WriteClassID(uint32_t classID, search::RawBuf *target); + + uint32_t GenerateDocsum(uint32_t docid, + GetDocsumsState *state, + const ResultClass *outputClass, + search::RawBuf *target); + + uint32_t RepackDocsum(GeneralResult *gres, + GetDocsumsState *state, + const ResultClass *outputClass, + search::RawBuf *target); + + void resolveInputClass(ResolveClassInfo &rci, uint32_t id) const; + void resolveInputClass(ResolveClassInfo &rci, DocsumStoreValue blob) const; + ResolveClassInfo resolveOutputClass(vespalib::stringref outputClassName) const; + + uint32_t oldWriteDocsum(uint32_t docid, GetDocsumsState *state, + IDocsumStore *docinfos, + search::RawBuf *target); + +public: + DynamicDocsumWriter(ResultConfig *config, KeywordExtractor *extractor); + virtual ~DynamicDocsumWriter(); + + ResultConfig *GetResultConfig() { return _resultConfig; } + + bool SetDefaultOutputClass(uint32_t classID); + bool Override(const char *fieldName, IDocsumFieldWriter *writer); + void InitState(IAttributeManager & attrMan, GetDocsumsState *state) override; + uint32_t WriteDocsum(uint32_t docid, + GetDocsumsState *state, + IDocsumStore *docinfos, + search::RawBuf *target) override; + + void insertDocsum(const ResolveClassInfo & outputClassInfo, + uint32_t docid, + GetDocsumsState *state, + IDocsumStore *docinfos, + vespalib::Slime & slime, + vespalib::slime::Inserter & target) override; + + ResolveClassInfo resolveClassInfo(vespalib::stringref outputClassName, uint32_t inputClassId) const override; +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/dynamicteaserdfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/dynamicteaserdfw.cpp new file mode 100644 index 00000000000..8c2c0b2e65c --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/dynamicteaserdfw.cpp @@ -0,0 +1,494 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <cstdio> +#include <vespa/log/log.h> +#include "juniperdfw.h" +#include <vespa/searchlib/parsequery/stackdumpiterator.h> +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/searchlib/queryeval/split_float.h> + +#include <vespa/searchlib/fef/properties.h> +#include <vespa/searchsummary/docsummary/docsumwriter.h> +#include <vespa/searchsummary/docsummary/docsumfieldwriter.h> +#include <vespa/searchsummary/docsummary/docsumstate.h> +#include <vespa/searchsummary/docsummary/keywordextractor.h> +#include <vespa/searchsummary/docsummary/docsumformat.h> +#include <vespa/vespalib/objects/nbostream.h> +#include <vespa/juniper/config.h> + +LOG_SETUP(".searchlib.docsummary.dynamicteaserdfw"); + +namespace juniper +{ + + +struct ExplicitItemData +{ + const char *_index; + uint32_t _indexlen; + const char *_term; + uint32_t _termlen; + uint32_t _weight; + + ExplicitItemData() + : _index(NULL), _indexlen(0), _term(NULL), _termlen(0), _weight(0) + {} + + ExplicitItemData(const char *index, uint32_t indexlen, const char* term, + uint32_t termlen, uint32_t weight = 0) + : _index(index), _indexlen(indexlen), _term(term), _termlen(termlen), _weight(weight) + {} +}; + + + +/** + * This struct is used to point to the traversal state located on + * the stack of the IQuery Traverse method. This is needed because + * the Traverse method is const. + **/ +struct QueryItem +{ + search::SimpleQueryStackDumpIterator *_si; + const ExplicitItemData *_data; + QueryItem() : _si(NULL), _data(NULL) {} + QueryItem(search::SimpleQueryStackDumpIterator *si) : _si(si), _data(NULL) {} + QueryItem(ExplicitItemData *data) : _si(NULL), _data(data) {} +private: + QueryItem(const QueryItem&); + QueryItem& operator= (const QueryItem&); +}; +}; + +namespace search { +class Property; + +namespace fef { +class TermVisitor : public IPropertiesVisitor +{ +public: + juniper::IQueryVisitor *_visitor; + juniper::QueryItem _item; + + TermVisitor(juniper::IQueryVisitor *visitor) : + _visitor(visitor), _item() {} + + virtual void visitProperty(const Property::Value &key, const Property &values); + +}; + +void +TermVisitor::visitProperty(const Property::Value &key, const Property &values) +{ + juniper::ExplicitItemData data; + juniper::QueryItem item(&data); + int index = 0; + int numBlocks = atoi(values.getAt(index++).c_str()); + data._index = key.c_str(); + data._indexlen = key.length(); + + _visitor->VisitAND(&item, numBlocks); + + for (int i = 0; i < numBlocks; i++) { + const Property::Value * s = & values.getAt(index++); + if ((*s)[0] == '"') { + s = & values.getAt(index++); + int phraseLen = atoi(s->c_str()); + _visitor->VisitPHRASE(&item, phraseLen); + s = & values.getAt(index++); + while ((*s)[0] != '"') { + data._term = s->c_str(); + data._termlen = s->length(); + _visitor->VisitKeyword(&item, s->c_str(), s->length()); + s = & values.getAt(index++); + } + } else { + data._term = s->c_str(); + data._termlen = s->length(); + _visitor->VisitKeyword(&item, s->c_str(), s->length()); + } + } +} + +} + +namespace docsummary { + +class JuniperQueryAdapter : public juniper::IQuery +{ +private: + JuniperQueryAdapter(const JuniperQueryAdapter&); + JuniperQueryAdapter operator= (const JuniperQueryAdapter&); + + KeywordExtractor *_kwExtractor; + const vespalib::stringref _buf; + const search::fef::Properties *_highlightTerms; + juniper::IQueryVisitor *_visitor; + +public: + JuniperQueryAdapter(KeywordExtractor *kwExtractor, + const vespalib::stringref &buf, + const search::fef::Properties *highlightTerms = NULL) + : _kwExtractor(kwExtractor), _buf(buf), _highlightTerms(highlightTerms), _visitor(NULL) {} + + // TODO: put this functionality into the stack dump iterator + bool SkipItem(search::SimpleQueryStackDumpIterator *iterator) const + { + uint32_t skipCount = iterator->getArity(); + + while (skipCount > 0) { + if (!iterator->next()) + return false; // stack too small + skipCount = skipCount - 1 + iterator->getArity(); + } + return true; + } + + virtual bool Traverse(juniper::IQueryVisitor *v) const; + + virtual int Weight(const juniper::QueryItem* item) const + { + if (item->_si != NULL) { + return item->_si->GetWeight().percent(); + } else { + return item->_data->_weight; + } + } + virtual juniper::ItemCreator Creator(const juniper::QueryItem* item) const + { + // cast master: Knut Omang + if (item->_si != NULL) { + return (juniper::ItemCreator) item->_si->getCreator(); + } else { + return juniper::CREA_ORIG; + } + } + virtual const char *Index(const juniper::QueryItem* item, size_t *len) const + { + if (item->_si != NULL) { + const char *ret; + item->_si->getIndexName(&ret, len); + return ret; + } else { + return item->_data->_index; + } + + } + virtual bool UsefulIndex(const juniper::QueryItem* item) const + { + const char *buf; + size_t buflen; + + if (_kwExtractor == NULL) + return true; + + if (item->_si != NULL) { + item->_si->getIndexName(&buf, &buflen); + } else { + buf = item->_data->_index; + buflen = item->_data->_indexlen; + } + return _kwExtractor->IsLegalIndex(buf, buflen); + } +}; + + + +bool +JuniperQueryAdapter::Traverse(juniper::IQueryVisitor *v) const +{ + bool rc = true; + search::SimpleQueryStackDumpIterator iterator(_buf); + juniper::QueryItem item(&iterator); + const char *buf; + size_t buflen; + + if (_highlightTerms->numKeys() > 0) { + v->VisitAND(&item, 2); + } + while (rc && iterator.next()) { + bool isSpecialToken = search::ParseItem::getFlag(iterator.getFlags(), search::ParseItem::IFLAG_SPECIALTOKEN); + switch (iterator.getType()) { + case search::ParseItem::ITEM_OR: + case search::ParseItem::ITEM_WEAK_AND: + case search::ParseItem::ITEM_EQUIV: + case search::ParseItem::ITEM_WORD_ALTERNATIVES: + // XXX unhandled + // case search::ParseItem::ITEM_WAND: + // case search::ParseItem::ITEM_WEIGHTED_SET: + // case search::ParseItem::ITEM_DOT_PRODUCT: + if (!v->VisitOR(&item, iterator.getArity())) + rc = SkipItem(&iterator); + break; + case search::ParseItem::ITEM_AND: + if (!v->VisitAND(&item, iterator.getArity())) + rc = SkipItem(&iterator); + break; + case search::ParseItem::ITEM_NOT: + if (!v->VisitANDNOT(&item, iterator.getArity())) + rc = SkipItem(&iterator); + break; + case search::ParseItem::ITEM_RANK: + if (!v->VisitRANK(&item, iterator.getArity())) + rc = SkipItem(&iterator); + break; + case search::ParseItem::ITEM_TERM: + case search::ParseItem::ITEM_EXACTSTRINGTERM: + case search::ParseItem::ITEM_PURE_WEIGHTED_STRING: + // XXX unhandled + // case search::ParseItem::ITEM_PURE_WEIGHTED_LONG: + iterator.getTerm(&buf, &buflen); + v->VisitKeyword(&item, buf, buflen, false, isSpecialToken); + break; + case search::ParseItem::ITEM_NUMTERM: + iterator.getTerm(&buf, &buflen); + { + vespalib::string termStr(buf, buflen); + queryeval::SplitFloat splitter(termStr); + if (splitter.parts() > 1) { + if (v->VisitPHRASE(&item, splitter.parts())) { + for (size_t i = 0; i < splitter.parts(); ++i) { + v->VisitKeyword(&item, + splitter.getPart(i).c_str(), + splitter.getPart(i).size(), false); + } + } + } else if (splitter.parts() == 1) { + v->VisitKeyword(&item, + splitter.getPart(0).c_str(), + splitter.getPart(0).size(), false); + } else { + v->VisitKeyword(&item, buf, buflen, false, true); + } + } + break; + case search::ParseItem::ITEM_PHRASE: + if (!v->VisitPHRASE(&item, iterator.getArity())) + rc = SkipItem(&iterator); + break; + case search::ParseItem::ITEM_PAREN: + if (!v->VisitOther(&item, iterator.getArity())) + rc = SkipItem(&iterator); + break; + case search::ParseItem::ITEM_PREFIXTERM: + case search::ParseItem::ITEM_SUBSTRINGTERM: + // XXX unhandled + // case search::ParseItem::ITEM_SUFFIXTERM: + iterator.getTerm(&buf, &buflen); + v->VisitKeyword(&item, buf, buflen, true, isSpecialToken); + break; + case search::ParseItem::ITEM_ANY: +#if (JUNIPER_RP_API_MINOR_VERSION >= 1) + if (!v->VisitANY(&item, iterator.getArity())) +#else + if (!v->VisitOR(&item, iterator.getArity())) +#endif + rc = SkipItem(&iterator); + break; + case search::ParseItem::ITEM_NEAR: + if (!v->VisitNEAR(&item, iterator.getArity(),iterator.getArg1())) + rc = SkipItem(&iterator); + break; + case search::ParseItem::ITEM_ONEAR: + if (!v->VisitWITHIN(&item, iterator.getArity(),iterator.getArg1())) + rc = SkipItem(&iterator); + break; + // XXX unhandled + // case search::ParseItem::ITEM_REGEXP: + // case search::ParseItem::ITEM_PREDICATE_QUERY: + default: + rc = false; + } + } + + if (_highlightTerms->numKeys() > 1) { + v->VisitAND(&item, _highlightTerms->numKeys()); + } + fef::TermVisitor tv(v); + _highlightTerms->visitProperties(tv); + + return rc; +} + +JuniperDFW::JuniperDFW(juniper::Juniper * juniper) + : _inputFieldEnumValue(static_cast<uint32_t>(-1)) + , _juniperConfig() + , _langFieldEnumValue(static_cast<uint32_t>(-1)) + , _juniper(juniper) +{ +} + + +JuniperDFW::~JuniperDFW() +{ +} + +bool +JuniperDFW::Init( + const char *fieldName, + const char *langFieldName, + const ResultConfig & config, + const char *inputField) +{ + bool rc = true; + const util::StringEnum & enums(config.GetFieldNameEnum()); + if (langFieldName != NULL) + _langFieldEnumValue = enums.Lookup(langFieldName); + _juniperConfig = _juniper->CreateConfig(fieldName); + if (_juniperConfig.get() == NULL) { + LOG(warning, "could not create juniper config for field '%s'", fieldName); + rc = false; + } + + _inputFieldEnumValue = enums.Lookup(inputField); + + if (_inputFieldEnumValue >= enums.GetNumEntries()) { + LOG(warning, "no docsum format contains field '%s'; dynamic teasers will be empty", + inputField); + } + return rc; +} + +bool +JuniperTeaserDFW::Init( + const char *fieldName, + const char *langFieldName, + const ResultConfig & config, + const char *inputField) +{ + bool rc = JuniperDFW::Init(fieldName, langFieldName, config, inputField); + + for (ResultConfig::const_iterator it(config.begin()), mt(config.end()); rc && it != mt; it++) { + + const ResConfigEntry *entry = + it->GetEntry(it->GetIndexFromEnumValue(_inputFieldEnumValue)); + + if (entry != NULL && + !IsRuntimeCompatible(entry->_type, RES_STRING) && + !IsRuntimeCompatible(entry->_type, RES_DATA)) + { + LOG(warning, "cannot use docsum field '%s' as input to dynamicteaser; bad type in result class %d (%s)", + inputField, it->GetClassID(), it->GetClassName()); + rc = false; + } + } + return rc; +} + +vespalib::string +DynamicTeaserDFW::makeDynamicTeaser(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state) +{ + if (state->_dynteaser._query == NULL) { + JuniperQueryAdapter iq(state->_kwExtractor, + state->_args.getStackDump(), + &state->_args.highlightTerms()); + state->_dynteaser._query = _juniper->CreateQueryHandle(iq, NULL); + } + + if (docid != state->_dynteaser._docid || + _inputFieldEnumValue != state->_dynteaser._input || + _langFieldEnumValue != state->_dynteaser._lang || + !juniper::AnalyseCompatible(_juniperConfig.get(), state->_dynteaser._config)) { + LOG(debug, "makeDynamicTeaser: docid (%d,%d), fieldenum (%d,%d), lang (%d,%d) analyse %s", + docid, state->_dynteaser._docid, + _inputFieldEnumValue, state->_dynteaser._input, + _langFieldEnumValue, state->_dynteaser._lang, + (juniper::AnalyseCompatible(_juniperConfig.get(), state->_dynteaser._config) ? "no" : "yes")); + + if (state->_dynteaser._result != NULL) + juniper::ReleaseResult(state->_dynteaser._result); + + state->_dynteaser._docid = docid; + state->_dynteaser._input = _inputFieldEnumValue; + state->_dynteaser._lang = _langFieldEnumValue; + state->_dynteaser._config = _juniperConfig.get(); + state->_dynteaser._result = NULL; + + int idx = gres->GetClass()->GetIndexFromEnumValue(_inputFieldEnumValue); + ResEntry *entry = gres->GetEntry(idx); + + if (entry != NULL && + state->_dynteaser._query != NULL) { + + // obtain Juniper input + const char *buf; + uint32_t buflen; + + entry->_resolve_field(&buf, &buflen, + &state->_docSumFieldSpace); + + if (LOG_WOULD_LOG(spam)) { + std::ostringstream hexDump; + hexDump << vespalib::HexDump(buf, buflen); + LOG(spam, "makeDynamicTeaser: docid=%d, input='%s', hexdump:\n%s", + docid, std::string(buf, buflen).c_str(), + hexDump.str().c_str()); + } + + uint32_t langid = static_cast<uint32_t>(-1); + + state->_dynteaser._result = + juniper::Analyse(_juniperConfig.get(), state->_dynteaser._query, + buf, buflen, docid, _inputFieldEnumValue, langid); + } + } + + juniper::Summary *teaser = (state->_dynteaser._result != NULL) + ? juniper::GetTeaser(state->_dynteaser._result, _juniperConfig.get()) + : NULL; + + if (LOG_WOULD_LOG(debug)) { + std::ostringstream hexDump; + if (teaser != NULL) { + hexDump << vespalib::HexDump(teaser->Text(), teaser->Length()); + } + LOG(debug, "makeDynamicTeaser: docid=%d, teaser='%s', hexdump:\n%s", + docid, (teaser != NULL ? std::string(teaser->Text(), teaser->Length()).c_str() : "NULL"), + hexDump.str().c_str()); + } + + if (teaser != NULL) { + return vespalib::string(teaser->Text(), + teaser->Length()); + } else { + return vespalib::string(); + } +} + +uint32_t +DynamicTeaserDFW::WriteField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + search::RawBuf *target) +{ + vespalib::string teaser = makeDynamicTeaser(docid, gres, state); + + bool isLong = IsBinaryCompatible(type, RES_LONG_STRING); + if (isLong) { + return DocsumFormat::addLongData(*target, teaser.c_str(), teaser.size()); + } else { + return DocsumFormat::addShortData(*target, teaser.c_str(), teaser.size()); + } +} + +void +DynamicTeaserDFW::insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType, + vespalib::slime::Inserter &target) +{ + vespalib::string teaser = makeDynamicTeaser(docid, gres, state); + vespalib::slime::Memory value(teaser.c_str(), teaser.size()); + target.insertString(value); +} + +} // namespace docsummary +} // namespace search + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/geoposdfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/geoposdfw.cpp new file mode 100644 index 00000000000..e15f0b8c986 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/geoposdfw.cpp @@ -0,0 +1,195 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".searchlib.docsummary.geoposdfw"); +#include "geoposdfw.h" +#include <vespa/searchlib/attribute/iattributemanager.h> +#include <vespa/searchlib/common/documentlocations.h> +#include <vespa/searchlib/common/location.h> +#include <vespa/vespalib/util/jsonwriter.h> + +namespace search { +namespace docsummary { + +using attribute::IAttributeVector; +using attribute::IAttributeContext; + +GeoPositionDFW::GeoPositionDFW(const vespalib::string & attrName) : + AttrDFW(attrName) +{ +} + +namespace { + +void fmtZcurve(int64_t zval, vespalib::slime::Inserter &target) +{ + int32_t docx = 0; + int32_t docy = 0; + vespalib::geo::ZCurve::decode(zval, &docx, &docy); + if (docx == 0 && docy == INT_MIN) { + LOG(spam, "skipping empty zcurve value"); + } else { + vespalib::slime::Cursor &obj = target.insertObject(); + obj.setLong("y", docy); + obj.setLong("x", docx); + } +} + +void fmtZcurve(int64_t zval, vespalib::JSONWriter json) +{ + int32_t docx = 0; + int32_t docy = 0; + vespalib::geo::ZCurve::decode(zval, &docx, &docy); + json.beginObject(); + json.appendKey("y"); json.appendInt64(docy); + json.appendKey("x"); json.appendInt64(docx); + json.endObject(); +} + +} // namespace <unnamed> + +vespalib::asciistream +GeoPositionDFW::formatField(const IAttributeVector & attribute, uint32_t docid) +{ + vespalib::asciistream target; + vespalib::JSONWriter json(target); + + if (attribute.hasMultiValue()) { + uint32_t entries = attribute.getValueCount(docid); + LOG(debug, "docid=%d, entries=%d", docid, entries); + json.beginArray(); + if (attribute.hasWeightedSetType()) { + std::vector<IAttributeVector::WeightedInt> elements(entries); + entries = attribute.get(docid, &elements[0], entries); + for (uint32_t i = 0; i < entries; ++i) { + json.beginObject(); + int64_t pos = elements[i].getValue(); + json.appendKey("item"); + fmtZcurve(pos, json); + json.appendKey("weight"); + json.appendInt64(elements[i].getWeight()); + json.endObject(); + } + } else { + std::vector<IAttributeVector::largeint_t> elements(16); + uint32_t numValues = attribute.get(docid, &elements[0], elements.size()); + if (numValues > elements.size()) { + elements.resize(numValues); + numValues = attribute.get(docid, &elements[0], elements.size()); + assert(numValues <= elements.size()); + } + LOG(debug, "docid=%d, numValues=%d", docid, numValues); + for (uint32_t i = 0; i < numValues; i++) { + int64_t pos = elements[i]; + fmtZcurve(pos, json); + } + } + } else { + int64_t pos = attribute.getInt(docid); + LOG(debug, "docid=%d, pos=%ld", docid, pos); + fmtZcurve(pos, json); + } + return target; +} + +void +GeoPositionDFW::insertField(uint32_t docid, + GeneralResult *, + GetDocsumsState * dsState, + ResType, + vespalib::slime::Inserter &target) +{ + using vespalib::slime::Cursor; + using vespalib::slime::ObjectInserter; + using vespalib::slime::ArrayInserter; + + const IAttributeVector & attribute = vec(*dsState); + if (attribute.hasMultiValue()) { + uint32_t entries = attribute.getValueCount(docid); + Cursor &arr = target.insertArray(); + if (attribute.hasWeightedSetType()) { + std::vector<IAttributeVector::WeightedInt> elements(entries); + entries = attribute.get(docid, &elements[0], entries); + for (uint32_t i = 0; i < entries; ++i) { + Cursor &elem = arr.addObject(); + int64_t pos = elements[i].getValue(); + ObjectInserter obj(elem, "item"); + fmtZcurve(pos, obj); + elem.setLong("weight", elements[i].getWeight()); + } + } else { + std::vector<IAttributeVector::largeint_t> elements(16); + uint32_t numValues = attribute.get(docid, &elements[0], elements.size()); + if (numValues > elements.size()) { + elements.resize(numValues); + numValues = attribute.get(docid, &elements[0], elements.size()); + assert(numValues <= elements.size()); + } + for (uint32_t i = 0; i < numValues; i++) { + int64_t pos = elements[i]; + ArrayInserter obj(arr); + fmtZcurve(pos, obj); + } + } + } else { + int64_t pos = attribute.getInt(docid); + fmtZcurve(pos, target); + } +} + +uint32_t +GeoPositionDFW::WriteField(uint32_t docid, + GeneralResult *, + GetDocsumsState * dsState, + ResType type, + search::RawBuf * target) +{ + int str_len_ofs = target->GetUsedLen(); + + vespalib::asciistream val(formatField(vec(*dsState), docid)); + + bool isLong = IsBinaryCompatible(type, RES_LONG_STRING); + if (isLong) { + uint32_t str_len_32 = val.size(); + target->append(&str_len_32, sizeof(str_len_32)); + target->append(val.c_str(), str_len_32); + } else { + uint16_t str_len_16 = val.size(); + target->append(&str_len_16, sizeof(str_len_16)); + target->append(val.c_str(), str_len_16); + } + // calculate number of bytes written + uint32_t written = target->GetUsedLen() - str_len_ofs; + return written; +} + +GeoPositionDFW::UP +GeoPositionDFW::create(const char *attribute_name, + IAttributeManager *attribute_manager) +{ + GeoPositionDFW::UP ret; + if (attribute_manager != NULL) { + if (!attribute_name) { + LOG(warning, "create: missing attribute name '%p'", attribute_name); + return ret; + } + IAttributeContext::UP context = attribute_manager->createContext(); + if (!context.get()) { + LOG(warning, "create: could not create context from attribute manager"); + return ret; + } + const IAttributeVector *attribute = context->getAttribute(attribute_name); + if (!attribute) { + LOG(warning, "create: could not get attribute '%s' from context", attribute_name); + return ret; + } + } + ret.reset(new GeoPositionDFW(attribute_name)); + return ret; +} + + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/geoposdfw.h b/searchsummary/src/vespa/searchsummary/docsummary/geoposdfw.h new file mode 100644 index 00000000000..91d834d45a7 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/geoposdfw.h @@ -0,0 +1,36 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchsummary/docsummary/attributedfw.h> + +namespace search { +namespace docsummary { + +/** + * This is the docsum field writer used to extract the position (as a string) from a zcurve attribute + **/ +class GeoPositionDFW : public AttrDFW +{ +private: + vespalib::asciistream formatField(const attribute::IAttributeVector & v, uint32_t docid); +public: + typedef std::unique_ptr<GeoPositionDFW> UP; + GeoPositionDFW(const vespalib::string & attrName); + virtual uint32_t WriteField(uint32_t docid, + GeneralResult * gres, + GetDocsumsState * state, + ResType type, + search::RawBuf * target); + virtual void insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target); + static UP create(const char *attribute_name, + IAttributeManager *attribute_manager); +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/getdocsumargs.cpp b/searchsummary/src/vespa/searchsummary/docsummary/getdocsumargs.cpp new file mode 100644 index 00000000000..1316a370a7c --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/getdocsumargs.cpp @@ -0,0 +1,96 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/searchsummary/docsummary/getdocsumargs.h> +#include <vespa/searchsummary/docsummary/resultconfig.h> + +namespace search { +namespace docsummary { + +GetDocsumArgs::GetDocsumArgs() + : _ranking(), + _qflags(0), + _resultClassName(), + _stackItems(0), + _stackDump(), + _location(), + _timeout(30 * fastos::TimeStamp::SEC), + _flags(0u), + _propertiesMap(), + _isLocationSet(false) +{ +} + + +GetDocsumArgs::~GetDocsumArgs() +{ +} + +void +GetDocsumArgs::setTimeout(const fastos::TimeStamp & timeout) +{ + _timeout = timeout; +} + +fastos::TimeStamp +GetDocsumArgs::getTimeout() const +{ + return _timeout; +} + + +void +GetDocsumArgs::Reset() +{ + _ranking.clear(); + _qflags = 0; + _stackItems = 0; + _timeout = 30 * fastos::TimeStamp::SEC; + _flags = 0; + _resultClassName.clear(); + _stackDump.clear(); + _location.clear(); + _isLocationSet = false; + { + PropsMap tmp; + std::swap(_propertiesMap, tmp); + } +} + + +void +GetDocsumArgs::Copy(GetDocsumArgs *src) +{ + if (src == this) { + return; + } + *src = *this; +} + +void +GetDocsumArgs::initFromDocsumRequest(const search::engine::DocsumRequest &req) +{ + _ranking = req.ranking; + _qflags = req.queryFlags; + _resultClassName = req.resultClassName; + _stackItems = req.stackItems; + _stackDump = req.stackDump; + _location = req.location; + _timeout = req.getTimeLeft(); + _flags = req._flags; + _propertiesMap = req.propertiesMap; + _isLocationSet = (_location.size() > 0); +} + +void +GetDocsumArgs::SetStackDump(uint32_t stackItems, uint32_t stackDumpLen, const char *stackDump) +{ + _stackItems = stackItems; + _stackDump.resize(stackDumpLen); + memcpy(&_stackDump[0], stackDump, _stackDump.size()); +} + +} +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/getdocsumargs.h b/searchsummary/src/vespa/searchsummary/docsummary/getdocsumargs.h new file mode 100644 index 00000000000..4b4a9e29f02 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/getdocsumargs.h @@ -0,0 +1,91 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 2003 Overture Services Norway AS +// Copyright (C) 1999-2003 Fast Search & Transfer ASA + +#pragma once + +#include <vespa/searchlib/fef/properties.h> +#include <vespa/searchlib/common/packets.h> +#include <vespa/searchlib/engine/docsumrequest.h> +#include <vespa/searchlib/engine/propertiesmap.h> + +namespace search { +namespace docsummary { + +class GetDocsumArgs +{ +public: + typedef engine::PropertiesMap PropsMap; + +private: + vespalib::string _ranking; + uint32_t _qflags; + vespalib::string _resultClassName; + uint32_t _stackItems; + std::vector<char> _stackDump; + vespalib::string _location; + fastos::TimeStamp _timeout; + uint32_t _flags; + PropsMap _propertiesMap; + + bool _isLocationSet; + +public: + GetDocsumArgs(); + ~GetDocsumArgs(); + + void Reset(); + void Copy(GetDocsumArgs *src); + void initFromDocsumRequest(const search::engine::DocsumRequest &req); + + void SetRankProfile(const vespalib::string &ranking) { _ranking = ranking; } + void SetQueryFlags(uint32_t qflags) { _qflags = qflags; } + void SetResultClassName(uint32_t len, const char *name) { + _resultClassName.assign(name, len); + } + void setResultClassName(const vespalib::stringref & name) { _resultClassName = name; } + void SetStackDump(uint32_t stackItems, + uint32_t stackDumpLen, const char *stackDump); + void SetLocation(uint32_t locationLen, const char *location) { + if ((_isLocationSet = (location != NULL))) { + _location.assign(location, locationLen); + } + } + + void + setFlags(uint32_t flags) + { + _flags = flags; + } + + void setTimeout(const fastos::TimeStamp & timeout); + fastos::TimeStamp getTimeout() const; + + const vespalib::string & getRankProfile() const { return _ranking; } + const vespalib::string & getResultClassName() const { return _resultClassName; } + const vespalib::string & getLocation() const { return _location; } + const vespalib::stringref getStackDump() const { + return vespalib::stringref(&_stackDump[0], _stackDump.size()); + } + + uint32_t GetQueryFlags() const { return _qflags; } + uint32_t GetStackItems() const { return _stackItems; } + uint32_t GetLocationLen() const { return _location.size(); } + uint32_t getFlags() const { return _flags; } + + const PropsMap &propertiesMap() const { return _propertiesMap; } + + const search::fef::Properties &rankProperties() const { + return _propertiesMap.rankProperties(); + } + const search::fef::Properties &featureOverrides() const { + return _propertiesMap.featureOverrides(); + } + const search::fef::Properties &highlightTerms() const { + return _propertiesMap.highlightTerms(); + } +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/idocsumenvironment.h b/searchsummary/src/vespa/searchsummary/docsummary/idocsumenvironment.h new file mode 100644 index 00000000000..546acd24f3f --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/idocsumenvironment.h @@ -0,0 +1,24 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/attribute/iattributemanager.h> +#include <vespa/juniper/rpinterface.h> + +namespace search { +namespace docsummary { + +/** + * Abstract view of information available to rewriters for generating docsum fields. + **/ +class IDocsumEnvironment { +public: + virtual search::IAttributeManager * getAttributeManager() = 0; + virtual vespalib::string lookupIndex(const vespalib::string & s) const = 0; + virtual juniper::Juniper * getJuniper() = 0; + virtual ~IDocsumEnvironment() {} +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h b/searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h new file mode 100644 index 00000000000..8d221597480 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h @@ -0,0 +1,69 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> + +namespace search { +namespace docsummary { + +/** + * Interface for a tokenizer. + */ +class ITokenizer +{ +public: + /** + * Representation of a token with type and text and optional stemmed variant. + */ + class Token + { + public: + enum Type { + WORD, // Fast_UnicodeUtil::IsWordChar() returns true + NON_WORD, // Fast_UnicodeUtil::IsWordChar() returns false + PUNCTUATION, // Fast_UnicodeUtil::IsTerminalPunctuationChar() returns true + ANNOTATION, // Interlinear annotation + NOT_DEF + }; + private: + vespalib::stringref _text; + vespalib::stringref _stem; + Type _type; + + public: + Token(const char * textBegin, const char * textEnd, Type type) : + _text(textBegin, textEnd - textBegin), _stem(), _type(type) {} + Token(const char * textBegin, const char * textEnd, const char * stemBegin, const char * stemEnd, Type type) : + _text(textBegin, textEnd - textBegin), _stem(stemBegin, stemEnd - stemBegin), _type(type) {} + const vespalib::stringref & getText() const { return _text; } + const vespalib::stringref & getStem() const { return _stem; } + bool hasStem() const { return _stem.c_str() != NULL; } + Type getType() const { return _type; } + }; + + virtual ~ITokenizer() {} + + /** + * Reset the tokenizer using the given buffer. + */ + virtual void reset(const char * buf, size_t len) = 0; + + /** + * Returns the size of the underlying buffer. + */ + virtual size_t getBufferSize() const = 0; + + /** + * Returns true if the text buffer has more tokens. + */ + virtual bool hasMoreTokens() = 0; + + /** + * Returns the next token from the text buffer. + */ + virtual Token getNextToken() = 0; +}; + +} +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/juniperdfw.h b/searchsummary/src/vespa/searchsummary/docsummary/juniperdfw.h new file mode 100644 index 00000000000..0717ba9ed6b --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/juniperdfw.h @@ -0,0 +1,74 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/searchsummary/docsummary/urlresult.h> +#include <vespa/searchsummary/docsummary/docsumstate.h> +#include <vespa/searchsummary/docsummary/resultconfig.h> +#include <vespa/vespalib/data/slime/inserter.h> +#include "docsumfieldwriter.h" + +namespace search { +namespace docsummary { + +class JuniperDFW : public IDocsumFieldWriter +{ +public: + virtual bool Init( + const char *fieldName, + const char *langFieldName, + const ResultConfig & config, + const char *inputField); +protected: + JuniperDFW(juniper::Juniper * juniper); + virtual ~JuniperDFW(); + + uint32_t _inputFieldEnumValue; + std::unique_ptr<juniper::Config> _juniperConfig; + uint32_t _langFieldEnumValue; + juniper::Juniper *_juniper; +private: + virtual bool IsGenerated() const { return false; } + JuniperDFW(const JuniperDFW &); + JuniperDFW & operator=(const JuniperDFW &); +}; + + +class JuniperTeaserDFW : public JuniperDFW +{ +public: + virtual bool Init( + const char *fieldName, + const char *langFieldName, + const ResultConfig & config, + const char *inputField); +protected: + JuniperTeaserDFW(juniper::Juniper * juniper) : JuniperDFW(juniper) { } +}; + + +class DynamicTeaserDFW : public JuniperTeaserDFW +{ +public: + DynamicTeaserDFW(juniper::Juniper * juniper) : JuniperTeaserDFW(juniper) { } + + vespalib::string makeDynamicTeaser(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state); + + virtual uint32_t WriteField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + search::RawBuf *target); + virtual void insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target); +}; + +} // namespace docsummary +} // namespace search + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/juniperproperties.cpp b/searchsummary/src/vespa/searchsummary/docsummary/juniperproperties.cpp new file mode 100644 index 00000000000..fc434998465 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/juniperproperties.cpp @@ -0,0 +1,112 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/searchcommon/config/subscriptionproxyng.h> +#include <vespa/vespalib/util/vstringfmt.h> +#include "juniperproperties.h" + +using vespa::config::search::summary::JuniperrcConfig; + +namespace search { +namespace docsummary { + +JuniperProperties::JuniperProperties() : + _properties() +{ + reset(); +} + +JuniperProperties::JuniperProperties(const JuniperrcConfig &cfg) : + _properties() +{ + reset(); + configure(cfg); +} + +JuniperProperties::~JuniperProperties() { + // empty +} + +void +JuniperProperties::reset() +{ + _properties.clear(); + //_properties["juniper.debug_mask"] = "0"; + //_properties["juniper.dynsum.connectors"] = "\x1F\x1D"; + _properties["juniper.dynsum.continuation"] = "\x1E"; + _properties["juniper.dynsum.escape_markup"] = "off"; + _properties["juniper.dynsum.fallback"] = "prefix"; + _properties["juniper.dynsum.highlight_off"] = "\x1F"; + _properties["juniper.dynsum.highlight_on"] = "\x1F"; + _properties["juniper.dynsum.preserve_white_space"] = "on"; + //_properties["juniper.dynsum.length"] = "256"; + //_properties["juniper.dynsum.max_matches"] = "3"; + //_properties["juniper.dynsum.min_length"] = "128"; + //_properties["juniper.dynsum.separators"] = "\x1F\x1D"; + //_properties["juniper.dynsum.surround_max"] = "128"; + _properties["juniper.matcher.winsize"] = "200"; + _properties["juniper.matcher.winsize_fallback_multiplier"] = "10.0"; + _properties["juniper.matcher.max_match_candidates"] = "1000"; + //_properties["juniper.proximity.factor"] = "0.25"; + //_properties["juniper.stem.max_extend"] = "3"; + //_properties["juniper.stem.min_length"] = "5"; +} + +void +JuniperProperties::configure(const JuniperrcConfig &cfg) +{ + reset(); + _properties["juniper.dynsum.fallback"] = cfg.prefix ? "prefix" : "none"; + _properties["juniper.dynsum.length"] = vespalib::make_vespa_string("%d", cfg.length); + _properties["juniper.dynsum.max_matches"] = vespalib::make_vespa_string("%d", cfg.maxMatches); + _properties["juniper.dynsum.min_length"] = vespalib::make_vespa_string("%d", cfg.minLength); + _properties["juniper.dynsum.surround_max"] = vespalib::make_vespa_string("%d", cfg.surroundMax); + _properties["juniper.matcher.winsize"] = vespalib::make_vespa_string("%d", cfg.winsize); + _properties["juniper.matcher.winsize_fallback_multiplier"] = vespalib::make_vespa_string("%f", cfg.winsizeFallbackMultiplier); + _properties["juniper.matcher.max_match_candidates"] = vespalib::make_vespa_string("%d", cfg.maxMatchCandidates); + _properties["juniper.stem.min_length"] = vespalib::make_vespa_string("%d", cfg.stemMinLength); + _properties["juniper.stem.max_extend"] = vespalib::make_vespa_string("%d", cfg.stemMaxExtend); + + for (uint32_t i = 0; i < cfg.override.size(); ++i) { + const JuniperrcConfig::Override &override = cfg.override[i]; + const vespalib::string keyDynsum = vespalib::make_vespa_string("%s.dynsum.", override.fieldname.c_str()); + const vespalib::string keyMatcher = vespalib::make_vespa_string("%s.matcher.", override.fieldname.c_str()); + const vespalib::string keyStem = vespalib::make_vespa_string("%s.stem.", override.fieldname.c_str()); + + _properties[keyDynsum + "fallback"] = override.prefix ? "prefix" : "none"; + _properties[keyDynsum + "length"] = vespalib::make_vespa_string("%d", override.length); + _properties[keyDynsum + "max_matches"] = vespalib::make_vespa_string("%d", override.maxMatches); + _properties[keyDynsum + "min_length"] = vespalib::make_vespa_string("%d", override.minLength); + _properties[keyDynsum + "surround_max"] = vespalib::make_vespa_string("%d", override.surroundMax); + + _properties[keyMatcher + "winsize"] = vespalib::make_vespa_string("%d", override.winsize); + _properties[keyMatcher + "winsize_fallback_multiplier"] = vespalib::make_vespa_string("%f", override.winsizeFallbackMultiplier); + _properties[keyMatcher + "max_match_candidates"] = vespalib::make_vespa_string("%d", override.maxMatchCandidates); + + _properties[keyStem + "min_length"] = vespalib::make_vespa_string("%d", override.stemMinLength); + _properties[keyStem + "max_extend"] = vespalib::make_vespa_string("%d", override.stemMaxExtend); + } +} + +void +JuniperProperties::subscribe(const char *configId) +{ + SubscriptionProxyNg<JuniperProperties, JuniperrcConfig> subscriber(*this, &JuniperProperties::configure); + subscriber.subscribe(configId); +} + +const char * +JuniperProperties::GetProperty(const char *name, const char *def) +{ + std::map<vespalib::string, vespalib::string>::const_iterator it = _properties.find(name); + return it != _properties.end() ? it->second.c_str() : def; +} + +void +JuniperProperties::SetProperty(const vespalib::string &key, const vespalib::string &val) +{ + _properties[key] = val; +} + +} // namespace docsummary +} // namespace search + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/juniperproperties.h b/searchsummary/src/vespa/searchsummary/docsummary/juniperproperties.h new file mode 100644 index 00000000000..5e9cdce48c7 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/juniperproperties.h @@ -0,0 +1,70 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/juniper/IJuniperProperties.h> +#include <map> +#include <vespa/searchsummary/config/config-juniperrc.h> +#include <string> + +namespace search { +namespace docsummary { + +class JuniperProperties : public IJuniperProperties { +private: + std::map<vespalib::string, vespalib::string> _properties; + + /** + * Resets the property map to all default values. This is used for the empty constructor and also called before + * retrieving configured properties. + */ + void reset(); + + +public: + /** + * Constructs a juniper property object with default values set. + */ + JuniperProperties(); + /** + * Constructs a juniper property object with default values set. + */ + JuniperProperties(const vespa::config::search::summary::JuniperrcConfig &cfg); + + /** + * Destructor. Frees any allocated resources. + */ + virtual ~JuniperProperties(); + + /** + * This method subscribes to config from the given configuration id. This does the necessary mapping from + * user-friendly configuration parameters to juniper specific properties. Note that no exceptions thrown by the + * configuration framework are caught in this method. Please refer to the config framework for details on what to + * expect. + * + * @param configId The config id to subscribe to. + */ + void subscribe(const char *configId); + + /** + * Implements configure callback for config subscription. + * + * @param cfg The configuration object. + */ + void configure(const vespa::config::search::summary::JuniperrcConfig &cfg); + + // Inherit doc from IJuniperProperties. + const char *GetProperty(const char *name, const char *def = NULL); + + /** + * Sets the value of a given named property. If the property already exists, it is overwritten. If it does not + * exist, it is added. + * + * @param key The name of the property to set. + * @param val The value to set for the property. + */ + void SetProperty(const vespalib::string &key, const vespalib::string &val); +}; + +} // namespace docsummary +} // namespace search + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keywordextractor.cpp b/searchsummary/src/vespa/searchsummary/docsummary/keywordextractor.cpp new file mode 100644 index 00000000000..6d567f9a6da --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/keywordextractor.cpp @@ -0,0 +1,233 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/searchlib/parsequery/stackdumpiterator.h> +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/searchsummary/docsummary/docsumstate.h> +#include <vespa/searchsummary/docsummary/keywordextractor.h> + + +LOG_SETUP(".searchlib.docsummary.keywordextractor"); + +/** Tell us what parts of the query we are interested in */ + +namespace search { +namespace docsummary { + + +bool useful(search::ParseItem::ItemCreator creator) +{ + switch (creator) + { + case search::ParseItem::CREA_ORIG: + return true; + default: + return false; + } +} + + +KeywordExtractor::KeywordExtractor(IDocsumEnvironment * env) + : _env(env), + _legalPrefixes(NULL), + _legalIndexes() +{ +} + + +KeywordExtractor::~KeywordExtractor() +{ + while (_legalPrefixes != NULL) { + IndexPrefix *tmp = _legalPrefixes; + _legalPrefixes = tmp->_next; + delete tmp; + } +} + + +void +KeywordExtractor::AddLegalIndexSpec(const char *spec) +{ + if (spec == NULL) + return; + + vespalib::string toks(spec); // tokens + vespalib::string tok; // single token + size_t offset; // offset into tokens buffer + size_t seppos; // separator position + + offset = 0; + while ((seppos = toks.find(';', offset)) != vespalib::string::npos) { + if (seppos == offset) { + offset++; // don't want empty tokens + } else { + tok = toks.substr(offset, seppos - offset); + offset = seppos + 1; + if (tok[tok.size() - 1] == '*') { + tok.resize(tok.size() - 1); + AddLegalIndexPrefix(tok.c_str()); + } else { + AddLegalIndexName(tok.c_str()); + } + } + } + if (toks.size() > offset) { // catch last token + tok = toks.substr(offset); + if (tok[tok.size() - 1] == '*') { + tok.resize(tok.size() - 1); + AddLegalIndexPrefix(tok.c_str()); + } else { + AddLegalIndexName(tok.c_str()); + } + } +} + + +vespalib::string +KeywordExtractor::GetLegalIndexSpec() +{ + vespalib::string spec; + + if (_legalPrefixes != NULL) { + for (IndexPrefix *pt = _legalPrefixes; + pt != NULL; pt = pt->_next) { + if (spec.size() > 0) + spec.append(';'); + spec.append(pt->_prefix); + spec.append('*'); + } + } + + for (Set::const_iterator it(_legalIndexes.begin()), mt(_legalIndexes.end()); it != mt; it++) { + if (spec.size() > 0) + spec.append(';'); + spec.append(*it); + } + return spec; +} + + +bool +KeywordExtractor::IsLegalIndex(const char *idxName, size_t idxNameLen) const +{ + vespalib::string resolvedIdxName; + vespalib::string idxS(idxName, idxNameLen); + + if (_env != NULL) { + resolvedIdxName = _env->lookupIndex(idxS); + } else { + + if ( ! idxS.empty() ) { + resolvedIdxName = idxS; + } else { + resolvedIdxName = "__defaultindex"; + } + } + + if (resolvedIdxName.empty()) + return false; + + return (IsLegalIndexPrefix(resolvedIdxName.c_str()) || + IsLegalIndexName(resolvedIdxName.c_str())); +} + + +char * +KeywordExtractor::ExtractKeywords(const vespalib::stringref &buf) const +{ + const char *str_ptr; + size_t str_len; + search::SimpleQueryStackDumpIterator si(buf); + char keywordstore[4096]; // Initial storage for keywords buffer + search::RawBuf keywords(keywordstore, sizeof(keywordstore)); + + while (si.next()) { + search::ParseItem::ItemCreator creator = si.getCreator(); + switch (si.getType()) { + case search::ParseItem::ITEM_NOT: + /** + * @todo Must consider only the first argument on the stack. + * Difficult without recursion. + */ + break; + + case search::ParseItem::ITEM_PHRASE: + { + // Must take the next arity TERMS and put together + bool phraseterms_was_added = false; + int phraseterms = si.getArity(); + for (int i = 0; i < phraseterms; i++) { + si.next(); + search::ParseItem::ItemType newtype = si.getType(); + if (newtype != search::ParseItem::ITEM_TERM && + newtype != search::ParseItem::ITEM_NUMTERM) + { + // stack syntax error + // LOG(debug, "Extracting keywords found a non-term in a phrase"); + // making a clean escape. + keywords.reset(); + goto iteratorloopend; + } else { + si.getIndexName(&str_ptr, &str_len); + if (!IsLegalIndex(str_ptr, str_len)) + continue; + // Found a term + si.getTerm(&str_ptr, &str_len); + search::ParseItem::ItemCreator term_creator = si.getCreator(); + if (str_len > 0 && useful(term_creator)) { + // Actual term to add + if (phraseterms_was_added) + // Not the first term in the phrase + keywords += " "; + else + phraseterms_was_added = true; + + keywords.append(str_ptr, str_len); + } + } + } + if (phraseterms_was_added) + // Terms was added, so 0-terminate the string + keywords.append("\0", 1); + + break; + } + case search::ParseItem::ITEM_PREFIXTERM: + case search::ParseItem::ITEM_SUBSTRINGTERM: + case search::ParseItem::ITEM_EXACTSTRINGTERM: + case search::ParseItem::ITEM_NUMTERM: + case search::ParseItem::ITEM_TERM: + si.getIndexName(&str_ptr, &str_len); + if (!IsLegalIndex(str_ptr, str_len)) + continue; + // add a new keyword + si.getTerm(&str_ptr, &str_len); + if (str_len > 0 && useful(creator)) { + // An actual string to add + keywords.append(str_ptr, str_len); + keywords.append("\0", 1); + } + break; + + default: + // Do nothing to AND, RANK, OR + break; + } + } + iteratorloopend: + // Add a 'blank' keyword + keywords.append("\0", 1); + + // Must now allocate a string and copy the data from the rawbuf + void *result = malloc(keywords.GetUsedLen()); + if (result != NULL) { + memcpy(result, keywords.GetDrainPos(), keywords.GetUsedLen()); + } + return static_cast<char *>(result); +} + +} +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/keywordextractor.h b/searchsummary/src/vespa/searchsummary/docsummary/keywordextractor.h new file mode 100644 index 00000000000..750cfb2cdee --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/keywordextractor.h @@ -0,0 +1,164 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/vespalib/stllike/hash_set.h> +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/searchsummary/docsummary/idocsumenvironment.h> + +namespace search { +namespace docsummary { + +class KeywordExtractor +{ +private: + KeywordExtractor(const KeywordExtractor &); + KeywordExtractor& operator=(const KeywordExtractor &); + +public: + + class IndexPrefix + { + private: + IndexPrefix(const IndexPrefix &); + IndexPrefix& operator=(const IndexPrefix &); + + public: + char *_prefix; + int _prefixLen; + IndexPrefix *_next; + + + IndexPrefix(const char *prefix, + IndexPrefix **list) + : _prefix(NULL), + _prefixLen(0), + _next(NULL) + { + _prefix = strdup(prefix); + assert(_prefix != NULL); + _prefixLen = strlen(prefix); + _next = *list; + *list = this; + } + + ~IndexPrefix() + { + free(_prefix); + } + + bool Match(const char *idxName) const + { + return (strncmp(idxName, _prefix, _prefixLen) == 0); + } + }; + +private: + typedef vespalib::hash_set<vespalib::string> Set; + IDocsumEnvironment *_env; + IndexPrefix *_legalPrefixes; + Set _legalIndexes; + + + bool IsLegalIndexPrefix(const char *idxName) const + { + for (const IndexPrefix *pt = _legalPrefixes; + pt != NULL; + pt = pt->_next) + { + if (pt->Match(idxName)) + return true; + } + return false; + } + + bool IsLegalIndexName(const char *idxName) const + { + return _legalIndexes.find(idxName) != _legalIndexes.end(); + } + +public: + explicit KeywordExtractor(IDocsumEnvironment * env); + ~KeywordExtractor(); + + + /** + * Add a prefix to the set of legal index name prefixes. + * + * @param prefix the index name prefix to add. + **/ + void AddLegalIndexPrefix(const char *prefix) + { + //Self destructing construction + new IndexPrefix(prefix, &_legalPrefixes); + } + + + /** + * Add a name to the set of legal index names. + * + * @param idxName the index name to add. + **/ + void AddLegalIndexName(const char *idxName) + { + _legalIndexes.insert(idxName); + } + + + /** + * Parse the input string as a ';' separated list of index names and + * index name prefixes. A '*' following a token in the list denotes + * that the token is an index name prefix. Add the index names and + * index name prefixes to the set of legal values. + * + * @param spec list of legal index names and prefixes. + **/ + void AddLegalIndexSpec(const char *spec); + + + /** + * Create a spec on the same format as accepted by the @ref + * AddLegalIndexSpec method. Freeing the returned spec is the + * responsibility of the caller of this method. + * + * @return spec defining legal index names and prefixes. + **/ + vespalib::string GetLegalIndexSpec(); + + + /** + * Determine wether the given index name is legal by checking it + * against the current set of legal index names and index name + * prefixes held by this object. + * + * @return true if the given index name is legal. + **/ + bool IsLegalIndex(const char *idxName, size_t idxNameLen) const; + + + /** + * Extract keywords from a stack dump of a SimpleQueryStack. + * + * The words are extracted as follows: For AND and OR operators, all + * TERM items occuring in a legal index (the set of legal indexes is + * defined by invoking the @ref AddLegalIndex and @ref + * AddLegalIndexPrefix methods) are extracted. + * + * For PHRASE operators, the TERMS in a phrase are put together with + * space between them. + * + * @todo For NOT operators, only the first operand is considered. + * + * @param buf Pointer to buffer with simple query stack dump. + * @param bufLen Length of stack dump buffer + * @return Pointer to a buffer containing zero-terminated keywords, + * with an empty word at the end. + */ + char *ExtractKeywords(const vespalib::stringref &buf) const; +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/positionsdfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/positionsdfw.cpp new file mode 100644 index 00000000000..6c47b305f09 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/positionsdfw.cpp @@ -0,0 +1,337 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include "positionsdfw.h" +#include <vespa/log/log.h> + +LOG_SETUP(".searchlib.docsummary.positionsdfw"); + +namespace search { +namespace docsummary { + +using search::attribute::IAttributeContext; +using search::attribute::IAttributeVector; +using search::attribute::BasicType; +using search::common::Location; + +AbsDistanceDFW::AbsDistanceDFW(const vespalib::string & attrName) : + AttrDFW(attrName) +{ +} + +uint64_t +AbsDistanceDFW::findMinDistance(uint32_t docid, + GetDocsumsState *state) +{ + search::common::Location &location = *state->_parsedLocation; + const IAttributeVector & attribute(vec(*state)); + + uint64_t absdist = std::numeric_limits<int64_t>::max(); + int32_t docx = 0; + int32_t docy = 0; + std::vector<IAttributeVector::largeint_t> pos(16); + uint32_t numValues = attribute.get(docid, &pos[0], pos.size()); + if (numValues > pos.size()) { + pos.resize(numValues); + numValues = attribute.get(docid, &pos[0], pos.size()); + assert(numValues <= pos.size()); + } + for (uint32_t i = 0; i < numValues; i++) { + int64_t docxy(pos[i]); + vespalib::geo::ZCurve::decode(docxy, &docx, &docy); + uint32_t dx; + if (location.getX() > docx) { + dx = location.getX() - docx; + } else { + dx = docx - location.getX(); + } + if (location.getXAspect() != 0) { + dx = ((uint64_t) dx * location.getXAspect()) >> 32; + } + uint32_t dy; + if (location.getY() > docy) { + dy = location.getY() - docy; + } else { + dy = docy - location.getY(); + } + uint64_t dist2 = dx * (uint64_t) dx + + dy * (uint64_t) dy; + if (dist2 < absdist) { + absdist = dist2; + } + } + return (uint64_t) sqrt((double) absdist); +} + +void +AbsDistanceDFW::insertField(uint32_t docid, + GeneralResult *, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target) +{ + bool forceEmpty = true; + + const vespalib::string &locationStr = state->_args.getLocation(); + if (locationStr.size() > 0) { + if (state->_parsedLocation.get() == NULL) { + state->_callback.ParseLocation(state); + } + assert(state->_parsedLocation.get() != NULL); + if (state->_parsedLocation->getParseError() == NULL) { + forceEmpty = false; + } + } + if (forceEmpty) return; + + uint64_t absdist = findMinDistance(docid, state); + + if (type == RES_INT) { + target.insertLong(absdist); + } else { + vespalib::string value = vespalib::stringify(absdist); + vespalib::slime::Memory data(value.c_str(), value.size()); + + if (type == RES_STRING || + type == RES_LONG_STRING || + type == RES_XMLSTRING) + { + target.insertString(data); + } + if (type == RES_LONG_DATA || + type == RES_DATA) + { + target.insertData(data); + } + } +} + + +uint32_t +AbsDistanceDFW::WriteField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + search::RawBuf *target) +{ + (void) gres; + + bool forceEmpty = true; + + const vespalib::string &locationStr = state->_args.getLocation(); + if (locationStr.size() > 0) { + if (state->_parsedLocation.get() == NULL) { + state->_callback.ParseLocation(state); + } + assert(state->_parsedLocation.get() != NULL); + if (state->_parsedLocation->getParseError() == NULL) { + forceEmpty = false; + } + } + + uint32_t written = 0; + if (!forceEmpty) { + uint64_t absdist = findMinDistance(docid, state); + + if (type != RES_INT) { + bool isLong = IsBinaryCompatible(type, RES_LONG_STRING); + uint16_t str_len_16 = 0; + uint32_t str_len_32 = 0; + int str_len_ofs = target->GetUsedLen(); + + if (isLong) + target->append(&str_len_32, sizeof(str_len_32)); + else + target->append(&str_len_16, sizeof(str_len_16)); + + target->addNum64(absdist, 1, ' '); + + // calculate number of bytes written + written = target->GetUsedLen() - str_len_ofs; + + // patch in correct field length + if (isLong) { + str_len_32 = written - sizeof(str_len_32); + memcpy(target->GetWritableDrainPos(str_len_ofs), &str_len_32, + sizeof(str_len_32)); + } else { + str_len_16 = written - sizeof(str_len_16); + memcpy(target->GetWritableDrainPos(str_len_ofs), &str_len_16, + sizeof(str_len_16)); + } + } else { + uint32_t val32 = (uint32_t) absdist; + target->append(&val32, sizeof(val32)); + written = sizeof(val32); + } + } else { + if (type != RES_INT) { + bool isLong = IsBinaryCompatible(type, RES_LONG_STRING); + uint16_t str_len_16 = 0; + uint32_t str_len_32 = 0; + int str_len_ofs = target->GetUsedLen(); + + if (isLong) + target->append(&str_len_32, sizeof(str_len_32)); + else + target->append(&str_len_16, sizeof(str_len_16)); + + // calculate number of bytes written + written = target->GetUsedLen() - str_len_ofs; + } else { + uint32_t val32 = 0u; + target->append(&val32, sizeof(val32)); + written = sizeof(val32); + } + } + return written; +} + +//-------------------------------------------------------------------------- + +PositionsDFW::PositionsDFW(const vespalib::string & attrName) : + AttrDFW(attrName) +{ +} + +vespalib::asciistream +PositionsDFW::formatField(const attribute::IAttributeVector & attribute, uint32_t docid, ResType type) +{ + vespalib::asciistream target; + int32_t docx = 0; + int32_t docy = 0; + + std::vector<IAttributeVector::largeint_t> pos(16); + uint32_t numValues = attribute.get(docid, &pos[0], pos.size()); + if (numValues > pos.size()) { + pos.resize(numValues); + numValues = attribute.get(docid, &pos[0], pos.size()); + assert(numValues <= pos.size()); + } + LOG(debug, "docid=%d, numValues=%d", docid, numValues); + + bool isShort = ! IsBinaryCompatible(type, RES_LONG_STRING); + for (uint32_t i = 0; i < numValues; i++) { + int64_t docxy(pos[i]); + vespalib::geo::ZCurve::decode(docxy, &docx, &docy); + if (docx == 0 && docy == INT_MIN) { + LOG(spam, "skipping empty zcurve value"); + continue; + } + double degrees_ns = docy; degrees_ns /= 1000000.0; + double degrees_ew = docx; degrees_ew /= 1000000.0; + + target << "<position x=\"" << docx << "\" y=\"" << docy << "\""; + target << " latlong=\""; + target << vespalib::FloatSpec::fixed; + if (degrees_ns < 0) { + target << "S" << (-degrees_ns); + } else { + target << "N" << degrees_ns; + } + target << ";"; + if (degrees_ew < 0) { + target << "W" << (-degrees_ew); + } else { + target << "E" << degrees_ew; + } + target << "\" />"; + if (isShort && target.size() > 30000) { + target << "<overflow />"; + break; + } + } + return target; +} + + +uint32_t +PositionsDFW::WriteField(uint32_t docid, + GeneralResult *, + GetDocsumsState * dsState, + ResType type, + search::RawBuf *target) +{ + int str_len_ofs = target->GetUsedLen(); + + vespalib::asciistream val(formatField(vec(*dsState), docid, type)); + + bool isLong = IsBinaryCompatible(type, RES_LONG_STRING); + if (isLong) { + uint32_t str_len_32 = val.size(); + target->append(&str_len_32, sizeof(str_len_32)); + target->append(val.c_str(), str_len_32); + } else { + uint16_t str_len_16 = val.size(); + target->append(&str_len_16, sizeof(str_len_16)); + target->append(val.c_str(), str_len_16); + } + // calculate number of bytes written + uint32_t written = target->GetUsedLen() - str_len_ofs; + return written; +} + + +void +PositionsDFW::insertField(uint32_t docid, + GeneralResult *, + GetDocsumsState * dsState, + ResType type, + vespalib::slime::Inserter &target) +{ + vespalib::asciistream val(formatField(vec(*dsState), docid, type)); + target.insertString(vespalib::slime::Memory(val.c_str(), val.size())); +} + +//-------------------------------------------------------------------------- + +PositionsDFW::UP createPositionsDFW(const char *attribute_name, + IAttributeManager *attribute_manager) +{ + PositionsDFW::UP ret; + if (attribute_manager != NULL) { + if (!attribute_name) { + LOG(debug, "createPositionsDFW: missing attribute name '%p'", attribute_name); + return ret; + } + IAttributeContext::UP context = attribute_manager->createContext(); + if (!context.get()) { + LOG(debug, "createPositionsDFW: could not create context from attribute manager"); + return ret; + } + const IAttributeVector *attribute = context->getAttribute(attribute_name); + if (!attribute) { + LOG(debug, "createPositionsDFW: could not get attribute '%s' from context", attribute_name); + return ret; + } + } + ret.reset(new PositionsDFW(attribute_name)); + return ret; +} + +AbsDistanceDFW::UP createAbsDistanceDFW(const char *attribute_name, + IAttributeManager *attribute_manager) +{ + AbsDistanceDFW::UP ret; + if (attribute_manager != NULL) { + if (!attribute_name) { + LOG(debug, "createAbsDistanceDFW: missing attribute name '%p'", attribute_name); + return ret; + } + IAttributeContext::UP context = attribute_manager->createContext(); + if (!context.get()) { + LOG(debug, "createAbsDistanceDFW: could not create context from attribute manager"); + return ret; + } + const IAttributeVector *attribute = context->getAttribute(attribute_name); + if (!attribute) { + LOG(debug, "createAbsDistanceDFW: could not get attribute '%s' from context", attribute_name); + return ret; + } + } + ret.reset(new AbsDistanceDFW(attribute_name)); + return ret; +} + +} // namespace docsummary +} // namespace search diff --git a/searchsummary/src/vespa/searchsummary/docsummary/positionsdfw.h b/searchsummary/src/vespa/searchsummary/docsummary/positionsdfw.h new file mode 100644 index 00000000000..301bf2ceecc --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/positionsdfw.h @@ -0,0 +1,63 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchsummary/docsummary/attributedfw.h> + +namespace search { +namespace docsummary { + +class AbsDistanceDFW : public AttrDFW +{ +private: + uint64_t findMinDistance(uint32_t docid, GetDocsumsState *state); +public: + AbsDistanceDFW(const vespalib::string & attrName); + + virtual bool IsGenerated() const { return true; } + virtual uint32_t WriteField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + search::RawBuf *target); + virtual void insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target); +}; + +//-------------------------------------------------------------------------- + +class PositionsDFW : public AttrDFW +{ +private: + vespalib::asciistream formatField(const attribute::IAttributeVector & v, uint32_t docid, ResType type); + +public: + typedef std::unique_ptr<PositionsDFW> UP; + + PositionsDFW(const vespalib::string & attrName); + + virtual bool IsGenerated() const { return true; } + virtual uint32_t WriteField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + search::RawBuf *target); + virtual void insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target); +}; + +PositionsDFW::UP createPositionsDFW(const char *attribute_name, + IAttributeManager *index_man); + +AbsDistanceDFW::UP createAbsDistanceDFW(const char *attribute_name, + IAttributeManager *index_man); + +} // namespace docsummary +} // namespace search + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/rankfeaturesdfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/rankfeaturesdfw.cpp new file mode 100644 index 00000000000..8f5055f6d1d --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/rankfeaturesdfw.cpp @@ -0,0 +1,113 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/searchlib/common/featureset.h> +#include <vespa/searchlib/common/packets.h> +#include <vespa/searchsummary/docsummary/rankfeaturesdfw.h> +#include <vespa/searchlib/common/feature.h> +#include "docsumformat.h" + +LOG_SETUP(".searchlib.docsummary.rankfeaturesdfw"); + +namespace search { +namespace docsummary { + +RankFeaturesDFW::RankFeaturesDFW() : + _env(NULL) +{ +} + +RankFeaturesDFW::~RankFeaturesDFW() +{ +} + +void +RankFeaturesDFW::init(IDocsumEnvironment * env) +{ + _env = env; +} + +uint32_t +RankFeaturesDFW::WriteField(uint32_t docid, + GeneralResult * gres, + GetDocsumsState * state, + ResType type, + search::RawBuf * target) +{ + (void) gres; + + if (state->_rankFeatures.get() == NULL) { + state->_callback.FillRankFeatures(state, _env); + if (state->_rankFeatures.get() == NULL) { // still no rank features to write + return DocsumFormat::addEmpty(type, *target); + } + } + + uint32_t written = 0; + + const FeatureSet::StringVector & names = state->_rankFeatures->getNames(); + const feature_t * values = state->_rankFeatures->getFeaturesByDocId(docid); + vespalib::JSONStringer & json(state->_jsonStringer); + if (values != NULL) { + json.clear(); + json.beginObject(); + for (uint32_t i = 0; i < names.size(); ++i) { + featureDump(json, names[i], values[i]); + } + json.endObject(); + written += SummaryFeaturesDFW::writeString(json.toString(), type, target); + json.clear(); + } else { + written += DocsumFormat::addEmpty(type, *target); + } + + return written; +} + + +void +RankFeaturesDFW::insertField(uint32_t docid, + GeneralResult *, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target) +{ + if (state->_rankFeatures.get() == NULL) { + state->_callback.FillRankFeatures(state, _env); + if (state->_rankFeatures.get() == NULL) { // still no rank features to write + return; + } + } + const FeatureSet::StringVector & names = state->_rankFeatures->getNames(); + const feature_t * values = state->_rankFeatures->getFeaturesByDocId(docid); + if (type == RES_FEATUREDATA && values != NULL) { + vespalib::slime::Cursor& obj = target.insertObject(); + for (uint32_t i = 0; i < names.size(); ++i) { + vespalib::slime::Memory name(names[i].c_str(), names[i].size()); + obj.setDouble(name, values[i]); + } + return; + } + vespalib::JSONStringer & json(state->_jsonStringer); + if (values != NULL) { + json.clear(); + json.beginObject(); + for (uint32_t i = 0; i < names.size(); ++i) { + featureDump(json, names[i], values[i]); + } + json.endObject(); + vespalib::slime::Memory value(json.toString().c_str(), + json.toString().size()); + if (type == RES_STRING || type == RES_LONG_STRING) { + target.insertString(value); + } + if (type == RES_DATA || type == RES_LONG_DATA) { + target.insertData(value); + } + json.clear(); + } +} + +} +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/rankfeaturesdfw.h b/searchsummary/src/vespa/searchsummary/docsummary/rankfeaturesdfw.h new file mode 100644 index 00000000000..a04271a16c1 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/rankfeaturesdfw.h @@ -0,0 +1,39 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <map> +#include <string> +#include <vespa/searchsummary/docsummary/summaryfeaturesdfw.h> + +namespace search { +namespace docsummary { + +class RankFeaturesDFW : public FeaturesDFW +{ +private: + RankFeaturesDFW(const RankFeaturesDFW &); + RankFeaturesDFW & operator=(const RankFeaturesDFW &); + + IDocsumEnvironment * _env; + +public: + RankFeaturesDFW(); + virtual ~RankFeaturesDFW(); + void init(IDocsumEnvironment * env); + virtual bool IsGenerated() const { return true; } + virtual uint32_t WriteField(uint32_t docid, + GeneralResult * gres, + GetDocsumsState * state, + ResType type, + search::RawBuf * target); + virtual void insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target); +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/resultclass.cpp b/searchsummary/src/vespa/searchsummary/docsummary/resultclass.cpp new file mode 100644 index 00000000000..19d8baccf45 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/resultclass.cpp @@ -0,0 +1,114 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/fnet/frt/frt.h> +#include <vespa/searchsummary/docsummary/resultclass.h> +#include <vespa/searchsummary/docsummary/resultconfig.h> + +#include <zlib.h> + +LOG_SETUP(".searchlib.docsummary.resultclass"); + +namespace search { +namespace docsummary { + +ResultClass::ResultClass(const char *name, uint32_t id, util::StringEnum & fieldEnum) + : _name(name), + _classID(id), + _entries(), + _nameMap(), + _fieldEnum(fieldEnum), + _enumMap(), + _dynInfo(NULL) +{ +} + + +ResultClass::~ResultClass() +{ +} + + +bool +ResultClass::AddConfigEntry(const char *name, ResType type) +{ + if (_nameMap.find(name) != _nameMap.end()) + return false; + + _nameMap[name] = _entries.size(); + ResConfigEntry e; + e._type = type; + e._bindname = name; + e._enumValue = _fieldEnum.Add(name); + assert(e._enumValue >= 0); + _entries.push_back(e); + return true; +} + + +void +ResultClass::CreateEnumMap() +{ + _enumMap.resize(_fieldEnum.GetNumEntries()); + + for (uint32_t i(0), m(_enumMap.size()); i < m; i++) { + _enumMap[i] = -1; + } + for (uint32_t i(0); i < _entries.size(); i++) { + _enumMap[_entries[i]._enumValue] = i; + } +} + + +bool +ResEntry::_extract_field(search::RawBuf *target) const +{ + bool rc = true; + target->reset(); + + if (ResultConfig::IsVariableSize(_type)) { + if (_is_compressed()) { // COMPRESSED + + uint32_t len = _get_length(); + uint32_t realLen = 0; + + if (len >= sizeof(uint32_t)) + realLen = _get_real_length(); + else + rc = false; + + if (realLen > 0) { + uLongf rlen = realLen; + char *fillPos = target->GetWritableFillPos(realLen + 1 < 32000 ? + 32000 : realLen + 1); + if ((uncompress((Bytef *)fillPos, &rlen, + (const Bytef *)(_get_compressed()), + len - sizeof(realLen)) == Z_OK) && + rlen == realLen) { + fillPos[realLen] = '\0'; + target->Fill(realLen); + } else { + rc = false; + } + } + } else { // UNCOMPRESSED + uint32_t len = _len; + if (len + 1 < 32000) + target->preAlloc(32000); + else + target->preAlloc(len + 1); + char *fillPos = target->GetWritableFillPos(len + 1 < 32000 ? + 32000 : len + 1); + memcpy(fillPos, _pt, len); + fillPos[len] = '\0'; + target->Fill(len); + } + } + return rc; +} + +} +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/resultclass.h b/searchsummary/src/vespa/searchsummary/docsummary/resultclass.h new file mode 100644 index 00000000000..e35408a796c --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/resultclass.h @@ -0,0 +1,291 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 2001-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/vespalib/stllike/string.h> +#include <vespa/vespalib/stllike/hash_map.h> +#include <vespa/searchlib/util/stringenum.h> + +namespace search { +namespace docsummary { + +/** + * This enumeration contains values denoting the different types of + * docsum fields. NOTE: The internal implementation depends on RES_INT + * having the value 0. All types < RES_STRING must be fixed size and + * all types > RES_STRING must be variable size. + **/ +enum ResType { + RES_INT = 0, + RES_SHORT, + RES_BYTE, + RES_FLOAT, + RES_DOUBLE, + RES_INT64, + RES_STRING, + RES_DATA, + RES_LONG_STRING, + RES_LONG_DATA, + RES_XMLSTRING, + RES_JSONSTRING, + RES_FEATUREDATA +}; + + +/** + * This struct describes a single docsum field (name and type). A + * docsum blob is unpacked into an array of ResEntry instances + * by interpreting it as described by an array of ResConfigEntry + * instances. + **/ +struct ResConfigEntry { + ResType _type; + vespalib::string _bindname; + int _enumValue; +}; + + +/** + * This struct holds the actual value of a single docsum field. A + * docsum blob is unpacked into an array of ResEntry instances + * by interpreting it as described by an array of ResConfigEntry + * instances. Note that type normalization is performed when unpacking + * docsum fields. Fields of type RES_BYTE and RES_SHORT are promoted + * to RES_INT. Fields of type RES_FLOAT are promoted to RES_DOUBLE. + **/ +struct ResEntry +{ + ResType _type; + union { + uint32_t _intval; + uint32_t _stringlen; + uint32_t _datalen; + uint32_t _len; + uint64_t _int64val; + double _doubleval; + }; + union { + char *_stringval; + char *_dataval; + void *_pt; + }; + + bool _extract_field(search::RawBuf *target) const; + + uint32_t _get_length() const { return (_len & 0x7fffffff); } + bool _is_compressed() const { return (_len & 0x80000000) != 0; } + uint32_t _get_real_length() const + { + // precond: IsVariableSize(_type) && _len >= sizeof(uint32_t) + + uint32_t rlen; + memcpy(&rlen, _pt, sizeof(rlen)); + return rlen; + } + const void *_get_compressed() const + { + // precond: IsVariableSize(_type) && _len >= sizeof(uint32_t) + + return (const void *)(((const char *) _pt) + sizeof(uint32_t)); + } + void _resolve_field(const char **buf, uint32_t *buflen, + search::RawBuf *target) const + { + // precond: IsVariableSize(_type) + + if (_is_compressed()) { + if (_extract_field(target)) { + *buf = target->GetDrainPos(); + *buflen = target->GetUsedLen(); + } else { + *buf = NULL; + *buflen = 0; + } + } else { + *buf = (char *) _pt; + *buflen = _len; + } + } +}; + +/** + * This class represents a specific docsum format (docsum class). It + * contains an array of ResConfigEntry instances (config + * entries) that may be used to unpack docsum blobs into + * ResEntry arrays. It also contains methods for mapping both + * field name and field name enum value into field index. The field + * index may then be used to access the actual field in the + * GeneralResult object representing the unpacked docsum blob. + **/ +class ResultClass +{ +public: + struct DynamicInfo + { + uint32_t _overrideCnt; // # fields overridden + uint32_t _generateCnt; // # fields generated + }; + +private: + ResultClass(const ResultClass &); + ResultClass& operator=(const ResultClass &); + typedef vespalib::hash_map<vespalib::string, int> NameIdMap; + typedef std::vector<ResConfigEntry> Configs; + + vespalib::string _name; // name of this class + uint32_t _classID; // ID of this class + Configs _entries; // config entries for this result class + NameIdMap _nameMap; // fieldname -> entry index + util::StringEnum &_fieldEnum; // fieldname -> f.n. enum value [SHARED] + std::vector<int> _enumMap; // fieldname enum value -> entry index + DynamicInfo *_dynInfo; // fields overridden and generated + +public: + typedef std::unique_ptr<ResultClass> UP; + + /** + * Constructor. Assign name and id to this result class. Also gain + * ref. to shared string enum object and insert into linked list. + * + * @param name the name of this result class. + * @param id the numeric id of this result class. + * @param fieldEnum shared object used to enumerate field names. + **/ + ResultClass(const char *name, uint32_t id, util::StringEnum & fieldEnum); + + /** + * Destructor. Delete internal structures. + **/ + ~ResultClass(); + + + /** + * Attach dynamic field data to this result class. + * + * @param data pointer to dynamic field data. + **/ + void setDynamicInfo(DynamicInfo *data) { _dynInfo = data; } + + + /** + * Obtain pointer to dynamic field data attached to this result class. + * + * @return pointer to dynamic field data. + **/ + DynamicInfo *getDynamicInfo() const { return _dynInfo; } + + + /** + * Obtain the name of this result class. + * + * @return name of this result class. + **/ + const char *GetClassName() const { return _name.c_str(); } + + + /** + * Obtain the numeric id of this result class. + * + * @return numeric id of this result class. + **/ + uint32_t GetClassID() const { return _classID; } + + + /** + * Obtain the number of config entries (size of the + * ResConfigEntry array) held by this result class. + * + * @return number of config entries held by this object. + **/ + uint32_t GetNumEntries() const { return _entries.size(); } + + + /** + * Add a config entry to this result class. Each config entry + * contains the name and type of a field present in the docsum blobs + * conforming to this result class. This method will fail if the + * field name given already has been used to name a field in this + * result class. + * + * @return true(success)/false(fail) + * @param name the name of the field to add. + * @param type the type of the field to add. + **/ + bool AddConfigEntry(const char *name, ResType type); + + + /** + * This method may be called to create an internal mapping from + * field name enumerated value to field index. When building up a + * result configuration possibly containing several result classes, + * all field names are enumerated (across all result classes), + * assigning a single unique integer value to each field name. This + * is done with the StringEnum object given to the + * constructor. This way, fastserver components that want to + * reference a unique field name may use the enumerated value + * instead of the string itself. NOTE: This method must be called in + * order to use the GetIndexFromEnumValue method. NOTE2: This method + * is called by the ResultConfig::CreateEnumMaps method; no + * need to call it directly. + **/ + void CreateEnumMap(); + + + /** + * Obtain the field index from the field name. The field index may + * be used to look up a config entry in this object, or to look up a + * result entry in a GeneralResult object. NOTE: When using + * the return value from this method to look up a result entry in a + * GeneralResult object, make sure that the + * GeneralResult object has this object as it's result + * class. NOTE2: This method is called by the + * GeneralResult::GetEntry(string) method; no need to call it + * directly. + * + * @return field index or -1 if not found. + **/ + int GetIndexFromName(const char* name) const + { + NameIdMap::const_iterator found(_nameMap.find(name)); + return (found != _nameMap.end()) ? found->second : -1; + } + + + /** + * Obtain the field index from the field name enumerated value. The + * field index may be used to look up a config entry in this object, + * or to look up a result entry in a GeneralResult + * object. NOTE: When using the return value from this method to + * look up a result entry in a GeneralResult object, make sure + * that the GeneralResult object has this object as it's + * result class. NOTE2: This method is called by the + * GeneralResult::GetEntryFromEnumValue method; no need to + * call it directly. NOTE3: You need to call the CreateEnumMap + * method before calling this one. + * + * @return field index or -1 if not found. + **/ + int GetIndexFromEnumValue(uint32_t value) const + { + return (value < _enumMap.size()) ? _enumMap[value] : -1; + } + + + /** + * Obtain config entry by field index. + * + * @return config entry or NULL if not found. + **/ + const ResConfigEntry *GetEntry(uint32_t offset) const + { + return (offset < _entries.size()) ? &_entries[offset] : NULL; + } +}; + + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/resultconfig.cpp b/searchsummary/src/vespa/searchsummary/docsummary/resultconfig.cpp new file mode 100644 index 00000000000..de635d14854 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/resultconfig.cpp @@ -0,0 +1,246 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/searchsummary/docsummary/resultconfig.h> +#include <vespa/searchsummary/docsummary/urlresult.h> +#include <vespa/vespalib/util/exceptions.h> + +LOG_SETUP(".searchlib.docsummary.resultconfig"); + +namespace search { +namespace docsummary { + +void +ResultConfig::Clean() +{ + _classLookup.clear(); + _nameLookup.clear(); +} + + +void +ResultConfig::Init() +{ +} + + +ResultConfig::ResultConfig() + : _defaultSummaryId(-1), + _classLookup(), + _nameLookup() +{ + Init(); +} + + +ResultConfig::~ResultConfig() +{ + Clean(); +} + + +const char * +ResultConfig::GetResTypeName(ResType type) +{ + switch (type) { + case RES_INT: return "integer"; + case RES_SHORT: return "short"; + case RES_BYTE: return "byte"; + case RES_FLOAT: return "float"; + case RES_DOUBLE: return "double"; + case RES_INT64: return "int64"; + case RES_STRING: return "string"; + case RES_DATA: return "data"; + case RES_LONG_STRING: return "longstring"; + case RES_LONG_DATA: return "longdata"; + case RES_XMLSTRING: return "xmlstring"; + case RES_JSONSTRING: return "jsonstring"; + case RES_FEATUREDATA: return "featuredata"; + } + return "unknown-type"; +} + +void +ResultConfig::Reset() +{ + if (! _classLookup.empty() || _fieldEnum.GetNumEntries() > 0) { + Clean(); + Init(); + } +} + + +ResultClass * +ResultConfig::AddResultClass(const char *name, uint32_t id) +{ + ResultClass *ret = NULL; + + if (id != NoClassID() && (_classLookup.find(id) == _classLookup.end())) { + ResultClass::UP rc(new ResultClass(name, id, _fieldEnum)); + ret = rc.get(); + _classLookup[id] = std::move(rc); + if (_nameLookup.find(name) != _nameLookup.end()) { + LOG(warning, "Duplicate result class name: %s " + "(now maps to class id %u)", name, id); + } + _nameLookup[name] = id; + } + return ret; +} + + +const ResultClass* +ResultConfig::LookupResultClass(uint32_t id) const +{ + IdMap::const_iterator it(_classLookup.find(id)); + return (it != _classLookup.end()) ? it->second.get() : NULL; +} + +uint32_t +ResultConfig::LookupResultClassId(const vespalib::string &name, uint32_t def) const +{ + NameMap::const_iterator found(_nameLookup.find(name)); + return (found != _nameLookup.end()) ? found->second : def; +} + +uint32_t +ResultConfig::LookupResultClassId(const vespalib::string &name) const +{ + return LookupResultClassId(name, (name.empty() || (name == "default")) ? _defaultSummaryId : NoClassID()); +} + + +void +ResultConfig::CreateEnumMaps() +{ + for (IdMap::iterator it(_classLookup.begin()), mt(_classLookup.end()); it != mt; it++) { + it ->second->CreateEnumMap(); + } +} + + +bool +ResultConfig::ReadConfig(const vespa::config::search::SummaryConfig &cfg, const char *configId) +{ + bool rc = true; + Reset(); + int maxclassID = 0x7fffffff; // avoid negative classids + _defaultSummaryId = cfg.defaultsummaryid; + for (uint32_t i = 0; rc && i < cfg.classes.size(); i++) { + if (cfg.classes[i].name.empty()) { + LOG(warning, "%s classes[%d]: empty name", configId, i); + } + int classID = cfg.classes[i].id; + if (classID < 0 || classID > maxclassID) { + LOG(error, "%s classes[%d]: bad id %d", configId, i, classID); + rc = false; + break; + } + ResultClass *resClass = AddResultClass(cfg.classes[i].name.c_str(), classID); + if (resClass == NULL) { + LOG(error, + "%s: unable to add classes[%d] name %s", + configId, i, cfg.classes[i].name.c_str()); + rc = false; + break; + } + for (unsigned int j = 0; rc && j < cfg.classes[i].fields.size(); j++) { + const char *fieldtype = cfg.classes[i].fields[j].type.c_str(); + const char *fieldname = cfg.classes[i].fields[j].name.c_str(); + LOG(debug, "Reconfiguring class '%s' field '%s' of type '%s'", cfg.classes[i].name.c_str(), fieldname, fieldtype); + if (strcmp(fieldtype, "integer") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_INT); + } else if (strcmp(fieldtype, "short") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_SHORT); + } else if (strcmp(fieldtype, "byte") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_BYTE); + } else if (strcmp(fieldtype, "float") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_FLOAT); + } else if (strcmp(fieldtype, "double") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_DOUBLE); + } else if (strcmp(fieldtype, "int64") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_INT64); + } else if (strcmp(fieldtype, "string") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_STRING); + } else if (strcmp(fieldtype, "data") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_DATA); + } else if (strcmp(fieldtype, "longstring") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_LONG_STRING); + } else if (strcmp(fieldtype, "longdata") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_LONG_DATA); + } else if (strcmp(fieldtype, "xmlstring") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_XMLSTRING); + } else if (strcmp(fieldtype, "jsonstring") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_JSONSTRING); + } else if (strcmp(fieldtype, "featuredata") == 0) { + rc = resClass->AddConfigEntry(fieldname, RES_FEATUREDATA); + } else { // FAIL: unknown field type + LOG(error, + "%s %s.fields[%d]: unknown type '%s'", + configId, cfg.classes[i].name.c_str(), j, fieldtype); + rc = false; + break; + } + if (!rc) { // FAIL: duplicate field name + LOG(error, + "%s %s.fields[%d]: duplicate name '%s'", + configId, cfg.classes[i].name.c_str(), j, fieldname); + break; + } + } + } + if (rc) { + CreateEnumMaps(); // create mappings needed by TVM + } else { + Reset(); // FAIL, discard all config + } + return rc; +} + +uint32_t +ResultConfig::GetClassID(const char *buf, uint32_t buflen) +{ + uint32_t ret = NoClassID(); + uint32_t tmp32; + + if (buflen >= sizeof(tmp32)) { + memcpy(&tmp32, buf, sizeof(tmp32)); + ret = tmp32; + } + return ret; +} + +urlresult* +ResultConfig::Unpack(uint32_t partition, + uint32_t docid, + HitRank metric, + const char *buf, + uint32_t buflen) const +{ + urlresult *ret = NULL; + const ResultClass *resClass = NULL; + uint32_t tmp32; + + if (buflen >= sizeof(tmp32)) { + memcpy(&tmp32, buf, sizeof(tmp32)); + buf += sizeof(tmp32); + buflen -= sizeof(tmp32); + resClass = LookupResultClass(tmp32); + } + + if (resClass != NULL && (buflen > 0)) { + ret = new GeneralResult(resClass, partition, docid, metric); + if (ret->unpack(buf, buflen) != 0) { // FAIL: unpack + delete ret; + ret = NULL; + } + } + + return (ret != NULL) ? ret : new badurlresult(partition, docid, metric); +} + +} // namespace docsummary +} // namespace search diff --git a/searchsummary/src/vespa/searchsummary/docsummary/resultconfig.h b/searchsummary/src/vespa/searchsummary/docsummary/resultconfig.h new file mode 100644 index 00000000000..ed01dcdf6b3 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/resultconfig.h @@ -0,0 +1,301 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 2001-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/config-summary.h> +#include <vespa/fnet/frt/frt.h> +#include <vespa/searchlib/util/rawbuf.h> +#include <vespa/searchlib/util/stringenum.h> +#include <vespa/searchsummary/docsummary/resultclass.h> +#include <vespa/searchsummary/docsummary/urlresult.h> + +namespace search { +namespace docsummary { + +/** + * This class represents the overall result configuration. A result + * configuration may contain multiple result classes, where each + * result class represents a specific docsum blob format. The first n + * (32) bits in the docsum blob defines the id of a result + * class. The rest of the data contained in the docsum blob is then + * defined by the sequence of config entries held by the result class + * with the given id. Unpacking of docsum blobs is performed by first + * extracting the result class id and then using the appropriate + * result class to unpack the rest of the docsum fields. The + * extraction of the class id is done by the Unpack method in this + * class, while the unpacking of the docsum fields is done by a + * GeneralResult object backed by a ResultClass object. + **/ +class ResultConfig +{ +private: + ResultConfig(const ResultConfig &); + ResultConfig& operator=(const ResultConfig &); + + typedef vespalib::hash_map<vespalib::string, uint32_t> NameMap; + typedef vespalib::hash_map<uint32_t, ResultClass::UP> IdMap; + uint32_t _defaultSummaryId; + search::util::StringEnum _fieldEnum; + IdMap _classLookup; + NameMap _nameLookup; // name -> class id + + void Clean(); + void Init(); + +public: + class iterator { + public: + iterator(IdMap::iterator it) : _it(it) { } + iterator operator ++(int) { iterator tmp(_it); ++_it; return tmp; } + iterator & operator ++() { ++_it; return *this; } + bool operator == (const iterator & b) const { return _it == b._it; } + bool operator != (const iterator & b) const { return _it != b._it; } + ResultClass & operator *() { return *_it->second; } + ResultClass * operator ->() { return _it->second.get(); } + private: + IdMap::iterator _it; + }; + + class const_iterator { + public: + const_iterator(IdMap::const_iterator it) : _it(it) { } + const_iterator operator ++(int) { const_iterator tmp(_it); ++_it; return tmp; } + const_iterator & operator ++() { ++_it; return *this; } + bool operator == (const const_iterator & b) const { return _it == b._it; } + bool operator != (const const_iterator & b) const { return _it != b._it; } + const ResultClass & operator *() const { return *_it->second; } + const ResultClass * operator ->() const { return _it->second.get(); } + private: + IdMap::const_iterator _it; + }; + + iterator begin() { return iterator(_classLookup.begin()); } + iterator end() { return iterator(_classLookup.end()); } + const_iterator begin() const { return const_iterator(_classLookup.begin()); } + const_iterator end() const { return const_iterator(_classLookup.end()); } + + /** + * Constructor. Create an initially empty result configuration. + * NOTE: This method simply calls the Init method. + **/ + ResultConfig(); + + /** + * Destructor. Delete all internal structures. NOTE: This method + * simply calls the Clean method. + **/ + ~ResultConfig(); + + + /** + * @return value denoting an undefined class id. + **/ + static uint32_t NoClassID() { return static_cast<uint32_t>(-1); } + + + /** + * Determine if a result field type is of variable size. + * + * @return true for variable size field types, false for fixed + * size field types + **/ + static bool IsVariableSize(ResType t) { return (t >= RES_STRING); } + + + /** + * Determine if a pair of result field types are binary + * compatible. A pair of types are binary compatible if the packed + * representation is identical. + * + * @return true if the given types are binary compatible. + * @param a enum value of a result field type. + * @param b enum value of a result field type. + **/ + static bool IsBinaryCompatible(ResType a, ResType b) + { + if (a == b) { + return true; + } + switch (a) { + case RES_STRING: + case RES_DATA: + return (b == RES_STRING || b == RES_DATA); + case RES_LONG_STRING: + case RES_LONG_DATA: + case RES_XMLSTRING: + case RES_FEATUREDATA: + case RES_JSONSTRING: + return (b == RES_LONG_STRING || b == RES_LONG_DATA || + b == RES_XMLSTRING || b == RES_FEATUREDATA || b == RES_JSONSTRING); + default: + return false; + } + return false; + } + + + /** + * Determine if a pair of result field types are runtime + * compatible. A pair of types are runtime compatible if the + * unpacked (@ref ResEntry) representation is identical. + * + * @return true if the given types are runtime compatible. + * @param a enum value of a result field type. + * @param b enum value of a result field type. + **/ + static bool IsRuntimeCompatible(ResType a, ResType b) + { + switch (a) { + case RES_INT: + case RES_SHORT: + case RES_BYTE: + return (b == RES_INT || b == RES_SHORT || b == RES_BYTE); + case RES_FLOAT: + case RES_DOUBLE: + return (b == RES_FLOAT || b == RES_DOUBLE); + case RES_INT64: + return b == RES_INT64; + case RES_STRING: + case RES_LONG_STRING: + case RES_XMLSTRING: + case RES_JSONSTRING: + return (b == RES_STRING || b == RES_LONG_STRING || b == RES_XMLSTRING || b == RES_JSONSTRING); + case RES_DATA: + case RES_LONG_DATA: + return (b == RES_DATA || b == RES_LONG_DATA); + case RES_FEATUREDATA: + return (b == RES_FEATUREDATA); + } + return false; + } + + + /** + * @return the name of the given result field type. + * @param resType enum value of a result field type. + **/ + static const char *GetResTypeName(ResType type); + + /** + * Discard the current configuration and start over. After this + * method returns, the state of this object will be equal to the + * state right after it was created. This method may call both Clean + * and Init. + **/ + void Reset(); + + + /** + * Add a new result class to this result configuration. This will + * create a new result class object and insert it into the lookup + * structure. This method will fail if another class with the same + * ID has already been added or if the given ID is illegal. + * + * @return newly created result class object or NULL. + * @param name name of result class to add. + * @param classID id of result class to add. + **/ + ResultClass *AddResultClass(const char *name, uint32_t classID); + + + /** + * Obtain result class from the result class id. This method is used + * when unpacking docsum blobs. + * + * @return result class with the given id or NULL if not found. + * @param classID the id of the result class to look up. + **/ + const ResultClass *LookupResultClass(uint32_t classID) const; + + + /** + * Obtain result class id from the result class name. + * + * @return result class id or 'def' if not found + * @param name the name of the result class + * @param def default return value if not found + **/ + uint32_t LookupResultClassId(const vespalib::string &name, uint32_t def) const; + + /** + * Obtain result class id from the result class name. + * + * @return result class id or configured default if empty or "default". + * @param name the name of the result class, NoClassId(-1) meaning undefined + **/ + uint32_t LookupResultClassId(const vespalib::string &name) const; + + + /** + * Obtain the number of result classes held by this result + * configuration. + * + * @return number of result classes. + **/ + uint32_t GetNumResultClasses() const { return _classLookup.size(); } + + + /** + * Obtain the string enumeration object that holds the mapping from + * field name to field name enumerated value. + * + * @return field name enumeration. + **/ + const search::util::StringEnum & GetFieldNameEnum() const { return _fieldEnum; } + + + /** + * This method calls the CreateEnumMap on all result classes held by + * this object. This is needed in order to look up fields by field + * name enumerated value. + **/ + void CreateEnumMaps(); + + /** + * Read config that has been fetched from configserver. + * + * @return true(success)/false(fail) + * @param configId reference on server + **/ + bool ReadConfig(const vespa::config::search::SummaryConfig &cfg, const char *configId); + + /** + * Inspect a docsum blob and return the class id of the docsum + * contained within it. This method is useful if you want to know + * what it is before deciding whether to unpack it. + * + * @return docsum blob class id. + * @param buf docsum blob. + * @param buflen length of docsum blob. + **/ + uint32_t GetClassID(const char *buf, uint32_t buflen); + + /** + * Unpack docsum blob. The first n (0/8/16/32) bits are read from + * the data given and used to look up the appropriate result + * class. A GeneralResult object is created based on that + * class and told to unpack the rest of the docsum blob. If this + * operation succeeds, the GeneralResult object is + * returned. It if fails, a badurlresult object is returned + * instead. + * + * @return object representing the unpacked result. + * @param partition partition path for current hit. + * @param docid docid for current hit. + * @param metric relevance estimate for current hit. + * @param buf docsum blob. + * @param buflen length of docsum blob. + **/ + urlresult * + Unpack(uint32_t partition, + uint32_t docid, + HitRank metric, + const char *buf, + uint32_t buflen) const; +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/resultpacker.cpp b/searchsummary/src/vespa/searchsummary/docsummary/resultpacker.cpp new file mode 100644 index 00000000000..5648702eb7f --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/resultpacker.cpp @@ -0,0 +1,266 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/searchcommon/common/undefinedvalues.h> +#include <vespa/searchsummary/docsummary/resultpacker.h> +#include <zlib.h> + +LOG_SETUP(".searchlib.docsummary.resultpacker"); + +namespace search { +namespace docsummary { + +void +ResultPacker::WarnType(ResType type) +{ + LOG(debug, + "ResultPacker: got '%s', expected '%s' " + "(fields are binary compatible)", + GetResTypeName(type), + GetResTypeName(_cfgEntry->_type)); +} + + +void +ResultPacker::SetFormatError(ResType type) +{ + _error = true; + + if (_cfgEntry != NULL) { + LOG(error, + "ResultPacker: format error: got '%s', expected '%s'", + GetResTypeName(type), + GetResTypeName(_cfgEntry->_type)); + } else { + LOG(error, + "ResultPacker: format error: " + "got '%s', no more fields expected", GetResTypeName(type)); + } +} + + +ResultPacker::ResultPacker(const ResultConfig *resConfig) + : _buf(32768), + _cbuf(32768), + _resConfig(resConfig), + _resClass(NULL), + _entryIdx(0), + _cfgEntry(NULL), + _error(true) +{ +} + + +ResultPacker::~ResultPacker() +{ +} + +void +ResultPacker::InitPlain() +{ + _buf.reset(); +} + +bool +ResultPacker::Init(uint32_t classID) +{ + _buf.reset(); + _resClass = (_resConfig != NULL) ? + _resConfig->LookupResultClass(classID) : NULL; + _entryIdx = 0; + if (_resClass != NULL) { + uint32_t id = _resClass->GetClassID(); + _buf.append(&id, sizeof(id)); + _cfgEntry = _resClass->GetEntry(_entryIdx); + _error = false; + } else { + _cfgEntry = NULL; + _error = true; + + LOG(error, "ResultPacker: resultclass %d does not exist", classID); + } + + return !_error; +} + + +bool +ResultPacker::AddEmpty() +{ + if (!_error && _cfgEntry != NULL) { + switch (_cfgEntry->_type) { + case RES_INT: return AddInteger(search::attribute::getUndefined<int32_t>()); + case RES_SHORT: return AddShort(search::attribute::getUndefined<int16_t>()); + case RES_BYTE: return AddByte(search::attribute::getUndefined<int8_t>()); + case RES_FLOAT: return AddFloat(search::attribute::getUndefined<float>()); + case RES_DOUBLE: return AddDouble(search::attribute::getUndefined<double>()); + case RES_INT64: return AddInt64(search::attribute::getUndefined<int64_t>()); + case RES_STRING: return AddString(NULL, 0); + case RES_DATA: return AddData(NULL, 0); + case RES_XMLSTRING: + case RES_JSONSTRING: + case RES_FEATUREDATA: + case RES_LONG_STRING: return AddLongString(NULL, 0); + case RES_LONG_DATA: return AddLongData(NULL, 0); + } + } + return AddInteger(0); // to provoke error condition +} + + +bool +ResultPacker::AddByte(uint8_t value) +{ + if (CheckEntry(RES_BYTE)) + AddByteForce(value); + return !_error; +} + +void +ResultPacker::AddByteForce(uint8_t value) +{ + _buf.append(&value, sizeof(value)); +} + +bool +ResultPacker::AddShort(uint16_t value) +{ + if (CheckEntry(RES_SHORT)) + AddShortForce(value); + return !_error; +} + +void +ResultPacker::AddShortForce(uint16_t value) +{ + _buf.append(&value, sizeof(value)); +} + + +bool +ResultPacker::AddInteger(uint32_t value) +{ + if (CheckEntry(RES_INT)) + AddIntegerForce(value); + return !_error; +} + +void +ResultPacker::AddIntegerForce(uint32_t value) +{ + _buf.append(&value, sizeof(value)); +} + + +bool +ResultPacker::AddFloat(float value) +{ + if (CheckEntry(RES_FLOAT)) + _buf.append(&value, sizeof(value)); + return !_error; +} + + +bool +ResultPacker::AddDouble(double value) +{ + if (CheckEntry(RES_DOUBLE)) + _buf.append(&value, sizeof(value)); + return !_error; +} + + +bool +ResultPacker::AddInt64(uint64_t value) +{ + if (CheckEntry(RES_INT64)) + _buf.append(&value, sizeof(value)); + return !_error; +} + + +bool +ResultPacker::AddString(const char *str, uint32_t slen) +{ + if (CheckEntry(RES_STRING)) + AddStringForce(str, slen); + return !_error; +} + +void +ResultPacker::AddStringForce(const char *str, uint32_t slen) +{ + uint16_t len = slen; + _buf.append(&len, sizeof(len)); + _buf.append(str, len); +} + + +bool +ResultPacker::AddData(const char *buf, uint32_t buflen) +{ + if (CheckEntry(RES_DATA)) { + uint16_t len = buflen; + _buf.append(&len, sizeof(len)); + _buf.append(buf, len); + } + return !_error; +} + + +bool +ResultPacker::AddLongString(const char *str, uint32_t slen) +{ + if (CheckEntry(RES_LONG_STRING)) { + _buf.append(&slen, sizeof(slen)); + _buf.append(str, slen); + } + return !_error; +} + + +bool +ResultPacker::AddLongData(const char *buf, uint32_t buflen) +{ + if (CheckEntry(RES_LONG_DATA)) { + _buf.append(&buflen, sizeof(buflen)); + _buf.append(buf, buflen); + } + return !_error; +} + + +bool +ResultPacker::GetDocsumBlob(const char **buf, uint32_t *buflen) +{ + if (!_error && + _entryIdx != _resClass->GetNumEntries()) + { + _error = true; + LOG(error, + "ResultPacker: format error: %d fields are missing", + _resClass->GetNumEntries() - _entryIdx); + } + if (_error) { + *buf = NULL; + *buflen = 0; + return false; + } else { + *buf = _buf.GetDrainPos(); + *buflen = _buf.GetUsedLen(); + return true; + } +} + +void +ResultPacker::GetDocsumBlobForce(const char **buf, uint32_t *buflen) +{ + *buf = _buf.GetDrainPos(); + *buflen = _buf.GetUsedLen(); +} + +} +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/resultpacker.h b/searchsummary/src/vespa/searchsummary/docsummary/resultpacker.h new file mode 100644 index 00000000000..25763fd06a8 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/resultpacker.h @@ -0,0 +1,271 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 2001-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/searchsummary/docsummary/resultconfig.h> + +namespace search { +namespace docsummary { +/** + * An Object of this class may be used to create docsum blobs. A + * single blob is created by first indicating what result class the + * blob should conform to. After that, each docsum field is added with + * an individual method call. The blob may then be extracted by a + * final method call. Note that objects of this class may be re-used + * to create multiple blobs each. + **/ +class ResultPacker +{ +private: + ResultPacker(const ResultPacker &); + ResultPacker& operator=(const ResultPacker &); + + search::RawBuf _buf; // packing buffer + search::RawBuf _cbuf; // compression buffer + const ResultConfig *_resConfig; // result config + const ResultClass *_resClass; // result class of current blob + uint32_t _entryIdx; // current field index of current blob + const ResConfigEntry *_cfgEntry; // current field of current blob + bool _error; // error flag for current blob + + static const char *GetResTypeName(ResType type) + { return ResultConfig::GetResTypeName(type); } + + static bool IsBinaryCompatible(ResType a, ResType b) + { return ResultConfig::IsBinaryCompatible(a, b); } + + void WarnType(ResType type); + void SetFormatError(ResType type); + + bool CheckEntry(ResType type) + { + if (_error) + return false; + + bool rc = (_cfgEntry != NULL && + IsBinaryCompatible(_cfgEntry->_type, type)); + + if (rc) { + if (_cfgEntry->_type != type) { + WarnType(type); + } + _cfgEntry = _resClass->GetEntry(++_entryIdx); + } else { + SetFormatError(type); + } + + return rc; + } + +public: + /** + * Create a result packer based on the given result config. Note + * that the result config object is NOT handed over; it is the + * responsibility of the application to ensure that the lifetime of + * the result config object is longer than the lifetime of the + * created result packer object. + * + * @param resConfig result configuration. + **/ + explicit ResultPacker(const ResultConfig *resConfig); + ~ResultPacker(); + + + /** + * Start creating new docsum blob without result class. + * (Bypassing type-checks.) + **/ + void InitPlain(); + + /** + * Start creating a new docsum blob of the given result class. + * + * @return true(ok)/false(error). + * @param classID the id of the result class we want to create a + * docsum blob conforming to. + **/ + bool Init(uint32_t classID); + + /** + * Add empty field of appropriate type. + * + * @return true(ok)/false(error). + **/ + bool AddEmpty(); + + /** + * Add a 'byte' field to the docsum blob we are currently + * creating. Note that this method will fail if the type of the + * added field is not compatible with the field type sequence + * defined in the result class config. This method will also fail if + * an error condition is already detected. The only way to clear the + * error state is with another call to @ref Init. + * + * @return true(ok)/false(error). + * @param value byte value of field to add. + **/ + bool AddByte(uint8_t value); + + void AddByteForce(uint8_t value); + + /** + * Add a 'short' field to the docsum blob we are currently + * creating. Note that this method will fail if the type of the + * added field is not compatible with the field type sequence + * defined in the result class config. This method will also fail if + * an error condition is already detected. The only way to clear the + * error state is with another call to @ref Init. + * + * @return true(ok)/false(error). + * @param value short value of field to add. + **/ + bool AddShort(uint16_t value); + + void AddShortForce(uint16_t value); + + + /** + * Add a 'integer' field to the docsum blob we are currently + * creating. Note that this method will fail if the type of the + * added field is not compatible with the field type sequence + * defined in the result class config. This method will also fail if + * an error condition is already detected. The only way to clear the + * error state is with another call to @ref Init. + * + * @return true(ok)/false(error). + * @param value integer value of field to add. + **/ + bool AddInteger(uint32_t value); + + void AddIntegerForce(uint32_t value); + + + /** + * Add a 'float' field to the docsum blob we are currently + * creating. Note that this method will fail if the type of the + * added field is not compatible with the field type sequence + * defined in the result class config. This method will also fail if + * an error condition is already detected. The only way to clear the + * error state is with another call to @ref Init. + * + * @return true(ok)/false(error). + * @param value float value of field to add. + **/ + bool AddFloat(float value); + + + /** + * Add a 'double' field to the docsum blob we are currently + * creating. Note that this method will fail if the type of the + * added field is not compatible with the field type sequence + * defined in the result class config. This method will also fail if + * an error condition is already detected. The only way to clear the + * error state is with another call to @ref Init. + * + * @return true(ok)/false(error). + * @param value double value of field to add. + **/ + bool AddDouble(double value); + + + /** + * Add a 'int64' field to the docsum blob we are currently + * creating. Note that this method will fail if the type of the + * added field is not compatible with the field type sequence + * defined in the result class config. This method will also fail if + * an error condition is already detected. The only way to clear the + * error state is with another call to @ref Init. + * + * @return true(ok)/false(error). + * @param value int64 value of field to add. + **/ + bool AddInt64(uint64_t value); + + + /** + * Add a 'string' field to the docsum blob we are currently + * creating. Note that this method will fail if the type of the + * added field is not compatible with the field type sequence + * defined in the result class config. This method will also fail if + * an error condition is already detected. The only way to clear the + * error state is with another call to @ref Init. The maximum length + * of this field is 64kB. + * + * @return true(ok)/false(error). + * @param str pointer to string to add. + * @param slen length of string to add. + **/ + bool AddString(const char *str, uint32_t slen); + + void AddStringForce(const char *str, uint32_t slen); + + /** + * Add a 'data' field to the docsum blob we are currently + * creating. Note that this method will fail if the type of the + * added field is not compatible with the field type sequence + * defined in the result class config. This method will also fail if + * an error condition is already detected. The only way to clear the + * error state is with another call to @ref Init. The maximum length + * of this field is 64kB. + * + * @return true(ok)/false(error). + * @param buf pointer to data to add. + * @param buflen length of data to add. + **/ + bool AddData(const char *buf, uint32_t buflen); + + + /** + * Add a 'longstring' field to the docsum blob we are currently + * creating. Note that this method will fail if the type of the + * added field is not compatible with the field type sequence + * defined in the result class config. This method will also fail if + * an error condition is already detected. The only way to clear the + * error state is with another call to @ref Init. The maximum length + * of this field is 2GB. + * + * @return true(ok)/false(error). + * @param str pointer to string to add. + * @param slen length of string to add. + **/ + bool AddLongString(const char *str, uint32_t slen); + + + /** + * Add a 'longdata' field to the docsum blob we are currently + * creating. Note that this method will fail if the type of the + * added field is not compatible with the field type sequence + * defined in the result class config. This method will also fail if + * an error condition is already detected. The only way to clear the + * error state is with another call to @ref Init. The maximum length + * of this field is 2GB. + * + * @return true(ok)/false(error). + * @param buf pointer to data to add. + * @param buflen length of data to add. + **/ + bool AddLongData(const char *buf, uint32_t buflen); + + + /** + * Obtain a pointer to, and the length of, the created docsum + * blob. This method will fail if an error was previously detected, + * or if any docsum fields were missing (too few fields were + * added). Note that calling the @ref Init method invalidates the + * obtained docsum blob. + * + * @return true(ok)/false(error). + * @param buf where to store the pointer to the docsum blob. + * @param buflen where to store the length of the docsum blob. + **/ + bool GetDocsumBlob(const char **buf, uint32_t *buflen); + + void GetDocsumBlobForce(const char **buf, uint32_t *buflen); +}; + +} +} + + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/summaryfeaturesdfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/summaryfeaturesdfw.cpp new file mode 100644 index 00000000000..8b180dc3d78 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/summaryfeaturesdfw.cpp @@ -0,0 +1,160 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".searchlib.docsummary.summaryfeaturesdfw"); +#include <vespa/searchlib/common/featureset.h> +#include <vespa/searchlib/common/packets.h> +#include <vespa/searchsummary/docsummary/docsumformat.h> +#include "summaryfeaturesdfw.h" + +namespace search { +namespace docsummary { + + +SummaryFeaturesDFW::SummaryFeaturesDFW() : + _env(NULL) +{ +} + +SummaryFeaturesDFW::~SummaryFeaturesDFW() +{ +} + +void +SummaryFeaturesDFW::init(IDocsumEnvironment * env) +{ + _env = env; +} + +static vespalib::string _G_cached("vespa.summaryFeatures.cached"); +static vespalib::slime::Memory _M_cached("vespa.summaryFeatures.cached"); + +void +SummaryFeaturesDFW::insertField(uint32_t docid, + GeneralResult *, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target) +{ + if (state->_summaryFeatures.get() == 0) { + state->_callback.FillSummaryFeatures(state, _env); + if (state->_summaryFeatures.get() == 0) { // still no summary features to write + return; + } + } + const FeatureSet::StringVector &names = state->_summaryFeatures->getNames(); + const feature_t *values = state->_summaryFeatures->getFeaturesByDocId(docid); + if (type == RES_FEATUREDATA && values != NULL) { + vespalib::slime::Cursor& obj = target.insertObject(); + for (uint32_t i = 0; i < names.size(); ++i) { + vespalib::slime::Memory name(names[i].c_str(), names[i].size()); + obj.setDouble(name, values[i]); + } + if (state->_summaryFeaturesCached) { + obj.setDouble(_M_cached, 1.0); + } else { + obj.setDouble(_M_cached, 0.0); + } + return; + } + vespalib::JSONStringer & json(state->_jsonStringer); + if (values != NULL) { + json.clear(); + json.beginObject(); + for (uint32_t i = 0; i < names.size(); ++i) { + featureDump(json, names[i], values[i]); + } + json.appendKey(_G_cached); + if (state->_summaryFeaturesCached) { + json.appendDouble(1.0); + } else { + json.appendDouble(0.0); + } + json.endObject(); + vespalib::slime::Memory value(json.toString().c_str(), + json.toString().size()); + if (type == RES_STRING || type == RES_LONG_STRING) { + target.insertString(value); + } + if (type == RES_DATA || type == RES_LONG_DATA) { + target.insertData(value); + } + json.clear(); + } +} + +uint32_t +SummaryFeaturesDFW::WriteField(uint32_t docid, + GeneralResult * gres, + GetDocsumsState * state, + ResType type, + search::RawBuf * target) +{ + (void) gres; + + if (state->_summaryFeatures.get() == 0) { + state->_callback.FillSummaryFeatures(state, _env); + if (state->_summaryFeatures.get() == 0) { // still no summary features to write + return DocsumFormat::addEmpty(type, *target); + } + } + + uint32_t written = 0; + + const FeatureSet::StringVector &names = state->_summaryFeatures->getNames(); + vespalib::JSONStringer & json(state->_jsonStringer); + const feature_t *values = state->_summaryFeatures->getFeaturesByDocId(docid); + if (values != NULL) { + json.clear(); + json.beginObject(); + for (uint32_t i = 0; i < names.size(); ++i) { + featureDump(json, names[i], values[i]); + } + json.appendKey(_G_cached); + if (state->_summaryFeaturesCached) { + json.appendDouble(1.0); + } else { + json.appendDouble(0.0); + } + json.endObject(); + + written += writeString(json.toString(), type, target); + json.clear(); + } else { + written += DocsumFormat::addEmpty(type, *target); + } + + return written; +} + +void FeaturesDFW::featureDump(vespalib::JSONStringer & json, const vespalib::stringref & name, double feature) +{ + json.appendKey(name); + if (std::isnan(feature) || std::isinf(feature)) { + json.appendNull(); + } else { + json.appendDouble(feature); + } +} + + +uint32_t +SummaryFeaturesDFW::writeString(const vespalib::stringref & str, ResType type, search::RawBuf * target) +{ + switch (type) { + case RES_STRING: + case RES_DATA: + return DocsumFormat::addShortData(*target, str.c_str(), str.size()); + case RES_FEATUREDATA: + case RES_LONG_STRING: + case RES_LONG_DATA: + return DocsumFormat::addLongData(*target, str.c_str(), str.size()); + default: + LOG(error, "unhandled type %u in writeString()", type); + return DocsumFormat::addEmpty(type, *target); + } +} + +} +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/summaryfeaturesdfw.h b/searchsummary/src/vespa/searchsummary/docsummary/summaryfeaturesdfw.h new file mode 100644 index 00000000000..c9a6c5d9d9a --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/summaryfeaturesdfw.h @@ -0,0 +1,48 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <map> +#include <string> +#include <vespa/searchsummary/docsummary/docsumfieldwriter.h> +#include <vespa/vespalib/util/jsonwriter.h> + +namespace search { +namespace docsummary { + +class FeaturesDFW : public IDocsumFieldWriter +{ +protected: + void featureDump(vespalib::JSONStringer & json, const vespalib::stringref & name, double feature); +}; + +class SummaryFeaturesDFW : public FeaturesDFW +{ +private: + SummaryFeaturesDFW(const SummaryFeaturesDFW &); + SummaryFeaturesDFW & operator=(const SummaryFeaturesDFW &); + + IDocsumEnvironment * _env; + +public: + SummaryFeaturesDFW(); + virtual ~SummaryFeaturesDFW(); + void init(IDocsumEnvironment * env); + virtual bool IsGenerated() const { return true; } + virtual uint32_t WriteField(uint32_t docid, + GeneralResult * gres, + GetDocsumsState * state, + ResType type, + search::RawBuf * target); + virtual void insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target); + + static uint32_t writeString(const vespalib::stringref & str, ResType type, search::RawBuf * target); +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp new file mode 100644 index 00000000000..dfb7b863133 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp @@ -0,0 +1,94 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".searchlib.docsummary.textextractordfw"); +#include "tokenizer.h" +#include "textextractordfw.h" + +namespace search { +namespace docsummary { + +TextExtractorDFW::TextExtractorDFW() : + _inputFieldEnum(-1) +{ +} + +bool +TextExtractorDFW::init(const vespalib::string & fieldName, const vespalib::string & inputField, const ResultConfig & config) +{ + _inputFieldEnum = config.GetFieldNameEnum().Lookup(inputField.c_str()); + if (_inputFieldEnum == -1) { + LOG(warning, "Did not find input field '%s' as part of the docsum fields when initializing writer for field '%s'", + inputField.c_str(), fieldName.c_str()); + return false; + } + return true; +} + +void +TextExtractorDFW::insertField(uint32_t, + GeneralResult *gres, + GetDocsumsState *state, + ResType, + vespalib::slime::Inserter &target) +{ + vespalib::string extracted; + ResEntry * entry = gres->GetEntryFromEnumValue(_inputFieldEnum); + if (entry != NULL) { + const char * buf = NULL; + uint32_t buflen = 0; + entry->_resolve_field(&buf, &buflen, &state->_docSumFieldSpace); + // extract the text + Tokenizer tokenizer(buf, buflen); + while (tokenizer.hasMoreTokens()) { + Tokenizer::Token token = tokenizer.getNextToken(); + extracted.append(token.getText()); + } + } else { + LOG(warning, "Did not find input entry using field enum %d. Write an empty field", _inputFieldEnum); + } + target.insertString(vespalib::slime::Memory(extracted.c_str(), extracted.size())); +} + +uint32_t +TextExtractorDFW::WriteField(uint32_t docid, + GeneralResult * gres, + GetDocsumsState * state, + ResType type, + search::RawBuf * target) +{ + (void) docid; + (void) type; + uint32_t slen = 0; + uint32_t begin = target->GetUsedLen(); + // write text length + target->append(&slen, sizeof(slen)); + + ResEntry * entry = gres->GetEntryFromEnumValue(_inputFieldEnum); + if (entry != NULL) { + const char * buf = NULL; + uint32_t buflen = 0; + entry->_resolve_field(&buf, &buflen, &state->_docSumFieldSpace); + // extract the text + Tokenizer tokenizer(buf, buflen); + while (tokenizer.hasMoreTokens()) { + Tokenizer::Token token = tokenizer.getNextToken(); + target->append(token.getText().c_str(), token.getText().size()); + } + } else { + LOG(warning, "Did not find input entry using field enum %d. Write an empty field", _inputFieldEnum); + } + + // calculate number of bytes written + uint32_t written = target->GetUsedLen() - begin; + // patch in correct text length + slen = written - sizeof(slen); + memcpy(target->GetWritableDrainPos(begin), &slen, sizeof(slen)); + + return written; +} + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h b/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h new file mode 100644 index 00000000000..f22d5b3daa4 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h @@ -0,0 +1,42 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchsummary/docsummary/docsumfieldwriter.h> + +namespace search { +namespace docsummary { + +/** + * This is the docsum field writer used to extract the original text from a disk summary on the juniper format. + **/ +class TextExtractorDFW : public IDocsumFieldWriter +{ +private: + TextExtractorDFW(const TextExtractorDFW &); + TextExtractorDFW & operator=(const TextExtractorDFW &); + + int _inputFieldEnum; + +public: + TextExtractorDFW(); + virtual ~TextExtractorDFW() {} + bool init(const vespalib::string & fieldName, const vespalib::string & inputField, const ResultConfig & config); + // Inherit doc + virtual bool IsGenerated() const { return false; } + // Inherit doc + virtual uint32_t WriteField(uint32_t docid, + GeneralResult * gres, + GetDocsumsState * state, + ResType type, + search::RawBuf * target); + virtual void insertField(uint32_t docid, + GeneralResult *gres, + GetDocsumsState *state, + ResType type, + vespalib::slime::Inserter &target); +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp b/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp new file mode 100644 index 00000000000..61a0f8cdfdd --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp @@ -0,0 +1,112 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".summary.tokenizer"); +#include "tokenizer.h" + +namespace search { +namespace docsummary { + +Tokenizer::Token::Type +Tokenizer::getTokenType(ucs4_t ch) const +{ + if (Fast_UnicodeUtil::IsWordChar(ch)) { + return Token::WORD; + } else { + if (Fast_UnicodeUtil::IsTerminalPunctuationChar(ch)) { + return Token::PUNCTUATION; + } else { + return Token::NON_WORD; + } + } +} + +Tokenizer::Tokenizer(const char * buf, size_t len) : + _pos(buf), + _begin(buf), + _end(buf + len), + _tokenBegin(buf), + _type(Token::NOT_DEF), + _hasMoreTokens(_pos < _end) +{ +} + +void +Tokenizer::reset(const char * buf, size_t len) +{ + _pos = buf; + _begin = buf; + _end = buf + len; + _tokenBegin = buf; + _type = Token::NOT_DEF; + _hasMoreTokens = (_pos < _end); +} + +bool +Tokenizer::hasMoreTokens() +{ + return _hasMoreTokens; +} + +Tokenizer::Token +Tokenizer::getNextToken() +{ + const char * textBegin = _tokenBegin; + const char * textEnd = _pos; + const char * stemBegin = NULL; + const char * stemEnd = NULL; + const char * next = _pos; + bool insideAnnotation = false; + for (; _pos < _end; ) { + ucs4_t ch; + if ((unsigned const char)*next < 0x80) { + ch = *next++; + if (ch == 0x1F) { // unit separator + Token t(textBegin, textEnd, stemBegin, stemEnd, _type); + _pos = next; // advance to next char + _tokenBegin = next; // the next token begins at the next char + _type = Token::NOT_DEF; // reset the token type + if (_pos == _end) { // this is the last token + _hasMoreTokens = false; + } + return t; + } + } else { + ch = Fast_UnicodeUtil::GetUTF8CharNonAscii(next); // updates next to the next utf8 character + if (ch == 0xFFF9) { // anchor + insideAnnotation = true; + textBegin = next; + _type = Token::ANNOTATION; + } + } + if (!insideAnnotation) { + Token::Type tmpType = getTokenType(ch); + if (_type != Token::NOT_DEF && _type != tmpType) { // we found a new token type + Token t(textBegin, textEnd, stemBegin, stemEnd, _type); + _tokenBegin = _pos; // the next token begins at this char + _pos = next; // advance to next char + _type = tmpType; // remember the new token type + return t; + } + _type = tmpType; + textEnd = next; // advance to next char + } else { // inside annotation + if (ch == 0xFFFA) { // separator + textEnd = _pos; + stemBegin = next; + } else if (ch == 0xFFFB && stemBegin != NULL) { // terminator + stemEnd = _pos; + insideAnnotation = false; + } + } + + _pos = next; + } + LOG_ASSERT(_pos == _end); + _hasMoreTokens = false; + return Token(textBegin, _pos, _type); // return the last token +} + +} +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h b/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h new file mode 100644 index 00000000000..efd07e16b68 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h @@ -0,0 +1,47 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/fastlib/text/unicodeutil.h> +#include "itokenizer.h" + +namespace search { +namespace docsummary { + +/** + * This class is used to tokenize an utf-8 text buffer into tokens of type + * WORD, NON_WORD, PUNCTUATION, and ANNOTATION. + * + * Functions in Fast_UnicodeUtil are used to determine word characters and terminal punctuation characters. + * The unit separator 0x1F is always treated as a token separator. The unit separator itself is not returned as a token. + * Interlinear annotation (0xFFF9 original 0xFFFA stemmed 0xFFFB) is used to specify the stemmed variant of a word. + * The annotation characters are not returned as part of a token. + */ +class Tokenizer : public ITokenizer +{ +private: + const char * _pos; // the current position in the input buffer + const char * _begin; // the begin of input buffer + const char * _end; // the end of the input buffer + const char * _tokenBegin; // the start of the next token + Token::Type _type; // the type of the current position + bool _hasMoreTokens; // do we have more tokens + + Token::Type getTokenType(ucs4_t ch) const; + +public: + /** + * Creates a new tokenizer for the given utf-8 text buffer. + */ + Tokenizer(const char * buf, size_t len); + + // Inherit doc + virtual void reset(const char * buf, size_t len); + virtual size_t getBufferSize() const { return _end - _begin; } + virtual bool hasMoreTokens(); + virtual Token getNextToken(); +}; + +} +} + diff --git a/searchsummary/src/vespa/searchsummary/docsummary/urlresult.cpp b/searchsummary/src/vespa/searchsummary/docsummary/urlresult.cpp new file mode 100644 index 00000000000..92ebe07d457 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/urlresult.cpp @@ -0,0 +1,819 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 1998-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/searchsummary/docsummary/urlresult.h> +#include <vespa/searchsummary/docsummary/resultconfig.h> +#include <zlib.h> + +LOG_SETUP(".searchlib.docsummary.urlresult"); + +namespace search { +namespace docsummary { + +urlresult::urlresult(uint32_t partition, uint32_t docid, HitRank metric) + : _partition(partition), + _docid(docid), + _metric(metric) +{} + + +urlresult::~urlresult() +{ +} + + +/*===============================================================*/ + + +badurlresult::badurlresult() + : urlresult(0, 0, 0) +{} + + +badurlresult::badurlresult(uint32_t partition, uint32_t docid, HitRank metric) + : urlresult(partition, docid, metric) +{} + + +badurlresult::~badurlresult() +{} + + +int +badurlresult::unpack(const char *buf, const size_t buflen) +{ + (void) buf; + (void) buflen; + LOG(warning, "badurlresult::unpack"); + return 0; +} + + +/*===============================================================*/ + + +void +GeneralResult::AllocEntries(uint32_t buflen, bool inplace) +{ + uint32_t cnt = _resClass->GetNumEntries(); + uint32_t needMem = (inplace) + ? cnt * sizeof(ResEntry) + : cnt * sizeof(ResEntry) + buflen + 1; + + if (cnt > 0) { + _entrycnt = cnt; + _entries = (ResEntry *) malloc(needMem); + assert(_entries != NULL); + if (inplace) { + _buf = NULL; + _bufEnd = NULL; + } else { + _buf = ((char *)_entries) + cnt * sizeof(ResEntry); + _bufEnd = _buf + buflen + 1; + } + memset(_entries, 0, cnt * sizeof(ResEntry)); + } else { + _entrycnt = 0; + _entries = NULL; + _buf = NULL; + _bufEnd = NULL; + } +} + + +void +GeneralResult::FreeEntries() +{ + uint32_t cnt = _entrycnt; + + // (_buf == NULL) <=> (_inplace_unpack() || (cnt == 0)) + if (_buf != NULL) { + for (uint32_t i = 0; i < cnt; i++) { + if (ResultConfig::IsVariableSize(_entries[i]._type) && + !InBuf(_entries[i]._stringval)) + delete [] (_entries[i]._stringval); + } + } + free(_entries); // free '_entries'/'_buf' chunk +} + + + +GeneralResult::GeneralResult(const ResultClass *resClass, + uint32_t partition, uint32_t docid, + HitRank metric) + : urlresult(partition, docid, metric), + _resClass(resClass), + _entrycnt(0), + _entries(NULL), + _buf(NULL), + _bufEnd(NULL) +{ +} + + +GeneralResult::~GeneralResult() +{ + FreeEntries(); +} + + +ResEntry * +GeneralResult::GetEntry(uint32_t idx) +{ + return (idx < _entrycnt) ? &_entries[idx] : NULL; +} + + +ResEntry * +GeneralResult::GetEntry(const char *name) +{ + int idx = _resClass->GetIndexFromName(name); + + return (idx >= 0 && (uint32_t)idx < _entrycnt) ? + &_entries[idx] : NULL; +} + + +ResEntry * +GeneralResult::GetEntryFromEnumValue(uint32_t value) +{ + int idx = _resClass->GetIndexFromEnumValue(value); + + return (idx >= 0 && (uint32_t)idx < _entrycnt) ? + &_entries[idx] : NULL; +} + + +int +GeneralResult::unpack(const char *buf, const size_t buflen) +{ + bool rc = true; + const char *ebuf = buf + buflen; // Ref to first after buffer + const char *p = buf; // current position in buffer + + if (_entries != NULL) + FreeEntries(); + + AllocEntries(buflen); + + for (uint32_t i = 0; rc && i < _entrycnt; i++) { + const ResConfigEntry *entry = _resClass->GetEntry(i); + + switch (entry->_type) { + + case RES_INT: { + + if (p + sizeof(_entries[i]._intval) <= ebuf) { + + memcpy(&_entries[i]._intval, p, sizeof(_entries[i]._intval)); + _entries[i]._type = RES_INT; + p += sizeof(_entries[i]._intval); + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(..._intval) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_SHORT: { + + uint16_t shortval; + if (p + sizeof(shortval) <= ebuf) { + + memcpy(&shortval, p, sizeof(shortval)); + _entries[i]._intval = (uint32_t)shortval; + _entries[i]._type = RES_INT; // type promotion + p += sizeof(shortval); + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(shortval) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_BYTE: { + + uint8_t byteval; + if (p + sizeof(byteval) <= ebuf) { + + memcpy(&byteval, p, sizeof(byteval)); + _entries[i]._intval = (uint32_t)byteval; + _entries[i]._type = RES_INT; // type promotion + p += sizeof(byteval); + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(byteval) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_FLOAT: { + + float floatval; + if (p + sizeof(floatval) <= ebuf) { + + memcpy(&floatval, p, sizeof(floatval)); + _entries[i]._doubleval = (double)floatval; + _entries[i]._type = RES_DOUBLE; // type promotion + p += sizeof(floatval); + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(floatval) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_DOUBLE: { + + if (p + sizeof(_entries[i]._doubleval) <= ebuf) { + + memcpy(&_entries[i]._doubleval, p, sizeof(_entries[i]._doubleval)); + _entries[i]._type = RES_DOUBLE; + p += sizeof(_entries[i]._doubleval); + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(..._doubleval) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_INT64: { + + if (p + sizeof(_entries[i]._int64val) <= ebuf) { + + memcpy(&_entries[i]._int64val, p, sizeof(_entries[i]._int64val)); + _entries[i]._type = RES_INT64; + p += sizeof(_entries[i]._int64val); + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(..._int64val) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_STRING: { + + uint16_t slen; + if (p + sizeof(slen) <= ebuf) { + + memcpy(&slen, p, sizeof(slen)); + p += sizeof(slen); + + if (p + slen <= ebuf) { + + _entries[i]._stringval = _buf + (p - buf); + memcpy(_entries[i]._stringval, p, slen); + _entries[i]._stringval[slen] = '\0'; + _entries[i]._stringlen = slen; + _entries[i]._type = RES_STRING; + p += slen; + + } else { + + LOG(debug, "GeneralResult::unpack: p + slen > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(slen) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_DATA: { + + uint16_t dlen; + if (p + sizeof(dlen) <= ebuf) { + + memcpy(&dlen, p, sizeof(dlen)); + p += sizeof(dlen); + + if (p + dlen <= ebuf) { + + _entries[i]._dataval = _buf + (p - buf); + memcpy(_entries[i]._dataval, p, dlen); + _entries[i]._dataval[dlen] = '\0'; // just in case. + _entries[i]._datalen = dlen; + _entries[i]._type = RES_DATA; + p += dlen; + + } else { + + LOG(debug, "GeneralResult::unpack: p + dlen > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(dlen) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_XMLSTRING: + case RES_JSONSTRING: + case RES_FEATUREDATA: + case RES_LONG_STRING: { + + uint32_t lslen; + bool compressed; + if (p + sizeof(lslen) <= ebuf) { + + memcpy(&lslen, p, sizeof(lslen)); + p += sizeof(lslen); + + compressed = ((lslen & 0x80000000) != 0); + lslen &= 0x7fffffff; + + if (p + lslen <= ebuf) { + + if (compressed) { // COMPRESSED + uint32_t realLen = 0; + if (lslen >= sizeof(realLen)) + memcpy(&realLen, p, sizeof(realLen)); + else + LOG(warning, "Cannot uncompress docsum field %s; docsum field meta-data incomplete", + entry->_bindname.c_str()); + if (realLen > 0) { + _entries[i]._stringval = new char[realLen + 1]; + } + if (_entries[i]._stringval != NULL) { + uLongf rlen = realLen; + if ((uncompress((Bytef *)_entries[i]._stringval, &rlen, + (const Bytef *)(p + sizeof(realLen)), + lslen - sizeof(realLen)) == Z_OK) && + rlen == realLen) { + assert(rlen == realLen); + + // COMPRESSED LONG STRING FIELD OK + _entries[i]._stringval[realLen] = '\0'; + _entries[i]._stringlen = realLen; + + } else { + LOG(warning, "Cannot uncompress docsum field %s; decompression error", + entry->_bindname.c_str()); + delete [] _entries[i]._stringval; + _entries[i]._stringval = NULL; + } + } + // insert empty field if decompress failed + if (_entries[i]._stringval == NULL) { + _entries[i]._stringval = _buf + (p - buf); + _entries[i]._stringval[0] = '\0'; + _entries[i]._stringlen = 0; + } + + } else { // UNCOMPRESSED + + _entries[i]._stringval = _buf + (p - buf); + memcpy(_entries[i]._stringval, p, lslen); + _entries[i]._stringval[lslen] = '\0'; + _entries[i]._stringlen = lslen; + + } + _entries[i]._type = RES_STRING; // type normalization + p += lslen; + + } else { + + LOG(debug, "GeneralResult::unpack: p + lslen > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(lslen) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_LONG_DATA: { + + uint32_t ldlen; + bool compressed; + if (p + sizeof(ldlen) <= ebuf) { + + memcpy(&ldlen, p, sizeof(ldlen)); + p += sizeof(ldlen); + + compressed = ((ldlen & 0x80000000) != 0); + ldlen &= 0x7fffffff; + + if (p + ldlen <= ebuf) { + + if (compressed) { // COMPRESSED + uint32_t realLen = 0; + if (ldlen >= sizeof(realLen)) + memcpy(&realLen, p, sizeof(realLen)); + else + LOG(warning, "Cannot uncompress docsum field %s; docsum field meta-data incomplete", + entry->_bindname.c_str()); + if (realLen > 0) { + _entries[i]._dataval = new char [realLen + 1]; + } + if (_entries[i]._dataval != NULL) { + uLongf rlen = realLen; + if ((uncompress((Bytef *)_entries[i]._dataval, &rlen, + (const Bytef *)(p + sizeof(realLen)), + ldlen - sizeof(realLen)) == Z_OK) && + rlen == realLen) { + assert(rlen == realLen); + + // COMPRESSED LONG DATA FIELD OK + _entries[i]._dataval[realLen] = '\0'; + _entries[i]._datalen = realLen; + + } else { + LOG(warning, "Cannot uncompress docsum field %s; decompression error", + entry->_bindname.c_str()); + delete [] _entries[i]._dataval; + _entries[i]._dataval = NULL; + } + } + + // insert empty field if decompress failed + if (_entries[i]._dataval == NULL) { + _entries[i]._dataval = _buf + (p - buf); + _entries[i]._dataval[0] = '\0'; + _entries[i]._datalen = 0; + } + + } else { // UNCOMPRESSED + + _entries[i]._dataval = _buf + (p - buf); + memcpy(_entries[i]._dataval, p, ldlen); + _entries[i]._dataval[ldlen] = '\0'; // just in case + _entries[i]._datalen = ldlen; + + } + _entries[i]._type = RES_DATA; // type normalization + p += ldlen; + + } else { + + LOG(debug, "GeneralResult::unpack: p + ldlen > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(ldlen) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + default: + LOG(warning, "GeneralResult::unpack: no such type:%d", entry->_type); + LOG(error, "Incorrect type in document summary, couldn't unpack"); + rc = false; + break; + } // END -- switch (entry->_type) { + } // END -- for (uint32_t i = 0; rc && i < _entrycnt; i++) { + + if (rc && p != ebuf) { + LOG(debug, "GeneralResult::unpack: p:%p != ebuf:%p", p, ebuf); + LOG(error, "Document summary too long, couldn't unpack."); + rc = false; + } + + if (rc) + return 0; // SUCCESS + + // clean up on failure + FreeEntries(); + _entrycnt = 0; + _entries = NULL; + _buf = NULL; + _bufEnd = NULL; + + return -1; // FAIL +} + + +bool +GeneralResult::_inplace_unpack(const char *buf, const size_t buflen) +{ + bool rc = true; + const char *ebuf = buf + buflen; // Ref to first after buffer + const char *p = buf; // current position in buffer + + if (_entries != NULL) + FreeEntries(); + + AllocEntries(buflen, true); + + for (uint32_t i = 0; rc && i < _entrycnt; i++) { + const ResConfigEntry *entry = _resClass->GetEntry(i); + + switch (entry->_type) { + + case RES_INT: { + + if (p + sizeof(_entries[i]._intval) <= ebuf) { + + memcpy(&_entries[i]._intval, p, sizeof(_entries[i]._intval)); + _entries[i]._type = RES_INT; + p += sizeof(_entries[i]._intval); + + } else { + + LOG(debug, + "GeneralResult::_inplace_unpack: p + sizeof(..._intval) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_SHORT: { + + uint16_t shortval; + if (p + sizeof(shortval) <= ebuf) { + + memcpy(&shortval, p, sizeof(shortval)); + _entries[i]._intval = (uint32_t)shortval; + _entries[i]._type = RES_INT; // type promotion + p += sizeof(shortval); + + } else { + + LOG(debug, + "GeneralResult::_inplace_unpack: p + sizeof(shortval) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_BYTE: { + + uint8_t byteval; + if (p + sizeof(byteval) <= ebuf) { + + memcpy(&byteval, p, sizeof(byteval)); + _entries[i]._intval = (uint32_t)byteval; + _entries[i]._type = RES_INT; // type promotion + p += sizeof(byteval); + + } else { + + LOG(debug, + "GeneralResult::_inplace_unpack: p + sizeof(byteval) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_FLOAT: { + + float floatval; + if (p + sizeof(floatval) <= ebuf) { + + memcpy(&floatval, p, sizeof(floatval)); + _entries[i]._doubleval = (double)floatval; + _entries[i]._type = RES_DOUBLE; // type promotion + p += sizeof(floatval); + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(floatval) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_DOUBLE: { + + if (p + sizeof(_entries[i]._doubleval) <= ebuf) { + + memcpy(&_entries[i]._doubleval, p, sizeof(_entries[i]._doubleval)); + _entries[i]._type = RES_DOUBLE; + p += sizeof(_entries[i]._doubleval); + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(..._doubleval) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_INT64: { + + if (p + sizeof(_entries[i]._int64val) <= ebuf) { + + memcpy(&_entries[i]._int64val, p, sizeof(_entries[i]._int64val)); + _entries[i]._type = RES_INT64; + p += sizeof(_entries[i]._int64val); + + } else { + + LOG(debug, "GeneralResult::unpack: p + sizeof(..._int64val) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_STRING: { + + uint16_t slen; + if (p + sizeof(slen) <= ebuf) { + + memcpy(&slen, p, sizeof(slen)); + p += sizeof(slen); + + if (p + slen <= ebuf) { + + _entries[i]._stringval = const_cast<char *>(p); + _entries[i]._stringlen = slen; + _entries[i]._type = RES_STRING; + p += slen; + + } else { + + LOG(debug, "GeneralResult::_inplace_unpack: p + slen > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + + } else { + + LOG(debug, "GeneralResult::_inplace_unpack: p + sizeof(slen) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_DATA: { + + uint16_t dlen; + if (p + sizeof(dlen) <= ebuf) { + + memcpy(&dlen, p, sizeof(dlen)); + p += sizeof(dlen); + + if (p + dlen <= ebuf) { + + _entries[i]._dataval = const_cast<char *>(p); + _entries[i]._datalen = dlen; + _entries[i]._type = RES_DATA; + p += dlen; + + } else { + + LOG(debug, "GeneralResult::_inplace_unpack: p + dlen > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + + } else { + + LOG(debug, "GeneralResult::_inplace_unpack: p + sizeof(dlen) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_XMLSTRING: + case RES_JSONSTRING: + case RES_FEATUREDATA: + case RES_LONG_STRING: { + + uint32_t flen; + uint32_t lslen; + if (p + sizeof(flen) <= ebuf) { + + memcpy(&flen, p, sizeof(flen)); + p += sizeof(flen); + + lslen = flen & 0x7fffffff; + + if (p + lslen <= ebuf) { + + _entries[i]._stringval = const_cast<char *>(p); + _entries[i]._stringlen = flen; // with compression flag + _entries[i]._type = RES_STRING; // type normalization + p += lslen; + + } else { + + LOG(debug, "GeneralResult::_inplace_unpack: p + lslen > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + + } else { + + LOG(debug, "GeneralResult::_inplace_unpack: p + sizeof(lslen) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + case RES_LONG_DATA: { + + uint32_t flen; + uint32_t ldlen; + if (p + sizeof(flen) <= ebuf) { + + memcpy(&flen, p, sizeof(flen)); + p += sizeof(flen); + + ldlen = flen & 0x7fffffff; + + if (p + ldlen <= ebuf) { + + _entries[i]._dataval = const_cast<char *>(p); + _entries[i]._datalen = flen; // with compression flag + _entries[i]._type = RES_DATA; // type normalization + p += ldlen; + + } else { + + LOG(debug, "GeneralResult::_inplace_unpack: p + ldlen > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + + } else { + + LOG(debug, "GeneralResult::_inplace_unpack: p + sizeof(ldlen) > ebuf"); + LOG(error, "Document summary too short, couldn't unpack"); + rc = false; + } + break; + } + + default: + LOG(warning, + "GeneralResult::_inplace_unpack: no such type:%d", + entry->_type); + LOG(error, "Incorrect type in document summary, couldn't unpack"); + rc = false; + break; + } // END -- switch (entry->_type) { + } // END -- for (uint32_t i = 0; rc && i < _entrycnt; i++) { + + if (rc && p != ebuf) { + LOG(debug, "GeneralResult::_inplace_unpack: p:%p != ebuf:%p", p, ebuf); + LOG(error, "Document summary too long, couldn't unpack."); + rc = false; + } + + if (rc) + return true; // SUCCESS + + // clean up on failure + FreeEntries(); + _entrycnt = 0; + _entries = NULL; + _buf = NULL; + _bufEnd = NULL; + + return false; // FAIL +} + +} +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/urlresult.h b/searchsummary/src/vespa/searchsummary/docsummary/urlresult.h new file mode 100644 index 00000000000..e882a5a9ed8 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/urlresult.h @@ -0,0 +1,90 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 2001-2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/searchsummary/docsummary/resultclass.h> +#include <vespa/searchsummary/docsummary/docsumstorevalue.h> + +namespace search { +namespace docsummary { + +class urlresult +{ +protected: + uint32_t _partition; + uint32_t _docid; + HitRank _metric; + +public: + urlresult(uint32_t partition, uint32_t docid, HitRank metric); + virtual ~urlresult(); + + virtual bool IsGeneral() const { return false; } + uint32_t GetPartition() const { return _partition; } + uint32_t GetDocID() const { return _docid; } + HitRank GetMetric() const { return _metric; } + virtual int unpack(const char *buf, const size_t buflen) = 0; +}; + + +class badurlresult : public urlresult +{ +public: + badurlresult(); + badurlresult(uint32_t partition, uint32_t docid, HitRank metric); + virtual ~badurlresult(); + + virtual int unpack(const char *buf, const size_t buflen); +}; + + +class GeneralResult : public urlresult +{ +private: + GeneralResult(const GeneralResult &); + GeneralResult& operator=(const GeneralResult &); + + const ResultClass *_resClass; + uint32_t _entrycnt; + ResEntry *_entries; + char *_buf; // allocated in same chunk as _entries + char *_bufEnd; // first byte after _buf + + bool InBuf(void *pt) + { + return ((char *)pt >= _buf && + (char *)pt < _bufEnd); + } + + void AllocEntries(uint32_t buflen, bool inplace = false); + void FreeEntries(); + + bool _inplace_unpack(const char *buf, const size_t buflen); + +public: + GeneralResult(const ResultClass *resClass, uint32_t partition, + uint32_t docid, HitRank metric); + ~GeneralResult(); + + const ResultClass *GetClass() const { return _resClass; } + ResEntry *GetEntry(uint32_t idx); + ResEntry *GetEntry(const char *name); + ResEntry *GetEntryFromEnumValue(uint32_t val); + virtual bool IsGeneral() const { return true; } + virtual int unpack(const char *buf, const size_t buflen); + + bool inplaceUnpack(const DocsumStoreValue &value) { + if (value.valid()) { + return _inplace_unpack(value.fieldsPt(), value.fieldsSz()); + } else { + return false; + } + } +}; + +} +} + + diff --git a/searchsummary/testrun/.gitignore b/searchsummary/testrun/.gitignore new file mode 100644 index 00000000000..559f57dccbe --- /dev/null +++ b/searchsummary/testrun/.gitignore @@ -0,0 +1,9 @@ +test-report.html +test-report.html.* +test.*.*.desc +test.*.*.file.* +test.*.*.files.html +test.*.*.log +tmp.* +/test.*.*.result +Makefile |