diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2022-05-15 00:40:43 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-15 00:40:43 +0200 |
commit | dacf557add1c6a3ffab036cdf2f7dfdf9750b22e (patch) | |
tree | 3a9dfff58b98898e2e28c0337925f4f04e5eaeb0 /streamingvisitors/src/vespa/vsm | |
parent | 2722ce9d1d1ec12d57ebd3833ce37b0958afb752 (diff) |
Revert "Collapse vsm into streamingvisitors"
Diffstat (limited to 'streamingvisitors/src/vespa/vsm')
72 files changed, 0 insertions, 5477 deletions
diff --git a/streamingvisitors/src/vespa/vsm/.gitignore b/streamingvisitors/src/vespa/vsm/.gitignore deleted file mode 100644 index 4c5f5d9ef7a..00000000000 --- a/streamingvisitors/src/vespa/vsm/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -.depend -Makefile -/libvsm.so.5.1 diff --git a/streamingvisitors/src/vespa/vsm/common/.gitignore b/streamingvisitors/src/vespa/vsm/common/.gitignore deleted file mode 100644 index 95bc02923a9..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -*.exe -*.ilk -*.pdb -.depend* -Makefile diff --git a/streamingvisitors/src/vespa/vsm/common/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/common/CMakeLists.txt deleted file mode 100644 index 4570a9b581e..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -vespa_add_library(vsm_vsmcommon OBJECT - SOURCES - charbuffer.cpp - document.cpp - documenttypemapping.cpp - fieldmodifier.cpp - storagedocument.cpp - DEPENDS -) diff --git a/streamingvisitors/src/vespa/vsm/common/charbuffer.cpp b/streamingvisitors/src/vespa/vsm/common/charbuffer.cpp deleted file mode 100644 index b8fbb5c8846..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/charbuffer.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "charbuffer.h" -#include <cstring> - -namespace vsm { - -CharBuffer::CharBuffer(size_t len) : - _buffer(len), - _pos(0) -{ } - -void -CharBuffer::put(const char * src, size_t n) -{ - if (n > getRemaining()) { - resize(_pos + (n * 2)); - } - char * dst = &_buffer[_pos]; - memcpy(dst, src, n); - _pos += n; -} - -void -CharBuffer::resize(size_t len) -{ - if (len > getLength()) { - _buffer.resize(len); - } -} - -} - diff --git a/streamingvisitors/src/vespa/vsm/common/charbuffer.h b/streamingvisitors/src/vespa/vsm/common/charbuffer.h deleted file mode 100644 index 08618a9b973..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/charbuffer.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vector> -#include <memory> - -namespace vsm { - -/** - * Simple growable char buffer. - **/ -class CharBuffer -{ -private: - std::vector<char> _buffer; - size_t _pos; - -public: - typedef std::shared_ptr<CharBuffer> SP; - - /** - * Creates a char buffer with len bytes. - **/ - CharBuffer(size_t len = 0); - - /** - * Copies n bytes from the src array into the underlying buffer at the - * current position, and updates the position accordingly. - * Resizing will occur if needed. - **/ - void put(const char * src, size_t n); - - /** - * Resizes the buffer so that the new length becomes len. - * Resizing will not occur if len < current length. - **/ - void resize(size_t len); - - /** - * Resets the position to the beginning of the buffer. - **/ - void reset() { _pos = 0; } - - const char * getBuffer() const { return &_buffer[0]; } - size_t getLength() const { return _buffer.size(); } - size_t getPos() const { return _pos; } - size_t getRemaining() const { return getLength() - getPos(); } - void put(char c) { put(&c, 1); } -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/common/docsum.h b/streamingvisitors/src/vespa/vsm/common/docsum.h deleted file mode 100644 index 49b84cb0783..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/docsum.h +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "document.h" - -namespace vsm { - -/** - Will represent a cache of the document summaries. -> Actual docsums will be - generated on the fly when requested. A document summary is accessed by its - documentId. -*/ - -class IDocSumCache -{ -public: - virtual const Document & getDocSum(const search::DocumentIdT & docId) const = 0; - virtual ~IDocSumCache() { } -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/common/document.cpp b/streamingvisitors/src/vespa/vsm/common/document.cpp deleted file mode 100644 index a345c82ce2d..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/document.cpp +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "document.h" -#include <vespa/vespalib/stllike/asciistream.h> -#include <vespa/vespalib/stllike/hash_map.hpp> - -using search::DocumentIdT; -using search::TimeT; -using document::FieldValue; - -namespace vsm -{ - -vespalib::asciistream & operator << (vespalib::asciistream & os, const FieldRef & f) -{ - const char *s = f.data(); - os << f.size(); - if (s) { - os << s; // Better hope it's null terminated! - } - os << " : "; - return os; -} - -vespalib::asciistream & operator << (vespalib::asciistream & os, const StringFieldIdTMap & f) -{ - for (StringFieldIdTMapT::const_iterator it=f._map.begin(), mt=f._map.end(); it != mt; it++) { - os << it->first << " = " << it->second << '\n'; - } - return os; -} - -StringFieldIdTMap::StringFieldIdTMap() : - _map() -{ -} - -void StringFieldIdTMap::add(const vespalib::string & s, FieldIdT fieldId) -{ - _map[s] = fieldId; -} - -void StringFieldIdTMap::add(const vespalib::string & s) -{ - if (_map.find(s) == _map.end()) { - FieldIdT fieldId = _map.size(); - _map[s] = fieldId; - } -} - -FieldIdT StringFieldIdTMap::fieldNo(const vespalib::string & fName) const -{ - StringFieldIdTMapT::const_iterator found = _map.find(fName); - FieldIdT fNo((found != _map.end()) ? found->second : npos); - return fNo; -} - -size_t StringFieldIdTMap::highestFieldNo() const -{ - size_t maxFNo(0); - for (const auto & field : _map) { - if (field.second >= maxFNo) { - maxFNo = field.second + 1; - } - } - return maxFNo; -} - -Document::~Document() { } - -} - -VESPALIB_HASH_MAP_INSTANTIATE(vespalib::string, vsm::FieldIdTList); -VESPALIB_HASH_MAP_INSTANTIATE(vespalib::string, vsm::IndexFieldMapT); diff --git a/streamingvisitors/src/vespa/vsm/common/document.h b/streamingvisitors/src/vespa/vsm/common/document.h deleted file mode 100644 index 8c11d27072b..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/document.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/searchlib/query/base.h> -#include <vespa/document/fieldvalue/fieldvalue.h> -#include <vespa/vespalib/stllike/hash_map.h> -#include <map> - -namespace vespalib { - class asciistream; -} - -namespace vsm { - -/// Type to identify fields in documents. -typedef unsigned int FieldIdT; -/// A type to represent a list of FieldIds. -typedef std::vector<FieldIdT> FieldIdTList; -/// A type to represent all the fields contained in all the indexs. -typedef vespalib::hash_map<vespalib::string, FieldIdTList> IndexFieldMapT; -/// A type to represent all the fields contained in all the indexs in an all the document types. -typedef vespalib::hash_map<vespalib::string, IndexFieldMapT> DocumentTypeIndexFieldMapT; -/// A type to represent a map from fieldname to fieldid. -typedef std::map<vespalib::string, FieldIdT> StringFieldIdTMapT; - -class StringFieldIdTMap -{ - public: - enum { npos=0xFFFFFFFF }; - StringFieldIdTMap(); - FieldIdT fieldNo(const vespalib::string & fName) const; - void add(const vespalib::string & s); - void add(const vespalib::string & s, FieldIdT fNo); - const StringFieldIdTMapT & map() const { return _map; } - size_t highestFieldNo() const; - friend vespalib::asciistream & operator << (vespalib::asciistream & os, const StringFieldIdTMap & f); - private: - StringFieldIdTMapT _map; -}; - -typedef vespalib::stringref FieldRef; - -/** - This is the base class representing a document. It gives a document some - basic properties. A document is a collection of fields, together with a - document id and a time stamp. -*/ -class Document -{ - public: - Document(size_t maxFieldCount) : _docId(0), _fieldCount(maxFieldCount) { } - Document(search::DocumentIdT doc, size_t maxFieldCount) : _docId(doc), _fieldCount(maxFieldCount) { } - virtual ~Document(); - const search::DocumentIdT & getDocId() const { return _docId; } - size_t getFieldCount() const { return _fieldCount; } - void setDocId(const search::DocumentIdT & v) { _docId = v; } - virtual const document::FieldValue * getField(FieldIdT fId) const = 0; - /** - Returns true, if not possible to set. - */ - virtual bool setField(FieldIdT fId, document::FieldValue::UP fv) = 0; - private: - search::DocumentIdT _docId; - const size_t _fieldCount; -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/common/documenttypemapping.cpp b/streamingvisitors/src/vespa/vsm/common/documenttypemapping.cpp deleted file mode 100644 index 7886c44b2e0..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/documenttypemapping.cpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "documenttypemapping.h" -#include <vespa/document/repo/documenttyperepo.h> -#include <vespa/document/datatype/documenttype.h> -#include <vespa/vespalib/stllike/hash_map.hpp> - -#include <vespa/log/log.h> -LOG_SETUP(".vsm.common.documenttypemapping"); - -namespace vsm { - -DocumentTypeMapping::DocumentTypeMapping() : - _fieldMap(), - _defaultDocumentTypeName(), - _defaultDocumentType(), - _documentTypeFreq() -{ } - -DocumentTypeMapping::~DocumentTypeMapping() { } - -namespace { - -vespalib::string getDocTypeId(const document::DocumentType & docType) -{ - vespalib::string typeId(docType.getName()); - typeId += "0"; // Hardcoded version (version not supported) - return typeId; -} - -} - -void DocumentTypeMapping::init(const vespalib::string & defaultDocumentType, - const StringFieldIdTMapT & fieldList, - const document::DocumentTypeRepo &repo) -{ - _defaultDocumentType = repo.getDocumentType(defaultDocumentType); - _defaultDocumentTypeName = getDocTypeId(*_defaultDocumentType); - LOG(debug, "Setting default document type to '%s'", - _defaultDocumentTypeName.c_str()); - buildFieldMap(_defaultDocumentType, fieldList, _defaultDocumentTypeName); -} - -bool DocumentTypeMapping::prepareBaseDoc(SharedFieldPathMap & map) const -{ - FieldPathMapMapT::const_iterator found = _fieldMap.find(_defaultDocumentTypeName); - if (found != _fieldMap.end()) { - map = std::make_shared<FieldPathMapT>(found->second); - LOG(debug, "Found FieldPathMap for default document type '%s' with %zd elements", - _defaultDocumentTypeName.c_str(), map->size()); - } else { - LOG(warning, "No FieldPathMap found for default document type '%s'. Using empty one", - _defaultDocumentTypeName.c_str()); - map = std::make_shared<FieldPathMapT>(); - } - return true; -} - -void DocumentTypeMapping::buildFieldMap( - const document::DocumentType *docTypePtr, - const StringFieldIdTMapT & fieldList, const vespalib::string & typeId) -{ - LOG(debug, "buildFieldMap: docType = '%s', fieldList.size = '%zd', typeId = '%s'", - docTypePtr->getName().c_str(), fieldList.size(), typeId.c_str()); - const document::DocumentType & docType = *docTypePtr; - size_t highestFNo(0); - for (StringFieldIdTMapT::const_iterator it = fieldList.begin(), mt = fieldList.end(); it != mt; it++) { - highestFNo = std::max(highestFNo, size_t(it->second)); - } - highestFNo++; - FieldPathMapT & fieldMap = _fieldMap[typeId]; - - fieldMap.resize(highestFNo); - - size_t validCount(0); - for (StringFieldIdTMapT::const_iterator it = fieldList.begin(), mt = fieldList.end(); it != mt; it++) { - vespalib::string fname = it->first; - LOG(debug, "Handling %s -> %d", fname.c_str(), it->second); - try { - if ((it->first[0] != '[') && (it->first != "summaryfeatures") && (it->first != "rankfeatures") && (it->first != "ranklog") && (it->first != "sddocname") && (it->first != "documentid")) { - FieldPath fieldPath; - docType.buildFieldPath(fieldPath, fname); - fieldMap[it->second] = std::move(fieldPath); - validCount++; - LOG(spam, "Found %s -> %d in document", fname.c_str(), it->second); - } - } catch (const std::exception & e) { - LOG(debug, "Could not get field info for '%s' in documenttype '%s' (id = '%s') : %s", - it->first.c_str(), docType.getName().c_str(), typeId.c_str(), e.what()); - } - } - _documentTypeFreq.insert(std::make_pair(validCount, docTypePtr)); -} - -const document::DocumentType & DocumentTypeMapping::getCurrentDocumentType() const -{ - if (_documentTypeFreq.empty()) { - throw std::runtime_error("No document type registered yet."); - } - return *_documentTypeFreq.rbegin()->second; -} - - -} diff --git a/streamingvisitors/src/vespa/vsm/common/documenttypemapping.h b/streamingvisitors/src/vespa/vsm/common/documenttypemapping.h deleted file mode 100644 index 607b40cec47..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/documenttypemapping.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/vsm/common/storagedocument.h> - -namespace document { class DocumentTypeRepo; } - -namespace vsm -{ - -class DocumentTypeMapping -{ -public: - DocumentTypeMapping(); - ~DocumentTypeMapping(); - - /** - * Prepares the given document by sharing the field info map - * registered for that document type. - **/ - bool prepareBaseDoc(SharedFieldPathMap & doc) const; - - /** - * Builds a field info map for all registered document types. - **/ - void init(const vespalib::string & defaultDocumentType, - const StringFieldIdTMapT & fieldList, - const document::DocumentTypeRepo &repo); - - const document::DocumentType & getCurrentDocumentType() const; - const vespalib::string & getDefaultDocumentTypeName() const - { return _defaultDocumentTypeName; } - const document::DocumentType *getDefaultDocumentType() const - { return _defaultDocumentType; } - -private: - /** - * Builds a field info map for the given type id. This is a - * mapping from field id to field path and field value for all - * field names in the given list based on the given document type. - **/ - void buildFieldMap(const document::DocumentType *docType, - const StringFieldIdTMapT & fieldList, - const vespalib::string & typeId); - typedef vespalib::hash_map<vespalib::string, FieldPathMapT> FieldPathMapMapT; - typedef std::multimap<size_t, const document::DocumentType *> DocumentTypeUsage; - FieldPathMapMapT _fieldMap; - vespalib::string _defaultDocumentTypeName; - const document::DocumentType *_defaultDocumentType; - DocumentTypeUsage _documentTypeFreq; -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/common/fieldmodifier.cpp b/streamingvisitors/src/vespa/vsm/common/fieldmodifier.cpp deleted file mode 100644 index b39afd83b5a..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/fieldmodifier.cpp +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "fieldmodifier.h" -#include <vespa/vespalib/stllike/hash_map.hpp> - -namespace vsm { - -FieldModifierMap::FieldModifierMap() : - _map() -{ } - -FieldModifierMap::~FieldModifierMap() { } - -FieldModifier * -FieldModifierMap::getModifier(FieldIdT fId) const -{ - FieldModifierMapT::const_iterator itr = _map.find(fId); - if (itr == _map.end()) { - return NULL; - } - return itr->second.get(); -} - -} diff --git a/streamingvisitors/src/vespa/vsm/common/fieldmodifier.h b/streamingvisitors/src/vespa/vsm/common/fieldmodifier.h deleted file mode 100644 index 60e480fa237..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/fieldmodifier.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/document/fieldvalue/fieldvalue.h> -#include <vespa/vsm/common/document.h> - -namespace vsm { - -/** - * Interface for classes that want to modify a field value. - **/ -class FieldModifier -{ -public: - typedef std::unique_ptr<FieldModifier> UP; - - /** - * Modifies the given field value and returns a new one. - **/ - virtual document::FieldValue::UP modify(const document::FieldValue & fv) = 0; - - /** - * Modifies the given field value and returns a new one. - * Use the given field path to iterate the field value. - **/ - virtual document::FieldValue::UP modify(const document::FieldValue & fv, - const document::FieldPath & path) = 0; - - virtual ~FieldModifier() { } -}; - -typedef vespalib::hash_map<FieldIdT, FieldModifier::UP> FieldModifierMapT; - -/** - * This class wraps a map from field id to field modifier. - **/ -class FieldModifierMap -{ -private: - FieldModifierMapT _map; - -public: - FieldModifierMap(); - ~FieldModifierMap(); - FieldModifierMapT & map() { return _map; } - const FieldModifierMapT & map() const { return _map; } - - /** - * Returns the modifier associated with the given field id or NULL if not found. - * - * @param fId the field id to look up. - * @return the field modifier or NULL if not found. - **/ - FieldModifier * getModifier(FieldIdT fId) const; -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/common/storagedocument.cpp b/streamingvisitors/src/vespa/vsm/common/storagedocument.cpp deleted file mode 100644 index a0d666268f5..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/storagedocument.cpp +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "storagedocument.h" -#include <vespa/document/fieldvalue/arrayfieldvalue.h> -#include <vespa/document/fieldvalue/weightedsetfieldvalue.h> - -#include <vespa/log/log.h> -LOG_SETUP(".vsm.storagedocument"); - -using NestedIterator = document::FieldValue::PathRange; - -namespace vsm { - -StorageDocument::StorageDocument(document::Document::UP doc, const SharedFieldPathMap & fim, size_t fieldNoLimit) : - Document(fieldNoLimit), - _doc(std::move(doc)), - _fieldMap(fim), - _cachedFields(getFieldCount()), - _backedFields() -{ } - -StorageDocument::~StorageDocument() { } - -namespace { - FieldPath _emptyFieldPath; - StorageDocument::SubDocument _empySubDocument(NULL, _emptyFieldPath.getFullRange()); -} - -const StorageDocument::SubDocument & -StorageDocument::getComplexField(FieldIdT fId) const -{ - if (_cachedFields[fId].getFieldValue() == NULL) { - const FieldPath & fp = (*_fieldMap)[fId]; - if ( ! fp.empty() ) { - const document::StructuredFieldValue * sfv = _doc.get(); - NestedIterator nested = fp.getFullRange(); - const document::FieldPathEntry& fvInfo = nested.cur(); - bool ok = sfv->getValue(fvInfo.getFieldRef(), fvInfo.getFieldValueToSet()); - if (ok) { - SubDocument tmp(&fvInfo.getFieldValueToSet(), nested.next()); - _cachedFields[fId].swap(tmp); - } - } else { - LOG(debug, "Failed getting field fId %d.", fId); - return _empySubDocument; - } - } - return _cachedFields[fId]; -} - -void StorageDocument::saveCachedFields() const -{ - size_t m(_cachedFields.size()); - _backedFields.reserve(m); - for (size_t i(0); i < m; i++) { - if (_cachedFields[i].getFieldValue() != 0) { - _backedFields.emplace_back(document::FieldValue::UP(_cachedFields[i].getFieldValue()->clone())); - _cachedFields[i].setFieldValue(_backedFields.back().get()); - } - } -} - -const document::FieldValue * -StorageDocument::getField(FieldIdT fId) const -{ - return getComplexField(fId).getFieldValue(); -} - -bool StorageDocument::setField(FieldIdT fId, document::FieldValue::UP fv) -{ - bool ok(fId < _cachedFields.size()); - if (ok) { - const FieldPath & fp = (*_fieldMap)[fId]; - SubDocument tmp(fv.get(), NestedIterator(fp.end(), fp.end())); - _cachedFields[fId].swap(tmp); - _backedFields.emplace_back(std::move(fv)); - } - return ok; -} - -} diff --git a/streamingvisitors/src/vespa/vsm/common/storagedocument.h b/streamingvisitors/src/vespa/vsm/common/storagedocument.h deleted file mode 100644 index a7f21cb052f..00000000000 --- a/streamingvisitors/src/vespa/vsm/common/storagedocument.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "document.h" -#include <vespa/document/fieldvalue/document.h> - -namespace vsm { - -typedef vespalib::CloneablePtr<document::FieldValue> FieldValueContainer; -typedef document::FieldPath FieldPath; // field path to navigate a field value -typedef std::vector<FieldPath> FieldPathMapT; // map from field id to field path -typedef std::shared_ptr<FieldPathMapT> SharedFieldPathMap; - -class StorageDocument : public Document { -public: - typedef std::unique_ptr<StorageDocument> UP; - - class SubDocument { - public: - SubDocument() : _fieldValue(nullptr) {} - SubDocument(document::FieldValue *fv, document::FieldValue::PathRange nested) : - _fieldValue(fv), - _range(nested) - { } - - const document::FieldValue *getFieldValue() const { return _fieldValue; } - void setFieldValue(document::FieldValue *fv) { _fieldValue = fv; } - const document::FieldValue::PathRange & getRange() const { return _range; } - void swap(SubDocument &rhs) { - std::swap(_fieldValue, rhs._fieldValue); - std::swap(_range, rhs._range); - } - private: - FieldPath::const_iterator begin() const; - FieldPath::const_iterator end() const; - document::FieldValue *_fieldValue; - document::FieldValue::PathRange _range; - }; -public: - StorageDocument(document::Document::UP doc, const SharedFieldPathMap &fim, size_t fieldNoLimit); - StorageDocument(const StorageDocument &) = delete; - StorageDocument & operator = (const StorageDocument &) = delete; - ~StorageDocument(); - - const document::Document &docDoc() const { return *_doc; } - bool valid() const { return _doc.get() != nullptr; } - const SubDocument &getComplexField(FieldIdT fId) const; - const document::FieldValue *getField(FieldIdT fId) const override; - bool setField(FieldIdT fId, document::FieldValue::UP fv) override ; - void saveCachedFields() const; -private: - document::Document::UP _doc; - SharedFieldPathMap _fieldMap; - mutable std::vector<SubDocument> _cachedFields; - mutable std::vector<document::FieldValue::UP> _backedFields; -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/config/.gitignore b/streamingvisitors/src/vespa/vsm/config/.gitignore deleted file mode 100644 index d58390943e2..00000000000 --- a/streamingvisitors/src/vespa/vsm/config/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -.depend -Makefile -config-*.cpp -config-*.h diff --git a/streamingvisitors/src/vespa/vsm/config/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/config/CMakeLists.txt deleted file mode 100644 index fea0bafe6b2..00000000000 --- a/streamingvisitors/src/vespa/vsm/config/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -vespa_add_library(vsm_vconfig OBJECT - SOURCES - DEPENDS -) -vespa_generate_config(vsm_vconfig vsmfields.def) -install_config_definition(vsmfields.def vespa.config.search.vsm.vsmfields.def) -vespa_generate_config(vsm_vconfig vsm.def) -install_config_definition(vsm.def vespa.config.search.vsm.vsm.def) -vespa_generate_config(vsm_vconfig vsmsummary.def) -install_config_definition(vsmsummary.def vespa.config.search.vsm.vsmsummary.def) diff --git a/streamingvisitors/src/vespa/vsm/config/vsm-cfif.h b/streamingvisitors/src/vespa/vsm/config/vsm-cfif.h deleted file mode 100644 index 22033aee232..00000000000 --- a/streamingvisitors/src/vespa/vsm/config/vsm-cfif.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/vsm/config/config-vsmfields.h> -#include <vespa/vsm/config/config-vsm.h> -#include <vespa/vsm/config/config-vsmsummary.h> -#include <vespa/vespalib/util/ptrholder.h> - -using vespa::config::search::vsm::VsmConfig; -using vespa::config::search::vsm::VsmsummaryConfig; -using vespa::config::search::vsm::VsmfieldsConfig; - -namespace vsm { - -typedef vespalib::PtrHolder<VsmfieldsConfig> VsmfieldsHolder; -typedef std::shared_ptr<VsmfieldsConfig> VsmfieldsHandle; - -typedef vespalib::PtrHolder<VsmConfig> VsmHolder; -typedef std::shared_ptr<VsmConfig> VsmHandle; - -typedef vespalib::PtrHolder<VsmsummaryConfig> FastS_VsmsummaryHolder; -typedef std::shared_ptr<VsmsummaryConfig> FastS_VsmsummaryHandle; - -} - diff --git a/streamingvisitors/src/vespa/vsm/config/vsm.def b/streamingvisitors/src/vespa/vsm/config/vsm.def deleted file mode 100644 index 1971f9e9574..00000000000 --- a/streamingvisitors/src/vespa/vsm/config/vsm.def +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -namespace=vespa.config.search.vsm - -## The document model for the documents used as input for the VSM -doctype reference - -## Configuration for storage client used by VSM -storagecfg reference - -## Config defining what search method should be applied to different -## fields in the documents. It also contains a mapping from index name -## to a set of fields making up that index. -vsmfields reference diff --git a/streamingvisitors/src/vespa/vsm/config/vsmfields.def b/streamingvisitors/src/vespa/vsm/config/vsmfields.def deleted file mode 100644 index 5e943c9274d..00000000000 --- a/streamingvisitors/src/vespa/vsm/config/vsmfields.def +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -namespace=vespa.config.search.vsm - -## Level of verification applied to the documents received. -documentverificationlevel int default=0 - -## Set if one should ignore limit hits. -searchall int default=1 - -## The name of a field for which we are assigning a search method. -## The field name refers directly to a field in the document model. -fieldspec[].name string - -## The search method for a given field. Note: same field in 2 different document types must match on type if not a random result might be expected. -fieldspec[].searchmethod enum { NONE, BOOL, AUTOUTF8, UTF8, SSE2UTF8, INT8, INT16, INT32, INT64, FLOAT16, FLOAT, DOUBLE, GEOPOS } default=AUTOUTF8 -fieldspec[].arg1 string default="" - -## Maximum number of chars to search per field. -fieldspec[].maxlength int default=1048576 - -## Type of the field -fieldspec[].fieldtype enum {ATTRIBUTE, INDEX} default=INDEX - -## The name of a documenttype for which we are assigning a set of indexes. -documenttype[].name string -## The name of an index of a documenttype for which we are assigning a set of fields. -documenttype[].index[].name string - -## The name of a field part of an index. -## The field name refers directly to a field in the document model. -documenttype[].index[].field[].name string diff --git a/streamingvisitors/src/vespa/vsm/config/vsmsummary.def b/streamingvisitors/src/vespa/vsm/config/vsmsummary.def deleted file mode 100644 index 5eb96624826..00000000000 --- a/streamingvisitors/src/vespa/vsm/config/vsmsummary.def +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -namespace=vespa.config.search.vsm - -## The name of the result class that should be generated for documents -## returned from the VSM. If this value is empty, the first found -## result class will be used. -outputclass string default="" - -## Mapping of field names between the result class and the document -## model. This value represents the name in the result class. Fields -## not mentioned here will get the identity mapping. -fieldmap[].summary string - -## Mapping of field names between the result class and the document -## model. This field vector represents the names in the document model -## that should be used as input when generating the summary field. -fieldmap[].document[].field string - -## This command specifies how the document fields should be combined -## when generating the summary field. -fieldmap[].command enum { NONE, FLATTENJUNIPER, FLATTENSPACE } default=NONE diff --git a/streamingvisitors/src/vespa/vsm/searcher/.gitignore b/streamingvisitors/src/vespa/vsm/searcher/.gitignore deleted file mode 100644 index 95bc02923a9..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -*.exe -*.ilk -*.pdb -.depend* -Makefile diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt deleted file mode 100644 index 0a2a9ec21d2..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") - set(SSE2_FILES "fold.cpp") -else() - unset(SSE2_FILES) -endif() - -vespa_add_library(vsm_vsmsearcher OBJECT - SOURCES - boolfieldsearcher.cpp - fieldsearcher.cpp - floatfieldsearcher.cpp - ${SSE2_FILES} - futf8strchrfieldsearcher.cpp - geo_pos_field_searcher.cpp - intfieldsearcher.cpp - strchrfieldsearcher.cpp - utf8flexiblestringfieldsearcher.cpp - utf8strchrfieldsearcher.cpp - utf8stringfieldsearcherbase.cpp - utf8substringsearcher.cpp - utf8substringsnippetmodifier.cpp - utf8suffixstringfieldsearcher.cpp - utf8exactstringfieldsearcher.cpp - DEPENDS - vsm_vconfig -) diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp deleted file mode 100644 index 8c9b556e593..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "boolfieldsearcher.h" -#include <vespa/document/fieldvalue/boolfieldvalue.h> - -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; - -namespace vsm { - -namespace { -vespalib::stringref TRUE = "true"; -vespalib::stringref FALSE = "false"; -} - -std::unique_ptr<FieldSearcher> -BoolFieldSearcher::duplicate() const -{ - return std::make_unique<BoolFieldSearcher>(*this); -} - -BoolFieldSearcher::BoolFieldSearcher(FieldIdT fId) : - FieldSearcher(fId), - _terms() -{ } - -BoolFieldSearcher::~BoolFieldSearcher() = default; - -void BoolFieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) -{ - _terms.clear(); - FieldSearcher::prepare(qtl, buf); - for (const QueryTerm * qt : qtl) { - if (TRUE == qt->getTerm()) { - _terms.push_back(true); - } else if (FALSE == qt->getTerm()) { - _terms.push_back(false); - } else { - int64_t low; - int64_t high; - bool valid = qt->getAsIntegerTerm(low, high); - _terms.push_back(valid && (low > 0)); - } - } -} - -void BoolFieldSearcher::onValue(const document::FieldValue & fv) -{ - for(size_t j=0, jm(_terms.size()); j < jm; j++) { - if (static_cast<const document::BoolFieldValue &>(fv).getValue() == _terms[j]) { - addHit(*_qtl[j], 0); - } - } - ++_words; -} - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h deleted file mode 100644 index f6afef9e507..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "fieldsearcher.h" - -namespace vsm { - -class BoolFieldSearcher : public FieldSearcher -{ -public: - std::unique_ptr<FieldSearcher> duplicate() const override; - BoolFieldSearcher(FieldIdT fId); - ~BoolFieldSearcher(); - void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; - void onValue(const document::FieldValue & fv) override; -private: - std::vector<bool> _terms; -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp deleted file mode 100644 index e69999b160e..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp +++ /dev/null @@ -1,301 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "fieldsearcher.h" -#include <vespa/vsm/vsm/fieldsearchspec.h> -#include <vespa/document/fieldvalue/arrayfieldvalue.h> -#include <vespa/document/fieldvalue/weightedsetfieldvalue.h> - -#include <vespa/log/log.h> -LOG_SETUP(".vsm.searcher.fieldsearcher"); - -using search::byte; -using search::streaming::Query; -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; -using search::v16qi; - -namespace vsm { - -class force -{ - public: - force() { FieldSearcher::init(); } -}; - -static force __forceInit; - -byte FieldSearcher::_foldLowCase[256]; -byte FieldSearcher::_wordChar[256]; - -FieldSearcherBase::FieldSearcherBase() : - _qtl(), - _qtlFastBuffer(), - _qtlFastSize(0), - _qtlFast(nullptr) -{ -} - -FieldSearcherBase::FieldSearcherBase(const FieldSearcherBase & org) : - _qtl(), - _qtlFastBuffer(), - _qtlFastSize(0), - _qtlFast(nullptr) -{ - prepare(org._qtl); -} - -FieldSearcherBase::~FieldSearcherBase() -{ -} - -FieldSearcherBase & FieldSearcherBase::operator = (const FieldSearcherBase & org) -{ - if (this != &org) { - prepare(org._qtl); - } - return *this; -} - -void FieldSearcherBase::prepare(const QueryTermList & qtl) -{ - _qtl = qtl; - _qtlFastBuffer.resize(sizeof(*_qtlFast)*(_qtl.size()+1), 0x13); - _qtlFast = reinterpret_cast<v16qi *>(reinterpret_cast<unsigned long>(&_qtlFastBuffer[0]+15) & ~0xf); - _qtlFastSize = 0; - for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { - const QueryTerm & qt = **it; - memcpy(&_qtlFast[_qtlFastSize++], qt.getTerm(), std::min(size_t(16), qt.termLen())); - } -} - -FieldSearcher::FieldSearcher(const FieldIdT & fId, bool defaultPrefix) : - FieldSearcherBase(), - _field(fId), - _matchType(defaultPrefix ? PREFIX : REGULAR), - _maxFieldLength(0x100000), - _currentElementId(0), - _currentElementWeight(1), - _pureUsAsciiCount(0), - _pureUsAsciiFieldCount(0), - _anyUtf8Count(0), - _anyUtf8FieldCount(0), - _words(0), - _badUtf8Count(0), - _zeroCount(0) -{ - zeroStat(); -} - -FieldSearcher::~FieldSearcher() = default; - -bool FieldSearcher::search(const StorageDocument & doc) -{ - for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { - QueryTerm & qt = **it; - QueryTerm::FieldInfo & fInfo = qt.getFieldInfo(field()); - fInfo.setHitOffset(qt.getHitList().size()); - } - onSearch(doc); - for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { - QueryTerm & qt = **it; - QueryTerm::FieldInfo & fInfo = qt.getFieldInfo(field()); - fInfo.setHitCount(qt.getHitList().size() - fInfo.getHitOffset()); - fInfo.setFieldLength(_words); - } - _words = 0; - return true; -} - -void FieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & UNUSED_PARAM(buf)) -{ - FieldSearcherBase::prepare(qtl); - prepareFieldId(); -} - -size_t FieldSearcher::countWords(const FieldRef & f) -{ - size_t words = 0; - const char * n = f.data(); - const char * e = n + f.size(); - for( ; n < e; ++n) { - for (; isspace(*n) && (n<e); ++n); - const char * m = n; - for (; iswordchar(*n) && (n<e); ++n); - if (n > m) { - words++; - } - } - return words; -} - -void FieldSearcher::prepareFieldId() -{ - for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { - QueryTerm & qt = **it; - qt.resizeFieldId(field()); - } -} - -void FieldSearcher::addStat(const FieldSearcher & toAdd) -{ - _pureUsAsciiCount += toAdd._pureUsAsciiCount; - _pureUsAsciiFieldCount += toAdd._pureUsAsciiFieldCount; - _anyUtf8Count += toAdd._anyUtf8Count; - _anyUtf8FieldCount += toAdd._anyUtf8FieldCount; - _badUtf8Count += toAdd._badUtf8Count; - _zeroCount += toAdd._zeroCount; - for (size_t i=0; i<NELEMS(_utf8Count); i++) { _utf8Count[i] += toAdd._utf8Count[i]; } -} - -void FieldSearcher::zeroStat() -{ - _pureUsAsciiCount = 0; - _pureUsAsciiFieldCount = 0; - _anyUtf8Count = 0; - _anyUtf8FieldCount = 0; - _badUtf8Count = 0; - _zeroCount = 0; - for (size_t i=0; i<NELEMS(_utf8Count); i++) { _utf8Count[i] = 0; } -} - -void FieldSearcher::init() -{ - for (unsigned i = 0; i < NELEMS(_foldLowCase); i++) { - _foldLowCase[i] = 0; - _wordChar[i] = 0; - } - for (int i = 'A'; i <= 'Z'; i++) { - _wordChar[i] = 0xFF; - _foldLowCase[i] = i | 0x20; - } - for (int i = 'a'; i <= 'z'; i++) { - _wordChar[i] = 0xFF; - _foldLowCase[i] = i; - } - for (int i = '0'; i <= '9'; i++) { - _wordChar[i] = 0xFF; - _foldLowCase[i] = i; - } - for (int i = 0xC0; i <= 0xFF; i++) { - _wordChar[i] = 0xFF; - } - _wordChar[0xd7] = 0; - _wordChar[0xf7] = 0; - - if (1) /* _doAccentRemoval */ { - _foldLowCase[0xc0] = 'a'; - _foldLowCase[0xc1] = 'a'; - _foldLowCase[0xc2] = 'a'; - _foldLowCase[0xc3] = 'a'; // A tilde - _foldLowCase[0xc7] = 'c'; - _foldLowCase[0xc8] = 'e'; - _foldLowCase[0xc9] = 'e'; - _foldLowCase[0xca] = 'e'; - _foldLowCase[0xcb] = 'e'; - _foldLowCase[0xcc] = 'i'; // I grave - _foldLowCase[0xcd] = 'i'; - _foldLowCase[0xce] = 'i'; - _foldLowCase[0xcf] = 'i'; - _foldLowCase[0xd3] = 'o'; - _foldLowCase[0xd4] = 'o'; - _foldLowCase[0xda] = 'u'; - _foldLowCase[0xdb] = 'u'; - - _foldLowCase[0xe0] = 'a'; - _foldLowCase[0xe1] = 'a'; - _foldLowCase[0xe2] = 'a'; - _foldLowCase[0xe3] = 'a'; // a tilde - _foldLowCase[0xe7] = 'c'; - _foldLowCase[0xe8] = 'e'; - _foldLowCase[0xe9] = 'e'; - _foldLowCase[0xea] = 'e'; - _foldLowCase[0xeb] = 'e'; - _foldLowCase[0xec] = 'i'; // i grave - _foldLowCase[0xed] = 'i'; - _foldLowCase[0xee] = 'i'; - _foldLowCase[0xef] = 'i'; - _foldLowCase[0xf3] = 'o'; - _foldLowCase[0xf4] = 'o'; - _foldLowCase[0xfa] = 'u'; - _foldLowCase[0xfb] = 'u'; - } -} - -void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT & difm, const SharedSearcherBuf & searcherBuf, Query & query) -{ - QueryTermList qtl; - query.getLeafs(qtl); - vespalib::string tmp; - for (FieldIdTSearcherMap::iterator it = begin(), mt = end(); it != mt; it++) { - QueryTermList onlyInIndex; - FieldIdT fid = (*it)->field(); - for (QueryTermList::iterator qt = qtl.begin(), mqt = qtl.end(); qt != mqt; qt++) { - QueryTerm * q = *qt; - for (DocumentTypeIndexFieldMapT::const_iterator dt(difm.begin()), dmt(difm.end()); dt != dmt; dt++) { - const IndexFieldMapT & fim = dt->second; - IndexFieldMapT::const_iterator found = fim.find(FieldSearchSpecMap::stripNonFields(q->index())); - if (found != fim.end()) { - const FieldIdTList & index = found->second; - if ((find(index.begin(), index.end(), fid) != index.end()) && (find(onlyInIndex.begin(), onlyInIndex.end(), q) == onlyInIndex.end())) { - onlyInIndex.push_back(q); - } - } else { - LOG(debug, "Could not find the requested index=%s in the index config map. Query does not fit search definition.", q->index().c_str()); - } - } - } - /// Should perhaps do a unique on onlyInIndex - (*it)->prepare(onlyInIndex, searcherBuf); - if (logger.wants(ns_log::Logger::spam)) { - char tmpBuf[16]; - sprintf(tmpBuf,"%d", fid); - tmp += tmpBuf; - tmp += ", "; - } - } - LOG(debug, "Will search in %s", tmp.c_str()); -} - -bool FieldSearcher::onSearch(const StorageDocument & doc) -{ - bool retval(true); - size_t fNo(field()); - const StorageDocument::SubDocument & sub = doc.getComplexField(fNo); - if (sub.getFieldValue() != nullptr) { - IteratorHandler ih(*this); - sub.getFieldValue()->iterateNested(sub.getRange(), ih); - } - return retval; -} - -void -FieldSearcher::IteratorHandler::onPrimitive(uint32_t, const Content & c) -{ - LOG(spam, "onPrimitive: field value '%s'", c.getValue().toString().c_str()); - _searcher.setCurrentWeight(c.getWeight()); - _searcher.setCurrentElementId(getArrayIndex()); - _searcher.onValue(c.getValue()); -} - -void -FieldSearcher::IteratorHandler::onCollectionStart(const Content & c) -{ - const document::FieldValue & fv = c.getValue(); - LOG(spam, "onCollectionStart: field value '%s'", fv.toString().c_str()); - if (fv.isA(document::FieldValue::Type::ARRAY)) { - const document::ArrayFieldValue & afv = static_cast<const document::ArrayFieldValue &>(fv); - LOG(spam, "onCollectionStart: Array size = '%zu'", afv.size()); - } else if (fv.isA(document::FieldValue::Type::WSET)) { - const document::WeightedSetFieldValue & wsfv = static_cast<const document::WeightedSetFieldValue &>(fv); - LOG(spam, "onCollectionStart: WeightedSet size = '%zu'", wsfv.size()); - } -} - -void -FieldSearcher::IteratorHandler::onStructStart(const Content & c) -{ - LOG(spam, "onStructStart: field value '%s'", c.getValue().toString().c_str()); - _searcher.onStructValue(static_cast<const document::StructFieldValue &>(c.getValue())); -} - - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h deleted file mode 100644 index 5c2ef8fec28..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/document/fieldvalue/iteratorhandler.h> -#include <vespa/searchlib/query/streaming/query.h> -#include <vespa/vsm/common/document.h> -#include <vespa/vsm/common/storagedocument.h> - -namespace vsm { - -typedef size_t termcount_t; -typedef size_t termsize_t; - -#if defined(COLLECT_CHAR_STAT) - #define NEED_CHAR_STAT(a) { a; } -#else - #define NEED_CHAR_STAT(a) -#endif - -typedef ucs4_t cmptype_t; -typedef vespalib::Array<cmptype_t> SearcherBuf; -typedef std::shared_ptr<SearcherBuf> SharedSearcherBuf; -typedef std::vector<char> CharVector; - -class FieldSearcherBase -{ -protected: - search::streaming::QueryTermList _qtl; -private: - CharVector _qtlFastBuffer; -protected: - FieldSearcherBase(); - FieldSearcherBase(const FieldSearcherBase & org); - virtual ~FieldSearcherBase(void); - FieldSearcherBase & operator = (const FieldSearcherBase & org); - void prepare(const search::streaming::QueryTermList & qtl); - size_t _qtlFastSize; - search::v16qi *_qtlFast; -}; - -class FieldSearcher : public FieldSearcherBase -{ -public: - enum MatchType { - REGULAR, - PREFIX, - SUBSTRING, - SUFFIX, - EXACT - }; - - FieldSearcher(const FieldIdT & fId, bool defaultPrefix=false); - ~FieldSearcher() override; - virtual std::unique_ptr<FieldSearcher> duplicate() const = 0; - bool search(const StorageDocument & doc); - virtual void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf); - const FieldIdT & field() const { return _field; } - void field(const FieldIdT & v) { _field = v; prepareFieldId(); } - bool prefix() const { return _matchType == PREFIX; } - bool substring() const { return _matchType == SUBSTRING; } - bool suffix() const { return _matchType == SUFFIX; } - bool exact() const { return _matchType == EXACT; } - void setMatchType(MatchType mt) { _matchType = mt; } - static void init(); - static search::byte fold(search::byte c) { return _foldLowCase[c]; } - static search::byte iswordchar(search::byte c) { return _wordChar[c]; } - static search::byte isspace(search::byte c) { return ! iswordchar(c); } - static size_t countWords(const FieldRef & f); - unsigned pureUsAsciiCount() const { return _pureUsAsciiCount; } - unsigned pureUsAsciiFieldCount() const { return _pureUsAsciiFieldCount; } - unsigned anyUtf8Count() const { return _anyUtf8Count; } - unsigned anyUtf8FieldCount() const { return _anyUtf8FieldCount; } - unsigned badUtf8Count() const { return _badUtf8Count; } - unsigned zeroCount() const { return _zeroCount; } - unsigned utf8Count(size_t sz) const { return _utf8Count[1+sz]; } - const unsigned * utf8Count() const { return _utf8Count; } - int32_t getCurrentWeight() const { return _currentElementWeight; } - void addStat(const FieldSearcher & toAdd); - void zeroStat(); - FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; } - size_t maxFieldLength() const { return _maxFieldLength; } - -private: - class IteratorHandler : public document::fieldvalue::IteratorHandler { - private: - FieldSearcher & _searcher; - - void onPrimitive(uint32_t fid, const Content & c) override; - void onCollectionStart(const Content & c) override; - void onStructStart(const Content & c) override; - - public: - IteratorHandler(FieldSearcher & searcher) : _searcher(searcher) {} - }; - friend class IteratorHandler; // to allow calls to onValue(); - - void prepareFieldId(); - void setCurrentWeight(int32_t weight) { _currentElementWeight = weight; } - void setCurrentElementId(int32_t weight) { _currentElementId = weight; } - bool onSearch(const StorageDocument & doc); - virtual void onValue(const document::FieldValue & fv) = 0; - virtual void onStructValue(const document::StructFieldValue &) { } - FieldIdT _field; - MatchType _matchType; - unsigned _maxFieldLength; - uint32_t _currentElementId; - int32_t _currentElementWeight; // Contains the weight of the current item being evaluated. - /// Number of bytes in blocks containing pure us-ascii - unsigned _pureUsAsciiCount; - /// Number of blocks containing pure us-ascii - unsigned _pureUsAsciiFieldCount; - /// Number of bytes in blocks containing any non us-ascii - unsigned _anyUtf8Count; - /// Number of blocks containing any non us-ascii - unsigned _anyUtf8FieldCount; -protected: - /// Number of terms searched. - unsigned _words; - /// Number of utf8 bytes by utf8 size. - unsigned _utf8Count[6]; - unsigned _badUtf8Count; - unsigned _zeroCount; -protected: - void addPureUsAsciiField(size_t sz) { _pureUsAsciiCount += sz; _pureUsAsciiFieldCount++;; } - void addAnyUtf8Field(size_t sz) { _anyUtf8Count += sz; _anyUtf8FieldCount++; } - /** - * Adds a hit to the given query term. - * For each call to onValue() a batch of words are processed, and the position is local to this batch. - **/ - void addHit(search::streaming::QueryTerm & qt, uint32_t pos) const { - qt.add(_words + pos, field(), _currentElementId, getCurrentWeight()); - } -public: - static search::byte _foldLowCase[256]; - static search::byte _wordChar[256]; -}; - -typedef std::unique_ptr<FieldSearcher> FieldSearcherContainer; -typedef std::vector<FieldSearcherContainer> FieldIdTSearcherMapT; - -class FieldIdTSearcherMap : public FieldIdTSearcherMapT -{ -public: - void prepare(const DocumentTypeIndexFieldMapT & difm, const SharedSearcherBuf & searcherBuf, search::streaming::Query & query); -}; - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp deleted file mode 100644 index 02d8bd8c12a..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "floatfieldsearcher.h" - -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; - -namespace vsm { - -std::unique_ptr<FieldSearcher> -FloatFieldSearcher::duplicate() const -{ - return std::make_unique<FloatFieldSearcher>(*this); -} - -std::unique_ptr<FieldSearcher> -DoubleFieldSearcher::duplicate() const -{ - return std::make_unique<DoubleFieldSearcher>(*this); -} - -template<typename T> -FloatFieldSearcherT<T>::FloatFieldSearcherT(FieldIdT fId) : - FieldSearcher(fId), - _floatTerm() -{} - -template<typename T> -FloatFieldSearcherT<T>::~FloatFieldSearcherT() {} - -template<typename T> -void FloatFieldSearcherT<T>::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) -{ - _floatTerm.clear(); - FieldSearcher::prepare(qtl, buf); - for (QueryTermList::const_iterator it=qtl.begin(); it < qtl.end(); it++) { - const QueryTerm * qt = *it; - size_t sz(qt->termLen()); - if (sz) { - double low; - double high; - bool valid = qt->getAsDoubleTerm(low, high); - _floatTerm.push_back(FloatInfo(low, high, valid)); - } - } -} - - -template<typename T> -void FloatFieldSearcherT<T>::onValue(const document::FieldValue & fv) -{ - for(size_t j=0, jm(_floatTerm.size()); j < jm; j++) { - const FloatInfo & ii = _floatTerm[j]; - if (ii.valid() && (ii.cmp(fv.getAsDouble()))) { - addHit(*_qtl[j], 0); - } - } - ++_words; -} - -template<typename T> -bool FloatFieldSearcherT<T>::FloatInfo::cmp(T key) const -{ - return (_lower <= key) && (key <= _upper); -} - -template class FloatFieldSearcherT<float>; -template class FloatFieldSearcherT<double>; - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h deleted file mode 100644 index 98018fbf4a3..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "fieldsearcher.h" - -namespace vsm { - -template <typename T> -class FloatFieldSearcherT : public FieldSearcher -{ -public: - FloatFieldSearcherT(FieldIdT fId=0); - ~FloatFieldSearcherT(); - void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; - void onValue(const document::FieldValue & fv) override; -protected: - class FloatInfo - { - public: - FloatInfo(T low, T high, bool v) : _lower(low), _upper(high), _valid(v) { if (low > high) { _lower = high; _upper = low; } } - bool cmp(T key) const; - bool valid() const { return _valid; } - void setValid(bool v) { _valid = v; } - T getLow() const { return _lower; } - T getHigh() const { return _upper; } - private: - T _lower; - T _upper; - bool _valid; - }; - typedef std::vector<FloatInfo> FloatInfoListT; - FloatInfoListT _floatTerm; -}; - -typedef FloatFieldSearcherT<float> FloatFieldSearcherTF; -typedef FloatFieldSearcherT<double> FloatFieldSearcherTD; - -class FloatFieldSearcher : public FloatFieldSearcherTF -{ -public: - std::unique_ptr<FieldSearcher> duplicate() const override; - FloatFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTF(fId) { } -}; - -class DoubleFieldSearcher : public FloatFieldSearcherTD -{ -public: - std::unique_ptr<FieldSearcher> duplicate() const override; - DoubleFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTD(fId) { } -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/searcher/fold.cpp b/streamingvisitors/src/vespa/vsm/searcher/fold.cpp deleted file mode 100644 index bd2392d3ad6..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/fold.cpp +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -// -#include "fold.h" - -namespace vsm { - -const unsigned char * sse2_foldaa(const unsigned char * toFoldOrg, size_t sz, unsigned char * foldedOrg) -{ - typedef char v16qi __attribute__ ((__vector_size__(16))); - typedef long long v2di __attribute__ ((__vector_size__(16))); - static v16qi _G_0 = { '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1 }; - static v16qi _G_9 = { '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9' }; - static v16qi _G_a = { 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1 }; - static v16qi _G_z = { 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z' }; - static v16qi _G_8bit = { (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, - (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4 }; - static v2di _G_lowCase = { 0x2020202020202020ULL, 0x2020202020202020ULL }; - const v16qi *toFold = reinterpret_cast<const v16qi *>(toFoldOrg); - v2di * folded = reinterpret_cast<v2di *>(foldedOrg); - size_t i=0; - for (size_t m=sz/16; i < m; i++) - { -#ifndef __INTEL_COMPILER - int nonAscii = __builtin_ia32_pmovmskb128(toFold[i]); - if (nonAscii) - { -#ifdef __clang__ - v16qi non8Mask = _G_8bit > toFold[i]; -#else - v16qi non8Mask = __builtin_ia32_pcmpgtb128(_G_8bit, toFold[i]); -#endif - int non8bit = __builtin_ia32_pmovmskb128(non8Mask); - if (non8bit) - { - break; - } - break; - } -#ifdef __clang__ - v16qi _0 = toFold[i] > _G_0; - v16qi _z = toFold[i] > _G_z; - v2di _0_z = v2di(_0) ^ v2di(_z); - v2di toLow = _0_z & v2di(toFold[i]); - v16qi low = v16qi(toLow | _G_lowCase); - _0 = low > _G_0; - v16qi _9 = low > _G_9; - v16qi _a = low > _G_a; - _z = low > _G_z; - v2di _0_9_m = v2di(_0) ^ v2di(_9); - v2di _a_z_m = v2di(_a) ^ v2di(_z); - v2di _0_9 = _0_9_m & v2di(low); - v2di _a_z = _a_z_m & v2di(low); - folded[i] = _0_9 | _a_z; -#else - v16qi _0 = __builtin_ia32_pcmpgtb128(toFold[i], _G_0); - v16qi _z = __builtin_ia32_pcmpgtb128(toFold[i], _G_z); - v2di _0_z = __builtin_ia32_pxor128(v2di(_0), v2di(_z)); - v2di toLow = __builtin_ia32_pand128(_0_z, v2di(toFold[i])); - v16qi low = v16qi(__builtin_ia32_por128(toLow, _G_lowCase)); - _0 = __builtin_ia32_pcmpgtb128(low, _G_0); - v16qi _9 = __builtin_ia32_pcmpgtb128(low, _G_9); - v16qi _a = __builtin_ia32_pcmpgtb128(low, _G_a); - _z = __builtin_ia32_pcmpgtb128(low, _G_z); - v2di _0_9_m = __builtin_ia32_pxor128(v2di(_0), v2di(_9)); - v2di _a_z_m = __builtin_ia32_pxor128(v2di(_a), v2di(_z)); - v2di _0_9 = __builtin_ia32_pand128(_0_9_m, v2di(low)); - v2di _a_z = __builtin_ia32_pand128(_a_z_m, v2di(low)); - folded[i] = __builtin_ia32_por128(_0_9, _a_z); -#endif -#else -# warning "Intel's icc compiler does not like __builtin_ia32_pxor128" - LOG_ABORT("should not be reached"); -#endif - } - return toFoldOrg+i*16; -} - -const unsigned char * sse2_foldua(const unsigned char * toFoldOrg, size_t sz, unsigned char * foldedOrg) -{ - typedef char v16qi __attribute__ ((__vector_size__(16))); - typedef long long v2di __attribute__ ((__vector_size__(16))); - static v16qi _G_0 = { '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1 }; - static v16qi _G_9 = { '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9' }; - static v16qi _G_a = { 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1 }; - static v16qi _G_z = { 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z' }; - static v16qi _G_8bit = { (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, - (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4 }; - static v2di _G_lowCase = { 0x2020202020202020ULL, 0x2020202020202020ULL }; - v2di * folded = reinterpret_cast<v2di *>(foldedOrg); - size_t i=0; - for (size_t m=sz/16; i < m; i++) - { -#ifndef __INTEL_COMPILER -#ifdef __clang__ - v16qi current = __builtin_ia32_lddqu(reinterpret_cast<const char *>(&toFoldOrg[i*16])); -#else - v16qi current = __builtin_ia32_loaddqu(reinterpret_cast<const char *>(&toFoldOrg[i*16])); -#endif - int nonAscii = __builtin_ia32_pmovmskb128(current); - if (nonAscii) - { -#ifdef __clang__ - v16qi non8Mask = _G_8bit > current; -#else - v16qi non8Mask = __builtin_ia32_pcmpgtb128(_G_8bit, current); -#endif - int non8bit = __builtin_ia32_pmovmskb128(non8Mask); - if (non8bit) - { - break; - } - break; - } -#ifdef __clang__ - v16qi _0 = current > _G_0; - v16qi _z = current > _G_z; - v2di _0_z = v2di(_0) ^ v2di(_z); - v2di toLow = _0_z & v2di(current); - v16qi low = v16qi(toLow | _G_lowCase); - _0 = low > _G_0; - v16qi _9 = low > _G_9; - v16qi _a = low > _G_a; - _z = low > _G_z; - v2di _0_9_m = v2di(_0) ^ v2di(_9); - v2di _a_z_m = v2di(_a) ^ v2di(_z); - v2di _0_9 = _0_9_m & v2di(low); - v2di _a_z = _a_z_m & v2di(low); - folded[i] = _0_9 | _a_z; -#else - v16qi _0 = __builtin_ia32_pcmpgtb128(current, _G_0); - v16qi _z = __builtin_ia32_pcmpgtb128(current, _G_z); - v2di _0_z = __builtin_ia32_pxor128(v2di(_0), v2di(_z)); - v2di toLow = __builtin_ia32_pand128(_0_z, v2di(current)); - v16qi low = v16qi(__builtin_ia32_por128(toLow, _G_lowCase)); - _0 = __builtin_ia32_pcmpgtb128(low, _G_0); - v16qi _9 = __builtin_ia32_pcmpgtb128(low, _G_9); - v16qi _a = __builtin_ia32_pcmpgtb128(low, _G_a); - _z = __builtin_ia32_pcmpgtb128(low, _G_z); - v2di _0_9_m = __builtin_ia32_pxor128(v2di(_0), v2di(_9)); - v2di _a_z_m = __builtin_ia32_pxor128(v2di(_a), v2di(_z)); - v2di _0_9 = __builtin_ia32_pand128(_0_9_m, v2di(low)); - v2di _a_z = __builtin_ia32_pand128(_a_z_m, v2di(low)); - folded[i] = __builtin_ia32_por128(_0_9, _a_z); -#endif -#else -# warning "Intel's icc compiler does not like __builtin_ia32_pxor128" - LOG_ABORT("should not be reached"); -#endif - } - return toFoldOrg+i*16; -} - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/fold.h b/streamingvisitors/src/vespa/vsm/searcher/fold.h deleted file mode 100644 index 578b883484f..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/fold.h +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/vsm/common/document.h> - -namespace vsm { - -const search::byte * sse2_foldaa(const search::byte * toFoldOrg, size_t sz, search::byte * foldedOrg); -const search::byte * sse2_foldua(const search::byte * toFoldOrg, size_t sz, search::byte * foldedOrg); - -} - diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp deleted file mode 100644 index fc5d77de419..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp +++ /dev/null @@ -1,310 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "futf8strchrfieldsearcher.h" -#ifdef __x86_64__ -#include "fold.h" -#endif -#include <vespa/vespalib/util/size_literals.h> - -using search::byte; -using search::streaming::QueryTerm; -using search::v16qi; -using vespalib::Optimized; - -namespace vsm { - -std::unique_ptr<FieldSearcher> -FUTF8StrChrFieldSearcher::duplicate() const -{ - return std::make_unique<FUTF8StrChrFieldSearcher>(*this); -} - -FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher() - : UTF8StrChrFieldSearcher(), - _folded(4_Ki) -{ } -FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher(FieldIdT fId) - : UTF8StrChrFieldSearcher(fId), - _folded(4_Ki) -{ } -FUTF8StrChrFieldSearcher::~FUTF8StrChrFieldSearcher() {} - -bool -FUTF8StrChrFieldSearcher::ansiFold(const char * toFold, size_t sz, char * folded) -{ - bool retval(true); - for(size_t i=0; i < sz; i++) { - byte c = toFold[i]; - if (c>=128) { retval = false; break; } - folded[i] = FieldSearcher::_foldLowCase[c]; - } - return retval; -} - -bool -FUTF8StrChrFieldSearcher::lfoldaa(const char * toFold, size_t sz, char * folded, size_t & unalignedStart) -{ - unalignedStart = (size_t(toFold) & 0xF); -#ifdef __x86_64__ - bool retval(true); - size_t unalignedsz = std::min(sz, (16 - unalignedStart) & 0xF); - - size_t foldedUnaligned = (size_t(folded) & 0xF); - unalignedStart = (foldedUnaligned < unalignedStart) ? (unalignedStart-foldedUnaligned) : unalignedStart + 16 - foldedUnaligned; - size_t alignedStart = unalignedStart+unalignedsz; - - size_t alignedsz = sz - unalignedsz; - size_t alignsz16 = alignedsz & 0xFFFFFFF0; - size_t rest = alignedsz - alignsz16; - - if (unalignedStart) { - retval = ansiFold(toFold, unalignedsz, folded + unalignedStart); - } - if (alignsz16 && retval) { - const byte * end = sse2_foldaa(reinterpret_cast<const byte *>(toFold+unalignedsz), alignsz16, reinterpret_cast<byte *>(folded+alignedStart)); - retval = (end == reinterpret_cast<const byte *>(toFold+unalignedsz+alignsz16)); - } - if(rest && retval) { - retval = ansiFold(toFold + unalignedsz + alignsz16, rest, folded+alignedStart+alignsz16); - } - return retval; -#else - return ansiFold(toFold, sz, folded + unalignedStart); -#endif -} - -bool -FUTF8StrChrFieldSearcher::lfoldua(const char * toFold, size_t sz, char * folded, size_t & alignedStart) -{ - alignedStart = 0xF - (size_t(folded + 0xF) % 0x10); -#ifdef __x86_64__ - bool retval(true); - - size_t alignsz16 = sz & 0xFFFFFFF0; - size_t rest = sz - alignsz16; - - if (alignsz16) { - const byte * end = sse2_foldua(reinterpret_cast<const byte *>(toFold), alignsz16, reinterpret_cast<byte *>(folded+alignedStart)); - retval = (end == reinterpret_cast<const byte *>(toFold+alignsz16)); - } - if(rest && retval) { - retval = ansiFold(toFold + alignsz16, rest, folded+alignedStart+alignsz16); - } - return retval; -#else - return ansiFold(toFold, sz, folded + alignedStart); -#endif -} - -namespace { - -#ifdef __x86_64__ -inline const char * advance(const char * n, const v16qi zero) -{ - uint32_t charMap = 0; - unsigned zeroCountSum = 0; - do { // find first '\0' character (the end of the word) -#ifndef __INTEL_COMPILER -#ifdef __clang__ - v16qi tmpCurrent = __builtin_ia32_lddqu(n+zeroCountSum); - v16qi tmp0 = tmpCurrent == zero; -#else - v16qi tmpCurrent = __builtin_ia32_loaddqu(n+zeroCountSum); - v16qi tmp0 = __builtin_ia32_pcmpeqb128(tmpCurrent, reinterpret_cast<v16qi>(zero)); -#endif - charMap = __builtin_ia32_pmovmskb128(tmp0); // 1 in charMap equals to '\0' in input buffer -#else -# warning "Intel's icc compiler does not like __builtin_ia32_xxxxx" - LOG_ABORT("should not be reached"); -#endif - zeroCountSum += 16; - } while (!charMap); - int charCount = Optimized::lsbIdx(charMap); // number of word characters in last 16 bytes - uint32_t zeroMap = ((~charMap) & 0xffff) >> charCount; - - int zeroCounter = Optimized::lsbIdx(zeroMap); // number of non-characters ('\0') in last 16 bytes - int sum = zeroCountSum - 16 + charCount + zeroCounter; - if (!zeroMap) { // only '\0' in last 16 bytes (no new word found) - do { // find first word character (the next word) -#ifndef __INTEL_COMPILER -#ifdef __clang__ - v16qi tmpCurrent = __builtin_ia32_lddqu(n+zeroCountSum); - tmpCurrent = tmpCurrent > zero; -#else - v16qi tmpCurrent = __builtin_ia32_loaddqu(n+zeroCountSum); - tmpCurrent = __builtin_ia32_pcmpgtb128(tmpCurrent, reinterpret_cast<v16qi>(zero)); -#endif - zeroMap = __builtin_ia32_pmovmskb128(tmpCurrent); // 1 in zeroMap equals to word character in input buffer -#else -# warning "Intel's icc compiler does not like __builtin_ia32_xxxxx" - LOG_ABORT("should not be reached"); -#endif - zeroCountSum += 16; - } while(!zeroMap); - zeroCounter = Optimized::lsbIdx(zeroMap); - sum = zeroCountSum - 16 + zeroCounter; - } - return n + sum; -} -#else -inline const char* advance(const char* n) -{ - const char* p = n; - const char* zero = static_cast<const char *>(memchr(p, 0, 64_Ki)); - while (zero == nullptr) { - p += 64_Ki; - zero = static_cast<const char *>(memchr(p, 0, 64_Ki)); - } - p = zero; - while (*p == '\0') { - ++p; - } - return p; -} -#endif - -} - -size_t FUTF8StrChrFieldSearcher::match(const char *folded, size_t sz, QueryTerm & qt) -{ -#ifdef __x86_64__ - const v16qi _G_zero = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -#endif - termcount_t words(0); - const char * term; - termsize_t tsz = qt.term(term); - const char *et=term+tsz; - const char * n = folded; - const char *e = n + sz; - - while (!*n) n++; - while (true) { - if (n>=e) break; - -#if 0 - v16qi current = __builtin_ia32_loaddqu(n); - current = __builtin_ia32_pcmpeqb128(current, _qtlFast[0]); - unsigned eqMap = __builtin_ia32_pmovmskb128(current); - unsigned neqMap = ~eqMap; - unsigned numEq = Optimized::lsbIdx(neqMap); - /* if (eqMap)*/ { - if (numEq >= 16) { - const char *tt = term+16; - const char *p = n+16; - while ( (*tt == *p) && (tt < et)) { tt++; p++; numEq++; } - } - if ((numEq >= tsz) && (prefix() || qt.isPrefix() || !n[tsz])) { - addHit(qt, words); - } - } -#else - const char *tt = term; - while ((tt < et) && (*tt == *n)) { tt++; n++; } - if ((tt == et) && (prefix() || qt.isPrefix() || !*n)) { - addHit(qt, words); - } -#endif - words++; -#ifdef __x86_64__ - n = advance(n, _G_zero); -#else - n = advance(n); -#endif - } - return words; -} - -size_t FUTF8StrChrFieldSearcher::match(const char *folded, size_t sz, size_t mintsz, QueryTerm ** qtl, size_t qtlSize) -{ - (void) mintsz; -#ifdef __x86_64__ - const v16qi _G_zero = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -#endif - termcount_t words(0); - const char * n = folded; - const char *e = n + sz; - while (!*n) n++; - for( ; ; ) { - if (n>=e) break; -#if 0 - v16qi current = __builtin_ia32_loaddqu(n); - for(size_t i=0; i < qtlSize; i++) { - v16qi tmpEq = __builtin_ia32_pcmpeqb128(current, _qtlFast[i]); - unsigned eqMap = __builtin_ia32_pmovmskb128(tmpEq); - /* if (eqMap) */ { - QueryTerm & qt = *qtl[i]; - unsigned neqMap = ~eqMap; - unsigned numEq = Optimized::lsbIdx(neqMap); - termsize_t tsz = qt.termLen(); - if (numEq >= 16) { - const char *tt = qt.term() + 16; - const char *et=tt+tsz; - const char *p = n+16; - while ( (*tt == *p) && (tt < et)) { tt++; p++; numEq++; } - } - if ((numEq >= tsz) && (prefix() || qt.isPrefix() || !n[tsz])) { - addHit(qt, words); - } - } - } -#else - for(QueryTerm ** it=qtl, ** mt=qtl+qtlSize; it != mt; it++) { - QueryTerm & qt = **it; - const char * term; - termsize_t tsz = qt.term(term); - - const char *et=term+tsz; - const char *fnt; - for (fnt = n; (term < et) && (*term == *fnt); term++, fnt++); - if ((term == et) && (prefix() || qt.isPrefix() || !*fnt)) { - addHit(qt, words); - } - } -#endif - words++; -#ifdef __x86_64__ - n = advance(n, _G_zero); -#else - n = advance(n); -#endif - } - return words; -} - -size_t FUTF8StrChrFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) -{ - _folded.reserve(f.size()+16*3); //Enable fulle xmm0 store - size_t unalignedStart(0); - bool ascii7Bit = lfoldua(f.data(), f.size(), &_folded[0], unalignedStart); - if (ascii7Bit) { - char * folded = &_folded[unalignedStart]; - /// Add the pattern 00 01 00 to avoid multiple eof tests of falling off the edge. - folded[f.size()] = 0; - folded[f.size()+1] = 0x01; - memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values - return match(folded, f.size(), qt); - NEED_CHAR_STAT(addPureUsAsciiField(f.size())); - } else { - return UTF8StrChrFieldSearcher::matchTerm(f, qt); - } -} - -size_t FUTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) -{ - _folded.reserve(f.size()+16*3); //Enable fulle xmm0 store - size_t unalignedStart(0); - bool ascii7Bit = lfoldua(f.data(), f.size(), &_folded[0], unalignedStart); - if (ascii7Bit) { - char * folded = &_folded[unalignedStart]; - /// Add the pattern 00 01 00 to avoid multiple eof tests of falling off the edge. - folded[f.size()] = 0; - folded[f.size()+1] = 0x01; - memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values - return match(folded, f.size(), mintsz, &_qtl[0], _qtl.size()); - NEED_CHAR_STAT(addPureUsAsciiField(f.size())); - } else { - return UTF8StrChrFieldSearcher::matchTerms(f, mintsz); - } -} - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h deleted file mode 100644 index 900ab4c9120..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "utf8strchrfieldsearcher.h" - -namespace vsm { - -class FUTF8StrChrFieldSearcher : public UTF8StrChrFieldSearcher -{ -public: - std::unique_ptr<FieldSearcher> duplicate() const override; - FUTF8StrChrFieldSearcher(); - FUTF8StrChrFieldSearcher(FieldIdT fId); - ~FUTF8StrChrFieldSearcher(); - static bool ansiFold(const char * toFold, size_t sz, char * folded); - static bool lfoldaa(const char * toFold, size_t sz, char * folded, size_t & unalignedStart); - static bool lfoldua(const char * toFold, size_t sz, char * folded, size_t & alignedStart); - private: - size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - size_t matchTerms(const FieldRef&, const size_t shortestTerm) override; - virtual size_t match(const char *folded, size_t sz, search::streaming::QueryTerm & qt); - size_t match(const char *folded, size_t sz, size_t mintsz, search::streaming::QueryTerm ** qtl, size_t qtlSize); - std::vector<char> _folded; -}; - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp deleted file mode 100644 index db93bda7778..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "geo_pos_field_searcher.h" -#include <vespa/document/fieldvalue/arrayfieldvalue.h> -#include <vespa/document/fieldvalue/structfieldvalue.h> -#include <vespa/searchlib/common/geo_location_parser.h> -#include <vespa/vespalib/util/issue.h> -#include <vespa/vespalib/util/exception.h> - -#include <vespa/log/log.h> -LOG_SETUP(".vsm.searcher.geo_pos_field_searcher"); - -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; -using search::common::GeoLocation; -using search::common::GeoLocationParser; - -namespace vsm { - -std::unique_ptr<FieldSearcher> GeoPosFieldSearcher::duplicate() const { - return std::make_unique<GeoPosFieldSearcher>(*this); -} - -GeoPosFieldSearcher::GeoPosFieldSearcher(FieldIdT fId) : - FieldSearcher(fId), - _geoPosTerm() -{} - -GeoPosFieldSearcher::~GeoPosFieldSearcher() {} - -void GeoPosFieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) { - _geoPosTerm.clear(); - FieldSearcher::prepare(qtl, buf); - for (const QueryTerm * qt : qtl) { - const vespalib::string & str = qt->getTermString(); - GeoLocationParser parser; - bool valid = parser.parseNoField(str); - if (! valid) { - vespalib::Issue::report("invalid position in term: %s", str.c_str()); - } - _geoPosTerm.emplace_back(parser.getGeoLocation()); - } -} - -void GeoPosFieldSearcher::onValue(const document::FieldValue & fv) { - LOG(spam, "ignore field value '%s'", fv.toString().c_str()); -} - -void GeoPosFieldSearcher::onStructValue(const document::StructFieldValue & fv) { - size_t num_terms = _geoPosTerm.size(); - for (size_t j = 0; j < num_terms; ++j) { - const GeoPosInfo & gpi = _geoPosTerm[j]; - if (gpi.valid() && gpi.cmp(fv)) { - addHit(*_qtl[j], 0); - } - } - ++_words; -} - -bool GeoPosFieldSearcher::GeoPosInfo::cmp(const document::StructFieldValue & sfv) const { - try { - auto xv = sfv.getValue("x"); - auto yv = sfv.getValue("y"); - if (xv && yv) { - int32_t x = xv->getAsInt(); - int32_t y = yv->getAsInt(); - GeoLocation::Point p{x,y}; - if (inside_limit(p)) { - return true; - } - } - } catch (const vespalib::Exception &e) { - vespalib::Issue::report("bad fieldvalue for GeoPosFieldSearcher: %s", e.getMessage().c_str()); - } - return false; -} - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h deleted file mode 100644 index ef1c5b5a1c4..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "fieldsearcher.h" -#include <vespa/searchlib/common/geo_location.h> - -namespace vsm { - -class GeoPosFieldSearcher : public FieldSearcher { -public: - GeoPosFieldSearcher(FieldIdT fId=0); - ~GeoPosFieldSearcher(); - void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; - void onValue(const document::FieldValue & fv) override; - void onStructValue(const document::StructFieldValue & fv) override; - std::unique_ptr<FieldSearcher> duplicate() const override; -protected: - using GeoLocation = search::common::GeoLocation; - class GeoPosInfo : public GeoLocation { - public: - GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {} - bool cmp(const document::StructFieldValue & fv) const; - }; - typedef std::vector<GeoPosInfo> GeoPosInfoListT; - GeoPosInfoListT _geoPosTerm; -}; - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp deleted file mode 100644 index 8cfb8e6df14..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "intfieldsearcher.h" - -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; - -namespace vsm { - -std::unique_ptr<FieldSearcher> -IntFieldSearcher::duplicate() const -{ - return std::make_unique<IntFieldSearcher>(*this); -} - -IntFieldSearcher::IntFieldSearcher(FieldIdT fId) : - FieldSearcher(fId), - _intTerm() -{ } - -IntFieldSearcher::~IntFieldSearcher() = default; - -void IntFieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) -{ - _intTerm.clear(); - FieldSearcher::prepare(qtl, buf); - for (QueryTermList::const_iterator it=qtl.begin(); it < qtl.end(); it++) { - const QueryTerm * qt = *it; - size_t sz(qt->termLen()); - if (sz) { - int64_t low; - int64_t high; - bool valid = qt->getAsIntegerTerm(low, high); - _intTerm.push_back(IntInfo(low, high, valid)); - } - } -} - -void IntFieldSearcher::onValue(const document::FieldValue & fv) -{ - for(size_t j=0, jm(_intTerm.size()); j < jm; j++) { - const IntInfo & ii = _intTerm[j]; - if (ii.valid() && (ii.cmp(fv.getAsLong()))) { - addHit(*_qtl[j], 0); - } - } - ++_words; -} - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h deleted file mode 100644 index a2b17a87f4b..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "fieldsearcher.h" - -namespace vsm { - -class IntFieldSearcher : public FieldSearcher -{ -public: - std::unique_ptr<FieldSearcher> duplicate() const override; - IntFieldSearcher(FieldIdT fId=0); - ~IntFieldSearcher(); - void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; - void onValue(const document::FieldValue & fv) override; -protected: - class IntInfo - { - public: - IntInfo(int64_t low, int64_t high, bool v) : _lower(low), _upper(high), _valid(v) { if (low > high) { _lower = high; _upper = low; } } - bool cmp(int64_t key) const { return (_lower <= key) && (key <= _upper); } - bool valid() const { return _valid; } - private: - int64_t _lower; - int64_t _upper; - bool _valid; - }; - typedef std::vector<IntInfo> IntInfoListT; - IntInfoListT _intTerm; -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp deleted file mode 100644 index 1c4ff78ff4a..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "strchrfieldsearcher.h" -#include <vespa/document/fieldvalue/stringfieldvalue.h> - -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; - -namespace vsm { - -void StrChrFieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) -{ - FieldSearcher::prepare(qtl, buf); -} - -void StrChrFieldSearcher::onValue(const document::FieldValue & fv) -{ - const document::LiteralFieldValueB & sfv = static_cast<const document::LiteralFieldValueB &>(fv); - vespalib::stringref val = sfv.getValueRef(); - FieldRef fr(val.data(), std::min(maxFieldLength(), val.size())); - matchDoc(fr); -} - -bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef) -{ - bool retval(true); - if (_qtl.size() > 1) { - size_t mintsz = shortestTerm(); - if (fieldRef.size() >= mintsz) { - _words += matchTerms(fieldRef, mintsz); - } else { - _words += countWords(fieldRef); - } - } else { - for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { - QueryTerm & qt = **it; - if (fieldRef.size() >= qt.termLen()) { - _words += matchTerm(fieldRef, qt); - } else { - _words += countWords(fieldRef); - } - } - } - return retval; -} - -size_t StrChrFieldSearcher::shortestTerm() const -{ - size_t mintsz(_qtl.front()->termLen()); - for(QueryTermList::const_iterator it=_qtl.begin()+1, mt=_qtl.end(); it != mt; it++) { - const QueryTerm & qt = **it; - mintsz = std::min(mintsz, qt.termLen()); - } - return mintsz; -} - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h deleted file mode 100644 index 0155c79cddf..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "fieldsearcher.h" - -namespace vsm { - -class StrChrFieldSearcher : public FieldSearcher -{ -public: - StrChrFieldSearcher() : FieldSearcher(0) { } - StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { } - void onValue(const document::FieldValue & fv) override; - void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; -private: - size_t shortestTerm() const; - bool matchDoc(const FieldRef & field); - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) = 0; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) = 0; -}; - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp deleted file mode 100644 index 977602a691c..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "utf8exactstringfieldsearcher.h" - -using search::byte; -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; - -namespace vsm { - -std::unique_ptr<FieldSearcher> -UTF8ExactStringFieldSearcher::duplicate() const -{ - return std::make_unique<UTF8ExactStringFieldSearcher>(*this); -} - -size_t -UTF8ExactStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) -{ - (void) mintsz; - for (QueryTermList::iterator it = _qtl.begin(), mt = _qtl.end(); it != mt; ++it) { - QueryTerm & qt = **it; - matchTermExact(f, qt); - } - return 1; -} - -size_t -UTF8ExactStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) -{ - return matchTermExact(f, qt); -} - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h deleted file mode 100644 index 744974a6cf6..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> - -namespace vsm -{ - -/** - * This class does suffix utf8 searches. - **/ -class UTF8ExactStringFieldSearcher : public UTF8StringFieldSearcherBase -{ -protected: - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; - -public: - std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8ExactStringFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8ExactStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp deleted file mode 100644 index 9aef99f9fa1..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "utf8flexiblestringfieldsearcher.h" - -#include <vespa/log/log.h> -LOG_SETUP(".vsm.searcher.utf8flexiblestringfieldsearcher"); - -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; - -namespace vsm { - -std::unique_ptr<FieldSearcher> -UTF8FlexibleStringFieldSearcher::duplicate() const -{ - return std::make_unique<UTF8FlexibleStringFieldSearcher>(*this); -} - -size_t -UTF8FlexibleStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) -{ - (void) mintsz; - size_t words = 0; - for (QueryTermList::iterator it = _qtl.begin(); it != _qtl.end(); ++it) { - words = matchTerm(f, **it); - } - return words; -} - -size_t -UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) -{ - if (qt.isPrefix()) { - LOG(debug, "Use prefix match for prefix term '%s:%s'", qt.index().c_str(), qt.getTerm()); - return matchTermRegular(f, qt); - } else if (qt.isSubstring()) { - LOG(debug, "Use substring match for substring term '%s:%s'", qt.index().c_str(), qt.getTerm()); - return matchTermSubstring(f, qt); - } else if (qt.isSuffix()) { - LOG(debug, "Use suffix match for suffix term '%s:%s'", qt.index().c_str(), qt.getTerm()); - return matchTermSuffix(f, qt); - } else if (qt.isExactstring()) { - LOG(debug, "Use exact match for exact term '%s:%s'", qt.index().c_str(), qt.getTerm()); - return matchTermExact(f, qt); - } else { - if (substring()) { - LOG(debug, "Use substring match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); - return matchTermSubstring(f, qt); - } else if (suffix()) { - LOG(debug, "Use suffix match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); - return matchTermSuffix(f, qt); - } else if (exact()) { - LOG(debug, "Use exact match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); - return matchTermExact(f, qt); - } else { - LOG(debug, "Use regular/prefix match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); - return matchTermRegular(f, qt); - } - } -} - -UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher() : - UTF8StringFieldSearcherBase() -{ } - -UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher(FieldIdT fId) : - UTF8StringFieldSearcherBase(fId) -{ } - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h deleted file mode 100644 index 63931af0036..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> - -namespace vsm -{ - -/** - * This class does utf8 searches based on the query term type. - * It will choose between regular search strategy (including prefix) and substring search strategy. - **/ -class UTF8FlexibleStringFieldSearcher : public UTF8StringFieldSearcherBase -{ -private: - /** - * Tries to match the given query term against the content of the given field reference. - * Search strategy is choosen based on the query term type. - **/ - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - - /** - * Tries to match each query term in the underlying query against the content of the given field reference. - * Search strategy is choosen based on the query term type. - **/ - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; - -public: - std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8FlexibleStringFieldSearcher(); - UTF8FlexibleStringFieldSearcher(FieldIdT fId); -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp deleted file mode 100644 index 0d93009655c..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "utf8strchrfieldsearcher.h" - -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; -using search::byte; - -namespace vsm { - -std::unique_ptr<FieldSearcher> -UTF8StrChrFieldSearcher::duplicate() const -{ - return std::make_unique<UTF8StrChrFieldSearcher>(*this); -} - -size_t -UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) -{ - (void) mintsz; - termcount_t words(0); - const byte * n = reinterpret_cast<const byte *> (f.data()); - const byte * e = n + f.size(); - if (f.size() >= _buf->size()) { - _buf->reserve(f.size() + 1); - } - cmptype_t * fn = &(*_buf.get())[0]; - size_t fl(0); - - for( ; n < e; ) { - if (!*n) { _zeroCount++; n++; } - n = tokenize(n, _buf->capacity(), fn, fl); - for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { - QueryTerm & qt = **it; - const cmptype_t * term; - termsize_t tsz = qt.term(term); - if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) { - const cmptype_t *tt=term, *et=term+tsz; - for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++); - if (tt == et) { - addHit(qt, words); - } - } - } - words++; - } - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); - return words; -} - -size_t -UTF8StrChrFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) -{ - return matchTermRegular(f, qt); -} - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h deleted file mode 100644 index 1687a1a18c0..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "utf8stringfieldsearcherbase.h" - -namespace vsm { - -/** - * This class does normal utf8 searches. - * This class uses an highly optimized version of the tokenize method in fastlib. - **/ -class UTF8StrChrFieldSearcher : public UTF8StringFieldSearcherBase -{ -public: - std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8StrChrFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } - -protected: - size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp deleted file mode 100644 index 148cdf2c0c3..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp +++ /dev/null @@ -1,320 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "utf8stringfieldsearcherbase.h" -#include <cassert> - -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; -using search::byte; - -namespace vsm { - -const byte * -UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen) -{ - if (maxSz > 0) { - maxSz--; - } - ucs4_t c(*p); - ucs4_t *q(dstbuf); - const byte * end(p+maxSz); - - // Skip non-word characters between words - for (; p < end; ) { - if (c < 128) { - if (!c) { break; } - p++; - if (__builtin_expect(_isWord[c], false)) { - *q++ = _foldCase[c]; - c = 0; - } else { - c = *p; - } - } else { - const byte * oldP(p); - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - if (Fast_UnicodeUtil::IsWordChar(c)) { - _utf8Count[p-oldP-1]++; - const char *repl = ReplacementString(c); - if (repl != NULL) { - size_t repllen = strlen(repl); - if (repllen > 0) { - q = Fast_UnicodeUtil::ucs4copy(q,repl); - } - } else { - c = ToFold(c); - *q++ = c; - } - break; - } else { - if (c == _BadUTF8Char) { - _badUtf8Count++; - } else { - _utf8Count[p-oldP-1]++; - } - c = *p; - } - } - } - - c = *p; // Next char - for (; p < end;) { - if (c < 128) { // Common case, ASCII - if (!c) { break; } - p++; - if (__builtin_expect(!_isWord[c], false)) { - c = 0; - } else { - *q++ = _foldCase[c]; - c = *p; - } - } else { - const byte * oldP(p); - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) { - _utf8Count[p-oldP-1]++; - const char *repl = ReplacementString(c); - if (repl != NULL) { - size_t repllen = strlen(repl); - if (repllen > 0) { - q = Fast_UnicodeUtil::ucs4copy(q,repl); - } - } else { - c = ToFold(c); - *q++ = c; - } - - c = *p; - } else { - if (c == _BadUTF8Char) { - _badUtf8Count++; - } else { - _utf8Count[p-oldP-1]++; - } - break; - } - } - } - *q = 0; - tokenlen = q - dstbuf; - return p; -} - -size_t -UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt) -{ - termcount_t words(0); - const byte * n = reinterpret_cast<const byte *> (f.data()); - // __builtin_prefetch(n, 0, 0); - const cmptype_t * term; - termsize_t tsz = qt.term(term); - const byte * e = n + f.size(); - if ( f.size() >= _buf->size()) { - _buf->reserve(f.size() + 1); - } - cmptype_t * fn = &(*_buf.get())[0]; - size_t fl(0); - - for( ; n < e; ) { - if (!*n) { _zeroCount++; n++; } - n = tokenize(n, _buf->capacity(), fn, fl); - if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) { - const cmptype_t *tt=term, *et=term+tsz; - for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++); - if (tt == et) { - addHit(qt, words); - } - } - words++; - } - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); - return words; -} - -size_t -UTF8StringFieldSearcherBase::matchTermExact(const FieldRef & f, QueryTerm & qt) -{ - const byte * n = reinterpret_cast<const byte *> (f.data()); - const cmptype_t * term; - termsize_t tsz = qt.term(term); - const cmptype_t * eterm = term+tsz; - const byte * e = n + f.size(); - if (tsz <= f.size()) { - bool equal(true); - for (; equal && (n < e) && (term < eterm); term++) { - if (*term < 0x80) { - equal = (*term == _foldCase[*n++]); - } else { - cmptype_t c = ToFold(Fast_UnicodeUtil::GetUTF8CharNonAscii(n)); - equal = (*term == c); - } - } - if (equal && (term == eterm) && (qt.isPrefix() || (n == e))) { - addHit(qt,0); - } - } - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); - return 1; -} - -size_t -UTF8StringFieldSearcherBase::matchTermSubstring(const FieldRef & f, QueryTerm & qt) -{ - if (qt.termLen() == 0) { return 0; } - const byte * n = reinterpret_cast<const byte *> (f.data()); - const cmptype_t * term; - termsize_t tsz = qt.term(term); - if ( f.size() >= _buf->size()) { - _buf->reserve(f.size() + 1); - } - cmptype_t * fntemp = &(*_buf.get())[0]; - BufferWrapper wrapper(fntemp); - size_t fl = skipSeparators(n, f.size(), wrapper); - const cmptype_t * fn(fntemp); - const cmptype_t * fe = fn + fl; - const cmptype_t * fre = fe - tsz; - termcount_t words(0); - for(words = 0; fn <= fre; ) { - const cmptype_t *tt=term, *et=term+tsz, *fnt=fn; - for (; (tt < et) && (*tt == *fnt); tt++, fnt++); - if (tt == et) { - fn = fnt; - addHit(qt, words); - } else { - if ( ! Fast_UnicodeUtil::IsWordChar(*fn++) ) { - words++; - for(; (fn < fre) && ! Fast_UnicodeUtil::IsWordChar(*fn) ; fn++ ); - } - } - } - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); - return words + 1; // we must also count the last word -} - -size_t -UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt) -{ - termcount_t words = 0; - const byte * srcbuf = reinterpret_cast<const byte *> (f.data()); - const byte * srcend = srcbuf + f.size(); - const cmptype_t * term; - termsize_t tsz = qt.term(term); - if (f.size() >= _buf->size()) { - _buf->reserve(f.size() + 1); - } - cmptype_t * dstbuf = &(*_buf.get())[0]; - size_t tokenlen = 0; - - for( ; srcbuf < srcend; ) { - if (*srcbuf == 0) { - ++_zeroCount; - ++srcbuf; - } - srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); - if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) { - addHit(qt, words); - } - words++; - } - return words; -} - -UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase() : - StrChrFieldSearcher(), - Fast_NormalizeWordFolder(), - Fast_UnicodeUtil() -{ -} - -UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase(FieldIdT fId) : - StrChrFieldSearcher(fId), - Fast_NormalizeWordFolder(), - Fast_UnicodeUtil() -{ -} - -UTF8StringFieldSearcherBase::~UTF8StringFieldSearcherBase() {} - -void -UTF8StringFieldSearcherBase::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) -{ - StrChrFieldSearcher::prepare(qtl, buf); - _buf = buf; -} - -bool -UTF8StringFieldSearcherBase::matchTermSuffix(const cmptype_t * term, size_t termlen, - const cmptype_t * word, size_t wordlen) -{ - if ((termlen <= wordlen)) { - const cmptype_t * titr = term + termlen - 1; - const cmptype_t * witr = word + wordlen - 1; - bool hit = true; - // traverse the term and the word back to front - for (; titr >= term; --titr, --witr) { - if (*titr != *witr) { - hit = false; - break; - } - } - return hit; - } - return false; -} - -bool -UTF8StringFieldSearcherBase::isSeparatorCharacter(ucs4_t c) -{ - return ((c < 0x20) && (c != '\n') && (c != '\t')); -} - -template <typename T> -size_t -UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T & dstbuf) { - const search::byte * e(p+sz); - const search::byte * b(p); - - for(; p < e; ) { - ucs4_t c(*p); - const search::byte * oldP(p); - if (c < 128) { - p++; - if (!isSeparatorCharacter(c)) { - dstbuf.onCharacter(_foldCase[c], (oldP - b)); - } - } else { - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - const char *repl = ReplacementString(c); - if (repl != NULL) { - size_t repllen = strlen(repl); - if (repllen > 0) { - ucs4_t * buf = dstbuf.getBuf(); - ucs4_t * newBuf = Fast_UnicodeUtil::ucs4copy(buf, repl); - if (dstbuf.hasOffsets()) { - for (; buf < newBuf; ++buf) { - dstbuf.incBuf(1); - dstbuf.onOffset(oldP - b); - } - } else { - dstbuf.incBuf(newBuf - buf); - } - } - } else { - c = ToFold(c); - dstbuf.onCharacter(c, (oldP - b)); - } - if (c == _BadUTF8Char) { - _badUtf8Count++; - } else { - _utf8Count[p-oldP-1]++; - } - } - } - assert(dstbuf.valid()); - return dstbuf.size(); -} - -template unsigned long UTF8StringFieldSearcherBase::skipSeparators<UTF8StringFieldSearcherBase::BufferWrapper>(unsigned char const*, unsigned long, UTF8StringFieldSearcherBase::BufferWrapper&); -template unsigned long UTF8StringFieldSearcherBase::skipSeparators<UTF8StringFieldSearcherBase::OffsetWrapper>(unsigned char const*, unsigned long, UTF8StringFieldSearcherBase::OffsetWrapper&); - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h deleted file mode 100644 index f540a7ac457..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "strchrfieldsearcher.h" -#include <vespa/fastlib/text/normwordfolder.h> - -namespace vsm { - -/** - * This class is the base class for all utf8 string searchers. - * It contains utility functions used by the other searchers. - * As normal the prepare method is called - * after the query is built. A SharedSearcherBuf is used given to it. This is a - * buffer that is shared among all searchers that are run in the same context. - * Reuse of this buffer ensures better cache hit ratio because this is just a - * scratchpad for tokenizing. It will grow till the max size and stay there. - **/ -class UTF8StringFieldSearcherBase : public StrChrFieldSearcher, protected Fast_NormalizeWordFolder, public Fast_UnicodeUtil -{ -public: - /** - * Template class that wraps an ucs4 buffer. - * Used when invoking skipSeparators() during substring matching. - **/ - class BufferWrapper - { - protected: - ucs4_t * _bbuf; - ucs4_t * _cbuf; - - public: - BufferWrapper(ucs4_t * buf) : _bbuf(buf), _cbuf(buf) { } - BufferWrapper(ucs4_t * buf, size_t *) : _bbuf(buf), _cbuf(buf) { } - void onCharacter(ucs4_t ch, size_t) { *_cbuf++ = ch; } - void onOffset(size_t) { } - void incBuf(size_t inc) { _cbuf += inc; } - ucs4_t * getBuf() { return _cbuf; } - bool valid() { return true; } - size_t size() { return (_cbuf - _bbuf); } - bool hasOffsets() { return false; } - }; - - /** - * Template class that wraps an offset buffer in addition to an ucs4 buffer. - * The offset buffer contains offsets into the original utf8 buffer. - **/ - class OffsetWrapper : public BufferWrapper - { - private: - size_t * _boff; - size_t * _coff; - - public: - OffsetWrapper(ucs4_t * buf, size_t * offsets) : BufferWrapper(buf), _boff(offsets), _coff(offsets) {} - void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; } - void onOffset(size_t of) { *_coff++ = of; } - bool valid() { return (size() == (size_t)(_coff - _boff)); } - bool hasOffsets() { return true; } - }; - -protected: - SharedSearcherBuf _buf; - - const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen); - - /** - * Matches the given query term against the words in the given field reference - * using exact or prefix match strategy. - * - * @param f the field reference to match against. - * @param qt the query term trying to match. - * @return the number of words in the field ref. - **/ - size_t matchTermRegular(const FieldRef & f, search::streaming::QueryTerm & qt); - - /** - * Matches the given query term against the characters in the given field reference - * using substring match strategy. - * - * @param f the field reference to match against. - * @param qt the query term trying to match. - * @return the number of words in the field ref. - **/ - size_t matchTermSubstring(const FieldRef & f, search::streaming::QueryTerm & qt); - - /** - * Matches the given query term against the words in the given field reference - * using suffix match strategy. - * - * @param f the field reference to match against. - * @param qt the query term trying to match. - * @return the number of words in the field ref. - **/ - size_t matchTermSuffix(const FieldRef & f, search::streaming::QueryTerm & qt); - - /** - * Matches the given query term against the words in the given field reference - * using exact match strategy. - * - * @param f the field reference to match against. - * @param qt the query term trying to match. - * @return the number of words in the field ref. - **/ - size_t matchTermExact(const FieldRef & f, search::streaming::QueryTerm & qt); - -public: - UTF8StringFieldSearcherBase(); - UTF8StringFieldSearcherBase(FieldIdT fId); - ~UTF8StringFieldSearcherBase(); - void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; - /** - * Matches the given query term against the given word using suffix match strategy. - * - * @param term the buffer with the term. - * @param termLen the length of the term. - * @param word the buffer with the word. - * @param wordlen the length of the word. - * @return true if the term matches the word. - **/ - static bool matchTermSuffix(const cmptype_t * term, size_t termlen, - const cmptype_t * word, size_t wordlen); - - /** - * Checks whether the given character is a separator character. - **/ - static bool isSeparatorCharacter(ucs4_t); - - /** - * Transforms the given utf8 array into an array of ucs4 characters. - * Folding is performed. Separator characters are skipped. - **/ - template <typename T> - size_t skipSeparators(const search::byte * p, size_t sz, T & dstbuf); - -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp deleted file mode 100644 index fd327d3a3df..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include <vespa/vsm/searcher/utf8substringsearcher.h> - -using search::byte; -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; - -namespace vsm { - -std::unique_ptr<FieldSearcher> -UTF8SubStringFieldSearcher::duplicate() const -{ - return std::make_unique<UTF8SubStringFieldSearcher>(*this); -} - -size_t -UTF8SubStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) -{ - const byte * n = reinterpret_cast<const byte *> (f.data()); - if ( f.size() >= _buf->size()) { - _buf->reserve(f.size() + 1); - } - cmptype_t * fntemp = &(*_buf.get())[0]; - BufferWrapper wrapper(fntemp); - size_t fl = skipSeparators(n, f.size(), wrapper); - const cmptype_t * fn(fntemp); - const cmptype_t * fe = fn + fl; - const cmptype_t * fre = fe - mintsz; - termcount_t words(0); - for(words = 0; fn <= fre; ) { - for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { - QueryTerm & qt = **it; - const cmptype_t * term; - termsize_t tsz = qt.term(term); - - const cmptype_t *tt=term, *et=term+tsz, *fnt=fn; - for (; (tt < et) && (*tt == *fnt); tt++, fnt++); - if (tt == et) { - addHit(qt, words); - } - } - if ( ! Fast_UnicodeUtil::IsWordChar(*fn++) ) { - words++; - for(; (fn < fre) && ! Fast_UnicodeUtil::IsWordChar(*fn); fn++ ); - } - } - - NEED_CHAR_STAT(addAnyUtf8Field(f.size())); - return words + 1; // we must also count the last word -} - -size_t -UTF8SubStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) -{ - return matchTermSubstring(f, qt); -} - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h deleted file mode 100644 index 1c463c28847..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/vsm/searcher/utf8strchrfieldsearcher.h> - -namespace vsm { - -/** - * This class does substring utf8 searches. - **/ -class UTF8SubStringFieldSearcher : public UTF8StringFieldSearcherBase -{ -public: - std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8SubStringFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } -protected: - size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp deleted file mode 100644 index be02a58cfda..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "utf8substringsnippetmodifier.h" -#include <cassert> - -using search::byte; -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; - -namespace vsm { - -std::unique_ptr<FieldSearcher> -UTF8SubstringSnippetModifier::duplicate() const -{ - return std::make_unique<UTF8SubstringSnippetModifier>(*this); -} - -size_t -UTF8SubstringSnippetModifier::matchTerms(const FieldRef & f, const size_t mintsz) -{ - _modified->reset(); - _readPtr = f.data(); - const byte * src = reinterpret_cast<const byte *> (f.data()); - // resize ucs4 buffer - if (f.size() >= _buf->size()) { - _buf->resize(f.size() + 1); - } - // resize offset buffers - if (f.size() >= _offsets->size()) { - _offsets->resize(f.size() + 1); - } - // resize modified buffer - if (f.size() + 16 > _modified->getLength()) { - _modified->resize(f.size() + 16); // make room for some unit separators - } - cmptype_t * dbegin = &(*_buf.get())[0]; - OffsetWrapper wrapper(dbegin, &(*_offsets)[0]); - size_t numchars = skipSeparators(src, f.size(), wrapper); - const cmptype_t * ditr = dbegin; - const cmptype_t * dend = ditr + numchars; - const cmptype_t * drend = dend - mintsz; - termcount_t words = 0; - for(; ditr <= drend; ) { - for (QueryTermList::iterator itr = _qtl.begin(); itr != _qtl.end(); ++itr) { - QueryTerm & qt = **itr; - const cmptype_t * term; - termsize_t tsz = qt.term(term); - - const cmptype_t * titr = term; - const cmptype_t * tend = term + tsz; - const cmptype_t * dtmp = ditr; - for (; (titr < tend) && (*titr == *dtmp); ++titr, ++dtmp); - if (titr == tend) { - const char * mbegin = f.data() + (*_offsets)[ditr - dbegin]; - const char * mend = f.data() + ((dtmp < dend) ? ((*_offsets)[dtmp - dbegin]) : f.size()); - if (_readPtr <= mbegin) { - // We will only copy from the field ref once. - // If we have overlapping matches only the first one will be considered. - insertSeparators(mbegin, mend); - } - addHit(qt, words); - } - } - if ( ! Fast_UnicodeUtil::IsWordChar(*ditr++) ) { - words++; - for(; (ditr < drend) && ! Fast_UnicodeUtil::IsWordChar(*ditr) ; ++ditr ); - } - } - assert(_readPtr <= (f.data() + f.size())); - // copy remaining - size_t toCopy = f.size() - (_readPtr - f.data()); - copyToModified(toCopy); - - return words + 1; // we must also count the last word -} - -size_t -UTF8SubstringSnippetModifier::matchTerm(const FieldRef & f, QueryTerm & qt) -{ - const cmptype_t * term; - termsize_t tsz = qt.term(term); - return matchTerms(f, tsz); -} - -void -UTF8SubstringSnippetModifier::copyToModified(size_t n, bool skipSep) -{ - if (n == 0) { - return; - } - if (skipSep) { - for (const char * readEnd = _readPtr + n; _readPtr < readEnd; ++_readPtr) { - if (!isSeparatorCharacter(*_readPtr)) { - _modified->put(*_readPtr); - } - } - } else { - _modified->put(_readPtr, n); - _readPtr += n; - } -} - -void -UTF8SubstringSnippetModifier::insertSeparators(const char * mbegin, const char * mend) -{ - copyToModified(mbegin - _readPtr); - _modified->put(_unitSep); - // skip separators such that the match is not splitted. - copyToModified((mend - mbegin), true); - _modified->put(_unitSep); -} - -UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier() : - UTF8StringFieldSearcherBase(), - _modified(new CharBuffer(32)), - _offsets(new std::vector<size_t>(32)), - _readPtr(NULL), - _unitSep('\x1F') -{ -} - -UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId) : - UTF8StringFieldSearcherBase(fId), - _modified(new CharBuffer(32)), - _offsets(new std::vector<size_t>(32)), - _readPtr(NULL), - _unitSep('\x1F') -{ -} - -UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId, - const CharBuffer::SP & modBuf, - const SharedOffsetBuffer & offBuf) : - UTF8StringFieldSearcherBase(fId), - _modified(modBuf), - _offsets(offBuf), - _readPtr(NULL), - _unitSep('\x1F') -{ -} - -UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() {} - -} - diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h deleted file mode 100644 index 0127a7f2827..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "utf8stringfieldsearcherbase.h" -#include <vespa/vsm/common/charbuffer.h> - -namespace vsm { - -typedef std::shared_ptr<std::vector<size_t> > SharedOffsetBuffer; - -/** - * This class does substring searches the same way as UTF8SubStringFieldSearcher. - * While matching the query term(s) against the field reference it builds a modified - * buffer based on the field reference where the only difference is that unit separators - * are inserted before and after a match. These extra unit separators make it possible - * to highlight a substring match when later generating snippets. - **/ -class UTF8SubstringSnippetModifier : public UTF8StringFieldSearcherBase -{ -private: - CharBuffer::SP _modified; // buffer to write the modified field value - SharedOffsetBuffer _offsets; // for each character in _buf we have an offset into the utf8 buffer (field reference) - const char * _readPtr; // buffer to read from (field reference) - char _unitSep; // the unit separator character to use - - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; - - /** - * Copies n bytes from the field reference to the modified buffer and updates the read pointer. - * Separator characters from the field reference can be skipped. - * This is to avoid that a match is splitted by separator characters from the original field reference. - * - * @param n the number of bytes to copy. - * @param skipSep whether we should skip separator characters from the field reference. - **/ - void copyToModified(size_t n, bool skipSep = false); - - /** - * Copies from the field reference to the modified buffer and inserts unit separators for a match - * starting at mbegin (in the field reference) and ending at mend (in the field reference). - * A unit separator is inserted before and after the match. - * - * @param mbegin the beginning of the match. - * @param mend the end of the match. - **/ - void insertSeparators(const char * mbegin, const char * mend); - -public: - typedef std::shared_ptr<UTF8SubstringSnippetModifier> SP; - - std::unique_ptr<FieldSearcher> duplicate() const override; - - UTF8SubstringSnippetModifier(); - UTF8SubstringSnippetModifier(FieldIdT fId); - ~UTF8SubstringSnippetModifier(); - - /** - * Creates a new instance. - * - * @param fId the field id to operate on. - * @param modBuf the shared buffer used to store the modified field value. - * @param offBuf the shared buffer used to store the offsets into the field reference. - **/ - UTF8SubstringSnippetModifier(FieldIdT fId, const CharBuffer::SP & modBuf, const SharedOffsetBuffer & offBuf); - - const CharBuffer & getModifiedBuf() const { return *_modified; } - const search::streaming::QueryTermList & getQueryTerms() const { return _qtl; } -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp deleted file mode 100644 index 3495d46b85b..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "utf8suffixstringfieldsearcher.h" - -using search::byte; -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; - -namespace vsm { - -std::unique_ptr<FieldSearcher> -UTF8SuffixStringFieldSearcher::duplicate() const -{ - return std::make_unique<UTF8SuffixStringFieldSearcher>(*this); -} - -size_t -UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) -{ - (void) mintsz; - termcount_t words = 0; - const byte * srcbuf = reinterpret_cast<const byte *> (f.data()); - const byte * srcend = srcbuf + f.size(); - if (f.size() >= _buf->size()) { - _buf->reserve(f.size() + 1); - } - cmptype_t * dstbuf = &(*_buf.get())[0]; - size_t tokenlen = 0; - - for( ; srcbuf < srcend; ) { - if (*srcbuf == 0) { - ++_zeroCount; - ++srcbuf; - } - srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); - for (QueryTermList::iterator it = _qtl.begin(), mt = _qtl.end(); it != mt; ++it) { - QueryTerm & qt = **it; - const cmptype_t * term; - termsize_t tsz = qt.term(term); - if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) { - addHit(qt, words); - } - } - words++; - } - return words; -} - -size_t -UTF8SuffixStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) -{ - return matchTermSuffix(f, qt); -} - -} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h deleted file mode 100644 index 0640ac22da5..00000000000 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> - -namespace vsm -{ - -/** - * This class does suffix utf8 searches. - **/ -class UTF8SuffixStringFieldSearcher : public UTF8StringFieldSearcherBase -{ -protected: - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; - -public: - std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8SuffixStringFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/vsm/.gitignore b/streamingvisitors/src/vespa/vsm/vsm/.gitignore deleted file mode 100644 index 95bc02923a9..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -*.exe -*.ilk -*.pdb -.depend* -Makefile diff --git a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt deleted file mode 100644 index adc00b341a3..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -vespa_add_library(vsm_vsmbase OBJECT - SOURCES - docsumfieldspec.cpp - docsumfilter.cpp - fieldsearchspec.cpp - flattendocsumwriter.cpp - slimefieldwriter.cpp - snippetmodifier.cpp - vsm-adapter.cpp - docsumconfig.cpp - DEPENDS - vsm_vconfig -) diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.cpp deleted file mode 100644 index 656e9eed132..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.cpp +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include <vespa/vsm/vsm/docsumconfig.h> -#include <vespa/searchsummary/docsummary/docsumfieldwriter.h> -#include <vespa/searchsummary/docsummary/matched_elements_filter_dfw.h> -#include <vespa/searchlib/common/matching_elements_fields.h> -#include <vespa/vsm/config/config-vsmfields.h> -#include <vespa/vsm/config/config-vsmsummary.h> - -using search::MatchingElementsFields; -using search::docsummary::IDocsumFieldWriter; -using search::docsummary::EmptyDFW; -using search::docsummary::MatchedElementsFilterDFW; -using search::docsummary::ResultConfig; -using vespa::config::search::vsm::VsmfieldsConfig; -using vespa::config::search::vsm::VsmsummaryConfig; - -namespace vsm { - -namespace { - -void populate_fields(MatchingElementsFields& fields, VsmfieldsConfig& fields_config, const vespalib::string& field_name) -{ - vespalib::string prefix = field_name + "."; - for (const auto& spec : fields_config.fieldspec) { - if (spec.name.substr(0, prefix.size()) == prefix) { - fields.add_mapping(field_name, spec.name); - } - if (spec.name == field_name) { - fields.add_field(field_name); - } - } -} - -} - -DynamicDocsumConfig::DynamicDocsumConfig(search::docsummary::IDocsumEnvironment* env, search::docsummary::DynamicDocsumWriter* writer, std::shared_ptr<VsmfieldsConfig> vsm_fields_config) - : Parent(env, writer), - _vsm_fields_config(std::move(vsm_fields_config)) -{ -} - -IDocsumFieldWriter::UP -DynamicDocsumConfig::createFieldWriter(const string & fieldName, const string & overrideName, const string & argument, bool & rc, std::shared_ptr<search::MatchingElementsFields> matching_elems_fields) -{ - IDocsumFieldWriter::UP fieldWriter; - if ((overrideName == "staticrank") || - (overrideName == "ranklog") || - (overrideName == "label") || - (overrideName == "project") || - (overrideName == "positions") || - (overrideName == "absdist") || - (overrideName == "subproject")) - { - fieldWriter = std::make_unique<EmptyDFW>(); - rc = true; - } else if ((overrideName == "attribute") || - (overrideName == "attributecombiner") || - (overrideName == "geopos")) { - rc = true; - } else if ((overrideName == "matchedattributeelementsfilter") || - (overrideName == "matchedelementsfilter")) { - string source_field = argument.empty() ? fieldName : argument; - const ResultConfig& resultConfig = getResultConfig(); - int source_field_enum = resultConfig.GetFieldNameEnum().Lookup(source_field.c_str()); - populate_fields(*matching_elems_fields, *_vsm_fields_config, source_field); - fieldWriter = MatchedElementsFilterDFW::create(source_field, source_field_enum, matching_elems_fields); - rc = static_cast<bool>(fieldWriter); - } else { - fieldWriter = search::docsummary::DynamicDocsumConfig::createFieldWriter(fieldName, overrideName, argument, rc, matching_elems_fields); - } - return fieldWriter; -} - -} diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.h b/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.h deleted file mode 100644 index 11010c04e90..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include <vespa/searchsummary/docsummary/docsumconfig.h> - -namespace vespa::config::search::vsm { -namespace internal { class InternalVsmfieldsType; } -typedef const internal::InternalVsmfieldsType VsmfieldsConfig; -} -namespace vsm { - -class DynamicDocsumConfig : public search::docsummary::DynamicDocsumConfig -{ -public: - using Parent = search::docsummary::DynamicDocsumConfig; - using VsmfieldsConfig = vespa::config::search::vsm::VsmfieldsConfig; -private: - std::shared_ptr<VsmfieldsConfig> _vsm_fields_config; -public: - DynamicDocsumConfig(search::docsummary::IDocsumEnvironment* env, search::docsummary::DynamicDocsumWriter* writer, std::shared_ptr<VsmfieldsConfig> vsm_fields_config); -private: - std::unique_ptr<search::docsummary::IDocsumFieldWriter> - createFieldWriter(const string & fieldName, const string & overrideName, - const string & cf, bool & rc, std::shared_ptr<search::MatchingElementsFields> matching_elems_fields) override; -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.cpp deleted file mode 100644 index 936aaaa2091..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.cpp +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "docsumfieldspec.h" - -namespace vsm { - -DocsumFieldSpec::FieldIdentifier::FieldIdentifier() : - _id(StringFieldIdTMap::npos), - _path() -{ } - -DocsumFieldSpec::FieldIdentifier::FieldIdentifier(FieldIdT id, FieldPath path) : - _id(id), - _path(std::move(path)) -{ } - -DocsumFieldSpec::FieldIdentifier::FieldIdentifier(FieldIdentifier &&) noexcept = default; -DocsumFieldSpec::FieldIdentifier & DocsumFieldSpec::FieldIdentifier::operator=(FieldIdentifier &&) noexcept = default; -DocsumFieldSpec::FieldIdentifier::~FieldIdentifier() = default; - -DocsumFieldSpec::DocsumFieldSpec() : - _resultType(search::docsummary::RES_INT), - _command(VsmsummaryConfig::Fieldmap::Command::NONE), - _outputField(), - _inputFields() -{ } - -DocsumFieldSpec::DocsumFieldSpec(search::docsummary::ResType resultType, - VsmsummaryConfig::Fieldmap::Command command) : - _resultType(resultType), - _command(command), - _outputField(), - _inputFields() -{ } - -} diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.h b/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.h deleted file mode 100644 index db6ee9fa223..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/searchsummary/docsummary/resultclass.h> -#include <vespa/vsm/common/document.h> -#include <vespa/vsm/common/storagedocument.h> -#include <vespa/vsm/config/vsm-cfif.h> - -namespace vsm { - -/** - * This class contains the specifications for how to generate a summary field. - **/ -class DocsumFieldSpec { -public: - /** - * This class contains a field id and a field path (to navigate a field value). - **/ - class FieldIdentifier { - private: - FieldIdT _id; - FieldPath _path; - - public: - FieldIdentifier(); - FieldIdentifier(FieldIdT id, FieldPath path); - FieldIdentifier(FieldIdentifier &&) noexcept; - FieldIdentifier & operator=(FieldIdentifier &&) noexcept; - FieldIdentifier(const FieldIdentifier &) = delete; - FieldIdentifier & operator=(const FieldIdentifier &) = delete; - ~FieldIdentifier(); - FieldIdT getId() const { return _id; } - const FieldPath & getPath() const { return _path; } - }; - - typedef std::vector<FieldIdentifier> FieldIdentifierVector; - -private: - search::docsummary::ResType _resultType; - VsmsummaryConfig::Fieldmap::Command _command; - FieldIdentifier _outputField; - FieldIdentifierVector _inputFields; - -public: - DocsumFieldSpec(); - DocsumFieldSpec(search::docsummary::ResType resultType, VsmsummaryConfig::Fieldmap::Command command); - - /** - * Returns the result type for the summary field. - **/ - search::docsummary::ResType getResultType() const { return _resultType; } - - /** - * Returns the command specifying how to transform input fields into output summary field. - **/ - VsmsummaryConfig::Fieldmap::Command getCommand() const { return _command; } - - /** - * Returns whether the input field and output field are identical. - **/ - bool hasIdentityMapping() const { - return _inputFields.size() == 1 && _outputField.getId() == _inputFields[0].getId(); - } - - const FieldIdentifier & getOutputField() const { return _outputField; } - void setOutputField(FieldIdentifier outputField) { _outputField = std::move(outputField); } - const FieldIdentifierVector & getInputFields() const { return _inputFields; } - FieldIdentifierVector & getInputFields() { return _inputFields; } -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp deleted file mode 100644 index 70759feb41c..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp +++ /dev/null @@ -1,477 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "docsumfilter.h" -#include "slimefieldwriter.h" -#include <vespa/searchsummary/docsummary/summaryfieldconverter.h> -#include <vespa/document/base/exceptions.h> -#include <vespa/document/fieldvalue/iteratorhandler.h> - -#include <vespa/log/log.h> -LOG_SETUP(".vsm.docsumfilter"); - -using namespace search::docsummary; - - -namespace { - -class Handler : public document::fieldvalue::IteratorHandler { -public: -}; - -struct IntResultHandler : public Handler { - int32_t value; - IntResultHandler() : value(0) {} - void onPrimitive(uint32_t, const Content & c) override { - value = c.getValue().getAsInt(); - } -}; - -struct LongResultHandler : public Handler { - int64_t value; - LongResultHandler() : value(0) {} - void onPrimitive(uint32_t, const Content & c) override { - value = c.getValue().getAsLong(); - } -}; - -struct FloatResultHandler : public Handler { - float value; - FloatResultHandler() : value(0) {} - void onPrimitive(uint32_t, const Content & c) override { - value = c.getValue().getAsFloat(); - } -}; - -struct DoubleResultHandler : public Handler { - double value; - DoubleResultHandler() : value(0) {} - void onPrimitive(uint32_t, const Content & c) override { - value = c.getValue().getAsDouble(); - } -}; - -class StringResultHandler : public Handler { -private: - ResType _type; - ResultPacker & _packer; - void addToPacker(const char * buf, size_t len) { - switch (_type) { - case RES_STRING: - _packer.AddString(buf, len); - break; - case RES_LONG_STRING: - _packer.AddLongString(buf, len); - break; - default: - break; - } - } - -public: - StringResultHandler(ResType t, ResultPacker & p) : _type(t), _packer(p) {} - void onPrimitive(uint32_t, const Content & c) override { - const document::FieldValue & fv = c.getValue(); - if (fv.isLiteral()) { - const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(fv); - vespalib::stringref s = lfv.getValueRef(); - addToPacker(s.data(), s.size()); - } else { - vespalib::string s = fv.toString(); - addToPacker(s.c_str(), s.size()); - } - } -}; - -class RawResultHandler : public Handler { -private: - ResType _type; - ResultPacker & _packer; - -public: - RawResultHandler(ResType t, ResultPacker & p) : _type(t), _packer(p) {} - void onPrimitive(uint32_t, const Content & c) override { - const document::FieldValue & fv = c.getValue(); - try { - std::pair<const char *, size_t> buf = fv.getAsRaw(); - if (buf.first != nullptr) { - switch (_type) { - case RES_DATA: - _packer.AddData(buf.first, buf.second); - break; - case RES_LONG_DATA: - _packer.AddLongData(buf.first, buf.second); - break; - default: - break; - } - } - } catch (document::InvalidDataTypeConversionException & e) { - LOG(warning, "RawResultHandler: Could not get field value '%s' as raw. Skipping writing this field", fv.toString().c_str()); - _packer.AddEmpty(); - } - } -}; - - -} - - -namespace vsm { - -FieldPath -copyPathButFirst(const FieldPath & rhs) { - // skip the element that correspond to the start field value - FieldPath path; - if ( ! rhs.empty()) { - for (auto it = rhs.begin() + 1; it != rhs.end(); ++it) { - path.push_back(std::make_unique<document::FieldPathEntry>(**it)); - } - } - return path; -} - -void -DocsumFilter::prepareFieldSpec(DocsumFieldSpec & spec, const DocsumTools::FieldSpec & toolsSpec, - const FieldMap & fieldMap, const FieldPathMapT & fieldPathMap) -{ - { // setup output field - const vespalib::string & name = toolsSpec.getOutputName(); - LOG(debug, "prepareFieldSpec: output field name '%s'", name.c_str()); - FieldIdT field = fieldMap.fieldNo(name); - if (field != FieldMap::npos) { - if (field < fieldPathMap.size()) { - spec.setOutputField(DocsumFieldSpec::FieldIdentifier(field, copyPathButFirst(fieldPathMap[field]))); - } else { - LOG(warning, "Could not find a field path for field '%s' with id '%d'", name.c_str(), field); - spec.setOutputField(DocsumFieldSpec::FieldIdentifier(field, FieldPath())); - } - } else { - LOG(warning, "Could not find output summary field '%s'", name.c_str()); - } - } - // setup input fields - for (size_t i = 0; i < toolsSpec.getInputNames().size(); ++i) { - const vespalib::string & name = toolsSpec.getInputNames()[i]; - LOG(debug, "prepareFieldSpec: input field name '%s'", name.c_str()); - FieldIdT field = fieldMap.fieldNo(name); - if (field != FieldMap::npos) { - if (field < fieldPathMap.size()) { - LOG(debug, "field %u < map size %zu", field, fieldPathMap.size()); - spec.getInputFields().push_back(DocsumFieldSpec::FieldIdentifier(field, copyPathButFirst(fieldPathMap[field]))); - } else { - LOG(warning, "Could not find a field path for field '%s' with id '%d'", name.c_str(), field); - spec.getInputFields().push_back(DocsumFieldSpec::FieldIdentifier(field, FieldPath())); - } - if (_highestFieldNo <= field) { - _highestFieldNo = field + 1; - } - } else { - LOG(warning, "Could not find input summary field '%s'", name.c_str()); - } - } -} - -const document::FieldValue * -DocsumFilter::getFieldValue(const DocsumFieldSpec::FieldIdentifier & fieldId, - VsmsummaryConfig::Fieldmap::Command command, - const Document & docsum, bool & modified) -{ - FieldIdT fId = fieldId.getId(); - const document::FieldValue * fv = docsum.getField(fId); - if (fv == nullptr) { - return nullptr; - } - switch (command) { - case VsmsummaryConfig::Fieldmap::Command::FLATTENJUNIPER: - if (_snippetModifiers != nullptr) { - FieldModifier * mod = _snippetModifiers->getModifier(fId); - if (mod != nullptr) { - _cachedValue = mod->modify(*fv, fieldId.getPath()); - modified = true; - return _cachedValue.get(); - } - } - [[fallthrough]]; - default: - return fv; - } -} - - -DocsumFilter::DocsumFilter(const DocsumToolsPtr &tools, const IDocSumCache & docsumCache) : - _docsumCache(&docsumCache), - _tools(tools), - _fields(), - _highestFieldNo(0), - _packer(tools ? tools->getResultConfig() : nullptr), - _flattenWriter(), - _snippetModifiers(nullptr), - _cachedValue(), - _emptyFieldPath() -{ } - -DocsumFilter::~DocsumFilter() =default; - -void DocsumFilter::init(const FieldMap & fieldMap, const FieldPathMapT & fieldPathMap) -{ - if (_tools.get()) { - const ResultClass *resClass = _tools->getResultClass(); - const std::vector<DocsumTools::FieldSpec> & inputSpecs = _tools->getFieldSpecs(); - if (resClass != nullptr) { - uint32_t entryCnt = resClass->GetNumEntries(); - assert(entryCnt == inputSpecs.size()); - for (uint32_t i = 0; i < entryCnt; ++i) { - const ResConfigEntry &entry = *resClass->GetEntry(i); - const DocsumTools::FieldSpec & toolsSpec = inputSpecs[i]; - _fields.push_back(DocsumFieldSpec(entry._type, toolsSpec.getCommand())); - LOG(debug, "About to prepare field spec for summary field '%s'", entry._bindname.c_str()); - prepareFieldSpec(_fields.back(), toolsSpec, fieldMap, fieldPathMap); - } - assert(entryCnt == _fields.size()); - } - } -} - -uint32_t -DocsumFilter::getNumDocs() const -{ - return std::numeric_limits<uint32_t>::max(); -} - -void -DocsumFilter::writeField(const document::FieldValue & fv, const FieldPath & path, ResType type, ResultPacker & packer) -{ - switch (type) { - case RES_INT: { - IntResultHandler rh; - fv.iterateNested(path, rh); - uint32_t val = rh.value; - packer.AddInteger(val); - break; } - case RES_SHORT: { - IntResultHandler rh; - fv.iterateNested(path, rh); - uint16_t val = rh.value; - packer.AddShort(val); - break; } - case RES_BYTE: { - IntResultHandler rh; - fv.iterateNested(path, rh); - uint8_t val = rh.value; - packer.AddByte(val); - break; } - case RES_BOOL: { - IntResultHandler rh; - fv.iterateNested(path, rh); - uint8_t val = rh.value; - packer.AddByte(val); - break; } - case RES_FLOAT: { - FloatResultHandler rh; - fv.iterateNested(path, rh); - float val = rh.value; - packer.AddFloat(val); - break; } - case RES_DOUBLE: { - DoubleResultHandler rh; - fv.iterateNested(path, rh); - double val = rh.value; - packer.AddDouble(val); - break; } - case RES_INT64: { - LongResultHandler rh; - fv.iterateNested(path, rh); - uint64_t val = rh.value; - packer.AddInt64(val); - break; } - case RES_STRING: - case RES_LONG_STRING: - { - StringResultHandler rh(type, packer); - // the string result handler adds the result to the packer - fv.iterateNested(path, rh); - } - break; - case RES_DATA: - case RES_LONG_DATA: - { - RawResultHandler rh(type, packer); - // the raw result handler adds the result to the packer - fv.iterateNested(path, rh); - } - break; - default: - LOG(warning, "Unknown docsum field type: %s", ResultConfig::GetResTypeName(type)); - packer.AddEmpty(); // unhandled output type - break; - } -} - - -void -DocsumFilter::writeSlimeField(const DocsumFieldSpec & fieldSpec, - const Document & docsum, - ResultPacker & packer) -{ - if (fieldSpec.getCommand() == VsmsummaryConfig::Fieldmap::Command::NONE) { - const DocsumFieldSpec::FieldIdentifier & fieldId = fieldSpec.getOutputField(); - const document::FieldValue * fv = docsum.getField(fieldId.getId()); - if (fv != nullptr) { - LOG(debug, "writeSlimeField: About to write field '%d' as Slime: field value = '%s'", - fieldId.getId(), fv->toString().c_str()); - SlimeFieldWriter writer; - if (! fieldSpec.hasIdentityMapping()) { - writer.setInputFields(fieldSpec.getInputFields()); - } - writer.convert(*fv); - const vespalib::stringref out = writer.out(); - packer.AddLongString(out.data(), out.size()); - } else { - LOG(debug, "writeSlimeField: Field value not set for field '%d'", fieldId.getId()); - packer.AddEmpty(); - } - } else { - LOG(debug, "writeSlimeField: Cannot handle this command"); - packer.AddEmpty(); - } -} - -void -DocsumFilter::writeFlattenField(const DocsumFieldSpec & fieldSpec, - const Document & docsum, - ResultPacker & packer) -{ - if (fieldSpec.getCommand() == VsmsummaryConfig::Fieldmap::Command::NONE) { - LOG(debug, "writeFlattenField: Cannot handle command NONE"); - packer.AddEmpty(); - return; - } - - if (fieldSpec.getResultType() != RES_LONG_STRING && - fieldSpec.getResultType() != RES_STRING) - { - LOG(debug, "writeFlattenField: Can only handle result types STRING and LONG_STRING"); - packer.AddEmpty(); - return; - } - - switch (fieldSpec.getCommand()) { - case VsmsummaryConfig::Fieldmap::Command::FLATTENJUNIPER: - _flattenWriter.setSeparator("\x1E"); // record separator (same as juniper uses) - break; - default: - break; - } - const DocsumFieldSpec::FieldIdentifierVector & inputFields = fieldSpec.getInputFields(); - for (size_t i = 0; i < inputFields.size(); ++i) { - const DocsumFieldSpec::FieldIdentifier & fieldId = inputFields[i]; - bool modified = false; - const document::FieldValue * fv = getFieldValue(fieldId, fieldSpec.getCommand(), docsum, modified); - if (fv != nullptr) { - LOG(debug, "writeFlattenField: About to flatten field '%d' with field value (%s) '%s'", - fieldId.getId(), modified ? "modified" : "original", fv->toString().c_str()); - if (modified) { - fv->iterateNested(_emptyFieldPath, _flattenWriter); - } else { - fv->iterateNested(fieldId.getPath(), _flattenWriter); - } - } else { - LOG(debug, "writeFlattenField: Field value not set for field '%d'", fieldId.getId()); - } - } - - const CharBuffer & buf = _flattenWriter.getResult(); - switch (fieldSpec.getResultType()) { - case RES_STRING: - packer.AddString(buf.getBuffer(), buf.getPos()); - break; - case RES_LONG_STRING: - packer.AddLongString(buf.getBuffer(), buf.getPos()); - break; - default: - break; - } - _flattenWriter.clear(); -} - - -void -DocsumFilter::writeEmpty(ResType type, ResultPacker & packer) -{ - // use the 'notdefined' values when writing numeric values - switch (type) { - case RES_INT: - packer.AddInteger(std::numeric_limits<int32_t>::min()); - break; - case RES_SHORT: - packer.AddShort(std::numeric_limits<int16_t>::min()); - break; - case RES_BYTE: - packer.AddByte(0); // byte fields are unsigned so we have no 'notdefined' value. - break; - case RES_FLOAT: - packer.AddFloat(std::numeric_limits<float>::quiet_NaN()); - break; - case RES_DOUBLE: - packer.AddDouble(std::numeric_limits<double>::quiet_NaN()); - break; - case RES_INT64: - packer.AddInt64(std::numeric_limits<int64_t>::min()); - break; - default: - packer.AddEmpty(); - break; - } -} - -uint32_t -DocsumFilter::getSummaryClassId() const -{ - return _tools->getResultClass() ? _tools->getResultClass()->GetClassID() : ResultConfig::NoClassID(); -} - -DocsumStoreValue -DocsumFilter::getMappedDocsum(uint32_t id) -{ - const ResultClass *resClass = _tools->getResultClass(); - if (resClass == nullptr) { - return DocsumStoreValue(nullptr, 0); - } - - const Document & doc = _docsumCache->getDocSum(id); - - _packer.Init(resClass->GetClassID()); - for (FieldSpecList::iterator it(_fields.begin()), end = _fields.end(); it != end; ++it) { - ResType type = it->getResultType(); - if (type == RES_JSONSTRING) { - // this really means 'structured data' - writeSlimeField(*it, doc, _packer); - } else { - if (it->getInputFields().size() == 1 && it->getCommand() == VsmsummaryConfig::Fieldmap::Command::NONE) { - const DocsumFieldSpec::FieldIdentifier & fieldId = it->getInputFields()[0]; - const document::FieldValue * field = doc.getField(fieldId.getId()); - if (field != nullptr) { - writeField(*field, fieldId.getPath(), type, _packer); - } else { - writeEmpty(type, _packer); // void input - } - } else if (it->getInputFields().size() == 0 && it->getCommand() == VsmsummaryConfig::Fieldmap::Command::NONE) { - LOG(spam, "0 inputfields for output field %u", it->getOutputField().getId()); - writeEmpty(type, _packer); // no input - } else { - writeFlattenField(*it, doc, _packer); - } - } - } - - const char *buf; - uint32_t buflen; - bool ok = _packer.GetDocsumBlob(&buf, &buflen); - if (ok) { - return DocsumStoreValue(buf, buflen); - } else { - return DocsumStoreValue(nullptr, 0); - } -} - -} diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.h b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.h deleted file mode 100644 index e6f7ae3e6fe..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.h +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include <vespa/vsm/common/docsum.h> -#include <vespa/vsm/common/fieldmodifier.h> -#include <vespa/vsm/vsm/docsumfieldspec.h> -#include <vespa/vsm/vsm/fieldsearchspec.h> -#include <vespa/vsm/vsm/flattendocsumwriter.h> -#include <vespa/vsm/vsm/vsm-adapter.h> -#include <vespa/searchsummary/docsummary/resultpacker.h> -#include <vespa/searchsummary/docsummary/docsumstore.h> - -using search::docsummary::IDocsumStore; -using search::docsummary::DocsumStoreValue; -using search::docsummary::ResType; -using search::docsummary::ResultPacker; - -namespace vsm { - -/** - * This class implements the IDocsumStore interface such that docsum blobs - * can be fetched based on local document id. The docsum blobs are generated - * on the fly when requested. - **/ -class DocsumFilter : public IDocsumStore -{ -private: - typedef std::vector<DocsumFieldSpec> FieldSpecList; // list of summary field specs - typedef std::vector<vespalib::string> StringList; - typedef StringFieldIdTMap FieldMap; - - const IDocSumCache * _docsumCache; - DocsumToolsPtr _tools; - FieldSpecList _fields; // list of summary fields to generate - size_t _highestFieldNo; - ResultPacker _packer; - FlattenDocsumWriter _flattenWriter; - const FieldModifierMap * _snippetModifiers; - document::FieldValue::UP _cachedValue; - document::FieldPath _emptyFieldPath; - - DocsumFilter(const DocsumFilter &); - DocsumFilter &operator=(const DocsumFilter &); - void prepareFieldSpec(DocsumFieldSpec & spec, const DocsumTools::FieldSpec & toolsSpec, - const FieldMap & fieldMap, const FieldPathMapT & fieldPathMap); - const document::FieldValue * getFieldValue(const DocsumFieldSpec::FieldIdentifier & fieldId, - VsmsummaryConfig::Fieldmap::Command command, - const Document & docsum, bool & modified); - void writeField(const document::FieldValue & fv, const FieldPath & path, ResType type, ResultPacker & packer); - void writeSlimeField(const DocsumFieldSpec & fieldSpec, const Document & docsum, ResultPacker & packer); - void writeFlattenField(const DocsumFieldSpec & fieldSpec, const Document & docsum, ResultPacker & packer); - void writeEmpty(ResType type, ResultPacker & packer); - -public: - DocsumFilter(const DocsumToolsPtr & tools, const IDocSumCache & docsumCache); - ~DocsumFilter() override; - const DocsumToolsPtr & getTools() const { return _tools; } - - /** - * Initializes this docsum filter using the given field map and field path map. - * The field map is used to map from field name to field id. - * The field path map is used to retrieve the field path for each input field. - * - * @param fieldMap maps from field name -> field id - * @param fieldPathMap maps from field id -> field path - **/ - void init(const FieldMap & fieldMap, const FieldPathMapT & fieldPathMap); - - /** - * Sets the snippet modifiers to use when writing string fields used as input to snippet generation. - **/ - void setSnippetModifiers(const FieldModifierMap & modifiers) { _snippetModifiers = &modifiers; } - - /** - * Returns the highest field id + 1 among all fields in the field spec list. - **/ - size_t getHighestFieldNo() const { return _highestFieldNo; } - - - void setDocSumStore(const IDocSumCache & docsumCache) { _docsumCache = &docsumCache; } - - // Inherit doc from IDocsumStore - DocsumStoreValue getMappedDocsum(uint32_t id) override; - uint32_t getNumDocs() const override; - uint32_t getSummaryClassId() const override; -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp deleted file mode 100644 index 7043e63ec87..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp +++ /dev/null @@ -1,334 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "fieldsearchspec.h" -#include <vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h> -#include <vespa/vsm/searcher/utf8strchrfieldsearcher.h> -#include <vespa/vsm/searcher/utf8substringsearcher.h> -#include <vespa/vsm/searcher/utf8suffixstringfieldsearcher.h> -#include <vespa/vsm/searcher/utf8exactstringfieldsearcher.h> -#include <vespa/vsm/searcher/futf8strchrfieldsearcher.h> -#include <vespa/vsm/searcher/intfieldsearcher.h> -#include <vespa/vsm/searcher/boolfieldsearcher.h> -#include <vespa/vsm/searcher/floatfieldsearcher.h> -#include <vespa/vsm/searcher/geo_pos_field_searcher.h> -#include <vespa/vespalib/stllike/asciistream.h> -#include <regex> - -#include <vespa/log/log.h> -LOG_SETUP(".vsm.fieldsearchspec"); - -#define DEBUGMASK 0x01 - -using search::streaming::ConstQueryTermList; -using search::streaming::Query; -using search::streaming::QueryTerm; - -namespace vsm { - -namespace { - -void setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) { - if (arg1 == "prefix") { - searcher->setMatchType(FieldSearcher::PREFIX); - } else if (arg1 == "substring") { - searcher->setMatchType(FieldSearcher::SUBSTRING); - } else if (arg1 == "suffix") { - searcher->setMatchType(FieldSearcher::SUFFIX); - } else if (arg1 == "exact") { - searcher->setMatchType(FieldSearcher::EXACT); - } else if (arg1 == "word") { - searcher->setMatchType(FieldSearcher::EXACT); - } -} - -} - -FieldSearchSpec::FieldSearchSpec() : - _id(0), - _name(), - _maxLength(0x100000), - _searcher(), - _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE), - _arg1(), - _reconfigured(false) -{ -} -FieldSearchSpec::~FieldSearchSpec() = default; - -FieldSearchSpec::FieldSearchSpec(FieldSearchSpec&& rhs) noexcept = default; -FieldSearchSpec& FieldSearchSpec::operator=(FieldSearchSpec&& rhs) noexcept = default; - -FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, - VsmfieldsConfig::Fieldspec::Searchmethod searchDef, - const vespalib::string & arg1, size_t maxLength_) : - _id(fid), - _name(fname), - _maxLength(maxLength_), - _searcher(), - _searchMethod(searchDef), - _arg1(arg1), - _reconfigured(false) -{ - switch(searchDef) { - default: - LOG(warning, "Unknown searchdef = %d. Defaulting to AUTOUTF8", static_cast<int>(searchDef)); - [[fallthrough]]; - case VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8: - case VsmfieldsConfig::Fieldspec::Searchmethod::NONE: - case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8: - case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8: - if (arg1 == "substring") { - _searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid); - } else if (arg1 == "suffix") { - _searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid); - } else if (arg1 == "exact") { - _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); - } else if (arg1 == "word") { - _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); - } else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) { - _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid); - } else { - _searcher = std::make_unique<FUTF8StrChrFieldSearcher>(fid); - } - break; - case VsmfieldsConfig::Fieldspec::Searchmethod::BOOL: - _searcher = std::make_unique<BoolFieldSearcher>(fid); - break; - case VsmfieldsConfig::Fieldspec::Searchmethod::INT8: - case VsmfieldsConfig::Fieldspec::Searchmethod::INT16: - case VsmfieldsConfig::Fieldspec::Searchmethod::INT32: - case VsmfieldsConfig::Fieldspec::Searchmethod::INT64: - _searcher = std::make_unique<IntFieldSearcher>(fid); - break; - case VsmfieldsConfig::Fieldspec::Searchmethod::FLOAT: - _searcher = std::make_unique<FloatFieldSearcher>(fid); - break; - case VsmfieldsConfig::Fieldspec::Searchmethod::DOUBLE: - _searcher = std::make_unique<DoubleFieldSearcher>(fid); - break; - case VsmfieldsConfig::Fieldspec::Searchmethod::GEOPOS: - _searcher = std::make_unique<GeoPosFieldSearcher>(fid); - break; - } - if (_searcher) { - setMatchType(_searcher, arg1); - _searcher->maxFieldLength(maxLength()); - } -} - -void -FieldSearchSpec::reconfig(const QueryTerm & term) -{ - if (_reconfigured) { - return; - } - switch (_searchMethod) { - case VsmfieldsConfig::Fieldspec::Searchmethod::NONE: - case VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8: - case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8: - case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8: - if ((term.isSubstring() && _arg1 != "substring") || - (term.isSuffix() && _arg1 != "suffix") || - (term.isExactstring() && _arg1 != "exact") || - (term.isPrefix() && _arg1 == "suffix")) - { - _searcher = std::make_unique<UTF8FlexibleStringFieldSearcher>(id()); - // preserve the basic match property of the searcher - setMatchType(_searcher, _arg1); - LOG(debug, "Reconfigured to use UTF8FlexibleStringFieldSearcher (%s) for field '%s' with id '%d'", - _searcher->prefix() ? "prefix" : "regular", name().c_str(), id()); - _reconfigured = true; - } - break; - default: - break; - } -} - -vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpec & f) -{ - os << f._id << ' ' << f._name << ' '; - if ( ! f._searcher) { - os << " No searcher defined.\n"; - } - return os; -} - -FieldSearchSpecMap::FieldSearchSpecMap() = default; - -FieldSearchSpecMap::~FieldSearchSpecMap() = default; - -namespace { - const std::string _G_empty(""); - const std::string _G_value(".value"); - const std::regex _G_map1("\\{[a-zA-Z0-9]+\\}"); - const std::regex _G_map2("\\{\".*\"\\}"); - const std::regex _G_array("\\[[0-9]+\\]"); -} - -vespalib::string FieldSearchSpecMap::stripNonFields(const vespalib::string & rawIndex) -{ - if ((rawIndex.find('[') != vespalib::string::npos) || (rawIndex.find('{') != vespalib::string::npos)) { - std::string index = std::regex_replace(std::string(rawIndex), _G_map1, _G_value); - index = std::regex_replace(index, _G_map2, _G_value); - index = std::regex_replace(index, _G_array, _G_empty); - return index; - } - return rawIndex; -} - -bool FieldSearchSpecMap::buildFieldsInQuery(const Query & query, StringFieldIdTMap & fieldsInQuery) const -{ - bool retval(true); - ConstQueryTermList qtl; - query.getLeafs(qtl); - - for (const auto & term : qtl) { - for (const auto & dtm : documentTypeMap()) { - const IndexFieldMapT & fim = dtm.second; - vespalib::string rawIndex(term->index()); - vespalib::string index(stripNonFields(rawIndex)); - IndexFieldMapT::const_iterator fIt = fim.find(index); - if (fIt != fim.end()) { - for(FieldIdT fid : fIt->second) { - const FieldSearchSpec & spec = specMap().find(fid)->second; - LOG(debug, "buildFieldsInQuery = rawIndex='%s', index='%s'", rawIndex.c_str(), index.c_str()); - if ((rawIndex != index) && (spec.name().find(index) == 0)) { - vespalib::string modIndex(rawIndex); - modIndex.append(spec.name().substr(index.size())); - fieldsInQuery.add(modIndex, spec.id()); - } else { - fieldsInQuery.add(spec.name(),spec.id()); - } - } - } else { - LOG(warning, "No valid indexes registered for index %s", term->index().c_str()); - retval = false; - } - } - } - return retval; -} - -void FieldSearchSpecMap::buildFromConfig(const std::vector<vespalib::string> & otherFieldsNeeded) -{ - for(size_t i(0), m(otherFieldsNeeded.size()); i < m; i++) { - LOG(debug, "otherFieldsNeeded[%zd] = '%s'", i, otherFieldsNeeded[i].c_str()); - _nameIdMap.add(otherFieldsNeeded[i]); - } -} - -namespace { - -FieldIdTList -buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearchSpecMapT & specMap, - const VsmfieldsConfig::Documenttype::IndexVector & indexes) -{ - LOG(spam, "Index %s with %zd fields", ci.name.c_str(), ci.field.size()); - FieldIdTList ifm; - for (const VsmfieldsConfig::Documenttype::Index::Field & cf : ci.field) { - LOG(spam, "Parsing field %s", cf.name.c_str()); - auto foundIndex = std::find_if(indexes.begin(), indexes.end(), - [&cf](const auto & v) { return v.name == cf.name;}); - if ((foundIndex != indexes.end()) && (cf.name != ci.name)) { - FieldIdTList sub = buildFieldSet(*foundIndex, specMap, indexes); - ifm.insert(ifm.end(), sub.begin(), sub.end()); - } else { - auto foundField = std::find_if(specMap.begin(), specMap.end(), - [&cf](const auto & v) { return v.second.name() == cf.name;} ); - if (foundField != specMap.end()) { - ifm.push_back(foundField->second.id()); - } else { - LOG(warning, "Field %s not defined. Ignoring....", cf.name.c_str()); - } - } - } - return ifm; -} - -} - -bool FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf) -{ - bool retval(true); - LOG(spam, "Parsing %zd fields", conf->fieldspec.size()); - for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) { - LOG(spam, "Parsing %s", cfs.name.c_str()); - FieldIdT fieldId = specMap().size(); - FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, cfs.arg1.c_str(), cfs.maxlength); - _specMap[fieldId] = std::move(fss); - _nameIdMap.add(cfs.name, fieldId); - LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str()); - } - - LOG(spam, "Parsing %zd document types", conf->documenttype.size()); - for(const VsmfieldsConfig::Documenttype & di : conf->documenttype) { - IndexFieldMapT indexMapp; - LOG(spam, "Parsing document type %s with %zd indexes", di.name.c_str(), di.index.size()); - for(const VsmfieldsConfig::Documenttype::Index & ci : di.index) { - indexMapp[ci.name] = buildFieldSet(ci, specMap(), di.index); - } - _documentTypeMap[di.name] = indexMapp; - } - return retval; -} - -void -FieldSearchSpecMap::reconfigFromQuery(const Query & query) -{ - ConstQueryTermList qtl; - query.getLeafs(qtl); - - for (const auto & termA : qtl) { - for (const auto & ifm : documentTypeMap()) { - IndexFieldMapT::const_iterator itc = ifm.second.find(termA->index()); - if (itc != ifm.second.end()) { - for (FieldIdT fid : itc->second) { - FieldSearchSpec & spec = _specMap.find(fid)->second; - spec.reconfig(*termA); - } - } - } - } -} - -bool lesserField(const FieldSearcherContainer & a, const FieldSearcherContainer & b) -{ - return a->field() < b->field(); -} - -void FieldSearchSpecMap::buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap) -{ - fieldSearcherMap.clear(); - for (const auto & entry : fieldsInQuery) { - FieldIdT fId = entry.second; - const FieldSearchSpec & spec = specMap().find(fId)->second; - fieldSearcherMap.emplace_back(spec.searcher().duplicate()); - } - std::sort(fieldSearcherMap.begin(), fieldSearcherMap.end(), lesserField); -} - - -vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & df) -{ - os << "DocumentTypeMap = \n"; - for (const auto & dtm : df.documentTypeMap()) { - os << "DocType = " << dtm.first << "\n"; - os << "IndexMap = \n"; - for (const auto &index : dtm.second) { - os << index.first << ": "; - for (FieldIdT fid : index.second) { - os << fid << ' '; - } - os << '\n'; - } - } - os << "SpecMap = \n"; - for (const auto & entry : df.specMap()) { - os << entry.first << " = " << entry.second << '\n'; - } - os << "NameIdMap = \n" << df.nameIdMap(); - return os; -} - -} diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h deleted file mode 100644 index 7b78a8634e0..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/vsm/searcher/fieldsearcher.h> -#include <vespa/vsm/config/vsm-cfif.h> - -namespace vsm { - -class FieldSearchSpec -{ -public: - FieldSearchSpec(); - FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, - VsmfieldsConfig::Fieldspec::Searchmethod searchMethod, - const vespalib::string & arg1, size_t maxLength); - ~FieldSearchSpec(); - FieldSearchSpec(FieldSearchSpec&& rhs) noexcept; - FieldSearchSpec& operator=(FieldSearchSpec&& rhs) noexcept; - const FieldSearcher & searcher() const { return *_searcher; } - const vespalib::string & name() const { return _name; } - FieldIdT id() const { return _id; } - bool valid() const { return static_cast<bool>(_searcher); } - size_t maxLength() const { return _maxLength; } - - /** - * Reconfigures the field searcher based on information in the given query term. - **/ - void reconfig(const search::streaming::QueryTerm & term); - - friend vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpec & f); - -private: - FieldIdT _id; - vespalib::string _name; - size_t _maxLength; - FieldSearcherContainer _searcher; - VsmfieldsConfig::Fieldspec::Searchmethod _searchMethod; - vespalib::string _arg1; - bool _reconfigured; -}; - -typedef std::map<FieldIdT, FieldSearchSpec> FieldSearchSpecMapT; - -class FieldSearchSpecMap -{ -public: - FieldSearchSpecMap(); - ~FieldSearchSpecMap(); - - /** - * Iterates over all fields in the vsmfields config and creates a mapping from field id to FieldSearchSpec objects - * and a mapping from field name to field id. It then iterates over all document types and index names - * and creates a mapping from index name to list of field ids for each document type. - **/ - bool buildFromConfig(const VsmfieldsHandle & conf); - - /** - * Iterates over the given field name vector adding extra elements to the mapping from field name to field id. - **/ - void buildFromConfig(const std::vector<vespalib::string> & otherFieldsNeeded); - - /** - * Reconfigures some of the field searchers based on information in the given query. - **/ - void reconfigFromQuery(const search::streaming::Query & query); - - /** - * Adds a [field name, field id] entry to the given mapping for each field name used in the given query. - * This is achieved by mapping from query term index name -> list of field ids -> [field name, field id] pairs. - **/ - bool buildFieldsInQuery(const search::streaming::Query & query, StringFieldIdTMap & fieldsInQuery) const; - - /** - * Adds a [field name, field id] entry to the given mapping for each field name in the given vector. - **/ - void buildFieldsInQuery(const std::vector<vespalib::string> & otherFieldsNeeded, StringFieldIdTMap & fieldsInQuery) const; - - /** - * Adds a FieldSearcher object to the given field searcher map for each field name in the other map. - **/ - void buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap); - - const FieldSearchSpecMapT & specMap() const { return _specMap; } - //const IndexFieldMapT & indexMap() const { return _documentTypeMap.begin()->second; } - const DocumentTypeIndexFieldMapT & documentTypeMap() const { return _documentTypeMap; } - const StringFieldIdTMap & nameIdMap() const { return _nameIdMap; } - friend vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & f); - - static vespalib::string stripNonFields(const vespalib::string & rawIndex); - -private: - FieldSearchSpecMapT _specMap; // mapping from field id to field search spec - DocumentTypeIndexFieldMapT _documentTypeMap; // mapping from index name to field id list for each document type - StringFieldIdTMap _nameIdMap; // mapping from field name to field id -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.cpp b/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.cpp deleted file mode 100644 index 06b652d85e6..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "flattendocsumwriter.h" -#include <vespa/document/fieldvalue/fieldvalues.h> - -namespace vsm { - -void -FlattenDocsumWriter::considerSeparator() -{ - if (_useSeparator) { - _output.put(_separator.c_str(), _separator.size()); - } -} - -void -FlattenDocsumWriter::onPrimitive(uint32_t, const Content & c) -{ - considerSeparator(); - const document::FieldValue & fv = c.getValue(); - if (fv.isLiteral()) { - const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(fv); - vespalib::stringref value = lfv.getValueRef(); - _output.put(value.data(), value.size()); - } else if (fv.isNumeric() || - fv.isA(document::FieldValue::Type::BOOL)) - { - vespalib::string value = fv.getAsString(); - _output.put(value.data(), value.size()); - } else { - vespalib::string value = fv.toString(); - _output.put(value.data(), value.size()); - } - _useSeparator = true; -} - -FlattenDocsumWriter::FlattenDocsumWriter(const vespalib::string & separator) : - _output(32), - _separator(separator), - _useSeparator(false) -{ } - -FlattenDocsumWriter::~FlattenDocsumWriter() = default; - -} diff --git a/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.h b/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.h deleted file mode 100644 index 47c6f1e75d0..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include <vespa/document/fieldvalue/fieldvalue.h> -#include <vespa/document/fieldvalue/iteratorhandler.h> -#include <vespa/vsm/common/charbuffer.h> - -namespace vsm { - -/** - * This class is used to flatten out and write a complex field value. - * A separator string is inserted between primitive field values. - **/ -class FlattenDocsumWriter : public document::fieldvalue::IteratorHandler { -private: - CharBuffer _output; - vespalib::string _separator; - bool _useSeparator; - - void considerSeparator(); - void onPrimitive(uint32_t, const Content & c) override; - -public: - FlattenDocsumWriter(const vespalib::string & separator = " "); - ~FlattenDocsumWriter(); - void setSeparator(const vespalib::string & separator) { _separator = separator; } - const CharBuffer & getResult() const { return _output; } - void clear() { - _output.reset(); - _separator = " "; - _useSeparator = false; - } -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/vsm/i_matching_elements_filler.h b/streamingvisitors/src/vespa/vsm/vsm/i_matching_elements_filler.h deleted file mode 100644 index a35cea40cec..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/i_matching_elements_filler.h +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include <memory> - -namespace search { -class MatchingElements; -class MatchingElementsFields; -} - -namespace vsm { - -/* - * Interface class for filling matching elements structure for - * streaming search. - */ -class IMatchingElementsFiller { -public: - virtual std::unique_ptr<search::MatchingElements> fill_matching_elements(const search::MatchingElementsFields& fields) = 0; - virtual ~IMatchingElementsFiller() = default; -}; - -} diff --git a/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.cpp b/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.cpp deleted file mode 100644 index 5bc5798fb9d..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.cpp +++ /dev/null @@ -1,220 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "slimefieldwriter.h" -#include <vespa/searchlib/util/slime_output_raw_buf_adapter.h> -#include <vespa/vespalib/stllike/asciistream.h> -#include <vespa/vespalib/util/size_literals.h> -#include <vespa/searchsummary/docsummary/resultconfig.h> -#include <vespa/document/datatype/positiondatatype.h> - -#include <vespa/log/log.h> -LOG_SETUP(".vsm.slimefieldwriter"); - -namespace { - -vespalib::string -toString(const vsm::FieldPath & fieldPath) -{ - vespalib::asciistream oss; - for (size_t i = 0; i < fieldPath.size(); ++i) { - if (i > 0) { - oss << "."; - } - oss << fieldPath[i].getName(); - } - return oss.str(); -} - -vespalib::string -toString(const std::vector<vespalib::string> & fieldPath) -{ - vespalib::asciistream oss; - for (size_t i = 0; i < fieldPath.size(); ++i) { - if (i > 0) { - oss << "."; - } - oss << fieldPath[i]; - } - return oss.str(); -} - -} // namespace <unnamed> - -using namespace vespalib::slime::convenience; - - -namespace vsm { - -void -SlimeFieldWriter::traverseRecursive(const document::FieldValue & fv, Inserter &inserter) -{ - LOG(debug, "traverseRecursive: class(%s), fieldValue(%s), currentPath(%s)", - fv.className(), fv.toString().c_str(), toString(_currPath).c_str()); - - if (fv.isCollection()) { - const document::CollectionFieldValue & cfv = static_cast<const document::CollectionFieldValue &>(fv); - if (cfv.isA(document::FieldValue::Type::ARRAY)) { - const document::ArrayFieldValue & afv = static_cast<const document::ArrayFieldValue &>(cfv); - Cursor &a = inserter.insertArray(); - for (size_t i = 0; i < afv.size(); ++i) { - const document::FieldValue & nfv = afv[i]; - ArrayInserter ai(a); - traverseRecursive(nfv, ai); - } - } else { - assert(cfv.isA(document::FieldValue::Type::WSET)); - const document::WeightedSetFieldValue & wsfv = static_cast<const document::WeightedSetFieldValue &>(cfv); - Cursor &a = inserter.insertArray(); - Symbol isym = a.resolve("item"); - Symbol wsym = a.resolve("weight"); - for (const auto &entry : wsfv) { - Cursor &o = a.addObject(); - const document::FieldValue & nfv = *entry.first; - ObjectSymbolInserter oi(o, isym); - traverseRecursive(nfv, oi); - int weight = static_cast<const document::IntFieldValue &>(*entry.second).getValue(); - o.setLong(wsym, weight); - } - } - } else if (fv.isA(document::FieldValue::Type::MAP)) { - const document::MapFieldValue & mfv = static_cast<const document::MapFieldValue &>(fv); - Cursor &a = inserter.insertArray(); - Symbol keysym = a.resolve("key"); - Symbol valsym = a.resolve("value"); - for (const auto &entry : mfv) { - Cursor &o = a.addObject(); - ObjectSymbolInserter ki(o, keysym); - traverseRecursive(*entry.first, ki); - _currPath.push_back("value"); - ObjectSymbolInserter vi(o, valsym); - traverseRecursive(*entry.second, vi); - _currPath.pop_back(); - } - } else if (fv.isStructured()) { - const document::StructuredFieldValue & sfv = static_cast<const document::StructuredFieldValue &>(fv); - Cursor &o = inserter.insertObject(); - if (sfv.getDataType() == &document::PositionDataType::getInstance() - && search::docsummary::ResultConfig::wantedV8geoPositions()) - { - bool ok = true; - try { - int x = std::numeric_limits<int>::min(); - int y = std::numeric_limits<int>::min(); - for (const document::Field & entry : sfv) { - document::FieldValue::UP fval(sfv.getValue(entry)); - if (entry.getName() == "x") { - x = fval->getAsInt(); - } else if (entry.getName() == "y") { - y = fval->getAsInt(); - } else { - ok = false; - } - } - if (x == std::numeric_limits<int>::min()) ok = false; - if (y == std::numeric_limits<int>::min()) ok = false; - if (ok) { - o.setDouble("lat", double(y) / 1.0e6); - o.setDouble("lng", double(x) / 1.0e6); - return; - } - } catch (std::exception &e) { - (void)e; - // fallback to code below - } - } - for (const document::Field & entry : sfv) { - if (explorePath(entry.getName())) { - _currPath.push_back(entry.getName()); - Memory keymem(entry.getName()); - ObjectInserter oi(o, keymem); - document::FieldValue::UP fval(sfv.getValue(entry)); - traverseRecursive(*fval, oi); - _currPath.pop_back(); - } - } - } else { - if (fv.isLiteral()) { - const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(fv); - inserter.insertString(lfv.getValueRef()); - } else if (fv.isNumeric()) { - switch (fv.getDataType()->getId()) { - case document::DataType::T_BYTE: - case document::DataType::T_SHORT: - case document::DataType::T_INT: - case document::DataType::T_LONG: - inserter.insertLong(fv.getAsLong()); - break; - case document::DataType::T_DOUBLE: - inserter.insertDouble(fv.getAsDouble()); - break; - case document::DataType::T_FLOAT: - inserter.insertDouble(fv.getAsFloat()); - break; - default: - inserter.insertString(fv.getAsString()); - } - } else if (fv.isA(document::FieldValue::Type::BOOL)) { - const auto & bfv = static_cast<const document::BoolFieldValue &>(fv); - inserter.insertBool(bfv.getValue()); - } else { - inserter.insertString(fv.toString()); - } - } -} - -bool -SlimeFieldWriter::explorePath(vespalib::stringref candidate) -{ - if (_inputFields == nullptr) { - return true; - } - // find out if we should explore the current path - for (size_t i = 0; i < _inputFields->size(); ++i) { - const FieldPath & fp = (*_inputFields)[i].getPath(); - if (_currPath.size() <= fp.size()) { - bool equal = true; - for (size_t j = 0; j < _currPath.size() && equal; ++j) { - equal = (fp[j].getName() == _currPath[j]); - } - if (equal) { - if (_currPath.size() == fp.size()) { - return true; - } else if (fp[_currPath.size()].getName() == candidate) { - // the current path matches one of the input field paths - return true; - } - } - } - } - return false; -} - -SlimeFieldWriter::SlimeFieldWriter() : - _rbuf(4_Ki), - _slime(), - _inputFields(nullptr), - _currPath() -{ -} - -SlimeFieldWriter::~SlimeFieldWriter() = default; - -void -SlimeFieldWriter::convert(const document::FieldValue & fv) -{ - if (LOG_WOULD_LOG(debug)) { - if (_inputFields != nullptr) { - for (size_t i = 0; i < _inputFields->size(); ++i) { - LOG(debug, "write: input field path [%zd] '%s'", i, toString((*_inputFields)[i].getPath()).c_str()); - } - } else { - LOG(debug, "write: no input fields"); - } - } - SlimeInserter inserter(_slime); - traverseRecursive(fv, inserter); - search::SlimeOutputRawBufAdapter adapter(_rbuf); - vespalib::slime::BinaryFormat::encode(_slime, adapter); -} - -} diff --git a/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.h b/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.h deleted file mode 100644 index b5adac8985f..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "docsumfieldspec.h" -#include <vespa/vsm/common/storagedocument.h> -#include <vespa/document/fieldvalue/fieldvalues.h> -#include <vespa/vespalib/data/slime/slime.h> -#include <vespa/searchlib/util/rawbuf.h> - -namespace vsm { - -/** - * This class is used to write a field value as slime binary data. - * If only a subset of the field value should be written this subset - * is specified using the setInputFields() function. - **/ -class SlimeFieldWriter -{ -private: - search::RawBuf _rbuf; - vespalib::Slime _slime; - const DocsumFieldSpec::FieldIdentifierVector * _inputFields; - std::vector<vespalib::string> _currPath; - - void traverseRecursive(const document::FieldValue & fv, vespalib::slime::Inserter & inserter); - bool explorePath(vespalib::stringref candidate); - -public: - SlimeFieldWriter(); - ~SlimeFieldWriter(); - - - /** - * Specifies the subset of the field value that should be written. - **/ - void setInputFields(const DocsumFieldSpec::FieldIdentifierVector & inputFields) { _inputFields = &inputFields; } - - /** - * Convert the given field value - **/ - void convert(const document::FieldValue & fv); - - /** - * Return a reference to the output binary data - **/ - vespalib::stringref out() const { - return vespalib::stringref(_rbuf.GetDrainPos(), _rbuf.GetUsedLen()); - } - - void clear() { - _rbuf.Reuse(); - _inputFields = nullptr; - _currPath.clear(); - } -}; - -} diff --git a/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.cpp deleted file mode 100644 index 127302311f9..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.cpp +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "snippetmodifier.h" -#include <vespa/document/fieldvalue/stringfieldvalue.h> -#include <vespa/vespalib/stllike/hash_map.hpp> - -#include <vespa/log/log.h> -LOG_SETUP(".vsm.snippetmodifier"); - -using namespace document; -using search::streaming::QueryTerm; -using search::streaming::QueryTermList; -typedef vespalib::hash_map<vsm::FieldIdT, QueryTermList> FieldQueryTermMap; - -namespace { - -void -addIfNotPresent(FieldQueryTermMap & map, vsm::FieldIdT fId, QueryTerm * qt) -{ - FieldQueryTermMap::iterator itr = map.find(fId); - if (itr != map.end()) { - QueryTermList & qtl = itr->second; - if (std::find(qtl.begin(), qtl.end(), qt) == qtl.end()) { - qtl.push_back(qt); - } - } else { - map[fId].push_back(qt); - } -} - -} - -namespace vsm { - -void -SnippetModifier::considerSeparator() -{ - if (_useSep) { - _valueBuf->put(_groupSep); - } -} - -void -SnippetModifier::onPrimitive(uint32_t, const Content & c) -{ - considerSeparator(); - _searcher->onValue(c.getValue()); - _valueBuf->put(_searcher->getModifiedBuf().getBuffer(), _searcher->getModifiedBuf().getPos()); - _useSep = true; -} - -void -SnippetModifier::reset() -{ - _valueBuf->reset(); - _useSep = false; -} - - -SnippetModifier::SnippetModifier(const UTF8SubstringSnippetModifier::SP & searcher) : - _searcher(searcher), - _valueBuf(new CharBuffer(32)), - _groupSep('\x1E'), - _useSep(false), - _empty() -{ -} - -SnippetModifier::SnippetModifier(const UTF8SubstringSnippetModifier::SP & searcher, const CharBuffer::SP & valueBuf) : - _searcher(searcher), - _valueBuf(valueBuf), - _groupSep('\x1E'), - _useSep(false), - _empty() -{ -} - -SnippetModifier::~SnippetModifier() {} - -FieldValue::UP -SnippetModifier::modify(const FieldValue & fv, const document::FieldPath & path) -{ - reset(); - fv.iterateNested(path, *this); - return FieldValue::UP(new StringFieldValue(vespalib::string(_valueBuf->getBuffer(), _valueBuf->getPos()))); -} - - -SnippetModifierManager::SnippetModifierManager() : - _modifiers(), - _searchBuf(new SearcherBuf(64)), - _searchModifyBuf(new CharBuffer(64)), - _searchOffsetBuf(new std::vector<size_t>(64)), - _modifierBuf(new CharBuffer(128)) -{ -} - -SnippetModifierManager::~SnippetModifierManager() {} - -void -SnippetModifierManager::setup(const QueryTermList & queryTerms, - const FieldSearchSpecMapT & specMap, - const IndexFieldMapT & indexMap) -{ - FieldQueryTermMap fqtm; - - // setup modifiers - for (QueryTermList::const_iterator i = queryTerms.begin(); i != queryTerms.end(); ++i) { - QueryTerm * qt = *i; - IndexFieldMapT::const_iterator j = indexMap.find(qt->index()); - if (j != indexMap.end()) { - for (FieldIdTList::const_iterator k = j->second.begin(); k != j->second.end(); ++k) { - FieldIdT fId = *k; - const FieldSearchSpec & spec = specMap.find(fId)->second; - if (spec.searcher().substring() || qt->isSubstring()) { // we need a modifier for this field id - addIfNotPresent(fqtm, fId, qt); - if (_modifiers.getModifier(fId) == NULL) { - LOG(debug, "Create snippet modifier for field id '%u'", fId); - UTF8SubstringSnippetModifier::SP searcher - (new UTF8SubstringSnippetModifier(fId, _searchModifyBuf, _searchOffsetBuf)); - _modifiers.map()[fId] = std::make_unique<SnippetModifier>(searcher, _modifierBuf); - } - } - } - } - } - - // prepare modifiers - for (auto & entry : _modifiers.map()) { - FieldIdT fId = entry.first; - SnippetModifier & smod = static_cast<SnippetModifier &>(*entry.second); - smod.getSearcher()->prepare(fqtm[fId], _searchBuf); - } -} - -} diff --git a/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.h b/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.h deleted file mode 100644 index 4718ab8783a..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.h +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#pragma once - -#include "fieldsearchspec.h" -#include <vespa/vsm/common/charbuffer.h> -#include <vespa/vsm/common/document.h> -#include <vespa/vsm/common/fieldmodifier.h> -#include <vespa/vsm/searcher/utf8substringsnippetmodifier.h> -#include <vespa/document/fieldvalue/fieldvalue.h> -#include <vespa/document/fieldvalue/iteratorhandler.h> - -namespace vsm { - -/** - * This class is responsible for modifying field values where we have substring search and that are used - * as input to snippet generation. - * - * The class implements the FieldModifier interface to modify field values, and the IteratorHandler interface - * to traverse complex field values. Primitive field values are passed to the underlying searcher that is - * responsible for modifying the field value by inserting unit separators before and after matches. - * A group separator is inserted between primitive field values the same way as done by FlattenDocsumWriter. - **/ -class SnippetModifier : public FieldModifier, public document::fieldvalue::IteratorHandler -{ -private: - UTF8SubstringSnippetModifier::SP _searcher; - CharBuffer::SP _valueBuf; // buffer to store the final modified field value - char _groupSep; - bool _useSep; - document::FieldPath _empty; - - void considerSeparator(); - // Inherrit doc from document::FieldValue::IteratorHandler - void onPrimitive(uint32_t, const Content & c) override; - void reset(); - -public: - /** - * Creates a new instance. - * - * @param searcher the searcher used to modify primitive field values. - **/ - SnippetModifier(const UTF8SubstringSnippetModifier::SP & searcher); - - /** - * Creates a new instance. - * - * @param searcher the searcher used to modify primitive field values. - * @param valueBuf the shared buffer used to store the final modified field value. - **/ - SnippetModifier(const UTF8SubstringSnippetModifier::SP & searcher, const CharBuffer::SP & valueBuf); - - ~SnippetModifier(); - - /** - * Modifies the complete given field value. - **/ - document::FieldValue::UP modify(const document::FieldValue & fv) override { - return modify(fv, _empty); - } - - /** - * Modifies the given field value by passing all primitive field values to the searcher and - * inserting group separators between them. A string field value is returned. - * The iterating of the field value is limited by the given field path. - * - * @param fv the field value to modify. - * @param path the field path used to iterate the field value. - * @return the new modified field value. - **/ - document::FieldValue::UP modify(const document::FieldValue & fv, - const document::FieldPath & path) override; - - const CharBuffer & getValueBuf() const { return *_valueBuf; } - const UTF8SubstringSnippetModifier::SP & getSearcher() const { return _searcher; } -}; - -/** - * This class manages a set of snippet modifiers. - * The modifiers are instantiated and prepared in the setup function. - * This class also holds shared buffers that are used by the modifiers. - **/ -class SnippetModifierManager -{ -private: - FieldModifierMap _modifiers; - SharedSearcherBuf _searchBuf; - CharBuffer::SP _searchModifyBuf; - SharedOffsetBuffer _searchOffsetBuf; - CharBuffer::SP _modifierBuf; - -public: - SnippetModifierManager(); - ~SnippetModifierManager(); - - /** - * Setups snippet modifiers for all fields where we have substring search. - * - * @param queryTerms the query terms to take into consideration. - * @param specMap mapping from field id to search spec objects. - * @param fieldMap mapping from index (used in the query) to a list of field ids. - **/ - void setup(const search::streaming::QueryTermList & queryTerms, - const FieldSearchSpecMapT & specMap, const IndexFieldMapT & fieldMap); - - const FieldModifierMap & getModifiers() const { return _modifiers; } -}; - -} - diff --git a/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.cpp b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.cpp deleted file mode 100644 index 5507532d4f3..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.cpp +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "vsm-adapter.hpp" -#include "docsumconfig.h" -#include "i_matching_elements_filler.h" -#include <vespa/searchlib/common/matching_elements.h> - -#include <vespa/log/log.h> -LOG_SETUP(".vsm.vsm-adapter"); - -using search::docsummary::ResConfigEntry; -using search::docsummary::KeywordExtractor; -using search::MatchingElements; -using config::ConfigSnapshot; - -namespace vsm { - -GetDocsumsStateCallback::GetDocsumsStateCallback() : - _summaryFeatures(), - _rankFeatures(), - _matching_elements_filler() -{ } - -void GetDocsumsStateCallback::FillSummaryFeatures(GetDocsumsState * state, IDocsumEnvironment * env) -{ - (void) env; - if (_summaryFeatures) { // set the summary features to write to the docsum - state->_summaryFeatures = _summaryFeatures; - state->_summaryFeaturesCached = true; - } -} - -void GetDocsumsStateCallback::FillRankFeatures(GetDocsumsState * state, IDocsumEnvironment * env) -{ - (void) env; - if (_rankFeatures) { // set the rank features to write to the docsum - state->_rankFeatures = _rankFeatures; - } -} - -void GetDocsumsStateCallback::FillDocumentLocations(GetDocsumsState *state, IDocsumEnvironment * env) -{ - (void) state; - (void) env; -} - -std::unique_ptr<MatchingElements> -GetDocsumsStateCallback::fill_matching_elements(const search::MatchingElementsFields& fields) -{ - if (_matching_elements_filler) { - return _matching_elements_filler->fill_matching_elements(fields); - } - return std::make_unique<MatchingElements>(); -} - -void -GetDocsumsStateCallback::set_matching_elements_filler(std::unique_ptr<IMatchingElementsFiller> matching_elements_filler) -{ - _matching_elements_filler = std::move(matching_elements_filler); -} - -GetDocsumsStateCallback::~GetDocsumsStateCallback() = default; - -DocsumTools::FieldSpec::FieldSpec() : - _outputName(), - _inputNames(), - _command(VsmsummaryConfig::Fieldmap::Command::NONE) -{ } - -DocsumTools::FieldSpec::~FieldSpec() = default; - -DocsumTools::DocsumTools(std::unique_ptr<DynamicDocsumWriter> writer) : - _writer(std::move(writer)), - _juniper(), - _resultClass(), - _fieldSpecs() -{ } - - -DocsumTools::~DocsumTools() = default; - -bool -DocsumTools::obtainFieldNames(const FastS_VsmsummaryHandle &cfg) -{ - uint32_t defaultSummaryId = getResultConfig()->LookupResultClassId(cfg->outputclass); - _resultClass = getResultConfig()->LookupResultClass(defaultSummaryId); - if (_resultClass != NULL) { - for (uint32_t i = 0; i < _resultClass->GetNumEntries(); ++i) { - const ResConfigEntry * entry = _resultClass->GetEntry(i); - _fieldSpecs.push_back(FieldSpec()); - _fieldSpecs.back().setOutputName(entry->_bindname); - bool found = false; - if (cfg) { - // check if we have this summary field in the vsmsummary config - for (uint32_t j = 0; j < cfg->fieldmap.size() && !found; ++j) { - if (entry->_bindname == cfg->fieldmap[j].summary.c_str()) { - for (uint32_t k = 0; k < cfg->fieldmap[j].document.size(); ++k) { - _fieldSpecs.back().getInputNames().push_back(cfg->fieldmap[j].document[k].field); - } - _fieldSpecs.back().setCommand(cfg->fieldmap[j].command); - found = true; - } - } - } - if (!found) { - // use yourself as input - _fieldSpecs.back().getInputNames().push_back(entry->_bindname); - } - } - } else { - LOG(warning, "could not locate result class: '%s'", cfg->outputclass.c_str()); - } - return true; -} - -void -VSMAdapter::configure(const VSMConfigSnapshot & snapshot) -{ - std::lock_guard guard(_lock); - LOG(debug, "(re-)configure VSM (docsum tools)"); - - std::shared_ptr<SummaryConfig> summary(snapshot.getConfig<SummaryConfig>()); - std::shared_ptr<SummarymapConfig> summaryMap(snapshot.getConfig<SummarymapConfig>()); - std::shared_ptr<VsmsummaryConfig> vsmSummary(snapshot.getConfig<VsmsummaryConfig>()); - std::shared_ptr<JuniperrcConfig> juniperrc(snapshot.getConfig<JuniperrcConfig>()); - - _fieldsCfg.set(snapshot.getConfig<VsmfieldsConfig>().release()); - _fieldsCfg.latch(); - - LOG(debug, "configureFields(): Size of cfg fieldspec: %zd", _fieldsCfg.get()->fieldspec.size()); // UlfC: debugging - LOG(debug, "configureFields(): Size of cfg documenttype: %zd", _fieldsCfg.get()->documenttype.size()); // UlfC: debugging - LOG(debug, "configureSummary(): Size of cfg classes: %zd", summary->classes.size()); // UlfC: debugging - LOG(debug, "configureSummaryMap(): Size of cfg override: %zd", summaryMap->override.size()); // UlfC: debugging - LOG(debug, "configureVsmSummary(): Size of cfg fieldmap: %zd", vsmSummary->fieldmap.size()); // UlfC: debugging - LOG(debug, "configureVsmSummary(): outputclass='%s'", vsmSummary->outputclass.c_str()); // UlfC: debugging - - // init result config - std::unique_ptr<ResultConfig> resCfg(new ResultConfig()); - if ( ! resCfg->ReadConfig(*summary.get(), _configId.c_str())) { - throw std::runtime_error("(re-)configuration of VSM (docsum tools) failed due to bad summary config"); - } - - // init keyword extractor - auto kwExtractor = std::make_unique<KeywordExtractor>(nullptr); - kwExtractor->AddLegalIndexSpec(_highlightindexes.c_str()); - vespalib::string spec = kwExtractor->GetLegalIndexSpec(); - LOG(debug, "index highlight spec: '%s'", spec.c_str()); - - // create dynamic docsum writer - auto writer = std::make_unique<DynamicDocsumWriter>(resCfg.release(), kwExtractor.release()); - - // configure juniper (used when configuring DynamicDocsumConfig) - _juniperProps = std::make_unique<JuniperProperties>(*juniperrc); - auto juniper = std::make_unique<juniper::Juniper>(_juniperProps.get(), &_wordFolder); - - // create new docsum tools - auto docsumTools = std::make_unique<DocsumTools>(std::move(writer)); - docsumTools->setJuniper(std::move(juniper)); - - // configure dynamic docsum writer - DynamicDocsumConfig dynDocsumConfig(docsumTools.get(), docsumTools->getDocsumWriter(), _fieldsCfg.get()); - dynDocsumConfig.configure(*summaryMap.get()); - - // configure new docsum tools - if (docsumTools->obtainFieldNames(vsmSummary)) { - // latch new docsum tools into production - _docsumTools.set(docsumTools.release()); - _docsumTools.latch(); - } else { - throw std::runtime_error("(re-)configuration of VSM (docsum tools) failed"); - } -} - -VSMConfigSnapshot::VSMConfigSnapshot(const vespalib::string & configId, const config::ConfigSnapshot & snapshot) - : _configId(configId), - _snapshot(std::make_unique<config::ConfigSnapshot>(snapshot)) -{ } -VSMConfigSnapshot::~VSMConfigSnapshot() = default; - -VSMAdapter::VSMAdapter(const vespalib::string & highlightindexes, const vespalib::string & configId, Fast_WordFolder & wordFolder) - : _highlightindexes(highlightindexes), - _configId(configId), - _wordFolder(wordFolder), - _fieldsCfg(), - _docsumTools(), - _juniperProps(), - _lock() -{ -} - - -VSMAdapter::~VSMAdapter() = default; - -} diff --git a/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.h b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.h deleted file mode 100644 index 6484269353b..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.h +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include <vespa/searchlib/query/base.h> -#include <vespa/vsm/config/vsm-cfif.h> -#include <vespa/config-summary.h> -#include <vespa/config-summarymap.h> -#include <vespa/searchlib/common/featureset.h> -#include <vespa/searchsummary/docsummary/docsumwriter.h> -#include <vespa/searchsummary/docsummary/docsumstate.h> -#include <vespa/searchsummary/docsummary/idocsumenvironment.h> -#include <vespa/juniper/rpinterface.h> - -using search::docsummary::ResultConfig; -using search::docsummary::ResultClass; -using search::docsummary::IDocsumWriter; -using search::docsummary::DynamicDocsumWriter; -using search::docsummary::GetDocsumsState; -using search::docsummary::IDocsumEnvironment; -using search::docsummary::JuniperProperties; - -using vespa::config::search::SummaryConfig; -using vespa::config::search::SummarymapConfig; -using vespa::config::search::summary::JuniperrcConfig; - -namespace config { class ConfigSnapshot; } -namespace vsm { - -class IMatchingElementsFiller; - -class GetDocsumsStateCallback : public search::docsummary::GetDocsumsStateCallback -{ -private: - search::FeatureSet::SP _summaryFeatures; - search::FeatureSet::SP _rankFeatures; - std::unique_ptr<IMatchingElementsFiller> _matching_elements_filler; - -public: - GetDocsumsStateCallback(); - void FillSummaryFeatures(GetDocsumsState * state, IDocsumEnvironment * env) override; - void FillRankFeatures(GetDocsumsState * state, IDocsumEnvironment * env) override; - virtual void FillDocumentLocations(GetDocsumsState * state, IDocsumEnvironment * env); - virtual std::unique_ptr<search::MatchingElements> fill_matching_elements(const search::MatchingElementsFields& fields) override; - void setSummaryFeatures(const search::FeatureSet::SP & sf) { _summaryFeatures = sf; } - void setRankFeatures(const search::FeatureSet::SP & rf) { _rankFeatures = rf; } - void set_matching_elements_filler(std::unique_ptr<IMatchingElementsFiller> matching_elements_filler); - ~GetDocsumsStateCallback(); -}; - -class DocsumTools : public IDocsumEnvironment -{ -public: - class FieldSpec { - private: - vespalib::string _outputName; - std::vector<vespalib::string> _inputNames; - VsmsummaryConfig::Fieldmap::Command _command; - - public: - FieldSpec(); - ~FieldSpec(); - const vespalib::string & getOutputName() const { return _outputName; } - void setOutputName(const vespalib::string & name) { _outputName = name; } - const std::vector<vespalib::string> & getInputNames() const { return _inputNames; } - std::vector<vespalib::string> & getInputNames() { return _inputNames; } - VsmsummaryConfig::Fieldmap::Command getCommand() const { return _command; } - void setCommand(VsmsummaryConfig::Fieldmap::Command command) { _command = command; } - }; - -private: - std::unique_ptr<DynamicDocsumWriter> _writer; - std::unique_ptr<juniper::Juniper> _juniper; - const ResultClass * _resultClass; - std::vector<FieldSpec> _fieldSpecs; - DocsumTools(const DocsumTools &); - DocsumTools &operator=(const DocsumTools &); - -public: - DocsumTools(std::unique_ptr<DynamicDocsumWriter> writer); - ~DocsumTools(); - void setJuniper(std::unique_ptr<juniper::Juniper> juniper) { _juniper = std::move(juniper); } - ResultConfig *getResultConfig() const { return _writer->GetResultConfig(); } - DynamicDocsumWriter *getDocsumWriter() const { return _writer.get(); } - const ResultClass *getResultClass() const { return _resultClass; } - const std::vector<FieldSpec> & getFieldSpecs() const { return _fieldSpecs; } - bool obtainFieldNames(const FastS_VsmsummaryHandle &cfg); - - // inherit doc from IDocsumEnvironment - search::IAttributeManager * getAttributeManager() override { return NULL; } - vespalib::string lookupIndex(const vespalib::string&) const override { return ""; } - juniper::Juniper * getJuniper() override { return _juniper.get(); } -}; - -typedef std::shared_ptr<DocsumTools> DocsumToolsPtr; - -class VSMConfigSnapshot { -private: - const vespalib::string _configId; - std::unique_ptr<const config::ConfigSnapshot> _snapshot; -public: - VSMConfigSnapshot(const vespalib::string & configId, const config::ConfigSnapshot & snapshot); - ~VSMConfigSnapshot(); - template <typename ConfigType> - std::unique_ptr<ConfigType> getConfig() const; -}; - -class VSMAdapter -{ -public: - VSMAdapter(const vespalib::string & highlightindexes, const vespalib::string & configId, Fast_WordFolder & wordFolder); - virtual ~VSMAdapter(); - - VsmfieldsHandle getFieldsConfig() const { return _fieldsCfg.get(); } - DocsumToolsPtr getDocsumTools() const { return _docsumTools.get(); } - void configure(const VSMConfigSnapshot & snapshot); -private: - vespalib::string _highlightindexes; - const vespalib::string _configId; - Fast_WordFolder & _wordFolder; - vespalib::PtrHolder<VsmfieldsConfig> _fieldsCfg; - vespalib::PtrHolder<DocsumTools> _docsumTools; - std::unique_ptr<JuniperProperties> _juniperProps; - - std::mutex _lock; - - VSMAdapter(const VSMAdapter &); - VSMAdapter &operator=(const VSMAdapter &); -}; - -} // namespace vsm - diff --git a/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.hpp b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.hpp deleted file mode 100644 index f071dbb2015..00000000000 --- a/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.hpp +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "vsm-adapter.h" -#include <vespa/config/retriever/configsnapshot.hpp> - -namespace vsm { - -template <typename ConfigType> -std::unique_ptr<ConfigType> -VSMConfigSnapshot::getConfig() const -{ - return _snapshot->getConfig<ConfigType>(_configId); -} - -} // namespace vsm - |