diff options
author | Tor Egge <Tor.Egge@online.no> | 2022-07-14 11:52:46 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2022-07-14 11:52:46 +0200 |
commit | 407cb10b3dbad7a3f072ea4e0352430df63aef08 (patch) | |
tree | e67fd7b943d55f0c3aa20a5c79ba3964d5cadcc1 | |
parent | 439da54cb6068d6097fc65bdd8e5d0e6d108d81a (diff) |
Remove unused TextExtractorDFW.
8 files changed, 1 insertions, 316 deletions
diff --git a/searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp b/searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp index 12e80fe7402..d7f46f5edcc 100644 --- a/searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp +++ b/searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp @@ -122,7 +122,7 @@ SummarySetup(const vespalib::string & baseDir, const DocTypeName & docTypeName, DynamicDocsumConfig dynCfg(this, _docsumWriter.get()); dynCfg.configure(summarymapCfg); for (const auto & o : summarymapCfg.override) { - if (o.command == "dynamicteaser" || o.command == "textextractor") { + if (o.command == "dynamicteaser") { vespalib::string markupField = o.arguments; if (markupField.empty()) continue; diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt index e262482cddb..6f89e95c641 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt +++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt @@ -35,6 +35,4 @@ vespa_add_library(searchsummary_docsummary OBJECT struct_map_attribute_combiner_dfw.cpp summaryfeaturesdfw.cpp summaryfieldconverter.cpp - textextractordfw.cpp - tokenizer.cpp ) diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp index fea11923858..c61ef4a0330 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp @@ -12,7 +12,6 @@ #include "matched_elements_filter_dfw.h" #include "positionsdfw.h" #include "rankfeaturesdfw.h" -#include "textextractordfw.h" #include "summaryfeaturesdfw.h" #include <vespa/searchlib/common/matching_elements_fields.h> #include <vespa/vespalib/util/stringfmt.h> @@ -43,14 +42,6 @@ DynamicDocsumConfig::createFieldWriter(const string & fieldName, const string & } else { throw IllegalArgumentException("Missing argument"); } - } else if (overrideName == "textextractor") { - if ( ! argument.empty() ) { - TextExtractorDFW * fw = new TextExtractorDFW(); - fieldWriter.reset(fw); - rc = fw->init(fieldName, argument, resultConfig); - } else { - throw IllegalArgumentException("Missing argument"); - } } else if (overrideName == "summaryfeatures") { fieldWriter = std::make_unique<SummaryFeaturesDFW>(getEnvironment()); rc = true; diff --git a/searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h b/searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h deleted file mode 100644 index 83da9e4da15..00000000000 --- a/searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include <vespa/vespalib/stllike/string.h> - -namespace search::docsummary { - -/** - * Interface for a tokenizer. - */ -class ITokenizer -{ -public: - /** - * Representation of a token with type and text and optional stemmed variant. - */ - class Token - { - public: - enum Type { - WORD, // Fast_UnicodeUtil::IsWordChar() returns true - NON_WORD, // Fast_UnicodeUtil::IsWordChar() returns false - PUNCTUATION, // Fast_UnicodeUtil::IsTerminalPunctuationChar() returns true - ANNOTATION, // Interlinear annotation - NOT_DEF - }; - private: - vespalib::stringref _text; - vespalib::stringref _stem; - Type _type; - - public: - Token(const char * textBegin, const char * textEnd, Type type) : - _text(textBegin, textEnd - textBegin), _stem(), _type(type) {} - Token(const char * textBegin, const char * textEnd, const char * stemBegin, const char * stemEnd, Type type) : - _text(textBegin, textEnd - textBegin), _stem(stemBegin, stemEnd - stemBegin), _type(type) {} - vespalib::stringref getText() const { return _text; } - vespalib::stringref getStem() const { return _stem; } - bool hasStem() const { return _stem.data() != NULL; } - Type getType() const { return _type; } - }; - - virtual ~ITokenizer() {} - - /** - * Reset the tokenizer using the given buffer. - */ - virtual void reset(const char * buf, size_t len) = 0; - - /** - * Returns the size of the underlying buffer. - */ - virtual size_t getBufferSize() const = 0; - - /** - * Returns true if the text buffer has more tokens. - */ - virtual bool hasMoreTokens() = 0; - - /** - * Returns the next token from the text buffer. - */ - virtual Token getNextToken() = 0; -}; - -} - diff --git a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp deleted file mode 100644 index dc6d9524ee4..00000000000 --- a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "textextractordfw.h" -#include "docsumstate.h" -#include "general_result.h" -#include "tokenizer.h" -#include "resultconfig.h" -#include <vespa/vespalib/data/slime/inserter.h> - -#include <vespa/log/log.h> -LOG_SETUP(".searchlib.docsummary.textextractordfw"); - -namespace search::docsummary { - -TextExtractorDFW::TextExtractorDFW() : - _inputFieldEnum(-1) -{ -} - -bool -TextExtractorDFW::init(const vespalib::string & fieldName, const vespalib::string & inputField, const ResultConfig & config) -{ - _inputFieldEnum = config.GetFieldNameEnum().Lookup(inputField.c_str()); - if (_inputFieldEnum == -1) { - LOG(warning, "Did not find input field '%s' as part of the docsum fields when initializing writer for field '%s'", - inputField.c_str(), fieldName.c_str()); - return false; - } - return true; -} - -void -TextExtractorDFW::insertField(uint32_t, GeneralResult *gres, GetDocsumsState *, ResType, - vespalib::slime::Inserter &target) -{ - vespalib::string extracted; - ResEntry * entry = gres->GetPresentEntryFromEnumValue(_inputFieldEnum); - if (entry != nullptr) { - const char * buf = nullptr; - uint32_t buflen = 0; - entry->_resolve_field(&buf, &buflen); - // extract the text - Tokenizer tokenizer(buf, buflen); - while (tokenizer.hasMoreTokens()) { - Tokenizer::Token token = tokenizer.getNextToken(); - extracted.append(token.getText()); - } - } else { - LOG(warning, "Did not find input entry using field enum %d. Write an empty field", _inputFieldEnum); - } - target.insertString(vespalib::Memory(extracted.c_str(), extracted.size())); -} - -} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h b/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h deleted file mode 100644 index 3bce2ae5cd7..00000000000 --- a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "docsum_field_writer.h" - -namespace search::docsummary { - -class ResultConfig; - -/** - * This is the docsum field writer used to extract the original text from a disk summary on the juniper format. - **/ -class TextExtractorDFW : public DocsumFieldWriter -{ -private: - TextExtractorDFW(const TextExtractorDFW &); - TextExtractorDFW & operator=(const TextExtractorDFW &); - - int _inputFieldEnum; - -public: - TextExtractorDFW(); - ~TextExtractorDFW() override = default; - bool init(const vespalib::string & fieldName, const vespalib::string & inputField, const ResultConfig & config); - bool IsGenerated() const override { return false; } - void insertField(uint32_t docid, GeneralResult *gres, GetDocsumsState *state, - ResType type, vespalib::slime::Inserter &target) override; -}; - -} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp b/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp deleted file mode 100644 index 44dbedb4fe9..00000000000 --- a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "tokenizer.h" -#include <cassert> - -namespace search::docsummary { - -Tokenizer::Token::Type -Tokenizer::getTokenType(ucs4_t ch) const -{ - if (Fast_UnicodeUtil::IsWordChar(ch)) { - return Token::WORD; - } else { - if (Fast_UnicodeUtil::IsTerminalPunctuationChar(ch)) { - return Token::PUNCTUATION; - } else { - return Token::NON_WORD; - } - } -} - -Tokenizer::Tokenizer(const char * buf, size_t len) : - _pos(buf), - _begin(buf), - _end(buf + len), - _tokenBegin(buf), - _type(Token::NOT_DEF), - _hasMoreTokens(_pos < _end) -{ -} - -void -Tokenizer::reset(const char * buf, size_t len) -{ - _pos = buf; - _begin = buf; - _end = buf + len; - _tokenBegin = buf; - _type = Token::NOT_DEF; - _hasMoreTokens = (_pos < _end); -} - -bool -Tokenizer::hasMoreTokens() -{ - return _hasMoreTokens; -} - -Tokenizer::Token -Tokenizer::getNextToken() -{ - const char * textBegin = _tokenBegin; - const char * textEnd = _pos; - const char * stemBegin = NULL; - const char * stemEnd = NULL; - const char * next = _pos; - bool insideAnnotation = false; - for (; _pos < _end; ) { - ucs4_t ch; - if (static_cast<unsigned char>(*next) < 0x80) { - ch = *next++; - if (ch == 0x1F) { // unit separator - Token t(textBegin, textEnd, stemBegin, stemEnd, _type); - _pos = next; // advance to next char - _tokenBegin = next; // the next token begins at the next char - _type = Token::NOT_DEF; // reset the token type - if (_pos == _end) { // this is the last token - _hasMoreTokens = false; - } - return t; - } - } else { - ch = Fast_UnicodeUtil::GetUTF8CharNonAscii(next); // updates next to the next utf8 character - if (ch == 0xFFF9) { // anchor - insideAnnotation = true; - textBegin = next; - _type = Token::ANNOTATION; - } - } - if (!insideAnnotation) { - Token::Type tmpType = getTokenType(ch); - if (_type != Token::NOT_DEF && _type != tmpType) { // we found a new token type - Token t(textBegin, textEnd, stemBegin, stemEnd, _type); - _tokenBegin = _pos; // the next token begins at this char - _pos = next; // advance to next char - _type = tmpType; // remember the new token type - return t; - } - _type = tmpType; - textEnd = next; // advance to next char - } else { // inside annotation - if (ch == 0xFFFA) { // separator - textEnd = _pos; - stemBegin = next; - } else if (ch == 0xFFFB && stemBegin != NULL) { // terminator - stemEnd = _pos; - insideAnnotation = false; - } - } - - _pos = next; - } - assert(_pos == _end); - _hasMoreTokens = false; - return Token(textBegin, _pos, _type); // return the last token -} - -} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h b/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h deleted file mode 100644 index 77acc97aec9..00000000000 --- a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "itokenizer.h" -#include <vespa/fastlib/text/unicodeutil.h> - -namespace search::docsummary { - -/** - * This class is used to tokenize an utf-8 text buffer into tokens of type - * WORD, NON_WORD, PUNCTUATION, and ANNOTATION. - * - * Functions in Fast_UnicodeUtil are used to determine word characters and terminal punctuation characters. - * The unit separator 0x1F is always treated as a token separator. The unit separator itself is not returned as a token. - * Interlinear annotation (0xFFF9 original 0xFFFA stemmed 0xFFFB) is used to specify the stemmed variant of a word. - * The annotation characters are not returned as part of a token. - */ -class Tokenizer : public ITokenizer -{ -private: - const char * _pos; // the current position in the input buffer - const char * _begin; // the begin of input buffer - const char * _end; // the end of the input buffer - const char * _tokenBegin; // the start of the next token - Token::Type _type; // the type of the current position - bool _hasMoreTokens; // do we have more tokens - - Token::Type getTokenType(ucs4_t ch) const; - -public: - /** - * Creates a new tokenizer for the given utf-8 text buffer. - */ - Tokenizer(const char * buf, size_t len); - - void reset(const char * buf, size_t len) override; - size_t getBufferSize() const override { return _end - _begin; } - bool hasMoreTokens() override; - Token getNextToken() override; -}; - -} |