aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2022-07-14 11:52:46 +0200
committerTor Egge <Tor.Egge@online.no>2022-07-14 11:52:46 +0200
commit407cb10b3dbad7a3f072ea4e0352430df63aef08 (patch)
treee67fd7b943d55f0c3aa20a5c79ba3964d5cadcc1
parent439da54cb6068d6097fc65bdd8e5d0e6d108d81a (diff)
Remove unused TextExtractorDFW.
-rw-r--r--searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp2
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt2
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp9
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h68
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp54
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h31
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp108
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h43
8 files changed, 1 insertions, 316 deletions
diff --git a/searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp b/searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp
index 12e80fe7402..d7f46f5edcc 100644
--- a/searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp
+++ b/searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp
@@ -122,7 +122,7 @@ SummarySetup(const vespalib::string & baseDir, const DocTypeName & docTypeName,
DynamicDocsumConfig dynCfg(this, _docsumWriter.get());
dynCfg.configure(summarymapCfg);
for (const auto & o : summarymapCfg.override) {
- if (o.command == "dynamicteaser" || o.command == "textextractor") {
+ if (o.command == "dynamicteaser") {
vespalib::string markupField = o.arguments;
if (markupField.empty())
continue;
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
index e262482cddb..6f89e95c641 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
+++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
@@ -35,6 +35,4 @@ vespa_add_library(searchsummary_docsummary OBJECT
struct_map_attribute_combiner_dfw.cpp
summaryfeaturesdfw.cpp
summaryfieldconverter.cpp
- textextractordfw.cpp
- tokenizer.cpp
)
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp
index fea11923858..c61ef4a0330 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp
@@ -12,7 +12,6 @@
#include "matched_elements_filter_dfw.h"
#include "positionsdfw.h"
#include "rankfeaturesdfw.h"
-#include "textextractordfw.h"
#include "summaryfeaturesdfw.h"
#include <vespa/searchlib/common/matching_elements_fields.h>
#include <vespa/vespalib/util/stringfmt.h>
@@ -43,14 +42,6 @@ DynamicDocsumConfig::createFieldWriter(const string & fieldName, const string &
} else {
throw IllegalArgumentException("Missing argument");
}
- } else if (overrideName == "textextractor") {
- if ( ! argument.empty() ) {
- TextExtractorDFW * fw = new TextExtractorDFW();
- fieldWriter.reset(fw);
- rc = fw->init(fieldName, argument, resultConfig);
- } else {
- throw IllegalArgumentException("Missing argument");
- }
} else if (overrideName == "summaryfeatures") {
fieldWriter = std::make_unique<SummaryFeaturesDFW>(getEnvironment());
rc = true;
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h b/searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h
deleted file mode 100644
index 83da9e4da15..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#pragma once
-
-#include <vespa/vespalib/stllike/string.h>
-
-namespace search::docsummary {
-
-/**
- * Interface for a tokenizer.
- */
-class ITokenizer
-{
-public:
- /**
- * Representation of a token with type and text and optional stemmed variant.
- */
- class Token
- {
- public:
- enum Type {
- WORD, // Fast_UnicodeUtil::IsWordChar() returns true
- NON_WORD, // Fast_UnicodeUtil::IsWordChar() returns false
- PUNCTUATION, // Fast_UnicodeUtil::IsTerminalPunctuationChar() returns true
- ANNOTATION, // Interlinear annotation
- NOT_DEF
- };
- private:
- vespalib::stringref _text;
- vespalib::stringref _stem;
- Type _type;
-
- public:
- Token(const char * textBegin, const char * textEnd, Type type) :
- _text(textBegin, textEnd - textBegin), _stem(), _type(type) {}
- Token(const char * textBegin, const char * textEnd, const char * stemBegin, const char * stemEnd, Type type) :
- _text(textBegin, textEnd - textBegin), _stem(stemBegin, stemEnd - stemBegin), _type(type) {}
- vespalib::stringref getText() const { return _text; }
- vespalib::stringref getStem() const { return _stem; }
- bool hasStem() const { return _stem.data() != NULL; }
- Type getType() const { return _type; }
- };
-
- virtual ~ITokenizer() {}
-
- /**
- * Reset the tokenizer using the given buffer.
- */
- virtual void reset(const char * buf, size_t len) = 0;
-
- /**
- * Returns the size of the underlying buffer.
- */
- virtual size_t getBufferSize() const = 0;
-
- /**
- * Returns true if the text buffer has more tokens.
- */
- virtual bool hasMoreTokens() = 0;
-
- /**
- * Returns the next token from the text buffer.
- */
- virtual Token getNextToken() = 0;
-};
-
-}
-
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp
deleted file mode 100644
index dc6d9524ee4..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#include "textextractordfw.h"
-#include "docsumstate.h"
-#include "general_result.h"
-#include "tokenizer.h"
-#include "resultconfig.h"
-#include <vespa/vespalib/data/slime/inserter.h>
-
-#include <vespa/log/log.h>
-LOG_SETUP(".searchlib.docsummary.textextractordfw");
-
-namespace search::docsummary {
-
-TextExtractorDFW::TextExtractorDFW() :
- _inputFieldEnum(-1)
-{
-}
-
-bool
-TextExtractorDFW::init(const vespalib::string & fieldName, const vespalib::string & inputField, const ResultConfig & config)
-{
- _inputFieldEnum = config.GetFieldNameEnum().Lookup(inputField.c_str());
- if (_inputFieldEnum == -1) {
- LOG(warning, "Did not find input field '%s' as part of the docsum fields when initializing writer for field '%s'",
- inputField.c_str(), fieldName.c_str());
- return false;
- }
- return true;
-}
-
-void
-TextExtractorDFW::insertField(uint32_t, GeneralResult *gres, GetDocsumsState *, ResType,
- vespalib::slime::Inserter &target)
-{
- vespalib::string extracted;
- ResEntry * entry = gres->GetPresentEntryFromEnumValue(_inputFieldEnum);
- if (entry != nullptr) {
- const char * buf = nullptr;
- uint32_t buflen = 0;
- entry->_resolve_field(&buf, &buflen);
- // extract the text
- Tokenizer tokenizer(buf, buflen);
- while (tokenizer.hasMoreTokens()) {
- Tokenizer::Token token = tokenizer.getNextToken();
- extracted.append(token.getText());
- }
- } else {
- LOG(warning, "Did not find input entry using field enum %d. Write an empty field", _inputFieldEnum);
- }
- target.insertString(vespalib::Memory(extracted.c_str(), extracted.size()));
-}
-
-}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h b/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h
deleted file mode 100644
index 3bce2ae5cd7..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#pragma once
-
-#include "docsum_field_writer.h"
-
-namespace search::docsummary {
-
-class ResultConfig;
-
-/**
- * This is the docsum field writer used to extract the original text from a disk summary on the juniper format.
- **/
-class TextExtractorDFW : public DocsumFieldWriter
-{
-private:
- TextExtractorDFW(const TextExtractorDFW &);
- TextExtractorDFW & operator=(const TextExtractorDFW &);
-
- int _inputFieldEnum;
-
-public:
- TextExtractorDFW();
- ~TextExtractorDFW() override = default;
- bool init(const vespalib::string & fieldName, const vespalib::string & inputField, const ResultConfig & config);
- bool IsGenerated() const override { return false; }
- void insertField(uint32_t docid, GeneralResult *gres, GetDocsumsState *state,
- ResType type, vespalib::slime::Inserter &target) override;
-};
-
-}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp b/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp
deleted file mode 100644
index 44dbedb4fe9..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#include "tokenizer.h"
-#include <cassert>
-
-namespace search::docsummary {
-
-Tokenizer::Token::Type
-Tokenizer::getTokenType(ucs4_t ch) const
-{
- if (Fast_UnicodeUtil::IsWordChar(ch)) {
- return Token::WORD;
- } else {
- if (Fast_UnicodeUtil::IsTerminalPunctuationChar(ch)) {
- return Token::PUNCTUATION;
- } else {
- return Token::NON_WORD;
- }
- }
-}
-
-Tokenizer::Tokenizer(const char * buf, size_t len) :
- _pos(buf),
- _begin(buf),
- _end(buf + len),
- _tokenBegin(buf),
- _type(Token::NOT_DEF),
- _hasMoreTokens(_pos < _end)
-{
-}
-
-void
-Tokenizer::reset(const char * buf, size_t len)
-{
- _pos = buf;
- _begin = buf;
- _end = buf + len;
- _tokenBegin = buf;
- _type = Token::NOT_DEF;
- _hasMoreTokens = (_pos < _end);
-}
-
-bool
-Tokenizer::hasMoreTokens()
-{
- return _hasMoreTokens;
-}
-
-Tokenizer::Token
-Tokenizer::getNextToken()
-{
- const char * textBegin = _tokenBegin;
- const char * textEnd = _pos;
- const char * stemBegin = NULL;
- const char * stemEnd = NULL;
- const char * next = _pos;
- bool insideAnnotation = false;
- for (; _pos < _end; ) {
- ucs4_t ch;
- if (static_cast<unsigned char>(*next) < 0x80) {
- ch = *next++;
- if (ch == 0x1F) { // unit separator
- Token t(textBegin, textEnd, stemBegin, stemEnd, _type);
- _pos = next; // advance to next char
- _tokenBegin = next; // the next token begins at the next char
- _type = Token::NOT_DEF; // reset the token type
- if (_pos == _end) { // this is the last token
- _hasMoreTokens = false;
- }
- return t;
- }
- } else {
- ch = Fast_UnicodeUtil::GetUTF8CharNonAscii(next); // updates next to the next utf8 character
- if (ch == 0xFFF9) { // anchor
- insideAnnotation = true;
- textBegin = next;
- _type = Token::ANNOTATION;
- }
- }
- if (!insideAnnotation) {
- Token::Type tmpType = getTokenType(ch);
- if (_type != Token::NOT_DEF && _type != tmpType) { // we found a new token type
- Token t(textBegin, textEnd, stemBegin, stemEnd, _type);
- _tokenBegin = _pos; // the next token begins at this char
- _pos = next; // advance to next char
- _type = tmpType; // remember the new token type
- return t;
- }
- _type = tmpType;
- textEnd = next; // advance to next char
- } else { // inside annotation
- if (ch == 0xFFFA) { // separator
- textEnd = _pos;
- stemBegin = next;
- } else if (ch == 0xFFFB && stemBegin != NULL) { // terminator
- stemEnd = _pos;
- insideAnnotation = false;
- }
- }
-
- _pos = next;
- }
- assert(_pos == _end);
- _hasMoreTokens = false;
- return Token(textBegin, _pos, _type); // return the last token
-}
-
-}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h b/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h
deleted file mode 100644
index 77acc97aec9..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#pragma once
-
-#include "itokenizer.h"
-#include <vespa/fastlib/text/unicodeutil.h>
-
-namespace search::docsummary {
-
-/**
- * This class is used to tokenize an utf-8 text buffer into tokens of type
- * WORD, NON_WORD, PUNCTUATION, and ANNOTATION.
- *
- * Functions in Fast_UnicodeUtil are used to determine word characters and terminal punctuation characters.
- * The unit separator 0x1F is always treated as a token separator. The unit separator itself is not returned as a token.
- * Interlinear annotation (0xFFF9 original 0xFFFA stemmed 0xFFFB) is used to specify the stemmed variant of a word.
- * The annotation characters are not returned as part of a token.
- */
-class Tokenizer : public ITokenizer
-{
-private:
- const char * _pos; // the current position in the input buffer
- const char * _begin; // the begin of input buffer
- const char * _end; // the end of the input buffer
- const char * _tokenBegin; // the start of the next token
- Token::Type _type; // the type of the current position
- bool _hasMoreTokens; // do we have more tokens
-
- Token::Type getTokenType(ucs4_t ch) const;
-
-public:
- /**
- * Creates a new tokenizer for the given utf-8 text buffer.
- */
- Tokenizer(const char * buf, size_t len);
-
- void reset(const char * buf, size_t len) override;
- size_t getBufferSize() const override { return _end - _begin; }
- bool hasMoreTokens() override;
- Token getNextToken() override;
-};
-
-}