Remove unused TextExtractorDFW.

author: Tor Egge <Tor.Egge@online.no> 2022-07-14 11:52:46 +0200
committer: Tor Egge <Tor.Egge@online.no> 2022-07-14 11:52:46 +0200
commit: 407cb10b3dbad7a3f072ea4e0352430df63aef08 (patch)
tree: e67fd7b943d55f0c3aa20a5c79ba3964d5cadcc1
parent: 439da54cb6068d6097fc65bdd8e5d0e6d108d81a (diff)
8 files changed, 1 insertions, 316 deletions
diff --git a/searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp b/searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp
index 12e80fe7402..d7f46f5edcc 100644
--- a/searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp
+++ b/searchcore/src/vespa/searchcore/proton/docsummary/summarymanager.cpp
@@ -122,7 +122,7 @@ SummarySetup(const vespalib::string & baseDir, const DocTypeName & docTypeName,
     DynamicDocsumConfig dynCfg(this, _docsumWriter.get());
     dynCfg.configure(summarymapCfg);
     for (const auto & o : summarymapCfg.override) {
-        if (o.command == "dynamicteaser" || o.command == "textextractor") {
+        if (o.command == "dynamicteaser") {
             vespalib::string markupField = o.arguments;
             if (markupField.empty())
                 continue;
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
index e262482cddb..6f89e95c641 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
+++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt
@@ -35,6 +35,4 @@ vespa_add_library(searchsummary_docsummary OBJECT
     struct_map_attribute_combiner_dfw.cpp
     summaryfeaturesdfw.cpp
     summaryfieldconverter.cpp
-    textextractordfw.cpp
-    tokenizer.cpp
 )
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp b/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp
index fea11923858..c61ef4a0330 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/docsumconfig.cpp
@@ -12,7 +12,6 @@
 #include "matched_elements_filter_dfw.h"
 #include "positionsdfw.h"
 #include "rankfeaturesdfw.h"
-#include "textextractordfw.h"
 #include "summaryfeaturesdfw.h"
 #include <vespa/searchlib/common/matching_elements_fields.h>
 #include <vespa/vespalib/util/stringfmt.h>
@@ -43,14 +42,6 @@ DynamicDocsumConfig::createFieldWriter(const string & fieldName, const string &
         } else {
             throw IllegalArgumentException("Missing argument");
         }
-    } else if (overrideName == "textextractor") {
-        if ( ! argument.empty() ) {
-            TextExtractorDFW * fw = new TextExtractorDFW();
-            fieldWriter.reset(fw);
-            rc = fw->init(fieldName, argument, resultConfig);
-        } else {
-            throw IllegalArgumentException("Missing argument");
-        }
     } else if (overrideName == "summaryfeatures") {
         fieldWriter = std::make_unique<SummaryFeaturesDFW>(getEnvironment());
         rc = true;
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h b/searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h
deleted file mode 100644
index 83da9e4da15..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/itokenizer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#pragma once
-
-#include <vespa/vespalib/stllike/string.h>
-
-namespace search::docsummary {
-
-/**
- * Interface for a tokenizer.
- */
-class ITokenizer
-{
-public:
-    /**
-     * Representation of a token with type and text and optional stemmed variant.
-     */
-    class Token
-    {
-    public:
-        enum Type {
-            WORD,        // Fast_UnicodeUtil::IsWordChar() returns true
-            NON_WORD,    // Fast_UnicodeUtil::IsWordChar() returns false
-            PUNCTUATION, // Fast_UnicodeUtil::IsTerminalPunctuationChar() returns true
-            ANNOTATION,  // Interlinear annotation
-            NOT_DEF
-        };
-    private:
-        vespalib::stringref _text;
-        vespalib::stringref _stem;
-        Type                _type;
-
-    public:
-        Token(const char * textBegin, const char * textEnd, Type type) :
-            _text(textBegin, textEnd - textBegin), _stem(), _type(type) {}
-        Token(const char * textBegin, const char * textEnd, const char * stemBegin, const char * stemEnd, Type type) :
-            _text(textBegin, textEnd - textBegin), _stem(stemBegin, stemEnd - stemBegin), _type(type) {}
-        vespalib::stringref getText() const { return _text; }
-        vespalib::stringref getStem() const { return _stem; }
-        bool hasStem() const { return _stem.data() != NULL; }
-        Type getType() const { return _type; }
-    };
-
-    virtual ~ITokenizer() {}
-
-    /**
-     * Reset the tokenizer using the given buffer.
-     */
-    virtual void reset(const char * buf, size_t len) = 0;
-
-    /**
-     * Returns the size of the underlying buffer.
-     */
-    virtual size_t getBufferSize() const = 0;
-
-    /**
-     * Returns true if the text buffer has more tokens.
-     */
-    virtual bool hasMoreTokens() = 0;
-
-    /**
-     * Returns the next token from the text buffer.
-     */
-    virtual Token getNextToken() = 0;
-};
-
-}
-
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp b/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp
deleted file mode 100644
index dc6d9524ee4..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#include "textextractordfw.h"
-#include "docsumstate.h"
-#include "general_result.h"
-#include "tokenizer.h"
-#include "resultconfig.h"
-#include <vespa/vespalib/data/slime/inserter.h>
-
-#include <vespa/log/log.h>
-LOG_SETUP(".searchlib.docsummary.textextractordfw");
-
-namespace search::docsummary {
-
-TextExtractorDFW::TextExtractorDFW() :
-    _inputFieldEnum(-1)
-{
-}
-
-bool
-TextExtractorDFW::init(const vespalib::string & fieldName, const vespalib::string & inputField, const ResultConfig & config)
-{
-    _inputFieldEnum = config.GetFieldNameEnum().Lookup(inputField.c_str());
-    if (_inputFieldEnum == -1) {
-        LOG(warning, "Did not find input field '%s' as part of the docsum fields when initializing writer for field '%s'",
-            inputField.c_str(), fieldName.c_str());
-        return false;
-    }
-    return true;
-}
-
-void
-TextExtractorDFW::insertField(uint32_t, GeneralResult *gres, GetDocsumsState *, ResType,
-                              vespalib::slime::Inserter &target)
-{
-    vespalib::string extracted;
-    ResEntry * entry = gres->GetPresentEntryFromEnumValue(_inputFieldEnum);
-    if (entry != nullptr) {
-        const char * buf = nullptr;
-        uint32_t buflen = 0;
-        entry->_resolve_field(&buf, &buflen);
-        // extract the text
-        Tokenizer tokenizer(buf, buflen);
-        while (tokenizer.hasMoreTokens()) {
-            Tokenizer::Token token = tokenizer.getNextToken();
-            extracted.append(token.getText());
-        }
-    } else {
-        LOG(warning, "Did not find input entry using field enum %d. Write an empty field", _inputFieldEnum);
-    }
-    target.insertString(vespalib::Memory(extracted.c_str(), extracted.size()));
-}
-
-}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h b/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h
deleted file mode 100644
index 3bce2ae5cd7..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/textextractordfw.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#pragma once
-
-#include "docsum_field_writer.h"
-
-namespace search::docsummary {
-
-class ResultConfig;
-
-/**
- * This is the docsum field writer used to extract the original text from a disk summary on the juniper format.
- **/
-class TextExtractorDFW : public DocsumFieldWriter
-{
-private:
-    TextExtractorDFW(const TextExtractorDFW &);
-    TextExtractorDFW & operator=(const TextExtractorDFW &);
-
-    int _inputFieldEnum;
-
-public:
-    TextExtractorDFW();
-    ~TextExtractorDFW() override = default;
-    bool init(const vespalib::string & fieldName, const vespalib::string & inputField, const ResultConfig & config);
-    bool IsGenerated() const override { return false; }
-    void insertField(uint32_t docid, GeneralResult *gres, GetDocsumsState *state,
-                     ResType type, vespalib::slime::Inserter &target) override;
-};
-
-}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp b/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp
deleted file mode 100644
index 44dbedb4fe9..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#include "tokenizer.h"
-#include <cassert>
-
-namespace search::docsummary {
-
-Tokenizer::Token::Type
-Tokenizer::getTokenType(ucs4_t ch) const
-{
-    if (Fast_UnicodeUtil::IsWordChar(ch)) {
-        return Token::WORD;
-    } else {
-        if (Fast_UnicodeUtil::IsTerminalPunctuationChar(ch)) {
-            return Token::PUNCTUATION;
-        } else {
-            return Token::NON_WORD;
-        }
-    }
-}
-
-Tokenizer::Tokenizer(const char * buf, size_t len) :
-    _pos(buf),
-    _begin(buf),
-    _end(buf + len),
-    _tokenBegin(buf),
-    _type(Token::NOT_DEF),
-    _hasMoreTokens(_pos < _end)
-{
-}
-
-void
-Tokenizer::reset(const char * buf, size_t len)
-{
-    _pos = buf;
-    _begin = buf;
-    _end = buf + len;
-    _tokenBegin = buf;
-    _type = Token::NOT_DEF;
-    _hasMoreTokens = (_pos < _end);
-}
-
-bool
-Tokenizer::hasMoreTokens()
-{
-    return _hasMoreTokens;
-}
-
-Tokenizer::Token
-Tokenizer::getNextToken()
-{
-    const char * textBegin = _tokenBegin;
-    const char * textEnd = _pos;
-    const char * stemBegin = NULL;
-    const char * stemEnd = NULL;
-    const char * next = _pos;
-    bool insideAnnotation = false;
-    for (; _pos < _end; ) {
-        ucs4_t ch;
-        if (static_cast<unsigned char>(*next) < 0x80) {
-            ch = *next++;
-            if (ch == 0x1F) { // unit separator
-                Token t(textBegin, textEnd, stemBegin, stemEnd, _type);
-                _pos = next; // advance to next char
-                _tokenBegin = next; // the next token begins at the next char
-                _type = Token::NOT_DEF; // reset the token type
-                if (_pos == _end) { // this is the last token
-                    _hasMoreTokens = false;
-                }
-                return t;
-            }
-        } else {
-            ch = Fast_UnicodeUtil::GetUTF8CharNonAscii(next); // updates next to the next utf8 character
-            if (ch == 0xFFF9) { // anchor
-                insideAnnotation = true;
-                textBegin = next;
-                _type = Token::ANNOTATION;
-            }
-        }
-        if (!insideAnnotation) {
-            Token::Type tmpType = getTokenType(ch);
-            if (_type != Token::NOT_DEF && _type != tmpType) { // we found a new token type
-                Token t(textBegin, textEnd, stemBegin, stemEnd, _type);
-                _tokenBegin = _pos; // the next token begins at this char
-                _pos = next; // advance to next char
-                _type = tmpType; // remember the new token type
-                return t;
-            }
-            _type = tmpType;
-            textEnd = next; // advance to next char
-        } else { // inside annotation
-            if (ch == 0xFFFA) { // separator
-                textEnd = _pos;
-                stemBegin = next;
-            } else if (ch == 0xFFFB && stemBegin != NULL) { // terminator
-                stemEnd = _pos;
-                insideAnnotation = false;
-            }
-        }
-
-        _pos = next;
-    }
-    assert(_pos == _end);
-    _hasMoreTokens = false;
-    return Token(textBegin, _pos, _type); // return the last token
-}
-
-}
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h b/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h
deleted file mode 100644
index 77acc97aec9..00000000000
--- a/searchsummary/src/vespa/searchsummary/docsummary/tokenizer.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#pragma once
-
-#include "itokenizer.h"
-#include <vespa/fastlib/text/unicodeutil.h>
-
-namespace search::docsummary {
-
-/**
- * This class is used to tokenize an utf-8 text buffer into tokens of type
- * WORD, NON_WORD, PUNCTUATION, and ANNOTATION.
- *
- * Functions in Fast_UnicodeUtil are used to determine word characters and terminal punctuation characters.
- * The unit separator 0x1F is always treated as a token separator. The unit separator itself is not returned as a token.
- * Interlinear annotation (0xFFF9 original 0xFFFA stemmed 0xFFFB) is used to specify the stemmed variant of a word.
- * The annotation characters are not returned as part of a token.
- */
-class Tokenizer : public ITokenizer
-{
-private:
-    const char * _pos;   // the current position in the input buffer
-    const char * _begin; // the begin of input buffer
-    const char * _end;   // the end of the input buffer
-    const char * _tokenBegin; // the start of the next token
-    Token::Type  _type;  // the type of the current position
-    bool         _hasMoreTokens; // do we have more tokens
-
-    Token::Type getTokenType(ucs4_t ch) const;
-
-public:
-    /**
-     * Creates a new tokenizer for the given utf-8 text buffer.
-     */
-    Tokenizer(const char * buf, size_t len);
-
-    void reset(const char * buf, size_t len) override;
-    size_t getBufferSize() const override { return _end - _begin; }
-    bool hasMoreTokens() override;
-    Token getNextToken() override;
-};
-
-}
author	Tor Egge <Tor.Egge@online.no>	2022-07-14 11:52:46 +0200
committer	Tor Egge <Tor.Egge@online.no>	2022-07-14 11:52:46 +0200
commit	407cb10b3dbad7a3f072ea4e0352430df63aef08 (patch)
tree	e67fd7b943d55f0c3aa20a5c79ba3964d5cadcc1
parent	439da54cb6068d6097fc65bdd8e5d0e6d108d81a (diff)