// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. /** ***************************************************************************** * @author Bård Kvalheim * @date Creation date: 2001-12-07 * * A configurable tokenizer template that accepts two predicates: One to * determine separator symbols and one to determine punctuation symbols. A * typedef is defined that uses isspace/1 and ispunct/1. * * This tokenizer does not alter the text, and does not copy it. * * This tokenizer is not meant to be used as a real tokenizer for all * languages. It is only a fast and simple latin tokenizer, intended for * very basic applications. * * The tokens are returned as (char *, char *, bool) triples. The two * first elements delimit the token string, while the third element is * true if the token is a punctuation symbol. * * If the last character in the input text is a punctuation symbol, the last * token is the following: * * text = " something bl bla ." * * token.first -> . * token.second -> \0 * token._punctuation = true; * * In other words, token.second can point to the terminating '\0' in the input * text. * *****************************************************************************/ #pragma once #include #include /** ***************************************************************************** * A simple tokenizer. See description above. * * @class Fast_LatinTokenizer * @author Bård Kvalheim * @date Creation date: 2001-12-07 *****************************************************************************/ template class Fast_LatinTokenizer { private: Fast_LatinTokenizer(const Fast_LatinTokenizer &); Fast_LatinTokenizer& operator=(const Fast_LatinTokenizer &); public: /** Helper class. */ class Fast_Token { public: /** Member variables. */ char *first; // Points to start of token. Named 'first' for std::pair compatibility. char *second; // Points to end of token. Named 'second' for std::pair compatibility. bool _punctuation; // Is the token a punctuation symbol? /** Constructors. */ Fast_Token(char *begin, char *end, bool punctuation) : first(begin), second(end), _punctuation(punctuation) {} Fast_Token() : first(NULL), second(NULL), _punctuation(false) {} Fast_Token(const Fast_Token &other) : first(other.first), second(other.second), _punctuation(other._punctuation) { } Fast_Token& operator=(const Fast_Token &other) { first = other.first; second = other.second; _punctuation = other._punctuation; return *this; } }; /** Constructors/destructor. */ Fast_LatinTokenizer(); explicit Fast_LatinTokenizer(char *text); Fast_LatinTokenizer(char *text, size_t length); virtual ~Fast_LatinTokenizer(); /** Constructors, sort of. */ void SetNewText(char *text); void SetNewText(char *text, size_t length); /** Are there any more tokens left? */ bool MoreTokens(); /** Return next token. */ Fast_Token GetNextToken(); /** Return text buffer. */ char *GetOriginalText(); /** Observers in case we need not perform some action specific * to the IsSeparator or IsPunctuation implementations * (such as extra initialization or statistics gathering or...) */ IsPunctuation& GetIsPunctuation() { return _isPunctuation; } IsSeparator& GetIsSeparator() { return _isSeparator; } private: /** Member variables. */ char *_org; // Holds the original text buffer. char *_next; // Points to the current buffer position. char *_end; // Points to the end of the buffer. bool _moreTokens; // More text to process? IsSeparator _isSeparator; // Separator symbol predicate. IsPunctuation _isPunctuation; // Punctuation symbol predicate. /** Helper methods. */ void SkipBlanks(); }; /** ***************************************************************************** * Default constructor. * * @author Bård Kvalheim *****************************************************************************/ template Fast_LatinTokenizer::Fast_LatinTokenizer() : _org(NULL), _next(NULL), _end(NULL), _moreTokens(false), _isSeparator(), _isPunctuation() { } /** ***************************************************************************** * Constructor. Accepts a '\0' terminated text buffer. * * @param text * @author Bård Kvalheim *****************************************************************************/ template Fast_LatinTokenizer::Fast_LatinTokenizer(char *text) : _org(NULL), _next(NULL), _end(NULL), _moreTokens(false), _isSeparator(), _isPunctuation() { SetNewText(text); } /** ***************************************************************************** * Constructor. Accepts a text buffer and the buffer length * * @param text * @param length * @author Bård Kvalheim *****************************************************************************/ template Fast_LatinTokenizer::Fast_LatinTokenizer(char *text, size_t length) : _org(NULL), _next(NULL), _end(NULL), _moreTokens(false), _isSeparator(), _isPunctuation() { SetNewText(text, length); } /** ***************************************************************************** * Destructor. * * @author Bård Kvalheim *****************************************************************************/ template Fast_LatinTokenizer::~Fast_LatinTokenizer() { } /** ***************************************************************************** * Sets a new '\0' terminated string. * * @param text * @author Bård Kvalheim *****************************************************************************/ template void Fast_LatinTokenizer::SetNewText(char *text) { _org = text; _next = text; _moreTokens = text != NULL; _end = NULL; } /** ***************************************************************************** * Sets a new string, given the text buffer and its length. * * @param text * @param length * @author Bård Kvalheim *****************************************************************************/ template void Fast_LatinTokenizer::SetNewText(char *text, size_t length) { _org = text; _next = text; _moreTokens = text != NULL; _end = (_next ? _next + length : NULL); } /** ***************************************************************************** * Skips all blanks and flags if there are more tokens. * * @author Bård Kvalheim *****************************************************************************/ template void Fast_LatinTokenizer::SkipBlanks() { if (!_moreTokens) return; // Initialized with '\0' terminated buffer? if (_end == NULL) { while (*_next != '\0' && _isSeparator(*_next)) { ++_next; } if (*_next == '\0') { _moreTokens = false; } } // Initialized with specified buffer length. else { while (_next != _end && _isSeparator(*_next)) { ++_next; } if (_next == _end) { _moreTokens = false; } } } /** ***************************************************************************** * Returns true if there are more tokens left in the text buffer. * * @author Bård Kvalheim *****************************************************************************/ template bool Fast_LatinTokenizer::MoreTokens() { SkipBlanks(); return _moreTokens; } /** ***************************************************************************** * Returns the next token as a Fast_Token. * * @author Bård Kvalheim *****************************************************************************/ template typename Fast_LatinTokenizer::Fast_Token Fast_LatinTokenizer::GetNextToken() { char *prev = _next; // Skip all blanks and flag if there are no more tokens. SkipBlanks(); // Initialized with '\0' terminated buffer? Find the next blank or punctuation. if (_end == NULL) { while (*_next != '\0' && !_isSeparator(*_next) && !_isPunctuation(*_next)) { ++_next; } // Initialized with specified buffer length. } else { while (_next != _end && !_isSeparator(*_next) && !_isPunctuation(*_next)) { ++_next; } } // Check if this token is a punctuation symbol, and generate token. bool isToken = ((_next - prev == 0) && _isPunctuation(*prev)); if (isToken) { ++_next; } Fast_Token token(prev, _next, isToken); return token; } /** ***************************************************************************** * Returns the original text buffer. * * @author Bård Kvalheim *****************************************************************************/ template char * Fast_LatinTokenizer::GetOriginalText() { return _org; } /** ***************************************************************************** * Helper class. * * When using isspace/1, ensure that the argument is cast to unsigned char to * avoid problems with sign extension. See system documentation for details. * * @class Fast_IsSpace * @author Bård Kvalheim * @date Creation date: 2001-12-07 *****************************************************************************/ struct Fast_IsSpace { bool operator()(char c) {return (isspace(static_cast(c)) != 0);} }; /** ***************************************************************************** * Helper class. * * When using ispunct/1, ensure that the argument is cast to unsigned char to * avoid problems with sign extension. See system documentation for details. * * @class Fast_IsPunctuation * @author Bård Kvalheim * @date Creation date: 2001-12-07 *****************************************************************************/ struct Fast_IsPunctuation { bool operator()(char c) {return (ispunct(static_cast(c)) != 0);} }; /** ***************************************************************************** * A simple tokenizer. See description above. * * @class Fast_SimpleLatinTokenizer * @author Bård Kvalheim * @date Creation date: 2001-12-07 *****************************************************************************/ typedef Fast_LatinTokenizer Fast_SimpleLatinTokenizer;