// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. /** * @author Peter Boros * @date 2004/08/20 * @version $Id$ * @file ngram.cpp * @brief n-gram class for tokenized text. */ #include "ngram.h" #include "wordchartokenizer.h" #include #include namespace fsa { // {{{ NGram::NGram() NGram::NGram(const char *text, unsigned int from, int length) : _tokens() { append(text,from,length); } NGram::NGram(const char *text, Tokenizer &tokenizer, unsigned int from, int length) : _tokens() { append(text,tokenizer,from,length); } NGram::NGram(const NGram &g, unsigned int from, int length) : _tokens() { append(g,from,length); } NGram::NGram(const NGram &g, const Selector &select) : _tokens() { append(g,select); } NGram::NGram(const NGram &g, const Permuter &p, unsigned int id) : _tokens() { append(g,p,id); } NGram::NGram(const std::string &s, unsigned int from, int length) : _tokens() { append(s,from,length); } NGram::NGram(const std::string &s, Tokenizer &tokenizer, unsigned int from, int length) : _tokens() { append(s,tokenizer,from,length); } // }}} // {{{ NGram::set() void NGram::set(const char *text, unsigned int from, int length) { clear(); append(text,from,length); } void NGram::set(const char *text, Tokenizer &tokenizer, unsigned int from, int length) { clear(); append(text,tokenizer,from,length); } void NGram::set(const NGram &g, unsigned int from, int length) { if(this==&g){ set(NGram(g),from,length); } else{ clear(); append(g,from,length); } } void NGram::set(const NGram &g, const Selector &select) { if(this==&g){ set(NGram(g),select); } else{ clear(); append(g,select); } } void NGram::set(const NGram &g, const Permuter &p, unsigned int id) { if(this==&g){ set(NGram(g),p,id); } else{ clear(); append(g,p,id); } } void NGram::set(const std::string &s, unsigned int from, int length) { clear(); append(s,from,length); } void NGram::set(const std::string &s, Tokenizer &tokenizer, unsigned int from, int length) { clear(); append(s,tokenizer,from,length); } // }}} // {{{ NGram::setOne() void NGram::setOne(const std::string &s) { clear(); appendOne(s); } // }}} // {{{ NGram::append() void NGram::append(const char *text, unsigned int from, int length) { WordCharTokenizer tokenizer; append(text,tokenizer,from,length); } void NGram::append(const char *text, Tokenizer &tokenizer, unsigned int from, int length) { append(std::string(text),tokenizer,from,length); } void NGram::append(const NGram &g, unsigned int from, int length) { if(this==&g){ append(NGram(g),from,length); return; } if(length<0 || from+length>g._tokens.size()) length=g._tokens.size()-from; if(length>0){ for(unsigned int i=from; i0 && perm[i]<=(int)g._tokens.size()){ _tokens.push_back(g._tokens[perm[i]-1]); } } } void NGram::append(const std::string &s, unsigned int from, int length) { WordCharTokenizer tokenizer; append(s,tokenizer,from,length); } void NGram::append(const std::string &s, Tokenizer &tokenizer, unsigned int from, int length) { tokenizer.init(s); unsigned int i=0; while(i::iterator pos; pos = std::unique(_tokens.begin(),_tokens.end()); _tokens.erase(pos,_tokens.end()); return _tokens.size(); } // }}} // {{{ NGram::join() std::string NGram::join(const std::string &separator, unsigned int from, int length) const { unsigned int to = _tokens.size(); if(length!=-1 && from+lengthfrom) dest=_tokens[from]; for(unsigned i=from+1;i0) out<<" "; out<