diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /fsa/queryproc |
Publish
Diffstat (limited to 'fsa/queryproc')
-rw-r--r-- | fsa/queryproc/.gitignore | 9 | ||||
-rw-r--r-- | fsa/queryproc/count_plain_grams.cpp | 89 | ||||
-rw-r--r-- | fsa/queryproc/count_sorted_grams.cpp | 78 | ||||
-rw-r--r-- | fsa/queryproc/p2s_ratio.cpp | 59 | ||||
-rw-r--r-- | fsa/queryproc/permute_query.cpp | 110 | ||||
-rw-r--r-- | fsa/queryproc/sort_grams.cpp | 29 |
6 files changed, 374 insertions, 0 deletions
diff --git a/fsa/queryproc/.gitignore b/fsa/queryproc/.gitignore new file mode 100644 index 00000000000..a073ef1dd72 --- /dev/null +++ b/fsa/queryproc/.gitignore @@ -0,0 +1,9 @@ +.deps +.libs +Makefile +Makefile.in +count_plain_grams +count_sorted_grams +p2s_ratio +permute_query +sort_grams diff --git a/fsa/queryproc/count_plain_grams.cpp b/fsa/queryproc/count_plain_grams.cpp new file mode 100644 index 00000000000..197c958149b --- /dev/null +++ b/fsa/queryproc/count_plain_grams.cpp @@ -0,0 +1,89 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <iostream> +#include <iomanip> +#include <map> +#include <string> + +#include "fsa.h" +#include "permuter.h" +#include "selector.h" +#include "ngram.h" +#include "base64.h" +#include "wordchartokenizer.h" + +using namespace fsa; + +unsigned int gram_count(unsigned int mg, unsigned int q) +{ + unsigned int i,j,c1,c2,ct=0; + + for(i=2;i<=mg;i++){ + c1=1;c2=1; + for(j=(i>q-i)?(i+1):(q-i+1);j<=q;j++){ + c1*=j; + c2*=(q-j)+1; + } + ct+=c1/c2; + } + return ct; +} + +int main(int argc, char **argv) +{ + const unsigned int MAXQUERY = 10; + const unsigned int MAXGRAM = 6; + + Permuter p; + NGram freq_s,query; + WordCharTokenizer tokenizer(WordCharTokenizer::PUNCTUATION_WHITESPACEONLY); + unsigned int freq; + Selector s; + std::string qstr; + unsigned int qlen,glen; + + if(argc!=2){ + std::cerr << "usage: " << argv[0] << " fsa_file" << std::endl; + exit(1); + } + + FSA fsa(argv[1]); + FSA::State state(fsa); + std::map<std::string,unsigned int> grams,gq; + std::map<std::string,unsigned int>::iterator grams_it,gq_it; + std::string gram_str; + + while(!std::cin.eof()){ + getline(std::cin,qstr); + query.set(qstr,tokenizer,1,-1); + qlen = query.length(); + if(2<=qlen && qlen<=MAXQUERY){ + freq_s.set(qstr,tokenizer,0,1); + freq=atoi(freq_s[0].c_str()); + gq.clear(); + for(unsigned int i=0;i<qlen-1;i++){ + for(unsigned int j=2;j<=MAXGRAM&&i+j<=qlen;j++){ + state.startWord(query[i]); + for(unsigned int k=1;state.isValid()&&k<j;k++){ + state.deltaWord(query[i+k]); + } + if(state.isFinal()){ + gram_str = query.join(" ",i,j); + gq[gram_str]=freq; + } + } + } + for(gq_it=gq.begin();gq_it!=gq.end();++gq_it){ + grams_it=grams.find(gq_it->first); + if(grams_it!=grams.end()) + grams[gq_it->first]=grams_it->second+gq_it->second; + else + grams[gq_it->first]=gq_it->second; + } + } + } + + for(grams_it=grams.begin();grams_it!=grams.end();++grams_it) + std::cout << grams_it->first << '\t' << grams_it->second << std::endl; + + return 0; +} diff --git a/fsa/queryproc/count_sorted_grams.cpp b/fsa/queryproc/count_sorted_grams.cpp new file mode 100644 index 00000000000..58be4548fac --- /dev/null +++ b/fsa/queryproc/count_sorted_grams.cpp @@ -0,0 +1,78 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <iostream> +#include <iomanip> +#include <map> +#include <string> + +#include "fsa.h" +#include "permuter.h" +#include "selector.h" +#include "ngram.h" +#include "base64.h" +#include "wordchartokenizer.h" + +using namespace fsa; + +int main(int argc, char **argv) +{ + const unsigned int MAXQUERY = 10; + const unsigned int MAXGRAM = 6; + + Permuter p; + NGram freq_s,query,gram; + WordCharTokenizer tokenizer(WordCharTokenizer::PUNCTUATION_WHITESPACEONLY); + unsigned int freq; + Selector s; + std::string qstr; + unsigned int qlen,glen; + + if(argc!=2){ + std::cerr << "usage: " << argv[0] << " sorted_fsa_file" << std::endl; + exit(1); + } + + FSA fsa(argv[1]); + FSA::State state(fsa); + std::map<std::string,unsigned int> grams; + std::map<std::string,unsigned int>::iterator grams_it; + std::string gram_str; + + while(!std::cin.eof()){ + getline(std::cin,qstr); + query.set(qstr,tokenizer,1,-1); + qlen = query.length(); + if(2<=qlen && qlen<=MAXQUERY){ + freq_s.set(qstr,tokenizer,0,1); + freq=atoi(freq_s[0].c_str()); + query.sort(); + qlen = query.uniq(); + unsigned int glen=qlen<MAXGRAM?qlen:MAXGRAM; + for(unsigned int n=2;n<=glen;n++){ + unsigned int c=Permuter::firstComb(n,qlen); + while(c>0){ + s.clear(); + s.set(c); + gram.set(query,s); + state.startWord(gram[0]); + for(unsigned int i=1;state.isValid()&&i<gram.size();i++){ + state.deltaWord(gram[i]); + } + if(state.isFinal()){ + gram_str = gram.join(" "); + grams_it=grams.find(gram_str); + if(grams_it!=grams.end()) + grams[gram_str]=grams_it->second+freq; + else + grams[gram_str]=freq; + } + c=Permuter::nextComb(c,qlen); + } + } + } + } + + for(grams_it=grams.begin();grams_it!=grams.end();++grams_it) + std::cout << grams_it->first << '\t' << grams_it->second << std::endl; + + return 0; +} diff --git a/fsa/queryproc/p2s_ratio.cpp b/fsa/queryproc/p2s_ratio.cpp new file mode 100644 index 00000000000..cbc61c45d53 --- /dev/null +++ b/fsa/queryproc/p2s_ratio.cpp @@ -0,0 +1,59 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <iostream> +#include <iomanip> +#include <map> +#include <string> + +#include "fsa.h" +#include "permuter.h" +#include "ngram.h" +#include "base64.h" + +using namespace fsa; + +int main(int argc, char **argv) +{ + const unsigned int MAXQUERY = 10; + const unsigned int MAXGRAM = 6; + + Permuter p; + NGram freq_s,gram,sorted_gram; + unsigned int freq; + Selector s(10); + std::string gstr; + + if(argc!=3){ + std::cerr << "usage: " << argv[0] << " plain_count_fsa_file sorted_count_fsa_file" << std::endl; + exit(1); + } + + FSA plain_fsa(argv[1]); + FSA sorted_fsa(argv[2]); + FSA::State state1(plain_fsa),state2(sorted_fsa); + + while(!std::cin.eof()){ + getline(std::cin,gstr); + gram.set(gstr); + if(gram.length()>1){ + sorted_gram.set(gram); + sorted_gram.sort(); + sorted_gram.uniq(); + state1.startWord(gram[0]); + for(unsigned int i=1;state1.isValid()&&i<gram.length();i++){ + state1.deltaWord(gram[i]); + } + state2.startWord(sorted_gram[0]); + for(unsigned int i=1;state2.isValid()&&i<sorted_gram.length();i++){ + state2.deltaWord(sorted_gram[i]); + } + if(state1.isFinal() && state2.isFinal()){ + unsigned int c1,c2; + c1=*((unsigned int*)state1.data()); + c2=*((unsigned int*)state2.data()); + std::cout << gram << "\t" << c1 << "," << c2 << "," << (double)c1/(double)c2 << std::endl; + } + } + } + + return 0; +} diff --git a/fsa/queryproc/permute_query.cpp b/fsa/queryproc/permute_query.cpp new file mode 100644 index 00000000000..7645e864a44 --- /dev/null +++ b/fsa/queryproc/permute_query.cpp @@ -0,0 +1,110 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <iostream> +#include <iomanip> + +#include "permuter.h" +#include "selector.h" +#include "ngram.h" +#include "base64.h" + +#if (__GNUG__ <3 || (__GNUG__ == 3 && __GNUC_MINOR__ < 1)) +namespace std { +const char *fixed = ""; +} +#endif + +using namespace fsa; + +unsigned int gram_count(unsigned int mg, unsigned int q) +{ + unsigned int i,j,c1,c2,ct=0; + + for(i=2;i<=mg;i++){ + c1=1;c2=1; + for(j=(i>q-i)?(i+1):(q-i+1);j<=q;j++){ + c1*=j; + c2*=(q-j)+1; + } + ct+=c1/c2; + } + return ct; +} + +int main(int argc, char **argv) +{ + const unsigned int MAXQUERY = 10; + const unsigned int MAXGRAM = 6; + + Permuter p; + NGram query,gram; + Selector s; + std::string qstr; + unsigned int qlen,glen; + bool verbose=true; + unsigned int i; + double total,ctotal; + int stats[MAXQUERY+1]; + + for(i=0;i<=MAXQUERY;i++) + stats[i]=0; + while(!std::cin.eof()){ + getline(std::cin,qstr); + query.set(qstr,1); + qlen = query.length(); + if(2<=qlen && qlen<=MAXQUERY){ + stats[qlen]++; + std::cout << "QUERY: " << query << std::endl; + query.sort(); + qlen = query.uniq(); + unsigned int glen=qlen<MAXGRAM?qlen:MAXGRAM; + for(unsigned int n=2;n<=glen;n++){ + unsigned int c=Permuter::firstComb(n,qlen); + while(c>0){ + s.clear(); + s.set(c); + gram.set(query,s); + std::cout << " " << gram << std::endl; + c=Permuter::nextComb(c,qlen); + } + } + } + else{ + if(qlen<2) + stats[0]++; + else + stats[1]++; + } + } + + + + if(verbose){ + total=0.0;ctotal=0.0; + for(i=0;i<=MAXQUERY;i++) + total+=stats[i]; + std::cerr << std::fixed << std::setprecision(4) << std::endl; + std::cerr << "Statistics:" << std::endl; + std::cerr << std::endl; + std::cerr << " Empty or single term: " << + std::setw(12) << stats[0] << " " << + std::setw(7) << double(stats[0])*100.0/total << "%" << std::endl; + std::cerr << " Too long: " << + std::setw(12) << stats[1] << " " << + std::setw(7) << double(stats[1])*100.0/total << "%" << std::endl; + for(i=2;i<=MAXQUERY;i++){ + std::cerr << " Length " << std::setw(2) << i << " (grams " << std::setw(3) << + gram_count(i<MAXGRAM?i:MAXGRAM,i) << "): " << + std::setw(12) << stats[i] << " " << + std::setw(7) << double(stats[i])*100.0/total << "%" << std::endl; + ctotal+=stats[i]*gram_count(i<MAXGRAM?i:MAXGRAM,i); + } + std::cerr << " Total: " << + std::setw(12) << std::setprecision(0) << total << std::endl; + std::cerr << std::endl; + std::cerr << "Average number of grams per query: " << + std::setprecision(2) << ctotal/total << std::endl; + std::cerr << std::endl; + } + + return 0; +} diff --git a/fsa/queryproc/sort_grams.cpp b/fsa/queryproc/sort_grams.cpp new file mode 100644 index 00000000000..427dba129ff --- /dev/null +++ b/fsa/queryproc/sort_grams.cpp @@ -0,0 +1,29 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <iostream> +#include <iomanip> + +#include "permuter.h" +#include "ngram.h" +#include "base64.h" +#include "wordchartokenizer.h" + +using namespace fsa; + +int main(int argc, char **argv) +{ + + NGram query; + WordCharTokenizer tokenizer(WordCharTokenizer::PUNCTUATION_WHITESPACEONLY); + std::string qstr; + + while(!std::cin.eof()){ + getline(std::cin,qstr); + query.set(qstr,tokenizer,0,-1); + query.sort(); + query.uniq(); + std::cout << query << std::endl; + } + + + return 0; +} |