aboutsummaryrefslogtreecommitdiffstats
path: root/fsa/queryproc
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
committerJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
commit72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /fsa/queryproc
Publish
Diffstat (limited to 'fsa/queryproc')
-rw-r--r--fsa/queryproc/.gitignore9
-rw-r--r--fsa/queryproc/count_plain_grams.cpp89
-rw-r--r--fsa/queryproc/count_sorted_grams.cpp78
-rw-r--r--fsa/queryproc/p2s_ratio.cpp59
-rw-r--r--fsa/queryproc/permute_query.cpp110
-rw-r--r--fsa/queryproc/sort_grams.cpp29
6 files changed, 374 insertions, 0 deletions
diff --git a/fsa/queryproc/.gitignore b/fsa/queryproc/.gitignore
new file mode 100644
index 00000000000..a073ef1dd72
--- /dev/null
+++ b/fsa/queryproc/.gitignore
@@ -0,0 +1,9 @@
+.deps
+.libs
+Makefile
+Makefile.in
+count_plain_grams
+count_sorted_grams
+p2s_ratio
+permute_query
+sort_grams
diff --git a/fsa/queryproc/count_plain_grams.cpp b/fsa/queryproc/count_plain_grams.cpp
new file mode 100644
index 00000000000..197c958149b
--- /dev/null
+++ b/fsa/queryproc/count_plain_grams.cpp
@@ -0,0 +1,89 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <iostream>
+#include <iomanip>
+#include <map>
+#include <string>
+
+#include "fsa.h"
+#include "permuter.h"
+#include "selector.h"
+#include "ngram.h"
+#include "base64.h"
+#include "wordchartokenizer.h"
+
+using namespace fsa;
+
+unsigned int gram_count(unsigned int mg, unsigned int q)
+{
+ unsigned int i,j,c1,c2,ct=0;
+
+ for(i=2;i<=mg;i++){
+ c1=1;c2=1;
+ for(j=(i>q-i)?(i+1):(q-i+1);j<=q;j++){
+ c1*=j;
+ c2*=(q-j)+1;
+ }
+ ct+=c1/c2;
+ }
+ return ct;
+}
+
+int main(int argc, char **argv)
+{
+ const unsigned int MAXQUERY = 10;
+ const unsigned int MAXGRAM = 6;
+
+ Permuter p;
+ NGram freq_s,query;
+ WordCharTokenizer tokenizer(WordCharTokenizer::PUNCTUATION_WHITESPACEONLY);
+ unsigned int freq;
+ Selector s;
+ std::string qstr;
+ unsigned int qlen,glen;
+
+ if(argc!=2){
+ std::cerr << "usage: " << argv[0] << " fsa_file" << std::endl;
+ exit(1);
+ }
+
+ FSA fsa(argv[1]);
+ FSA::State state(fsa);
+ std::map<std::string,unsigned int> grams,gq;
+ std::map<std::string,unsigned int>::iterator grams_it,gq_it;
+ std::string gram_str;
+
+ while(!std::cin.eof()){
+ getline(std::cin,qstr);
+ query.set(qstr,tokenizer,1,-1);
+ qlen = query.length();
+ if(2<=qlen && qlen<=MAXQUERY){
+ freq_s.set(qstr,tokenizer,0,1);
+ freq=atoi(freq_s[0].c_str());
+ gq.clear();
+ for(unsigned int i=0;i<qlen-1;i++){
+ for(unsigned int j=2;j<=MAXGRAM&&i+j<=qlen;j++){
+ state.startWord(query[i]);
+ for(unsigned int k=1;state.isValid()&&k<j;k++){
+ state.deltaWord(query[i+k]);
+ }
+ if(state.isFinal()){
+ gram_str = query.join(" ",i,j);
+ gq[gram_str]=freq;
+ }
+ }
+ }
+ for(gq_it=gq.begin();gq_it!=gq.end();++gq_it){
+ grams_it=grams.find(gq_it->first);
+ if(grams_it!=grams.end())
+ grams[gq_it->first]=grams_it->second+gq_it->second;
+ else
+ grams[gq_it->first]=gq_it->second;
+ }
+ }
+ }
+
+ for(grams_it=grams.begin();grams_it!=grams.end();++grams_it)
+ std::cout << grams_it->first << '\t' << grams_it->second << std::endl;
+
+ return 0;
+}
diff --git a/fsa/queryproc/count_sorted_grams.cpp b/fsa/queryproc/count_sorted_grams.cpp
new file mode 100644
index 00000000000..58be4548fac
--- /dev/null
+++ b/fsa/queryproc/count_sorted_grams.cpp
@@ -0,0 +1,78 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <iostream>
+#include <iomanip>
+#include <map>
+#include <string>
+
+#include "fsa.h"
+#include "permuter.h"
+#include "selector.h"
+#include "ngram.h"
+#include "base64.h"
+#include "wordchartokenizer.h"
+
+using namespace fsa;
+
+int main(int argc, char **argv)
+{
+ const unsigned int MAXQUERY = 10;
+ const unsigned int MAXGRAM = 6;
+
+ Permuter p;
+ NGram freq_s,query,gram;
+ WordCharTokenizer tokenizer(WordCharTokenizer::PUNCTUATION_WHITESPACEONLY);
+ unsigned int freq;
+ Selector s;
+ std::string qstr;
+ unsigned int qlen,glen;
+
+ if(argc!=2){
+ std::cerr << "usage: " << argv[0] << " sorted_fsa_file" << std::endl;
+ exit(1);
+ }
+
+ FSA fsa(argv[1]);
+ FSA::State state(fsa);
+ std::map<std::string,unsigned int> grams;
+ std::map<std::string,unsigned int>::iterator grams_it;
+ std::string gram_str;
+
+ while(!std::cin.eof()){
+ getline(std::cin,qstr);
+ query.set(qstr,tokenizer,1,-1);
+ qlen = query.length();
+ if(2<=qlen && qlen<=MAXQUERY){
+ freq_s.set(qstr,tokenizer,0,1);
+ freq=atoi(freq_s[0].c_str());
+ query.sort();
+ qlen = query.uniq();
+ unsigned int glen=qlen<MAXGRAM?qlen:MAXGRAM;
+ for(unsigned int n=2;n<=glen;n++){
+ unsigned int c=Permuter::firstComb(n,qlen);
+ while(c>0){
+ s.clear();
+ s.set(c);
+ gram.set(query,s);
+ state.startWord(gram[0]);
+ for(unsigned int i=1;state.isValid()&&i<gram.size();i++){
+ state.deltaWord(gram[i]);
+ }
+ if(state.isFinal()){
+ gram_str = gram.join(" ");
+ grams_it=grams.find(gram_str);
+ if(grams_it!=grams.end())
+ grams[gram_str]=grams_it->second+freq;
+ else
+ grams[gram_str]=freq;
+ }
+ c=Permuter::nextComb(c,qlen);
+ }
+ }
+ }
+ }
+
+ for(grams_it=grams.begin();grams_it!=grams.end();++grams_it)
+ std::cout << grams_it->first << '\t' << grams_it->second << std::endl;
+
+ return 0;
+}
diff --git a/fsa/queryproc/p2s_ratio.cpp b/fsa/queryproc/p2s_ratio.cpp
new file mode 100644
index 00000000000..cbc61c45d53
--- /dev/null
+++ b/fsa/queryproc/p2s_ratio.cpp
@@ -0,0 +1,59 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <iostream>
+#include <iomanip>
+#include <map>
+#include <string>
+
+#include "fsa.h"
+#include "permuter.h"
+#include "ngram.h"
+#include "base64.h"
+
+using namespace fsa;
+
+int main(int argc, char **argv)
+{
+ const unsigned int MAXQUERY = 10;
+ const unsigned int MAXGRAM = 6;
+
+ Permuter p;
+ NGram freq_s,gram,sorted_gram;
+ unsigned int freq;
+ Selector s(10);
+ std::string gstr;
+
+ if(argc!=3){
+ std::cerr << "usage: " << argv[0] << " plain_count_fsa_file sorted_count_fsa_file" << std::endl;
+ exit(1);
+ }
+
+ FSA plain_fsa(argv[1]);
+ FSA sorted_fsa(argv[2]);
+ FSA::State state1(plain_fsa),state2(sorted_fsa);
+
+ while(!std::cin.eof()){
+ getline(std::cin,gstr);
+ gram.set(gstr);
+ if(gram.length()>1){
+ sorted_gram.set(gram);
+ sorted_gram.sort();
+ sorted_gram.uniq();
+ state1.startWord(gram[0]);
+ for(unsigned int i=1;state1.isValid()&&i<gram.length();i++){
+ state1.deltaWord(gram[i]);
+ }
+ state2.startWord(sorted_gram[0]);
+ for(unsigned int i=1;state2.isValid()&&i<sorted_gram.length();i++){
+ state2.deltaWord(sorted_gram[i]);
+ }
+ if(state1.isFinal() && state2.isFinal()){
+ unsigned int c1,c2;
+ c1=*((unsigned int*)state1.data());
+ c2=*((unsigned int*)state2.data());
+ std::cout << gram << "\t" << c1 << "," << c2 << "," << (double)c1/(double)c2 << std::endl;
+ }
+ }
+ }
+
+ return 0;
+}
diff --git a/fsa/queryproc/permute_query.cpp b/fsa/queryproc/permute_query.cpp
new file mode 100644
index 00000000000..7645e864a44
--- /dev/null
+++ b/fsa/queryproc/permute_query.cpp
@@ -0,0 +1,110 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <iostream>
+#include <iomanip>
+
+#include "permuter.h"
+#include "selector.h"
+#include "ngram.h"
+#include "base64.h"
+
+#if (__GNUG__ <3 || (__GNUG__ == 3 && __GNUC_MINOR__ < 1))
+namespace std {
+const char *fixed = "";
+}
+#endif
+
+using namespace fsa;
+
+unsigned int gram_count(unsigned int mg, unsigned int q)
+{
+ unsigned int i,j,c1,c2,ct=0;
+
+ for(i=2;i<=mg;i++){
+ c1=1;c2=1;
+ for(j=(i>q-i)?(i+1):(q-i+1);j<=q;j++){
+ c1*=j;
+ c2*=(q-j)+1;
+ }
+ ct+=c1/c2;
+ }
+ return ct;
+}
+
+int main(int argc, char **argv)
+{
+ const unsigned int MAXQUERY = 10;
+ const unsigned int MAXGRAM = 6;
+
+ Permuter p;
+ NGram query,gram;
+ Selector s;
+ std::string qstr;
+ unsigned int qlen,glen;
+ bool verbose=true;
+ unsigned int i;
+ double total,ctotal;
+ int stats[MAXQUERY+1];
+
+ for(i=0;i<=MAXQUERY;i++)
+ stats[i]=0;
+ while(!std::cin.eof()){
+ getline(std::cin,qstr);
+ query.set(qstr,1);
+ qlen = query.length();
+ if(2<=qlen && qlen<=MAXQUERY){
+ stats[qlen]++;
+ std::cout << "QUERY: " << query << std::endl;
+ query.sort();
+ qlen = query.uniq();
+ unsigned int glen=qlen<MAXGRAM?qlen:MAXGRAM;
+ for(unsigned int n=2;n<=glen;n++){
+ unsigned int c=Permuter::firstComb(n,qlen);
+ while(c>0){
+ s.clear();
+ s.set(c);
+ gram.set(query,s);
+ std::cout << " " << gram << std::endl;
+ c=Permuter::nextComb(c,qlen);
+ }
+ }
+ }
+ else{
+ if(qlen<2)
+ stats[0]++;
+ else
+ stats[1]++;
+ }
+ }
+
+
+
+ if(verbose){
+ total=0.0;ctotal=0.0;
+ for(i=0;i<=MAXQUERY;i++)
+ total+=stats[i];
+ std::cerr << std::fixed << std::setprecision(4) << std::endl;
+ std::cerr << "Statistics:" << std::endl;
+ std::cerr << std::endl;
+ std::cerr << " Empty or single term: " <<
+ std::setw(12) << stats[0] << " " <<
+ std::setw(7) << double(stats[0])*100.0/total << "%" << std::endl;
+ std::cerr << " Too long: " <<
+ std::setw(12) << stats[1] << " " <<
+ std::setw(7) << double(stats[1])*100.0/total << "%" << std::endl;
+ for(i=2;i<=MAXQUERY;i++){
+ std::cerr << " Length " << std::setw(2) << i << " (grams " << std::setw(3) <<
+ gram_count(i<MAXGRAM?i:MAXGRAM,i) << "): " <<
+ std::setw(12) << stats[i] << " " <<
+ std::setw(7) << double(stats[i])*100.0/total << "%" << std::endl;
+ ctotal+=stats[i]*gram_count(i<MAXGRAM?i:MAXGRAM,i);
+ }
+ std::cerr << " Total: " <<
+ std::setw(12) << std::setprecision(0) << total << std::endl;
+ std::cerr << std::endl;
+ std::cerr << "Average number of grams per query: " <<
+ std::setprecision(2) << ctotal/total << std::endl;
+ std::cerr << std::endl;
+ }
+
+ return 0;
+}
diff --git a/fsa/queryproc/sort_grams.cpp b/fsa/queryproc/sort_grams.cpp
new file mode 100644
index 00000000000..427dba129ff
--- /dev/null
+++ b/fsa/queryproc/sort_grams.cpp
@@ -0,0 +1,29 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <iostream>
+#include <iomanip>
+
+#include "permuter.h"
+#include "ngram.h"
+#include "base64.h"
+#include "wordchartokenizer.h"
+
+using namespace fsa;
+
+int main(int argc, char **argv)
+{
+
+ NGram query;
+ WordCharTokenizer tokenizer(WordCharTokenizer::PUNCTUATION_WHITESPACEONLY);
+ std::string qstr;
+
+ while(!std::cin.eof()){
+ getline(std::cin,qstr);
+ query.set(qstr,tokenizer,0,-1);
+ query.sort();
+ query.uniq();
+ std::cout << query << std::endl;
+ }
+
+
+ return 0;
+}