summaryrefslogtreecommitdiffstats
path: root/fsa/queryproc/count_sorted_grams.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'fsa/queryproc/count_sorted_grams.cpp')
-rw-r--r--fsa/queryproc/count_sorted_grams.cpp78
1 files changed, 78 insertions, 0 deletions
diff --git a/fsa/queryproc/count_sorted_grams.cpp b/fsa/queryproc/count_sorted_grams.cpp
new file mode 100644
index 00000000000..58be4548fac
--- /dev/null
+++ b/fsa/queryproc/count_sorted_grams.cpp
@@ -0,0 +1,78 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <iostream>
+#include <iomanip>
+#include <map>
+#include <string>
+
+#include "fsa.h"
+#include "permuter.h"
+#include "selector.h"
+#include "ngram.h"
+#include "base64.h"
+#include "wordchartokenizer.h"
+
+using namespace fsa;
+
+int main(int argc, char **argv)
+{
+ const unsigned int MAXQUERY = 10;
+ const unsigned int MAXGRAM = 6;
+
+ Permuter p;
+ NGram freq_s,query,gram;
+ WordCharTokenizer tokenizer(WordCharTokenizer::PUNCTUATION_WHITESPACEONLY);
+ unsigned int freq;
+ Selector s;
+ std::string qstr;
+ unsigned int qlen,glen;
+
+ if(argc!=2){
+ std::cerr << "usage: " << argv[0] << " sorted_fsa_file" << std::endl;
+ exit(1);
+ }
+
+ FSA fsa(argv[1]);
+ FSA::State state(fsa);
+ std::map<std::string,unsigned int> grams;
+ std::map<std::string,unsigned int>::iterator grams_it;
+ std::string gram_str;
+
+ while(!std::cin.eof()){
+ getline(std::cin,qstr);
+ query.set(qstr,tokenizer,1,-1);
+ qlen = query.length();
+ if(2<=qlen && qlen<=MAXQUERY){
+ freq_s.set(qstr,tokenizer,0,1);
+ freq=atoi(freq_s[0].c_str());
+ query.sort();
+ qlen = query.uniq();
+ unsigned int glen=qlen<MAXGRAM?qlen:MAXGRAM;
+ for(unsigned int n=2;n<=glen;n++){
+ unsigned int c=Permuter::firstComb(n,qlen);
+ while(c>0){
+ s.clear();
+ s.set(c);
+ gram.set(query,s);
+ state.startWord(gram[0]);
+ for(unsigned int i=1;state.isValid()&&i<gram.size();i++){
+ state.deltaWord(gram[i]);
+ }
+ if(state.isFinal()){
+ gram_str = gram.join(" ");
+ grams_it=grams.find(gram_str);
+ if(grams_it!=grams.end())
+ grams[gram_str]=grams_it->second+freq;
+ else
+ grams[gram_str]=freq;
+ }
+ c=Permuter::nextComb(c,qlen);
+ }
+ }
+ }
+ }
+
+ for(grams_it=grams.begin();grams_it!=grams.end();++grams_it)
+ std::cout << grams_it->first << '\t' << grams_it->second << std::endl;
+
+ return 0;
+}