From 72231250ed81e10d66bfe70701e64fa5fe50f712 Mon Sep 17 00:00:00 2001
From: Jon Bratseth <bratseth@yahoo-inc.com>
Date: Wed, 15 Jun 2016 23:09:44 +0200
Subject: Publish

---
 fsa/src/util/cn_txt2xml | 625 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 625 insertions(+)
 create mode 100755 fsa/src/util/cn_txt2xml

(limited to 'fsa/src/util/cn_txt2xml')

diff --git a/fsa/src/util/cn_txt2xml b/fsa/src/util/cn_txt2xml
new file mode 100755
index 00000000000..9c439879af9
--- /dev/null
+++ b/fsa/src/util/cn_txt2xml
@@ -0,0 +1,625 @@
+#!/usr/bin/perl
+
+use strict;
+
+use FSA;
+use BerkeleyDB;
+use Getopt::Long;
+use Pod::Usage;
+
+#
+# Process command line options.
+#
+
+my $do_qfreq       = 1;
+my $do_sqfreq      = 1;
+my $do_ext         = 1;
+my $do_assoc       = 1;
+my $do_cat         = 1;
+my $do_fsa         = 1;
+my $help           = 0;
+my $man            = 0;
+my $verbose        = 0;
+my $stopwords_file = '';
+my $output_file    = '';
+
+my $result = GetOptions('qfreq|q!'        => \$do_qfreq,
+						'sqfreq|s!'       => \$do_sqfreq,
+						'ext|e!'          => \$do_ext,
+						'assoc|a!'        => \$do_assoc,
+						'cat|c!'          => \$do_cat,
+						'fsa|f!'          => \$do_fsa,
+						'help|h'          => \$help,
+						'man|m'           => \$man,
+						'verbose|v'       => \$verbose,
+						'stopwords|w:s'   => \$stopwords_file,
+						'output-file|o=s' => \$output_file,
+					);
+
+pod2usage(1) if $help;
+pod2usage(-verbose => 2) if $man;
+
+#
+# Domain is a required parameter.
+#
+
+my $domain = shift || die "need domain";
+
+#
+# Some constants for setting limits etc.
+#
+
+my $MAX_UNIT_LENGTH  =  8;
+my $MAX_QUERY_LENGTH = 10;
+
+#
+# Declare arrays to store concept net data.
+#
+
+my @unit     = ();
+my @unit_f   = ();
+my @unit_qf  = ();
+my @unit_qfc = ();
+my @unit_qfs = ();
+my @ext      = ();
+my @assoc    = ();
+my @cats     = ();
+my @ucats    = ();
+my @st_map   = ();
+
+#
+# Some other global variables
+#
+
+my %stopwords = ();
+my %stopMap   = ();
+
+my ($total,$count);
+
+my ($fsa,$sfsa);
+
+#***********************************************************
+#
+# Functions
+#
+#***********************************************************
+
+
+sub msg($@){
+	my $format = shift;
+	if($verbose){
+		printf STDERR $format,@_;
+	}
+}
+
+sub progress($$$$){
+	my ($msg,$cnt,$total,$done) = @_;
+
+	if($done){
+		if($total>0){
+			msg("\r%s ... %d of %d (%.2f%%) ... done.\n",$msg,$cnt,$total,100.0*$cnt/$total);
+		}
+		else {
+			msg("\r%s ... %d ... done.\n",$msg,$cnt);
+		}			
+	}
+	elsif($cnt%1000==0){
+		if($total>0){
+			msg("\r%s ... %d of %d (%.2f%%)",$msg,$cnt,$total,100.0*$cnt/$total);
+		}
+		else {
+			msg("\r%s ... %d",$msg,$cnt);
+		}			
+	}
+}
+
+sub lookup($$){
+	my $fsa = shift; 
+	my $u = shift;
+	my $st = FSA::State->new($fsa);
+
+	$st->start();
+	$st->delta($u);
+	if($st->isFinal()){
+		return ($st->hash(),$st->nData());
+	}
+	else {
+		return (-1,0);
+	}
+}
+
+sub aggregate(\@){
+	my $aref = shift;
+	my %hash = ();
+	my $i;
+	for($i=0;$i<$#{$aref}+1;$i+=2){
+		$hash{$$aref[$i]} += $$aref[$i+1];
+	}
+	my @res;
+	foreach $i (sort {$hash{$b} <=> $hash{$a}} keys %hash){
+		push(@res,$i,$hash{$i});
+	}
+	return @res;
+}
+
+sub firstComb($$){
+	my $n = shift;
+	my $m = shift;
+
+	if($n==0 || $n>31 || $m==0 || $m>31 || $n>$m){
+		return 0;
+	}
+ 
+	return (1<<$n)-1;
+}
+
+sub nextComb($$){
+	my $c = shift;
+	my $m = shift;
+
+	if($c==0 || $m==0 || $m>31){
+		return 0;
+	}
+
+	my $x = $c;
+	my $limit = 1<<$m;
+	my ($mask,$mask1,$mask2);
+
+	if($x&1){
+		$mask=2;
+		while($x&$mask){
+			$mask<<=1;
+		}
+		$x^=($mask+($mask>>1));
+	}
+	else{
+		$mask=2;
+		while(!($x&$mask)){
+			$mask<<=1;
+		}
+		$mask1=$mask2=0;
+		while($x&$mask){
+			$mask1<<=1;
+			$mask1++;
+			$mask2+=$mask;
+			$mask<<=1;
+		}
+		$mask1>>=1;
+		$x^=($mask+($mask1^$mask2));
+	}
+	
+	return ($x<$limit)?$x:0;
+}
+
+sub selectComb($\@){
+	my $c = shift;
+	my $aref = shift;
+
+	my @res;
+	my $i = 0;
+	while($c>0 && $i<=$#$aref){
+		if($c&1){
+			push(@res,$$aref[$i]);
+		}
+		$c>>=1;
+		$i++;
+	}
+	return @res;
+}
+
+sub sortGrams($){
+	my $in = shift;
+	my @grams = split(/\s+/,$in);
+
+	if($#grams<1){
+		return $in;
+	}
+
+	my @sorted_grams = sort(@grams);
+	my $i=1;
+	while($i<=$#sorted_grams){
+		if($sorted_grams[$i] eq $sorted_grams[$i-1]){
+			splice(@sorted_grams,$i,1);
+		}
+		else{
+			$i++;
+		}
+	}
+	return join(" ",@sorted_grams);
+}
+
+sub cleanStop($){
+	my $unit = shift;
+	if($stopwords_file ne ''){
+		if(!defined($stopMap{$unit})){
+			my @words = split(/\s+/,$unit);
+			while ((@words) && ($stopwords{$words[0]})) {
+				shift(@words);
+			} 
+			while ((@words) && ($stopwords{$words[$#words]})) {
+				pop(@words);
+			}
+			$stopMap{$unit} = join(' ', @words);
+		}
+		return $stopMap{$unit};
+	}
+	return($unit);
+}
+
+
+#***********************************************************
+#
+# Main program.
+#
+#***********************************************************
+
+
+#
+# Configure stopwords list
+#
+
+if($stopwords_file ne ''){
+	msg("configuring stopwords ... ");
+	open(STOPFILE, $stopwords_file) or die "error opening stopwords file '$stopwords_file': $!\n\t";
+	while(<STOPFILE>){
+		chomp;
+		$stopwords{$_}=1;
+	}
+	close(STOPFILE);
+	msg("done.\n");
+}
+
+#
+# Build plain FSA with perfect hash and frequencies,
+# and compact FSA with perfect hash only.
+#
+if($do_fsa){
+	msg("building plain fsa ... ");
+	my %units_t = ();
+	open(U,"${domain}_unit.txt");
+	while(<U>){
+		chomp;
+		my ($f,$u) = split(/\t/);
+		my $uns = cleanStop($u);
+		if($uns ne ""){
+			$units_t{$uns}+=$f;
+		}
+	}
+	close(U);
+	open(F1,"| makefsa -vnp ${domain}.plain.fsa");
+	open(F2,"| makefsa -ep ${domain}.fsa");
+	foreach my $u (sort keys %units_t){
+		print F1 "$u\t$units_t{$u}\n";
+		print F2 "$u\n";
+	}
+	close(F1);
+	close(F2);
+	%units_t = ();
+	msg("done.\n");
+}
+
+#
+# Open plain FSA.
+#
+
+$fsa = FSA->new("${domain}.plain.fsa");
+
+#
+# Read units.
+#
+
+$total = 0 + `wc -l ${domain}_unit.txt`;
+$count = 0;
+open(U,"${domain}_unit.txt");
+while(<U>){
+	$count++; progress("reading units",$count,$total,0);
+	chomp;
+	my ($f,$u) = split(/\t/);
+	my $uns = cleanStop($u);
+	if($uns ne ""){
+		my ($idx,$frq) = lookup($fsa,$uns);
+		if($idx>=0){
+			$unit[$idx]   = $uns;
+			$unit_f[$idx] = $frq;
+		}
+	}
+}
+close(U);
+progress("reading units",$count,$total,1);
+
+
+#
+# Build term-sorted FSA for counting query frequencies.
+#
+
+if($do_qfreq || $do_sqfreq){
+	msg("building fsa for query frequencies ... ");
+	my %units_st = ();
+	for(my $i=0;$i<=$#unit;$i++){
+		my $uns = sortGrams($unit[$i]);
+		if(defined($units_st{$uns})){
+			$units_st{$uns}.=",$i";
+		}
+		else{
+			$units_st{$uns}="$i";
+		}
+	}
+	open(F,"| makefsa -vep ${domain}.sorted.fsa");
+	my $i=0;
+	foreach my $u (sort keys %units_st){
+		$st_map[$i]=$units_st{$u};
+		print F "$u\n";
+		$i++;
+	}
+	close(F);
+	%units_st = ();
+	msg("done.\n");
+
+    #
+    # Open term-sorted FSA.
+    #
+
+	$sfsa = FSA->new("${domain}.sorted.fsa");
+
+    #
+    # Read complete query file for query frequencies.
+    #
+
+	$total = 0 + `zcat complete.txt.gz | wc -l`;
+	$count = 0;
+	open(C,"zcat complete.txt.gz|") or die "ERROR opening pipe: \"zcat complete.txt.gz|\"\n";
+	while(<C>){
+		$count++; progress("processing raw query file for query frequencies",$count,$total,0);
+		chomp;
+		my ($frq,$query) = split(/\t/);
+
+		#
+		# Complete query match.
+		#
+		my ($idx,$f) = lookup($fsa,$query);
+		if($idx>=0){
+			$unit_qfc[$idx] += $frq;
+		}
+
+		#
+		# Partial query match.
+		#
+		my @qgrams = split(/\s+/,$query);
+		my $st = FSA::State->new($fsa);
+		my %frq_add = ();
+		for(my $i=0;$i<=$#qgrams;$i++){
+			$st->start();
+			$st->delta($qgrams[$i]);
+			if($st->isFinal()){
+				$frq_add{$st->hash()} = 1;
+			}
+			for(my $j=$i+1;$st->isValid()&&$j<=$#qgrams;$j++){
+				$st->delta(" ");
+				$st->delta($qgrams[$j]);
+				if($st->isFinal()){
+					$frq_add{$st->hash()} = 1;
+				}
+			}
+		}
+		foreach my $a (keys %frq_add){
+			$unit_qf[$a] += $frq;
+		}
+
+		if($do_sqfreq){
+			#
+			# Partial query match in any order.
+			#
+			my $squery = sortGrams($query);
+			my @sqgrams = split(/\s+/,$squery);
+			my $sst = FSA::State->new($sfsa);
+			%frq_add = ();
+			my $qlen=$#sqgrams+1;
+			if($qlen>$MAX_QUERY_LENGTH){
+				$qlen=$MAX_QUERY_LENGTH;
+			}
+			for(my $i=1;$i<=$qlen && $i<=$MAX_UNIT_LENGTH; $i++){
+				for(my $c=firstComb($i,$qlen);$c>0;$c=nextComb($c,$qlen)){
+					$sst->start();
+					my $tmp=join(" ",selectComb($c,@sqgrams));
+					$sst->delta($tmp);
+					if($sst->isFinal()){
+						my @to_add = split(/,/,$st_map[$sst->hash()]);
+						foreach my $a (@to_add){
+							$frq_add{$a} = 1;
+						}
+					}
+				}
+			}
+			foreach my $a (keys %frq_add){
+				$unit_qfs[$a] += $frq;
+			}
+		}
+	}
+	close(C);
+	progress("processing raw query file for query frequencies",$count,$total,1);
+}
+
+#
+# Read extensions.
+#
+if($do_ext){
+	$total = 0 + `wc -l ${domain}_ext.txt`;
+	$count = 0;
+	open(E,"${domain}_ext.txt");
+	while(<E>){
+		$count++; progress("reading extensions",$count,$total,0);
+		chomp;
+		my ($f,$u1,$u2) = split(/\t/);
+		my $uns1 = cleanStop($u1);
+		my $uns2 = cleanStop($u2);
+		if($uns1 ne "" && $uns1 ne $uns2){
+			my ($idx1,$frq1) = lookup($fsa,$u1);
+			my ($idx2,$frq2) = lookup($fsa,$u2);
+			if($idx1>=0 && $idx2>=0){
+				$ext[$idx1] .= "$idx2,$f ";
+			}
+		}
+	}
+	close(E);
+	progress("reading extensions",$count,$total,1);
+}
+
+#
+# Read associations.
+#
+if($do_assoc){
+	$total = 0 + `wc -l ${domain}_assoc.txt`;
+	$count = 0;
+	open(A,"${domain}_assoc.txt");
+	while(<A>){
+		$count++; progress("reading associations",$count,$total,0);
+		chomp;
+		my ($f,$u1,$u2) = split(/\t/);
+		my $uns1 = cleanStop($u1);
+		my $uns2 = cleanStop($u2);
+		if($uns1 ne "" && $uns2 ne "" && $uns1 ne $uns2){
+			my ($idx1,$frq1) = lookup($fsa,$u1);
+			my ($idx2,$frq2) = lookup($fsa,$u2);
+			if($idx1>=0 && $idx2>=0){
+				$assoc[$idx1] .= "$idx2,$f ";
+				$assoc[$idx2] .= "$idx1,$f ";
+			}
+		}	
+	}
+	close(A);
+	progress("reading associations",$count,$total,1);
+}
+
+#
+# Read categories.
+#
+
+if($do_cat){
+	tie my %hash, 'BerkeleyDB::Btree', -Filename => "uCat.db";
+
+	$total = scalar(keys %hash);
+	$count = 0;
+	my $cid = 0;
+	foreach my $c (sort keys %hash){
+		$count++; progress("reading categories",$count,$total,0);
+		if($c ne "Misc" && $c ne "zzz_uncategorized_catchall"){
+			$cats[$cid] = $c;
+			my (@ucs) = split(/\t/,$hash{$c});
+			foreach my $u (@ucs){
+				my ($t,$f) = split(/,/,$u);
+				my ($idx,$frq) = lookup($fsa,cleanStop($t));
+				if($idx>=0){
+					if(defined($ucats[$idx])){
+						if(!($ucats[$idx]=~/\b$cid\b/)){
+							$ucats[$idx] .= ",$cid";
+						}
+					}
+					else{
+						$ucats[$idx] = "$cid";
+					}
+				}
+			}
+			$cid++;
+		}
+	}
+	progress("reading categories",$count,$total,1);
+	untie %hash;
+}
+
+
+#
+# Write XML output.
+#
+$count=0;
+$total=$#unit+1;
+
+if($output_file eq ""){
+	$output_file = "${domain}.xml";
+}
+open(X,">$output_file");
+print X "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n";
+print X "<conceptnetwork id=\"$domain\" unitcount=\"" . ($#unit+1). "\">\n";
+for(my $i=0;$i<=$#unit;$i++){
+	$count++; progress("writing xml",$count,$total,0);
+	print X "  <unit id=\"$i\">\n";
+	print X "    <term id=\"$i\" freq=\"" . (0+$unit_f[$i]) . "\" cfreq=\"" . 
+		(0+$unit_qfc[$i]) . "\" qfreq=\"" . (0+$unit_qf[$i]) . "\" gfreq=\"" .
+		($do_sqfreq? (0+$unit_qfs[$i]) : 0) . "\">" . $unit[$i] . "</term>\n";
+    print X "    <extensions>\n";
+	if(defined($ext[$i]) && $ext[$i] ne ""){
+		chop($ext[$i]);
+		my @us = split(/[ ,]/,$ext[$i]);
+		for(my $j=0;$j<$#us+1;$j+=2){
+			print X "      <term id=\"".$us[$j]."\" freq=\"".$us[$j+1]."\">".$unit[$us[$j]]."</term>\n";
+		}
+	}
+    print X "    </extensions>\n";
+    print X "    <associations>\n";
+	if(defined($assoc[$i]) && $assoc[$i] ne ""){
+		chop($assoc[$i]);
+		my @usr = split(/[ ,]/,$assoc[$i]);
+		my (@us) = aggregate(@usr);
+		for(my $j=0;$j<$#us+1;$j+=2){
+			print X "      <term id=\"".$us[$j]."\" freq=\"".$us[$j+1]."\">".$unit[$us[$j]]."</term>\n";
+		}
+	}
+    print X "    </associations>\n";
+    print X "    <categories>\n";
+	if(defined($ucats[$i]) && $ucats[$i] ne ""){
+		my @ucs = split(/,/,$ucats[$i]);
+		foreach my $c (@ucs){
+			print X "      <category id=\"$c\">$cats[$c]</category>\n";
+		}
+	}
+    print X "    </categories>\n";
+}
+progress("writing xml",$count,$total,1);
+print X "  </unit>\n";
+print X "</conceptnetwork>\n";
+close(X);
+
+__END__
+	
+=head1 NAME
+
+cn_txt2xml - Convert a concept network to single XML file.
+
+=head1 SYNOPSIS
+
+cn_txt2xml [options] domain
+
+Options:
+
+  --[no]qfreq, -[no]q        [do not] retrieve query frequencies
+  --[no]sqfreq, -[no]s       [do not] retrieve term-sorted query frequencies
+  --[no]ext, -[no]e          [do not] process extensions
+  --[no]assoc, -[no]a        [do not] process associations
+  --[no]cat, -[no]c          [do not] process categories
+  --[no]fsa, -[no]f          [do not] build fsa
+  --stopwords=FILE, -w FILE  use the given stopwords file
+  --output-file, -o          output file
+  --verbose, -v              be verbose
+  --help, -h                 brief help message
+  --man, -m                  full documentation
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<-help>
+
+Print a brief help message and exits.
+
+=item B<-man>
+
+Prints the manual page and exits.
+
+=back
+
+=head1 DESCRIPTION
+
+B<This program> will convert a concept network to a single XML file.
+useful with the contents thereof.
+
+=cut
+
-- 
cgit v1.2.3