diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /fsa/src/apps |
Publish
Diffstat (limited to 'fsa/src/apps')
-rw-r--r-- | fsa/src/apps/.gitignore | 3 | ||||
-rw-r--r-- | fsa/src/apps/fsadump/.gitignore | 1 | ||||
-rw-r--r-- | fsa/src/apps/fsadump/CMakeLists.txt | 9 | ||||
-rw-r--r-- | fsa/src/apps/fsadump/fsadump.cpp | 186 | ||||
-rw-r--r-- | fsa/src/apps/fsainfo/.gitignore | 1 | ||||
-rw-r--r-- | fsa/src/apps/fsainfo/CMakeLists.txt | 9 | ||||
-rw-r--r-- | fsa/src/apps/fsainfo/fsainfo.cpp | 124 | ||||
-rw-r--r-- | fsa/src/apps/makefsa/.gitignore | 1 | ||||
-rw-r--r-- | fsa/src/apps/makefsa/CMakeLists.txt | 9 | ||||
-rw-r--r-- | fsa/src/apps/makefsa/makefsa.cpp | 295 |
10 files changed, 638 insertions, 0 deletions
diff --git a/fsa/src/apps/.gitignore b/fsa/src/apps/.gitignore new file mode 100644 index 00000000000..85c014ca23b --- /dev/null +++ b/fsa/src/apps/.gitignore @@ -0,0 +1,3 @@ +Makefile +.depend +vespa-*-* diff --git a/fsa/src/apps/fsadump/.gitignore b/fsa/src/apps/fsadump/.gitignore new file mode 100644 index 00000000000..36c86d6022c --- /dev/null +++ b/fsa/src/apps/fsadump/.gitignore @@ -0,0 +1 @@ +fsadump diff --git a/fsa/src/apps/fsadump/CMakeLists.txt b/fsa/src/apps/fsadump/CMakeLists.txt new file mode 100644 index 00000000000..069bdfb379b --- /dev/null +++ b/fsa/src/apps/fsadump/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(fsa_fsadump_app + SOURCES + fsadump.cpp + OUTPUT_NAME fsadump + INSTALL bin + DEPENDS + fsa +) diff --git a/fsa/src/apps/fsadump/fsadump.cpp b/fsa/src/apps/fsadump/fsadump.cpp new file mode 100644 index 00000000000..a713b5dd30f --- /dev/null +++ b/fsa/src/apps/fsadump/fsadump.cpp @@ -0,0 +1,186 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <inttypes.h> + +#include <iostream> +#include <fstream> + +#include <vespa/fsa/base64.h> +#include <vespa/fsa/fsa.h> +#include <vespa/fsa/automaton.h> + +using namespace fsa; + +enum FSA_Input_Format { + OUTPUT_UNDEF, + OUTPUT_TEXT, + OUTPUT_TEXT_EMPTY, + OUTPUT_TEXT_NUM, + OUTPUT_BINARY, + OUTPUT_BINARY_RAW, + OUTPUT_PHASH, + OUTPUT_DOT +}; + +void error(const char *name, const char *errormsg = NULL) +{ + if(errormsg!=NULL){ + fprintf(stderr,"%s: %s\n",name,errormsg); + } +} + +void usage(const char *name, const char *errormsg = NULL) +{ + error(name,errormsg); + fprintf(stderr,"usage:\n"); + fprintf(stderr," %s [OPTIONS] fsafile\n",name); + fprintf(stderr,"\n"); + fprintf(stderr," Valid options are:\n"); + fprintf(stderr," -h display this help\n"); + fprintf(stderr," -b use binary output format with Base64 encoded info\n"); + fprintf(stderr," -B use binary output format with raw info\n"); + fprintf(stderr," -e use text output format with no info (default)\n"); + fprintf(stderr," -n use text output format with (unsigned) numerical info\n"); + fprintf(stderr," -t use text input format\n"); + fprintf(stderr," -p use perfect hash value instead of meta info (text output)\n"); + fprintf(stderr," -d output dot format\n"); + fprintf(stderr," -V display version number\n"); + fprintf(stderr,"\n"); +} + +void version() +{ + std::cout << "fsadump " + << FSA::VER/1000000 << "." << (FSA::VER/1000)%1000 << "." << FSA::VER%1000; + if(FSA::VER != FSA::libVER()){ + std::cout << " (library " + << FSA::libVER()/1000000 << "." << (FSA::libVER()/1000)%1000 << "." << FSA::libVER()%1000 + << ")"; + } + std::cout << std::endl; +} + +int main(int argc, char** argv) +{ + FSA_Input_Format format = OUTPUT_UNDEF; + const char *input_file; + + char opt; + extern int optind; + + while((opt=getopt(argc,argv,"ebBhntpdV")) != -1){ + switch(opt){ + case 'b': + format = OUTPUT_BINARY; + break; + case 'B': + format = OUTPUT_BINARY_RAW; + break; + case 'h': + usage(argv[0]); + exit(0); + case 'V': + version(); + exit(0); + case 't': + format = OUTPUT_TEXT; + break; + case 'n': + format = OUTPUT_TEXT_NUM; + break; + case 'e': + format = OUTPUT_TEXT_EMPTY; + break; + case 'p': + format = OUTPUT_PHASH; + break; + case 'd': + format = OUTPUT_DOT; + break; + case '?': + usage(argv[0],"unrecognized option"); + exit(1); + } + } + + if(optind!=argc-1){ + usage(argv[0],"required parameter(s) missing"); + exit(1); + } + + if(format==OUTPUT_UNDEF) // use default format (warning?) + format=OUTPUT_TEXT_EMPTY; + + input_file = argv[optind]; + + FSA fsa(input_file); + + if(!fsa.isOk()){ + std::cerr << "Failed to open fsa file (" << input_file << ")" << std::endl; + exit(1); + } + + std::string meta,temp; + uint32_t num_meta; + uint32_t lines=0; + + if(format!=OUTPUT_DOT){ + + for(FSA::iterator it(fsa); it!=fsa.end(); ++it){ + + switch(format){ + case OUTPUT_BINARY: + temp.assign((const char *)(it->data()),it->dataSize()); + Base64::encode(temp,meta); + std::cout << it->str() << '\0' << meta << '\0'; + break; + case OUTPUT_BINARY_RAW: + meta.assign((const char *)(it->data()),it->dataSize()); + std::cout << it->str() << '\0' << meta << '\0'; + break; + case OUTPUT_TEXT: + meta.assign((const char *)(it->data()),it->dataSize()); + if(meta.size()>0 && meta[meta.size()-1]==0){ + meta.resize(meta.size()-1); + } + std::cout << it->str() << '\t' << meta << '\n'; + break; + case OUTPUT_TEXT_NUM: + switch(it->dataSize()){ + case 1: + num_meta = *((const uint8_t*)it->data()); + break; + case 2: + case 3: + num_meta = *((const uint16_t*)it->data()); + break; + case 4: + default: + num_meta = *((const uint32_t*)it->data()); + break; + } + std::cout << it->str() << '\t' << num_meta << '\n'; + break; + case OUTPUT_PHASH: + std::cout << it->str() << '\t' << lines << '\n'; + break; + case OUTPUT_TEXT_EMPTY: + std::cout << it->str() << '\n'; + break; + default: + assert(0); + break; + } + + ++lines; + } + } + + else { + fsa.printDot(); + } + + return 0; +} diff --git a/fsa/src/apps/fsainfo/.gitignore b/fsa/src/apps/fsainfo/.gitignore new file mode 100644 index 00000000000..fc50ebfe566 --- /dev/null +++ b/fsa/src/apps/fsainfo/.gitignore @@ -0,0 +1 @@ +fsainfo diff --git a/fsa/src/apps/fsainfo/CMakeLists.txt b/fsa/src/apps/fsainfo/CMakeLists.txt new file mode 100644 index 00000000000..c16332ed20b --- /dev/null +++ b/fsa/src/apps/fsainfo/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(fsa_fsainfo_app + SOURCES + fsainfo.cpp + OUTPUT_NAME fsainfo + INSTALL bin + DEPENDS + fsa +) diff --git a/fsa/src/apps/fsainfo/fsainfo.cpp b/fsa/src/apps/fsainfo/fsainfo.cpp new file mode 100644 index 00000000000..efbe6075331 --- /dev/null +++ b/fsa/src/apps/fsainfo/fsainfo.cpp @@ -0,0 +1,124 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <fcntl.h> + +#include <iostream> +#include <fstream> + +#include <vespa/fsa/fsa.h> + +using namespace fsa; + +void usage(const char *name, const char *errormsg = NULL) +{ + if(errormsg!=NULL){ + fprintf(stderr,"%s: %s\n",name,errormsg); + } + fprintf(stderr,"usage:\n"); + fprintf(stderr," %s [OPTIONS] fsa\n",name); + fprintf(stderr,"\n"); + fprintf(stderr," Valid options are:\n"); + fprintf(stderr," -h display this help\n"); + fprintf(stderr," -V display version number\n"); + fprintf(stderr,"\n"); +} + +void version() +{ + std::cout << "fsainfo " + << FSA::VER/1000000 << "." << (FSA::VER/1000)%1000 << "." << FSA::VER%1000; + if(FSA::VER != FSA::libVER()){ + std::cout << " (library " + << FSA::libVER()/1000000 << "." << (FSA::libVER()/1000)%1000 << "." << FSA::libVER()%1000 + << ")"; + } + std::cout << std::endl; +} + +int main(int argc, char** argv) +{ + const char *fsa_file; + + char opt; + extern int optind; + + while((opt=getopt(argc,argv,"hV")) != -1){ + switch(opt){ + case 'h': + usage(argv[0]); + exit(0); + case 'V': + version(); + exit(0); + case '?': + usage(argv[0],"unrecognized option"); + exit(1); + } + } + + if(optind!=argc-1){ + usage(argv[0],"required parameter fsa is missing"); + exit(1); + } + + fsa_file = argv[optind]; + + + + FSA::Header header; + + size_t r; + + int fd = ::open(fsa_file,O_RDONLY); + if(fd<0){ + std::cerr << "Failed to open fsa file (" << fsa_file << ")" << std::endl; + return 1; + } + else{ + r=::read(fd,&header,sizeof(header)); + ::close(fd); + if(r<sizeof(header) || header._magic!=FSA::MAGIC){ + std::cout << "Unrecognized file format (" << fsa_file << ")\n"; + } + else if(header._version<1000){ + std::cout << "Obsolete fsa file (" << fsa_file << ")\n"; + } + else { + std::cout << "Information about " << fsa_file << ":\n"; + std::cout << " Header size: " << sizeof(header) << " bytes" <<std::endl; + std::cout << " Magic: " << header._magic << std::endl; + std::cout << " Version: " << header._version/1000000 << "." + << (header._version%1000000)/1000 << "." + << header._version%1000 << std::endl; + std::cout << " Serial number: " << header._serial << std::endl; + std::cout << " Checksum: " << header._checksum << std::endl; + std::cout << " FSA size: " << header._size << " cells" <<std::endl; + std::cout << " " << header._size*(sizeof(unsigned char)+sizeof(unsigned int)) + << " bytes" <<std::endl; + std::cout << " Start state: " << header._start << std::endl; + std::cout << " Data size: " << header._data_size << " bytes" << std::endl; + std::cout << " Data item type: " << (header._data_type==FSA::DATA_FIXED? + "fixed size":"variable size") << std::endl; + if(header._data_type==FSA::DATA_FIXED) + std::cout << " Fixed item size: " << header._fixed_data_size << std::endl; + std::cout << " Perfect hash: " << (header._has_perfect_hash? + "yes":"no") << std::endl; + if(header._has_perfect_hash) + std::cout << " Perfect hash size: " << header._size*sizeof(unsigned int) << " bytes" << std::endl; + std::cout << " Total size: " + << (header._size*(sizeof(unsigned char)+ + sizeof(unsigned int)*(header._has_perfect_hash?2:1)) + + header._data_size + + sizeof(header)) + << " bytes" << std::endl; + std::cout << " Trying to load FSA ... " << std::flush; + + FSA fsa(fsa_file); + std::cout << (fsa.version()==header._version ? "succeeded.":"failed.") << std::endl; + } + } + + return 0; +} diff --git a/fsa/src/apps/makefsa/.gitignore b/fsa/src/apps/makefsa/.gitignore new file mode 100644 index 00000000000..1ea7393bec3 --- /dev/null +++ b/fsa/src/apps/makefsa/.gitignore @@ -0,0 +1 @@ +makefsa diff --git a/fsa/src/apps/makefsa/CMakeLists.txt b/fsa/src/apps/makefsa/CMakeLists.txt new file mode 100644 index 00000000000..80002338479 --- /dev/null +++ b/fsa/src/apps/makefsa/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(fsa_makefsa_app + SOURCES + makefsa.cpp + OUTPUT_NAME makefsa + INSTALL bin + DEPENDS + fsa +) diff --git a/fsa/src/apps/makefsa/makefsa.cpp b/fsa/src/apps/makefsa/makefsa.cpp new file mode 100644 index 00000000000..b27485a851e --- /dev/null +++ b/fsa/src/apps/makefsa/makefsa.cpp @@ -0,0 +1,295 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <inttypes.h> + +#include <iostream> +#include <fstream> + +#include <vespa/fsa/base64.h> +#include <vespa/fsa/fsa.h> +#include <vespa/fsa/automaton.h> + +using namespace fsa; + +enum FSA_Input_Format { + INPUT_UNDEF, + INPUT_TEXT, + INPUT_TEXT_EMPTY, + INPUT_TEXT_NUM, + INPUT_BINARY, + INPUT_BINARY_RAW }; + +void usage(const char *name, const char *errormsg = NULL) +{ + if(errormsg!=NULL){ + fprintf(stderr,"%s: %s\n",name,errormsg); + } + fprintf(stderr,"usage:\n"); + fprintf(stderr," %s [OPTIONS] [input_file] output_file\n",name); + fprintf(stderr,"\n"); + fprintf(stderr," Valid options are:\n"); + fprintf(stderr," -h display this help\n"); + fprintf(stderr," -b use binary input format with Base64 encoded info\n"); + fprintf(stderr," -B use binary input format with raw\n"); + fprintf(stderr," -e use text input format with no info (default)\n"); + fprintf(stderr," -n use text input format with (unsigned) numerical info\n"); + fprintf(stderr," -s bytes data size for numerical info: 1,2 or 4(default)\n"); + fprintf(stderr," -z bytes data size for binary info (-B) (0 means NUL terminated)\n"); + fprintf(stderr," -t use text input format\n"); + fprintf(stderr," -p build automaton with perfect hash\n"); + fprintf(stderr," -i ignore info string, regardless of input format\n"); + fprintf(stderr," -S serial serial number\n"); + fprintf(stderr," -v be verbose\n"); + fprintf(stderr," -V display version number\n"); + fprintf(stderr,"\n"); + fprintf(stderr," If input_file is not specified, standard input is used.\n"); +} + +void version() +{ + std::cout << "makefsa " + << FSA::VER/1000000 << "." << (FSA::VER/1000)%1000 << "." << FSA::VER%1000; + if(FSA::VER != FSA::libVER()){ + std::cout << " (library " + << FSA::libVER()/1000000 << "." << (FSA::libVER()/1000)%1000 << "." << FSA::libVER()%1000 + << ")"; + } + std::cout << std::endl; +} + + +int main(int argc, char** argv) +{ + FSA_Input_Format format = INPUT_UNDEF; + unsigned int num_size = 4; + unsigned int info_size_binary = 0; + bool build_phash = false; + const char *input_file; + const char *output_file; + uint32_t serial = 0; + bool ignore_info = false; + bool verbose = false; + unsigned int lines=0,count = 0; + + char opt; + extern char *optarg; + extern int optind; + + while((opt=getopt(argc,argv,"ebBhns:z:tpS:ivV")) != -1){ + switch(opt){ + case 'b': + format = INPUT_BINARY; + break; + case 'B': + format = INPUT_BINARY_RAW; + break; + case 'h': + usage(argv[0]); + exit(0); + case 'V': + version(); + exit(0); + case 't': + format = INPUT_TEXT; + break; + case 'n': + format = INPUT_TEXT_NUM; + break; + case 's': + num_size = strtoul(optarg,NULL,0); + if(num_size!=1 && num_size!=2 && num_size!=4){ + usage(argv[0],"invalid numerical info size (-s)"); + exit(1); + } + break; + case 'z': + info_size_binary = strtoul(optarg,NULL,0); + break; + case 'S': + serial = strtoul(optarg,NULL,0); + break; + case 'e': + format = INPUT_TEXT_EMPTY; + break; + case 'p': + build_phash = true; + break; + case 'i': + ignore_info = true; + break; + case 'v': + verbose = true; + break; + case '?': + usage(argv[0],"unrecognized option"); + exit(1); + } + } + + if(format==INPUT_UNDEF) // use default format (warning?) + format=INPUT_TEXT_EMPTY; + + if(optind+2==argc){ + input_file = argv[optind]; + output_file = argv[optind+1]; + } + else if(optind+1==argc){ + input_file = NULL; + output_file = argv[optind]; + } + else{ + usage(argv[0],"required parameter(s) missing"); + exit(1); + } + + Automaton automaton; + + std::string input,last_input,meta,temp; + union{ + uint8_t u1; + uint16_t u2; + uint32_t u4; + } num_meta; + std::ifstream infile; + std::istream *in; + char binary_info[info_size_binary]; + size_t split; + bool empty_meta_str = false; + + if(verbose) version(); + + if(verbose) std::cerr << "Initializing automaton ..."; + automaton.init(); + if(verbose) std::cerr << " done." << std::endl; + + if(input_file!=NULL){ + infile.open(input_file); + if (infile.fail()) { + std::cerr << "Error: Could not open file \"" << input_file << "\"\n"; + return(1); + } + in=&infile; + } + else{ + in=&std::cin; + } + if(verbose) std::cerr << "Inserting lines ..."; + while(!in->eof()){ + switch(format){ + case INPUT_BINARY: + getline(*in,input,'\0'); + getline(*in,temp,'\0'); + Base64::decode(temp,meta); + break; + case INPUT_BINARY_RAW: + getline(*in,input,'\0'); + if (info_size_binary) { + in->read(binary_info, info_size_binary); + meta.assign(binary_info, info_size_binary); + } + else + getline(*in,meta,'\0'); + break; + case INPUT_TEXT: + getline(*in,temp,'\n'); + split = temp.find_first_of('\t'); + input = temp.substr(0, split); + if (split == std::string::npos) { + empty_meta_str = true; + break; + } + meta = temp.substr(split + 1); + meta+='\0'; + break; + case INPUT_TEXT_NUM: + getline(*in,temp,'\n'); + split = temp.find_first_of('\t'); + input = temp.substr(0, split); + if (split == std::string::npos) { + empty_meta_str = true; + break; + } + temp = temp.substr(split + 1); + switch(num_size){ + case 1: + num_meta.u1=strtoul(temp.c_str(),NULL,0); + meta.assign((const char*)&num_meta,1); + break; + case 2: + num_meta.u2=strtoul(temp.c_str(),NULL,0); + meta.assign((const char*)&num_meta,2); + break; + case 4: + default: + num_meta.u4=strtoul(temp.c_str(),NULL,0); + meta.assign((const char*)&num_meta,4); + break; + } + break; + case INPUT_TEXT_EMPTY: + getline(*in,input,'\n'); + break; + case INPUT_UNDEF: + assert(0); + break; + } + + ++lines; + + if(input.length()>0){ + if(last_input>input){ + std::cerr << "warning: ignoring unsorted line " << lines << ", \"" << input << "\"\n"; + } + else if(last_input==input){ + std::cerr << "warning: ignoring duplicate line " << lines << ", \"" << input << "\"\n"; + } + else if(empty_meta_str) { + std::cerr << "warning: ignoring line " << lines << ", \"" << input << "\" with missing meta info\n"; + } + else{ + if(format==INPUT_TEXT_EMPTY || ignore_info){ + automaton.insertSortedString(input); + } + else{ + automaton.insertSortedString(input,meta); + } + if(verbose){ + ++count; + if(count%1000==0) + std::cerr << "\rInserting lines ... (inserted " << count << " lines)"; + } + } + last_input=input; + } + empty_meta_str = false; + } + if(verbose) std::cerr << "\rInserting lines ... (inserted " << count << "/" << (lines-1) << " lines) ... done.\n"; + if(input_file!=NULL){ + infile.close(); + } + + + if(verbose) std::cerr << "Finalizing ..."; + automaton.finalize(); + if(verbose) std::cerr << " done." << std::endl; + + + if(build_phash){ + if(verbose) std::cerr << "Adding perfect hash ..."; + automaton.addPerfectHash(); + if(verbose) std::cerr << " done." << std::endl; + } + + + if(verbose) std::cerr << "Writing fsa file ..."; + if (!automaton.write(output_file,serial)) { + std::cerr << "Failed to write fsa file '" << std::string(output_file) << "'. Please check write permissions" << std::endl; + return 1; + } + if(verbose) std::cerr << " done." << std::endl; + + + return 0; +} |