diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /fsa/src |
Publish
Diffstat (limited to 'fsa/src')
124 files changed, 22292 insertions, 0 deletions
diff --git a/fsa/src/.gitignore b/fsa/src/.gitignore new file mode 100644 index 00000000000..65ad4d24f75 --- /dev/null +++ b/fsa/src/.gitignore @@ -0,0 +1,6 @@ +.depend +*_test +test.out +/Makefile.ini +/config_command.sh +/fsa.mak diff --git a/fsa/src/alltest/.gitignore b/fsa/src/alltest/.gitignore new file mode 100644 index 00000000000..c950caba857 --- /dev/null +++ b/fsa/src/alltest/.gitignore @@ -0,0 +1,15 @@ +Makefile +.depend +__testfsa__.__fsa__ +fsa_conceptnet_test_app +fsa_detector_test_app +fsa_fsa_create_test_app +fsa_fsa_perf_test_app +fsa_fsa_test_app +fsa_fsamanager_test_app +fsa_lookup_test_app +fsa_ngram_test_app +fsa_segmenter_test_app +fsa_vectorizer_perf_test_app +fsa_vectorizer_test_app +*.output diff --git a/fsa/src/alltest/CMakeLists.txt b/fsa/src/alltest/CMakeLists.txt new file mode 100644 index 00000000000..d82ca400405 --- /dev/null +++ b/fsa/src/alltest/CMakeLists.txt @@ -0,0 +1,70 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(fsa_conceptnet_test_app + SOURCES + conceptnet_test.cpp + DEPENDS + fsamanagers + fsa +) +vespa_add_executable(fsa_detector_test_app + SOURCES + detector_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_fsa_test_app + SOURCES + fsa_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_fsa_create_test_app + SOURCES + fsa_create_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_fsa_perf_test_app + SOURCES + fsa_perftest.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_fsamanager_test_app + SOURCES + fsamanager_test.cpp + DEPENDS + fsamanagers + fsa +) +vespa_add_executable(fsa_lookup_test_app + SOURCES + lookup_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_ngram_test_app + SOURCES + ngram_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_segmenter_test_app + SOURCES + segmenter_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_vectorizer_test_app + SOURCES + vectorizer_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_vectorizer_perf_test_app + SOURCES + vectorizer_perftest.cpp + DEPENDS + fsa +) +vespa_add_test(NAME fsa_vectorizer_perf_test_app NO_VALGRIND COMMAND sh alltest.sh) diff --git a/fsa/src/alltest/alltest.sh b/fsa/src/alltest/alltest.sh new file mode 100755 index 00000000000..37274721e25 --- /dev/null +++ b/fsa/src/alltest/alltest.sh @@ -0,0 +1,11 @@ +#!/bin/bash +./detector_test.sh +./fsa_test.sh +./fsa_fsa_create_test_app +./fsa_fsa_perf_test_app +./fsa_fsamanager_test_app . __testfsa__.__fsa__ +./lookup_test.sh +./ngram_test.sh +./segmenter_test.sh +./vectorizer_test.sh +./fsa_vectorizer_perf_test_app diff --git a/fsa/src/alltest/conceptnet_test.cpp b/fsa/src/alltest/conceptnet_test.cpp new file mode 100644 index 00000000000..38c020aa511 --- /dev/null +++ b/fsa/src/alltest/conceptnet_test.cpp @@ -0,0 +1,80 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <stdlib.h> +#include <unistd.h> +#include <stdio.h> + +#include <vespa/fsa/conceptnet.h> +#include <vespa/fsamanagers/conceptnetmanager.h> + +using namespace fsa; + +int main(int argc, char **argv) +{ + char opt; + //extern char *optarg; + extern int optind; + + bool do_ext = false, do_assoc = false, do_cat = false; + + while((opt=getopt(argc,argv,"aec")) != -1){ + switch(opt){ + case 'a': + do_assoc = true; + break; + case 'e': + do_ext = true; + break; + case 'c': + do_cat = true; + break; + case '?': + fprintf(stderr,"conceptnet_test: unrecognized option"); + exit(1); + } + } + + if(optind>=argc){ + fprintf(stderr,"usage: conceptnet_test [-aec] DOMAIN [UNIT ...]\n"); + exit(1); + } + + std::string domain = argv[optind]; + + if(!ConceptNetManager::instance().load(domain, + domain + ".fsa", + domain + ".dat")){ + fprintf(stderr,"failed to load concept net %s\n",domain.c_str()); + exit(1); + } + + ConceptNet::Handle* cn = ConceptNetManager::instance().get(domain); + + if(cn!=NULL){ + for(int i=optind+1;i<argc;i++){ + int idx = (*cn)->lookup(argv[i]); + printf("%s(%d) : (%d,%d,%d,%d) (%f,%f)\n",argv[i],idx, + (*cn)->frq(idx),(*cn)->cFrq(idx),(*cn)->qFrq(idx),(*cn)->sFrq(idx), + (*cn)->score(idx),(*cn)->strength(idx)); + if(do_ext){ + for(int e = 0; e<(*cn)->numExt(idx); e++){ + printf(" %s, %d\n",(*cn)->lookup((*cn)->ext(idx,e)),(*cn)->extFrq(idx,e)); + } + } + if(do_assoc){ + for(int a = 0; a<(*cn)->numAssoc(idx); a++){ + printf(" %s, %d\n",(*cn)->lookup((*cn)->assoc(idx,a)),(*cn)->assocFrq(idx,a)); + } + } + if(do_cat){ + for(int c = 0; c<(*cn)->numCat(idx); c++){ + printf(" %s\n",(*cn)->catName((*cn)->cat(idx,c))); + } + } + } + } + else { + fprintf(stderr,"failed to load concept net %s\n",domain.c_str()); + exit(1); + } + +} diff --git a/fsa/src/alltest/conceptnet_test.out b/fsa/src/alltest/conceptnet_test.out new file mode 100644 index 00000000000..9f3570cebf1 --- /dev/null +++ b/fsa/src/alltest/conceptnet_test.out @@ -0,0 +1,4 @@ +new york(841954) : (-1,-1,-1,-1) (-1.000000,-1.000000) +sunnyvale(1139231) : (-1,-1,-1,-1) (-1.000000,-1.000000) +gibson(479780) : (-1,-1,-1,-1) (-1.000000,-1.000000) +metallica(770993) : (-1,-1,-1,-1) (-1.000000,-1.000000) diff --git a/fsa/src/alltest/detector_test.cpp b/fsa/src/alltest/detector_test.cpp new file mode 100644 index 00000000000..1942c4ba7a6 --- /dev/null +++ b/fsa/src/alltest/detector_test.cpp @@ -0,0 +1,50 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file vectorizertest.cpp + * @brief Test for the vectorizer class + * + */ + +#include <iostream> +#include <iomanip> +#include <string> + +#include <vespa/fsa/fsa.h> +#include <vespa/fsa/detector.h> +#include <vespa/fsa/ngram.h> + +using namespace fsa; + +class MyHits : public Detector::Hits{ +public: + MyHits() {}; + ~MyHits() {}; + + void add(const NGram &text, + unsigned int from, int length, + const FSA::State &) + { + std::cout << "detected: [" << from << "," << from+length-1 << "], '" + << text.join(" ",from,length) << "'\n"; + } +}; + +int main(int argc, char **argv) +{ + FSA dict(argc>=2? argv[1] : "__testfsa__.__fsa__"); + + Detector d(dict); + MyHits h; + + std::string text; + while(!std::cin.eof()){ + getline(std::cin,text); + + d.detect(text,h); + } + + return 0; +} diff --git a/fsa/src/alltest/detector_test.out b/fsa/src/alltest/detector_test.out new file mode 100644 index 00000000000..c5dbbdd08f1 --- /dev/null +++ b/fsa/src/alltest/detector_test.out @@ -0,0 +1,26 @@ +detected: [0,0], 'apple' +detected: [0,0], 'apricot' +detected: [0,0], 'artichoke' +detected: [0,0], 'banana' +detected: [0,0], 'cabbage' +detected: [0,0], 'carrot' +detected: [0,0], 'cherry' +detected: [0,0], 'chili' +detected: [0,0], 'cucumber' +detected: [0,0], 'eggplant' +detected: [0,0], 'grapes' +detected: [0,0], 'lettuce' +detected: [0,0], 'onion' +detected: [0,0], 'paprika' +detected: [0,1], 'passion fruit' +detected: [0,0], 'pea' +detected: [0,0], 'peach' +detected: [0,0], 'pear' +detected: [0,0], 'pineapple' +detected: [0,0], 'plum' +detected: [0,0], 'potato' +detected: [0,0], 'pumpkin' +detected: [0,1], 'sour cherry' +detected: [1,1], 'cherry' +detected: [0,0], 'squash' +detected: [0,0], 'tomato' diff --git a/fsa/src/alltest/detector_test.sh b/fsa/src/alltest/detector_test.sh new file mode 100755 index 00000000000..dd6f650a35c --- /dev/null +++ b/fsa/src/alltest/detector_test.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./fsa_detector_test_app < testinput.txt > detector_test.output +diff detector_test.output detector_test.out diff --git a/fsa/src/alltest/fsa_create_test.cpp b/fsa/src/alltest/fsa_create_test.cpp new file mode 100644 index 00000000000..c72ea900aad --- /dev/null +++ b/fsa/src/alltest/fsa_create_test.cpp @@ -0,0 +1,94 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <iostream> + +#include <vespa/fsa/fsa.h> +#include <vespa/fsa/automaton.h> +#include <vespa/fsa/timestamp.h> + +using namespace fsa; + +int main(int, char**) +{ + + Automaton *aut = new Automaton; + + Blob fruit("Fruit"), veggie("Vegetable"), city("City"); + + TimeStamp t; + + aut->init(); + + aut->insertSortedString("Cupertino",city); + aut->insertSortedString("Foster City",city); + aut->insertSortedString("Los Altos",city); + aut->insertSortedString("Menlo Park",city); + aut->insertSortedString("Mountain View",city); + aut->insertSortedString("Palo Alto",city); + aut->insertSortedString("San Francisco",city); + aut->insertSortedString("San Jose",city); + aut->insertSortedString("Santa Clara",city); + aut->insertSortedString("Saratoga",city); + aut->insertSortedString("Sunnyvale",city); + aut->insertSortedString("apple",fruit); + aut->insertSortedString("apricot",fruit); + aut->insertSortedString("artichoke",veggie); + aut->insertSortedString("banana",fruit); + aut->insertSortedString("cabbage",veggie); + aut->insertSortedString("carrot",veggie); + aut->insertSortedString("cherry",fruit); + aut->insertSortedString("chili",veggie); + aut->insertSortedString("cucumber",veggie); + aut->insertSortedString("eggplant",veggie); + aut->insertSortedString("grapes",fruit); + aut->insertSortedString("lettuce",veggie); + aut->insertSortedString("onion",veggie); + aut->insertSortedString("paprika",veggie); + aut->insertSortedString("passion fruit",fruit); + aut->insertSortedString("pea",veggie); + aut->insertSortedString("peach",fruit); + aut->insertSortedString("pear",fruit); + aut->insertSortedString("pineapple",fruit); + aut->insertSortedString("plum",fruit); + aut->insertSortedString("potato",veggie); + aut->insertSortedString("pumpkin",veggie); + aut->insertSortedString("sour cherry",fruit); + aut->insertSortedString("squash",veggie); + aut->insertSortedString("tomato",veggie); + + aut->finalize(); + + double d1 = t.elapsed(); + + aut->addPerfectHash(); + + double d2 = t.elapsed(); + + aut->write("__testfsa__.__fsa__"); + + double d3 = t.elapsed(); + + FSA *fsa = aut->getFSA(); + + double d4 = t.elapsed(); + + std::cout << "Automoaton build finished (" << 1000*d1 << "ms," << 1000*(d2-d1) << "ms)" + << ", fsa retrieval (" << 1000*(d4-d3) << "ms) " << ((fsa==NULL)?"failed":"succeded") << ".\n"; + + if(fsa!=NULL){ + FSA::State fs(*fsa); + const unsigned char *pb = fs.lookup("cucumber"); + std::cout << "Lookup(\"cucumber\") -> "; + if(pb!=NULL){ + std::cout << "\"" << pb << "\""; + } + else{ + std::cout << "not found."; + } + std::cout << "\n"; + } + + delete aut; + delete fsa; + + return 0; +} diff --git a/fsa/src/alltest/fsa_perftest.cpp b/fsa/src/alltest/fsa_perftest.cpp new file mode 100644 index 00000000000..90d2c042b07 --- /dev/null +++ b/fsa/src/alltest/fsa_perftest.cpp @@ -0,0 +1,77 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <stdlib.h> +#include <iostream> +#include <iomanip> +#include <string> + +#include <vespa/fsa/fsa.h> +#include <vespa/fsa/timestamp.h> + +using namespace fsa; + +int main(int, char**) +{ + FSA f("__testfsa__.__fsa__"); + FSA::State s(f); + FSA::HashedState hs(f); + FSA::MemoryState ms(f); + FSA::HashedMemoryState hms(f); + FSA::CounterState cs(f); + std::string input("cucumber"); + unsigned int count=10000000,i; + + std::cout << "Number of lookups: " << count << std::endl; + std::cout << "Input string length: " << input.length() << std::endl; + std::cout << std::endl; + + TimeStamp t; + double t0,t1; + + t0=t.elapsed(); + for(i=0;i<count;i++){ + s.start(); + s.lookup(input); + } + t1=t.elapsed()-t0; + std::cout << "State: " << t1*1000 << " ms" << "\t" + << (unsigned int)(count*input.length()/t1) << " delta/sec" << std::endl; + + t0=t.elapsed(); + for(i=0;i<count;i++){ + hs.start(); + hs.lookup(input); + } + t1=t.elapsed()-t0; + std::cout << "HashedState: " << t1*1000 << " ms"<< "\t" + << (unsigned int)(count*input.length()/t1) << " delta/sec" << std::endl; + + t0=t.elapsed(); + for(i=0;i<count;i++){ + ms.start(); + ms.lookup(input); + } + t1=t.elapsed()-t0; + std::cout << "MemoryState: " << t1*1000 << " ms"<< "\t" + << (unsigned int)(count*input.length()/t1) << " delta/sec" << std::endl; + + t0=t.elapsed(); + for(i=0;i<count;i++){ + hms.start(); + hms.lookup(input); + } + t1=t.elapsed()-t0; + std::cout << "HashedMemoryState: " << t1*1000 << " ms"<< "\t" + << (unsigned int)(count*input.length()/t1) << " delta/sec" << std::endl; + + t0=t.elapsed(); + for(i=0;i<count;i++){ + cs.start(); + cs.lookup(input); + } + t1=t.elapsed()-t0; + std::cout << "CounterState: " << t1*1000 << " ms"<< "\t" + << (unsigned int)(count*input.length()/t1) << " delta/sec" << std::endl; + + return 0; +} diff --git a/fsa/src/alltest/fsa_test.cpp b/fsa/src/alltest/fsa_test.cpp new file mode 100644 index 00000000000..5bc95f20430 --- /dev/null +++ b/fsa/src/alltest/fsa_test.cpp @@ -0,0 +1,114 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <stdio.h> +#include <string> + +#include <vespa/fsa/fsa.h> + +using namespace fsa; + +int main(int, char**) +{ + FSA *f = new FSA("__testfsa__.__fsa__", FILE_ACCESS_MMAP); + FSA::State *fs = new FSA::State(*f); + + std::string s("cucu"); + fs->start(s); + fs->delta('m'); + fs->delta("ber"); + if(fs->isFinal()){ + printf("start/delta test: string(\"cucu\")+'m'+\"ber\" is accepted\n"); + printf(" data size: %d\n",fs->dataSize()); + printf(" data string: \"%-*.*s\"\n",fs->dataSize(),fs->dataSize(),fs->data()); + } + else { + printf("start/delta test failed.\n"); + } + + const unsigned char *pb = fs->lookup("cucumber"); + if(pb!=NULL){ + printf("lookup test: \"cucumber\" -> \"%s\"\n",pb); + } + else{ + printf("lookup test: \"cucumber\" not found.\n"); + } + + + FSA::HashedState *fs1 = new FSA::HashedState(*f); + + + fs1->delta("pe"); + + FSA::HashedState *fs2 = new FSA::HashedState(*fs1); + FSA::HashedState *fs3 = new FSA::HashedState(*fs1); + + + + fs1->delta("a"); + fs2->delta("ach"); + fs3->delta("ar"); + + if(fs1->isFinal() && fs2->isFinal()){ + printf("copy hashed state test:\n"); + printf(" \"pe\"+\"a\": hash=%d, data_size=%d, data string=\"%-*.*s\"\n", + fs1->hash(),fs1->dataSize(),fs1->dataSize(),fs1->dataSize(),fs1->data()); + printf(" \"pe\"+\"ach\": hash=%d, data_size=%d, data string=\"%-*.*s\"\n", + fs2->hash(),fs2->dataSize(),fs2->dataSize(),fs2->dataSize(),fs2->data()); + printf(" \"pe\"+\"ar\": hash=%d, data_size=%d, data string=\"%-*.*s\"\n", + fs3->hash(),fs3->dataSize(),fs3->dataSize(),fs3->dataSize(),fs3->data()); + + } + else { + printf("copy hashed state test failed.\n"); + } + + printf("revLookup test:\n"); + unsigned int i=0; + std::string res; + while(i<100){ + res=fs2->revLookup(i); + if(res.size()==0) + break; + fs2->lookup(res); + printf(" %d -> %s -> %d\n",i,res.c_str(),fs2->hash()); + i++; + } + + printf("iterator test:\n"); + fs1->start('p'); + printf(" possible continuations from \"p\":\n"); + for(FSA::iterator it(*fs1); it!=fs1->end(); ++it){ + printf(" \"p\" + \"%s\"\n",it->str().c_str()); + } + + delete fs; + delete fs1; + delete fs2; + delete fs3; + + + printf("counter/memory state test\n"); + FSA::CounterState *cs = new FSA::CounterState(*f); + FSA::MemoryState *ms = new FSA::MemoryState(*f); + + cs->start("cucu"); + ms->start("cucu"); + printf(" \"cucu\" -> %s:%d\n",ms->memory().c_str(),cs->counter()); + + cs->start("cucumber"); + ms->start("cucumber"); + printf(" \"cucumber\" -> %s:%d\n",ms->memory().c_str(),cs->counter()); + + cs->start("cucumber slumber"); + ms->start("cucumber slumber"); + printf(" \"cucumber slumber\" -> %s:%d\n",ms->memory().c_str(),cs->counter()); + + delete cs; + delete ms; + delete f; + + return 0; +} diff --git a/fsa/src/alltest/fsa_test.out b/fsa/src/alltest/fsa_test.out new file mode 100644 index 00000000000..b9c96e5b795 --- /dev/null +++ b/fsa/src/alltest/fsa_test.out @@ -0,0 +1,60 @@ +start/delta test: string("cucu")+'m'+"ber" is accepted + data size: 10 + data string: "Vegetable " +lookup test: "cucumber" -> "Vegetable" +copy hashed state test: + "pe"+"a": hash=26, data_size=10, data string="Vegetable " + "pe"+"ach": hash=27, data_size=6, data string="Fruit " + "pe"+"ar": hash=28, data_size=6, data string="Fruit " +revLookup test: + 0 -> Cupertino -> 0 + 1 -> Foster City -> 1 + 2 -> Los Altos -> 2 + 3 -> Menlo Park -> 3 + 4 -> Mountain View -> 4 + 5 -> Palo Alto -> 5 + 6 -> San Francisco -> 6 + 7 -> San Jose -> 7 + 8 -> Santa Clara -> 8 + 9 -> Saratoga -> 9 + 10 -> Sunnyvale -> 10 + 11 -> apple -> 11 + 12 -> apricot -> 12 + 13 -> artichoke -> 13 + 14 -> banana -> 14 + 15 -> cabbage -> 15 + 16 -> carrot -> 16 + 17 -> cherry -> 17 + 18 -> chili -> 18 + 19 -> cucumber -> 19 + 20 -> eggplant -> 20 + 21 -> grapes -> 21 + 22 -> lettuce -> 22 + 23 -> onion -> 23 + 24 -> paprika -> 24 + 25 -> passion fruit -> 25 + 26 -> pea -> 26 + 27 -> peach -> 27 + 28 -> pear -> 28 + 29 -> pineapple -> 29 + 30 -> plum -> 30 + 31 -> potato -> 31 + 32 -> pumpkin -> 32 + 33 -> sour cherry -> 33 + 34 -> squash -> 34 + 35 -> tomato -> 35 +iterator test: + possible continuations from "p": + "p" + "aprika" + "p" + "assion fruit" + "p" + "ea" + "p" + "each" + "p" + "ear" + "p" + "ineapple" + "p" + "lum" + "p" + "otato" + "p" + "umpkin" +counter/memory state test + "cucu" -> cucu:4 + "cucumber" -> cucumber:8 + "cucumber slumber" -> cucumber:8 diff --git a/fsa/src/alltest/fsa_test.sh b/fsa/src/alltest/fsa_test.sh new file mode 100755 index 00000000000..497fd291c4d --- /dev/null +++ b/fsa/src/alltest/fsa_test.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./fsa_fsa_test_app > fsa_test.output +diff fsa_test.output fsa_test.out diff --git a/fsa/src/alltest/fsamanager_test.cpp b/fsa/src/alltest/fsamanager_test.cpp new file mode 100644 index 00000000000..7ca4a2d8e8a --- /dev/null +++ b/fsa/src/alltest/fsamanager_test.cpp @@ -0,0 +1,25 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fsa/fsa.h> +#include <vespa/fsamanagers/fsamanager.h> + +#include <iostream> +#include <string> +#include <stdlib.h> + +using namespace fsa; + +int main(int argc, char** argv) +{ + if(argc<3){ + std::cerr << "usage: fsamanager_test cache_dir fsa_file_or_url [fsa_file_or_url ...]\n"; + exit(1); + } + + FSAManager::instance().setCacheDir(argv[1]); + + for(int i=2;i<argc;i++){ + std::cerr << "Loading " << argv[i] << " ... "; + std::cerr << (FSAManager::instance().load(argv[i],argv[i]) ? "ok":"failed") << "\n"; + } + +} diff --git a/fsa/src/alltest/lookup_test.cpp b/fsa/src/alltest/lookup_test.cpp new file mode 100644 index 00000000000..6ff4e3063d4 --- /dev/null +++ b/fsa/src/alltest/lookup_test.cpp @@ -0,0 +1,49 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <stdlib.h> +#include <iostream> +#include <iomanip> +#include <string> + +#include <vespa/fsa/fsa.h> + +#if (__GNUG__ <3 || (__GNUG__ == 3 && __GNUC_MINOR__ < 1)) +namespace std { +const char *left = ""; +} +#endif + +using namespace fsa; + +int main(int argc, char** argv) +{ + + if(argc!=2){ + std::cerr << "usage: lookup_test fsafile <input >output" << std::endl; + exit(1); + } + + FSA f(argv[1]); + FSA::HashedState fs(f); + std::string input; + + while(!std::cin.eof()){ + getline(std::cin,input); + + if(input.size()>0){ + fs.start(input); + if(fs.isFinal()){ + std::cout << "'" << input << "'" << " is accepted, hash value: " << fs.hash() + << ", data size: " << fs.dataSize() + << ", data string: \"" + << std::setw(fs.dataSize()) << std::left << fs.data() + << "\"" << std::endl; + } + else{ + std::cout << "'" << input << "'" << " is not accepted." << std::endl; + } + } + } + + return 0; +} diff --git a/fsa/src/alltest/lookup_test.out b/fsa/src/alltest/lookup_test.out new file mode 100644 index 00000000000..b7dd9b4da4b --- /dev/null +++ b/fsa/src/alltest/lookup_test.out @@ -0,0 +1,41 @@ +'Cupertino' is accepted, hash value: 0, data size: 5, data string: "City " +'Foster City' is accepted, hash value: 1, data size: 5, data string: "City " +'Los Altos' is accepted, hash value: 2, data size: 5, data string: "City " +'Menlo Park' is accepted, hash value: 3, data size: 5, data string: "City " +'Mountain View' is accepted, hash value: 4, data size: 5, data string: "City " +'Palo Alto' is accepted, hash value: 5, data size: 5, data string: "City " +'San Francisco' is accepted, hash value: 6, data size: 5, data string: "City " +'San Jose' is accepted, hash value: 7, data size: 5, data string: "City " +'Santa Clara' is accepted, hash value: 8, data size: 5, data string: "City " +'Saratoga' is accepted, hash value: 9, data size: 5, data string: "City " +'Sunnyvale' is accepted, hash value: 10, data size: 5, data string: "City " +'apple' is accepted, hash value: 11, data size: 6, data string: "Fruit " +'apricot' is accepted, hash value: 12, data size: 6, data string: "Fruit " +'artichoke' is accepted, hash value: 13, data size: 10, data string: "Vegetable " +'banana' is accepted, hash value: 14, data size: 6, data string: "Fruit " +'cabbage' is accepted, hash value: 15, data size: 10, data string: "Vegetable " +'carrot' is accepted, hash value: 16, data size: 10, data string: "Vegetable " +'cherry' is accepted, hash value: 17, data size: 6, data string: "Fruit " +'chili' is accepted, hash value: 18, data size: 10, data string: "Vegetable " +'cucumber' is accepted, hash value: 19, data size: 10, data string: "Vegetable " +'eggplant' is accepted, hash value: 20, data size: 10, data string: "Vegetable " +'grapes' is accepted, hash value: 21, data size: 6, data string: "Fruit " +'lettuce' is accepted, hash value: 22, data size: 10, data string: "Vegetable " +'onion' is accepted, hash value: 23, data size: 10, data string: "Vegetable " +'paprika' is accepted, hash value: 24, data size: 10, data string: "Vegetable " +'passion fruit' is accepted, hash value: 25, data size: 6, data string: "Fruit " +'pea' is accepted, hash value: 26, data size: 10, data string: "Vegetable " +'peach' is accepted, hash value: 27, data size: 6, data string: "Fruit " +'pear' is accepted, hash value: 28, data size: 6, data string: "Fruit " +'pineapple' is accepted, hash value: 29, data size: 6, data string: "Fruit " +'plum' is accepted, hash value: 30, data size: 6, data string: "Fruit " +'potato' is accepted, hash value: 31, data size: 10, data string: "Vegetable " +'pumpkin' is accepted, hash value: 32, data size: 10, data string: "Vegetable " +'sour cherry' is accepted, hash value: 33, data size: 6, data string: "Fruit " +'squash' is accepted, hash value: 34, data size: 10, data string: "Vegetable " +'tomato' is accepted, hash value: 35, data size: 10, data string: "Vegetable " +'alpha' is not accepted. +'beta' is not accepted. +'gamma' is not accepted. +'delta' is not accepted. +'epsilon' is not accepted. diff --git a/fsa/src/alltest/lookup_test.sh b/fsa/src/alltest/lookup_test.sh new file mode 100755 index 00000000000..394baecc78a --- /dev/null +++ b/fsa/src/alltest/lookup_test.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./fsa_lookup_test_app __testfsa__.__fsa__ < testinput.txt > lookup_test.output +diff lookup_test.output lookup_test.out diff --git a/fsa/src/alltest/ngram_test.cpp b/fsa/src/alltest/ngram_test.cpp new file mode 100644 index 00000000000..7f0be7769e1 --- /dev/null +++ b/fsa/src/alltest/ngram_test.cpp @@ -0,0 +1,57 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <iostream> + +#include <vespa/fsa/permuter.h> +#include <vespa/fsa/selector.h> +#include <vespa/fsa/ngram.h> +#include <vespa/fsa/base64.h> +#include <vespa/fsa/wordchartokenizer.h> + +using namespace fsa; + +int main(int, char **) +{ + Permuter p; + + NGram q1("a b c d e f"), q2(q1,p,10), q3(q2,p,13); + + Selector s; + + std::string s1("this is a test"), s2; + + Base64::encode(s1,s2); + std::cout << "'" << s1 << "'" << std::endl; + std::cout << "'" << s2 << "'" << std::endl; + Base64::decode(s2,s1); + std::cout << "'" << s1 << "'" << std::endl; + + + std::cout << q1 << std::endl; + std::cout << q2 << std::endl; + std::cout << q3 << std::endl; + + q2.sort(); + std::cout << q2 << std::endl; + q2.reverse(); + std::cout << q2 << std::endl; + + std::cout << std::hex; + for(unsigned int n=1;n<=6;n++){ + unsigned int c=Permuter::firstComb(n,6); + while(c>0){ + s.clear(); + s.set(c); + q2.set(q1,s); + std::cout << c << ": " << q2 << std::endl; + c=Permuter::nextComb(c,6); + } + } + std::cout << std::dec; + + WordCharTokenizer tokenizer(WordCharTokenizer::PUNCTUATION_SMART,"PUNCT"); + + NGram q4("test, wordchar tokenizer. does it work?",tokenizer); + + std::cout << q4.join(" -|- ") << std::endl; + +} diff --git a/fsa/src/alltest/ngram_test.out b/fsa/src/alltest/ngram_test.out new file mode 100644 index 00000000000..d826e3173dd --- /dev/null +++ b/fsa/src/alltest/ngram_test.out @@ -0,0 +1,72 @@ +'this is a test' +'dGhpcyBpcyBhIHRlc3Q=' +'this is a test' +a b c d e f +b d a c e f +a b c d e f +a b c d e f +f e d c b a +1: a +2: b +4: c +8: d +10: e +20: f +3: a b +5: a c +6: b c +9: a d +a: b d +c: c d +11: a e +12: b e +14: c e +18: d e +21: a f +22: b f +24: c f +28: d f +30: e f +7: a b c +b: a b d +d: a c d +e: b c d +13: a b e +15: a c e +16: b c e +19: a d e +1a: b d e +1c: c d e +23: a b f +25: a c f +26: b c f +29: a d f +2a: b d f +2c: c d f +31: a e f +32: b e f +34: c e f +38: d e f +f: a b c d +17: a b c e +1b: a b d e +1d: a c d e +1e: b c d e +27: a b c f +2b: a b d f +2d: a c d f +2e: b c d f +33: a b e f +35: a c e f +36: b c e f +39: a d e f +3a: b d e f +3c: c d e f +1f: a b c d e +2f: a b c d f +37: a b c e f +3b: a b d e f +3d: a c d e f +3e: b c d e f +3f: a b c d e f +test -|- PUNCT -|- wordchar -|- tokenizer -|- PUNCT -|- does -|- it -|- work diff --git a/fsa/src/alltest/ngram_test.sh b/fsa/src/alltest/ngram_test.sh new file mode 100755 index 00000000000..85559d6e391 --- /dev/null +++ b/fsa/src/alltest/ngram_test.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./fsa_ngram_test_app > ngram_test.output +diff ngram_test.output ngram_test.out diff --git a/fsa/src/alltest/segmenter_test.cpp b/fsa/src/alltest/segmenter_test.cpp new file mode 100644 index 00000000000..3b80fe3390e --- /dev/null +++ b/fsa/src/alltest/segmenter_test.cpp @@ -0,0 +1,74 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file segmenter_test.cpp + * @brief Test for the Segmenter class + * + */ + +#include <iostream> +#include <iomanip> + +#include <vespa/fsa/segmenter.h> + +using namespace fsa; + +int main(int argc, char **argv) +{ + FSA dict(argc>=2? argv[1] : "__testfsa__.__fsa__"); + + Segmenter segmenter(dict); + Segmenter::Segments segments; + const Segmenter::Segmentation *segmentation; + + std::string text; + while(!std::cin.eof()){ + getline(std::cin,text); + + if(text.size()>3){ + + segmenter.segment(text,segments); + + std::cout << "List of all segments:" << std::endl; + for(unsigned int i=0; i<segments.size(); i++){ + std::cout << " " + << segments.sgm(i) << ":" << segments.conn(i) << " [" + << segments.beg(i) << "," << segments.end(i)-1 << "]" + << std::endl; + } + + segmentation=segments.segmentation(Segmenter::SEGMENTATION_WEIGHTED); + + std::cout << "Weighted segmentation:" << std::endl << " "; + for(Segmenter::SegmentationConstIterator it=segmentation->begin(); + it!=segmentation->end();++it){ + std::cout << "(" << segments.sgm(*it) << ")"; + } + std::cout << std::endl; + + segmentation=segments.segmentation(Segmenter::SEGMENTATION_RIGHTMOST_LONGEST); + + std::cout << "Rightmost-longest segmentation:" << std::endl << " "; + for(Segmenter::SegmentationConstIterator it=segmentation->begin(); + it!=segmentation->end();++it){ + std::cout << "(" << segments.sgm(*it) << ")"; + } + std::cout << std::endl; + + segmentation=segments.segmentation(Segmenter::SEGMENTATION_LEFTMOST_LONGEST); + + std::cout << "Lefttmost-longest segmentation:" << std::endl << " "; + for(Segmenter::SegmentationConstIterator it=segmentation->begin(); + it!=segmentation->end();++it){ + std::cout << "(" << segments.sgm(*it) << ")"; + } + std::cout << std::endl; + + } + + } + + return 0; +} diff --git a/fsa/src/alltest/segmenter_test.out b/fsa/src/alltest/segmenter_test.out new file mode 100644 index 00000000000..d8c42cfacce --- /dev/null +++ b/fsa/src/alltest/segmenter_test.out @@ -0,0 +1,332 @@ +List of all segments: + cupertino:0 [0,0] +Weighted segmentation: + (cupertino) +Rightmost-longest segmentation: + (cupertino) +Lefttmost-longest segmentation: + (cupertino) +List of all segments: + foster:0 [0,0] + city:0 [1,1] +Weighted segmentation: + (foster)(city) +Rightmost-longest segmentation: + (foster)(city) +Lefttmost-longest segmentation: + (foster)(city) +List of all segments: + los:0 [0,0] + altos:0 [1,1] +Weighted segmentation: + (los)(altos) +Rightmost-longest segmentation: + (los)(altos) +Lefttmost-longest segmentation: + (los)(altos) +List of all segments: + menlo:0 [0,0] + park:0 [1,1] +Weighted segmentation: + (menlo)(park) +Rightmost-longest segmentation: + (menlo)(park) +Lefttmost-longest segmentation: + (menlo)(park) +List of all segments: + mountain:0 [0,0] + view:0 [1,1] +Weighted segmentation: + (mountain)(view) +Rightmost-longest segmentation: + (mountain)(view) +Lefttmost-longest segmentation: + (mountain)(view) +List of all segments: + palo:0 [0,0] + alto:0 [1,1] +Weighted segmentation: + (palo)(alto) +Rightmost-longest segmentation: + (palo)(alto) +Lefttmost-longest segmentation: + (palo)(alto) +List of all segments: + san:0 [0,0] + francisco:0 [1,1] +Weighted segmentation: + (san)(francisco) +Rightmost-longest segmentation: + (san)(francisco) +Lefttmost-longest segmentation: + (san)(francisco) +List of all segments: + san:0 [0,0] + jose:0 [1,1] +Weighted segmentation: + (san)(jose) +Rightmost-longest segmentation: + (san)(jose) +Lefttmost-longest segmentation: + (san)(jose) +List of all segments: + santa:0 [0,0] + clara:0 [1,1] +Weighted segmentation: + (santa)(clara) +Rightmost-longest segmentation: + (santa)(clara) +Lefttmost-longest segmentation: + (santa)(clara) +List of all segments: + saratoga:0 [0,0] +Weighted segmentation: + (saratoga) +Rightmost-longest segmentation: + (saratoga) +Lefttmost-longest segmentation: + (saratoga) +List of all segments: + sunnyvale:0 [0,0] +Weighted segmentation: + (sunnyvale) +Rightmost-longest segmentation: + (sunnyvale) +Lefttmost-longest segmentation: + (sunnyvale) +List of all segments: + apple:1769304646 [0,0] +Weighted segmentation: + (apple) +Rightmost-longest segmentation: + (apple) +Lefttmost-longest segmentation: + (apple) +List of all segments: + apricot:1769304646 [0,0] +Weighted segmentation: + (apricot) +Rightmost-longest segmentation: + (apricot) +Lefttmost-longest segmentation: + (apricot) +List of all segments: + artichoke:1701274966 [0,0] +Weighted segmentation: + (artichoke) +Rightmost-longest segmentation: + (artichoke) +Lefttmost-longest segmentation: + (artichoke) +List of all segments: + banana:1769304646 [0,0] +Weighted segmentation: + (banana) +Rightmost-longest segmentation: + (banana) +Lefttmost-longest segmentation: + (banana) +List of all segments: + cabbage:1701274966 [0,0] +Weighted segmentation: + (cabbage) +Rightmost-longest segmentation: + (cabbage) +Lefttmost-longest segmentation: + (cabbage) +List of all segments: + carrot:1701274966 [0,0] +Weighted segmentation: + (carrot) +Rightmost-longest segmentation: + (carrot) +Lefttmost-longest segmentation: + (carrot) +List of all segments: + cherry:1769304646 [0,0] +Weighted segmentation: + (cherry) +Rightmost-longest segmentation: + (cherry) +Lefttmost-longest segmentation: + (cherry) +List of all segments: + chili:1701274966 [0,0] +Weighted segmentation: + (chili) +Rightmost-longest segmentation: + (chili) +Lefttmost-longest segmentation: + (chili) +List of all segments: + cucumber:1701274966 [0,0] +Weighted segmentation: + (cucumber) +Rightmost-longest segmentation: + (cucumber) +Lefttmost-longest segmentation: + (cucumber) +List of all segments: + eggplant:1701274966 [0,0] +Weighted segmentation: + (eggplant) +Rightmost-longest segmentation: + (eggplant) +Lefttmost-longest segmentation: + (eggplant) +List of all segments: + grapes:1769304646 [0,0] +Weighted segmentation: + (grapes) +Rightmost-longest segmentation: + (grapes) +Lefttmost-longest segmentation: + (grapes) +List of all segments: + lettuce:1701274966 [0,0] +Weighted segmentation: + (lettuce) +Rightmost-longest segmentation: + (lettuce) +Lefttmost-longest segmentation: + (lettuce) +List of all segments: + onion:1701274966 [0,0] +Weighted segmentation: + (onion) +Rightmost-longest segmentation: + (onion) +Lefttmost-longest segmentation: + (onion) +List of all segments: + paprika:1701274966 [0,0] +Weighted segmentation: + (paprika) +Rightmost-longest segmentation: + (paprika) +Lefttmost-longest segmentation: + (paprika) +List of all segments: + passion:0 [0,0] + fruit:0 [1,1] + passion fruit:1769304646 [0,1] +Weighted segmentation: + (passion fruit) +Rightmost-longest segmentation: + (passion fruit) +Lefttmost-longest segmentation: + (passion fruit) +List of all segments: + peach:1769304646 [0,0] +Weighted segmentation: + (peach) +Rightmost-longest segmentation: + (peach) +Lefttmost-longest segmentation: + (peach) +List of all segments: + pear:1769304646 [0,0] +Weighted segmentation: + (pear) +Rightmost-longest segmentation: + (pear) +Lefttmost-longest segmentation: + (pear) +List of all segments: + pineapple:1769304646 [0,0] +Weighted segmentation: + (pineapple) +Rightmost-longest segmentation: + (pineapple) +Lefttmost-longest segmentation: + (pineapple) +List of all segments: + plum:1769304646 [0,0] +Weighted segmentation: + (plum) +Rightmost-longest segmentation: + (plum) +Lefttmost-longest segmentation: + (plum) +List of all segments: + potato:1701274966 [0,0] +Weighted segmentation: + (potato) +Rightmost-longest segmentation: + (potato) +Lefttmost-longest segmentation: + (potato) +List of all segments: + pumpkin:1701274966 [0,0] +Weighted segmentation: + (pumpkin) +Rightmost-longest segmentation: + (pumpkin) +Lefttmost-longest segmentation: + (pumpkin) +List of all segments: + sour:0 [0,0] + cherry:1769304646 [1,1] + sour cherry:1769304646 [0,1] +Weighted segmentation: + (sour cherry) +Rightmost-longest segmentation: + (sour cherry) +Lefttmost-longest segmentation: + (sour cherry) +List of all segments: + squash:1701274966 [0,0] +Weighted segmentation: + (squash) +Rightmost-longest segmentation: + (squash) +Lefttmost-longest segmentation: + (squash) +List of all segments: + tomato:1701274966 [0,0] +Weighted segmentation: + (tomato) +Rightmost-longest segmentation: + (tomato) +Lefttmost-longest segmentation: + (tomato) +List of all segments: + alpha:0 [0,0] +Weighted segmentation: + (alpha) +Rightmost-longest segmentation: + (alpha) +Lefttmost-longest segmentation: + (alpha) +List of all segments: + beta:0 [0,0] +Weighted segmentation: + (beta) +Rightmost-longest segmentation: + (beta) +Lefttmost-longest segmentation: + (beta) +List of all segments: + gamma:0 [0,0] +Weighted segmentation: + (gamma) +Rightmost-longest segmentation: + (gamma) +Lefttmost-longest segmentation: + (gamma) +List of all segments: + delta:0 [0,0] +Weighted segmentation: + (delta) +Rightmost-longest segmentation: + (delta) +Lefttmost-longest segmentation: + (delta) +List of all segments: + epsilon:0 [0,0] +Weighted segmentation: + (epsilon) +Rightmost-longest segmentation: + (epsilon) +Lefttmost-longest segmentation: + (epsilon) diff --git a/fsa/src/alltest/segmenter_test.sh b/fsa/src/alltest/segmenter_test.sh new file mode 100755 index 00000000000..d36a6d10057 --- /dev/null +++ b/fsa/src/alltest/segmenter_test.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./fsa_segmenter_test_app < testinput.txt > segmenter_test.output +diff segmenter_test.output segmenter_test.out diff --git a/fsa/src/alltest/testinput.txt b/fsa/src/alltest/testinput.txt new file mode 100644 index 00000000000..fa4afece710 --- /dev/null +++ b/fsa/src/alltest/testinput.txt @@ -0,0 +1,41 @@ +Cupertino +Foster City +Los Altos +Menlo Park +Mountain View +Palo Alto +San Francisco +San Jose +Santa Clara +Saratoga +Sunnyvale +apple +apricot +artichoke +banana +cabbage +carrot +cherry +chili +cucumber +eggplant +grapes +lettuce +onion +paprika +passion fruit +pea +peach +pear +pineapple +plum +potato +pumpkin +sour cherry +squash +tomato +alpha +beta +gamma +delta +epsilon diff --git a/fsa/src/alltest/vectorizer_perftest.cpp b/fsa/src/alltest/vectorizer_perftest.cpp new file mode 100644 index 00000000000..582652ec66d --- /dev/null +++ b/fsa/src/alltest/vectorizer_perftest.cpp @@ -0,0 +1,95 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file vectorizertest.cpp + * @brief Test for the vectorizer class + * + */ + +#include <string> +#include <iostream> +#include <iomanip> + +#include <vespa/fsa/vectorizer.h> +#include <vespa/fsa/timestamp.h> + +using namespace fsa; + +int main(int argc, char **argv) +{ + FSA dict(argc>=2? argv[1] : "__testfsa__.__fsa__"); + + Vectorizer v(dict); + Vectorizer::TermVector tv; + + + std::string text = + "belfast northern ireland protestant extremists crashed a forklift " + "truck into a belfast pub packed with catholics early friday and tossed " + "gasoline bombs into the building on a road on the front line of " + "tensions between the two communities " + "no one was hurt in the attack police said, though the forklift came " + "crashing through a window just above a bench where a patron had been " + "sitting seconds earlier the bar s owner sean conlon said " + "the customer had just gotten up to go to the toilet so it s really " + "just by the grace of god still he s here today at all conlon said " + "a protestant gang used the stolen vehicle to smash down a heavy metal " + "security grill on a window at around 12 45 a m then to toss three " + "gasoline bombs inside the pub on the crumlin road an especially " + "polarized part of north belfast where catholic protestant tensions " + "have repeatedly flared " + "no group claimed responsibility for the attack on the thirty two " + "degrees north pub a catholic frequented bar across the street from a " + "hard line protestant district but catholic leaders blamed the largest " + "illegal protestant group the ulster defense association " + "firefighters quickly doused the flames caused by the gasoline " + "bombs the forklift remained wedged into the pub friday afternoon as " + "engineers and architects discussed whether the newly refurbished pub " + "would have to be partly demolished " + "the uda is supposed to be observing a cease fire in support of " + "northern ireland s 1998 peace accord but britain no longer recognizes " + "the validity of the uda truce because the anti catholic group has " + "violated it so often " + "the crumlin road area of north belfast has suffered some of northern " + "ireland s most graphic sectarian trouble in recent years while both " + "sides complain of suffering harassment and stone throwing protestants " + "in particular accuse the expanding catholic community of seeking to " + "force them from the area a charge the catholics deny. " + "protestant mobs in 2001 and 2002 blocked catholics from taking their " + "children to the local catholic elementary school which is in the " + "predominantly protestant part of the area " + "on july 12 hundreds of catholics from the area s ardoyne district " + "swarmed over police and british soldiers protecting a protestant " + "parade that had just passed down crumlin road dozens were wounded " + "demographic tensions lie at the heart of the northern ireland " + "conflict which was founded 84 years ago as a british territory with a " + "70 percent protestant majority the most recent census in 2001 put the " + "sectarian split at nearer 55 percent protestant and 45 percent " + "catholic and confirmed that belfast now has a catholic majority"; + + NGram tokenized_text(text); + + TimeStamp t; + double t0,t1; + unsigned int count=1000; + + std::cout << "Number of iterations: " << count << std::endl; + std::cout << "Input string length: " << text.length() << std::endl; + std::cout << "Number of input tokens: " << tokenized_text.length() << std::endl; + std::cout << std::endl; + + t0=t.elapsed(); + for(unsigned int i=0; i<count; ++i){ + v.vectorize(tokenized_text,tv); + } + t1=t.elapsed()-t0; + std::cout << "Vectorizer performance: \t" << t1 << " sec" << "\t\t" + << count/t1 << " document/sec" << std::endl; + for(unsigned int i=0; i<tv.size(); i++){ + std::cout << tv[i].term() << ", " << tv[i].weight() << std::endl; + } + + return 0; +} diff --git a/fsa/src/alltest/vectorizer_test.cpp b/fsa/src/alltest/vectorizer_test.cpp new file mode 100644 index 00000000000..e3bcf236455 --- /dev/null +++ b/fsa/src/alltest/vectorizer_test.cpp @@ -0,0 +1,40 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file vectorizertest.cpp + * @brief Test for the vectorizer class + * + */ + +#include <iostream> +#include <iomanip> + +#include <vespa/fsa/vectorizer.h> + +using namespace fsa; + +int main(int argc, char **argv) +{ + FSA dict(argc>=2? argv[1] : "__testfsa__.__fsa__"); + + Vectorizer v(dict); + Vectorizer::TermVector tv; + + std::string text; + NGram tokenized_text; + + while(!std::cin.eof()){ + getline(std::cin,text); + + tokenized_text.set(text); + v.vectorize(tokenized_text,tv); + + for(unsigned int i=0; i<tv.size(); i++){ + std::cout << tv[i].term() << ", " << tv[i].weight() << std::endl; + } + } + + return 0; +} diff --git a/fsa/src/alltest/vectorizer_test.out b/fsa/src/alltest/vectorizer_test.out new file mode 100644 index 00000000000..aa30421a2bf --- /dev/null +++ b/fsa/src/alltest/vectorizer_test.out @@ -0,0 +1,26 @@ +apple, 0 +apricot, 0 +artichoke, 0 +banana, 0 +cabbage, 0 +carrot, 0 +cherry, 0 +chili, 0 +cucumber, 0 +eggplant, 0 +grapes, 0 +lettuce, 0 +onion, 0 +paprika, 0 +passion fruit, 0 +pea, 0 +peach, 0 +pear, 0 +pineapple, 0 +plum, 0 +potato, 0 +pumpkin, 0 +cherry, 0 +sour cherry, 0 +squash, 0 +tomato, 0 diff --git a/fsa/src/alltest/vectorizer_test.sh b/fsa/src/alltest/vectorizer_test.sh new file mode 100755 index 00000000000..03d794fc6e8 --- /dev/null +++ b/fsa/src/alltest/vectorizer_test.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./fsa_vectorizer_test_app < testinput.txt > vectorizer_test.output +diff vectorizer_test.output vectorizer_test.out diff --git a/fsa/src/apps/.gitignore b/fsa/src/apps/.gitignore new file mode 100644 index 00000000000..85c014ca23b --- /dev/null +++ b/fsa/src/apps/.gitignore @@ -0,0 +1,3 @@ +Makefile +.depend +vespa-*-* diff --git a/fsa/src/apps/fsadump/.gitignore b/fsa/src/apps/fsadump/.gitignore new file mode 100644 index 00000000000..36c86d6022c --- /dev/null +++ b/fsa/src/apps/fsadump/.gitignore @@ -0,0 +1 @@ +fsadump diff --git a/fsa/src/apps/fsadump/CMakeLists.txt b/fsa/src/apps/fsadump/CMakeLists.txt new file mode 100644 index 00000000000..069bdfb379b --- /dev/null +++ b/fsa/src/apps/fsadump/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(fsa_fsadump_app + SOURCES + fsadump.cpp + OUTPUT_NAME fsadump + INSTALL bin + DEPENDS + fsa +) diff --git a/fsa/src/apps/fsadump/fsadump.cpp b/fsa/src/apps/fsadump/fsadump.cpp new file mode 100644 index 00000000000..a713b5dd30f --- /dev/null +++ b/fsa/src/apps/fsadump/fsadump.cpp @@ -0,0 +1,186 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <inttypes.h> + +#include <iostream> +#include <fstream> + +#include <vespa/fsa/base64.h> +#include <vespa/fsa/fsa.h> +#include <vespa/fsa/automaton.h> + +using namespace fsa; + +enum FSA_Input_Format { + OUTPUT_UNDEF, + OUTPUT_TEXT, + OUTPUT_TEXT_EMPTY, + OUTPUT_TEXT_NUM, + OUTPUT_BINARY, + OUTPUT_BINARY_RAW, + OUTPUT_PHASH, + OUTPUT_DOT +}; + +void error(const char *name, const char *errormsg = NULL) +{ + if(errormsg!=NULL){ + fprintf(stderr,"%s: %s\n",name,errormsg); + } +} + +void usage(const char *name, const char *errormsg = NULL) +{ + error(name,errormsg); + fprintf(stderr,"usage:\n"); + fprintf(stderr," %s [OPTIONS] fsafile\n",name); + fprintf(stderr,"\n"); + fprintf(stderr," Valid options are:\n"); + fprintf(stderr," -h display this help\n"); + fprintf(stderr," -b use binary output format with Base64 encoded info\n"); + fprintf(stderr," -B use binary output format with raw info\n"); + fprintf(stderr," -e use text output format with no info (default)\n"); + fprintf(stderr," -n use text output format with (unsigned) numerical info\n"); + fprintf(stderr," -t use text input format\n"); + fprintf(stderr," -p use perfect hash value instead of meta info (text output)\n"); + fprintf(stderr," -d output dot format\n"); + fprintf(stderr," -V display version number\n"); + fprintf(stderr,"\n"); +} + +void version() +{ + std::cout << "fsadump " + << FSA::VER/1000000 << "." << (FSA::VER/1000)%1000 << "." << FSA::VER%1000; + if(FSA::VER != FSA::libVER()){ + std::cout << " (library " + << FSA::libVER()/1000000 << "." << (FSA::libVER()/1000)%1000 << "." << FSA::libVER()%1000 + << ")"; + } + std::cout << std::endl; +} + +int main(int argc, char** argv) +{ + FSA_Input_Format format = OUTPUT_UNDEF; + const char *input_file; + + char opt; + extern int optind; + + while((opt=getopt(argc,argv,"ebBhntpdV")) != -1){ + switch(opt){ + case 'b': + format = OUTPUT_BINARY; + break; + case 'B': + format = OUTPUT_BINARY_RAW; + break; + case 'h': + usage(argv[0]); + exit(0); + case 'V': + version(); + exit(0); + case 't': + format = OUTPUT_TEXT; + break; + case 'n': + format = OUTPUT_TEXT_NUM; + break; + case 'e': + format = OUTPUT_TEXT_EMPTY; + break; + case 'p': + format = OUTPUT_PHASH; + break; + case 'd': + format = OUTPUT_DOT; + break; + case '?': + usage(argv[0],"unrecognized option"); + exit(1); + } + } + + if(optind!=argc-1){ + usage(argv[0],"required parameter(s) missing"); + exit(1); + } + + if(format==OUTPUT_UNDEF) // use default format (warning?) + format=OUTPUT_TEXT_EMPTY; + + input_file = argv[optind]; + + FSA fsa(input_file); + + if(!fsa.isOk()){ + std::cerr << "Failed to open fsa file (" << input_file << ")" << std::endl; + exit(1); + } + + std::string meta,temp; + uint32_t num_meta; + uint32_t lines=0; + + if(format!=OUTPUT_DOT){ + + for(FSA::iterator it(fsa); it!=fsa.end(); ++it){ + + switch(format){ + case OUTPUT_BINARY: + temp.assign((const char *)(it->data()),it->dataSize()); + Base64::encode(temp,meta); + std::cout << it->str() << '\0' << meta << '\0'; + break; + case OUTPUT_BINARY_RAW: + meta.assign((const char *)(it->data()),it->dataSize()); + std::cout << it->str() << '\0' << meta << '\0'; + break; + case OUTPUT_TEXT: + meta.assign((const char *)(it->data()),it->dataSize()); + if(meta.size()>0 && meta[meta.size()-1]==0){ + meta.resize(meta.size()-1); + } + std::cout << it->str() << '\t' << meta << '\n'; + break; + case OUTPUT_TEXT_NUM: + switch(it->dataSize()){ + case 1: + num_meta = *((const uint8_t*)it->data()); + break; + case 2: + case 3: + num_meta = *((const uint16_t*)it->data()); + break; + case 4: + default: + num_meta = *((const uint32_t*)it->data()); + break; + } + std::cout << it->str() << '\t' << num_meta << '\n'; + break; + case OUTPUT_PHASH: + std::cout << it->str() << '\t' << lines << '\n'; + break; + case OUTPUT_TEXT_EMPTY: + std::cout << it->str() << '\n'; + break; + default: + assert(0); + break; + } + + ++lines; + } + } + + else { + fsa.printDot(); + } + + return 0; +} diff --git a/fsa/src/apps/fsainfo/.gitignore b/fsa/src/apps/fsainfo/.gitignore new file mode 100644 index 00000000000..fc50ebfe566 --- /dev/null +++ b/fsa/src/apps/fsainfo/.gitignore @@ -0,0 +1 @@ +fsainfo diff --git a/fsa/src/apps/fsainfo/CMakeLists.txt b/fsa/src/apps/fsainfo/CMakeLists.txt new file mode 100644 index 00000000000..c16332ed20b --- /dev/null +++ b/fsa/src/apps/fsainfo/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(fsa_fsainfo_app + SOURCES + fsainfo.cpp + OUTPUT_NAME fsainfo + INSTALL bin + DEPENDS + fsa +) diff --git a/fsa/src/apps/fsainfo/fsainfo.cpp b/fsa/src/apps/fsainfo/fsainfo.cpp new file mode 100644 index 00000000000..efbe6075331 --- /dev/null +++ b/fsa/src/apps/fsainfo/fsainfo.cpp @@ -0,0 +1,124 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <fcntl.h> + +#include <iostream> +#include <fstream> + +#include <vespa/fsa/fsa.h> + +using namespace fsa; + +void usage(const char *name, const char *errormsg = NULL) +{ + if(errormsg!=NULL){ + fprintf(stderr,"%s: %s\n",name,errormsg); + } + fprintf(stderr,"usage:\n"); + fprintf(stderr," %s [OPTIONS] fsa\n",name); + fprintf(stderr,"\n"); + fprintf(stderr," Valid options are:\n"); + fprintf(stderr," -h display this help\n"); + fprintf(stderr," -V display version number\n"); + fprintf(stderr,"\n"); +} + +void version() +{ + std::cout << "fsainfo " + << FSA::VER/1000000 << "." << (FSA::VER/1000)%1000 << "." << FSA::VER%1000; + if(FSA::VER != FSA::libVER()){ + std::cout << " (library " + << FSA::libVER()/1000000 << "." << (FSA::libVER()/1000)%1000 << "." << FSA::libVER()%1000 + << ")"; + } + std::cout << std::endl; +} + +int main(int argc, char** argv) +{ + const char *fsa_file; + + char opt; + extern int optind; + + while((opt=getopt(argc,argv,"hV")) != -1){ + switch(opt){ + case 'h': + usage(argv[0]); + exit(0); + case 'V': + version(); + exit(0); + case '?': + usage(argv[0],"unrecognized option"); + exit(1); + } + } + + if(optind!=argc-1){ + usage(argv[0],"required parameter fsa is missing"); + exit(1); + } + + fsa_file = argv[optind]; + + + + FSA::Header header; + + size_t r; + + int fd = ::open(fsa_file,O_RDONLY); + if(fd<0){ + std::cerr << "Failed to open fsa file (" << fsa_file << ")" << std::endl; + return 1; + } + else{ + r=::read(fd,&header,sizeof(header)); + ::close(fd); + if(r<sizeof(header) || header._magic!=FSA::MAGIC){ + std::cout << "Unrecognized file format (" << fsa_file << ")\n"; + } + else if(header._version<1000){ + std::cout << "Obsolete fsa file (" << fsa_file << ")\n"; + } + else { + std::cout << "Information about " << fsa_file << ":\n"; + std::cout << " Header size: " << sizeof(header) << " bytes" <<std::endl; + std::cout << " Magic: " << header._magic << std::endl; + std::cout << " Version: " << header._version/1000000 << "." + << (header._version%1000000)/1000 << "." + << header._version%1000 << std::endl; + std::cout << " Serial number: " << header._serial << std::endl; + std::cout << " Checksum: " << header._checksum << std::endl; + std::cout << " FSA size: " << header._size << " cells" <<std::endl; + std::cout << " " << header._size*(sizeof(unsigned char)+sizeof(unsigned int)) + << " bytes" <<std::endl; + std::cout << " Start state: " << header._start << std::endl; + std::cout << " Data size: " << header._data_size << " bytes" << std::endl; + std::cout << " Data item type: " << (header._data_type==FSA::DATA_FIXED? + "fixed size":"variable size") << std::endl; + if(header._data_type==FSA::DATA_FIXED) + std::cout << " Fixed item size: " << header._fixed_data_size << std::endl; + std::cout << " Perfect hash: " << (header._has_perfect_hash? + "yes":"no") << std::endl; + if(header._has_perfect_hash) + std::cout << " Perfect hash size: " << header._size*sizeof(unsigned int) << " bytes" << std::endl; + std::cout << " Total size: " + << (header._size*(sizeof(unsigned char)+ + sizeof(unsigned int)*(header._has_perfect_hash?2:1)) + + header._data_size + + sizeof(header)) + << " bytes" << std::endl; + std::cout << " Trying to load FSA ... " << std::flush; + + FSA fsa(fsa_file); + std::cout << (fsa.version()==header._version ? "succeeded.":"failed.") << std::endl; + } + } + + return 0; +} diff --git a/fsa/src/apps/makefsa/.gitignore b/fsa/src/apps/makefsa/.gitignore new file mode 100644 index 00000000000..1ea7393bec3 --- /dev/null +++ b/fsa/src/apps/makefsa/.gitignore @@ -0,0 +1 @@ +makefsa diff --git a/fsa/src/apps/makefsa/CMakeLists.txt b/fsa/src/apps/makefsa/CMakeLists.txt new file mode 100644 index 00000000000..80002338479 --- /dev/null +++ b/fsa/src/apps/makefsa/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(fsa_makefsa_app + SOURCES + makefsa.cpp + OUTPUT_NAME makefsa + INSTALL bin + DEPENDS + fsa +) diff --git a/fsa/src/apps/makefsa/makefsa.cpp b/fsa/src/apps/makefsa/makefsa.cpp new file mode 100644 index 00000000000..b27485a851e --- /dev/null +++ b/fsa/src/apps/makefsa/makefsa.cpp @@ -0,0 +1,295 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <inttypes.h> + +#include <iostream> +#include <fstream> + +#include <vespa/fsa/base64.h> +#include <vespa/fsa/fsa.h> +#include <vespa/fsa/automaton.h> + +using namespace fsa; + +enum FSA_Input_Format { + INPUT_UNDEF, + INPUT_TEXT, + INPUT_TEXT_EMPTY, + INPUT_TEXT_NUM, + INPUT_BINARY, + INPUT_BINARY_RAW }; + +void usage(const char *name, const char *errormsg = NULL) +{ + if(errormsg!=NULL){ + fprintf(stderr,"%s: %s\n",name,errormsg); + } + fprintf(stderr,"usage:\n"); + fprintf(stderr," %s [OPTIONS] [input_file] output_file\n",name); + fprintf(stderr,"\n"); + fprintf(stderr," Valid options are:\n"); + fprintf(stderr," -h display this help\n"); + fprintf(stderr," -b use binary input format with Base64 encoded info\n"); + fprintf(stderr," -B use binary input format with raw\n"); + fprintf(stderr," -e use text input format with no info (default)\n"); + fprintf(stderr," -n use text input format with (unsigned) numerical info\n"); + fprintf(stderr," -s bytes data size for numerical info: 1,2 or 4(default)\n"); + fprintf(stderr," -z bytes data size for binary info (-B) (0 means NUL terminated)\n"); + fprintf(stderr," -t use text input format\n"); + fprintf(stderr," -p build automaton with perfect hash\n"); + fprintf(stderr," -i ignore info string, regardless of input format\n"); + fprintf(stderr," -S serial serial number\n"); + fprintf(stderr," -v be verbose\n"); + fprintf(stderr," -V display version number\n"); + fprintf(stderr,"\n"); + fprintf(stderr," If input_file is not specified, standard input is used.\n"); +} + +void version() +{ + std::cout << "makefsa " + << FSA::VER/1000000 << "." << (FSA::VER/1000)%1000 << "." << FSA::VER%1000; + if(FSA::VER != FSA::libVER()){ + std::cout << " (library " + << FSA::libVER()/1000000 << "." << (FSA::libVER()/1000)%1000 << "." << FSA::libVER()%1000 + << ")"; + } + std::cout << std::endl; +} + + +int main(int argc, char** argv) +{ + FSA_Input_Format format = INPUT_UNDEF; + unsigned int num_size = 4; + unsigned int info_size_binary = 0; + bool build_phash = false; + const char *input_file; + const char *output_file; + uint32_t serial = 0; + bool ignore_info = false; + bool verbose = false; + unsigned int lines=0,count = 0; + + char opt; + extern char *optarg; + extern int optind; + + while((opt=getopt(argc,argv,"ebBhns:z:tpS:ivV")) != -1){ + switch(opt){ + case 'b': + format = INPUT_BINARY; + break; + case 'B': + format = INPUT_BINARY_RAW; + break; + case 'h': + usage(argv[0]); + exit(0); + case 'V': + version(); + exit(0); + case 't': + format = INPUT_TEXT; + break; + case 'n': + format = INPUT_TEXT_NUM; + break; + case 's': + num_size = strtoul(optarg,NULL,0); + if(num_size!=1 && num_size!=2 && num_size!=4){ + usage(argv[0],"invalid numerical info size (-s)"); + exit(1); + } + break; + case 'z': + info_size_binary = strtoul(optarg,NULL,0); + break; + case 'S': + serial = strtoul(optarg,NULL,0); + break; + case 'e': + format = INPUT_TEXT_EMPTY; + break; + case 'p': + build_phash = true; + break; + case 'i': + ignore_info = true; + break; + case 'v': + verbose = true; + break; + case '?': + usage(argv[0],"unrecognized option"); + exit(1); + } + } + + if(format==INPUT_UNDEF) // use default format (warning?) + format=INPUT_TEXT_EMPTY; + + if(optind+2==argc){ + input_file = argv[optind]; + output_file = argv[optind+1]; + } + else if(optind+1==argc){ + input_file = NULL; + output_file = argv[optind]; + } + else{ + usage(argv[0],"required parameter(s) missing"); + exit(1); + } + + Automaton automaton; + + std::string input,last_input,meta,temp; + union{ + uint8_t u1; + uint16_t u2; + uint32_t u4; + } num_meta; + std::ifstream infile; + std::istream *in; + char binary_info[info_size_binary]; + size_t split; + bool empty_meta_str = false; + + if(verbose) version(); + + if(verbose) std::cerr << "Initializing automaton ..."; + automaton.init(); + if(verbose) std::cerr << " done." << std::endl; + + if(input_file!=NULL){ + infile.open(input_file); + if (infile.fail()) { + std::cerr << "Error: Could not open file \"" << input_file << "\"\n"; + return(1); + } + in=&infile; + } + else{ + in=&std::cin; + } + if(verbose) std::cerr << "Inserting lines ..."; + while(!in->eof()){ + switch(format){ + case INPUT_BINARY: + getline(*in,input,'\0'); + getline(*in,temp,'\0'); + Base64::decode(temp,meta); + break; + case INPUT_BINARY_RAW: + getline(*in,input,'\0'); + if (info_size_binary) { + in->read(binary_info, info_size_binary); + meta.assign(binary_info, info_size_binary); + } + else + getline(*in,meta,'\0'); + break; + case INPUT_TEXT: + getline(*in,temp,'\n'); + split = temp.find_first_of('\t'); + input = temp.substr(0, split); + if (split == std::string::npos) { + empty_meta_str = true; + break; + } + meta = temp.substr(split + 1); + meta+='\0'; + break; + case INPUT_TEXT_NUM: + getline(*in,temp,'\n'); + split = temp.find_first_of('\t'); + input = temp.substr(0, split); + if (split == std::string::npos) { + empty_meta_str = true; + break; + } + temp = temp.substr(split + 1); + switch(num_size){ + case 1: + num_meta.u1=strtoul(temp.c_str(),NULL,0); + meta.assign((const char*)&num_meta,1); + break; + case 2: + num_meta.u2=strtoul(temp.c_str(),NULL,0); + meta.assign((const char*)&num_meta,2); + break; + case 4: + default: + num_meta.u4=strtoul(temp.c_str(),NULL,0); + meta.assign((const char*)&num_meta,4); + break; + } + break; + case INPUT_TEXT_EMPTY: + getline(*in,input,'\n'); + break; + case INPUT_UNDEF: + assert(0); + break; + } + + ++lines; + + if(input.length()>0){ + if(last_input>input){ + std::cerr << "warning: ignoring unsorted line " << lines << ", \"" << input << "\"\n"; + } + else if(last_input==input){ + std::cerr << "warning: ignoring duplicate line " << lines << ", \"" << input << "\"\n"; + } + else if(empty_meta_str) { + std::cerr << "warning: ignoring line " << lines << ", \"" << input << "\" with missing meta info\n"; + } + else{ + if(format==INPUT_TEXT_EMPTY || ignore_info){ + automaton.insertSortedString(input); + } + else{ + automaton.insertSortedString(input,meta); + } + if(verbose){ + ++count; + if(count%1000==0) + std::cerr << "\rInserting lines ... (inserted " << count << " lines)"; + } + } + last_input=input; + } + empty_meta_str = false; + } + if(verbose) std::cerr << "\rInserting lines ... (inserted " << count << "/" << (lines-1) << " lines) ... done.\n"; + if(input_file!=NULL){ + infile.close(); + } + + + if(verbose) std::cerr << "Finalizing ..."; + automaton.finalize(); + if(verbose) std::cerr << " done." << std::endl; + + + if(build_phash){ + if(verbose) std::cerr << "Adding perfect hash ..."; + automaton.addPerfectHash(); + if(verbose) std::cerr << " done." << std::endl; + } + + + if(verbose) std::cerr << "Writing fsa file ..."; + if (!automaton.write(output_file,serial)) { + std::cerr << "Failed to write fsa file '" << std::string(output_file) << "'. Please check write permissions" << std::endl; + return 1; + } + if(verbose) std::cerr << " done." << std::endl; + + + return 0; +} diff --git a/fsa/src/libfsa/.gitignore b/fsa/src/libfsa/.gitignore new file mode 100644 index 00000000000..9fb98574200 --- /dev/null +++ b/fsa/src/libfsa/.gitignore @@ -0,0 +1,6 @@ +*.la +*.lo +.deps +.libs +Makefile +Makefile.in diff --git a/fsa/src/libfsa/automaton-alternate.h b/fsa/src/libfsa/automaton-alternate.h new file mode 100644 index 00000000000..20cc8f933eb --- /dev/null +++ b/fsa/src/libfsa/automaton-alternate.h @@ -0,0 +1,998 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @file automaton.h + * @brief Definition of the classes used for %FSA (%Finite %State %Automaton) construction + * + */ + +#pragma once + +#include <map> +#include <list> +#include <string> +#include <vector> +#include <assert.h> +#include <sys/mman.h> // for mmap() etc + +#include "blob.h" +#include "fsa.h" + +namespace fsa { + + +// {{{ Automaton +/** + * @class Automaton + * @brief %FSA (%Finite %State %Automaton) construction class. + * + * The Automaton class provides the methods and data structures needed + * for construcing a %Finite %State %Automaton from input strings. (The + * current implementation requires the input to be sorted, this + * requirement may be relaxed in future releases.) + * + * The constructed %FSA, when stored in a compact representation, can + * be used for lookups, etc. vie the FSA class. The compact %FSA can + * not be modified anymore. + */ +class Automaton { + +public: + /** + * Empty data item for final states without assigned data. Contains + * a zero terminated empty string. + */ + static const Blob EMPTY_BLOB; + +private: + + class State; + + // {{{ Automaton::Transition + /** + * @struct Transition + * @brief Struct for storing a single transition. + * + * A transition consists of an input symbol and a new state. + */ + struct Transition { + symbol_t _symbol; /**< Input symbol. */ + State *_state; /**< New state. */ + }; + // }}} + + // {{{ Automaton::TransitionList + /** + * @class TransitionList + * @brief Class representing all transitions from a state. + * + * This class is used for the internal representation of the + * automaton. A state can be represented by the list of all + * possible transitions from that state. Two states are + * equivalent, if both are final (with the same meta info) or both + * are not final, and their transition list matches, that is they + * have the same number of out-transitions, these correspond to the + * same set of input symbols, and for each of these symbols the new + * states are equal. In the internal representation, final states + * are implemented by means of a special transition, so transition + * list equivalence is implies state equivalence. + */ + class TransitionList { + + friend class State; + + private: + Transition* _trans; /**< Transition array. */ + unsigned int _size; /**< Used size. */ + + public: + /** + * @brief Constructor. + * + * Default constructor, creates an empty transition list. + */ + TransitionList() : _trans(NULL), _size(0) {}; + + /** + * @brief Destructor. + */ + ~TransitionList() + { if(_trans!=NULL) free(_trans); } + + /** + * @brief Copy constructor. + * + * @param tl Reference to transition list object. + */ + TransitionList(const TransitionList& tl) : _trans(NULL), _size(tl._size) + { + if(_size>0){ + _trans = (Transition*)malloc(_size*sizeof(Transition)); + assert(_trans!=NULL); + } + memcpy(_trans, tl._trans, sizeof(_trans[0]) * _size); + } + + + /** + * @brief Less-than operator. + * + * t1<t2 (or t1.operator<(t2) is true iff + * - t1 has less transitions than t2, or + * - t1 and t2 have the same number of transitions, and the + * first transition which is different for t1 and t2 (sorted + * by symbol) has a lower symbol for t1, or + * - t1 and t2 have the same number of transitions, and the + * first transition which is different for t1 and t2 (sorted + * by symbol) has the same symbol but a lower new state for t1 + * + * @param tl Reference to transition list object. + * @return True iff the t1<t2. + */ + bool operator<(const TransitionList& tl) const; + + /** + * @brief Greater-than operator. + * + * t1>t2 (or t1.operator>(t2) is true iff + * - t1 has more transitions than t2, or + * - t1 and t2 have the same number of transitions, and the + * first transition which is different for t1 and t2 (sorted + * by symbol) has a higher symbol for t1, or + * - t1 and t2 have the same number of transitions, and the + * first transition which is different for t1 and t2 (sorted + * by symbol) has the same symbol but a higher new state for t1 + * + * @param tl Reference to transition list object. + * @return True iff the t1>t2. + */ + bool operator>(const TransitionList& tl) const; + + /** + * @brief Equals operator. + * + * t1==t2 (or t1.operator==(t2) is true iff + * - t1 and t2 have the same number of transitions, which have + * the same set of of symbols and for each symbol the new + * states are equal + * + * @param tl Reference to transition list object. + * @return True iff the t1==t2. + */ + bool operator==(const TransitionList& tl) const; + + /** + * @brief Check for emptyness. + * + * @return True iff the transition list is empty. + */ + bool empty() { return (_size==0); } + + /** + * @brief Get transition list size. + * + * @return Size of the transition list (number of transitions, or 0 if empty). + */ + unsigned int size() const { return _size; } + + /** + * @brief Index operator. + * + * Returns a reference to the ith transition on the list. i must + * be between 0 and size-1 (0<=i<=size-1). + * + * @param i Index of transition. + * @return Reference to the ith transition. + */ + const Transition& operator[](unsigned int i) const { return _trans[i]; } + + /** + * @brief Get the last transition. + * + * Returns a pointer to the last transition, or NULL pointer if + * the list is empty. + * + * @return Pointer to last transition, or NULL. + */ + Transition* last() + { if(_size>0) return &_trans[_size-1]; + return NULL; + } + + /** + * @brief Get the transition corresponding to a symbol. + * + * Returns a pointer to the transition corresponding to a given + * symbol, or NULL pointer if the symbol is not found on the list + * (a transition with that symbol does not exist). + * + * @param sy Input symbol. + * @return Pointer to last transition, or NULL. + */ + Transition* find(symbol_t sy) + { for(unsigned int i=0; i<_size; i++){ + if(_trans[i]._symbol == sy) return &_trans[i]; + } + return NULL; + } + + /** + * @brief Append a new transition to the list. + * + * Appends a new transition to the end of the list. The allocated + * size is increased if necessary. If a transition with the same + * symbol already exists, the behaviour is undefined. + * + * @param sy Input symbol. + * @param st Pointer to new state. + */ + void append(symbol_t sy, State* st) + { + if(_size==0){ + _trans = (Transition*)malloc(sizeof(Transition)); + } + else{ + _trans = (Transition*)realloc(_trans,(_size+1)*sizeof(Transition)); + } + assert(_trans!=NULL); + _trans[_size]._symbol=sy; + _trans[_size]._state=st; + _size++; + } + + }; + + // }}} + // {{{ Automaton::State + /** + * @class State + * @brief Class representing a state of the automaton. + * + * The representation of the automaton states consists of a + * transition list for the state, and meta info blob (the latter + * only used for special states reached by a final transition. A + * final transition is a transition from a final (accepting) state + * with the reserved FINAL_SYMBOL (0xff) to a special state, which + * stores the meta info corresponding to the final state. For each + * unique meta info blob, there is one special state. + */ + class State { + + private: + + TransitionList _tlist; /**< Transition list. */ + const Blob *_blob; /**< Meta info blob. */ + + public: + + /** + * @brief Constructor. + * + * Default constructor, creates a state with an empty transition + * list and no (NULL) blob. + */ + State() : _tlist(), _blob(NULL) {} + + /** + * @brief Constructor. + * + * Creates a (special) state with an empty transition list and a + * given blob. + * + * @param b Pointer to blob. + */ + State(const Blob* b) : _tlist(), _blob(b) {} + + /** + * @brief Destructor. + */ + ~State() { if(_blob!=NULL) delete _blob; } + + /** + * @brief Check if the state is final (accepting) state. + * + * @return True if the state is final. + */ + bool isFinal() { return child(FSA::FINAL_SYMBOL)!=NULL; } + + /** + * @brief Get the blob assigned to the state. + * + * @return Pointer to blob. + */ + const Blob* getBlob() const { return _blob; } + + /** + * @brief Check if the state has children. + * + * Returns true if the state has children (the transition list is + * not empty), or false if the state is a leaf. + * + * @return True if the state has children. + */ + bool hasChildren() { return !_tlist.empty(); } + + /** + * @brief Get child corresponding to a symbol. + * + * Get the child of the state which is reached by a transition + * with a given symbol. If there is no out-transition with that + * symbol, NULL is returned. + * + * @return Pointer to the child, or NULL. + */ + State* child(symbol_t sy) + { Transition* t = _tlist.find(sy); + if(t!=NULL){ return t->_state; } + return NULL; + } + + /** + * @brief Get the last child. + * + * Get the last child of the state which is reached by a valid + * transition (not FINAL_SYMBOL). If no such children exists, NULL + * is returned. + * + * @return Pointer to last child, or NULL. + */ + State* lastChild() + { Transition* t = _tlist.last(); + if(t!=NULL && t->_symbol!=FSA::FINAL_SYMBOL){ return t->_state; } + return NULL; + } + + /** + * @brief Update the last child. + * + * Updates the last child to point to a new state. This method is + * used when merging equivalent subtrees together. + * + * @param st New state to be used in last child. + */ + void updateLastChild(State* st) + { Transition* t = _tlist.last(); + if(t!=NULL){ + t->_state = st; + } + } + + /** + * @brief Append a new empty child. + * + * Append an empty child to the list of transitions using the + * given symbol (and optional blob). + * + * @param sy New transition symbol. + * @param b Optional blob to be assigned to the new state, defaults to NULL. + * @return Pointer to the new state. + */ + State* addEmptyChild(symbol_t sy, const Blob *b=NULL) + { + State* child = new State(b); + assert(child!=NULL); + _tlist.append(sy,child); + return child; + } + + /** + * @brief Add a transition to an existing state. + * + * Append a new transition to the list pointing to an existing + * state, using the given symbol. + * + * @param sy New transition symbol. + * @param child Pointer to destination state (already existing). + * @return Pointer to the child state. + */ + State* addChild(symbol_t sy, State* child) + { + _tlist.append(sy,child); + return child; + } + + /** + * @brief Get the transition list. + * + * Get the transition list of the state. + * + * @return Reference to the transition list. + */ + const TransitionList& getTransitionList(void) const { return _tlist; } + + + }; + + // }}} + // {{{ Automaton::TListPtrLess + /** + * @class TListPtrLess + * @brief Less-than functor for use with ordered STL containers. + * + * The function compares two TransitionList pointers by comparing + * the objects they point to. + */ + struct TListPtrLess { + inline bool operator()(const TransitionList * const & x, const TransitionList * const & y) const { return *x < *y; } + }; + // }}} + // {{{ Special allocator for Register that will make it possible to completely reclaim its memory when we are done with it + template <typename _Tp> + class MMapArenaAllocator { + std::vector<_Tp*> _chunks; + size_t _size; // used # of objects in current chunk + static const size_t _CAPACITY = 16 * 1024 * 1024; // capacity of chunk in bytes + public: + typedef size_t size_type; + typedef ptrdiff_t difference_type; + typedef _Tp* pointer; + typedef const _Tp* const_pointer; + typedef _Tp& reference; + typedef const _Tp& const_reference; + typedef _Tp value_type; + + template<typename _Tp1> + struct rebind + { typedef MMapArenaAllocator<_Tp1> other; }; + + MMapArenaAllocator() throw(): _chunks(), _size(0) { } + + MMapArenaAllocator(const MMapArenaAllocator&) throw(): _chunks(), _size(0) { } + + template<typename _Tp1> + MMapArenaAllocator(const MMapArenaAllocator<_Tp1>&) throw(): _chunks(), _size(0) { } + + ~MMapArenaAllocator() throw() { release(); } + + pointer + address(reference __x) const { return &__x; } + + const_pointer + address(const_reference __x) const { return &__x; } + + // NB: __n is permitted to be 0. The C++ standard says nothing + // about what the return value is when __n == 0. + pointer + allocate(size_type __n, const void* = 0) + { + pointer __ret; + if(__n) { + size_type __b = __n * sizeof(_Tp); + if(_chunks.size()==0 || _CAPACITY - (_size*sizeof(_Tp)) < __b) { // need new chunk + __ret = static_cast<_Tp*>(::mmap(0, _CAPACITY, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, /*fd=*/0, /*offset=*/0)); + if(__ret == MAP_FAILED) + throw std::bad_alloc(); + _chunks.push_back(__ret); + _size = __n; + } + else { // fits in current chunk + __ret = (*(_chunks.end()-1)) + _size; + _size += __n; + } + } + return __ret; + } + + // __p is not permitted to be a null pointer. + void + deallocate(pointer, size_type) + { } + + void release(void) + { + for(size_t i = 0; i < _chunks.size(); i++){ + ::munmap(_chunks[i], _CAPACITY); + } + _chunks.clear(); + _size = 0; + } + + size_type + max_size() const throw() + { return _CAPACITY / sizeof(_Tp); } + + void + construct(pointer __p, const _Tp& __val) + { ::new(__p) value_type(__val); } + + void + destroy(pointer __p) { __p->~_Tp(); } + }; + // }}} + // {{{ Automaton::Register, BlobRegister, StateArray, StateCellArray, PackMap, SymList and iterators + + struct StateArrayLess { + bool operator()(State* const &x, State* const &y) + { return x < y; } + }; + struct StateCellArrayItem { + State *state; + uint32_t cell; + StateCellArrayItem(): state(NULL), cell(0) { } + StateCellArrayItem(State *s): state(s), cell(0) { } + }; + struct StateCellArrayLess { + bool operator()(const StateCellArrayItem &x, const StateCellArrayItem &y) + { return x.state < y.state; } + }; + + /** + * @brief Register of states, maps a transition list to a state object + */ + typedef std::map< const TransitionList*,State*,TListPtrLess,MMapArenaAllocator< std::pair< const TransitionList*, State* > > > Register; + /** + * @brief State register iterator. + */ + typedef std::map< const TransitionList*,State*,TListPtrLess,MMapArenaAllocator< std::pair< const TransitionList*, State* > > >::iterator RegisterIterator; + + /** + * @brief Register of states, maps a blob to a special state. + */ + typedef std::map< Blob,State* > BlobRegister; + /** + * @brief Blob register iterator. + */ + typedef std::map< Blob,State* >::iterator BlobRegisterIterator; + + /** + * @brief Array of state pointers. + */ + typedef std::vector< State* > StateArray; + /** + * @brief State* array iterator. + */ + typedef std::vector< State* >::iterator StateArrayIterator; + + /** + * @brief Array of state/cell pairs. + */ + typedef std::vector< StateCellArrayItem > StateCellArray; + /** + * @brief StateCell array iterator. + */ + typedef std::vector< StateCellArrayItem >::iterator StateCellArrayIterator; + + /** + * @brief Packing map, maps a state pointer to a state ID. + */ + typedef std::map< const void*, unsigned int > PackMap; + /** + * @brief Packing map iterator. + */ + typedef std::map< const void*, unsigned int >::iterator PackMapIterator; + + /** + * @brief symbol_t list. + */ + typedef std::list<symbol_t> SymList; + /** + * @brief symbol_t list iterator. + */ + typedef std::list<symbol_t>::iterator SymListIterator; + /** + * @brief symbol_t list const_iterator. + */ + typedef std::list<symbol_t>::const_iterator SymListConstIterator; + // }}} + + // {{{ Automaton::PackedAutomaton + + /** + * @class PackedAutomaton + * @brief Helper class for packing an automaton. + * + * This class is used for packing an Automaton to a compressed + * format which can be saved to file to be used by the FSA class. + */ + class PackedAutomaton { + + private: + bool _packable; /**< Packable flag. */ + PackMap _blob_map; /**< Map blob pointers to indices. */ + State **_packed_ptr; /**< Array for state pointers. */ + state_t *_packed_idx; /**< Array for state indices. */ + symbol_t *_symbol; /**< Array for transition symbols. */ + bool *_used; /**< Array for cell used flags. */ + hash_t *_perf_hash; /**< Array for perfect hash deltas. */ + hash_t *_totals; /**< Array for perfect hash totals. */ + uint32_t _packed_size; /**< Size of packed arrays (in cells). */ + uint32_t _last_packed; /**< Index of last packed state. */ + + data_t *_blob; /**< Data storage. */ + uint32_t _blob_size; /**< Data storage size. */ + uint32_t _blob_used; /**< Used data storage size. */ + uint32_t _blob_type; /**< Type of data items (fixed/var.) */ + uint32_t _fixed_blob_size; /**< Data item size if fixed. */ + + state_t _start_state; /**< Index of start state. */ + + /** + * @brief Number of cells to allocate in one expansion. + */ + static const uint32_t _ALLOC_CELLS = 131072; // 128k + + /** + * @brief Number of bytes to allocate in one data storage expansion. + */ + static const uint32_t _ALLOC_BLOB = 65536; // 64k + + /** + * @brief How long back the search for an empty cell should start. + */ + static const uint32_t _BACKCHECK = 255; + + + /** + * @brief Expand cell arrays. + */ + void expandCells(); + + /** + * @brief Expand data storage. + * + * @param minExpand Mimimum size to expand, it will be rounded up + * to the nearest multiply of _ALLOC_BLOB. + */ + void expandBlob(uint32_t minExpand); + + /** + * @brief Get an empty cell. + * + * Start looking for an empty cell _BACKCHECK cells before the + * last packed cell, and return the index of the first empty cell + * found. The cell arrays are expanded on demand, that is if no + * empty cell is found. + * + * @return Index of empty cell. + */ + uint32_t getEmptyCell(); + + /** + * @brief Get an empty cell where a list of transitions can be stored. + * + * Start looking for an empty cell _BACKCHECK cells before the + * last packed cell. In addition to the cell being empty, it + * should be possible to store a list of transitions from that + * cell. The cell arrays are expanded on demand, that is if no + * empty cell is found. + * + * @param t List of transition symbols. + * @return Index of empty cell. + */ + uint32_t getCell(const SymList &t); + + /** + * @brief Pack a data item. + * + * Pack a data item to the data storage. If the same (or + * equivalent) data item has been packed before, return the offset + * where it was packed. Otherwise, pack the data item at the end + * of the storage (expand storage if needed), add the item and + * offset to the blob map and return the offset. + * + * @param b Pointer to data item. + * @return Offset to data item in data storage. + */ + uint32_t packBlob(const Blob* b); + + /** + * @brief Compute perfect hash deltas for a subtree. + * + * Recursive function for computing the perfect hash deltas for + * all transitions within a subtree. The delta for transition T + * from state S is the number of final states reachable from state + * S via transitions lower than T (that is, with a lower input + * symbol). Also, state S being a final state counts. The hash + * deltas are filled into the _perf_hash array. + * + * @return Number of final states within the subtree. + */ + hash_t computePerfectHash(state_t state); + + + public: + + /** + * @brief Default constructor. + */ + PackedAutomaton() : + _packable(false), + _blob_map(), + _packed_ptr(NULL), + _packed_idx(NULL), + _symbol(NULL), + _used(NULL), + _perf_hash(NULL), + _totals(NULL), + _packed_size(0), + _last_packed(0), + _blob(NULL), + _blob_size(0), + _blob_used(0), + _blob_type(0), + _fixed_blob_size(0), + _start_state(0) + { } + + /** + * @brief Destructor. + */ + ~PackedAutomaton() { reset(); } + + /** + * @brief Reset the object. + * + * Reset the object and free all allocated memory. + */ + void reset(); + + /** + * @brief Initialize. + * + * Reset the object, and initialize data structures, also + * preallocate memory for cell and data storage. + */ + void init(); + + /** + * @brief Pack a state. + * + * Pack a state and its transitions into the compact structure. For + * final states, the data item is packed as well. + * + * @param s Pointer to state to pack. + * @return False if the object is not packable (it has been + * finalized, or it has not been initialized) + */ + bool packState(Automaton::StateCellArrayIterator &it); + + /** + * @brief Set the cell of the start state. + * + * @param cell Cell of start state. + */ + void setStartState(uint32_t cell) { _start_state = (state_t)cell; } + + /** + * @brief Finalize the packed structure. + * + * Obtain all state indices from the state pointers using the + * pack map. Also compact the data storage if all data items have + * the same size (only store the size once, and store data items + * consecutively, without size attribute). + * + * @param queue State queue. + */ + void finalize(const StateCellArray &queue); + + /** + * @brief Add perfect hash to the automaton. + * + * Computes the perfect hash for the whole automaton. + */ + void addPerfectHash(); + + /** + * @brief Write the automaton to a file. + * + * @param filename Name of file. + * @param serial Serial number. + * @return True on success. + */ + bool write(const char *filename, uint32_t serial = 0); + + /** + * @brief Read an automaton from file. + * + * @param filename Name of file. + * @return True on success. + */ + bool read(const char *filename); + + /** + * @brief Perform a lookup in the packed automaton. + * + * @param input Input string + * @return Pointer to data associated with input, or NULL if input is not accepted. + */ + const unsigned char* lookup(const char *input) const; + + /** + * @brief Create an FSA object from the automaton. + * + * Create an FSA object from the automaton. The PackedAutomaton is + * implicitly reset if the operation succeeds. PackedAutomanton + * cannot access the private constructor of FSA, so we have to pass + * the object via a struct, which is ugly :-(. + * + * @param d Pointer to the FSA::Descriptor (struct) to store necessary info for + * creating the FSA object. + * @return True if the operation was successful. + */ + bool getFSA(FSA::Descriptor &d); + + }; + + // }}} + + + Register *_register; /**< Register of states. */ + BlobRegister _blob_register; /**< Register of data items. */ + State* _q0; /**< Start state. */ + StateArray *_queue; /**< State queue. */ + bool _finalized; /**< Finalized flag. */ + PackedAutomaton _packed; /**< Packed automaton. */ + + /** + * @brief Get last state in common path. + * + * Get the last state of the common path shared by the current input + * string and strings already in the automaton. Also sets a pointer + * to the suffix part of \a input which occurs after the last state. + * + * @param input Input string. + * @return Pointer to last state in common path. + */ + State* getCPLastState(const char *input, const char *&suffix); + + /** + * @brief Replace or register a state. + * + * Replace the state with an already registered equivalent state in + * the automaton, or register it if no such state exists yet. + * + * @param state Pointer to state to be replaced or registered. + */ + void replaceOrRegister(State* state); + + /** + * @brief Add new states for a suffix. + * + * Add the necessary new states for a suffix of an input string. The + * suffix is that part of an input string which is not covered by + * the common path. + * + * @param state Pointer to last state in the common path. + * @param suffix Suffix. + * @param b Data item associated with the input. + */ + void addSuffix(State* state, const char *suffix, const Blob *b=NULL); + + /** + * @brief Clean up data structures and release memory. + */ + void cleanUp(); + +public: + + /** + * @brief Default constructor. + */ + Automaton() : + _register(NULL), + _blob_register(), + _q0(NULL), + _queue(NULL), + _finalized(false), + _packed() + { } + + /** + * @brief Destructor. + */ + ~Automaton(); + + /** + * @brief Initialize the object. + */ + void init(); + + /** + * @brief Insert a string to the automaton. + * + * Insert a string to the automaton. Input strings must be inserted + * in sorted order, otherwise the behaviour is undefined. + * + * @param input Input string. + */ + void insertSortedString(const std::string &input); + + /** + * @brief Insert a string to the automaton. + * + * Insert a string to the automaton. Input strings must be inserted + * in sorted order, otherwise the behaviour is undefined. + * + * @param input Input string. + * @param meta Meta info string to be stored in data item). + */ + void insertSortedString(const std::string &input, const std::string &meta); + + /** + * @brief Insert a string to the automaton. + * + * Insert a string to the automaton. Input strings must be inserted + * in sorted order, otherwise the behaviour is undefined. + * + * @param input Input string. + * @param b Reference to data item. + */ + void insertSortedString(const char *input, const Blob &b); + + /** + * @brief Insert a string to the automaton. + * + * Insert a string to the automaton. Input strings must be inserted + * in sorted order, otherwise the behaviour is undefined. + * + * @param input Input string. + * @param b Pointer to data item. + */ + void insertSortedString(const char *input, const Blob *b=NULL); + + /** + * @brief Finalize the automaton. + * + * Finalize the automaton. This involves calling replaceOrRegister + * for the start state _q0, and building the packed automaton, so no + * strings can be added to the automaton after this method is + * called. + */ + void finalize(); + + /** + * @brief Add perfect hash to automaton. + * + * Compute and add perfect hash structure to the automaton. Only + * works on finalized automata. + */ + void addPerfectHash(); + + /** + * @brief Write the finalized automaton to file. + * + * @param file Name of the file. + * @param serial Serial number. + * @return True on success. + */ + bool write(const char *file, uint32_t serial = 0); + + /** + * @brief Write the finalized automaton to file. + * + * @param file Name of the file. + * @param serial Serial number. + * @return True on success. + */ + bool write(const std::string &file, uint32_t serial = 0) + { + return write(file.c_str(),serial); + } + + /** + * @brief Create an FSA object from the automaton. + * + * Create an FSA object from the automaton. The Automaton and + * PackedAutomaton is implicitly reset. + * + * @return Pointer to a newly created FSA object. The caller is + * responsible for freeing it. + */ + FSA* getFSA(void); + +}; +// }}} + +template<typename _Tp> + inline bool + operator==(const Automaton::MMapArenaAllocator<_Tp>&, const Automaton::MMapArenaAllocator<_Tp>&) + { return true; } + +template<typename _Tp> + inline bool + operator!=(const Automaton::MMapArenaAllocator<_Tp>&, const Automaton::MMapArenaAllocator<_Tp>&) + { return false; } + +} // namespace fsa + diff --git a/fsa/src/libfsamanagers/.gitignore b/fsa/src/libfsamanagers/.gitignore new file mode 100644 index 00000000000..9fb98574200 --- /dev/null +++ b/fsa/src/libfsamanagers/.gitignore @@ -0,0 +1,6 @@ +*.la +*.lo +.deps +.libs +Makefile +Makefile.in diff --git a/fsa/src/main/java/com/yahoo/fsa/FSA.java b/fsa/src/main/java/com/yahoo/fsa/FSA.java new file mode 100644 index 00000000000..6e352f3ddca --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/FSA.java @@ -0,0 +1,636 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.CharBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel.MapMode; +import java.nio.charset.Charset; +import java.util.NoSuchElementException; + + +/** + * Finite-State Automaton. + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + */ +public class FSA { + + /** + * Thread local state object used to traverse a Finite-State Automaton. + */ + public static class State { + + FSA fsa; + int state = 0; + int hash = 0; + + private State(FSA fsa) { + this.fsa = fsa; + start(); + } + + public void start(){ + state = fsa.start(); + hash = 0; + } + + public void delta(byte symbol) { + hash += fsa.hashDelta(state,symbol); + state = fsa.delta(state,symbol); + } + + /** Returns whether the given symbol would take us to a valid state, without changing the state */ + public boolean peekDelta(byte symbol) { + return fsa.delta(state,symbol)!=0; + } + + public boolean tryDelta(byte symbol) { + int lastHash=hash; + int lastState=state; + delta(symbol); + if (isValid()) return true; + + hash=lastHash; + state=lastState; + return false; + } + + public void delta(char chr){ + CharBuffer chrbuf = CharBuffer.allocate(1); + chrbuf.put(0,chr); + ByteBuffer buf = fsa.encode(chrbuf); + while(state >0 && buf.position()<buf.limit()){ + delta(buf.get()); + } + } + + /** Jumps ahead by string */ + public void delta(String string){ + ByteBuffer buf = fsa.encode(string); + while(state >0 && buf.position()<buf.limit()){ + delta(buf.get()); + } + } + + /** + * Jumps ahead by string if that puts us into a valid state, does nothing otherwise + * + * @return whether we jumped to a valid state (true) or di nothing (false) + */ + public boolean tryDelta(String string){ + int lastHash=hash; + int lastState=state; + delta(string); + if (isValid()) return true; + + hash=lastHash; + state=lastState; + return false; + } + + /** Jumps ahead by a word - if this is not the first word, it must be preceeded by space. */ + public void deltaWord(String string){ + if (state!=fsa.start()) { + delta((byte)' '); + } + delta(string); + } + + /** + * Tries to jump ahead by one word. If the given string is not the next complete valid word, nothing is done. + */ + public boolean tryDeltaWord(String string){ + int lastHash=hash; + int lastState=state; + tryDelta((byte)' '); + delta(string); + if (isValid() && peekDelta((byte)' ')) return true; + if (isFinal()) return true; + + hash=lastHash; + state=lastState; + return false; + } + + public boolean isFinal(){ + return fsa.isFinal(state); + } + + public boolean isStartState() { + return fsa.start() == state; + } + + public boolean isValid(){ + return state !=0; + } + + public ByteBuffer data(){ + return fsa.data(state); + } + + public String dataString(){ + return fsa.dataString(state); + } + + public int hash(){ + return hash; + } + + public ByteBuffer lookup(String str){ + start(); + delta(str); + return fsa.data(state); + } + + public boolean hasPerfectHash(){ + return fsa.hasPerfectHash(); + } + + } + + /** + * Class used to iterate over all accepted strings in the fsa. + */ + public static class Iterator implements java.util.Iterator<Iterator.Item> { + /** + * Internally, this class stores the state information for the iterator. + * Externally, it is used for accessing the data associated with the iterator position. + */ + public static class Item { + private FSA fsa; + private java.util.Stack<Byte> string; + private int symbol; + private int state; + private java.util.Stack<Integer> stack; + + /** + * Constructor + * @param fsa the FSA object the iterator is associated with. + * @param state the state used as start state. + */ + public Item(FSA fsa, int state) { + this.fsa = fsa; + this.string = new java.util.Stack(); + this.symbol = 0; + this.state = state; + this.stack = new java.util.Stack(); + } + + /** + * Copy constructor. (Does not copy the state stack) + */ + public Item(Item item) { + this.fsa = item.fsa; + this.string = new java.util.Stack(); + for (java.util.Iterator<Byte> itr = item.string.iterator(); itr.hasNext(); ) { + byte b = itr.next(); + this.string.push(b); + } + this.symbol = item.symbol; + this.state = item.state; + // no need to fill the stack as this constructor is used by Iterator::next() + this.stack = null; + } + + public String getString() { + ByteBuffer buffer = ByteBuffer.allocate(string.size()); + for (java.util.Iterator<Byte> itr = string.iterator(); itr.hasNext(); ) { + byte b = itr.next(); + buffer.put(b); + } + buffer.flip(); + return fsa.decode(buffer); + } + + public ByteBuffer getData() { + return fsa.data(state); + } + + public String getDataString() { + return fsa.dataString(state); + } + + public String toString() { + return "string: " + string + "(" + getString() + "), symbol: " + symbol + ", state: " + state; + } + } + + private Item item; + boolean useInitState = false; + + /** + * Constructor. + * @param state the state to create the iterator from. + */ + public Iterator(State state) { + item = new Item(state.fsa, state.state); + if (state.isFinal()) { + useInitState = true; + } else { + findNext(); + } + } + + private void findNext() { + int nextState; + int depth; + + if (item.symbol == 256 || item.fsa == null) { + throw new NoSuchElementException(); + } + + // flip the flag now that the first state has been returned + if (useInitState) { + useInitState = false; + } + + // try to find the next final state + for(;;) { + item.symbol++; + if (item.symbol < 256) { + byte symbol = (byte)item.symbol; + nextState = item.fsa.delta(item.state, (byte)item.symbol); + if (nextState != 0) { + item.string.push((byte)item.symbol); + item.stack.push(item.state); + item.state = nextState; + item.symbol = 0; + if (item.fsa.isFinal(nextState)) { + break; + } + } + } else { // backtrack + if ((depth = item.string.size()) > 0) { + byte b = item.string.pop(); // remove the last byte + item.symbol = b < 0 ? b + 256 : b; + item.state = item.stack.pop(); + } else { + item.state = 0; + break; + } + } + } + } + + public boolean hasNext() { + return item.state != 0 || useInitState; + } + + public Item next() { + Item retval = new Item(item); + findNext(); + return retval; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + public State getState(){ + return new State(this); + } + + /** + * Returns a new iterator to the start state. + */ + public Iterator iterator() { + return new Iterator(getState()); + } + + /** + * Returns a new iterator to the given state. + * @param state the state to create the iterator from. + */ + public Iterator iterator(State state) { + return new Iterator(state); + } + + private boolean _ok = false; + private MappedByteBuffer _header; + private MappedByteBuffer _symbol_tab; + private MappedByteBuffer _state_tab; + private MappedByteBuffer _data; + private MappedByteBuffer _phash; + private Charset _charset; + + /** + * Loads an FSA from a resource file name, which is resolved from the class path of the + * class loader of the given class. + * <p> + * This is useful for loading fsa's deployed within OSGi bundles. + * + * @param resourceFileName the name of the file, relative to any element on the classpath. + * For example, if the classpath contains resources/ and the file is resources/myfsa.fsa + * this argument should be myfsa.fsa + * @param loadingClass a class which provides the class loader to use for loading. Any class which is loaded + * from the same class path as the resource will do (e.g with OSGi - any class in the same bundle jar) + * @return the loaded FSA + * @throws RuntimeException if the class could not be loaded + */ + public static FSA loadFromResource(String resourceFileName,Class loadingClass) { + URL fsaUrl=loadingClass.getResource(resourceFileName); + if ( ! "file".equals(fsaUrl.getProtocol())) + throw new RuntimeException("Could not open non-file url '" + fsaUrl + "' as a file input stream: " + + "The classloader of " + loadingClass + "' does not return file urls"); + return new FSA(fsaUrl.getFile()); + } + + /** + * Loads an FSA from a file using utf-8 encoding + * + * @throws IllegalArgumentException if the file is not found + */ + public FSA(String filename) { + init(filename,"utf-8"); + } + + /** + * Loads an FSA from a file using the specified character encoding. + * + * @throws IllegalArgumentException if the file is not found + */ + public FSA(String filename, String charsetname) { + init(filename,charsetname); + } + + /** Loads an FSA from a file input stream using utf-8 encoding */ + public FSA(FileInputStream filename) { + init(filename,"utf-8"); + } + + /** Loads an FSA from a file input stream using the specified character encoding */ + public FSA(FileInputStream filename, String charsetname) { + init(filename,charsetname); + } + + private void init(String filename, String charsetname){ + try { + init(new FileInputStream(filename),charsetname); + } + catch (FileNotFoundException e) { + throw new IllegalArgumentException("Could not find FSA file '" + filename + "'",e); + } + catch (IOException e) { + throw new IllegalArgumentException("Could not read FSA file '" + filename + "'",e); + } + } + + private void init(FileInputStream file, String charsetname) { + try { + _charset = Charset.forName(charsetname); + + _header = file.getChannel().map(MapMode.READ_ONLY,0,256); + _header.order(ByteOrder.LITTLE_ENDIAN); + if (h_magic()!=2038637673) { + throw new IOException("Stream does not contain an FSA: Wrong file magic number " + h_magic()); + } + _symbol_tab = file.getChannel().map(MapMode.READ_ONLY, + 256,h_size()); + _symbol_tab.order(ByteOrder.LITTLE_ENDIAN); + _state_tab = file.getChannel().map(MapMode.READ_ONLY, + 256+h_size(),4*h_size()); + _state_tab.order(ByteOrder.LITTLE_ENDIAN); + _data = file.getChannel().map(MapMode.READ_ONLY, + 256+5*h_size(),h_data_size()); + _data.order(ByteOrder.LITTLE_ENDIAN); + if(h_has_phash()>0){ + _phash = file.getChannel().map(MapMode.READ_ONLY, + 256+5*h_size()+h_data_size(), + 4*h_size()); + _phash.order(ByteOrder.LITTLE_ENDIAN); + } + _ok=true; + } + catch (IOException e) { + throw new RuntimeException("IO error while reading FSA file",e); + } + } + + private int h_magic(){ + return _header.getInt(0); + } + private int h_version(){ + return _header.getInt(4); + } + private int h_checksum(){ + return _header.getInt(8); + } + private int h_size(){ + return _header.getInt(12); + } + private int h_start(){ + return _header.getInt(16); + } + private int h_data_size(){ + return _header.getInt(20); + } + private int h_data_type(){ + return _header.getInt(24); + } + private int h_fixed_data_size(){ + return _header.getInt(28); + } + private int h_has_phash(){ + return _header.getInt(32); + } + private int h_serial(){ + return _header.getInt(36); + } + private int getSymbol(int index){ + int symbol = _symbol_tab.get(index); + if(symbol<0){ + symbol += 256; + } + return symbol; + } + + private ByteBuffer encode(String str){ + return _charset.encode(str); + } + + private ByteBuffer encode(CharBuffer chrbuf){ + return _charset.encode(chrbuf); + } + + private String decode(ByteBuffer buf){ + return _charset.decode(buf).toString(); + } + + public boolean isOk(){ + return _ok; + } + + public boolean hasPerfectHash(){ + return _ok && h_has_phash()==1; + } + + public int version(){ + if(_ok){ + return h_version(); + } + return 0; + } + + public int serial(){ + if(_ok){ + return h_serial(); + } + return 0; + } + + protected int start(){ + if(_ok){ + return h_start(); + } + + return 0; + } + + protected int delta(int state, byte symbol){ + int s=symbol; + if(s<0){ + s+=256; + } + if(_ok && s>0 && s<255){ + if(getSymbol(state+s)==s){ + return _state_tab.getInt(4*(state+s)); + } + } + return 0; + } + + protected int hashDelta(int state, byte symbol){ + int s=symbol; + if(s<0){ + s+=256; + } + if(_ok && h_has_phash()==1 && s>0 && s<255){ + if(getSymbol(state+s)==s){ + return _phash.getInt(4*(state+s)); + } + } + return 0; + } + + protected boolean isFinal(int state){ + if(_ok){ + if(getSymbol(state+255)==255){ + return true; + } + } + return false; + } + + /** + * Retrieves data for the given state using the underlying fsa data buffer. + * @param state The fsa state to retrieve data from. + * @return A new buffer containing the data for the given state. + **/ + protected ByteBuffer data(int state) { + if(_ok && isFinal(state)){ + int offset = _state_tab.getInt(4*(state+255)); + int length; + if(h_data_type()==1){ + length = h_fixed_data_size(); + } + else{ + length = _data.getInt(offset); + offset += 4; + } + ByteBuffer meta = ByteBuffer.allocate(length); + meta.order(ByteOrder.LITTLE_ENDIAN); + byte[] dst = meta.array(); + for (int i = 0; i < length; ++i) { + dst[i] = _data.get(i + offset); + } + return meta; + } + return null; + } + + /** + * Retrieves data for the given state using the underlying fsa data buffer. + * @param state The fsa state to retrieve data from. + * @return A string representation of the data for the given state. + **/ + protected String dataString(int state) { + ByteBuffer meta = data(state); + if(meta!=null){ + // Remove trailing '\0' if it exists. This is usually the + // case for automata built with text format (makefsa -t) + String data = decode(meta); + if (data.endsWith("\0")) { + data = data.substring(0, data.length()-1); + } + return data; + } + return null; + } + + /** + * Convenience method that returns the metadata string in the fsa + * for the input lookup String, or null if the input string does + * not exist in the fsa. + * @param str The string to look up. + * @return Metadata string from the fsa. */ + public String lookup(String str){ + State s = getState(); + s.lookup(str); + return s.dataString(); + } + + + //// test //// + public static void main(String[] args) { + String test = "sour cherry"; + if (args.length >= 1) { + test = args[0]; + } + + String fsafile = "/home/gv/fsa/test/__testfsa__.__fsa__"; + //String fsafile = "/home/p13n/prelude/automata/query2dmozsegments.fsa"; + + FSA fsa = new FSA(fsafile); + + System.out.println("Loading FSA file "+fsafile+": "+fsa.isOk()); + System.out.println(" version: " + fsa.version()/1000000 + "." + + (fsa.version()/1000) % 1000 + "." + + fsa.version() % 1000); + System.out.println(" serial: " + fsa.serial()); + System.out.println(" phash: " + fsa.hasPerfectHash()); + + FSA.State s = fsa.getState(); + + s.start(); + for (int i=0; i < test.length(); i++) { + s.delta(test.charAt(i)); + } + System.out.println("\ndelta() char test " + test + ": " + + s.isFinal() + ", info: " + s.dataString() + + ", hash value: " + s.hash()); + + s.start(); + s.delta(test); + System.out.println("\ndelta() test " + test + ": " + + s.isFinal() + ", info: " + s.dataString() + + ", hash value: " + s.hash()); + + s.lookup(test); + String data = s.dataString(); + System.out.println("\nlookup() test \"" + test + "\": " + + (s.lookup(test) != null) + + ", info: " + data + ", hash value: " + s.hash()); + + String data2 = fsa.lookup(test); + System.out.println("\nFSA.lookup() test \"" + test + "\": " + data2); + } +} + + diff --git a/fsa/src/main/java/com/yahoo/fsa/MetaData.java b/fsa/src/main/java/com/yahoo/fsa/MetaData.java new file mode 100644 index 00000000000..fde868464c8 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/MetaData.java @@ -0,0 +1,217 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.ByteOrder; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.charset.Charset; + +import com.yahoo.fsa.FSA; + + +/** + * Class for accessing meta-data (dat-files) used by FSA applications. + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + **/ +public class MetaData { + + private boolean _ok = false; + private MappedByteBuffer _header; + private MappedByteBuffer _data; + private Charset _charset; + + + public MetaData(String filename){ + init(filename, "utf-8"); + } + + public MetaData(String filename, String charsetname){ + init(filename, charsetname); + } + + public boolean isOk(){ + return _ok; + } + + private void init(String filename, String charsetname){ + + _charset = Charset.forName(charsetname); + + FileInputStream file; + try { + file = new FileInputStream(filename); + } + catch (FileNotFoundException e) { + System.out.print("MetaData file " + filename + " not found.\n"); + return; + } + + try { + _header = file.getChannel().map(MapMode.READ_ONLY,0,256); + _header.order(ByteOrder.LITTLE_ENDIAN); + if(h_magic()!=-2025936501){ + System.out.print("MetaData bad magic " + h_magic() +"\n"); + return; + } + _data = file.getChannel().map(MapMode.READ_ONLY, + 256, + h_size()); + _data.order(ByteOrder.LITTLE_ENDIAN); + _ok=true; + } + catch (IOException e) { + System.out.print("MetaData IO exception.\n"); + return; + } + } + + private int h_magic(){ + return _header.getInt(0); + } + private int h_version(){ + return _header.getInt(4); + } + private int h_checksum(){ + return _header.getInt(8); + } + private int h_size(){ + return _header.getInt(12); + } + private int h_reserved(int i){ + if(i<0||i>9){ + return 0; + } + return _header.getInt(16+4*i); + } + private int h_user(int i){ + if(i<0||i>49){ + return 0; + } + return _header.getInt(56+4*i); + } + + + private ByteBuffer encode(CharBuffer chrbuf){ + return _charset.encode(chrbuf); + } + + private String decode(ByteBuffer buf){ + return _charset.decode(buf).toString(); + } + + + public int user(int i){ + if(!_ok){ + return 0; + } + return h_user(i); + } + + public int getIntEntry(int idx) + { + if(_ok){ + return _data.getInt(idx*4); + } + else + return 0; + } + + public ByteBuffer getDirectRecordEntry(int idx, int size) + { + if(_ok){ + ByteBuffer meta = ByteBuffer.allocate(size); + meta.order(ByteOrder.LITTLE_ENDIAN); + _data.position(idx*size); + _data.get(meta.array(),0,size); + return meta; + } + else + return null; + } + + public ByteBuffer getIndirectRecordEntry(int idx, int size) + { + if(_ok){ + int offset = _data.getInt(idx*4); + ByteBuffer meta = ByteBuffer.allocate(size); + meta.order(ByteOrder.LITTLE_ENDIAN); + _data.position(offset); + _data.get(meta.array(),0,size); + return meta; + } + else + return null; + } + + public ByteBuffer getIndirectRecordEntry(int idx) + { + if(_ok){ + int offset = _data.getInt(idx*4); + int size = _data.getInt(offset); + ByteBuffer meta = ByteBuffer.allocate(size); + meta.order(ByteOrder.LITTLE_ENDIAN); + _data.position(offset+4); + _data.get(meta.array(),0,size); + return meta; + } + else + return null; + } + + public String getStringEntry(int stringOffset){ + if(_ok){ + int length = 0; + _data.position(stringOffset); + while(_data.get()!=0){ + length++; + } + ByteBuffer meta = ByteBuffer.allocate(length); + meta.order(ByteOrder.LITTLE_ENDIAN); + _data.position(stringOffset); + _data.get(meta.array(),0,length); + return decode(meta); + } + return null; + } + + public String[] getStringArrayEntry(int stringOffset, int numStrings){ + if(_ok && numStrings>0){ + String[] stringArray = new String[numStrings]; + int pos=stringOffset; + for(int i=0;i<numStrings;i++){ + int length = 0; + _data.position(pos); + while(_data.get()!=0){ + length++; + } + ByteBuffer meta = ByteBuffer.allocate(length); + meta.order(ByteOrder.LITTLE_ENDIAN); + _data.position(pos); + _data.get(meta.array(),0,length); + stringArray[i] = decode(meta); + pos += length+1; + } + return stringArray; + } + return null; + } + + //// test //// + public static void main(String[] args) { + String file = "dmozPred_2.dat"; + + MetaData metaData = new MetaData(file); + + System.out.println("Loading MetaData "+file+": "+metaData.isOk()); + } + + + +} diff --git a/fsa/src/main/java/com/yahoo/fsa/conceptnet/ConceptNet.java b/fsa/src/main/java/com/yahoo/fsa/conceptnet/ConceptNet.java new file mode 100644 index 00000000000..13cb93073d2 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/conceptnet/ConceptNet.java @@ -0,0 +1,384 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.conceptnet; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.ByteOrder; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.charset.Charset; + +import com.yahoo.fsa.FSA; + + +/** + * Class for accessing the concept network automata. + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + **/ +public class ConceptNet { + + private FSA _fsa; + private boolean _ok = false; + private MappedByteBuffer _header; + private MappedByteBuffer _index; + private MappedByteBuffer _info; + private MappedByteBuffer _catindex; + private MappedByteBuffer _strings; + private Charset _charset; + + + public ConceptNet(String domain){ + init(domain, "utf-8"); + } + + public ConceptNet(String domain, String charsetname){ + init(domain, charsetname); + } + + public boolean isOk(){ + return _ok; + } + + private void init(String domain, String charsetname){ + + _charset = Charset.forName(charsetname); + + _fsa = new FSA(domain + ".fsa",charsetname); + + if(!_fsa.isOk()){ + return; + } + + FileInputStream file; + try { + file = new FileInputStream(domain + ".dat"); + } + catch (FileNotFoundException e) { + System.out.print("ConceptNet data file " + domain + ".dat" + " not found.\n"); + return; + } + + try { + _header = file.getChannel().map(MapMode.READ_ONLY,0,256); + _header.order(ByteOrder.LITTLE_ENDIAN); + if(h_magic()!=238579428){ + System.out.print("ConceptNet bad magic " + h_magic() +"\n"); + return; + } + _index = file.getChannel().map(MapMode.READ_ONLY, + 256, + 8*4*h_index_size()); + _index.order(ByteOrder.LITTLE_ENDIAN); + _info = file.getChannel().map(MapMode.READ_ONLY, + 256+8*4*h_index_size(), + 4*h_info_size()); + _info.order(ByteOrder.LITTLE_ENDIAN); + _catindex = file.getChannel().map(MapMode.READ_ONLY, + 256+8*4*h_index_size()+4*h_info_size(), + 4*h_catindex_size()); + _catindex.order(ByteOrder.LITTLE_ENDIAN); + _strings = file.getChannel().map(MapMode.READ_ONLY, + 256+8*4*h_index_size()+4*h_info_size()+4*h_catindex_size(), + h_strings_size()); + _strings.order(ByteOrder.LITTLE_ENDIAN); + _ok=true; + } + catch (IOException e) { + System.out.print("ConceptNet IO exception.\n"); + return; + } + } + + private int h_magic(){ + return _header.getInt(0); + } + private int h_version(){ + return _header.getInt(4); + } + private int h_checksum(){ + return _header.getInt(8); + } + private int h_index_size(){ + return _header.getInt(12); + } + private int h_info_size(){ + return _header.getInt(16); + } + private int h_catindex_size(){ + return _header.getInt(20); + } + private int h_strings_size(){ + return _header.getInt(24); + } + private int h_max_freq(){ + return _header.getInt(28); + } + private int h_max_cfreq(){ + return _header.getInt(32); + } + private int h_max_qfreq(){ + return _header.getInt(36); + } + private int h_max_sfreq(){ + return _header.getInt(40); + } + private int h_max_efreq(){ + return _header.getInt(44); + } + private int h_max_afreq(){ + return _header.getInt(48); + } + + + private ByteBuffer encode(CharBuffer chrbuf){ + return _charset.encode(chrbuf); + } + + private String decode(ByteBuffer buf){ + return _charset.decode(buf).toString(); + } + + public int lookup(String unit) + { + FSA.State state = _fsa.getState(); + // state.start(); // getState does this for us + state.delta(unit); + if(state.isFinal()){ + return state.hash(); + } + return -1; + } + + public String lookup(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return null; + } + int termoffset = _index.getInt(4*8*idx); + return getString(termoffset); + } + + private String getString(int stringOffset){ + if(_ok){ + int length = 0; + _strings.position(stringOffset); + while(_strings.get()!=0){ + length++; + } + ByteBuffer meta = ByteBuffer.allocate(length); + _strings.position(stringOffset); + _strings.get(meta.array(),0,length); + return decode(meta); + } + return null; + } + + public int frq(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return -1; + } + return _index.getInt(4*8*idx+4); + } + + public int cFrq(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return -1; + } + return _index.getInt(4*8*idx+8); + } + + public int qFrq(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return -1; + } + return _index.getInt(4*8*idx+12); + } + + public int sFrq(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return -1; + } + return _index.getInt(4*8*idx+16); + } + + public double score(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return -1.0; + } + return 100.0*cFrq(idx)/qFrq(idx); + } + + public double strength(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return -1.0; + } + return 100.0*qFrq(idx)/sFrq(idx); + } + + public int numExt(int idx) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+20); + if(offset==0){ + return 0; + } + return _info.getInt(4*offset); + } + + public int ext(int idx, int i) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+20); + if(offset==0){ + return -1; + } + if(i>=_info.getInt(4*offset)){ + return -1; + } + return _info.getInt(4*offset+4+8*i); + } + + public int extFrq(int idx, int i) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+20); + if(offset==0){ + return -1; + } + if(i>=_info.getInt(4*offset)){ + return -1; + } + return _info.getInt(4*offset+8+8*i); + } + + public int numAssoc(int idx) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+24); + if(offset==0){ + return 0; + } + return _info.getInt(4*offset); + } + + public int assoc(int idx, int i) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+24); + if(offset==0){ + return -1; + } + if(i>=_info.getInt(4*offset)){ + return -1; + } + return _info.getInt(4*offset+4+8*i); + } + + public int assocFrq(int idx, int i) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+24); + if(offset==0){ + return -1; + } + if(i>=_info.getInt(4*offset)){ + return -1; + } + return _info.getInt(4*offset+8+8*i); + } + + public int numCat(int idx) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+28); + if(offset==0){ + return 0; + } + return _info.getInt(4*offset); + } + + public int cat(int idx, int i) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+28); + if(offset==0){ + return -1; + } + if(i>=_info.getInt(4*offset)){ + return -1; + } + return _info.getInt(4*offset+4+8*i); + } + + public String catName(int catidx) + { + if(!_ok || catidx<0 || catidx>=h_catindex_size()){ + return null; + } + int catoffset = _catindex.getInt(4*catidx); + return getString(catoffset); + } + + //// test //// + public static void main(String[] args) { + String domain = "/home/gv/fsa/automata/us_main_20041002_20041008"; + + ConceptNet cn = new ConceptNet(domain); + + System.out.println("Loading ConceptNet domain "+domain+": "+cn.isOk()); + int idx = cn.lookup("new york"); + System.out.println(" lookup(\"new york\") -> "+idx); + System.out.println(" lookup("+idx+") -> "+cn.lookup(idx)+"("+cn.score(idx)+","+cn.strength(idx)+")"); + System.out.println(" extensions("+cn.numExt(idx)+"):"); + for(int i=0;i<5 && i<cn.numExt(idx);i++){ + System.out.println(" "+cn.lookup(cn.ext(idx,i))+","+cn.extFrq(idx,i)); + } + if(5<cn.numExt(idx)){ + System.out.println(" ..."); + } + System.out.println(" associations("+cn.numAssoc(idx)+"):"); + for(int i=0;i<5 && i<cn.numAssoc(idx);i++){ + System.out.println(" "+cn.lookup(cn.assoc(idx,i))+","+cn.assocFrq(idx,i)); + } + if(5<cn.numAssoc(idx)){ + System.out.println(" ..."); + } + System.out.println(" categories("+cn.numCat(idx)+"):"); + for(int i=0;i<5 && i<cn.numCat(idx);i++){ + System.out.println(" "+cn.catName(cn.cat(idx,i))); + } + if(5<cn.numCat(idx)){ + System.out.println(" ..."); + } + } + + + +} diff --git a/fsa/src/main/java/com/yahoo/fsa/package-info.java b/fsa/src/main/java/com/yahoo/fsa/package-info.java new file mode 100644 index 00000000000..94c7fd30603 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/package-info.java @@ -0,0 +1,7 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +@ExportPackage +@PublicApi +package com.yahoo.fsa; + +import com.yahoo.api.annotations.PublicApi; +import com.yahoo.osgi.annotation.ExportPackage; diff --git a/fsa/src/main/java/com/yahoo/fsa/segmenter/Segment.java b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segment.java new file mode 100644 index 00000000000..1e424372a66 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segment.java @@ -0,0 +1,42 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.segmenter; + +/** + * Class encapsulation of a segment. + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + */ +public class Segment { + + int _beg; + int _end; + int _conn; + + public Segment(int b, int e, int c) + { + _beg = b; + _end = e; + _conn = c; + } + + public int beg() + { + return _beg; + } + + public int end() + { + return _end; + } + + public int len() + { + return _end-_beg; + } + + public int conn() + { + return _conn; + } + +} diff --git a/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java new file mode 100644 index 00000000000..80ccd791644 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java @@ -0,0 +1,137 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.segmenter; + +import java.util.LinkedList; +import java.util.ListIterator; + +import com.yahoo.fsa.FSA; + +/** + * API for accessing the Segmenter automata. + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + */ +public class Segmenter { + + private FSA _fsa; + + public Segmenter(FSA fsa) { + _fsa = fsa; + } + + public Segmenter(String filename) { + _fsa = new FSA(filename,"utf-8"); + } + + public Segmenter(String filename, String charsetname) { + _fsa = new FSA(filename,charsetname); + } + + public boolean isOk() + { + return _fsa.isOk(); + } + + public Segments segment(String input) + { + String[] tokens = input.split("\\s"); + return segment(tokens); + } + + private class Detector { + FSA.State _state; + int _index; + + public Detector(FSA.State s, int i) + { + _state = s; + _index = i; + } + + public FSA.State state() + { + return _state; + } + + public int index() + { + return _index; + } + } + + public Segments segment(String[] tokens) + { + Segments segments = new Segments(tokens); + LinkedList detectors = new LinkedList(); + + int i=0; + + + while(i<tokens.length){ + detectors.add(new Detector(_fsa.getState(),i)); + + ListIterator det_it = detectors.listIterator(); + while(det_it.hasNext()){ + Detector d = (Detector)det_it.next(); + d.state().deltaWord(tokens[i]); + if(d.state().isFinal()){ + segments.add(new Segment(d.index(),i+1,d.state().data().getInt(0))); + } + + if(!d.state().isValid()){ + det_it.remove(); + } + } + i++; + } + + return segments; + } + + //// test //// + public static void main(String[] args) { + String fsafile = "/home/gv/fsa/automata/segments.fsa"; + + Segmenter segmenter = new Segmenter(fsafile); + + System.out.println("Loading segmenter FSA file "+fsafile+": "+segmenter.isOk()); + + for(int a=0;a<1||a<args.length;a++){ + + String query; + if(a==args.length){ + query = "times square head"; + } + else { + query = args[a]; + } + System.out.println("processing query \""+query+"\""); + + Segments segments = segmenter.segment(query); + System.out.println("all segments:"); + for(int i=0; i<segments.size();i++){ + System.out.println(" "+i+": \""+segments.sgm(i)+"\","+segments.conn(i)); + } + + Segments best; + + best = segments.segmentation(Segments.SEGMENTATION_WEIGHTED); + System.out.print("best segments (weighted): "); + for(int i=0; i<best.size();i++){ + System.out.print("("+best.sgm(i)+")"); + } + System.out.println(); + + best = segments.segmentation(Segments.SEGMENTATION_RIGHTMOST_LONGEST); + System.out.print("best segments (rightmost_longest):"); + for(int i=0; i<best.size();i++){ + System.out.print("("+best.sgm(i)+")"); + } + System.out.println(); + + } + + } + +} + diff --git a/fsa/src/main/java/com/yahoo/fsa/segmenter/Segments.java b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segments.java new file mode 100644 index 00000000000..26752046f80 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segments.java @@ -0,0 +1,313 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.segmenter; + +import java.util.LinkedList; + +/** + * Contains the segmentation() method. + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + */ +public class Segments extends LinkedList { + + public final static int SEGMENTATION_WEIGHTED = 0; + public final static int SEGMENTATION_WEIGHTED_BIAS10 = 1; + public final static int SEGMENTATION_WEIGHTED_BIAS20 = 2; + public final static int SEGMENTATION_WEIGHTED_BIAS50 = 3; + public final static int SEGMENTATION_WEIGHTED_BIAS100 = 4; + public final static int SEGMENTATION_WEIGHTED_LEFTMOST = 5; + public final static int SEGMENTATION_WEIGHTED_RIGHTMOST = 6; + public final static int SEGMENTATION_WEIGHTED_LONGEST = 7; + public final static int SEGMENTATION_LEFTMOST_LONGEST = 8; + public final static int SEGMENTATION_LEFTMOST_WEIGHTED = 9; + public final static int SEGMENTATION_RIGHTMOST_LONGEST = 10; + public final static int SEGMENTATION_RIGHTMOST_WEIGHTED = 11; + public final static int SEGMENTATION_LONGEST_WEIGHTED = 12; + public final static int SEGMENTATION_LONGEST_LEFTMOST = 13; + public final static int SEGMENTATION_LONGEST_RIGHTMOST = 14; + public final static int SEGMENTATION_METHODS = 15; + + private String[] _tokens; + private int _size; + private int[][] _map; + + public Segments(String[] tokens) + { + _tokens = tokens; + _size = tokens.length; + _map = new int[_size+1][_size+1]; + for(int i=0; i<=_size; i++){ + for(int j=0; j<=_size; j++){ + _map[i][j]=-1; + } + } + } + + public void add(Segment s) + { + super.add(s); + _map[s.beg()][s.end()]=super.size()-1; + } + + private void addMissingSingles() + { + for(int i=0; i<_size; i++){ + if(_map[i][i+1]==-1){ + super.add(new Segment(i,i+1,0)); + _map[i][i+1]=super.size()-1; + } + } + } + + private void reMap() + { + for(int i=0; i<=_size; i++){ + for(int j=0; j<=_size; j++){ + _map[i][j]=-1; + } + } + for(int i=0; i<super.size(); i++){ + _map[beg(i)][end(i)] = i; + } + } + + public String sgm(int idx) + { + if(idx<0 || idx>=super.size()){ + return null; + } + String s = new String(_tokens[((Segment)(super.get(idx))).beg()]); + for(int i=((Segment)(super.get(idx))).beg()+1;i<((Segment)(super.get(idx))).end();i++){ + s += " " + _tokens[i]; + } + return s; + } + + public int beg(int idx) + { + if(idx<0 || idx>=super.size()){ + return -1; + } + return ((Segment)(super.get(idx))).beg(); + } + + public int end(int idx) + { + if(idx<0 || idx>=super.size()){ + return -1; + } + return ((Segment)(super.get(idx))).end(); + } + + public int len(int idx) + { + if(idx<0 || idx>=super.size()){ + return -1; + } + return ((Segment)(super.get(idx))).len(); + } + + public int conn(int idx) + { + if(idx<0 || idx>=super.size()){ + return -1; + } + return ((Segment)(super.get(idx))).conn(); + } + + public Segments segmentation(int method) + { + Segments smnt = new Segments(_tokens); + + addMissingSingles(); + + int maxsc, id, bestid=-1, bias=0, c, pos, bestval, temp=0, next=-1; + int[] maxScore = new int[super.size()]; + int[] nextid = new int[super.size()]; + for(int i=0;i<nextid.length;i++){ + nextid[i]=-1; + } + + switch(method){ + case SEGMENTATION_WEIGHTED_BIAS100: + bias+=50; + case SEGMENTATION_WEIGHTED_BIAS50: + bias+=30; + case SEGMENTATION_WEIGHTED_BIAS20: + bias+=10; + case SEGMENTATION_WEIGHTED_BIAS10: + bias+=10; + case SEGMENTATION_WEIGHTED: + bestid=-1; + for(int i=_tokens.length;i>=0;i--){ + bestid=-1;maxsc=0; + for(int j=i+1;j<=_tokens.length;j++){ + id=_map[i][j]; + if(id>=0 && maxScore[id]+1>maxsc) { + bestid=id; + maxsc=maxScore[id]+1; + } + } + if(maxsc>0){ + maxsc--; + } + for(int j=0;j<i;j++){ + id=_map[j][i]; + if(id>=0){ + nextid[id] = bestid; + c = conn(id); + if(i-j<=1){ + maxScore[id] = maxsc; + } + else if(bias>0){ + maxScore[id] = maxsc + ((100+(i-j-2)*bias)*c)/100; + } + else{ + maxScore[id] = maxsc + c; + } + } + } + } + id = bestid; + while(id!=-1){ + smnt.add(((Segment)(super.get(id)))); + id=nextid[id]; + } + break; + case SEGMENTATION_LEFTMOST_LONGEST: + case SEGMENTATION_LEFTMOST_WEIGHTED: + pos = 0; + while(pos<_tokens.length){ + bestid = -1; bestval = -1; + for(int i=pos+1;i<=_tokens.length;i++){ + id = _map[pos][i]; + if(id>=0 && + (method==SEGMENTATION_LEFTMOST_LONGEST || + (temp=(len(id)>1)? conn(id) :0)>bestval) ){ + bestid = id; + bestval = temp; + next = i; + } + } + smnt.add((Segment)(super.get(bestid))); + pos=next; + } + break; + case SEGMENTATION_RIGHTMOST_LONGEST: + case SEGMENTATION_RIGHTMOST_WEIGHTED: + pos = _tokens.length; + while(pos>0){ + bestid = -1; bestval = -1; + for(int i=pos-1;i>=0;i--){ + id = _map[i][pos]; + if(id>=0 && + (method==SEGMENTATION_RIGHTMOST_LONGEST || + (temp=(len(id)>1)? conn(id) :0)>bestval) ){ + bestid = id; + bestval = temp; + next = i; + } + } + smnt.addFirst(super.get(bestid)); + pos=next; + } + smnt.reMap(); + break; + case SEGMENTATION_LONGEST_WEIGHTED: + case SEGMENTATION_LONGEST_LEFTMOST: + case SEGMENTATION_LONGEST_RIGHTMOST: + case SEGMENTATION_WEIGHTED_LONGEST: + case SEGMENTATION_WEIGHTED_LEFTMOST: + case SEGMENTATION_WEIGHTED_RIGHTMOST: + buildSegmentationRecursive(method,smnt,0,_tokens.length); + break; + } + + return smnt; + } + + private void buildSegmentationRecursive(int method, Segments smnt, int b, int e) + { + int bestid, bestval1, bestval2, temp; + + bestid=-1;bestval1=-1;bestval2=-1; + for(int i=0;i<super.size();i++){ + if(b<=beg(i) && e>=end(i)){ + switch(method){ + case SEGMENTATION_LONGEST_WEIGHTED: + if(len(i)>bestval1 || + (len(i)==bestval1 && conn(i)>bestval2) ){ + bestid=i; + bestval1=len(i); + bestval2=conn(i); + } + break; + case SEGMENTATION_LONGEST_LEFTMOST: + if(len(i)>bestval1 || + (len(i)==bestval1 && beg(i)<bestval2) ){ + bestid=i; + bestval1=len(i); + bestval2=beg(i); + } + break; + case SEGMENTATION_LONGEST_RIGHTMOST: + if(len(i)>bestval1 || + (len(i)==bestval1 && end(i)>bestval2) ){ + bestid=i; + bestval1=len(i); + bestval2=end(i); + } + break; + case SEGMENTATION_WEIGHTED_LONGEST: + temp = (len(i)>1)?conn(i):0; + if(temp>bestval1 || + (temp==bestval1 && len(i)>bestval2) ){ + bestid=i; + bestval1=temp; + bestval2=len(i); + } + break; + case SEGMENTATION_WEIGHTED_LEFTMOST: + temp = (len(i)>1)? conn(i) :0; + if(temp>bestval1 || + (temp==bestval1 && beg(i)<bestval2) ){ + bestid=i; + bestval1=temp; + bestval2=beg(i); + } + break; + case SEGMENTATION_WEIGHTED_RIGHTMOST: + temp = len(i)>1?conn(i):0; + if(temp>bestval1 || + (temp==bestval1 && end(i)>bestval2) ){ + bestid=i; + bestval1=temp; + bestval2=end(i); + } + break; + default: // dummy defult pick first possible + if(bestid<0){ + bestid=i; + } + break; + } + } + } + if(bestid<0) { + return; // this should never happen, as all one-word segments are created + } + + if(b<beg(bestid)){ + buildSegmentationRecursive(method,smnt,b,beg(bestid)); + } + + // add segment + smnt.add((Segment)(super.get(bestid))); + + // check right side + if(e>end(bestid)){ + buildSegmentationRecursive(method,smnt,end(bestid),e); + } + } + +} diff --git a/fsa/src/main/java/com/yahoo/fsa/topicpredictor/PredictedTopic.java b/fsa/src/main/java/com/yahoo/fsa/topicpredictor/PredictedTopic.java new file mode 100644 index 00000000000..2dd0dcc9bb2 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/topicpredictor/PredictedTopic.java @@ -0,0 +1,65 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.topicpredictor; + + +/** + * Class encapsulation of a predicted topic. A topic has a weight and + * a term vector string of topicSegments. + * + * @author gjoranv + **/ +public class PredictedTopic { + + private String topic = ""; + private double weight = 0.0; + private String vector = ""; + + + public PredictedTopic(String topic, double weight, String vector){ + this.topic = topic; + this.weight = weight; + this.vector = vector; + } + + public PredictedTopic(String topic, double weight){ + this(topic, weight, ""); + } + + + /** Returns the topic */ + public String getTopic() { return topic; } + + /** Returns the weight */ + public double getWeight() { return weight; } + + /** Returns the vector*/ + public String getVector() { return vector; } + + + /** Sets the weight */ + public void setWeight(double weight) { + this.weight = weight; + } + + /** Adds to the weight */ + public void addWeight(double weight) { + this.weight += weight; + } + + /** Sets the vector*/ + public void setVector(String vector) { + this.vector = vector; + } + + /** Compares this topic to another topic, according to weight descending */ + public int compareDescendWeight(Object o) { + PredictedTopic pt = (PredictedTopic)o; + + double wgt1 = getWeight(); + double wgt2 = pt.getWeight(); + if (wgt1 < wgt2) { return 1; } + if (wgt1 > wgt2) { return -1;} + return 0; + } + +} diff --git a/fsa/src/main/java/com/yahoo/fsa/topicpredictor/TopicPredictor.java b/fsa/src/main/java/com/yahoo/fsa/topicpredictor/TopicPredictor.java new file mode 100644 index 00000000000..177e879c6c8 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/topicpredictor/TopicPredictor.java @@ -0,0 +1,180 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.topicpredictor; + +import java.util.logging.Logger; +import java.util.List; +import java.util.LinkedList; +import java.util.Iterator; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.ByteOrder; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.charset.Charset; + +import com.yahoo.fsa.FSA; +import com.yahoo.fsa.MetaData; + + +/** + * Class for accessing the topic prediction automata. Look up the + * predicted topics for a term. Each topic has an attached weight and + * a term vector (topicSegments). + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + **/ +public class TopicPredictor extends MetaData { + + private static final String packageName = "com.yahoo.fsa.topicpredictor"; + + private FSA fsa = null; + + public TopicPredictor(String fsafile, String datfile){ + this(fsafile, datfile, "utf-8"); + } + + public TopicPredictor(String fsafile, String datfile, + String charsetname) { + super(datfile, charsetname); + if (!isOk()) { + Logger.getLogger(packageName). + warning("Error initializing predictor with file " + datfile); + } + + // Init the segment->'topic index' FSA + fsa = new FSA(fsafile); + if (!fsa.isOk()) { + Logger.getLogger(packageName). + warning("Error initializing FSA with file " + fsafile); + } + } + + /** + * Returns a list of PredictedTopic objects, one for each topic + * the segment maps to. The returned list contains all topics, + * as opposed to the two-argument version. + * @param segment The segment string to find (all) topics for. + * @return (Linked)List of PredictedTopic objects. */ + public List getPredictedTopics(String segment) { + return getPredictedTopics(segment, 0); + } + + /** + * Returns a list of PredictedTopic objects, one for each topic + * the segment maps to. The returned list length is cut off at + * 'maxTopics' entries, maxTopics=0 returns all topics. + * @param segment The segment string to find topics for. + * @param maxTopics The max number of topics to return, 0 for all topics + * @return (Linked)List of PredictedTopic objects. */ + public List getPredictedTopics(String segment, int maxTopics) { + List predictedTopics = new LinkedList(); + + int segIdx = getSegmentIndex(segment); + int[][] topicArr = getTopicArray(segIdx, maxTopics); + int numTopics = topicArr.length; + int allTopics = getNumTopics(segIdx); + /*Logger.getLogger(packageName). + fine("Segment: '" + segment + "' has " + allTopics + + " topics in automaton, fetched " + numTopics); + */ + for(int i=0; i < numTopics; i++) { + int weight = topicArr[i][1]; + String[] topicInfo= getTopicInfo(topicArr[i][0]); + String topic = topicInfo[0]; + String vector= topicInfo[1]; + PredictedTopic pt = + new PredictedTopic(topic, (double)weight, vector); + predictedTopics.add(pt); + } + + return predictedTopics; + } + + /** + * Returns the index (hash value) of the input segment in the FSA. + * @param segment The segment string to find index for. + * @return Index for this segment in the FSA. */ + private int getSegmentIndex(String segment) { + FSA.State s = fsa.getState(); + s.delta(segment); + if (s.isFinal()) { + return s.hash(); + } + return -1; + } + + /** + * Returns the number of topics the FSA contains for the input + * segment. + * @return Number of topics for the segment. */ + private int getNumTopics(int segIdx) { + if (segIdx < 0) { + return 0; + } + ByteBuffer buf = getIndirectRecordEntry(segIdx, 4); + return buf.getInt(0); + } + + /** + * Reads the topics and other metadata for a segment from the + * (memory-mapped) metadata file. Returns the info in a + * two-dimensional array (one row per topic). + * @param segIdx The FSA index (hash value) for the segment. + * @param maxTopics Max number of topics to return, 0 for all topics. + * @return Number of topics for the segment. */ + private int[][] getTopicArray(int segIdx, int maxTopics) { + if (segIdx < 0) { + return new int[0][0]; + } + + int numTopics = getNumTopics(segIdx); + if ((maxTopics > 0) && (numTopics > maxTopics)) { + numTopics = maxTopics; + } + + int[][] topics = new int[numTopics][2]; + ByteBuffer buf = getIndirectRecordEntry(segIdx,4+8*numTopics); + for(int i=0; i<numTopics; i++){ + topics[i][0] = buf.getInt(4+8*i); + topics[i][1] = buf.getInt(8+8*i); + } + return topics; + } + + /** + * Returns the topic and vector strings from the internal meta + * data structure. + * @param topicId Topic start index in a two-dimensional array + * @return topic string at [0] and vector string at [1] */ + private String[] getTopicInfo(int topicId) { + return getStringArrayEntry(user(0) + topicId, 2); + } + + + //// test //// + public static void main(String[] args) { + String segment = "new york"; + if (args.length >= 1) { + segment = args[0]; + } + + String fsafile = "/home/gv/fsa/automata/dmozPred_2.fsa"; + String datfile = "/home/gv/fsa/automata/dmozPred_2.dat"; + + TopicPredictor predictor = new TopicPredictor(fsafile, datfile); + + List predictedTopics = predictor.getPredictedTopics(segment, 25); + Iterator i = predictedTopics.iterator(); + while (i.hasNext()) { + PredictedTopic topic = (PredictedTopic) i.next(); + System.out.println("\n topic=" + topic.getTopic()); + System.out.println(" weight=" + topic.getWeight()); + System.out.println(" vector=" + topic.getVector()); + } + } + +} diff --git a/fsa/src/test/fsa/test-data.fsa b/fsa/src/test/fsa/test-data.fsa Binary files differnew file mode 100644 index 00000000000..92a8a8153ff --- /dev/null +++ b/fsa/src/test/fsa/test-data.fsa diff --git a/fsa/src/test/fsa/test-fsa.fsa b/fsa/src/test/fsa/test-fsa.fsa Binary files differnew file mode 100644 index 00000000000..015be3aeea4 --- /dev/null +++ b/fsa/src/test/fsa/test-fsa.fsa diff --git a/fsa/src/test/fsa/test-iterator.fsa b/fsa/src/test/fsa/test-iterator.fsa Binary files differnew file mode 100644 index 00000000000..a83c6529f06 --- /dev/null +++ b/fsa/src/test/fsa/test-iterator.fsa diff --git a/fsa/src/test/fsa/utf8.fsa b/fsa/src/test/fsa/utf8.fsa Binary files differnew file mode 100644 index 00000000000..4398ac99d11 --- /dev/null +++ b/fsa/src/test/fsa/utf8.fsa diff --git a/fsa/src/test/input/test-data-input.txt b/fsa/src/test/input/test-data-input.txt new file mode 100644 index 00000000000..4acbd811537 --- /dev/null +++ b/fsa/src/test/input/test-data-input.txt @@ -0,0 +1,4 @@ +aa aa data +bbbb bbbb data +c c data +dddddd dddddd data diff --git a/fsa/src/test/input/test-fsa-input.txt b/fsa/src/test/input/test-fsa-input.txt new file mode 100644 index 00000000000..ff56fd30af4 --- /dev/null +++ b/fsa/src/test/input/test-fsa-input.txt @@ -0,0 +1,3 @@ +aword +this is a test +tudor vidor diff --git a/fsa/src/test/input/test-iterator-input.txt b/fsa/src/test/input/test-iterator-input.txt new file mode 100644 index 00000000000..2724764c724 --- /dev/null +++ b/fsa/src/test/input/test-iterator-input.txt @@ -0,0 +1,12 @@ +abacus abacus +abadan abadan +abaisse abaisse +abdicate abdicate +abdomen abdomen +abdominous abdominous +dachs dachs +dacia dacia +daciaa daciaa +daciab daciab +dacite dacite +dacota dacota diff --git a/fsa/src/test/input/utf8.txt b/fsa/src/test/input/utf8.txt new file mode 100644 index 00000000000..15c96002c0b --- /dev/null +++ b/fsa/src/test/input/utf8.txt @@ -0,0 +1 @@ +हिन्दी diff --git a/fsa/src/test/java/com/yahoo/fsa/test/FSADataTestCase.java b/fsa/src/test/java/com/yahoo/fsa/test/FSADataTestCase.java new file mode 100644 index 00000000000..ce9854e7c44 --- /dev/null +++ b/fsa/src/test/java/com/yahoo/fsa/test/FSADataTestCase.java @@ -0,0 +1,104 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.test; + +import com.yahoo.fsa.FSA; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.nio.BufferUnderflowException; + +/** + * @author <a href="geirst@yahoo-inc.com">Geir Storli</a> + */ +public class FSADataTestCase extends junit.framework.TestCase { + + private static class Worker extends Thread { + FSA.State state; + String word; + String data; + long numRuns; + long numExceptions; + long numAsserts; + public Worker(FSA fsa, String word, String data, long numRuns) { + state = fsa.getState(); + this.word = word; + this.data = data; + this.numRuns = numRuns; + this.numExceptions = 0; + this.numAsserts = 0; + } + public void run() { + for (long i = 0; i < numRuns; ++i) { + state.start(); + state.delta(word); + try { + String data = state.dataString(); + if (!this.data.equals(data)) { + ++numAsserts; + } + } catch (BufferUnderflowException e) { + ++numExceptions; + } + } + System.out.println("Worker(" + word + "): numExceptions(" + numExceptions + "), numAsserts(" + numAsserts + ")"); + } + }; + + private FSA fsa; + + public FSADataTestCase(String name) { + super(name); + } + + protected void setUp() throws IOException { + fsa = new FSA(new FileInputStream("src/test/fsa/test-data.fsa")); + } + + public void testBasic() { + FSA.State state = fsa.getState(); + state.delta("aa"); + assertTrue(state.isFinal()); + assertEquals("aa data", state.dataString()); + + state.start(); + state.delta("bbbb"); + assertTrue(state.isFinal()); + assertEquals("bbbb data", state.dataString()); + + state.start(); + state.delta("c"); + assertTrue(state.isFinal()); + assertEquals("c data", state.dataString()); + + state.start(); + state.delta("dddddd"); + assertTrue(state.isFinal()); + assertEquals("dddddd data", state.dataString()); + } + + public void testMultipleThreads() { + long numRuns = 10000; + List<Worker> workers = new ArrayList<Worker>(); + workers.add(new Worker(fsa, "aa", "aa data", numRuns)); + workers.add(new Worker(fsa, "bbbb", "bbbb data", numRuns)); + workers.add(new Worker(fsa, "c", "c data", numRuns)); + workers.add(new Worker(fsa, "dddddd", "dddddd data", numRuns)); + for (int i = 0; i < workers.size(); ++i) { + workers.get(i).start(); + } + try { + for (int i = 0; i < workers.size(); ++i) { + workers.get(i).join(); + } + } catch (InterruptedException e) { + assertTrue(false); + } + for (int i = 0; i < workers.size(); ++i) { + assertEquals(0, workers.get(i).numExceptions); + assertEquals(0, workers.get(i).numAsserts); + } + } + +} diff --git a/fsa/src/test/java/com/yahoo/fsa/test/FSAIteratorTestCase.java b/fsa/src/test/java/com/yahoo/fsa/test/FSAIteratorTestCase.java new file mode 100644 index 00000000000..21dc86f4925 --- /dev/null +++ b/fsa/src/test/java/com/yahoo/fsa/test/FSAIteratorTestCase.java @@ -0,0 +1,119 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.test; + +import com.yahoo.fsa.FSA; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * @author <a href="geirst@yahoo-inc.com">Geir Storli</a> + */ +public class FSAIteratorTestCase extends junit.framework.TestCase { + + private FSA fsa; + + private FSA.State state; + + private List<String> expected; + + public FSAIteratorTestCase(String name) { + super(name); + } + + protected void setUp() { + fsa = new FSA("src/test/fsa/test-iterator.fsa"); + state = fsa.getState(); + + expected = new ArrayList<String>(); + + expected.add("abacus"); + expected.add("abadan"); + expected.add("abaisse"); + expected.add("abdicate"); + expected.add("abdomen"); + expected.add("abdominous"); + expected.add("dachs"); + expected.add("dacia"); + expected.add("daciaa"); + expected.add("daciab"); + expected.add("dacite"); + expected.add("dacota"); + } + + private void checkIterator(int beginIdx, int endIdx, String prefix) { + System.out.println("checkIterator(" + beginIdx + ", " + endIdx + ", " + prefix + ")"); + java.util.Iterator<FSA.Iterator.Item> i = fsa.iterator(state); + for (; i.hasNext() && beginIdx < endIdx; ++beginIdx) { + FSA.Iterator.Item item = i.next(); + System.out.println("item: " + item); + String str = prefix + item.getString(); + String data = item.getDataString(); + System.out.println("str: '" + expected.get(beginIdx) + "'.equals('" + str + "')?"); + assertTrue(expected.get(beginIdx).equals(str)); + System.out.println("data: '" + expected.get(beginIdx) + "'.equals('" + data + "')?"); + assertTrue(expected.get(beginIdx).equals(data)); + } + assertFalse(i.hasNext()); + assertTrue(beginIdx == endIdx); + } + + public void testIterator() { + checkIterator(0, expected.size(), ""); + } + + public void testIteratorSingle() { + state.delta("dach"); + checkIterator(6, 7, "dach"); + } + + public void testIteratorSubset() { + state.delta("abd"); + checkIterator(3, 6, "abd"); + } + + public void testIteratorFinalState() { + state.delta("dacia"); + checkIterator(7, 10, "dacia"); + } + + public void testIteratorFinalStateOnly() { + state.delta("dachs"); + checkIterator(6, 7, "dachs"); + } + + public void testIteratorEmpty1() { + state.delta("b"); + java.util.Iterator i = fsa.iterator(state); + assertFalse(i.hasNext()); + try { + i.next(); + assertFalse(true); + } catch (NoSuchElementException e) { + assertTrue(true); + } + } + + public void testIteratorEmpty2() { + state.delta("daciac"); + java.util.Iterator i = fsa.iterator(state); + assertFalse(i.hasNext()); + try { + i.next(); + assertFalse(true); + } catch (NoSuchElementException e) { + assertTrue(true); + } + } + + public void testIteratorRemove() { + java.util.Iterator i = fsa.iterator(state); + try { + i.remove(); + assertFalse(true); + } catch (UnsupportedOperationException e) { + assertTrue(true); + } + } +} diff --git a/fsa/src/test/java/com/yahoo/fsa/test/FSATestCase.java b/fsa/src/test/java/com/yahoo/fsa/test/FSATestCase.java new file mode 100644 index 00000000000..4300c5938e1 --- /dev/null +++ b/fsa/src/test/java/com/yahoo/fsa/test/FSATestCase.java @@ -0,0 +1,100 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.test; + +import com.yahoo.fsa.FSA; + +import java.io.FileInputStream; +import java.io.IOException; + +/** + * @author <a href="bratseth@yahoo-inc.com">Jon Bratseth</a> + */ +public class FSATestCase extends junit.framework.TestCase { + + private FSA fsa; + + private FSA.State state; + + public FSATestCase(String name) { + super(name); + } + + protected void setUp() throws IOException { + fsa=new FSA(new FileInputStream("src/test/fsa/test-fsa.fsa")); + state=fsa.getState(); + } + + public void testSingleWordDelta() { + state.delta("aword"); + assertTrue(state.isValid()); + assertTrue(state.isFinal()); + } + + public void testSingleWordDeltaWord() { + state.deltaWord("aword"); + assertTrue(state.isValid()); + assertTrue(state.isFinal()); + } + + public void testSingleWordDeltaPartialMatch() { + state.delta("awo"); + assertTrue(state.isValid()); + assertFalse(state.isFinal()); + } + + public void testSingleWordDeltaPartialMatchWord() { + state.deltaWord("awo"); + assertTrue(state.isValid()); + assertFalse(state.isFinal()); + } + + public void testMultiWordDelta() { + state.delta("th"); + assertFalse(state.isFinal()); + state.delta("is "); + assertFalse(state.isFinal()); + state.delta("is "); + assertFalse(state.isFinal()); + state.delta("a"); + assertFalse(state.isFinal()); + state.delta(" test"); + assertTrue(state.isValid()); + assertTrue(state.isFinal()); + } + + public void testMultiWordDeltaWord() { + state.deltaWord("this"); + assertFalse(state.isFinal()); + state.deltaWord("is"); + assertFalse(state.isFinal()); + state.deltaWord("a"); + assertFalse(state.isFinal()); + state.deltaWord("test"); + assertTrue(state.isValid()); + assertTrue(state.isFinal()); + } + + public void testMultiWordDeltaWordInvalid() { + state.deltaWord("th"); + assertFalse(state.isFinal()); + state.deltaWord("is "); + assertFalse(state.isFinal()); + assertFalse(state.isValid()); + } + + public void testMultiWordDeltaTry() { + assertFalse(state.tryDeltaWord("thiss")); + assertTrue(state.isValid()); + assertTrue(state.tryDeltaWord("this")); + state.deltaWord("is"); + state.tryDeltaWord("a"); + assertFalse(state.tryDeltaWord("tes")); + assertFalse(state.tryDeltaWord("tesz")); + assertFalse(state.tryDeltaWord("teszzzz")); + assertTrue(state.tryDeltaWord("test")); + assertTrue(state.isValid()); + assertTrue(state.isFinal()); + + } + +} diff --git a/fsa/src/test/java/com/yahoo/fsa/test/UTF8TestCase.java b/fsa/src/test/java/com/yahoo/fsa/test/UTF8TestCase.java new file mode 100644 index 00000000000..3f07816a914 --- /dev/null +++ b/fsa/src/test/java/com/yahoo/fsa/test/UTF8TestCase.java @@ -0,0 +1,97 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.test; + +import com.yahoo.fsa.FSA; +import java.util.Iterator; +import java.nio.charset.Charset; + +/** + * @author <a href="geirst@yahoo-inc.com">Geir Storli</a> + */ +public class UTF8TestCase extends junit.framework.TestCase { + + private Charset charset = Charset.forName("utf-8"); + private FSA fsa; + private FSA.State state; + private byte prefixBuf[]; + private byte suffixBuf[]; + private String prefix; + private String suffix; + private String word; + + private static byte [] convert(int [] buf) { + byte retval[] = new byte[buf.length]; + for (int i = 0; i < buf.length; ++i) { + retval[i] = (byte)buf[i]; + } + return retval; + } + + public UTF8TestCase(String name) { + super(name); + } + + protected void setUp() { + fsa = new FSA("src/test/fsa/utf8.fsa"); // fsa with one word (6 code points, 18 bytes) + state = fsa.getState(); + int pbuf[] = {0xe0,0xa4,0xb9}; + prefixBuf = convert(pbuf); + prefix = new String(prefixBuf, charset); + int sbuf[] = {0xe0,0xa4,0xbf,0xe0,0xa4,0xa8,0xe0,0xa5,0x8d,0xe0,0xa4,0xa6,0xe0,0xa5,0x80}; + suffixBuf = convert(sbuf); + suffix = new String(suffixBuf, charset); + word = prefix + suffix; + } + + public void testStringDelta() { + state.delta(word); + assertTrue(state.isFinal()); + } + + public void testCharDelta() { + assertEquals(6, word.length()); + for (int i = 0; i < word.length(); ++i) { + state.delta(word.charAt(i)); + assertTrue(state.isValid()); + } + assertTrue(state.isFinal()); + } + + public void testByteDelta() { + FSA.State state = fsa.getState(); + assertEquals(3, prefixBuf.length); + for (int i = 0; i < prefixBuf.length; ++i) { + state.delta(prefixBuf[i]); + assertTrue(state.isValid()); + } + assertEquals(15, suffixBuf.length); + for (int i = 0; i < suffixBuf.length; ++i) { + state.delta(suffixBuf[i]); + assertTrue(state.isValid()); + } + assertTrue(state.isFinal()); + } + + public void testIteratorAtStart() { + Iterator<FSA.Iterator.Item> itr = fsa.iterator(state); + FSA.Iterator.Item item = itr.next(); + assertEquals(word, item.getString()); + assertFalse(itr.hasNext()); + } + + public void testIteratorWithPrefix() { + state.delta(prefix); + Iterator<FSA.Iterator.Item> itr = fsa.iterator(state); + FSA.Iterator.Item item = itr.next(); + assertEquals(suffix, item.getString()); + assertFalse(itr.hasNext()); + } + + public void testIteratorWithCompleteWord() { + state.delta(word); + Iterator<FSA.Iterator.Item> itr = fsa.iterator(state); + FSA.Iterator.Item item = itr.next(); + assertEquals("", item.getString()); + assertFalse(itr.hasNext()); + } +} diff --git a/fsa/src/util/.gitignore b/fsa/src/util/.gitignore new file mode 100644 index 00000000000..282522db034 --- /dev/null +++ b/fsa/src/util/.gitignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/fsa/src/util/cn_txt2xml b/fsa/src/util/cn_txt2xml new file mode 100755 index 00000000000..9c439879af9 --- /dev/null +++ b/fsa/src/util/cn_txt2xml @@ -0,0 +1,625 @@ +#!/usr/bin/perl + +use strict; + +use FSA; +use BerkeleyDB; +use Getopt::Long; +use Pod::Usage; + +# +# Process command line options. +# + +my $do_qfreq = 1; +my $do_sqfreq = 1; +my $do_ext = 1; +my $do_assoc = 1; +my $do_cat = 1; +my $do_fsa = 1; +my $help = 0; +my $man = 0; +my $verbose = 0; +my $stopwords_file = ''; +my $output_file = ''; + +my $result = GetOptions('qfreq|q!' => \$do_qfreq, + 'sqfreq|s!' => \$do_sqfreq, + 'ext|e!' => \$do_ext, + 'assoc|a!' => \$do_assoc, + 'cat|c!' => \$do_cat, + 'fsa|f!' => \$do_fsa, + 'help|h' => \$help, + 'man|m' => \$man, + 'verbose|v' => \$verbose, + 'stopwords|w:s' => \$stopwords_file, + 'output-file|o=s' => \$output_file, + ); + +pod2usage(1) if $help; +pod2usage(-verbose => 2) if $man; + +# +# Domain is a required parameter. +# + +my $domain = shift || die "need domain"; + +# +# Some constants for setting limits etc. +# + +my $MAX_UNIT_LENGTH = 8; +my $MAX_QUERY_LENGTH = 10; + +# +# Declare arrays to store concept net data. +# + +my @unit = (); +my @unit_f = (); +my @unit_qf = (); +my @unit_qfc = (); +my @unit_qfs = (); +my @ext = (); +my @assoc = (); +my @cats = (); +my @ucats = (); +my @st_map = (); + +# +# Some other global variables +# + +my %stopwords = (); +my %stopMap = (); + +my ($total,$count); + +my ($fsa,$sfsa); + +#*********************************************************** +# +# Functions +# +#*********************************************************** + + +sub msg($@){ + my $format = shift; + if($verbose){ + printf STDERR $format,@_; + } +} + +sub progress($$$$){ + my ($msg,$cnt,$total,$done) = @_; + + if($done){ + if($total>0){ + msg("\r%s ... %d of %d (%.2f%%) ... done.\n",$msg,$cnt,$total,100.0*$cnt/$total); + } + else { + msg("\r%s ... %d ... done.\n",$msg,$cnt); + } + } + elsif($cnt%1000==0){ + if($total>0){ + msg("\r%s ... %d of %d (%.2f%%)",$msg,$cnt,$total,100.0*$cnt/$total); + } + else { + msg("\r%s ... %d",$msg,$cnt); + } + } +} + +sub lookup($$){ + my $fsa = shift; + my $u = shift; + my $st = FSA::State->new($fsa); + + $st->start(); + $st->delta($u); + if($st->isFinal()){ + return ($st->hash(),$st->nData()); + } + else { + return (-1,0); + } +} + +sub aggregate(\@){ + my $aref = shift; + my %hash = (); + my $i; + for($i=0;$i<$#{$aref}+1;$i+=2){ + $hash{$$aref[$i]} += $$aref[$i+1]; + } + my @res; + foreach $i (sort {$hash{$b} <=> $hash{$a}} keys %hash){ + push(@res,$i,$hash{$i}); + } + return @res; +} + +sub firstComb($$){ + my $n = shift; + my $m = shift; + + if($n==0 || $n>31 || $m==0 || $m>31 || $n>$m){ + return 0; + } + + return (1<<$n)-1; +} + +sub nextComb($$){ + my $c = shift; + my $m = shift; + + if($c==0 || $m==0 || $m>31){ + return 0; + } + + my $x = $c; + my $limit = 1<<$m; + my ($mask,$mask1,$mask2); + + if($x&1){ + $mask=2; + while($x&$mask){ + $mask<<=1; + } + $x^=($mask+($mask>>1)); + } + else{ + $mask=2; + while(!($x&$mask)){ + $mask<<=1; + } + $mask1=$mask2=0; + while($x&$mask){ + $mask1<<=1; + $mask1++; + $mask2+=$mask; + $mask<<=1; + } + $mask1>>=1; + $x^=($mask+($mask1^$mask2)); + } + + return ($x<$limit)?$x:0; +} + +sub selectComb($\@){ + my $c = shift; + my $aref = shift; + + my @res; + my $i = 0; + while($c>0 && $i<=$#$aref){ + if($c&1){ + push(@res,$$aref[$i]); + } + $c>>=1; + $i++; + } + return @res; +} + +sub sortGrams($){ + my $in = shift; + my @grams = split(/\s+/,$in); + + if($#grams<1){ + return $in; + } + + my @sorted_grams = sort(@grams); + my $i=1; + while($i<=$#sorted_grams){ + if($sorted_grams[$i] eq $sorted_grams[$i-1]){ + splice(@sorted_grams,$i,1); + } + else{ + $i++; + } + } + return join(" ",@sorted_grams); +} + +sub cleanStop($){ + my $unit = shift; + if($stopwords_file ne ''){ + if(!defined($stopMap{$unit})){ + my @words = split(/\s+/,$unit); + while ((@words) && ($stopwords{$words[0]})) { + shift(@words); + } + while ((@words) && ($stopwords{$words[$#words]})) { + pop(@words); + } + $stopMap{$unit} = join(' ', @words); + } + return $stopMap{$unit}; + } + return($unit); +} + + +#*********************************************************** +# +# Main program. +# +#*********************************************************** + + +# +# Configure stopwords list +# + +if($stopwords_file ne ''){ + msg("configuring stopwords ... "); + open(STOPFILE, $stopwords_file) or die "error opening stopwords file '$stopwords_file': $!\n\t"; + while(<STOPFILE>){ + chomp; + $stopwords{$_}=1; + } + close(STOPFILE); + msg("done.\n"); +} + +# +# Build plain FSA with perfect hash and frequencies, +# and compact FSA with perfect hash only. +# +if($do_fsa){ + msg("building plain fsa ... "); + my %units_t = (); + open(U,"${domain}_unit.txt"); + while(<U>){ + chomp; + my ($f,$u) = split(/\t/); + my $uns = cleanStop($u); + if($uns ne ""){ + $units_t{$uns}+=$f; + } + } + close(U); + open(F1,"| makefsa -vnp ${domain}.plain.fsa"); + open(F2,"| makefsa -ep ${domain}.fsa"); + foreach my $u (sort keys %units_t){ + print F1 "$u\t$units_t{$u}\n"; + print F2 "$u\n"; + } + close(F1); + close(F2); + %units_t = (); + msg("done.\n"); +} + +# +# Open plain FSA. +# + +$fsa = FSA->new("${domain}.plain.fsa"); + +# +# Read units. +# + +$total = 0 + `wc -l ${domain}_unit.txt`; +$count = 0; +open(U,"${domain}_unit.txt"); +while(<U>){ + $count++; progress("reading units",$count,$total,0); + chomp; + my ($f,$u) = split(/\t/); + my $uns = cleanStop($u); + if($uns ne ""){ + my ($idx,$frq) = lookup($fsa,$uns); + if($idx>=0){ + $unit[$idx] = $uns; + $unit_f[$idx] = $frq; + } + } +} +close(U); +progress("reading units",$count,$total,1); + + +# +# Build term-sorted FSA for counting query frequencies. +# + +if($do_qfreq || $do_sqfreq){ + msg("building fsa for query frequencies ... "); + my %units_st = (); + for(my $i=0;$i<=$#unit;$i++){ + my $uns = sortGrams($unit[$i]); + if(defined($units_st{$uns})){ + $units_st{$uns}.=",$i"; + } + else{ + $units_st{$uns}="$i"; + } + } + open(F,"| makefsa -vep ${domain}.sorted.fsa"); + my $i=0; + foreach my $u (sort keys %units_st){ + $st_map[$i]=$units_st{$u}; + print F "$u\n"; + $i++; + } + close(F); + %units_st = (); + msg("done.\n"); + + # + # Open term-sorted FSA. + # + + $sfsa = FSA->new("${domain}.sorted.fsa"); + + # + # Read complete query file for query frequencies. + # + + $total = 0 + `zcat complete.txt.gz | wc -l`; + $count = 0; + open(C,"zcat complete.txt.gz|") or die "ERROR opening pipe: \"zcat complete.txt.gz|\"\n"; + while(<C>){ + $count++; progress("processing raw query file for query frequencies",$count,$total,0); + chomp; + my ($frq,$query) = split(/\t/); + + # + # Complete query match. + # + my ($idx,$f) = lookup($fsa,$query); + if($idx>=0){ + $unit_qfc[$idx] += $frq; + } + + # + # Partial query match. + # + my @qgrams = split(/\s+/,$query); + my $st = FSA::State->new($fsa); + my %frq_add = (); + for(my $i=0;$i<=$#qgrams;$i++){ + $st->start(); + $st->delta($qgrams[$i]); + if($st->isFinal()){ + $frq_add{$st->hash()} = 1; + } + for(my $j=$i+1;$st->isValid()&&$j<=$#qgrams;$j++){ + $st->delta(" "); + $st->delta($qgrams[$j]); + if($st->isFinal()){ + $frq_add{$st->hash()} = 1; + } + } + } + foreach my $a (keys %frq_add){ + $unit_qf[$a] += $frq; + } + + if($do_sqfreq){ + # + # Partial query match in any order. + # + my $squery = sortGrams($query); + my @sqgrams = split(/\s+/,$squery); + my $sst = FSA::State->new($sfsa); + %frq_add = (); + my $qlen=$#sqgrams+1; + if($qlen>$MAX_QUERY_LENGTH){ + $qlen=$MAX_QUERY_LENGTH; + } + for(my $i=1;$i<=$qlen && $i<=$MAX_UNIT_LENGTH; $i++){ + for(my $c=firstComb($i,$qlen);$c>0;$c=nextComb($c,$qlen)){ + $sst->start(); + my $tmp=join(" ",selectComb($c,@sqgrams)); + $sst->delta($tmp); + if($sst->isFinal()){ + my @to_add = split(/,/,$st_map[$sst->hash()]); + foreach my $a (@to_add){ + $frq_add{$a} = 1; + } + } + } + } + foreach my $a (keys %frq_add){ + $unit_qfs[$a] += $frq; + } + } + } + close(C); + progress("processing raw query file for query frequencies",$count,$total,1); +} + +# +# Read extensions. +# +if($do_ext){ + $total = 0 + `wc -l ${domain}_ext.txt`; + $count = 0; + open(E,"${domain}_ext.txt"); + while(<E>){ + $count++; progress("reading extensions",$count,$total,0); + chomp; + my ($f,$u1,$u2) = split(/\t/); + my $uns1 = cleanStop($u1); + my $uns2 = cleanStop($u2); + if($uns1 ne "" && $uns1 ne $uns2){ + my ($idx1,$frq1) = lookup($fsa,$u1); + my ($idx2,$frq2) = lookup($fsa,$u2); + if($idx1>=0 && $idx2>=0){ + $ext[$idx1] .= "$idx2,$f "; + } + } + } + close(E); + progress("reading extensions",$count,$total,1); +} + +# +# Read associations. +# +if($do_assoc){ + $total = 0 + `wc -l ${domain}_assoc.txt`; + $count = 0; + open(A,"${domain}_assoc.txt"); + while(<A>){ + $count++; progress("reading associations",$count,$total,0); + chomp; + my ($f,$u1,$u2) = split(/\t/); + my $uns1 = cleanStop($u1); + my $uns2 = cleanStop($u2); + if($uns1 ne "" && $uns2 ne "" && $uns1 ne $uns2){ + my ($idx1,$frq1) = lookup($fsa,$u1); + my ($idx2,$frq2) = lookup($fsa,$u2); + if($idx1>=0 && $idx2>=0){ + $assoc[$idx1] .= "$idx2,$f "; + $assoc[$idx2] .= "$idx1,$f "; + } + } + } + close(A); + progress("reading associations",$count,$total,1); +} + +# +# Read categories. +# + +if($do_cat){ + tie my %hash, 'BerkeleyDB::Btree', -Filename => "uCat.db"; + + $total = scalar(keys %hash); + $count = 0; + my $cid = 0; + foreach my $c (sort keys %hash){ + $count++; progress("reading categories",$count,$total,0); + if($c ne "Misc" && $c ne "zzz_uncategorized_catchall"){ + $cats[$cid] = $c; + my (@ucs) = split(/\t/,$hash{$c}); + foreach my $u (@ucs){ + my ($t,$f) = split(/,/,$u); + my ($idx,$frq) = lookup($fsa,cleanStop($t)); + if($idx>=0){ + if(defined($ucats[$idx])){ + if(!($ucats[$idx]=~/\b$cid\b/)){ + $ucats[$idx] .= ",$cid"; + } + } + else{ + $ucats[$idx] = "$cid"; + } + } + } + $cid++; + } + } + progress("reading categories",$count,$total,1); + untie %hash; +} + + +# +# Write XML output. +# +$count=0; +$total=$#unit+1; + +if($output_file eq ""){ + $output_file = "${domain}.xml"; +} +open(X,">$output_file"); +print X "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n"; +print X "<conceptnetwork id=\"$domain\" unitcount=\"" . ($#unit+1). "\">\n"; +for(my $i=0;$i<=$#unit;$i++){ + $count++; progress("writing xml",$count,$total,0); + print X " <unit id=\"$i\">\n"; + print X " <term id=\"$i\" freq=\"" . (0+$unit_f[$i]) . "\" cfreq=\"" . + (0+$unit_qfc[$i]) . "\" qfreq=\"" . (0+$unit_qf[$i]) . "\" gfreq=\"" . + ($do_sqfreq? (0+$unit_qfs[$i]) : 0) . "\">" . $unit[$i] . "</term>\n"; + print X " <extensions>\n"; + if(defined($ext[$i]) && $ext[$i] ne ""){ + chop($ext[$i]); + my @us = split(/[ ,]/,$ext[$i]); + for(my $j=0;$j<$#us+1;$j+=2){ + print X " <term id=\"".$us[$j]."\" freq=\"".$us[$j+1]."\">".$unit[$us[$j]]."</term>\n"; + } + } + print X " </extensions>\n"; + print X " <associations>\n"; + if(defined($assoc[$i]) && $assoc[$i] ne ""){ + chop($assoc[$i]); + my @usr = split(/[ ,]/,$assoc[$i]); + my (@us) = aggregate(@usr); + for(my $j=0;$j<$#us+1;$j+=2){ + print X " <term id=\"".$us[$j]."\" freq=\"".$us[$j+1]."\">".$unit[$us[$j]]."</term>\n"; + } + } + print X " </associations>\n"; + print X " <categories>\n"; + if(defined($ucats[$i]) && $ucats[$i] ne ""){ + my @ucs = split(/,/,$ucats[$i]); + foreach my $c (@ucs){ + print X " <category id=\"$c\">$cats[$c]</category>\n"; + } + } + print X " </categories>\n"; +} +progress("writing xml",$count,$total,1); +print X " </unit>\n"; +print X "</conceptnetwork>\n"; +close(X); + +__END__ + +=head1 NAME + +cn_txt2xml - Convert a concept network to single XML file. + +=head1 SYNOPSIS + +cn_txt2xml [options] domain + +Options: + + --[no]qfreq, -[no]q [do not] retrieve query frequencies + --[no]sqfreq, -[no]s [do not] retrieve term-sorted query frequencies + --[no]ext, -[no]e [do not] process extensions + --[no]assoc, -[no]a [do not] process associations + --[no]cat, -[no]c [do not] process categories + --[no]fsa, -[no]f [do not] build fsa + --stopwords=FILE, -w FILE use the given stopwords file + --output-file, -o output file + --verbose, -v be verbose + --help, -h brief help message + --man, -m full documentation + +=head1 OPTIONS + +=over 8 + +=item B<-help> + +Print a brief help message and exits. + +=item B<-man> + +Prints the manual page and exits. + +=back + +=head1 DESCRIPTION + +B<This program> will convert a concept network to a single XML file. +useful with the contents thereof. + +=cut + diff --git a/fsa/src/util/cn_xml2dat b/fsa/src/util/cn_xml2dat new file mode 100755 index 00000000000..4394c201da8 --- /dev/null +++ b/fsa/src/util/cn_xml2dat @@ -0,0 +1,218 @@ +#!/usr/bin/perl + +use strict; + +use FSA; +use BerkeleyDB; +use Getopt::Long; +use Pod::Usage; + +# +# Process command line options. +# + +my $help = 0; +my $man = 0; +my $verbose = 0; +my $input_file = ''; +my $output_file = ''; + +my $result = GetOptions('help|h' => \$help, + 'man|m' => \$man, + 'verbose|v' => \$verbose, + 'input-file|i=s' => \$input_file, + 'output-file|o=s' => \$output_file, + ); + +pod2usage(1) if $help; +pod2usage(-verbose => 2) if $man; + +# +# Domain is a required parameter. +# + +my $domain = shift || die "need domain"; + + +my $MAGIC = 238579428; + +#*********************************************************** +# +# Functions +# +#*********************************************************** + + +sub msg($@){ + my $format = shift; + if($verbose){ + printf STDERR $format,@_; + } +} + +sub progress($$$$){ + my ($msg,$cnt,$total,$done) = @_; + + if($done){ + if($total>0){ + msg("\r%s ... %d of %d (%.2f%%) ... done.\n",$msg,$cnt,$total,100.0*$cnt/$total); + } + else { + msg("\r%s ... %d ... done.\n",$msg,$cnt); + } + } + elsif($cnt%1000==0){ + if($total>0){ + msg("\r%s ... %d of %d (%.2f%%)",$msg,$cnt,$total,100.0*$cnt/$total); + } + else { + msg("\r%s ... %d",$msg,$cnt); + } + } +} + +my @cats = (); + +my $index = ""; +my $extinfo = pack('L',0); # pack dummy word to make it easy to find empties +my $unitstr = ""; +my $catindex = ""; + +my $extptr = 1; +my $strptr = 0; + +my $maxfrq; +my $maxcfrq; +my $maxqfrq; +my $maxsfrq; +my $maxefrq; +my $maxafrq; +$maxfrq = $maxcfrq = $maxqfrq = $maxsfrq = $maxefrq = $maxafrq = 0; + + +my $count=0; +my @ext; + +if($input_file eq ""){ + $input_file = "${domain}.xml"; +} +open(X,"$input_file"); +my $line = <X>; +$line = <X>; +my ($cnid,$total) = $line=~/<conceptnetwork id=\"([^\"]*)\" unitcount=\"(\d*)\">/; +die "missing unit count ($total)" if($total<=0); +if($cnid ne $domain){ + msg("Warning! Domain \"%s\" does not match concept network id \"%s\".\n",$domain,$cnid); +} +while(<X>){ + if(/^\s*<unit/){ + $count++; progress("reading xml",$count,$total,0); + $line = <X>; + my ($id,$frq,$cfrq,$qfrq,$sfrq,$term) = $line=~/^\s*<term id=\"(\d*)\" freq=\"(\d*)\" cfreq=\"(\d*)\" qfreq=\"(\d*)\" gfreq=\"(\d*)\">([^<]*)<\/term>/; + + if($frq>$maxfrq) { $maxfrq = $frq; } + if($cfrq>$maxcfrq) { $maxcfrq = $cfrq; } + if($qfrq>$maxqfrq) { $maxqfrq = $qfrq; } + if($sfrq>$maxsfrq) { $maxsfrq = $sfrq; } + + $index .= pack('L',$strptr); # pack term + $unitstr .= pack('Z*',$term); + $strptr = length($unitstr); + $index .= pack('L',$frq); # pack frq + $index .= pack('L',$cfrq); # pack frq + $index .= pack('L',$qfrq); # pack frq + $index .= pack('L',$sfrq); # pack frq + + $line = <X>; + @ext = (); + EXT: + while($line = <X>){ + last EXT if($line=~/<\/extensions>/); + my ($id,$efrq) = ($line=~/^\s*<term id=\"(\d*)\" freq=\"(\d*)\">/); + push(@ext,$id); + push(@ext,$efrq); + if($efrq>$maxefrq) { $maxefrq = $efrq; } + } + + if($#ext==-1){ + $index .= pack('L',0); # pack empty ext + } + else { + $index .= pack('L',$extptr); # pack ext + $extinfo .= pack('L',($#ext+1)/2); + $extinfo .= pack('L*',@ext); + $extptr += $#ext+2; + } + + $line = <X>; + @ext = (); + ASSOC: + while($line = <X>){ + last ASSOC if($line=~/<\/associations>/); + my ($id,$afrq) = $line=~/^\s*<term id=\"(\d*)\" freq=\"(\d*)\">/; + push(@ext,$id); + push(@ext,$afrq); + if($afrq>$maxafrq) { $maxafrq = $afrq; } + } + + if($#ext==-1){ + $index .= pack('L',0); # pack empty assoc + } + else { + $index .= pack('L',$extptr); # pack assoc + $extinfo .= pack('L',($#ext+1)/2); + $extinfo .= pack('L*',@ext); + $extptr += $#ext+2; + } + + $line = <X>; + @ext = (); + CAT: + while($line = <X>){ + last CAT if($line=~/<\/categories>/); + my ($id,$cat) = $line=~/^\s*<category id=\"(\d*)\">([^<]*)<\/category>/; + if(!defined($cats[$id])){ + $cats[$id] = $cat; + } + push(@ext,$id); + } + + if($#ext==-1){ + $index .= pack('L',0); # pack empty cat + } + else { + $index .= pack('L',$extptr); # pack cat + $extinfo .= pack('L',$#ext+1); + $extinfo .= pack('L*',@ext); + $extptr += $#ext+2; + } + + } +} +close(X); +progress("reading xml",$count,$total,1); + +for(my $i=0;$i<=$#cats;$i++){ + $catindex .= pack('L',$strptr); # pack category names + $unitstr .= pack('Z*',$cats[$i]); + $strptr = length($unitstr); +} + + +msg("writing data file ... "); +if($output_file eq ""){ + $output_file = "$domain.dat"; +} +open(DAT,">$output_file"); +my $header = pack('L64',$MAGIC,0,0, + $count,$extptr,$#cats+1,$strptr, + $maxfrq,$maxcfrq,$maxqfrq,$maxsfrq,$maxefrq,$maxafrq, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); +print DAT $header; +print DAT $index; +print DAT $extinfo; +print DAT $catindex; +print DAT $unitstr; +close(DAT); +msg("done.\n"); diff --git a/fsa/src/util/fsadump/.gitignore b/fsa/src/util/fsadump/.gitignore new file mode 100644 index 00000000000..ba07f8761d4 --- /dev/null +++ b/fsa/src/util/fsadump/.gitignore @@ -0,0 +1,5 @@ +.deps +.libs +Makefile +Makefile.in +fsadump diff --git a/fsa/src/util/fsainfo/.gitignore b/fsa/src/util/fsainfo/.gitignore new file mode 100644 index 00000000000..bf788157708 --- /dev/null +++ b/fsa/src/util/fsainfo/.gitignore @@ -0,0 +1,5 @@ +.deps +.libs +Makefile +Makefile.in +fsainfo diff --git a/fsa/src/util/makefsa/.gitignore b/fsa/src/util/makefsa/.gitignore new file mode 100644 index 00000000000..83748e6a3e6 --- /dev/null +++ b/fsa/src/util/makefsa/.gitignore @@ -0,0 +1,5 @@ +.deps +.libs +Makefile +Makefile.in +makefsa diff --git a/fsa/src/vespa/.gitignore b/fsa/src/vespa/.gitignore new file mode 100644 index 00000000000..a728d158730 --- /dev/null +++ b/fsa/src/vespa/.gitignore @@ -0,0 +1,3 @@ +Makefile +.depend +libfsa*.so.* diff --git a/fsa/src/vespa/fsa/CMakeLists.txt b/fsa/src/vespa/fsa/CMakeLists.txt new file mode 100644 index 00000000000..dff59686894 --- /dev/null +++ b/fsa/src/vespa/fsa/CMakeLists.txt @@ -0,0 +1,44 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(fsa + SOURCES + automaton.cpp + base64.cpp + blob.cpp + conceptnet.cpp + detector.cpp + fsa.cpp + metadata.cpp + ngram.cpp + permuter.cpp + segmenter.cpp + selector.cpp + unicode.cpp + unicode_charprops.cpp + unicode_lowercase.cpp + unicode_tables.cpp + vectorizer.cpp + wordchartokenizer.cpp + INSTALL lib64 + DEPENDS +) + +install(FILES + automaton.h + base64.h + blob.h + checksum.h + conceptnet.h + detector.h + file.h + fsa.h + metadata.h + ngram.h + permuter.h + segmenter.h + selector.h + timestamp.h + tokenizer.h + unicode.h + vectorizer.h + wordchartokenizer.h + DESTINATION include/vespa/fsa) diff --git a/fsa/src/vespa/fsa/automaton-alternate.cpp b/fsa/src/vespa/fsa/automaton-alternate.cpp new file mode 100644 index 00000000000..c753ba9f844 --- /dev/null +++ b/fsa/src/vespa/fsa/automaton-alternate.cpp @@ -0,0 +1,846 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> // for ;:read(), ::write(), etc. +#include <sys/stat.h> +#include <algorithm> // for std::sort<>(), std::equal_range<>() + +#include "fsa.h" +#include "automaton.h" +#include "checksum.h" + + +namespace fsa { + +// {{{ constants + +const uint32_t Automaton::PackedAutomaton::_ALLOC_CELLS; +const uint32_t Automaton::PackedAutomaton::_ALLOC_BLOB; +const uint32_t Automaton::PackedAutomaton::_BACKCHECK; + +const Blob Automaton::EMPTY_BLOB(""); + +// }}} + +// {{{ Automaton::TransitionList::operator<() + +bool Automaton::TransitionList::operator<(const Automaton::TransitionList& tl) const +{ + if(this==&tl) return false; + if(_size<tl._size) return true; + if(_size>tl._size) return false; + for(unsigned int i=0; i<_size;i++){ + if(_trans[i]._symbol<tl._trans[i]._symbol) return true; + if(_trans[i]._symbol>tl._trans[i]._symbol) return false; + if(_trans[i]._state<tl._trans[i]._state) return true; + if(_trans[i]._state>tl._trans[i]._state) return false; + } + return false; +} + +// }}} +// {{{ Automaton::TransitionList::operator>() + +bool Automaton::TransitionList::operator>(const Automaton::TransitionList& tl) const +{ + if(this==&tl) return false; + if(_size>tl._size) return true; + if(_size<tl._size) return false; + for(unsigned int i=0; i<_size;i++){ + if(_trans[i]._symbol>tl._trans[i]._symbol) return true; + if(_trans[i]._symbol<tl._trans[i]._symbol) return false; + if(_trans[i]._state>tl._trans[i]._state) return true; + if(_trans[i]._state<tl._trans[i]._state) return false; + } + return false; +} + +// }}} +// {{{ Automaton::TransitionList::operator==() + +bool Automaton::TransitionList::operator==(const Automaton::TransitionList& tl) const +{ + if(this==&tl) return true; + if(_size!=tl._size) return false; + for(unsigned int i=0; i<_size;i++){ + if(_trans[i]._symbol!=tl._trans[i]._symbol) return false; + if(_trans[i]._state!=tl._trans[i]._state) return false; + } + return true; +} + +// }}} + +// {{{ Automaton::PackedAutomaton::reset() + +void Automaton::PackedAutomaton::reset() +{ + _packable = false; + _blob_map.clear(); + if(_packed_ptr!=NULL){ + free(_packed_ptr); + _packed_ptr=NULL; + } + if(_packed_idx!=NULL){ + if(sizeof(State*)!=sizeof(state_t)){ + free(_packed_idx); + } + _packed_idx=NULL; + } + if(_symbol!=NULL){ + free(_symbol); + _symbol=NULL; + } + if(_used!=NULL){ + free(_used); + _used=NULL; + } + if(_perf_hash!=NULL){ + free(_perf_hash); + _perf_hash=NULL; + } + if(_totals!=NULL){ + free(_totals); + _totals=NULL; + } + _packed_size=0; + _last_packed=0; + if(_blob!=NULL){ + free(_blob); + _blob=NULL; + } + _blob_size=0; + _blob_used=0; + _blob_type=FSA::DATA_VARIABLE; + _fixed_blob_size=0; + _start_state=0; +} + +// }}} +// {{{ Automaton::PackedAutomaton::init() + +void Automaton::PackedAutomaton::init() +{ + reset(); + + _packed_ptr = (State**)malloc(_ALLOC_CELLS*sizeof(State*)); + if(sizeof(State*)!=sizeof(state_t)){ + _packed_idx = (state_t*)malloc(_ALLOC_CELLS*sizeof(state_t)); + } + else { + _packed_idx = (state_t*)_packed_ptr; + } + _symbol = (symbol_t*)malloc(_ALLOC_CELLS*sizeof(symbol_t)); + _used = (bool*)malloc(_ALLOC_CELLS*sizeof(bool)); + _packed_size = _ALLOC_CELLS; + + assert(_packed_ptr!=NULL && _packed_idx!=NULL && _symbol!=NULL && _used!=NULL); + + for(uint32_t i=0;i<_packed_size;i++){ + _used[i] = false; + _symbol[i] = FSA::EMPTY_SYMBOL; + _packed_ptr[i] = NULL; + } + if(sizeof(State*)!=sizeof(state_t)){ + for(uint32_t i=0;i<_packed_size;i++){ + _packed_idx[i] = 0; + } + } + + _blob = (data_t*)malloc(_ALLOC_BLOB); + _blob_size = _ALLOC_BLOB; + + assert(_blob!=NULL); + + _packable = true; +} + +// }}} +// {{{ Automaton::PackedAutomaton::expandCells() + +void Automaton::PackedAutomaton::expandCells() +{ + uint32_t i; + + _packed_ptr = (State**)realloc(_packed_ptr,(_packed_size+_ALLOC_CELLS)*sizeof(State*)); + if(sizeof(State*)!=sizeof(state_t)){ + _packed_idx = (state_t*)realloc(_packed_idx,(_packed_size+_ALLOC_CELLS)*sizeof(state_t)); + } + else { + _packed_idx = (state_t*)_packed_ptr; + } + _symbol = (symbol_t*)realloc(_symbol,(_packed_size+_ALLOC_CELLS)*sizeof(symbol_t)); + _used = (bool*)realloc(_used,(_packed_size+_ALLOC_CELLS)*sizeof(bool)); + + assert(_packed_ptr!=NULL && _packed_idx!=NULL && _symbol!=NULL && _used!=NULL); + + for(i=_packed_size;i<_packed_size+_ALLOC_CELLS;i++){ + _used[i] = false; + _symbol[i] = FSA::EMPTY_SYMBOL; + _packed_ptr[i] = NULL; + if(sizeof(State*)!=sizeof(state_t)){ + _packed_idx[i] = 0; + } + } + _packed_size += _ALLOC_CELLS; +} + +// }}} +// {{{ Automaton::PackedAutomaton::expandBlob() + +void Automaton::PackedAutomaton::expandBlob(uint32_t minExpand) +{ + uint32_t expand=(minExpand/_ALLOC_BLOB+1)*_ALLOC_BLOB; + + _blob = (data_t*)realloc(_blob,_blob_size+expand); + + assert(_blob!=NULL); + + _blob_size += expand; +} + +// }}} +// {{{ Automaton::PackedAutomaton::getEmptyCell() + +uint32_t Automaton::PackedAutomaton::getEmptyCell() +{ + unsigned int cell = _last_packed>_BACKCHECK?_last_packed-_BACKCHECK:1; + while(_used[cell]){ + cell++; + if(cell+256>=_packed_size) + expandCells(); + } + + _used[cell] = true; + + return cell; +} + +// }}} +// {{{ Automaton::PackedAutomaton::getCell() + +uint32_t Automaton::PackedAutomaton::getCell(const Automaton::SymList &t) +{ + SymListConstIterator tit; + uint32_t cell = _last_packed>_BACKCHECK?_last_packed-_BACKCHECK:1; + bool found = false; + while(!found){ + if(!_used[cell]){ + if(cell+256>=_packed_size) + expandCells(); + for(tit=t.begin();tit!=t.end();++tit){ + if(_symbol[cell+*tit]!=FSA::EMPTY_SYMBOL) + break; + } + if(tit==t.end()) + found=true; + } + if(!found){ + cell++; + if(cell>=_packed_size) + expandCells(); + } + } + _used[cell] = true; + for(tit=t.begin();tit!=t.end();++tit){ + _symbol[cell+*tit] = *tit; + } + + return cell; +} + +// }}} +// {{{ Automaton::PackedAutomaton::packState() + +bool Automaton::PackedAutomaton::packState(Automaton::StateCellArrayIterator &it) +{ + SymList transitions; + uint32_t cell; + size_t i; + const TransitionList &tlist = it->state->getTransitionList(); + + if(_packable){ + if(tlist.size()==0){ + cell = getEmptyCell(); + } + else{ + for(i=0; i<tlist.size(); i++){ + transitions.push_back(tlist[i]._symbol); + } + transitions.sort(); + cell = getCell(transitions); + for(i=0; i<tlist.size(); i++){ + if(tlist[i]._symbol==FSA::FINAL_SYMBOL){ + _packed_idx[cell+FSA::FINAL_SYMBOL] = packBlob(tlist[i]._state->getBlob()); + } + else{ + _packed_ptr[cell+tlist[i]._symbol] = tlist[i]._state; + } + } + } + + it->cell = cell; + if(cell>_last_packed) + _last_packed = cell; + + return true; + } + + return false; +} + +// }}} +// {{{ Automaton::PackedAutomaton::packBlob() + +static const Blob nullBlob; + +uint32_t Automaton::PackedAutomaton::packBlob(const Blob *b) +{ + PackMapIterator pi = _blob_map.find(b); + if(pi!=_blob_map.end()){ + return pi->second; + } + else { + uint32_t cell=_blob_used; + _blob_map[b]=cell; + if(b==NULL){ + b=&nullBlob; + } + uint32_t size=b->size(); + if(_blob_used+size+sizeof(uint32_t)>_blob_size) + expandBlob(size+sizeof(uint32_t)); + memcpy(_blob+_blob_used,&size,sizeof(uint32_t)); + memcpy(_blob+_blob_used+sizeof(uint32_t),b->data(),size); + _blob_used += size+sizeof(uint32_t); + + return cell; + } +} + +// }}} +// {{{ Automaton::PackedAutomaton::finalize() + +void Automaton::PackedAutomaton::finalize(const StateCellArray &queue) +{ + uint32_t i; + + if(_packable){ + for(i=0;i<_last_packed+256;i++){ + if(i>=_packed_size) // this shouldn't happen anymore, but check anyway + expandCells(); + if(_symbol[i]!=FSA::EMPTY_SYMBOL && _symbol[i]!=FSA::FINAL_SYMBOL){ + //@@@@@@ probably faster to write a custom binary search + _packed_idx[i] = std::equal_range(queue.begin(), queue.end(), StateCellArrayItem(_packed_ptr[i]), StateCellArrayLess()).first->cell; + } + } + + // compact blobs if the size is constant + std::map<uint32_t,uint32_t> bcomp; + std::map<uint32_t,uint32_t>::iterator bcomp_it; + bcomp[0]=0; + uint32_t lastsize = *((uint32_t*)_blob), currsize; + uint32_t i=lastsize+sizeof(uint32_t); + uint32_t j=lastsize; + bool fixedsize = true; + while(i<_blob_used){ + currsize = *((uint32_t*)(_blob+i)); + if(currsize!=lastsize){ + fixedsize = false; + break; + } + bcomp[i]=j; + i+=currsize+sizeof(uint32_t); + j+=currsize; + } + if(fixedsize){ + _blob_type = FSA::DATA_FIXED; + _fixed_blob_size = lastsize; + _blob_used = j; + for(i=0;i<_last_packed+256;i++){ + if(_symbol[i]==FSA::FINAL_SYMBOL){ + _packed_idx[i] = bcomp[_packed_idx[i]]; + } + } + + for(bcomp_it = bcomp.begin(); bcomp_it!=bcomp.end(); ++bcomp_it){ + memmove(_blob+(bcomp_it->second),_blob+(bcomp_it->first+sizeof(uint32_t)),lastsize); + } + } + + _packable = false; + } +} + +// }}} +// {{{ Automaton::PackedAutomaton::computePerfectHash() + +hash_t Automaton::PackedAutomaton::computePerfectHash(state_t state) +{ + symbol_t s; + hash_t count; + + if(_totals[state]!=0){ + return _totals[state]; + } + + count = (_symbol[state+FSA::FINAL_SYMBOL]==FSA::FINAL_SYMBOL) ? 1 : 0; + + for(s=1;s<=254;s++){ + if(_symbol[state+s]==s){ + _perf_hash[state+s] = count; + count += computePerfectHash(_packed_idx[state+s]); + } + } + + _totals[state] = count; + + return count; +} + +// }}} +// {{{ Automaton::PackedAutomaton::addPerfectHash() + +void Automaton::PackedAutomaton::addPerfectHash() +{ + if(_last_packed==0 || _packable){ + // do nothing with an empty automaton or one which has not been finalized + return; + } + + uint32_t size = _last_packed+256; + + _perf_hash = (hash_t*)malloc(size*sizeof(hash_t)); + _totals = (hash_t*)malloc(size*sizeof(hash_t)); + + assert(_perf_hash!=NULL && _totals!=NULL); + + for(unsigned int i=0;i<size;i++){ + _perf_hash[i] = 0; + _totals[i] = 0; + } + + computePerfectHash(_start_state); + + free(_totals); _totals=NULL; +} + +// }}} +// {{{ Automaton::PackedAutomaton::lookup() + +const data_t* Automaton::PackedAutomaton::lookup(const char *input) const +{ + if(_packable || _start_state==0){ + return NULL; + } + state_t state = _start_state; + const char *p=input; + while(*p){ + if(_symbol[state+*p]==*p){ + state=_packed_idx[state+*p]; + p++; + } + else{ + return NULL; + } + } + if(_symbol[state+FSA::FINAL_SYMBOL]==FSA::FINAL_SYMBOL){ + return _blob+_packed_idx[state+FSA::FINAL_SYMBOL]; + } + return NULL; +} + +// }}} +// {{{ Automaton::PackedAutomaton::write() + +bool Automaton::PackedAutomaton::write(const char *filename, uint32_t serial) +{ + if(_packable || _packed_size==0) // must be non-empty and finalized + return false; + + FSA::Header header; + + header._magic = FSA::MAGIC; + header._version = FSA::VER; + header._checksum = 0; + header._size = _last_packed+256; + header._start = _start_state; + header._data_size = _blob_used; + header._data_type = _blob_type; + header._fixed_data_size = _fixed_blob_size; + header._has_perfect_hash = (_perf_hash==NULL) ? 0 : 1; + header._serial = serial; + memset(&(header._reserved), 0, sizeof(header._reserved)); + + int fd = open(filename,O_CREAT|O_TRUNC|O_RDWR,S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); + if(fd<0) return false; + + header._checksum += Checksum::compute(_symbol,header._size*sizeof(symbol_t)); + header._checksum += Checksum::compute(_packed_idx,header._size*sizeof(state_t)); + header._checksum += Checksum::compute(_blob,_blob_used); + if(header._has_perfect_hash){ + header._checksum += Checksum::compute(_perf_hash,header._size*sizeof(hash_t)); + } + + ::write(fd,&header,sizeof(header)); + ::write(fd,_symbol,header._size*(sizeof(symbol_t))); + ::write(fd,_packed_idx,header._size*(sizeof(state_t))); + ::write(fd,_blob,_blob_used); + if(header._has_perfect_hash){ + ::write(fd,_perf_hash,header._size*(sizeof(hash_t))); + } + close(fd); + + return true; +} + +// }}} +// {{{ Automaton::PackedAutomaton::read() + +bool Automaton::PackedAutomaton::read(const char *filename) +{ + FSA::Header header; + size_t r; + + reset(); + int fd = ::open(filename,O_RDONLY); + if(fd<0){ + return false; + } + r=::read(fd,&header,sizeof(header)); + if(r<sizeof(header) || header._magic!=FSA::MAGIC){ + ::close(fd); + return false; + } + + _packable = false; + _packed_size = header._size; + _last_packed = _packed_size-256; + _blob_size = header._data_size; + _blob_used = header._data_size; + _blob_type = header._data_type; + _fixed_blob_size = header._fixed_data_size; + _start_state = header._start; + + _symbol = (symbol_t*)malloc(_packed_size*sizeof(symbol_t)); + assert(_symbol!=NULL); + ::read(fd,_symbol,_packed_size*(sizeof(symbol_t))); + _packed_idx = (state_t*)malloc(_packed_size*sizeof(state_t)); + assert(_packed_idx!=NULL); + ::read(fd,_packed_idx,_packed_size*(sizeof(state_t))); + _blob = (data_t*)malloc(_blob_used); + assert(_blob!=NULL); + ::read(fd,_blob,_blob_used); + if(header._has_perfect_hash){ + _perf_hash = (hash_t*)malloc(_packed_size*sizeof(hash_t)); + assert(_perf_hash!=NULL); + ::read(fd,_perf_hash,_packed_size*(sizeof(hash_t))); + } + + ::close(fd); + + return true; +} + +// }}} +// {{{ Automaton::PackedAutomaton::getFSA() + +bool Automaton::PackedAutomaton::getFSA(FSA::Descriptor &d) +{ + if(_packable || _packed_size==0) // must be non-empty and finalized + return false; + + uint32_t size = _last_packed+256; + + _symbol = (symbol_t*)realloc(_symbol,size*sizeof(symbol_t)); + _packed_idx = (state_t*)realloc(_packed_idx,size*sizeof(state_t)); + _blob = (data_t*)realloc(_blob,_blob_used); + if(_perf_hash!=NULL){ + _perf_hash = (hash_t*)realloc(_perf_hash,size*sizeof(hash_t)); + } + + d._version = FSA::VER; + d._serial = 0; + d._state = _packed_idx; + d._symbol = _symbol; + d._size = size; + d._data = _blob; + d._data_size = _blob_used; + d._data_type = _blob_type; + d._fixed_data_size = _fixed_blob_size; + d._perf_hash = _perf_hash; + d._start = _start_state; + + _symbol = NULL; + _packed_idx = NULL; + if(sizeof(State*)==sizeof(state_t)){ // _packed_idx and _packed_ptr are overlayed + _packed_ptr=NULL; + } + _blob = NULL; + _perf_hash = NULL; + reset(); + + return true; +} + +// }}} + +// {{{ Automaton::cleanUp() + +void Automaton::cleanUp() +{ + if(_q0!=NULL){ + finalize(); // make sure all states are in _register + for(BlobRegisterIterator bi = _blob_register.begin(); bi!=_blob_register.end(); ++bi){ + delete bi->second; + } + _blob_register.clear(); // clear _blob_register + // clear _register and remove all states +#if 0 + // In the previous 1-pass method (without _queue), the _register owned + // the memory for all states so we cleaned up this way: + for(RegisterIterator ri = _register.begin(); ri!=_register.end(); ++ri){ + delete ri->second; + } +#else + if(_queue) { + for(StateArrayIterator qi=_queue->begin(); qi!=_queue->end(); ++qi){ + if(*qi!=_q0) // _q0 may or may not be in the queue so we don't want to double-free it + delete *qi; + } + delete _queue; + _queue = NULL; + } +#endif + delete _register; + _register = NULL; + delete _q0; + _q0 = NULL; + } +} + +// }}} +// {{{ Automaton::~Automaton() + +Automaton::~Automaton() +{ + cleanUp(); +} + +// }}} +// {{{ Automaton::getCPLastState() + +Automaton::State* Automaton::getCPLastState(const char *input, const char *&suffix) +{ + if(_q0==NULL) return NULL; + + unsigned int l=0; + State* state = _q0; + State* next; + while(input[l]!=0){ + next = state->child(input[l]); + if(next==NULL){ + suffix=input+l; + return state; + } + state=next; + l++; + } + suffix=input+l; + return state; +} + +// }}} +// {{{ Automaton::addSuffix() + +void Automaton::addSuffix(State* state, const char *suffix, const Blob *b) +{ + State* current = state; + State* child; + + while(*suffix != 0){ + child = current->addEmptyChild(*suffix); + current = child; + suffix++; + } + BlobRegisterIterator bi; + if(b!=NULL) + bi = _blob_register.find(*b); + else + bi = _blob_register.find(EMPTY_BLOB); + if(bi!=_blob_register.end()){ + child = bi->second; + current->addChild(FSA::FINAL_SYMBOL,child); + } + else { + const Blob *bcopy = (b==NULL) ? new Blob(EMPTY_BLOB) : new Blob(*b); + assert(bcopy!=NULL); + child = current->addEmptyChild(FSA::FINAL_SYMBOL,bcopy); + _blob_register[*bcopy] = child; + } +} + +// }}} +// {{{ Automaton::init() + +void Automaton::init() +{ + cleanUp(); + _register = new Register(); + _q0 = new State(); + _queue = new StateArray(); + assert(_q0!=NULL); + _finalized = false; + + _packed.init(); +} + +// }}} +// {{{ Automaton::finalize() + +void Automaton::finalize() +{ + if(!_finalized && _q0!=NULL){ + replaceOrRegister(_q0); + // + // 2nd-pass begin; clear the _register to free up memory, then pack queued states: + // + delete _register; + _register = NULL; + _queue->push_back(_q0); + std::sort(_queue->begin(), _queue->end(), StateArrayLess()); + // now that _register memory is freed up, transfer StateArray into StateCellArray for packing: + StateCellArray queue(_queue->size()); + for(size_t i=0; i < queue.size(); i++){ + queue[i].state = _queue->operator[](i); + queue[i].cell = 0; + } + delete _queue; + _queue = NULL; + for(StateCellArrayIterator it=queue.begin(); it!=queue.end(); ++it){ + _packed.packState(it); + if(it->state == _q0) + _packed.setStartState(it->cell); + } + // clean up queue + for(StateCellArrayIterator it=queue.begin(); it!=queue.end(); ++it){ + if(it->state!=_q0) + delete it->state; + } + // + // 2nd-pass end + // + _packed.finalize(queue); + _finalized = true; + } +} + +// }}} +// {{{ Automaton::addPerfectHash() + +void Automaton::addPerfectHash() +{ + if(_finalized){ + _packed.addPerfectHash(); + } +} + +// }}} +// {{{ Automaton::write() + +bool Automaton::write(const char *file, uint32_t serial) +{ + if(!_finalized){ + finalize(); + } + return _packed.write(file,serial); +} + +// }}} +// {{{ Automaton::getFSA() + +FSA* Automaton::getFSA() +{ + if(!_finalized){ + finalize(); + } + + FSA::Descriptor d; + + if(!_packed.getFSA(d)) + return NULL; + + FSA *fsa = new FSA(d); + + cleanUp(); + + return fsa; +} + +// }}} +// {{{ Automaton::insertSortedString() + +void Automaton::insertSortedString(const std::string &input) +{ + insertSortedString(input.c_str()); +} + +void Automaton::insertSortedString(const std::string &input, const std::string &meta) +{ + Blob b(meta); + insertSortedString(input.c_str(),&b); +} + +void Automaton::insertSortedString(const char *input, const Blob& b) +{ + insertSortedString(input,&b); +} + +void Automaton::insertSortedString(const char *input, const Blob* b) +{ + if(_q0==NULL || _finalized) return; + + const char* currentSuffix; + State* lastState = getCPLastState(input, currentSuffix); + + if(lastState->hasChildren()){ + replaceOrRegister(lastState); + } + addSuffix(lastState,currentSuffix,b); +} + +// }}} +// {{{ Automaton::replaceOrRegister() + +void Automaton::replaceOrRegister(Automaton::State* state) +{ + State* child = state->lastChild(); + if(child!=NULL){ + if(child->hasChildren()){ + replaceOrRegister(child); + } + RegisterIterator ri = _register->find(&(child->getTransitionList())); + if(ri!=_register->end() && ri->second!=child){ + state->updateLastChild(ri->second); + delete child; + } + else { + (*_register)[&(child->getTransitionList())] = child; +#if 0 + // In the previous 1-pass method (without _queue), we packed states as + // we went: + _packed.packState(child); +#else + // Now we queue them up to be packed after _register memory is reclaimed: + _queue->push_back(child); +#endif + } + } +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/automaton.cpp b/fsa/src/vespa/fsa/automaton.cpp new file mode 100644 index 00000000000..dffca7739ff --- /dev/null +++ b/fsa/src/vespa/fsa/automaton.cpp @@ -0,0 +1,824 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> // for ;:read(), ::write(), etc. +#include <sys/stat.h> + +#include "fsa.h" +#include "automaton.h" +#include "checksum.h" + + +namespace fsa { + +// {{{ constants + +const uint32_t Automaton::PackedAutomaton::_ALLOC_CELLS; +const uint32_t Automaton::PackedAutomaton::_ALLOC_BLOB; +const uint32_t Automaton::PackedAutomaton::_BACKCHECK; + +const Blob Automaton::EMPTY_BLOB(""); + +// }}} + +// {{{ Automaton::TransitionList::operator<() + +bool Automaton::TransitionList::operator<(const Automaton::TransitionList& tl) const +{ + if(this==&tl) return false; + if(_size<tl._size) return true; + if(_size>tl._size) return false; + for(unsigned int i=0; i<_size;i++){ + if(_trans[i]._symbol<tl._trans[i]._symbol) return true; + if(_trans[i]._symbol>tl._trans[i]._symbol) return false; + if(_trans[i]._state<tl._trans[i]._state) return true; + if(_trans[i]._state>tl._trans[i]._state) return false; + } + return false; +} + +// }}} +// {{{ Automaton::TransitionList::operator>() + +bool Automaton::TransitionList::operator>(const Automaton::TransitionList& tl) const +{ + if(this==&tl) return false; + if(_size>tl._size) return true; + if(_size<tl._size) return false; + for(unsigned int i=0; i<_size;i++){ + if(_trans[i]._symbol>tl._trans[i]._symbol) return true; + if(_trans[i]._symbol<tl._trans[i]._symbol) return false; + if(_trans[i]._state>tl._trans[i]._state) return true; + if(_trans[i]._state<tl._trans[i]._state) return false; + } + return false; +} + +// }}} +// {{{ Automaton::TransitionList::operator==() + +bool Automaton::TransitionList::operator==(const Automaton::TransitionList& tl) const +{ + if(this==&tl) return true; + if(_size!=tl._size) return false; + for(unsigned int i=0; i<_size;i++){ + if(_trans[i]._symbol!=tl._trans[i]._symbol) return false; + if(_trans[i]._state!=tl._trans[i]._state) return false; + } + return true; +} + +// }}} + +// {{{ Automaton::PackedAutomaton::reset() + +void Automaton::PackedAutomaton::reset() +{ + _packable = false; + _pack_map.clear(); + _blob_map.clear(); + if(_packed_ptr!=NULL){ + free(_packed_ptr); + _packed_ptr=NULL; + } + if(_packed_idx!=NULL){ + if(sizeof(State*)!=sizeof(state_t)){ + free(_packed_idx); + } + _packed_idx=NULL; + } + if(_symbol!=NULL){ + free(_symbol); + _symbol=NULL; + } + if(_used!=NULL){ + free(_used); + _used=NULL; + } + if(_perf_hash!=NULL){ + free(_perf_hash); + _perf_hash=NULL; + } + if(_totals!=NULL){ + free(_totals); + _totals=NULL; + } + _packed_size=0; + _last_packed=0; + if(_blob!=NULL){ + free(_blob); + _blob=NULL; + } + _blob_size=0; + _blob_used=0; + _blob_type=FSA::DATA_VARIABLE; + _fixed_blob_size=0; + _start_state=0; +} + +// }}} +// {{{ Automaton::PackedAutomaton::init() + +void Automaton::PackedAutomaton::init() +{ + reset(); + + _packed_ptr = (State**)malloc(_ALLOC_CELLS*sizeof(State*)); + if(sizeof(State*)!=sizeof(state_t)){ + _packed_idx = (state_t*)malloc(_ALLOC_CELLS*sizeof(state_t)); + } + else { + _packed_idx = (state_t*)_packed_ptr; + } + _symbol = (symbol_t*)malloc(_ALLOC_CELLS*sizeof(symbol_t)); + _used = (bool*)malloc(_ALLOC_CELLS*sizeof(bool)); + _packed_size = _ALLOC_CELLS; + + assert(_packed_ptr!=NULL && _packed_idx!=NULL && _symbol!=NULL && _used!=NULL); + + for(uint32_t i=0;i<_packed_size;i++){ + _used[i] = false; + _symbol[i] = FSA::EMPTY_SYMBOL; + _packed_ptr[i] = NULL; + if(sizeof(State*)!=sizeof(state_t)){ + _packed_idx[i] = 0; + } + } + + _blob = (data_t*)malloc(_ALLOC_BLOB); + _blob_size = _ALLOC_BLOB; + + assert(_blob!=NULL); + + _packable = true; +} + +// }}} +// {{{ Automaton::PackedAutomaton::expandCells() + +void Automaton::PackedAutomaton::expandCells() +{ + uint32_t i; + + _packed_ptr = (State**)realloc(_packed_ptr,(_packed_size+_ALLOC_CELLS)*sizeof(State*)); + if(sizeof(State*)!=sizeof(state_t)){ + _packed_idx = (state_t*)realloc(_packed_idx,(_packed_size+_ALLOC_CELLS)*sizeof(state_t)); + } + else { + _packed_idx = (state_t*)_packed_ptr; + } + _symbol = (symbol_t*)realloc(_symbol,(_packed_size+_ALLOC_CELLS)*sizeof(symbol_t)); + _used = (bool*)realloc(_used,(_packed_size+_ALLOC_CELLS)*sizeof(bool)); + + assert(_packed_ptr!=NULL && _packed_idx!=NULL && _symbol!=NULL && _used!=NULL); + + for(i=_packed_size;i<_packed_size+_ALLOC_CELLS;i++){ + _used[i] = false; + _symbol[i] = FSA::EMPTY_SYMBOL; + _packed_ptr[i] = NULL; + if(sizeof(State*)!=sizeof(state_t)){ + _packed_idx[i] = 0; + } + } + _packed_size += _ALLOC_CELLS; +} + +// }}} +// {{{ Automaton::PackedAutomaton::expandBlob() + +void Automaton::PackedAutomaton::expandBlob(uint32_t minExpand) +{ + uint32_t expand=(minExpand/_ALLOC_BLOB+1)*_ALLOC_BLOB; + + _blob = (data_t*)realloc(_blob,_blob_size+expand); + + assert(_blob!=NULL); + + _blob_size += expand; +} + +// }}} +// {{{ Automaton::PackedAutomaton::getEmptyCell() + +uint32_t Automaton::PackedAutomaton::getEmptyCell() +{ + unsigned int cell = _last_packed>_BACKCHECK?_last_packed-_BACKCHECK:1; + while(_used[cell]){ + cell++; + if(cell+256>=_packed_size) + expandCells(); + } + + _used[cell] = true; + + return cell; +} + +// }}} +// {{{ Automaton::PackedAutomaton::getCell() + +uint32_t Automaton::PackedAutomaton::getCell(Automaton::SymList t) +{ + SymListIterator tit; + uint32_t cell = _last_packed>_BACKCHECK?_last_packed-_BACKCHECK:1; + bool found = false; + while(!found){ + if(!_used[cell]){ + if(cell+256>=_packed_size) + expandCells(); + for(tit=t.begin();tit!=t.end();++tit){ + if(_symbol[cell+*tit]!=FSA::EMPTY_SYMBOL) + break; + } + if(tit==t.end()) + found=true; + } + if(!found){ + cell++; + if(cell>=_packed_size) + expandCells(); + } + } + _used[cell] = true; + for(tit=t.begin();tit!=t.end();++tit){ + _symbol[cell+*tit] = *tit; + } + + return cell; +} + +// }}} +// {{{ Automaton::PackedAutomaton::packStartState() + +bool Automaton::PackedAutomaton::packStartState(const Automaton::State *s) +{ + return packState(s,true); +} + +// }}} +// {{{ Automaton::PackedAutomaton::packState() + +bool Automaton::PackedAutomaton::packState(const Automaton::State *s, bool start) +{ + SymList transitions; + uint32_t cell; + size_t i; + + if(_packable){ + if(s->getTransitionList().size()==0){ + cell = getEmptyCell(); + } + else{ + for(i=0; i<s->getTransitionList().size(); i++){ + transitions.push_back(s->getTransitionList()[i]._symbol); + } + transitions.sort(); + cell = getCell(transitions); + for(i=0; i<s->getTransitionList().size(); i++){ + if(s->getTransitionList()[i]._symbol==FSA::FINAL_SYMBOL){ + _packed_idx[cell+FSA::FINAL_SYMBOL] = + packBlob(s->getTransitionList()[i]._state->getBlob()); + } + else{ + _packed_ptr[cell+s->getTransitionList()[i]._symbol] = + s->getTransitionList()[i]._state; + } + } + } + + _pack_map[s] = cell; + if(cell>_last_packed) + _last_packed = cell; + if(start) + _start_state=(state_t)cell; + + return true; + } + + return false; + +} + +// }}} +// {{{ Automaton::PackedAutomaton::packBlob() + +static const Blob nullBlob; + +uint32_t Automaton::PackedAutomaton::packBlob(const Blob *b) +{ + PackMapIterator pi = _blob_map.find(b); + if(pi!=_blob_map.end()){ + return pi->second; + } + else { + uint32_t cell=_blob_used; + _blob_map[b]=cell; + if(b==NULL){ + b=&nullBlob; + } + uint32_t size=b->size(); + if(_blob_used+size+sizeof(uint32_t)>_blob_size) + expandBlob(size+sizeof(uint32_t)); + memcpy(_blob+_blob_used,&size,sizeof(uint32_t)); + memcpy(_blob+_blob_used+sizeof(uint32_t),b->data(),size); + _blob_used += size+sizeof(uint32_t); + + return cell; + } +} + +// }}} +// {{{ Automaton::PackedAutomaton::finalize() + +void Automaton::PackedAutomaton::finalize() +{ + if(_packable){ + for(uint32_t i=0;i<_last_packed+256;i++){ + if(i>=_packed_size) // this shouldn't happen anymore, but check anyway + expandCells(); + if(_symbol[i]!=FSA::EMPTY_SYMBOL && _symbol[i]!=FSA::FINAL_SYMBOL){ + _packed_idx[i] = _pack_map[_packed_ptr[i]]; + } + } + + // compact blobs if the size is constant + std::map<uint32_t,uint32_t> bcomp; + std::map<uint32_t,uint32_t>::iterator bcomp_it; + bcomp[0]=0; + uint32_t lastsize = *((uint32_t*)_blob), currsize; + uint32_t i=lastsize+sizeof(uint32_t); + uint32_t j=lastsize; + bool fixedsize = true; + while(i<_blob_used){ + currsize = *((uint32_t*)(_blob+i)); + if(currsize!=lastsize){ + fixedsize = false; + break; + } + bcomp[i]=j; + i+=currsize+sizeof(uint32_t); + j+=currsize; + } + if(fixedsize){ + _blob_type = FSA::DATA_FIXED; + _fixed_blob_size = lastsize; + _blob_used = j; + for(i=0;i<_last_packed+256;i++){ + if(_symbol[i]==FSA::FINAL_SYMBOL){ + _packed_idx[i] = bcomp[_packed_idx[i]]; + } + } + + for(bcomp_it = bcomp.begin(); bcomp_it!=bcomp.end(); ++bcomp_it){ + memmove(_blob+(bcomp_it->second),_blob+(bcomp_it->first+sizeof(uint32_t)),lastsize); + } + } + + _packable = false; + } +} + +// }}} +// {{{ Automaton::PackedAutomaton::computePerfectHash() + +hash_t Automaton::PackedAutomaton::computePerfectHash(state_t state) +{ + symbol_t s; + hash_t count; + + if(_totals[state]!=0){ + return _totals[state]; + } + + count = (_symbol[state+FSA::FINAL_SYMBOL]==FSA::FINAL_SYMBOL) ? 1 : 0; + + for(s=1;s<=254;s++){ + if(_symbol[state+s]==s){ + _perf_hash[state+s] = count; + count += computePerfectHash(_packed_idx[state+s]); + } + } + + _totals[state] = count; + + return count; +} + +// }}} +// {{{ Automaton::PackedAutomaton::addPerfectHash() + +void Automaton::PackedAutomaton::addPerfectHash() +{ + if(_last_packed==0 || _packable){ + // do nothing with an empty automaton or one which has not been finalized + return; + } + + uint32_t size = _last_packed+256; + + _perf_hash = (hash_t*)malloc(size*sizeof(hash_t)); + _totals = (hash_t*)malloc(size*sizeof(hash_t)); + + assert(_perf_hash!=NULL && _totals!=NULL); + + for(unsigned int i=0;i<size;i++){ + _perf_hash[i] = 0; + _totals[i] = 0; + } + + computePerfectHash(_start_state); + + free(_totals); _totals=NULL; +} + +// }}} +// {{{ Automaton::PackedAutomaton::lookup() + +const data_t* Automaton::PackedAutomaton::lookup(const char *input) const +{ + if(_packable || _start_state==0){ + return NULL; + } + state_t state = _start_state; + const char *p=input; + while(*p){ + if(_symbol[state+*p]==*p){ + state=_packed_idx[state+*p]; + p++; + } + else{ + return NULL; + } + } + if(_symbol[state+FSA::FINAL_SYMBOL]==FSA::FINAL_SYMBOL){ + return _blob+_packed_idx[state+FSA::FINAL_SYMBOL]; + } + return NULL; +} + +// }}} +// {{{ Automaton::PackedAutomaton::write() + +bool Automaton::PackedAutomaton::write(const char *filename, uint32_t serial) +{ + if(_packable || _packed_size==0) // must be non-empty and finalized + return false; + + FSA::Header header; + + header._magic = FSA::MAGIC; + header._version = FSA::VER; + header._checksum = 0; + header._size = _last_packed+256; + header._start = _start_state; + header._data_size = _blob_used; + header._data_type = _blob_type; + header._fixed_data_size = _fixed_blob_size; + header._has_perfect_hash = (_perf_hash==NULL) ? 0 : 1; + header._serial = serial; + memset(&(header._reserved), 0, sizeof(header._reserved)); + + int fd = open(filename,O_CREAT|O_TRUNC|O_RDWR,S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); + if(fd<0) return false; + + header._checksum += Checksum::compute(_symbol,header._size*sizeof(symbol_t)); + header._checksum += Checksum::compute(_packed_idx,header._size*sizeof(state_t)); + header._checksum += Checksum::compute(_blob,_blob_used); + if(header._has_perfect_hash){ + header._checksum += Checksum::compute(_perf_hash,header._size*sizeof(hash_t)); + } + + ::write(fd,&header,sizeof(header)); + ::write(fd,_symbol,header._size*(sizeof(symbol_t))); + ::write(fd,_packed_idx,header._size*(sizeof(state_t))); + ::write(fd,_blob,_blob_used); + if(header._has_perfect_hash){ + ::write(fd,_perf_hash,header._size*(sizeof(hash_t))); + } + close(fd); + + return true; +} + +// }}} +// {{{ Automaton::PackedAutomaton::read() + +bool Automaton::PackedAutomaton::read(const char *filename) +{ + FSA::Header header; + size_t r; + + reset(); + int fd = ::open(filename,O_RDONLY); + if(fd<0){ + return false; + } + r=::read(fd,&header,sizeof(header)); + if(r<sizeof(header) || header._magic!=FSA::MAGIC){ + ::close(fd); + return false; + } + + _packable = false; + _packed_size = header._size; + _last_packed = _packed_size-256; + _blob_size = header._data_size; + _blob_used = header._data_size; + _blob_type = header._data_type; + _fixed_blob_size = header._fixed_data_size; + _start_state = header._start; + + _symbol = (symbol_t*)malloc(_packed_size*sizeof(symbol_t)); + assert(_symbol!=NULL); + ::read(fd,_symbol,_packed_size*(sizeof(symbol_t))); + _packed_idx = (state_t*)malloc(_packed_size*sizeof(state_t)); + assert(_packed_idx!=NULL); + ::read(fd,_packed_idx,_packed_size*(sizeof(state_t))); + _blob = (data_t*)malloc(_blob_used); + assert(_blob!=NULL); + ::read(fd,_blob,_blob_used); + if(header._has_perfect_hash){ + _perf_hash = (hash_t*)malloc(_packed_size*sizeof(hash_t)); + assert(_perf_hash!=NULL); + ::read(fd,_perf_hash,_packed_size*(sizeof(hash_t))); + } + + ::close(fd); + + return true; +} + +// }}} +// {{{ Automaton::PackedAutomaton::getFSA() + +bool Automaton::PackedAutomaton::getFSA(FSA::Descriptor &d) +{ + if(_packable || _packed_size==0) // must be non-empty and finalized + return false; + + uint32_t size = _last_packed+256; + + _symbol = (symbol_t*)realloc(_symbol,size*sizeof(symbol_t)); + _packed_idx = (state_t*)realloc(_packed_idx,size*sizeof(state_t)); + _blob = (data_t*)realloc(_blob,_blob_used); + if(_perf_hash!=NULL){ + _perf_hash = (hash_t*)realloc(_perf_hash,size*sizeof(hash_t)); + } + + d._version = FSA::VER; + d._serial = 0; + d._state = _packed_idx; + d._symbol = _symbol; + d._size = size; + d._data = _blob; + d._data_size = _blob_used; + d._data_type = _blob_type; + d._fixed_data_size = _fixed_blob_size; + d._perf_hash = _perf_hash; + d._start = _start_state; + + _symbol = NULL; + _packed_idx = NULL; + if(sizeof(State*)==sizeof(state_t)){ // _packed_idx and _packed_ptr are overlayed + _packed_ptr=NULL; + } + _blob = NULL; + _perf_hash = NULL; + reset(); + + return true; +} + +// }}} + +// {{{ Automaton::cleanUp() + +void Automaton::cleanUp() +{ + if(_q0!=NULL){ + finalize(); // make sure all states are in _register + for(BlobRegisterIterator bi = _blob_register.begin(); bi!=_blob_register.end(); ++bi){ + delete bi->second; + } + _blob_register.clear(); // clear _blob_register + // clear _register and remove all states + for(RegisterIterator ri = _register.begin(); ri!=_register.end(); ++ri){ + delete ri->second; + } + _register.clear(); + delete _q0; + _q0 = NULL; +#if ((__GNUG__ == 3 && __GNUC_MINOR__ >= 1) || __GNUG__ > 3) + _previous_input.clear(); +#else + _previous_input = ""; +#endif + } +} + +// }}} +// {{{ Automaton::~Automaton() + +Automaton::~Automaton() +{ + cleanUp(); +} + +// }}} +// {{{ Automaton::getCPLength() + +unsigned int Automaton::getCPLength(const char *input) +{ + if(_q0==NULL) return 0; + + unsigned int l=0; + State* state = _q0; + State* next; + while(input[l]!=0){ + next = state->child(input[l]); + if(next==NULL) return l; + state=next; + l++; + } + return l; +} + +// }}} +// {{{ Automaton::getCPLastState() + +Automaton::State* Automaton::getCPLastState(const char *input) +{ + if(_q0==NULL) return NULL; + + unsigned int l=0; + State* state = _q0; + State* next; + while(input[l]!=0){ + next = state->child(input[l]); + if(next==NULL) return state; + state=next; + l++; + } + return state; +} + +// }}} +// {{{ Automaton::addSuffix() + +void Automaton::addSuffix(State* state, const char *suffix, const Blob *b) +{ + State* current = state; + State* child; + + while(*suffix != 0){ + child = current->addEmptyChild(*suffix); + current = child; + suffix++; + } + BlobRegisterIterator bi; + if(b!=NULL) + bi = _blob_register.find(*b); + else + bi = _blob_register.find(EMPTY_BLOB); + if(bi!=_blob_register.end()){ + child = bi->second; + current->addChild(FSA::FINAL_SYMBOL,child); + } + else { + const Blob *bcopy = (b==NULL) ? new Blob(EMPTY_BLOB) : new Blob(*b); + assert(bcopy!=NULL); + child = current->addEmptyChild(FSA::FINAL_SYMBOL,bcopy); + _blob_register[*bcopy] = child; + } +} + +// }}} +// {{{ Automaton::init() + +void Automaton::init() +{ + cleanUp(); + _q0 = new State(); + assert(_q0!=NULL); + _finalized = false; + + _packed.init(); +} + +// }}} +// {{{ Automaton::finalize() + +void Automaton::finalize() +{ + if(!_finalized && _q0!=NULL){ + replaceOrRegister(_q0); + _packed.packStartState(_q0); + _packed.finalize(); + _finalized = true; + } + +} + +// }}} +// {{{ Automaton::addPerfectHash() + +void Automaton::addPerfectHash() +{ + if(_finalized){ + _packed.addPerfectHash(); + } +} + +// }}} +// {{{ Automaton::write() + +bool Automaton::write(const char *file, uint32_t serial) +{ + if(!_finalized){ + finalize(); + } + return _packed.write(file,serial); +} + +// }}} +// {{{ Automaton::getFSA() + +FSA* Automaton::getFSA() +{ + if(!_finalized){ + finalize(); + } + + FSA::Descriptor d; + + if(!_packed.getFSA(d)) + return NULL; + + FSA *fsa = new FSA(d); + + cleanUp(); + + return fsa; +} + +// }}} +// {{{ Automaton::insertSortedString() + +void Automaton::insertSortedString(const std::string &input) +{ + insertSortedString(input.c_str()); +} + +void Automaton::insertSortedString(const std::string &input, const std::string &meta) +{ + Blob b(meta); + insertSortedString(input.c_str(),&b); +} + +void Automaton::insertSortedString(const char *input, const Blob& b) +{ + insertSortedString(input,&b); +} + +void Automaton::insertSortedString(const char *input, const Blob* b) +{ + if(_q0==NULL || _finalized) return; + + State* lastState = getCPLastState(input); + const char* currentSuffix = input + getCPLength(input); + + if(lastState->hasChildren()){ + replaceOrRegister(lastState); + } + addSuffix(lastState,currentSuffix,b); +} + +// }}} +// {{{ Automaton::replaceOrRegister() + +void Automaton::replaceOrRegister(Automaton::State* state) +{ + State* child = state->lastChild(); + if(child!=NULL){ + if(child->hasChildren()){ + replaceOrRegister(child); + } + RegisterIterator ri = _register.find(TListPtr(&(child->getTransitionList()))); + if(ri!=_register.end() && ri->second!=child){ + state->updateLastChild(ri->second); + delete child; + } + else { + _register[TListPtr(&(child->getTransitionList()))] = child; + _packed.packState(child); + } + } +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/automaton.h b/fsa/src/vespa/fsa/automaton.h new file mode 100644 index 00000000000..089c7784a0d --- /dev/null +++ b/fsa/src/vespa/fsa/automaton.h @@ -0,0 +1,970 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file automaton.h + * @brief Definition of the classes used for %FSA (%Finite %State %Automaton) construction + * + */ + +#pragma once + +#include <map> +#include <list> +#include <string> +#include <assert.h> + +#include "blob.h" +#include "fsa.h" + +namespace fsa { + +// {{{ Automaton +/** + * @class Automaton + * @brief %FSA (%Finite %State %Automaton) construction class. + * + * The Automaton class provides the methods and data structures needed + * for construcing a %Finite %State %Automaton from input strings. (The + * current implementation requires the input to be sorted, this + * requirement may be relaxed in future relases.) + * + * The constructed %FSA, when stored in a compact representation, can + * be used for lookups, etc. vie the FSA class. The compact %FSA can + * not be modified anymore. + */ +class Automaton { + +public: + /** + * Empty data item for final states without assigned data. Contains + * a zero terminated empty string. + */ + static const Blob EMPTY_BLOB; + +private: + + class State; + + // {{{ Automaton::Transition + /** + * @struct Transition + * @brief Struct for storing a single transition. + * + * A transition consists of an input symbol and a new state. + */ + struct Transition { + symbol_t _symbol; /**< Input symbol. */ + State *_state; /**< New state. */ + }; + // }}} + + // {{{ Automaton::TransitionList + /** + * @class TransitionList + * @brief Class representing all transitions from a state. + * + * This class is used for the interal representation of the + * automaton. A state can be represented by the list of all + * possible transitions from that state. Two states are + * equivalent, if both are final (with the same meta info) or both + * are not final, and their transition list matches, that is they + * have the same number of out-transitions, these correspond to the + * same set of input symbols, and for each of these symbols the new + * states are equal. In the internal representation, final states + * are implemented by means of a special transition, so transition + * list equivalence is implies state equivalence. + */ + class TransitionList { + + friend class State; + + private: + unsigned int _alloc; /**< Allocated size (number of transitions). */ + unsigned int _size; /**< Used size. */ + Transition* _trans; /**< Transition array. */ + + public: + /** + * @brief Constructor. + * + * Default constructor, creates an empty transition list. + */ + TransitionList() : _alloc(0), _size(0), _trans(NULL) {}; + + /** + * @brief Constructor. + * + * Constructor, creates an empty transition list, but preallocates + * space for a given number of transitions. + * + * @param prealloc Number of states to preallocate space for. + */ + TransitionList(unsigned int prealloc) : _alloc(prealloc), _size(0), _trans(NULL) + { if(prealloc>0){ + _trans = (Transition*)malloc(prealloc*sizeof(Transition)); + assert(_trans!=NULL); + } + } + + /** + * @brief Destructor. + */ + ~TransitionList() + { if(_trans!=NULL) free(_trans); } + + /** + * @brief Copy constructor. + * + * @param tl Reference to transition list object. + */ + TransitionList(const TransitionList& tl) : _alloc(tl._size), _size(tl._size), _trans(NULL) + { + if(_alloc>0){ + _trans = (Transition*)malloc(_alloc*sizeof(Transition)); + assert(_trans!=NULL); + } + for(unsigned int i=0; i<_size; i++) + _trans[i] = tl._trans[i]; + } + + + /** + * @brief Less-than operator. + * + * t1<t2 (or t1.operator<(t2) is true iff + * - t1 has less transitions than t2, or + * - t1 and t2 have the same number of transitions, and the + * first transition which is different for t1 and t2 (sorted + * by symbol) has a lower symbol for t1, or + * - t1 and t2 have the same number of transitions, and the + * first transition which is different for t1 and t2 (sorted + * by symbol) has the same symbol but a lower new state for t1 + * + * @param tl Reference to transition list object. + * @return True iff the t1<t2. + */ + bool operator<(const TransitionList& tl) const; + + /** + * @brief Greater-than operator. + * + * t1>t2 (or t1.operator>(t2) is true iff + * - t1 has more transitions than t2, or + * - t1 and t2 have the same number of transitions, and the + * first transition which is different for t1 and t2 (sorted + * by symbol) has a higher symbol for t1, or + * - t1 and t2 have the same number of transitions, and the + * first transition which is different for t1 and t2 (sorted + * by symbol) has the same symbol but a higher new state for t1 + * + * @param tl Reference to transition list object. + * @return True iff the t1>t2. + */ + bool operator>(const TransitionList& tl) const; + + /** + * @brief Equals operator. + * + * t1==t2 (or t1.operator==(t2) is true iff + * - t1 and t2 have the same number of transitions, which have + * the same set of of symbols and for each symbol the new + * states are equal + * + * @param tl Reference to transition list object. + * @return True iff the t1==t2. + */ + bool operator==(const TransitionList& tl) const; + + /** + * @brief Check for emptyness. + * + * @return True iff the transition list is empty. + */ + bool empty() { return (_size==0); } + + /** + * @brief Get transition list size. + * + * @return Size of the transition list (number of transitions, or 0 if empty). + */ + unsigned int size() const { return _size; } + + /** + * @brief Index operator. + * + * Returns a reference to the ith transition on the list. i must + * be between 0 and size-1 (0<=i<=size-1). + * + * @param i Index of transition. + * @return Reference to the ith transition. + */ + const Transition& operator[](unsigned int i) const { return _trans[i]; } + + /** + * @brief Get the last transition. + * + * Returns a pointer to the last transition, or NULL pointer if + * the list is empty. + * + * @return Pointer to last transition, or NULL. + */ + Transition* last() + { if(_size>0) return &_trans[_size-1]; + return NULL; + } + + /** + * @brief Get the transition corresponding to a symbol. + * + * Returns a pointer to the transition corresponding to a given + * symbol, or NULL pointer if the symbol is not found on the list + * (a transition with that symbol does not exist). + * + * @param sy Input symbol. + * @return Pointer to last transition, or NULL. + */ + Transition* find(symbol_t sy) + { for(unsigned int i=0; i<_size; i++){ + if(_trans[i]._symbol == sy) return &_trans[i]; + } + return NULL; + } + + /** + * @brief Append a new transition to the list. + * + * Appends a new transition to the end of the list. The allocated + * size is increased if necessary. If a transition with the same + * symbol already exists, the behaviour is undefined. + * + * @param sy Input symbol. + * @param st Pointer to new state. + */ + void append(symbol_t sy, State* st) + { if(_size==_alloc){ + if(_alloc==0){ + _alloc=1; + _trans = (Transition*)malloc(_alloc*sizeof(Transition)); + } + else{ + _alloc+=2; + _trans = (Transition*)realloc(_trans,_alloc*sizeof(Transition)); + } + assert(_trans!=NULL); + } + _trans[_size]._symbol=sy; + _trans[_size]._state=st; + _size++; + } + + }; + + // }}} + + // {{{ Automaton::TListPtr + /** + * @class TListPtr + * @brief Helper class, pointer to a transition list (TransitionList). + * + * The purpose of this class is to override the comparison operators + * for a pointer, instead of comparing the value of the pointer + * itself, compares the objects the pointer is pointing to. + */ + class TListPtr { + + private: + const TransitionList *_ptr; /**< Pointer to a TransitionList */ + + public: + + /** + * @brief Constructor. + * + * Initialize object to point to the specified transition list. + * + * @param tl pointer to a transition list. + */ + TListPtr(const TransitionList *tl) : _ptr(tl) {} + + /** + * @brief Copy constructor. + * + * Initialize object from another TListPtr. + * + * @param tp Reference to TListPtr. + */ + TListPtr(const TListPtr& tp) : _ptr(tp._ptr) {} + + /** + * @brief Get the pointer to the transition list. + * + * @return Pointer to the TransitionList. + */ + const TransitionList* getPtr() const { return _ptr; } + + /** + * @brief Less-than operator. + * + * Compares the pointed objects instead of the value of the + * pointer itself. + * + * @param tp Reference to TListPtr object. + * @return Comparison result. + */ + bool operator<(const TListPtr& tp) const + { return(*_ptr<*tp._ptr); } + + /** + * @brief Greater-than operator. + * + * Compares the pointed objects instead of the value of the + * pointer itself. + * + * @param tp Reference to TListPtr object. + * @return Comparison result. + */ + bool operator>(const TListPtr& tp) const + { return(*_ptr>*tp._ptr); } + + /** + * @brief Equals operator. + * + * Compares the pointed objects instead of the value of the + * pointer itself. + * + * @param tp Reference to TListPtr object. + * @return Comparison result. + */ + bool operator==(const TListPtr& tp) const + { return(*_ptr==*tp._ptr); } + }; + // }}} + + // {{{ Automaton::State + /** + * @class State + * @brief Class representing a state of the automaton. + * + * The representation of the automaton states consists of a + * transition list for the state, and meta info blob (the latter + * only used for special states reached by a final transition. A + * final transition is a transition from a final (accepting) state + * with the reserved FINAL_SYMBOL (0xff) to a special state, which + * stores the meta info corresponding to the final state. For each + * unique meta info blob, there is one special state. + */ + class State { + + private: + + TransitionList _tlist; /**< Transition list. */ + const Blob *_blob; /**< Meta info blob. */ + + public: + + /** + * @brief Constructor. + * + * Default constructor, creates a state with an empty transition + * list and no (NULL) blob. + */ + State() : _tlist(), _blob(NULL) {} + + /** + * @brief Constructor. + * + * Creates a (special) state with an empty transition list and a + * given blob. + * + * @param b Pointer to blob. + */ + State(const Blob* b) : _tlist(), _blob(b) {} + + /** + * @brief Destructor. + */ + ~State() { if(_blob!=NULL) delete _blob; } + + /** + * @brief Check if the state is final (accepting) state. + * + * @return True if the state is final. + */ + bool isFinal() { return child(FSA::FINAL_SYMBOL)!=NULL; } + + /** + * @brief Get the blob assigned to the state. + * + * @return Pointer to blob. + */ + const Blob* getBlob() const { return _blob; } + + /** + * @brief Check if the state has children. + * + * Returns true if the state has children (the transition list is + * not empty), or false if the state is a leaf. + * + * @return True if the state has children. + */ + bool hasChildren() { return !_tlist.empty(); } + + /** + * @brief Get child corresponding to a symbol. + * + * Get the child of the state which is reached by a transition + * with a given symbol. If there is no out-transition with that + * symbol, NULL is returned. + * + * @return Pointer to the child, or NULL. + */ + State* child(symbol_t sy) + { Transition* t = _tlist.find(sy); + if(t!=NULL){ return t->_state; } + return NULL; + } + + /** + * @brief Get the last child. + * + * Get the last child of the state which is reached by a valid + * transition (not FINAL_SYMBOL). If no such children exists, NULL + * is returned. + * + * @return Pointer to last child, or NULL. + */ + State* lastChild() + { Transition* t = _tlist.last(); + if(t!=NULL && t->_symbol!=FSA::FINAL_SYMBOL){ return t->_state; } + return NULL; + } + + /** + * @brief Update the last child. + * + * Updates the last child to point to a new state. This method is + * used when merging equivalent subtrees together. + * + * @param st New state to be used in last child. + */ + void updateLastChild(State* st) + { Transition* t = _tlist.last(); + if(t!=NULL){ + t->_state = st; + } + } + + /** + * @brief Append a new empty child. + * + * Append an empty child to the list of transitions using the + * given symbol (and optional blob). + * + * @param sy New transition symbol. + * @param b Optional blob to be assigned to the new state, defaults to NULL. + * @return Pointer to the new state. + */ + State* addEmptyChild(symbol_t sy, const Blob *b=NULL) + { + State* child = new State(b); + assert(child!=NULL); + _tlist.append(sy,child); + return child; + } + + /** + * @brief Add a transition to an existing state. + * + * Append a new transition to the list pointing to an existing + * state, using the given symbol. + * + * @param sy New transition symbol. + * @param child Pointer to destination state (already existing). + * @return Pointer to the child state. + */ + State* addChild(symbol_t sy, State* child) + { + _tlist.append(sy,child); + return child; + } + + /** + * @brief Get the transition list. + * + * Get the transition list of the state. + * + * @return Reference to the transition list. + */ + const TransitionList& getTransitionList(void) const { return _tlist; } + + + }; + + // }}} + + // {{{ Automaton::Register, BlobRegister, PackMap, SymList and iterators + /** + * @brief Register of states, maps a transition list to a state object + */ + typedef std::map< TListPtr,State* > Register; + /** + * @brief State register iterator. + */ + typedef std::map< TListPtr,State* >::iterator RegisterIterator; + + /** + * @brief Register of states, maps a blob to a special state. + */ + typedef std::map< Blob,State* > BlobRegister; + /** + * @brief Blob register iterator. + */ + typedef std::map< Blob,State* >::iterator BlobRegisterIterator; + + /** + * @brief Packing map, maps a state pointer to a state ID. + */ + typedef std::map< const void*, unsigned int > PackMap; + /** + * @brief Packing map iterator. + */ + typedef std::map< const void*, unsigned int >::iterator PackMapIterator; + + /** + * @brief symbol_t list. + */ + typedef std::list<symbol_t> SymList; + /** + * @brief symbol_t list iterator. + */ + typedef std::list<symbol_t>::iterator SymListIterator; + // }}} + + // {{{ Automaton::PackedAutomaton + + /** + * @class PackedAutomaton + * @brief Helper class for packing an automaton. + * + * This class is used for packing an Automaton to a compressed + * format which can be saved to file to be used by the FSA class. + */ + class PackedAutomaton { + + private: + bool _packable; /**< Packable flag. */ + PackMap _pack_map; /**< Map state pointers to indices. */ + PackMap _blob_map; /**< Map blob pointers to indices. */ + State **_packed_ptr; /**< Array for state pointers. */ + state_t *_packed_idx; /**< Array for state indices. */ + symbol_t *_symbol; /**< Array for transition symbols. */ + bool *_used; /**< Array for cell used flags. */ + hash_t *_perf_hash; /**< Array for perfect hash deltas. */ + hash_t *_totals; /**< Array for perfect hash totals. */ + uint32_t _packed_size; /**< Size of packed arrays (in cells). */ + uint32_t _last_packed; /**< Index of last packed state. */ + + data_t *_blob; /**< Data storage. */ + uint32_t _blob_size; /**< Data storage size. */ + uint32_t _blob_used; /**< Used data storage size. */ + uint32_t _blob_type; /**< Type of data items (fixed/var.) */ + uint32_t _fixed_blob_size; /**< Data item size if fixed. */ + + state_t _start_state; /**< Index of start state. */ + + /** + * @brief Number of cells to allocate in one expansion. + */ + static const uint32_t _ALLOC_CELLS = 131072; // 128k + + /** + * @brief Number of bytes to allocate in one data storage expansion. + */ + static const uint32_t _ALLOC_BLOB = 65536; // 64k + + /** + * @brief How long back the search for an empty cell should start. + */ + static const uint32_t _BACKCHECK = 255; + + + /** + * @brief Expand cell arrays. + */ + void expandCells(); + + /** + * @brief Expand data storage. + * + * @param minExpand Mimimum size to expand, it will be rounded up + * to the nearest multiply of _ALLOC_BLOB. + */ + void expandBlob(uint32_t minExpand); + + /** + * @brief Get an empty cell. + * + * Start looking for an empty cell _BACKCHECK cells before the + * last packed cell, and return the index of the first empty cell + * found. The cell arrays are expanded on demand, that is if no + * empty cell is found. + * + * @return Index of empty cell. + */ + uint32_t getEmptyCell(); + + /** + * @brief Get an empty cell where a list of transitions can be stored. + * + * Start looking for an empty cell _BACKCHECK cells before the + * last packed cell. In addition to the cell being empty, it + * should be possible to store a list of transitions from that + * cell. The cell arrays are expanded on demand, that is if no + * empty cell is found. + * + * @param t List of transition symbols. + * @return Index of empty cell. + */ + uint32_t getCell(SymList t); + + /** + * @brief Pack a data item. + * + * Pack a data item to the data storage. If the same (or + * equivalent) data item has been packed before, return the offset + * where it was packed. Otherwise, pack the data item at the end + * of the storage (expand storage if needed), add the item and + * offset to the blob map and return the offset. + * + * @param b Pointer to data item. + * @return Offset to data item in data storage. + */ + uint32_t packBlob(const Blob* b); + + /** + * @brief Compute perfect hash deltas for a subtree. + * + * Recursive function for computing the perfect hash deltas for + * all transitions within a subtree. The delta for transition T + * from state S is the number of final states reachable from state + * S via transitions lower than T (that is, with a lower input + * symbol). Also, state S being a final state counts. The hash + * deltas are filled into the _perf_hash array. + * + * @return Number of final states within the subtree. + */ + hash_t computePerfectHash(state_t state); + + + public: + + /** + * @brief Default constructor. + */ + PackedAutomaton() : + _packable(false), + _pack_map(), + _blob_map(), + _packed_ptr(NULL), + _packed_idx(NULL), + _symbol(NULL), + _used(NULL), + _perf_hash(NULL), + _totals(NULL), + _packed_size(0), + _last_packed(0), + _blob(NULL), + _blob_size(0), + _blob_used(0), + _blob_type(0), + _fixed_blob_size(0), + _start_state(0) + { } + + /** + * @brief Destructor. + */ + ~PackedAutomaton() { reset(); } + + /** + * @brief Reset the object. + * + * Reset the object and free all allocated memory. + */ + void reset(); + + /** + * @brief Initialize. + * + * Reset the object, and initialize data structures, also + * preallocate memory for cell and data storage. + */ + void init(); + + /** + * @brief Pack a state. + * + * Pack a state and its transitions into the compact structure. For + * final states, the data item is packed as well. + * + * @param s Pointer to state to pack. + * @param start True if the state is the start state. + * @return False if the object is not packable (it has been + * finalized, or it has not been initialized) + */ + bool packState(const State* s, bool start=false); + + /** + * @brief Pack the start state. + * + * Pack the state and mark it as the start state. (Equivalent to + * packState(s,true)). + * + * @param s Pointer to state to pack. + * @return False if the object is not packable (it has been + * finalized, or it has not been initialized) + */ + bool packStartState(const State* s); + + /** + * @brief Finalize the packed structure. + * + * Obtain all state inidices from the state pointers using the + * pack map. Also compact the data storage if all data items have + * the same size (only store the size once, and store data items + * consecutively, without size attribute). + */ + void finalize(); + + /** + * @brief Add perfect hash to the automaton. + * + * Computes the perfect hash for the whole automaton. + */ + void addPerfectHash(); + + /** + * @brief Write the automaton to a file. + * + * @param filename Name of file. + * @param serial Serial number. + * @return True on success. + */ + bool write(const char *filename, uint32_t serial = 0); + + /** + * @brief Read an automaton from file. + * + * @param filename Name of file. + * @return True on success. + */ + bool read(const char *filename); + + /** + * @brief Perform a lookup in the packed automaton. + * + * @param input Input string + * @return Pointer to data associated with input, or NULL if input is not accepted. + */ + const unsigned char* lookup(const char *input) const; + + /** + * @brief Create an FSA object from the automaton. + * + * Create an FSA object from the automaton. The PackedAutomaton is + * implicitly reset if the operation succeeds. PackedAutomanton + * cannot access the private constructor of FSA, so we have to pass + * the object via a struct, which is ugly :-(. + * + * @param d Pointer to the FSA::Descriptor (struct) to store necessary info for + * creating the FSA object. + * @return True if the operation was successful. + */ + bool getFSA(FSA::Descriptor &d); + + }; + + // }}} + + + Register _register; /**< Register of states. */ + BlobRegister _blob_register; /**< Register of data items. */ + State* _q0; /**< Start state. */ + std::string _previous_input; /**< Previous input string. */ + bool _finalized; /**< Finalized flag. */ + PackedAutomaton _packed; /**< Packed automaton. */ + + /** + * @brief Get common path length. + * + * Get the length of the common path shared by the current input + * string and strings already in the automaton. + * + * @param input Input string. + * @return Length of common path. + */ + unsigned int getCPLength(const char *input); + + /** + * @brief Get last state in common path. + * + * Get the last state of the common path shared by the current input + * string and strings already in the automaton. + * + * @param input Input string. + * @return Pointer to last state in common path. + */ + State* getCPLastState(const char *input); + + /** + * @brief Replace or register a state. + * + * Replace the state with an already registered equivalent state in + * the automaton, or register it if no such state exists yet. + * + * @param state Pointer to state to be replaced or registered. + */ + void replaceOrRegister(State* state); + + /** + * @brief Add new states for a suffix. + * + * Add the necessary new states for a suffix of an input string. The + * suffix is that part of an input string which is not covered by + * the common path. + * + * @param state Pointer to last state in the common path. + * @param suffix Suffix. + * @param b Data item associated with the input. + */ + void addSuffix(State* state, const char *suffix, const Blob *b=NULL); + + /** + * @brief Clean up data structures and release memory. + */ + void cleanUp(); + +public: + + /** + * @brief Default constructor. + */ + Automaton() : + _register(), + _blob_register(), + _q0(NULL), + _previous_input(), + _finalized(false), + _packed() + {} + + /** + * @brief Destructor. + */ + ~Automaton(); + + /** + * @brief Initialize the object. + */ + void init(); + + /** + * @brief Insert a string to the automaton. + * + * Insert a string to the automaton. Input strings must be inserted + * in sorted order, otherwise the behaviour is undefined. + * + * @param input Input string. + */ + void insertSortedString(const std::string &input); + + /** + * @brief Insert a string to the automaton. + * + * Insert a string to the automaton. Input strings must be inserted + * in sorted order, otherwise the behaviour is undefined. + * + * @param input Input string. + * @param meta Meta info string to be stored in data item). + */ + void insertSortedString(const std::string &input, const std::string &meta); + + /** + * @brief Insert a string to the automaton. + * + * Insert a string to the automaton. Input strings must be inserted + * in sorted order, otherwise the behaviour is undefined. + * + * @param input Input string. + * @param b Reference to data item. + */ + void insertSortedString(const char *input, const Blob &b); + + /** + * @brief Insert a string to the automaton. + * + * Insert a string to the automaton. Input strings must be inserted + * in sorted order, otherwise the behaviour is undefined. + * + * @param input Input string. + * @param b Pointer to data item. + */ + void insertSortedString(const char *input, const Blob *b=NULL); + + /** + * @brief Finalize the automaton. + * + * Finalize the automaton. This involves calling replaceOrRegister + * for the start state _q0, and building the packed automaton, so no + * strings can be added to the automaton after this method is + * called. + */ + void finalize(); + + /** + * @brief Add perfect hash to automaton. + * + * Compute and add perfect hash structure to the automaton. Only + * works on finalized automata. + */ + void addPerfectHash(); + + /** + * @brief Write the finalized automaton to file. + * + * @param file Name of the file. + * @param serial Serial number. + * @return True on success. + */ + bool write(const char *file, uint32_t serial = 0); + + /** + * @brief Write the finalized automaton to file. + * + * @param file Name of the file. + * @param serial Serial number. + * @return True on success. + */ + bool write(const std::string &file, uint32_t serial = 0) + { + return write(file.c_str(),serial); + } + + /** + * @brief Create an FSA object from the automaton. + * + * Create an FSA object from the automaton. The Automaton and + * PackedAutomaton is implicitly reset. + * + * @return Pointer to a newly created FSA object. The caller is + * responsible for freeing it. + */ + FSA* getFSA(void); + +}; +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/base64.cpp b/fsa/src/vespa/fsa/base64.cpp new file mode 100644 index 00000000000..f06fc445cc7 --- /dev/null +++ b/fsa/src/vespa/fsa/base64.cpp @@ -0,0 +1,142 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file base64.cpp + * @brief Implementation of Base64 class methods + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <iostream> +#include <string> + +#include "base64.h" + + +namespace fsa { + +// {{{ Base64::_table, Base64::_padding + +const unsigned char Base64::_table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +const unsigned char Base64::_padding = '='; + +// }}} + +// {{{ Base64::b2n() + +inline int Base64::b2n(int b) +{ + if (b>='A' && b<='Z') + return b-'A'; + else if (b>='a' && b<='z') + return b-'a'+26; + else if (b>='0' && b<='9') + return b-'0'+52; + else if (b=='+') + return 62; + else if (b=='/') + return 63; + else + return -1; +} + +// }}} +// {{{ Base64::n2b() + +inline int Base64::n2b(int n) +{ + if(n<0||n>63) + return -1; + return _table[n]; +} + +// }}} + +// {{{ Base64::decode() + +int Base64::decode(const std::string &src, std::string &dest) +{ + if(src.length()&0x03){ // source length should be 4*n + dest.resize(0); + return -1; + } + + dest.resize(3*(src.length()>>2),'\0'); + + std::string::size_type i, index = 0; + int s1,s2,s3,s4; + + for (i =0; i<src.length(); i+=4) { + s1 = b2n(src[i]); + s2 = b2n(src[i+1]); + s3 = b2n(src[i+2]); + s4 = b2n(src[i+3]); + + + if(s1<0||s2<0){ // the first two symbols should not be '=' + dest.resize(index); + return -1; + } + + if(s3<0){ // only one output symbol + dest[index++] = s1<<2 | s2>>4; + if(s4>=0){ // if s3 is '=', s4 should be '=' too + dest.resize(index); + return -1; + } + } + else if(s4<0){ // two symbols + dest[index++] = s1<<2 | s2>>4; + dest[index++] = (s2&0x0f)<<4 | s3>>2; + } + else { // all three present + dest[index++] = s1<<2 | s2>>4; + dest[index++] = (s2&0x0f)<<4 | s3>>2; + dest[index++] = (s3&0x03)<<6 | s4; + } + } + + dest.resize(index); + return index; +} + +// }}} +// {{{ Base64::encode() + +int Base64::encode(const std::string &src, std::string &dest) +{ + dest.resize(4*((src.length()+2)/3),'\0'); + + std::string::size_type i, index = 0; + + for(i=0;i+2<src.length();i+=3) { + dest[index++] = n2b(src[i]>>2); + dest[index++] = n2b((src[i]&0x03)<<4 | src[i+1]>>4); + dest[index++] = n2b((src[i+1]&0x0f)<<2 | src[i+2]>>6); + dest[index++] = n2b(src[i+2]&0x3f); + } + + if (i<src.length()) { // handle padding + dest[index++] = n2b(src[i]>>2); + if (i<src.length()-1) { // 2 bytes left + dest[index++] = n2b((src[i]&0x03)<<4 | src[i+1]>>4); + dest[index++] = n2b((src[i+1]&0x0f)<<2); + dest[index++] = _padding; + } else { // 1 byte left + dest[index++] = n2b((src[i+1]&0x03)<<4); + dest[index++] = _padding; + dest[index++] = _padding; + } + } + + return index; +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/base64.h b/fsa/src/vespa/fsa/base64.h new file mode 100644 index 00000000000..b0ada3b1bff --- /dev/null +++ b/fsa/src/vespa/fsa/base64.h @@ -0,0 +1,58 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file base64.h + * @brief Definition of Base64 class + * + */ + +#pragma once + +#include <string> + +namespace fsa { + +/** + * @class Base64 + * @brief Base64 encoding and decoding. + * + * Encode and decode arbitrary binary strings to %Base64. + */ +class Base64 { +private: + /** Encoing table */ + static const unsigned char _table[]; + /** Padding character */ + static const unsigned char _padding; + + /** Decode one symbol */ + static inline int b2n(int b); + /** Encode one symbol */ + static inline int n2b(int n); + +public: + + /** + * @brief Decode a %Base64 encoded string. + * + * @param src Source %Base64 encoded string. + * @param dest Destination to hold the decoded string. + * @return Size of destination string. + */ + static int decode(const std::string &src, std::string &dest); + + /** + * @brief Decode a %Base64 encoded string. + * + * @param src Source string. + * @param dest Destination to hold %Base64 encoded string. + * @return Size of destination string. + */ + static int encode(const std::string &src, std::string &dest); + +}; + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/blob.cpp b/fsa/src/vespa/fsa/blob.cpp new file mode 100644 index 00000000000..3fd381b33fd --- /dev/null +++ b/fsa/src/vespa/fsa/blob.cpp @@ -0,0 +1,54 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file blob.cpp + * @brief Implementation of Blob class methods + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "blob.h" + + +namespace fsa { + +// {{{ Blob::operator<() + +bool Blob::operator<(const Blob& b) const +{ + if(_size<b._size) return true; + if(_size>b._size) return false; + if(_size==0) return false; + if(memcmp(_data,b._data,_size)<0) return true; + return false; +} + +// }}} +// {{{ Blob::operator>() + +bool Blob::operator>(const Blob& b) const +{ + if(_size>b._size) return true; + if(_size<b._size) return false; + if(_size==0) return false; + if(memcmp(_data,b._data,_size)>0) return true; + return false; +} + +// }}} +// {{{ Blob::operator==() + +bool Blob::operator==(const Blob& b) const +{ + if(_size==b._size && (_size==0 || memcmp(_data,b._data,_size)==0)) return true; + return false; +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/blob.h b/fsa/src/vespa/fsa/blob.h new file mode 100644 index 00000000000..362b37eb48e --- /dev/null +++ b/fsa/src/vespa/fsa/blob.h @@ -0,0 +1,140 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file blob.h + * @brief Definition of Blob class + * + */ + +#pragma once + +#include <string.h> +#include <stdlib.h> + +#include <string> + +namespace fsa { + +// {{{ class Blob + +/** + * @class Blob + * @brief %Blob (binary large object) class. + * + * Representation of a blob (binary large object). Supports assign + * method, access to size and data, and comparison operators. + */ +class Blob { +private: + /** Size of data. */ + unsigned int _size; + /** Pointer to the data. */ + void* _data; +public: + + /** + * @brief Default constructor + * + * Creates an empty blob. + */ + Blob() : _size(0), _data(NULL) {} + + /** + * @brief Constructor + * + * Creates a blob from a character string. The string must be zero + * terminated. + * + * @param str Pointer to input string. + */ + Blob(const char *str) : _size(strlen(str)+1), _data((void*)strdup(str)) {} + + /** + * @brief Constructor + * + * Creates a blob from arbitrary data. + * + * @param data Pointer to data. + * @param size Size of the data. + */ + Blob(const void *data, unsigned int size) : _size(size), _data(malloc(size)) + { memcpy(_data,data,_size); } + + /** + * @brief Copy constructor + * + * @param b Blob to copy. + */ + Blob(const Blob& b) : _size(b._size), _data(malloc(_size)) + { memcpy(_data,b._data,_size); } + + /** + * @brief Constructor + * + * Creates a blob from std::string. + * + * @param s Reference to input string. + */ + Blob(const std::string &s) : _size(s.size()), _data(malloc(_size)) + { s.copy((char*)_data,_size); } + + /** Destructor */ + ~Blob() { if(_data!=NULL) free(_data); } + + /** + * @brief Get data size. + * + * @return Data size. + */ + unsigned int size() const { return _size; } + + /** + * @brief Get data. + * + * @return Pointer to data. Valid as long as the blob object exists + * and is not modified. + */ + const void* data() const { return _data; } + + /** + * @brief Reassign the blob. + * + * @param s Input string + */ + void assign(const std::string &s) + { + if(_data!=NULL) free(_data); + _size=s.size(); + _data=malloc(s.size()); + s.copy((char*)_data,_size); + } + + /** + * @brief Less-than operator. + * + * @param b Blob to compare. + */ + bool operator<(const Blob& b) const; + + /** + * @brief Greater-than operator. + * + * @param b Blob to compare. + */ + bool operator>(const Blob& b) const; + + /** + * @brief Equals operator. + * + * @param b Blob to compare. + */ + bool operator==(const Blob& b) const; + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/checksum.h b/fsa/src/vespa/fsa/checksum.h new file mode 100644 index 00000000000..0c685b27e0a --- /dev/null +++ b/fsa/src/vespa/fsa/checksum.h @@ -0,0 +1,58 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/09/20 + * @version $Id$ + * @file checksum.h + * @brief Definition of Checksum class + * + */ + +#pragma once + +#include <inttypes.h> +#include <string.h> + + +namespace fsa { + +// {{{ class Checksum + +/** + * @class Checksum + * @brief Simple checksum class + */ +class Checksum { +public: + + /** + * @brief Comupte 32-bit checksum value of an arbitrary buffer. + * + * @param buffer Pointer to the buffer. + * @param size Size of the buffer. + * @return 32-bit checksum value. + */ + static uint32_t compute(void *buffer, uint32_t size) + { + uint32_t checksum=0,rest=0,i=0; + char *buf = (char *)buffer; + + for(i=0;i<(size>>2);i++){ + uint32_t tmp; + memcpy(&tmp, buf, sizeof(uint32_t)); + buf += sizeof(uint32_t); + checksum += tmp; + } + //@@@@@@BUG! should be if((size&3)>0) but that will break checksumming; postpone to next major .fsa format change + if(size&(3>0)){ // was if(size&3>0) but that generates a warning in GCC4 + memcpy(&rest,(uint8_t*)buffer+4*i,size&3); + checksum+=rest; + } + return checksum; + } +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/conceptnet.cpp b/fsa/src/vespa/fsa/conceptnet.cpp new file mode 100644 index 00000000000..da73003dee6 --- /dev/null +++ b/fsa/src/vespa/fsa/conceptnet.cpp @@ -0,0 +1,512 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/10/01 + * @version $Id$ + * @file conceptnet.cpp + * @brief Concept network class implementation. + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "conceptnet.h" +#include "fstream" + +#include <fcntl.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/mman.h> // for ::mmap() +#include <sys/time.h> +#include <sys/resource.h> // for getrlimit(), setrlimit(), etc. + +// define this at your own risk... +#undef NO_RANGE_CHECK + +namespace fsa { + +// {{{ constants + +const uint32_t ConceptNet::MAGIC; + +// }}} + +// {{{ ConceptNet::ConceptNet() + +ConceptNet::ConceptNet(const char *fsafile, const char *datafile, FileAccessMethod fam) : + _mmap_addr(NULL), _mmap_length(0), + _unit_fsa(fsafile,fam), + _index_size(0), _index(NULL), + _info_size(0), _info(NULL), + _catindex_size(0), _catindex(NULL), + _strings_size(0), _strings(NULL), + _ok(false) +{ + _ok = _unit_fsa.isOk(); + if(_ok && datafile!=NULL) + _ok = read(datafile,fam); +} + +ConceptNet::ConceptNet(const std::string &fsafile, const std::string &datafile, FileAccessMethod fam) : + _mmap_addr(NULL), _mmap_length(0), + _unit_fsa(fsafile,fam), + _index_size(0), _index(NULL), + _info_size(0), _info(NULL), + _catindex_size(0), _catindex(NULL), + _strings_size(0), _strings(NULL), + _ok(false) +{ + _ok = _unit_fsa.isOk(); + if(_ok) + _ok = read(datafile.c_str(),fam); +} + +// }}} +// {{{ ConceptNet::~ConceptNet() + +ConceptNet::~ConceptNet() +{ + reset(); +} + +// }}} + +// {{{ ConceptNet::reset() + +void ConceptNet::reset() +{ + if(_mmap_addr!=NULL && _mmap_addr!=MAP_FAILED){ + munmap(_mmap_addr,_mmap_length); + } + else{ + delete[] _index; + delete[] _info; + delete[] _catindex; + delete[] _strings; + } + _mmap_addr=NULL; _mmap_length=0; + // leave _unit_fsa alone + _index_size=0; _index=NULL; + _info_size=0; _info=NULL; + _catindex_size=0; _catindex=NULL; + _strings_size=0; _strings=NULL; + _ok=false; +} + +// }}} +// {{{ ConceptNet::read() + +bool ConceptNet::read(const char *datafile, FileAccessMethod fam) +{ + Header header; + + size_t r; + + reset(); //WATCHOUT: if reset() ever changes to unref _unit_fsa, we can't use it since the FSA is read in the constructor before we get here + + if(fam==FILE_ACCESS_UNDEF) + fam=_default_file_access_method; + + if(datafile==NULL) + return false; + + int fd = ::open(datafile,O_RDONLY); + if(fd<0) + return false; + + r=::read(fd,&header,sizeof(header)); + if(r!=sizeof(header) || header._magic!=ConceptNet::MAGIC){ + ::close(fd); + return false; + } + + _index_size = header._index_size; + _info_size = header._info_size; + _catindex_size = header._catindex_size; + _strings_size = header._strings_size; + + if(fam==FILE_ACCESS_MMAP || fam==FILE_ACCESS_MMAP_WITH_MLOCK){ + _mmap_length = + sizeof(header) + + _index_size*sizeof(UnitData) + + _info_size*sizeof(uint32_t) + + _catindex_size*sizeof(uint32_t) + + _strings_size*sizeof(char); + _mmap_addr = ::mmap((void*)0, _mmap_length, PROT_READ, MAP_SHARED, fd, 0); + if(_mmap_addr==MAP_FAILED){ + ::close(fd); + reset(); + return false; + } + if(fam==FILE_ACCESS_MMAP_WITH_MLOCK){ + if(mlock(_mmap_addr, _mmap_length)<0) { + /* try to increase RLIMIT_MEMLOCK then mlock() again */ + struct rlimit rl; + if(getrlimit(RLIMIT_MEMLOCK, &rl) >= 0) { + rl.rlim_cur += _mmap_length + getpagesize(); + rl.rlim_max += _mmap_length + getpagesize(); + if(setrlimit(RLIMIT_MEMLOCK, &rl) >= 0) + mlock(_mmap_addr, _mmap_length); + } + } + } + } + + // read _index + if(_mmap_addr==NULL){ + _index = new UnitData[_index_size]; + r=::read(fd,_index,_index_size*sizeof(UnitData)); + if(r!=_index_size*sizeof(UnitData)){ + ::close(fd); + reset(); + return false; + } + } + else { + _index = (UnitData*)((uint8_t*)_mmap_addr + sizeof(header)); + } + + // read _info + if(_mmap_addr==NULL){ + _info = new uint32_t[_info_size]; + r=::read(fd,_info,_info_size*sizeof(uint32_t)); + if(r!=_info_size*sizeof(uint32_t)){ + ::close(fd); + reset(); + return false; + } + } + else { + _info = (uint32_t*)((uint8_t*)_index + _index_size*sizeof(UnitData)); + } + + // read _catindex + if(_mmap_addr==NULL){ + _catindex = new uint32_t[_catindex_size]; + r=::read(fd,_catindex,_catindex_size*sizeof(uint32_t)); + if(r!=_catindex_size*sizeof(uint32_t)){ + ::close(fd); + reset(); + return false; + } + } + else { + _catindex = (uint32_t*)((uint8_t*)_info + _info_size*sizeof(uint32_t)); + } + + // read _strings + if(_mmap_addr==NULL){ + _strings = new char[_strings_size]; + r=::read(fd,_strings,_strings_size*sizeof(char)); + if(r!=_strings_size*sizeof(char)){ + ::close(fd); + reset(); + return false; + } + } + else { + _strings = (char*)((uint8_t*)_catindex + _catindex_size*sizeof(uint32_t)); + } + + ::close(fd); + + return true; +} + +// }}} + +// {{{ ConceptNet::lookup() + +int ConceptNet::lookup(const char *unit) const +{ + FSA::HashedState hs(_unit_fsa); + hs.start(unit); + if(hs.isFinal()){ + return (int)hs.hash(); + } + return -1; +} + +const char * ConceptNet::lookup(int idx) const +{ +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return NULL; + } +#endif + return _strings+_index[idx]._term; +} + +// }}} +// {{{ ConceptNet::frq() + +int ConceptNet::frq(int idx) const +{ +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1; + } +#endif + return _index[idx]._frq; +} + +int ConceptNet::frq(const char *unit) const +{ + return frq(lookup(unit)); +} + +// }}} +// {{{ ConceptNet::cFrq() + +int ConceptNet::cFrq(int idx) const +{ +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1; + } +#endif + return _index[idx]._cfrq; +} + +int ConceptNet::cFrq(const char *unit) const +{ + return cFrq(lookup(unit)); +} + +// }}} +// {{{ ConceptNet::qFrq() + +int ConceptNet::qFrq(int idx) const +{ +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1; + } +#endif + return _index[idx]._qfrq; +} + +int ConceptNet::qFrq(const char *unit) const +{ + return qFrq(lookup(unit)); +} + +// }}} +// {{{ ConceptNet::sFrq() + +int ConceptNet::sFrq(int idx) const +{ +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1; + } +#endif + return _index[idx]._sfrq; +} + +int ConceptNet::sFrq(const char *unit) const +{ + return sFrq(lookup(unit)); +} + +// }}} +// {{{ ConceptNet::score() + +double ConceptNet::score(int idx) const +{ +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1.0; + } +#endif + return 100.0*(double)_index[idx]._cfrq/(double)_index[idx]._qfrq; +} + +double ConceptNet::score(const char *unit) const +{ + return score(lookup(unit)); +} + +// }}} +// {{{ ConceptNet::strength() + +double ConceptNet::strength(int idx) const +{ +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1.0; + } +#endif + return 100.0*(double)_index[idx]._qfrq/(double)_index[idx]._sfrq; +} + +double ConceptNet::strength(const char *unit) const +{ + return strength(lookup(unit)); +} + +// }}} +// {{{ ConceptNet::numExt() + +int ConceptNet::numExt(int idx) const +{ +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1; + } +#endif + if(_index[idx]._exts==0){ + return 0; + } + return (int)_info[_index[idx]._exts]; +} + +// }}} +// {{{ ConceptNet::numAssoc() + +int ConceptNet::numAssoc(int idx) const +{ +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1; + } +#endif + if(_index[idx]._assocs==0){ + return 0; + } + return (int)_info[_index[idx]._assocs]; +} + +// }}} +// {{{ ConceptNet::numCat() + +int ConceptNet::numCat(int idx) const +{ +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1; + } +#endif + if(_index[idx]._cats==0){ + return 0; + } + return (int)_info[_index[idx]._cats]; +} + +// }}} +// {{{ ConceptNet::ext() + +int ConceptNet::ext(int idx, int j) const +{ + assert(j>=0); +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1; + } + if(_index[idx]._exts==0){ + return -1; + } + if((uint32_t)j>=_info[_index[idx]._exts]){ + return -1; + } +#endif + return (int)_info[_index[idx]._exts+1+2*j]; +} + +// }}} +// {{{ ConceptNet::extFrq() + +int ConceptNet::extFrq(int idx, int j) const +{ + assert(j>=0); +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1; + } + if(_index[idx]._exts==0){ + return -1; + } + if((uint32_t)j>=_info[_index[idx]._exts]){ + return -1; + } +#endif + return (int)_info[_index[idx]._exts+1+2*j+1]; +} + +// }}} +// {{{ ConceptNet::assoc() + +int ConceptNet::assoc(int idx, int j) const +{ + assert(j>=0); +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1; + } + if(_index[idx]._assocs==0){ + return -1; + } + if((uint32_t)j>=_info[_index[idx]._assocs]){ + return -1; + } +#endif + return (int)_info[_index[idx]._assocs+1+2*j]; +} + +// }}} +// {{{ ConceptNet::assocFrq() + +int ConceptNet::assocFrq(int idx, int j) const +{ + assert(j>=0); +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1; + } + if(_index[idx]._assocs==0){ + return -1; + } + if((uint32_t)j>=_info[_index[idx]._assocs]){ + return -1; + } +#endif + return (int)_info[_index[idx]._assocs+1+2*j+1]; +} + +// }}} +// {{{ ConceptNet::cat() + +int ConceptNet::cat(int idx, int j) const +{ + assert(j>=0); +#ifndef NO_RANGE_CHECK + if(idx<0 || (uint32_t)idx>=_index_size){ + return -1; + } + if(_index[idx]._cats==0){ + return -1; + } + if((uint32_t)j>=_info[_index[idx]._cats]){ + return -1; + } +#endif + return (int)_info[_index[idx]._cats+1+j]; +} + +// }}} +// {{{ ConceptNet::catName() + +const char *ConceptNet::catName(int catIdx) const +{ + if(catIdx<0 || (uint32_t)catIdx>=_catindex_size){ + return NULL; + } + return _strings+_catindex[catIdx]; + +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/conceptnet.h b/fsa/src/vespa/fsa/conceptnet.h new file mode 100644 index 00000000000..77c7a8b9e03 --- /dev/null +++ b/fsa/src/vespa/fsa/conceptnet.h @@ -0,0 +1,371 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/10/01 + * @version $Id$ + * @file conceptnet.h + * @brief Concept network class definition. + * + */ + +#pragma once + +#include <assert.h> +#include <stdlib.h> +#include "file.h" // for FileAccessMethod +#include "fsa.h" + + +namespace fsa { + +// {{{ class ConceptNet + +/** + * @class ConceptNet + * @brief Class for compact representation of a concept network. + */ +class ConceptNet { + +public: + + class Handle; // defined in conceptnethandle.h + +private: + static const uint32_t MAGIC = 238579428; /**< Magic number identifying concept net files. */ + + static const FileAccessMethod _default_file_access_method = FILE_ACCESS_MMAP; /**< Default file access method (read/mmap). */ + + /** + * @struct Header + * @brief Concept net data file header. + */ + struct Header { + uint32_t _magic; /**< Magic number. */ + uint32_t _version; /**< Version number. (currently not used) */ + uint32_t _checksum; /**< Checksum. (currently not used) */ + uint32_t _index_size; /**< Size of index structure. */ + uint32_t _info_size; /**< Size of info structure. */ + uint32_t _catindex_size; /**< Size of category index. */ + uint32_t _strings_size; /**< Size of string storage. */ + uint32_t _max_freq; /**< Reseved for normalization purposes. */ + uint32_t _max_cfreq; /**< Reseved for normalization purposes. */ + uint32_t _max_qfreq; /**< Reseved for normalization purposes. */ + uint32_t _max_sfreq; /**< Reseved for normalization purposes. */ + uint32_t _max_efreq; /**< Reseved for normalization purposes. */ + uint32_t _max_afreq; /**< Reseved for normalization purposes. */ + uint32_t _dummy[51]; /**< Reserved. */ + }; + + /** + * @struct UnitData + * @brief Unit data structure. + */ + struct UnitData { + uint32_t _term; /**< Offset of unit string in string storage. */ + uint32_t _frq; /**< Unit frequency. */ + uint32_t _cfrq; /**< Frequency of the unit as complete query. */ + uint32_t _qfrq; /**< Frequency of the unit as part of a query. */ + uint32_t _sfrq; /**< Number of queries containing all unit terms. */ + uint32_t _exts; /**< If non-zero: offset of extension info in info structure. */ + uint32_t _assocs; /**< If non-zero: offset of association info in info structure. */ + uint32_t _cats; /**< If non-zero: offset of category info in info structure. */ + }; + + void *_mmap_addr; /**< mmap address, NULL is file has not been mmapped. */ + size_t _mmap_length; /**< mmap length. */ + + FSA _unit_fsa; /**< %FSA containing the units (with hash). */ + uint32_t _index_size; /**< Size of the index structure. */ + UnitData *_index; /**< Pointer to the index structure in memory. */ + uint32_t _info_size; /**< Size of the info structure. */ + uint32_t *_info; /**< Pointer to the info structure in memory. */ + uint32_t _catindex_size; /**< Size of the catergory index. */ + uint32_t *_catindex; /**< Pointer to the category index in memory. */ + uint32_t _strings_size; /**< Size of the string storage. */ + char *_strings; /**< Pointer to the string storage in memory. */ + + bool _ok; /**< Flag indicating successful initialization. */ + + /** + * @brief Reset the object. + * + * Resets the object to an empty %ConceptNet, and releases allocated memory. + */ + void reset(); + + /** + * @brief Read the concept net data file from disk. + * + * @param datafile Name of the concept net data file. + * @param fam File access mode (read or mmap). If not set, the + * global default access mode will be used. + * @return True on success. + */ + bool read(const char *datafile, fsa::FileAccessMethod fam = FILE_ACCESS_UNDEF); + + /** + * @brief Unimplemented private default constructor. + */ + ConceptNet(); + /** + * @brief Unimplemented private copy constructor. + */ + ConceptNet(const ConceptNet&); + /** + * @brief Unimplemented private assignement operator. + */ + const ConceptNet& operator=(const ConceptNet&); + +public: + + /** + * @brief Constructor. + * + * @param fsafile %FSA file containing the units, with a perfect has + * (used for indexing the data file). + * @param datafile Concept net data file. + * @param fam File access mode (read or mmap). If not set, the + * global default access mode will be used. + */ + ConceptNet(const char *fsafile, const char *datafile=NULL, FileAccessMethod fam = FILE_ACCESS_UNDEF); + ConceptNet(const std::string &fsafile, const std::string &datafile, FileAccessMethod fam = FILE_ACCESS_UNDEF); + + /** + * @brief Destructor. + */ + virtual ~ConceptNet(); + + /** + * @brief Check if initialization was successful. + * + * @return True if the initialization of the object succeeded. + */ + bool isOk() const + { + return _ok; + } + + /** + * @brief Get the concept net %FSA. + * + * Get the concept net %FSA. The object continues to be owned by the + * concept net. + * + * @return The concept net %FSA. + */ + const FSA& getFSA() const + { + assert(_ok); + return _unit_fsa; + } + + /** + * @brief Look up a unit. + * + * Look up a unit in the concept net, and get its index. + * + * @param unit Unit string. + * @return Index of the unit, or -1 if not found. + */ + int lookup(const char *unit) const; + + /** + * @brief Look up a unit index. + * + * Look up a unit index in the concept net, and get the unit string. + * + * @param idx Unit index. + * @return Pointer to the unit string, or NULL if index is out of range. + */ + const char * lookup(int idx) const; + + /** + * @brief Get the unit frequency of the unit. + * + * @param idx Unit index. + * @return Unit frequency, or -1 if the index is out of range. + */ + int frq(int idx) const; + + /** + * @brief Get the unit frequency of the unit. + * + * @param unit Unit string. + * @return Unit frequency, or -1 if the unit is not found. + */ + int frq(const char *unit) const; + + /** + * @brief Get the frequency of the unit as a complete query. + * + * @param idx Unit index. + * @return Unit-C frequency, or -1 if the index is out of range. + */ + int cFrq(int idx) const; + + /** + * @brief Get the frequency of the unit as a complete query. + * + * @param unit Unit string. + * @return Unit-C frequency, or -1 if the unit is not found. + */ + int cFrq(const char *unit) const; + + /** + * @brief Get the frequency of the unit as part of a query. + * + * @param idx Unit index. + * @return Unit-Q frequency, or -1 if the index is out of range. + */ + int qFrq(int idx) const; + + /** + * @brief Get the frequency of the unit as part of a query. + * + * @param unit Unit string. + * @return Unit-Q frequency, or -1 if the unit is not found. + */ + int qFrq(const char *unit) const; + + /** + * @brief Get the frequency of queries containing all terms of the unit. + * + * @param idx Unit index. + * @return Unit-S frequency, or -1 if the index is out of range. + */ + int sFrq(int idx) const; + + /** + * @brief Get the frequency of queries containing all terms of the unit. + * + * @param unit Unit string. + * @return Unit-Q frequency, or -1 if the unit is not found. + */ + int sFrq(const char *unit) const; + + /** + * @brief Get the unit score (100.0*cFrq/qFrq). + * + * @param idx Unit index. + * @return Unit score, or -1.0 if the index is out of range. + */ + double score(int idx) const; + + /** + * @brief Get the unit score (100.0*cFrq/qFrq). + * + * @param unit Unit string. + * @return Unit score, or -1. if the unit is not found. + */ + double score(const char *unit) const; + + /** + * @brief Get the unit strength (100.0*qFrq/sFrq). + * + * @param idx Unit index. + * @return Unit strength, or -1.0 if the index is out of range. + */ + double strength(int idx) const; + + /** + * @brief Get the unit strength (100.0*qFrq/sFrq). + * + * @param unit Unit string. + * @return Unit strength, or -1. if the unit is not found. + */ + double strength(const char *unit) const; + + /** + * @brief Get the number of extensions for the unit. + * + * @param idx Unit index. + * @return Number of extensions for the unit, -1 if the index is out + * of range. + */ + int numExt(int idx) const; + + /** + * @brief Get the number of associations for the unit. + * + * @param idx Unit index. + * @return Number of associations for the unit, -1 if the index is out + * of range. + */ + int numAssoc(int idx) const; + + /** + * @brief Get the number of categories for the unit. + * + * @param idx Unit index. + * @return Number of categories for the unit, -1 if the index is out + * of range. + */ + int numCat(int idx) const; + + /** + * @brief Get the index of an extension. + * + * @param idx Unit index. + * @param j Number of the extension (extensions of each unit are + * sorted by decreasing weight). + * @return Extension (unit) index, -1 if idx or j is out + * of range. + */ + int ext(int idx, int j) const; + + /** + * @brief Get the frequency of an extension. + * + * @param idx Unit index. + * @param j Number of the extension (extensions of each unit are + * sorted by decreasing weight). + * @return Extension frequency, -1 if idx or j is out + * of range. + */ + int extFrq(int idx, int j) const; + + /** + * @brief Get the index of an association. + * + * @param idx Unit index. + * @param j Number of the association (associations of each unit are + * sorted by decreasing weight). + * @return Association (unit) index, -1 if idx or j is out + * of range. + */ + int assoc(int idx, int j) const; + + /** + * @brief Get the frequency of an association. + * + * @param idx Unit index. + * @param j Number of the association (associations of each unit are + * sorted by decreasing weight). + * @return Association frequency, -1 if idx or j is out + * of range. + */ + int assocFrq(int idx, int j) const; + + /** + * @brief Get the index of a category. + * + * @param idx Unit index. + * @param j Number of the category. + * @return Catergory index, -1 if idx or j is out of range. + */ + int cat(int idx, int j) const; + + /** + * @brief Get the name of a category. + * + * @param catIdx Category index. + * @return Catergory name, or NULL if catIdx is out of range. + */ + const char *catName(int catIdx) const; + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/detector.cpp b/fsa/src/vespa/fsa/detector.cpp new file mode 100644 index 00000000000..f9e92c994d5 --- /dev/null +++ b/fsa/src/vespa/fsa/detector.cpp @@ -0,0 +1,102 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file detector.cpp + * @brief %FSA (%Finite %State %Automaton) based detector (implementation) + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <list> +#include <algorithm> + +#include <math.h> + +#include "detector.h" +#include "fsa.h" +#include "ngram.h" + + +namespace fsa { + +// {{{ Detector::detect + +void Detector::detect(const NGram &text, Detector::Hits &hits, + unsigned int from, int length) const +{ + std::list<FSA::WordCounterState> detectors; + std::list<FSA::WordCounterState>::iterator det_it; + unsigned int i,to; + + to = text.length(); + if(length!=-1 && from+length<to) + to=from+length; + + i=from; + while(i<to){ + detectors.push_back(FSA::WordCounterState(_dictionary)); + + det_it=detectors.begin(); + while(det_it!=detectors.end()){ + det_it->deltaWord(text[i]); + if(det_it->isFinal()){ + hits.add(text, i-det_it->getCounter()+1, det_it->getCounter(), *det_it); + } + + if(det_it->isValid()) + ++det_it; + else{ + det_it=detectors.erase(det_it); + } + } + ++i; + } + + detectors.clear(); +} + +// }}} +// {{{ Detector::detectWithHash + +void Detector::detectWithHash(const NGram &text, Detector::Hits &hits, + unsigned int from, int length) const +{ + std::list<FSA::HashedWordCounterState> detectors; + std::list<FSA::HashedWordCounterState>::iterator det_it; + unsigned int i,to; + + to = text.length(); + if(length!=-1 && from+length<to) + to=from+length; + + i=from; + while(i<to){ + detectors.push_back(FSA::HashedWordCounterState(_dictionary)); + + det_it=detectors.begin(); + while(det_it!=detectors.end()){ + det_it->deltaWord(text[i]); + if(det_it->isFinal()){ + hits.add(text, i-det_it->getCounter()+1, det_it->getCounter(), *det_it); + } + + if(det_it->isValid()) + ++det_it; + else{ + det_it=detectors.erase(det_it); + } + } + ++i; + } + + detectors.clear(); +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/detector.h b/fsa/src/vespa/fsa/detector.h new file mode 100644 index 00000000000..62e54077519 --- /dev/null +++ b/fsa/src/vespa/fsa/detector.h @@ -0,0 +1,131 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file detector.h + * @brief %FSA (%Finite %State %Automaton) based detector. + * + */ + +#pragma once + +#include <string> +#include <map> +#include <vector> + +#include "fsa.h" +#include "ngram.h" + +namespace fsa { + +// {{{ Detector + +/** + * @class Detector + * @brief Simple %FSA based detector. + * + * Class for processing a tokenized text and detecting occurrences of + * terms and phrases in a given dictionary. + */ +class Detector { + +public: + + // {{{ class Detector::Hits + + /** + * @class Hits + * @brief Class for collecting the detection results. + * + * This is a base class which must be subclassed for each particular + * application of the detector. The method add() will be called for + * each term/phrase detected by the detector. + */ + class Hits { + public: + /** Default constructor. */ + Hits() {} + /** Destructor. */ + virtual ~Hits() {}; + + /** + * @brief Method to receive results from the detector. + * + * @param text Tokenized detector input text. + * @param from Index of the first term of the detected phrase. + * @param length Length of the detected phrase. + * @param state Final state after the detection of the phrase. + */ + virtual void add(const NGram &text, + unsigned int from, int length, + const FSA::State &state) = 0; + }; + + // }}} + +private: + + /** Dictionary. */ + const FSA& _dictionary; + + /** Unimplemented private default constructor. */ + Detector(); + /** Unimplemented private copy constructor. */ + Detector(const Detector&); + +public: + + /** + * @brief Constructor. + * + * Creates a detector, and initializes the dictionary from a handle. + * + * @param dict Dictionary handle. + */ + Detector(const FSA& dict) : _dictionary(dict) {} + + /** + * @brief Constructor. + * + * Creates a detector, and initializes the dictionary from a handle. + * + * @param dict Dictionary handle. + */ + Detector(const FSA* dict) : _dictionary(*dict) {} + + /** + * @brief Destructor. + */ + ~Detector() {} + + /** + * @brief Detect terms and phrases in a text. + * + * @param text Tokenized text. + * @param hits Reference to the object for collecting the results. + * @param from Index of first term in text where detection should start. + * @param length Number of term to consider (-1 means to end of text). + */ + void detect(const NGram &text, Hits &hits, + unsigned int from=0, int length=-1) const; + + /** + * @brief Detect terms and phrases in a text. + * + * Same as detect(), but uses hashed states. + * + * @param text Tokenized text. + * @param hits Reference to the object for collecting the results. + * @param from Index of first term in text where detection should start. + * @param length Number of term to consider (-1 means to end of text). + */ + void detectWithHash(const NGram &text, Hits &hits, + unsigned int from=0, int length=-1) const; + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/file.h b/fsa/src/vespa/fsa/file.h new file mode 100644 index 00000000000..414751e4849 --- /dev/null +++ b/fsa/src/vespa/fsa/file.h @@ -0,0 +1,29 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2008/05/30 + * @version $Id$ + * @file file.h + * @brief Currently just %FileAccessMethod + */ + +#pragma once + +namespace fsa { + +// {{{ FileAccessMethod + +/** + * @brief File access method enum. + */ +enum FileAccessMethod { + FILE_ACCESS_UNDEF, + FILE_ACCESS_READ, + FILE_ACCESS_MMAP, + FILE_ACCESS_MMAP_WITH_MLOCK +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/fsa.cpp b/fsa/src/vespa/fsa/fsa.cpp new file mode 100644 index 00000000000..63ff979e411 --- /dev/null +++ b/fsa/src/vespa/fsa/fsa.cpp @@ -0,0 +1,413 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file fsa.cpp + * @brief Implementation of FSA methods (not inlined) + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "fsa.h" +#include "checksum.h" + +#include <map> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> // for ::read(), ::close() +#include <sys/types.h> +#include <sys/mman.h> // for ::mmap() +#include <sys/time.h> +#include <sys/resource.h> // for getrlimit(), setrlimit(), etc. + + + +namespace fsa { + +// {{{ constants +const uint32_t FSA::MAGIC; +const uint32_t FSA::VER; +const symbol_t FSA::EMPTY_SYMBOL; +const symbol_t FSA::FINAL_SYMBOL; +// }}} + + +// {{{ FSA::iterator::operator++() + +FSA::iterator& FSA::iterator::operator++() +{ + state_t next; + unsigned int depth; + + if(_item._symbol==0xff || _item._fsa==NULL) + return *this; + + if(_item._symbol==0 && _item._state==0) + _item._state=_item._fsa->start(); + + while(1){ + _item._symbol++; + if(_item._symbol<0xff){ + next=_item._fsa->delta(_item._state,_item._symbol); + if(next){ + _item._string += _item._symbol; + _item._stack.push_back(_item._state); + _item._state = next; + _item._symbol = 0; + if(_item._fsa->isFinal(next)) + break; + } + } + else { // bactrack + if((depth=_item._string.size())>0){ + _item._symbol = _item._string[depth-1]; + _item._string.resize(depth-1); + _item._state = _item._stack.back(); + _item._stack.pop_back(); + } + else{ + _item._state=0; + break; + } + } + } + return *this; +} + +// }}} +// {{{ FSA::libVER() + +uint32_t FSA::libVER() +{ + return VER; +} + +// }}} +// {{{ MetaData::MetaData() + +FSA::FSA(const char *file, FileAccessMethod fam) : + _mmap_addr(NULL), _mmap_length(0), + _version(0), _serial(0), + _state(NULL), _symbol(NULL), _size(0), + _data(NULL), _data_size(0), _data_type(DATA_VARIABLE), _fixed_data_size(0), + _has_perfect_hash(false),_perf_hash(NULL), + _start(0), _ok(false) +{ + _ok = read(file, fam); +} + +FSA::FSA(const std::string &file, FileAccessMethod fam) : + _mmap_addr(NULL), _mmap_length(0), + _version(0), _serial(0), + _state(NULL), _symbol(NULL), _size(0), + _data(NULL), _data_size(0), _data_type(DATA_VARIABLE), _fixed_data_size(0), + _has_perfect_hash(false),_perf_hash(NULL), + _start(0), _ok(false) +{ + _ok = read(file.c_str(), fam); +} + +// }}} +// {{{ FSA::~FSA() + +FSA::~FSA() +{ + if(_mmap_addr!=NULL && _mmap_addr!=MAP_FAILED){ + munmap(_mmap_addr,_mmap_length); + } + else{ + if(_state!=NULL) free(_state); + if(_symbol!=NULL) free(_symbol); + if(_data!=NULL) free(_data); + if(_perf_hash!=NULL) free(_perf_hash); + } +} + +// }}} +// {{{ FSA::reset() + +void FSA::reset() +{ + _version = 0; + _serial = 0; + if(_mmap_addr!=NULL && _mmap_addr!=MAP_FAILED){ + munmap(_mmap_addr,_mmap_length); + } + else{ + if(_state!=NULL) free(_state); + if(_symbol!=NULL) free(_symbol); + if(_data!=NULL) free(_data); + if(_perf_hash!=NULL) free(_perf_hash); + } + _mmap_addr=NULL; _mmap_length=0; + _state=NULL; _symbol=NULL; _size=0; + _data=NULL; _data_size=0; _data_type=DATA_VARIABLE; _fixed_data_size=0; + _has_perfect_hash=false; _perf_hash=NULL; + _start=0; +} + +// }}} +// {{{ FSA::read() + +bool FSA::read(const char *file, FileAccessMethod fam) +{ + Header header; + size_t r; + uint32_t checksum=0; + + reset(); + + if(fam==FILE_ACCESS_UNDEF) + fam=_default_file_access_method; + + if(file==NULL) + return false; + + int fd = ::open(file,O_RDONLY); + if(fd<0) + return false; + + r=::read(fd,&header,sizeof(header)); + if(r<sizeof(header) || header._magic!=MAGIC || header._version<1000){ + ::close(fd); // no fsa had version number below 0.1.0 + return false; + } + + _version = header._version; + _serial = header._serial; + _size = header._size; + _data_size = header._data_size; + _data_type = header._data_type; + _fixed_data_size = header._fixed_data_size; + _start = header._start; + + if(fam==FILE_ACCESS_MMAP || fam==FILE_ACCESS_MMAP_WITH_MLOCK){ + _mmap_length = + sizeof(header) + + _size*sizeof(symbol_t) + + _size*sizeof(state_t) + + _data_size + + (header._has_perfect_hash?_size*sizeof(hash_t):0); + _mmap_addr = ::mmap((void*)0, _mmap_length, PROT_READ, MAP_SHARED, fd, 0); + if(_mmap_addr==MAP_FAILED){ + ::close(fd); + reset(); + return false; + } + if(fam==FILE_ACCESS_MMAP_WITH_MLOCK){ + if(mlock(_mmap_addr, _mmap_length)<0) { + /* try to increase RLIMIT_MEMLOCK then mlock() again */ + struct rlimit rl; + if(getrlimit(RLIMIT_MEMLOCK, &rl) >= 0) { + rl.rlim_cur += _mmap_length + getpagesize(); + rl.rlim_max += _mmap_length + getpagesize(); + if(setrlimit(RLIMIT_MEMLOCK, &rl) >= 0) + mlock(_mmap_addr, _mmap_length); + } + } + } + } + + if(_mmap_addr==NULL){ + _symbol = (symbol_t*)malloc(_size*sizeof(symbol_t)); + r=::read(fd,_symbol,_size*sizeof(symbol_t)); + if(r!=_size*sizeof(symbol_t)){ + ::close(fd); + reset(); + return false; + } + } + else { + _symbol = (symbol_t*)((uint8_t*)_mmap_addr + sizeof(header)); + } + checksum += Checksum::compute(_symbol,_size*sizeof(symbol_t)); + + if(_mmap_addr==NULL){ + _state = (state_t*)malloc(_size*sizeof(state_t)); + r=::read(fd,_state,_size*sizeof(state_t)); + if(r!=_size*sizeof(state_t)){ + ::close(fd); + reset(); + return false; + } + } + else { + _state = (state_t*)((uint8_t*)_mmap_addr + sizeof(header) + + _size*sizeof(symbol_t)); + } + checksum += Checksum::compute(_state,_size*sizeof(state_t)); + + if(_mmap_addr==NULL){ + _data = (data_t*)malloc(_data_size); + r=::read(fd,_data,_data_size); + if(r!=_data_size){ + ::close(fd); + reset(); + return false; + } + } + else { + _data = (data_t*)((uint8_t*)_mmap_addr + sizeof(header) + + _size*sizeof(symbol_t) + + _size*sizeof(state_t)); + } + checksum += Checksum::compute(_data,_data_size); + + if(header._has_perfect_hash){ + if(_mmap_addr==NULL){ + _perf_hash = (hash_t*)malloc(_size*sizeof(hash_t)); + r=::read(fd,_perf_hash,_size*sizeof(hash_t)); + if(r!=_size*sizeof(hash_t)){ + ::close(fd); + reset(); + return false; + } + } + else { + _perf_hash = (hash_t*)((uint8_t*)_mmap_addr + sizeof(header) + + _size*sizeof(symbol_t) + + _size*sizeof(state_t) + + _data_size); + } + checksum += Checksum::compute(_perf_hash,_size*sizeof(hash_t)); + _has_perfect_hash = true; + } + + ::close(fd); + + if(_version>=2000 && checksum!=header._checksum){ + reset(); // use checksum since version 0.2.0 + return false; + } + + return true; +} +// }}} +// {{{ FSA::revLookup() + +std::string FSA::revLookup(hash_t hash) const +{ + state_t state = start(); + state_t next,last_next, current_next; + hash_t current = 0,d,last_d; + std::string current_string; + symbol_t symbol,last_symbol,current_symbol; + + if(!hasPerfectHash()) + return std::string(); + last_symbol=current_symbol=0; + + while(current<hash){ + last_symbol=current_symbol=0; + last_next=current_next=0; + d=last_d=0; + for(symbol=1;symbol<=254;symbol++){ + next=delta(state,symbol); + if(next){ + last_symbol=current_symbol; + current_symbol=symbol; + last_next=current_next; + current_next=next; + last_d=d; + d=hashDelta(state,symbol); + if(current+d>=hash) + break; + } + } + if(current_symbol==0) + return std::string(); + if(current+d<=hash){ + current_string+=(char)current_symbol; + state=current_next; + current+=d; + } + else{ + current_string+=(char)last_symbol; + state=last_next; + current+=last_d; + } + } + + while(!isFinal(state)){ + for(symbol=1;symbol<=254;symbol++){ + next=delta(state,symbol); + if(next){ + current_string+=(char)symbol; + state=next; + break; + } + } + if(symbol==255) + return std::string(); + } + + return current_string; +} + +// }}} + +// {{{ FSA::printDot() + +void FSA::printDot(std::ostream &out) const +{ + state_t start,state,next; + symbol_t symbol; + std::list<state_t> state_stack; + std::list<symbol_t> symbol_stack; + std::map<state_t,bool> visited; + bool v; + + + symbol=0; + start=state=this->start(); + + out << "digraph fsa {\n"; + out << " node [label=\"\",shape=circle]\n"; + out << " start [label=start]\n"; + + while(1){ + symbol++; + if(symbol<0xff){ + next=delta(state,symbol); + if(next){ + v=visited[next]; + if(!v && isFinal(next)) + out << " n" << next << " [shape=doublecircle]\n"; + out << " "; + if(state==start) + out << "start"; + else + out << "n" << state; + out << " -> n" << next << " [label=\"" << char(symbol) << "\"]\n"; + if(!v){ + visited[next]=true; + symbol_stack.push_back(symbol); + state_stack.push_back(state); + state = next; + symbol = 0; + } + } + } + else { // bactrack + if(state_stack.size()>0){ + symbol = symbol_stack.back(); + symbol_stack.pop_back(); + state = state_stack.back(); + state_stack.pop_back(); + } + else{ + break; + } + } + } + + out << "}\n"; + +} +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/fsa.h b/fsa/src/vespa/fsa/fsa.h new file mode 100644 index 00000000000..a508b1eb0f4 --- /dev/null +++ b/fsa/src/vespa/fsa/fsa.h @@ -0,0 +1,2312 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file fsa.h + * @brief Class definition of the %FSA (%Finite %State %Automaton) matcher + */ + +#pragma once + +#include <string> +#include <list> +#include <iostream> +#include <inttypes.h> + +#include "file.h" // for FileAccessMethod + +namespace fsa { + +// {{{ symbol_t, state_t, hash_t, data_t +/** + * @brief Symbol type used by the automaton. sizeof() should be 1. + */ +typedef uint8_t symbol_t; + +/** + * @brief State type used by the automaton. + */ +typedef uint32_t state_t; + +/** + * @brief Hash type used by the automaton. + */ +typedef uint32_t hash_t; + +/** + * @brief Data type used by the automaton. sizeof() should be 1. + */ +typedef uint8_t data_t; + +// }}} + + +// {{{ FSA + +/** + * Forward declaration of friend. + */ +class Automaton; + +/** + * @class FSA + * @brief %FSA (%Finite %State %Automaton) matcher. + * + * The FSA class provides very fast string lookup and perfect hashing + * using the Finite State Automaton technology. The automata are built + * off-line using the Automaton class. + */ +class FSA { + +public: + + class Handle; // defined in fsahandle.h + class State; + + // {{{ FSA::iterator + /** + * @class iterator + * @brief Iterate through all accepted strings in the fsa. + */ + class iterator { + + friend class State; + + public: + + /** + * @class iteratorItem + * @brief Helper class for storing iterator state and accessing data. + * + * Internally, this class stores the state information for the + * iterator. Externally, it is used for accessing the data + * associated with the iterator position. + */ + class iteratorItem { + + friend class iterator; + + private: + std::string _string; /**< The current string. */ + std::list<state_t> _stack; /**< The stack of visited states. */ + symbol_t _symbol; /**< Currently examined symbol. */ + state_t _state; /**< Currently examined state. */ + const FSA* _fsa; /**< Pointer to the FSA. */ + + /** + * @brief Default constructor; unimplemented. + */ + iteratorItem(); + + /** + * @brief Constructor. + * + * @param fsa Pointer to the %FSA object the iterator is associated with. + */ + iteratorItem(const FSA *fsa) : _string(), _stack(), _symbol(0), _state(0), _fsa(fsa) {} + + /** + * @brief Constructor. + * + * @param fsa Pointer to the %FSA object the iterator is associated with. + * @param s State to use as start state. + */ + iteratorItem(const FSA *fsa, state_t s) : + _string(), _stack(), _symbol(0), _state(s), _fsa(fsa) {} + + /** + * @brief Copy constructor. + * + * @param it Pointer to iterator item to copy. + */ + iteratorItem(const iteratorItem& it) : _string(it._string), _stack(it._stack), + _symbol(it._symbol), _state(it._state), + _fsa(it._fsa) {} + + /** + * @brief Destructor. + */ + ~iteratorItem() {} + + public: + + /** + * @brief Access the string associated with the iterator poristion. + * + * @return Current string. + */ + const std::string& str() const { return _string; } + + /** + * @brief Get the size of meta data which belongs to the current string. + * + * @return The size of meta data. + */ + int dataSize() const { return _fsa->dataSize(_state); } + + /** + * @brief Get the meta data which belongs to the current string. + * + * @return Pointer to the meta data. + */ + const data_t* data() const { return _fsa->data(_state); } + }; + + private: + + iteratorItem _item; /**< Internal state. */ + + /** + * @brief Constructor. + * + * Private constructor, reserved for FSA::State::begin() and end(). + * + * @param fsa Pointer to the FSA object to assiociate with. + * @param s State to use as initial state. + */ + iterator(const FSA *fsa, state_t s) : _item(fsa,s) + { + if(!fsa->isFinal(s)) + operator++(); + } + + public: + + /** + * @brief Default constructor. + * + * Creates an unitialized iterator. The effect of using any of + * the access methods on unitialized iterators is undefined. + */ + iterator() : _item(NULL) {} + + /** + * @brief Copy constructor. + * + * @param it iterator object to copy. + */ + iterator(const iterator &it) : _item(it._item) {} + + /** + * @brief Constructor. + * + * Create an iterator for a given state s. The iterator will + * only iterate through possible endings from this state. + * + * @param s State to create the iterator from. + */ + iterator(const State &s) : _item(s._fsa,s._state) + { + if(!s.isFinal()) + operator++(); + } + + /** + * @brief Constructor. + * + * Private constructor, reserved for FSA::begin() and end(). + * + * @param fsa Pointer to the FSA object to assiociate with. + * @param atEnd True for end(), false for begin(). (Default is false.) + */ + iterator(const FSA *fsa, bool atEnd=false) : _item(fsa) + { + if(atEnd) + _item._symbol = 0xff; + else + operator++(); + } + + /** + * @brief Assignment operator. + * + * @param it iterator object to set values from. + * @return Reference to this iterator object. + */ + iterator& operator=(const iterator &it) { _item=it._item; return *this; } + + /** + * @brief Not equal operator. + * + * @return True if the two iterators do not point to the same poistion. + */ + bool operator!=(const iterator &it) const + { + return _item._fsa!=it._item._fsa || _item._symbol!=it._item._symbol || + _item._state!=it._item._state || _item._string!=it._item._string || + _item._stack!=it._item._stack; + } + + /** + * @brief Prefix increment operator. + * + * Prefix increment operator. Calling on an uninitalized iterator + * (or one which has reached end()) has no effect. + * + * @return Reference to this. + */ + iterator& operator++(); + + /** + * @brief Dereference operator. + * + * @return Const reference to state object for data access. + */ + const iteratorItem& operator*() const { return _item; } + + /** + * @brief Dereference operator. + * + * @return Const pointer to state object for data access. + */ + const iteratorItem* operator->() const { return &_item; } + + }; + + // }}} + + // {{{ FSA::State + /** + * @class State + * @brief Class for FSA lookups. + * + * This class represents the state of a finite state automaton. It + * is connected to one FSA for its whole lifetime. Provides methods + * for transitions and lookups. + */ + class State { + + friend FSA::iterator::iterator(const State &); + + private: + /** + * @brief Unimplemented private default constructor. + */ + State(); + /** + * @brief Unimplemented private assignment operator. + */ + State& operator=(const State&); + + protected: + const FSA *_fsa; /**< Pointer to the FSA. */ + state_t _state; /**< Current state. */ + + public: + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. + * + * @param f Reference to FSA. + */ + State(const FSA& f) : _fsa(&f), _state(_fsa->start()) {} + + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. + * + * @param f Pointer to FSA. + */ + State(const FSA* f) : _fsa(f), _state(_fsa->start()) {} + + /** + * @brief Copy constructor. + * + * Duplicate an existing state. The new state will refer to the + * same state of the automaton, but it can be used independently + * (e.g. continue with different transitions). + * + * @param s Reference to state to be duplicated. + */ + State(const State& s) : _fsa(s._fsa), _state(s._state) {} + + /** + * @brief Destructor. + * + * Destructor, does nothing special. + */ + virtual ~State() {} + + /** + * @brief Check if the automaton has perfect hash built in. + * + * Returns true if the automaton was built with a perfect hash included. + * + * @return True if the automaton has perfect hash. + */ + virtual bool hasPerfectHash() const + { + return _fsa->hasPerfectHash(); + } + + /** + * @brief Check is the state is valid. + * + * Returns true if the state is valid, that is the sequence of + * transitions leading to this state exists in the automaton. + * + * @return True if the state is valid. + */ + virtual bool isValid() const + { + return _state>0; + } + + /** + * @brief Set the state to the start state of the automaton. + * + * @return True if the resulting state is valid. + */ + virtual bool start() + { + _state = _fsa->start(); + return _state!=0; + } + + /** + * @brief Delta transition. + * + * Perform a delta transition using a single input symbol. + * + * @param in Input symbol. + * @return True if the resulting state is valid. + */ + virtual bool delta(symbol_t in) + { + _state = _fsa->delta(_state,in); + return _state!=0; + } + + /** + * @brief Try a delta transition. + * + * Try if a delta transition would succeed, without performing the + * transition. + * + * @param in Input symbol. + * @return True if the delta transition would succeed. + */ + virtual bool tryDelta(symbol_t in) + { + return _fsa->delta(_state,in)!=0; + } + + /** + * @brief Start and transition. + * + * Sets the state to the starting state of the automaton, and + * performs a transition using a single input symbol. + * + * @param in Input symbol. + * @return True if the resulting state is valid. + */ + virtual bool start(symbol_t in) + { + start(); + return delta(in); + } + + /** + * @brief Start and transition. + * + * Sets the state to the starting state of the automaton, and + * performs a transition using a sequence of input symbols. + * + * @param in Input symbols, zero terminated. + * @return True if the resulting state is valid. + */ + virtual bool start(const symbol_t *in) + { + start(); + return delta(in); + } + + /** + * @brief Start and transition. + * + * Sets the state to the starting state of the automaton, and + * performs a transition using a sequence of input symbols. + * + * @param in Input symbols, zero terminated. + * @return True if the resulting state is valid. + */ + virtual bool start(const char *in) + { + start(); + return delta(in); + } + + /** + * @brief Start and transition. + * + * Sets the state to the starting state of the automaton, and + * performs a transition using a sequence of input symbols. + * + * @param in Input symbols. + * @return True if the resulting state is valid. + */ + virtual bool start(const std::string &in) + { + start(); + return delta(in); + } + + /** + * @brief Start and transition. + * + * Sets the state to the starting state of the automaton, and + * performs a transition using an input word. + * + * @param in Input word. + * @return True if the resulting state is valid. + */ + virtual bool startWord(const std::string &in) + { + start(); + return delta(in); + } + + /** + * @brief Delta transition. + * + * Perform a delta transition using a sequence of input symbols. + * + * @param in Input symbols, zero terminated. + * @return True if the resulting state is valid. + */ + virtual bool delta(const symbol_t *in) + { + const symbol_t *p=in; + + while(*p && _state>0){ + delta(*p); + p++; + } + return _state!=0; + } + + /** + * @brief Delta transition. + * + * Perform a delta transition using a sequence of input symbols. + * + * @param in Input symbols, zero terminated. + * @return True if the resulting state is valid. + */ + virtual bool delta(const char *in) + { + return delta((const symbol_t *)in); + } + + /** + * @brief Delta transition. + * + * Perform a delta transition using a sequence of input symbols. + * + * @param in Input symbols. + * @return True if the resulting state is valid. + */ + virtual bool delta(const std::string &in) + { + unsigned int idx=0; + + while(idx<in.length() && _state>0){ + delta(in[idx]); + idx++; + } + return _state!=0; + } + + /** + * @brief Delta transition. + * + * Perform a delta transition using an input word. A word + * separator symbol ` ` is inserted before the word if it is not + * the first word (the current state is not the start state). + * + * @param in Input word. + * @return True if the resulting state is valid. + */ + virtual bool deltaWord(const std::string &in) + { + if(_state!=_fsa->start()) + delta(' '); + return delta(in); + } + + /** + * @brief Check if the current state is final (accepting) state. + * + * @return True if the state is final. + */ + virtual bool isFinal(void) const + { + return _fsa->isFinal(_state); + } + + /** + * @brief Get the size of a data item. + * + * Get the size of the data item assiciated with a final + * state. The return value -1 indicates that the current state is + * not a final state. + * + * @return Size of data item, or -1 if the state is not final. + */ + virtual int dataSize(void) const + { + return _fsa->dataSize(_state); + } + + /** + * @brief Get the data item. + * + * Get the data item assiciated with a final state. The return + * value NULL indicates that the current state is not a final + * state. + * + * @return Pointer to data item, or NULL if the state is not final. + */ + virtual const data_t *data() const + { + return _fsa->data(_state); + } + + /** + * @brief Get the data item as a character string. + * + * Get the data item assiciated with a final state. The return + * value NULL indicates that the current state is not a final + * state. + * + * @return Pointer to data item, or NULL if the state is not final. + */ + virtual const char *cData() const + { + return (const char*)(_fsa->data(_state)); + } + + /** + * @brief Get the data item as an unsigned 32-bit integer. + * + * Get the data item assiciated with a final state as an unsigned + * 32-bit integer. If the data field size is 0 or the state is not + * final, zero returned, otherwise 1, 2 or 4 byte integer is + * retrieved according to the size and converted to uint32_t. + * + * @return Numerical data. + */ + virtual uint32_t nData() const + { + const data_t *da = _fsa->data(_state); + int si = _fsa->dataSize(_state); + if(si<=0) + return 0; + switch(si){ + case 1: + return (uint32_t)((const uint8_t*)da)[0]; + case 2: + case 3: + return (uint32_t)((const uint16_t*)da)[0]; + case 4: + default: + return ((const uint32_t*)da)[0]; + } + } + + /** + * @brief Dummy hash() method; for simple states returns only + * zero. Will be overridden by HashedState etc. + * + * @return 0 + */ + virtual hash_t hash() const + { + return 0; + } + + + /** + * @brief Perform a lookup. + * + * Perform a string lookup in the automaton (sequence of + * transitions, starting from the start state. Returns a pointer + * to the data item associated with the final state if the string + * is accepted, NULL otherwise. + * + * @param in Input string. + * @return Pointer to data item, or NULL if the state is not final. + */ + virtual const data_t *lookup(const symbol_t *in) + { + start(in); + return data(); + } + + /** + * @brief Perform a lookup. + * + * Perform a string lookup in the automaton (sequence of + * transitions, starting from the start state. Returns a pointer + * to the data item associated with the final state if the string + * is accepted, NULL otherwise. + * + * @param in Input string. + * @return Pointer to data item, or NULL if the state is not final. + */ + virtual const data_t *lookup(const char *in) + { + return lookup((const symbol_t*)in); + } + + /** + * @brief Perform a lookup. + * + * Perform a string lookup in the automaton (sequence of + * transitions, starting from the start state. Returns a pointer + * to the data item associated with the final state if the string + * is accepted, NULL otherwise. + * + * @param in Input string. + * @return Pointer to data item, or NULL if the state is not final. + */ + virtual const data_t *lookup(const std::string &in) + { + start(in); + return data(); + } + + /** + * @brief Reverse lookup. + * + * For a given hash value, return the corresponding string. + * + * @param hash Hash value. + * @return String corresponding to hash value, or empty string if + * the fsa has no perfect hash or the hash value is out of + * range. + */ + virtual std::string revLookup(hash_t hash) const + { + return _fsa->revLookup(hash); + } + + /** + * @brief Get iterator pointing to the beginning of the fsa. + * + * @return iterator pointing to the first string in the fsa. + */ + virtual FSA::iterator begin() const { return FSA::iterator(_fsa,_state); } + + /** + * @brief Get iterator pointing past the end of the fsa. + * + * @return iterator pointing past the last string in the fsa. + */ + virtual FSA::iterator end() const { return FSA::iterator(_fsa,true); } + + }; + + // }}} + + // {{{ FSA::HashedState + /** + * @class HashedState + * @brief Class for FSA lookups with perfect hash functionality. + * + * This class represents the state of a finite state automaton. It + * is connected to one FSA for its whole lifetime. Provides all + * methods of the FSA::State plus perfect hashing functionality. + */ + class HashedState : public State { + + private: + /** + * @brief Unimplemented private default constructor. + */ + HashedState(); + /** + * @brief Unimplemented private assignment operator. + */ + HashedState& operator=(const HashedState&); + + protected: + hash_t _hash; /**< Hash value. */ + + public: + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. + * + * @param f Reference to FSA. + */ + HashedState(const FSA& f) : State(f), _hash(0) {} + + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. + * + * @param f Pointer to FSA. + */ + HashedState(const FSA* f) : State(f), _hash(0) {} + + /** + * @brief Copy constructor. + * + * Duplicate an existing hashed state. + * + * @param s Reference to hashed state to copy. + */ + HashedState(const HashedState& s) : State(s), _hash(s._hash) {} + + /** + * @brief Destructor. + */ + virtual ~HashedState() {} + +#if ((__GNUG__ == 3 && __GNUC_MINOR__ >= 1) || __GNUG__ > 3) + using State::start; + using State::delta; +#else + virtual bool start(symbol_t in) { start(); return delta(in); } + virtual bool start(const symbol_t *in) { start(); return delta(in); } + virtual bool start(const char *in) { start(); return delta(in); } + virtual bool start(const std::string &in) { start(); return delta(in); } + virtual bool delta(const symbol_t *in) + { + const symbol_t *p=in; + while(*p && _state>0){ + delta(*p); + p++; + } + return _state!=0; + } + virtual bool delta(const char *in) { return delta((const symbol_t *)in); } + virtual bool delta(const std::string &in) + { + unsigned int idx=0; + + while(idx<in.length() && _state>0){ + delta(in[idx]); + idx++; + } + return _state!=0; + } +#endif + + /** + * @brief Set the state to the starting state of the automaton. + * + * This method overrides the State::start() method, and resets the + * hash value in addition. + * + * @return True if the resulting state is valid. + */ + virtual bool start() + { + _hash = 0; + return State::start(); + } + + /** + * @brief Delta transition for hashed states. + * + * Extends the State::delta() method with hash value update. + * + * @param in Input symbol. + * @return True if the resulting state is valid. + */ + virtual bool delta(symbol_t in) + { + _hash += _fsa->hashDelta(_state,in); + return State::delta(in); + } + + /** + * @brief Get current hash value. + * + * For final states, returns the perfect hash value for the input + * string which lead to the the state. For any state (including + * final states) the value equals the number of strings accepted + * by the automaton which (in an alphabetical ordering) preceed + * the string leading to the state. + * + * @return Hash value. + */ + virtual hash_t hash() const + { + return _hash; + } + + /** + * @brief Obsolete alias for hash(), for backwards compatibility. + * + * @return Hash value. + */ + virtual hash_t getHash() const + { + return _hash; + } + + }; + + // }}} + + // {{{ FSA::CounterState + /** + * @class CounterState + * @brief Class for FSA lookups with counter. + * + * This class represents the state of a finite state automaton. It + * is connected to one FSA for its whole lifetime. Provides all + * methods of the FSA::State and counts the number of transtitions. + */ + class CounterState : public State { + + private: + /** + * @brief Unimplemented private default constructor. + */ + CounterState(); + /** + * @brief Unimplemented private assignment operator. + */ + CounterState& operator=(const CounterState&); + + protected: + uint32_t _counter; /**< Counter value. */ + + public: + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the counter. + * + * @param f Reference to FSA. + */ + CounterState(const FSA& f) : State(f), _counter(0) {} + + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the counter. + * + * @param f Pointer to FSA. + */ + CounterState(const FSA* f) : State(f), _counter(0) {} + + /** + * @brief Copy constructor. + * + * Duplicate an existing hashed state. + * + * @param s Reference to hashed state to copy. + */ + CounterState(const CounterState& s) : State(s), _counter(s._counter) {} + + /** + * @brief Destructor. + */ + virtual ~CounterState() {} + +#if ((__GNUG__ == 3 && __GNUC_MINOR__ >= 1) || __GNUG__ > 3) + using State::start; + using State::delta; +#else + virtual bool start(symbol_t in) { start(); return delta(in); } + virtual bool start(const symbol_t *in) { start(); return delta(in); } + virtual bool start(const char *in) { start(); return delta(in); } + virtual bool start(const std::string &in) { start(); return delta(in); } + virtual bool delta(const symbol_t *in) + { + const symbol_t *p=in; + while(*p && _state>0){ + delta(*p); + p++; + } + return _state!=0; + } + virtual bool delta(const char *in) { return delta((const symbol_t *)in); } + virtual bool delta(const std::string &in) + { + unsigned int idx=0; + + while(idx<in.length() && _state>0){ + delta(in[idx]); + idx++; + } + return _state!=0; + } +#endif + + /** + * @brief Set the state to the starting state of the automaton. + * + * This method overrides the State::start() method, and resets the + * counter in addition. + * + * @return True if the resulting state is valid. + */ + virtual bool start() + { + _counter = 0; + return State::start(); + } + + /** + * @brief Delta transition for counter states. + * + * Extends the State::delta() method with counter increment. + * + * @param in Input symbol. + * @return True if the resulting state is valid. + */ + virtual bool delta(symbol_t in) + { + bool ok = State::delta(in); + if(ok) + ++_counter; // only count valid transitions + return ok; + } + + /** + * @brief Get current counter value. + * + * Return the current counter. The counter is the number of + * transitions from the start state to the current state. + * If the state is not valid anymore, the counter is the number of + * transitions to the last valid state. + * + * @return Counter value. + */ + virtual uint32_t counter() const + { + return _counter; + } + + /** + * @brief An alias for counter() + * + * @return Counter value. + */ + virtual uint32_t getCounter() const + { + return _counter; + } + + }; + // }}} + + // {{{ FSA::WordCounterState + /** + * @class WordCounterState + * @brief Class for FSA lookups with word counter. + * + * This class is similar to CounterState, but it counts whole word + * transitions. Operations other than start(void), startWord(const std::string&) + * or deltaWord(const std::string&) will not modify the counter. + */ + class WordCounterState : public State { + + private: + /** + * @brief Unimplemented private default constructor. + */ + WordCounterState(); + /** + * @brief Unimplemented private assignment operator. + */ + WordCounterState& operator=(const WordCounterState&); + + protected: + uint32_t _counter; /**< Counter value. */ + + public: + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the counter. + * + * @param f Reference to FSA. + */ + WordCounterState(const FSA& f) : State(f), _counter(0) {} + + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the counter. + * + * @param f Pointer to FSA. + */ + WordCounterState(const FSA* f) : State(f), _counter(0) {} + + /** + * @brief Copy constructor. + * + * Duplicate an existing hashed state. + * + * @param s Reference to hashed state to copy. + */ + WordCounterState(const WordCounterState& s) : State(s), _counter(s._counter) {} + + /** + * @brief Destructor. + */ + virtual ~WordCounterState() {} + + /** + * @brief Set the state to the starting state of the automaton. + * + * This method overrides the State::start() method, and resets the + * counter in addition. + * + * @return True if the resulting state is valid. + */ + virtual bool start() + { + _counter = 0; + return State::start(); + } + + /** + * @brief Start and transition. + * + * Sets the state to the starting state of the automaton, and + * performs a transition using an input word. + * + * @param in Input word. + * @return True if the resulting state is valid. + */ + virtual bool startWord(const std::string &in) + { + start(); + return deltaWord(in); + } + + /** + * @brief Delta transition. + * + * Perform a delta transition using an input word. A word + * separator symbol ` ` is inserted before the word if it is not + * the first word (the current state is not the start state). + * + * @param in Input word. + * @return True if the resulting state is valid. + */ + virtual bool deltaWord(const std::string &in) + { + if(in.length()==0){ + return _state!=0; + } + if(_state!=_fsa->start()) + delta(' '); + bool ok = delta(in); + if(ok) + ++_counter; // only count valid word transitions + return ok; + } + + /** + * @brief Get current counter value. + * + * Return the current counter. The counter is the number of + * word transitions from the start state to the current state. + * If the state is not valid anymore, the counter is the number of + * word transitions to the last valid state. + * + * @return Counter value. + */ + virtual uint32_t counter() const + { + return _counter; + } + + /** + * @brief An alias for counter() + * + * @return Counter value. + */ + virtual uint32_t getCounter() const + { + return _counter; + } + + }; + // }}} + + // {{{ FSA::MemoryState + /** + * @class MemoryState + * @brief Class for FSA lookups with memory functionality. + * + * This class represents the state of a finite state automaton. It + * is connected to one FSA for its whole lifetime. Provides all + * methods of the FSA::State and in addition it remebers the + * sequence of symbols which led to this state. + */ + class MemoryState : public State { + + private: + /** + * @brief Unimplemented private default constructor. + */ + MemoryState(); + /** + * @brief Unimplemented private assignment operator. + */ + MemoryState& operator=(const MemoryState&); + + protected: + std::string _memory; /**< Memory value. */ + + public: + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the memory value. + * + * @param f Reference to FSA. + */ + MemoryState(const FSA& f) : State(f), _memory() {} + + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the memory value. + * + * @param f Pointer to FSA. + */ + MemoryState(const FSA* f) : State(f), _memory() {} + + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the memory value. + * Reserves space for the memory string. + * + * @param f Reference to FSA. + * @param res Size to pre-reserve. + */ + MemoryState(const FSA& f, unsigned int res) : State(f), _memory() + { + _memory.reserve(res); + } + + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the memory value. + * Reserves space for the memory string. + * + * @param f Pointer to FSA. + * @param res Size to pre-reserve. + */ + MemoryState(const FSA* f, unsigned int res) : State(f), _memory() + { + _memory.reserve(res); + } + + /** + * @brief Copy constructor. + * + * Duplicate an existing memory state. + * + * @param s Reference to memory state to copy. + */ + MemoryState(const MemoryState& s) : State(s), _memory(s._memory) {} + + /** + * @brief Destructor. + */ + virtual ~MemoryState() {} + +#if ((__GNUG__ == 3 && __GNUC_MINOR__ >= 1) || __GNUG__ > 3) + using State::start; + using State::delta; +#else + virtual bool start(symbol_t in) { start(); return delta(in); } + virtual bool start(const symbol_t *in) { start(); return delta(in); } + virtual bool start(const char *in) { start(); return delta(in); } + virtual bool start(const std::string &in) { start(); return delta(in); } + virtual bool delta(const symbol_t *in) + { + const symbol_t *p=in; + while(*p && _state>0){ + delta(*p); + p++; + } + return _state!=0; + } + virtual bool delta(const char *in) { return delta((const symbol_t *)in); } + virtual bool delta(const std::string &in) + { + unsigned int idx=0; + + while(idx<in.length() && _state>0){ + delta(in[idx]); + idx++; + } + return _state!=0; + } +#endif + + /** + * @brief Set the state to the starting state of the automaton. + * + * This method overrides the State::start() method, and resets the + * memory in addition. + * + * @return True if the resulting state is valid. + */ + virtual bool start() + { +#if ((__GNUG__ == 3 && __GNUC_MINOR__ >= 1) || __GNUG__ > 3) + _memory.clear(); +#else + _memory = ""; +#endif + return State::start(); + } + + /** + * @brief Delta transition for memory states. + * + * Extends the State::delta() method with memory update. + * + * @param in Input symbol. + * @return True if the resulting state is valid. + */ + virtual bool delta(symbol_t in) + { + bool ok = State::delta(in); + if(ok) + _memory += (char)in; + return ok; + } + + /** + * @brief Get current memory value. + * + * The memory for a state stores the sequence of the + * transitions which lead to the current state (or the last valid + * state). + * + * @return Memory value. + */ + virtual std::string memory() const + { + return _memory; + } + + /** + * @brief Alias for memory(). + * + * @return Memory value. + */ + virtual std::string getMemory() const + { + return _memory; + } + + }; + + // }}} + + // {{{ FSA::HashedMemoryState + /** + * @class HashedMemoryState + * @brief Class for FSA lookups with perfect hash and memory functionality. + * + * This class represents the state of a finite state automaton. It + * is connected to one FSA for its whole lifetime. Provides all + * methods of the FSA::State plus perfect hashing functionality and + * in addition it remebers the sequence of symbols which led to this + * state. + */ + class HashedMemoryState : public State { + + private: + /** + * @brief Unimplemented private default constructor. + */ + HashedMemoryState(); + /** + * @brief Unimplemented private assignment operator. + */ + HashedMemoryState& operator=(const HashedMemoryState&); + + protected: + hash_t _hash; /**< Hash value. */ + std::string _memory; /**< Memory value. */ + + public: + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the memory value. + * + * @param f Reference to FSA. + */ + HashedMemoryState(const FSA& f) : State(f), _hash(0), _memory() {} + + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the memory value. + * + * @param f Pointer to FSA. + */ + HashedMemoryState(const FSA* f) : State(f), _hash(0), _memory() {} + + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the memory value. + * Reserves space for the memory string. + * + * @param f Reference to FSA. + * @param res Size to pre-reserve. + */ + HashedMemoryState(const FSA& f, unsigned int res) : State(f), _hash(0), _memory() + { + _memory.reserve(res); + } + + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the memory value. + * Reserves space for the memory string. + * + * @param f Pointer to FSA. + * @param res Size to pre-reserve. + */ + HashedMemoryState(const FSA* f, unsigned int res) : State(f), _hash(0), _memory() + { + _memory.reserve(res); + } + + /** + * @brief Copy constructor. + * + * Duplicate an existing hashed memory state. + * + * @param s Reference to hashed memory state to copy. + */ + HashedMemoryState(const HashedMemoryState& s) : State(s), + _hash(s._hash), + _memory(s._memory) {} + /** + * @brief Destructor. + */ + virtual ~HashedMemoryState() {} + +#if ((__GNUG__ == 3 && __GNUC_MINOR__ >= 1) || __GNUG__ > 3) + using State::start; + using State::delta; +#else + virtual bool start(symbol_t in) { start(); return delta(in); } + virtual bool start(const symbol_t *in) { start(); return delta(in); } + virtual bool start(const char *in) { start(); return delta(in); } + virtual bool start(const std::string &in) { start(); return delta(in); } + virtual bool delta(const symbol_t *in) + { + const symbol_t *p=in; + while(*p && _state>0){ + delta(*p); + p++; + } + return _state!=0; + } + virtual bool delta(const char *in) { return delta((const symbol_t *)in); } + virtual bool delta(const std::string &in) + { + unsigned int idx=0; + + while(idx<in.length() && _state>0){ + delta(in[idx]); + idx++; + } + return _state!=0; + } +#endif + + /** + * @brief Set the state to the starting state of the automaton. + * + * This method overrides the State::start() method, and resets the + * hash and memory in addition. + * + * @return True if the resulting state is valid. + */ + virtual bool start() + { + _hash = 0; +#if ((__GNUG__ == 3 && __GNUC_MINOR__ >= 1) || __GNUG__ > 3) + _memory.clear(); +#else + _memory = ""; +#endif + return State::start(); + } + + /** + * @brief Delta transition for memory states. + * + * Extends the State::delta() method with hash and memory update. + * + * @param in Input symbol. + * @return True if the resulting state is valid. + */ + virtual bool delta(symbol_t in) + { + _hash += _fsa->hashDelta(_state,in); + bool ok = State::delta(in); + if(ok) + _memory += (char)in; // only remeber valid transitions + return ok; + } + + /** + * @brief Get current hash value. + * + * For final states, returns the perfect hash value for the input + * string which lead to the the state. For any state (including + * final states) the value equals the number of strings accepted + * by the automaton which (in an alphabetical ordering) preceed + * the string leading to the state. + * + * @return Hash value. + */ + virtual hash_t hash() const + { + return _hash; + } + + /** + * @brief Obsolete alias for hash(), for backwards compatibility. + * + * @return Hash value. + */ + virtual hash_t getHash() const + { + return _hash; + } + + /** + * @brief Get current memory value. + * + * The memory for a state stores the sequence of the + * transitions which lead to the current state (or the last valid + * state). + * + * @return Memory value. + */ + virtual std::string memory() const + { + return _memory; + } + + /** + * @brief Alias for memory(). + * + * @return Memory value. + */ + virtual std::string getMemory() const + { + return _memory; + } + + }; + + // }}} + + // {{{ FSA::HashedCounterState + /** + * @class HashedCounterState + * @brief Class for FSA lookups with counter and hash. + * + * This class represents the state of a finite state automaton. It + * is connected to one FSA for its whole lifetime. Provides all + * methods of the FSA::State and counts the number of transtitions, + * and computes hash value. + */ + class HashedCounterState : public State { + + private: + /** + * @brief Unimplemented private default constructor. + */ + HashedCounterState(); + /** + * @brief Unimplemented private assignment operator. + */ + HashedCounterState& operator=(const CounterState&); + + protected: + hash_t _hash; /**< Hash value. */ + uint32_t _counter; /**< Counter value. */ + + public: + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the counter. + * + * @param f Reference to FSA. + */ + HashedCounterState(const FSA& f) : State(f), _hash(0), _counter(0) {} + + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the counter. + * + * @param f Pointer to FSA. + */ + HashedCounterState(const FSA* f) : State(f), _hash(0), _counter(0) {} + + /** + * @brief Copy constructor. + * + * Duplicate an existing hashed state. + * + * @param s Reference to hashed state to copy. + */ + HashedCounterState(const HashedCounterState& s) : State(s), _hash(s._hash), _counter(s._counter) {} + + /** + * @brief Destructor. + */ + virtual ~HashedCounterState() {} + +#if ((__GNUG__ == 3 && __GNUC_MINOR__ >= 1) || __GNUG__ > 3) + using State::start; + using State::delta; +#else + virtual bool start(symbol_t in) { start(); return delta(in); } + virtual bool start(const symbol_t *in) { start(); return delta(in); } + virtual bool start(const char *in) { start(); return delta(in); } + virtual bool start(const std::string &in) { start(); return delta(in); } + virtual bool delta(const symbol_t *in) + { + const symbol_t *p=in; + while(*p && _state>0){ + delta(*p); + p++; + } + return _state!=0; + } + virtual bool delta(const char *in) { return delta((const symbol_t *)in); } + virtual bool delta(const std::string &in) + { + unsigned int idx=0; + + while(idx<in.length() && _state>0){ + delta(in[idx]); + idx++; + } + return _state!=0; + } +#endif + + /** + * @brief Set the state to the starting state of the automaton. + * + * This method overrides the State::start() method, and resets the + * counter in addition. + * + * @return True if the resulting state is valid. + */ + virtual bool start() + { + _hash = 0; + _counter = 0; + return State::start(); + } + + /** + * @brief Delta transition for hashed counter states. + * + * Extends the State::delta() method with counter increment and + * hash update. + * + * @param in Input symbol. + * @return True if the resulting state is valid. + */ + virtual bool delta(symbol_t in) + { + _hash += _fsa->hashDelta(_state,in); + bool ok = State::delta(in); + if(ok) + ++_counter; // only count valid transitions + return ok; + } + + /** + * @brief Get current hash value. + * + * For final states, returns the perfect hash value for the input + * string which lead to the the state. For any state (including + * final states) the value equals the number of strings accepted + * by the automaton which (in an alphabetical ordering) preceed + * the string leading to the state. + * + * @return Hash value. + */ + virtual hash_t hash() const + { + return _hash; + } + + /** + * @brief Obsolete alias for hash(), for backwards compatibility. + * + * @return Hash value. + */ + virtual hash_t getHash() const + { + return _hash; + } + + /** + * @brief Get current counter value. + * + * Return the current counter. The counter is the number of + * transitions from the start state to the current state. + * If the state is not valid anymore, the counter is the number of + * transitions to the last valid state. + * + * @return Counter value. + */ + virtual uint32_t counter() const + { + return _counter; + } + + /** + * @brief An alias for counter() + * + * @return Counter value. + */ + virtual uint32_t getCounter() const + { + return _counter; + } + + }; + // }}} + + // {{{ FSA::HashedWordCounterState + /** + * @class HashedWordCounterState + * @brief Class for FSA lookups with word counter and hash. + * + * This class is similar to CounterState, but it counts whole word + * transitions. Operations other than start(void), startWord(const std::string&) + * or deltaWord(const std::string&) will not modify the counter. + */ + class HashedWordCounterState : public State { + + private: + /** + * @brief Unimplemented private default constructor. + */ + HashedWordCounterState(); + /** + * @brief Unimplemented private assignment operator. + */ + HashedWordCounterState& operator=(const HashedWordCounterState&); + + protected: + hash_t _hash; /**< Hash value. */ + uint32_t _counter; /**< Counter value. */ + + +#if ((__GNUG__ == 3 && __GNUC_MINOR__ >= 1) || __GNUG__ > 3) + using State::delta; +#else + virtual bool delta(const symbol_t *in) + { + const symbol_t *p=in; + while(*p && _state>0){ + delta(*p); + p++; + } + return _state!=0; + } + virtual bool delta(const char *in) { return delta((const symbol_t *)in); } + virtual bool delta(const std::string &in) + { + unsigned int idx=0; + + while(idx<in.length() && _state>0){ + delta(in[idx]); + idx++; + } + return _state!=0; + } +#endif + + /** + * @brief Delta transition for hashed word counter states. + * + * Extends the State::delta() method with hash update. It is + * protected so it is not accessible outside (only deltaWord is). + * + * @param in Input symbol. + * @return True if the resulting state is valid. + */ + virtual bool delta(symbol_t in) + { + _hash += _fsa->hashDelta(_state,in); + bool ok = State::delta(in); + return ok; + } + + public: + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the counter. + * + * @param f Reference to FSA. + */ + HashedWordCounterState(const FSA& f) : State(f), _hash(0), _counter(0) {} + + /** + * @brief Constructor. + * + * Create a new state from an FSA, and set it to the starting + * state of the automaton. Also reset the counter. + * + * @param f Pointer to FSA. + */ + HashedWordCounterState(const FSA* f) : State(f), _hash(0), _counter(0) {} + + /** + * @brief Copy constructor. + * + * Duplicate an existing hashed state. + * + * @param s Reference to hashed state to copy. + */ + HashedWordCounterState(const HashedWordCounterState& s) : State(s), _hash(s._hash), _counter(s._counter) {} + + /** + * @brief Destructor. + */ + virtual ~HashedWordCounterState() {} + + /** + * @brief Set the state to the starting state of the automaton. + * + * This method overrides the State::start() method, and resets the + * counter in addition. + * + * @return True if the resulting state is valid. + */ + virtual bool start() + { + _hash = 0; + _counter = 0; + return State::start(); + } + + /** + * @brief Start and transition. + * + * Sets the state to the starting state of the automaton, and + * performs a transition using an input word. + * + * @param in Input word. + * @return True if the resulting state is valid. + */ + virtual bool startWord(const std::string &in) + { + start(); + return deltaWord(in); + } + + /** + * @brief Delta transition. + * + * Perform a delta transition using an input word. A word + * separator symbol ` ` is inserted before the word if it is not + * the first word (the current state is not the start state). + * + * @param in Input word. + * @return True if the resulting state is valid. + */ + virtual bool deltaWord(const std::string &in) + { + if(in.length()==0){ + return _state!=0; + } + if(_state!=_fsa->start()) + delta(' '); + bool ok = delta(in); + if(ok) + ++_counter; // only count valid word transitions + return ok; + } + + /** + * @brief Get current hash value. + * + * For final states, returns the perfect hash value for the input + * string which lead to the the state. For any state (including + * final states) the value equals the number of strings accepted + * by the automaton which (in an alphabetical ordering) preceed + * the string leading to the state. + * + * @return Hash value. + */ + virtual hash_t hash() const + { + return _hash; + } + + /** + * @brief Obsolete alias for hash(), for backwards compatibility. + * + * @return Hash value. + */ + virtual hash_t getHash() const + { + return _hash; + } + + /** + * @brief Get current counter value. + * + * Return the current counter. The counter is the number of + * word transitions from the start state to the current state. + * If the state is not valid anymore, the counter is the number of + * word transitions to the last valid state. + * + * @return Counter value. + */ + virtual uint32_t counter() const + { + return _counter; + } + + /** + * @brief An alias for counter() + * + * @return Counter value. + */ + virtual uint32_t getCounter() const + { + return _counter; + } + + }; + + // }}} + +#if (__GNUG__ < 3 || (__GNUG__ == 3 && __GNUC_MINOR__ < 1)) + friend class State; + friend class HashedState; + friend class MemoryState; + friend class HashedMemoryState; + friend class CounterState; + friend class HashedCounterState; + friend class WordCounterState; + friend class HashedWordCounterState; +#endif + +public: + /** + * @brief Magic number for identifying fsa files. + */ + static const uint32_t MAGIC = 0x79832469; + + /** + * @brief Version number. + * + * Version number for identifying the fsa library and files. The + * format is MMMmmmrrr, M=major, m=minor, r=revision. 1000 equals + * 0.1.0. + */ + static const uint32_t VER = 2000001; + + /** + * @brief Library version number. + * + * Static method which returns the library version. + */ + static uint32_t libVER(); + + /** + * @brief Reserved symbol used for empty cells in internal tables. + */ + static const symbol_t EMPTY_SYMBOL = 0x00; + + /** + * @brief Reserved symbol used for final states in internal tables. + */ + static const symbol_t FINAL_SYMBOL = 0xff; + + /** + * @brief Type of data items for final states. + * + * Type of data items for final states. The possible values are: + * - DATA_VARIABLE (0) - variable size data items, the size is + * stored with each item + * - DATA_FIXED (1) - fixed size data items. The size is only + * stored once in the header. + */ + enum Data_Type { + DATA_VARIABLE = 0, + DATA_FIXED + }; + + /** + * @struct Header + * @brief %FSA header. + * + * Header structure of the %FSA files. + */ + struct Header { + uint32_t _magic; /**< Magic number. */ + uint32_t _version; /**< Version number. */ + uint32_t _checksum; /**< Checksum. */ + uint32_t _size; /**< Size of fsa (cells). */ + uint32_t _start; /**< Start state. */ + uint32_t _data_size; /**< Size of data. */ + uint32_t _data_type; /**< Type of data items. */ + uint32_t _fixed_data_size; /**< Data item size if fixed. */ + uint32_t _has_perfect_hash; /**< Indicator for perfect hash. */ + uint32_t _serial; /**< Serial number */ + uint32_t _reserved[54]; /**< Reserved (pads size to 256 bytes). */ + }; + + /** + * @struct Descriptor + * @brief %FSA descriptor. + * + * %FSA descriptor for creating FSA objects directly from Automaton + * objects (used by Automaton::getFSA()). + */ + struct Descriptor { + uint32_t _version; + uint32_t _serial; + state_t *_state; + symbol_t *_symbol; + uint32_t _size; + data_t *_data; + uint32_t _data_size; + uint32_t _data_type; + uint32_t _fixed_data_size; + hash_t *_perf_hash; + uint32_t _start; + }; + +private: + + static const FileAccessMethod _default_file_access_method = FILE_ACCESS_MMAP; /**< Default file access method (read/mmap). */ + + void *_mmap_addr; /**< mmap address, NULL is file has not been mmapped. */ + size_t _mmap_length; /**< mmap length. */ + + uint32_t _version; /**< Version of fsalib used to build this fsa. */ + uint32_t _serial; /**< Serial number of this fsa. */ + + state_t *_state; /**< State table for transitions. */ + symbol_t *_symbol; /**< Symbol table for transitions. */ + uint32_t _size; /**< Size (number of cells). */ + + data_t *_data; /**< Data storage. */ + uint32_t _data_size; /**< Size of data storage. */ + uint32_t _data_type; /**< Type of data items (fixed or var.) */ + uint32_t _fixed_data_size; /**< Size of data items if fixed. */ + + bool _has_perfect_hash; /**< Indicator of perfect hash present. */ + hash_t *_perf_hash; /**< Perfect hash table, if present. */ + + state_t _start; /**< Index of start state. */ + + bool _ok; /**< Flag set if object initialization succeeded. */ + +public: + + /** + * @brief Constructor. + * + * Initializes the object from an fsa file. + * + * @param file Name of fsa file. + * @param fam File access mode (read or mmap). If not set, the + * global default access mode will be used. + */ + FSA(const char *file, FileAccessMethod fam = FILE_ACCESS_UNDEF); + FSA(const std::string &file, FileAccessMethod fam = FILE_ACCESS_UNDEF); + + /** + * @brief Destructor. + */ + virtual ~FSA(); + + /** + * @brief Check if initialization was successful. + * + * @return True if the initialization of the object succeeded. + */ + bool isOk() const + { + return _ok; + } + + /** + * @brief Get the fsa library version used for building this %FSA. + * + * @return fsa library version. + */ + uint32_t version(void) const + { + return _version; + } + + /** + * @brief Get the serial number of the %FSA. + * + * @return Serial number. + */ + uint32_t serial(void) const + { + return _serial; + } + + /** + * @brief Check if the %FSA has perferct hash. + * + * @return True if the %FSA was built with perfect hash. + */ + bool hasPerfectHash(void) const + { + return _has_perfect_hash; + } + + /** + * @brief Get the start state of the %FSA. + * + * @return Index of the start state (0 if the %FSA is empty). + */ + state_t start() const + { + return _start; + } + + /** + * @brief Perform a delta transition. + * + * Performs a delta transtion in the automaton. The input is the + * index of the current state and an input symbol, and the return + * value is the index of the new state. + * + * @param fs Index of current state. + * @param in Input symbol. + * @return Index of new state. + */ + state_t delta(state_t fs, symbol_t in) const + { + // fs!=0 check is unnecessary, as state 0 is never packed so _symbol[in]!=in always. + // if(!fs) + // return 0; + state_t nfs=fs+in; + if(_symbol[nfs]==in) + return _state[nfs]; + else + return 0; + } + + /** + * @brief Get hash delta for a transition. + * + * The perfect hash value for a final state is obtained from the sum + * of hash deltas for the transitions leading to that state. + * + * @param fs Index of current state. + * @param in Input symbol. + * @return Hash delta for the transition. + */ + hash_t hashDelta(state_t fs, symbol_t in) const + { + if(_has_perfect_hash && fs!=0 && _symbol[fs+in]==in) + return _perf_hash[fs+in]; + else + return 0; + } + + /** + * @brief Check if the state is a final (accepting) state. + * + * @param fs State. + * @return True if the state is final. + */ + bool isFinal(state_t fs) const + { + if(fs==0) + return false; + return _symbol[fs+FINAL_SYMBOL]==FINAL_SYMBOL; + } + + /** + * @brief Reverse lookup. + * + * For a given hash value, return the corresponding string. + * + * @param hash Hash value. + * @return String corresponding to hash value, or empty string if + * the fsa has no perfect hash or the hash value is out of + * range. + */ + std::string revLookup(hash_t hash) const; + + /** + * @brief Get the size of data item associated with a final state. + * + * @param fs State. + * @return Size of data item, or -1 if the state is not final. + */ + int dataSize(state_t fs) const + { + if(fs==0) + return -1; + if(_symbol[fs+FINAL_SYMBOL]==FINAL_SYMBOL){ + if(_data_type==DATA_FIXED) + return _fixed_data_size; + else + return (int)(*((uint32_t*)(_data+_state[fs+FINAL_SYMBOL]))); + } + return -1; + } + + /** + * @brief Get a pointer to the data item associated with a final state. + * + * @param fs State. + * @return Pointer to data item, or NULL if the state is not final. + */ + const data_t *data(unsigned int fs) const + { + if(fs==0) + return NULL; + if(_symbol[fs+FINAL_SYMBOL]==FINAL_SYMBOL){ + if(_data_type==DATA_FIXED) + return _data+_state[fs+FINAL_SYMBOL]; + else + return _data+_state[fs+FINAL_SYMBOL]+sizeof(uint32_t); + } + return NULL; + } + + /** + * @brief Print the fsa in dot (graphviz) format. + * + * @param out Output stream (std::cout if omitted). + */ + void printDot(std::ostream &out=std::cout) const; + + /** + * @brief Get iterator pointing to the beginning of the fsa. + * + * @return iterator pointing to the first string in the fsa. + */ + FSA::iterator begin() const { return FSA::iterator(this); } + + /** + * @brief Get iterator pointing past the end of the fsa. + * + * @return iterator pointing past the last string in the fsa. + */ + FSA::iterator end() const { return FSA::iterator(this,true); } + +private: + + /** + * @brief Unimplemented private default constructor. + */ + FSA(); + /** + * @brief Unimplemented private copy constructor. + */ + FSA(const FSA&); + /** + * @brief Unimplemented private assignment operator. + */ + const FSA& operator=(const FSA&); + + /** + * Automaton needs access to a private constructor. + */ + friend class Automaton; + + /** + * @brief Constructor. + * + * Initializes the object from ready memory buffers. + * (Used by Automaton::PackedAutomaton::getFSA.) + * + * @param d Descriptor containing all FSA parameters. + */ + FSA(Descriptor &d) : + _mmap_addr(NULL), _mmap_length(0), + _version(d._version), _serial(d._serial), + _state(d._state), _symbol(d._symbol), _size(d._size), + _data(d._data), _data_size(d._data_size), _data_type(d._data_type), + _fixed_data_size(d._fixed_data_size), + _has_perfect_hash(d._perf_hash!=NULL),_perf_hash(d._perf_hash), + _start(d._start) + { + } + + /** + * @brief Reset the object. + * + * Resets the object to an empty %FSA, and releases allocated memory. + */ + void reset(); + + /** + * @brief Read the %FSA from file. + * + * Reads the %FSA from a file. Returns true on success. + * + * @param filename Name of fsa file. + * @return True on success. + */ + bool read(const char *filename, FileAccessMethod fam = FILE_ACCESS_UNDEF); + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/metadata.cpp b/fsa/src/vespa/fsa/metadata.cpp new file mode 100644 index 00000000000..2a9d511cc91 --- /dev/null +++ b/fsa/src/vespa/fsa/metadata.cpp @@ -0,0 +1,137 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/10/01 + * @version $Id$ + * @file metadata.cpp + * @brief Generic meta data class implementation. + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "metadata.h" +#include "fstream" + +#include <fcntl.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/mman.h> // for ::mmap() +#include <sys/time.h> +#include <sys/resource.h> // for getrlimit(), setrlimit(), etc. + +namespace fsa { + +// {{{ constants + +const uint32_t MetaData::MAGIC; + +// }}} + +// {{{ MetaData::MetaData() + +MetaData::MetaData(const char *datafile, FileAccessMethod fam) : _mmap_addr(NULL), _mmap_length(0), _ok(false), _header(), _data(NULL) +{ + _ok = read(datafile,fam); +} + +MetaData::MetaData(const std::string &datafile, FileAccessMethod fam) : _mmap_addr(NULL), _mmap_length(0), _ok(false), _header(), _data(NULL) +{ + _ok = read(datafile.c_str(),fam); +} + +// }}} +// {{{ MetaData::~MetaData() + +MetaData::~MetaData() +{ + reset(); +} + +// }}} + +// {{{ MetaData::reset() + +void MetaData::reset() +{ + if(_mmap_addr!=NULL && _mmap_addr!=MAP_FAILED){ + munmap(_mmap_addr,_mmap_length); + } + else{ + if(_data!=NULL) free(_data); + } + _mmap_addr=NULL; _mmap_length=0; + _ok=false; + _data=NULL; +} + +// }}} +// {{{ MetaData::read() + +bool MetaData::read(const char *datafile, FileAccessMethod fam) +{ + size_t r; + + reset(); + + if(fam==FILE_ACCESS_UNDEF) + fam=_default_file_access_method; + + if(datafile==NULL) + return false; + + int fd = ::open(datafile,O_RDONLY); + if(fd<0) + return false; + + r=::read(fd,&_header,sizeof(_header)); + if(r!=sizeof(_header) || _header._magic!=MetaData::MAGIC){ + ::close(fd); + return false; + } + + if(fam==FILE_ACCESS_MMAP || fam==FILE_ACCESS_MMAP_WITH_MLOCK){ + _mmap_length = sizeof(_header) + _header._size; + _mmap_addr = ::mmap((void*)0, _mmap_length, PROT_READ, MAP_SHARED, fd, 0); + if(_mmap_addr==MAP_FAILED){ + ::close(fd); + reset(); + return false; + } + if(fam==FILE_ACCESS_MMAP_WITH_MLOCK){ + if(mlock(_mmap_addr, _mmap_length)<0) { + /* try to increase RLIMIT_MEMLOCK then mlock() again */ + struct rlimit rl; + if(getrlimit(RLIMIT_MEMLOCK, &rl) >= 0) { + rl.rlim_cur += _mmap_length + getpagesize(); + rl.rlim_max += _mmap_length + getpagesize(); + if(setrlimit(RLIMIT_MEMLOCK, &rl) >= 0) + mlock(_mmap_addr, _mmap_length); + } + } + } + } + + if(_mmap_addr==NULL){ + _data = malloc(_header._size); + r=::read(fd,_data,_header._size); + if(r!=_header._size){ + ::close(fd); + reset(); + return false; + } + } + else { + _data = (void*)((uint8_t*)_mmap_addr + sizeof(_header)); + } + + ::close(fd); + + return true; +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/metadata.h b/fsa/src/vespa/fsa/metadata.h new file mode 100644 index 00000000000..132ecb1d157 --- /dev/null +++ b/fsa/src/vespa/fsa/metadata.h @@ -0,0 +1,177 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/12/17 + * @version $Id$ + * @file metadata.h + * @brief Generic metadata associated with perfect hash values. The + * data structutre is completely up to the user, but it is + * usually an array of fixed size records indexed by the + * perfect hash value, or it contains an index which maps the + * perfect hash values to variable size records. + * + */ + +#pragma once + +#include <stdlib.h> +#include "fsa.h" + + +namespace fsa { + +// {{{ class MetaData + +/** + * @class MetaData + * @brief Class for representing generic metadata. + * + * Generic metadata associated with perfect hash values. The data + * structutre is completely up to the user, but it is usually an array + * of fixed size records indexed by the perfect hash value, or it + * contains an index which maps the perfect hash values to variable + * size records. + */ +class MetaData { + +public: + + class Handle; // defined in metadatahandle.h + +private: + static const uint32_t MAGIC = 0x873EA98B; /**< Magic number identifying metadata net files. */ + + static const FileAccessMethod _default_file_access_method = FILE_ACCESS_MMAP; /**< Default file access method (read/mmap). */ + + /** + * @struct Header + * @brief Concept net data file header. + */ + struct Header { + uint32_t _magic; /**< Magic number. */ + uint32_t _version; /**< Version number. (currently not used) */ + uint32_t _checksum; /**< Checksum. (currently not used) */ + uint32_t _size; /**< Size of the data. */ + uint32_t _reserved[10]; /**< Reserved for later use. */ + uint32_t _user[50]; /**< User defined fields. */ + }; + + void *_mmap_addr; /**< mmap address, NULL is file has not been mmapped. */ + size_t _mmap_length; /**< mmap length. */ + + bool _ok; /**< Flag indicating successful initialization. */ + Header _header; + void *_data; + + /** + * @brief Reset the object. + * + * Resets the object to an empty %MetaData, and releases allocated memory. + */ + void reset(); + + /** + * @brief Read the metadata file from disk. + * + * @param datafile Name of the metadata file. + * @param fam File access mode (read or mmap). If not set, the + * global default access mode will be used. + * @return True on success. + */ + bool read(const char *datafile, FileAccessMethod fam = FILE_ACCESS_UNDEF); + + /** + * @brief Unimplemented private default constructor. + */ + MetaData(); + /** + * @brief Unimplemented private copy constructor. + */ + MetaData(const MetaData&); + /** + * @brief Unimplemented private assignment operator. + */ + const MetaData& operator=(const MetaData&); + +public: + + /** + * @brief Constructor. + * + * @param datafile Metadata file. + * @param fam File access mode (read or mmap). If not set, the + * global default access mode will be used. + */ + MetaData(const char *datafile, FileAccessMethod fam = FILE_ACCESS_UNDEF); + MetaData(const std::string &datafile, FileAccessMethod fam = FILE_ACCESS_UNDEF); + + /** + * @brief Destructor. + */ + virtual ~MetaData(); + + /** + * @brief Check if initialization was successful. + * + * @return True if the initialization of the object succeeded. + */ + bool isOk() const + { + return _ok; + } + + /** + * @brief Get user defined header field + * + * @param idx Field index + * @return Header field value. + */ + uint32_t user(unsigned int idx) const + { + if(_ok && idx<50) + return _header._user[idx]; + else + return 0; + } + + uint32_t getUIntEntry(uint32_t idx) const + { + if(_ok){ + return ((const uint32_t*)_data)[idx]; + } + else + return 0; + } + + const void *getDirectRecordEntry(uint32_t idx, uint32_t size) const + { + if(_ok) + return (const void*)((const uint8_t*)_data+idx*size); + else + return NULL; + } + + const void *getIndirectRecordEntry(uint32_t idx) const + { + if(_ok){ + uint32_t offset=((const uint32_t*)_data)[idx]; + return (const void*)((const uint8_t*)_data+offset); + } + else + return NULL; + } + + const char *getCharPtrEntry(uint32_t offset) const + { + if(_ok) + return ((const char*)_data)+offset; + else + return NULL; + } + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/ngram.cpp b/fsa/src/vespa/fsa/ngram.cpp new file mode 100644 index 00000000000..050b9eff035 --- /dev/null +++ b/fsa/src/vespa/fsa/ngram.cpp @@ -0,0 +1,285 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file ngram.cpp + * @brief n-gram class for tokenized text. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "ngram.h" +#include "wordchartokenizer.h" + +#include <ctype.h> +#include <iostream> + +namespace fsa { + +// {{{ NGram::NGram() + +NGram::NGram(const char *text, unsigned int from, int length) : _tokens() +{ + append(text,from,length); +} + +NGram::NGram(const char *text, Tokenizer &tokenizer, unsigned int from, int length) : _tokens() +{ + append(text,tokenizer,from,length); +} + +NGram::NGram(const NGram &g, unsigned int from, int length) : _tokens() +{ + append(g,from,length); +} + +NGram::NGram(const NGram &g, const Selector &select) : _tokens() +{ + append(g,select); +} + +NGram::NGram(const NGram &g, const Permuter &p, unsigned int id) : _tokens() +{ + append(g,p,id); +} + +NGram::NGram(const std::string &s, unsigned int from, int length) : _tokens() +{ + append(s,from,length); +} + +NGram::NGram(const std::string &s, Tokenizer &tokenizer, unsigned int from, int length) : _tokens() +{ + append(s,tokenizer,from,length); +} + +// }}} +// {{{ NGram::set() + +void NGram::set(const char *text, unsigned int from, int length) +{ + clear(); + append(text,from,length); +} + +void NGram::set(const char *text, Tokenizer &tokenizer, unsigned int from, int length) +{ + clear(); + append(text,tokenizer,from,length); + +} + +void NGram::set(const NGram &g, unsigned int from, int length) +{ + if(this==&g){ + set(NGram(g),from,length); + } + else{ + clear(); + append(g,from,length); + } +} + +void NGram::set(const NGram &g, const Selector &select) +{ + if(this==&g){ + set(NGram(g),select); + } + else{ + clear(); + append(g,select); + } +} + +void NGram::set(const NGram &g, const Permuter &p, unsigned int id) +{ + if(this==&g){ + set(NGram(g),p,id); + } + else{ + clear(); + append(g,p,id); + } +} + +void NGram::set(const std::string &s, unsigned int from, int length) +{ + clear(); + append(s,from,length); +} + +void NGram::set(const std::string &s, Tokenizer &tokenizer, unsigned int from, int length) +{ + clear(); + append(s,tokenizer,from,length); +} + +// }}} +// {{{ NGram::setOne() + +void NGram::setOne(const std::string &s) +{ + clear(); + appendOne(s); +} + +// }}} +// {{{ NGram::append() + +void NGram::append(const char *text, unsigned int from, int length) +{ + WordCharTokenizer tokenizer; + append(text,tokenizer,from,length); +} + +void NGram::append(const char *text, Tokenizer &tokenizer, unsigned int from, int length) +{ + append(std::string(text),tokenizer,from,length); +} + + +void NGram::append(const NGram &g, unsigned int from, int length) +{ + if(this==&g){ + append(NGram(g),from,length); + return; + } + + if(length<0 || from+length>g._tokens.size()) length=g._tokens.size()-from; + + if(length>0){ + for(unsigned int i=from; i<from+length; i++){ + _tokens.push_back(g._tokens[i]); + } + } +} + +void NGram::append(const NGram &g, const Selector &select) +{ + if(this==&g){ + append(NGram(g),select); + return; + } + + for(unsigned int i=0; i<g._tokens.size()&&i<select.size(); i++){ + if(select[i]) + _tokens.push_back(g._tokens[i]); + } +} + +void NGram::append(const NGram &g, const Permuter &p, unsigned int id) +{ + if(this==&g){ + append(NGram(g),p,id); + return; + } + + std::string perm=p.getPerm(id); + + for(unsigned int i=0;i<perm.length();i++){ + if(perm[i]>0 && perm[i]<=(int)g._tokens.size()){ + _tokens.push_back(g._tokens[perm[i]-1]); + } + } +} + +void NGram::append(const std::string &s, unsigned int from, int length) +{ + WordCharTokenizer tokenizer; + append(s,tokenizer,from,length); +} + +void NGram::append(const std::string &s, Tokenizer &tokenizer, unsigned int from, int length) +{ + tokenizer.init(s); + unsigned int i=0; + while(i<from && tokenizer.hasMore()){ + tokenizer.getNext(); + i++; + } + + i=0; + while(tokenizer.hasMore() && (length<0 || (int)i<length)){ + appendOne(tokenizer.getNext()); + i++; + } +} + +// }}} +// {{{ NGram::appendOne() + +void NGram::appendOne(const std::string &s) +{ + _tokens.push_back(s); +} + +// }}} +// {{{ NGram::uniq() + +unsigned int NGram::uniq() +{ + std::vector<std::string>::iterator pos; + + pos = std::unique(_tokens.begin(),_tokens.end()); + _tokens.erase(pos,_tokens.end()); + return _tokens.size(); +} + +// }}} +// {{{ NGram::join() + +std::string NGram::join(const std::string &separator, unsigned int from, int length) const +{ + unsigned int to = _tokens.size(); + if(length!=-1 && from+length<to) + to=from+length; + + std::string dest; + if(to>from) + dest=_tokens[from]; + for(unsigned i=from+1;i<to;i++){ + dest+=separator; + dest+=_tokens[i]; + } + + return dest; +} + +// }}} +// {{{ NGram::getPermIdTo() + +int NGram::getPermIdTo(const NGram &g, const Permuter &p) const +{ + if(_tokens.size()!=g._tokens.size()) + return -1; + + std::string perm(_tokens.size(),'\0'); + for(unsigned int i=0;i<_tokens.size();i++){ + for(unsigned int j=0;j<g._tokens.size();j++){ + if(_tokens[i]==g._tokens[j]){ + perm[j]=i+1; + } + } + } + return p.getPermId(perm); +} + +// }}} + +// {{{ operator<< + +std::ostream& operator<<(std::ostream &out, const NGram &g) +{ + for(unsigned int i=0;i<g._tokens.size();i++){ + if(i>0) out<<" "; + out<<g._tokens[i]; + } + return out; +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/ngram.h b/fsa/src/vespa/fsa/ngram.h new file mode 100644 index 00000000000..32f739e3533 --- /dev/null +++ b/fsa/src/vespa/fsa/ngram.h @@ -0,0 +1,433 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file ngram.h + * @brief n-gram class for tokenized text. + */ + +#pragma once + +#include <iostream> +#include <vector> +#include <string> +#include <algorithm> + +#include "unicode.h" +#include "selector.h" +#include "permuter.h" +#include "tokenizer.h" + +namespace fsa { + +// {{{ class NGram + +/** + * @class NGram + * @brief Class for representing n-grams. + * + * Supports tokenization and various manipulation methods, such as + * join, sort, uniq, etc. + */ +class NGram { + +public: + +private: + std::vector<std::string> _tokens; /**< Vector holding the tokens. */ + +public: + /** + * @brief Default constructor, creates empty NGram. + */ + NGram() : _tokens() {} + + /** + * @brief Constructor. + * + * Creates an NGram object from a utf-8 encoded character + * string. The string must be zero terminated. The string is + * tokenized using unicode wordchar property. For certain puctuation + * strategies, a special puctuation token is inserted if a puctuation + * character is found. + * + * @param text Input text. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + */ + NGram(const char *text, + unsigned int from=0, int length=-1); + + /** + * @brief Constructor. + * + * Creates an NGram object from a utf-8 encoded character + * string. The string must be zero terminated. The string is + * tokenized using the supplied tokienizer. + * + * @param text Input text. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + * @param tokenizer Tokenizer. + */ + NGram(const char *text, + Tokenizer &tokenizer, + unsigned int from=0, int length=-1); + + /** + * @brief (Sort of) Copy constructor. + * + * @param g NGram object to copy. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + */ + NGram(const NGram &g, unsigned int from=0, int length=-1); + + /** + * @brief (Sort of) Copy constructor. + * + * Copy selected tokens from an NGram objects. + * + * @param g NGram object to copy. + * @param select Selector indicating which tokens to copy. + */ + NGram(const NGram &g, const Selector &select); + + /** + * @brief (Sort of) Copy constructor. + * + * Create a new NGram and permute the tokens. + * + * @param g NGram object to copy. + * @param p Permuter object. + * @param id Permutation ID. + */ + NGram(const NGram &g, const Permuter &p, unsigned int id); + + /** + * @brief Constructor. + * + * Creates an NGram object from a utf-8 encoded std::string. The + * string is tokenized using unicode wordchar property. For certain + * puctuation strategies, a special puctuation token is inserted if + * a puctuation character is found. + * + * @param s Input text. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + */ + NGram(const std::string &s, + unsigned int from=0, int length=-1); + + /** + * @brief Constructor. + * + * Creates an NGram object from a utf-8 encoded std::string. The + * string is tokenized using the supplied tokenizer. + * + * @param s Input text. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + * @param tokenizer Tokenizer. + */ + NGram(const std::string &s, + Tokenizer &tokenizer, + unsigned int from=0, int length=-1); + + /** + * @brief Set the object. + * + * Reinitalizes the NGram object from a utf-8 encoded character + * string. The string must be zero terminated. The string is + * tokenized using unicode wordchar property. For certain puctuation + * strategies, a special puctuation token is inserted if a puctuation + * character is found. + * + * @param text Input text. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + */ + void set(const char *text, + unsigned int from=0, int length=-1); + + /** + * @brief Set the object. + * + * Reinitalizes the NGram object from a utf-8 encoded character + * string. The string must be zero terminated. The string is + * tokenized using the supplied tokenizer. + * + * @param text Input text. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + * @param tokenizer Tokenizer. + */ + void set(const char *text, + Tokenizer &tokenizer, + unsigned int from=0, int length=-1); + + /** + * @brief Set the object. + * + * @param g NGram object to copy. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + */ + void set(const NGram &g, unsigned int from=0, int length=-1); + + /** + * @brief Set the object. + * + * Copy selected tokens from an NGram objects. + * + * @param g NGram object to copy. + * @param select Selector indicating which tokens to copy. + */ + void set(const NGram &g, const Selector &select); + + /** + * @brief Set the object. + * + * Set the object from another NGram with permuting the tokens. + * + * @param g NGram object to copy. + * @param p Permuter object. + * @param id Permutation ID. + */ + void set(const NGram &g, const Permuter &p, unsigned int id); + + /** + * @brief Set the object. + * + * Reinitalizes the NGram object from a utf-8 encoded + * std::string. The string is tokenized using unicode wordchar + * property. For certain puctuation strategies, a special puctuation + * token is inserted if a puctuation character is found. + * + * @param s Input text. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + */ + void set(const std::string &s, + unsigned int from=0, int length=-1); + + /** + * @brief Set the object. + * + * Reinitalizes the NGram object from a utf-8 encoded + * std::string. The string is tokenized using the supplied tokenizer. + * + * @param s Input text. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + * @param tokenizer Tokenizer. + */ + void set(const std::string &s, + Tokenizer &tokenizer, + unsigned int from=0, int length=-1); + + /** + * @brief Set the object. + * + * Reinitalizes the object from an std::string, as a single token. + * + * @param s Input string. + */ + void setOne(const std::string &s); + + /** + * @brief Append tokens to the object. + * + * Appends tokens to the NGram object from a utf-8 encoded character + * string. The string must be zero terminated. The string is + * tokenized using unicode wordchar property. For certain puctuation + * strategies, a special puctuation token is inserted if a + * puctuation character is found. + * + * @param text Input text. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + */ + void append(const char *text, + unsigned int from=0, int length=-1); + + /** + * @brief Append tokens to the object. + * + * Appends tokens to the NGram object from a utf-8 encoded character + * string. The string must be zero terminated. The string is + * tokenized using the supplied tokenizer. + * + * @param text Input text. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + * @param tokenizer Tokenizer. + */ + void append(const char *text, + Tokenizer &tokenizer, + unsigned int from=0, int length=-1); + + /** + * @brief Append tokens to the object. + * + * @param g NGram object to append. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + */ + void append(const NGram &g, unsigned int from=0, int length=-1); + + /** + * @brief Append tokens to the object. + * + * Append selected tokens from an NGram objects. + * + * @param g NGram object to append. + * @param select Selector indicating which tokens to copy. + */ + void append(const NGram &g, const Selector &select); + + /** + * @brief Append tokens to the object. + * + * Append a permuted NGram. + * + * @param g NGram object to append. + * @param p Permuter object. + * @param id Permutation ID. + */ + void append(const NGram &g, const Permuter &p, unsigned int id); + + /** + * @brief Append tokens to the object. + * + * Appends tokens to the NGram object from a utf-8 encoded + * std::string. The string is tokenized using unicode wordchar + * property. For certain puctuation strategies, a special puctuation + * token is inserted if a puctuation character is found. + * + * @param s Input text. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + */ + void append(const std::string &s, + unsigned int from=0, int length=-1); + + /** + * @brief Append tokens to the object. + * + * Appends tokens to the NGram object from a utf-8 encoded + * std::string. The string is tokenized using the supplied tokenizer. + * + * @param s Input text. + * @param from Starting token to keep (preceeding tokens are ignored). + * @param length Number of tokens to keep. + * @param tokenizer Tokenizer. + */ + void append(const std::string &s, + Tokenizer &tokenizer, + unsigned int from=0, int length=-1); + + /** + * @brief Append a single token to the object. + * + * Appends a single token from an std::string. + * + * @param s Input string. + */ + void appendOne(const std::string &s); + + + /** + * @brief Reset the object. + */ + void clear() { _tokens.clear(); } + + /** + * @brief Get the size of the n-gram (number of tokens). + * + * @return Number of tokens in n-gram. + */ + unsigned int size() const { return _tokens.size(); } + + /** + * @brief Get the length (size) of the n-gram (number of tokens). + * + * @return Number of tokens in n-gram. + */ + unsigned int length() const { return _tokens.size(); } + + /** + * @brief Sort the tokens lexicograpically. + */ + void sort() { std::sort(_tokens.begin(),_tokens.end()); } + + /** + * @brief Remove duplicate tokens from a sorted n-gram. + */ + unsigned int uniq(); + + /** + * @brief Reverse the order of the tokens. + */ + void reverse() { std::reverse(_tokens.begin(),_tokens.end()); } + + /** + * @brief Join the whole or parts of the n-gram to single string. + * + * @param separator Separator string. + * @param from Starting token (default 0). + * @param length Number of tokens (default -1 which means all). + * @return Joined tokens. + */ + std::string join(const std::string &separator = " ", + unsigned int from=0, int length=-1) const; + + /** + * @brief Index operator. + * + * Provides access a token directly. The index must be in the range + * of 0..length()-1, this is not checked. + * + * @param i Index. + * @return Reference to token. + */ + std::string& operator[](unsigned int i) { return _tokens[i]; } + + /** + * @brief Index operator. + * + * Provides const access a token directly. The index must be in the + * range of 0..length()-1, this is not checked. + * + * @param i Index. + * @return Const reference to token. + */ + const std::string& operator[](unsigned int i) const { return _tokens[i]; } + + /** + * @brief Get permutation ID to another n-gram. + * + * Get permutation ID to another n-gram. The other n-gram should + * consist of the same tokens in different order. + * + * @param g The other n-gram. + * @param p Permuter object. + * @return Permutation ID. + */ + int getPermIdTo(const NGram &g, const Permuter &p) const; + + /** + * @brief Output operator. + * + * @param out Reference to output stream. + * @param g n-gram. + * @return Reference to output stream. + */ + friend std::ostream& operator<<(std::ostream &out, const NGram &g); +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/permuter.cpp b/fsa/src/vespa/fsa/permuter.cpp new file mode 100644 index 00000000000..a0d472e59fd --- /dev/null +++ b/fsa/src/vespa/fsa/permuter.cpp @@ -0,0 +1,135 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file permuter.cpp + * @brief Permuter class. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "permuter.h" + +namespace fsa { + +// {{{ Permuter::MAX_UNIT_LENGTH + +const unsigned int Permuter::MAX_UNIT_LENGTH; + +// }}} + +// {{{ Permuter::initRec() + +void Permuter::initRec(const std::string &input, std::string tail) +{ + std::string temp; + int i; + + if(input.length()==0){ + _permtab.push_back(tail); + _permmap[tail] = _permtab.size()-1; + } + else{ + for(i=input.length()-1;i>=0;i--){ + temp = input; + temp.erase(i,1); + initRec(temp,input.substr(i,1)+tail); + } + } +} + +// }}} +// {{{ Permuter::Permuter() + +Permuter::Permuter() : _permtab(), _permmap(), _size(0), _seed(MAX_UNIT_LENGTH,0) +{ + unsigned int i; + + _size = 1; + for(i=1;i<=MAX_UNIT_LENGTH;i++){ + _seed[i-1]=i; + _size*=i; + } + _permtab.reserve(_size); + + initRec(_seed,std::string()); +} + +// }}} +// {{{ Permuter::~Permuter() + +Permuter::~Permuter() +{ +} + +// }}} +// {{{ Permuter::getPermId() + +int Permuter::getPermId(const std::string &perm) const +{ + std::string t(perm); + + if(t.length()>MAX_UNIT_LENGTH) + return -1; + + if(t.length()<MAX_UNIT_LENGTH) + t+=_seed.substr(t.length(),MAX_UNIT_LENGTH-t.length()); + + const PermMapConstIterator pi = _permmap.find(t); + if(pi==_permmap.end()) + return -1; + else + return pi->second; +} + +// }}} +// {{{ Permuter::firstComb() + +unsigned int Permuter::firstComb(unsigned int n, unsigned int m) +{ + if(n==0 || n>31 || m==0 || m>31 || n>m) + return 0; + + return (1<<n)-1; +} + +// }}} +// {{{ Permuter::nextComb() + +unsigned int Permuter::nextComb(unsigned int c, unsigned int m) + +{ + if(c==0 || m==0 || m>31) + return 0; + + unsigned int x=c; + unsigned int limit=1<<m; + unsigned int mask, mask1,mask2; + + if(x&1){ + mask=2; + while(x&mask) mask<<=1; + x^=(mask+(mask>>1)); + } + else{ + mask=2; + while(!(x&mask)) mask<<=1; + mask1=mask2=0; + while(x&mask){ + mask1<<=1;mask1++; + mask2+=mask; + mask<<=1; + } + mask1>>=1; + x^=(mask+(mask1^mask2)); + } + + return (x<limit)?x:0; +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/permuter.h b/fsa/src/vespa/fsa/permuter.h new file mode 100644 index 00000000000..15b016ab733 --- /dev/null +++ b/fsa/src/vespa/fsa/permuter.h @@ -0,0 +1,65 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file permuter.h + * @brief Permuter class. + */ + +#pragma once + +#include <vector> +#include <map> +#include <string> + + +namespace fsa { + +// {{{ class Permuter + +/** + * @class Permuter + * @brief Permuter class. + */ +class Permuter { +private: + + static const unsigned int MAX_UNIT_LENGTH = 6; + + typedef std::vector<std::string> PermTab; + typedef std::vector<std::string>::iterator PermTabIterator; + typedef std::map<std::string,unsigned int> PermMap; + typedef std::map<std::string,unsigned int>::iterator PermMapIterator; + typedef std::map<std::string,unsigned int>::const_iterator PermMapConstIterator; + + PermTab _permtab; + PermMap _permmap; + unsigned int _size; + std::string _seed; + + void initRec(const std::string &input, std::string tail); + +public: + /** + * @brief Default constructor. + */ + Permuter(); + + /** + * @brief Destructor. + */ + ~Permuter(); + + std::string getPerm(unsigned int id) const { return _permtab[id]; } + int getPermId(const std::string &perm) const; + + static unsigned int firstComb(unsigned int n, unsigned int m); + static unsigned int nextComb(unsigned int c, unsigned int m); + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/segmenter.cpp b/fsa/src/vespa/fsa/segmenter.cpp new file mode 100644 index 00000000000..91f5a611f13 --- /dev/null +++ b/fsa/src/vespa/fsa/segmenter.cpp @@ -0,0 +1,279 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file segmenter.cpp + * @brief Query segmenter based on %FSA (%Finite %State %Automaton) (implementation) + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> + +#include "segmenter.h" + + +namespace fsa { + +// {{{ Segmenter::Segments::initSingles + +void Segmenter::Segments::initSingles() +{ + for(unsigned int i=0;i<_text.size();i++){ + if(!_map.isValid(i,i+1)){ + _map.set(i,i+1,_segments.size()); + _segments.push_back(Segment(i,i+1,0)); + } + } +} + + +// }}} +// {{{ Segmenter::Segments::buildSegmentation + +void Segmenter::Segments::buildSegmentation(Segmenter::SegmentationMethod method) +{ + int i,j; + int n_txt=(int)_text.size(), n_sgm=_segments.size(); + int id,bestid; + int pos, next=n_txt; + unsigned int maxsc,conn; + int bestval,temp=0,bias; + std::vector<int> nextid(n_sgm,-1); + std::vector<unsigned int> maxScore(n_sgm,0); + + if(_segmentation[method]==NULL){ + _segmentation[method] = new Segmenter::Segmentation; + } + else { + _segmentation[method]->clear(); + } + + bias=0; + switch(method){ + case SEGMENTATION_WEIGHTED_BIAS100: + bias+=50; + case SEGMENTATION_WEIGHTED_BIAS50: + bias+=30; + case SEGMENTATION_WEIGHTED_BIAS20: + bias+=10; + case SEGMENTATION_WEIGHTED_BIAS10: + bias+=10; + case SEGMENTATION_WEIGHTED: + bestid=-1; + for(i=n_txt;i>=0;i--){ + bestid=-1;maxsc=0; + for(j=i+1;j<=n_txt;j++){ + id=_map.get(i,j); + if(id>=0 && maxScore[id]+1>maxsc) { + bestid=id; + maxsc=maxScore[id]+1; + } + } + if(maxsc>0) maxsc--; + for(j=0;j<i;j++){ + id=_map.get(j,i); + if(id>=0){ + nextid[id] = bestid; + conn = _segments[id].conn(); + if(i-j<=1){ + maxScore[id] = maxsc; + } + else if(bias>0){ + maxScore[id] = maxsc + ((100+(i-j-2)*bias)*conn)/100; + } + else{ + maxScore[id] = maxsc + conn; + } + } + } + } + id = bestid; + while(id!=-1){ + _segmentation[method]->push_back(id); + id=nextid[id]; + } + break; + case SEGMENTATION_LEFTMOST_LONGEST: + case SEGMENTATION_LEFTMOST_WEIGHTED: + pos = 0; + while(pos<n_txt){ + bestid = -1; bestval = -1; + for(i=pos+1;i<=n_txt;i++){ + id = _map.get(pos,i); + if(id>=0 && + (method==SEGMENTATION_LEFTMOST_LONGEST || + (temp=(_segments[id].len()>1)?(int)_segments[id].conn():0)>bestval) ){ + bestid = id; + bestval = temp; + next = i; + } + } + _segmentation[method]->push_back(bestid); + pos=next; + } + break; + case SEGMENTATION_RIGHTMOST_LONGEST: + case SEGMENTATION_RIGHTMOST_WEIGHTED: + pos = n_txt; + while(pos>0){ + bestid = -1; bestval = -1; + for(i=pos-1;i>=0;i--){ + id = _map.get(i,pos); + if(id>=0 && + (method==SEGMENTATION_RIGHTMOST_LONGEST || + (temp=(_segments[id].len()>1)?(int)_segments[id].conn():0)>bestval) ){ + bestid = id; + bestval = temp; + next = i; + } + } + _segmentation[method]->push_front(bestid); + pos=next; + } + break; + case SEGMENTATION_LONGEST_WEIGHTED: + case SEGMENTATION_LONGEST_LEFTMOST: + case SEGMENTATION_LONGEST_RIGHTMOST: + case SEGMENTATION_WEIGHTED_LONGEST: + case SEGMENTATION_WEIGHTED_LEFTMOST: + case SEGMENTATION_WEIGHTED_RIGHTMOST: + buildSegmentationRecursive(method,*_segmentation[method],0,n_txt); + break; + default: + break; + } +} + +// }}} +// {{{ Segmenter::Segments::buildSegmentationRecursive + +void Segmenter::Segments::buildSegmentationRecursive(Segmenter::SegmentationMethod method, + Segmenter::Segmentation& segmentation, + unsigned int beg, + unsigned int end) +{ + int bestid, bestval1, bestval2, temp; + int i; + + // locate the best segment according to method + bestid=-1;bestval1=-1;bestval2=-1; + for(i=0;i<(int)_segments.size();i++){ + if(beg<=_segments[i].beg() && end>=_segments[i].end()){ + switch(method){ + case SEGMENTATION_LONGEST_WEIGHTED: + if((int)_segments[i].len()>bestval1 || + ((int)_segments[i].len()==bestval1 && (int)_segments[i].conn()>bestval2) ){ + bestid=i; + bestval1=_segments[i].len(); + bestval2=_segments[i].conn(); + } + break; + case SEGMENTATION_LONGEST_LEFTMOST: + if((int)_segments[i].len()>bestval1 || + ((int)_segments[i].len()==bestval1 && (int)_segments[i].beg()<bestval2) ){ + bestid=i; + bestval1=_segments[i].len(); + bestval2=_segments[i].beg(); + } + break; + case SEGMENTATION_LONGEST_RIGHTMOST: + if((int)_segments[i].len()>bestval1 || + ((int)_segments[i].len()==bestval1 && (int)_segments[i].end()>bestval2) ){ + bestid=i; + bestval1=_segments[i].len(); + bestval2=_segments[i].end(); + } + break; + case SEGMENTATION_WEIGHTED_LONGEST: + temp = (_segments[i].len()>1)?(int)_segments[i].conn():0; + if(temp>bestval1 || + (temp==bestval1 && + (int)_segments[i].len()>bestval2) ){ + bestid=i; + bestval1=temp; + bestval2=_segments[i].len(); + } + break; + case SEGMENTATION_WEIGHTED_LEFTMOST: + temp = (_segments[i].len()>1)?(int)_segments[i].conn():0; + if(temp>bestval1 || + (temp==bestval1 && + (int)_segments[i].beg()<bestval2) ){ + bestid=i; + bestval1=temp; + bestval2=_segments[i].beg(); + } + break; + case SEGMENTATION_WEIGHTED_RIGHTMOST: + temp = (int)_segments[i].len()>1?(int)_segments[i].conn():0; + if(temp>bestval1 || + (temp==bestval1 && + (int)_segments[i].end()>bestval2) ){ + bestid=i; + bestval1=temp; + bestval2=_segments[i].end(); + } + break; + default: // dummy defult pick first possible + if(bestid<0){ + bestid=i; + } + break; + } + } + } + if(bestid<0) { + return; // this should never happen, as all one-word segments are created + } + + // check left side + if(beg<_segments[bestid].beg()){ + buildSegmentationRecursive(method,segmentation,beg,_segments[bestid].beg()); + } + + // add segment + segmentation.push_back(bestid); + + // check right side + if(end>_segments[bestid].end()){ + buildSegmentationRecursive(method,segmentation,_segments[bestid].end(),end); + } +} + +// }}} + +// {{{ Segmenter::segment + +void Segmenter::segment(Segmenter::Segments &segments) const +{ + segments.clear(); + _detector.detect(segments.getText(),segments); +} + +void Segmenter::segment(const NGram &text, Segmenter::Segments &segments) const +{ + segments.setText(text); + _detector.detect(segments.getText(),segments); +} + +void Segmenter::segment(const std::string &text, Segmenter::Segments &segments) const +{ + + segments.setText(text); + _detector.detect(segments.getText(),segments); +} + +void Segmenter::segment(const char *text, Segmenter::Segments &segments) const +{ + segments.setText(text); + _detector.detect(segments.getText(),segments); +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/segmenter.h b/fsa/src/vespa/fsa/segmenter.h new file mode 100644 index 00000000000..243629bbaa8 --- /dev/null +++ b/fsa/src/vespa/fsa/segmenter.h @@ -0,0 +1,636 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/09/13 + * @version $Id$ + * @file segmenter.h + * @brief Query segmenter based on %FSA (%Finite %State %Automaton) + * + */ + +#pragma once + +#include <string> +#include <map> +#include <vector> +#include <list> + +#include <stdio.h> + +#include "fsa.h" +#include "ngram.h" +#include "detector.h" + + +namespace fsa { + +// {{{ class Segmenter + +/** + * @class Segmenter + * @brief Query segmenter based on %FSA. + */ +class Segmenter { + +public: + + // {{{ enum Segmenter::SegmentationMethod + + /** + * @brief Enumerated type of supported segmentation method IDs + * + * The segmentation methods currently supported are the following: + * - SEGMENTATION_WEIGHTED - gives the segmentation where the sum + * of the scores of nontrivial (more than one word) segments is + * the highest + * - SEGMENTATION_WEIGHTED_BIASxx - (xx can be 10,20,50 or 100) + * gives the segmentation where the sum of the scores of + * nontrivial (more than one word) segments is the highest. The + * scores are biased based on segment length, xx% extra for each + * term over 2 + * - SEGMENTATION_WEIGHTED_LEFTMOST - picks the segment with + * highest score first, if there are several possibilities, picks + * the leftmost, then repeats for the rest of the query + * - SEGMENTATION_WEIGHTED_RIGHTMOST - picks the segment with + * highest score first, if there are several possibilities, picks + * the rightmost, then repeats for the rest of the query + * - SEGMENTATION_WEIGHTED_LONGEST - picks the segment with + * highest score first, if there are several possibilities, picks + * the longest, then repeats for the rest of the query + * - SEGMENTATION_LEFTMOST_LONGEST - picks the leftmost segment + * first, if there are several possibilities, picks the longest, + * then repeats for the rest of the query + * - SEGMENTATION_LEFTMOST_WEIGHTED - picks the leftmost segment + * first, if there are several possibilities, picks the one with + * highest score, then repeats for the rest of the query + * - SEGMENTATION_RIGHTMOST_LONGEST - picks the rightmost segment + * first, if there are several possibilities, picks the longest, + * then repeats for the rest of the query + * - SEGMENTATION_RIGHTMOST_WEIGHTED - picks the rightmost segment + * first, if there are several possibilities, picks the one with + * highest score, then repeats for the rest of the query + * - SEGMENTATION_LONGEST_WEIGHTED - picks the longest segment + * first, if there are several possibilities, picks the one with + * highest score, then repeats for the rest of the query + * - SEGMENTATION_LONGEST_LEFTMOST - picks the longest segment + * first, if there are several possibilities, picks leftmost, + * then repeats for the rest of the query + * - SEGMENTATION_LONGEST_RIGHTMOST - picks the longest segment + * first, if there are several possibilities, picks the rightmost, + * then repeats for the rest of the query + */ + enum SegmentationMethod { + SEGMENTATION_WEIGHTED, + SEGMENTATION_WEIGHTED_BIAS10, + SEGMENTATION_WEIGHTED_BIAS20, + SEGMENTATION_WEIGHTED_BIAS50, + SEGMENTATION_WEIGHTED_BIAS100, + SEGMENTATION_WEIGHTED_LEFTMOST, + SEGMENTATION_WEIGHTED_RIGHTMOST, + SEGMENTATION_WEIGHTED_LONGEST, + SEGMENTATION_LEFTMOST_LONGEST, + SEGMENTATION_LEFTMOST_WEIGHTED, + SEGMENTATION_RIGHTMOST_LONGEST, + SEGMENTATION_RIGHTMOST_WEIGHTED, + SEGMENTATION_LONGEST_WEIGHTED, + SEGMENTATION_LONGEST_LEFTMOST, + SEGMENTATION_LONGEST_RIGHTMOST, + SEGMENTATION_METHODS }; + + // }}} + + // {{{ typedef Segmenter::Segmentation + + /** %Segmentation type */ + typedef std::list<int> Segmentation; + /** Iterator for %segmentation type */ + typedef std::list<int>::iterator SegmentationIterator; + /** Const iterator for %segmentation type */ + typedef std::list<int>::const_iterator SegmentationConstIterator; + + // }}} + + // {{{ class Segmenter::Segments + + /** + * @class Segments + * @brief Class for storing segmentation results. + * + * Class for storing segmentation results. It is a subclass of + * Detector::Hits, so it can be used directly by a Detector. + */ + class Segments : public Detector::Hits { + + private: + + // {{{ class Segmenter::Segments::Segment + + /** + * @class Segment + * @brief Simple segment class. + * + * Simple segment class. A segment is defined by its beginning and + * end, and it has a connexity. Beginning and end refer to indices + * in the original text. + */ + class Segment { + + private: + unsigned int _beg; /**< Beginning of the segment. */ + unsigned int _end; /**< End of the segment. */ + unsigned int _conn; /**< Connexity of the segment. */ + + public: + + /** + * @brief Default constructor. + * + * Null segment at postion zero. + */ + Segment() : _beg(0), _end(0), _conn(0) {} + + /** + * @brief Constructor. + * + * @param b Beginning of the segment. + * @param e End of the segment (the position after the last term). + * @param c Connexity of the segment. + */ + Segment(unsigned int b, unsigned int e, unsigned int c) : + _beg(b), _end(e), _conn(c) {} + + /** + * @brief Copy constructor. + * + * @param s Segment object to copy. + */ + Segment(const Segment &s) : _beg(s._beg), _end(s._end), _conn(s._conn) {} + + /** + * @brief Destructor. + */ + ~Segment() {} + + /** + * @brief Set the segment parameters. + * + * @param b Beginning of the segment. + * @param e End of the segment (the position after the last term). + * @param c Connexity of the segment. + */ + void set(unsigned int b, unsigned int e, unsigned int c) + { + _beg=b; + _end=e; + _conn=c; + } + + public: + /** + * @brief Get the beginning of the segment. + * + * @return Beginning of the segment. + */ + unsigned int beg() const { return _beg; } + + /** + * @brief Get the end of the segment. + * + * @return End of the segment. (Position after last term.) + */ + unsigned int end() const { return _end; } + + /** + * @brief Get the length of the segment. + * + * @return Length of the segment (number of terms). + */ + unsigned int len() const { return _end-_beg; } + + /** + * @brief Get the connexity of the segment. + * + * @return Connexity of the segment. + */ + unsigned int conn() const { return _conn; } + }; + + // }}} + + // {{{ class Segmenter::Segments::SegmentMap + + /** + * @class SegmentMap + * @brief Class for mapping (beg,end) pairs to segment idx. + */ + class SegmentMap { + + private: + /** Size of current map. */ + unsigned int _size; + /** %Segment map */ + std::vector<int> _map; + + public: + /** Default constructor, creates empty map of zero size. */ + SegmentMap() : _size(0), _map() {} + + /** + * @brief Constructor. + * + * Creates an empty map of given size. + * + * @param n Map size. + */ + SegmentMap(unsigned int n) : _size(n+1), _map(_size*_size,-1) {} + + /** Destructor */ + ~SegmentMap() {} + + /** + * @brief Initialize the map. + * + * Initialize the map to an empty map of given size. + * + * @param n Map size. + */ + void init(unsigned int n) + { + _size = n+1; + _map.assign(_size*_size,-1); + } + + /** + * @brief Clear the map. + * + * Reset the map to an empty map of zero size. + */ + void clear() + { + _size = 0; + _map.clear(); + } + + /** + * @brief Get current map size. + * + * @return Map size. + */ + unsigned int size() const { return _size; } + + /** + * @brief Set an element in the map. + * + * @param i Beginning of the segment. + * @param j End of the segment. + * @param idx %Segment index. + */ + void set(unsigned int i, unsigned int j, int idx) + { + if(i<_size && j<_size) + _map[i*_size+j] = idx; + } + + /** + * @brief Get an element from the map. + * + * @param i Beginning of the segment. + * @param j End of the segment. + * @return %Segment index (-1 if segment does not exist). + */ + int get(unsigned int i, unsigned int j) const + { + if(i<_size && j<_size) + return _map[i*_size+j]; + return -1; + } + + /** + * @brief Check if a segment exists. + * + * @param i Beginning of the segment. + * @param j End of the segment. + * @return True if segment exists. + */ + bool isValid(unsigned int i, unsigned int j) const + { + return i<_size && j<_size && _map[i*_size+j]!=-1; + } + }; + + // }}} + + private: + NGram _text; /**< Tokenized text (e.g. query). */ + std::vector<Segment> _segments; /**< Detected segments. */ + SegmentMap _map; /**< Map of segments. */ + std::vector<Segmentation*> _segmentation; /**< Pre-built segmentations. */ + + + /** + * @brief Insert all single term segments. + * + * Insert all single term segments as detected with zero + * connexity. This is important for some of the segentation + * algorithms. + */ + void initSingles(); + + /** + * @brief Build a segmentation. + * + * @param method %Segmentation method. + */ + void buildSegmentation(Segmenter::SegmentationMethod method); + + /** + * @brief Build a segmentation recursively. + * + * Some of the segmentation methods are implemented + * recursively. + * + * @param method %Segmentation method. + * @param segmentation Segmentation object which holds results. + * @param beg Beginning of the subquery to process. + * @param end End the subquery to process. + */ + void buildSegmentationRecursive(Segmenter::SegmentationMethod method, + Segmentation& segmentation, + unsigned int beg, + unsigned int end); + + public: + /** Default constructor */ + Segments() : _text(), _segments(), _map(), + _segmentation(Segmenter::SEGMENTATION_METHODS,NULL) {} + + /** Destructor */ + ~Segments() {} + + /** + * @brief Set input text, and clear all results. + * + * @param text Input text. + */ + void setText(const NGram &text) + { + _text.set(text); + clear(); + } + + /** + * @brief Set input text, and clear all results. + * + * @param text Input text. + */ + void setText(const std::string &text) + { + _text.set(text); + clear(); + } + + /** + * @brief Set input text, and clear all results. + * + * @param text Input text. + */ + void setText(const char *text) + { + _text.set(text); + clear(); + } + + /** + * @brief Get a reference to the input text. + * + * Get a reference to the input text. Valid as long as the + * Segments object is valid and not modified. + * + * return Reference to input text. + */ + const NGram& getText() const { return _text; } + + /** + * @brief Clear all detected segments and built segmentations. + */ + void clear() + { + _segments.clear(); + _map.init(_text.size()); + initSingles(); + for(unsigned int i=0;i<SEGMENTATION_METHODS;i++){ + delete _segmentation[i]; + _segmentation[i]=NULL; + } + } + + /** + * @brief Insert a detected segment. + * + * This method will be called by the detector for each detected + * segment. + * + * @param text Input text. + * @param from Index of first token. + * @param length Number of tokens. + * @param state Final state after detected phrase. + */ + void add(const NGram &text, + unsigned int from, int length, + const FSA::State &state) + { + (void)text; + unsigned int to=from+length; + int id=_map.get(from,to); + if(id==-1){ + _map.set(from,to,_segments.size()); + _segments.push_back(Segment(from,to,state.nData())); + } + else{ + _segments[id].set(from,to,state.nData()); + } + } + + /** + * @brief Get the size (number of segments). + * + * @return Number of segments. + */ + unsigned int size() const { return _segments.size(); } + + /** + * @brief Get a segment as a string. + * + * @param i %Segment index. + * @return %Segment string. + */ + const std::string operator[](unsigned int i) const { return sgm(i); } + + /** + * @brief Get a segment as a string. + * + * @param i %Segment index. + * @return %Segment string. + */ + const std::string sgm(unsigned int i) const + { + if(i<_segments.size()) + return _text.join(" ",_segments[i].beg(),_segments[i].len()); + return std::string(); + } + + /** + * @brief Get the beginning of a segment. + * + * @param i %Segment index. + * @return Beginning of the segment. + */ + unsigned beg(unsigned int i) const + { + if(i<_segments.size()) + return _segments[i].beg(); + return 0; + } + + /** + * @brief Get the end of a segment. + * + * @param i %Segment index. + * @return End of the segment. + */ + unsigned end(unsigned int i) const + { + if(i<_segments.size()) + return _segments[i].end(); + return 0; + } + + /** + * @brief Get the length of a segment. + * + * @param i %Segment index. + * @return Length of the segment. + */ + unsigned len(unsigned int i) const + { + if(i<_segments.size()) + return _segments[i].len(); + return 0; + } + + /** + * @brief Get the connexity of a segment. + * + * @param i %Segment index. + * @return Connexity of the segment. + */ + unsigned conn(unsigned int i) const + { + if(i<_segments.size()) + return _segments[i].conn(); + return 0; + } + + /** + * @brief Get the a segmentation of the query using the given method. + * + * @param method %Segmentation method + * @return Pointer to the Segmentation object, valid as long as the + * Segments object is valid and not modified. + */ + const Segmenter::Segmentation* segmentation(Segmenter::SegmentationMethod method) + { + if(method<SEGMENTATION_WEIGHTED || method>=SEGMENTATION_METHODS) + method=SEGMENTATION_WEIGHTED; + if(_segmentation[method]==NULL){ + buildSegmentation(method); + } + return _segmentation[method]; + } + + }; + + // }}} + + +private: + + const FSA& _dictionary; /**< Dictionary. */ + Detector _detector; /**< Detector. */ + + /** Unimplemented private default constructor */ + Segmenter(); + /** Unimplemented private copy constructor */ + Segmenter(const Segmenter&); + +public: + + /** + * @brief Constructor. + * + * Create Segmeneter object and initialize dictionary and detector. + * + * @param dict Dictionary to use. + */ + Segmenter(const FSA& dict) : _dictionary(dict), _detector(_dictionary) {} + + /** + * @brief Constructor. + * + * Create Segmeneter object and initialize dictionary and detector. + * + * @param dict Dictionary to use. + */ + Segmenter(const FSA* dict) : _dictionary(*dict), _detector(_dictionary) {} + + /** Destructor */ + ~Segmenter() {} + + /** + * @brief %Segment a query. + * + * @param segments %Segments object, input text already initialized. + */ + void segment(Segmenter::Segments &segments) const; + + /** + * @brief %Segment a query. + * + * @param text Input text. + * @param segments %Segments object to hold the results. + */ + void segment(const NGram &text, Segmenter::Segments &segments) const; + + /** + * @brief %Segment a query. + * + * @param text Input text. + * @param segments %Segments object to hold the results. + */ + void segment(const std::string &text, Segmenter::Segments &segments) const; + + /** + * @brief %Segment a query. + * + * @param text Input text. + * @param segments %Segments object to hold the results. + */ + void segment(const char *text, Segmenter::Segments &segments) const; + + /** + * @brief %Segment a query. + * + * @param text Input text. + * @param segments %Segments object to hold the results. + */ + void segment(const char *text, Segmenter::Segments *segments) const + { + segment(text,*segments); + } + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/selector.cpp b/fsa/src/vespa/fsa/selector.cpp new file mode 100644 index 00000000000..b139a8ebaed --- /dev/null +++ b/fsa/src/vespa/fsa/selector.cpp @@ -0,0 +1,77 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file selector.cpp + * @brief Selector class. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "selector.h" + +namespace fsa { + +// {{{ Selector::clear() + +void Selector::clear() +{ + _selector.clear(); +} + +// }}} +// {{{ Selector::set() + +void Selector::set(unsigned int c) +{ + unsigned int idx=0; + while(c>0){ + if(idx>=_selector.size()){ + _selector.resize(idx+1,false); + } + if(c&1) + _selector[idx]=true; + c>>=1; + idx++; + } +} + +// }}} +// {{{ Selector::select() + +void Selector::select(unsigned int i) +{ + if(i>=_selector.size()){ + _selector.resize(i+1,false); + } + _selector[i] = true; +} + +// }}} +// {{{ Selector::unselect() + +void Selector::unselect(unsigned int i) +{ + if(i>=_selector.size()){ + _selector.resize(i+1,false); + } + _selector[i] = false; +} + +// }}} +// {{{ Selector::operator[]() + +bool Selector::operator[](unsigned int i) const +{ + if(i>=_selector.size()){ + return false; + } + return _selector[i]; +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/selector.h b/fsa/src/vespa/fsa/selector.h new file mode 100644 index 00000000000..00e87bcb3f5 --- /dev/null +++ b/fsa/src/vespa/fsa/selector.h @@ -0,0 +1,105 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file selector.h + * @brief Selector class. + */ + +#pragma once + +#include <vector> + +namespace fsa { + +// {{{ class Selector + + +/** + * @class Selector + * @brief Simple (bitmap-like) selector class. + */ +class Selector { + + private: + + /** Selector */ + std::vector<bool> _selector; + + public: + + /** + * @brief Default constructor. + */ + Selector() : _selector() {} + + /** + * @brief Copy constructor. + * + * @param s Selector to copy. + */ + Selector(const Selector &s) : _selector(s._selector) {} + + /** + * @brief Constructor. + * + * Set the selector from a bitmask. + * + * @param c Bitmask. + */ + Selector(unsigned int c) : _selector() { set(c); } + + /** + * @brief Destructor. + */ + ~Selector() {} + + + /** + * @brief Clear the selector. + */ + void clear(); + + /** + * @brief Set selector from bitmask. + * + * @param c Bitmask. + */ + void set(unsigned int c); + + /** + * @brief Get size of selector. + * + * @return Size. + */ + unsigned int size() const { return _selector.size(); } + + /** + * @brief Set an item in the selector. + * + * @param i Index. + */ + void select(unsigned int i); + + /** + * @brief Unset an item in the selector. + * + * @param i Index. + */ + void unselect(unsigned int i); + + /** + * @brief Get an item. + * + * @param i Index. + * @return Item. + */ + bool operator[](unsigned int i) const; + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/timestamp.h b/fsa/src/vespa/fsa/timestamp.h new file mode 100644 index 00000000000..0455fc4c144 --- /dev/null +++ b/fsa/src/vespa/fsa/timestamp.h @@ -0,0 +1,84 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file timestamp.h + * @brief Simple timestamp class. + */ + +#pragma once + +#include <sys/time.h> +#include <time.h> + +namespace fsa { + +// {{{ class TimeStamp + +/** + * @class TimeStamp + * @brief Simple timestamp class. + */ +class TimeStamp { +private: + struct timeval _ts; +public: + /** + * @brief Constructor, registers current time. + */ + TimeStamp() { + gettimeofday(&_ts,NULL); + } + /** + * @brief Destructor. + */ + ~TimeStamp() {} + + /** + * @brief Reset timestamp. + * + * Set timestamp value to current time. + */ + void reset() + { + gettimeofday(&_ts,NULL); + } + + /** + * @brief Get timestamp value (= object creation or last reset time). + * + * @return Timestamp value in seconds. + */ + double getVal() const + { + return double(_ts.tv_sec)+double(_ts.tv_usec)/1000000.0; + } + + /** + * @brief Get elapsed time (since object creation time). + * + * @return Elapsed time in seconds. + */ + double elapsed() const + { + struct timeval now; + gettimeofday(&now,NULL); + return double(now.tv_sec)-double(_ts.tv_sec)+ + (double(now.tv_usec)-double(_ts.tv_usec))/1000000.0; + } + + /** + * @brief Calculate difference between timestamps. + * + * @return Difference between timestamps in seconds. + */ + double operator-(const TimeStamp &other) const + { + return double(_ts.tv_sec)-double(other._ts.tv_sec)+ + (double(_ts.tv_usec)-double(other._ts.tv_usec))/1000000.0; + } +}; + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/tokenizer.h b/fsa/src/vespa/fsa/tokenizer.h new file mode 100644 index 00000000000..2dceacca60b --- /dev/null +++ b/fsa/src/vespa/fsa/tokenizer.h @@ -0,0 +1,69 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file tokenizer.h + * @brief Generic tokenizer class. + */ + +#pragma once + +#include <iostream> +#include <vector> +#include <string> +#include <algorithm> + + +namespace fsa { + +// {{{ class Tokenizer + +/** + * @class Tokenizer + * @brief Generic tokenizer class. + * + * Generic interface to various tokenizer implementations. + */ +class Tokenizer { + +public: + + /** + * @brief Constructor. + */ + Tokenizer() {} + + /** + * @brief Destructor. + */ + virtual ~Tokenizer() {} + + /** + * @brief Initialize the tokenizer. + * + * @param text Input text. + * @return True on success. + */ + virtual bool init(const std::string &text) = 0; + + /** + * @brief Check if there are more tokens available. + * + * @return True if there are more tokens. + */ + virtual bool hasMore() = 0; + + /** + * @brief Get next token. + * + * @return Next token, or empty string if there are no more tokens left. + */ + virtual std::string getNext() = 0; + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/unicode.cpp b/fsa/src/vespa/fsa/unicode.cpp new file mode 100644 index 00000000000..4a35e79ff12 --- /dev/null +++ b/fsa/src/vespa/fsa/unicode.cpp @@ -0,0 +1,532 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "unicode.h" + +#include <assert.h> +#include <stdlib.h> + +namespace fsa { + +const unsigned int Unicode::_BadUTF8Char; +const unsigned int Unicode::_EOF; + + +char* Unicode::utf8copy(char *dst, const ucs4_t *src) +{ + ucs4_t i; + char *p; + + p = dst; + while ((i = *src++) != 0) { + if (i < 128) + *p++ = i; + else if (i < 0x800) { + *p++ = (i >> 6) | 0xc0; + *p++ = (i & 63) | 0x80; + } else if (i < 0x10000) { + *p++ = (i >> 12) | 0xe0; + *p++ = ((i >> 6) & 63) | 0x80; + *p++ = (i & 63) | 0x80; + } else if (i < 0x200000) { + *p++ = (i >> 18) | 0xf0; + *p++ = ((i >> 12) & 63) | 0x80; + *p++ = ((i >> 6) & 63) | 0x80; + *p++ = (i & 63) | 0x80; + } else if (i < 0x4000000) { + *p++ = (i >> 24) | 0xf8; + *p++ = ((i >> 18) & 63) | 0x80; + *p++ = ((i >> 12) & 63) | 0x80; + *p++ = ((i >> 6) & 63) | 0x80; + *p++ = (i & 63) | 0x80; + } else { + *p++ = (i >> 30) | 0xfc; + *p++ = ((i >> 24) & 63) | 0x80; + *p++ = ((i >> 18) & 63) | 0x80; + *p++ = ((i >> 12) & 63) | 0x80; + *p++ = ((i >> 6) & 63) | 0x80; + *p++ = (i & 63) | 0x80; + } + } + *p = 0; + return p; +} + +char* Unicode::utf8ncopy(char *dst, const ucs4_t *src, + int maxdst, int maxsrc) +{ + ucs4_t i; + char *p; + char *edst; + const ucs4_t *esrc; + + p = dst; + edst = dst + maxdst; + esrc = src + maxsrc; + + while (src < esrc && (i = *src++) != 0 && p < edst) { + if (i < 128) + *p++ = i; + else if (i < 0x800) { + if (p + 1 >= edst) + break; + *p++ = (i >> 6) | 0xc0; + *p++ = (i & 63) | 0x80; + } else if (i < 0x10000) { + if (p + 2 >= edst) + break; + *p++ = (i >> 12) | 0xe0; + *p++ = ((i >> 6) & 63) | 0x80; + *p++ = (i & 63) | 0x80; + } else if (i < 0x200000) { + if (p + 3 >= edst) + break; + *p++ = (i >> 18) | 0xf0; + *p++ = ((i >> 12) & 63) | 0x80; + *p++ = ((i >> 6) & 63) | 0x80; + *p++ = (i & 63) | 0x80; + } else if (i < 0x4000000) { + if (p + 4 >= edst) + break; + *p++ = (i >> 24) | 0xf8; + *p++ = ((i >> 18) & 63) | 0x80; + *p++ = ((i >> 12) & 63) | 0x80; + *p++ = ((i >> 6) & 63) | 0x80; + *p++ = (i & 63) | 0x80; + } else { + if (p + 5 >= edst) + break; + *p++ = (i >> 30) | 0xfc; + *p++ = ((i >> 24) & 63) | 0x80; + *p++ = ((i >> 18) & 63) | 0x80; + *p++ = ((i >> 12) & 63) | 0x80; + *p++ = ((i >> 6) & 63) | 0x80; + *p++ = (i & 63) | 0x80; + } + } + if (p < edst) + *p = 0; + return p; +} + + +int Unicode::utf8cmp(const char *s1, const ucs4_t *s2) +{ + ucs4_t i1; + ucs4_t i2; + + const unsigned char *ps1 = reinterpret_cast<const unsigned char *>(s1); + do { + i1 = getUTF8Char(ps1); + i2 = *s2++; + } while (i1 != 0 && i1 == i2); + if (i1 > i2) + return 1; + if (i1 < i2) + return -1; + return 0; +} + + +int Unicode::utf8casecmp(const char *s1, const ucs4_t *s2) +{ + ucs4_t i1, i2; + + const unsigned char *ps1 = reinterpret_cast<const unsigned char *>(s1); + do { + i1 = toLower(getUTF8Char(ps1)); + i2 = toLower(*s2++); + } while (i1 != 0 && i1 == i2); + if (i1 > i2) + return 1; + if (i1 < i2) + return -1; + return 0; +} + +size_t Unicode::utf8len(const ucs4_t *str) +{ + ucs4_t i; + size_t res; + + res = 0; + while ((i = *str++) != 0) { + if (i < 128) + res += 1; + else if (i < 0x800) + res += 2; + else if (i < 0x10000) + res += 3; + else if (i < 0x200000) + res += 4; + else if (i < 0x4000000) + res += 5; + else + res += 6; + } + return res; +} + +size_t Unicode::utf8nlen(const ucs4_t *str, int maxsrc) +{ + ucs4_t i; + size_t res; + int n; + + n = 0; + res = 0; + while ((i = *str++) != 0 && n < maxsrc) { + if (i < 128) + res += 1; + else if (i < 0x800) + res += 2; + else if (i < 0x10000) + res += 3; + else if (i < 0x200000) + res += 4; + else if (i < 0x4000000) + res += 5; + else + res += 6; + + n++; + } + return res; +} + +size_t Unicode::ucs4strlen(const ucs4_t *str) +{ + const ucs4_t *p = str; + while (*p++ != 0) { + /* Do nothing */ + } + return p - 1 - str; +} + +size_t Unicode::ucs4len(const char *str) +{ + ucs4_t i; + size_t res; + const unsigned char *pstr = reinterpret_cast<const unsigned char *>(str); + + res = 0; + while ((i = getUTF8Char(pstr)) != 0) { + if (i != _BadUTF8Char) + res++; + } + return res; +} + +size_t Unicode::ucs4nlen(const char *str, size_t n) +{ + ucs4_t i; + size_t res; + const unsigned char *pstr = reinterpret_cast<const unsigned char *>(str); + const unsigned char *end_str = pstr + n; + + res = 0; + while ((pstr < end_str) && (i = getUTF8Char(pstr, end_str-pstr)) != 0) { + if (i != _BadUTF8Char) + if (pstr <= end_str) + res++; + } + return res; +} + +ucs4_t* Unicode::ucs4copy(ucs4_t *dst, const char *src) +{ + ucs4_t i; + ucs4_t *p; + const unsigned char *psrc = reinterpret_cast<const unsigned char *>(src); + + p = dst; + while ((i = getUTF8Char(psrc)) != 0) { + if (i != _BadUTF8Char) + *p++ = i; + } + *p = 0; + return p; +} + +ucs4_t* Unicode::ucs4ncopy(ucs4_t *dst, const char *src, int byteLength) +{ + ucs4_t i; + ucs4_t *p; + const unsigned char *psrc = reinterpret_cast<const unsigned char *>(src); + const unsigned char *end_src = psrc + byteLength; + + p = dst; + while ((psrc < end_src) && (i = getUTF8Char(psrc, end_src-psrc)) != 0) { + if (i != _BadUTF8Char) + *p++ = i; + } + *p = 0; + return p; +} + + +char* Unicode::strdupUTF8(const char *src) +{ + char *res; + size_t reslen; + ucs4_t i; + const unsigned char *p; + char *q; + + reslen = 0; + p = reinterpret_cast<const unsigned char *>(src); + while ((i = getUTF8Char(p)) != 0) { + if (i != _BadUTF8Char) + reslen += utf8clen(i); + } + res = static_cast<char *>(malloc(reslen + 1)); + p = reinterpret_cast<const unsigned char *>(src); + q = res; + while ((i = getUTF8Char(p)) != 0) { + if (i != _BadUTF8Char) + q = utf8cput(q, i); + } + assert(q == res + reslen); + *q = 0; + return res; +} + + +char* Unicode::strlowdupUTF8(const char *src) +{ + char *res; + size_t reslen; + ucs4_t i; + const unsigned char *p; + char *q; + + reslen = 0; + p = reinterpret_cast<const unsigned char *>(src); + while ((i = getUTF8Char(p)) != 0) { + if (i != _BadUTF8Char) { + i = toLower(i); + if (i != _BadUTF8Char) + reslen += utf8clen(i); + } + } + res = static_cast<char *>(malloc(reslen + 1)); + p = reinterpret_cast<const unsigned char *>(src); + q = res; + while ((i = getUTF8Char(p)) != 0) { + if (i != _BadUTF8Char) { + i = toLower(i); + if (i != _BadUTF8Char) + q = utf8cput(q, i); + } + } + assert(q == res + reslen); + *q = 0; + return res; +} + +char* Unicode::strdupLAT1(const char *src) +{ + char *res; + size_t reslen; + ucs4_t i; + const unsigned char *p; + char *q; + + reslen = 0; + p = reinterpret_cast<const unsigned char *>(src); + while ((i = *p++) != 0) { + reslen += utf8clen(i); + } + res = static_cast<char *>(malloc(reslen + 1)); + p = reinterpret_cast<const unsigned char *>(src); + q = res; + while ((i = *p++) != 0) { + q = utf8cput(q, i); + } + assert(q == res + reslen); + *q = 0; + return res; +} + +ucs4_t Unicode::getUTF8Char(unsigned const char *&src, + int length /* = -1 */ ) +{ + ucs4_t retval; + + if (length != -1) { + // Check for unfinished UTF8 character sequence + int bytes = getUTF8ByteLength(*src); + if (bytes > length) { + src += bytes; + return _BadUTF8Char; + } + } + + if (*src >= 0x80) { /* 0x80..0xff */ + if (*src >= 0xc0) { + if (src[1] < 0x80 || src[1] >= 0xc0) { + src++; + return _BadUTF8Char; + } + if (*src >= 0xe0) { /* 0xe0..0xff */ + if (src[2] < 0x80 || src[2] >= 0xc0) { + src += 2; + return _BadUTF8Char; + } + if (*src >= 0xf0) { /* 0xf0..0xff */ + if (src[3] < 0x80 || src[3] >= 0xc0) { + src += 3; + return _BadUTF8Char; + } + if (*src >= 0xf8) { /* 0xf8..0xff */ + if (src[4] < 0x80 || src[4] >= 0xc0) { + src += 4; + return _BadUTF8Char; + } + if (*src >= 0xfc) { /* 0xfc..0xff */ + if (src[5] < 0x80 || src[5] >= 0xc0) { + src += 5; + return _BadUTF8Char; + } + if (*src >= 0xfe) { /* 0xfe..0xff: INVALID */ + src += 5; + return _BadUTF8Char; + } else { /* 0xfc..0xfd: 6 bytes */ + retval = ((src[0] & 1) << 30) | + ((src[1] & 63) << 24) | + ((src[2] & 63) << 18) | + ((src[3] & 63) << 12) | + ((src[4] & 63) << 6) | + (src[5] & 63); + if (retval < 0x4000000u) /* 6 bytes: >= 0x4000000 */ + retval = _BadUTF8Char; + src += 6; + return retval; + } + } else { /* 0xf8..0xfb: 5 bytes */ + retval = ((src[0] & 3) << 24) | + ((src[1] & 63) << 18) | + ((src[2] & 63) << 12) | + ((src[3] & 63) << 6) | + (src[4] & 63); + if (retval < 0x200000u) /* 5 bytes: >= 0x200000 */ + retval = _BadUTF8Char; + src += 5; + return retval; + } + } else { /* 0xf0..0xf7: 4 bytes */ + retval = ((src[0] & 7) << 18) | + ((src[1] & 63) << 12) | + ((src[2] & 63) << 6) | + (src[3] & 63); + if (retval < 0x10000) /* 4 bytes: >= 0x10000 */ + retval = _BadUTF8Char; + src += 4; + return retval; + } + } else { /* 0xe0..0xef: 3 bytes */ + retval = ((src[0] & 15) << 12) | + ((src[1] & 63) << 6) | + (src[2] & 63); + if (retval < 0x800) /* 3 bytes: >= 0x800 */ + retval = _BadUTF8Char; + src += 3; + return retval; + } + } else { /* 0xc0..0xdf: 2 bytes */ + + retval = ((src[0] & 31) << 6) | + (src[1] & 63); + if (retval < 0x80) /* 2 bytes: >= 0x80 */ + retval = _BadUTF8Char; + src += 2; + return retval; + } + } else { /* 0x80..0xbf: INVALID */ + src += 1; + return _BadUTF8Char; + } + } else /* 0x00..0x7f: 1 byte */ + return *src++; +} + + + + +#define UTF8_STARTCHAR(c) (!((c) & 0x80) || ((c) & 0x40)) + + /** Move forwards or backwards a number of characters within an UTF8 buffer + * Modify pos to yield new position if possible + * @param start A pointer to the start of the UTF8 buffer + * @param length The length of the UTF8 buffer + * @param pos A pointer to the current position within the UTF8 buffer, + * updated to reflect new position upon return. @param pos will + * point to the start of the offset'th character before or after the character + * currently pointed to. + * @param offset An offset (+/-) in number of UTF8 characters. + * Offset 0 consequently yields a move to the start of the current character. + * @return Number of bytes moved, or -1 if out of range. + * If -1 is returned, pos is unchanged. + */ +int Unicode::utf8move(unsigned const char* start, size_t length, + unsigned const char*& pos, off_t offset) +{ + int increment = offset > 0 ? 1 : -1; + unsigned const char* p = pos; + + /* If running backward we first need to get to the start of + * the current character, that's an extra step. + * Similarly, if we are running forward an are at the start of a character, + * we count that character as a step. + */ + + if (increment < 0) + { + // Already at start? + if (p < start) return -1; + if (!offset) + { + if (p > start + length) return -1; + } + else if (p == start) return -1; + + // Initially pointing to the first invalid char? + if (p == start + length) + p += increment; + else + offset += increment; + } + else if (p >= start + length) + return -1; + else if (UTF8_STARTCHAR(*p)) + offset += increment; + + + for (; p >= start && p < start+length; p += increment) + { + /** Are we at start of a character? (both highest bits or none of them set) */ + if (UTF8_STARTCHAR(*p)) + offset -= increment; // We have "eaten" another character (independent of dir) + if (offset == 0) break; + } + + if (offset != 0) + { + offset -= increment; + if (increment < 0) + p -= increment; + } + + if (offset == 0) // Enough room to make it.. + { + int moved = abs(p - pos); + pos = p; + return moved; + } + else + return -1; +} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/unicode.h b/fsa/src/vespa/fsa/unicode.h new file mode 100644 index 00000000000..3b14299193d --- /dev/null +++ b/fsa/src/vespa/fsa/unicode.h @@ -0,0 +1,483 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <inttypes.h> +#include <sys/types.h> + +namespace fsa { + +/** utf8_t is the type of the multi-byte UTF-8 character components */ +typedef uint8_t utf8_t; +/** ucs4_t is the type of the 4-byte UCS4 characters */ +typedef uint32_t ucs4_t; + + +/** + * @class Unicode + * @brief Unicode character manipulation class. + * + * Utility class for unicode character handling. + * Used to examine properties of unicode characters, and + * provide fast conversion methods between often used encodings. + */ +class Unicode { +private: + /** ISO 8859-1 digits. _isdigit[i] == 1 if i is a digit. + */ + static const unsigned char _isdigit[256]; + /** ISO 8859-1 operators in integer index expressions. + * _isintegerindexop[i] == 1 if i is a valid char in integer + * range expressions, which is ';<>[]'. + * This is maybe a bit specialized for the fastsearch application? + */ + static const unsigned char _isintegerindexop[256]; + /** ISO 8859-1 wordchar identification. + * _iswordchar[i] == 1 if i is a word character. + * Wordchars are A-Z, a-z, 0-9, 0xC0-0xFF except 0xD7 and 0xF7. + */ + static const unsigned char _iswordchar[256]; + /** ISO 8859-1 identifier start char. + * _isidstartchar[i] == 1 if i is an id start character. + * Is A-z, a-z. + */ + static const unsigned char _isidstartchar[256]; + /** ISO 8859-1 identifier char. + * _isidchar[i] == 1 if i is an id character. + * Is A-z, a-z, 0-9, and '-', '_', ':', '.'. + */ + static const unsigned char _isidchar[256]; + /** ISO 8859-1 space chars. _isspacechar[i] == 1 if i is a space char. + * Space chars are ' ', '\\r', '\\t', '\\n'. + */ + static const unsigned char _isspacechar[256]; + /** + * ISO 8859-1 uppercase to lowercase mapping table. + * _tolower[i] == j if j is the lowercase of i, else it is i (identity). + * It is useful in the range A-Z, 0xC0-0xE0 except 0xD7. + */ + static const unsigned char _tolower[256]; + /** + * Table for easy lookup of UTF8 character length in bytes + */ + static const unsigned char _utf8header[256]; + + /** Two-level lowercase table. 256 pages, 256 elements each. + * This table is defined in unicode-lowercase.cpp, which is + * autogenerated by the extcase application. */ + static const unsigned short *_compLowerCase[256]; + + /** Two-level character property table. 256 pages with 256 elements each. + * This table is defined in unicode-charprops.cpp, which is + * autogenerated by the extprop application. */ + static const unsigned char *_compCharProps[256]; + +public: + + /** The property bit identificators */ + enum { + _spaceProp = 1, + _wordcharProp = 2, + _ideographicProp = 4, + _decimalDigitCharProp = 8, + _ignorableControlCharProp = 16 + }; + + /** Indicates an invalid UTF-8 character sequence. */ + static const ucs4_t _BadUTF8Char = 0xfffffffeu; + /** EndOfFile */ + static const ucs4_t _EOF = 0xffffffffu; + + /** + * Return the 'raw' property bitmap. + * @param testchar the UCS4 character to test. + * @return unsigned char with the property bitmap. + */ + static unsigned char getProperty(ucs4_t testchar) { + if (testchar < 65536) + return _compCharProps[testchar >> 8][testchar & 255]; + else + return 0; + } + + /** + * Test for a specified property. + * @param testchar the UCS4 character to test. + * @param testprops the set of properties to test for. + * @return true if testchar satisfies the specified set of properties. + */ + static bool hasProperty(ucs4_t testchar, unsigned char testprops) { + return (testchar < 65536 && + (_compCharProps[testchar >> 8][testchar & 255] & testprops) != 0); + } + + /** + * Test for word character. Characters with certain unicode properties + * are recognized as word characters. In addition to this, all + * characters with the custom _FASTWordProp is regarded as a word + * character. The previous range in _privateUseProp is included + * in the _FASTWordProp set of ranges. + * @param testchar the UCS4 character to test. + * @return true if testchar is a word character, i.e. if it has + * one or more of the properties alphabetic, ideographic, + * combining char, decimal digit char, private use, extender. + */ + static bool isWordChar(ucs4_t testchar) { + return (testchar < 65536 && + (_compCharProps[testchar >> 8][testchar & 255] & + _wordcharProp) != 0); + } + + /** + * Test for ideographic character. + * @param testchar the UCS4 character to test. + * @return true if testchar is an ideographic character, + * i.e. if it has the ideographic property. + */ + static bool isIdeographicChar(ucs4_t testchar) { + return (testchar < 65536 && + (_compCharProps[testchar >> 8][testchar & 255] & + _ideographicProp) != 0); + } + + /** + * Test for private use character. Implemented to + * return true if character is in the range E000-F8FF, + * since this is the only range of characters with + * this property. + * @param testchar the UCS4 character to test. + * @return true if testchar is a private use character, + * i.e. if it has the private use property. + */ + static bool isPrivateUseChar(ucs4_t testchar) { + return (testchar >= 0xE000 && testchar <= 0xF8FF); + //return (testchar < 65536 && + //(_compCharProps[testchar >> 8][testchar & 255] & + //(_privateUseProp)) != 0); + } + + /** + * Test for ignorable character. + * @param testchar the UCS4 character to test. + * @return true if testchar is an ignorable character, + * i.e. if it has the ignorable control char property. + */ + static bool isIgnorableChar(ucs4_t testchar) { + return (testchar < 65536 && + (_compCharProps[testchar >> 8][testchar & 255] & + _ignorableControlCharProp) != 0); + } + + /** + * Test for identificator start character. + * InitTables should be called before using this test. + * @param testchar the UCS4 character to test. + * @return true if testchar is an identificator start character. + */ + static bool isIDStartChar(ucs4_t testchar) + { + return (testchar < 256 && _isidstartchar[testchar] != 0); + } + + /** + * Test for identificator character. + * InitTables should be called before using this test. + * @param testchar the UCS4 character to test. + * @return true if testchar is an identificator character. + */ + static bool isIDChar(ucs4_t testchar) + { + return (testchar < 256 && _isidchar[testchar] != 0); + } + + /** + * Test for digit character. + * @param testchar the UCS4 character to test. + * @return true if testchar is a digit character, + * i.e. if it has the decimal digit char property. + */ + static bool isDigit(ucs4_t testchar) + { + return (testchar < 65536 && + (_compCharProps[testchar >> 8][testchar & 255] & + _decimalDigitCharProp) != 0); + } + + /** + * Test for integer range expression character. + * InitTables should be called before using this test. + * @param testchar the UCS4 character to test. + * @return true if testchar is an integer range expression character. + */ + static bool isIntegerIndexOp(ucs4_t testchar) + { + return (testchar < 256 && _isintegerindexop[testchar] != 0); + } + + /** + * Test for space character. + * @param testchar the UCS4 character to test. + * @return true if testchar is a space character, + * i.e. if it has the space char property. + */ + static bool isSpaceChar(ucs4_t testchar) + { + return (testchar < 65536 && + (_compCharProps[testchar >> 8][testchar & 255] & + _spaceProp) != 0); + } + + /** + * Test for uppercase character. + * @param testchar the UCS4 character to test. + * @return true if testchar is an uppercase character. + */ + static bool isUpper(ucs4_t testchar) + { + if (testchar >= 65536) + return false; + ucs4_t ret = _compLowerCase[testchar >> 8][testchar & 255]; + return (ret != 0 && ret != testchar); + } + + /** + * Lowercase an UCS4 character. + * @param testchar The character to lowercase. + * @return The lowercase of the input, if defined. Else the input character. + */ + static ucs4_t toLower(ucs4_t testchar) + { + ucs4_t ret; + if (testchar < 65536) { + ret = _compLowerCase[testchar >> 8][testchar & 255]; + if (ret == 0) + return testchar; + return ret; + } else + return testchar; + } + + /** + * Get the length of the UTF-8 representation of an UCS4 character. + * @param i The UCS4 character. + * @return The number of bytes required for the UTF-8 representation. + */ + static size_t utf8clen(ucs4_t i) { + if (i < 128) + return 1; + else if (i < 0x800) + return 2; + else if (i < 0x10000) + return 3; + else if (i < 0x200000) + return 4; + else if (i < 0x4000000) + return 5; + else + return 6; + } + + /** + * Get the length of the UTF8 character in number of bytes + * @param utf8char the first byte in a UTF8 character + * @return the number of bytes in the UTF8 character + */ + static unsigned char getUTF8ByteLength(unsigned char utf8char) { + return _utf8header[utf8char]; + } + + /** + * Put an UCS4 character into a buffer as an UTF-8 representation. + * @param dst The destination buffer. + * @param i The UCS4 character. + * @return Pointer to the next position in dst after the putted byte(s). + */ + static char *utf8cput(char *dst, ucs4_t i) { + if (i < 128) + *dst++ = i; + else if (i < 0x800) { + *dst++ = (i >> 6) | 0xc0; + *dst++ = (i & 63) | 0x80; + } else if (i < 0x10000) { + *dst++ = (i >> 12) | 0xe0; + *dst++ = ((i >> 6) & 63) | 0x80; + *dst++ = (i & 63) | 0x80; + } else if (i < 0x200000) { + *dst++ = (i >> 18) | 0xf0; + *dst++ = ((i >> 12) & 63) | 0x80; + *dst++ = ((i >> 6) & 63) | 0x80; + *dst++ = (i & 63) | 0x80; + } else if (i < 0x4000000) { + *dst++ = (i >> 24) | 0xf8; + *dst++ = ((i >> 18) & 63) | 0x80; + *dst++ = ((i >> 12) & 63) | 0x80; + *dst++ = ((i >> 6) & 63) | 0x80; + *dst++ = (i & 63) | 0x80; + } else { + *dst++ = (i >> 30) | 0xfc; + *dst++ = ((i >> 24) & 63) | 0x80; + *dst++ = ((i >> 18) & 63) | 0x80; + *dst++ = ((i >> 12) & 63) | 0x80; + *dst++ = ((i >> 6) & 63) | 0x80; + *dst++ = (i & 63) | 0x80; + } + return dst; + } + + + /** + * Convert UCS4 to UTF-8. + * @param dst The destination buffer for the UTF-8 string. + * @param src The source UCS4 string. + * @return A pointer to the destination. + */ + static char *utf8copy(char *dst, const ucs4_t *src); + + /** + * Convert UCS4 to UTF-8, bounded by max lengths. + * @param dst The destination buffer for the UTF-8 string. + * @param src The source UCS4 string. + * @param maxdst The maximum number of bytes to put into dst. + * @param maxsrc The maximum number of characters to convert from src. + * @return A pointer to the destination. + */ + static char *utf8ncopy(char *dst, const ucs4_t *src, int maxdst, int maxsrc); + + /** + * Compare an UTF-8 string to a UCS4 string, analogous to strcmp(3). + * @param s1 The UTF-8 string. + * @param s2 The UCS4 string. + * @return An integer less than, equal to, or greater than zero, + * if s1 is, respectively, less than, matching, or greater than s2. + */ + static int utf8cmp(const char *s1, const ucs4_t *s2); + + /** + * Compare an UTF-8 string to a UCS4 string, ignoring case. + * This is comparable to strcasecmp(3). + * @param s1 The UTF-8 string. + * @param s2 The UCS4 string. + * @return An integer less than, equal to, or greater than zero, + * if s1 is, respectively, less than, matching, or greater than s2. + */ + static int utf8casecmp(const char *s1, const ucs4_t *s2); + + /** + * Find the length, in bytes, of the UTF-8 representation of an UCS4 string. + * @param str The UCS4 string. + * @return The length, in bytes, of the equivalent UTF-8 representation. + */ + static size_t utf8len(const ucs4_t *str); + + /** + * Find the length, in bytes, of the UTF-8 representation of the first + * maxsrc characters of an UCS4 string. + * @param str The UCS4 string. + * @param maxsrc The maximum number of UCS4 characters to consider. + * @return The length, in bytes, of the equivalent UTF-8 representation. + */ + static size_t utf8nlen(const ucs4_t *str, int maxsrc); + + /** + * Find the number of characters in an UCS4 string. + * @param str The UCS4 string. + * @return The number of characters. + */ + static size_t ucs4strlen(const ucs4_t *str); + + /** + * Find the number of UCS4 characters in an UTF-8 string. I.e. + * how many UCS4 characters would be needed for the string. + * @param str The UTF-8 string. + * @return The number of characters needed. + */ + static size_t ucs4len(const char *str); + + /** + * Find the number of characters in an UTF-8 string, up to + * a maximum of bytes. + * @param str The UTF-8 string. + * @param n The max number of bytes to consider. + * @return The number of characters needed. + */ + static size_t ucs4nlen(const char *str, size_t n); + + /** + * Copy an UTF-8 string into an UCS4 string. + * @param dst The UCS4 destination buffer. + * @param src The UTF-8 source buffer. + * @return A pointer to the destination string. + */ + static ucs4_t *ucs4copy(ucs4_t *dst, const char *src); + + /** + * Copy an UTF-8 string into an UCS4 string, up to a maximum + * number of bytes from the UTF-8 string. + * @param dst Destination UCS4 string buffer. + * @param src Source UTF-8 string. + * @param maxsrc Max number of bytes to copy. + * @return Pointer to the destination buffer. + */ + static ucs4_t *ucs4ncopy(ucs4_t *dst, const char *src, int maxsrc); + + /** + * Copy an UTF-8 string to an UTF-8 string. + * This only copies the valid UTF-8 characters. + * @param src The source UTF-8 string. + * @return Pointer to a new allocated buffer with the result. + */ + static char *strdupUTF8(const char *src); + + /** + * Copy an UTF-8 string to an UTF-8 string, converting + * to lowercase as we go. + * @param src The source UTF-8 string. + * @return Pointer to a new allocated buffer with the result. + */ + static char *strlowdupUTF8(const char *src); + + /** + * Copy an ISO-8859-1 string to an UTF-8 string. + * @param src The source ISO-8859-1 string. + * @return Pointer to a new alloacted buffer with the UTF-8 result. + */ + static char *strdupLAT1(const char *src); + + /** + * Get the next UCS4 character from an UTF-8 string buffer. + * Modify the src pointer to allow future calls. + * @param src The address of a pointer to the current position + * in the UTF-8 string. + * @param length The maximum allowed length of the byte sequence. + * -1 means no check. + * @return The next UCS4 character, or _BadUTF8Char if the + * next character is invalid. + */ + static ucs4_t getUTF8Char(unsigned const char *&src, + int length = -1); + static ucs4_t getUTF8Char(char *&src, + int length = -1) + { + unsigned const char *temp = reinterpret_cast<unsigned char*>(src); + ucs4_t res=getUTF8Char(temp,length); + src=reinterpret_cast<char*>(const_cast<unsigned char*>(temp)); + return res; + } + + + /** Move forwards or backwards a number of characters within an UTF8 buffer + * Modify pos to yield new position if possible + * @param start A pointer to the start of the UTF8 buffer + * @param length The length of the UTF8 buffer + * @param pos A pointer to the current position within the UTF8 buffer, + * updated to reflect new position upon return + * @param offset An offset (+/-) in number of UTF8 characters. + * Offset 0 means move to the start of the current character. + * @return Number of bytes moved, or -1 if out of range + */ + static int utf8move(unsigned const char* start, size_t length, + unsigned const char*& pos, off_t offset); +}; + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/unicode_charprops.cpp b/fsa/src/vespa/fsa/unicode_charprops.cpp new file mode 100644 index 00000000000..3bcc45d4a2c --- /dev/null +++ b/fsa/src/vespa/fsa/unicode_charprops.cpp @@ -0,0 +1,1688 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "unicode.h" + +namespace fsa { + +/* + * Bit 0 indicates white space character + * Bit 1 indicates Word character + * Bit 2 indicates ideographic character + * Bit 3 indicates decimal digit character + * Bit 4 indicates ignorable control character + */ + +static unsigned char _intCompCharProps[11264]={ + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x0A, 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x02, 0x00, 0x00, 0x10, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x00, 0x02, + 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x10, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x00, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, 0x00, 0x02, + 0x00, 0x02, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x0A, 0x0A, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x10, 0x00, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x0A, 0x0A, 0x02, 0x02, 0x02, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x0A, 0x0A, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x02, + 0x02, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x02, + 0x02, 0x00, 0x00, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x00, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x0A, 0x0A, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x02, + 0x02, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x00, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x00, 0x00, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x02, + 0x02, 0x00, 0x00, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x0A, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, + 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, + 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x0A, 0x0A, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x02, + 0x02, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x02, + 0x02, 0x00, 0x00, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x00, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x0A, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x00, 0x02, 0x00, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x02, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x0A, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x0A, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x0A, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x0A, 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x00, 0x02, 0x00, 0x00, 0x02, + 0x02, 0x00, 0x02, 0x00, 0x00, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x02, 0x02, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x00, 0x00, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x0A, 0x0A, 0x00, 0x00, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x0A, 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x02, + 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x0A, 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, + 0x12, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x0A, 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x10, 0x10, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x00, 0x00, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x0A, 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x10, 0x10, 0x10, 0x01, 0x00, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x0A, 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0A, 0x0A, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x01, 0x10, 0x10, 0x10, 0x10, 0x10, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x02, 0x00, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, + 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x02, 0x06, 0x06, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x06, 0x06, 0x06, 0x02, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x12, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x00, + 0x02, 0x02, 0x00, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00, 0x10, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, + 0x0A, 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x22, 0x22, + 0x12, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x00, + 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x00, 0x00, 0x02, 0x02, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00, +}; + +const unsigned char *Unicode::_compCharProps[256]={ + _intCompCharProps+0x0000, /* Page 0x00 */ + _intCompCharProps+0x0100, /* Page 0x01 */ + _intCompCharProps+0x0200, /* Page 0x02 */ + _intCompCharProps+0x0300, /* Page 0x03 */ + _intCompCharProps+0x0400, /* Page 0x04 */ + _intCompCharProps+0x0500, /* Page 0x05 */ + _intCompCharProps+0x0600, /* Page 0x06 */ + _intCompCharProps+0x0700, /* Page 0x07 */ + _intCompCharProps+0x0800, /* Page 0x08 */ + _intCompCharProps+0x0900, /* Page 0x09 */ + _intCompCharProps+0x0A00, /* Page 0x0A */ + _intCompCharProps+0x0B00, /* Page 0x0B */ + _intCompCharProps+0x0C00, /* Page 0x0C */ + _intCompCharProps+0x0D00, /* Page 0x0D */ + _intCompCharProps+0x0E00, /* Page 0x0E */ + _intCompCharProps+0x0F00, /* Page 0x0F */ + _intCompCharProps+0x1000, /* Page 0x10 */ + _intCompCharProps+0x1100, /* Page 0x11 */ + _intCompCharProps+0x1200, /* Page 0x12 */ + _intCompCharProps+0x1300, /* Page 0x13 */ + _intCompCharProps+0x1400, /* Page 0x14 */ + _intCompCharProps+0x0100, /* Page 0x15 */ + _intCompCharProps+0x1500, /* Page 0x16 */ + _intCompCharProps+0x1600, /* Page 0x17 */ + _intCompCharProps+0x1700, /* Page 0x18 */ + _intCompCharProps+0x1800, /* Page 0x19 */ + _intCompCharProps+0x0800, /* Page 0x1A */ + _intCompCharProps+0x0800, /* Page 0x1B */ + _intCompCharProps+0x0800, /* Page 0x1C */ + _intCompCharProps+0x1900, /* Page 0x1D */ + _intCompCharProps+0x1A00, /* Page 0x1E */ + _intCompCharProps+0x1B00, /* Page 0x1F */ + _intCompCharProps+0x1C00, /* Page 0x20 */ + _intCompCharProps+0x1D00, /* Page 0x21 */ + _intCompCharProps+0x0800, /* Page 0x22 */ + _intCompCharProps+0x0800, /* Page 0x23 */ + _intCompCharProps+0x0800, /* Page 0x24 */ + _intCompCharProps+0x0800, /* Page 0x25 */ + _intCompCharProps+0x0800, /* Page 0x26 */ + _intCompCharProps+0x0800, /* Page 0x27 */ + _intCompCharProps+0x0800, /* Page 0x28 */ + _intCompCharProps+0x0800, /* Page 0x29 */ + _intCompCharProps+0x0800, /* Page 0x2A */ + _intCompCharProps+0x0800, /* Page 0x2B */ + _intCompCharProps+0x0800, /* Page 0x2C */ + _intCompCharProps+0x0800, /* Page 0x2D */ + _intCompCharProps+0x0800, /* Page 0x2E */ + _intCompCharProps+0x0800, /* Page 0x2F */ + _intCompCharProps+0x1E00, /* Page 0x30 */ + _intCompCharProps+0x1F00, /* Page 0x31 */ + _intCompCharProps+0x2000, /* Page 0x32 */ + _intCompCharProps+0x0800, /* Page 0x33 */ + _intCompCharProps+0x2100, /* Page 0x34 */ + _intCompCharProps+0x2100, /* Page 0x35 */ + _intCompCharProps+0x2100, /* Page 0x36 */ + _intCompCharProps+0x2100, /* Page 0x37 */ + _intCompCharProps+0x2100, /* Page 0x38 */ + _intCompCharProps+0x2100, /* Page 0x39 */ + _intCompCharProps+0x2100, /* Page 0x3A */ + _intCompCharProps+0x2100, /* Page 0x3B */ + _intCompCharProps+0x2100, /* Page 0x3C */ + _intCompCharProps+0x2100, /* Page 0x3D */ + _intCompCharProps+0x2100, /* Page 0x3E */ + _intCompCharProps+0x2100, /* Page 0x3F */ + _intCompCharProps+0x2100, /* Page 0x40 */ + _intCompCharProps+0x2100, /* Page 0x41 */ + _intCompCharProps+0x2100, /* Page 0x42 */ + _intCompCharProps+0x2100, /* Page 0x43 */ + _intCompCharProps+0x2100, /* Page 0x44 */ + _intCompCharProps+0x2100, /* Page 0x45 */ + _intCompCharProps+0x2100, /* Page 0x46 */ + _intCompCharProps+0x2100, /* Page 0x47 */ + _intCompCharProps+0x2100, /* Page 0x48 */ + _intCompCharProps+0x2100, /* Page 0x49 */ + _intCompCharProps+0x2100, /* Page 0x4A */ + _intCompCharProps+0x2100, /* Page 0x4B */ + _intCompCharProps+0x2100, /* Page 0x4C */ + _intCompCharProps+0x2200, /* Page 0x4D */ + _intCompCharProps+0x2100, /* Page 0x4E */ + _intCompCharProps+0x2100, /* Page 0x4F */ + _intCompCharProps+0x2100, /* Page 0x50 */ + _intCompCharProps+0x2100, /* Page 0x51 */ + _intCompCharProps+0x2100, /* Page 0x52 */ + _intCompCharProps+0x2100, /* Page 0x53 */ + _intCompCharProps+0x2100, /* Page 0x54 */ + _intCompCharProps+0x2100, /* Page 0x55 */ + _intCompCharProps+0x2100, /* Page 0x56 */ + _intCompCharProps+0x2100, /* Page 0x57 */ + _intCompCharProps+0x2100, /* Page 0x58 */ + _intCompCharProps+0x2100, /* Page 0x59 */ + _intCompCharProps+0x2100, /* Page 0x5A */ + _intCompCharProps+0x2100, /* Page 0x5B */ + _intCompCharProps+0x2100, /* Page 0x5C */ + _intCompCharProps+0x2100, /* Page 0x5D */ + _intCompCharProps+0x2100, /* Page 0x5E */ + _intCompCharProps+0x2100, /* Page 0x5F */ + _intCompCharProps+0x2100, /* Page 0x60 */ + _intCompCharProps+0x2100, /* Page 0x61 */ + _intCompCharProps+0x2100, /* Page 0x62 */ + _intCompCharProps+0x2100, /* Page 0x63 */ + _intCompCharProps+0x2100, /* Page 0x64 */ + _intCompCharProps+0x2100, /* Page 0x65 */ + _intCompCharProps+0x2100, /* Page 0x66 */ + _intCompCharProps+0x2100, /* Page 0x67 */ + _intCompCharProps+0x2100, /* Page 0x68 */ + _intCompCharProps+0x2100, /* Page 0x69 */ + _intCompCharProps+0x2100, /* Page 0x6A */ + _intCompCharProps+0x2100, /* Page 0x6B */ + _intCompCharProps+0x2100, /* Page 0x6C */ + _intCompCharProps+0x2100, /* Page 0x6D */ + _intCompCharProps+0x2100, /* Page 0x6E */ + _intCompCharProps+0x2100, /* Page 0x6F */ + _intCompCharProps+0x2100, /* Page 0x70 */ + _intCompCharProps+0x2100, /* Page 0x71 */ + _intCompCharProps+0x2100, /* Page 0x72 */ + _intCompCharProps+0x2100, /* Page 0x73 */ + _intCompCharProps+0x2100, /* Page 0x74 */ + _intCompCharProps+0x2100, /* Page 0x75 */ + _intCompCharProps+0x2100, /* Page 0x76 */ + _intCompCharProps+0x2100, /* Page 0x77 */ + _intCompCharProps+0x2100, /* Page 0x78 */ + _intCompCharProps+0x2100, /* Page 0x79 */ + _intCompCharProps+0x2100, /* Page 0x7A */ + _intCompCharProps+0x2100, /* Page 0x7B */ + _intCompCharProps+0x2100, /* Page 0x7C */ + _intCompCharProps+0x2100, /* Page 0x7D */ + _intCompCharProps+0x2100, /* Page 0x7E */ + _intCompCharProps+0x2100, /* Page 0x7F */ + _intCompCharProps+0x2100, /* Page 0x80 */ + _intCompCharProps+0x2100, /* Page 0x81 */ + _intCompCharProps+0x2100, /* Page 0x82 */ + _intCompCharProps+0x2100, /* Page 0x83 */ + _intCompCharProps+0x2100, /* Page 0x84 */ + _intCompCharProps+0x2100, /* Page 0x85 */ + _intCompCharProps+0x2100, /* Page 0x86 */ + _intCompCharProps+0x2100, /* Page 0x87 */ + _intCompCharProps+0x2100, /* Page 0x88 */ + _intCompCharProps+0x2100, /* Page 0x89 */ + _intCompCharProps+0x2100, /* Page 0x8A */ + _intCompCharProps+0x2100, /* Page 0x8B */ + _intCompCharProps+0x2100, /* Page 0x8C */ + _intCompCharProps+0x2100, /* Page 0x8D */ + _intCompCharProps+0x2100, /* Page 0x8E */ + _intCompCharProps+0x2100, /* Page 0x8F */ + _intCompCharProps+0x2100, /* Page 0x90 */ + _intCompCharProps+0x2100, /* Page 0x91 */ + _intCompCharProps+0x2100, /* Page 0x92 */ + _intCompCharProps+0x2100, /* Page 0x93 */ + _intCompCharProps+0x2100, /* Page 0x94 */ + _intCompCharProps+0x2100, /* Page 0x95 */ + _intCompCharProps+0x2100, /* Page 0x96 */ + _intCompCharProps+0x2100, /* Page 0x97 */ + _intCompCharProps+0x2100, /* Page 0x98 */ + _intCompCharProps+0x2100, /* Page 0x99 */ + _intCompCharProps+0x2100, /* Page 0x9A */ + _intCompCharProps+0x2100, /* Page 0x9B */ + _intCompCharProps+0x2100, /* Page 0x9C */ + _intCompCharProps+0x2100, /* Page 0x9D */ + _intCompCharProps+0x2100, /* Page 0x9E */ + _intCompCharProps+0x2300, /* Page 0x9F */ + _intCompCharProps+0x0100, /* Page 0xA0 */ + _intCompCharProps+0x0100, /* Page 0xA1 */ + _intCompCharProps+0x0100, /* Page 0xA2 */ + _intCompCharProps+0x0100, /* Page 0xA3 */ + _intCompCharProps+0x2400, /* Page 0xA4 */ + _intCompCharProps+0x0800, /* Page 0xA5 */ + _intCompCharProps+0x0800, /* Page 0xA6 */ + _intCompCharProps+0x0800, /* Page 0xA7 */ + _intCompCharProps+0x0800, /* Page 0xA8 */ + _intCompCharProps+0x0800, /* Page 0xA9 */ + _intCompCharProps+0x0800, /* Page 0xAA */ + _intCompCharProps+0x0800, /* Page 0xAB */ + _intCompCharProps+0x0100, /* Page 0xAC */ + _intCompCharProps+0x0100, /* Page 0xAD */ + _intCompCharProps+0x0100, /* Page 0xAE */ + _intCompCharProps+0x0100, /* Page 0xAF */ + _intCompCharProps+0x0100, /* Page 0xB0 */ + _intCompCharProps+0x0100, /* Page 0xB1 */ + _intCompCharProps+0x0100, /* Page 0xB2 */ + _intCompCharProps+0x0100, /* Page 0xB3 */ + _intCompCharProps+0x0100, /* Page 0xB4 */ + _intCompCharProps+0x0100, /* Page 0xB5 */ + _intCompCharProps+0x0100, /* Page 0xB6 */ + _intCompCharProps+0x0100, /* Page 0xB7 */ + _intCompCharProps+0x0100, /* Page 0xB8 */ + _intCompCharProps+0x0100, /* Page 0xB9 */ + _intCompCharProps+0x0100, /* Page 0xBA */ + _intCompCharProps+0x0100, /* Page 0xBB */ + _intCompCharProps+0x0100, /* Page 0xBC */ + _intCompCharProps+0x0100, /* Page 0xBD */ + _intCompCharProps+0x0100, /* Page 0xBE */ + _intCompCharProps+0x0100, /* Page 0xBF */ + _intCompCharProps+0x0100, /* Page 0xC0 */ + _intCompCharProps+0x0100, /* Page 0xC1 */ + _intCompCharProps+0x0100, /* Page 0xC2 */ + _intCompCharProps+0x0100, /* Page 0xC3 */ + _intCompCharProps+0x0100, /* Page 0xC4 */ + _intCompCharProps+0x0100, /* Page 0xC5 */ + _intCompCharProps+0x0100, /* Page 0xC6 */ + _intCompCharProps+0x0100, /* Page 0xC7 */ + _intCompCharProps+0x0100, /* Page 0xC8 */ + _intCompCharProps+0x0100, /* Page 0xC9 */ + _intCompCharProps+0x0100, /* Page 0xCA */ + _intCompCharProps+0x0100, /* Page 0xCB */ + _intCompCharProps+0x0100, /* Page 0xCC */ + _intCompCharProps+0x0100, /* Page 0xCD */ + _intCompCharProps+0x0100, /* Page 0xCE */ + _intCompCharProps+0x0100, /* Page 0xCF */ + _intCompCharProps+0x0100, /* Page 0xD0 */ + _intCompCharProps+0x0100, /* Page 0xD1 */ + _intCompCharProps+0x0100, /* Page 0xD2 */ + _intCompCharProps+0x0100, /* Page 0xD3 */ + _intCompCharProps+0x0100, /* Page 0xD4 */ + _intCompCharProps+0x0100, /* Page 0xD5 */ + _intCompCharProps+0x0100, /* Page 0xD6 */ + _intCompCharProps+0x2500, /* Page 0xD7 */ + _intCompCharProps+0x2600, /* Page 0xD8 */ + _intCompCharProps+0x2600, /* Page 0xD9 */ + _intCompCharProps+0x2600, /* Page 0xDA */ + _intCompCharProps+0x2600, /* Page 0xDB */ + _intCompCharProps+0x2600, /* Page 0xDC */ + _intCompCharProps+0x2600, /* Page 0xDD */ + _intCompCharProps+0x2600, /* Page 0xDE */ + _intCompCharProps+0x2600, /* Page 0xDF */ + _intCompCharProps+0x0100, /* Page 0xE0 */ + _intCompCharProps+0x0100, /* Page 0xE1 */ + _intCompCharProps+0x0100, /* Page 0xE2 */ + _intCompCharProps+0x0100, /* Page 0xE3 */ + _intCompCharProps+0x0100, /* Page 0xE4 */ + _intCompCharProps+0x0100, /* Page 0xE5 */ + _intCompCharProps+0x0100, /* Page 0xE6 */ + _intCompCharProps+0x0100, /* Page 0xE7 */ + _intCompCharProps+0x0100, /* Page 0xE8 */ + _intCompCharProps+0x0100, /* Page 0xE9 */ + _intCompCharProps+0x0100, /* Page 0xEA */ + _intCompCharProps+0x0100, /* Page 0xEB */ + _intCompCharProps+0x0100, /* Page 0xEC */ + _intCompCharProps+0x0100, /* Page 0xED */ + _intCompCharProps+0x0100, /* Page 0xEE */ + _intCompCharProps+0x0100, /* Page 0xEF */ + _intCompCharProps+0x0100, /* Page 0xF0 */ + _intCompCharProps+0x0100, /* Page 0xF1 */ + _intCompCharProps+0x0100, /* Page 0xF2 */ + _intCompCharProps+0x0100, /* Page 0xF3 */ + _intCompCharProps+0x0100, /* Page 0xF4 */ + _intCompCharProps+0x0100, /* Page 0xF5 */ + _intCompCharProps+0x0100, /* Page 0xF6 */ + _intCompCharProps+0x0100, /* Page 0xF7 */ + _intCompCharProps+0x0100, /* Page 0xF8 */ + _intCompCharProps+0x2100, /* Page 0xF9 */ + _intCompCharProps+0x2700, /* Page 0xFA */ + _intCompCharProps+0x2800, /* Page 0xFB */ + _intCompCharProps+0x0100, /* Page 0xFC */ + _intCompCharProps+0x2900, /* Page 0xFD */ + _intCompCharProps+0x2A00, /* Page 0xFE */ + _intCompCharProps+0x2B00 /* Page 0xFF */ +}; + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/unicode_lowercase.cpp b/fsa/src/vespa/fsa/unicode_lowercase.cpp new file mode 100644 index 00000000000..e69368c6ef3 --- /dev/null +++ b/fsa/src/vespa/fsa/unicode_lowercase.cpp @@ -0,0 +1,656 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "unicode.h" + +namespace fsa { + +static unsigned short _intCompLowerCase[3072]={ + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0061u, 0x0062u, 0x0063u, 0x0064u, 0x0065u, 0x0066u, 0x0067u, + 0x0068u, 0x0069u, 0x006Au, 0x006Bu, 0x006Cu, 0x006Du, 0x006Eu, 0x006Fu, + 0x0070u, 0x0071u, 0x0072u, 0x0073u, 0x0074u, 0x0075u, 0x0076u, 0x0077u, + 0x0078u, 0x0079u, 0x007Au, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x00E0u, 0x00E1u, 0x00E2u, 0x00E3u, 0x00E4u, 0x00E5u, 0x00E6u, 0x00E7u, + 0x00E8u, 0x00E9u, 0x00EAu, 0x00EBu, 0x00ECu, 0x00EDu, 0x00EEu, 0x00EFu, + 0x00F0u, 0x00F1u, 0x00F2u, 0x00F3u, 0x00F4u, 0x00F5u, 0x00F6u, 0x0000u, + 0x00F8u, 0x00F9u, 0x00FAu, 0x00FBu, 0x00FCu, 0x00FDu, 0x00FEu, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0101u, 0x0000u, 0x0103u, 0x0000u, 0x0105u, 0x0000u, 0x0107u, 0x0000u, + 0x0109u, 0x0000u, 0x010Bu, 0x0000u, 0x010Du, 0x0000u, 0x010Fu, 0x0000u, + 0x0111u, 0x0000u, 0x0113u, 0x0000u, 0x0115u, 0x0000u, 0x0117u, 0x0000u, + 0x0119u, 0x0000u, 0x011Bu, 0x0000u, 0x011Du, 0x0000u, 0x011Fu, 0x0000u, + 0x0121u, 0x0000u, 0x0123u, 0x0000u, 0x0125u, 0x0000u, 0x0127u, 0x0000u, + 0x0129u, 0x0000u, 0x012Bu, 0x0000u, 0x012Du, 0x0000u, 0x012Fu, 0x0000u, + 0x0069u, 0x0000u, 0x0133u, 0x0000u, 0x0135u, 0x0000u, 0x0137u, 0x0000u, + 0x0000u, 0x013Au, 0x0000u, 0x013Cu, 0x0000u, 0x013Eu, 0x0000u, 0x0140u, + 0x0000u, 0x0142u, 0x0000u, 0x0144u, 0x0000u, 0x0146u, 0x0000u, 0x0148u, + 0x0000u, 0x0000u, 0x014Bu, 0x0000u, 0x014Du, 0x0000u, 0x014Fu, 0x0000u, + 0x0151u, 0x0000u, 0x0153u, 0x0000u, 0x0155u, 0x0000u, 0x0157u, 0x0000u, + 0x0159u, 0x0000u, 0x015Bu, 0x0000u, 0x015Du, 0x0000u, 0x015Fu, 0x0000u, + 0x0161u, 0x0000u, 0x0163u, 0x0000u, 0x0165u, 0x0000u, 0x0167u, 0x0000u, + 0x0169u, 0x0000u, 0x016Bu, 0x0000u, 0x016Du, 0x0000u, 0x016Fu, 0x0000u, + 0x0171u, 0x0000u, 0x0173u, 0x0000u, 0x0175u, 0x0000u, 0x0177u, 0x0000u, + 0x00FFu, 0x017Au, 0x0000u, 0x017Cu, 0x0000u, 0x017Eu, 0x0000u, 0x0000u, + 0x0000u, 0x0253u, 0x0183u, 0x0000u, 0x0185u, 0x0000u, 0x0254u, 0x0188u, + 0x0000u, 0x0256u, 0x0257u, 0x018Cu, 0x0000u, 0x0000u, 0x01DDu, 0x0259u, + 0x025Bu, 0x0192u, 0x0000u, 0x0260u, 0x0263u, 0x0000u, 0x0269u, 0x0268u, + 0x0199u, 0x0000u, 0x0000u, 0x0000u, 0x026Fu, 0x0272u, 0x0000u, 0x0275u, + 0x01A1u, 0x0000u, 0x01A3u, 0x0000u, 0x01A5u, 0x0000u, 0x0280u, 0x01A8u, + 0x0000u, 0x0283u, 0x0000u, 0x0000u, 0x01ADu, 0x0000u, 0x0288u, 0x01B0u, + 0x0000u, 0x028Au, 0x028Bu, 0x01B4u, 0x0000u, 0x01B6u, 0x0000u, 0x0292u, + 0x01B9u, 0x0000u, 0x0000u, 0x0000u, 0x01BDu, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x01C6u, 0x01C6u, 0x0000u, 0x01C9u, + 0x01C9u, 0x0000u, 0x01CCu, 0x01CCu, 0x0000u, 0x01CEu, 0x0000u, 0x01D0u, + 0x0000u, 0x01D2u, 0x0000u, 0x01D4u, 0x0000u, 0x01D6u, 0x0000u, 0x01D8u, + 0x0000u, 0x01DAu, 0x0000u, 0x01DCu, 0x0000u, 0x0000u, 0x01DFu, 0x0000u, + 0x01E1u, 0x0000u, 0x01E3u, 0x0000u, 0x01E5u, 0x0000u, 0x01E7u, 0x0000u, + 0x01E9u, 0x0000u, 0x01EBu, 0x0000u, 0x01EDu, 0x0000u, 0x01EFu, 0x0000u, + 0x0000u, 0x01F3u, 0x01F3u, 0x0000u, 0x01F5u, 0x0000u, 0x0195u, 0x01BFu, + 0x01F9u, 0x0000u, 0x01FBu, 0x0000u, 0x01FDu, 0x0000u, 0x01FFu, 0x0000u, + 0x0201u, 0x0000u, 0x0203u, 0x0000u, 0x0205u, 0x0000u, 0x0207u, 0x0000u, + 0x0209u, 0x0000u, 0x020Bu, 0x0000u, 0x020Du, 0x0000u, 0x020Fu, 0x0000u, + 0x0211u, 0x0000u, 0x0213u, 0x0000u, 0x0215u, 0x0000u, 0x0217u, 0x0000u, + 0x0219u, 0x0000u, 0x021Bu, 0x0000u, 0x021Du, 0x0000u, 0x021Fu, 0x0000u, + 0x019Eu, 0x0000u, 0x0223u, 0x0000u, 0x0225u, 0x0000u, 0x0227u, 0x0000u, + 0x0229u, 0x0000u, 0x022Bu, 0x0000u, 0x022Du, 0x0000u, 0x022Fu, 0x0000u, + 0x0231u, 0x0000u, 0x0233u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x03ACu, 0x0000u, + 0x03ADu, 0x03AEu, 0x03AFu, 0x0000u, 0x03CCu, 0x0000u, 0x03CDu, 0x03CEu, + 0x0000u, 0x03B1u, 0x03B2u, 0x03B3u, 0x03B4u, 0x03B5u, 0x03B6u, 0x03B7u, + 0x03B8u, 0x03B9u, 0x03BAu, 0x03BBu, 0x03BCu, 0x03BDu, 0x03BEu, 0x03BFu, + 0x03C0u, 0x03C1u, 0x0000u, 0x03C3u, 0x03C4u, 0x03C5u, 0x03C6u, 0x03C7u, + 0x03C8u, 0x03C9u, 0x03CAu, 0x03CBu, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x03D9u, 0x0000u, 0x03DBu, 0x0000u, 0x03DDu, 0x0000u, 0x03DFu, 0x0000u, + 0x03E1u, 0x0000u, 0x03E3u, 0x0000u, 0x03E5u, 0x0000u, 0x03E7u, 0x0000u, + 0x03E9u, 0x0000u, 0x03EBu, 0x0000u, 0x03EDu, 0x0000u, 0x03EFu, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x03B8u, 0x0000u, 0x0000u, 0x03F8u, + 0x0000u, 0x03F2u, 0x03FBu, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0450u, 0x0451u, 0x0452u, 0x0453u, 0x0454u, 0x0455u, 0x0456u, 0x0457u, + 0x0458u, 0x0459u, 0x045Au, 0x045Bu, 0x045Cu, 0x045Du, 0x045Eu, 0x045Fu, + 0x0430u, 0x0431u, 0x0432u, 0x0433u, 0x0434u, 0x0435u, 0x0436u, 0x0437u, + 0x0438u, 0x0439u, 0x043Au, 0x043Bu, 0x043Cu, 0x043Du, 0x043Eu, 0x043Fu, + 0x0440u, 0x0441u, 0x0442u, 0x0443u, 0x0444u, 0x0445u, 0x0446u, 0x0447u, + 0x0448u, 0x0449u, 0x044Au, 0x044Bu, 0x044Cu, 0x044Du, 0x044Eu, 0x044Fu, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0461u, 0x0000u, 0x0463u, 0x0000u, 0x0465u, 0x0000u, 0x0467u, 0x0000u, + 0x0469u, 0x0000u, 0x046Bu, 0x0000u, 0x046Du, 0x0000u, 0x046Fu, 0x0000u, + 0x0471u, 0x0000u, 0x0473u, 0x0000u, 0x0475u, 0x0000u, 0x0477u, 0x0000u, + 0x0479u, 0x0000u, 0x047Bu, 0x0000u, 0x047Du, 0x0000u, 0x047Fu, 0x0000u, + 0x0481u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x048Bu, 0x0000u, 0x048Du, 0x0000u, 0x048Fu, 0x0000u, + 0x0491u, 0x0000u, 0x0493u, 0x0000u, 0x0495u, 0x0000u, 0x0497u, 0x0000u, + 0x0499u, 0x0000u, 0x049Bu, 0x0000u, 0x049Du, 0x0000u, 0x049Fu, 0x0000u, + 0x04A1u, 0x0000u, 0x04A3u, 0x0000u, 0x04A5u, 0x0000u, 0x04A7u, 0x0000u, + 0x04A9u, 0x0000u, 0x04ABu, 0x0000u, 0x04ADu, 0x0000u, 0x04AFu, 0x0000u, + 0x04B1u, 0x0000u, 0x04B3u, 0x0000u, 0x04B5u, 0x0000u, 0x04B7u, 0x0000u, + 0x04B9u, 0x0000u, 0x04BBu, 0x0000u, 0x04BDu, 0x0000u, 0x04BFu, 0x0000u, + 0x0000u, 0x04C2u, 0x0000u, 0x04C4u, 0x0000u, 0x04C6u, 0x0000u, 0x04C8u, + 0x0000u, 0x04CAu, 0x0000u, 0x04CCu, 0x0000u, 0x04CEu, 0x0000u, 0x0000u, + 0x04D1u, 0x0000u, 0x04D3u, 0x0000u, 0x04D5u, 0x0000u, 0x04D7u, 0x0000u, + 0x04D9u, 0x0000u, 0x04DBu, 0x0000u, 0x04DDu, 0x0000u, 0x04DFu, 0x0000u, + 0x04E1u, 0x0000u, 0x04E3u, 0x0000u, 0x04E5u, 0x0000u, 0x04E7u, 0x0000u, + 0x04E9u, 0x0000u, 0x04EBu, 0x0000u, 0x04EDu, 0x0000u, 0x04EFu, 0x0000u, + 0x04F1u, 0x0000u, 0x04F3u, 0x0000u, 0x04F5u, 0x0000u, 0x0000u, 0x0000u, + 0x04F9u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0501u, 0x0000u, 0x0503u, 0x0000u, 0x0505u, 0x0000u, 0x0507u, 0x0000u, + 0x0509u, 0x0000u, 0x050Bu, 0x0000u, 0x050Du, 0x0000u, 0x050Fu, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0561u, 0x0562u, 0x0563u, 0x0564u, 0x0565u, 0x0566u, 0x0567u, + 0x0568u, 0x0569u, 0x056Au, 0x056Bu, 0x056Cu, 0x056Du, 0x056Eu, 0x056Fu, + 0x0570u, 0x0571u, 0x0572u, 0x0573u, 0x0574u, 0x0575u, 0x0576u, 0x0577u, + 0x0578u, 0x0579u, 0x057Au, 0x057Bu, 0x057Cu, 0x057Du, 0x057Eu, 0x057Fu, + 0x0580u, 0x0581u, 0x0582u, 0x0583u, 0x0584u, 0x0585u, 0x0586u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1E01u, 0x0000u, 0x1E03u, 0x0000u, 0x1E05u, 0x0000u, 0x1E07u, 0x0000u, + 0x1E09u, 0x0000u, 0x1E0Bu, 0x0000u, 0x1E0Du, 0x0000u, 0x1E0Fu, 0x0000u, + 0x1E11u, 0x0000u, 0x1E13u, 0x0000u, 0x1E15u, 0x0000u, 0x1E17u, 0x0000u, + 0x1E19u, 0x0000u, 0x1E1Bu, 0x0000u, 0x1E1Du, 0x0000u, 0x1E1Fu, 0x0000u, + 0x1E21u, 0x0000u, 0x1E23u, 0x0000u, 0x1E25u, 0x0000u, 0x1E27u, 0x0000u, + 0x1E29u, 0x0000u, 0x1E2Bu, 0x0000u, 0x1E2Du, 0x0000u, 0x1E2Fu, 0x0000u, + 0x1E31u, 0x0000u, 0x1E33u, 0x0000u, 0x1E35u, 0x0000u, 0x1E37u, 0x0000u, + 0x1E39u, 0x0000u, 0x1E3Bu, 0x0000u, 0x1E3Du, 0x0000u, 0x1E3Fu, 0x0000u, + 0x1E41u, 0x0000u, 0x1E43u, 0x0000u, 0x1E45u, 0x0000u, 0x1E47u, 0x0000u, + 0x1E49u, 0x0000u, 0x1E4Bu, 0x0000u, 0x1E4Du, 0x0000u, 0x1E4Fu, 0x0000u, + 0x1E51u, 0x0000u, 0x1E53u, 0x0000u, 0x1E55u, 0x0000u, 0x1E57u, 0x0000u, + 0x1E59u, 0x0000u, 0x1E5Bu, 0x0000u, 0x1E5Du, 0x0000u, 0x1E5Fu, 0x0000u, + 0x1E61u, 0x0000u, 0x1E63u, 0x0000u, 0x1E65u, 0x0000u, 0x1E67u, 0x0000u, + 0x1E69u, 0x0000u, 0x1E6Bu, 0x0000u, 0x1E6Du, 0x0000u, 0x1E6Fu, 0x0000u, + 0x1E71u, 0x0000u, 0x1E73u, 0x0000u, 0x1E75u, 0x0000u, 0x1E77u, 0x0000u, + 0x1E79u, 0x0000u, 0x1E7Bu, 0x0000u, 0x1E7Du, 0x0000u, 0x1E7Fu, 0x0000u, + 0x1E81u, 0x0000u, 0x1E83u, 0x0000u, 0x1E85u, 0x0000u, 0x1E87u, 0x0000u, + 0x1E89u, 0x0000u, 0x1E8Bu, 0x0000u, 0x1E8Du, 0x0000u, 0x1E8Fu, 0x0000u, + 0x1E91u, 0x0000u, 0x1E93u, 0x0000u, 0x1E95u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1EA1u, 0x0000u, 0x1EA3u, 0x0000u, 0x1EA5u, 0x0000u, 0x1EA7u, 0x0000u, + 0x1EA9u, 0x0000u, 0x1EABu, 0x0000u, 0x1EADu, 0x0000u, 0x1EAFu, 0x0000u, + 0x1EB1u, 0x0000u, 0x1EB3u, 0x0000u, 0x1EB5u, 0x0000u, 0x1EB7u, 0x0000u, + 0x1EB9u, 0x0000u, 0x1EBBu, 0x0000u, 0x1EBDu, 0x0000u, 0x1EBFu, 0x0000u, + 0x1EC1u, 0x0000u, 0x1EC3u, 0x0000u, 0x1EC5u, 0x0000u, 0x1EC7u, 0x0000u, + 0x1EC9u, 0x0000u, 0x1ECBu, 0x0000u, 0x1ECDu, 0x0000u, 0x1ECFu, 0x0000u, + 0x1ED1u, 0x0000u, 0x1ED3u, 0x0000u, 0x1ED5u, 0x0000u, 0x1ED7u, 0x0000u, + 0x1ED9u, 0x0000u, 0x1EDBu, 0x0000u, 0x1EDDu, 0x0000u, 0x1EDFu, 0x0000u, + 0x1EE1u, 0x0000u, 0x1EE3u, 0x0000u, 0x1EE5u, 0x0000u, 0x1EE7u, 0x0000u, + 0x1EE9u, 0x0000u, 0x1EEBu, 0x0000u, 0x1EEDu, 0x0000u, 0x1EEFu, 0x0000u, + 0x1EF1u, 0x0000u, 0x1EF3u, 0x0000u, 0x1EF5u, 0x0000u, 0x1EF7u, 0x0000u, + 0x1EF9u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1F00u, 0x1F01u, 0x1F02u, 0x1F03u, 0x1F04u, 0x1F05u, 0x1F06u, 0x1F07u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1F10u, 0x1F11u, 0x1F12u, 0x1F13u, 0x1F14u, 0x1F15u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1F20u, 0x1F21u, 0x1F22u, 0x1F23u, 0x1F24u, 0x1F25u, 0x1F26u, 0x1F27u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1F30u, 0x1F31u, 0x1F32u, 0x1F33u, 0x1F34u, 0x1F35u, 0x1F36u, 0x1F37u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1F40u, 0x1F41u, 0x1F42u, 0x1F43u, 0x1F44u, 0x1F45u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x1F51u, 0x0000u, 0x1F53u, 0x0000u, 0x1F55u, 0x0000u, 0x1F57u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1F60u, 0x1F61u, 0x1F62u, 0x1F63u, 0x1F64u, 0x1F65u, 0x1F66u, 0x1F67u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1F80u, 0x1F81u, 0x1F82u, 0x1F83u, 0x1F84u, 0x1F85u, 0x1F86u, 0x1F87u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1F90u, 0x1F91u, 0x1F92u, 0x1F93u, 0x1F94u, 0x1F95u, 0x1F96u, 0x1F97u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1FA0u, 0x1FA1u, 0x1FA2u, 0x1FA3u, 0x1FA4u, 0x1FA5u, 0x1FA6u, 0x1FA7u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1FB0u, 0x1FB1u, 0x1F70u, 0x1F71u, 0x1FB3u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1F72u, 0x1F73u, 0x1F74u, 0x1F75u, 0x1FC3u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1FD0u, 0x1FD1u, 0x1F76u, 0x1F77u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1FE0u, 0x1FE1u, 0x1F7Au, 0x1F7Bu, 0x1FE5u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x1F78u, 0x1F79u, 0x1F7Cu, 0x1F7Du, 0x1FF3u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x03C9u, 0x0000u, + 0x0000u, 0x0000u, 0x006Bu, 0x00E5u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x2170u, 0x2171u, 0x2172u, 0x2173u, 0x2174u, 0x2175u, 0x2176u, 0x2177u, + 0x2178u, 0x2179u, 0x217Au, 0x217Bu, 0x217Cu, 0x217Du, 0x217Eu, 0x217Fu, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x24D0u, 0x24D1u, + 0x24D2u, 0x24D3u, 0x24D4u, 0x24D5u, 0x24D6u, 0x24D7u, 0x24D8u, 0x24D9u, + 0x24DAu, 0x24DBu, 0x24DCu, 0x24DDu, 0x24DEu, 0x24DFu, 0x24E0u, 0x24E1u, + 0x24E2u, 0x24E3u, 0x24E4u, 0x24E5u, 0x24E6u, 0x24E7u, 0x24E8u, 0x24E9u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0xFF41u, 0xFF42u, 0xFF43u, 0xFF44u, 0xFF45u, 0xFF46u, 0xFF47u, + 0xFF48u, 0xFF49u, 0xFF4Au, 0xFF4Bu, 0xFF4Cu, 0xFF4Du, 0xFF4Eu, 0xFF4Fu, + 0xFF50u, 0xFF51u, 0xFF52u, 0xFF53u, 0xFF54u, 0xFF55u, 0xFF56u, 0xFF57u, + 0xFF58u, 0xFF59u, 0xFF5Au, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, +}; + +const unsigned short *Unicode::_compLowerCase[256]={ + _intCompLowerCase+0x0000, /* Page 0x00 */ + _intCompLowerCase+0x0100, /* Page 0x01 */ + _intCompLowerCase+0x0200, /* Page 0x02 */ + _intCompLowerCase+0x0300, /* Page 0x03 */ + _intCompLowerCase+0x0400, /* Page 0x04 */ + _intCompLowerCase+0x0500, /* Page 0x05 */ + _intCompLowerCase+0x0600, /* Page 0x06 */ + _intCompLowerCase+0x0600, /* Page 0x07 */ + _intCompLowerCase+0x0600, /* Page 0x08 */ + _intCompLowerCase+0x0600, /* Page 0x09 */ + _intCompLowerCase+0x0600, /* Page 0x0A */ + _intCompLowerCase+0x0600, /* Page 0x0B */ + _intCompLowerCase+0x0600, /* Page 0x0C */ + _intCompLowerCase+0x0600, /* Page 0x0D */ + _intCompLowerCase+0x0600, /* Page 0x0E */ + _intCompLowerCase+0x0600, /* Page 0x0F */ + _intCompLowerCase+0x0600, /* Page 0x10 */ + _intCompLowerCase+0x0600, /* Page 0x11 */ + _intCompLowerCase+0x0600, /* Page 0x12 */ + _intCompLowerCase+0x0600, /* Page 0x13 */ + _intCompLowerCase+0x0600, /* Page 0x14 */ + _intCompLowerCase+0x0600, /* Page 0x15 */ + _intCompLowerCase+0x0600, /* Page 0x16 */ + _intCompLowerCase+0x0600, /* Page 0x17 */ + _intCompLowerCase+0x0600, /* Page 0x18 */ + _intCompLowerCase+0x0600, /* Page 0x19 */ + _intCompLowerCase+0x0600, /* Page 0x1A */ + _intCompLowerCase+0x0600, /* Page 0x1B */ + _intCompLowerCase+0x0600, /* Page 0x1C */ + _intCompLowerCase+0x0600, /* Page 0x1D */ + _intCompLowerCase+0x0700, /* Page 0x1E */ + _intCompLowerCase+0x0800, /* Page 0x1F */ + _intCompLowerCase+0x0600, /* Page 0x20 */ + _intCompLowerCase+0x0900, /* Page 0x21 */ + _intCompLowerCase+0x0600, /* Page 0x22 */ + _intCompLowerCase+0x0600, /* Page 0x23 */ + _intCompLowerCase+0x0A00, /* Page 0x24 */ + _intCompLowerCase+0x0600, /* Page 0x25 */ + _intCompLowerCase+0x0600, /* Page 0x26 */ + _intCompLowerCase+0x0600, /* Page 0x27 */ + _intCompLowerCase+0x0600, /* Page 0x28 */ + _intCompLowerCase+0x0600, /* Page 0x29 */ + _intCompLowerCase+0x0600, /* Page 0x2A */ + _intCompLowerCase+0x0600, /* Page 0x2B */ + _intCompLowerCase+0x0600, /* Page 0x2C */ + _intCompLowerCase+0x0600, /* Page 0x2D */ + _intCompLowerCase+0x0600, /* Page 0x2E */ + _intCompLowerCase+0x0600, /* Page 0x2F */ + _intCompLowerCase+0x0600, /* Page 0x30 */ + _intCompLowerCase+0x0600, /* Page 0x31 */ + _intCompLowerCase+0x0600, /* Page 0x32 */ + _intCompLowerCase+0x0600, /* Page 0x33 */ + _intCompLowerCase+0x0600, /* Page 0x34 */ + _intCompLowerCase+0x0600, /* Page 0x35 */ + _intCompLowerCase+0x0600, /* Page 0x36 */ + _intCompLowerCase+0x0600, /* Page 0x37 */ + _intCompLowerCase+0x0600, /* Page 0x38 */ + _intCompLowerCase+0x0600, /* Page 0x39 */ + _intCompLowerCase+0x0600, /* Page 0x3A */ + _intCompLowerCase+0x0600, /* Page 0x3B */ + _intCompLowerCase+0x0600, /* Page 0x3C */ + _intCompLowerCase+0x0600, /* Page 0x3D */ + _intCompLowerCase+0x0600, /* Page 0x3E */ + _intCompLowerCase+0x0600, /* Page 0x3F */ + _intCompLowerCase+0x0600, /* Page 0x40 */ + _intCompLowerCase+0x0600, /* Page 0x41 */ + _intCompLowerCase+0x0600, /* Page 0x42 */ + _intCompLowerCase+0x0600, /* Page 0x43 */ + _intCompLowerCase+0x0600, /* Page 0x44 */ + _intCompLowerCase+0x0600, /* Page 0x45 */ + _intCompLowerCase+0x0600, /* Page 0x46 */ + _intCompLowerCase+0x0600, /* Page 0x47 */ + _intCompLowerCase+0x0600, /* Page 0x48 */ + _intCompLowerCase+0x0600, /* Page 0x49 */ + _intCompLowerCase+0x0600, /* Page 0x4A */ + _intCompLowerCase+0x0600, /* Page 0x4B */ + _intCompLowerCase+0x0600, /* Page 0x4C */ + _intCompLowerCase+0x0600, /* Page 0x4D */ + _intCompLowerCase+0x0600, /* Page 0x4E */ + _intCompLowerCase+0x0600, /* Page 0x4F */ + _intCompLowerCase+0x0600, /* Page 0x50 */ + _intCompLowerCase+0x0600, /* Page 0x51 */ + _intCompLowerCase+0x0600, /* Page 0x52 */ + _intCompLowerCase+0x0600, /* Page 0x53 */ + _intCompLowerCase+0x0600, /* Page 0x54 */ + _intCompLowerCase+0x0600, /* Page 0x55 */ + _intCompLowerCase+0x0600, /* Page 0x56 */ + _intCompLowerCase+0x0600, /* Page 0x57 */ + _intCompLowerCase+0x0600, /* Page 0x58 */ + _intCompLowerCase+0x0600, /* Page 0x59 */ + _intCompLowerCase+0x0600, /* Page 0x5A */ + _intCompLowerCase+0x0600, /* Page 0x5B */ + _intCompLowerCase+0x0600, /* Page 0x5C */ + _intCompLowerCase+0x0600, /* Page 0x5D */ + _intCompLowerCase+0x0600, /* Page 0x5E */ + _intCompLowerCase+0x0600, /* Page 0x5F */ + _intCompLowerCase+0x0600, /* Page 0x60 */ + _intCompLowerCase+0x0600, /* Page 0x61 */ + _intCompLowerCase+0x0600, /* Page 0x62 */ + _intCompLowerCase+0x0600, /* Page 0x63 */ + _intCompLowerCase+0x0600, /* Page 0x64 */ + _intCompLowerCase+0x0600, /* Page 0x65 */ + _intCompLowerCase+0x0600, /* Page 0x66 */ + _intCompLowerCase+0x0600, /* Page 0x67 */ + _intCompLowerCase+0x0600, /* Page 0x68 */ + _intCompLowerCase+0x0600, /* Page 0x69 */ + _intCompLowerCase+0x0600, /* Page 0x6A */ + _intCompLowerCase+0x0600, /* Page 0x6B */ + _intCompLowerCase+0x0600, /* Page 0x6C */ + _intCompLowerCase+0x0600, /* Page 0x6D */ + _intCompLowerCase+0x0600, /* Page 0x6E */ + _intCompLowerCase+0x0600, /* Page 0x6F */ + _intCompLowerCase+0x0600, /* Page 0x70 */ + _intCompLowerCase+0x0600, /* Page 0x71 */ + _intCompLowerCase+0x0600, /* Page 0x72 */ + _intCompLowerCase+0x0600, /* Page 0x73 */ + _intCompLowerCase+0x0600, /* Page 0x74 */ + _intCompLowerCase+0x0600, /* Page 0x75 */ + _intCompLowerCase+0x0600, /* Page 0x76 */ + _intCompLowerCase+0x0600, /* Page 0x77 */ + _intCompLowerCase+0x0600, /* Page 0x78 */ + _intCompLowerCase+0x0600, /* Page 0x79 */ + _intCompLowerCase+0x0600, /* Page 0x7A */ + _intCompLowerCase+0x0600, /* Page 0x7B */ + _intCompLowerCase+0x0600, /* Page 0x7C */ + _intCompLowerCase+0x0600, /* Page 0x7D */ + _intCompLowerCase+0x0600, /* Page 0x7E */ + _intCompLowerCase+0x0600, /* Page 0x7F */ + _intCompLowerCase+0x0600, /* Page 0x80 */ + _intCompLowerCase+0x0600, /* Page 0x81 */ + _intCompLowerCase+0x0600, /* Page 0x82 */ + _intCompLowerCase+0x0600, /* Page 0x83 */ + _intCompLowerCase+0x0600, /* Page 0x84 */ + _intCompLowerCase+0x0600, /* Page 0x85 */ + _intCompLowerCase+0x0600, /* Page 0x86 */ + _intCompLowerCase+0x0600, /* Page 0x87 */ + _intCompLowerCase+0x0600, /* Page 0x88 */ + _intCompLowerCase+0x0600, /* Page 0x89 */ + _intCompLowerCase+0x0600, /* Page 0x8A */ + _intCompLowerCase+0x0600, /* Page 0x8B */ + _intCompLowerCase+0x0600, /* Page 0x8C */ + _intCompLowerCase+0x0600, /* Page 0x8D */ + _intCompLowerCase+0x0600, /* Page 0x8E */ + _intCompLowerCase+0x0600, /* Page 0x8F */ + _intCompLowerCase+0x0600, /* Page 0x90 */ + _intCompLowerCase+0x0600, /* Page 0x91 */ + _intCompLowerCase+0x0600, /* Page 0x92 */ + _intCompLowerCase+0x0600, /* Page 0x93 */ + _intCompLowerCase+0x0600, /* Page 0x94 */ + _intCompLowerCase+0x0600, /* Page 0x95 */ + _intCompLowerCase+0x0600, /* Page 0x96 */ + _intCompLowerCase+0x0600, /* Page 0x97 */ + _intCompLowerCase+0x0600, /* Page 0x98 */ + _intCompLowerCase+0x0600, /* Page 0x99 */ + _intCompLowerCase+0x0600, /* Page 0x9A */ + _intCompLowerCase+0x0600, /* Page 0x9B */ + _intCompLowerCase+0x0600, /* Page 0x9C */ + _intCompLowerCase+0x0600, /* Page 0x9D */ + _intCompLowerCase+0x0600, /* Page 0x9E */ + _intCompLowerCase+0x0600, /* Page 0x9F */ + _intCompLowerCase+0x0600, /* Page 0xA0 */ + _intCompLowerCase+0x0600, /* Page 0xA1 */ + _intCompLowerCase+0x0600, /* Page 0xA2 */ + _intCompLowerCase+0x0600, /* Page 0xA3 */ + _intCompLowerCase+0x0600, /* Page 0xA4 */ + _intCompLowerCase+0x0600, /* Page 0xA5 */ + _intCompLowerCase+0x0600, /* Page 0xA6 */ + _intCompLowerCase+0x0600, /* Page 0xA7 */ + _intCompLowerCase+0x0600, /* Page 0xA8 */ + _intCompLowerCase+0x0600, /* Page 0xA9 */ + _intCompLowerCase+0x0600, /* Page 0xAA */ + _intCompLowerCase+0x0600, /* Page 0xAB */ + _intCompLowerCase+0x0600, /* Page 0xAC */ + _intCompLowerCase+0x0600, /* Page 0xAD */ + _intCompLowerCase+0x0600, /* Page 0xAE */ + _intCompLowerCase+0x0600, /* Page 0xAF */ + _intCompLowerCase+0x0600, /* Page 0xB0 */ + _intCompLowerCase+0x0600, /* Page 0xB1 */ + _intCompLowerCase+0x0600, /* Page 0xB2 */ + _intCompLowerCase+0x0600, /* Page 0xB3 */ + _intCompLowerCase+0x0600, /* Page 0xB4 */ + _intCompLowerCase+0x0600, /* Page 0xB5 */ + _intCompLowerCase+0x0600, /* Page 0xB6 */ + _intCompLowerCase+0x0600, /* Page 0xB7 */ + _intCompLowerCase+0x0600, /* Page 0xB8 */ + _intCompLowerCase+0x0600, /* Page 0xB9 */ + _intCompLowerCase+0x0600, /* Page 0xBA */ + _intCompLowerCase+0x0600, /* Page 0xBB */ + _intCompLowerCase+0x0600, /* Page 0xBC */ + _intCompLowerCase+0x0600, /* Page 0xBD */ + _intCompLowerCase+0x0600, /* Page 0xBE */ + _intCompLowerCase+0x0600, /* Page 0xBF */ + _intCompLowerCase+0x0600, /* Page 0xC0 */ + _intCompLowerCase+0x0600, /* Page 0xC1 */ + _intCompLowerCase+0x0600, /* Page 0xC2 */ + _intCompLowerCase+0x0600, /* Page 0xC3 */ + _intCompLowerCase+0x0600, /* Page 0xC4 */ + _intCompLowerCase+0x0600, /* Page 0xC5 */ + _intCompLowerCase+0x0600, /* Page 0xC6 */ + _intCompLowerCase+0x0600, /* Page 0xC7 */ + _intCompLowerCase+0x0600, /* Page 0xC8 */ + _intCompLowerCase+0x0600, /* Page 0xC9 */ + _intCompLowerCase+0x0600, /* Page 0xCA */ + _intCompLowerCase+0x0600, /* Page 0xCB */ + _intCompLowerCase+0x0600, /* Page 0xCC */ + _intCompLowerCase+0x0600, /* Page 0xCD */ + _intCompLowerCase+0x0600, /* Page 0xCE */ + _intCompLowerCase+0x0600, /* Page 0xCF */ + _intCompLowerCase+0x0600, /* Page 0xD0 */ + _intCompLowerCase+0x0600, /* Page 0xD1 */ + _intCompLowerCase+0x0600, /* Page 0xD2 */ + _intCompLowerCase+0x0600, /* Page 0xD3 */ + _intCompLowerCase+0x0600, /* Page 0xD4 */ + _intCompLowerCase+0x0600, /* Page 0xD5 */ + _intCompLowerCase+0x0600, /* Page 0xD6 */ + _intCompLowerCase+0x0600, /* Page 0xD7 */ + _intCompLowerCase+0x0600, /* Page 0xD8 */ + _intCompLowerCase+0x0600, /* Page 0xD9 */ + _intCompLowerCase+0x0600, /* Page 0xDA */ + _intCompLowerCase+0x0600, /* Page 0xDB */ + _intCompLowerCase+0x0600, /* Page 0xDC */ + _intCompLowerCase+0x0600, /* Page 0xDD */ + _intCompLowerCase+0x0600, /* Page 0xDE */ + _intCompLowerCase+0x0600, /* Page 0xDF */ + _intCompLowerCase+0x0600, /* Page 0xE0 */ + _intCompLowerCase+0x0600, /* Page 0xE1 */ + _intCompLowerCase+0x0600, /* Page 0xE2 */ + _intCompLowerCase+0x0600, /* Page 0xE3 */ + _intCompLowerCase+0x0600, /* Page 0xE4 */ + _intCompLowerCase+0x0600, /* Page 0xE5 */ + _intCompLowerCase+0x0600, /* Page 0xE6 */ + _intCompLowerCase+0x0600, /* Page 0xE7 */ + _intCompLowerCase+0x0600, /* Page 0xE8 */ + _intCompLowerCase+0x0600, /* Page 0xE9 */ + _intCompLowerCase+0x0600, /* Page 0xEA */ + _intCompLowerCase+0x0600, /* Page 0xEB */ + _intCompLowerCase+0x0600, /* Page 0xEC */ + _intCompLowerCase+0x0600, /* Page 0xED */ + _intCompLowerCase+0x0600, /* Page 0xEE */ + _intCompLowerCase+0x0600, /* Page 0xEF */ + _intCompLowerCase+0x0600, /* Page 0xF0 */ + _intCompLowerCase+0x0600, /* Page 0xF1 */ + _intCompLowerCase+0x0600, /* Page 0xF2 */ + _intCompLowerCase+0x0600, /* Page 0xF3 */ + _intCompLowerCase+0x0600, /* Page 0xF4 */ + _intCompLowerCase+0x0600, /* Page 0xF5 */ + _intCompLowerCase+0x0600, /* Page 0xF6 */ + _intCompLowerCase+0x0600, /* Page 0xF7 */ + _intCompLowerCase+0x0600, /* Page 0xF8 */ + _intCompLowerCase+0x0600, /* Page 0xF9 */ + _intCompLowerCase+0x0600, /* Page 0xFA */ + _intCompLowerCase+0x0600, /* Page 0xFB */ + _intCompLowerCase+0x0600, /* Page 0xFC */ + _intCompLowerCase+0x0600, /* Page 0xFD */ + _intCompLowerCase+0x0600, /* Page 0xFE */ + _intCompLowerCase+0x0B00 /* Page 0xFF */ +}; + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/unicode_tables.cpp b/fsa/src/vespa/fsa/unicode_tables.cpp new file mode 100644 index 00000000000..d20255f29c5 --- /dev/null +++ b/fsa/src/vespa/fsa/unicode_tables.cpp @@ -0,0 +1,162 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "unicode.h" + +namespace fsa { + +const unsigned char Unicode::_isdigit[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + +const unsigned char Unicode::_isintegerindexop[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + +const unsigned char Unicode::_iswordchar[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + }; + +const unsigned char Unicode::_isidstartchar[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + +const unsigned char Unicode::_isidchar[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + +const unsigned char Unicode::_isspacechar[256] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + +const unsigned char Unicode::_tolower[256] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, + }; + +const unsigned char Unicode::_utf8header[256] = { + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x05, 0x06, 0x06, 0x00, 0x00, + }; + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/vectorizer.cpp b/fsa/src/vespa/fsa/vectorizer.cpp new file mode 100644 index 00000000000..54c67fdc800 --- /dev/null +++ b/fsa/src/vespa/fsa/vectorizer.cpp @@ -0,0 +1,92 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file vectorizer.cpp + * @brief Simple document vectorizer based on %FSA (%Finite %State %Automaton) (implementation) + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <list> +#include <algorithm> + +#include <math.h> + +#include "vectorizer.h" +#include "fsa.h" + + +namespace fsa { + +// {{{ Vectorizer::TfIdf::weight + +double Vectorizer::TfIdf::weight(unsigned int tfnorm, unsigned int idfnorm, + double tfexp, double idfexp) const +{ + double tf_n, idf_n; + + if(tfnorm==0 || tfexp==0.0){ + tf_n = 1.0; + } + else{ + tf_n = (double)_tf/tfnorm; + if(tfexp!=1.0 && tf_n!=0.0){ + tf_n = exp(tfexp*log(tf_n)); + } + } + + if(idfnorm==0 || idfexp==0.0){ + idf_n = 1.0; + } + else{ + idf_n = 1.0-(double)_idf/idfnorm; + if(idf_n<0.0) + idf_n = 0.0; + if(idfexp!=1.0 && idf_n!=0.0){ + idf_n = exp(idfexp*log(idf_n)); + } + } + + return tf_n * idf_n; +} + +// }}} + +// {{{ Vectorizer::vectorize + +void Vectorizer::vectorize(const NGram &text, TermVector &vector, unsigned int limit, + bool keephits, double tfexp, double idfexp) const +{ + RawVector raw_vect(keephits); + RawVector::iterator rvi; + + _detector.detect(text,raw_vect); + vector.clear(); + unsigned int tfmax=1; + for(rvi=raw_vect.begin(); rvi!=raw_vect.end(); ++rvi){ + if(rvi->second.first.tf()>tfmax) + tfmax=rvi->second.first.tf(); + } + vector.reserve(raw_vect.size()); + for(rvi=raw_vect.begin(); rvi!=raw_vect.end(); ++rvi){ + vector.push_back(VectorItem(rvi->first,rvi->second.first.weight(tfmax,_idf_docs,tfexp,idfexp),rvi->second.second)); + } + std::sort(vector.begin(),vector.end()); + if(vector.size()>limit){ + vector.resize(limit); + } +} + +void Vectorizer::vectorize(const NGram &text, TermVector &vector, unsigned int limit, + double tfexp, double idfexp) const +{ + vectorize(text, vector, limit, false, tfexp, idfexp); +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/vectorizer.h b/fsa/src/vespa/fsa/vectorizer.h new file mode 100644 index 00000000000..9e8856191da --- /dev/null +++ b/fsa/src/vespa/fsa/vectorizer.h @@ -0,0 +1,642 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file vectorizer.h + * @brief Simple document vectorizer based on %FSA (%Finite %State %Automaton) + */ + +#pragma once + +#include <string> +#include <map> +#include <vector> + +#include "fsa.h" +#include "detector.h" + +namespace fsa { + +// {{{ Vectorizer + +/** + * @class Vectorizer + * @brief Simple document vectorizer based on %FSA. + */ +class Vectorizer { + +public: + + // {{{ Vectorizer::VectorItem + + /** + * @class VectorItem + * @brief Document vector item. + * + * Document vector item. Contains a term/phrase and an assigned + * weight, and provides comparison operators for sorting. + */ + class VectorItem { + public: + typedef std::pair<unsigned int /*position*/, int /*length*/> Hit; + typedef std::vector<Hit> Hits; + private: + std::string _term; /**< Term/phrase. */ + double _weight; /**< Term weight. */ + Hits _hits; /**< The token positions at which the term was found */ + public: + /** + * @brief Default constructor, creates empty item with zero weight. + */ + VectorItem() : _term(), _weight(0.0), _hits() {} + + /** + * @brief Copy constructor. + * + * @param v VectorItem to copy. + */ + VectorItem(const VectorItem &v) : _term(v._term), _weight(v._weight), _hits(v._hits) {} + + /** + * @brief Constructor. + * + * Creates a vector item from a string and a weight. + * + * @param t Term/phrase. + * @param w Weight. + */ + VectorItem(const std::string t, double w) : _term(t), _weight(w), _hits() {} + + /** + * @brief Constructor. + * + * Creates a vector item from a string and a weight. + * + * @param t Term/phrase. + * @param w Weight. + */ + VectorItem(const std::string t, double w, const Hits &h) : _term(t), _weight(w), _hits(h) {} + + /** + * @brief Destructor. + */ + ~VectorItem() {} + + /** + * @brief Assignment operator. + * + * @param v VectorItem. + * @return Reference to (this) VectorItem. + */ + const VectorItem& operator=(const VectorItem& v) + { + _term = v._term; + _weight = v._weight; + _hits = v._hits; + return *this; + } + + /** + * @brief Less-than operator. + * + * The order is highest weight first, than sorted alphabetically. + * + * @param v Other vector item. + * @return True is this item<other item. + */ + bool operator<(const VectorItem & v) const + { + if(_weight>v._weight) return true; + if(_weight<v._weight) return false; + if(_term<v._term) return true; + return false; + } + + /** + * @brief Greater-than operator. + * + * The order is highest weight first, than sorted alphabetically. + * + * @param v Other vector item. + * @return True is this item>other item. + */ + bool operator>(const VectorItem & v) const + { + if(_weight<v._weight) return true; + if(_weight>v._weight) return false; + if(_term>v._term) return true; + return false; + } + + /** + * @brief Equals operator. + * + * Two VectorItems equal if both the terms and weight are equal. + * + * @param v Other vector item. + * @return True is this item==other item. + */ + bool operator==(const VectorItem & v) const + { + if(_weight==v._weight && _term==v._term) return true; + return false; + } + + /** + * @brief Get the term/phrase. + * + * @return (Copy of) term/phrase. + */ + std::string term() const { return _term; } + + /** + * @brief An obsolete alias for term(). + * + * @return (Copy of) term/phrase. + */ + std::string getTerm() const { return _term; } + + /** + * @brief Get the weight. + * + * @return Weight. + */ + double weight() const { return _weight; } + + /** + * @brief An obsolete alias for weight(). + * + * @return Weight. + */ + double getWeight() const { return _weight; } + + /** + * @brief Get the hits. + * + * @return A reference to the hits vector. + */ + const Hits &hits() const { return _hits; } + + }; + + // }}} + + // {{{ Vectorizer::TfIdf + + /** + * @class TfIdf + * @brief Class for computing TfIdf weights. + * + * Class for computing TfIdf (term frequency/inverse document + * frequency) weights. + */ + class TfIdf { + private: + unsigned int _tf; /**< Term frequency. */ + unsigned int _idf; /**< (Inverse) document frequency. */ + public: + /** + * @brief Default constructor. + */ + TfIdf() : _tf(0), _idf(0) {} + + /** + * @brief Copy constructor. + * + * @param ti TfIdf object to copy. + */ + TfIdf(const TfIdf &ti) : _tf(ti._tf), _idf(ti._idf) {} + + /** + * @brief Constructor. + * + * @param t Term frequency. + * @param i (Inverse) document frequency. + */ + TfIdf(unsigned int t, unsigned int i) : _tf(t), _idf(i) {} + + /** + * @brief Destructor. + */ + ~TfIdf() {} + + /** + * @brief Assignment operator. + * + * @param ti Reference to TfIdf object. + * @return Reference to (this) TfIdf object. + */ + const TfIdf& operator=(const TfIdf& ti) + { + _tf = ti._tf; + _idf = ti._idf; + return *this; + } + + /** + * @brief Assignment operator, set only Tf. + * + * @param t Term frequency. + * @return Reference to (this) TfIdf object. + */ + const TfIdf& operator=(unsigned int t) + { + _tf = t; + return *this; + } + + /** + * @brief Prefix increment operator. + * + * Prefix increment operator, increments Tf. + * + * @return Reference to (this) TfIdf object. + */ + TfIdf& operator++() + { + ++_tf; + return *this; + } + + /** + * @brief += operator. + * + * += operator, adds the parameter to Tf. + * + * @return Reference to (this) TfIdf object. + */ + const TfIdf& operator+=(unsigned int t) + { + _tf+=t; + return *this; + } + + /** + * @brief Get Tf value. + * + * @return Tf (term frequency) value. + */ + unsigned int tf() const { return _tf; } + + /** + * @brief An obsolete alias for tf(). + * + * @return Tf (term frequency) value. + */ + unsigned int getTf() const { return _tf; } + + /** + * @brief Get Idf value. + * + * @return Idf ((inverse) document frequency) value. + */ + unsigned int idf() const { return _idf; } + + /** + * @brief An obsolete alias for idf(). + * + * @return Idf ((inverse) document frequency) value. + */ + unsigned int getIdf() const { return _idf; } + + /** + * @brief Compute the weight from the Tf and Idf values. + * + * @param tfnorm Normalize Tf (divide by tfnorm). + * @param idfnorm Normalize Idf (divide by idfnorm). + * @param tfexp Tf exponent. + * @param idfexp Idf exponent. + * @return Weight based on Tf and Idf values. + */ + double weight(unsigned int tfnorm=1, unsigned int idfnorm=1, + double tfexp=1.0, double idfexp=1.0) const; + + /** + * @brief An obsolete alias for weight(). + * + * @param tfnorm Normalize Tf (divide by tfnorm). + * @param idfnorm Normalize Idf (divide by idfnorm). + * @param tfexp Tf exponent. + * @param idfexp Idf exponent. + * @return Weight based on Tf and Idf values. + */ + double getWeight(unsigned int tfnorm=1, unsigned int idfnorm=1, + double tfexp=1.0, double idfexp=1.0) const + { + return weight(tfnorm,idfnorm,tfexp,idfexp); + } + + }; + + // }}} + + /** + * @brief Term vector type. + */ + typedef std::vector<VectorItem> TermVector; + + +private: + + // {{{ Vectorizer::RawVector + + /** + * @class RawVector + * @brief Class for building a raw document vector. + * + * The RawVector class is a subclass of Detector::Hits, so it can be + * used directly with a Detector. The recognized terms and phrases + * will be collected and counted (->term frequency). Idf counts are + * obtained from the automaton the first time the term is + * encountered. + */ + class RawVector : public Detector::Hits { + + public: + + typedef std::map<std::string, std::pair<TfIdf, VectorItem::Hits> > ItemMap; + + // {{{ Vectorizer::RawVector::iterator + + /** + * @class iterator + * @brief Iterator for the RawVector class. + * + * This class is actually a wrapper around an + * std::map<std::string,TfIdf>::iterator. + */ + class iterator { + friend class RawVector; + private: + + /** + * @brief The real (std::map<>) iterator. + */ + ItemMap::iterator _mi; + + /** + * @brief Constructor. + * + * @param mi A real (std::map<>) iterator. + */ + iterator(ItemMap::iterator mi) : _mi(mi) {} + + public: + + /** + * @brief Default constructor. + */ + iterator() : _mi() {} + + /** + * @brief Copy constructor. + * + * @param it Reference to a Vectorizer::RawVector::iterator + * object. + */ + iterator(const iterator &it) : _mi(it._mi) {} + + /** + * @brief Constructor. + * + * Initialize the iterator to the beginning of a RawVector + * object. + * + * @param rv Reference to a Vectorizer::RawVector object, the + * iterator will be initalized to rv.begin(). + */ + iterator(RawVector &rv) : _mi(rv._item_map.begin()) { } + + /** + * @brief Assignment operator. + * + * @param it Reference to another iterator. + * @return Reference to this iterator. + */ + iterator& operator=(const iterator &it) { _mi=it._mi; return *this; } + + /** + * @brief Not equals operator. + * + * @param it Reference to another iterator. + * @return True if the two iterators point to different elements. + */ + bool operator!=(const iterator &it) const { return _mi!=it._mi; } + + /** + * @brief Prefix increment operator. + * + * @return Reference to the (incremented) iterator. + */ + iterator& operator++() { ++_mi; return *this; } + + /** + * @brief Dereference operator + * + * @return Reference to the actual pair the iterator refers to. + */ + ItemMap::value_type& operator*() { return _mi.operator*(); } + + /** + * @brief Dereference operator + * + * @return Pointer to the actual pair the iterator refers to. + */ + ItemMap::value_type* operator->() { return _mi.operator->(); } + }; + + // }}} + +#if (__GNUG__<3 || (__GNUG__ == 3 && __GNUC_MINOR__ < 1)) + friend RawVector::iterator; +#endif + + private: + + /** + * @brief Flag for controlling whether or not the detector will + * save hit position information. + */ + bool _save_positions; + + /** + * @brief The map holding the detected terms/phrases. + */ + ItemMap _item_map; + + public: + + /** + * @brief Default constructor. + */ + RawVector(bool save_positions = false) : _save_positions(save_positions), _item_map() {} + + /** + * @brief Destructor. + */ + ~RawVector() {} + + /** + * @brief Clear all data structures. + */ + void clear() { _item_map.clear(); } + + /** + * @brief Register a term or phrase. + * + * This method will be called by the detector for each term or + * recognized. + * + * @param text Input document (tokenized). + * @param from Index of first token of the phrase. + * @param length Length of the phrase. + * @param state Reference to the final state of the automaton + * after recognition of the phrase. + */ + void add(const NGram &text, + unsigned int from, int length, + const FSA::State &state) + { + ItemMap::iterator pos; + std::string str = text.join(" ",from,length); + pos=_item_map.find(str); + if(pos==_item_map.end()){ + pos=_item_map.insert( + ItemMap::value_type( + str, + std::pair<TfIdf,VectorItem::Hits>( + TfIdf(1,state.nData()), + VectorItem::Hits() + ) + ) + ).first; + } + else { + ++(pos->second.first); + } + if(_save_positions){ + pos->second.second.push_back(VectorItem::Hit(from,length)); + } + } + + /** + * @brief Get the size of the vector. + * + * @return Size of the vector (number of items). + */ + unsigned int size() const { return _item_map.size(); } + + /** + * @brief Get an iterator to the beginning of the vector. + * + * @return Iterator pointing to the first item of the vector. + */ + iterator begin() { return iterator(_item_map.begin()); } + + /** + * @brief Get an iterator to the end of the vector. + * + * @return Iterator pointing beyond the last item of the vector. + */ + iterator end() { return iterator(_item_map.end()); } + + }; + + // }}} + + const FSA& _dictionary; /**< The dictionary. */ + Detector _detector; /**< The detector. */ + unsigned int _idf_docs; /**< Total number of documents (for Idf calculations) */ + + /** + * @brief Retrieve total number of documents from the automaton. + * + * Retrieve total number of documents from the automaton. For the + * Idf calculations to work properly, the total number of documents + * needs to be stored in the automaton. This is done via a special + * term, '#IDFDOCS', with a numerical meta info which equals the + * total number of documents. + */ + void initIdfCount() + { + _idf_docs=0; + FSA::State s(_dictionary); + if(s.start("#IDFDOCS")) + _idf_docs = s.nData(); + + if(!_idf_docs) + ++_idf_docs; + } + +public: + + /** + * @brief Constructor. + * + * Initialize the dictionary and the detector from an FSA. + * + * @param dict FSA + */ + Vectorizer(const FSA& dict) : + _dictionary(dict), + _detector(_dictionary), + _idf_docs(0) + { + initIdfCount(); + } + + /** + * @brief Constructor. + * + * Initialize the dictionary and the detector from an FSA. + * + * @param dict FSA + */ + Vectorizer(const FSA* dict) : + _dictionary(*dict), + _detector(_dictionary), + _idf_docs(0) + { + initIdfCount(); + } + + /** + * @brief Destructor. + */ + ~Vectorizer() {} + + + /** + * @brief Vectorize a document. + * + * @param text Input document. + * @param vector TermVector object to hold the document vector. + * @param limit Limit the number of vector items. + * @param keephits Include in the vector items the hit positions of terms. + * @param tfexp Exponent for tf (term frequency). + * @param idfexp Exponent for idf (inverse document frequency). + */ + void vectorize(const NGram &text, TermVector &vector, unsigned int limit, + bool keephits, double tfexp = 1.0, double idfexp = 1.0) const; + + /** + * @brief Vectorize a document. + * + * In this version of the call, hit positions are not kept. + * + * @param text Input document. + * @param vector TermVector object to hold the document vector. + * @param limit Limit the number of vector items (default=15). + * @param tfexp Exponent for tf (term frequency). + * @param idfexp Exponent for idf (inverse document frequency). + */ + void vectorize(const NGram &text, TermVector &vector, unsigned int limit=15, + double tfexp = 1.0, double idfexp = 1.0) const; + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsa/wordchartokenizer.cpp b/fsa/src/vespa/fsa/wordchartokenizer.cpp new file mode 100644 index 00000000000..e6ea7ec918a --- /dev/null +++ b/fsa/src/vespa/fsa/wordchartokenizer.cpp @@ -0,0 +1,101 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "wordchartokenizer.h" +#include "unicode.h" + +#include <string.h> + + +namespace fsa { + +const bool WordCharTokenizer::_punctuation_table[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, +}; + + +bool WordCharTokenizer::init(const std::string &text) +{ + _tokens.clear(); + _current = 0; + + char *dup; + if(_lowercase) + dup = Unicode::strlowdupUTF8(text.c_str()); + else + dup = Unicode::strdupUTF8(text.c_str()); + + char *tmp = dup; + char *tok,*end; + ucs4_t ch=0; + bool need_punct=false, added_punct=false; + + while(*tmp) { + tok=NULL; + while((tok=tmp,*tmp) && + (ch=Unicode::getUTF8Char(tmp), + _punctuation==PUNCTUATION_WHITESPACEONLY?Unicode::isSpaceChar(ch):!Unicode::isWordChar(ch))){ + if(_punctuation!=PUNCTUATION_DISCARD && _punctuation!=PUNCTUATION_WHITESPACEONLY){ + if(ch<128 && _punctuation_table[ch] && need_punct && !added_punct){ + _tokens.push_back(_punctuation_token); + added_punct=true; + } + } + } + + while((end=tmp,*tmp) && + (ch=Unicode::getUTF8Char(tmp), + _punctuation==PUNCTUATION_WHITESPACEONLY?!Unicode::isSpaceChar(ch):Unicode::isWordChar(ch))); + + if(*end) { + *end=0; + } + if(*tok){ + _tokens.push_back(std::string((char *)tok)); + added_punct = false; + need_punct = true; + if(_punctuation!=PUNCTUATION_DISCARD && _punctuation!=PUNCTUATION_WHITESPACEONLY){ + if(ch<128 && _punctuation_table[ch]){ + if(_punctuation==PUNCTUATION_FULL || ch!='.' || strlen(tok)>1){ + _tokens.push_back(_punctuation_token); + added_punct=true; + } + } + } + } + } + + if(added_punct) { // The last token is a puctuation, drop it + _tokens.pop_back(); + } + + free(dup); + return true; +} + + +bool WordCharTokenizer::hasMore() +{ + return _tokens.size()>_current; +} + +std::string WordCharTokenizer::getNext() +{ + if(_tokens.size()>_current){ + return _tokens[_current++]; + } + else{ + return std::string(); + } +} + +} // namespace fsa diff --git a/fsa/src/vespa/fsa/wordchartokenizer.h b/fsa/src/vespa/fsa/wordchartokenizer.h new file mode 100644 index 00000000000..c66c727207f --- /dev/null +++ b/fsa/src/vespa/fsa/wordchartokenizer.h @@ -0,0 +1,109 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file wordchartokenizer.h + * @brief Tokenizer based on the unicode WORDCHAR property. + */ + +#pragma once + +#include "tokenizer.h" + +#include <iostream> +#include <vector> +#include <string> +#include <algorithm> + + +namespace fsa { + +// {{{ class WordCharTokenizer + +/** + * @class WordCharTokenizer + * @brief Tokenizer based on the Unicode WORDCHAR property. + */ +class WordCharTokenizer : public Tokenizer { + +public: + /** + * @brief Enumareted type for specifying puctuation removal strategy. + * + * Enumareted type for specifying puctuation removal strategy. The + * following strategies are currently supported: + * - PUNCTUATION_DISCARD: discard all punctuation. + * - PUNCTUATION_FULL: honour all punctuation and insert + * punctuation token. + * - PUNCTUATION_SMART: same as PUNCTUATION_FULL, with some + * heuristics to not break acronyms and names. + * - PUNCTUATION_WHITESPACEONLY: treat everything (including + * punctuation) as word characters, except white space. + */ + enum Punctuation { + PUNCTUATION_DISCARD = 0, + PUNCTUATION_FULL, + PUNCTUATION_SMART, + PUNCTUATION_WHITESPACEONLY + }; + +private: + + static const bool _punctuation_table[]; /**< Table used for punctuation tests. */ + + std::vector<std::string> _tokens; /**< Vector holding the tokens. */ + unsigned int _current; /**< Index of current token. */ + Punctuation _punctuation; /**< Punctuation strategy. */ + std::string _punctuation_token; /**< Special token for marking punctuation. */ + bool _lowercase; /**< Indicator whether tokens should be lowercased. */ + +public: + + WordCharTokenizer(Punctuation punct = PUNCTUATION_DISCARD, const std::string &punct_token = ".") : + _tokens(), + _current(0), + _punctuation(punct), + _punctuation_token(punct_token), + _lowercase(true) + {} + + virtual ~WordCharTokenizer() {} + + Punctuation getPunctuation() const { return _punctuation; } + void setPunctuation(Punctuation punct) { _punctuation=punct; } + std::string getPunctuationToken() const { return _punctuation_token; } + void setPunctuationToken(const std::string &punct_token) { _punctuation_token=punct_token; } + void rewind() { _current=0; } + void setLowerCase(bool lc) { _lowercase = lc; } + bool getLowerCase() const { return _lowercase; } + + /** + * @brief Initialize the tokenizer. + * + * @param text Input text. + * @return True on success. + */ + virtual bool init(const std::string &text); + + + /** + * @brief Check if there are more tokens available. + * + * @return True if there are more tokens. + */ + virtual bool hasMore(); + + /** + * @brief Get next token. + * + * @return Next token, or empty string if there are no more tokens left. + */ + virtual std::string getNext(); + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsamanagers/CMakeLists.txt b/fsa/src/vespa/fsamanagers/CMakeLists.txt new file mode 100644 index 00000000000..3e02946c59b --- /dev/null +++ b/fsa/src/vespa/fsamanagers/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(fsamanagers + SOURCES + conceptnetmanager.cpp + fsamanager.cpp + metadatamanager.cpp + mutex.cpp + rwlock.cpp + singleton.cpp + INSTALL lib64 + DEPENDS +) + +install(FILES + conceptnethandle.h + conceptnetmanager.h + fsahandle.h + fsamanager.h + metadatahandle.h + metadatamanager.h + mutex.h + refcountable.h + rwlock.h + singleton.h + DESTINATION include/vespa/fsamanagers) diff --git a/fsa/src/vespa/fsamanagers/conceptnethandle.h b/fsa/src/vespa/fsamanagers/conceptnethandle.h new file mode 100644 index 00000000000..a574343714f --- /dev/null +++ b/fsa/src/vespa/fsamanagers/conceptnethandle.h @@ -0,0 +1,123 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/10/01 + * @version $Id$ + * @file conceptnetmanager.h + * @brief Concept network handle class definition. + * + */ + +#pragma once + +#include <string> + +#include "refcountable.h" +#include <vespa/fsa/conceptnet.h> + +namespace fsa { + +// {{{ class ConceptNet::Handle + +/** + * @class Handle + * @brief Concept net handle. + * + * A Handle looks like a ConceptNet, but copies are cheap; the actual + * ConceptNet objects are refcounted and Handle copies merely copy the + * ConceptNet pointer and increment the refcount. + */ +class ConceptNet::Handle { + +private: + + /** + * @brief Unimplemented private default constructor. + */ + Handle(); + /** + * @brief Unimplemented private assignment operator. + */ + Handle& operator=(const Handle&); + + class RefCountableConceptNet: public ConceptNet, public RefCountable<ConceptNet> { + public: + RefCountableConceptNet(const char *fsafile, const char *datafile=NULL, FileAccessMethod fam = FILE_ACCESS_UNDEF) : ConceptNet(fsafile,datafile,fam) {} + }; + + RefCountableConceptNet *_conceptNet; /**< The ConceptNet object itself. */ + +public: + + /** + * @brief Copy constructor. + * + * Duplicate a handle (and add new reference to the ConceptNet object. + * + * @param h Reference to existing ConceptNet::Handle. + */ + Handle(const Handle& h) : _conceptNet(h._conceptNet) + { + _conceptNet->addReference(); + } + + /** + * @brief Constructor. + * + * @param fsafile %FSA file containing the units, with a perfect has + * (used for indexing the data file). + * @param datafile Concept net data file. + * @param fam File access mode (read or mmap). If not set, the + * global preferred access mode will be used. + */ + Handle(const char *fsafile, const char *datafile=NULL, FileAccessMethod fam = FILE_ACCESS_UNDEF) : + _conceptNet(new RefCountableConceptNet(fsafile,datafile,fam)) + { + _conceptNet->addReference(); + } + + /** + * @brief Constructor. + * + * @param fsafile %FSA file containing the units, with a perfect has + * (used for indexing the data file). + * @param datafile Concept net data file. + * @param fam File access mode (read or mmap). If not set, the + * global preferred access mode will be used. + */ + Handle(const std::string &fsafile, const std::string &datafile=NULL, FileAccessMethod fam = FILE_ACCESS_UNDEF) : + _conceptNet(new RefCountableConceptNet(fsafile.c_str(),datafile.c_str(),fam)) + { + _conceptNet->addReference(); + } + + /** + * @brief Destructor. + */ + ~Handle(void) + { + _conceptNet->removeReference(); + } + + /** + * @brief Dereference operator, provides access to ConceptNet + * methods. + * + * @return Reference to the ConceptNet object. + */ + const ConceptNet& operator*() const { return *_conceptNet; } + + /** + * @brief Dereference operator, provides access to ConceptNet + * methods. + * + * @return Pointer the ConceptNet object. + */ + const ConceptNet* operator->() const { return _conceptNet; } + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsamanagers/conceptnetmanager.cpp b/fsa/src/vespa/fsamanagers/conceptnetmanager.cpp new file mode 100644 index 00000000000..459d7c81239 --- /dev/null +++ b/fsa/src/vespa/fsamanagers/conceptnetmanager.cpp @@ -0,0 +1,105 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/10/01 + * @version $Id$ + * @file conceptnetmanager.cpp + * @brief Concept network manager class implementation. + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "conceptnetmanager.h" + +namespace fsa { + +// {{{ ConceptNetManager::~ConceptNetManager() + +ConceptNetManager::~ConceptNetManager() +{ + for(LibraryIterator it=_library.begin(); it!=_library.end();++it){ + delete it->second; + } +} + +// }}} + +// {{{ ConceptNetManager::load() + +bool ConceptNetManager::load(const std::string &id, const std::string &fsafile, const std::string &datafile) +{ + ConceptNet::Handle *newcn = new ConceptNet::Handle(fsafile.c_str(), datafile.length()>0?datafile.c_str():NULL); + + if(newcn==NULL || !(*newcn)->isOk()){ + delete newcn; + return false; + } + + _lock.wrLock(); + { + LibraryIterator it = _library.find(id); + if(it!=_library.end()){ + delete it->second; + it->second = newcn; + } + else + _library.insert(Library::value_type(id,newcn)); + } + _lock.unlock(); + + return true; +} + +// }}} +// {{{ ConceptNetManager::get() + +ConceptNet::Handle* ConceptNetManager::get(const std::string &id) const +{ + ConceptNet::Handle *newhandle=NULL; + _lock.rdLock(); + { + LibraryConstIterator it = _library.find(id); + if(it!=_library.end()){ + newhandle = new ConceptNet::Handle(*(it->second)); + } + } + _lock.unlock(); + return newhandle; +} + +// }}} +// {{{ ConceptNetManager::drop() + +void ConceptNetManager::drop(const std::string &id) +{ + _lock.wrLock(); + { + LibraryIterator it = _library.find(id); + if(it!=_library.end()){ + delete it->second; + _library.erase(it); + } + } + _lock.unlock(); +} + +// }}} +// {{{ ConceptNetManager::clear() + +void ConceptNetManager::clear() +{ + _lock.wrLock(); + { + for(LibraryIterator it = _library.begin(); it!=_library.end(); ++it) + delete it->second; + _library.clear(); + } + _lock.unlock(); +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsamanagers/conceptnetmanager.h b/fsa/src/vespa/fsamanagers/conceptnetmanager.h new file mode 100644 index 00000000000..d4e55bc68a6 --- /dev/null +++ b/fsa/src/vespa/fsamanagers/conceptnetmanager.h @@ -0,0 +1,104 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/10/01 + * @version $Id$ + * @file conceptnetmanager.h + * @brief Concept network manager class definition. + * + */ + +#pragma once + +#include <string> +#include <map> + +#include "singleton.h" +#include "rwlock.h" +#include "conceptnethandle.h" + +namespace fsa { + +// {{{ class ConceptNetManager + +/** + * @class ConceptNetManager + * @brief Class for managing concept networks. + * + * This class provides a single point of access to all concept networks + * used by the applications. + */ +class ConceptNetManager : public Singleton<ConceptNetManager> { + +protected: + friend class Singleton<ConceptNetManager>; + + /** Default constructor. Protected to avoid accidental creation */ + ConceptNetManager() : _library(), _lock() {} + +private: + + /** Private unimplemented copy constructor */ + ConceptNetManager(const ConceptNetManager&); + /** Private unimplemented assignment operator */ + ConceptNetManager& operator=(const ConceptNetManager&); + + /** %ConceptNet library type */ + typedef std::map<std::string,ConceptNet::Handle*> Library; + /** %ConceptNet library iterator type */ + typedef std::map<std::string,ConceptNet::Handle*>::iterator LibraryIterator; + /** %ConceptNet library const iterator type */ + typedef std::map<std::string,ConceptNet::Handle*>::const_iterator LibraryConstIterator; + + Library _library; /**< Library of concept networks. */ + mutable RWLock _lock; /**< Read-write lock for library synchronization. */ + +public: + + /** Destructor */ + ~ConceptNetManager(); + + /** + * @brief Load a concept network into memory. + * + * @param id Concept network id (to be used in later get() or drop() calls). + * @param fsafile Concept net %FSA file name + * @param datafile Concept net data file name (defaults to empty + * string which means use the fsa file name but + * replace .fsa extension with .dat). + */ + bool load(const std::string &id, + const std::string &fsafile, + const std::string &datafile=std::string("")); + + /** + * @brief Get a handle to a concept net. + * + * @param id Concept net id. + * @return Newly allocated handle, must be deleted by the + * caller. (NULL if no concept net with the given id was found.) + */ + ConceptNet::Handle* get(const std::string &id) const; + + /** + * @brief Drop a concept net from the library. + * + * Drop a concept net from the library. The concept net object will + * be deleted automagically when there are no more handles referring + * to it. + * + * @param id Concept net id. + */ + void drop(const std::string &id); + + /** + * @brief Drop all concept nets from the library. + */ + void clear(); + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsamanagers/fsahandle.h b/fsa/src/vespa/fsamanagers/fsahandle.h new file mode 100644 index 00000000000..9504c416c79 --- /dev/null +++ b/fsa/src/vespa/fsamanagers/fsahandle.h @@ -0,0 +1,191 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/09/07 + * @version $Id$ + * @file fsamanager.h + * @brief FSA handle class definition. + * + */ + +#pragma once + +#include <string> + +#include "refcountable.h" +#include <vespa/fsa/fsa.h> + +namespace fsa { + +// {{{ FSA::Handle + +/** + * @class Handle + * @brief FSA accessor. + * + * A Handle looks like an FSA, but copies are cheap; the actual FSA + * objects are refcounted and Handle copies merely copy the FSA pointer + * and increment the refcount. + */ +class FSA::Handle { + +private: + + /** + * @brief Unimplemented private default constructor. + */ + Handle(); + /** + * @brief Unimplemented private assignment operator. + */ + Handle& operator=(const Handle&); + + class RefCountableFSA: public FSA, public RefCountable<FSA> { + public: + RefCountableFSA(const char *file, FileAccessMethod fam = FILE_ACCESS_UNDEF) : FSA(file,fam) {} + }; + + RefCountableFSA *_fsa; /**< The FSA object itself. */ + + /** + * @brief Get a pointer to the referred FSA object. + * + * @return pointer to the referred FSA object. + */ + const FSA* getFSA() const + { + return _fsa; + } + +public: + + /** + * @brief Copy constructor. + * + * Duplicate a handle (and add new reference to the FSA object. + * + * @param h Reference to handle to duplicate. + */ + Handle(const Handle& h) : _fsa(h._fsa) + { + _fsa->addReference(); + } + + /** + * @brief Constructor. + * + * Create a new FSA object (loaded from file) and add reference. + * + * @param file Name of the file containing the automaton. + * @param fam File access mode (read or mmap). If not set, the + * global preferred access mode will be used. + */ + Handle(const char *file, FileAccessMethod fam = FILE_ACCESS_UNDEF) : + _fsa(new RefCountableFSA(file,fam)) + { + _fsa->addReference(); + } + + /** + * @brief Constructor. + * + * Create a new FSA object (loaded from file) and add reference. + * + * @param file Name of the file containing the automaton. + * @param fam File access mode (read or mmap). If not set, the + * global preferred access mode will be used. + */ + Handle(const std::string &file, FileAccessMethod fam = FILE_ACCESS_UNDEF) : + _fsa(new RefCountableFSA(file.c_str(),fam)) + { + _fsa->addReference(); + } + + /** + * @brief Destructor. + * + * Remove reference to the FSA object. + */ + ~Handle(void) + { + _fsa->removeReference(); + } + + /** + * @brief Dereference operator, provides access to Metadata + * methods. + * + * @return Reference to the Metadata object. + */ + const FSA& operator*() const { return *_fsa; } + + /** + * @brief Dereference operator, provides access to Metadata + * methods. + * + * @return Pointer the Metadata object. + */ + const FSA* operator->() const { return _fsa; } + + /** + * @brief Check if %FSA was properly constructed. + * + * @return true iff underlying %FSA was properly constructed. + */ + bool isOk(void) const + { + return _fsa->isOk(); + } + + /** + * @brief Get the fsa library version used for building this %FSA. + * + * @return fsa library version. + */ + uint32_t version(void) const + { + return _fsa->version(); + } + + /** + * @brief Get the serial number of the %FSA. + * + * @return Serial number. + */ + uint32_t serial(void) const + { + return _fsa->serial(); + } + + /** + * @brief Check is the automaton has perfect hash built in. + * + * Returns true if the automaton was built with a perfect hash included. + * + * @return True if the automaton has perfect hash. + */ + bool hasPerfectHash() const + { + return _fsa->hasPerfectHash(); + } + + /** + * @brief Get iterator pointing to the beginning of the fsa. + * + * @return iterator pointing to the first string in the fsa. + */ + FSA::iterator begin() const { return FSA::iterator(_fsa); } + + /** + * @brief Get iterator pointing past the end of the fsa. + * + * @return iterator pointing past the last string in the fsa. + */ + FSA::iterator end() const { return FSA::iterator(_fsa,true); } + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsamanagers/fsamanager.cpp b/fsa/src/vespa/fsamanagers/fsamanager.cpp new file mode 100644 index 00000000000..8816ea2a4b8 --- /dev/null +++ b/fsa/src/vespa/fsamanagers/fsamanager.cpp @@ -0,0 +1,187 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file fsamanager.cpp + * @brief + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "fsamanager.h" + +#ifdef HAVE_CURL +#include <stdio.h> +#include <unistd.h> +#include <curl/curl.h> +#include <curl/types.h> +#include <curl/easy.h> +#endif + + + +namespace fsa { + +// {{{ FSAManager::~FSAManager() + +FSAManager::~FSAManager() +{ + for(LibraryIterator it=_library.begin(); it!=_library.end();++it){ + delete it->second; + } +} + +// }}} +// {{{ FSAManager::load() + +bool FSAManager::load(const std::string &id, const std::string &url) +{ + std::string file=url; + +#if ((__GNUG__ == 3 && __GNUC_MINOR__ >= 1) || __GNUG__ > 3) + if(!url.compare(0,7,"http://")) +#else + if(!url.compare("http://",0,7)) +#endif + { + unsigned int pos=url.find_last_of('/'); + if(pos==url.size()-1) return false; + _cacheLock.lock(); + file=_cacheDir; + _cacheLock.unlock(); + if(file.size()>0 && file[file.size()-1]!='/') file+='/'; + file+=url.substr(pos+1); + if(!getUrl(url,file)) return false; + } + + FSA::Handle *newdict = new FSA::Handle(file); + if(!newdict->isOk()){ + delete newdict; + return false; + } + + _lock.wrLock(); + { + LibraryIterator it = _library.find(id); + if(it!=_library.end()){ + delete it->second; + it->second = newdict; + } + else + _library.insert(Library::value_type(id,newdict)); + } + _lock.unlock(); + + return true; +} + +// }}} +// {{{ FSAManager::getUrl() + +bool FSAManager::getUrl(const std::string &url, const std::string &file) +{ +#ifdef HAVE_CURL + CURL *curl_handle; + FILE *filehandle; + long response_code; + + filehandle = fopen(file.c_str(),"r"); + if(filehandle!=NULL){ + fclose(filehandle); + return true; + } + + filehandle = fopen(file.c_str(),"w"); + if(filehandle==NULL) + return false; + + curl_handle = curl_easy_init(); + + curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)filehandle); + curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libfsa-url-agent/0.1"); + + curl_easy_perform(curl_handle); + + curl_easy_getinfo(curl_handle, CURLINFO_RESPONSE_CODE, &response_code); + + curl_easy_cleanup(curl_handle); + + fclose(filehandle); + + if(response_code!=200){ + unlink(file.c_str()); + return false; + } + + return true; +#else // HAVE_CURL + (void)url;(void)file; + return false; +#endif // HAVE_CURL +} + +// }}} +// {{{ FSAManager::get() + +FSA::Handle* FSAManager::get(const std::string &id) const +{ + FSA::Handle *newhandle=NULL; + _lock.rdLock(); + { + LibraryConstIterator it = _library.find(id); + if(it!=_library.end()){ + newhandle = new FSA::Handle(*(it->second)); + } + } + _lock.unlock(); + return newhandle; +} + +// }}} +// {{{ FSAManager::drop() + +void FSAManager::drop(const std::string &id) +{ + _lock.wrLock(); + { + LibraryIterator it = _library.find(id); + if(it!=_library.end()){ + delete it->second; + _library.erase(it); + } + } + _lock.unlock(); +} + +// }}} +// {{{ FSAManager::clear() + +void FSAManager::clear() +{ + _lock.wrLock(); + { + for(LibraryIterator it = _library.begin(); it!=_library.end(); ++it) + delete it->second; + _library.clear(); + } + _lock.unlock(); +} + +// }}} +// {{{ FSAManager::setCacheDir() + +void FSAManager::setCacheDir(const std::string &dir) +{ + _cacheLock.lock(); + _cacheDir = dir; + _cacheLock.unlock(); +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsamanagers/fsamanager.h b/fsa/src/vespa/fsamanagers/fsamanager.h new file mode 100644 index 00000000000..6de1b95a085 --- /dev/null +++ b/fsa/src/vespa/fsamanagers/fsamanager.h @@ -0,0 +1,140 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/09/07 + * @version $Id$ + * @file fsamanager.h + * @brief Class definition of the %FSA manager + * + */ + +#pragma once + +#include <string> +#include <map> + +#include "singleton.h" +#include "rwlock.h" +#include "fsahandle.h" + +namespace fsa { + +// {{{ class FSAManager + +/** + * @class FSAManager + * @brief Class for managing finite state automata. + * + * This class provides a single point of access to all finite state + * automata used by the applications. Supports loading fsa files and + * downloading from the net if libcurl support is built in, in which + * case the files are cached in a local cache directory. FSAManager is + * implemented as a singleton. + */ +class FSAManager : public Singleton<FSAManager> { + +protected: + friend class Singleton<FSAManager>; + + /** Default constructor. Protected to avoid accidental creation */ + FSAManager() : _library(), _lock(), _cacheDir(), _cacheLock() {} + +private: + + /** Private unimplemented copy constructor */ + FSAManager(const FSAManager&); + /** Private unimplemented assignment operator */ + FSAManager& operator=(const FSAManager&); + + /** %FSA library type */ + typedef std::map<std::string,FSA::Handle*> Library; + /** %FSA library iterator type */ + typedef std::map<std::string,FSA::Handle*>::iterator LibraryIterator; + /** %FSA library const iterator type */ + typedef std::map<std::string,FSA::Handle*>::const_iterator LibraryConstIterator; + + Library _library; /**< Library of automata. */ + mutable RWLock _lock; /**< Read-write lock for library synchronization. */ + std::string _cacheDir; /**< Cache directory. */ + mutable Mutex _cacheLock; /**< Mutex for cache synchronization. */ + + /** + * @brief Fetch an automaton from the net. + * + * @param url URL to automaton. + * @param file Name of local file to store automaton. + * @return True on success. + */ + bool getUrl(const std::string &url, const std::string &file); + +public: + + /** Destructor */ + ~FSAManager(); + + /** + * @brief Load automaton from file or fetch from the net. + * + * Load automaton from file or fetch from the net. If the url begins + * with "http://", and libcurl support is compiled in, the automaton + * is downloaded from the net an stored in the local cache, unless + * an automaton with that filename already exist in the cache, in which + * case the local copy is used. This behaviour is expected to change + * in the future, and it will use the serial number from the fsa + * header to decide whether an update is needed. + * + * If an automaton is already registered with the given ID, the old + * one is dropped as soon as the new is loaded. This does not + * effects handles to the old automaton which were acquired + * previously, as the old automaton will stay in memory until all + * handles are deleted. + * + * @param id Automaton ID (name) used by the application. + * @param url File name or URL (the latter if it begins with "http://"). + * @return True on success. + */ + bool load(const std::string &id, const std::string &url); + + /** + * @brief Get a handle to an automaton. + * + * @param id Automaton ID (name). + * @return Pointer to a new handle to the automaton, or NULL if not found. + * The handle must be deleted when it is not needed + * anymore. (In fact it should be deleted and re-requested + * on a regular basis if automaton updates may be performed.) + */ + FSA::Handle* get(const std::string &id) const; + + /** + * @brief Drop an automaton from the library. + * + * Drop the automaton from the library. All new requests for the + * given ID will receive a NULL handle after this operation (unless + * an automaton with the same ID is later loaded again). + * + * @param id Automaton ID + */ + void drop(const std::string &id); + + /** + * @brief Drop all automatons from the library. + */ + void clear(); + + /** + * @brief Set the local cache directory. + * + * Set the local cache directory (default is empty, which + * corresponds to the CWD (current working directory). + * + * @param dir Cache directory. + */ + void setCacheDir(const std::string &dir); + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsamanagers/metadatahandle.h b/fsa/src/vespa/fsamanagers/metadatahandle.h new file mode 100644 index 00000000000..8603caedfb7 --- /dev/null +++ b/fsa/src/vespa/fsamanagers/metadatahandle.h @@ -0,0 +1,130 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/10/01 + * @version $Id$ + * @file metadatamanager.h + * @brief Metadata handle class definition. + * + */ + +#pragma once + +#include <string> + +#include "refcountable.h" +#include <vespa/fsa/metadata.h> + +namespace fsa { + +// {{{ class MetaData::Handle + +/** + * @class Handle + * @brief MetaData handle. + * + * A Handle looks like a MetaData, but copies are cheap; the actual + * MetaData objects are refcounted and Handle copies merely copy the + * MetaData pointer and increment the refcount. + */ +class MetaData::Handle { + +private: + + /** + * @brief Unimplemented private default constructor. + */ + Handle(); + /** + * @brief Unimplemented private assignment operator. + */ + Handle& operator=(const Handle&); + + class RefCountableMetaData: public MetaData, public RefCountable<MetaData> { + public: + RefCountableMetaData(const char *datafile, FileAccessMethod fam = FILE_ACCESS_UNDEF) : MetaData(datafile,fam) {} + }; + + RefCountableMetaData *_metaData; /**< The MetaData object itself. */ + +public: + + /** + * @brief Copy constructor. + * + * Duplicate a handle (and add new reference to the MetaData object. + * + * @param h Reference to existing Metadata::Handle. + */ + Handle(const Handle& h) : _metaData(h._metaData) + { + _metaData->addReference(); + } + + /** + * @brief Constructor. + * + * Create a new MetaData object (loaded from file) and add reference. + * + * @param datafile Name of the file containing the metadata. + * @param fam File access mode (read or mmap). If not set, the + * global preferred access mode will be used. + */ + Handle(const char *datafile, FileAccessMethod fam = FILE_ACCESS_UNDEF) : + _metaData(new RefCountableMetaData(datafile,fam)) + { + _metaData->addReference(); + } + + /** + * @brief Constructor. + * + * Create a new MetaData object (loaded from file) and add reference. + * + * @param datafile Name of the file containing the metadata. + * @param fam File access mode (read or mmap). If not set, the + * global preferred access mode will be used. + */ + Handle(const std::string &datafile, FileAccessMethod fam = FILE_ACCESS_UNDEF) : + _metaData(new RefCountableMetaData(datafile.c_str(),fam)) + { + _metaData->addReference(); + } + + /** + * @brief Destructor. + */ + ~Handle(void) + { + _metaData->removeReference(); + } + + /** + * @brief Dereference operator, provides access to Metadata + * methods. + * + * @return Reference to the Metadata object. + */ + const MetaData& operator*() const { return *_metaData; } + + /** + * @brief Dereference operator, provides access to Metadata + * methods. + * + * @return Pointer the Metadata object. + */ + const MetaData* operator->() const { return _metaData; } + + /** + * @brief Proxy methods + */ + uint32_t user(unsigned int idx) const + { + return _metaData->user(idx); + } +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsamanagers/metadatamanager.cpp b/fsa/src/vespa/fsamanagers/metadatamanager.cpp new file mode 100644 index 00000000000..9721d632c52 --- /dev/null +++ b/fsa/src/vespa/fsamanagers/metadatamanager.cpp @@ -0,0 +1,105 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/10/01 + * @version $Id$ + * @file metadatamanager.cpp + * @brief Metadata manager class implementation. + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "metadatamanager.h" + +namespace fsa { + +// {{{ MetaDataManager::~MetaDataManager() + +MetaDataManager::~MetaDataManager() +{ + for(LibraryIterator it=_library.begin(); it!=_library.end();++it){ + delete it->second; + } +} + +// }}} + +// {{{ MetaDataManager::load() + +bool MetaDataManager::load(const std::string &id, const std::string &datafile) +{ + MetaData::Handle *newmd = new MetaData::Handle(datafile.c_str()); + + if(newmd==NULL || !(*newmd)->isOk()){ + delete newmd; + return false; + } + + _lock.wrLock(); + { + LibraryIterator it = _library.find(id); + if(it!=_library.end()){ + delete it->second; + it->second = newmd; + } + else + _library.insert(Library::value_type(id,newmd)); + } + _lock.unlock(); + + return true; +} + +// }}} +// {{{ MetaDataManager::get() + +MetaData::Handle* MetaDataManager::get(const std::string &id) const +{ + MetaData::Handle *newhandle=NULL; + _lock.rdLock(); + { + LibraryConstIterator it = _library.find(id); + if(it!=_library.end()){ + newhandle = new MetaData::Handle(*(it->second)); + } + } + _lock.unlock(); + return newhandle; +} + +// }}} +// {{{ MetaDataManager::drop() + +void MetaDataManager::drop(const std::string &id) +{ + _lock.wrLock(); + { + LibraryIterator it = _library.find(id); + if(it!=_library.end()){ + delete it->second; + _library.erase(it); + } + } + _lock.unlock(); +} + +// }}} +// {{{ MetaDataManager::clear() + +void MetaDataManager::clear() +{ + _lock.wrLock(); + { + for(LibraryIterator it = _library.begin(); it!=_library.end(); ++it) + delete it->second; + _library.clear(); + } + _lock.unlock(); +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsamanagers/metadatamanager.h b/fsa/src/vespa/fsamanagers/metadatamanager.h new file mode 100644 index 00000000000..d87ca59626c --- /dev/null +++ b/fsa/src/vespa/fsamanagers/metadatamanager.h @@ -0,0 +1,99 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/10/01 + * @version $Id$ + * @file metadatamanager.h + * @brief Metadata manager class definition. + * + */ + +#pragma once + +#include <string> +#include <map> + +#include "singleton.h" +#include "rwlock.h" +#include "metadatahandle.h" + +namespace fsa { + +// {{{ class MetaDataManager + +/** + * @class MetaDataManager + * @brief Class for managing generic metadata. + * + * This class provides a single point of access to all metadata + * used by the applications. + */ +class MetaDataManager : public Singleton<MetaDataManager> { + +protected: + friend class Singleton<MetaDataManager>; + + /** Default constructor. Protected to avoid accidental creation */ + MetaDataManager() : _library(), _lock() {} + +private: + + /** Private unimplemented copy constructor */ + MetaDataManager(const MetaDataManager&); + /** Private unimplemented assignment operator */ + MetaDataManager& operator=(const MetaDataManager&); + + /** %MetaData library type */ + typedef std::map<std::string,MetaData::Handle*> Library; + /** %MetaData library iterator type */ + typedef std::map<std::string,MetaData::Handle*>::iterator LibraryIterator; + /** %MetaData library const iterator type */ + typedef std::map<std::string,MetaData::Handle*>::const_iterator LibraryConstIterator; + + Library _library; /**< Library of MetaData objects. */ + mutable RWLock _lock; /**< Read-write lock for library synchronization. */ + +public: + + /** Destructor */ + ~MetaDataManager(); + + /** + * @brief Load a metadata file into memory. + * + * @param id MetaData id (to be used in later get() or drop() calls). + * @param datafile Metadata file name + */ + bool load(const std::string &id, const std::string &datafile); + + /** + * @brief Get a handle to metadata. + * + * @param id Metadata id. + * @return Newly allocated handle, must be deleted by the + * caller. (NULL if no metadata with the given id was found.) + */ + MetaData::Handle* get(const std::string &id) const; + + /** + * @brief Drop a metadata from the library. + * + * Drop a metadata from the library. The metadata object will + * be deleted automagically when there are no more handles referring + * to it. + * + * @param id MetaData id. + */ + void drop(const std::string &id); + + /** + * @brief Drop all metadatas from the library. + */ + void clear(); + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsamanagers/mutex.cpp b/fsa/src/vespa/fsamanagers/mutex.cpp new file mode 100644 index 00000000000..1c62744291d --- /dev/null +++ b/fsa/src/vespa/fsamanagers/mutex.cpp @@ -0,0 +1,82 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/09/07 + * @version $Id$ + * @file mutex.cpp + * @brief Mutex. + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef DISABLE_THREADS +#include <pthread.h> +#include <sched.h> +#include <assert.h> +#endif + +#include "mutex.h" + +namespace fsa { + +// {{{ class Mutex::Impl + +struct Mutex::Impl +{ +#ifndef DISABLE_THREADS + pthread_mutex_t _mutex; /**< lock */ +#else + int _mutex; +#endif +}; + +// }}} + +Mutex::Mutex(void) : _impl(new Impl) +{ +#ifndef DISABLE_THREADS + int rc; + rc = pthread_mutex_init(&(_impl->_mutex),NULL); + assert(rc == 0); +#endif +} + +Mutex::~Mutex(void) +{ +#ifndef DISABLE_THREADS + pthread_mutex_destroy(&(_impl->_mutex)); +#endif + delete _impl; +} + +bool Mutex::tryLock (void) +{ +#ifndef DISABLE_THREADS + return pthread_mutex_trylock(&(_impl->_mutex)) == 0; +#else + return true; +#endif +} + +bool Mutex::lock (void) +{ +#ifndef DISABLE_THREADS + return pthread_mutex_lock(&(_impl->_mutex)) == 0; +#else + return true; +#endif +} + +bool Mutex::unlock (void) +{ +#ifndef DISABLE_THREADS + return pthread_mutex_unlock(&(_impl->_mutex)) == 0; +#else + return true; +#endif +} + +} // namespace fsa diff --git a/fsa/src/vespa/fsamanagers/mutex.h b/fsa/src/vespa/fsamanagers/mutex.h new file mode 100644 index 00000000000..87deb081b08 --- /dev/null +++ b/fsa/src/vespa/fsamanagers/mutex.h @@ -0,0 +1,73 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/09/07 + * @version $Id$ + * @file mutex.h + * @brief Mutex. + * + */ + +#pragma once + +// {{{ class Mutex + +namespace fsa { + +/** + * @class Mutex + * @brief Mutex. + * + * Simple mutex class based on POSIX pthread_mutex_t. + */ +class Mutex +{ + protected: + class Impl; + Impl *_impl; + + public: + /** + * @brief Constructor + */ + Mutex(void); + + /** + * @brief Destructor + */ + ~Mutex(void); + + /** + * @brief Try to get a lock. + * + * Try to get a lock. This method is non-blocking, and + * returns true if locking was succesful. + * + * @return True if locking was successful. + */ + bool tryLock (void); + + /** + * @brief Get a lock. + * + * Get a read (shared) lock. This method blocks until a + * lock is available (that is no other thread holds a + * lock on the object.) + * + * @return True if locking was successful. + */ + bool lock (void); + + /** + * @brief Release a lock. + * + * @return True if unlocking was successful. + */ + bool unlock (void); + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsamanagers/refcountable.h b/fsa/src/vespa/fsamanagers/refcountable.h new file mode 100644 index 00000000000..77f00bc3450 --- /dev/null +++ b/fsa/src/vespa/fsamanagers/refcountable.h @@ -0,0 +1,111 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file refcountable.h + * @brief Reference countable template + */ + +#pragma once + +#include "mutex.h" + +namespace fsa { + +// {{{ class RefCountable + +/** + * @class RefCountable + * @brief Reference countable template + * + * Subclass this template, and use the addReference and removeReference + * methods to keep track of how many references the object has. When + * the last reference is removed, the object blows up (well, destroys + * itself). + */ +template <typename T> +class RefCountable +{ +protected: + + /** Reference count */ + int _refCount; + + /** Lock */ + Mutex _sequencerLock; + + + /** + * @brief Destroy the object + * + * @return True. + */ + virtual bool destroy(void) + { + delete this; + return true; + }; + +private: + + /** Unimplemented private copy constructor. */ + RefCountable(const RefCountable &original); + /** Unimplemented private assignment operator. */ + const RefCountable& operator=(const RefCountable &original); + +public: + + /** + * @brief Constructor + */ + RefCountable(void) + : _refCount(0), + _sequencerLock() + { + } + + /** + * @brief Destructor + */ + virtual ~RefCountable(void) {} + + /** + * @brief Increase reference count. + */ + virtual void addReference(void) + { + _sequencerLock.lock(); + _refCount++; + _sequencerLock.unlock(); + } + + /** + * @brief Decrease reference count, and destroy object if no + * references are left. + * + * @return True if the object was destroyed. + */ + virtual bool removeReference(void) + { + bool destroyed = false; + + _sequencerLock.lock(); + _refCount--; + + if(_refCount<1){ + _sequencerLock.unlock(); + destroyed = destroy(); + } + else{ + _sequencerLock.unlock(); + } + return destroyed; + } + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsamanagers/rwlock.cpp b/fsa/src/vespa/fsamanagers/rwlock.cpp new file mode 100644 index 00000000000..9c296dfe980 --- /dev/null +++ b/fsa/src/vespa/fsamanagers/rwlock.cpp @@ -0,0 +1,99 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/09/07 + * @version $Id$ + * @file rwlock.cpp + * @brief Read-write lock. + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef DISABLE_THREADS +#include <pthread.h> +#include <sched.h> +#include <assert.h> +#endif + +#include "rwlock.h" + +namespace fsa { + +// {{{ class RWLock::Impl + +struct RWLock::Impl +{ +#ifndef DISABLE_THREADS + pthread_rwlock_t _rwlock; /**< Lock. */ +#else + int _rwlock; +#endif +}; + +// }}} + +RWLock::RWLock(void) : _impl(new Impl) +{ +#ifndef DISABLE_THREADS + int rc; + rc = pthread_rwlock_init(&(_impl->_rwlock),NULL); + assert(rc == 0); +#endif +} + +RWLock::~RWLock(void) +{ +#ifndef DISABLE_THREADS + pthread_rwlock_destroy(&(_impl->_rwlock)); +#endif +} + +bool RWLock::tryRdLock (void) +{ +#ifndef DISABLE_THREADS + return pthread_rwlock_tryrdlock(&(_impl->_rwlock)) == 0; +#else + return true; +#endif +} + +bool RWLock::tryWrLock (void) +{ +#ifndef DISABLE_THREADS + return pthread_rwlock_trywrlock(&(_impl->_rwlock)) == 0; +#else + return true; +#endif +} + +bool RWLock::rdLock (void) +{ +#ifndef DISABLE_THREADS + return pthread_rwlock_rdlock(&(_impl->_rwlock)) == 0; +#else + return true; +#endif +} + +bool RWLock::wrLock (void) +{ +#ifndef DISABLE_THREADS + return pthread_rwlock_wrlock(&(_impl->_rwlock)) == 0; +#else + return true; +#endif +} + +bool RWLock::unlock (void) +{ +#ifndef DISABLE_THREADS + return pthread_rwlock_unlock(&(_impl->_rwlock)) == 0; +#else + return true; +#endif +} + +} // namespace fsa diff --git a/fsa/src/vespa/fsamanagers/rwlock.h b/fsa/src/vespa/fsamanagers/rwlock.h new file mode 100644 index 00000000000..4c85d1cac8a --- /dev/null +++ b/fsa/src/vespa/fsamanagers/rwlock.h @@ -0,0 +1,95 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/09/07 + * @version $Id$ + * @file rwlock.h + * @brief Read-write lock. + * + */ + +#pragma once + +namespace fsa { + +// {{{ class RWLock + +/** + * @class RWLock + * @brief Read-write lock. + * + * Simple read-write lock class based on POSIX pthread_rwlock_t. + */ +class RWLock +{ + protected: + class Impl; + Impl *_impl; + + public: + + /** + * @brief Constructor. + */ + RWLock(void); + + /** + * @brief Destructor. + */ + ~RWLock(void); + + /** + * @brief Try to get a read (shared) lock. + * + * Try to get a read (shared) lock. This method is non-blocking, and + * returns true if locking was succesful. + * + * @return True if locking was successful. + */ + bool tryRdLock (void); + + /** + * @brief Try to get a write (exclusive) lock. + * + * Try to get a write (exclusive) lock. This method is non-blocking, and + * returns true if locking was succesful. + * + * @return True if locking was successful. + */ + bool tryWrLock (void); + + /** + * @brief Get a read (shared) lock. + * + * Get a read (shared) lock. This method blocks until a shared + * lock is available (that is no other thread holds an exclusive + * lock on the object.) + * + * @return True if locking was successful. + */ + bool rdLock (void); + + /** + * @brief Get a write (exclusive) lock. + * + * Get a write (exclusive) lock. This method blocks until an + * exclusive lock is available (that is no other thread holds a + * shared or an exclusive lock on the object.) + * + * @return True if locking was successful. + */ + bool wrLock (void); + + /** + * @brief Release a (shared or exclusive) lock. + * + * @return True if unlocking was successful. + */ + bool unlock (void); + +}; + +// }}} + +} // namespace fsa + diff --git a/fsa/src/vespa/fsamanagers/singleton.cpp b/fsa/src/vespa/fsamanagers/singleton.cpp new file mode 100644 index 00000000000..76e9535b450 --- /dev/null +++ b/fsa/src/vespa/fsamanagers/singleton.cpp @@ -0,0 +1,89 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/09/05 + * @version $Id$ + * @file singleton.cpp + * @brief Singleton pattern. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdlib.h> + +#include "singleton.h" + + +namespace fsa { + +// {{{ SingletonExitHandler::_instance + +SingletonExitHandler* SingletonExitHandler::_instance = NULL; + +// }}} + +// {{{ SingletonExitHandler::SingletonExitHandler() + +SingletonExitHandler::SingletonExitHandler() + : _functionList() +{ + /* + * This won't work as part of plugins. When library is unloaded, the + * registration remains, and the program will crash when trying to + * exit. + */ + atexit(&atExit); +} + +// }}} +// {{{ SingletonExitHandler::~SingletonExitHandler() + +SingletonExitHandler::~SingletonExitHandler() +{ +} + +// }}} +// {{{ SingletonExitHandler::instance() + +SingletonExitHandler* SingletonExitHandler::instance() +{ + if (_instance == NULL) { + _instance = new SingletonExitHandler(); + } + return _instance; +} + +// }}} +// {{{ SingletonExitHandler::registerSingletonDestroyer() + +void SingletonExitHandler::registerSingletonDestroyer(void (*p)()) +{ + _functionList.push_front(p); +} + +// }}} +// {{{ SingletonExitHandler::atExit() + +void SingletonExitHandler::atExit() +{ + SingletonExitHandler::instance()->destroy(); + delete SingletonExitHandler::instance(); +} + +// }}} +// {{{ SingletonExitHandler::destroy() + +void SingletonExitHandler::destroy() +{ + for(FunctionListIterator iterator=_functionList.begin(); + iterator!=_functionList.end(); ++iterator) { + (*iterator)(); + } + +} + +// }}} + +} // namespace fsa diff --git a/fsa/src/vespa/fsamanagers/singleton.h b/fsa/src/vespa/fsamanagers/singleton.h new file mode 100644 index 00000000000..db11a9bf444 --- /dev/null +++ b/fsa/src/vespa/fsamanagers/singleton.h @@ -0,0 +1,172 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/09/05 + * @version $Id$ + * @file singleton.h + * @brief Singleton pattern. + */ + + +#pragma once + +#include <list> + +#include "mutex.h" + + +namespace fsa { + +// {{{ class SingletonExitHandler + +/** + * @class SingletonExitHandler + * @brief %Singleton exit handler. + * + * %Singleton exit handler. Uses the atexit() librarary call to + * destory all Singleton objects in reverse order as they were + * created. It is also a singleton self. + */ +class SingletonExitHandler +{ +private: + + /** Default constructor */ + SingletonExitHandler(); + + /** Method to call at exit, destroys all Singletons. */ + static void atExit(); + + /** Instance pointer */ + static SingletonExitHandler* _instance; + + /** Destroy method - does the dirty work */ + void destroy(); + + + typedef std::list<void(*)()> FunctionList; + typedef std::list<void(*)()>::iterator FunctionListIterator; + + /** List of Singleton destroy functions */ + FunctionList _functionList; + +public: + + /** Destructor */ + virtual ~SingletonExitHandler(); + + /** + * @brief Get instance pointer. + * + * @return pointer to instance. + */ + static SingletonExitHandler* instance(); + + /** + * @brief Register a singleton. + * + * @param p Pointer to destroy function of the singleton. + */ + void registerSingletonDestroyer(void (*p)()); + +}; + +// }}} + +// {{{ class Singleton + +/** + * @class Singleton + * @brief %Singleton template. + * + * %Singleton template (from Design Patterns by Gamma et al.). To use + * it, subclass as follows, and make constructors private: + * + * class MyClass : public Singleton<MyClass> { + * friend class Singleton<MyClass>; + * private: + * MyClass(); + * public: + * void MyMethod(); + * ... + * } + * + * and then call MyMethod as: + * + * MyClass::instance().MyMethod(); + * + */ +template<typename T> +class Singleton +{ + /** SingletonExitHandler handles destruction. */ + friend class SingletonExitHandler; + +public: + /** Destructor */ + virtual ~Singleton(); + + /** + * @brief Get reference to the instance. + * + * Get reference to the instance. The first call of this method will + * create the instance, and register the destroy function with the + * exit handler. + * + * @return Reference to the instance. + */ + static T& instance(); + +protected: + + /** Explicit constructor (to avoid implicit conversion). */ + explicit Singleton(); + +private: + + /** Copy constructor (unimplemented) */ + Singleton(const Singleton&); + /** Assignment operator (unimplemented) */ + Singleton& operator=(const Singleton&); + + /** Destroy function - this will be registered with the exit handler. */ + static void destroy(); + + static Mutex _lock; /**< Mutex for synchronization. */ + + static T* _instance; /**< Instance pointer. */ +}; + + +template<typename T> Singleton<T>::Singleton() {} + +template<typename T> Singleton<T>::~Singleton() {} + +template<typename T> void Singleton<T>::destroy() +{ + delete _instance; + _instance = NULL; +} + +template<typename T> T& Singleton<T>::instance() +{ + if (_instance == NULL) { + _lock.lock(); + if (_instance == NULL) { + SingletonExitHandler::instance()->registerSingletonDestroyer(&destroy); + _instance = new T(); + } + _lock.unlock(); + } + + return *_instance; +} + +template<typename T> T* Singleton<T>::_instance = NULL; + +template<typename T> Mutex Singleton<T>::_lock; + +// }}} + +} // namespace fsa + |