diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /fsa/src/alltest |
Publish
Diffstat (limited to 'fsa/src/alltest')
28 files changed, 1471 insertions, 0 deletions
diff --git a/fsa/src/alltest/.gitignore b/fsa/src/alltest/.gitignore new file mode 100644 index 00000000000..c950caba857 --- /dev/null +++ b/fsa/src/alltest/.gitignore @@ -0,0 +1,15 @@ +Makefile +.depend +__testfsa__.__fsa__ +fsa_conceptnet_test_app +fsa_detector_test_app +fsa_fsa_create_test_app +fsa_fsa_perf_test_app +fsa_fsa_test_app +fsa_fsamanager_test_app +fsa_lookup_test_app +fsa_ngram_test_app +fsa_segmenter_test_app +fsa_vectorizer_perf_test_app +fsa_vectorizer_test_app +*.output diff --git a/fsa/src/alltest/CMakeLists.txt b/fsa/src/alltest/CMakeLists.txt new file mode 100644 index 00000000000..d82ca400405 --- /dev/null +++ b/fsa/src/alltest/CMakeLists.txt @@ -0,0 +1,70 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(fsa_conceptnet_test_app + SOURCES + conceptnet_test.cpp + DEPENDS + fsamanagers + fsa +) +vespa_add_executable(fsa_detector_test_app + SOURCES + detector_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_fsa_test_app + SOURCES + fsa_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_fsa_create_test_app + SOURCES + fsa_create_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_fsa_perf_test_app + SOURCES + fsa_perftest.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_fsamanager_test_app + SOURCES + fsamanager_test.cpp + DEPENDS + fsamanagers + fsa +) +vespa_add_executable(fsa_lookup_test_app + SOURCES + lookup_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_ngram_test_app + SOURCES + ngram_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_segmenter_test_app + SOURCES + segmenter_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_vectorizer_test_app + SOURCES + vectorizer_test.cpp + DEPENDS + fsa +) +vespa_add_executable(fsa_vectorizer_perf_test_app + SOURCES + vectorizer_perftest.cpp + DEPENDS + fsa +) +vespa_add_test(NAME fsa_vectorizer_perf_test_app NO_VALGRIND COMMAND sh alltest.sh) diff --git a/fsa/src/alltest/alltest.sh b/fsa/src/alltest/alltest.sh new file mode 100755 index 00000000000..37274721e25 --- /dev/null +++ b/fsa/src/alltest/alltest.sh @@ -0,0 +1,11 @@ +#!/bin/bash +./detector_test.sh +./fsa_test.sh +./fsa_fsa_create_test_app +./fsa_fsa_perf_test_app +./fsa_fsamanager_test_app . __testfsa__.__fsa__ +./lookup_test.sh +./ngram_test.sh +./segmenter_test.sh +./vectorizer_test.sh +./fsa_vectorizer_perf_test_app diff --git a/fsa/src/alltest/conceptnet_test.cpp b/fsa/src/alltest/conceptnet_test.cpp new file mode 100644 index 00000000000..38c020aa511 --- /dev/null +++ b/fsa/src/alltest/conceptnet_test.cpp @@ -0,0 +1,80 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <stdlib.h> +#include <unistd.h> +#include <stdio.h> + +#include <vespa/fsa/conceptnet.h> +#include <vespa/fsamanagers/conceptnetmanager.h> + +using namespace fsa; + +int main(int argc, char **argv) +{ + char opt; + //extern char *optarg; + extern int optind; + + bool do_ext = false, do_assoc = false, do_cat = false; + + while((opt=getopt(argc,argv,"aec")) != -1){ + switch(opt){ + case 'a': + do_assoc = true; + break; + case 'e': + do_ext = true; + break; + case 'c': + do_cat = true; + break; + case '?': + fprintf(stderr,"conceptnet_test: unrecognized option"); + exit(1); + } + } + + if(optind>=argc){ + fprintf(stderr,"usage: conceptnet_test [-aec] DOMAIN [UNIT ...]\n"); + exit(1); + } + + std::string domain = argv[optind]; + + if(!ConceptNetManager::instance().load(domain, + domain + ".fsa", + domain + ".dat")){ + fprintf(stderr,"failed to load concept net %s\n",domain.c_str()); + exit(1); + } + + ConceptNet::Handle* cn = ConceptNetManager::instance().get(domain); + + if(cn!=NULL){ + for(int i=optind+1;i<argc;i++){ + int idx = (*cn)->lookup(argv[i]); + printf("%s(%d) : (%d,%d,%d,%d) (%f,%f)\n",argv[i],idx, + (*cn)->frq(idx),(*cn)->cFrq(idx),(*cn)->qFrq(idx),(*cn)->sFrq(idx), + (*cn)->score(idx),(*cn)->strength(idx)); + if(do_ext){ + for(int e = 0; e<(*cn)->numExt(idx); e++){ + printf(" %s, %d\n",(*cn)->lookup((*cn)->ext(idx,e)),(*cn)->extFrq(idx,e)); + } + } + if(do_assoc){ + for(int a = 0; a<(*cn)->numAssoc(idx); a++){ + printf(" %s, %d\n",(*cn)->lookup((*cn)->assoc(idx,a)),(*cn)->assocFrq(idx,a)); + } + } + if(do_cat){ + for(int c = 0; c<(*cn)->numCat(idx); c++){ + printf(" %s\n",(*cn)->catName((*cn)->cat(idx,c))); + } + } + } + } + else { + fprintf(stderr,"failed to load concept net %s\n",domain.c_str()); + exit(1); + } + +} diff --git a/fsa/src/alltest/conceptnet_test.out b/fsa/src/alltest/conceptnet_test.out new file mode 100644 index 00000000000..9f3570cebf1 --- /dev/null +++ b/fsa/src/alltest/conceptnet_test.out @@ -0,0 +1,4 @@ +new york(841954) : (-1,-1,-1,-1) (-1.000000,-1.000000) +sunnyvale(1139231) : (-1,-1,-1,-1) (-1.000000,-1.000000) +gibson(479780) : (-1,-1,-1,-1) (-1.000000,-1.000000) +metallica(770993) : (-1,-1,-1,-1) (-1.000000,-1.000000) diff --git a/fsa/src/alltest/detector_test.cpp b/fsa/src/alltest/detector_test.cpp new file mode 100644 index 00000000000..1942c4ba7a6 --- /dev/null +++ b/fsa/src/alltest/detector_test.cpp @@ -0,0 +1,50 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file vectorizertest.cpp + * @brief Test for the vectorizer class + * + */ + +#include <iostream> +#include <iomanip> +#include <string> + +#include <vespa/fsa/fsa.h> +#include <vespa/fsa/detector.h> +#include <vespa/fsa/ngram.h> + +using namespace fsa; + +class MyHits : public Detector::Hits{ +public: + MyHits() {}; + ~MyHits() {}; + + void add(const NGram &text, + unsigned int from, int length, + const FSA::State &) + { + std::cout << "detected: [" << from << "," << from+length-1 << "], '" + << text.join(" ",from,length) << "'\n"; + } +}; + +int main(int argc, char **argv) +{ + FSA dict(argc>=2? argv[1] : "__testfsa__.__fsa__"); + + Detector d(dict); + MyHits h; + + std::string text; + while(!std::cin.eof()){ + getline(std::cin,text); + + d.detect(text,h); + } + + return 0; +} diff --git a/fsa/src/alltest/detector_test.out b/fsa/src/alltest/detector_test.out new file mode 100644 index 00000000000..c5dbbdd08f1 --- /dev/null +++ b/fsa/src/alltest/detector_test.out @@ -0,0 +1,26 @@ +detected: [0,0], 'apple' +detected: [0,0], 'apricot' +detected: [0,0], 'artichoke' +detected: [0,0], 'banana' +detected: [0,0], 'cabbage' +detected: [0,0], 'carrot' +detected: [0,0], 'cherry' +detected: [0,0], 'chili' +detected: [0,0], 'cucumber' +detected: [0,0], 'eggplant' +detected: [0,0], 'grapes' +detected: [0,0], 'lettuce' +detected: [0,0], 'onion' +detected: [0,0], 'paprika' +detected: [0,1], 'passion fruit' +detected: [0,0], 'pea' +detected: [0,0], 'peach' +detected: [0,0], 'pear' +detected: [0,0], 'pineapple' +detected: [0,0], 'plum' +detected: [0,0], 'potato' +detected: [0,0], 'pumpkin' +detected: [0,1], 'sour cherry' +detected: [1,1], 'cherry' +detected: [0,0], 'squash' +detected: [0,0], 'tomato' diff --git a/fsa/src/alltest/detector_test.sh b/fsa/src/alltest/detector_test.sh new file mode 100755 index 00000000000..dd6f650a35c --- /dev/null +++ b/fsa/src/alltest/detector_test.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./fsa_detector_test_app < testinput.txt > detector_test.output +diff detector_test.output detector_test.out diff --git a/fsa/src/alltest/fsa_create_test.cpp b/fsa/src/alltest/fsa_create_test.cpp new file mode 100644 index 00000000000..c72ea900aad --- /dev/null +++ b/fsa/src/alltest/fsa_create_test.cpp @@ -0,0 +1,94 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <iostream> + +#include <vespa/fsa/fsa.h> +#include <vespa/fsa/automaton.h> +#include <vespa/fsa/timestamp.h> + +using namespace fsa; + +int main(int, char**) +{ + + Automaton *aut = new Automaton; + + Blob fruit("Fruit"), veggie("Vegetable"), city("City"); + + TimeStamp t; + + aut->init(); + + aut->insertSortedString("Cupertino",city); + aut->insertSortedString("Foster City",city); + aut->insertSortedString("Los Altos",city); + aut->insertSortedString("Menlo Park",city); + aut->insertSortedString("Mountain View",city); + aut->insertSortedString("Palo Alto",city); + aut->insertSortedString("San Francisco",city); + aut->insertSortedString("San Jose",city); + aut->insertSortedString("Santa Clara",city); + aut->insertSortedString("Saratoga",city); + aut->insertSortedString("Sunnyvale",city); + aut->insertSortedString("apple",fruit); + aut->insertSortedString("apricot",fruit); + aut->insertSortedString("artichoke",veggie); + aut->insertSortedString("banana",fruit); + aut->insertSortedString("cabbage",veggie); + aut->insertSortedString("carrot",veggie); + aut->insertSortedString("cherry",fruit); + aut->insertSortedString("chili",veggie); + aut->insertSortedString("cucumber",veggie); + aut->insertSortedString("eggplant",veggie); + aut->insertSortedString("grapes",fruit); + aut->insertSortedString("lettuce",veggie); + aut->insertSortedString("onion",veggie); + aut->insertSortedString("paprika",veggie); + aut->insertSortedString("passion fruit",fruit); + aut->insertSortedString("pea",veggie); + aut->insertSortedString("peach",fruit); + aut->insertSortedString("pear",fruit); + aut->insertSortedString("pineapple",fruit); + aut->insertSortedString("plum",fruit); + aut->insertSortedString("potato",veggie); + aut->insertSortedString("pumpkin",veggie); + aut->insertSortedString("sour cherry",fruit); + aut->insertSortedString("squash",veggie); + aut->insertSortedString("tomato",veggie); + + aut->finalize(); + + double d1 = t.elapsed(); + + aut->addPerfectHash(); + + double d2 = t.elapsed(); + + aut->write("__testfsa__.__fsa__"); + + double d3 = t.elapsed(); + + FSA *fsa = aut->getFSA(); + + double d4 = t.elapsed(); + + std::cout << "Automoaton build finished (" << 1000*d1 << "ms," << 1000*(d2-d1) << "ms)" + << ", fsa retrieval (" << 1000*(d4-d3) << "ms) " << ((fsa==NULL)?"failed":"succeded") << ".\n"; + + if(fsa!=NULL){ + FSA::State fs(*fsa); + const unsigned char *pb = fs.lookup("cucumber"); + std::cout << "Lookup(\"cucumber\") -> "; + if(pb!=NULL){ + std::cout << "\"" << pb << "\""; + } + else{ + std::cout << "not found."; + } + std::cout << "\n"; + } + + delete aut; + delete fsa; + + return 0; +} diff --git a/fsa/src/alltest/fsa_perftest.cpp b/fsa/src/alltest/fsa_perftest.cpp new file mode 100644 index 00000000000..90d2c042b07 --- /dev/null +++ b/fsa/src/alltest/fsa_perftest.cpp @@ -0,0 +1,77 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <stdlib.h> +#include <iostream> +#include <iomanip> +#include <string> + +#include <vespa/fsa/fsa.h> +#include <vespa/fsa/timestamp.h> + +using namespace fsa; + +int main(int, char**) +{ + FSA f("__testfsa__.__fsa__"); + FSA::State s(f); + FSA::HashedState hs(f); + FSA::MemoryState ms(f); + FSA::HashedMemoryState hms(f); + FSA::CounterState cs(f); + std::string input("cucumber"); + unsigned int count=10000000,i; + + std::cout << "Number of lookups: " << count << std::endl; + std::cout << "Input string length: " << input.length() << std::endl; + std::cout << std::endl; + + TimeStamp t; + double t0,t1; + + t0=t.elapsed(); + for(i=0;i<count;i++){ + s.start(); + s.lookup(input); + } + t1=t.elapsed()-t0; + std::cout << "State: " << t1*1000 << " ms" << "\t" + << (unsigned int)(count*input.length()/t1) << " delta/sec" << std::endl; + + t0=t.elapsed(); + for(i=0;i<count;i++){ + hs.start(); + hs.lookup(input); + } + t1=t.elapsed()-t0; + std::cout << "HashedState: " << t1*1000 << " ms"<< "\t" + << (unsigned int)(count*input.length()/t1) << " delta/sec" << std::endl; + + t0=t.elapsed(); + for(i=0;i<count;i++){ + ms.start(); + ms.lookup(input); + } + t1=t.elapsed()-t0; + std::cout << "MemoryState: " << t1*1000 << " ms"<< "\t" + << (unsigned int)(count*input.length()/t1) << " delta/sec" << std::endl; + + t0=t.elapsed(); + for(i=0;i<count;i++){ + hms.start(); + hms.lookup(input); + } + t1=t.elapsed()-t0; + std::cout << "HashedMemoryState: " << t1*1000 << " ms"<< "\t" + << (unsigned int)(count*input.length()/t1) << " delta/sec" << std::endl; + + t0=t.elapsed(); + for(i=0;i<count;i++){ + cs.start(); + cs.lookup(input); + } + t1=t.elapsed()-t0; + std::cout << "CounterState: " << t1*1000 << " ms"<< "\t" + << (unsigned int)(count*input.length()/t1) << " delta/sec" << std::endl; + + return 0; +} diff --git a/fsa/src/alltest/fsa_test.cpp b/fsa/src/alltest/fsa_test.cpp new file mode 100644 index 00000000000..5bc95f20430 --- /dev/null +++ b/fsa/src/alltest/fsa_test.cpp @@ -0,0 +1,114 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <stdio.h> +#include <string> + +#include <vespa/fsa/fsa.h> + +using namespace fsa; + +int main(int, char**) +{ + FSA *f = new FSA("__testfsa__.__fsa__", FILE_ACCESS_MMAP); + FSA::State *fs = new FSA::State(*f); + + std::string s("cucu"); + fs->start(s); + fs->delta('m'); + fs->delta("ber"); + if(fs->isFinal()){ + printf("start/delta test: string(\"cucu\")+'m'+\"ber\" is accepted\n"); + printf(" data size: %d\n",fs->dataSize()); + printf(" data string: \"%-*.*s\"\n",fs->dataSize(),fs->dataSize(),fs->data()); + } + else { + printf("start/delta test failed.\n"); + } + + const unsigned char *pb = fs->lookup("cucumber"); + if(pb!=NULL){ + printf("lookup test: \"cucumber\" -> \"%s\"\n",pb); + } + else{ + printf("lookup test: \"cucumber\" not found.\n"); + } + + + FSA::HashedState *fs1 = new FSA::HashedState(*f); + + + fs1->delta("pe"); + + FSA::HashedState *fs2 = new FSA::HashedState(*fs1); + FSA::HashedState *fs3 = new FSA::HashedState(*fs1); + + + + fs1->delta("a"); + fs2->delta("ach"); + fs3->delta("ar"); + + if(fs1->isFinal() && fs2->isFinal()){ + printf("copy hashed state test:\n"); + printf(" \"pe\"+\"a\": hash=%d, data_size=%d, data string=\"%-*.*s\"\n", + fs1->hash(),fs1->dataSize(),fs1->dataSize(),fs1->dataSize(),fs1->data()); + printf(" \"pe\"+\"ach\": hash=%d, data_size=%d, data string=\"%-*.*s\"\n", + fs2->hash(),fs2->dataSize(),fs2->dataSize(),fs2->dataSize(),fs2->data()); + printf(" \"pe\"+\"ar\": hash=%d, data_size=%d, data string=\"%-*.*s\"\n", + fs3->hash(),fs3->dataSize(),fs3->dataSize(),fs3->dataSize(),fs3->data()); + + } + else { + printf("copy hashed state test failed.\n"); + } + + printf("revLookup test:\n"); + unsigned int i=0; + std::string res; + while(i<100){ + res=fs2->revLookup(i); + if(res.size()==0) + break; + fs2->lookup(res); + printf(" %d -> %s -> %d\n",i,res.c_str(),fs2->hash()); + i++; + } + + printf("iterator test:\n"); + fs1->start('p'); + printf(" possible continuations from \"p\":\n"); + for(FSA::iterator it(*fs1); it!=fs1->end(); ++it){ + printf(" \"p\" + \"%s\"\n",it->str().c_str()); + } + + delete fs; + delete fs1; + delete fs2; + delete fs3; + + + printf("counter/memory state test\n"); + FSA::CounterState *cs = new FSA::CounterState(*f); + FSA::MemoryState *ms = new FSA::MemoryState(*f); + + cs->start("cucu"); + ms->start("cucu"); + printf(" \"cucu\" -> %s:%d\n",ms->memory().c_str(),cs->counter()); + + cs->start("cucumber"); + ms->start("cucumber"); + printf(" \"cucumber\" -> %s:%d\n",ms->memory().c_str(),cs->counter()); + + cs->start("cucumber slumber"); + ms->start("cucumber slumber"); + printf(" \"cucumber slumber\" -> %s:%d\n",ms->memory().c_str(),cs->counter()); + + delete cs; + delete ms; + delete f; + + return 0; +} diff --git a/fsa/src/alltest/fsa_test.out b/fsa/src/alltest/fsa_test.out new file mode 100644 index 00000000000..b9c96e5b795 --- /dev/null +++ b/fsa/src/alltest/fsa_test.out @@ -0,0 +1,60 @@ +start/delta test: string("cucu")+'m'+"ber" is accepted + data size: 10 + data string: "Vegetable " +lookup test: "cucumber" -> "Vegetable" +copy hashed state test: + "pe"+"a": hash=26, data_size=10, data string="Vegetable " + "pe"+"ach": hash=27, data_size=6, data string="Fruit " + "pe"+"ar": hash=28, data_size=6, data string="Fruit " +revLookup test: + 0 -> Cupertino -> 0 + 1 -> Foster City -> 1 + 2 -> Los Altos -> 2 + 3 -> Menlo Park -> 3 + 4 -> Mountain View -> 4 + 5 -> Palo Alto -> 5 + 6 -> San Francisco -> 6 + 7 -> San Jose -> 7 + 8 -> Santa Clara -> 8 + 9 -> Saratoga -> 9 + 10 -> Sunnyvale -> 10 + 11 -> apple -> 11 + 12 -> apricot -> 12 + 13 -> artichoke -> 13 + 14 -> banana -> 14 + 15 -> cabbage -> 15 + 16 -> carrot -> 16 + 17 -> cherry -> 17 + 18 -> chili -> 18 + 19 -> cucumber -> 19 + 20 -> eggplant -> 20 + 21 -> grapes -> 21 + 22 -> lettuce -> 22 + 23 -> onion -> 23 + 24 -> paprika -> 24 + 25 -> passion fruit -> 25 + 26 -> pea -> 26 + 27 -> peach -> 27 + 28 -> pear -> 28 + 29 -> pineapple -> 29 + 30 -> plum -> 30 + 31 -> potato -> 31 + 32 -> pumpkin -> 32 + 33 -> sour cherry -> 33 + 34 -> squash -> 34 + 35 -> tomato -> 35 +iterator test: + possible continuations from "p": + "p" + "aprika" + "p" + "assion fruit" + "p" + "ea" + "p" + "each" + "p" + "ear" + "p" + "ineapple" + "p" + "lum" + "p" + "otato" + "p" + "umpkin" +counter/memory state test + "cucu" -> cucu:4 + "cucumber" -> cucumber:8 + "cucumber slumber" -> cucumber:8 diff --git a/fsa/src/alltest/fsa_test.sh b/fsa/src/alltest/fsa_test.sh new file mode 100755 index 00000000000..497fd291c4d --- /dev/null +++ b/fsa/src/alltest/fsa_test.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./fsa_fsa_test_app > fsa_test.output +diff fsa_test.output fsa_test.out diff --git a/fsa/src/alltest/fsamanager_test.cpp b/fsa/src/alltest/fsamanager_test.cpp new file mode 100644 index 00000000000..7ca4a2d8e8a --- /dev/null +++ b/fsa/src/alltest/fsamanager_test.cpp @@ -0,0 +1,25 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fsa/fsa.h> +#include <vespa/fsamanagers/fsamanager.h> + +#include <iostream> +#include <string> +#include <stdlib.h> + +using namespace fsa; + +int main(int argc, char** argv) +{ + if(argc<3){ + std::cerr << "usage: fsamanager_test cache_dir fsa_file_or_url [fsa_file_or_url ...]\n"; + exit(1); + } + + FSAManager::instance().setCacheDir(argv[1]); + + for(int i=2;i<argc;i++){ + std::cerr << "Loading " << argv[i] << " ... "; + std::cerr << (FSAManager::instance().load(argv[i],argv[i]) ? "ok":"failed") << "\n"; + } + +} diff --git a/fsa/src/alltest/lookup_test.cpp b/fsa/src/alltest/lookup_test.cpp new file mode 100644 index 00000000000..6ff4e3063d4 --- /dev/null +++ b/fsa/src/alltest/lookup_test.cpp @@ -0,0 +1,49 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <stdlib.h> +#include <iostream> +#include <iomanip> +#include <string> + +#include <vespa/fsa/fsa.h> + +#if (__GNUG__ <3 || (__GNUG__ == 3 && __GNUC_MINOR__ < 1)) +namespace std { +const char *left = ""; +} +#endif + +using namespace fsa; + +int main(int argc, char** argv) +{ + + if(argc!=2){ + std::cerr << "usage: lookup_test fsafile <input >output" << std::endl; + exit(1); + } + + FSA f(argv[1]); + FSA::HashedState fs(f); + std::string input; + + while(!std::cin.eof()){ + getline(std::cin,input); + + if(input.size()>0){ + fs.start(input); + if(fs.isFinal()){ + std::cout << "'" << input << "'" << " is accepted, hash value: " << fs.hash() + << ", data size: " << fs.dataSize() + << ", data string: \"" + << std::setw(fs.dataSize()) << std::left << fs.data() + << "\"" << std::endl; + } + else{ + std::cout << "'" << input << "'" << " is not accepted." << std::endl; + } + } + } + + return 0; +} diff --git a/fsa/src/alltest/lookup_test.out b/fsa/src/alltest/lookup_test.out new file mode 100644 index 00000000000..b7dd9b4da4b --- /dev/null +++ b/fsa/src/alltest/lookup_test.out @@ -0,0 +1,41 @@ +'Cupertino' is accepted, hash value: 0, data size: 5, data string: "City " +'Foster City' is accepted, hash value: 1, data size: 5, data string: "City " +'Los Altos' is accepted, hash value: 2, data size: 5, data string: "City " +'Menlo Park' is accepted, hash value: 3, data size: 5, data string: "City " +'Mountain View' is accepted, hash value: 4, data size: 5, data string: "City " +'Palo Alto' is accepted, hash value: 5, data size: 5, data string: "City " +'San Francisco' is accepted, hash value: 6, data size: 5, data string: "City " +'San Jose' is accepted, hash value: 7, data size: 5, data string: "City " +'Santa Clara' is accepted, hash value: 8, data size: 5, data string: "City " +'Saratoga' is accepted, hash value: 9, data size: 5, data string: "City " +'Sunnyvale' is accepted, hash value: 10, data size: 5, data string: "City " +'apple' is accepted, hash value: 11, data size: 6, data string: "Fruit " +'apricot' is accepted, hash value: 12, data size: 6, data string: "Fruit " +'artichoke' is accepted, hash value: 13, data size: 10, data string: "Vegetable " +'banana' is accepted, hash value: 14, data size: 6, data string: "Fruit " +'cabbage' is accepted, hash value: 15, data size: 10, data string: "Vegetable " +'carrot' is accepted, hash value: 16, data size: 10, data string: "Vegetable " +'cherry' is accepted, hash value: 17, data size: 6, data string: "Fruit " +'chili' is accepted, hash value: 18, data size: 10, data string: "Vegetable " +'cucumber' is accepted, hash value: 19, data size: 10, data string: "Vegetable " +'eggplant' is accepted, hash value: 20, data size: 10, data string: "Vegetable " +'grapes' is accepted, hash value: 21, data size: 6, data string: "Fruit " +'lettuce' is accepted, hash value: 22, data size: 10, data string: "Vegetable " +'onion' is accepted, hash value: 23, data size: 10, data string: "Vegetable " +'paprika' is accepted, hash value: 24, data size: 10, data string: "Vegetable " +'passion fruit' is accepted, hash value: 25, data size: 6, data string: "Fruit " +'pea' is accepted, hash value: 26, data size: 10, data string: "Vegetable " +'peach' is accepted, hash value: 27, data size: 6, data string: "Fruit " +'pear' is accepted, hash value: 28, data size: 6, data string: "Fruit " +'pineapple' is accepted, hash value: 29, data size: 6, data string: "Fruit " +'plum' is accepted, hash value: 30, data size: 6, data string: "Fruit " +'potato' is accepted, hash value: 31, data size: 10, data string: "Vegetable " +'pumpkin' is accepted, hash value: 32, data size: 10, data string: "Vegetable " +'sour cherry' is accepted, hash value: 33, data size: 6, data string: "Fruit " +'squash' is accepted, hash value: 34, data size: 10, data string: "Vegetable " +'tomato' is accepted, hash value: 35, data size: 10, data string: "Vegetable " +'alpha' is not accepted. +'beta' is not accepted. +'gamma' is not accepted. +'delta' is not accepted. +'epsilon' is not accepted. diff --git a/fsa/src/alltest/lookup_test.sh b/fsa/src/alltest/lookup_test.sh new file mode 100755 index 00000000000..394baecc78a --- /dev/null +++ b/fsa/src/alltest/lookup_test.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./fsa_lookup_test_app __testfsa__.__fsa__ < testinput.txt > lookup_test.output +diff lookup_test.output lookup_test.out diff --git a/fsa/src/alltest/ngram_test.cpp b/fsa/src/alltest/ngram_test.cpp new file mode 100644 index 00000000000..7f0be7769e1 --- /dev/null +++ b/fsa/src/alltest/ngram_test.cpp @@ -0,0 +1,57 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <iostream> + +#include <vespa/fsa/permuter.h> +#include <vespa/fsa/selector.h> +#include <vespa/fsa/ngram.h> +#include <vespa/fsa/base64.h> +#include <vespa/fsa/wordchartokenizer.h> + +using namespace fsa; + +int main(int, char **) +{ + Permuter p; + + NGram q1("a b c d e f"), q2(q1,p,10), q3(q2,p,13); + + Selector s; + + std::string s1("this is a test"), s2; + + Base64::encode(s1,s2); + std::cout << "'" << s1 << "'" << std::endl; + std::cout << "'" << s2 << "'" << std::endl; + Base64::decode(s2,s1); + std::cout << "'" << s1 << "'" << std::endl; + + + std::cout << q1 << std::endl; + std::cout << q2 << std::endl; + std::cout << q3 << std::endl; + + q2.sort(); + std::cout << q2 << std::endl; + q2.reverse(); + std::cout << q2 << std::endl; + + std::cout << std::hex; + for(unsigned int n=1;n<=6;n++){ + unsigned int c=Permuter::firstComb(n,6); + while(c>0){ + s.clear(); + s.set(c); + q2.set(q1,s); + std::cout << c << ": " << q2 << std::endl; + c=Permuter::nextComb(c,6); + } + } + std::cout << std::dec; + + WordCharTokenizer tokenizer(WordCharTokenizer::PUNCTUATION_SMART,"PUNCT"); + + NGram q4("test, wordchar tokenizer. does it work?",tokenizer); + + std::cout << q4.join(" -|- ") << std::endl; + +} diff --git a/fsa/src/alltest/ngram_test.out b/fsa/src/alltest/ngram_test.out new file mode 100644 index 00000000000..d826e3173dd --- /dev/null +++ b/fsa/src/alltest/ngram_test.out @@ -0,0 +1,72 @@ +'this is a test' +'dGhpcyBpcyBhIHRlc3Q=' +'this is a test' +a b c d e f +b d a c e f +a b c d e f +a b c d e f +f e d c b a +1: a +2: b +4: c +8: d +10: e +20: f +3: a b +5: a c +6: b c +9: a d +a: b d +c: c d +11: a e +12: b e +14: c e +18: d e +21: a f +22: b f +24: c f +28: d f +30: e f +7: a b c +b: a b d +d: a c d +e: b c d +13: a b e +15: a c e +16: b c e +19: a d e +1a: b d e +1c: c d e +23: a b f +25: a c f +26: b c f +29: a d f +2a: b d f +2c: c d f +31: a e f +32: b e f +34: c e f +38: d e f +f: a b c d +17: a b c e +1b: a b d e +1d: a c d e +1e: b c d e +27: a b c f +2b: a b d f +2d: a c d f +2e: b c d f +33: a b e f +35: a c e f +36: b c e f +39: a d e f +3a: b d e f +3c: c d e f +1f: a b c d e +2f: a b c d f +37: a b c e f +3b: a b d e f +3d: a c d e f +3e: b c d e f +3f: a b c d e f +test -|- PUNCT -|- wordchar -|- tokenizer -|- PUNCT -|- does -|- it -|- work diff --git a/fsa/src/alltest/ngram_test.sh b/fsa/src/alltest/ngram_test.sh new file mode 100755 index 00000000000..85559d6e391 --- /dev/null +++ b/fsa/src/alltest/ngram_test.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./fsa_ngram_test_app > ngram_test.output +diff ngram_test.output ngram_test.out diff --git a/fsa/src/alltest/segmenter_test.cpp b/fsa/src/alltest/segmenter_test.cpp new file mode 100644 index 00000000000..3b80fe3390e --- /dev/null +++ b/fsa/src/alltest/segmenter_test.cpp @@ -0,0 +1,74 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file segmenter_test.cpp + * @brief Test for the Segmenter class + * + */ + +#include <iostream> +#include <iomanip> + +#include <vespa/fsa/segmenter.h> + +using namespace fsa; + +int main(int argc, char **argv) +{ + FSA dict(argc>=2? argv[1] : "__testfsa__.__fsa__"); + + Segmenter segmenter(dict); + Segmenter::Segments segments; + const Segmenter::Segmentation *segmentation; + + std::string text; + while(!std::cin.eof()){ + getline(std::cin,text); + + if(text.size()>3){ + + segmenter.segment(text,segments); + + std::cout << "List of all segments:" << std::endl; + for(unsigned int i=0; i<segments.size(); i++){ + std::cout << " " + << segments.sgm(i) << ":" << segments.conn(i) << " [" + << segments.beg(i) << "," << segments.end(i)-1 << "]" + << std::endl; + } + + segmentation=segments.segmentation(Segmenter::SEGMENTATION_WEIGHTED); + + std::cout << "Weighted segmentation:" << std::endl << " "; + for(Segmenter::SegmentationConstIterator it=segmentation->begin(); + it!=segmentation->end();++it){ + std::cout << "(" << segments.sgm(*it) << ")"; + } + std::cout << std::endl; + + segmentation=segments.segmentation(Segmenter::SEGMENTATION_RIGHTMOST_LONGEST); + + std::cout << "Rightmost-longest segmentation:" << std::endl << " "; + for(Segmenter::SegmentationConstIterator it=segmentation->begin(); + it!=segmentation->end();++it){ + std::cout << "(" << segments.sgm(*it) << ")"; + } + std::cout << std::endl; + + segmentation=segments.segmentation(Segmenter::SEGMENTATION_LEFTMOST_LONGEST); + + std::cout << "Lefttmost-longest segmentation:" << std::endl << " "; + for(Segmenter::SegmentationConstIterator it=segmentation->begin(); + it!=segmentation->end();++it){ + std::cout << "(" << segments.sgm(*it) << ")"; + } + std::cout << std::endl; + + } + + } + + return 0; +} diff --git a/fsa/src/alltest/segmenter_test.out b/fsa/src/alltest/segmenter_test.out new file mode 100644 index 00000000000..d8c42cfacce --- /dev/null +++ b/fsa/src/alltest/segmenter_test.out @@ -0,0 +1,332 @@ +List of all segments: + cupertino:0 [0,0] +Weighted segmentation: + (cupertino) +Rightmost-longest segmentation: + (cupertino) +Lefttmost-longest segmentation: + (cupertino) +List of all segments: + foster:0 [0,0] + city:0 [1,1] +Weighted segmentation: + (foster)(city) +Rightmost-longest segmentation: + (foster)(city) +Lefttmost-longest segmentation: + (foster)(city) +List of all segments: + los:0 [0,0] + altos:0 [1,1] +Weighted segmentation: + (los)(altos) +Rightmost-longest segmentation: + (los)(altos) +Lefttmost-longest segmentation: + (los)(altos) +List of all segments: + menlo:0 [0,0] + park:0 [1,1] +Weighted segmentation: + (menlo)(park) +Rightmost-longest segmentation: + (menlo)(park) +Lefttmost-longest segmentation: + (menlo)(park) +List of all segments: + mountain:0 [0,0] + view:0 [1,1] +Weighted segmentation: + (mountain)(view) +Rightmost-longest segmentation: + (mountain)(view) +Lefttmost-longest segmentation: + (mountain)(view) +List of all segments: + palo:0 [0,0] + alto:0 [1,1] +Weighted segmentation: + (palo)(alto) +Rightmost-longest segmentation: + (palo)(alto) +Lefttmost-longest segmentation: + (palo)(alto) +List of all segments: + san:0 [0,0] + francisco:0 [1,1] +Weighted segmentation: + (san)(francisco) +Rightmost-longest segmentation: + (san)(francisco) +Lefttmost-longest segmentation: + (san)(francisco) +List of all segments: + san:0 [0,0] + jose:0 [1,1] +Weighted segmentation: + (san)(jose) +Rightmost-longest segmentation: + (san)(jose) +Lefttmost-longest segmentation: + (san)(jose) +List of all segments: + santa:0 [0,0] + clara:0 [1,1] +Weighted segmentation: + (santa)(clara) +Rightmost-longest segmentation: + (santa)(clara) +Lefttmost-longest segmentation: + (santa)(clara) +List of all segments: + saratoga:0 [0,0] +Weighted segmentation: + (saratoga) +Rightmost-longest segmentation: + (saratoga) +Lefttmost-longest segmentation: + (saratoga) +List of all segments: + sunnyvale:0 [0,0] +Weighted segmentation: + (sunnyvale) +Rightmost-longest segmentation: + (sunnyvale) +Lefttmost-longest segmentation: + (sunnyvale) +List of all segments: + apple:1769304646 [0,0] +Weighted segmentation: + (apple) +Rightmost-longest segmentation: + (apple) +Lefttmost-longest segmentation: + (apple) +List of all segments: + apricot:1769304646 [0,0] +Weighted segmentation: + (apricot) +Rightmost-longest segmentation: + (apricot) +Lefttmost-longest segmentation: + (apricot) +List of all segments: + artichoke:1701274966 [0,0] +Weighted segmentation: + (artichoke) +Rightmost-longest segmentation: + (artichoke) +Lefttmost-longest segmentation: + (artichoke) +List of all segments: + banana:1769304646 [0,0] +Weighted segmentation: + (banana) +Rightmost-longest segmentation: + (banana) +Lefttmost-longest segmentation: + (banana) +List of all segments: + cabbage:1701274966 [0,0] +Weighted segmentation: + (cabbage) +Rightmost-longest segmentation: + (cabbage) +Lefttmost-longest segmentation: + (cabbage) +List of all segments: + carrot:1701274966 [0,0] +Weighted segmentation: + (carrot) +Rightmost-longest segmentation: + (carrot) +Lefttmost-longest segmentation: + (carrot) +List of all segments: + cherry:1769304646 [0,0] +Weighted segmentation: + (cherry) +Rightmost-longest segmentation: + (cherry) +Lefttmost-longest segmentation: + (cherry) +List of all segments: + chili:1701274966 [0,0] +Weighted segmentation: + (chili) +Rightmost-longest segmentation: + (chili) +Lefttmost-longest segmentation: + (chili) +List of all segments: + cucumber:1701274966 [0,0] +Weighted segmentation: + (cucumber) +Rightmost-longest segmentation: + (cucumber) +Lefttmost-longest segmentation: + (cucumber) +List of all segments: + eggplant:1701274966 [0,0] +Weighted segmentation: + (eggplant) +Rightmost-longest segmentation: + (eggplant) +Lefttmost-longest segmentation: + (eggplant) +List of all segments: + grapes:1769304646 [0,0] +Weighted segmentation: + (grapes) +Rightmost-longest segmentation: + (grapes) +Lefttmost-longest segmentation: + (grapes) +List of all segments: + lettuce:1701274966 [0,0] +Weighted segmentation: + (lettuce) +Rightmost-longest segmentation: + (lettuce) +Lefttmost-longest segmentation: + (lettuce) +List of all segments: + onion:1701274966 [0,0] +Weighted segmentation: + (onion) +Rightmost-longest segmentation: + (onion) +Lefttmost-longest segmentation: + (onion) +List of all segments: + paprika:1701274966 [0,0] +Weighted segmentation: + (paprika) +Rightmost-longest segmentation: + (paprika) +Lefttmost-longest segmentation: + (paprika) +List of all segments: + passion:0 [0,0] + fruit:0 [1,1] + passion fruit:1769304646 [0,1] +Weighted segmentation: + (passion fruit) +Rightmost-longest segmentation: + (passion fruit) +Lefttmost-longest segmentation: + (passion fruit) +List of all segments: + peach:1769304646 [0,0] +Weighted segmentation: + (peach) +Rightmost-longest segmentation: + (peach) +Lefttmost-longest segmentation: + (peach) +List of all segments: + pear:1769304646 [0,0] +Weighted segmentation: + (pear) +Rightmost-longest segmentation: + (pear) +Lefttmost-longest segmentation: + (pear) +List of all segments: + pineapple:1769304646 [0,0] +Weighted segmentation: + (pineapple) +Rightmost-longest segmentation: + (pineapple) +Lefttmost-longest segmentation: + (pineapple) +List of all segments: + plum:1769304646 [0,0] +Weighted segmentation: + (plum) +Rightmost-longest segmentation: + (plum) +Lefttmost-longest segmentation: + (plum) +List of all segments: + potato:1701274966 [0,0] +Weighted segmentation: + (potato) +Rightmost-longest segmentation: + (potato) +Lefttmost-longest segmentation: + (potato) +List of all segments: + pumpkin:1701274966 [0,0] +Weighted segmentation: + (pumpkin) +Rightmost-longest segmentation: + (pumpkin) +Lefttmost-longest segmentation: + (pumpkin) +List of all segments: + sour:0 [0,0] + cherry:1769304646 [1,1] + sour cherry:1769304646 [0,1] +Weighted segmentation: + (sour cherry) +Rightmost-longest segmentation: + (sour cherry) +Lefttmost-longest segmentation: + (sour cherry) +List of all segments: + squash:1701274966 [0,0] +Weighted segmentation: + (squash) +Rightmost-longest segmentation: + (squash) +Lefttmost-longest segmentation: + (squash) +List of all segments: + tomato:1701274966 [0,0] +Weighted segmentation: + (tomato) +Rightmost-longest segmentation: + (tomato) +Lefttmost-longest segmentation: + (tomato) +List of all segments: + alpha:0 [0,0] +Weighted segmentation: + (alpha) +Rightmost-longest segmentation: + (alpha) +Lefttmost-longest segmentation: + (alpha) +List of all segments: + beta:0 [0,0] +Weighted segmentation: + (beta) +Rightmost-longest segmentation: + (beta) +Lefttmost-longest segmentation: + (beta) +List of all segments: + gamma:0 [0,0] +Weighted segmentation: + (gamma) +Rightmost-longest segmentation: + (gamma) +Lefttmost-longest segmentation: + (gamma) +List of all segments: + delta:0 [0,0] +Weighted segmentation: + (delta) +Rightmost-longest segmentation: + (delta) +Lefttmost-longest segmentation: + (delta) +List of all segments: + epsilon:0 [0,0] +Weighted segmentation: + (epsilon) +Rightmost-longest segmentation: + (epsilon) +Lefttmost-longest segmentation: + (epsilon) diff --git a/fsa/src/alltest/segmenter_test.sh b/fsa/src/alltest/segmenter_test.sh new file mode 100755 index 00000000000..d36a6d10057 --- /dev/null +++ b/fsa/src/alltest/segmenter_test.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./fsa_segmenter_test_app < testinput.txt > segmenter_test.output +diff segmenter_test.output segmenter_test.out diff --git a/fsa/src/alltest/testinput.txt b/fsa/src/alltest/testinput.txt new file mode 100644 index 00000000000..fa4afece710 --- /dev/null +++ b/fsa/src/alltest/testinput.txt @@ -0,0 +1,41 @@ +Cupertino +Foster City +Los Altos +Menlo Park +Mountain View +Palo Alto +San Francisco +San Jose +Santa Clara +Saratoga +Sunnyvale +apple +apricot +artichoke +banana +cabbage +carrot +cherry +chili +cucumber +eggplant +grapes +lettuce +onion +paprika +passion fruit +pea +peach +pear +pineapple +plum +potato +pumpkin +sour cherry +squash +tomato +alpha +beta +gamma +delta +epsilon diff --git a/fsa/src/alltest/vectorizer_perftest.cpp b/fsa/src/alltest/vectorizer_perftest.cpp new file mode 100644 index 00000000000..582652ec66d --- /dev/null +++ b/fsa/src/alltest/vectorizer_perftest.cpp @@ -0,0 +1,95 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file vectorizertest.cpp + * @brief Test for the vectorizer class + * + */ + +#include <string> +#include <iostream> +#include <iomanip> + +#include <vespa/fsa/vectorizer.h> +#include <vespa/fsa/timestamp.h> + +using namespace fsa; + +int main(int argc, char **argv) +{ + FSA dict(argc>=2? argv[1] : "__testfsa__.__fsa__"); + + Vectorizer v(dict); + Vectorizer::TermVector tv; + + + std::string text = + "belfast northern ireland protestant extremists crashed a forklift " + "truck into a belfast pub packed with catholics early friday and tossed " + "gasoline bombs into the building on a road on the front line of " + "tensions between the two communities " + "no one was hurt in the attack police said, though the forklift came " + "crashing through a window just above a bench where a patron had been " + "sitting seconds earlier the bar s owner sean conlon said " + "the customer had just gotten up to go to the toilet so it s really " + "just by the grace of god still he s here today at all conlon said " + "a protestant gang used the stolen vehicle to smash down a heavy metal " + "security grill on a window at around 12 45 a m then to toss three " + "gasoline bombs inside the pub on the crumlin road an especially " + "polarized part of north belfast where catholic protestant tensions " + "have repeatedly flared " + "no group claimed responsibility for the attack on the thirty two " + "degrees north pub a catholic frequented bar across the street from a " + "hard line protestant district but catholic leaders blamed the largest " + "illegal protestant group the ulster defense association " + "firefighters quickly doused the flames caused by the gasoline " + "bombs the forklift remained wedged into the pub friday afternoon as " + "engineers and architects discussed whether the newly refurbished pub " + "would have to be partly demolished " + "the uda is supposed to be observing a cease fire in support of " + "northern ireland s 1998 peace accord but britain no longer recognizes " + "the validity of the uda truce because the anti catholic group has " + "violated it so often " + "the crumlin road area of north belfast has suffered some of northern " + "ireland s most graphic sectarian trouble in recent years while both " + "sides complain of suffering harassment and stone throwing protestants " + "in particular accuse the expanding catholic community of seeking to " + "force them from the area a charge the catholics deny. " + "protestant mobs in 2001 and 2002 blocked catholics from taking their " + "children to the local catholic elementary school which is in the " + "predominantly protestant part of the area " + "on july 12 hundreds of catholics from the area s ardoyne district " + "swarmed over police and british soldiers protecting a protestant " + "parade that had just passed down crumlin road dozens were wounded " + "demographic tensions lie at the heart of the northern ireland " + "conflict which was founded 84 years ago as a british territory with a " + "70 percent protestant majority the most recent census in 2001 put the " + "sectarian split at nearer 55 percent protestant and 45 percent " + "catholic and confirmed that belfast now has a catholic majority"; + + NGram tokenized_text(text); + + TimeStamp t; + double t0,t1; + unsigned int count=1000; + + std::cout << "Number of iterations: " << count << std::endl; + std::cout << "Input string length: " << text.length() << std::endl; + std::cout << "Number of input tokens: " << tokenized_text.length() << std::endl; + std::cout << std::endl; + + t0=t.elapsed(); + for(unsigned int i=0; i<count; ++i){ + v.vectorize(tokenized_text,tv); + } + t1=t.elapsed()-t0; + std::cout << "Vectorizer performance: \t" << t1 << " sec" << "\t\t" + << count/t1 << " document/sec" << std::endl; + for(unsigned int i=0; i<tv.size(); i++){ + std::cout << tv[i].term() << ", " << tv[i].weight() << std::endl; + } + + return 0; +} diff --git a/fsa/src/alltest/vectorizer_test.cpp b/fsa/src/alltest/vectorizer_test.cpp new file mode 100644 index 00000000000..e3bcf236455 --- /dev/null +++ b/fsa/src/alltest/vectorizer_test.cpp @@ -0,0 +1,40 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +/** + * @author Peter Boros + * @date 2004/08/20 + * @version $Id$ + * @file vectorizertest.cpp + * @brief Test for the vectorizer class + * + */ + +#include <iostream> +#include <iomanip> + +#include <vespa/fsa/vectorizer.h> + +using namespace fsa; + +int main(int argc, char **argv) +{ + FSA dict(argc>=2? argv[1] : "__testfsa__.__fsa__"); + + Vectorizer v(dict); + Vectorizer::TermVector tv; + + std::string text; + NGram tokenized_text; + + while(!std::cin.eof()){ + getline(std::cin,text); + + tokenized_text.set(text); + v.vectorize(tokenized_text,tv); + + for(unsigned int i=0; i<tv.size(); i++){ + std::cout << tv[i].term() << ", " << tv[i].weight() << std::endl; + } + } + + return 0; +} diff --git a/fsa/src/alltest/vectorizer_test.out b/fsa/src/alltest/vectorizer_test.out new file mode 100644 index 00000000000..aa30421a2bf --- /dev/null +++ b/fsa/src/alltest/vectorizer_test.out @@ -0,0 +1,26 @@ +apple, 0 +apricot, 0 +artichoke, 0 +banana, 0 +cabbage, 0 +carrot, 0 +cherry, 0 +chili, 0 +cucumber, 0 +eggplant, 0 +grapes, 0 +lettuce, 0 +onion, 0 +paprika, 0 +passion fruit, 0 +pea, 0 +peach, 0 +pear, 0 +pineapple, 0 +plum, 0 +potato, 0 +pumpkin, 0 +cherry, 0 +sour cherry, 0 +squash, 0 +tomato, 0 diff --git a/fsa/src/alltest/vectorizer_test.sh b/fsa/src/alltest/vectorizer_test.sh new file mode 100755 index 00000000000..03d794fc6e8 --- /dev/null +++ b/fsa/src/alltest/vectorizer_test.sh @@ -0,0 +1,3 @@ +#!/bin/bash +./fsa_vectorizer_test_app < testinput.txt > vectorizer_test.output +diff vectorizer_test.output vectorizer_test.out |