aboutsummaryrefslogtreecommitdiffstats
path: root/fsa/src/alltest/vectorizer_perftest.cpp
blob: 824c79a4838a5610404933dafd30422bed4b1771 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
/**
 * @author  Peter Boros
 * @date    2004/08/20
 * @version $Id$
 * @file    vectorizertest.cpp
 * @brief   Test for the vectorizer class
 *
 */

#include <string>
#include <iostream>
#include <iomanip>

#include <vespa/fsa/vectorizer.h>
#include <vespa/fsa/timestamp.h>

using namespace fsa;

int main(int argc, char **argv)
{
  FSA dict(argc>=2? argv[1] : "__testfsa__.__fsa__");

  Vectorizer v(dict);
  Vectorizer::TermVector tv;


  std::string text =
    "belfast northern ireland protestant extremists crashed a forklift "
    "truck into a belfast pub packed with catholics early friday and tossed "
    "gasoline bombs into the building on a road on the front line of "
    "tensions between the two communities "
    "no one was hurt in the attack police said, though the forklift came "
    "crashing through a window just above a bench where a patron had been "
    "sitting seconds earlier the bar s owner sean conlon said "
    "the customer had just gotten up to go to the toilet so it s really "
    "just by the grace of god still he s here today at all conlon said "
    "a protestant gang used the stolen vehicle to smash down a heavy metal "
    "security grill on a window at around 12 45 a m then to toss three "
    "gasoline bombs inside the pub on the crumlin road  an especially "
    "polarized part of north belfast where catholic protestant tensions "
    "have repeatedly flared "
    "no group claimed responsibility for the attack on the thirty two "
    "degrees north pub a catholic frequented bar across the street from a "
    "hard line protestant district but catholic leaders blamed the largest "
    "illegal protestant group the ulster defense association "
    "firefighters quickly doused the flames caused by the gasoline "
    "bombs the forklift remained wedged into the pub friday afternoon as "
    "engineers and architects discussed whether the newly refurbished pub "
    "would have to be partly demolished "
    "the uda is supposed to be observing a cease fire in support of "
    "northern ireland s 1998 peace accord but britain no longer recognizes "
    "the validity of the uda truce because the anti catholic group has "
    "violated it so often "
    "the crumlin road area of north belfast has suffered some of northern "
    "ireland s most graphic sectarian trouble in recent years  while both "
    "sides complain of suffering harassment and stone throwing protestants "
    "in particular accuse the expanding catholic community of seeking to "
    "force them from the area a charge the catholics deny. "
    "protestant mobs in 2001 and 2002 blocked catholics from taking their "
    "children to the local catholic elementary school which is in the "
    "predominantly protestant part of the area "
    "on july 12 hundreds of catholics from the area s ardoyne district "
    "swarmed over police and british soldiers protecting a protestant "
    "parade that had just passed down crumlin road dozens were wounded "
    "demographic tensions lie at the heart of the northern ireland "
    "conflict which was founded 84 years ago as a british territory with a "
    "70 percent protestant majority the most recent census in 2001 put the "
    "sectarian split at nearer 55 percent protestant and 45 percent "
    "catholic and confirmed that belfast now has a catholic majority";

  NGram tokenized_text(text);

  TimeStamp t;
  double t0,t1;
  unsigned int count=1000;

  std::cout << "Number of iterations: " << count << std::endl;
  std::cout << "Input string length: " << text.length() << std::endl;
  std::cout << "Number of input tokens: " << tokenized_text.length() << std::endl;
  std::cout << std::endl;

  t0=t.elapsed();
  for(unsigned int i=0; i<count; ++i){
    v.vectorize(tokenized_text,tv);
  }
  t1=t.elapsed()-t0;
  std::cout << "Vectorizer performance: \t" << t1 << " sec" << "\t\t"
            << count/t1 << " document/sec" << std::endl;
  for(unsigned int i=0; i<tv.size(); i++){
    std::cout << tv[i].term() << ", " << tv[i].weight() << std::endl;
  }

  return 0;
}