1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
|
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <util/filereader.h>
#include <iostream>
#include <string.h>
#include <assert.h>
/**
* Extract query urls from web logs. The filterfile application reads
* concatenated web logs from stdin and writes all query urls found in
* the input to stdout. Urls beginning with '/cgi-bin/search?' are
* assumed to be query urls. Only the 'query' and 'type' parameters
* are kept in the output.
**/
int
main(int argc, char** argv)
{
bool showUsage = false;
bool allowAllParams = false;
int bufsize = 10240;
// parse options and override defaults.
int optIdx;
char opt;
const char *arg;
bool optError;
optIdx = 1;
optError = false;
while((opt = GetOpt(argc, argv, "ahm:", arg, optIdx)) != -1) {
switch(opt) {
case 'a':
allowAllParams = true;
break;
case 'h':
showUsage = true;
break;
case 'm':
bufsize = atoi(arg);
if (bufsize < 10240) {
bufsize = 10240;
}
break;
default:
optError = true;
break;
}
}
if (optError || showUsage) {
printf("usage: vespa-fbench-filter-file [-a] [-h] [-m maxLineSize]\n\n");
printf("Read concatenated fastserver logs from stdin and write\n");
printf("extracted query urls to stdout.\n\n");
printf(" -a : all parameters to the original query urls are preserved.\n");
printf(" If the -a switch is not given, only 'query' and 'type'\n");
printf(" parameters are kept in the extracted query urls.\n");
printf(" -h : print this usage information.\n");
printf(" -m <num> : max line size for input/output lines.\n");
printf(" Can not be less than the default [10240]\n");
return -1;
}
const char *beginToken = "GET ";
int beginTokenlen = strlen(beginToken);
const char *endToken = " HTTP/";
//const char *prefix = "/cgi-bin/search?";
const char *prefix = "/?";
int prefixlen = strlen(prefix);
//const char *trigger = "/cgi-bin/";
const char *trigger = "";
int triggerlen = strlen(trigger);
// open input and output (should never fail)
FileReader *reader = new FileReader();
if (!reader->OpenStdin()) {
printf("could not open stdin! (strange)\n");
delete reader;
return -1;
}
std::ostream & file = std::cout;
// filter the input
char *line = new char[bufsize];
assert(line != NULL);
int res;
char *tmp;
char *url;
int startIdx;
int endIdx;
int idx;
int outIdx;
char *buf = new char[bufsize];
assert(buf != NULL);
int state; // 0=expect param name, 1=copy, 2=skip
bool gotQuery;
memcpy(buf, prefix, prefixlen);
while ((res = reader->ReadLine(line, bufsize - 1)) >= 0) {
// find field beginning
tmp = strstr(line, beginToken);
startIdx = (tmp != NULL) ? (tmp - line) + beginTokenlen : 0;
// find url beginning
url = strstr(line + startIdx, trigger);
if (url == NULL)
continue; // CONTINUE
// find field end
tmp = strstr(line + startIdx, endToken);
if (tmp == NULL)
tmp = strstr(line + startIdx, "\"");
endIdx = (tmp != NULL) ? (tmp - line) : strlen(line);
// find params
idx = (url - line) + triggerlen;
while (idx < endIdx && line[idx++] != '?');
if (idx >= endIdx)
continue; // CONTINUE
outIdx = prefixlen;
state = 0; // expect param name
gotQuery = false;
while(idx < endIdx) {
switch (state) {
case 0:
state = ((strncmp(line + idx, "query=", 6) == 0
&& (gotQuery = true)) ||
allowAllParams ||
strncmp(line + idx, "type=", 5) == 0) ? 1 : 2;
break;
case 1:
buf[outIdx++] = line[idx];
//@fallthrough@
case 2:
if (line[idx++] == '&')
state = 0;
break;
}
}
if (!gotQuery)
continue; // CONTINUE
if (buf[outIdx - 1] == '&')
outIdx--;
buf[outIdx++] = '\n';
buf[outIdx] = '\0';
if (!file.write(buf, outIdx)) {
reader->Close();
delete reader;
delete [] line;
delete [] buf;
return -1;
}
}
reader->Close();
delete reader;
delete [] line;
delete [] buf;
return 0;
}
|