aboutsummaryrefslogtreecommitdiffstats
path: root/fbench/src/filterfile/filterfile.cpp
blob: aa932f77ce6c50156ad60fbbc935204d23d56a95 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <util/filereader.h>
#include <iostream>
#include <string.h>
#include <cassert>
#include <unistd.h>

/**
 * Extract query urls from web logs. The filterfile application reads
 * concatenated web logs from stdin and writes all query urls found in
 * the input to stdout. Urls beginning with '/cgi-bin/search?'  are
 * assumed to be query urls. Only the 'query' and 'type' parameters
 * are kept in the output.
 **/

int
main(int argc, char** argv)
{
    bool showUsage = false;
    bool allowAllParams = false;
    int  bufsize = 10240;

    // parse options and override defaults.
    int         opt;
    bool        optError;

    optError = false;
    while((opt = getopt(argc, argv, "ahm:")) != -1) {
        switch(opt) {
        case 'a':
            allowAllParams = true;
            break;
        case 'h':
            showUsage = true;
            break;
        case 'm':
            bufsize = atoi(optarg);
            if (bufsize < 10240) {
                bufsize = 10240;
            }
            break;
        default:
            optError = true;
            break;
        }
    }

    if (optError || showUsage) {
        printf("usage: vespa-fbench-filter-file [-a] [-h] [-m maxLineSize]\n\n");
        printf("Read concatenated fastserver logs from stdin and write\n");
        printf("extracted query urls to stdout.\n\n");
        printf(" -a : all parameters to the original query urls are preserved.\n");
        printf("      If the -a switch is not given, only 'query' and 'type'\n");
        printf("      parameters are kept in the extracted query urls.\n");
        printf(" -h : print this usage information.\n");
        printf(" -m <num> : max line size for input/output lines.\n");
        printf("            Can not be less than the default [10240]\n");
        return -1;
    }

    const char *beginToken = "GET ";
    int beginTokenlen = strlen(beginToken);

    const char *endToken = " HTTP/";

    //const char *prefix = "/cgi-bin/search?";
    const char *prefix = "/?";
    int prefixlen = strlen(prefix);

    //const char *trigger = "/cgi-bin/";
    const char *trigger = "";
    int triggerlen = strlen(trigger);

    // open input and output (should never fail)
    FileReader *reader = new FileReader();
    if (!reader->OpenStdin()) {
        printf("could not open stdin! (strange)\n");
        delete reader;
        return -1;
    }
    std::ostream & file = std::cout;

    // filter the input
    char *line    = new char[bufsize];
    assert(line != NULL);
    int   res;
    char *tmp;
    char *url;
    int   startIdx;
    int   endIdx;
    int   idx;
    int   outIdx;
    char *buf     = new char[bufsize];
    assert(buf != NULL);
    int   state; // 0=expect param name, 1=copy, 2=skip
    bool  gotQuery;
    memcpy(buf, prefix, prefixlen);
    while ((res = reader->ReadLine(line, bufsize - 1)) >= 0) {

        // find field beginning
        tmp = strstr(line, beginToken);
        startIdx = (tmp != NULL) ? (tmp - line) + beginTokenlen : 0;

        // find url beginning
        url = strstr(line + startIdx, trigger);
        if (url == NULL)
            continue;                                // CONTINUE

        // find field end
        tmp = strstr(line + startIdx, endToken);
        if (tmp == NULL)
            tmp = strstr(line + startIdx, "\"");
        endIdx = (tmp != NULL) ? (tmp - line) : strlen(line);

        // find params
        idx = (url - line) + triggerlen;
        while (idx < endIdx && line[idx++] != '?');
        if (idx >= endIdx)
            continue;                                // CONTINUE

        outIdx   = prefixlen;
        state    = 0;              // expect param name
        gotQuery = false;
        while(idx < endIdx) {
            switch (state) {
            case 0:
                state = ((strncmp(line + idx, "query=", 6) == 0
                          && (gotQuery = true)) ||
                         allowAllParams ||
                         strncmp(line + idx, "type=", 5) == 0) ? 1 : 2;
                break;
            case 1:
                buf[outIdx++] = line[idx];
                [[fallthrough]];
            case 2:
                if (line[idx++] == '&')
                    state = 0;
                break;
            }
        }
        if (!gotQuery)
            continue;                                // CONTINUE

        if (buf[outIdx - 1] == '&')
            outIdx--;
        buf[outIdx++] = '\n';
        buf[outIdx] = '\0';
        if (!file.write(buf, outIdx)) {
            reader->Close();
            delete reader;
            delete [] line;
            delete [] buf;
            return -1;
        }
    }
    reader->Close();
    delete reader;
    delete [] line;
    delete [] buf;
    return 0;
}