1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
|
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <util/filereader.h>
#include <fstream>
#include <vector>
#include <memory>
/**
* Split a text file randomly in a number of parts. Process an input
* file (or stdin) line by line, writing each line out to a randomly
* chosen output file. The output files are numbered using a counter
* and a filename pattern.
**/
int
main(int argc, char** argv)
{
// parameters with default values.
const char *pattern = "query%03d.txt";
int linebufsize = 10240;
// parse options and override defaults.
int idx;
char opt;
const char *arg;
bool optError;
idx = 1;
optError = false;
while((opt = GetOpt(argc, argv, "p:m:", arg, idx)) != -1) {
switch(opt) {
case 'p':
pattern = arg;
break;
case 'm':
linebufsize = atoi(arg);
if (linebufsize < 10240) {
linebufsize = 10240;
}
break;
default:
optError = true;
break;
}
}
if (argc < (idx + 1) || argc > (idx + 2) || optError) {
printf("usage: vespa-fbench-split-file [-p pattern] [-m maxLineSize] <numparts> [<file>]\n\n");
printf(" -p pattern : output name pattern ['query%%03d.txt']\n");
printf(" -m <num> : max line size for input/output lines.\n");
printf(" Can not be less than the default [10240]\n");
printf(" <numparts> : number of output files to generate.\n\n");
printf("Reads from <file> (stdin if <file> is not given) and\n");
printf("randomly distributes each line between <numpart> output\n");
printf("files. The names of the output files are generated by\n");
printf("combining the <pattern> with sequential numbers using\n");
printf("the sprintf function.\n");
return -1;
}
int outcnt = atoi(argv[idx]);
if (outcnt < 1) {
printf("too few output files!\n");
return -1;
}
int i;
int res;
std::vector<char> linebuf(linebufsize);
char filename[1024];
std::unique_ptr<FileReader> input = std::make_unique<FileReader>();
std::vector<std::unique_ptr<std::ostream>> output;
if (argc > (idx + 1)) {
if (!input->Open(argv[idx + 1])) {
printf("could not open input file!\n");
return -1;
}
} else {
if (!input->OpenStdin()) {
printf("could not open stdin! (strange)\n");
return -1;
}
}
// open output files
output.reserve(outcnt);
for (i = 0; i < outcnt; i++) {
snprintf(filename, 1024, pattern, i);
output.emplace_back(std::make_unique<std::ofstream>(filename, std::ofstream::out | std::ofstream::binary | std::ofstream::trunc));
if (! output.back()) {
printf("could not open output file: %s\n", filename);
input->Close();
return -1;
}
}
// split file
while ((res = input->ReadLine(&linebuf[0], linebufsize - 1)) >= 0) {
if (res < linebufsize - 1) {
linebuf[res] = '\n';
linebuf[res + 1] = '\0'; // just in case
i = random() % outcnt;
if (!output[i]->write(&linebuf[0], res + 1)) {
printf("error writing to file '%d'\n", i);
}
} else {
printf("line too long, skipping...\n");
}
}
// close files
input->Close();
return 0;
}
|