diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /fbench/src/splitfile |
Publish
Diffstat (limited to 'fbench/src/splitfile')
-rw-r--r-- | fbench/src/splitfile/.gitignore | 5 | ||||
-rw-r--r-- | fbench/src/splitfile/CMakeLists.txt | 9 | ||||
-rw-r--r-- | fbench/src/splitfile/description.html | 2 | ||||
-rw-r--r-- | fbench/src/splitfile/splitfile.cpp | 114 |
4 files changed, 130 insertions, 0 deletions
diff --git a/fbench/src/splitfile/.gitignore b/fbench/src/splitfile/.gitignore new file mode 100644 index 00000000000..681674c8928 --- /dev/null +++ b/fbench/src/splitfile/.gitignore @@ -0,0 +1,5 @@ +.depend +Makefile +splitfile +splitfile.ilk +splitfile.pdb diff --git a/fbench/src/splitfile/CMakeLists.txt b/fbench/src/splitfile/CMakeLists.txt new file mode 100644 index 00000000000..94c8c5681ff --- /dev/null +++ b/fbench/src/splitfile/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(fbench_splitfile_app + SOURCES + splitfile.cpp + OUTPUT_NAME splitfile + INSTALL bin + DEPENDS + fbench_util +) diff --git a/fbench/src/splitfile/description.html b/fbench/src/splitfile/description.html new file mode 100644 index 00000000000..b38cb5e8f65 --- /dev/null +++ b/fbench/src/splitfile/description.html @@ -0,0 +1,2 @@ +<!-- Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --> +Program used to split query url files. diff --git a/fbench/src/splitfile/splitfile.cpp b/fbench/src/splitfile/splitfile.cpp new file mode 100644 index 00000000000..001e6c4ed5c --- /dev/null +++ b/fbench/src/splitfile/splitfile.cpp @@ -0,0 +1,114 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <util/filereader.h> +#include <fstream> +#include <vector> +#include <memory> + +/** + * Split a text file randomly in a number of parts. Process an input + * file (or stdin) line by line, writing each line out to a randomly + * chosen output file. The output files are numbered using a counter + * and a filename pattern. + **/ + +int +main(int argc, char** argv) +{ + // parameters with default values. + const char *pattern = "query%03d.txt"; + int linebufsize = 10240; + + // parse options and override defaults. + int idx; + char opt; + const char *arg; + bool optError; + + idx = 1; + optError = false; + while((opt = GetOpt(argc, argv, "p:m:", arg, idx)) != -1) { + switch(opt) { + case 'p': + pattern = arg; + break; + case 'm': + linebufsize = atoi(arg); + if (linebufsize < 10240) { + linebufsize = 10240; + } + break; + default: + optError = true; + break; + } + } + + if (argc < (idx + 1) || argc > (idx + 2) || optError) { + printf("usage: splitfile [-p pattern] [-m maxLineSize] <numparts> [<file>]\n\n"); + printf(" -p pattern : output name pattern ['query%%03d.txt']\n"); + printf(" -m <num> : max line size for input/output lines.\n"); + printf(" Can not be less than the default [10240]\n"); + printf(" <numparts> : number of output files to generate.\n\n"); + printf("Reads from <file> (stdin if <file> is not given) and\n"); + printf("randomly distributes each line between <numpart> output\n"); + printf("files. The names of the output files are generated by\n"); + printf("combining the <pattern> with sequential numbers using\n"); + printf("the sprintf function.\n"); + return -1; + } + + int outcnt = atoi(argv[idx]); + if (outcnt < 1) { + printf("too few output files!\n"); + return -1; + } + + int i; + int res; + std::vector<char> linebuf(linebufsize); + char filename[1024]; + std::unique_ptr<FileReader> input = std::make_unique<FileReader>(); + std::vector<std::unique_ptr<std::ostream>> output; + + if (argc > (idx + 1)) { + if (!input->Open(argv[idx + 1])) { + printf("could not open input file!\n"); + return -1; + } + } else { + if (!input->OpenStdin()) { + printf("could not open stdin! (strange)\n"); + return -1; + } + } + + // open output files + output.reserve(outcnt); + for (i = 0; i < outcnt; i++) { + snprintf(filename, 1024, pattern, i); + output.emplace_back(std::make_unique<std::ofstream>(filename, std::ofstream::out | std::ofstream::binary | std::ofstream::trunc)); + if (! output.back()) { + printf("could not open output file: %s\n", filename); + input->Close(); + return -1; + } + } + + // split file + while ((res = input->ReadLine(&linebuf[0], linebufsize - 1)) >= 0) { + if (res < linebufsize - 1) { + linebuf[res] = '\n'; + linebuf[res + 1] = '\0'; // just in case + i = random() % outcnt; + if (!output[i]->write(&linebuf[0], res + 1)) { + printf("error writing to file '%d'\n", i); + } + } else { + printf("line too long, skipping...\n"); + } + } + + // close files + input->Close(); + return 0; +} |