summaryrefslogtreecommitdiffstats
path: root/fbench/src/splitfile
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
committerJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
commit72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /fbench/src/splitfile
Publish
Diffstat (limited to 'fbench/src/splitfile')
-rw-r--r--fbench/src/splitfile/.gitignore5
-rw-r--r--fbench/src/splitfile/CMakeLists.txt9
-rw-r--r--fbench/src/splitfile/description.html2
-rw-r--r--fbench/src/splitfile/splitfile.cpp114
4 files changed, 130 insertions, 0 deletions
diff --git a/fbench/src/splitfile/.gitignore b/fbench/src/splitfile/.gitignore
new file mode 100644
index 00000000000..681674c8928
--- /dev/null
+++ b/fbench/src/splitfile/.gitignore
@@ -0,0 +1,5 @@
+.depend
+Makefile
+splitfile
+splitfile.ilk
+splitfile.pdb
diff --git a/fbench/src/splitfile/CMakeLists.txt b/fbench/src/splitfile/CMakeLists.txt
new file mode 100644
index 00000000000..94c8c5681ff
--- /dev/null
+++ b/fbench/src/splitfile/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(fbench_splitfile_app
+ SOURCES
+ splitfile.cpp
+ OUTPUT_NAME splitfile
+ INSTALL bin
+ DEPENDS
+ fbench_util
+)
diff --git a/fbench/src/splitfile/description.html b/fbench/src/splitfile/description.html
new file mode 100644
index 00000000000..b38cb5e8f65
--- /dev/null
+++ b/fbench/src/splitfile/description.html
@@ -0,0 +1,2 @@
+<!-- Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
+Program used to split query url files.
diff --git a/fbench/src/splitfile/splitfile.cpp b/fbench/src/splitfile/splitfile.cpp
new file mode 100644
index 00000000000..001e6c4ed5c
--- /dev/null
+++ b/fbench/src/splitfile/splitfile.cpp
@@ -0,0 +1,114 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <util/filereader.h>
+#include <fstream>
+#include <vector>
+#include <memory>
+
+/**
+ * Split a text file randomly in a number of parts. Process an input
+ * file (or stdin) line by line, writing each line out to a randomly
+ * chosen output file. The output files are numbered using a counter
+ * and a filename pattern.
+ **/
+
+int
+main(int argc, char** argv)
+{
+ // parameters with default values.
+ const char *pattern = "query%03d.txt";
+ int linebufsize = 10240;
+
+ // parse options and override defaults.
+ int idx;
+ char opt;
+ const char *arg;
+ bool optError;
+
+ idx = 1;
+ optError = false;
+ while((opt = GetOpt(argc, argv, "p:m:", arg, idx)) != -1) {
+ switch(opt) {
+ case 'p':
+ pattern = arg;
+ break;
+ case 'm':
+ linebufsize = atoi(arg);
+ if (linebufsize < 10240) {
+ linebufsize = 10240;
+ }
+ break;
+ default:
+ optError = true;
+ break;
+ }
+ }
+
+ if (argc < (idx + 1) || argc > (idx + 2) || optError) {
+ printf("usage: splitfile [-p pattern] [-m maxLineSize] <numparts> [<file>]\n\n");
+ printf(" -p pattern : output name pattern ['query%%03d.txt']\n");
+ printf(" -m <num> : max line size for input/output lines.\n");
+ printf(" Can not be less than the default [10240]\n");
+ printf(" <numparts> : number of output files to generate.\n\n");
+ printf("Reads from <file> (stdin if <file> is not given) and\n");
+ printf("randomly distributes each line between <numpart> output\n");
+ printf("files. The names of the output files are generated by\n");
+ printf("combining the <pattern> with sequential numbers using\n");
+ printf("the sprintf function.\n");
+ return -1;
+ }
+
+ int outcnt = atoi(argv[idx]);
+ if (outcnt < 1) {
+ printf("too few output files!\n");
+ return -1;
+ }
+
+ int i;
+ int res;
+ std::vector<char> linebuf(linebufsize);
+ char filename[1024];
+ std::unique_ptr<FileReader> input = std::make_unique<FileReader>();
+ std::vector<std::unique_ptr<std::ostream>> output;
+
+ if (argc > (idx + 1)) {
+ if (!input->Open(argv[idx + 1])) {
+ printf("could not open input file!\n");
+ return -1;
+ }
+ } else {
+ if (!input->OpenStdin()) {
+ printf("could not open stdin! (strange)\n");
+ return -1;
+ }
+ }
+
+ // open output files
+ output.reserve(outcnt);
+ for (i = 0; i < outcnt; i++) {
+ snprintf(filename, 1024, pattern, i);
+ output.emplace_back(std::make_unique<std::ofstream>(filename, std::ofstream::out | std::ofstream::binary | std::ofstream::trunc));
+ if (! output.back()) {
+ printf("could not open output file: %s\n", filename);
+ input->Close();
+ return -1;
+ }
+ }
+
+ // split file
+ while ((res = input->ReadLine(&linebuf[0], linebufsize - 1)) >= 0) {
+ if (res < linebufsize - 1) {
+ linebuf[res] = '\n';
+ linebuf[res + 1] = '\0'; // just in case
+ i = random() % outcnt;
+ if (!output[i]->write(&linebuf[0], res + 1)) {
+ printf("error writing to file '%d'\n", i);
+ }
+ } else {
+ printf("line too long, skipping...\n");
+ }
+ }
+
+ // close files
+ input->Close();
+ return 0;
+}