diff options
author | Lester Solbakken <lesters@yahoo-inc.com> | 2017-07-04 16:37:30 +0200 |
---|---|---|
committer | Lester Solbakken <lesters@yahoo-inc.com> | 2017-07-04 16:37:30 +0200 |
commit | 57e2d94aee61a61470eed2c736aefb4edcce5f2d (patch) | |
tree | 2447f1b6ebe328bbe64cd9840de2c3fcc1e746e1 /sample-apps/blog-tutorial-shared | |
parent | 2672bc5c10f2fef93d2bf1d2e267a9a6d255c9db (diff) |
Remove local file paths in blog sample app shared code
Diffstat (limited to 'sample-apps/blog-tutorial-shared')
-rw-r--r-- | sample-apps/blog-tutorial-shared/README.md | 39 | ||||
-rw-r--r-- | sample-apps/blog-tutorial-shared/src/R/generateDataset.R | 8 | ||||
-rwxr-xr-x | sample-apps/blog-tutorial-shared/src/python/vespaModel.py | 81 |
3 files changed, 61 insertions, 67 deletions
diff --git a/sample-apps/blog-tutorial-shared/README.md b/sample-apps/blog-tutorial-shared/README.md index 846156908c3..bdbf3b2fede 100644 --- a/sample-apps/blog-tutorial-shared/README.md +++ b/sample-apps/blog-tutorial-shared/README.md @@ -9,7 +9,7 @@ Parses JSON from the file trainPosts.json downloaded from Kaggle during the [blog search tutorial](https://git.corp.yahoo.com/pages/vespa/documentation/documentation/tutorials/blog-search.html) and format it according to Vespa Document JSON format. $ python parse.py -p trainPosts.json > somefile.json - + Give it the flag "-p" or "--popularity", and the script also calculates and adds the field `popularity`, as introduced [in the tutorial](https://git.corp.yahoo.com/pages/vespa/documentation/documentation/tutorials/blog-search.html#blog-popularity-signal). ## Vespa Tutorial pt. 2 @@ -24,23 +24,18 @@ Give it the flag "-p" or "--popularity", and the script also calculates and adds ## Vespa Tutorial pt.3 -Pre-computed data used through out the tutorial can be found [here](http://trdstorage.trondheim.corp.yahoo.com/~tmartins/vespa_tutorial_data/). - -You can download ```vespa_tutorial_data.tar.gz``` (144MB) and decompress it with - - $ wget http://trdstorage.trondheim.corp.yahoo.com/~tmartins/vespa_tutorial_data.tar.gz - $ tar -xvzf vespa_tutorial_data.tar.gz +Pre-computed data used throughout the tutorial will be made available shortly. ### Create Training Dataset - $ ./generateDataset.R -d vespa_tutorial_data/user_item_cf_cv/product.json \ - -u vespa_tutorial_data/user_item_cf_cv/user.json \ - -t vespa_tutorial_data/training_and_test_indices/train.txt \ - -o vespa_tutorial_data/nn_model/training_set.txt + $ ./generateDataset.R -d blog_job/user_item_cf_cv/product.json \ + -u blog_job/user_item_cf_cv/user.json \ + -t blog_job/training_and_test_indices/train.txt \ + -o blog_job/nn_model/training_set.txt ### Train model with TensorFlow -Train the model with +Train the model with $ python vespaModel.py --product_features_file_path vespa_tutorial_data/user_item_cf_cv/product.json \ --user_features_file_path vespa_tutorial_data/user_item_cf_cv/user.json \ @@ -49,21 +44,21 @@ Train the model with Model parameters and summary statistics will be saved at folder ```runs/${start_time}``` with ```${start_time}``` representing the time you started to train the model. Visualize the accuracy and loss metrics with - + $ tensorboard --logdir runs/1473845959/summaries/ **Note**: The folder ```1473845959``` depends on the time you start to train the model and will be different in your case. -### Export model parameters to Tensor Vespa format +### Export model parameters to Tensor Vespa format -```checkpoint_dir``` holds the folder that TensorFlow writes the learned model parameters (stored using protobuf) and ```output_dir``` is the folder that we will output the model parameters in +```checkpoint_dir``` holds the folder that TensorFlow writes the learned model parameters (stored using protobuf) and ```output_dir``` is the folder that we will output the model parameters in Vespa Tensor format. import vespaModel checkpoint_dir = "./runs/1473845959/checkpoints" output_dir = "application_package/constants" - + serializer = serializeVespaModel(checkpoint_dir, output_dir) serializer.serialize_to_disk(variable_name = "W_hidden", dimension_names = ['input', 'hidden']) serializer.serialize_to_disk(variable_name = "b_hidden", dimension_names = ['hidden']) @@ -74,23 +69,23 @@ The python code containing the class ```serializeVespaModel``` can be found at: ### Offline evaluation -Query Vespa using the rank-profile ```tensor``` for users in the test set and return 100 blog post recommendations. Use those recommendations in the information contained in the test set to compute -metrics defined in the Tutorial pt. 2. +Query Vespa using the rank-profile ```tensor``` for users in the test set and return 100 blog post recommendations. Use those recommendations in the information contained in the test set to compute +metrics defined in the Tutorial pt. 2. pig -x local -f tutorial_compute_metric.pig \ -param VESPA_HADOOP_JAR=vespa-hadoop.jar \ - -param TEST_INDICES=blog-job/training_and_test_indices/testing_set_ids \ + -param TEST_INDICES=blog-job/training_and_test_indices/testing_set_ids \ -param ENDPOINT=$(hostname):8080 -param NUMBER_RECOMMENDATIONS=100 - -param RANKING_NAME=tensor + -param RANKING_NAME=tensor -param OUTPUT=blog-job/cf-metric Repeat the process, but now using the rank-profile ```nn_tensor```. pig -x local -f tutorial_compute_metric.pig \ -param VESPA_HADOOP_JAR=vespa-hadoop.jar \ - -param TEST_INDICES=blog-job/training_and_test_indices/testing_set_ids \ + -param TEST_INDICES=blog-job/training_and_test_indices/testing_set_ids \ -param ENDPOINT=$(hostname):8080 -param NUMBER_RECOMMENDATIONS=100 -param RANKING_NAME=nn_tensor - -param OUTPUT=blog-job/cf-metric
\ No newline at end of file + -param OUTPUT=blog-job/cf-metric diff --git a/sample-apps/blog-tutorial-shared/src/R/generateDataset.R b/sample-apps/blog-tutorial-shared/src/R/generateDataset.R index 461a75c6506..d69cd5ba825 100644 --- a/sample-apps/blog-tutorial-shared/src/R/generateDataset.R +++ b/sample-apps/blog-tutorial-shared/src/R/generateDataset.R @@ -2,10 +2,10 @@ library(jsonlite) library(dplyr) -file_path_document <- '/Users/tmartins/projects/yahoo/sw/vespa-examples/blog-recommendation-support/data/blog-job/user_item_cf_cv/product.json' -file_path_user <- '/Users/tmartins/projects/yahoo/sw/vespa-examples/blog-recommendation-support/data/blog-job/user_item_cf_cv/user.json' -file_path_train <- '/Users/tmartins/projects/yahoo/sw/vespa-examples/blog-recommendation-support/data/blog-job/training_and_test_indices/train.txt' -output_file <- '/Users/tmartins/projects/yahoo/sw/vespa-examples/blog-recommendation-support/data/blog-job/nn_model/training_set.txt' +file_path_document <- 'blog-job/user_item_cf_cv/product.json' +file_path_user <- 'blog-job/user_item_cf_cv/user.json' +file_path_train <- 'blog-job/training_and_test_indices/train.txt' +output_file <- 'blog-job/nn_model/training_set.txt' # get ids from documents that have a latent vector lines <- readLines(file_path_document) diff --git a/sample-apps/blog-tutorial-shared/src/python/vespaModel.py b/sample-apps/blog-tutorial-shared/src/python/vespaModel.py index 7f2a0c06014..5d3bf1eceb7 100755 --- a/sample-apps/blog-tutorial-shared/src/python/vespaModel.py +++ b/sample-apps/blog-tutorial-shared/src/python/vespaModel.py @@ -1,8 +1,7 @@ -#! /Users/tmartins/anaconda/envs/tensorflow/bin/python # Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. """ -Train a 2 layers neural network to compute the probability of a user +Train a 2 layers neural network to compute the probability of a user represented by the vector u liking a document represented by the vector d. Usage: ./vespaModel.py --product_features_file_path path \ @@ -13,30 +12,30 @@ Expected File formats: - product_features_file_path contains a file with rows following the JSON format below: -{"post_id" : 20, - "user_item_cf" : {"user_item_cf:5" : -0.66617566, - "user_item_cf:6" : 0.29197264, - "user_item_cf:1" : -0.15582734, - "user_item_cf:7" : 0.3350679, - "user_item_cf:2" : -0.16676047, - "user_item_cf:9" : -0.31653953, - "user_item_cf:3" : -0.21495385, - "user_item_cf:4" : -0.036676258, - "user_item_cf:8" : 0.122069225, +{"post_id" : 20, + "user_item_cf" : {"user_item_cf:5" : -0.66617566, + "user_item_cf:6" : 0.29197264, + "user_item_cf:1" : -0.15582734, + "user_item_cf:7" : 0.3350679, + "user_item_cf:2" : -0.16676047, + "user_item_cf:9" : -0.31653953, + "user_item_cf:3" : -0.21495385, + "user_item_cf:4" : -0.036676258, + "user_item_cf:8" : 0.122069225, "user_item_cf:0" : 0.20922394}} - user_features_file_path contains a file with rows following the JSON format below: -{"user_id" : 270, - "user_item_cf" : {"user_item_cf:5" : -0.54011273, - "user_item_cf:6" : 0.2723072, - "user_item_cf:1" : -0.23280832, - "user_item_cf:7" : -0.011183357, - "user_item_cf:2" : -0.3987285, - "user_item_cf:9" : -0.05703937, - "user_item_cf:3" : 0.04699418, - "user_item_cf:4" : 0.06679048, - "user_item_cf:8" : 0.31399783, +{"user_id" : 270, + "user_item_cf" : {"user_item_cf:5" : -0.54011273, + "user_item_cf:6" : 0.2723072, + "user_item_cf:1" : -0.23280832, + "user_item_cf:7" : -0.011183357, + "user_item_cf:2" : -0.3987285, + "user_item_cf:9" : -0.05703937, + "user_item_cf:3" : 0.04699418, + "user_item_cf:4" : 0.06679048, + "user_item_cf:8" : 0.31399783, "user_item_cf:0" : 0.5000366}} - dataset_file_path contains a file with rows containing tab-separated post_id, user_id, label such as the sample below: @@ -70,7 +69,7 @@ class getData: indexes = ['user_item_cf:' + str(x) for x in range(0,10,1)] values = [json['user_item_cf'][x] for x in indexes] return [id, values] - + def get_product_features_lookup(self): product_features = [self.parse_cf_features(json.loads(line), 'post_id') for line in open(self.product_features_file_path)] return dict(product_features) @@ -108,15 +107,15 @@ class getData: input_u_shuffled = input_u[shuffle_indices] input_d_shuffled = input_d[shuffle_indices] input_y_shuffled = input_y[shuffle_indices] - + # Split train/test set dev_samples = int(len(input_u_shuffled)*perc) u_train, u_dev = input_u_shuffled[:-dev_samples], input_u_shuffled[-dev_samples:] d_train, d_dev = input_d_shuffled[:-dev_samples], input_d_shuffled[-dev_samples:] y_train, y_dev = input_y_shuffled[:-dev_samples], input_y_shuffled[-dev_samples:] print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) - - return u_train, u_dev, d_train, d_dev, y_train, y_dev + + return u_train, u_dev, d_train, d_dev, y_train, y_dev def batch_iter(self, data, batch_size, num_epochs, shuffle=True): """ @@ -140,7 +139,7 @@ class getData: class vespaRunTimeModel: """ Model that combine user and document features and needs to be evaluated at query time. - """ + """ def __init__(self, user_feature_length, doc_feature_length, hidden_length): # placeholders @@ -153,11 +152,11 @@ class vespaRunTimeModel: # hidden layer self.W_hidden = tf.Variable( - tf.truncated_normal([user_feature_length + + tf.truncated_normal([user_feature_length + doc_feature_length, hidden_length], stddev=0.1), name = 'W_hidden') self.b_hidden = tf.Variable(tf.constant(0.1, shape=[hidden_length]), name = 'b_hidden') - self.hidden_layer = tf.nn.relu(tf.matmul(self.input_concat, self.W_hidden) + self.b_hidden, + self.hidden_layer = tf.nn.relu(tf.matmul(self.input_concat, self.W_hidden) + self.b_hidden, name = 'hidden_layer') # output layer @@ -237,10 +236,10 @@ class serializeVespaModel: """ Serialize TensorFlow variables to Vespa JSON format - Example: + Example: checkpoint_dir = "./runs/1473845959/checkpoints" output_dir = "./runs/1473845959/vespa_variables" - + serializer = serializeVespaModel(checkpoint_dir, output_dir) serializer.serialize_to_disk(variable_name = "W_hidden", dimension_names = ['input', 'hidden']) serializer.serialize_to_disk(variable_name = "b_hidden", dimension_names = ['hidden']) @@ -262,7 +261,7 @@ class serializeVespaModel: for element in variable: dimension_address.append((dimension_names[0], str(count))) count += 1 - cells.append({ 'address': dict(dimension_address), "value": float(element) }) + cells.append({ 'address': dict(dimension_address), "value": float(element) }) return cells else: count = 0 @@ -277,7 +276,7 @@ class serializeVespaModel: variable = self.reader.get_tensor(variable_name) cells = self.write_cell_value(variable, dimension_names) return json.dumps({'cells': cells}) - + def serialize_to_disk(self, variable_name, dimension_names): text_file = open(os.path.join(output_dir, variable_name + ".json"), "w") text_file.write(serializer.write_to_vespa_json_format(variable_name, dimension_names)) @@ -285,7 +284,7 @@ class serializeVespaModel: def task_train(): - # Data + # Data tf.flags.DEFINE_string("product_features_file_path", '', "File containing product features") tf.flags.DEFINE_string("user_features_file_path", '', "File containing user features") tf.flags.DEFINE_string("dataset_file_path", '', "File containing labels for each document user pair") @@ -316,13 +315,13 @@ def task_train(): FLAGS.product_features_file_path, FLAGS.user_features_file_path, FLAGS.dataset_file_path) - + input_u, input_d, input_y = data_pre_processing.prepare_dataset() u_train, u_dev, d_train, d_dev, y_train, y_dev = data_pre_processing.create_train_test_sets(input_u, input_d, input_y, seed = 10, perc = 0.2) user_feature_length = input_u.shape[1] doc_feature_length = input_d.shape[1] - + # Create a graph with tf.Graph().as_default(): @@ -332,11 +331,11 @@ def task_train(): allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) - with sess.as_default(): + with sess.as_default(): # instanciate a model - vespa_model = vespaRunTimeModel(user_feature_length = user_feature_length, - doc_feature_length = doc_feature_length, + vespa_model = vespaRunTimeModel(user_feature_length = user_feature_length, + doc_feature_length = doc_feature_length, hidden_length = FLAGS.hidden_length_factor * (user_feature_length + doc_feature_length)) # create a train operation @@ -344,7 +343,7 @@ def task_train(): # Summaries for loss and accuracy train_summary_op, dev_summary_op = vespa_model.summary_oprations() - + # Output directory for models and summaries out_dir = vespa_model.create_output_dir() @@ -380,7 +379,7 @@ def task_train(): print("") if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) - print("Saved model checkpoint to {}\n".format(path)) + print("Saved model checkpoint to {}\n".format(path)) if __name__ == "__main__": |