diff options
author | tmartins <tmartins@yahoo-inc.com> | 2016-09-28 22:39:15 +0200 |
---|---|---|
committer | tmartins <tmartins@yahoo-inc.com> | 2016-09-28 22:39:15 +0200 |
commit | 5ec8b6dc1cd6b70605d10bf0e4ba46eec5c68563 (patch) | |
tree | c578563acee45b9a0e8af7f8136630a0d19f1b31 /sample-apps | |
parent | f036cfef92a934e6dc6edb06701e0ae0ed34598a (diff) |
R script to generate training dataset
Diffstat (limited to 'sample-apps')
-rw-r--r-- | sample-apps/blog-tutorial-shared/src/R/generateDataset.R | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/sample-apps/blog-tutorial-shared/src/R/generateDataset.R b/sample-apps/blog-tutorial-shared/src/R/generateDataset.R new file mode 100644 index 00000000000..b410ad4094c --- /dev/null +++ b/sample-apps/blog-tutorial-shared/src/R/generateDataset.R @@ -0,0 +1,56 @@ +library(jsonlite) +library(dplyr) + +file_path_document <- '/Users/tmartins/projects/yahoo/sw/vespa-examples/blog-recommendation-support/data/blog-job/user_item_cf_cv/product.json' +file_path_user <- '/Users/tmartins/projects/yahoo/sw/vespa-examples/blog-recommendation-support/data/blog-job/user_item_cf_cv/user.json' +file_path_train <- '/Users/tmartins/projects/yahoo/sw/vespa-examples/blog-recommendation-support/data/blog-job/training_and_test_indices/train.txt' +output_file <- '/Users/tmartins/projects/yahoo/sw/vespa-examples/blog-recommendation-support/data/blog-job/nn_model/training_set.txt' + +# get ids from documents that have a latent vector +lines <- readLines(file_path_document) +product_ids <- NULL +for (line in lines){ + product_ids <- c(product_ids, fromJSON(txt=line)$post_id) +} + +# get ids from users that have a latent vector +lines <- readLines(file_path_user) +user_ids <- NULL +for (line in lines){ + user_ids <- c(user_ids, fromJSON(txt=line)$user_id) +} + +# read (product, user) ids used for training +train_ids <- read.delim(file = file_path_train, header = FALSE, stringsAsFactors = FALSE) +colnames(train_ids) <- c("product_id", "user_id") + +# filter out product id and user id that does not have latent vectors +temp <- merge(x = train_ids, y = data.frame(product_id = product_ids)) +final_positive_train_ids <- merge(x = temp, y = data.frame(user_id = user_ids)) + +# add positive labels +final_positive_train_ids <- data.frame(final_positive_train_ids, label = 1) + +# add noise to the data +clicks_per_user <- final_positive_train_ids %>% group_by(user_id) %>% summarise(number_clicks = sum(label)) + +unread_proportion <- 10 +unread_products <- matrix(NA, unread_proportion*sum(clicks_per_user$number_clicks), 3) +colnames(unread_products) <- c("user_id", "product_id", "label") +count <- 0 +for (i in 1:nrow(clicks_per_user)){ + print(paste(i, "/ ", nrow(clicks_per_user))) + number_itens <- unread_proportion * as.numeric(clicks_per_user[i, "number_clicks"]) + row_index <- count + 1:number_itens + count <- count + number_itens + user_id <- clicks_per_user[i, "user_id"] + new_samples <- sample(x = product_ids, size = unread_proportion * as.numeric(clicks_per_user[i, "number_clicks"]), replace = FALSE) + unread_products[row_index, ] <- matrix(c(rep(as.numeric(user_id), number_itens), new_samples, rep(0, number_itens)), ncol = 3) +} + +# create final dataset +final_train_ids <- rbind(final_positive_train_ids, data.frame(unread_products)) +duplicated_rows <- duplicated(x = final_train_ids[, c("user_id", "product_id")]) +final_train_ids <- final_train_ids[!duplicated_rows, ] + +write.table(x = final_train_ids, file = output_file, sep = "\t", quote = FALSE, row.names = FALSE) |