diff options
author | Kristian Aune <kraune@yahoo-inc.com> | 2017-08-18 09:14:46 +0200 |
---|---|---|
committer | Kristian Aune <kraune@yahoo-inc.com> | 2017-08-18 09:14:46 +0200 |
commit | e36ce286e3cc2026fe0ccbbf0e116477b23c70e1 (patch) | |
tree | 3d3dadc8869629d7ebc7d019a400fc641a072c4c /sample-apps/blog-tutorial-shared/src | |
parent | a618787ba847eed7c30f6bd81256db8d39e5e2c4 (diff) |
Sample apps moved to vespa-engine/sample-apps
Diffstat (limited to 'sample-apps/blog-tutorial-shared/src')
22 files changed, 0 insertions, 1554 deletions
diff --git a/sample-apps/blog-tutorial-shared/src/R/generateDataset.R b/sample-apps/blog-tutorial-shared/src/R/generateDataset.R deleted file mode 100644 index d69cd5ba825..00000000000 --- a/sample-apps/blog-tutorial-shared/src/R/generateDataset.R +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -library(jsonlite) -library(dplyr) - -file_path_document <- 'blog-job/user_item_cf_cv/product.json' -file_path_user <- 'blog-job/user_item_cf_cv/user.json' -file_path_train <- 'blog-job/training_and_test_indices/train.txt' -output_file <- 'blog-job/nn_model/training_set.txt' - -# get ids from documents that have a latent vector -lines <- readLines(file_path_document) -product_ids <- NULL -for (line in lines){ - product_ids <- c(product_ids, fromJSON(txt=line)$post_id) -} - -# get ids from users that have a latent vector -lines <- readLines(file_path_user) -user_ids <- NULL -for (line in lines){ - user_ids <- c(user_ids, fromJSON(txt=line)$user_id) -} - -# read (product, user) ids used for training -train_ids <- read.delim(file = file_path_train, header = FALSE, stringsAsFactors = FALSE) -colnames(train_ids) <- c("product_id", "user_id") - -# filter out product id and user id that does not have latent vectors -temp <- merge(x = train_ids, y = data.frame(product_id = product_ids)) -final_positive_train_ids <- merge(x = temp, y = data.frame(user_id = user_ids)) - -# add positive labels -final_positive_train_ids <- data.frame(final_positive_train_ids, label = 1) - -# add noise to the data -clicks_per_user <- final_positive_train_ids %>% group_by(user_id) %>% summarise(number_clicks = sum(label)) - -unread_proportion <- 10 -unread_products <- matrix(NA, unread_proportion*sum(clicks_per_user$number_clicks), 3) -colnames(unread_products) <- c("user_id", "product_id", "label") -count <- 0 -for (i in 1:nrow(clicks_per_user)){ - print(paste(i, "/ ", nrow(clicks_per_user))) - number_itens <- unread_proportion * as.numeric(clicks_per_user[i, "number_clicks"]) - row_index <- count + 1:number_itens - count <- count + number_itens - user_id <- clicks_per_user[i, "user_id"] - new_samples <- sample(x = product_ids, size = unread_proportion * as.numeric(clicks_per_user[i, "number_clicks"]), replace = FALSE) - unread_products[row_index, ] <- matrix(c(rep(as.numeric(user_id), number_itens), new_samples, rep(0, number_itens)), ncol = 3) -} - -# create final dataset -final_train_ids <- rbind(final_positive_train_ids, data.frame(unread_products)) -duplicated_rows <- duplicated(x = final_train_ids[, c("user_id", "product_id")]) -final_train_ids <- final_train_ids[!duplicated_rows, ] - -write.table(x = final_train_ids, file = output_file, sep = "\t", quote = FALSE, row.names = FALSE) diff --git a/sample-apps/blog-tutorial-shared/src/__init__.py b/sample-apps/blog-tutorial-shared/src/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 --- a/sample-apps/blog-tutorial-shared/src/__init__.py +++ /dev/null diff --git a/sample-apps/blog-tutorial-shared/src/main/__init__.py b/sample-apps/blog-tutorial-shared/src/main/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 --- a/sample-apps/blog-tutorial-shared/src/main/__init__.py +++ /dev/null diff --git a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_compute_metric.pig b/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_compute_metric.pig deleted file mode 100644 index 61ca9bc7cb2..00000000000 --- a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_compute_metric.pig +++ /dev/null @@ -1,43 +0,0 @@ --- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -REGISTER $VESPA_HADOOP_JAR - -DEFINE BlogPostRecommendations - com.yahoo.vespa.hadoop.pig.VespaQuery( - 'query=http://$ENDPOINT/search/?user_id=<user_id>&hits=$NUMBER_RECOMMENDATIONS&ranking=$RANKING_NAME', - 'schema=rank:int,id:chararray,relevance:double,fields/post_id:chararray' - ); - --- Load test_set data from a local file -test_indices = LOAD '$TEST_INDICES' AS (post_id:chararray, user_id:chararray); -users = FOREACH test_indices GENERATE user_id; -users = FILTER users BY user_id IS NOT null; -users = DISTINCT users; - --- Run a set of queries against Vespa -recommendations = FOREACH users GENERATE user_id, - FLATTEN(BlogPostRecommendations(*)) AS (rank, id, relevance, post_id); -recommendations = FOREACH recommendations GENERATE user_id, rank, post_id; -recommendations = FILTER recommendations BY rank IS NOT NULL AND post_id IS NOT NULL; - --- join data -joined_data = JOIN test_indices BY (post_id, user_id), recommendations BY (post_id, user_id); -joined_data = FOREACH joined_data GENERATE - test_indices::post_id AS post_id, - test_indices::user_id AS user_id, - rank; - --- transform and add a column -joined_data = FOREACH joined_data - GENERATE post_id, - user_id, - rank, - (double)rank/(double)$NUMBER_RECOMMENDATIONS AS percentile; - -grouped_data = GROUP joined_data BY user_id; -grouped_data = FOREACH grouped_data - GENERATE group AS user_id, - SUM(joined_data.percentile) AS sum_percentile, - COUNT(joined_data.post_id) AS number_read, - (double)SUM(joined_data.percentile)/(double)COUNT(joined_data.post_id) AS expected_percentile; - -STORE grouped_data INTO '$OUTPUT'; diff --git a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_feed_content_and_tensor_vespa.pig b/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_feed_content_and_tensor_vespa.pig deleted file mode 100644 index 50152318d04..00000000000 --- a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_feed_content_and_tensor_vespa.pig +++ /dev/null @@ -1,135 +0,0 @@ --- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -REGISTER '$VESPA_HADOOP_JAR' - --- Create valid Vespa put operations -DEFINE VespaPutOperationDoc - com.yahoo.vespa.hadoop.pig.VespaDocumentOperation( - 'operation=put', - 'docid=id:blog-recommendation:blog_post::<post_id>', - 'create-tensor-fields=user_item_cf', - 'simple-array-fields=tags,categories' - ); - -DEFINE VespaPutOperationUser - com.yahoo.vespa.hadoop.pig.VespaDocumentOperation( - 'operation=put', - 'docid=id:blog-recommendation:user::<user_id>', - 'create-tensor-fields=user_item_cf', - 'simple-array-fields=has_read_items' - ); - --- Transform tabular data to a Vespa document operation JSON format -DEFINE VespaStorage - com.yahoo.vespa.hadoop.pig.VespaStorage(); - --- Load data -data = LOAD '$DATA_PATH' USING - JsonLoader('date_gmt:chararray, - language:chararray, - author:chararray, - url:chararray, - title:chararray, - blog:chararray, - post_id:chararray, - tags:{T:(tag_name:chararray)}, - blogname:chararray, - date:chararray, - content:chararray, - categories:{T:(category_name:chararray)}, - likes:{T:(dt:chararray, uid:chararray)}'); - -data_for_feed = FOREACH data GENERATE - date_gmt, - language, - author, - url, - title, - blog, - post_id, - tags, - blogname, - content, - categories; - --- Feed only blog posts that belong to test set -test_indices = LOAD '$TEST_INDICES' AS (post_id, user_id); -test_indices = FOREACH test_indices GENERATE post_id; -test_indices = DISTINCT test_indices; - -test_data_for_feed = FOREACH (JOIN data_for_feed BY post_id, test_indices BY post_id) - GENERATE date_gmt AS date_gmt, - language AS language, - author AS author, - url AS url, - title AS title, - blog AS blog, - data_for_feed::post_id AS post_id, - tags AS tags, - blogname AS blogname, - content AS content, - categories AS categories; - --- Load Blog post CF latent factors -data_doc = LOAD '$BLOG_POST_FACTORS' USING - JsonLoader('post_id:chararray, - user_item_cf:[double]'); - --- Join data and latent factors -data_content_and_doc_tensor = JOIN test_data_for_feed BY post_id LEFT, data_doc BY post_id; -data_content_and_doc_tensor = FOREACH data_content_and_doc_tensor GENERATE - date_gmt AS date_gmt, - language AS language, - author AS author, - url AS url, - title AS title, - blog AS blog, - test_data_for_feed::post_id as post_id, - tags AS tags, - blogname AS blogname, - content AS content, - categories AS categories, - user_item_cf AS user_item_cf, - (user_item_cf IS NOT NULL ? 1 : 0) AS has_user_item_cf; - --- Generate valid Vespa JSON format -data_content_and_doc_tensor_feed = FOREACH data_content_and_doc_tensor GENERATE VespaPutOperationDoc(*); - --- Load User CF latent factors -data_user = LOAD '$USER_FACTORS' USING - JsonLoader('user_id:chararray, - user_item_cf:[double]'); -data_user = FOREACH data_user GENERATE - user_id AS user_id, - user_item_cf AS user_item_cf; - --- Articles already liked -data_likes = FOREACH data GENERATE post_id, FLATTEN(likes) AS (dt, uid); - -post_liked_per_user = GROUP data_likes BY uid; -post_liked_per_user = FOREACH post_liked_per_user GENERATE - group AS user_id, - data_likes.post_id AS has_read_items; - --- Join user data -data_user = JOIN post_liked_per_user BY user_id FULL, - data_user BY user_id; - -data_user = FOREACH data_user GENERATE - (post_liked_per_user::user_id IS NOT NULL ? post_liked_per_user::user_id : data_user::user_id) AS user_id, - user_item_cf AS user_item_cf, - (user_item_cf IS NOT NULL ? 1 : 0) AS has_user_item_cf, - has_read_items AS has_read_items; - -data_user = FILTER data_user BY user_id IS NOT NULL; - --- Generate valid Vespa JSON format -data_user_for_feed = FOREACH data_user GENERATE VespaPutOperationUser(*); - -joint_content_tensors = UNION data_content_and_doc_tensor_feed, data_user_for_feed; - --- Store into Vespa -STORE joint_content_tensors INTO '$ENDPOINT' USING VespaStorage(); - - - - diff --git a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_feed_content_vespa.pig b/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_feed_content_vespa.pig deleted file mode 100644 index 62b8a676cfc..00000000000 --- a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_feed_content_vespa.pig +++ /dev/null @@ -1,51 +0,0 @@ --- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -REGISTER '$VESPA_HADOOP_JAR' - --- UDF to create valid Vespa document operation in JSON format -DEFINE VespaPutOperationDoc - com.yahoo.vespa.hadoop.pig.VespaDocumentOperation( - 'operation=put', - 'docid=id:blog-search:blog_post::<post_id>', - 'simple-array-fields=tags,categories' - ); - --- UDF to send data to a Vespa endpoint -DEFINE VespaStorage - com.yahoo.vespa.hadoop.pig.VespaStorage(); - --- Load data from any source - here we load using JsonLoader -data = LOAD '$DATA_PATH' USING - JsonLoader('date_gmt:chararray, - language:chararray, - author:chararray, - url:chararray, - title:chararray, - blog:chararray, - post_id:chararray, - tags:{T:(tag_name:chararray)}, - blogname:chararray, - date:chararray, - content:chararray, - categories:{T:(category_name:chararray)}, - likes:{T:(dt:chararray, uid:chararray)}'); - --- Select fields that will be sent to Vespa. --- This should follow blog_post.sd -data_for_feed = FOREACH data GENERATE - date_gmt, - language, - author, - url, - title, - blog, - post_id, - tags, - blogname, - content, - categories; - --- Create valid Vespa put operations in JSON format -data_for_feed_json = FOREACH data_for_feed GENERATE VespaPutOperationDoc(*); - --- Store into Vespa -STORE data_for_feed_json INTO '$ENDPOINT' USING VespaStorage(); diff --git a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_get_recommendation_list.pig b/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_get_recommendation_list.pig deleted file mode 100644 index dcaffabba2d..00000000000 --- a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_get_recommendation_list.pig +++ /dev/null @@ -1,22 +0,0 @@ --- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -REGISTER $VESPA_HADOOP_JAR - -DEFINE BlogPostRecommendations - com.yahoo.vespa.hadoop.pig.VespaQuery( - 'query=http://$ENDPOINT/search/?user_id=<user_id>&hits=$NUMBER_RECOMMENDATIONS&ranking=$RANKING_NAME', - 'schema=rank:int,id:chararray,relevance:double,fields/post_id:chararray' - ); - --- Load test_set data from a local file -test_indices = LOAD '$TEST_INDICES' AS (post_id:chararray, user_id:chararray); -users = FOREACH test_indices GENERATE user_id; -users = FILTER users BY user_id IS NOT null; -users = DISTINCT users; - --- Run a set of queries against Vespa -recommendations = FOREACH users GENERATE user_id, - FLATTEN(BlogPostRecommendations(*)) AS (rank, id, relevance, post_id); -recommendations = FOREACH recommendations GENERATE user_id, rank, post_id; -recommendations = FILTER recommendations BY rank IS NOT NULL AND post_id IS NOT NULL; - -STORE recommendations INTO '$OUTPUT'; diff --git a/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/BlogRecommendationApp.scala b/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/BlogRecommendationApp.scala deleted file mode 100644 index 30be7da2727..00000000000 --- a/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/BlogRecommendationApp.scala +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.example.blog - -import org.apache.spark.sql.SparkSession - -object BlogRecommendationApp { - val usage = """ - Usage: spark-submit \ - | --class "BlogRecommendationApp" \ - | --master local[4] \ - | JAR_FILE - | --task task_command [TASK RELATED OPTIONS] - - spark-submit \ - | --class "BlogRecommendationApp" \ - | --master local[4] \ - | JAR_FILE - | --task collaborative_filtering - | --input_file path - | --rank value - | --numIterations value - | --lambda value - | --output_path path - - spark-submit \ - | --class "BlogRecommendationApp" \ - | --master local[4] \ - | JAR_FILE - | --task collaborative_filtering_cv - | --input_file path - | --numIterations value - | --output_path path - | - - spark-submit \ - | --class "BlogRecommendationApp" \ - | --master local[4] \ - | JAR_FILE - | --task split_set - | --input_file path - | --test_perc_stage1 value - | --test_perc_stage2 value - | --seed value - | --output_path path - """ - - private val COLLABORATIVE_FILTERING = "collaborative_filtering" - private val COLLABORATIVE_FILTERING_CV = "collaborative_filtering_cv" - private val SPLIT_SET_INTO_TRAIN_AND_TEST = "split_set" - - type OptionMap = Map[Symbol, Any] - - def main(args: Array[String]) { - - val options = parseCommandLineOptions(args) - val task_name = options('task).toString - - task_name match { - case COLLABORATIVE_FILTERING => CollaborativeFilteringExample(options) - case COLLABORATIVE_FILTERING_CV => CollaborativeFilteringCV(options) - case SPLIT_SET_INTO_TRAIN_AND_TEST => SplitSetIntoTrainingAndTestSets(options) - } - - } - - private def SplitSetIntoTrainingAndTestSets(options: OptionMap) = { - - val spark = SparkSession - .builder() - .appName("Split Full Data Into Train and Test Sets") - .getOrCreate() - - val splitter = new SplitFullSetIntoTrainAndTestSets(spark) - - val sets = splitter.run(input_file_path = options('input_file).toString, - test_perc_stage1 = options('test_perc_stage1).toString.toDouble, - test_perc_stage2 = options('test_perc_stage2).toString.toDouble, - seed = options('seed).toString.toInt) - - SplitFullSetIntoTrainAndTestSets.writeTrainAndTestSetsIndices(sets, options('output_path).toString) - - } - - private def CollaborativeFilteringExample(options: OptionMap) = { - - // TODO: Check if output_path already exist - - val spark = SparkSession - .builder() - .appName("Collaborative Filtering") - .getOrCreate() - - val cf = new CollaborativeFiltering(spark) - - val model = cf.run( - input_path = options('input_file).toString, - rank = options('rank).toString.toInt, - numIterations = options('num_iterations).toString.toInt, - lambda = options('lambda).toString.toDouble) - - CollaborativeFiltering.writeFeaturesAsVespaTensorText(model, options('output_path).toString) - - } - - private def CollaborativeFilteringCV(options: OptionMap) = { - - // TODO: Check if output_path already exist - - val spark = SparkSession - .builder() - .appName("Collaborative Filtering CV") - .getOrCreate() - - val cf = new CollaborativeFiltering(spark) - - val model = cf.run_pipeline( - input_path = options('input_file).toString, - numIterations = options('num_iterations).toString.toInt) - - CollaborativeFiltering.writeFeaturesAsVespaTensorText(model, options('output_path).toString) - - } - - private def parseCommandLineOptions(args: Array[String]): OptionMap = { - - def findTask(list: List[String]) : String = { - list match { - case Nil => println("Please, define a valid task" + "\n" + usage) - sys.exit(1) - case "--task" :: value :: tail => - value - case option :: tail => findTask(tail) - } - } - - def ParseCollaborativeFilteringOptions(map : OptionMap, list: List[String]) : OptionMap = { - list match { - case Nil => map - case "--input_file" :: value :: tail => - ParseCollaborativeFilteringOptions(map ++ Map('input_file -> value.toString), tail) - case "--rank" :: value :: tail => - ParseCollaborativeFilteringOptions(map ++ Map('rank -> value.toInt), tail) - case "--numIterations" :: value :: tail => - ParseCollaborativeFilteringOptions(map ++ Map('num_iterations -> value.toInt), tail) - case "--lambda" :: value :: tail => - ParseCollaborativeFilteringOptions(map ++ Map('lambda -> value.toDouble), tail) - case "--output_path" :: value :: tail => - ParseCollaborativeFilteringOptions(map ++ Map('output_path -> value.toString), tail) - case option :: tail => - ParseCollaborativeFilteringOptions(map, tail) - } - } - - def ParseCollaborativeFilteringCVOptions(map : OptionMap, list: List[String]) : OptionMap = { - list match { - case Nil => map - case "--input_file" :: value :: tail => - ParseCollaborativeFilteringOptions(map ++ Map('input_file -> value.toString), tail) - case "--numIterations" :: value :: tail => - ParseCollaborativeFilteringOptions(map ++ Map('num_iterations -> value.toInt), tail) - case "--output_path" :: value :: tail => - ParseCollaborativeFilteringOptions(map ++ Map('output_path -> value.toString), tail) - case option :: tail => - ParseCollaborativeFilteringOptions(map, tail) - } - } - - def ParseSplitSetOptions(map : OptionMap, list: List[String]) : OptionMap = { - list match { - case Nil => map - case "--input_file" :: value :: tail => - ParseSplitSetOptions(map ++ Map('input_file -> value.toString), tail) - case "--test_perc_stage1" :: value :: tail => - ParseSplitSetOptions(map ++ Map('test_perc_stage1 -> value.toDouble), tail) - case "--test_perc_stage2" :: value :: tail => - ParseSplitSetOptions(map ++ Map('test_perc_stage2 -> value.toDouble), tail) - case "--seed" :: value :: tail => - ParseSplitSetOptions(map ++ Map('seed -> value.toInt), tail) - case "--output_path" :: value :: tail => - ParseSplitSetOptions(map ++ Map('output_path -> value.toString), tail) - case option :: tail => - ParseSplitSetOptions(map , tail) - } - } - - if (args.length == 0) println(usage) - val arglist = args.toList - - val task_name = findTask(arglist) - - val options = task_name match { - case COLLABORATIVE_FILTERING => ParseCollaborativeFilteringOptions(Map('task -> task_name), arglist) - case COLLABORATIVE_FILTERING_CV => ParseCollaborativeFilteringCVOptions(Map('task -> task_name), arglist) - case SPLIT_SET_INTO_TRAIN_AND_TEST => ParseSplitSetOptions(Map('task -> task_name), arglist) - } - - options - - } - -} - - diff --git a/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/CollaborativeFiltering.scala b/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/CollaborativeFiltering.scala deleted file mode 100644 index 56828c828a5..00000000000 --- a/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/CollaborativeFiltering.scala +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.example.blog - -import org.apache.spark.ml.recommendation.{ALS, ALSModel} -import org.apache.spark.ml.evaluation.RegressionEvaluator -import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} -import org.apache.spark.mllib.recommendation.Rating -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.types.IntegerType -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{Row, DataFrame} -import org.apache.spark.sql.functions.{col, explode} - -import scala.collection.mutable -import scala.util.parsing.json.JSONObject - -class CollaborativeFiltering(val ss: SparkSession) { - - import ss.implicits._ - - def loadTrainingIndicesIntoDataFrame(input_path: String) = { - - val ratings = ss.sparkContext.textFile(input_path) - .map(_.split("\t")) - .map(p => (p(0), p(1), 1)) - .toDF("post_id", "user_id", "label") - .filter(col("post_id").notEqual("null")) - .filter(col("user_id").notEqual("null")) - .select(col("post_id").cast(IntegerType).as("post_id"), - col("user_id").cast(IntegerType).as("user_id"), - col("label").cast(IntegerType).as("label")) - - ratings - - } - - def loadDataIntoDataFrame(input_path: String): DataFrame = { - - val dataset = ss.read.json(input_path) - - val setOne = udf(() => 1) - - val ratings = dataset.select(col("post_id").cast(IntegerType).as("post_id"), - explode(col("likes")).as("likes_flat")) - .select(col("post_id"), col("likes_flat.uid").cast(IntegerType).as("user_id")) - .withColumn("label", setOne()) - - ratings - - } - - def loadDataIntoRating(input_path: String): RDD[Rating] = { - - val dataset: DataFrame = ss.read.json(input_path) - - val ratings = dataset.select(col("post_id"), explode(col("likes")).as("likes_flat")) - .select(col("post_id"), col("likes_flat.uid").as("user_id")) - .rdd.map { - case Row(post_id: String, - user_id: String) => - Rating(user_id.toInt, post_id.toInt, 1) - } - - ratings - - } - - def run(input_path: String, rank: Int, numIterations: Int, lambda: Double): ALSModel = { - - // Loading and preparing the data - val ratings = loadTrainingIndicesIntoDataFrame(input_path) - - // Fitting the model - val model = new ALS() - .setItemCol("post_id") - .setRatingCol("label") - .setUserCol("user_id") - .setImplicitPrefs(true) - .setAlpha(lambda) - .setMaxIter(numIterations) - .setRank(rank) - .fit(ratings) - - model - - } - - def run_pipeline(input_path: String, numIterations: Int): ALSModel = { - - // Loading and preparing the data - val ratings = loadTrainingIndicesIntoDataFrame(input_path) - - // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. - val collaborative_filtering = new ALS() - .setItemCol("post_id") - .setRatingCol("label") - .setUserCol("user_id") - .setMaxIter(numIterations) - - val paramGrid = new ParamGridBuilder() - .addGrid(collaborative_filtering.rank, Array(10, 50, 100)) - .addGrid(collaborative_filtering.alpha, Array(0.001, 0.01, 0.1)) - .build() - - val cv = new CrossValidator() - .setEstimator(collaborative_filtering) - .setEvaluator(new RegressionEvaluator) - .setEstimatorParamMaps(paramGrid) - .setNumFolds(2) // Use 3+ in practice - - // Run cross-validation, and choose the best set of parameters. - val cvModel = cv.fit(ratings) - - cvModel.bestModel.asInstanceOf[ALSModel] - - } - -} - -object CollaborativeFiltering { - - def writeModelFeaturesAsTensor[T] (modelFeatures:(Int, mutable.WrappedArray[T]), id_string:String) = { - - val id = modelFeatures._1 - val latentVector = modelFeatures._2 - var latentVectorMap:Map[String,T] = Map() - var output:Map[String,Any] = Map() - - for ( i <- latentVector.indices ){ - - latentVectorMap += (("user_item_cf:" + i.toString, latentVector(i))) - - } - - output += ((id_string, id)) - output += (("user_item_cf", JSONObject(latentVectorMap))) - - JSONObject(output) - - } - - def writeFeaturesAsVespaTensorText(model: ALSModel, output_path: String): Unit ={ - - model - .itemFactors.rdd - .map { - case Row(id: Int, features: mutable.WrappedArray[Double]) => writeModelFeaturesAsTensor((id, features), "post_id") - } - .saveAsTextFile(output_path + "/product_features") - model - .userFactors.rdd - .map { - case Row(id: Int, features: mutable.WrappedArray[Double]) => writeModelFeaturesAsTensor((id, features), "user_id") - } - .saveAsTextFile(output_path + "/user_features") - - } - -} diff --git a/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/SplitFullSetIntoTrainAndTestSets.scala b/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/SplitFullSetIntoTrainAndTestSets.scala deleted file mode 100644 index feff388618e..00000000000 --- a/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/SplitFullSetIntoTrainAndTestSets.scala +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.example.blog - -import org.apache.spark.sql.{SparkSession, DataFrame} -import org.apache.spark.sql.functions.udf -import org.apache.spark.sql.functions._ - -class SplitFullSetIntoTrainAndTestSets(val ss: SparkSession) { - - private def loadAndSimplifyFullDataset(input_file_path: String): DataFrame = { - - // Load full dataset - val full_dataset = ss.read.json(input_file_path) - val full_dataset_simple = full_dataset.select(col("post_id"), size(col("likes")).as("number_likes"), col("likes")) - - full_dataset_simple - - } - - private def splitSimplifiedDatasetIntoTrainAndTestSets(full_dataset_simple: DataFrame, - test_perc_stage1: Double, - test_perc_stage2: Double, - seed: Int): Array[DataFrame] = { - - // Set some blog posts aside to be present only on the test set - var sets = full_dataset_simple.randomSplit(Array(1 - test_perc_stage1, test_perc_stage1), seed) - - val training_set = sets(0) - val training_set_null = training_set.filter("number_likes = 0") - var training_set_exploded = training_set.select(col("post_id"), explode(col("likes")).as("likes_flat")) - training_set_exploded = training_set_exploded.select("post_id", "likes_flat.uid") - - val test_set = sets(1) - val test_set_null = test_set.filter("number_likes = 0") - var test_set_exploded = test_set.select(col("post_id"), explode(col("likes")).as("likes_flat")) - test_set_exploded = test_set_exploded.select("post_id", "likes_flat.uid") - - // randomly move some (post_id, uid) from training set to test set - sets = training_set_exploded.randomSplit(Array(1 - test_perc_stage2, test_perc_stage2), seed) - - training_set_exploded = sets(0) - - val additional_test_set_exploded = sets(1) - test_set_exploded = test_set_exploded.union(additional_test_set_exploded) - - // concatenate exploded set with null set - val getNull = udf(() => None: Option[String]) - training_set_exploded = training_set_exploded.union(training_set_null.select("post_id").withColumn("uid", getNull())) - test_set_exploded = test_set_exploded.union(test_set_null.select("post_id").withColumn("uid", getNull())) - - Array(training_set_exploded, test_set_exploded) - - } - - def run(input_file_path: String, test_perc_stage1: Double, test_perc_stage2:Double, seed: Int): Array[DataFrame] = { - - val full_dataset_simple = loadAndSimplifyFullDataset(input_file_path) - - splitSimplifiedDatasetIntoTrainAndTestSets(full_dataset_simple, - test_perc_stage1, - test_perc_stage2, - seed) - - } - -} - -object SplitFullSetIntoTrainAndTestSets { - - def writeTrainAndTestSetsIndices(train_and_test_sets: Array[DataFrame], output_path: String): Unit = { - - val training_set_exploded = train_and_test_sets(0) - val test_set_exploded = train_and_test_sets(1) - - // Write to disk - training_set_exploded.rdd.map(x => x(0) + "\t" + x(1)).saveAsTextFile(output_path + "/training_set_ids") - test_set_exploded.rdd.map(x => x(0) + "\t" + x(1)).saveAsTextFile(output_path + "/testing_set_ids") - - } - -} diff --git a/sample-apps/blog-tutorial-shared/src/python/__init__.py b/sample-apps/blog-tutorial-shared/src/python/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 --- a/sample-apps/blog-tutorial-shared/src/python/__init__.py +++ /dev/null diff --git a/sample-apps/blog-tutorial-shared/src/python/parse.py b/sample-apps/blog-tutorial-shared/src/python/parse.py deleted file mode 100644 index 207f8a14740..00000000000 --- a/sample-apps/blog-tutorial-shared/src/python/parse.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -import json -import argparse - - -class KaggleRawDataParser: - - popularity = False - raw_data_file = None - total_number_of_likes = 0 - likes_per_blog = {} - - def __init__(self): - parser = argparse.ArgumentParser() - parser.add_argument("-p", "--popularity", action="store_true", help="add 'popularity' field") - parser.add_argument("file", help="location of file to be parsed") - args = parser.parse_args() - - self.popularity = args.popularity - self.raw_data_file = args.file - - def main(self): - if self.popularity: - self.calculate_popularity() - self.parse() - - def calculate_popularity(self): - unparsed_file = open(self.raw_data_file, "r") - - for line in unparsed_file: - data = json.loads(line) - - self.total_number_of_likes += len(data["likes"]) - if data["blog"] in self.likes_per_blog: - self.likes_per_blog[data["blog"]] += len(data["likes"]) - else: - self.likes_per_blog[data["blog"]] = len(data["likes"]) - - unparsed_file.close() - - def parse(self): - unparsed_file = open(self.raw_data_file, "r") - - for line in unparsed_file: - data = json.loads(line) - - parsed_data = { - "put": "id:blog-search:blog_post::" + data["post_id"], - "fields": { - "blogname": data["blogname"], - "post_id": data["post_id"], - "author": data["author"], - "language": data["language"], - "categories": data["categories"], - "title": data["title"], - "blog": data["blog"], - "date_gmt": data["date_gmt"], - "url": data["url"], - "content": data["content"], - "tags": data["tags"], - "date": int(data["date_gmt"][0:4] + data["date_gmt"][5:7] + data["date_gmt"][8:10]) - } - } - if self.popularity: - parsed_data["fields"]["popularity"] = \ - float(self.likes_per_blog[data["blog"]]) / float(self.total_number_of_likes) - - print(json.dumps(parsed_data)) - - unparsed_file.close() - -if __name__ == '__main__': - KaggleRawDataParser().main() diff --git a/sample-apps/blog-tutorial-shared/src/python/vespaModel.py b/sample-apps/blog-tutorial-shared/src/python/vespaModel.py deleted file mode 100755 index 5d3bf1eceb7..00000000000 --- a/sample-apps/blog-tutorial-shared/src/python/vespaModel.py +++ /dev/null @@ -1,397 +0,0 @@ -# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -""" -Train a 2 layers neural network to compute the probability of a user -represented by the vector u liking a document represented by the vector d. - -Usage: ./vespaModel.py --product_features_file_path path \ - --user_features_file_path path \ - --dataset_file_path path - -Expected File formats: - -- product_features_file_path contains a file with rows following the JSON format below: - -{"post_id" : 20, - "user_item_cf" : {"user_item_cf:5" : -0.66617566, - "user_item_cf:6" : 0.29197264, - "user_item_cf:1" : -0.15582734, - "user_item_cf:7" : 0.3350679, - "user_item_cf:2" : -0.16676047, - "user_item_cf:9" : -0.31653953, - "user_item_cf:3" : -0.21495385, - "user_item_cf:4" : -0.036676258, - "user_item_cf:8" : 0.122069225, - "user_item_cf:0" : 0.20922394}} - -- user_features_file_path contains a file with rows following the JSON format below: - -{"user_id" : 270, - "user_item_cf" : {"user_item_cf:5" : -0.54011273, - "user_item_cf:6" : 0.2723072, - "user_item_cf:1" : -0.23280832, - "user_item_cf:7" : -0.011183357, - "user_item_cf:2" : -0.3987285, - "user_item_cf:9" : -0.05703937, - "user_item_cf:3" : 0.04699418, - "user_item_cf:4" : 0.06679048, - "user_item_cf:8" : 0.31399783, - "user_item_cf:0" : 0.5000366}} - -- dataset_file_path contains a file with rows containing tab-separated post_id, user_id, label such as the sample below: - -1000054 118475 1 -10001560 666315 0 -10001560 1230226 0 -10001560 561306 1 -""" - - -import tensorflow as tf -import time -import os -import datetime -import json -import numpy as np - -class getData: - """ - Data pre-processing - """ - def __init__(self, product_features_file_path, user_features_file_path, data_set_file_path): - self.product_features_file_path = product_features_file_path - self.user_features_file_path = user_features_file_path - self.data_set_file_path = data_set_file_path - - # Create user and document lookup features - def parse_cf_features(self, json, id_name): - id = json[id_name] - indexes = ['user_item_cf:' + str(x) for x in range(0,10,1)] - values = [json['user_item_cf'][x] for x in indexes] - return [id, values] - - def get_product_features_lookup(self): - product_features = [self.parse_cf_features(json.loads(line), 'post_id') for line in open(self.product_features_file_path)] - return dict(product_features) - - def get_user_features_lookup(self): - user_features = [self.parse_cf_features(json.loads(line), 'user_id') for line in open(self.user_features_file_path)] - return dict(user_features) - - def parse_dataset(self, line, lookup_user_features, lookup_product_features): - info = line.strip("\n").split("\t") - user_id = float(info[0]) - product_id = float(info[1]) - label = int(info[2]) - return lookup_user_features[user_id], lookup_product_features[product_id], [label] - - def prepare_dataset(self): - lookup_product_features = self.get_product_features_lookup() - lookup_user_features = self.get_user_features_lookup() - with open(self.data_set_file_path) as f: - input_u = []; input_d = []; input_y = [] - for line in f: - u, d, y = self.parse_dataset(line, lookup_user_features, lookup_product_features) - input_u.append(u) - input_d.append(d) - input_y.append(y) - input_u = np.array(input_u) - input_d = np.array(input_d) - input_y = np.array(input_y) - return input_u, input_d, input_y - - def create_train_test_sets(self, input_u, input_d, input_y, seed = 10, perc = 0.2): - # Randomly shuffle data - np.random.seed(seed) - shuffle_indices = np.random.permutation(np.arange(len(input_u))) - input_u_shuffled = input_u[shuffle_indices] - input_d_shuffled = input_d[shuffle_indices] - input_y_shuffled = input_y[shuffle_indices] - - # Split train/test set - dev_samples = int(len(input_u_shuffled)*perc) - u_train, u_dev = input_u_shuffled[:-dev_samples], input_u_shuffled[-dev_samples:] - d_train, d_dev = input_d_shuffled[:-dev_samples], input_d_shuffled[-dev_samples:] - y_train, y_dev = input_y_shuffled[:-dev_samples], input_y_shuffled[-dev_samples:] - print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) - - return u_train, u_dev, d_train, d_dev, y_train, y_dev - - def batch_iter(self, data, batch_size, num_epochs, shuffle=True): - """ - Generates a batch iterator for a dataset. - """ - data = np.array(data) - data_size = len(data) - num_batches_per_epoch = int(len(data)/batch_size) + 1 - for epoch in range(num_epochs): - # Shuffle the data at each epoch - if shuffle: - shuffle_indices = np.random.permutation(np.arange(data_size)) - shuffled_data = data[shuffle_indices] - else: - shuffled_data = data - for batch_num in range(num_batches_per_epoch): - start_index = batch_num * batch_size - end_index = min((batch_num + 1) * batch_size, data_size) - yield shuffled_data[start_index:end_index] - -class vespaRunTimeModel: - """ - Model that combine user and document features and needs to be evaluated at query time. - """ - def __init__(self, user_feature_length, doc_feature_length, hidden_length): - - # placeholders - self.input_u = tf.placeholder(tf.float32, [None, user_feature_length], name = 'input_u') - self.input_d = tf.placeholder(tf.float32, [None, doc_feature_length], name = 'input_d') - self.input_y = tf.placeholder(tf.float32, [None, 1], name = 'input_y') - - # merge user and document vector - self.input_concat = tf.concat(1, [self.input_d, self.input_u], name = 'input_concat') - - # hidden layer - self.W_hidden = tf.Variable( - tf.truncated_normal([user_feature_length + - doc_feature_length, hidden_length], stddev=0.1), name = 'W_hidden') - self.b_hidden = tf.Variable(tf.constant(0.1, shape=[hidden_length]), name = 'b_hidden') - - self.hidden_layer = tf.nn.relu(tf.matmul(self.input_concat, self.W_hidden) + self.b_hidden, - name = 'hidden_layer') - - # output layer - self.W_final = tf.Variable( - tf.random_uniform([hidden_length, 1], -0.1, 0.1), - name="W_final") - self.b_final = tf.Variable(tf.zeros([1]), name="b_final") - - self.y = tf.sigmoid(tf.matmul(self.hidden_layer, self.W_final) + self.b_final, name = 'y') - - # prediction based on model output - self.prediction = tf.cast(tf.greater_equal(self.y, 0.5), "float", name = 'prediction') - - # loss function - prob = tf.clip_by_value(self.y,1e-5,1.0 - 1e-5) - self.loss = tf.reduce_mean(- self.input_y * tf.log(prob) - (1 - self.input_y) * tf.log(1 - prob), name = 'loss') - - # accuracy - correct_predictions = tf.equal(self.prediction, self.input_y) - self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") - - def train_operation(self, learning_rate): - global_step = tf.Variable(0, name="global_step", trainable=False) - #optimizer = tf.train.GradientDescentOptimizer(learning_rate) - optimizer = tf.train.AdagradOptimizer(learning_rate) - train_op = optimizer.minimize(self.loss, global_step=global_step) - return train_op, global_step - - def create_output_dir(self): - timestamp = str(int(time.time())) - out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) - print("Writing to {}\n".format(out_dir)) - return out_dir - - def summary_oprations(self): - loss_summary = tf.scalar_summary("loss", self.loss) - acc_summary = tf.scalar_summary("accuracy", self.accuracy) - train_summary_op = tf.merge_summary([loss_summary, acc_summary]) - dev_summary_op = tf.merge_summary([loss_summary, acc_summary]) - return train_summary_op, dev_summary_op - - def train_step(self, u_batch, d_batch, y_batch, writer=None): - """ - A single training step - """ - feed_dict = { - self.input_u: u_batch, - self.input_d: d_batch, - self.input_y: y_batch - } - _, step, summaries, loss, accuracy = sess.run( - [train_op, global_step, train_summary_op, self.loss, self.accuracy], - feed_dict) - time_str = datetime.datetime.now().isoformat() - print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) - if writer: - writer.add_summary(summaries, step) - - def dev_step(self, u_batch, d_batch, y_batch, writer=None): - """ - Evaluates model on a dev set - """ - feed_dict = { - self.input_u: u_batch, - self.input_d: d_batch, - self.input_y: y_batch - } - step, summaries, loss, accuracy = sess.run( - [global_step, dev_summary_op, self.loss, self.accuracy], - feed_dict) - time_str = datetime.datetime.now().isoformat() - print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) - if writer: - writer.add_summary(summaries, step) - -class serializeVespaModel: - """ - Serialize TensorFlow variables to Vespa JSON format - - Example: - checkpoint_dir = "./runs/1473845959/checkpoints" - output_dir = "./runs/1473845959/vespa_variables" - - serializer = serializeVespaModel(checkpoint_dir, output_dir) - serializer.serialize_to_disk(variable_name = "W_hidden", dimension_names = ['input', 'hidden']) - serializer.serialize_to_disk(variable_name = "b_hidden", dimension_names = ['hidden']) - serializer.serialize_to_disk(variable_name = "W_final", dimension_names = ['hidden', 'final']) - serializer.serialize_to_disk(variable_name = "b_final", dimension_names = ['final']) - """ - def __init__(self, checkpoint_dir, output_dir): - self.checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) - self.reader = tf.train.NewCheckpointReader(self.checkpoint_file) - self.output_dir = output_dir - - def write_cell_value(self, variable, dimension_names, dimension_address = None): - if dimension_address is None: - dimension_address = [] - shape = variable.shape - if len(shape) == 1: - count = 0 - cells = [] - for element in variable: - dimension_address.append((dimension_names[0], str(count))) - count += 1 - cells.append({ 'address': dict(dimension_address), "value": float(element) }) - return cells - else: - count = 0 - output = [] - for slice in variable: - dimension_address.append((dimension_names[0], str(count))) - output.extend(self.write_cell_value(slice, dimension_names[1:], dimension_address)) - count += 1 - return output - - def write_to_vespa_json_format(self, variable_name, dimension_names): - variable = self.reader.get_tensor(variable_name) - cells = self.write_cell_value(variable, dimension_names) - return json.dumps({'cells': cells}) - - def serialize_to_disk(self, variable_name, dimension_names): - text_file = open(os.path.join(output_dir, variable_name + ".json"), "w") - text_file.write(serializer.write_to_vespa_json_format(variable_name, dimension_names)) - text_file.close() - - -def task_train(): - # Data - tf.flags.DEFINE_string("product_features_file_path", '', "File containing product features") - tf.flags.DEFINE_string("user_features_file_path", '', "File containing user features") - tf.flags.DEFINE_string("dataset_file_path", '', "File containing labels for each document user pair") - - tf.flags.DEFINE_integer("hidden_length_factor", 2, "The hidden layer has size 'hidden_length_factor * input_vector_length'") - - # Misc Parameters - tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") - tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") - - # Training parameters - tf.flags.DEFINE_float("learning_rate", 0.1, "Gradient Descent learning rate") - - tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") - tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)") - tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)") - tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") - - FLAGS = tf.flags.FLAGS - FLAGS._parse_flags() - print("\nParameters:") - for attr, value in sorted(FLAGS.__flags.items()): - print("{}={}".format(attr.upper(), value)) - print("") - - # Data preparation - data_pre_processing = getData( - FLAGS.product_features_file_path, - FLAGS.user_features_file_path, - FLAGS.dataset_file_path) - - input_u, input_d, input_y = data_pre_processing.prepare_dataset() - u_train, u_dev, d_train, d_dev, y_train, y_dev = data_pre_processing.create_train_test_sets(input_u, input_d, input_y, seed = 10, perc = 0.2) - - user_feature_length = input_u.shape[1] - doc_feature_length = input_d.shape[1] - - - # Create a graph - with tf.Graph().as_default(): - - # Create a session - session_conf = tf.ConfigProto( - allow_soft_placement=FLAGS.allow_soft_placement, - log_device_placement=FLAGS.log_device_placement) - sess = tf.Session(config=session_conf) - with sess.as_default(): - - # instanciate a model - vespa_model = vespaRunTimeModel(user_feature_length = user_feature_length, - doc_feature_length = doc_feature_length, - hidden_length = FLAGS.hidden_length_factor * (user_feature_length + doc_feature_length)) - - # create a train operation - train_op, global_step = vespa_model.train_operation(learning_rate = FLAGS.learning_rate) - - # Summaries for loss and accuracy - train_summary_op, dev_summary_op = vespa_model.summary_oprations() - - # Output directory for models and summaries - out_dir = vespa_model.create_output_dir() - - # Write train summaries to disk - train_summary_dir = os.path.join(out_dir, "summaries", "train") - train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph) - - # Dev summaries - dev_summary_dir = os.path.join(out_dir, "summaries", "dev") - dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph) - - # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it - checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) - checkpoint_prefix = os.path.join(checkpoint_dir, "model") - if not os.path.exists(checkpoint_dir): - os.makedirs(checkpoint_dir) - saver = tf.train.Saver(tf.all_variables()) - - # Initialize all variables - sess.run(tf.initialize_all_variables()) - - # Generate batches - batches = data_pre_processing.batch_iter( - list(zip(u_train, d_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) - # Training loop. For each batch... - for batch in batches: - u_batch, d_batch, y_batch = zip(*batch) - vespa_model.train_step(u_batch, d_batch, y_batch, writer=train_summary_writer) - current_step = tf.train.global_step(sess, global_step) - if current_step % FLAGS.evaluate_every == 0: - print("\nEvaluation:") - vespa_model.dev_step(u_dev, d_dev, y_dev, writer=dev_summary_writer) - print("") - if current_step % FLAGS.checkpoint_every == 0: - path = saver.save(sess, checkpoint_prefix, global_step=current_step) - print("Saved model checkpoint to {}\n".format(path)) - -if __name__ == "__main__": - - # Task - tf.flags.DEFINE_string("task", 'train', "Train a model from scratch") - - FLAGS = tf.flags.FLAGS - FLAGS._parse_flags() - print("\nParameters:") - for attr, value in sorted(FLAGS.__flags.items()): - print("{}={}".format(attr.upper(), value)) - print("") - - if FLAGS.task == "train": - task_train() diff --git a/sample-apps/blog-tutorial-shared/src/test/__init__.py b/sample-apps/blog-tutorial-shared/src/test/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 --- a/sample-apps/blog-tutorial-shared/src/test/__init__.py +++ /dev/null diff --git a/sample-apps/blog-tutorial-shared/src/test/python/__init__.py b/sample-apps/blog-tutorial-shared/src/test/python/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 --- a/sample-apps/blog-tutorial-shared/src/test/python/__init__.py +++ /dev/null diff --git a/sample-apps/blog-tutorial-shared/src/test/python/parse-unittest.py b/sample-apps/blog-tutorial-shared/src/test/python/parse-unittest.py deleted file mode 100644 index 1a75994c740..00000000000 --- a/sample-apps/blog-tutorial-shared/src/test/python/parse-unittest.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -import os -import sys -import unittest -import json -from StringIO import StringIO - -import src.main.python.parse as parse - -class KaggleRawDataParserTest(unittest.TestCase): - - raw_test_file = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + "/resources/trainPostsSampleWith3Elements.json" - saved_stdout = sys.stdout - out = StringIO() - - - def setUp(self): - sys.argv.append(self.raw_test_file) - - self.out = StringIO() - sys.stdout = self.out - - def tearDown(self): - sys.argv = [sys.argv[0]] - - sys.stdout = self.saved_stdout - - - def test_no_flags(self): - parser = parse.KaggleRawDataParser() - - self.assertFalse(parser.popularity) - self.assertEqual(parser.raw_data_file, self.raw_test_file) - - def test_popularity_flag(self): - sys.argv.append("-p") - parser = parse.KaggleRawDataParser() - - self.assertTrue(parser.popularity) - - def test_parsing_without_popularity(self): - parser = parse.KaggleRawDataParser() - - parser.parse() - - output_array = self.out.getvalue().strip().split('\n') - compare_with = [{ - "fields": { - "author": "5", - "blog": "4", - "blogname": "Matt on Not-WordPress", - "categories": [ - "Moblog" - ], - "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\" alt=\"\" title=\"photo19\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3838\" /></a>", - "date": 20120328, - "date_gmt": "2012-03-28 03:36:57", - "language": "en", - "post_id": "507823", - "tags": [], - "title": "#vipworkshop dinner", - "url": "http://matt.wordpress.com/?p=3837" - }, - "put": "id:blog-search:blog_post::507823" - }, - { - "fields": { - "author": "5", - "blog": "4", - "blogname": "Matt on Not-WordPress", - "categories": [ - "Moblog" - ], - "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\" alt=\"\" title=\"photo20\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3840\" /></a>", - "date": 20120328, - "date_gmt": "2012-03-28 04:41:37", - "language": "en", - "post_id": "1406963", - "tags": [], - "title": "Oven roasted tomatoes", - "url": "http://matt.wordpress.com/?p=3839" - }, - "put": "id:blog-search:blog_post::1406963" - }, - { - "fields": { - "author": "5", - "blog": "4", - "blogname": "Matt on Not-WordPress", - "categories": [ - "Moblog" - ], - "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\" alt=\"\" title=\"photo21\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3842\" /></a>", - "date": 20120328, - "date_gmt": "2012-03-28 19:59:45", - "language": "en", - "post_id": "1329369", - "tags": [], - "title": "Fish tacos and spicy slaw", - "url": "http://matt.wordpress.com/?p=3841" - }, - "put": "id:blog-search:blog_post::1329369" - }] - - for i in range(0, 3): - self.assertEqual(json.loads(output_array[i]), compare_with[i]) - - def test_parsing_with_popularity(self): - sys.argv.append("-p") - parser = parse.KaggleRawDataParser() - - parser.main() - - output_array = self.out.getvalue().strip().split('\n') - compare_with = [{ - "fields": { - "author": "5", - "blog": "4", - "blogname": "Matt on Not-WordPress", - "categories": [ - "Moblog" - ], - "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\" alt=\"\" title=\"photo19\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3838\" /></a>", - "date": 20120328, - "date_gmt": "2012-03-28 03:36:57", - "language": "en", - "popularity": 1.0, - "post_id": "507823", - "tags": [], - "title": "#vipworkshop dinner", - "url": "http://matt.wordpress.com/?p=3837" - }, - "put": "id:blog-search:blog_post::507823" - }, - { - "fields": { - "author": "5", - "blog": "4", - "blogname": "Matt on Not-WordPress", - "categories": [ - "Moblog" - ], - "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\" alt=\"\" title=\"photo20\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3840\" /></a>", - "date": 20120328, - "date_gmt": "2012-03-28 04:41:37", - "language": "en", - "popularity": 1.0, - "post_id": "1406963", - "tags": [], - "title": "Oven roasted tomatoes", - "url": "http://matt.wordpress.com/?p=3839" - }, - "put": "id:blog-search:blog_post::1406963" - }, - { - "fields": { - "author": "5", - "blog": "4", - "blogname": "Matt on Not-WordPress", - "categories": [ - "Moblog" - ], - "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\" alt=\"\" title=\"photo21\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3842\" /></a>", - "date": 20120328, - "date_gmt": "2012-03-28 19:59:45", - "language": "en", - "popularity": 1.0, - "post_id": "1329369", - "tags": [], - "title": "Fish tacos and spicy slaw", - "url": "http://matt.wordpress.com/?p=3841" - }, - "put": "id:blog-search:blog_post::1329369" - }] - - for i in range(0, 3): - self.assertEqual(json.loads(output_array[i]), compare_with[i]) - -if __name__ == '__main__': - unittest.main() diff --git a/sample-apps/blog-tutorial-shared/src/test/resources/trainPostsSample.json b/sample-apps/blog-tutorial-shared/src/test/resources/trainPostsSample.json deleted file mode 100644 index c7b7e32f396..00000000000 --- a/sample-apps/blog-tutorial-shared/src/test/resources/trainPostsSample.json +++ /dev/null @@ -1,10 +0,0 @@ -{"date_gmt":"2012-03-28 03:36:57", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3837", "title": "#vipworkshop dinner", "blog": "4", "post_id": "507823", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-28 03:36:57", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\" alt=\"\" title=\"photo19\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3838\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-31 11:43:38", "uid": "6218184"}, {"dt": "2012-03-28 10:25:22", "uid": "11335199"}]} -{"date_gmt":"2012-03-28 04:41:37", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3839", "title": "Oven roasted tomatoes", "blog": "4", "post_id": "1406963", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-28 04:41:37", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\" alt=\"\" title=\"photo20\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3840\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-31 11:43:03", "uid": "6218184"}, {"dt": "2012-03-28 05:01:34", "uid": "26248885"}]} -{"date_gmt":"2012-03-28 19:59:45", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3841", "title": "Fish tacos and spicy slaw", "blog": "4", "post_id": "1329369", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-28 19:59:45", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\" alt=\"\" title=\"photo21\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3842\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-28 21:05:37", "uid": "31367867"}]} -{"date_gmt":"2012-03-31 00:58:37", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3845", "title": "White corn guacamole", "blog": "4", "post_id": "916703", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-31 00:58:37", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo22.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo22.jpg\" alt=\"\" title=\"photo22\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3846\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-31 11:42:15", "uid": "6218184"}, {"dt": "2012-03-31 03:32:06", "uid": "33301824"}]} -{"date_gmt":"2012-03-31 18:46:59", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3847", "title": "#WordPress cake and wine", "blog": "4", "post_id": "1829542", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-31 18:46:59", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo23.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo23.jpg\" alt=\"\" title=\"photo23\" width=\"640\" height=\"480\" class=\"alignnone size-full wp-image-3848\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-04-04 14:22:40", "uid": "21552"}, {"dt": "2012-04-01 07:51:59", "uid": "50392"}, {"dt": "2012-04-12 16:58:42", "uid": "53742"}, {"dt": "2012-04-05 13:37:51", "uid": "396702"}, {"dt": "2012-04-02 22:30:22", "uid": "414033"}, {"dt": "2012-04-01 08:17:25", "uid": "872435"}, {"dt": "2012-03-31 18:50:18", "uid": "1156143"}, {"dt": "2012-04-01 07:25:27", "uid": "1246555"}, {"dt": "2012-03-31 19:26:40", "uid": "2177886"}, {"dt": "2012-03-31 19:13:17", "uid": "3346825"}, {"dt": "2012-04-01 21:50:49", "uid": "5073742"}, {"dt": "2012-03-31 19:46:11", "uid": "6134205"}, {"dt": "2012-03-31 19:07:17", "uid": "6433901"}, {"dt": "2012-04-04 01:01:08", "uid": "6894686"}, {"dt": "2012-03-31 18:53:19", "uid": "7073116"}, {"dt": "2012-03-31 18:51:52", "uid": "8288845"}, {"dt": "2012-04-23 10:17:31", "uid": "12788480"}, {"dt": "2012-04-05 15:50:45", "uid": "28688316"}, {"dt": "2012-04-01 19:46:36", "uid": "29630467"}, {"dt": "2012-03-31 18:48:49", "uid": "33013507"}]} -{"date_gmt":"2012-04-01 03:48:07", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3849", "title": "Oysters!", "blog": "4", "post_id": "1197076", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-04-01 03:48:07", "content": "<a href=\"http://matt.files.wordpress.com/2012/04/photo.jpg\"><img src=\"http://matt.files.wordpress.com/2012/04/photo.jpg\" alt=\"\" title=\"photo\" width=\"640\" height=\"480\" class=\"alignnone size-full wp-image-3850\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-04-01 03:59:19", "uid": "5360368"}, {"dt": "2012-04-01 03:52:29", "uid": "8689260"}, {"dt": "2012-04-10 06:08:23", "uid": "26404032"}]} -{"date_gmt":"2012-04-01 19:05:20", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3851", "title": "Crab and artichoke pizza", "blog": "4", "post_id": "377833", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-04-01 19:05:20", "content": "<a href=\"http://matt.files.wordpress.com/2012/04/photo1.jpg\"><img src=\"http://matt.files.wordpress.com/2012/04/photo1.jpg\" alt=\"\" title=\"photo1\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3852\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-04-13 19:29:25", "uid": "6218184"}]} -{"date_gmt":"2012-04-01 19:48:38", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3853", "title": "Framboise float and brownie", "blog": "4", "post_id": "871687", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-04-01 19:48:38", "content": "<a href=\"http://matt.files.wordpress.com/2012/04/photo2.jpg\"><img src=\"http://matt.files.wordpress.com/2012/04/photo2.jpg\" alt=\"\" title=\"photo2\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3854\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-04-13 19:28:36", "uid": "6218184"}]} -{"date_gmt":"2012-04-01 21:09:09", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3855", "title": "Rothko #10", "blog": "4", "post_id": "1893680", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-04-01 21:09:09", "content": "<a href=\"http://matt.files.wordpress.com/2012/04/photo3.jpg\"><img src=\"http://matt.files.wordpress.com/2012/04/photo3.jpg\" alt=\"\" title=\"photo3\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3856\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-04-01 21:41:34", "uid": "2096880"}, {"dt": "2012-04-02 03:50:33", "uid": "4634349"}, {"dt": "2012-04-01 21:10:13", "uid": "6766437"}, {"dt": "2012-04-01 21:35:48", "uid": "11335199"}, {"dt": "2012-04-01 21:18:40", "uid": "11691159"}]} -{"date_gmt":"2012-04-03 13:22:50", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3857", "title": "Port lights", "blog": "4", "post_id": "891295", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-04-03 13:22:50", "content": "<a href=\"http://matt.files.wordpress.com/2012/04/photo4.jpg\"><img src=\"http://matt.files.wordpress.com/2012/04/photo4.jpg\" alt=\"\" title=\"photo4\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3858\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-04-04 05:50:43", "uid": "4204944"}, {"dt": "2012-04-03 15:23:45", "uid": "6134205"}, {"dt": "2012-04-03 13:27:01", "uid": "8399160"}, {"dt": "2012-04-03 13:36:09", "uid": "11335199"}, {"dt": "2012-04-03 13:24:17", "uid": "33301824"}]} diff --git a/sample-apps/blog-tutorial-shared/src/test/resources/trainPostsSampleWith3Elements.json b/sample-apps/blog-tutorial-shared/src/test/resources/trainPostsSampleWith3Elements.json deleted file mode 100644 index c144300fe37..00000000000 --- a/sample-apps/blog-tutorial-shared/src/test/resources/trainPostsSampleWith3Elements.json +++ /dev/null @@ -1,3 +0,0 @@ -{"date_gmt":"2012-03-28 03:36:57", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3837", "title": "#vipworkshop dinner", "blog": "4", "post_id": "507823", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-28 03:36:57", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\" alt=\"\" title=\"photo19\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3838\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-31 11:43:38", "uid": "6218184"}, {"dt": "2012-03-28 10:25:22", "uid": "11335199"}]} -{"date_gmt":"2012-03-28 04:41:37", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3839", "title": "Oven roasted tomatoes", "blog": "4", "post_id": "1406963", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-28 04:41:37", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\" alt=\"\" title=\"photo20\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3840\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-31 11:43:03", "uid": "6218184"}, {"dt": "2012-03-28 05:01:34", "uid": "26248885"}]} -{"date_gmt":"2012-03-28 19:59:45", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3841", "title": "Fish tacos and spicy slaw", "blog": "4", "post_id": "1329369", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-28 19:59:45", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\" alt=\"\" title=\"photo21\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3842\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-28 21:05:37", "uid": "31367867"}]} diff --git a/sample-apps/blog-tutorial-shared/src/test/resources/trainingSetIndicesSample.txt b/sample-apps/blog-tutorial-shared/src/test/resources/trainingSetIndicesSample.txt deleted file mode 100644 index 7312ed2fdca..00000000000 --- a/sample-apps/blog-tutorial-shared/src/test/resources/trainingSetIndicesSample.txt +++ /dev/null @@ -1,20 +0,0 @@ -1000136 30856199 -1000631 19883445 -1000631 24350500 -1000631 25936432 -1000631 29881381 -1000776 30532923 -1001135 20234756 -1001135 31697777 -100156 12076558 -100156 12159263 -100156 17277682 -100156 964144 -1002462 1878322 -1002515 16433161 -1002515 21922168 -1002515 23039563 -1002515 2493815 -1002599 23733565 -1002599 29430402 -1002599 30449719 diff --git a/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/BlogRecommendationAppTest.scala b/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/BlogRecommendationAppTest.scala deleted file mode 100644 index bd73c088e3e..00000000000 --- a/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/BlogRecommendationAppTest.scala +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.example.blog - -import org.scalatest.FunSuite - -class BlogRecommendationAppTest extends FunSuite { - - test("CollaborativeFilteringApp writes user and item latent factors to output path") (pending) - -} diff --git a/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/CollaborativeFilteringTest.scala b/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/CollaborativeFilteringTest.scala deleted file mode 100644 index 07df1ebf622..00000000000 --- a/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/CollaborativeFilteringTest.scala +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.example.blog - -import org.apache.spark.ml.recommendation.ALSModel -import org.apache.spark.sql.SparkSession -import org.scalatest.Matchers._ -import org.scalatest._ - -class CollaborativeFilteringTest extends FunSuite with BeforeAndAfter { - - var ss: SparkSession = _ - - before { - - ss = SparkSession - .builder() - .appName("Unit Test") - .master("local[*]") - .getOrCreate() - - } - - after { - ss.stop() - } - - test("run method returns a MatrixFactorizationModel with latent factors of size 10 to user and item") { - - val file_path = getClass.getResource("/trainingSetIndicesSample.txt") - - val cf = new CollaborativeFiltering(ss) - - val model = cf.run( - input_path = file_path.toString, - rank = 10, - numIterations = 10, - lambda = 0.01) - - model shouldBe a [ALSModel] - - val product_feature_array = model.itemFactors.first().getSeq(1) - assertResult(10){product_feature_array.length} - - val user_feature_array = model.userFactors.first().getSeq(1) - assertResult(10){user_feature_array.length} - - } - - test("run_pipeline method returns a MatrixFactorizationModel with latent factors of size 10 to user and item") { - - val file_path = getClass.getResource("/trainingSetIndicesSample.txt") - - val cf = new CollaborativeFiltering(ss) - - val model = cf.run_pipeline(input_path = file_path.toString, numIterations = 10) - - model shouldBe a [ALSModel] - - val product_feature_array = model.itemFactors.first().getSeq(1) - assertResult(10){product_feature_array.length} - - val user_feature_array = model.userFactors.first().getSeq(1) - assertResult(10){user_feature_array.length} - - } - -} diff --git a/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/SplitFullSetIntoTrainAndTestSetsTest.scala b/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/SplitFullSetIntoTrainAndTestSetsTest.scala deleted file mode 100644 index 395cc99f8c5..00000000000 --- a/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/SplitFullSetIntoTrainAndTestSetsTest.scala +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.example.blog - -import org.apache.spark.sql.{SparkSession, DataFrame} -import org.scalatest.Matchers._ -import org.scalatest._ - -class SplitFullSetIntoTrainAndTestSetsTest extends FunSuite with BeforeAndAfter { - - var ss: SparkSession = _ - - before { - - ss = SparkSession - .builder() - .appName("Unit Test") - .master("local[*]") - .getOrCreate() - - } - - after { - ss.stop() - } - - test("SplitFullSetIntoTrainAndTestSets should return an Array of DataFrame") { - - val file_path = getClass.getResource("/trainPostsSample.json") - - val splitter = new SplitFullSetIntoTrainAndTestSets(ss) - - val sets = splitter.run(input_file_path = file_path.toString, - test_perc_stage1 = 0.05, - test_perc_stage2 = 0.15, - seed = 123) - - sets shouldBe a [Array[DataFrame]] - - } - -} - |