aboutsummaryrefslogtreecommitdiffstats
path: root/sample-apps
diff options
context:
space:
mode:
authorLester Solbakken <lesters@yahoo-inc.com>2017-07-05 12:55:36 +0200
committerLester Solbakken <lesters@yahoo-inc.com>2017-07-05 12:55:36 +0200
commit550090587be026f8e93bd6007e1b6826728f019b (patch)
tree6045f7bca717cbc5a44cd73a466e422f7ef01162 /sample-apps
parentfe7548cc73e3f6064a28499503cfc6eb25c8f19b (diff)
Remove unused blog sample files moved to shared
Diffstat (limited to 'sample-apps')
-rw-r--r--sample-apps/blog-recommendation/src/pig/feed_content_and_tensor_vespa.pig101
-rw-r--r--sample-apps/blog-recommendation/src/pig/feed_content_vespa.pig72
-rw-r--r--sample-apps/blog-recommendation/src/pig/feed_user_item_cf_vespa.pig38
-rw-r--r--sample-apps/blog-recommendation/src/pig/generate_user_item_cf_dataset.pig16
-rw-r--r--sample-apps/blog-recommendation/src/pig/get_recommendations.pig30
-rw-r--r--sample-apps/blog-recommendation/src/pig/tutorial_blog_popularity.pig56
-rw-r--r--sample-apps/blog-recommendation/src/pig/tutorial_feed_content_and_tensor_vespa.pig117
-rw-r--r--sample-apps/blog-recommendation/src/pig/tutorial_feed_content_vespa.pig52
-rw-r--r--sample-apps/blog-recommendation/src/spark/collaborative_filtering_example.scala60
-rw-r--r--sample-apps/blog-recommendation/src/spark/data_exploration.scala64
-rw-r--r--sample-apps/blog-recommendation/src/spark/expected_percentile.scala40
-rw-r--r--sample-apps/blog-recommendation/src/spark/full_dataset_cf.scala61
-rw-r--r--sample-apps/blog-recommendation/src/spark/train_test_set_division.scala46
13 files changed, 0 insertions, 753 deletions
diff --git a/sample-apps/blog-recommendation/src/pig/feed_content_and_tensor_vespa.pig b/sample-apps/blog-recommendation/src/pig/feed_content_and_tensor_vespa.pig
deleted file mode 100644
index 70d14dc1cc1..00000000000
--- a/sample-apps/blog-recommendation/src/pig/feed_content_and_tensor_vespa.pig
+++ /dev/null
@@ -1,101 +0,0 @@
--- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-REGISTER vespa-hadoop.jar
-
--- Create valid Vespa put operations
-DEFINE VespaPutOperationDoc
- com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
- 'operation=put',
- 'docid=id:blog-recommendation:blog_post::<post_id>',
- 'create-tensor-fields=user_item_cf',
- 'simple-array-fields=tags,categories'
- );
-
-DEFINE VespaPutOperationUser
- com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
- 'operation=put',
- 'docid=id:blog-recommendation:user::<user_id>',
- 'create-tensor-fields=user_item_cf',
- 'simple-array-fields=has_read_items'
- );
-
--- Transform tabular data to a Vespa document operation JSON format
-DEFINE VespaStorage
- com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- DEFINE VespaStorage
--- com.yahoo.vespa.hadoop.pig.VespaStorage(
--- 'create-document-operation=true',
--- 'operation=put',
--- 'docid=id:blog-recommendation:blog_post::<post_id>'
--- );
-
--- Load data from any source - here we load using PigStorage
-data = LOAD 'blog-recommendation/trainPostsFinal' USING JsonLoader('date_gmt:chararray, language:chararray, author:chararray, url:chararray, title:chararray, blog:chararray, post_id:chararray, tags:{T:(tag_name:chararray)}, blogname:chararray, date:chararray, content:chararray, categories:{T:(category_name:chararray)}, likes:{T:(dt:chararray, uid:chararray)}');
-
-data_for_feed = FOREACH data GENERATE
- date_gmt,
- language,
- author,
- url,
- title,
- blog,
- post_id,
- tags,
- blogname,
- content,
- categories;
-
-data_doc = LOAD 'blog-recommendation/user_item_cf/product_features' USING JsonLoader('post_id:chararray, user_item_cf:[double]');
-
-data_content_and_doc_tensor = JOIN data_for_feed BY post_id LEFT, data_doc BY post_id;
-data_content_and_doc_tensor = FOREACH data_content_and_doc_tensor GENERATE
- date_gmt AS date_gmt,
- language AS language,
- author AS author,
- url AS url,
- title AS title,
- blog AS blog,
- data_for_feed::post_id as post_id,
- tags AS tags,
- blogname AS blogname,
- content AS content,
- categories AS categories,
- user_item_cf AS user_item_cf,
- (user_item_cf IS NOT NULL ? 1 : 0) AS has_user_item_cf;
-
-data_content_and_doc_tensor_feed = FOREACH data_content_and_doc_tensor GENERATE VespaPutOperationDoc(*);
-
--- use cf latent factor
-data_user = LOAD 'blog-recommendation/user_item_cf/user_features' USING JsonLoader('user_id:chararray, user_item_cf:[double]');
-data_user = FOREACH data_user GENERATE
- user_id AS user_id,
- user_item_cf AS user_item_cf;
-
-
--- Articles already liked
-data_likes = FOREACH data GENERATE post_id, FLATTEN(likes) AS (dt, uid);
-
-post_liked_per_user = GROUP data_likes BY uid;
-post_liked_per_user = FOREACH post_liked_per_user GENERATE group AS user_id, data_likes.post_id AS has_read_items;
-
--- Join user data
-data_user = JOIN post_liked_per_user BY user_id FULL, data_user BY user_id;
-
-data_user = FOREACH data_user GENERATE (post_liked_per_user::user_id IS NOT NULL ? post_liked_per_user::user_id : data_user::user_id) AS user_id,
- user_item_cf AS user_item_cf,
- (user_item_cf IS NOT NULL ? 1 : 0) AS has_user_item_cf,
- has_read_items AS has_read_items;
-data_user = FILTER data_user BY user_id IS NOT NULL;
-
-data_user_for_feed = FOREACH data_user GENERATE VespaPutOperationUser(*);
-
-joint_content_tensors = UNION data_content_and_doc_tensor_feed, data_user_for_feed;
-
--- STORE data_for_feed into 'vespa_put_operation';
-
--- Store into Vespa
-STORE joint_content_tensors INTO '$ENDPOINT' USING VespaStorage();
-
-
-
-
diff --git a/sample-apps/blog-recommendation/src/pig/feed_content_vespa.pig b/sample-apps/blog-recommendation/src/pig/feed_content_vespa.pig
deleted file mode 100644
index d6379649789..00000000000
--- a/sample-apps/blog-recommendation/src/pig/feed_content_vespa.pig
+++ /dev/null
@@ -1,72 +0,0 @@
--- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-REGISTER vespa-hadoop.jar
-
--- UDF to create valid Vespa document operation in JSON format
-DEFINE VespaPutOperationDoc
- com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
- 'operation=put',
- 'docid=id:blog-search:blog_post::<post_id>',
- 'simple-array-fields=tags,categories'
- );
-
--- UDF to send data to a Vespa endpoint
-DEFINE VespaStorage
- com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- Load data from any source - here we load using JsonLoader
-data_first_release = LOAD 'blog-recommendation/first_release/trainPosts.json' USING
- JsonLoader('date_gmt:chararray,
- language:chararray,
- author:chararray,
- url:chararray,
- title:chararray,
- blog:chararray,
- post_id:chararray,
- tags:{T:(tag_name:chararray)},
- blogname:chararray,
- date:chararray,
- content:chararray,
- categories:{T:(category_name:chararray)},
- likes:{T:(dt:chararray, uid:chararray)}');
-
-data_second_release = LOAD 'blog-recommendation/second_release/trainPosts.json' USING
- JsonLoader('date_gmt:chararray,
- language:chararray,
- author:chararray,
- url:chararray,
- title:chararray,
- blog:chararray,
- post_id:chararray,
- tags:{T:(tag_name:chararray)},
- blogname:chararray,
- date:chararray,
- content:chararray,
- categories:{T:(category_name:chararray)},
- likes:{T:(dt:chararray, uid:chararray)}');
-
-data = UNION data_first_release, data_second_release;
-
--- Select fields that will be sent to Vespa.
--- This should follow blog_post.sd
-data_for_feed = FOREACH data GENERATE
- date_gmt,
- language,
- author,
- url,
- title,
- blog,
- post_id,
- tags,
- blogname,
- content,
- categories;
-
--- Create valid Vespa put operations in JSON format
-data_for_feed_json = FOREACH data_for_feed GENERATE VespaPutOperationDoc(*);
-
--- Sample Vespa operations
--- data_for_feed_json_sample = SAMPLE data_for_feed_json 0.0005;
--- STORE data_for_feed_json_sample INTO 'blog-sample';
-
--- Store into Vespa
-STORE data_for_feed_json INTO '$ENDPOINT' USING VespaStorage();
diff --git a/sample-apps/blog-recommendation/src/pig/feed_user_item_cf_vespa.pig b/sample-apps/blog-recommendation/src/pig/feed_user_item_cf_vespa.pig
deleted file mode 100644
index 66cb6c5d5c6..00000000000
--- a/sample-apps/blog-recommendation/src/pig/feed_user_item_cf_vespa.pig
+++ /dev/null
@@ -1,38 +0,0 @@
--- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-REGISTER vespa-hadoop.jar
-
--- Create valid Vespa put operations
-DEFINE VespaPutOperationDoc
- com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
- 'operation=put',
- 'docid=id:blog-recommendation:blog_post::<post_id>',
- 'create-tensor-fields=user_item_cf'
- );
-
-DEFINE VespaPutOperationUser
- com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
- 'operation=put',
- 'docid=id:blog-recommendation:user::<user_id>',
- 'create-tensor-fields=user_item_cf'
- );
-
--- Transform tabular data to a Vespa document operation JSON format
-DEFINE VespaStorage
- com.yahoo.vespa.hadoop.pig.VespaStorage();
-
-
-data_doc = LOAD 'blog-recommendation/user_item_cf/product_features' USING JsonLoader('post_id:chararray, user_item_cf:[double]');
-data_doc_for_feed = FOREACH data_doc GENERATE VespaPutOperationDoc(*);
-
-
-data_user = LOAD 'blog-recommendation/user_item_cf/user_features' USING JsonLoader('user_id:chararray, user_item_cf:[double]');
-data_user_for_feed = FOREACH data_user GENERATE VespaPutOperationUser(*);
-
-
--- Store into Vespa
-STORE data_doc_for_feed INTO '$ENDPOINT' USING VespaStorage();
-STORE data_user_for_feed INTO '$ENDPOINT' USING VespaStorage();
-
-
-
-
diff --git a/sample-apps/blog-recommendation/src/pig/generate_user_item_cf_dataset.pig b/sample-apps/blog-recommendation/src/pig/generate_user_item_cf_dataset.pig
deleted file mode 100644
index feea22263d9..00000000000
--- a/sample-apps/blog-recommendation/src/pig/generate_user_item_cf_dataset.pig
+++ /dev/null
@@ -1,16 +0,0 @@
--- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
--- Load data from any source - here we load using PigStorage
-data = LOAD 'blog-recommendation/trainPostsFinal' USING JsonLoader('date_gmt:chararray, language:chararray, author:chararray, url:chararray, title:chararray, blog:chararray, post_id:chararray, tags:{T:(tag_name:chararray)}, blogname:chararray, date:chararray, content:chararray, categories:{T:(category_name:chararray)}, likes:{T:(dt:chararray, uid:chararray)}');
-
-data_likes = FOREACH data GENERATE post_id, FLATTEN(likes) AS (dt, uid);
-
-data_cf = FOREACH data_likes GENERATE uid, post_id, 1 as rate;
-
-data_cf = FILTER data_cf BY (uid IS NOT NULL) AND (uid != '') AND (post_id IS NOT NULL) AND (post_id != '');
-
--- data_cf_sample = SAMPLE data_cf 0.001;
-
--- data_cf = LIMIT data_cf 10;
-
-STORE data_cf INTO 'blog-recommendation/trainPostsFinal_user_item_cf';
-
diff --git a/sample-apps/blog-recommendation/src/pig/get_recommendations.pig b/sample-apps/blog-recommendation/src/pig/get_recommendations.pig
deleted file mode 100644
index ad97e8361a2..00000000000
--- a/sample-apps/blog-recommendation/src/pig/get_recommendations.pig
+++ /dev/null
@@ -1,30 +0,0 @@
--- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
--- REGISTER $VESPA_HADOOP_JAR
-REGISTER vespa-hadoop.jar
--- REGISTER parquet-pig-bundle-1.8.1.jar
-
--- Define Vespa query for retrieving blog posts
-DEFINE BlogPostRecommendations
- com.yahoo.vespa.hadoop.pig.VespaQuery(
- 'query=http://ENDPOINT:8080/search/?user_id=<userid>&hits=100',
- 'schema=rank:int,id:chararray,relevance:double,fields/post_id:chararray'
- );
-
--- Load test_set data from a local file
-test_set = LOAD 'data/cv/test_set_exploded' AS (post_id:chararray, userid:chararray);
-users = FOREACH test_set GENERATE userid;
-users = FILTER users BY userid IS NOT null;
-users = DISTINCT users;
-
-users_limit = LIMIT users 10;
-
--- Run a set of queries against Vespa
-recommendations = FOREACH users_limit GENERATE userid,
- FLATTEN(BlogPostRecommendations(*)) AS (rank, id, relevance, post_id);
-recommendations = FOREACH recommendations GENERATE userid, rank, post_id;
-
-recommendations = FILTER recommendations BY rank IS NOT NULL AND post_id IS NOT NULL;
-
--- Output recommendations
-STORE recommendations INTO 'data/recommendations' USING PigStorage('\t', '-schema');
--- STORE recommendations INTO 'data/recommendations' USING org.apache.parquet.pig.ParquetStorer();
diff --git a/sample-apps/blog-recommendation/src/pig/tutorial_blog_popularity.pig b/sample-apps/blog-recommendation/src/pig/tutorial_blog_popularity.pig
deleted file mode 100644
index 25c3b4ab711..00000000000
--- a/sample-apps/blog-recommendation/src/pig/tutorial_blog_popularity.pig
+++ /dev/null
@@ -1,56 +0,0 @@
--- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-REGISTER '$VESPA_HADOOP_JAR'
-
--- UDF to create valid Vespa document operation in JSON format
-DEFINE VespaUpdateOperationDoc
- com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
- 'operation=update',
- 'docid=id:blog-search:blog_post::<post_id>'
- );
-
--- UDF to send data to a Vespa endpoint
-DEFINE VespaStorage
- com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- Load data from any source - here we load using JsonLoader
-data = LOAD '$DATA_PATH' USING
- JsonLoader('date_gmt:chararray,
- language:chararray,
- author:chararray,
- url:chararray,
- title:chararray,
- blog:chararray,
- post_id:chararray,
- tags:{T:(tag_name:chararray)},
- blogname:chararray,
- date:chararray,
- content:chararray,
- categories:{T:(category_name:chararray)},
- likes:{T:(dt:chararray, uid:chararray)}');
-
-data = FILTER data BY likes IS NOT NULL;
-
-data_likes = FOREACH data GENERATE
- blog,
- post_id,
- blogname,
- FLATTEN(likes) AS (dt, uid);
-
--- data_likes_limit = LIMIT data_likes 10;
-
-likes = FOREACH (GROUP data_likes ALL)
- GENERATE COUNT(data_likes) as total_number;
-
-blog_popularity = FOREACH (GROUP data_likes BY blog) GENERATE
- group as blog,
- (double)COUNT(data_likes)/(double)likes.total_number AS popularity;
-
-data_update = JOIN data_likes BY blog, blog_popularity BY blog;
-data_update = FOREACH data_update GENERATE
- post_id, popularity;
-
--- Create valid Vespa put operations in JSON format
-data_for_feed_json = FOREACH data_update GENERATE VespaUpdateOperationDoc(*);
-
--- Store into Vespa
-STORE data_for_feed_json INTO '$ENDPOINT' USING VespaStorage();
diff --git a/sample-apps/blog-recommendation/src/pig/tutorial_feed_content_and_tensor_vespa.pig b/sample-apps/blog-recommendation/src/pig/tutorial_feed_content_and_tensor_vespa.pig
deleted file mode 100644
index b000037f25a..00000000000
--- a/sample-apps/blog-recommendation/src/pig/tutorial_feed_content_and_tensor_vespa.pig
+++ /dev/null
@@ -1,117 +0,0 @@
--- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-REGISTER '$VESPA_HADOOP_JAR'
-
--- Create valid Vespa put operations
-DEFINE VespaPutOperationDoc
- com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
- 'operation=put',
- 'docid=id:blog-recommendation:blog_post::<post_id>',
- 'create-tensor-fields=user_item_cf',
- 'simple-array-fields=tags,categories'
- );
-
-DEFINE VespaPutOperationUser
- com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
- 'operation=put',
- 'docid=id:blog-recommendation:user::<user_id>',
- 'create-tensor-fields=user_item_cf',
- 'simple-array-fields=has_read_items'
- );
-
--- Transform tabular data to a Vespa document operation JSON format
-DEFINE VespaStorage
- com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- Load data
-data = LOAD '$DATA_PATH' USING
- JsonLoader('date_gmt:chararray,
- language:chararray,
- author:chararray,
- url:chararray,
- title:chararray,
- blog:chararray,
- post_id:chararray,
- tags:{T:(tag_name:chararray)},
- blogname:chararray,
- date:chararray,
- content:chararray,
- categories:{T:(category_name:chararray)},
- likes:{T:(dt:chararray, uid:chararray)}');
-
-data_for_feed = FOREACH data GENERATE
- date_gmt,
- language,
- author,
- url,
- title,
- blog,
- post_id,
- tags,
- blogname,
- content,
- categories;
-
--- Load Blog post CF latent factors
-data_doc = LOAD '$BLOG_POST_FACTORS' USING
- JsonLoader('post_id:chararray,
- user_item_cf:[double]');
-
--- Join data and latent factors
-data_content_and_doc_tensor = JOIN data_for_feed BY post_id LEFT, data_doc BY post_id;
-data_content_and_doc_tensor = FOREACH data_content_and_doc_tensor GENERATE
- date_gmt AS date_gmt,
- language AS language,
- author AS author,
- url AS url,
- title AS title,
- blog AS blog,
- data_for_feed::post_id as post_id,
- tags AS tags,
- blogname AS blogname,
- content AS content,
- categories AS categories,
- user_item_cf AS user_item_cf,
- (user_item_cf IS NOT NULL ? 1 : 0) AS has_user_item_cf;
-
--- Generate valid Vespa JSON format
-data_content_and_doc_tensor_feed = FOREACH data_content_and_doc_tensor GENERATE VespaPutOperationDoc(*);
-
--- Load User CF latent factors
-data_user = LOAD '$USER_FACTORS' USING
- JsonLoader('user_id:chararray,
- user_item_cf:[double]');
-data_user = FOREACH data_user GENERATE
- user_id AS user_id,
- user_item_cf AS user_item_cf;
-
--- Articles already liked
-data_likes = FOREACH data GENERATE post_id, FLATTEN(likes) AS (dt, uid);
-
-post_liked_per_user = GROUP data_likes BY uid;
-post_liked_per_user = FOREACH post_liked_per_user GENERATE
- group AS user_id,
- data_likes.post_id AS has_read_items;
-
--- Join user data
-data_user = JOIN post_liked_per_user BY user_id FULL,
- data_user BY user_id;
-
-data_user = FOREACH data_user GENERATE
- (post_liked_per_user::user_id IS NOT NULL ? post_liked_per_user::user_id : data_user::user_id) AS user_id,
- user_item_cf AS user_item_cf,
- (user_item_cf IS NOT NULL ? 1 : 0) AS has_user_item_cf,
- has_read_items AS has_read_items;
-
-data_user = FILTER data_user BY user_id IS NOT NULL;
-
--- Generate valid Vespa JSON format
-data_user_for_feed = FOREACH data_user GENERATE VespaPutOperationUser(*);
-
-joint_content_tensors = UNION data_content_and_doc_tensor_feed, data_user_for_feed;
-
--- Store into Vespa
-STORE joint_content_tensors INTO '$ENDPOINT' USING VespaStorage();
-
-
-
-
diff --git a/sample-apps/blog-recommendation/src/pig/tutorial_feed_content_vespa.pig b/sample-apps/blog-recommendation/src/pig/tutorial_feed_content_vespa.pig
deleted file mode 100644
index 6fff700881a..00000000000
--- a/sample-apps/blog-recommendation/src/pig/tutorial_feed_content_vespa.pig
+++ /dev/null
@@ -1,52 +0,0 @@
--- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-REGISTER '$VESPA_HADOOP_JAR'
--- REGISTER vespa-hadoop.jar
-
--- UDF to create valid Vespa document operation in JSON format
-DEFINE VespaPutOperationDoc
- com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
- 'operation=put',
- 'docid=id:blog-search:blog_post::<post_id>',
- 'simple-array-fields=tags,categories'
- );
-
--- UDF to send data to a Vespa endpoint
-DEFINE VespaStorage
- com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- Load data from any source - here we load using JsonLoader
-data = LOAD '$DATA_PATH' USING
- JsonLoader('date_gmt:chararray,
- language:chararray,
- author:chararray,
- url:chararray,
- title:chararray,
- blog:chararray,
- post_id:chararray,
- tags:{T:(tag_name:chararray)},
- blogname:chararray,
- date:chararray,
- content:chararray,
- categories:{T:(category_name:chararray)},
- likes:{T:(dt:chararray, uid:chararray)}');
-
--- Select fields that will be sent to Vespa.
--- This should follow blog_post.sd
-data_for_feed = FOREACH data GENERATE
- date_gmt,
- language,
- author,
- url,
- title,
- blog,
- post_id,
- tags,
- blogname,
- content,
- categories;
-
--- Create valid Vespa put operations in JSON format
-data_for_feed_json = FOREACH data_for_feed GENERATE VespaPutOperationDoc(*);
-
--- Store into Vespa
-STORE data_for_feed_json INTO '$ENDPOINT' USING VespaStorage();
diff --git a/sample-apps/blog-recommendation/src/spark/collaborative_filtering_example.scala b/sample-apps/blog-recommendation/src/spark/collaborative_filtering_example.scala
deleted file mode 100644
index 2118862f063..00000000000
--- a/sample-apps/blog-recommendation/src/spark/collaborative_filtering_example.scala
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-import org.apache.spark.mllib.recommendation.ALS
-import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
-import org.apache.spark.mllib.recommendation.Rating
-import scala.util.parsing.json.JSONObject
-
-// Load and parse the data
-val data = sc.textFile("blog-recommendation/trainPostsFinal_user_item_cf")
-val ratings = data.map(_.split('\t') match { case Array(user, item, rate) =>
- Rating(user.toInt, item.toInt, rate.toDouble)
-})
-
-// Build the recommendation model using ALS
-val rank = 10
-val numIterations = 10
-val model = ALS.train(ratings, rank, numIterations, 0.01)
-
-// Evaluate the model on rating data
-val usersProducts = ratings.map { case Rating(user, product, rate) =>
- (user, product)
-}
-val predictions =
- model.predict(usersProducts).map { case Rating(user, product, rate) =>
- ((user, product), rate)
- }
-val ratesAndPreds = ratings.map { case Rating(user, product, rate) =>
- ((user, product), rate)
-}.join(predictions)
-val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
- val err = (r1 - r2)
- err * err
-}.mean()
-println("Mean Squared Error = " + MSE)
-
-def writeModelFeaturesAsTensor (modelFeatures:(Int, Array[Double]), id_string:String) = {
-
- val id = modelFeatures._1
- val latentVector = modelFeatures._2
- var latentVectorMap:Map[String,Double] = Map()
- var output:Map[String,Any] = Map()
-
- for ( i <- 0 until latentVector.length ){
-
- latentVectorMap += (("user_item_cf:" + i.toString, latentVector(i)))
-
- }
-
- output += ((id_string, id))
- output += (("user_item_cf", scala.util.parsing.json.JSONObject(latentVectorMap)))
-
- JSONObject(output)
-
-}
-
-val product_features = model.productFeatures.map(x => writeModelFeaturesAsTensor(x, "post_id"))
-product_features.saveAsTextFile("blog-recommendation/user_item_cf/product_features")
-val user_features = model.userFeatures.map(x => writeModelFeaturesAsTensor(x, "user_id"))
-user_features.saveAsTextFile("blog-recommendation/user_item_cf/user_features")
-
-
diff --git a/sample-apps/blog-recommendation/src/spark/data_exploration.scala b/sample-apps/blog-recommendation/src/spark/data_exploration.scala
deleted file mode 100644
index d9f3abc9cd8..00000000000
--- a/sample-apps/blog-recommendation/src/spark/data_exploration.scala
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-// sc is an existing SparkContext.
-val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-
-val original_train_post_path = "blog-recommendation-support/data/original_data/trainPosts.json"
-val original_train_post_thin_path = "blog-recommendation-support/data/original_data/trainPostsThin.json"
-val original_test_post_thin_path = "blog-recommendation-support/data/original_data/testPostsThin.json"
-
-val original_train_post = sqlContext.read.json(original_train_post_path)
-val original_train_post_thin = sqlContext.read.json(original_train_post_thin_path)
-val original_test_post_thin = sqlContext.read.json(original_test_post_thin_path)
-
-val count_original_train = original_train_post.count()
-val count_original_train_thin = original_train_post_thin.count()
-val count_original_test_thin = original_test_post_thin.count()
-
-// The inferred schema can be visualized using the printSchema() method.
-original_train_post.printSchema()
-original_train_post_thin.printSchema()
-original_test_post_thin.printSchema()
-
-// No intersection between train and test data
-original_train_post_thin.join(original_test_post_thin, original_train_post_thin("post_id") == original_test_post_thin("post_id")).count(2)
-
-// original_train_minimal_df
-var original_train_minimal_df = original_train_post.select($"date_gmt", $"post_id", size($"likes").as("number_likes"), $"likes")
-// no duplicate post_id
-original_train_minimal_df.select("post_id").dropDuplicates().count() - original_train_minimal_df.select("post_id").count()
-
-// CHECK THIS DECISION - I SHOULD NOT EXLUDE POST_ID WITH ZERO LIKES
-// OTHERWISE THERE WILL BE NO DOCUMENT IN THE TEST SET THAT NO ONE HAS LIKED,
-// WHICH MAKES THE EXERCISE MUCH EASIER
-// only post_id with at least one like
-// original_train_minimal_df = original_train_minimal_df.filter("number_likes > 0")
-
-// Set some post_id aside to be present only on the test set
-var sets = original_train_minimal_df.randomSplit(Array(0.95, 0.05), 123)
-
-var training_set = sets(0)
-var test_set = sets(1)
-
-// flat dataframe so that each line is a combination of post_id and user
-training_set = training_set.select($"post_id", explode($"likes").as("likes_flat"))
-training_set = training_set.select("post_id", "likes_flat.uid")
-
-test_set = test_set.select($"post_id", explode($"likes").as("likes_flat"))
-test_set = test_set.select("post_id", "likes_flat.uid")
-
-// randomly move some (post_id, uid) from training set to test set
-sets = training_set.randomSplit(Array(0.85, 0.15), 123)
-
-training_set = sets(0)
-var additional_test_set = sets(1)
-
-// concatenate test_set and additional_test_set
-test_set = test_set.unionAll(additional_test_set)
-
-// see number of likes distribution
-val like_dist = original_train_minimal_df.groupBy("number_likes").count().orderBy(asc("number_likes")).collect()
-like_dist.map(println)
-
-
-
-
diff --git a/sample-apps/blog-recommendation/src/spark/expected_percentile.scala b/sample-apps/blog-recommendation/src/spark/expected_percentile.scala
deleted file mode 100644
index b1fd05f6f04..00000000000
--- a/sample-apps/blog-recommendation/src/spark/expected_percentile.scala
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-val test_file_path = "data/cv/test_set_exploded"
-val blog_recom_file_path = "data/recommendations"
-val size_recommendation_list = 100
-
-val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-
-val test_set = sc.textFile(test_file_path).
- map(_.split("\t")).map(p => (p(0).toString, p(1).toString)).
- toDF("post_id", "user_id")
-
-val recommendations = sc.textFile(blog_recom_file_path).
- map(_.split("\t")).map(p => (p(0).toString, p(1).toString, p(2).toString)).
- toDF("user_id", "rank", "post_id")
-
-// val recommendations = sqlContext.createDataFrame(Seq(
-// ("16966742", "5", "1009088"),
-// ("30463255", "10", "1044974")
-// )).toDF("user_id", "rank", "post_id")
-
-// join data
-var joined_data = test_set.
- join(recommendations,
- test_set("post_id") === recommendations("post_id") &&
- test_set("user_id") === recommendations("user_id")).
- select(test_set("post_id"),
- test_set("user_id"),
- recommendations("rank"))
-
-// transform and add a column
-joined_data = joined_data.withColumn("percentile", joined_data("rank")/size_recommendation_list)
-
-val expected_percentile = joined_data.
- // groupBy($"user_id").
- groupBy().
- agg(sum($"percentile").as("sum_percentile"),
- count($"post_id").as("number_read")).
- withColumn("expected_percentile", $"sum_percentile" / $"number_read")
-
-expected_percentile.show()
diff --git a/sample-apps/blog-recommendation/src/spark/full_dataset_cf.scala b/sample-apps/blog-recommendation/src/spark/full_dataset_cf.scala
deleted file mode 100644
index 405105eb663..00000000000
--- a/sample-apps/blog-recommendation/src/spark/full_dataset_cf.scala
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-import org.apache.spark.mllib.recommendation.ALS
-import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
-import org.apache.spark.mllib.recommendation.Rating
-import scala.util.parsing.json.JSONObject
-
-// Prepare data
-
-val data_path = "data/original_data/trainPosts.json"
-
-val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-
-val full_dataset = sqlContext.read.json(data_path)
-
-var data = full_dataset.select($"post_id", explode($"likes").as("likes_flat"))
-data = data.select($"likes_flat.uid".as("uid"), $"post_id")
-
-data = data.filter("uid is not null and uid != '' and post_id is not null and post_id != ''")
-
-val ratings = data.rdd.map(x => (x(0).toString, x(1).toString) match {
- case (user, item) => Rating(user.toInt, item.toInt, 1)
-})
-
-// Train the model
-
-val rank = 10
-val numIterations = 10
-val model = ALS.train(ratings, rank, numIterations, 0.01)
-
-// Convert latent vectors from model to Vespa Tensor model
-
-def writeModelFeaturesAsTensor (modelFeatures:(Int, Array[Double]), id_string:String) = {
-
- val id = modelFeatures._1
- val latentVector = modelFeatures._2
- var latentVectorMap:Map[String,Double] = Map()
- var output:Map[String,Any] = Map()
-
- for ( i <- 0 until latentVector.length ){
-
- latentVectorMap += (("user_item_cf:" + i.toString, latentVector(i)))
-
- }
-
- output += ((id_string, id))
- output += (("user_item_cf", scala.util.parsing.json.JSONObject(latentVectorMap)))
-
- JSONObject(output)
-
-}
-
-// Write user and item latent factors to disk
-
-val product_features = model.productFeatures.map(x => writeModelFeaturesAsTensor(x, "post_id"))
-product_features.saveAsTextFile("data/user_item_cf/product_features")
-val user_features = model.userFeatures.map(x => writeModelFeaturesAsTensor(x, "user_id"))
-user_features.saveAsTextFile("data/user_item_cf/user_features")
-
-
-
-
diff --git a/sample-apps/blog-recommendation/src/spark/train_test_set_division.scala b/sample-apps/blog-recommendation/src/spark/train_test_set_division.scala
deleted file mode 100644
index 224ea74cb4e..00000000000
--- a/sample-apps/blog-recommendation/src/spark/train_test_set_division.scala
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-import org.apache.spark.sql.functions.udf
-
-// Inputs
-val input_file_path = "data/original_data/trainPosts.json"
-val test_perc_stage1 = 0.05
-val test_perc_stage2 = 0.15
-val training_file_path = "data/cv/training_set_exploded"
-val test_file_path = "data/cv/test_set_exploded"
-val seed = 123
-
-val sqlContext = new org.apache.spark.sql.SQLContext(sc)
-
-// Load full dataset
-val full_dataset = sqlContext.read.json(input_file_path)
-val full_dataset_simple = full_dataset.select($"post_id", size($"likes").as("number_likes"), $"likes")
-
-// Set some blog posts aside to be present only on the test set
-var sets = full_dataset_simple.randomSplit(Array(1 - test_perc_stage1, test_perc_stage1), seed)
-
-var training_set = sets(0)
-val training_set_null = training_set.filter("number_likes == 0")
-var training_set_exploded = training_set.select($"post_id", explode($"likes").as("likes_flat"))
-training_set_exploded = training_set_exploded.select("post_id", "likes_flat.uid")
-
-var test_set = sets(1)
-val test_set_null = test_set.filter("number_likes == 0")
-var test_set_exploded = test_set.select($"post_id", explode($"likes").as("likes_flat"))
-test_set_exploded = test_set_exploded.select("post_id", "likes_flat.uid")
-
-// randomly move some (post_id, uid) from training set to test set
-sets = training_set_exploded.randomSplit(Array(1 - test_perc_stage2, test_perc_stage2), seed)
-
-training_set_exploded = sets(0)
-
-var additional_test_set_exploded = sets(1)
-test_set_exploded = test_set_exploded.unionAll(additional_test_set_exploded)
-
-// concatenate exploded set with null set
-val getNull = udf(() => None: Option[String])
-training_set_exploded = training_set_exploded.unionAll(training_set_null.select("post_id").withColumn("uid", getNull()))
-test_set_exploded = test_set_exploded.unionAll(test_set_null.select("post_id").withColumn("uid", getNull()))
-
-// Write to disk
-training_set_exploded.rdd.map(x => x(0) + "\t" + x(1)).saveAsTextFile(training_file_path)
-test_set_exploded.rdd.map(x => x(0) + "\t" + x(1)).saveAsTextFile(test_file_path)