Sample apps moved to vespa-engine/sample-apps

author: Kristian Aune <kraune@yahoo-inc.com> 2017-08-18 09:14:46 +0200
committer: Kristian Aune <kraune@yahoo-inc.com> 2017-08-18 09:14:46 +0200
commit: e36ce286e3cc2026fe0ccbbf0e116477b23c70e1 (patch)
tree: 3d3dadc8869629d7ebc7d019a400fc641a072c4c /sample-apps/blog-tutorial-shared/src
parent: a618787ba847eed7c30f6bd81256db8d39e5e2c4 (diff)
22 files changed, 0 insertions, 1554 deletions
diff --git a/sample-apps/blog-tutorial-shared/src/R/generateDataset.R b/sample-apps/blog-tutorial-shared/src/R/generateDataset.R
deleted file mode 100644
index d69cd5ba825..00000000000
--- a/sample-apps/blog-tutorial-shared/src/R/generateDataset.R
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-library(jsonlite)
-library(dplyr)
-
-file_path_document <- 'blog-job/user_item_cf_cv/product.json'
-file_path_user <- 'blog-job/user_item_cf_cv/user.json'
-file_path_train <- 'blog-job/training_and_test_indices/train.txt'
-output_file <- 'blog-job/nn_model/training_set.txt'
-
-# get ids from documents that have a latent vector
-lines <- readLines(file_path_document)
-product_ids <- NULL
-for (line in lines){
-  product_ids <- c(product_ids, fromJSON(txt=line)$post_id)
-}
-
-# get ids from users that have a latent vector
-lines <- readLines(file_path_user)
-user_ids <- NULL
-for (line in lines){
-  user_ids <- c(user_ids, fromJSON(txt=line)$user_id)
-}
-
-# read (product, user) ids used for training
-train_ids <- read.delim(file = file_path_train, header = FALSE, stringsAsFactors = FALSE)
-colnames(train_ids) <- c("product_id", "user_id")
-
-# filter out product id and user id that does not have latent vectors
-temp <- merge(x = train_ids, y = data.frame(product_id = product_ids))
-final_positive_train_ids <- merge(x = temp, y = data.frame(user_id = user_ids))
-
-# add positive labels
-final_positive_train_ids <- data.frame(final_positive_train_ids, label = 1)
-
-# add noise to the data
-clicks_per_user <- final_positive_train_ids %>% group_by(user_id) %>% summarise(number_clicks = sum(label))
-
-unread_proportion <- 10
-unread_products <- matrix(NA, unread_proportion*sum(clicks_per_user$number_clicks), 3)
-colnames(unread_products) <- c("user_id", "product_id", "label")
-count <- 0
-for (i in 1:nrow(clicks_per_user)){
-  print(paste(i, "/ ", nrow(clicks_per_user)))
-  number_itens <- unread_proportion * as.numeric(clicks_per_user[i, "number_clicks"])
-  row_index <- count + 1:number_itens
-  count <- count + number_itens
-  user_id <- clicks_per_user[i, "user_id"]
-  new_samples <- sample(x = product_ids, size = unread_proportion * as.numeric(clicks_per_user[i, "number_clicks"]), replace = FALSE)
-  unread_products[row_index, ] <- matrix(c(rep(as.numeric(user_id), number_itens), new_samples, rep(0, number_itens)), ncol = 3)
-}
-
-# create final dataset
-final_train_ids <- rbind(final_positive_train_ids, data.frame(unread_products))
-duplicated_rows <- duplicated(x = final_train_ids[, c("user_id", "product_id")])
-final_train_ids <- final_train_ids[!duplicated_rows, ]
-
-write.table(x = final_train_ids, file = output_file, sep = "\t", quote = FALSE, row.names = FALSE)
diff --git a/sample-apps/blog-tutorial-shared/src/__init__.py b/sample-apps/blog-tutorial-shared/src/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/sample-apps/blog-tutorial-shared/src/__init__.py
+++ /dev/null
diff --git a/sample-apps/blog-tutorial-shared/src/main/__init__.py b/sample-apps/blog-tutorial-shared/src/main/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/sample-apps/blog-tutorial-shared/src/main/__init__.py
+++ /dev/null
diff --git a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_compute_metric.pig b/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_compute_metric.pig
deleted file mode 100644
index 61ca9bc7cb2..00000000000
--- a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_compute_metric.pig
+++ /dev/null
@@ -1,43 +0,0 @@
--- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-REGISTER $VESPA_HADOOP_JAR
-
-DEFINE  BlogPostRecommendations
-        com.yahoo.vespa.hadoop.pig.VespaQuery(
-            'query=http://$ENDPOINT/search/?user_id=<user_id>&hits=$NUMBER_RECOMMENDATIONS&ranking=$RANKING_NAME',
-            'schema=rank:int,id:chararray,relevance:double,fields/post_id:chararray'
-        );
-
--- Load test_set data from a local file
-test_indices = LOAD '$TEST_INDICES' AS (post_id:chararray, user_id:chararray);
-users = FOREACH test_indices GENERATE user_id;
-users = FILTER users BY user_id IS NOT null;
-users = DISTINCT users;
-
--- Run a set of queries against Vespa
-recommendations = FOREACH users GENERATE user_id, 
-                                         FLATTEN(BlogPostRecommendations(*)) AS (rank, id, relevance, post_id);
-recommendations = FOREACH recommendations GENERATE user_id, rank, post_id;
-recommendations = FILTER recommendations BY rank IS NOT NULL AND post_id IS NOT NULL;                                              
-
--- join data
-joined_data = JOIN test_indices BY (post_id, user_id), recommendations BY (post_id, user_id);
-joined_data = FOREACH joined_data GENERATE 
-  test_indices::post_id AS post_id, 
-  test_indices::user_id AS user_id, 
-  rank;
-
--- transform and add a column
-joined_data = FOREACH joined_data 
-  GENERATE post_id, 
-           user_id, 
-           rank, 
-           (double)rank/(double)$NUMBER_RECOMMENDATIONS AS percentile;
-
-grouped_data = GROUP joined_data BY user_id;
-grouped_data = FOREACH grouped_data 
-  GENERATE group AS user_id,
-           SUM(joined_data.percentile) AS sum_percentile,
-           COUNT(joined_data.post_id) AS number_read,
-           (double)SUM(joined_data.percentile)/(double)COUNT(joined_data.post_id) AS expected_percentile;
-
-STORE grouped_data INTO '$OUTPUT';
diff --git a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_feed_content_and_tensor_vespa.pig b/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_feed_content_and_tensor_vespa.pig
deleted file mode 100644
index 50152318d04..00000000000
--- a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_feed_content_and_tensor_vespa.pig
+++ /dev/null
@@ -1,135 +0,0 @@
--- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-REGISTER '$VESPA_HADOOP_JAR'
-
--- Create valid Vespa put operations
-DEFINE VespaPutOperationDoc
-       com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
-            'operation=put',
-            'docid=id:blog-recommendation:blog_post::<post_id>',
-            'create-tensor-fields=user_item_cf',
-            'simple-array-fields=tags,categories'
-       );
-
-DEFINE VespaPutOperationUser
-       com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
-            'operation=put',
-            'docid=id:blog-recommendation:user::<user_id>',
-            'create-tensor-fields=user_item_cf',
-            'simple-array-fields=has_read_items'
-       );
-
--- Transform tabular data to a Vespa document operation JSON format
-DEFINE VespaStorage
-       com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- Load data 
-data = LOAD '$DATA_PATH' USING 
-  JsonLoader('date_gmt:chararray, 
-              language:chararray, 
-              author:chararray, 
-              url:chararray, 
-              title:chararray, 
-              blog:chararray, 
-              post_id:chararray, 
-              tags:{T:(tag_name:chararray)}, 
-              blogname:chararray, 
-              date:chararray, 
-              content:chararray, 
-              categories:{T:(category_name:chararray)}, 
-              likes:{T:(dt:chararray, uid:chararray)}');
-
-data_for_feed = FOREACH data GENERATE 
-  date_gmt, 
-  language, 
-  author, 
-  url, 
-  title, 
-  blog, 
-  post_id, 
-  tags, 
-  blogname,
-  content, 
-  categories;
-
--- Feed only blog posts that belong to test set
-test_indices = LOAD '$TEST_INDICES' AS (post_id, user_id);
-test_indices = FOREACH test_indices GENERATE post_id;
-test_indices = DISTINCT test_indices;
-
-test_data_for_feed = FOREACH (JOIN data_for_feed BY post_id, test_indices BY post_id) 
-  GENERATE date_gmt AS date_gmt, 
-           language AS language, 
-           author AS author, 
-           url AS url, 
-           title AS title, 
-           blog AS blog, 
-           data_for_feed::post_id AS post_id, 
-           tags AS tags, 
-           blogname AS blogname,
-           content AS content, 
-           categories AS categories;
-
--- Load Blog post CF latent factors
-data_doc = LOAD '$BLOG_POST_FACTORS' USING 
-  JsonLoader('post_id:chararray, 
-              user_item_cf:[double]');
-
--- Join data and latent factors
-data_content_and_doc_tensor = JOIN test_data_for_feed BY post_id LEFT, data_doc BY post_id;
-data_content_and_doc_tensor = FOREACH data_content_and_doc_tensor GENERATE
-  date_gmt AS date_gmt, 
-  language AS language, 
-  author AS author, 
-  url AS url, 
-  title AS title, 
-  blog AS blog, 
-  test_data_for_feed::post_id as post_id, 
-  tags AS tags, 
-  blogname AS blogname,
-  content AS content, 
-  categories AS categories,
-  user_item_cf AS user_item_cf,
-  (user_item_cf IS NOT NULL ? 1 : 0) AS has_user_item_cf;
-
--- Generate valid Vespa JSON format
-data_content_and_doc_tensor_feed = FOREACH data_content_and_doc_tensor GENERATE VespaPutOperationDoc(*);
-
--- Load User CF latent factors
-data_user = LOAD '$USER_FACTORS' USING 
-  JsonLoader('user_id:chararray, 
-              user_item_cf:[double]');
-data_user = FOREACH data_user GENERATE  
-  user_id AS user_id,
-  user_item_cf AS user_item_cf;
-  
--- Articles already liked
-data_likes = FOREACH data GENERATE post_id, FLATTEN(likes) AS (dt, uid); 
-
-post_liked_per_user = GROUP data_likes BY uid;
-post_liked_per_user = FOREACH post_liked_per_user GENERATE 
-  group AS user_id, 
-  data_likes.post_id AS has_read_items;
-
--- Join user data
-data_user = JOIN post_liked_per_user BY user_id FULL, 
-                 data_user BY user_id;
-
-data_user = FOREACH data_user GENERATE 
-  (post_liked_per_user::user_id IS NOT NULL ? post_liked_per_user::user_id : data_user::user_id) AS user_id,
-  user_item_cf AS user_item_cf,
-  (user_item_cf IS NOT NULL ? 1 : 0) AS has_user_item_cf,
-  has_read_items AS has_read_items;
-
-data_user = FILTER data_user BY user_id IS NOT NULL;
-
--- Generate valid Vespa JSON format
-data_user_for_feed = FOREACH data_user GENERATE VespaPutOperationUser(*);
-
-joint_content_tensors = UNION data_content_and_doc_tensor_feed, data_user_for_feed;
-
--- Store into Vespa
-STORE joint_content_tensors INTO '$ENDPOINT' USING VespaStorage();
-
-
-
-
diff --git a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_feed_content_vespa.pig b/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_feed_content_vespa.pig
deleted file mode 100644
index 62b8a676cfc..00000000000
--- a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_feed_content_vespa.pig
+++ /dev/null
@@ -1,51 +0,0 @@
--- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-REGISTER '$VESPA_HADOOP_JAR'
-
--- UDF to create valid Vespa document operation in JSON format
-DEFINE VespaPutOperationDoc
-       com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
-            'operation=put',
-            'docid=id:blog-search:blog_post::<post_id>',
-            'simple-array-fields=tags,categories'
-       );
-
--- UDF to send data to a Vespa endpoint
-DEFINE VespaStorage
-       com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- Load data from any source - here we load using JsonLoader
-data = LOAD '$DATA_PATH' USING
-  JsonLoader('date_gmt:chararray,
-              language:chararray,
-              author:chararray,
-              url:chararray,
-              title:chararray,
-              blog:chararray,
-              post_id:chararray,
-              tags:{T:(tag_name:chararray)},
-              blogname:chararray,
-              date:chararray,
-              content:chararray,
-              categories:{T:(category_name:chararray)},
-              likes:{T:(dt:chararray, uid:chararray)}');
-
--- Select fields that will be sent to Vespa.
--- This should follow blog_post.sd
-data_for_feed = FOREACH data GENERATE
-  date_gmt,
-  language,
-  author,
-  url,
-  title,
-  blog,
-  post_id,
-  tags,
-  blogname,
-  content,
-  categories;
-
--- Create valid Vespa put operations in JSON format
-data_for_feed_json = FOREACH data_for_feed GENERATE VespaPutOperationDoc(*);
-
--- Store into Vespa
-STORE data_for_feed_json INTO '$ENDPOINT' USING VespaStorage();
diff --git a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_get_recommendation_list.pig b/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_get_recommendation_list.pig
deleted file mode 100644
index dcaffabba2d..00000000000
--- a/sample-apps/blog-tutorial-shared/src/main/pig/tutorial_get_recommendation_list.pig
+++ /dev/null
@@ -1,22 +0,0 @@
--- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-REGISTER $VESPA_HADOOP_JAR
-
-DEFINE  BlogPostRecommendations
-        com.yahoo.vespa.hadoop.pig.VespaQuery(
-            'query=http://$ENDPOINT/search/?user_id=<user_id>&hits=$NUMBER_RECOMMENDATIONS&ranking=$RANKING_NAME',
-            'schema=rank:int,id:chararray,relevance:double,fields/post_id:chararray'
-        );
-
--- Load test_set data from a local file
-test_indices = LOAD '$TEST_INDICES' AS (post_id:chararray, user_id:chararray);
-users = FOREACH test_indices GENERATE user_id;
-users = FILTER users BY user_id IS NOT null;
-users = DISTINCT users;
-
--- Run a set of queries against Vespa
-recommendations = FOREACH users GENERATE user_id, 
-                                         FLATTEN(BlogPostRecommendations(*)) AS (rank, id, relevance, post_id);
-recommendations = FOREACH recommendations GENERATE user_id, rank, post_id;
-recommendations = FILTER recommendations BY rank IS NOT NULL AND post_id IS NOT NULL;                                              
-
-STORE recommendations INTO '$OUTPUT';
diff --git a/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/BlogRecommendationApp.scala b/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/BlogRecommendationApp.scala
deleted file mode 100644
index 30be7da2727..00000000000
--- a/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/BlogRecommendationApp.scala
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.example.blog
-
-import org.apache.spark.sql.SparkSession
-
-object BlogRecommendationApp {
-  val usage = """
-    Usage: spark-submit \
-                |  --class "BlogRecommendationApp" \
-                |  --master local[4] \
-                |  JAR_FILE
-                |  --task task_command [TASK RELATED OPTIONS]
-
-           spark-submit \
-                |  --class "BlogRecommendationApp" \
-                |  --master local[4] \
-                |  JAR_FILE
-                |  --task collaborative_filtering
-                |  --input_file path
-                |  --rank value
-                |  --numIterations value
-                |  --lambda value
-                |  --output_path path
-
-           spark-submit \
-                |  --class "BlogRecommendationApp" \
-                |  --master local[4] \
-                |  JAR_FILE
-                |  --task collaborative_filtering_cv
-                |  --input_file path
-                |  --numIterations value
-                |  --output_path path
-                |
-
-           spark-submit \
-                |  --class "BlogRecommendationApp" \
-                |  --master local[4] \
-                |  JAR_FILE
-                |  --task split_set
-                |  --input_file path
-                |  --test_perc_stage1 value
-                |  --test_perc_stage2 value
-                |  --seed value
-                |  --output_path path
-              """
-
-  private val COLLABORATIVE_FILTERING = "collaborative_filtering"
-  private val COLLABORATIVE_FILTERING_CV = "collaborative_filtering_cv"
-  private val SPLIT_SET_INTO_TRAIN_AND_TEST = "split_set"
-
-  type OptionMap = Map[Symbol, Any]
-
-  def main(args: Array[String]) {
-
-    val options = parseCommandLineOptions(args)
-    val task_name = options('task).toString
-
-    task_name match {
-      case COLLABORATIVE_FILTERING => CollaborativeFilteringExample(options)
-      case COLLABORATIVE_FILTERING_CV => CollaborativeFilteringCV(options)
-      case SPLIT_SET_INTO_TRAIN_AND_TEST => SplitSetIntoTrainingAndTestSets(options)
-    }
-
-  }
-
-  private def SplitSetIntoTrainingAndTestSets(options: OptionMap) = {
-
-    val spark = SparkSession
-      .builder()
-      .appName("Split Full Data Into Train and Test Sets")
-      .getOrCreate()
-
-    val splitter = new SplitFullSetIntoTrainAndTestSets(spark)
-
-    val sets = splitter.run(input_file_path = options('input_file).toString,
-      test_perc_stage1 = options('test_perc_stage1).toString.toDouble,
-      test_perc_stage2 = options('test_perc_stage2).toString.toDouble,
-      seed = options('seed).toString.toInt)
-
-    SplitFullSetIntoTrainAndTestSets.writeTrainAndTestSetsIndices(sets, options('output_path).toString)
-
-  }
-
-  private def CollaborativeFilteringExample(options: OptionMap) = {
-
-    // TODO: Check if output_path already exist
-
-    val spark = SparkSession
-      .builder()
-      .appName("Collaborative Filtering")
-      .getOrCreate()
-
-    val cf = new CollaborativeFiltering(spark)
-
-    val model = cf.run(
-      input_path = options('input_file).toString,
-      rank = options('rank).toString.toInt,
-      numIterations = options('num_iterations).toString.toInt,
-      lambda = options('lambda).toString.toDouble)
-
-    CollaborativeFiltering.writeFeaturesAsVespaTensorText(model, options('output_path).toString)
-
-  }
-
-  private def CollaborativeFilteringCV(options: OptionMap) = {
-
-    // TODO: Check if output_path already exist
-
-    val spark = SparkSession
-      .builder()
-      .appName("Collaborative Filtering CV")
-      .getOrCreate()
-
-    val cf = new CollaborativeFiltering(spark)
-
-    val model = cf.run_pipeline(
-      input_path = options('input_file).toString,
-      numIterations = options('num_iterations).toString.toInt)
-
-    CollaborativeFiltering.writeFeaturesAsVespaTensorText(model, options('output_path).toString)
-
-  }
-
-  private def parseCommandLineOptions(args: Array[String]): OptionMap = {
-
-    def findTask(list: List[String]) : String = {
-      list match {
-        case Nil => println("Please, define a valid task" + "\n" + usage)
-          sys.exit(1)
-        case "--task" :: value :: tail =>
-          value
-        case option :: tail => findTask(tail)
-      }
-    }
-
-    def ParseCollaborativeFilteringOptions(map : OptionMap, list: List[String]) : OptionMap = {
-      list match {
-        case Nil => map
-        case "--input_file" :: value :: tail =>
-          ParseCollaborativeFilteringOptions(map ++ Map('input_file -> value.toString), tail)
-        case "--rank" :: value :: tail =>
-          ParseCollaborativeFilteringOptions(map ++ Map('rank -> value.toInt), tail)
-        case "--numIterations" :: value :: tail =>
-          ParseCollaborativeFilteringOptions(map ++ Map('num_iterations -> value.toInt), tail)
-        case "--lambda" :: value :: tail =>
-          ParseCollaborativeFilteringOptions(map ++ Map('lambda -> value.toDouble), tail)
-        case "--output_path" :: value :: tail =>
-          ParseCollaborativeFilteringOptions(map ++ Map('output_path -> value.toString), tail)
-        case option :: tail =>
-          ParseCollaborativeFilteringOptions(map, tail)
-      }
-    }
-
-    def ParseCollaborativeFilteringCVOptions(map : OptionMap, list: List[String]) : OptionMap = {
-      list match {
-        case Nil => map
-        case "--input_file" :: value :: tail =>
-          ParseCollaborativeFilteringOptions(map ++ Map('input_file -> value.toString), tail)
-        case "--numIterations" :: value :: tail =>
-          ParseCollaborativeFilteringOptions(map ++ Map('num_iterations -> value.toInt), tail)
-        case "--output_path" :: value :: tail =>
-          ParseCollaborativeFilteringOptions(map ++ Map('output_path -> value.toString), tail)
-        case option :: tail =>
-          ParseCollaborativeFilteringOptions(map, tail)
-      }
-    }
-
-    def ParseSplitSetOptions(map : OptionMap, list: List[String]) : OptionMap = {
-      list match {
-        case Nil => map
-        case "--input_file" :: value :: tail =>
-          ParseSplitSetOptions(map ++ Map('input_file -> value.toString), tail)
-        case "--test_perc_stage1" :: value :: tail =>
-          ParseSplitSetOptions(map ++ Map('test_perc_stage1 -> value.toDouble), tail)
-        case "--test_perc_stage2" :: value :: tail =>
-          ParseSplitSetOptions(map ++ Map('test_perc_stage2 -> value.toDouble), tail)
-        case "--seed" :: value :: tail =>
-          ParseSplitSetOptions(map ++ Map('seed -> value.toInt), tail)
-        case "--output_path" :: value :: tail =>
-          ParseSplitSetOptions(map ++ Map('output_path -> value.toString), tail)
-        case option :: tail =>
-          ParseSplitSetOptions(map , tail)
-      }
-    }
-
-    if (args.length == 0) println(usage)
-    val arglist = args.toList
-
-    val task_name = findTask(arglist)
-
-    val options = task_name match {
-      case COLLABORATIVE_FILTERING => ParseCollaborativeFilteringOptions(Map('task -> task_name), arglist)
-      case COLLABORATIVE_FILTERING_CV => ParseCollaborativeFilteringCVOptions(Map('task -> task_name), arglist)
-      case SPLIT_SET_INTO_TRAIN_AND_TEST => ParseSplitSetOptions(Map('task -> task_name), arglist)
-    }
-
-    options
-
-  }
-
-}
-
-
diff --git a/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/CollaborativeFiltering.scala b/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/CollaborativeFiltering.scala
deleted file mode 100644
index 56828c828a5..00000000000
--- a/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/CollaborativeFiltering.scala
+++ /dev/null
@@ -1,160 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.example.blog
-
-import org.apache.spark.ml.recommendation.{ALS, ALSModel}
-import org.apache.spark.ml.evaluation.RegressionEvaluator
-import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
-import org.apache.spark.mllib.recommendation.Rating
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.IntegerType
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.{Row, DataFrame}
-import org.apache.spark.sql.functions.{col, explode}
-
-import scala.collection.mutable
-import scala.util.parsing.json.JSONObject
-
-class CollaborativeFiltering(val ss: SparkSession) {
-
-  import ss.implicits._
-
-  def loadTrainingIndicesIntoDataFrame(input_path: String) = {
-
-    val ratings = ss.sparkContext.textFile(input_path)
-      .map(_.split("\t"))
-      .map(p => (p(0), p(1), 1))
-      .toDF("post_id", "user_id", "label")
-      .filter(col("post_id").notEqual("null"))
-      .filter(col("user_id").notEqual("null"))
-      .select(col("post_id").cast(IntegerType).as("post_id"),
-              col("user_id").cast(IntegerType).as("user_id"),
-              col("label").cast(IntegerType).as("label"))
-
-    ratings
-
-  }
-
-  def loadDataIntoDataFrame(input_path: String): DataFrame = {
-
-    val dataset = ss.read.json(input_path)
-
-    val setOne = udf(() => 1)
-
-    val ratings = dataset.select(col("post_id").cast(IntegerType).as("post_id"),
-                                 explode(col("likes")).as("likes_flat"))
-      .select(col("post_id"), col("likes_flat.uid").cast(IntegerType).as("user_id"))
-      .withColumn("label", setOne())
-
-    ratings
-
-  }
-
-  def loadDataIntoRating(input_path: String): RDD[Rating] = {
-
-    val dataset: DataFrame = ss.read.json(input_path)
-
-    val ratings = dataset.select(col("post_id"), explode(col("likes")).as("likes_flat"))
-      .select(col("post_id"), col("likes_flat.uid").as("user_id"))
-      .rdd.map {
-      case Row(post_id: String,
-      user_id: String) =>
-        Rating(user_id.toInt, post_id.toInt, 1)
-    }
-
-    ratings
-
-  }
-
-  def run(input_path: String, rank: Int, numIterations: Int, lambda: Double): ALSModel = {
-
-    // Loading and preparing the data
-    val ratings = loadTrainingIndicesIntoDataFrame(input_path)
-
-    // Fitting the model
-    val model = new ALS()
-      .setItemCol("post_id")
-      .setRatingCol("label")
-      .setUserCol("user_id")
-      .setImplicitPrefs(true)
-      .setAlpha(lambda)
-      .setMaxIter(numIterations)
-      .setRank(rank)
-      .fit(ratings)
-
-    model
-
-  }
-
-  def run_pipeline(input_path: String, numIterations: Int): ALSModel = {
-
-    // Loading and preparing the data
-    val ratings = loadTrainingIndicesIntoDataFrame(input_path)
-
-    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
-    val collaborative_filtering = new ALS()
-      .setItemCol("post_id")
-      .setRatingCol("label")
-      .setUserCol("user_id")
-      .setMaxIter(numIterations)
-
-    val paramGrid = new ParamGridBuilder()
-      .addGrid(collaborative_filtering.rank, Array(10, 50, 100))
-      .addGrid(collaborative_filtering.alpha, Array(0.001, 0.01, 0.1))
-      .build()
-
-    val cv = new CrossValidator()
-      .setEstimator(collaborative_filtering)
-      .setEvaluator(new RegressionEvaluator)
-      .setEstimatorParamMaps(paramGrid)
-      .setNumFolds(2)  // Use 3+ in practice
-
-    // Run cross-validation, and choose the best set of parameters.
-    val cvModel = cv.fit(ratings)
-
-    cvModel.bestModel.asInstanceOf[ALSModel]
-
-  }
-
-}
-
-object CollaborativeFiltering {
-
-  def writeModelFeaturesAsTensor[T] (modelFeatures:(Int, mutable.WrappedArray[T]), id_string:String) = {
-
-    val id = modelFeatures._1
-    val latentVector = modelFeatures._2
-    var latentVectorMap:Map[String,T] = Map()
-    var output:Map[String,Any] = Map()
-
-    for ( i <- latentVector.indices ){
-
-      latentVectorMap += (("user_item_cf:" + i.toString, latentVector(i)))
-
-    }
-
-    output += ((id_string, id))
-    output += (("user_item_cf", JSONObject(latentVectorMap)))
-
-    JSONObject(output)
-
-  }
-
-  def writeFeaturesAsVespaTensorText(model: ALSModel, output_path: String): Unit ={
-
-    model
-      .itemFactors.rdd
-      .map {
-        case Row(id: Int, features: mutable.WrappedArray[Double]) => writeModelFeaturesAsTensor((id, features), "post_id")
-      }
-      .saveAsTextFile(output_path + "/product_features")
-    model
-      .userFactors.rdd
-      .map {
-        case Row(id: Int, features: mutable.WrappedArray[Double]) => writeModelFeaturesAsTensor((id, features), "user_id")
-      }
-      .saveAsTextFile(output_path + "/user_features")
-
-  }
-
-}
diff --git a/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/SplitFullSetIntoTrainAndTestSets.scala b/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/SplitFullSetIntoTrainAndTestSets.scala
deleted file mode 100644
index feff388618e..00000000000
--- a/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/SplitFullSetIntoTrainAndTestSets.scala
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.example.blog
-
-import org.apache.spark.sql.{SparkSession, DataFrame}
-import org.apache.spark.sql.functions.udf
-import org.apache.spark.sql.functions._
-
-class SplitFullSetIntoTrainAndTestSets(val ss: SparkSession) {
-
-  private def loadAndSimplifyFullDataset(input_file_path: String): DataFrame = {
-
-    // Load full dataset
-    val full_dataset = ss.read.json(input_file_path)
-    val full_dataset_simple = full_dataset.select(col("post_id"), size(col("likes")).as("number_likes"), col("likes"))
-
-    full_dataset_simple
-
-  }
-
-  private def splitSimplifiedDatasetIntoTrainAndTestSets(full_dataset_simple: DataFrame,
-                                                         test_perc_stage1: Double,
-                                                         test_perc_stage2: Double,
-                                                         seed: Int): Array[DataFrame] = {
-
-    // Set some blog posts aside to be present only on the test set
-    var sets = full_dataset_simple.randomSplit(Array(1 - test_perc_stage1, test_perc_stage1), seed)
-
-    val training_set = sets(0)
-    val training_set_null = training_set.filter("number_likes = 0")
-    var training_set_exploded = training_set.select(col("post_id"), explode(col("likes")).as("likes_flat"))
-    training_set_exploded = training_set_exploded.select("post_id", "likes_flat.uid")
-
-    val test_set = sets(1)
-    val test_set_null = test_set.filter("number_likes = 0")
-    var test_set_exploded = test_set.select(col("post_id"), explode(col("likes")).as("likes_flat"))
-    test_set_exploded = test_set_exploded.select("post_id", "likes_flat.uid")
-
-    // randomly move some (post_id, uid) from training set to test set
-    sets = training_set_exploded.randomSplit(Array(1 - test_perc_stage2, test_perc_stage2), seed)
-
-    training_set_exploded = sets(0)
-
-    val additional_test_set_exploded = sets(1)
-    test_set_exploded = test_set_exploded.union(additional_test_set_exploded)
-
-    // concatenate exploded set with null set
-    val getNull = udf(() => None: Option[String])
-    training_set_exploded = training_set_exploded.union(training_set_null.select("post_id").withColumn("uid", getNull()))
-    test_set_exploded = test_set_exploded.union(test_set_null.select("post_id").withColumn("uid", getNull()))
-
-    Array(training_set_exploded, test_set_exploded)
-
-  }
-
-  def run(input_file_path: String, test_perc_stage1: Double, test_perc_stage2:Double, seed: Int): Array[DataFrame] = {
-
-    val full_dataset_simple = loadAndSimplifyFullDataset(input_file_path)
-
-    splitSimplifiedDatasetIntoTrainAndTestSets(full_dataset_simple,
-      test_perc_stage1,
-      test_perc_stage2,
-      seed)
-
-  }
-
-}
-
-object SplitFullSetIntoTrainAndTestSets {
-
-  def writeTrainAndTestSetsIndices(train_and_test_sets: Array[DataFrame], output_path: String): Unit = {
-
-    val training_set_exploded = train_and_test_sets(0)
-    val test_set_exploded = train_and_test_sets(1)
-
-    // Write to disk
-    training_set_exploded.rdd.map(x => x(0) + "\t" + x(1)).saveAsTextFile(output_path + "/training_set_ids")
-    test_set_exploded.rdd.map(x => x(0) + "\t" + x(1)).saveAsTextFile(output_path + "/testing_set_ids")
-
-  }
-
-}
diff --git a/sample-apps/blog-tutorial-shared/src/python/__init__.py b/sample-apps/blog-tutorial-shared/src/python/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/sample-apps/blog-tutorial-shared/src/python/__init__.py
+++ /dev/null
diff --git a/sample-apps/blog-tutorial-shared/src/python/parse.py b/sample-apps/blog-tutorial-shared/src/python/parse.py
deleted file mode 100644
index 207f8a14740..00000000000
--- a/sample-apps/blog-tutorial-shared/src/python/parse.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-import json
-import argparse
-
-
-class KaggleRawDataParser:
-
-    popularity = False
-    raw_data_file = None
-    total_number_of_likes = 0
-    likes_per_blog = {}
-
-    def __init__(self):
-        parser = argparse.ArgumentParser()
-        parser.add_argument("-p", "--popularity", action="store_true", help="add 'popularity' field")
-        parser.add_argument("file", help="location of file to be parsed")
-        args = parser.parse_args()
-
-        self.popularity = args.popularity
-        self.raw_data_file = args.file
-
-    def main(self):
-        if self.popularity:
-            self.calculate_popularity()
-        self.parse()
-
-    def calculate_popularity(self):
-        unparsed_file = open(self.raw_data_file, "r")
-
-        for line in unparsed_file:
-            data = json.loads(line)
-
-            self.total_number_of_likes += len(data["likes"])
-            if data["blog"] in self.likes_per_blog:
-                self.likes_per_blog[data["blog"]] += len(data["likes"])
-            else:
-                self.likes_per_blog[data["blog"]] = len(data["likes"])
-
-        unparsed_file.close()
-
-    def parse(self):
-        unparsed_file = open(self.raw_data_file, "r")
-
-        for line in unparsed_file:
-            data = json.loads(line)
-
-            parsed_data = {
-                "put": "id:blog-search:blog_post::" + data["post_id"],
-                "fields": {
-                    "blogname": data["blogname"],
-                    "post_id": data["post_id"],
-                    "author": data["author"],
-                    "language": data["language"],
-                    "categories": data["categories"],
-                    "title": data["title"],
-                    "blog": data["blog"],
-                    "date_gmt": data["date_gmt"],
-                    "url": data["url"],
-                    "content": data["content"],
-                    "tags": data["tags"],
-                    "date": int(data["date_gmt"][0:4] + data["date_gmt"][5:7] + data["date_gmt"][8:10])
-                }
-            }
-            if self.popularity:
-                parsed_data["fields"]["popularity"] = \
-                    float(self.likes_per_blog[data["blog"]]) / float(self.total_number_of_likes)
-
-            print(json.dumps(parsed_data))
-
-        unparsed_file.close()
-
-if __name__ == '__main__':
-    KaggleRawDataParser().main()
diff --git a/sample-apps/blog-tutorial-shared/src/python/vespaModel.py b/sample-apps/blog-tutorial-shared/src/python/vespaModel.py
deleted file mode 100755
index 5d3bf1eceb7..00000000000
--- a/sample-apps/blog-tutorial-shared/src/python/vespaModel.py
+++ /dev/null
@@ -1,397 +0,0 @@
-# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-"""
-Train a 2 layers neural network to compute the probability of a user
-represented by the vector u liking a document represented by the vector d.
-
-Usage: ./vespaModel.py --product_features_file_path path \
-                       --user_features_file_path path \
-                       --dataset_file_path path
-
-Expected File formats:
-
-- product_features_file_path contains a file with rows following the JSON format below:
-
-{"post_id" : 20,
- "user_item_cf" : {"user_item_cf:5" : -0.66617566,
-                   "user_item_cf:6" : 0.29197264,
-                   "user_item_cf:1" : -0.15582734,
-                   "user_item_cf:7" : 0.3350679,
-                   "user_item_cf:2" : -0.16676047,
-                   "user_item_cf:9" : -0.31653953,
-                   "user_item_cf:3" : -0.21495385,
-                   "user_item_cf:4" : -0.036676258,
-                   "user_item_cf:8" : 0.122069225,
-                   "user_item_cf:0" : 0.20922394}}
-
-- user_features_file_path contains a file with rows following the JSON format below:
-
-{"user_id" : 270,
- "user_item_cf" : {"user_item_cf:5" : -0.54011273,
-                   "user_item_cf:6" : 0.2723072,
-                   "user_item_cf:1" : -0.23280832,
-                   "user_item_cf:7" : -0.011183357,
-                   "user_item_cf:2" : -0.3987285,
-                   "user_item_cf:9" : -0.05703937,
-                   "user_item_cf:3" : 0.04699418,
-                   "user_item_cf:4" : 0.06679048,
-                   "user_item_cf:8" : 0.31399783,
-                   "user_item_cf:0" : 0.5000366}}
-
-- dataset_file_path contains a file with rows containing tab-separated post_id, user_id, label such as the sample below:
-
-1000054	    118475	1
-10001560	666315	0
-10001560	1230226	0
-10001560	561306	1
-"""
-
-
-import tensorflow as tf
-import time
-import os
-import datetime
-import json
-import numpy as np
-
-class getData:
-    """
-    Data pre-processing
-    """
-    def __init__(self, product_features_file_path, user_features_file_path, data_set_file_path):
-        self.product_features_file_path = product_features_file_path
-        self.user_features_file_path = user_features_file_path
-        self.data_set_file_path = data_set_file_path
-
-    # Create user and document lookup features
-    def parse_cf_features(self, json, id_name):
-        id = json[id_name]
-        indexes = ['user_item_cf:' + str(x) for x in range(0,10,1)]
-        values = [json['user_item_cf'][x] for x in indexes]
-        return [id, values]
-
-    def get_product_features_lookup(self):
-        product_features = [self.parse_cf_features(json.loads(line), 'post_id') for line in open(self.product_features_file_path)]
-        return dict(product_features)
-
-    def get_user_features_lookup(self):
-        user_features = [self.parse_cf_features(json.loads(line), 'user_id') for line in open(self.user_features_file_path)]
-        return dict(user_features)
-
-    def parse_dataset(self, line, lookup_user_features, lookup_product_features):
-        info = line.strip("\n").split("\t")
-        user_id = float(info[0])
-        product_id = float(info[1])
-        label = int(info[2])
-        return lookup_user_features[user_id], lookup_product_features[product_id], [label]
-
-    def prepare_dataset(self):
-        lookup_product_features = self.get_product_features_lookup()
-        lookup_user_features = self.get_user_features_lookup()
-        with open(self.data_set_file_path) as f:
-            input_u = []; input_d = []; input_y = []
-            for line in f:
-                u, d, y = self.parse_dataset(line, lookup_user_features, lookup_product_features)
-                input_u.append(u)
-                input_d.append(d)
-                input_y.append(y)
-        input_u = np.array(input_u)
-        input_d = np.array(input_d)
-        input_y = np.array(input_y)
-        return input_u, input_d, input_y
-
-    def create_train_test_sets(self, input_u, input_d, input_y, seed = 10, perc = 0.2):
-        # Randomly shuffle data
-        np.random.seed(seed)
-        shuffle_indices = np.random.permutation(np.arange(len(input_u)))
-        input_u_shuffled = input_u[shuffle_indices]
-        input_d_shuffled = input_d[shuffle_indices]
-        input_y_shuffled = input_y[shuffle_indices]
-
-        # Split train/test set
-        dev_samples = int(len(input_u_shuffled)*perc)
-        u_train, u_dev = input_u_shuffled[:-dev_samples], input_u_shuffled[-dev_samples:]
-        d_train, d_dev = input_d_shuffled[:-dev_samples], input_d_shuffled[-dev_samples:]
-        y_train, y_dev = input_y_shuffled[:-dev_samples], input_y_shuffled[-dev_samples:]
-        print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
-
-        return u_train, u_dev, d_train, d_dev, y_train, y_dev
-
-    def batch_iter(self, data, batch_size, num_epochs, shuffle=True):
-        """
-        Generates a batch iterator for a dataset.
-        """
-        data = np.array(data)
-        data_size = len(data)
-        num_batches_per_epoch = int(len(data)/batch_size) + 1
-        for epoch in range(num_epochs):
-            # Shuffle the data at each epoch
-            if shuffle:
-                shuffle_indices = np.random.permutation(np.arange(data_size))
-                shuffled_data = data[shuffle_indices]
-            else:
-                shuffled_data = data
-            for batch_num in range(num_batches_per_epoch):
-                start_index = batch_num * batch_size
-                end_index = min((batch_num + 1) * batch_size, data_size)
-                yield shuffled_data[start_index:end_index]
-
-class vespaRunTimeModel:
-    """
-    Model that combine user and document features and needs to be evaluated at query time.
-    """
-    def __init__(self, user_feature_length, doc_feature_length, hidden_length):
-
-        # placeholders
-        self.input_u = tf.placeholder(tf.float32, [None, user_feature_length], name = 'input_u')
-        self.input_d = tf.placeholder(tf.float32, [None, doc_feature_length], name = 'input_d')
-        self.input_y = tf.placeholder(tf.float32, [None, 1], name = 'input_y')
-
-        # merge user and document vector
-        self.input_concat = tf.concat(1, [self.input_d, self.input_u], name = 'input_concat')
-
-        # hidden layer
-        self.W_hidden = tf.Variable(
-            tf.truncated_normal([user_feature_length +
-                doc_feature_length, hidden_length], stddev=0.1), name = 'W_hidden')
-        self.b_hidden = tf.Variable(tf.constant(0.1, shape=[hidden_length]), name = 'b_hidden')
-
-        self.hidden_layer = tf.nn.relu(tf.matmul(self.input_concat, self.W_hidden) + self.b_hidden,
-            name = 'hidden_layer')
-
-        # output layer
-        self.W_final = tf.Variable(
-                tf.random_uniform([hidden_length, 1], -0.1, 0.1),
-                name="W_final")
-        self.b_final = tf.Variable(tf.zeros([1]), name="b_final")
-
-        self.y = tf.sigmoid(tf.matmul(self.hidden_layer, self.W_final) + self.b_final, name = 'y')
-
-        # prediction based on model output
-        self.prediction = tf.cast(tf.greater_equal(self.y, 0.5), "float", name = 'prediction')
-
-        # loss function
-        prob = tf.clip_by_value(self.y,1e-5,1.0 - 1e-5)
-        self.loss = tf.reduce_mean(- self.input_y * tf.log(prob) - (1 - self.input_y) * tf.log(1 - prob), name = 'loss')
-
-        # accuracy
-        correct_predictions = tf.equal(self.prediction, self.input_y)
-        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
-
-    def train_operation(self, learning_rate):
-        global_step = tf.Variable(0, name="global_step", trainable=False)
-        #optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-        optimizer = tf.train.AdagradOptimizer(learning_rate)
-        train_op = optimizer.minimize(self.loss, global_step=global_step)
-        return train_op, global_step
-
-    def create_output_dir(self):
-        timestamp = str(int(time.time()))
-        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
-        print("Writing to {}\n".format(out_dir))
-        return out_dir
-
-    def summary_oprations(self):
-        loss_summary = tf.scalar_summary("loss", self.loss)
-        acc_summary = tf.scalar_summary("accuracy", self.accuracy)
-        train_summary_op = tf.merge_summary([loss_summary, acc_summary])
-        dev_summary_op = tf.merge_summary([loss_summary, acc_summary])
-        return train_summary_op, dev_summary_op
-
-    def train_step(self, u_batch, d_batch, y_batch, writer=None):
-        """
-        A single training step
-        """
-        feed_dict = {
-          self.input_u: u_batch,
-          self.input_d: d_batch,
-          self.input_y: y_batch
-        }
-        _, step, summaries, loss, accuracy = sess.run(
-            [train_op, global_step, train_summary_op, self.loss, self.accuracy],
-            feed_dict)
-        time_str = datetime.datetime.now().isoformat()
-        print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
-        if writer:
-            writer.add_summary(summaries, step)
-
-    def dev_step(self, u_batch, d_batch, y_batch, writer=None):
-        """
-        Evaluates model on a dev set
-        """
-        feed_dict = {
-          self.input_u: u_batch,
-          self.input_d: d_batch,
-          self.input_y: y_batch
-        }
-        step, summaries, loss, accuracy = sess.run(
-            [global_step, dev_summary_op, self.loss, self.accuracy],
-            feed_dict)
-        time_str = datetime.datetime.now().isoformat()
-        print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
-        if writer:
-            writer.add_summary(summaries, step)
-
-class serializeVespaModel:
-    """
-    Serialize TensorFlow variables to Vespa JSON format
-
-    Example:
-        checkpoint_dir = "./runs/1473845959/checkpoints"
-        output_dir = "./runs/1473845959/vespa_variables"
-
-        serializer = serializeVespaModel(checkpoint_dir, output_dir)
-        serializer.serialize_to_disk(variable_name = "W_hidden", dimension_names = ['input', 'hidden'])
-        serializer.serialize_to_disk(variable_name = "b_hidden", dimension_names = ['hidden'])
-        serializer.serialize_to_disk(variable_name = "W_final", dimension_names = ['hidden', 'final'])
-        serializer.serialize_to_disk(variable_name = "b_final", dimension_names = ['final'])
-    """
-    def __init__(self, checkpoint_dir, output_dir):
-        self.checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
-        self.reader = tf.train.NewCheckpointReader(self.checkpoint_file)
-        self.output_dir = output_dir
-
-    def write_cell_value(self, variable, dimension_names, dimension_address = None):
-        if dimension_address is None:
-            dimension_address = []
-        shape = variable.shape
-        if len(shape) == 1:
-            count = 0
-            cells = []
-            for element in variable:
-                dimension_address.append((dimension_names[0], str(count)))
-                count += 1
-                cells.append({ 'address': dict(dimension_address), "value": float(element) })
-            return cells
-        else:
-            count = 0
-            output = []
-            for slice in variable:
-                dimension_address.append((dimension_names[0], str(count)))
-                output.extend(self.write_cell_value(slice, dimension_names[1:], dimension_address))
-                count += 1
-            return output
-
-    def write_to_vespa_json_format(self, variable_name, dimension_names):
-        variable = self.reader.get_tensor(variable_name)
-        cells = self.write_cell_value(variable, dimension_names)
-        return json.dumps({'cells': cells})
-
-    def serialize_to_disk(self, variable_name, dimension_names):
-        text_file = open(os.path.join(output_dir, variable_name + ".json"), "w")
-        text_file.write(serializer.write_to_vespa_json_format(variable_name, dimension_names))
-        text_file.close()
-
-
-def task_train():
-    # Data
-    tf.flags.DEFINE_string("product_features_file_path", '', "File containing product features")
-    tf.flags.DEFINE_string("user_features_file_path", '', "File containing user features")
-    tf.flags.DEFINE_string("dataset_file_path", '', "File containing labels for each document user pair")
-
-    tf.flags.DEFINE_integer("hidden_length_factor", 2, "The hidden layer has size 'hidden_length_factor * input_vector_length'")
-
-    # Misc Parameters
-    tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
-    tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
-
-    # Training parameters
-    tf.flags.DEFINE_float("learning_rate", 0.1, "Gradient Descent learning rate")
-
-    tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
-    tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
-    tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
-    tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
-
-    FLAGS = tf.flags.FLAGS
-    FLAGS._parse_flags()
-    print("\nParameters:")
-    for attr, value in sorted(FLAGS.__flags.items()):
-        print("{}={}".format(attr.upper(), value))
-    print("")
-
-    # Data preparation
-    data_pre_processing = getData(
-        FLAGS.product_features_file_path,
-        FLAGS.user_features_file_path,
-        FLAGS.dataset_file_path)
-
-    input_u, input_d, input_y = data_pre_processing.prepare_dataset()
-    u_train, u_dev, d_train, d_dev, y_train, y_dev = data_pre_processing.create_train_test_sets(input_u, input_d, input_y, seed = 10, perc = 0.2)
-
-    user_feature_length = input_u.shape[1]
-    doc_feature_length = input_d.shape[1]
-
-
-    # Create a graph
-    with tf.Graph().as_default():
-
-        # Create a session
-        session_conf = tf.ConfigProto(
-          allow_soft_placement=FLAGS.allow_soft_placement,
-          log_device_placement=FLAGS.log_device_placement)
-        sess = tf.Session(config=session_conf)
-        with sess.as_default():
-
-            # instanciate a model
-            vespa_model = vespaRunTimeModel(user_feature_length = user_feature_length,
-                doc_feature_length = doc_feature_length,
-                hidden_length = FLAGS.hidden_length_factor * (user_feature_length + doc_feature_length))
-
-            # create a train operation
-            train_op, global_step = vespa_model.train_operation(learning_rate = FLAGS.learning_rate)
-
-            # Summaries for loss and accuracy
-            train_summary_op, dev_summary_op = vespa_model.summary_oprations()
-
-            # Output directory for models and summaries
-            out_dir = vespa_model.create_output_dir()
-
-            # Write train summaries to disk
-            train_summary_dir = os.path.join(out_dir, "summaries", "train")
-            train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)
-
-            # Dev summaries
-            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
-            dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)
-
-            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
-            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
-            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
-            if not os.path.exists(checkpoint_dir):
-                os.makedirs(checkpoint_dir)
-            saver = tf.train.Saver(tf.all_variables())
-
-            # Initialize all variables
-            sess.run(tf.initialize_all_variables())
-
-        # Generate batches
-        batches = data_pre_processing.batch_iter(
-            list(zip(u_train, d_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
-        # Training loop. For each batch...
-        for batch in batches:
-            u_batch, d_batch, y_batch = zip(*batch)
-            vespa_model.train_step(u_batch, d_batch, y_batch, writer=train_summary_writer)
-            current_step = tf.train.global_step(sess, global_step)
-            if current_step % FLAGS.evaluate_every == 0:
-                print("\nEvaluation:")
-                vespa_model.dev_step(u_dev, d_dev, y_dev, writer=dev_summary_writer)
-                print("")
-            if current_step % FLAGS.checkpoint_every == 0:
-                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
-                print("Saved model checkpoint to {}\n".format(path))
-
-if __name__ == "__main__":
-
-    # Task
-    tf.flags.DEFINE_string("task", 'train', "Train a model from scratch")
-
-    FLAGS = tf.flags.FLAGS
-    FLAGS._parse_flags()
-    print("\nParameters:")
-    for attr, value in sorted(FLAGS.__flags.items()):
-        print("{}={}".format(attr.upper(), value))
-    print("")
-
-    if FLAGS.task == "train":
-        task_train()
diff --git a/sample-apps/blog-tutorial-shared/src/test/__init__.py b/sample-apps/blog-tutorial-shared/src/test/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/sample-apps/blog-tutorial-shared/src/test/__init__.py
+++ /dev/null
diff --git a/sample-apps/blog-tutorial-shared/src/test/python/__init__.py b/sample-apps/blog-tutorial-shared/src/test/python/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/sample-apps/blog-tutorial-shared/src/test/python/__init__.py
+++ /dev/null
diff --git a/sample-apps/blog-tutorial-shared/src/test/python/parse-unittest.py b/sample-apps/blog-tutorial-shared/src/test/python/parse-unittest.py
deleted file mode 100644
index 1a75994c740..00000000000
--- a/sample-apps/blog-tutorial-shared/src/test/python/parse-unittest.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-import os
-import sys
-import unittest
-import json
-from StringIO import StringIO
-
-import src.main.python.parse as parse
-
-class KaggleRawDataParserTest(unittest.TestCase):
-
-    raw_test_file = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + "/resources/trainPostsSampleWith3Elements.json"
-    saved_stdout = sys.stdout
-    out = StringIO()
-
-
-    def setUp(self):
-        sys.argv.append(self.raw_test_file)
-
-        self.out = StringIO()
-        sys.stdout = self.out
-
-    def tearDown(self):
-        sys.argv = [sys.argv[0]]
-
-        sys.stdout = self.saved_stdout
-
-
-    def test_no_flags(self):
-        parser = parse.KaggleRawDataParser()
-
-        self.assertFalse(parser.popularity)
-        self.assertEqual(parser.raw_data_file, self.raw_test_file)
-
-    def test_popularity_flag(self):
-        sys.argv.append("-p")
-        parser = parse.KaggleRawDataParser()
-
-        self.assertTrue(parser.popularity)
-
-    def test_parsing_without_popularity(self):
-        parser = parse.KaggleRawDataParser()
-
-        parser.parse()
-
-        output_array = self.out.getvalue().strip().split('\n')
-        compare_with = [{
-            "fields": {
-                "author": "5",
-                "blog": "4",
-                "blogname": "Matt on Not-WordPress",
-                "categories": [
-                    "Moblog"
-                ],
-                "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\" alt=\"\" title=\"photo19\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3838\" /></a>",
-                "date": 20120328,
-                "date_gmt": "2012-03-28 03:36:57",
-                "language": "en",
-                "post_id": "507823",
-                "tags": [],
-                "title": "#vipworkshop dinner",
-                "url": "http://matt.wordpress.com/?p=3837"
-            },
-            "put": "id:blog-search:blog_post::507823"
-        },
-        {
-            "fields": {
-                "author": "5",
-                "blog": "4",
-                "blogname": "Matt on Not-WordPress",
-                "categories": [
-                    "Moblog"
-                ],
-                "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\" alt=\"\" title=\"photo20\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3840\" /></a>",
-                "date": 20120328,
-                "date_gmt": "2012-03-28 04:41:37",
-                "language": "en",
-                "post_id": "1406963",
-                "tags": [],
-                "title": "Oven roasted tomatoes",
-                "url": "http://matt.wordpress.com/?p=3839"
-            },
-            "put": "id:blog-search:blog_post::1406963"
-        },
-        {
-            "fields": {
-                "author": "5",
-                "blog": "4",
-                "blogname": "Matt on Not-WordPress",
-                "categories": [
-                    "Moblog"
-                ],
-                "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\" alt=\"\" title=\"photo21\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3842\" /></a>",
-                "date": 20120328,
-                "date_gmt": "2012-03-28 19:59:45",
-                "language": "en",
-                "post_id": "1329369",
-                "tags": [],
-                "title": "Fish tacos and spicy slaw",
-                "url": "http://matt.wordpress.com/?p=3841"
-            },
-            "put": "id:blog-search:blog_post::1329369"
-        }]
-
-        for i in range(0, 3):
-            self.assertEqual(json.loads(output_array[i]), compare_with[i])
-
-    def test_parsing_with_popularity(self):
-        sys.argv.append("-p")
-        parser = parse.KaggleRawDataParser()
-
-        parser.main()
-
-        output_array = self.out.getvalue().strip().split('\n')
-        compare_with = [{
-            "fields": {
-                "author": "5",
-                "blog": "4",
-                "blogname": "Matt on Not-WordPress",
-                "categories": [
-                    "Moblog"
-                ],
-                "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\" alt=\"\" title=\"photo19\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3838\" /></a>",
-                "date": 20120328,
-                "date_gmt": "2012-03-28 03:36:57",
-                "language": "en",
-                "popularity": 1.0,
-                "post_id": "507823",
-                "tags": [],
-                "title": "#vipworkshop dinner",
-                "url": "http://matt.wordpress.com/?p=3837"
-            },
-            "put": "id:blog-search:blog_post::507823"
-        },
-        {
-            "fields": {
-                "author": "5",
-                "blog": "4",
-                "blogname": "Matt on Not-WordPress",
-                "categories": [
-                    "Moblog"
-                ],
-                "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\" alt=\"\" title=\"photo20\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3840\" /></a>",
-                "date": 20120328,
-                "date_gmt": "2012-03-28 04:41:37",
-                "language": "en",
-                "popularity": 1.0,
-                "post_id": "1406963",
-                "tags": [],
-                "title": "Oven roasted tomatoes",
-                "url": "http://matt.wordpress.com/?p=3839"
-            },
-            "put": "id:blog-search:blog_post::1406963"
-        },
-        {
-            "fields": {
-                "author": "5",
-                "blog": "4",
-                "blogname": "Matt on Not-WordPress",
-                "categories": [
-                    "Moblog"
-                ],
-                "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\" alt=\"\" title=\"photo21\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3842\" /></a>",
-                "date": 20120328,
-                "date_gmt": "2012-03-28 19:59:45",
-                "language": "en",
-                "popularity": 1.0,
-                "post_id": "1329369",
-                "tags": [],
-                "title": "Fish tacos and spicy slaw",
-                "url": "http://matt.wordpress.com/?p=3841"
-            },
-            "put": "id:blog-search:blog_post::1329369"
-        }]
-
-        for i in range(0, 3):
-            self.assertEqual(json.loads(output_array[i]), compare_with[i])
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/sample-apps/blog-tutorial-shared/src/test/resources/trainPostsSample.json b/sample-apps/blog-tutorial-shared/src/test/resources/trainPostsSample.json
deleted file mode 100644
index c7b7e32f396..00000000000
--- a/sample-apps/blog-tutorial-shared/src/test/resources/trainPostsSample.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{"date_gmt":"2012-03-28 03:36:57", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3837", "title": "#vipworkshop dinner", "blog": "4", "post_id": "507823", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-28 03:36:57", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\" alt=\"\" title=\"photo19\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3838\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-31 11:43:38", "uid": "6218184"}, {"dt": "2012-03-28 10:25:22", "uid": "11335199"}]}
-{"date_gmt":"2012-03-28 04:41:37", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3839", "title": "Oven roasted tomatoes", "blog": "4", "post_id": "1406963", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-28 04:41:37", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\" alt=\"\" title=\"photo20\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3840\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-31 11:43:03", "uid": "6218184"}, {"dt": "2012-03-28 05:01:34", "uid": "26248885"}]}
-{"date_gmt":"2012-03-28 19:59:45", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3841", "title": "Fish tacos and spicy slaw", "blog": "4", "post_id": "1329369", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-28 19:59:45", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\" alt=\"\" title=\"photo21\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3842\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-28 21:05:37", "uid": "31367867"}]}
-{"date_gmt":"2012-03-31 00:58:37", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3845", "title": "White corn guacamole", "blog": "4", "post_id": "916703", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-31 00:58:37", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo22.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo22.jpg\" alt=\"\" title=\"photo22\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3846\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-31 11:42:15", "uid": "6218184"}, {"dt": "2012-03-31 03:32:06", "uid": "33301824"}]}
-{"date_gmt":"2012-03-31 18:46:59", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3847", "title": "#WordPress cake and wine", "blog": "4", "post_id": "1829542", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-31 18:46:59", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo23.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo23.jpg\" alt=\"\" title=\"photo23\" width=\"640\" height=\"480\" class=\"alignnone size-full wp-image-3848\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-04-04 14:22:40", "uid": "21552"}, {"dt": "2012-04-01 07:51:59", "uid": "50392"}, {"dt": "2012-04-12 16:58:42", "uid": "53742"}, {"dt": "2012-04-05 13:37:51", "uid": "396702"}, {"dt": "2012-04-02 22:30:22", "uid": "414033"}, {"dt": "2012-04-01 08:17:25", "uid": "872435"}, {"dt": "2012-03-31 18:50:18", "uid": "1156143"}, {"dt": "2012-04-01 07:25:27", "uid": "1246555"}, {"dt": "2012-03-31 19:26:40", "uid": "2177886"}, {"dt": "2012-03-31 19:13:17", "uid": "3346825"}, {"dt": "2012-04-01 21:50:49", "uid": "5073742"}, {"dt": "2012-03-31 19:46:11", "uid": "6134205"}, {"dt": "2012-03-31 19:07:17", "uid": "6433901"}, {"dt": "2012-04-04 01:01:08", "uid": "6894686"}, {"dt": "2012-03-31 18:53:19", "uid": "7073116"}, {"dt": "2012-03-31 18:51:52", "uid": "8288845"}, {"dt": "2012-04-23 10:17:31", "uid": "12788480"}, {"dt": "2012-04-05 15:50:45", "uid": "28688316"}, {"dt": "2012-04-01 19:46:36", "uid": "29630467"}, {"dt": "2012-03-31 18:48:49", "uid": "33013507"}]}
-{"date_gmt":"2012-04-01 03:48:07", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3849", "title": "Oysters!", "blog": "4", "post_id": "1197076", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-04-01 03:48:07", "content": "<a href=\"http://matt.files.wordpress.com/2012/04/photo.jpg\"><img src=\"http://matt.files.wordpress.com/2012/04/photo.jpg\" alt=\"\" title=\"photo\" width=\"640\" height=\"480\" class=\"alignnone size-full wp-image-3850\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-04-01 03:59:19", "uid": "5360368"}, {"dt": "2012-04-01 03:52:29", "uid": "8689260"}, {"dt": "2012-04-10 06:08:23", "uid": "26404032"}]}
-{"date_gmt":"2012-04-01 19:05:20", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3851", "title": "Crab and artichoke pizza", "blog": "4", "post_id": "377833", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-04-01 19:05:20", "content": "<a href=\"http://matt.files.wordpress.com/2012/04/photo1.jpg\"><img src=\"http://matt.files.wordpress.com/2012/04/photo1.jpg\" alt=\"\" title=\"photo1\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3852\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-04-13 19:29:25", "uid": "6218184"}]}
-{"date_gmt":"2012-04-01 19:48:38", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3853", "title": "Framboise float and brownie", "blog": "4", "post_id": "871687", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-04-01 19:48:38", "content": "<a href=\"http://matt.files.wordpress.com/2012/04/photo2.jpg\"><img src=\"http://matt.files.wordpress.com/2012/04/photo2.jpg\" alt=\"\" title=\"photo2\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3854\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-04-13 19:28:36", "uid": "6218184"}]}
-{"date_gmt":"2012-04-01 21:09:09", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3855", "title": "Rothko #10", "blog": "4", "post_id": "1893680", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-04-01 21:09:09", "content": "<a href=\"http://matt.files.wordpress.com/2012/04/photo3.jpg\"><img src=\"http://matt.files.wordpress.com/2012/04/photo3.jpg\" alt=\"\" title=\"photo3\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3856\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-04-01 21:41:34", "uid": "2096880"}, {"dt": "2012-04-02 03:50:33", "uid": "4634349"}, {"dt": "2012-04-01 21:10:13", "uid": "6766437"}, {"dt": "2012-04-01 21:35:48", "uid": "11335199"}, {"dt": "2012-04-01 21:18:40", "uid": "11691159"}]}
-{"date_gmt":"2012-04-03 13:22:50", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3857", "title": "Port lights", "blog": "4", "post_id": "891295", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-04-03 13:22:50", "content": "<a href=\"http://matt.files.wordpress.com/2012/04/photo4.jpg\"><img src=\"http://matt.files.wordpress.com/2012/04/photo4.jpg\" alt=\"\" title=\"photo4\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3858\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-04-04 05:50:43", "uid": "4204944"}, {"dt": "2012-04-03 15:23:45", "uid": "6134205"}, {"dt": "2012-04-03 13:27:01", "uid": "8399160"}, {"dt": "2012-04-03 13:36:09", "uid": "11335199"}, {"dt": "2012-04-03 13:24:17", "uid": "33301824"}]}
diff --git a/sample-apps/blog-tutorial-shared/src/test/resources/trainPostsSampleWith3Elements.json b/sample-apps/blog-tutorial-shared/src/test/resources/trainPostsSampleWith3Elements.json
deleted file mode 100644
index c144300fe37..00000000000
--- a/sample-apps/blog-tutorial-shared/src/test/resources/trainPostsSampleWith3Elements.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{"date_gmt":"2012-03-28 03:36:57", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3837", "title": "#vipworkshop dinner", "blog": "4", "post_id": "507823", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-28 03:36:57", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo19.jpg\" alt=\"\" title=\"photo19\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3838\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-31 11:43:38", "uid": "6218184"}, {"dt": "2012-03-28 10:25:22", "uid": "11335199"}]}
-{"date_gmt":"2012-03-28 04:41:37", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3839", "title": "Oven roasted tomatoes", "blog": "4", "post_id": "1406963", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-28 04:41:37", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo20.jpg\" alt=\"\" title=\"photo20\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3840\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-31 11:43:03", "uid": "6218184"}, {"dt": "2012-03-28 05:01:34", "uid": "26248885"}]}
-{"date_gmt":"2012-03-28 19:59:45", "language": "en", "author": "5", "url": "http://matt.wordpress.com/?p=3841", "title": "Fish tacos and spicy slaw", "blog": "4", "post_id": "1329369", "tags": [], "blogname": "Matt on Not-WordPress", "date": "2012-03-28 19:59:45", "content": "<a href=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\"><img src=\"http://matt.files.wordpress.com/2012/03/photo21.jpg\" alt=\"\" title=\"photo21\" width=\"1000\" height=\"750\" class=\"alignnone size-full wp-image-3842\" /></a>", "categories": ["Moblog"], "likes": [{"dt": "2012-03-28 21:05:37", "uid": "31367867"}]}
diff --git a/sample-apps/blog-tutorial-shared/src/test/resources/trainingSetIndicesSample.txt b/sample-apps/blog-tutorial-shared/src/test/resources/trainingSetIndicesSample.txt
deleted file mode 100644
index 7312ed2fdca..00000000000
--- a/sample-apps/blog-tutorial-shared/src/test/resources/trainingSetIndicesSample.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-1000136	30856199
-1000631	19883445
-1000631	24350500
-1000631	25936432
-1000631	29881381
-1000776	30532923
-1001135	20234756
-1001135	31697777
-100156	12076558
-100156	12159263
-100156	17277682
-100156	964144
-1002462	1878322
-1002515	16433161
-1002515	21922168
-1002515	23039563
-1002515	2493815
-1002599	23733565
-1002599	29430402
-1002599	30449719
diff --git a/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/BlogRecommendationAppTest.scala b/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/BlogRecommendationAppTest.scala
deleted file mode 100644
index bd73c088e3e..00000000000
--- a/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/BlogRecommendationAppTest.scala
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.example.blog
-
-import org.scalatest.FunSuite
-
-class BlogRecommendationAppTest extends FunSuite {
-
-  test("CollaborativeFilteringApp writes user and item latent factors to output path") (pending)
-
-}
diff --git a/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/CollaborativeFilteringTest.scala b/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/CollaborativeFilteringTest.scala
deleted file mode 100644
index 07df1ebf622..00000000000
--- a/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/CollaborativeFilteringTest.scala
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.example.blog
-
-import org.apache.spark.ml.recommendation.ALSModel
-import org.apache.spark.sql.SparkSession
-import org.scalatest.Matchers._
-import org.scalatest._
-
-class CollaborativeFilteringTest extends FunSuite with BeforeAndAfter {
-
-  var ss: SparkSession = _
-
-  before {
-
-    ss = SparkSession
-      .builder()
-      .appName("Unit Test")
-      .master("local[*]")
-      .getOrCreate()
-
-  }
-
-  after {
-    ss.stop()
-  }
-
-  test("run method returns a MatrixFactorizationModel with latent factors of size 10 to user and item") {
-
-    val file_path = getClass.getResource("/trainingSetIndicesSample.txt")
-
-    val cf = new CollaborativeFiltering(ss)
-
-    val model = cf.run(
-      input_path = file_path.toString,
-      rank = 10,
-      numIterations = 10,
-      lambda = 0.01)
-
-    model shouldBe a [ALSModel]
-
-    val product_feature_array = model.itemFactors.first().getSeq(1)
-    assertResult(10){product_feature_array.length}
-
-    val user_feature_array = model.userFactors.first().getSeq(1)
-    assertResult(10){user_feature_array.length}
-
-  }
-
-  test("run_pipeline method returns a MatrixFactorizationModel with latent factors of size 10 to user and item") {
-
-    val file_path = getClass.getResource("/trainingSetIndicesSample.txt")
-
-    val cf = new CollaborativeFiltering(ss)
-
-    val model = cf.run_pipeline(input_path = file_path.toString, numIterations = 10)
-
-    model shouldBe a [ALSModel]
-
-    val product_feature_array = model.itemFactors.first().getSeq(1)
-    assertResult(10){product_feature_array.length}
-
-    val user_feature_array = model.userFactors.first().getSeq(1)
-    assertResult(10){user_feature_array.length}
-
-  }
-
-}
diff --git a/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/SplitFullSetIntoTrainAndTestSetsTest.scala b/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/SplitFullSetIntoTrainAndTestSetsTest.scala
deleted file mode 100644
index 395cc99f8c5..00000000000
--- a/sample-apps/blog-tutorial-shared/src/test/scala/com/yahoo/example/blog/SplitFullSetIntoTrainAndTestSetsTest.scala
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.example.blog
-
-import org.apache.spark.sql.{SparkSession, DataFrame}
-import org.scalatest.Matchers._
-import org.scalatest._
-
-class SplitFullSetIntoTrainAndTestSetsTest extends FunSuite with BeforeAndAfter {
-
-  var ss: SparkSession = _
-
-  before {
-
-    ss = SparkSession
-      .builder()
-      .appName("Unit Test")
-      .master("local[*]")
-      .getOrCreate()
-
-  }
-
-  after {
-    ss.stop()
-  }
-
-  test("SplitFullSetIntoTrainAndTestSets should return an Array of DataFrame") {
-
-    val file_path = getClass.getResource("/trainPostsSample.json")
-
-    val splitter = new SplitFullSetIntoTrainAndTestSets(ss)
-
-    val sets = splitter.run(input_file_path = file_path.toString,
-      test_perc_stage1 = 0.05,
-      test_perc_stage2 = 0.15,
-      seed = 123)
-
-    sets shouldBe a [Array[DataFrame]]
-
-  }
-
-}
-
author	Kristian Aune <kraune@yahoo-inc.com>	2017-08-18 09:14:46 +0200
committer	Kristian Aune <kraune@yahoo-inc.com>	2017-08-18 09:14:46 +0200
commit	e36ce286e3cc2026fe0ccbbf0e116477b23c70e1 (patch)
tree	3d3dadc8869629d7ebc7d019a400fc641a072c4c /sample-apps/blog-tutorial-shared/src
parent	a618787ba847eed7c30f6bd81256db8d39e5e2c4 (diff)