aboutsummaryrefslogtreecommitdiffstats
path: root/sample-apps/blog-tutorial-shared/src/main/scala/com/yahoo/example/blog/CollaborativeFiltering.scala
blob: f20e23321a1b62d591131392f259d006d7573e2f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
package com.yahoo.example.blog

import org.apache.spark.ml.recommendation.{ALS, ALSModel}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.sql.functions.{col, explode}

import scala.collection.mutable
import scala.util.parsing.json.JSONObject

class CollaborativeFiltering(val ss: SparkSession) {

  import ss.implicits._

  def loadTrainingIndicesIntoDataFrame(input_path: String) = {

    val ratings = ss.sparkContext.textFile(input_path)
      .map(_.split("\t"))
      .map(p => (p(0), p(1), 1))
      .toDF("post_id", "user_id", "label")
      .filter(col("post_id").notEqual("null"))
      .filter(col("user_id").notEqual("null"))
      .select(col("post_id").cast(IntegerType).as("post_id"),
              col("user_id").cast(IntegerType).as("user_id"),
              col("label").cast(IntegerType).as("label"))

    ratings

  }

  def loadDataIntoDataFrame(input_path: String): DataFrame = {

    val dataset = ss.read.json(input_path)

    val setOne = udf(() => 1)

    val ratings = dataset.select(col("post_id").cast(IntegerType).as("post_id"),
                                 explode(col("likes")).as("likes_flat"))
      .select(col("post_id"), col("likes_flat.uid").cast(IntegerType).as("user_id"))
      .withColumn("label", setOne())

    ratings

  }

  def loadDataIntoRating(input_path: String): RDD[Rating] = {

    val dataset: DataFrame = ss.read.json(input_path)

    val ratings = dataset.select(col("post_id"), explode(col("likes")).as("likes_flat"))
      .select(col("post_id"), col("likes_flat.uid").as("user_id"))
      .rdd.map {
      case Row(post_id: String,
      user_id: String) =>
        Rating(user_id.toInt, post_id.toInt, 1)
    }

    ratings

  }

  def run(input_path: String, rank: Int, numIterations: Int, lambda: Double): ALSModel = {

    // Loading and preparing the data
    val ratings = loadTrainingIndicesIntoDataFrame(input_path)

    // Fitting the model
    val model = new ALS()
      .setItemCol("post_id")
      .setRatingCol("label")
      .setUserCol("user_id")
      .setImplicitPrefs(true)
      .setAlpha(lambda)
      .setMaxIter(numIterations)
      .setRank(rank)
      .fit(ratings)

    model

  }

  def run_pipeline(input_path: String, numIterations: Int): ALSModel = {

    // Loading and preparing the data
    val ratings = loadTrainingIndicesIntoDataFrame(input_path)

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    val collaborative_filtering = new ALS()
      .setItemCol("post_id")
      .setRatingCol("label")
      .setUserCol("user_id")
      .setMaxIter(numIterations)

    val paramGrid = new ParamGridBuilder()
      .addGrid(collaborative_filtering.rank, Array(10, 50, 100))
      .addGrid(collaborative_filtering.alpha, Array(0.001, 0.01, 0.1))
      .build()

    val cv = new CrossValidator()
      .setEstimator(collaborative_filtering)
      .setEvaluator(new RegressionEvaluator)
      .setEstimatorParamMaps(paramGrid)
      .setNumFolds(2)  // Use 3+ in practice

    // Run cross-validation, and choose the best set of parameters.
    val cvModel = cv.fit(ratings)

    cvModel.bestModel.asInstanceOf[ALSModel]

  }

}

object CollaborativeFiltering {

  def writeModelFeaturesAsTensor[T] (modelFeatures:(Int, mutable.WrappedArray[T]), id_string:String) = {

    val id = modelFeatures._1
    val latentVector = modelFeatures._2
    var latentVectorMap:Map[String,T] = Map()
    var output:Map[String,Any] = Map()

    for ( i <- latentVector.indices ){

      latentVectorMap += (("user_item_cf:" + i.toString, latentVector(i)))

    }

    output += ((id_string, id))
    output += (("user_item_cf", JSONObject(latentVectorMap)))

    JSONObject(output)

  }

  def writeFeaturesAsVespaTensorText(model: ALSModel, output_path: String): Unit ={

    model
      .itemFactors.rdd
      .map {
        case Row(id: Int, features: mutable.WrappedArray[Double]) => writeModelFeaturesAsTensor((id, features), "post_id")
      }
      .saveAsTextFile(output_path + "/product_features")
    model
      .userFactors.rdd
      .map {
        case Row(id: Int, features: mutable.WrappedArray[Double]) => writeModelFeaturesAsTensor((id, features), "user_id")
      }
      .saveAsTextFile(output_path + "/user_features")

  }

}