Packages

The following R libraries are used in this project

library(dplyr)
library(plotly)
library(knitr)
library(vegan)
library(Rtsne)
library(recommenderlab)

Exploratory Data Analysis

movies <- read.csv('/Users/shichenghan/Documents/Institute/111-1/RS/ml-100k/u.item', sep='|', header = F)
colnames(movies) <- c("movie_id" , "movie_title" , "release_date" , "video_release_date" , "IMDb_URL" ,
                      "unknown" , "Action" , "Adventure" , "Animation" , "Children's" ,
                      "Comedy" , "Crime" , "Documentary" , "Drama" , "Fantasy" ,
                      "Film-Noir" , "Horror" , "Musical" , "Mystery" , "Romance" ,
                      "Sci-Fi" , "Thriller" , "War" , "Western")
users <- read.csv('/Users/shichenghan/Documents/Institute/111-1/RS/ml-100k/u.user', sep='|', header = F)
colnames(users) <- c("user_id" , "age" , "gender" , "occupation" , "zip_code")
d <- read.csv('/Users/shichenghan/Documents/Institute/111-1/RS/ml-100k/u.data', sep='\t', header = F)
colnames(d) <- c("user_id" , "item_id" , "rating" , "timestamp")

Movie

type <- colnames(movies)[6:24]
n <- movies[,6:24] %>% colSums()
m <- cbind(type, n) %>% as.data.frame()
m$n <- as.numeric(m$n)
m <- m[order(m$n, decreasing = T),]
m$type <- factor(m$type, levels = unique(m$type)[order(m$n, decreasing = TRUE)])
fig1 <- plot_ly(m, x = ~type, y = ~n, type = 'bar',
                marker = list(color = c('rgba(222,45,38,0.8)',
                                       rep('rgba(204,204,204,1)', 18))))
fig1 <- fig1 %>% layout(title = "What are common genere of movies ?",
                        xaxis = list(title = ""),
                        yaxis = list(title = ""))
fig1

mg <- rowSums(movies[,6:24]) %>% table() %>% as.data.frame()
colnames(mg) <- c("Genre", "n")
mg$Genre <- factor(mg$Genre, levels = mg$Genre[order(mg$n, decreasing = TRUE)])
fig2 <- plot_ly(mg, x = ~Genre, y = ~n, type = 'bar',
                marker = list(color = c(rep('rgba(222,45,38,0.8)', 2),
                                        rep('rgba(204,204,204,1)', 4))))
fig2 <- fig2 %>% layout(title = "<b> About 80% of movies have two or fewer genre <b>",
                        xaxis = list(title = "Genre"),
                        yaxis = list(title = ""))
fig2

m <- d$item_id %>% table() %>% as.data.frame()
year <- movies$release_date %>% substr(., 8, 11)
month <- movies$release_date %>% substr(., 4, 6)
day <- movies$release_date %>% substr(., 1, 2)
m <- data.frame(title = movies$movie_title, 
                year = year,
                month = month,
                day = day,
                n = m[,-1])
m <- m[order(m$n, decreasing = TRUE),]
m$day[which(m$year==971)] <- 04
m$month[which(m$year==971)] <- "Feb"
m$year[which(m$year==971)] <- 1971
ym <- m$year %>% table() %>% as.data.frame()
ym <- ym[-1,]
colnames(ym) <- c("year", "n")
ym <- ym[order(ym$year, decreasing = F),]
ym$type <- factor(ym$year, levels = unique(ym$year)[order(ym$n, decreasing = TRUE)])
fig3 <- plot_ly(ym, x = ~year, y = ~n, type = 'bar',
                marker = list(color = c(rep('rgba(204,204,204,1)', 65),
                                        'rgba(222,45,38,0.8)',
                                        rep('rgba(204,204,204,1)', 5))))
fig3 <- fig3 %>% layout(title = "<b> People mostly watched movies released in 90s <b>",
                        xaxis = list(title = ""),
                        yaxis = list(title = ""))
fig3

mm <- m$month %>% table %>% as.data.frame()
mm <- mm[-1,]
colnames(mm) <- c("month", "n")
mm$n[which(mm$month=="Jan")] <- mm$n[which(mm$month=="Jan")] - which(m$month=="Jan" & m$day=="01") %>% length()
mm$month <- mm$month %>% as.vector()
mm[13,] <- c("Jan_01", which(m$month=="Jan" & m$day=="01") %>% length())
mm$month <- mm$month %>% factor(., levels = c('Jan_01', 'Jan', 'Feb', 'Mar', 'Apr', 
                                              'May', 'Jun', 'Jul', 'Aug', 'Sep', 
                                              'Oct', 'Nov', 'Dec'))
mm$n <- as.numeric(mm$n)

fig4 <- plot_ly(mm, x = ~month, y = ~n, type = 'bar',
                marker = list(color = c(rep('rgba(204,204,204,1)', 12),
                                        'rgba(222,45,38,0.8)')))
fig4 <- fig4 %>% layout(title = "<b> Jan 1st is the default release month and date ! <b>",
                        xaxis = list(title = ""),
                        yaxis = list(title = ""))
fig4

User

u <- table(users$occupation) %>% as.data.frame()
colnames(u) <- c("year", "n")
u <- u[order(u$n, decreasing = T),]
u$year <- factor(u$year, levels = unique(u$year)[order(u$n, decreasing = TRUE)])
fig5 <- plot_ly(u, x = ~year, y = ~n, year = 'bar',
                marker = list(color = c('rgba(222,45,38,0.8)',
                                        rep('rgba(204,204,204,1)', 20))))
fig5 <- fig5 %>% layout(title = "<b> Which kind of occupant watches more movies ? <b>",
                        xaxis = list(title = ""),
                        yaxis = list(title = ""))
fig5

fig6 <- plot_ly(y = users$age[which(users$occupation=="student")], 
               type = "box", name="Student")
fig6 <- fig6 %>% add_trace(y = ~users$age[which(users$occupation=="retired")], name="Retired")
fig6 <- fig6 %>% add_trace(y = ~users$age[-c(which(users$occupation=="student"),
                                           which(users$occupation=="retired"))], name="Others")
fig6 <- fig6 %>% layout(title = "<b> Age Distribution in Different Occupations <b>")
fig6

Rating

rd <- d %>% as(., "realRatingMatrix")
rt <- as.vector(rd@data)
rt <- rt[rt != 0]
fig7 <- plot_ly(x = ~rt, type = "histogram", alpha = 0.6) %>% 
  layout(title = "<b> Histogram of Ratings <b>",
         xaxis = list(title = "Rating"))

fig7

rdn <- normalize(rd)
rtn <- as.vector(rdn@data)
rtn <- rtn[rtn != 0]
fig8 <- plot_ly(x = ~rtn, type = "histogram", alpha = 0.6, nbinsx = 40) %>% 
  layout(title = "<b> Histogram of Normalized Ratings <b>",
         xaxis = list(title = "Rating"))

fig8

Preparation

Zero-One Matrix

kable(head(movies[,c(6:14)]), "simple")
unknown Action Adventure Animation Children’s Comedy Crime Documentary Drama
0 0 0 1 1 1 0 0 0
0 1 1 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 1 0 0 0 1 0 0 1
0 0 0 0 0 0 1 0 1
0 0 0 0 0 0 0 0 1

Jaccard

Similarity of two sets \(U\) and \(V\).

\[ Jaccard(U,V) = \frac{|U \cap V|}{|U \cup V|} \]


Distance Matrix

mjd <- movies[6:24] %>% t() %>% vegdist(., method = "jaccard") %>% as.matrix()
kable(head(mjd[,1:6]), "simple")
unknown Action Adventure Animation Children’s Comedy
unknown 0 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
Action 1 0.0000000 0.7588424 0.9896552 0.9752747 0.9558011
Adventure 1 0.7588424 0.0000000 0.9649123 0.8046512 0.9710611
Animation 1 0.9896552 0.9649123 0.0000000 0.7480916 0.9813780
Children’s 1 0.9752747 0.8046512 0.7480916 0.0000000 0.9300341
Comedy 1 0.9558011 0.9710611 0.9813780 0.9300341 0.0000000

Dimension Reduction

mjd_pca <- prcomp(mjd)
mjd_pc <- as.data.frame(mjd_pca$rotation)
mjd_pc$feature <- row.names(mjd_pc)
vars <- (mjd_pca$sdev)^2
props <- vars / sum(vars)
cumulative.props <- cumsum(props)
plot_ly(x=mjd_pc$PC1, y=mjd_pc$PC2, z=mjd_pc$PC3, 
        type="scatter3d", mode="markers", color=mjd_pc$feature) %>% 
  layout(showlegend = TRUE,  title="<b> Movie Genre PCA <b>",
         scene = list(xaxis = list(title = 'PC1'),
                      yaxis = list(title = 'PC2'),
                      zaxis = list(title = 'PC3')),
         legend = list(itemsizing='constant', font = list(size = 12)))

set.seed(1221)
mjd_tsne <- Rtsne(mjd, dims = 3, perplexity = 5)
mjd_y <- as.data.frame(mjd_tsne$Y)
mjd_y$feature <- row.names(mjd_pc)
plot_ly(x=mjd_y[,1], y=mjd_y[,2], z=mjd_y[,3], 
        type="scatter3d", mode="markers", color=mjd_pc$feature) %>% 
  layout(showlegend = TRUE, title="<b> Movie Genre t-SNE <b>",
         legend = list(itemsizing='constant', font = list(size = 12)))

Recommender System

Minimum Thresholds

we restrict the model training to those users who have rated at least 50 movies, and those movies that have been rated by at least 50 users.

Before

rd
## 943 x 1682 rating matrix of class 'realRatingMatrix' with 100000 ratings.

After

srd <- rd[rowCounts(rd) >= 50, colCounts(rd) >= 50]
srd
## 568 x 603 rating matrix of class 'realRatingMatrix' with 73544 ratings.

K-fold Cross-Validation

percent <- 0.8      
items <- -5               
rating <- 4     
k <- 5                    

sett <- evaluationScheme(data = srd, 
                              method = "cross-validation",
                              train = percent, 
                              given = items,
                              goodRating = rating, 
                              k = k)
sett
## Evaluation scheme using all-but-5 items
## Method: 'cross-validation' with 5 run(s).
## Good ratings: >=4.000000
## Data set: 568 x 603 rating matrix of class 'realRatingMatrix' with 73544 ratings.

User Base CF

recommend <- Recommender(data = getData(sett, "train"),
                         method = "UBCF", parameter = NULL)
pred <- predict(object = recommend,
                newdata = getData(sett, "known"),
                type = "ratings")
acc <- calcPredictionAccuracy(x = pred,
                              data = getData(sett, "unknown"),
                              byUser = TRUE)
colMeans(acc, na.rm = T)
##      RMSE       MSE       MAE 
## 0.9881104 1.1440621 0.8343489
pred <- predict(object = recommend,
                newdata = getData(sett, "known"),
                n = 10,
                type = "topNList")
head(rownames(acc))
## [1] "6"  "11" "14" "43" "48" "64"
head(pred@items)
## $`0`
##  [1] 587  92 339 226 104 511 537 534 524 599
## 
## $`1`
##  [1] 136 172 173 191 349 486  18 106  35 164
## 
## $`2`
##  [1] 405 375 451  48 438 161 184  37 374 442
## 
## $`3`
##  [1] 226  47  48 412 162 468 478 526 111 377
## 
## $`4`
##  [1] 295  36 305 420 463 402 162  69 104 109
## 
## $`5`
##  [1] 199 338  49 466 256 262 173 501 271 245

Item Base CF

recommend <- Recommender(data = getData(sett, "train"),
                         method = "IBCF", parameter = list(method = "cosine"))
pred <- predict(object = recommend,
                newdata = getData(sett, "known"),
                n = 10,
                type = "ratings")
acc <- calcPredictionAccuracy(x = pred,
                              data = getData(sett, "unknown"),
                              byUser = TRUE)
colMeans(acc, na.rm = T)
##      RMSE       MSE       MAE 
## 0.9157892 0.9753539 0.7571266
pred <- predict(object = recommend,
                newdata = getData(sett, "known"),
                n = 10,
                type = "topNList")
head(rownames(acc))
## [1] "6"  "11" "14" "43" "48" "64"
head(pred@items)
## $`0`
##  [1] 131 514 560 578 598 524 566 239 511 287
## 
## $`1`
##  [1] 149 332 363 460 489   1 126 441 117 177
## 
## $`2`
##  [1]  17  24  55  60  78 135 138 149 198 206
## 
## $`3`
##  [1] 155 328 592  68 111  76 377 412 177 172
## 
## $`4`
##  [1]  36  71 108 293 375 388 400 569 573 118
## 
## $`5`
##  [1] 139 212 311 440  76  49  20 322 396 418

FunkSVD

recommend <- Recommender(data = getData(sett, "train"),
                         method = "SVD", parameter = list(k = 10))
pred <- predict(object = recommend,
                newdata = getData(sett, "known"),
                n = 10,
                type = "ratings")
acc <- calcPredictionAccuracy(x = pred,
                              data = getData(sett, "unknown"),
                              byUser = TRUE)
colMeans(acc, na.rm = T)
##      RMSE       MSE       MAE 
## 0.8387843 0.7974579 0.7004318
pred <- predict(object = recommend,
                newdata = getData(sett, "known"),
                n = 10,
                type = "topNList")
head(rownames(acc))
## [1] "6"  "11" "14" "43" "48" "64"
head(pred@items)
## $`0`
##  [1] 449  92 103 272 476 480 468 474 455 573
## 
## $`1`
##  [1] 322 377 141  92 233  39  52 150 272 103
## 
## $`2`
##  [1] 141 377  92 274 150 272  52 374 171 157
## 
## $`3`
##  [1]  19 407  80 377 142 150 449  68 405 162
## 
## $`4`
##  [1] 322 274 141  52  92  11 233 103  82 150
## 
## $`5`
##  [1] 322 377 141  92 233 109 150 272 368 195

Compare Models

set.seed(1221)
models <- list(SVD = list(name = "SVD", param=list(k = 10)),
               IBCF = list(name = "IBCF", param = list(method = "cosine")),
               UBCF = list(name = "UBCF", param = NULL),
               random = list(name = "RANDOM", param=NULL))
n <- c(1, 3, 5, 10, 15, 20)
topn_results <- evaluate(x = sett, 
                         method = models, 
                         n = n, 
                         type = "topNList")
## SVD run fold/sample [model time/prediction time]
##   1  [0.036sec/0.018sec] 
##   2  [0.034sec/0.022sec] 
##   3  [0.035sec/0.021sec] 
##   4  [0.032sec/0.023sec] 
##   5  [0.038sec/0.021sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.251sec/0.021sec] 
##   2  [0.353sec/0.021sec] 
##   3  [0.248sec/0.025sec] 
##   4  [0.248sec/0.022sec] 
##   5  [0.261sec/0.023sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0.002sec/0.196sec] 
##   2  [0.001sec/0.196sec] 
##   3  [0.002sec/0.196sec] 
##   4  [0.001sec/0.196sec] 
##   5  [0.001sec/0.192sec] 
## RANDOM run fold/sample [model time/prediction time]
##   1  [0.004sec/0.016sec] 
##   2  [0.001sec/0.015sec] 
##   3  [0.004sec/0.015sec] 
##   4  [0.001sec/0.015sec] 
##   5  [0.001sec/0.014sec]
plot(topn_results, y = "ROC", annotate = 1, legend="topleft")
title("ROC Curve")


rating_results <- evaluate(x = sett, 
                           method = models, 
                           type = "ratings")
## SVD run fold/sample [model time/prediction time]
##   1  [0.034sec/0.012sec] 
##   2  [0.035sec/0.016sec] 
##   3  [0.035sec/0.014sec] 
##   4  [0.032sec/0.013sec] 
##   5  [0.037sec/0.012sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.25sec/0.013sec] 
##   2  [0.357sec/0.013sec] 
##   3  [0.247sec/0.018sec] 
##   4  [0.245sec/0.015sec] 
##   5  [0.251sec/0.013sec] 
## UBCF run fold/sample [model time/prediction time]
##   1  [0.003sec/0.186sec] 
##   2  [0.002sec/0.185sec] 
##   3  [0.001sec/0.185sec] 
##   4  [0.001sec/0.193sec] 
##   5  [0.002sec/0.184sec] 
## RANDOM run fold/sample [model time/prediction time]
##   1  [0.001sec/0.008sec] 
##   2  [0.001sec/0.009sec] 
##   3  [0.001sec/0.012sec] 
##   4  [0.001sec/0.009sec] 
##   5  [0.004sec/0.008sec]
plot(rating_results, ylim = c(0,4))


Conclusion

  1. funkSVD’s accuracy is higher than that of UBCF and IBCF

  2. The results recommended by different recommendation methods are quite different

References

1993年的電影技術躍進

FunkSVD

recommenderlab

A Tutorial on Principal Component Analysis

Visualizing Data using t-SNE