The following R libraries are used in this project
library(dplyr)
library(plotly)
library(knitr)
library(vegan)
library(Rtsne)
library(recommenderlab)
movies <- read.csv('/Users/shichenghan/Documents/Institute/111-1/RS/ml-100k/u.item', sep='|', header = F)
colnames(movies) <- c("movie_id" , "movie_title" , "release_date" , "video_release_date" , "IMDb_URL" ,
"unknown" , "Action" , "Adventure" , "Animation" , "Children's" ,
"Comedy" , "Crime" , "Documentary" , "Drama" , "Fantasy" ,
"Film-Noir" , "Horror" , "Musical" , "Mystery" , "Romance" ,
"Sci-Fi" , "Thriller" , "War" , "Western")
users <- read.csv('/Users/shichenghan/Documents/Institute/111-1/RS/ml-100k/u.user', sep='|', header = F)
colnames(users) <- c("user_id" , "age" , "gender" , "occupation" , "zip_code")
d <- read.csv('/Users/shichenghan/Documents/Institute/111-1/RS/ml-100k/u.data', sep='\t', header = F)
colnames(d) <- c("user_id" , "item_id" , "rating" , "timestamp")
type <- colnames(movies)[6:24]
n <- movies[,6:24] %>% colSums()
m <- cbind(type, n) %>% as.data.frame()
m$n <- as.numeric(m$n)
m <- m[order(m$n, decreasing = T),]
m$type <- factor(m$type, levels = unique(m$type)[order(m$n, decreasing = TRUE)])
fig1 <- plot_ly(m, x = ~type, y = ~n, type = 'bar',
marker = list(color = c('rgba(222,45,38,0.8)',
rep('rgba(204,204,204,1)', 18))))
fig1 <- fig1 %>% layout(title = "What are common genere of movies ?",
xaxis = list(title = ""),
yaxis = list(title = ""))
fig1
mg <- rowSums(movies[,6:24]) %>% table() %>% as.data.frame()
colnames(mg) <- c("Genre", "n")
mg$Genre <- factor(mg$Genre, levels = mg$Genre[order(mg$n, decreasing = TRUE)])
fig2 <- plot_ly(mg, x = ~Genre, y = ~n, type = 'bar',
marker = list(color = c(rep('rgba(222,45,38,0.8)', 2),
rep('rgba(204,204,204,1)', 4))))
fig2 <- fig2 %>% layout(title = "<b> About 80% of movies have two or fewer genre <b>",
xaxis = list(title = "Genre"),
yaxis = list(title = ""))
fig2
m <- d$item_id %>% table() %>% as.data.frame()
year <- movies$release_date %>% substr(., 8, 11)
month <- movies$release_date %>% substr(., 4, 6)
day <- movies$release_date %>% substr(., 1, 2)
m <- data.frame(title = movies$movie_title,
year = year,
month = month,
day = day,
n = m[,-1])
m <- m[order(m$n, decreasing = TRUE),]
m$day[which(m$year==971)] <- 04
m$month[which(m$year==971)] <- "Feb"
m$year[which(m$year==971)] <- 1971
ym <- m$year %>% table() %>% as.data.frame()
ym <- ym[-1,]
colnames(ym) <- c("year", "n")
ym <- ym[order(ym$year, decreasing = F),]
ym$type <- factor(ym$year, levels = unique(ym$year)[order(ym$n, decreasing = TRUE)])
fig3 <- plot_ly(ym, x = ~year, y = ~n, type = 'bar',
marker = list(color = c(rep('rgba(204,204,204,1)', 65),
'rgba(222,45,38,0.8)',
rep('rgba(204,204,204,1)', 5))))
fig3 <- fig3 %>% layout(title = "<b> People mostly watched movies released in 90s <b>",
xaxis = list(title = ""),
yaxis = list(title = ""))
fig3
mm <- m$month %>% table %>% as.data.frame()
mm <- mm[-1,]
colnames(mm) <- c("month", "n")
mm$n[which(mm$month=="Jan")] <- mm$n[which(mm$month=="Jan")] - which(m$month=="Jan" & m$day=="01") %>% length()
mm$month <- mm$month %>% as.vector()
mm[13,] <- c("Jan_01", which(m$month=="Jan" & m$day=="01") %>% length())
mm$month <- mm$month %>% factor(., levels = c('Jan_01', 'Jan', 'Feb', 'Mar', 'Apr',
'May', 'Jun', 'Jul', 'Aug', 'Sep',
'Oct', 'Nov', 'Dec'))
mm$n <- as.numeric(mm$n)
fig4 <- plot_ly(mm, x = ~month, y = ~n, type = 'bar',
marker = list(color = c(rep('rgba(204,204,204,1)', 12),
'rgba(222,45,38,0.8)')))
fig4 <- fig4 %>% layout(title = "<b> Jan 1st is the default release month and date ! <b>",
xaxis = list(title = ""),
yaxis = list(title = ""))
fig4
u <- table(users$occupation) %>% as.data.frame()
colnames(u) <- c("year", "n")
u <- u[order(u$n, decreasing = T),]
u$year <- factor(u$year, levels = unique(u$year)[order(u$n, decreasing = TRUE)])
fig5 <- plot_ly(u, x = ~year, y = ~n, year = 'bar',
marker = list(color = c('rgba(222,45,38,0.8)',
rep('rgba(204,204,204,1)', 20))))
fig5 <- fig5 %>% layout(title = "<b> Which kind of occupant watches more movies ? <b>",
xaxis = list(title = ""),
yaxis = list(title = ""))
fig5
fig6 <- plot_ly(y = users$age[which(users$occupation=="student")],
type = "box", name="Student")
fig6 <- fig6 %>% add_trace(y = ~users$age[which(users$occupation=="retired")], name="Retired")
fig6 <- fig6 %>% add_trace(y = ~users$age[-c(which(users$occupation=="student"),
which(users$occupation=="retired"))], name="Others")
fig6 <- fig6 %>% layout(title = "<b> Age Distribution in Different Occupations <b>")
fig6
rd <- d %>% as(., "realRatingMatrix")
rt <- as.vector(rd@data)
rt <- rt[rt != 0]
fig7 <- plot_ly(x = ~rt, type = "histogram", alpha = 0.6) %>%
layout(title = "<b> Histogram of Ratings <b>",
xaxis = list(title = "Rating"))
fig7
rdn <- normalize(rd)
rtn <- as.vector(rdn@data)
rtn <- rtn[rtn != 0]
fig8 <- plot_ly(x = ~rtn, type = "histogram", alpha = 0.6, nbinsx = 40) %>%
layout(title = "<b> Histogram of Normalized Ratings <b>",
xaxis = list(title = "Rating"))
fig8
Zero-One Matrix
kable(head(movies[,c(6:14)]), "simple")
unknown | Action | Adventure | Animation | Children’s | Comedy | Crime | Documentary | Drama |
---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
Jaccard
Similarity of two sets \(U\) and \(V\).
\[ Jaccard(U,V) = \frac{|U \cap V|}{|U \cup V|} \]
Distance Matrix
mjd <- movies[6:24] %>% t() %>% vegdist(., method = "jaccard") %>% as.matrix()
kable(head(mjd[,1:6]), "simple")
unknown | Action | Adventure | Animation | Children’s | Comedy | |
---|---|---|---|---|---|---|
unknown | 0 | 1.0000000 | 1.0000000 | 1.0000000 | 1.0000000 | 1.0000000 |
Action | 1 | 0.0000000 | 0.7588424 | 0.9896552 | 0.9752747 | 0.9558011 |
Adventure | 1 | 0.7588424 | 0.0000000 | 0.9649123 | 0.8046512 | 0.9710611 |
Animation | 1 | 0.9896552 | 0.9649123 | 0.0000000 | 0.7480916 | 0.9813780 |
Children’s | 1 | 0.9752747 | 0.8046512 | 0.7480916 | 0.0000000 | 0.9300341 |
Comedy | 1 | 0.9558011 | 0.9710611 | 0.9813780 | 0.9300341 | 0.0000000 |
mjd_pca <- prcomp(mjd)
mjd_pc <- as.data.frame(mjd_pca$rotation)
mjd_pc$feature <- row.names(mjd_pc)
vars <- (mjd_pca$sdev)^2
props <- vars / sum(vars)
cumulative.props <- cumsum(props)
plot_ly(x=mjd_pc$PC1, y=mjd_pc$PC2, z=mjd_pc$PC3,
type="scatter3d", mode="markers", color=mjd_pc$feature) %>%
layout(showlegend = TRUE, title="<b> Movie Genre PCA <b>",
scene = list(xaxis = list(title = 'PC1'),
yaxis = list(title = 'PC2'),
zaxis = list(title = 'PC3')),
legend = list(itemsizing='constant', font = list(size = 12)))
set.seed(1221)
mjd_tsne <- Rtsne(mjd, dims = 3, perplexity = 5)
mjd_y <- as.data.frame(mjd_tsne$Y)
mjd_y$feature <- row.names(mjd_pc)
plot_ly(x=mjd_y[,1], y=mjd_y[,2], z=mjd_y[,3],
type="scatter3d", mode="markers", color=mjd_pc$feature) %>%
layout(showlegend = TRUE, title="<b> Movie Genre t-SNE <b>",
legend = list(itemsizing='constant', font = list(size = 12)))
we restrict the model training to those users who have rated at least 50 movies, and those movies that have been rated by at least 50 users.
Before
rd
## 943 x 1682 rating matrix of class 'realRatingMatrix' with 100000 ratings.
After
srd <- rd[rowCounts(rd) >= 50, colCounts(rd) >= 50]
srd
## 568 x 603 rating matrix of class 'realRatingMatrix' with 73544 ratings.
percent <- 0.8
items <- -5
rating <- 4
k <- 5
sett <- evaluationScheme(data = srd,
method = "cross-validation",
train = percent,
given = items,
goodRating = rating,
k = k)
sett
## Evaluation scheme using all-but-5 items
## Method: 'cross-validation' with 5 run(s).
## Good ratings: >=4.000000
## Data set: 568 x 603 rating matrix of class 'realRatingMatrix' with 73544 ratings.
recommend <- Recommender(data = getData(sett, "train"),
method = "UBCF", parameter = NULL)
pred <- predict(object = recommend,
newdata = getData(sett, "known"),
type = "ratings")
acc <- calcPredictionAccuracy(x = pred,
data = getData(sett, "unknown"),
byUser = TRUE)
colMeans(acc, na.rm = T)
## RMSE MSE MAE
## 0.9881104 1.1440621 0.8343489
pred <- predict(object = recommend,
newdata = getData(sett, "known"),
n = 10,
type = "topNList")
head(rownames(acc))
## [1] "6" "11" "14" "43" "48" "64"
head(pred@items)
## $`0`
## [1] 587 92 339 226 104 511 537 534 524 599
##
## $`1`
## [1] 136 172 173 191 349 486 18 106 35 164
##
## $`2`
## [1] 405 375 451 48 438 161 184 37 374 442
##
## $`3`
## [1] 226 47 48 412 162 468 478 526 111 377
##
## $`4`
## [1] 295 36 305 420 463 402 162 69 104 109
##
## $`5`
## [1] 199 338 49 466 256 262 173 501 271 245
recommend <- Recommender(data = getData(sett, "train"),
method = "IBCF", parameter = list(method = "cosine"))
pred <- predict(object = recommend,
newdata = getData(sett, "known"),
n = 10,
type = "ratings")
acc <- calcPredictionAccuracy(x = pred,
data = getData(sett, "unknown"),
byUser = TRUE)
colMeans(acc, na.rm = T)
## RMSE MSE MAE
## 0.9157892 0.9753539 0.7571266
pred <- predict(object = recommend,
newdata = getData(sett, "known"),
n = 10,
type = "topNList")
head(rownames(acc))
## [1] "6" "11" "14" "43" "48" "64"
head(pred@items)
## $`0`
## [1] 131 514 560 578 598 524 566 239 511 287
##
## $`1`
## [1] 149 332 363 460 489 1 126 441 117 177
##
## $`2`
## [1] 17 24 55 60 78 135 138 149 198 206
##
## $`3`
## [1] 155 328 592 68 111 76 377 412 177 172
##
## $`4`
## [1] 36 71 108 293 375 388 400 569 573 118
##
## $`5`
## [1] 139 212 311 440 76 49 20 322 396 418
recommend <- Recommender(data = getData(sett, "train"),
method = "SVD", parameter = list(k = 10))
pred <- predict(object = recommend,
newdata = getData(sett, "known"),
n = 10,
type = "ratings")
acc <- calcPredictionAccuracy(x = pred,
data = getData(sett, "unknown"),
byUser = TRUE)
colMeans(acc, na.rm = T)
## RMSE MSE MAE
## 0.8387843 0.7974579 0.7004318
pred <- predict(object = recommend,
newdata = getData(sett, "known"),
n = 10,
type = "topNList")
head(rownames(acc))
## [1] "6" "11" "14" "43" "48" "64"
head(pred@items)
## $`0`
## [1] 449 92 103 272 476 480 468 474 455 573
##
## $`1`
## [1] 322 377 141 92 233 39 52 150 272 103
##
## $`2`
## [1] 141 377 92 274 150 272 52 374 171 157
##
## $`3`
## [1] 19 407 80 377 142 150 449 68 405 162
##
## $`4`
## [1] 322 274 141 52 92 11 233 103 82 150
##
## $`5`
## [1] 322 377 141 92 233 109 150 272 368 195
set.seed(1221)
models <- list(SVD = list(name = "SVD", param=list(k = 10)),
IBCF = list(name = "IBCF", param = list(method = "cosine")),
UBCF = list(name = "UBCF", param = NULL),
random = list(name = "RANDOM", param=NULL))
n <- c(1, 3, 5, 10, 15, 20)
topn_results <- evaluate(x = sett,
method = models,
n = n,
type = "topNList")
## SVD run fold/sample [model time/prediction time]
## 1 [0.036sec/0.018sec]
## 2 [0.034sec/0.022sec]
## 3 [0.035sec/0.021sec]
## 4 [0.032sec/0.023sec]
## 5 [0.038sec/0.021sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [0.251sec/0.021sec]
## 2 [0.353sec/0.021sec]
## 3 [0.248sec/0.025sec]
## 4 [0.248sec/0.022sec]
## 5 [0.261sec/0.023sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0.002sec/0.196sec]
## 2 [0.001sec/0.196sec]
## 3 [0.002sec/0.196sec]
## 4 [0.001sec/0.196sec]
## 5 [0.001sec/0.192sec]
## RANDOM run fold/sample [model time/prediction time]
## 1 [0.004sec/0.016sec]
## 2 [0.001sec/0.015sec]
## 3 [0.004sec/0.015sec]
## 4 [0.001sec/0.015sec]
## 5 [0.001sec/0.014sec]
plot(topn_results, y = "ROC", annotate = 1, legend="topleft")
title("ROC Curve")
rating_results <- evaluate(x = sett,
method = models,
type = "ratings")
## SVD run fold/sample [model time/prediction time]
## 1 [0.034sec/0.012sec]
## 2 [0.035sec/0.016sec]
## 3 [0.035sec/0.014sec]
## 4 [0.032sec/0.013sec]
## 5 [0.037sec/0.012sec]
## IBCF run fold/sample [model time/prediction time]
## 1 [0.25sec/0.013sec]
## 2 [0.357sec/0.013sec]
## 3 [0.247sec/0.018sec]
## 4 [0.245sec/0.015sec]
## 5 [0.251sec/0.013sec]
## UBCF run fold/sample [model time/prediction time]
## 1 [0.003sec/0.186sec]
## 2 [0.002sec/0.185sec]
## 3 [0.001sec/0.185sec]
## 4 [0.001sec/0.193sec]
## 5 [0.002sec/0.184sec]
## RANDOM run fold/sample [model time/prediction time]
## 1 [0.001sec/0.008sec]
## 2 [0.001sec/0.009sec]
## 3 [0.001sec/0.012sec]
## 4 [0.001sec/0.009sec]
## 5 [0.004sec/0.008sec]
plot(rating_results, ylim = c(0,4))
funkSVD’s accuracy is higher than that of UBCF and IBCF
The results recommended by different recommendation methods are quite different
References