Building Recommendation Systems for BBC iPlayer users

Loading the Data

#Loadning the needed 2 datasets
movie_data <- fread(input="~/Documents/LBS/Machine Learning/Session 02/The Assignment/movies.csv",
                    stringsAsFactors = FALSE) #9742 number of rows

rating_data <- fread(input = "~/Documents/LBS/Machine Learning/Session 02/The Assignment/ratings.csv",nrows = 1e6) #100836 number of rows

#Creating the genre list for my data
list_genre <- c("Action", "Adventure", "Animation", "Children", 
                "Comedy", "Crime","Documentary", "Drama", "Fantasy",
                "Film-Noir", "Horror", "Musical", "Mystery","Romance",
                "Sci-Fi", "Thriller", "War", "Western")

Cleaning the Data

Cleaning Movies Data

#Cleaning up the movie_data from the movies that appear more than once

#Finding the movies that appeard more than once
repeatMovies <- movie_data %>% 
  group_by(title) %>% 
  summarise(n = n()) %>% 
  filter(n > 1) %>% 
  pull(title)

repeatMovies # 5 movies appear more than once

## [1] "Confessions of a Dangerous Mind (2002)"
## [2] "Emma (1996)"                           
## [3] "Eros (2004)"                           
## [4] "Saturn 3 (1980)"                       
## [5] "War of the Worlds (2005)"

# [1] "Confessions of a Dangerous Mind (2002)" "Emma (1996)"                           
# [3] "Eros (2004)"                            "Saturn 3 (1980)"                       
# [5] "War of the Worlds (2005)" 

removeRows <- 0  #Initialising  the variable that stores the rows we have to remove later (the duplicates)

##### REMOVE REPEATS IN RATING DATA ####

for(i in repeatMovies){
  repeatMovieLoc <- which(movie_data$title == i) #Getting the indices of the rows where the current movie (with title i) is stored 
  
  tempGenre <- paste(movie_data$genres[repeatMovieLoc], collapse="|")   #Collapsing the genres assigned to the duplicated movies, separated by |
  
tempGenre <- paste(unique(unlist(strsplit(tempGenre, split = "\\|")[[1]])), collapse = "|")   #Splitting the string up again, taking only the unique values

movie_data$genres[repeatMovieLoc[1]] <- tempGenre   #Bringing the new genre string to the first appearance of the movie in the dataframe
  

removeRows <- c(removeRows, repeatMovieLoc[-1])  #Adding the duplicate entry (row 6271) to the list of rows I want to remove later
  
### REMOVE DUPLICATES IN RATINGS DATA ###

  
repeatMovieIdLoc <- which(rating_data$movieId %in% movie_data$movieId[removeRows]) #Searching for all the entries in the rating_data dataframe that refer to duplicate movies in movie_data
  
rating_data$movieId[repeatMovieIdLoc] <- movie_data$movieId[repeatMovieLoc[1]]  #Changing movieId to the movieId that remains in our movie_data dataframe
}

#Removing the duplicate rows in the movie_data
movie_data <- movie_data[-removeRows,]
movie_data

##       movieId                                     title
##    1:       1                          Toy Story (1995)
##    2:       2                            Jumanji (1995)
##    3:       3                   Grumpier Old Men (1995)
##    4:       4                  Waiting to Exhale (1995)
##    5:       5        Father of the Bride Part II (1995)
##   ---                                                  
## 9733:  193581 Black Butler: Book of the Atlantic (2017)
## 9734:  193583              No Game No Life: Zero (2017)
## 9735:  193585                              Flint (2017)
## 9736:  193587       Bungo Stray Dogs: Dead Apple (2018)
## 9737:  193609       Andrew Dice Clay: Dice Rules (1991)
##                                            genres
##    1: Adventure|Animation|Children|Comedy|Fantasy
##    2:                  Adventure|Children|Fantasy
##    3:                              Comedy|Romance
##    4:                        Comedy|Drama|Romance
##    5:                                      Comedy
##   ---                                            
## 9733:             Action|Animation|Comedy|Fantasy
## 9734:                    Animation|Comedy|Fantasy
## 9735:                                       Drama
## 9736:                            Action|Animation
## 9737:                                      Comedy

Cleaning Ratings Data

There is a risk that the same user has rated a movie multiple times. I would like to keep only the best rating.

I will group by userId and movieId, and only take the highest (maximum) rating In the end, I will produce a dataframe, where there is (at most) 1 rating of a user for a movie. This is a constraint needed to build up a proper user-movie-interaction-matrix

rating_data <- rating_data %>% 
  group_by(userId, movieId) %>% 
  summarise(rating = max(rating)) %>% 
  ungroup()

length(unique(rating_data$movieId)) #Identifying movies that have not been yet rated by any user. They are safe to ignore for now, since they shouldn't affect our recommendations

## [1] 9719

length(unique(movie_data$movieId))

## [1] 9737

setdiff(unique(movie_data$movieId), unique(rating_data$movieId))

##  [1]  1076  2939  3338  3456  4194  5721  6668  6849  7020  7792  8765 25855
## [13] 26085 30892 32160 32371 34482 85565

rating_data

## # A tibble: 100,832 x 3
##    userId movieId rating
##     <int>   <int>  <dbl>
##  1      1       1      4
##  2      1       3      4
##  3      1       6      4
##  4      1      47      5
##  5      1      50      5
##  6      1      70      3
##  7      1     101      5
##  8      1     110      4
##  9      1     151      5
## 10      1     157      5
## # … with 100,822 more rows

Checking the Data is Clean

#movie_data
skim(movie_data)

(#tab:performing_data_checks)Data summary
Name	movie_data
Number of rows	9737
Number of columns	3
_______________________
Column type frequency:
character	2
numeric	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	6	158	0	9737	0
genres	0	1	3	77	0	952	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
movieId	0	1	42165	52136	1	3247	7294	76173	193609	▇▂▂▁▁

summary(movie_data)

##     movieId          title              genres         
##  Min.   :     1   Length:9737        Length:9737       
##  1st Qu.:  3247   Class :character   Class :character  
##  Median :  7294   Mode  :character   Mode  :character  
##  Mean   : 42165                                        
##  3rd Qu.: 76173                                        
##  Max.   :193609

head(movie_data)

##    movieId                              title
## 1:       1                   Toy Story (1995)
## 2:       2                     Jumanji (1995)
## 3:       3            Grumpier Old Men (1995)
## 4:       4           Waiting to Exhale (1995)
## 5:       5 Father of the Bride Part II (1995)
## 6:       6                        Heat (1995)
##                                         genres
## 1: Adventure|Animation|Children|Comedy|Fantasy
## 2:                  Adventure|Children|Fantasy
## 3:                              Comedy|Romance
## 4:                        Comedy|Drama|Romance
## 5:                                      Comedy
## 6:                       Action|Crime|Thriller

#rating_data
skim(movie_data)

(#tab:performing_data_checks)Data summary
Name	movie_data
Number of rows	9737
Number of columns	3
_______________________
Column type frequency:
character	2
numeric	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	6	158	0	9737	0
genres	0	1	3	77	0	952	0

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
movieId	0	1	42165	52136	1	3247	7294	76173	193609	▇▂▂▁▁

summary(rating_data)

##      userId       movieId           rating   
##  Min.   :  1   Min.   :     1   Min.   :0.5  
##  1st Qu.:177   1st Qu.:  1199   1st Qu.:3.0  
##  Median :325   Median :  2991   Median :3.5  
##  Mean   :326   Mean   : 19430   Mean   :3.5  
##  3rd Qu.:477   3rd Qu.:  8119   3rd Qu.:4.0  
##  Max.   :610   Max.   :193609   Max.   :5.0

head(rating_data)

## # A tibble: 6 x 3
##   userId movieId rating
##    <int>   <int>  <dbl>
## 1      1       1      4
## 2      1       3      4
## 3      1       6      4
## 4      1      47      5
## 5      1      50      5
## 6      1      70      3

dupes_movie<-movie_data%>%get_dupes(movieId,title) # checking for dupes once again
dupes_movie # no dupes

## Empty data.table (0 rows and 4 cols): movieId,title,dupe_count,genres

dupes_rating<-rating_data%>%get_dupes(userId,movieId)
dupes_rating # no dupes

## # A tibble: 0 x 4
## # … with 4 variables: userId <int>, movieId <int>, dupe_count <int>,
## #   rating <dbl>

movie_data

##       movieId                                     title
##    1:       1                          Toy Story (1995)
##    2:       2                            Jumanji (1995)
##    3:       3                   Grumpier Old Men (1995)
##    4:       4                  Waiting to Exhale (1995)
##    5:       5        Father of the Bride Part II (1995)
##   ---                                                  
## 9733:  193581 Black Butler: Book of the Atlantic (2017)
## 9734:  193583              No Game No Life: Zero (2017)
## 9735:  193585                              Flint (2017)
## 9736:  193587       Bungo Stray Dogs: Dead Apple (2018)
## 9737:  193609       Andrew Dice Clay: Dice Rules (1991)
##                                            genres
##    1: Adventure|Animation|Children|Comedy|Fantasy
##    2:                  Adventure|Children|Fantasy
##    3:                              Comedy|Romance
##    4:                        Comedy|Drama|Romance
##    5:                                      Comedy
##   ---                                            
## 9733:             Action|Animation|Comedy|Fantasy
## 9734:                    Animation|Comedy|Fantasy
## 9735:                                       Drama
## 9736:                            Action|Animation
## 9737:                                      Comedy

searchMatrix <- movie_data %>% 
  separate_rows(genres, sep = "\\|") %>% #Splitting the genres assigned to movies into multiple rows
  mutate(values = 1) %>% 
  pivot_wider(names_from = genres, values_from = values) %>%    #Pivoting the dataframe so that each genre is assigned its own column
  replace(is.na(.), 0)

searchMatrix %>%  #Some movies have no genres listed, this is not problematic for the recommendation model, since it is purely collaborative, rather than content-based
  filter(`(no genres listed)`==1) %>% 
  length()

## [1] 22

Building the Rating Matrix (User-Movie-Interactions Matrix)

ratingMatrix <- rating_data %>% #A ratings matrix consists of userId's as rows and movieId's as columns
  arrange(movieId) %>% 
  pivot_wider(names_from = movieId, values_from = rating) %>%   #allocating each movie to its own column
  arrange(userId) %>% 
  select(-userId)   #dropping userId


ratingMatrix <- as.matrix(ratingMatrix) #converting the dataframe created above into a matrix

#Ordering the rows and columns so that those with the fewest NAs are furthest to the top left, which is crucial for selecting the most popular movies and most frequent users
ratingMatrix <- ratingMatrix[order(rowSums(is.na(ratingMatrix))),order(colSums(is.na(ratingMatrix)))]


ratingMatrix <- as(ratingMatrix, "realRatingMatrix") #Converting the rating matrix into a recommenderlab sparse matrix

STEP 1 - EXPLORATORY ANALYSIS

Preparing the Data

Ratings Counts

rating_data %>% 
  group_by(rating) %>% 
  summarise(n = n()) #Creating a count of movie ratings

## # A tibble: 10 x 2
##    rating     n
##     <dbl> <int>
##  1    0.5  1370
##  2    1    2811
##  3    1.5  1791
##  4    2    7550
##  5    2.5  5550
##  6    3   20047
##  7    3.5 13134
##  8    4   26817
##  9    4.5  8551
## 10    5   13211

movie_rating_count <- rating_data %>% 
  group_by(movieId) %>% 
  summarise(ratings = n()) %>% 
  left_join(movie_data, by = "movieId") %>% 
  select(movieId, title, ratings) %>% 
  arrange(desc(ratings)) #Creating a table with number of ratings per movie

head(movie_rating_count) #previewing

## # A tibble: 6 x 3
##   movieId title                                     ratings
##     <int> <chr>                                       <int>
## 1     356 Forrest Gump (1994)                           329
## 2     318 Shawshank Redemption, The (1994)              317
## 3     296 Pulp Fiction (1994)                           307
## 4     593 Silence of the Lambs, The (1991)              279
## 5    2571 Matrix, The (1999)                            278
## 6     260 Star Wars: Episode IV - A New Hope (1977)     251

user_rating_count <- rating_data %>%  #Creating a table with number of ratings per user
  group_by(userId) %>% 
  summarise(ratings = n()) %>% 
  arrange(desc(ratings))


head(user_rating_count) #preaviewing

## # A tibble: 6 x 2
##   userId ratings
##    <int>   <int>
## 1    414    2698
## 2    599    2478
## 3    474    2108
## 4    448    1864
## 5    274    1346
## 6    610    1302

How similar the 100 most popular movies are to eachother

movie_similarity <- similarity(ratingMatrix[, 1:100], 
                               method = "cosine",
                               which = "items")


movie_similarity <- as.matrix(movie_similarity) + diag(100) #Adding diag(100) - the similarity of a movie with itself is always 1

image(movie_similarity, main = "Top 100 Popular Movies' Similarity")

How similar the 100 most frequent users are to eachother

#I produce a matrix which calculates how similar users are to one another
user_similarity <- similarity(ratingMatrix[1:100, ],
                              method = "cosine",
                              which = "users")

#I also add diag(100), since the similarity of a user with themselves is 1
user_similarity <- as.matrix(user_similarity) + diag(100)

image(user_similarity, main = "100 Top active users' similarity")

a Histogram Showing the Frequency of Ratings of All Movies

rating_data %>% 
  group_by(rating) %>% 
  count()

## # A tibble: 10 x 2
## # Groups:   rating [10]
##    rating     n
##     <dbl> <int>
##  1    0.5  1370
##  2    1    2811
##  3    1.5  1791
##  4    2    7550
##  5    2.5  5550
##  6    3   20047
##  7    3.5 13134
##  8    4   26817
##  9    4.5  8551
## 10    5   13211

rating_data %>%  #Building a histogram
  ggplot(aes(rating)) + 
  geom_bar() +
  labs(title="Movies were most frequently rated for 4.0 ",subtitle = "Movie rating frequency",x="Rating",y="Count") + 
  theme_minimal() + 
  theme(plot.title = element_text(color = "gray60",size=15,face="bold", family= "Montserrat"),
        axis.title.y = element_text(size = 8, angle = 90, family="Montserrat", face = "plain"),
        plot.subtitle = element_text(color = "gray60", face="plain", ,size= 10,family= "Montserrat"),
        axis.title.x = element_text(size = 8, family="Montserrat", face = "plain"),
        axis.text.x=element_text(family="Montserrat", size=7),
        axis.text.y=element_text(family="Montserrat", size=7))+
  scale_x_continuous(limits=c(0,5.5)) +
  scale_y_continuous(limits=c(0,30000))

(rating_data)

## # A tibble: 100,832 x 3
##    userId movieId rating
##     <int>   <int>  <dbl>
##  1      1       1      4
##  2      1       3      4
##  3      1       6      4
##  4      1      47      5
##  5      1      50      5
##  6      1      70      3
##  7      1     101      5
##  8      1     110      4
##  9      1     151      5
## 10      1     157      5
## # … with 100,822 more rows

STEP 2 - SELECT MOST COMMON USERS AND MOVIES

I will select movies with more than ‘m’ ratings and users who have given more than ‘n’ ratings, where m <- c(10, 20, 50, 100, 200); n <- m

#Since the rows (users) and columns (movies) of my ratingMatrix are already ranked in decreasing order of frequency/popularity, we are just slicing the matrix

#movies rated more than 10 times and users who have rated more than 10 movies
top10_rate <- ratingMatrix[rowCounts(ratingMatrix) > 10,colCounts(ratingMatrix) > 10] 

#movies rated more than 20 times and users who have rated more than 20 movies
top20_rate <- ratingMatrix[rowCounts(ratingMatrix) > 20,colCounts(ratingMatrix) > 20] 

#movies rated more than 50 times and users who have rated more than 50 movies
top50_rate <- ratingMatrix[rowCounts(ratingMatrix) > 50,colCounts(ratingMatrix) > 50] 

#movies rated more than 100 times and users who have rated more than 100 movies
top100_rate <- ratingMatrix[rowCounts(ratingMatrix) > 100,colCounts(ratingMatrix) > 100] 
#top100_rate <- top100_rate[rowCounts(top100_rate) > 5, ]

#movies rated more than 200 times and users who have rated more than 200 movies
top200_rate <- ratingMatrix[rowCounts(ratingMatrix) > 200,colCounts(ratingMatrix) > 200]

STEP 3 - BUILDING RECOMMENDATION SYSTEMS

For building and testing my recommendation systems I built matrixes with movies containing more than m ratings and users who rated more than n movies, where n=m. I selected the following values of m & n to test: 10,20,50,100, 200. For each of the values I ran three models using methods: - User-user memory-based method (UBCF) - Item-item memory-based method (IBCF) Both of the above are memory based, collaborative filtering methods that learn from the past interactions between users/items to make new recommendations. The assumption behind these is that we have a sufficient number of interaction data in order for our predictions to a) establish similarities between users/items b) predict reliable outcomes. - Model-based approach using matrix factorisation (LIMBF) This method is based on user-item interactions, it allows for a level of approximation for groups or individuals, we can establish a mathematical level of confidence on the model. The drawback of this approach is that it works much slower and demands much bigger computing power to execute – it is much more costly to run.

Splitting each subset into test and training data

set.seed(1234) # setting seed
top10_split <- evaluationScheme(top10_rate, method="split", train=0.8, given=-5) # splitting all the data to 80% training data and 20% remaining for the testing
top10_split

## Evaluation scheme using all-but-5 items
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 610 x 2121 rating matrix of class 'realRatingMatrix' with 79636 ratings.

set.seed(1234)
top20_rate <- top20_rate[rowCounts(top20_rate) > 5, ]
top20_split <- evaluationScheme(top20_rate, method="split", train=0.8, given=-5)
top20_split

## Evaluation scheme using all-but-5 items
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 595 x 1235 rating matrix of class 'realRatingMatrix' with 66401 ratings.

set.seed(1234)
top50_split <- evaluationScheme(top50_rate, method="split", train=0.8, given=-5)
top50_split

## Evaluation scheme using all-but-5 items
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 378 x 436 rating matrix of class 'realRatingMatrix' with 36214 ratings.

set.seed(1234)
top100_rate <- top100_rate[rowCounts(top100_rate) > 5, ]
top100_split <- evaluationScheme(top100_rate, method="split", train=0.8, given=-5)

set.seed(1234)
top200_rate <- top200_rate[rowCounts(top200_rate) > 5, ]
top200_split <- evaluationScheme(top200_rate, method="split", train=0.8, given=-5)

I will build three separate recommendations systems for each of the values of ‘n’ and ‘m’ (5 versions for each type of RS)

Item Based CF

#TOP10
set.seed(1234) # setting seed
top10_IBCF <- Recommender(getData(top10_split, "train"), method = "IBCF", param=list(normalize = "center", method="Cosine",k=350))
#TOP20 
set.seed(1234) # setting seed
top20_IBCF <- Recommender(getData(top20_split, "train"), method = "IBCF", param=list(normalize = "center", method="Cosine",k=350))
#TOP50
set.seed(1234) # setting seed
top50_IBCF <- Recommender(getData(top50_split, "train"), method = "IBCF", param=list(normalize = "center", method="Cosine",k=350))
#TOP100
set.seed(1234) # setting seed
top100_IBCF <- Recommender(getData(top100_split, "train"), method = "IBCF", param=list(normalize = "center", method="Cosine",k=350))
#TOP200
set.seed(1234) # setting seed
top200_IBCF <- Recommender(getData(top200_split, "train"), method = "IBCF", param=list(normalize = "center", method="Cosine",k=350))

User Based CF

#TOP10
set.seed(1234) # setting seed
top10_UBCF <- Recommender(getData(top10_split, "train"), method = "UBCF", param=list(normalize = "center", method="Cosine", nn=25))
#TOP20
set.seed(1234) # setting seed
top20_UBCF <- Recommender(getData(top20_split, "train"), method = "UBCF", param=list(normalize = "center", method="Cosine", nn=25))
#TOP50
set.seed(1234) # setting seed
top50_UBCF <- Recommender(getData(top50_split, "train"), method = "UBCF", param=list(normalize = "center", method="Cosine", nn=25))
#TOP100
set.seed(1234) # setting seed
top100_UBCF <- Recommender(getData(top100_split, "train"), method = "UBCF", param=list(normalize = "center", method="Cosine", nn=25))
#TOP200
set.seed(1234) # setting seed
top200_UBCF <- Recommender(getData(top200_split, "train"), method = "UBCF", param=list(normalize = "center", method="Cosine", nn=25))

Model Based CF using Matrix Factorisation

#TOP10
set.seed(1234) # setting seed
top10_LIBMF <- Recommender(getData(top10_split, "train"), method = "LIBMF", param=list(normalize = "center", method="Cosine"))

## Available parameter (with default values):
## dim   =  10
## costp_l2  =  0.01
## costq_l2  =  0.01
## nthread   =  1
## verbose   =  FALSE

#TOP20
set.seed(1234) # setting seed
top20_LIBMF <- Recommender(getData(top20_split, "train"), method = "LIBMF", param=list(normalize = "center", method="Cosine"))

## Available parameter (with default values):
## dim   =  10
## costp_l2  =  0.01
## costq_l2  =  0.01
## nthread   =  1
## verbose   =  FALSE

#TOP50
set.seed(1234) # setting seed
top50_LIBMF <- Recommender(getData(top50_split, "train"), method = "LIBMF", param=list(normalize = "center", method="Cosine"))

## Available parameter (with default values):
## dim   =  10
## costp_l2  =  0.01
## costq_l2  =  0.01
## nthread   =  1
## verbose   =  FALSE

#TOP100
set.seed(1234) # setting seed
top100_LIBMF <- Recommender(getData(top100_split, "train"), method = "LIBMF", param=list(normalize = "center", method="Cosine"))

## Available parameter (with default values):
## dim   =  10
## costp_l2  =  0.01
## costq_l2  =  0.01
## nthread   =  1
## verbose   =  FALSE

#TOP200
set.seed(1234) # setting seed
top200_LIBMF <- Recommender(getData(top200_split, "train"), method = "LIBMF", param=list(normalize = "center", method="Cosine"))

## Available parameter (with default values):
## dim   =  10
## costp_l2  =  0.01
## costq_l2  =  0.01
## nthread   =  1
## verbose   =  FALSE

In my work, I built in total 15 models. However, before I did it, I set the seed for ‘1234’ in order to split the data for each mxn matrix. It is important to mention, as one of the remarks from the analysis I realized is that the outcome and performance of the models is strongly dependent on the seed we choose. Using different seeds at the end led me to the same model choice, however, the outputs differed significantly from each other. Let me now explain how the models behave when we include only common movies/users first. When n and m= 200, we include in our model only the movies that were rated 200 or more times and the users that rated more than 200 movies. In such a manner, we significantly shrink the data. As a result, even though the RMSE is low in comparison to other models, our dataset does not have many values and it is not fully reliable. On the contrary, when we set our n and m=10 or 20, we include also rarely watched movies and not so active users. In this way, we obtain a large dataset including the majority of the observations. However, if we include such rare items/ users, our models have much less predictive power, and that ultimately lead us to higher RMSE. As we will see further from the RMSE analysis, the best choice lies in between, when n & m =100 or 50. The middle point seems to be the tradeoff of the two options mentioned above.

STEP 4 - Computing RMSE of each model and combination of m x n

I will assess performance of the above models for different number of ‘items’ and ‘users’. I need to remember CF methods are most appropriate for high volume items (often watched movies) and frequent users. So, if I use n,m == 20 then performance changes as opposed to when I use n, == 50.

For each recommendation system we expect to see 5 RMSE numbers, corresponding to n,m == {10,20,50,100,200}

RMSE_item_based_CF

set.seed(1234) # setting seed
#TOP10

#I compute predicted ratings by giving the model the known part of the test data (i.e. the data of the users for all but 5 movies for each user)
set.seed(1234) # setting seed
top10_IBCF_predict <- predict(top10_IBCF, getData(top10_split, "known"), type="ratings")

#Finally, I can calculate RMSE between the predictions and the unknown part of the test data (i.e. for the 5 movies that were held out)
set.seed(1234) # setting seed
top10_IBCF_RMSE <- calcPredictionAccuracy(top10_IBCF_predict, getData(top10_split, "unknown"))[1]
top10_IBCF_RMSE

## RMSE 
## 1.12

#TOP20

#I compute predicted ratings by giving the model the known part of the test data (i.e. the data of the users for all but 5 movies for each user)
set.seed(1234) # setting seed
top20_IBCF_predict <- predict(top20_IBCF, getData(top20_split, "known"), type="ratings")

#Finally, I can calculate RMSE between the predictions and the unknown part of the test data (i.e. for the 5 movies that were held out)
set.seed(1234) # setting seed
top20_IBCF_RMSE <- calcPredictionAccuracy(top20_IBCF_predict, getData(top20_split, "unknown"))[1]
top20_IBCF_RMSE

## RMSE 
## 1.05

#TOP20

#I compute predicted ratings by giving the model the known part of the test data (i.e. the data of the users for all but 5 movies for each user)
set.seed(1234) # setting seed
top50_IBCF_predict <- predict(top50_IBCF, getData(top50_split, "known"), type="ratings")

#Finally, I can calculate RMSE between the predictions and the unknown part of the test data (i.e. for the 5 movies that were held out)
set.seed(1234) # setting seed
top50_IBCF_RMSE <- calcPredictionAccuracy(top50_IBCF_predict, getData(top50_split, "unknown"))[1]
top50_IBCF_RMSE

##  RMSE 
## 0.848

#TOP20

#I compute predicted ratings by giving the model the known part of the test data (i.e. the data of the users for all but 5 movies for each user)
set.seed(1234) # setting seed
top100_IBCF_predict <- predict(top100_IBCF, getData(top100_split, "known"), type="ratings")

#Finally, I can calculate RMSE between the predictions and the unknown part of the test data (i.e. for the 5 movies that were held out)
set.seed(1234) # setting seed
top100_IBCF_RMSE <- calcPredictionAccuracy(top100_IBCF_predict, getData(top100_split, "unknown"))[1]
top100_IBCF_RMSE

##  RMSE 
## 0.823

#TOP200

#I compute predicted ratings by giving the model the known part of the test data (i.e. the data of the users for all but 5 movies for each user)
set.seed(1234) # setting seed
top200_IBCF_predict <- predict(top200_IBCF, getData(top200_split, "known"), type="ratings")

#Finally, I can calculate RMSE between the predictions and the unknown part of the test data (i.e. for the 5 movies that were held out)
set.seed(1234) # setting seed
top200_IBCF_RMSE <- calcPredictionAccuracy(top200_IBCF_predict, getData(top200_split, "unknown"))[1]
top200_IBCF_RMSE

##  RMSE 
## 0.815

RMSE_user_based_CF

set.seed(1234) # setting seed
#TOP10
top10_UBCF_predict <- predict(top10_UBCF, getData(top10_split, "known"), type="ratings")

set.seed(1234) # setting seed
top10_UBCF_RMSE <- calcPredictionAccuracy(top10_UBCF_predict, getData(top10_split, "unknown"))[1]
top10_UBCF_RMSE

## RMSE 
## 1.15

#TOP20
set.seed(1234) # setting seed
top20_UBCF_predict <- predict(top20_UBCF, getData(top20_split, "known"), type="ratings")

set.seed(1234) # setting seed
top20_UBCF_RMSE <- calcPredictionAccuracy(top20_UBCF_predict, getData(top20_split, "unknown"))[1]
top20_UBCF_RMSE

## RMSE 
## 1.24

#TOP50
set.seed(1234) # setting seed
top50_UBCF_predict <- predict(top50_UBCF, getData(top50_split, "known"), type="ratings")

set.seed(1234) # setting seed
top50_UBCF_RMSE <- calcPredictionAccuracy(top50_UBCF_predict, getData(top50_split, "unknown"))[1]
top50_UBCF_RMSE

##  RMSE 
## 0.975

#TOP100
set.seed(1234) # setting seed
top100_UBCF_predict <- predict(top100_UBCF, getData(top100_split, "known"), type="ratings")

set.seed(1234) # setting seed
top100_UBCF_RMSE <- calcPredictionAccuracy(top100_UBCF_predict, getData(top100_split, "unknown"))[1]
top100_UBCF_RMSE

##  RMSE 
## 0.868

#TOP200
set.seed(1234) # setting seed
top200_UBCF_predict <- predict(top200_UBCF, getData(top200_split, "known"), type="ratings")

set.seed(1234) # setting seed
top200_UBCF_RMSE <- calcPredictionAccuracy(top200_UBCF_predict, getData(top200_split, "unknown"))[1]
top200_UBCF_RMSE

##  RMSE 
## 0.827

RMSE_model_based_CF_using_MF

set.seed(1234) # setting seed
#TOP10
top10_LIBMF_predict <- predict(top10_LIBMF, getData(top10_split, "known"), type="ratings")

## iter      tr_rmse          obj
##    0       1.1798   1.1613e+05
##    1       0.8518   6.3520e+04
##    2       0.8295   6.0723e+04
##    3       0.8056   5.7777e+04
##    4       0.7878   5.5632e+04
##    5       0.7759   5.4206e+04
##    6       0.7670   5.3179e+04
##    7       0.7604   5.2403e+04
##    8       0.7547   5.1748e+04
##    9       0.7497   5.1182e+04
##   10       0.7453   5.0694e+04
##   11       0.7411   5.0221e+04
##   12       0.7373   4.9807e+04
##   13       0.7335   4.9405e+04
##   14       0.7298   4.8997e+04
##   15       0.7261   4.8591e+04
##   16       0.7227   4.8231e+04
##   17       0.7192   4.7871e+04
##   18       0.7158   4.7514e+04
##   19       0.7124   4.7146e+04

set.seed(1234) # setting seed
top10_LIBMF_RMSE <- calcPredictionAccuracy(top10_LIBMF_predict, getData(top10_split, "unknown"))[1]
top10_LIBMF_RMSE

##  RMSE 
## 0.874

#TOP20
set.seed(1234) # setting seed
top20_LIBMF_predict <- predict(top20_LIBMF, getData(top20_split, "known"), type="ratings")

## iter      tr_rmse          obj
##    0       1.1637   9.4129e+04
##    1       0.8500   5.2580e+04
##    2       0.8308   5.0597e+04
##    3       0.8094   4.8412e+04
##    4       0.7938   4.6822e+04
##    5       0.7826   4.5729e+04
##    6       0.7744   4.4932e+04
##    7       0.7685   4.4374e+04
##    8       0.7628   4.3822e+04
##    9       0.7583   4.3393e+04
##   10       0.7540   4.2986e+04
##   11       0.7501   4.2633e+04
##   12       0.7462   4.2265e+04
##   13       0.7425   4.1923e+04
##   14       0.7389   4.1610e+04
##   15       0.7355   4.1291e+04
##   16       0.7321   4.0990e+04
##   17       0.7288   4.0699e+04
##   18       0.7253   4.0381e+04
##   19       0.7221   4.0097e+04

set.seed(1234) # setting seed
top20_LIBMF_RMSE <- calcPredictionAccuracy(top20_LIBMF_predict, getData(top20_split, "unknown"))[1]
top20_LIBMF_RMSE

##  RMSE 
## 0.894

#TOP50
set.seed(1234) # setting seed
top50_LIBMF_predict <- predict(top50_LIBMF, getData(top50_split, "known"), type="ratings")

## iter      tr_rmse          obj
##    0       1.1417   4.9467e+04
##    1       0.8429   2.8224e+04
##    2       0.8307   2.7527e+04
##    3       0.8154   2.6686e+04
##    4       0.7982   2.5750e+04
##    5       0.7846   2.5011e+04
##    6       0.7752   2.4517e+04
##    7       0.7683   2.4158e+04
##    8       0.7624   2.3845e+04
##    9       0.7580   2.3626e+04
##   10       0.7536   2.3398e+04
##   11       0.7500   2.3212e+04
##   12       0.7465   2.3037e+04
##   13       0.7436   2.2891e+04
##   14       0.7405   2.2732e+04
##   15       0.7377   2.2608e+04
##   16       0.7349   2.2465e+04
##   17       0.7321   2.2328e+04
##   18       0.7294   2.2192e+04
##   19       0.7268   2.2070e+04

set.seed(1234) # setting seed
top50_LIBMF_RMSE <- calcPredictionAccuracy(top50_LIBMF_predict, getData(top50_split, "unknown"))[1]
top50_LIBMF_RMSE

##  RMSE 
## 0.795

#TOP100
set.seed(1234) # setting seed
top100_LIBMF_predict <- predict(top100_LIBMF, getData(top100_split, "known"), type="ratings")

## iter      tr_rmse          obj
##    0       1.1885   2.0024e+04
##    1       0.8100   9.8797e+03
##    2       0.7955   9.5802e+03
##    3       0.7718   9.1092e+03
##    4       0.7498   8.6914e+03
##    5       0.7362   8.4326e+03
##    6       0.7270   8.2630e+03
##    7       0.7217   8.1623e+03
##    8       0.7170   8.0813e+03
##    9       0.7129   8.0064e+03
##   10       0.7096   7.9432e+03
##   11       0.7067   7.8942e+03
##   12       0.7036   7.8378e+03
##   13       0.7008   7.7878e+03
##   14       0.6980   7.7401e+03
##   15       0.6954   7.6948e+03
##   16       0.6924   7.6407e+03
##   17       0.6898   7.5953e+03
##   18       0.6869   7.5460e+03
##   19       0.6844   7.5028e+03

set.seed(1234) # setting seed
top100_LIBMF_RMSE <- calcPredictionAccuracy(top100_LIBMF_predict, getData(top100_split, "unknown"))[1]
top100_LIBMF_RMSE

##  RMSE 
## 0.738

#TOP200
set.seed(1234) # setting seed
top200_LIBMF_predict <- predict(top200_LIBMF, getData(top200_split, "known"), type="ratings")

## iter      tr_rmse          obj
##    0       1.6403   4.2879e+03
##    1       0.7882   1.1040e+03
##    2       0.7654   1.0478e+03
##    3       0.7472   1.0080e+03
##    4       0.7196   9.4929e+02
##    5       0.6924   8.9553e+02
##    6       0.6676   8.4433e+02
##    7       0.6521   8.1507e+02
##    8       0.6400   7.9177e+02
##    9       0.6307   7.7467e+02
##   10       0.6233   7.6104e+02
##   11       0.6168   7.4848e+02
##   12       0.6109   7.3810e+02
##   13       0.6059   7.2794e+02
##   14       0.6002   7.1799e+02
##   15       0.5951   7.0874e+02
##   16       0.5915   7.0253e+02
##   17       0.5863   6.9293e+02
##   18       0.5830   6.8768e+02
##   19       0.5777   6.7729e+02

set.seed(1234) # setting seed
top200_LIBMF_RMSE <- calcPredictionAccuracy(top200_LIBMF_predict, getData(top200_split, "unknown"))[1]
top200_LIBMF_RMSE

##  RMSE 
## 0.796

STEP 5 - REPORTING PLOTS SHOWING HOW RMSE CHANGES GIVEN THE MODEL AND VALUES OF ‘N’ AND ‘M’

For each of my models, I predicted the values on the testing set and then in the next step, I checked the RMSEs for all the models. As we can observe across all the models, when n & m=100, the RMSE turns out to be the lowest for each of the methods. The Visualisation of all the RMSEs can help us better understand the performance of all the models.

rmse_ibcf<-c()
rmse_ubcf<-c()
rmse_libmf<-c()

rmse_ibcf <- c(top10_IBCF_RMSE, top20_IBCF_RMSE, top50_IBCF_RMSE, top100_IBCF_RMSE, top200_IBCF_RMSE)
rmse_ubcf <- c(top10_UBCF_RMSE, top20_UBCF_RMSE, top50_UBCF_RMSE, top100_UBCF_RMSE, top200_UBCF_RMSE)
rmse_libmf <- c(top10_LIBMF_RMSE, top20_LIBMF_RMSE, top50_LIBMF_RMSE, top100_LIBMF_RMSE, top200_LIBMF_RMSE)

n <- c(10, 20, 50, 100, 200)

models<-c("IBCF","IBCF","IBCF","IBCF","IBCF",
          "UBCF","UBCF","UBCF","UBCF","UBCF",
          "LIBMF","LIBMF","LIBMF","LIBMF","LIBMF")

df_rmse<- tibble(rmse = c(rmse_ibcf, rmse_ubcf, rmse_libmf),
                 lambda = c(n, n, n),
                 model = models)

glimpse(df_rmse)

## Rows: 15
## Columns: 3
## $ rmse   <dbl> 1.124, 1.045, 0.848, 0.823, 0.815, 1.150, 1.241, 0.975, 0.868,…
## $ lambda <dbl> 10, 20, 50, 100, 200, 10, 20, 50, 100, 200, 10, 20, 50, 100, 2…
## $ model  <chr> "IBCF", "IBCF", "IBCF", "IBCF", "IBCF", "UBCF", "UBCF", "UBCF"…

ggplot(df_rmse, aes(x=lambda,y=rmse,colour=model)) +
  geom_line() +
geom_point() +
  theme_minimal()+
  theme(plot.title = element_text(size=15,face="bold", family= "Montserrat"),
        plot.subtitle = element_text(size=10,face="plain", family= "Montserrat"),
        axis.title.y = element_text(size = 8, angle = 90, family="Montserrat", face = "bold"),
        axis.text.y=element_text(family="Montserrat", size=7),
        axis.title.x = element_text(size = 8, family="Montserrat", face = "bold"),
        axis.text.x=element_text(family="Montserrat", size=7))+
  labs(title = "Recommendation performance works best for LIMBF model",
       subtitle = "RMSEs of particular recommendations system models",
       x = "n & m values",
       y = "RMSE",
       colour = "Model") +
  
  theme(title = element_text(family="Courier", face="bold"))

STEP 6 - CHOOSING THE OPTIMAL MODEL AND EVALUATING ITS RECOMMENDATIONS

From the above analysis, I can deduct that the best performing method and model I should choose for my recommendation system is the Matrix Factorisation Model with n & m= 100.

Such a model is based on the interactions between users and items. It is not only dependent on one of these variables. The big advantage is as well, that such a solution does not rely on the assumption of a large historical dataset. However, in the real world, such a model might not perform well for the rare movies. For these, we should try to use perhaps a different model of different m & n values and combine a mixed model that runs different analyses depending on the popularity of the movies and activeness of the users.

set.seed(1234) # setting seed
#this is our recommended model
recommen_model <- Recommender(getData(top100_split, "train"), 
                              method = "LIBMF", 
                              param=list(normalize = "center", 
                                         method="Cosine"))

## Available parameter (with default values):
## dim   =  10
## costp_l2  =  0.01
## costq_l2  =  0.01
## nthread   =  1
## verbose   =  FALSE

predition_LIBMF <- predict(recommen_model, getData(top100_split, "known"), type="ratings")

## iter      tr_rmse          obj
##    0       1.1885   2.0024e+04
##    1       0.8100   9.8797e+03
##    2       0.7955   9.5802e+03
##    3       0.7718   9.1092e+03
##    4       0.7498   8.6914e+03
##    5       0.7362   8.4326e+03
##    6       0.7270   8.2630e+03
##    7       0.7217   8.1623e+03
##    8       0.7170   8.0813e+03
##    9       0.7129   8.0064e+03
##   10       0.7096   7.9432e+03
##   11       0.7067   7.8942e+03
##   12       0.7036   7.8378e+03
##   13       0.7008   7.7878e+03
##   14       0.6980   7.7401e+03
##   15       0.6954   7.6948e+03
##   16       0.6924   7.6407e+03
##   17       0.6898   7.5953e+03
##   18       0.6869   7.5460e+03
##   19       0.6844   7.5028e+03

#building a matrix
prediction_2 <- predict(recommen_model, top100_rate[1:25, ], type="ratings")

## iter      tr_rmse          obj
##    0       1.1762   1.9580e+04
##    1       0.8039   9.7144e+03
##    2       0.7828   9.2965e+03
##    3       0.7584   8.8238e+03
##    4       0.7426   8.5223e+03
##    5       0.7323   8.3230e+03
##    6       0.7252   8.1958e+03
##    7       0.7206   8.1116e+03
##    8       0.7156   8.0203e+03
##    9       0.7120   7.9544e+03
##   10       0.7079   7.8818e+03
##   11       0.7046   7.8222e+03
##   12       0.7011   7.7617e+03
##   13       0.6974   7.6932e+03
##   14       0.6949   7.6527e+03
##   15       0.6911   7.5839e+03
##   16       0.6880   7.5297e+03
##   17       0.6850   7.4835e+03
##   18       0.6814   7.4205e+03
##   19       0.6785   7.3712e+03

as(prediction_2, "matrix")[,1:10]

##       356  318 296  593 2571 260  480  110  589  527
##  [1,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
##  [2,]  NA   NA  NA   NA   NA  NA   NA   NA   NA 4.01
##  [3,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
##  [4,]  NA 4.22  NA   NA   NA  NA   NA 3.77   NA 4.22
##  [5,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
##  [6,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
##  [7,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
##  [8,]  NA   NA  NA   NA   NA  NA   NA   NA   NA 4.77
##  [9,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
## [10,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
## [11,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
## [12,]  NA   NA  NA   NA   NA  NA   NA   NA   NA 4.22
## [13,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
## [14,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
## [15,]  NA 4.22  NA   NA   NA  NA 2.76   NA 3.27   NA
## [16,]  NA   NA  NA 3.48   NA  NA   NA   NA   NA 3.51
## [17,]  NA   NA  NA   NA   NA  NA   NA 4.00 3.73   NA
## [18,]  NA   NA  NA   NA   NA  NA   NA 3.79 3.63   NA
## [19,]  NA   NA  NA 4.35   NA  NA   NA   NA   NA   NA
## [20,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
## [21,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
## [22,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
## [23,]  NA   NA  NA 4.60   NA  NA   NA   NA   NA   NA
## [24,]  NA   NA  NA   NA   NA  NA   NA   NA   NA   NA
## [25,]  NA   NA  NA   NA   NA  NA 3.59   NA   NA   NA

#finding recommended 5 movies for user 9
top100_rate[1,]

## 1 x 134 rating matrix of class 'realRatingMatrix' with 133 ratings.

recommended.items.u9<- predict(recommen_model, top100_rate[9,], n=5)

## iter      tr_rmse          obj
##    0       1.2320   1.7357e+04
##    1       0.8243   8.2441e+03
##    2       0.8109   8.0137e+03
##    3       0.7990   7.8255e+03
##    4       0.7835   7.5721e+03
##    5       0.7702   7.3657e+03
##    6       0.7595   7.1981e+03
##    7       0.7505   7.0574e+03
##    8       0.7448   6.9676e+03
##    9       0.7384   6.8709e+03
##   10       0.7340   6.8041e+03
##   11       0.7292   6.7305e+03
##   12       0.7252   6.6733e+03
##   13       0.7213   6.6139e+03
##   14       0.7176   6.5630e+03
##   15       0.7143   6.5117e+03
##   16       0.7110   6.4643e+03
##   17       0.7076   6.4154e+03
##   18       0.7047   6.3734e+03
##   19       0.7017   6.3311e+03

as(recommended.items.u9, "list")[[1]]

## [1] "4878"  "608"   "1073"  "1213"  "68954"

# [1] "608"  "1213" "4878" "1265" "1197" movieIDs

movies_u9<- c()
movies_u9<- append(movies_u9,movie_data$title[movie_data$movieId %in% "608"])
# "Fargo (1996)"

movies_u9<- append(movies_u9,movie_data$title[movie_data$movieId %in% "1213"])
# "Goodfellas (1990)"

movies_u9<- append(movies_u9,movie_data$title[movie_data$movieId %in% "4878"])
# "Donnie Darko (2001)"

movies_u9<- append(movies_u9,movie_data$title[movie_data$movieId %in% "1265"])
# "Groundhog Day (1993)"

movies_u9<- append(movies_u9,movie_data$title[movie_data$movieId %in% "1197"])
# "Princess Bride, The (1987)"

movies_u9

## [1] "Fargo (1996)"               "Goodfellas (1990)"         
## [3] "Donnie Darko (2001)"        "Groundhog Day (1993)"      
## [5] "Princess Bride, The (1987)"

— END —

Next Project