In [8]:
#Dataframe manipulation library
import pandas as pd
#Math functions, we’ll only need the sqrt function so let’s import only that
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
#Storing the movie information into a pandas dataframe
movies_df = pd.read_csv('movies.csv')
#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv('ratings.csv')

In [12]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [14]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don’t conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the ‘title’ column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

In [15]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [16]:
#Dropping the genres column
movies_df = movies_df.drop('genres', 1)

In [17]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [19]:
userInput = [
{'title':'Breakfast Club, The', 'rating':5},
{'title':'Toy Story', 'rating':3.5},
{'title':'Jumanji', 'rating':2},
{'title':'Pulp Fiction', 'rating':5},
{'title':'Akira', 'rating':4.5}
]
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [20]:
#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It’s implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won’t use from the input dataframe
inputMovies = inputMovies.drop('year', 1)
#Final input dataframe
#If a movie you added in above isn’t here, then it might not be in the original
#dataframe or it might spelled differently, please check capitalisation.
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,164600,Akira,4.5
5,1968,"Breakfast Club, The",5.0


In [21]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
70,2,1,3.5,1141415820
141,2,1968,1.0,1141415850
254,3,1,4.0,1439472215
264,3,296,5.0,1439474476


In [22]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

In [28]:
userSubsetGroup.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
70,2,1,3.5,1141415820
141,2,1968,1.0,1141415850
254,3,1,4.0,1439472215
264,3,296,5.0,1439474476
...,...,...,...,...
412634,2845,1,5.0,857822102
412724,2847,1,5.0,1489730761
412725,2847,2,4.5,1489731313
412750,2847,296,5.0,1489730655


In [29]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]), reverse=True)

In [31]:
userSubsetGroup[0:3]

[(43,
        userId  movieId  rating   timestamp
  5858      43        1     4.0  1170491388
  5859      43        2     3.5  1170494481
  5884      43      296     5.0  1169595659
  5974      43     1274     4.5  1170848041
  6018      43     1968     4.5  1169599043),
 (171,
         userId  movieId  rating   timestamp
  20794     171        1     4.5  1074594380
  20795     171        2     4.0  1074590433
  20833     171      296     5.0  1074591698
  20973     171     1274     4.0  1074590795
  21047     171     1968     4.0  1074595055),
 (440,
         userId  movieId  rating   timestamp
  54640     440        1     3.5  1230019553
  54641     440        2     2.0  1231475605
  54669     440      296     5.0  1230019011
  54756     440     1274     5.0  1231474568
  54798     440     1968     4.5  1230019885)]

In [32]:
userSubsetGroup = userSubsetGroup[0:100]

In [45]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}
#For every user group in our subset
for name, group in userSubsetGroup:
   #Let’s start by sorting the input and current user group so the values aren’t mixed up later on
   group = group.sort_values(by='movieId')
   inputMovies = inputMovies.sort_values(by='movieId')
   #Get the N for the formula
   nRatings = len(group)
   #Get the review scores for the movies that they both have in common
   temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
   #And then store them in a temporary buffer variable in a list format to facilitate future calculations
   tempRatingList = temp_df['rating'].tolist()
   #Let’s also put the current user group reviews in a list format
   tempGroupList = group['rating'].tolist()
   #Now let’s calculate the pearson correlation between two users, so called, x and y
   Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
   Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
   Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
   #If the denominator is different than zero, then divide, else, 0 correlation.
   if Sxx != 0 and Syy != 0:
      pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
   else:
      pearsonCorrelationDict[name] = 0

In [46]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.946029,43
1,0.328897,171
2,0.961538,440
3,0.468807,597
4,0.877058,695


In [48]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
46,1.0,836
88,1.0,2342
36,1.0,572
27,1.0,321
86,1.0,2294


In [49]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,timestamp
0,1.0,836,1,4.0,940636758
1,1.0,836,2,3.0,941909202
2,1.0,836,4,3.0,941907470
3,1.0,836,6,2.0,941997766
4,1.0,836,11,2.0,940631209


In [50]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,timestamp,weightedRating
0,1.0,836,1,4.0,940636758,4.0
1,1.0,836,2,3.0,941909202,3.0
2,1.0,836,4,3.0,941907470,3.0
3,1.0,836,6,2.0,941997766,2.0
4,1.0,836,11,2.0,940631209,2.0


In [51]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,41.322084,156.851119
2,39.172949,113.157891
3,12.780225,37.025402
4,1.852803,6.411211
5,8.039279,21.63968


In [53]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.795818,1
2,2.888674,2
3,2.897085,3
4,3.460277,4
5,2.691744,5


In [54]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
26082,5.0,26082
4427,5.0,4427
7767,5.0,7767
309,5.0,309
4517,5.0,4517
1940,5.0,1940
1939,5.0,1939
1936,5.0,1936
1935,5.0,1935
6650,5.0,6650


In [55]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
305,309,"Red Firecracker, Green Firecracker (Pao Da Shu...",1994
1846,1935,How Green Was My Valley,1941
1847,1936,Mrs. Miniver,1942
1850,1939,"Best Years of Our Lives, The",1946
1851,1940,Gentleman's Agreement,1947
4322,4427,"Lion in Winter, The",1968
4412,4517,Lady in White (a.k.a. The Mystery of the Lady ...,1988
6527,6650,Kind Hearts and Coronets,1949
7447,7767,"Best of Youth, The (La meglio gioventù)",2003
8565,26082,Harakiri (Seppuku),1962


# New Section

# New Section