{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"#Dataframe manipulation library\n",
"import pandas as pd\n",
"#Math functions, we’ll only need the sqrt function so let’s import only that\n",
"from math import sqrt\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
],
"metadata": {
"id": "4qzht__fy-FI"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#Storing the movie information into a pandas dataframe\n",
"movies_df = pd.read_csv('movies.csv')\n",
"#Storing the user information into a pandas dataframe\n",
"ratings_df = pd.read_csv('ratings.csv')"
],
"metadata": {
"id": "2ZXm9nu77E2v"
},
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"source": [
"movies_df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "lj34wDdMC4st",
"outputId": "8b479ff3-e57d-46c6-f552-81c0a51124a6"
},
"execution_count": 12,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" movieId title \\\n",
"0 1 Toy Story (1995) \n",
"1 2 Jumanji (1995) \n",
"2 3 Grumpier Old Men (1995) \n",
"3 4 Waiting to Exhale (1995) \n",
"4 5 Father of the Bride Part II (1995) \n",
"\n",
" genres \n",
"0 Adventure|Animation|Children|Comedy|Fantasy \n",
"1 Adventure|Children|Fantasy \n",
"2 Comedy|Romance \n",
"3 Comedy|Drama|Romance \n",
"4 Comedy "
],
"text/html": [
"\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" movieId \n",
" title \n",
" genres \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" Toy Story (1995) \n",
" Adventure|Animation|Children|Comedy|Fantasy \n",
" \n",
" \n",
" 1 \n",
" 2 \n",
" Jumanji (1995) \n",
" Adventure|Children|Fantasy \n",
" \n",
" \n",
" 2 \n",
" 3 \n",
" Grumpier Old Men (1995) \n",
" Comedy|Romance \n",
" \n",
" \n",
" 3 \n",
" 4 \n",
" Waiting to Exhale (1995) \n",
" Comedy|Drama|Romance \n",
" \n",
" \n",
" 4 \n",
" 5 \n",
" Father of the Bride Part II (1995) \n",
" Comedy \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"source": [
"#Using regular expressions to find a year stored between parentheses\n",
"#We specify the parantheses so we don’t conflict with movies that have years in their titles\n",
"movies_df['year'] = movies_df.title.str.extract('(\\(\\d\\d\\d\\d\\))',expand=False)\n",
"#Removing the parentheses\n",
"movies_df['year'] = movies_df.year.str.extract('(\\d\\d\\d\\d)',expand=False)\n",
"#Removing the years from the ‘title’ column\n",
"movies_df['title'] = movies_df.title.str.replace('(\\(\\d\\d\\d\\d\\))', '')\n",
"#Applying the strip function to get rid of any ending whitespace characters that may have appeared\n",
"movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())"
],
"metadata": {
"id": "TDnET0zPDEED"
},
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"source": [
"movies_df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "EKM5kASyDLkP",
"outputId": "75ec1a36-fe87-4a7b-f085-ce7c4bc4c692"
},
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" movieId title \\\n",
"0 1 Toy Story \n",
"1 2 Jumanji \n",
"2 3 Grumpier Old Men \n",
"3 4 Waiting to Exhale \n",
"4 5 Father of the Bride Part II \n",
"\n",
" genres year \n",
"0 Adventure|Animation|Children|Comedy|Fantasy 1995 \n",
"1 Adventure|Children|Fantasy 1995 \n",
"2 Comedy|Romance 1995 \n",
"3 Comedy|Drama|Romance 1995 \n",
"4 Comedy 1995 "
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" movieId \n",
" title \n",
" genres \n",
" year \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" Toy Story \n",
" Adventure|Animation|Children|Comedy|Fantasy \n",
" 1995 \n",
" \n",
" \n",
" 1 \n",
" 2 \n",
" Jumanji \n",
" Adventure|Children|Fantasy \n",
" 1995 \n",
" \n",
" \n",
" 2 \n",
" 3 \n",
" Grumpier Old Men \n",
" Comedy|Romance \n",
" 1995 \n",
" \n",
" \n",
" 3 \n",
" 4 \n",
" Waiting to Exhale \n",
" Comedy|Drama|Romance \n",
" 1995 \n",
" \n",
" \n",
" 4 \n",
" 5 \n",
" Father of the Bride Part II \n",
" Comedy \n",
" 1995 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 15
}
]
},
{
"cell_type": "code",
"source": [
"#Dropping the genres column\n",
"movies_df = movies_df.drop('genres', 1)"
],
"metadata": {
"id": "_b_ZgmR5Dc1P"
},
"execution_count": 16,
"outputs": []
},
{
"cell_type": "code",
"source": [
"ratings_df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "BX3fUoOaDjLv",
"outputId": "4d167fc0-11b2-4e69-da8b-3e13e4d27f17"
},
"execution_count": 17,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" userId movieId rating timestamp\n",
"0 1 296 5.0 1147880044\n",
"1 1 306 3.5 1147868817\n",
"2 1 307 5.0 1147868828\n",
"3 1 665 5.0 1147878820\n",
"4 1 899 3.5 1147868510"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" userId \n",
" movieId \n",
" rating \n",
" timestamp \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" 296 \n",
" 5.0 \n",
" 1147880044 \n",
" \n",
" \n",
" 1 \n",
" 1 \n",
" 306 \n",
" 3.5 \n",
" 1147868817 \n",
" \n",
" \n",
" 2 \n",
" 1 \n",
" 307 \n",
" 5.0 \n",
" 1147868828 \n",
" \n",
" \n",
" 3 \n",
" 1 \n",
" 665 \n",
" 5.0 \n",
" 1147878820 \n",
" \n",
" \n",
" 4 \n",
" 1 \n",
" 899 \n",
" 3.5 \n",
" 1147868510 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 17
}
]
},
{
"cell_type": "code",
"source": [
"userInput = [\n",
"{'title':'Breakfast Club, The', 'rating':5},\n",
"{'title':'Toy Story', 'rating':3.5},\n",
"{'title':'Jumanji', 'rating':2},\n",
"{'title':'Pulp Fiction', 'rating':5},\n",
"{'title':'Akira', 'rating':4.5}\n",
"]\n",
"inputMovies = pd.DataFrame(userInput)\n",
"inputMovies"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "_PwiTjRnDs1U",
"outputId": "f232ea92-b38c-457e-d334-5fd29128f2e2"
},
"execution_count": 19,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title rating\n",
"0 Breakfast Club, The 5.0\n",
"1 Toy Story 3.5\n",
"2 Jumanji 2.0\n",
"3 Pulp Fiction 5.0\n",
"4 Akira 4.5"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" title \n",
" rating \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" Breakfast Club, The \n",
" 5.0 \n",
" \n",
" \n",
" 1 \n",
" Toy Story \n",
" 3.5 \n",
" \n",
" \n",
" 2 \n",
" Jumanji \n",
" 2.0 \n",
" \n",
" \n",
" 3 \n",
" Pulp Fiction \n",
" 5.0 \n",
" \n",
" \n",
" 4 \n",
" Akira \n",
" 4.5 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 19
}
]
},
{
"cell_type": "code",
"source": [
"#Filtering out the movies by title\n",
"inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]\n",
"#Then merging it so we can get the movieId. It’s implicitly merging it by title.\n",
"inputMovies = pd.merge(inputId, inputMovies)\n",
"#Dropping information we won’t use from the input dataframe\n",
"inputMovies = inputMovies.drop('year', 1)\n",
"#Final input dataframe\n",
"#If a movie you added in above isn’t here, then it might not be in the original\n",
"#dataframe or it might spelled differently, please check capitalisation.\n",
"inputMovies"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 238
},
"id": "T3n9mvlKD_OM",
"outputId": "9c6bc35f-9c03-4988-a6a7-d0025d94e8a2"
},
"execution_count": 20,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" movieId title rating\n",
"0 1 Toy Story 3.5\n",
"1 2 Jumanji 2.0\n",
"2 296 Pulp Fiction 5.0\n",
"3 1274 Akira 4.5\n",
"4 164600 Akira 4.5\n",
"5 1968 Breakfast Club, The 5.0"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" movieId \n",
" title \n",
" rating \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" Toy Story \n",
" 3.5 \n",
" \n",
" \n",
" 1 \n",
" 2 \n",
" Jumanji \n",
" 2.0 \n",
" \n",
" \n",
" 2 \n",
" 296 \n",
" Pulp Fiction \n",
" 5.0 \n",
" \n",
" \n",
" 3 \n",
" 1274 \n",
" Akira \n",
" 4.5 \n",
" \n",
" \n",
" 4 \n",
" 164600 \n",
" Akira \n",
" 4.5 \n",
" \n",
" \n",
" 5 \n",
" 1968 \n",
" Breakfast Club, The \n",
" 5.0 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 20
}
]
},
{
"cell_type": "code",
"source": [
"#Filtering out users that have watched movies that the input has watched and storing it\n",
"userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]\n",
"userSubset.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "aGx-3x09Eozj",
"outputId": "87cc81a7-6cfe-4f63-f05c-71122836564f"
},
"execution_count": 21,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" userId movieId rating timestamp\n",
"0 1 296 5.0 1147880044\n",
"70 2 1 3.5 1141415820\n",
"141 2 1968 1.0 1141415850\n",
"254 3 1 4.0 1439472215\n",
"264 3 296 5.0 1439474476"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" userId \n",
" movieId \n",
" rating \n",
" timestamp \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" 296 \n",
" 5.0 \n",
" 1147880044 \n",
" \n",
" \n",
" 70 \n",
" 2 \n",
" 1 \n",
" 3.5 \n",
" 1141415820 \n",
" \n",
" \n",
" 141 \n",
" 2 \n",
" 1968 \n",
" 1.0 \n",
" 1141415850 \n",
" \n",
" \n",
" 254 \n",
" 3 \n",
" 1 \n",
" 4.0 \n",
" 1439472215 \n",
" \n",
" \n",
" 264 \n",
" 3 \n",
" 296 \n",
" 5.0 \n",
" 1439474476 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 21
}
]
},
{
"cell_type": "code",
"source": [
"#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter\n",
"userSubsetGroup = userSubset.groupby(['userId'])"
],
"metadata": {
"id": "WS_Jx47DEx01"
},
"execution_count": 22,
"outputs": []
},
{
"cell_type": "code",
"source": [
"userSubsetGroup.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 424
},
"id": "jJWPtlcaE7ml",
"outputId": "27346d76-e73f-4c15-9b28-5eda8d4d6e26"
},
"execution_count": 28,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" userId movieId rating timestamp\n",
"0 1 296 5.0 1147880044\n",
"70 2 1 3.5 1141415820\n",
"141 2 1968 1.0 1141415850\n",
"254 3 1 4.0 1439472215\n",
"264 3 296 5.0 1439474476\n",
"... ... ... ... ...\n",
"412634 2845 1 5.0 857822102\n",
"412724 2847 1 5.0 1489730761\n",
"412725 2847 2 4.5 1489731313\n",
"412750 2847 296 5.0 1489730655\n",
"412840 2847 1968 4.5 1489730899\n",
"\n",
"[3285 rows x 4 columns]"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" userId \n",
" movieId \n",
" rating \n",
" timestamp \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" 296 \n",
" 5.0 \n",
" 1147880044 \n",
" \n",
" \n",
" 70 \n",
" 2 \n",
" 1 \n",
" 3.5 \n",
" 1141415820 \n",
" \n",
" \n",
" 141 \n",
" 2 \n",
" 1968 \n",
" 1.0 \n",
" 1141415850 \n",
" \n",
" \n",
" 254 \n",
" 3 \n",
" 1 \n",
" 4.0 \n",
" 1439472215 \n",
" \n",
" \n",
" 264 \n",
" 3 \n",
" 296 \n",
" 5.0 \n",
" 1439474476 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 412634 \n",
" 2845 \n",
" 1 \n",
" 5.0 \n",
" 857822102 \n",
" \n",
" \n",
" 412724 \n",
" 2847 \n",
" 1 \n",
" 5.0 \n",
" 1489730761 \n",
" \n",
" \n",
" 412725 \n",
" 2847 \n",
" 2 \n",
" 4.5 \n",
" 1489731313 \n",
" \n",
" \n",
" 412750 \n",
" 2847 \n",
" 296 \n",
" 5.0 \n",
" 1489730655 \n",
" \n",
" \n",
" 412840 \n",
" 2847 \n",
" 1968 \n",
" 4.5 \n",
" 1489730899 \n",
" \n",
" \n",
"
\n",
"
3285 rows × 4 columns
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 28
}
]
},
{
"cell_type": "code",
"source": [
"#Sorting it so users with movie most in common with the input will have priority\n",
"userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]), reverse=True)"
],
"metadata": {
"id": "th7AaRzzFB7K"
},
"execution_count": 29,
"outputs": []
},
{
"cell_type": "code",
"source": [
"userSubsetGroup[0:3]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "p8zpIt4uFgSR",
"outputId": "41c4ffb3-7d3d-48e0-d6e7-c6fa97a62c07"
},
"execution_count": 31,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[(43,\n",
" userId movieId rating timestamp\n",
" 5858 43 1 4.0 1170491388\n",
" 5859 43 2 3.5 1170494481\n",
" 5884 43 296 5.0 1169595659\n",
" 5974 43 1274 4.5 1170848041\n",
" 6018 43 1968 4.5 1169599043),\n",
" (171,\n",
" userId movieId rating timestamp\n",
" 20794 171 1 4.5 1074594380\n",
" 20795 171 2 4.0 1074590433\n",
" 20833 171 296 5.0 1074591698\n",
" 20973 171 1274 4.0 1074590795\n",
" 21047 171 1968 4.0 1074595055),\n",
" (440,\n",
" userId movieId rating timestamp\n",
" 54640 440 1 3.5 1230019553\n",
" 54641 440 2 2.0 1231475605\n",
" 54669 440 296 5.0 1230019011\n",
" 54756 440 1274 5.0 1231474568\n",
" 54798 440 1968 4.5 1230019885)]"
]
},
"metadata": {},
"execution_count": 31
}
]
},
{
"cell_type": "code",
"source": [
"userSubsetGroup = userSubsetGroup[0:100]"
],
"metadata": {
"id": "u52LTKGAFiLa"
},
"execution_count": 32,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient\n",
"pearsonCorrelationDict = {}\n",
"#For every user group in our subset\n",
"for name, group in userSubsetGroup:\n",
" #Let’s start by sorting the input and current user group so the values aren’t mixed up later on\n",
" group = group.sort_values(by='movieId')\n",
" inputMovies = inputMovies.sort_values(by='movieId')\n",
" #Get the N for the formula\n",
" nRatings = len(group)\n",
" #Get the review scores for the movies that they both have in common\n",
" temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]\n",
" #And then store them in a temporary buffer variable in a list format to facilitate future calculations\n",
" tempRatingList = temp_df['rating'].tolist()\n",
" #Let’s also put the current user group reviews in a list format\n",
" tempGroupList = group['rating'].tolist()\n",
" #Now let’s calculate the pearson correlation between two users, so called, x and y\n",
" Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)\n",
" Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)\n",
" Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)\n",
" #If the denominator is different than zero, then divide, else, 0 correlation.\n",
" if Sxx != 0 and Syy != 0:\n",
" pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)\n",
" else:\n",
" pearsonCorrelationDict[name] = 0"
],
"metadata": {
"id": "sWe-SYu2Fxv8"
},
"execution_count": 45,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')\n",
"pearsonDF.columns = ['similarityIndex']\n",
"pearsonDF['userId'] = pearsonDF.index\n",
"pearsonDF.index = range(len(pearsonDF))\n",
"pearsonDF.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "2XkBpiJpF6RT",
"outputId": "8169e1b1-a7d4-4968-82de-24d4f482bd00"
},
"execution_count": 46,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" similarityIndex userId\n",
"0 0.946029 43\n",
"1 0.328897 171\n",
"2 0.961538 440\n",
"3 0.468807 597\n",
"4 0.877058 695"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" similarityIndex \n",
" userId \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0.946029 \n",
" 43 \n",
" \n",
" \n",
" 1 \n",
" 0.328897 \n",
" 171 \n",
" \n",
" \n",
" 2 \n",
" 0.961538 \n",
" 440 \n",
" \n",
" \n",
" 3 \n",
" 0.468807 \n",
" 597 \n",
" \n",
" \n",
" 4 \n",
" 0.877058 \n",
" 695 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 46
}
]
},
{
"cell_type": "code",
"source": [
"topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]\n",
"topUsers.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "bhTwW7A6GkS-",
"outputId": "51705e41-0b3f-45ef-e66e-2063e46051c6"
},
"execution_count": 48,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" similarityIndex userId\n",
"46 1.0 836\n",
"88 1.0 2342\n",
"36 1.0 572\n",
"27 1.0 321\n",
"86 1.0 2294"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" similarityIndex \n",
" userId \n",
" \n",
" \n",
" \n",
" \n",
" 46 \n",
" 1.0 \n",
" 836 \n",
" \n",
" \n",
" 88 \n",
" 1.0 \n",
" 2342 \n",
" \n",
" \n",
" 36 \n",
" 1.0 \n",
" 572 \n",
" \n",
" \n",
" 27 \n",
" 1.0 \n",
" 321 \n",
" \n",
" \n",
" 86 \n",
" 1.0 \n",
" 2294 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 48
}
]
},
{
"cell_type": "code",
"source": [
"topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')\n",
"topUsersRating.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "oT6ri2K2HRVI",
"outputId": "4e1dff86-3e81-463a-e12c-0a97de839f11"
},
"execution_count": 49,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" similarityIndex userId movieId rating timestamp\n",
"0 1.0 836 1 4.0 940636758\n",
"1 1.0 836 2 3.0 941909202\n",
"2 1.0 836 4 3.0 941907470\n",
"3 1.0 836 6 2.0 941997766\n",
"4 1.0 836 11 2.0 940631209"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" similarityIndex \n",
" userId \n",
" movieId \n",
" rating \n",
" timestamp \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1.0 \n",
" 836 \n",
" 1 \n",
" 4.0 \n",
" 940636758 \n",
" \n",
" \n",
" 1 \n",
" 1.0 \n",
" 836 \n",
" 2 \n",
" 3.0 \n",
" 941909202 \n",
" \n",
" \n",
" 2 \n",
" 1.0 \n",
" 836 \n",
" 4 \n",
" 3.0 \n",
" 941907470 \n",
" \n",
" \n",
" 3 \n",
" 1.0 \n",
" 836 \n",
" 6 \n",
" 2.0 \n",
" 941997766 \n",
" \n",
" \n",
" 4 \n",
" 1.0 \n",
" 836 \n",
" 11 \n",
" 2.0 \n",
" 940631209 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 49
}
]
},
{
"cell_type": "code",
"source": [
"#Multiplies the similarity by the user’s ratings\n",
"topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']\n",
"topUsersRating.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "01iXMJSSHgbt",
"outputId": "6ab6c3d0-4010-4718-d0ca-2a6e3eaac0fd"
},
"execution_count": 50,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" similarityIndex userId movieId rating timestamp weightedRating\n",
"0 1.0 836 1 4.0 940636758 4.0\n",
"1 1.0 836 2 3.0 941909202 3.0\n",
"2 1.0 836 4 3.0 941907470 3.0\n",
"3 1.0 836 6 2.0 941997766 2.0\n",
"4 1.0 836 11 2.0 940631209 2.0"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" similarityIndex \n",
" userId \n",
" movieId \n",
" rating \n",
" timestamp \n",
" weightedRating \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1.0 \n",
" 836 \n",
" 1 \n",
" 4.0 \n",
" 940636758 \n",
" 4.0 \n",
" \n",
" \n",
" 1 \n",
" 1.0 \n",
" 836 \n",
" 2 \n",
" 3.0 \n",
" 941909202 \n",
" 3.0 \n",
" \n",
" \n",
" 2 \n",
" 1.0 \n",
" 836 \n",
" 4 \n",
" 3.0 \n",
" 941907470 \n",
" 3.0 \n",
" \n",
" \n",
" 3 \n",
" 1.0 \n",
" 836 \n",
" 6 \n",
" 2.0 \n",
" 941997766 \n",
" 2.0 \n",
" \n",
" \n",
" 4 \n",
" 1.0 \n",
" 836 \n",
" 11 \n",
" 2.0 \n",
" 940631209 \n",
" 2.0 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 50
}
]
},
{
"cell_type": "code",
"source": [
"#Applies a sum to the topUsers after grouping it up by userId\n",
"tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]\n",
"tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']\n",
"tempTopUsersRating.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 238
},
"id": "wuHtE276H79k",
"outputId": "2d9eaa1e-e4c3-4833-b662-7c5f06b837ab"
},
"execution_count": 51,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" sum_similarityIndex sum_weightedRating\n",
"movieId \n",
"1 41.322084 156.851119\n",
"2 39.172949 113.157891\n",
"3 12.780225 37.025402\n",
"4 1.852803 6.411211\n",
"5 8.039279 21.639680"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" sum_similarityIndex \n",
" sum_weightedRating \n",
" \n",
" \n",
" movieId \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" 1 \n",
" 41.322084 \n",
" 156.851119 \n",
" \n",
" \n",
" 2 \n",
" 39.172949 \n",
" 113.157891 \n",
" \n",
" \n",
" 3 \n",
" 12.780225 \n",
" 37.025402 \n",
" \n",
" \n",
" 4 \n",
" 1.852803 \n",
" 6.411211 \n",
" \n",
" \n",
" 5 \n",
" 8.039279 \n",
" 21.639680 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 51
}
]
},
{
"cell_type": "code",
"source": [
"#Creates an empty dataframe\n",
"recommendation_df = pd.DataFrame()\n",
"#Now we take the weighted average\n",
"recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']\n",
"recommendation_df['movieId'] = tempTopUsersRating.index\n",
"recommendation_df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 238
},
"id": "xd7tozbdILjU",
"outputId": "b7d7828e-d142-423f-d176-90ffdf8b1ee2"
},
"execution_count": 53,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" weighted average recommendation score movieId\n",
"movieId \n",
"1 3.795818 1\n",
"2 2.888674 2\n",
"3 2.897085 3\n",
"4 3.460277 4\n",
"5 2.691744 5"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" weighted average recommendation score \n",
" movieId \n",
" \n",
" \n",
" movieId \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" 1 \n",
" 3.795818 \n",
" 1 \n",
" \n",
" \n",
" 2 \n",
" 2.888674 \n",
" 2 \n",
" \n",
" \n",
" 3 \n",
" 2.897085 \n",
" 3 \n",
" \n",
" \n",
" 4 \n",
" 3.460277 \n",
" 4 \n",
" \n",
" \n",
" 5 \n",
" 2.691744 \n",
" 5 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 53
}
]
},
{
"cell_type": "code",
"source": [
"recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)\n",
"recommendation_df.head(10)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 394
},
"id": "s3NfvM__IXP5",
"outputId": "02dd0fe0-247e-48bf-a5b4-1e6b994a6340"
},
"execution_count": 54,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" weighted average recommendation score movieId\n",
"movieId \n",
"26082 5.0 26082\n",
"4427 5.0 4427\n",
"7767 5.0 7767\n",
"309 5.0 309\n",
"4517 5.0 4517\n",
"1940 5.0 1940\n",
"1939 5.0 1939\n",
"1936 5.0 1936\n",
"1935 5.0 1935\n",
"6650 5.0 6650"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" weighted average recommendation score \n",
" movieId \n",
" \n",
" \n",
" movieId \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" 26082 \n",
" 5.0 \n",
" 26082 \n",
" \n",
" \n",
" 4427 \n",
" 5.0 \n",
" 4427 \n",
" \n",
" \n",
" 7767 \n",
" 5.0 \n",
" 7767 \n",
" \n",
" \n",
" 309 \n",
" 5.0 \n",
" 309 \n",
" \n",
" \n",
" 4517 \n",
" 5.0 \n",
" 4517 \n",
" \n",
" \n",
" 1940 \n",
" 5.0 \n",
" 1940 \n",
" \n",
" \n",
" 1939 \n",
" 5.0 \n",
" 1939 \n",
" \n",
" \n",
" 1936 \n",
" 5.0 \n",
" 1936 \n",
" \n",
" \n",
" 1935 \n",
" 5.0 \n",
" 1935 \n",
" \n",
" \n",
" 6650 \n",
" 5.0 \n",
" 6650 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 54
}
]
},
{
"cell_type": "code",
"source": [
"movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 363
},
"id": "R1FfYuO1Ihb1",
"outputId": "1d2ab447-5ddc-4543-fb19-47149b118a9d"
},
"execution_count": 55,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" movieId title year\n",
"305 309 Red Firecracker, Green Firecracker (Pao Da Shu... 1994\n",
"1846 1935 How Green Was My Valley 1941\n",
"1847 1936 Mrs. Miniver 1942\n",
"1850 1939 Best Years of Our Lives, The 1946\n",
"1851 1940 Gentleman's Agreement 1947\n",
"4322 4427 Lion in Winter, The 1968\n",
"4412 4517 Lady in White (a.k.a. The Mystery of the Lady ... 1988\n",
"6527 6650 Kind Hearts and Coronets 1949\n",
"7447 7767 Best of Youth, The (La meglio gioventù) 2003\n",
"8565 26082 Harakiri (Seppuku) 1962"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" movieId \n",
" title \n",
" year \n",
" \n",
" \n",
" \n",
" \n",
" 305 \n",
" 309 \n",
" Red Firecracker, Green Firecracker (Pao Da Shu... \n",
" 1994 \n",
" \n",
" \n",
" 1846 \n",
" 1935 \n",
" How Green Was My Valley \n",
" 1941 \n",
" \n",
" \n",
" 1847 \n",
" 1936 \n",
" Mrs. Miniver \n",
" 1942 \n",
" \n",
" \n",
" 1850 \n",
" 1939 \n",
" Best Years of Our Lives, The \n",
" 1946 \n",
" \n",
" \n",
" 1851 \n",
" 1940 \n",
" Gentleman's Agreement \n",
" 1947 \n",
" \n",
" \n",
" 4322 \n",
" 4427 \n",
" Lion in Winter, The \n",
" 1968 \n",
" \n",
" \n",
" 4412 \n",
" 4517 \n",
" Lady in White (a.k.a. The Mystery of the Lady ... \n",
" 1988 \n",
" \n",
" \n",
" 6527 \n",
" 6650 \n",
" Kind Hearts and Coronets \n",
" 1949 \n",
" \n",
" \n",
" 7447 \n",
" 7767 \n",
" Best of Youth, The (La meglio gioventù) \n",
" 2003 \n",
" \n",
" \n",
" 8565 \n",
" 26082 \n",
" Harakiri (Seppuku) \n",
" 1962 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 55
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "UiyVD0GtIqkV"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# New Section"
],
"metadata": {
"id": "J1IjGp9jkOAq"
}
},
{
"cell_type": "markdown",
"source": [
"# New Section"
],
"metadata": {
"id": "marGl7D6kOh9"
}
}
]
}