{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "source": [ "#Dataframe manipulation library\n", "import pandas as pd\n", "#Math functions, we’ll only need the sqrt function so let’s import only that\n", "from math import sqrt\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ], "metadata": { "id": "4qzht__fy-FI" }, "execution_count": 8, "outputs": [] }, { "cell_type": "code", "source": [ "#Storing the movie information into a pandas dataframe\n", "movies_df = pd.read_csv('movies.csv')\n", "#Storing the user information into a pandas dataframe\n", "ratings_df = pd.read_csv('ratings.csv')" ], "metadata": { "id": "2ZXm9nu77E2v" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "movies_df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "lj34wDdMC4st", "outputId": "8b479ff3-e57d-46c6-f552-81c0a51124a6" }, "execution_count": 12, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " movieId title \\\n", "0 1 Toy Story (1995) \n", "1 2 Jumanji (1995) \n", "2 3 Grumpier Old Men (1995) \n", "3 4 Waiting to Exhale (1995) \n", "4 5 Father of the Bride Part II (1995) \n", "\n", " genres \n", "0 Adventure|Animation|Children|Comedy|Fantasy \n", "1 Adventure|Children|Fantasy \n", "2 Comedy|Romance \n", "3 Comedy|Drama|Romance \n", "4 Comedy " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlegenres
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
12Jumanji (1995)Adventure|Children|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama|Romance
45Father of the Bride Part II (1995)Comedy
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [ "#Using regular expressions to find a year stored between parentheses\n", "#We specify the parantheses so we don’t conflict with movies that have years in their titles\n", "movies_df['year'] = movies_df.title.str.extract('(\\(\\d\\d\\d\\d\\))',expand=False)\n", "#Removing the parentheses\n", "movies_df['year'] = movies_df.year.str.extract('(\\d\\d\\d\\d)',expand=False)\n", "#Removing the years from the ‘title’ column\n", "movies_df['title'] = movies_df.title.str.replace('(\\(\\d\\d\\d\\d\\))', '')\n", "#Applying the strip function to get rid of any ending whitespace characters that may have appeared\n", "movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())" ], "metadata": { "id": "TDnET0zPDEED" }, "execution_count": 14, "outputs": [] }, { "cell_type": "code", "source": [ "movies_df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "EKM5kASyDLkP", "outputId": "75ec1a36-fe87-4a7b-f085-ce7c4bc4c692" }, "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " movieId title \\\n", "0 1 Toy Story \n", "1 2 Jumanji \n", "2 3 Grumpier Old Men \n", "3 4 Waiting to Exhale \n", "4 5 Father of the Bride Part II \n", "\n", " genres year \n", "0 Adventure|Animation|Children|Comedy|Fantasy 1995 \n", "1 Adventure|Children|Fantasy 1995 \n", "2 Comedy|Romance 1995 \n", "3 Comedy|Drama|Romance 1995 \n", "4 Comedy 1995 " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlegenresyear
01Toy StoryAdventure|Animation|Children|Comedy|Fantasy1995
12JumanjiAdventure|Children|Fantasy1995
23Grumpier Old MenComedy|Romance1995
34Waiting to ExhaleComedy|Drama|Romance1995
45Father of the Bride Part IIComedy1995
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "#Dropping the genres column\n", "movies_df = movies_df.drop('genres', 1)" ], "metadata": { "id": "_b_ZgmR5Dc1P" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "ratings_df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "BX3fUoOaDjLv", "outputId": "4d167fc0-11b2-4e69-da8b-3e13e4d27f17" }, "execution_count": 17, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " userId movieId rating timestamp\n", "0 1 296 5.0 1147880044\n", "1 1 306 3.5 1147868817\n", "2 1 307 5.0 1147868828\n", "3 1 665 5.0 1147878820\n", "4 1 899 3.5 1147868510" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
012965.01147880044
113063.51147868817
213075.01147868828
316655.01147878820
418993.51147868510
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "source": [ "userInput = [\n", "{'title':'Breakfast Club, The', 'rating':5},\n", "{'title':'Toy Story', 'rating':3.5},\n", "{'title':'Jumanji', 'rating':2},\n", "{'title':'Pulp Fiction', 'rating':5},\n", "{'title':'Akira', 'rating':4.5}\n", "]\n", "inputMovies = pd.DataFrame(userInput)\n", "inputMovies" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "_PwiTjRnDs1U", "outputId": "f232ea92-b38c-457e-d334-5fd29128f2e2" }, "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title rating\n", "0 Breakfast Club, The 5.0\n", "1 Toy Story 3.5\n", "2 Jumanji 2.0\n", "3 Pulp Fiction 5.0\n", "4 Akira 4.5" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlerating
0Breakfast Club, The5.0
1Toy Story3.5
2Jumanji2.0
3Pulp Fiction5.0
4Akira4.5
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "code", "source": [ "#Filtering out the movies by title\n", "inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]\n", "#Then merging it so we can get the movieId. It’s implicitly merging it by title.\n", "inputMovies = pd.merge(inputId, inputMovies)\n", "#Dropping information we won’t use from the input dataframe\n", "inputMovies = inputMovies.drop('year', 1)\n", "#Final input dataframe\n", "#If a movie you added in above isn’t here, then it might not be in the original\n", "#dataframe or it might spelled differently, please check capitalisation.\n", "inputMovies" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 238 }, "id": "T3n9mvlKD_OM", "outputId": "9c6bc35f-9c03-4988-a6a7-d0025d94e8a2" }, "execution_count": 20, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " movieId title rating\n", "0 1 Toy Story 3.5\n", "1 2 Jumanji 2.0\n", "2 296 Pulp Fiction 5.0\n", "3 1274 Akira 4.5\n", "4 164600 Akira 4.5\n", "5 1968 Breakfast Club, The 5.0" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlerating
01Toy Story3.5
12Jumanji2.0
2296Pulp Fiction5.0
31274Akira4.5
4164600Akira4.5
51968Breakfast Club, The5.0
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 20 } ] }, { "cell_type": "code", "source": [ "#Filtering out users that have watched movies that the input has watched and storing it\n", "userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]\n", "userSubset.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "aGx-3x09Eozj", "outputId": "87cc81a7-6cfe-4f63-f05c-71122836564f" }, "execution_count": 21, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " userId movieId rating timestamp\n", "0 1 296 5.0 1147880044\n", "70 2 1 3.5 1141415820\n", "141 2 1968 1.0 1141415850\n", "254 3 1 4.0 1439472215\n", "264 3 296 5.0 1439474476" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
012965.01147880044
70213.51141415820
141219681.01141415850
254314.01439472215
26432965.01439474476
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 21 } ] }, { "cell_type": "code", "source": [ "#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter\n", "userSubsetGroup = userSubset.groupby(['userId'])" ], "metadata": { "id": "WS_Jx47DEx01" }, "execution_count": 22, "outputs": [] }, { "cell_type": "code", "source": [ "userSubsetGroup.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "jJWPtlcaE7ml", "outputId": "27346d76-e73f-4c15-9b28-5eda8d4d6e26" }, "execution_count": 28, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " userId movieId rating timestamp\n", "0 1 296 5.0 1147880044\n", "70 2 1 3.5 1141415820\n", "141 2 1968 1.0 1141415850\n", "254 3 1 4.0 1439472215\n", "264 3 296 5.0 1439474476\n", "... ... ... ... ...\n", "412634 2845 1 5.0 857822102\n", "412724 2847 1 5.0 1489730761\n", "412725 2847 2 4.5 1489731313\n", "412750 2847 296 5.0 1489730655\n", "412840 2847 1968 4.5 1489730899\n", "\n", "[3285 rows x 4 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
012965.01147880044
70213.51141415820
141219681.01141415850
254314.01439472215
26432965.01439474476
...............
412634284515.0857822102
412724284715.01489730761
412725284724.51489731313
41275028472965.01489730655
412840284719684.51489730899
\n", "

3285 rows × 4 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 28 } ] }, { "cell_type": "code", "source": [ "#Sorting it so users with movie most in common with the input will have priority\n", "userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]), reverse=True)" ], "metadata": { "id": "th7AaRzzFB7K" }, "execution_count": 29, "outputs": [] }, { "cell_type": "code", "source": [ "userSubsetGroup[0:3]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "p8zpIt4uFgSR", "outputId": "41c4ffb3-7d3d-48e0-d6e7-c6fa97a62c07" }, "execution_count": 31, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[(43,\n", " userId movieId rating timestamp\n", " 5858 43 1 4.0 1170491388\n", " 5859 43 2 3.5 1170494481\n", " 5884 43 296 5.0 1169595659\n", " 5974 43 1274 4.5 1170848041\n", " 6018 43 1968 4.5 1169599043),\n", " (171,\n", " userId movieId rating timestamp\n", " 20794 171 1 4.5 1074594380\n", " 20795 171 2 4.0 1074590433\n", " 20833 171 296 5.0 1074591698\n", " 20973 171 1274 4.0 1074590795\n", " 21047 171 1968 4.0 1074595055),\n", " (440,\n", " userId movieId rating timestamp\n", " 54640 440 1 3.5 1230019553\n", " 54641 440 2 2.0 1231475605\n", " 54669 440 296 5.0 1230019011\n", " 54756 440 1274 5.0 1231474568\n", " 54798 440 1968 4.5 1230019885)]" ] }, "metadata": {}, "execution_count": 31 } ] }, { "cell_type": "code", "source": [ "userSubsetGroup = userSubsetGroup[0:100]" ], "metadata": { "id": "u52LTKGAFiLa" }, "execution_count": 32, "outputs": [] }, { "cell_type": "code", "source": [ "#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient\n", "pearsonCorrelationDict = {}\n", "#For every user group in our subset\n", "for name, group in userSubsetGroup:\n", " #Let’s start by sorting the input and current user group so the values aren’t mixed up later on\n", " group = group.sort_values(by='movieId')\n", " inputMovies = inputMovies.sort_values(by='movieId')\n", " #Get the N for the formula\n", " nRatings = len(group)\n", " #Get the review scores for the movies that they both have in common\n", " temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]\n", " #And then store them in a temporary buffer variable in a list format to facilitate future calculations\n", " tempRatingList = temp_df['rating'].tolist()\n", " #Let’s also put the current user group reviews in a list format\n", " tempGroupList = group['rating'].tolist()\n", " #Now let’s calculate the pearson correlation between two users, so called, x and y\n", " Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)\n", " Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)\n", " Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)\n", " #If the denominator is different than zero, then divide, else, 0 correlation.\n", " if Sxx != 0 and Syy != 0:\n", " pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)\n", " else:\n", " pearsonCorrelationDict[name] = 0" ], "metadata": { "id": "sWe-SYu2Fxv8" }, "execution_count": 45, "outputs": [] }, { "cell_type": "code", "source": [ "pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')\n", "pearsonDF.columns = ['similarityIndex']\n", "pearsonDF['userId'] = pearsonDF.index\n", "pearsonDF.index = range(len(pearsonDF))\n", "pearsonDF.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "2XkBpiJpF6RT", "outputId": "8169e1b1-a7d4-4968-82de-24d4f482bd00" }, "execution_count": 46, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " similarityIndex userId\n", "0 0.946029 43\n", "1 0.328897 171\n", "2 0.961538 440\n", "3 0.468807 597\n", "4 0.877058 695" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
similarityIndexuserId
00.94602943
10.328897171
20.961538440
30.468807597
40.877058695
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 46 } ] }, { "cell_type": "code", "source": [ "topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]\n", "topUsers.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "bhTwW7A6GkS-", "outputId": "51705e41-0b3f-45ef-e66e-2063e46051c6" }, "execution_count": 48, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " similarityIndex userId\n", "46 1.0 836\n", "88 1.0 2342\n", "36 1.0 572\n", "27 1.0 321\n", "86 1.0 2294" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
similarityIndexuserId
461.0836
881.02342
361.0572
271.0321
861.02294
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 48 } ] }, { "cell_type": "code", "source": [ "topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')\n", "topUsersRating.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "oT6ri2K2HRVI", "outputId": "4e1dff86-3e81-463a-e12c-0a97de839f11" }, "execution_count": 49, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " similarityIndex userId movieId rating timestamp\n", "0 1.0 836 1 4.0 940636758\n", "1 1.0 836 2 3.0 941909202\n", "2 1.0 836 4 3.0 941907470\n", "3 1.0 836 6 2.0 941997766\n", "4 1.0 836 11 2.0 940631209" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
similarityIndexuserIdmovieIdratingtimestamp
01.083614.0940636758
11.083623.0941909202
21.083643.0941907470
31.083662.0941997766
41.0836112.0940631209
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 49 } ] }, { "cell_type": "code", "source": [ "#Multiplies the similarity by the user’s ratings\n", "topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']\n", "topUsersRating.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "01iXMJSSHgbt", "outputId": "6ab6c3d0-4010-4718-d0ca-2a6e3eaac0fd" }, "execution_count": 50, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " similarityIndex userId movieId rating timestamp weightedRating\n", "0 1.0 836 1 4.0 940636758 4.0\n", "1 1.0 836 2 3.0 941909202 3.0\n", "2 1.0 836 4 3.0 941907470 3.0\n", "3 1.0 836 6 2.0 941997766 2.0\n", "4 1.0 836 11 2.0 940631209 2.0" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
similarityIndexuserIdmovieIdratingtimestampweightedRating
01.083614.09406367584.0
11.083623.09419092023.0
21.083643.09419074703.0
31.083662.09419977662.0
41.0836112.09406312092.0
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 50 } ] }, { "cell_type": "code", "source": [ "#Applies a sum to the topUsers after grouping it up by userId\n", "tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]\n", "tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']\n", "tempTopUsersRating.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 238 }, "id": "wuHtE276H79k", "outputId": "2d9eaa1e-e4c3-4833-b662-7c5f06b837ab" }, "execution_count": 51, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " sum_similarityIndex sum_weightedRating\n", "movieId \n", "1 41.322084 156.851119\n", "2 39.172949 113.157891\n", "3 12.780225 37.025402\n", "4 1.852803 6.411211\n", "5 8.039279 21.639680" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sum_similarityIndexsum_weightedRating
movieId
141.322084156.851119
239.172949113.157891
312.78022537.025402
41.8528036.411211
58.03927921.639680
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 51 } ] }, { "cell_type": "code", "source": [ "#Creates an empty dataframe\n", "recommendation_df = pd.DataFrame()\n", "#Now we take the weighted average\n", "recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']\n", "recommendation_df['movieId'] = tempTopUsersRating.index\n", "recommendation_df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 238 }, "id": "xd7tozbdILjU", "outputId": "b7d7828e-d142-423f-d176-90ffdf8b1ee2" }, "execution_count": 53, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " weighted average recommendation score movieId\n", "movieId \n", "1 3.795818 1\n", "2 2.888674 2\n", "3 2.897085 3\n", "4 3.460277 4\n", "5 2.691744 5" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
weighted average recommendation scoremovieId
movieId
13.7958181
22.8886742
32.8970853
43.4602774
52.6917445
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 53 } ] }, { "cell_type": "code", "source": [ "recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)\n", "recommendation_df.head(10)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 394 }, "id": "s3NfvM__IXP5", "outputId": "02dd0fe0-247e-48bf-a5b4-1e6b994a6340" }, "execution_count": 54, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " weighted average recommendation score movieId\n", "movieId \n", "26082 5.0 26082\n", "4427 5.0 4427\n", "7767 5.0 7767\n", "309 5.0 309\n", "4517 5.0 4517\n", "1940 5.0 1940\n", "1939 5.0 1939\n", "1936 5.0 1936\n", "1935 5.0 1935\n", "6650 5.0 6650" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
weighted average recommendation scoremovieId
movieId
260825.026082
44275.04427
77675.07767
3095.0309
45175.04517
19405.01940
19395.01939
19365.01936
19355.01935
66505.06650
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 54 } ] }, { "cell_type": "code", "source": [ "movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "id": "R1FfYuO1Ihb1", "outputId": "1d2ab447-5ddc-4543-fb19-47149b118a9d" }, "execution_count": 55, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " movieId title year\n", "305 309 Red Firecracker, Green Firecracker (Pao Da Shu... 1994\n", "1846 1935 How Green Was My Valley 1941\n", "1847 1936 Mrs. Miniver 1942\n", "1850 1939 Best Years of Our Lives, The 1946\n", "1851 1940 Gentleman's Agreement 1947\n", "4322 4427 Lion in Winter, The 1968\n", "4412 4517 Lady in White (a.k.a. The Mystery of the Lady ... 1988\n", "6527 6650 Kind Hearts and Coronets 1949\n", "7447 7767 Best of Youth, The (La meglio gioventù) 2003\n", "8565 26082 Harakiri (Seppuku) 1962" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitleyear
305309Red Firecracker, Green Firecracker (Pao Da Shu...1994
18461935How Green Was My Valley1941
18471936Mrs. Miniver1942
18501939Best Years of Our Lives, The1946
18511940Gentleman's Agreement1947
43224427Lion in Winter, The1968
44124517Lady in White (a.k.a. The Mystery of the Lady ...1988
65276650Kind Hearts and Coronets1949
74477767Best of Youth, The (La meglio gioventù)2003
856526082Harakiri (Seppuku)1962
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 55 } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "UiyVD0GtIqkV" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# New Section" ], "metadata": { "id": "J1IjGp9jkOAq" } }, { "cell_type": "markdown", "source": [ "# New Section" ], "metadata": { "id": "marGl7D6kOh9" } } ] }