||
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Collaborative Filtering\n",
- "\n",
- "## The Framework"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>user_id</th>\n",
- " <th>age</th>\n",
- " <th>sex</th>\n",
- " <th>occupation</th>\n",
- " <th>zip_code</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>1</td>\n",
- " <td>24</td>\n",
- " <td>M</td>\n",
- " <td>technician</td>\n",
- " <td>85711</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>2</td>\n",
- " <td>53</td>\n",
- " <td>F</td>\n",
- " <td>other</td>\n",
- " <td>94043</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>3</td>\n",
- " <td>23</td>\n",
- " <td>M</td>\n",
- " <td>writer</td>\n",
- " <td>32067</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>4</td>\n",
- " <td>24</td>\n",
- " <td>M</td>\n",
- " <td>technician</td>\n",
- " <td>43537</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>5</td>\n",
- " <td>33</td>\n",
- " <td>F</td>\n",
- " <td>other</td>\n",
- " <td>15213</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " user_id age sex occupation zip_code\n",
- "0 1 24 M technician 85711\n",
- "1 2 53 F other 94043\n",
- "2 3 23 M writer 32067\n",
- "3 4 24 M technician 43537\n",
- "4 5 33 F other 15213"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Load the u.user file into a dataframe\n",
- "u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']\n",
- "\n",
- "users = pd.read_csv('../data/movielens/u.user', sep='|', names=u_cols,\n",
- " encoding='latin-1')\n",
- "\n",
- "users.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>movie_id</th>\n",
- " <th>title</th>\n",
- " <th>release date</th>\n",
- " <th>video release date</th>\n",
- " <th>IMDb URL</th>\n",
- " <th>unknown</th>\n",
- " <th>Action</th>\n",
- " <th>Adventure</th>\n",
- " <th>Animation</th>\n",
- " <th>Children's</th>\n",
- " <th>...</th>\n",
- " <th>Fantasy</th>\n",
- " <th>Film-Noir</th>\n",
- " <th>Horror</th>\n",
- " <th>Musical</th>\n",
- " <th>Mystery</th>\n",
- " <th>Romance</th>\n",
- " <th>Sci-Fi</th>\n",
- " <th>Thriller</th>\n",
- " <th>War</th>\n",
- " <th>Western</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>1</td>\n",
- " <td>Toy Story (1995)</td>\n",
- " <td>01-Jan-1995</td>\n",
- " <td>NaN</td>\n",
- " <td>http://us.imdb.com/M/title-exact?Toy%20Story%2...</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " <td>...</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>2</td>\n",
- " <td>GoldenEye (1995)</td>\n",
- " <td>01-Jan-1995</td>\n",
- " <td>NaN</td>\n",
- " <td>http://us.imdb.com/M/title-exact?GoldenEye%20(...</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>...</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>3</td>\n",
- " <td>Four Rooms (1995)</td>\n",
- " <td>01-Jan-1995</td>\n",
- " <td>NaN</td>\n",
- " <td>http://us.imdb.com/M/title-exact?Four%20Rooms%...</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>...</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>4</td>\n",
- " <td>Get Shorty (1995)</td>\n",
- " <td>01-Jan-1995</td>\n",
- " <td>NaN</td>\n",
- " <td>http://us.imdb.com/M/title-exact?Get%20Shorty%...</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>...</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>5</td>\n",
- " <td>Copycat (1995)</td>\n",
- " <td>01-Jan-1995</td>\n",
- " <td>NaN</td>\n",
- " <td>http://us.imdb.com/M/title-exact?Copycat%20(1995)</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>...</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " <td>1</td>\n",
- " <td>0</td>\n",
- " <td>0</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>5 rows × 24 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " movie_id title release date video release date \\\n",
- "0 1 Toy Story (1995) 01-Jan-1995 NaN \n",
- "1 2 GoldenEye (1995) 01-Jan-1995 NaN \n",
- "2 3 Four Rooms (1995) 01-Jan-1995 NaN \n",
- "3 4 Get Shorty (1995) 01-Jan-1995 NaN \n",
- "4 5 Copycat (1995) 01-Jan-1995 NaN \n",
- "\n",
- " IMDb URL unknown Action \\\n",
- "0 http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 \n",
- "1 http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 \n",
- "2 http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 \n",
- "3 http://us.imdb.com/M/title-exact?Get%20Shorty%... 0 1 \n",
- "4 http://us.imdb.com/M/title-exact?Copycat%20(1995) 0 0 \n",
- "\n",
- " Adventure Animation Children's ... Fantasy Film-Noir Horror \\\n",
- "0 0 1 1 ... 0 0 0 \n",
- "1 1 0 0 ... 0 0 0 \n",
- "2 0 0 0 ... 0 0 0 \n",
- "3 0 0 0 ... 0 0 0 \n",
- "4 0 0 0 ... 0 0 0 \n",
- "\n",
- " Musical Mystery Romance Sci-Fi Thriller War Western \n",
- "0 0 0 0 0 0 0 0 \n",
- "1 0 0 0 0 1 0 0 \n",
- "2 0 0 0 0 1 0 0 \n",
- "3 0 0 0 0 0 0 0 \n",
- "4 0 0 0 0 1 0 0 \n",
- "\n",
- "[5 rows x 24 columns]"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Load the u.item file into a dataframe\n",
- "i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',\n",
- " 'Animation', 'Children\\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',\n",
- " 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']\n",
- "\n",
- "movies = pd.read_csv('../data/movielens/u.item', sep='|', names=i_cols, encoding='latin-1')\n",
- "\n",
- "movies.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#Remove all information except Movie ID and title\n",
- "movies = movies[['movie_id', 'title']]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>user_id</th>\n",
- " <th>movie_id</th>\n",
- " <th>rating</th>\n",
- " <th>timestamp</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>196</td>\n",
- " <td>242</td>\n",
- " <td>3</td>\n",
- " <td>881250949</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>186</td>\n",
- " <td>302</td>\n",
- " <td>3</td>\n",
- " <td>891717742</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>22</td>\n",
- " <td>377</td>\n",
- " <td>1</td>\n",
- " <td>878887116</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>244</td>\n",
- " <td>51</td>\n",
- " <td>2</td>\n",
- " <td>880606923</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>166</td>\n",
- " <td>346</td>\n",
- " <td>1</td>\n",
- " <td>886397596</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " user_id movie_id rating timestamp\n",
- "0 196 242 3 881250949\n",
- "1 186 302 3 891717742\n",
- "2 22 377 1 878887116\n",
- "3 244 51 2 880606923\n",
- "4 166 346 1 886397596"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Load the u.data file into a dataframe\n",
- "r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']\n",
- "\n",
- "ratings = pd.read_csv('../data/movielens/u.data', sep='\\t', names=r_cols,\n",
- " encoding='latin-1')\n",
- "\n",
- "ratings.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#Drop the timestamp column\n",
- "ratings = ratings.drop('timestamp', axis=1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#Import the train_test_split function\n",
- "from sklearn.model_selection import train_test_split\n",
- "\n",
- "#Assign X as the original ratings dataframe and y as the user_id column of ratings.\n",
- "X = ratings.copy()\n",
- "y = ratings['user_id']\n",
- "\n",
- "#Split into training and test datasets, stratified along user_id\n",
- "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#Import the mean_squared_error function\n",
- "from sklearn.metrics import mean_squared_error\n",
- "\n",
- "#Function that computes the root mean squared error (or RMSE)\n",
- "def rmse(y_true, y_pred):\n",
- " return np.sqrt(mean_squared_error(y_true, y_pred))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#Define the baseline model to always return 3.\n",
- "def baseline(user_id, movie_id):\n",
- " return 3.0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#Function to compute the RMSE score obtained on the testing set by a model\n",
- "def score(cf_model):\n",
- " \n",
- " #Construct a list of user-movie tuples from the testing dataset\n",
- " id_pairs = zip(X_test['user_id'], X_test['movie_id'])\n",
- " \n",
- " #Predict the rating for every user-movie tuple\n",
- " y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])\n",
- " \n",
- " #Extract the actual ratings given by the users in the test data\n",
- " y_true = np.array(X_test['rating'])\n",
- " \n",
- " #Return the final RMSE score\n",
- " return rmse(y_true, y_pred)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "1.2470926188539486"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "score(baseline)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "collapsed": true
- },
- "source": [
- "## User Based Collaborative Filtering\n",
- "\n",
- "### Ratings Matrix"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th>movie_id</th>\n",
- " <th>1</th>\n",
- " <th>2</th>\n",
- " <th>3</th>\n",
- " <th>4</th>\n",
- " <th>5</th>\n",
- " <th>6</th>\n",
- " <th>7</th>\n",
- " <th>8</th>\n",
- " <th>9</th>\n",
- " <th>10</th>\n",
- " <th>...</th>\n",
- " <th>1669</th>\n",
- " <th>1670</th>\n",
- " <th>1671</th>\n",
- " <th>1673</th>\n",
- " <th>1674</th>\n",
- " <th>1675</th>\n",
- " <th>1676</th>\n",
- " <th>1679</th>\n",
- " <th>1681</th>\n",
- " <th>1682</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>user_id</th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>5.0</td>\n",
- " <td>3.0</td>\n",
- " <td>4.0</td>\n",
- " <td>3.0</td>\n",
- " <td>3.0</td>\n",
- " <td>5.0</td>\n",
- " <td>4.0</td>\n",
- " <td>1.0</td>\n",
- " <td>5.0</td>\n",
- " <td>3.0</td>\n",
- " <td>...</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>2.0</td>\n",
- " <td>...</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>...</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>...</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5</th>\n",
- " <td>NaN</td>\n",
- " <td>3.0</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>...</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>5 rows × 1647 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- "movie_id 1 2 3 4 5 6 7 8 9 10 ... \\\n",
- "user_id ... \n",
- "1 5.0 3.0 4.0 3.0 3.0 5.0 4.0 1.0 5.0 3.0 ... \n",
- "2 NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 ... \n",
- "3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... \n",
- "4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... \n",
- "5 NaN 3.0 NaN NaN NaN NaN NaN NaN NaN NaN ... \n",
- "\n",
- "movie_id 1669 1670 1671 1673 1674 1675 1676 1679 1681 1682 \n",
- "user_id \n",
- "1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n",
- "2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n",
- "3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n",
- "5 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n",
- "\n",
- "[5 rows x 1647 columns]"
- ]
- },
- "execution_count": 58,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Build the ratings matrix using pivot_table function\n",
- "r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='movie_id')\n",
- "\n",
- "r_matrix.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Mean"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 88,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#User Based Collaborative Filter using Mean Ratings\n",
- "def cf_user_mean(user_id, movie_id):\n",
- " \n",
- " #Check if movie_id exists in r_matrix\n",
- " if movie_id in r_matrix:\n",
- " #Compute the mean of all the ratings given to the movie\n",
- " mean_rating = r_matrix[movie_id].mean()\n",
- " \n",
- " else:\n",
- " #Default to a rating of 3.0 in the absence of any information\n",
- " mean_rating = 3.0\n",
- " \n",
- " return mean_rating"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 89,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "1.0234701463131335"
- ]
- },
- "execution_count": 89,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Compute RMSE for the Mean model\n",
- "score(cf_user_mean)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Weighted Mean"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#Create a dummy ratings matrix with all null values imputed to 0\n",
- "r_matrix_dummy = r_matrix.copy().fillna(0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# Import cosine_score \n",
- "from sklearn.metrics.pairwise import cosine_similarity\n",
- "\n",
- "#Compute the cosine similarity matrix using the dummy ratings matrix\n",
- "cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 63,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th>user_id</th>\n",
- " <th>1</th>\n",
- " <th>2</th>\n",
- " <th>3</th>\n",
- " <th>4</th>\n",
- " <th>5</th>\n",
- " <th>6</th>\n",
- " <th>7</th>\n",
- " <th>8</th>\n",
- " <th>9</th>\n",
- " <th>10</th>\n",
- " <th>...</th>\n",
- " <th>934</th>\n",
- " <th>935</th>\n",
- " <th>936</th>\n",
- " <th>937</th>\n",
- " <th>938</th>\n",
- " <th>939</th>\n",
- " <th>940</th>\n",
- " <th>941</th>\n",
- " <th>942</th>\n",
- " <th>943</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>user_id</th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>1.000000</td>\n",
- " <td>0.118076</td>\n",
- " <td>0.029097</td>\n",
- " <td>0.011628</td>\n",
- " <td>0.264677</td>\n",
- " <td>0.312419</td>\n",
- " <td>0.308729</td>\n",
- " <td>0.224269</td>\n",
- " <td>0.026017</td>\n",
- " <td>0.286411</td>\n",
- " <td>...</td>\n",
- " <td>0.308475</td>\n",
- " <td>0.055872</td>\n",
- " <td>0.197862</td>\n",
- " <td>0.131367</td>\n",
- " <td>0.152449</td>\n",
- " <td>0.084456</td>\n",
- " <td>0.293293</td>\n",
- " <td>0.056765</td>\n",
- " <td>0.103536</td>\n",
- " <td>0.326491</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>0.118076</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.099097</td>\n",
- " <td>0.107680</td>\n",
- " <td>0.034279</td>\n",
- " <td>0.152789</td>\n",
- " <td>0.086705</td>\n",
- " <td>0.078864</td>\n",
- " <td>0.068940</td>\n",
- " <td>0.092399</td>\n",
- " <td>...</td>\n",
- " <td>0.086927</td>\n",
- " <td>0.259636</td>\n",
- " <td>0.289092</td>\n",
- " <td>0.318824</td>\n",
- " <td>0.149105</td>\n",
- " <td>0.186347</td>\n",
- " <td>0.168034</td>\n",
- " <td>0.106748</td>\n",
- " <td>0.136796</td>\n",
- " <td>0.080358</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>0.029097</td>\n",
- " <td>0.099097</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.252131</td>\n",
- " <td>0.026893</td>\n",
- " <td>0.062539</td>\n",
- " <td>0.039767</td>\n",
- " <td>0.089474</td>\n",
- " <td>0.078162</td>\n",
- " <td>0.037670</td>\n",
- " <td>...</td>\n",
- " <td>0.040918</td>\n",
- " <td>0.019031</td>\n",
- " <td>0.065417</td>\n",
- " <td>0.055373</td>\n",
- " <td>0.086503</td>\n",
- " <td>0.018418</td>\n",
- " <td>0.096993</td>\n",
- " <td>0.109631</td>\n",
- " <td>0.092574</td>\n",
- " <td>0.018987</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>0.011628</td>\n",
- " <td>0.107680</td>\n",
- " <td>0.252131</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.045543</td>\n",
- " <td>0.078812</td>\n",
- " <td>0.095354</td>\n",
- " <td>0.059498</td>\n",
- " <td>0.053879</td>\n",
- " <td>...</td>\n",
- " <td>0.024226</td>\n",
- " <td>0.050703</td>\n",
- " <td>0.056561</td>\n",
- " <td>0.107294</td>\n",
- " <td>0.098892</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.132900</td>\n",
- " <td>0.142798</td>\n",
- " <td>0.097066</td>\n",
- " <td>0.015176</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5</th>\n",
- " <td>0.264677</td>\n",
- " <td>0.034279</td>\n",
- " <td>0.026893</td>\n",
- " <td>0.000000</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.202843</td>\n",
- " <td>0.299619</td>\n",
- " <td>0.163724</td>\n",
- " <td>0.038474</td>\n",
- " <td>0.153021</td>\n",
- " <td>...</td>\n",
- " <td>0.262547</td>\n",
- " <td>0.048524</td>\n",
- " <td>0.048312</td>\n",
- " <td>0.022202</td>\n",
- " <td>0.091910</td>\n",
- " <td>0.066000</td>\n",
- " <td>0.156172</td>\n",
- " <td>0.115842</td>\n",
- " <td>0.124297</td>\n",
- " <td>0.267574</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>6</th>\n",
- " <td>0.312419</td>\n",
- " <td>0.152789</td>\n",
- " <td>0.062539</td>\n",
- " <td>0.045543</td>\n",
- " <td>0.202843</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.375963</td>\n",
- " <td>0.131795</td>\n",
- " <td>0.110944</td>\n",
- " <td>0.400758</td>\n",
- " <td>...</td>\n",
- " <td>0.287549</td>\n",
- " <td>0.080312</td>\n",
- " <td>0.162988</td>\n",
- " <td>0.182856</td>\n",
- " <td>0.114262</td>\n",
- " <td>0.092090</td>\n",
- " <td>0.261859</td>\n",
- " <td>0.097606</td>\n",
- " <td>0.206104</td>\n",
- " <td>0.187637</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>7</th>\n",
- " <td>0.308729</td>\n",
- " <td>0.086705</td>\n",
- " <td>0.039767</td>\n",
- " <td>0.078812</td>\n",
- " <td>0.299619</td>\n",
- " <td>0.375963</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.211282</td>\n",
- " <td>0.107795</td>\n",
- " <td>0.328923</td>\n",
- " <td>...</td>\n",
- " <td>0.290002</td>\n",
- " <td>0.074170</td>\n",
- " <td>0.094619</td>\n",
- " <td>0.084235</td>\n",
- " <td>0.115620</td>\n",
- " <td>0.100625</td>\n",
- " <td>0.233843</td>\n",
- " <td>0.039199</td>\n",
- " <td>0.224227</td>\n",
- " <td>0.296332</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>8</th>\n",
- " <td>0.224269</td>\n",
- " <td>0.078864</td>\n",
- " <td>0.089474</td>\n",
- " <td>0.095354</td>\n",
- " <td>0.163724</td>\n",
- " <td>0.131795</td>\n",
- " <td>0.211282</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.037040</td>\n",
- " <td>0.183375</td>\n",
- " <td>...</td>\n",
- " <td>0.165008</td>\n",
- " <td>0.066843</td>\n",
- " <td>0.058766</td>\n",
- " <td>0.068759</td>\n",
- " <td>0.087159</td>\n",
- " <td>0.129381</td>\n",
- " <td>0.188662</td>\n",
- " <td>0.121223</td>\n",
- " <td>0.083910</td>\n",
- " <td>0.273238</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>9</th>\n",
- " <td>0.026017</td>\n",
- " <td>0.068940</td>\n",
- " <td>0.078162</td>\n",
- " <td>0.059498</td>\n",
- " <td>0.038474</td>\n",
- " <td>0.110944</td>\n",
- " <td>0.107795</td>\n",
- " <td>0.037040</td>\n",
- " <td>1.000000</td>\n",
- " <td>0.155435</td>\n",
- " <td>...</td>\n",
- " <td>0.011708</td>\n",
- " <td>0.000000</td>\n",
- " <td>0.101710</td>\n",
- " <td>0.034568</td>\n",
- " <td>0.045002</td>\n",
- " <td>0.052699</td>\n",
- " <td>0.107486</td>\n",
- " <td>0.055766</td>\n",
- " <td>0.070065</td>\n",
- " <td>0.088281</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>10</th>\n",
- " <td>0.286411</td>\n",
- " <td>0.092399</td>\n",
- " <td>0.037670</td>\n",
- " <td>0.053879</td>\n",
- " <td>0.153021</td>\n",
- " <td>0.400758</td>\n",
- " <td>0.328923</td>\n",
- " <td>0.183375</td>\n",
- " <td>0.155435</td>\n",
- " <td>1.000000</td>\n",
- " <td>...</td>\n",
- " <td>0.278558</td>\n",
- " <td>0.049310</td>\n",
- " <td>0.153506</td>\n",
- " <td>0.065471</td>\n",
- " <td>0.060088</td>\n",
- " <td>0.033686</td>\n",
- " <td>0.197107</td>\n",
- " <td>0.085402</td>\n",
- " <td>0.118945</td>\n",
- " <td>0.162538</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>10 rows × 943 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- "user_id 1 2 3 4 5 6 7 \\\n",
- "user_id \n",
- "1 1.000000 0.118076 0.029097 0.011628 0.264677 0.312419 0.308729 \n",
- "2 0.118076 1.000000 0.099097 0.107680 0.034279 0.152789 0.086705 \n",
- "3 0.029097 0.099097 1.000000 0.252131 0.026893 0.062539 0.039767 \n",
- "4 0.011628 0.107680 0.252131 1.000000 0.000000 0.045543 0.078812 \n",
- "5 0.264677 0.034279 0.026893 0.000000 1.000000 0.202843 0.299619 \n",
- "6 0.312419 0.152789 0.062539 0.045543 0.202843 1.000000 0.375963 \n",
- "7 0.308729 0.086705 0.039767 0.078812 0.299619 0.375963 1.000000 \n",
- "8 0.224269 0.078864 0.089474 0.095354 0.163724 0.131795 0.211282 \n",
- "9 0.026017 0.068940 0.078162 0.059498 0.038474 0.110944 0.107795 \n",
- "10 0.286411 0.092399 0.037670 0.053879 0.153021 0.400758 0.328923 \n",
- "\n",
- "user_id 8 9 10 ... 934 935 936 \\\n",
- "user_id ... \n",
- "1 0.224269 0.026017 0.286411 ... 0.308475 0.055872 0.197862 \n",
- "2 0.078864 0.068940 0.092399 ... 0.086927 0.259636 0.289092 \n",
- "3 0.089474 0.078162 0.037670 ... 0.040918 0.019031 0.065417 \n",
- "4 0.095354 0.059498 0.053879 ... 0.024226 0.050703 0.056561 \n",
- "5 0.163724 0.038474 0.153021 ... 0.262547 0.048524 0.048312 \n",
- "6 0.131795 0.110944 0.400758 ... 0.287549 0.080312 0.162988 \n",
- "7 0.211282 0.107795 0.328923 ... 0.290002 0.074170 0.094619 \n",
- "8 1.000000 0.037040 0.183375 ... 0.165008 0.066843 0.058766 \n",
- "9 0.037040 1.000000 0.155435 ... 0.011708 0.000000 0.101710 \n",
- "10 0.183375 0.155435 1.000000 ... 0.278558 0.049310 0.153506 \n",
- "\n",
- "user_id 937 938 939 940 941 942 943 \n",
- "user_id \n",
- "1 0.131367 0.152449 0.084456 0.293293 0.056765 0.103536 0.326491 \n",
- "2 0.318824 0.149105 0.186347 0.168034 0.106748 0.136796 0.080358 \n",
- "3 0.055373 0.086503 0.018418 0.096993 0.109631 0.092574 0.018987 \n",
- "4 0.107294 0.098892 0.000000 0.132900 0.142798 0.097066 0.015176 \n",
- "5 0.022202 0.091910 0.066000 0.156172 0.115842 0.124297 0.267574 \n",
- "6 0.182856 0.114262 0.092090 0.261859 0.097606 0.206104 0.187637 \n",
- "7 0.084235 0.115620 0.100625 0.233843 0.039199 0.224227 0.296332 \n",
- "8 0.068759 0.087159 0.129381 0.188662 0.121223 0.083910 0.273238 \n",
- "9 0.034568 0.045002 0.052699 0.107486 0.055766 0.070065 0.088281 \n",
- "10 0.065471 0.060088 0.033686 0.197107 0.085402 0.118945 0.162538 \n",
- "\n",
- "[10 rows x 943 columns]"
- ]
- },
- "execution_count": 63,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Convert into pandas dataframe \n",
- "cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)\n",
- "\n",
- "cosine_sim.head(10)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 140,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#User Based Collaborative Filter using Weighted Mean Ratings\n",
- "def cf_user_wmean(user_id, movie_id):\n",
- " \n",
- " #Check if movie_id exists in r_matrix\n",
- " if movie_id in r_matrix:\n",
- " \n",
- " #Get the similarity scores for the user in question with every other user\n",
- " sim_scores = cosine_sim[user_id]\n",
- " \n",
- " #Get the user ratings for the movie in question\n",
- " m_ratings = r_matrix[movie_id]\n",
- " \n",
- " #Extract the indices containing NaN in the m_ratings series\n",
- " idx = m_ratings[m_ratings.isnull()].index\n",
- " \n",
- " #Drop the NaN values from the m_ratings Series\n",
- " m_ratings = m_ratings.dropna()\n",
- " \n",
- " #Drop the corresponding cosine scores from the sim_scores series\n",
- " sim_scores = sim_scores.drop(idx)\n",
- " \n",
- " #Compute the final weighted mean\n",
- " wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()\n",
- " \n",
- " else:\n",
- " #Default to a rating of 3.0 in the absence of any information\n",
- " wmean_rating = 3.0\n",
- " \n",
- " return wmean_rating"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 139,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "1.0174483808407588"
- ]
- },
- "execution_count": 139,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "score(cf_user_wmean)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Demographics"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 145,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>user_id</th>\n",
- " <th>movie_id</th>\n",
- " <th>rating</th>\n",
- " <th>age</th>\n",
- " <th>sex</th>\n",
- " <th>occupation</th>\n",
- " <th>zip_code</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>889</td>\n",
- " <td>684</td>\n",
- " <td>2</td>\n",
- " <td>24</td>\n",
- " <td>M</td>\n",
- " <td>technician</td>\n",
- " <td>78704</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>889</td>\n",
- " <td>279</td>\n",
- " <td>2</td>\n",
- " <td>24</td>\n",
- " <td>M</td>\n",
- " <td>technician</td>\n",
- " <td>78704</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>889</td>\n",
- " <td>29</td>\n",
- " <td>3</td>\n",
- " <td>24</td>\n",
- " <td>M</td>\n",
- " <td>technician</td>\n",
- " <td>78704</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>889</td>\n",
- " <td>190</td>\n",
- " <td>3</td>\n",
- " <td>24</td>\n",
- " <td>M</td>\n",
- " <td>technician</td>\n",
- " <td>78704</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>889</td>\n",
- " <td>232</td>\n",
- " <td>3</td>\n",
- " <td>24</td>\n",
- " <td>M</td>\n",
- " <td>technician</td>\n",
- " <td>78704</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " user_id movie_id rating age sex occupation zip_code\n",
- "0 889 684 2 24 M technician 78704\n",
- "1 889 279 2 24 M technician 78704\n",
- "2 889 29 3 24 M technician 78704\n",
- "3 889 190 3 24 M technician 78704\n",
- "4 889 232 3 24 M technician 78704"
- ]
- },
- "execution_count": 145,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Merge the original users dataframe with the training set \n",
- "merged_df = pd.merge(X_train, users)\n",
- "\n",
- "merged_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 150,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "sex\n",
- "F 3.827586\n",
- "M 3.918919\n",
- "Name: rating, dtype: float64"
- ]
- },
- "execution_count": 150,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Compute the mean rating of every movie by gender\n",
- "gender_mean = merged_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#Set the index of the users dataframe to the user_id\n",
- "users = users.set_index('user_id')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 165,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#Gender Based Collaborative Filter using Mean Ratings\n",
- "def cf_gender(user_id, movie_id):\n",
- " \n",
- " #Check if movie_id exists in r_matrix (or training set)\n",
- " if movie_id in r_matrix:\n",
- " #Identify the gender of the user\n",
- " gender = users.loc[user_id]['sex']\n",
- " \n",
- " #Check if the gender has rated the movie\n",
- " if gender in gender_mean[movie_id]:\n",
- " \n",
- " #Compute the mean rating given by that gender to the movie\n",
- " gender_rating = gender_mean[movie_id][gender]\n",
- " \n",
- " else:\n",
- " gender_rating = 3.0\n",
- " \n",
- " else:\n",
- " #Default to a rating of 3.0 in the absence of any information\n",
- " gender_rating = 3.0\n",
- " \n",
- " return gender_rating"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 166,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/usr/local/lib/python3.6/site-packages/pandas/core/indexes/multi.py:819: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
- " return self._engine.get_value(s, k)\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "1.0330308800874282"
- ]
- },
- "execution_count": 166,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "score(cf_gender)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 174,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr>\n",
- " <th>occupation</th>\n",
- " <th colspan=\"2\" halign=\"left\">administrator</th>\n",
- " <th colspan=\"2\" halign=\"left\">artist</th>\n",
- " <th>doctor</th>\n",
- " <th colspan=\"2\" halign=\"left\">educator</th>\n",
- " <th colspan=\"2\" halign=\"left\">engineer</th>\n",
- " <th>entertainment</th>\n",
- " <th>...</th>\n",
- " <th colspan=\"2\" halign=\"left\">salesman</th>\n",
- " <th colspan=\"2\" halign=\"left\">scientist</th>\n",
- " <th colspan=\"2\" halign=\"left\">student</th>\n",
- " <th colspan=\"2\" halign=\"left\">technician</th>\n",
- " <th colspan=\"2\" halign=\"left\">writer</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>sex</th>\n",
- " <th>F</th>\n",
- " <th>M</th>\n",
- " <th>F</th>\n",
- " <th>M</th>\n",
- " <th>M</th>\n",
- " <th>F</th>\n",
- " <th>M</th>\n",
- " <th>F</th>\n",
- " <th>M</th>\n",
- " <th>F</th>\n",
- " <th>...</th>\n",
- " <th>F</th>\n",
- " <th>M</th>\n",
- " <th>F</th>\n",
- " <th>M</th>\n",
- " <th>F</th>\n",
- " <th>M</th>\n",
- " <th>F</th>\n",
- " <th>M</th>\n",
- " <th>F</th>\n",
- " <th>M</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>movie_id</th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>4.0</td>\n",
- " <td>4.222222</td>\n",
- " <td>4.25</td>\n",
- " <td>3.500000</td>\n",
- " <td>3.666667</td>\n",
- " <td>3.50</td>\n",
- " <td>3.923077</td>\n",
- " <td>4.0</td>\n",
- " <td>3.970588</td>\n",
- " <td>5.0</td>\n",
- " <td>...</td>\n",
- " <td>4.0</td>\n",
- " <td>4.000000</td>\n",
- " <td>3.5</td>\n",
- " <td>3.888889</td>\n",
- " <td>3.833333</td>\n",
- " <td>3.709091</td>\n",
- " <td>4.0</td>\n",
- " <td>4.200000</td>\n",
- " <td>4.166667</td>\n",
- " <td>3.142857</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>3.0</td>\n",
- " <td>3.750000</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>3.250000</td>\n",
- " <td>NaN</td>\n",
- " <td>3.363636</td>\n",
- " <td>NaN</td>\n",
- " <td>...</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>2.333333</td>\n",
- " <td>3.333333</td>\n",
- " <td>NaN</td>\n",
- " <td>2.714286</td>\n",
- " <td>5.000000</td>\n",
- " <td>2.666667</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>3.5</td>\n",
- " <td>2.500000</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>4.00</td>\n",
- " <td>2.500000</td>\n",
- " <td>NaN</td>\n",
- " <td>3.625000</td>\n",
- " <td>NaN</td>\n",
- " <td>...</td>\n",
- " <td>NaN</td>\n",
- " <td>1.000000</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>2.000000</td>\n",
- " <td>3.217391</td>\n",
- " <td>NaN</td>\n",
- " <td>4.000000</td>\n",
- " <td>NaN</td>\n",
- " <td>1.000000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>3.0</td>\n",
- " <td>3.888889</td>\n",
- " <td>NaN</td>\n",
- " <td>4.666667</td>\n",
- " <td>3.000000</td>\n",
- " <td>2.75</td>\n",
- " <td>3.636364</td>\n",
- " <td>NaN</td>\n",
- " <td>3.555556</td>\n",
- " <td>NaN</td>\n",
- " <td>...</td>\n",
- " <td>4.0</td>\n",
- " <td>3.666667</td>\n",
- " <td>NaN</td>\n",
- " <td>3.600000</td>\n",
- " <td>3.285714</td>\n",
- " <td>3.724138</td>\n",
- " <td>NaN</td>\n",
- " <td>3.200000</td>\n",
- " <td>4.250000</td>\n",
- " <td>3.500000</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5</th>\n",
- " <td>4.0</td>\n",
- " <td>2.333333</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>4.00</td>\n",
- " <td>1.500000</td>\n",
- " <td>NaN</td>\n",
- " <td>2.666667</td>\n",
- " <td>NaN</td>\n",
- " <td>...</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>NaN</td>\n",
- " <td>3.500000</td>\n",
- " <td>4.333333</td>\n",
- " <td>3.272727</td>\n",
- " <td>NaN</td>\n",
- " <td>3.333333</td>\n",
- " <td>4.000000</td>\n",
- " <td>2.666667</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>5 rows × 41 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- "occupation administrator artist doctor educator \\\n",
- "sex F M F M M F \n",
- "movie_id \n",
- "1 4.0 4.222222 4.25 3.500000 3.666667 3.50 \n",
- "2 3.0 3.750000 NaN NaN NaN NaN \n",
- "3 3.5 2.500000 NaN NaN NaN 4.00 \n",
- "4 3.0 3.888889 NaN 4.666667 3.000000 2.75 \n",
- "5 4.0 2.333333 NaN NaN NaN 4.00 \n",
- "\n",
- "occupation engineer entertainment ... salesman \\\n",
- "sex M F M F ... F \n",
- "movie_id ... \n",
- "1 3.923077 4.0 3.970588 5.0 ... 4.0 \n",
- "2 3.250000 NaN 3.363636 NaN ... NaN \n",
- "3 2.500000 NaN 3.625000 NaN ... NaN \n",
- "4 3.636364 NaN 3.555556 NaN ... 4.0 \n",
- "5 1.500000 NaN 2.666667 NaN ... NaN \n",
- "\n",
- "occupation scientist student technician \\\n",
- "sex M F M F M F \n",
- "movie_id \n",
- "1 4.000000 3.5 3.888889 3.833333 3.709091 4.0 \n",
- "2 NaN NaN NaN 2.333333 3.333333 NaN \n",
- "3 1.000000 NaN NaN 2.000000 3.217391 NaN \n",
- "4 3.666667 NaN 3.600000 3.285714 3.724138 NaN \n",
- "5 NaN NaN 3.500000 4.333333 3.272727 NaN \n",
- "\n",
- "occupation writer \n",
- "sex M F M \n",
- "movie_id \n",
- "1 4.200000 4.166667 3.142857 \n",
- "2 2.714286 5.000000 2.666667 \n",
- "3 4.000000 NaN 1.000000 \n",
- "4 3.200000 4.250000 3.500000 \n",
- "5 3.333333 4.000000 2.666667 \n",
- "\n",
- "[5 rows x 41 columns]"
- ]
- },
- "execution_count": 174,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Compute the mean rating by gender and occupation\n",
- "gen_occ_mean = merged_df[['sex', 'rating', 'movie_id', 'occupation']].pivot_table(\n",
- " values='rating', index='movie_id', columns=['occupation', 'sex'], aggfunc='mean')\n",
- "\n",
- "gen_occ_mean.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 198,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#Gender and Occupation Based Collaborative Filter using Mean Ratings\n",
- "def cf_gen_occ(user_id, movie_id):\n",
- " \n",
- " #Check if movie_id exists in gen_occ_mean\n",
- " if movie_id in gen_occ_mean.index:\n",
- " \n",
- " #Identify the user\n",
- " user = users.loc[user_id]\n",
- " \n",
- " #Identify the gender and occupation\n",
- " gender = user['sex']\n",
- " occ = user['occupation']\n",
- " \n",
- " #Check if the occupation has rated the movie\n",
- " if occ in gen_occ_mean.loc[movie_id]:\n",
- " \n",
- " #Check if the gender has rated the movie\n",
- " if gender in gen_occ_mean.loc[movie_id][occ]:\n",
- " \n",
- " #Extract the required rating\n",
- " rating = gen_occ_mean.loc[movie_id][occ][gender]\n",
- " \n",
- " #Default to 3.0 if the rating is null\n",
- " if np.isnan(rating):\n",
- " rating = 3.0\n",
- " \n",
- " return rating\n",
- " \n",
- " #Return the default rating \n",
- " return 3.0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 199,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "1.1391976012043645"
- ]
- },
- "execution_count": 199,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "score(cf_gen_occ)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Model Based Approaches"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 231,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating RMSE of algorithm KNNBasic.\n",
- "\n",
- "------------\n",
- "Fold 1\n",
- "Computing the msd similarity matrix...\n",
- "Done computing similarity matrix.\n",
- "RMSE: 0.9776\n",
- "------------\n",
- "Fold 2\n",
- "Computing the msd similarity matrix...\n",
- "Done computing similarity matrix.\n",
- "RMSE: 0.9789\n",
- "------------\n",
- "Fold 3\n",
- "Computing the msd similarity matrix...\n",
- "Done computing similarity matrix.\n",
- "RMSE: 0.9695\n",
- "------------\n",
- "Fold 4\n",
- "Computing the msd similarity matrix...\n",
- "Done computing similarity matrix.\n",
- "RMSE: 0.9810\n",
- "------------\n",
- "Fold 5\n",
- "Computing the msd similarity matrix...\n",
- "Done computing similarity matrix.\n",
- "RMSE: 0.9849\n",
- "------------\n",
- "------------\n",
- "Mean RMSE: 0.9784\n",
- "------------\n",
- "------------\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "CaseInsensitiveDefaultDict(list,\n",
- " {'rmse': [0.97764007686097709,\n",
- " 0.97889035204999741,\n",
- " 0.9694859699934969,\n",
- " 0.98099811511904433,\n",
- " 0.98488926832497381]})"
- ]
- },
- "execution_count": 231,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Import the required classes and methods from the surprise library\n",
- "from surprise import Reader, Dataset, KNNBasic, evaluate\n",
- "\n",
- "#Define a Reader object\n",
- "#The Reader object helps in parsing the file or dataframe containing ratings\n",
- "reader = Reader()\n",
- "\n",
- "#Create the dataset to be used for building the filter\n",
- "data = Dataset.load_from_df(ratings, reader)\n",
- "\n",
- "#Define the algorithm object; in this case kNN\n",
- "knn = KNNBasic()\n",
- "\n",
- "#Evaluate the performance in terms of RMSE\n",
- "evaluate(knn, data, measures=['RMSE'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 232,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Evaluating RMSE of algorithm SVD.\n",
- "\n",
- "------------\n",
- "Fold 1\n",
- "RMSE: 0.9371\n",
- "------------\n",
- "Fold 2\n",
- "RMSE: 0.9417\n",
- "------------\n",
- "Fold 3\n",
- "RMSE: 0.9289\n",
- "------------\n",
- "Fold 4\n",
- "RMSE: 0.9379\n",
- "------------\n",
- "Fold 5\n",
- "RMSE: 0.9379\n",
- "------------\n",
- "------------\n",
- "Mean RMSE: 0.9367\n",
- "------------\n",
- "------------\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "CaseInsensitiveDefaultDict(list,\n",
- " {'rmse': [0.93714337825960081,\n",
- " 0.9417378198331483,\n",
- " 0.92893737314257874,\n",
- " 0.93793761103739881,\n",
- " 0.93789928866069328]})"
- ]
- },
- "execution_count": 232,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Import SVD\n",
- "from surprise import SVD\n",
- "\n",
- "#Define the SVD algorithm object\n",
- "svd = SVD()\n",
- "\n",
- "#Evaluate the performance in terms of RMSE\n",
- "evaluate(svd, data, measures=['RMSE'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|