| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497 |
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Hybrid Recommenders"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "#Import or compute the cosine_sim matrix\n",
- "cosine_sim = pd.read_csv('../data/cosine_sim.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Import or compute the cosine sim mapping matrix\n",
- "cosine_sim_map = pd.read_csv('../data/cosine_sim_map.csv', header=None)\n",
- "\n",
- "#Convert cosine_sim_map into a Pandas Series\n",
- "cosine_sim_map = cosine_sim_map.set_index(0)\n",
- "cosine_sim_map = cosine_sim_map[1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Build the SVD based Collaborative filter\n",
- "from surprise import SVD, Reader, Dataset\n",
- "\n",
- "reader = Reader()\n",
- "ratings = pd.read_csv('../data/ratings_small.csv')\n",
- "data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)\n",
- "data.split(n_folds=5)\n",
- "svd = SVD()\n",
- "trainset = data.build_full_trainset()\n",
- "svd.train(trainset)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Build title to ID and ID to title mappings\n",
- "id_map = pd.read_csv('../data/movie_ids.csv')\n",
- "id_to_title = id_map.set_index('id')\n",
- "title_to_id = id_map.set_index('title')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Import or compute relevant metadata of the movies\n",
- "smd = pd.read_csv('../data/metadata_small.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "def hybrid(userId, title):\n",
- " #Extract the cosine_sim index of the movie\n",
- " idx = cosine_sim_map[title]\n",
- " \n",
- " #Extract the TMDB ID of the movie\n",
- " tmdbId = title_to_id.loc[title]['id']\n",
- " \n",
- " #Extract the movie ID internally assigned by the dataset\n",
- " movie_id = title_to_id.loc[title]['movieId']\n",
- " \n",
- " #Extract the similarity scores and their corresponding index for every movie from the cosine_sim matrix\n",
- " sim_scores = list(enumerate(cosine_sim[str(int(idx))]))\n",
- " \n",
- " #Sort the (index, score) tuples in decreasing order of similarity scores\n",
- " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
- " \n",
- " #Select the top 25 tuples, excluding the first \n",
- " #(as it is the similarity score of the movie with itself)\n",
- " sim_scores = sim_scores[1:26]\n",
- " \n",
- " #Store the cosine_sim indices of the top 25 movies in a list\n",
- " movie_indices = [i[0] for i in sim_scores]\n",
- "\n",
- " #Extract the metadata of the aforementioned movies\n",
- " movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]\n",
- " \n",
- " #Compute the predicted ratings using the SVD filter\n",
- " movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, id_to_title.loc[x]['movieId']).est)\n",
- " \n",
- " #Sort the movies in decreasing order of predicted rating\n",
- " movies = movies.sort_values('est', ascending=False)\n",
- " \n",
- " #Return the top 10 movies as recommendations\n",
- " return movies.head(10)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>title</th>\n",
- " <th>vote_count</th>\n",
- " <th>vote_average</th>\n",
- " <th>year</th>\n",
- " <th>id</th>\n",
- " <th>est</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>1011</th>\n",
- " <td>The Terminator</td>\n",
- " <td>4208.0</td>\n",
- " <td>7.4</td>\n",
- " <td>1984</td>\n",
- " <td>218</td>\n",
- " <td>3.140748</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>974</th>\n",
- " <td>Aliens</td>\n",
- " <td>3282.0</td>\n",
- " <td>7.7</td>\n",
- " <td>1986</td>\n",
- " <td>679</td>\n",
- " <td>3.126947</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>8401</th>\n",
- " <td>Star Trek Into Darkness</td>\n",
- " <td>4479.0</td>\n",
- " <td>7.4</td>\n",
- " <td>2013</td>\n",
- " <td>54138</td>\n",
- " <td>3.079551</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>7705</th>\n",
- " <td>Alice in Wonderland</td>\n",
- " <td>8.0</td>\n",
- " <td>5.4</td>\n",
- " <td>1933</td>\n",
- " <td>25694</td>\n",
- " <td>3.054995</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3060</th>\n",
- " <td>Sinbad and the Eye of the Tiger</td>\n",
- " <td>39.0</td>\n",
- " <td>6.3</td>\n",
- " <td>1977</td>\n",
- " <td>11940</td>\n",
- " <td>3.028386</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>8658</th>\n",
- " <td>X-Men: Days of Future Past</td>\n",
- " <td>6155.0</td>\n",
- " <td>7.5</td>\n",
- " <td>2014</td>\n",
- " <td>127585</td>\n",
- " <td>2.997411</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2014</th>\n",
- " <td>Fantastic Planet</td>\n",
- " <td>140.0</td>\n",
- " <td>7.6</td>\n",
- " <td>1973</td>\n",
- " <td>16306</td>\n",
- " <td>2.957614</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>522</th>\n",
- " <td>Terminator 2: Judgment Day</td>\n",
- " <td>4274.0</td>\n",
- " <td>7.7</td>\n",
- " <td>1991</td>\n",
- " <td>280</td>\n",
- " <td>2.914548</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1621</th>\n",
- " <td>Darby O'Gill and the Little People</td>\n",
- " <td>35.0</td>\n",
- " <td>6.7</td>\n",
- " <td>1959</td>\n",
- " <td>18887</td>\n",
- " <td>2.844940</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1668</th>\n",
- " <td>Return from Witch Mountain</td>\n",
- " <td>38.0</td>\n",
- " <td>5.6</td>\n",
- " <td>1978</td>\n",
- " <td>14822</td>\n",
- " <td>2.804012</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " title vote_count vote_average year \\\n",
- "1011 The Terminator 4208.0 7.4 1984 \n",
- "974 Aliens 3282.0 7.7 1986 \n",
- "8401 Star Trek Into Darkness 4479.0 7.4 2013 \n",
- "7705 Alice in Wonderland 8.0 5.4 1933 \n",
- "3060 Sinbad and the Eye of the Tiger 39.0 6.3 1977 \n",
- "8658 X-Men: Days of Future Past 6155.0 7.5 2014 \n",
- "2014 Fantastic Planet 140.0 7.6 1973 \n",
- "522 Terminator 2: Judgment Day 4274.0 7.7 1991 \n",
- "1621 Darby O'Gill and the Little People 35.0 6.7 1959 \n",
- "1668 Return from Witch Mountain 38.0 5.6 1978 \n",
- "\n",
- " id est \n",
- "1011 218 3.140748 \n",
- "974 679 3.126947 \n",
- "8401 54138 3.079551 \n",
- "7705 25694 3.054995 \n",
- "3060 11940 3.028386 \n",
- "8658 127585 2.997411 \n",
- "2014 16306 2.957614 \n",
- "522 280 2.914548 \n",
- "1621 18887 2.844940 \n",
- "1668 14822 2.804012 "
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "hybrid(1, 'Avatar')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style>\n",
- " .dataframe thead tr:only-child th {\n",
- " text-align: right;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: left;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>title</th>\n",
- " <th>vote_count</th>\n",
- " <th>vote_average</th>\n",
- " <th>year</th>\n",
- " <th>id</th>\n",
- " <th>est</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>522</th>\n",
- " <td>Terminator 2: Judgment Day</td>\n",
- " <td>4274.0</td>\n",
- " <td>7.7</td>\n",
- " <td>1991</td>\n",
- " <td>280</td>\n",
- " <td>3.943639</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2834</th>\n",
- " <td>Predator</td>\n",
- " <td>2129.0</td>\n",
- " <td>7.3</td>\n",
- " <td>1987</td>\n",
- " <td>106</td>\n",
- " <td>3.866272</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>8401</th>\n",
- " <td>Star Trek Into Darkness</td>\n",
- " <td>4479.0</td>\n",
- " <td>7.4</td>\n",
- " <td>2013</td>\n",
- " <td>54138</td>\n",
- " <td>3.858491</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1011</th>\n",
- " <td>The Terminator</td>\n",
- " <td>4208.0</td>\n",
- " <td>7.4</td>\n",
- " <td>1984</td>\n",
- " <td>218</td>\n",
- " <td>3.856029</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>7705</th>\n",
- " <td>Alice in Wonderland</td>\n",
- " <td>8.0</td>\n",
- " <td>5.4</td>\n",
- " <td>1933</td>\n",
- " <td>25694</td>\n",
- " <td>3.701565</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>922</th>\n",
- " <td>The Abyss</td>\n",
- " <td>822.0</td>\n",
- " <td>7.1</td>\n",
- " <td>1989</td>\n",
- " <td>2756</td>\n",
- " <td>3.676465</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>974</th>\n",
- " <td>Aliens</td>\n",
- " <td>3282.0</td>\n",
- " <td>7.7</td>\n",
- " <td>1986</td>\n",
- " <td>679</td>\n",
- " <td>3.672303</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1621</th>\n",
- " <td>Darby O'Gill and the Little People</td>\n",
- " <td>35.0</td>\n",
- " <td>6.7</td>\n",
- " <td>1959</td>\n",
- " <td>18887</td>\n",
- " <td>3.628234</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1668</th>\n",
- " <td>Return from Witch Mountain</td>\n",
- " <td>38.0</td>\n",
- " <td>5.6</td>\n",
- " <td>1978</td>\n",
- " <td>14822</td>\n",
- " <td>3.614118</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2014</th>\n",
- " <td>Fantastic Planet</td>\n",
- " <td>140.0</td>\n",
- " <td>7.6</td>\n",
- " <td>1973</td>\n",
- " <td>16306</td>\n",
- " <td>3.602051</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " title vote_count vote_average year \\\n",
- "522 Terminator 2: Judgment Day 4274.0 7.7 1991 \n",
- "2834 Predator 2129.0 7.3 1987 \n",
- "8401 Star Trek Into Darkness 4479.0 7.4 2013 \n",
- "1011 The Terminator 4208.0 7.4 1984 \n",
- "7705 Alice in Wonderland 8.0 5.4 1933 \n",
- "922 The Abyss 822.0 7.1 1989 \n",
- "974 Aliens 3282.0 7.7 1986 \n",
- "1621 Darby O'Gill and the Little People 35.0 6.7 1959 \n",
- "1668 Return from Witch Mountain 38.0 5.6 1978 \n",
- "2014 Fantastic Planet 140.0 7.6 1973 \n",
- "\n",
- " id est \n",
- "522 280 3.943639 \n",
- "2834 106 3.866272 \n",
- "8401 54138 3.858491 \n",
- "1011 218 3.856029 \n",
- "7705 25694 3.701565 \n",
- "922 2756 3.676465 \n",
- "974 679 3.672303 \n",
- "1621 18887 3.628234 \n",
- "1668 14822 3.614118 \n",
- "2014 16306 3.602051 "
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "hybrid(2, 'Avatar')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|