| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784 |
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Plot Description Based Recommender"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: scikit-learn in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.1.2)\n",
- "Requirement already satisfied: scipy in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.9.0)\n",
- "Collecting matplotlib\n",
- " Downloading matplotlib-3.5.3-cp310-cp310-macosx_10_9_x86_64.whl (7.3 MB)\n",
- "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.3/7.3 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m\n",
- "\u001b[?25hRequirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/site-packages (from scikit-learn) (1.22.4)\n",
- "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (1.1.0)\n",
- "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
- "Collecting fonttools>=4.22.0\n",
- " Using cached fonttools-4.34.4-py3-none-any.whl (944 kB)\n",
- "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
- "Collecting cycler>=0.10\n",
- " Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)\n",
- "Collecting kiwisolver>=1.0.1\n",
- " Downloading kiwisolver-1.4.4-cp310-cp310-macosx_10_9_x86_64.whl (65 kB)\n",
- "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m65.5/65.5 KB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (21.3)\n",
- "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
- "Collecting pillow>=6.2.0\n",
- " Downloading Pillow-9.2.0-cp310-cp310-macosx_10_10_x86_64.whl (3.1 MB)\n",
- "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m0m\n",
- "\u001b[?25hRequirement already satisfied: six>=1.5 in /usr/local/Cellar/six/1.16.0_2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
- "Installing collected packages: pillow, kiwisolver, fonttools, cycler, matplotlib\n",
- "Successfully installed cycler-0.11.0 fonttools-4.34.4 kiwisolver-1.4.4 matplotlib-3.5.3 pillow-9.2.0\n",
- "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n",
- "You should consider upgrading via the '/usr/local/Cellar/ipython/8.4.0/libexec/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
- "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
- ]
- }
- ],
- "source": [
- "%pip install scikit-learn scipy matplotlib"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: scikit-learn in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.1.2)\n",
- "Requirement already satisfied: scipy in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.9.0)\n",
- "Requirement already satisfied: matplotlib in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (3.5.3)\n",
- "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
- "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (1.1.0)\n",
- "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/site-packages (from scikit-learn) (1.22.4)\n",
- "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
- "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
- "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (1.4.4)\n",
- "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (4.34.4)\n",
- "Requirement already satisfied: cycler>=0.10 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
- "Requirement already satisfied: packaging>=20.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (21.3)\n",
- "Requirement already satisfied: pillow>=6.2.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (9.2.0)\n",
- "Requirement already satisfied: six>=1.5 in /usr/local/Cellar/six/1.16.0_2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
- "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n",
- "You should consider upgrading via the '/usr/local/Cellar/ipython/8.4.0/libexec/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
- "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
- ]
- }
- ],
- "source": [
- "%pip install scikit-learn scipy matplotlib"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>title</th>\n",
- " <th>genres</th>\n",
- " <th>runtime</th>\n",
- " <th>vote_average</th>\n",
- " <th>vote_count</th>\n",
- " <th>year</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>Toy Story</td>\n",
- " <td>['animation', 'comedy', 'family']</td>\n",
- " <td>81.0</td>\n",
- " <td>7.7</td>\n",
- " <td>5415.0</td>\n",
- " <td>1995</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>Jumanji</td>\n",
- " <td>['adventure', 'fantasy', 'family']</td>\n",
- " <td>104.0</td>\n",
- " <td>6.9</td>\n",
- " <td>2413.0</td>\n",
- " <td>1995</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>Grumpier Old Men</td>\n",
- " <td>['romance', 'comedy']</td>\n",
- " <td>101.0</td>\n",
- " <td>6.5</td>\n",
- " <td>92.0</td>\n",
- " <td>1995</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>Waiting to Exhale</td>\n",
- " <td>['comedy', 'drama', 'romance']</td>\n",
- " <td>127.0</td>\n",
- " <td>6.1</td>\n",
- " <td>34.0</td>\n",
- " <td>1995</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>Father of the Bride Part II</td>\n",
- " <td>['comedy']</td>\n",
- " <td>106.0</td>\n",
- " <td>5.7</td>\n",
- " <td>173.0</td>\n",
- " <td>1995</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " title genres runtime \\\n",
- "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
- "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
- "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
- "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
- "4 Father of the Bride Part II ['comedy'] 106.0 \n",
- "\n",
- " vote_average vote_count year \n",
- "0 7.7 5415.0 1995 \n",
- "1 6.9 2413.0 1995 \n",
- "2 6.5 92.0 1995 \n",
- "3 6.1 34.0 1995 \n",
- "4 5.7 173.0 1995 "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "\n",
- "#Import data from the clean file \n",
- "df = pd.read_csv('../data/metadata_clean.csv')\n",
- "\n",
- "#Print the head of the cleaned DataFrame\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>title</th>\n",
- " <th>genres</th>\n",
- " <th>runtime</th>\n",
- " <th>vote_average</th>\n",
- " <th>vote_count</th>\n",
- " <th>year</th>\n",
- " <th>overview</th>\n",
- " <th>id</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>Toy Story</td>\n",
- " <td>['animation', 'comedy', 'family']</td>\n",
- " <td>81.0</td>\n",
- " <td>7.7</td>\n",
- " <td>5415.0</td>\n",
- " <td>1995</td>\n",
- " <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
- " <td>862</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>Jumanji</td>\n",
- " <td>['adventure', 'fantasy', 'family']</td>\n",
- " <td>104.0</td>\n",
- " <td>6.9</td>\n",
- " <td>2413.0</td>\n",
- " <td>1995</td>\n",
- " <td>When siblings Judy and Peter discover an encha...</td>\n",
- " <td>8844</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>Grumpier Old Men</td>\n",
- " <td>['romance', 'comedy']</td>\n",
- " <td>101.0</td>\n",
- " <td>6.5</td>\n",
- " <td>92.0</td>\n",
- " <td>1995</td>\n",
- " <td>A family wedding reignites the ancient feud be...</td>\n",
- " <td>15602</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>Waiting to Exhale</td>\n",
- " <td>['comedy', 'drama', 'romance']</td>\n",
- " <td>127.0</td>\n",
- " <td>6.1</td>\n",
- " <td>34.0</td>\n",
- " <td>1995</td>\n",
- " <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
- " <td>31357</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>Father of the Bride Part II</td>\n",
- " <td>['comedy']</td>\n",
- " <td>106.0</td>\n",
- " <td>5.7</td>\n",
- " <td>173.0</td>\n",
- " <td>1995</td>\n",
- " <td>Just when George Banks has recovered from his ...</td>\n",
- " <td>11862</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " title genres runtime \\\n",
- "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
- "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
- "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
- "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
- "4 Father of the Bride Part II ['comedy'] 106.0 \n",
- "\n",
- " vote_average vote_count year \\\n",
- "0 7.7 5415.0 1995 \n",
- "1 6.9 2413.0 1995 \n",
- "2 6.5 92.0 1995 \n",
- "3 6.1 34.0 1995 \n",
- "4 5.7 173.0 1995 \n",
- "\n",
- " overview id \n",
- "0 Led by Woody, Andy's toys live happily in his ... 862 \n",
- "1 When siblings Judy and Peter discover an encha... 8844 \n",
- "2 A family wedding reignites the ancient feud be... 15602 \n",
- "3 Cheated on, mistreated and stepped on, the wom... 31357 \n",
- "4 Just when George Banks has recovered from his ... 11862 "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Import the original file\n",
- "orig_df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)\n",
- "\n",
- "#Add the useful features into the cleaned dataframe\n",
- "df['overview'], df['id'] = orig_df['overview'], orig_df['id']\n",
- "\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "#Import TfIdfVectorizer from the scikit-learn library\n",
- "from sklearn.feature_extraction.text import TfidfVectorizer\n",
- "\n",
- "#Define a TF-IDF Vectorizer Object. Remove all english stopwords\n",
- "tfidf = TfidfVectorizer(stop_words='english')\n",
- "\n",
- "#Replace NaN with an empty string\n",
- "df['overview'] = df['overview'].fillna('')\n",
- "\n",
- "#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature\n",
- "tfidf_matrix = tfidf.fit_transform(df['overview'])\n",
- "\n",
- "#Output the shape of tfidf_matrix\n",
- "tfidf_matrix.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Import linear_kernel to compute the dot product\n",
- "from sklearn.metrics.pairwise import linear_kernel\n",
- "\n",
- "# Compute the cosine similarity matrix\n",
- "cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any\n",
- "indices = pd.Series(df.index, index=df['title']).drop_duplicates()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Function that takes in movie title as input and gives recommendations \n",
- "def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):\n",
- " # Obtain the index of the movie that matches the title\n",
- " idx = indices[title]\n",
- "\n",
- " # Get the pairwsie similarity scores of all movies with that movie\n",
- " # And convert it into a list of tuples as described above\n",
- " sim_scores = list(enumerate(cosine_sim[idx]))\n",
- "\n",
- " # Sort the movies based on the cosine similarity scores\n",
- " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
- "\n",
- " # Get the scores of the 10 most similar movies. Ignore the first movie.\n",
- " sim_scores = sim_scores[1:11]\n",
- "\n",
- " # Get the movie indices\n",
- " movie_indices = [i[0] for i in sim_scores]\n",
- "\n",
- " # Return the top 10 most similar movies\n",
- " return df['title'].iloc[movie_indices]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Get recommendations for The Lion King\n",
- "content_recommender('The Lion King')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Metadata Based Recommender"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Load the keywords and credits files\n",
- "cred_df = pd.read_csv('../data/credits.csv')\n",
- "key_df = pd.read_csv('../data/keywords.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Print the head of the credit dataframe\n",
- "cred_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Print the head of the keywords dataframe\n",
- "key_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Convert the IDs of df into int\n",
- "df['id'] = df['id'].astype('int')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Function to convert all non-integer IDs to NaN\n",
- "def clean_ids(x):\n",
- " try:\n",
- " return int(x)\n",
- " except:\n",
- " return np.nan"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Clean the ids of df\n",
- "df['id'] = df['id'].apply(clean_ids)\n",
- "\n",
- "#Filter all rows that have a null ID\n",
- "df = df[df['id'].notnull()]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Convert IDs into integer\n",
- "df['id'] = df['id'].astype('int')\n",
- "key_df['id'] = key_df['id'].astype('int')\n",
- "cred_df['id'] = cred_df['id'].astype('int')\n",
- "\n",
- "# Merge keywords and credits into your main metadata dataframe\n",
- "df = df.merge(cred_df, on='id')\n",
- "df = df.merge(key_df, on='id')\n",
- "\n",
- "#Display the head of df\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Convert the stringified objects into the native python objects\n",
- "from ast import literal_eval\n",
- "\n",
- "features = ['cast', 'crew', 'keywords', 'genres']\n",
- "for feature in features:\n",
- " df[feature] = df[feature].apply(literal_eval)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Print the first cast member of the first movie in df\n",
- "df.iloc[0]['crew'][0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Extract the director's name. If director is not listed, return NaN\n",
- "def get_director(x):\n",
- " for crew_member in x:\n",
- " if crew_member['job'] == 'Director':\n",
- " return crew_member['name']\n",
- " return np.nan"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Define the new director feature\n",
- "df['director'] = df['crew'].apply(get_director)\n",
- "\n",
- "#Print the directors of the first five movies\n",
- "df['director'].head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Returns the list top 3 elements or entire list; whichever is more.\n",
- "def generate_list(x):\n",
- " if isinstance(x, list):\n",
- " names = [i['name'] for i in x]\n",
- " #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.\n",
- " if len(names) > 3:\n",
- " names = names[:3]\n",
- " return names\n",
- "\n",
- " #Return empty list in case of missing/malformed data\n",
- " return []"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Apply the generate_list function to cast and keywords\n",
- "df['cast'] = df['cast'].apply(generate_list)\n",
- "df['keywords'] = df['keywords'].apply(generate_list)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Only consider a maximum of 3 genres\n",
- "df['genres'] = df['genres'].apply(lambda x: x[:3])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Print the new features of the first 5 movies along with title\n",
- "df[['title', 'cast', 'director', 'keywords', 'genres']].head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase\n",
- "def sanitize(x):\n",
- " if isinstance(x, list):\n",
- " #Strip spaces and convert to lowercase\n",
- " return [str.lower(i.replace(\" \", \"\")) for i in x]\n",
- " else:\n",
- " #Check if director exists. If not, return empty string\n",
- " if isinstance(x, str):\n",
- " return str.lower(x.replace(\" \", \"\"))\n",
- " else:\n",
- " return ''"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Apply the generate_list function to cast, keywords, director and genres\n",
- "for feature in ['cast', 'director', 'genres', 'keywords']:\n",
- " df[feature] = df[feature].apply(sanitize)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "#Function that creates a soup out of the desired metadata\n",
- "def create_soup(x):\n",
- " return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create the new soup feature\n",
- "df['soup'] = df.apply(create_soup, axis=1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Display the soup of the first movie\n",
- "df.iloc[0]['soup']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Import CountVectorizer\n",
- "from sklearn.feature_extraction.text import CountVectorizer\n",
- "\n",
- "#Define a new CountVectorizer object and create vectors for the soup\n",
- "count = CountVectorizer(stop_words='english')\n",
- "count_matrix = count.fit_transform(df['soup'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Import cosine_similarity function\n",
- "from sklearn.metrics.pairwise import cosine_similarity\n",
- "\n",
- "#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)\n",
- "cosine_sim2 = cosine_similarity(count_matrix, count_matrix)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Reset index of your df and construct reverse mapping again\n",
- "df = df.reset_index()\n",
- "indices2 = pd.Series(df.index, index=df['title'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "content_recommender('The Lion King', cosine_sim2, df, indices2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|