| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292 |
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Plot Description Based Recommender"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: scikit-learn in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.1.2)\n",
- "Requirement already satisfied: scipy in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.9.0)\n",
- "Requirement already satisfied: matplotlib in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (3.5.3)\n",
- "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
- "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/site-packages (from scikit-learn) (1.22.4)\n",
- "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (1.1.0)\n",
- "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
- "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (4.34.4)\n",
- "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (1.4.4)\n",
- "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
- "Requirement already satisfied: packaging>=20.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (21.3)\n",
- "Requirement already satisfied: pillow>=6.2.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (9.2.0)\n",
- "Requirement already satisfied: cycler>=0.10 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
- "Requirement already satisfied: six>=1.5 in /usr/local/Cellar/six/1.16.0_2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
- "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n",
- "You should consider upgrading via the '/usr/local/Cellar/ipython/8.4.0/libexec/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
- "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
- ]
- }
- ],
- "source": [
- "%pip install scikit-learn scipy matplotlib"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: scikit-learn in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.1.2)\n",
- "Requirement already satisfied: scipy in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.9.0)\n",
- "Requirement already satisfied: matplotlib in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (3.5.3)\n",
- "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (1.1.0)\n",
- "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/site-packages (from scikit-learn) (1.22.4)\n",
- "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
- "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
- "Requirement already satisfied: cycler>=0.10 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
- "Requirement already satisfied: pillow>=6.2.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (9.2.0)\n",
- "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (1.4.4)\n",
- "Requirement already satisfied: packaging>=20.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (21.3)\n",
- "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (4.34.4)\n",
- "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
- "Requirement already satisfied: six>=1.5 in /usr/local/Cellar/six/1.16.0_2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
- "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n",
- "You should consider upgrading via the '/usr/local/Cellar/ipython/8.4.0/libexec/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
- "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
- ]
- }
- ],
- "source": [
- "%pip install scikit-learn scipy matplotlib"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>title</th>\n",
- " <th>genres</th>\n",
- " <th>runtime</th>\n",
- " <th>vote_average</th>\n",
- " <th>vote_count</th>\n",
- " <th>year</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>Toy Story</td>\n",
- " <td>['animation', 'comedy', 'family']</td>\n",
- " <td>81.0</td>\n",
- " <td>7.7</td>\n",
- " <td>5415.0</td>\n",
- " <td>1995</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>Jumanji</td>\n",
- " <td>['adventure', 'fantasy', 'family']</td>\n",
- " <td>104.0</td>\n",
- " <td>6.9</td>\n",
- " <td>2413.0</td>\n",
- " <td>1995</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>Grumpier Old Men</td>\n",
- " <td>['romance', 'comedy']</td>\n",
- " <td>101.0</td>\n",
- " <td>6.5</td>\n",
- " <td>92.0</td>\n",
- " <td>1995</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>Waiting to Exhale</td>\n",
- " <td>['comedy', 'drama', 'romance']</td>\n",
- " <td>127.0</td>\n",
- " <td>6.1</td>\n",
- " <td>34.0</td>\n",
- " <td>1995</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>Father of the Bride Part II</td>\n",
- " <td>['comedy']</td>\n",
- " <td>106.0</td>\n",
- " <td>5.7</td>\n",
- " <td>173.0</td>\n",
- " <td>1995</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " title genres runtime \\\n",
- "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
- "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
- "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
- "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
- "4 Father of the Bride Part II ['comedy'] 106.0 \n",
- "\n",
- " vote_average vote_count year \n",
- "0 7.7 5415.0 1995 \n",
- "1 6.9 2413.0 1995 \n",
- "2 6.5 92.0 1995 \n",
- "3 6.1 34.0 1995 \n",
- "4 5.7 173.0 1995 "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "\n",
- "#Import data from the clean file \n",
- "df = pd.read_csv('../data/metadata_clean.csv')\n",
- "\n",
- "#Print the head of the cleaned DataFrame\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>title</th>\n",
- " <th>genres</th>\n",
- " <th>runtime</th>\n",
- " <th>vote_average</th>\n",
- " <th>vote_count</th>\n",
- " <th>year</th>\n",
- " <th>overview</th>\n",
- " <th>id</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>Toy Story</td>\n",
- " <td>['animation', 'comedy', 'family']</td>\n",
- " <td>81.0</td>\n",
- " <td>7.7</td>\n",
- " <td>5415.0</td>\n",
- " <td>1995</td>\n",
- " <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
- " <td>862</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>Jumanji</td>\n",
- " <td>['adventure', 'fantasy', 'family']</td>\n",
- " <td>104.0</td>\n",
- " <td>6.9</td>\n",
- " <td>2413.0</td>\n",
- " <td>1995</td>\n",
- " <td>When siblings Judy and Peter discover an encha...</td>\n",
- " <td>8844</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>Grumpier Old Men</td>\n",
- " <td>['romance', 'comedy']</td>\n",
- " <td>101.0</td>\n",
- " <td>6.5</td>\n",
- " <td>92.0</td>\n",
- " <td>1995</td>\n",
- " <td>A family wedding reignites the ancient feud be...</td>\n",
- " <td>15602</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>Waiting to Exhale</td>\n",
- " <td>['comedy', 'drama', 'romance']</td>\n",
- " <td>127.0</td>\n",
- " <td>6.1</td>\n",
- " <td>34.0</td>\n",
- " <td>1995</td>\n",
- " <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
- " <td>31357</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>Father of the Bride Part II</td>\n",
- " <td>['comedy']</td>\n",
- " <td>106.0</td>\n",
- " <td>5.7</td>\n",
- " <td>173.0</td>\n",
- " <td>1995</td>\n",
- " <td>Just when George Banks has recovered from his ...</td>\n",
- " <td>11862</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " title genres runtime \\\n",
- "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
- "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
- "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
- "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
- "4 Father of the Bride Part II ['comedy'] 106.0 \n",
- "\n",
- " vote_average vote_count year \\\n",
- "0 7.7 5415.0 1995 \n",
- "1 6.9 2413.0 1995 \n",
- "2 6.5 92.0 1995 \n",
- "3 6.1 34.0 1995 \n",
- "4 5.7 173.0 1995 \n",
- "\n",
- " overview id \n",
- "0 Led by Woody, Andy's toys live happily in his ... 862 \n",
- "1 When siblings Judy and Peter discover an encha... 8844 \n",
- "2 A family wedding reignites the ancient feud be... 15602 \n",
- "3 Cheated on, mistreated and stepped on, the wom... 31357 \n",
- "4 Just when George Banks has recovered from his ... 11862 "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Import the original file\n",
- "orig_df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)\n",
- "\n",
- "#Add the useful features into the cleaned dataframe\n",
- "df['overview'], df['id'] = orig_df['overview'], orig_df['id']\n",
- "\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(45466, 75827)"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Import TfIdfVectorizer from the scikit-learn library\n",
- "from sklearn.feature_extraction.text import TfidfVectorizer\n",
- "\n",
- "#Define a TF-IDF Vectorizer Object. Remove all english stopwords\n",
- "tfidf = TfidfVectorizer(stop_words='english')\n",
- "\n",
- "#Replace NaN with an empty string\n",
- "df['overview'] = df['overview'].fillna('')\n",
- "\n",
- "#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature\n",
- "tfidf_matrix = tfidf.fit_transform(df['overview'])\n",
- "\n",
- "#Output the shape of tfidf_matrix\n",
- "tfidf_matrix.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Import linear_kernel to compute the dot product\n",
- "from sklearn.metrics.pairwise import linear_kernel\n",
- "\n",
- "# Compute the cosine similarity matrix\n",
- "cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any\n",
- "indices = pd.Series(df.index, index=df['title']).drop_duplicates()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Function that takes in movie title as input and gives recommendations \n",
- "def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):\n",
- " # Obtain the index of the movie that matches the title\n",
- " idx = indices[title]\n",
- "\n",
- " # Get the pairwsie similarity scores of all movies with that movie\n",
- " # And convert it into a list of tuples as described above\n",
- " sim_scores = list(enumerate(cosine_sim[idx]))\n",
- "\n",
- " # Sort the movies based on the cosine similarity scores\n",
- " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
- "\n",
- " # Get the scores of the 10 most similar movies. Ignore the first movie.\n",
- " sim_scores = sim_scores[1:11]\n",
- "\n",
- " # Get the movie indices\n",
- " movie_indices = [i[0] for i in sim_scores]\n",
- "\n",
- " # Return the top 10 most similar movies\n",
- " return df['title'].iloc[movie_indices]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "34682 How the Lion Cub and the Turtle Sang a Song\n",
- "9353 The Lion King 1½\n",
- "9115 The Lion King 2: Simba's Pride\n",
- "42829 Prey\n",
- "25654 Fearless Fagan\n",
- "17041 African Cats\n",
- "27933 Massaï, les guerriers de la pluie\n",
- "6094 Born Free\n",
- "37409 Sour Grape\n",
- "3203 The Waiting Game\n",
- "Name: title, dtype: object"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Get recommendations for The Lion King\n",
- "content_recommender('The Lion King')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Metadata Based Recommender"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Load the keywords and credits files\n",
- "cred_df = pd.read_csv('../data/credits.csv')\n",
- "key_df = pd.read_csv('../data/keywords.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>cast</th>\n",
- " <th>crew</th>\n",
- " <th>id</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>[{'cast_id': 14, 'character': 'Woody (voice)',...</td>\n",
- " <td>[{'credit_id': '52fe4284c3a36847f8024f49', 'de...</td>\n",
- " <td>862</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>[{'cast_id': 1, 'character': 'Alan Parrish', '...</td>\n",
- " <td>[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...</td>\n",
- " <td>8844</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>[{'cast_id': 2, 'character': 'Max Goldman', 'c...</td>\n",
- " <td>[{'credit_id': '52fe466a9251416c75077a89', 'de...</td>\n",
- " <td>15602</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>[{'cast_id': 1, 'character': \"Savannah 'Vannah...</td>\n",
- " <td>[{'credit_id': '52fe44779251416c91011acb', 'de...</td>\n",
- " <td>31357</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>[{'cast_id': 1, 'character': 'George Banks', '...</td>\n",
- " <td>[{'credit_id': '52fe44959251416c75039ed7', 'de...</td>\n",
- " <td>11862</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " cast \\\n",
- "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n",
- "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n",
- "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n",
- "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n",
- "4 [{'cast_id': 1, 'character': 'George Banks', '... \n",
- "\n",
- " crew id \n",
- "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n",
- "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n",
- "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n",
- "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n",
- "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 "
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Print the head of the credit dataframe\n",
- "cred_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>id</th>\n",
- " <th>keywords</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>862</td>\n",
- " <td>[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>8844</td>\n",
- " <td>[{'id': 10090, 'name': 'board game'}, {'id': 1...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>15602</td>\n",
- " <td>[{'id': 1495, 'name': 'fishing'}, {'id': 12392...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>31357</td>\n",
- " <td>[{'id': 818, 'name': 'based on novel'}, {'id':...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>11862</td>\n",
- " <td>[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " id keywords\n",
- "0 862 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n",
- "1 8844 [{'id': 10090, 'name': 'board game'}, {'id': 1...\n",
- "2 15602 [{'id': 1495, 'name': 'fishing'}, {'id': 12392...\n",
- "3 31357 [{'id': 818, 'name': 'based on novel'}, {'id':...\n",
- "4 11862 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Print the head of the keywords dataframe\n",
- "key_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "ename": "ValueError",
- "evalue": "invalid literal for int() with base 10: '1997-08-20'",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
- "Input \u001b[0;32mIn [13]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#Convert the IDs of df into int\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mid\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mint\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/generic.py:5912\u001b[0m, in \u001b[0;36mNDFrame.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 5905\u001b[0m results \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 5906\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miloc[:, i]\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[1;32m 5907\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns))\n\u001b[1;32m 5908\u001b[0m ]\n\u001b[1;32m 5910\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 5911\u001b[0m \u001b[38;5;66;03m# else, only a single dtype is given\u001b[39;00m\n\u001b[0;32m-> 5912\u001b[0m new_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mgr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5913\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_constructor(new_data)\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mastype\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 5915\u001b[0m \u001b[38;5;66;03m# GH 33113: handle empty frame or series\u001b[39;00m\n",
- "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/internals/managers.py:419\u001b[0m, in \u001b[0;36mBaseBlockManager.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 418\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mastype\u001b[39m(\u001b[38;5;28mself\u001b[39m: T, dtype, copy: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, errors: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m T:\n\u001b[0;32m--> 419\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mastype\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n",
- "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/internals/managers.py:304\u001b[0m, in \u001b[0;36mBaseBlockManager.apply\u001b[0;34m(self, f, align_keys, ignore_failures, **kwargs)\u001b[0m\n\u001b[1;32m 302\u001b[0m applied \u001b[38;5;241m=\u001b[39m b\u001b[38;5;241m.\u001b[39mapply(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 304\u001b[0m applied \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mTypeError\u001b[39;00m, \u001b[38;5;167;01mNotImplementedError\u001b[39;00m):\n\u001b[1;32m 306\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ignore_failures:\n",
- "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/internals/blocks.py:580\u001b[0m, in \u001b[0;36mBlock.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 562\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 563\u001b[0m \u001b[38;5;124;03mCoerce to the new dtype.\u001b[39;00m\n\u001b[1;32m 564\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 576\u001b[0m \u001b[38;5;124;03mBlock\u001b[39;00m\n\u001b[1;32m 577\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 578\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvalues\n\u001b[0;32m--> 580\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array_safe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 582\u001b[0m new_values \u001b[38;5;241m=\u001b[39m maybe_coerce_values(new_values)\n\u001b[1;32m 583\u001b[0m newb \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmake_block(new_values)\n",
- "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1292\u001b[0m, in \u001b[0;36mastype_array_safe\u001b[0;34m(values, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 1289\u001b[0m dtype \u001b[38;5;241m=\u001b[39m dtype\u001b[38;5;241m.\u001b[39mnumpy_dtype\n\u001b[1;32m 1291\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1292\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1293\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[1;32m 1294\u001b[0m \u001b[38;5;66;03m# e.g. astype_nansafe can fail on object-dtype of strings\u001b[39;00m\n\u001b[1;32m 1295\u001b[0m \u001b[38;5;66;03m# trying to convert to float\u001b[39;00m\n\u001b[1;32m 1296\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
- "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1237\u001b[0m, in \u001b[0;36mastype_array\u001b[0;34m(values, dtype, copy)\u001b[0m\n\u001b[1;32m 1234\u001b[0m values \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[1;32m 1236\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1237\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1239\u001b[0m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[1;32m 1240\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n",
- "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1154\u001b[0m, in \u001b[0;36mastype_nansafe\u001b[0;34m(arr, dtype, copy, skipna)\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_object_dtype(arr\u001b[38;5;241m.\u001b[39mdtype):\n\u001b[1;32m 1151\u001b[0m \n\u001b[1;32m 1152\u001b[0m \u001b[38;5;66;03m# work around NumPy brokenness, #1987\u001b[39;00m\n\u001b[1;32m 1153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39missubdtype(dtype\u001b[38;5;241m.\u001b[39mtype, np\u001b[38;5;241m.\u001b[39minteger):\n\u001b[0;32m-> 1154\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype_intsafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1156\u001b[0m \u001b[38;5;66;03m# if we have a datetime/timedelta array of objects\u001b[39;00m\n\u001b[1;32m 1157\u001b[0m \u001b[38;5;66;03m# then coerce to a proper dtype and recall astype_nansafe\u001b[39;00m\n\u001b[1;32m 1159\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_datetime64_dtype(dtype):\n",
- "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/_libs/lib.pyx:668\u001b[0m, in \u001b[0;36mpandas._libs.lib.astype_intsafe\u001b[0;34m()\u001b[0m\n",
- "\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: '1997-08-20'"
- ]
- }
- ],
- "source": [
- "#Convert the IDs of df into int\n",
- "df['id'] = df['id'].astype('int')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Function to convert all non-integer IDs to NaN\n",
- "def clean_ids(x):\n",
- " try:\n",
- " return int(x)\n",
- " except:\n",
- " return np.nan"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Clean the ids of df\n",
- "df['id'] = df['id'].apply(clean_ids)\n",
- "\n",
- "#Filter all rows that have a null ID\n",
- "df = df[df['id'].notnull()]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/var/folders/dv/107570dd01b_m3wvmvv0g9lm0000gn/T/ipykernel_21647/2304563750.py:2: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df['id'] = df['id'].astype('int')\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>title</th>\n",
- " <th>genres</th>\n",
- " <th>runtime</th>\n",
- " <th>vote_average</th>\n",
- " <th>vote_count</th>\n",
- " <th>year</th>\n",
- " <th>overview</th>\n",
- " <th>id</th>\n",
- " <th>cast</th>\n",
- " <th>crew</th>\n",
- " <th>keywords</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>Toy Story</td>\n",
- " <td>['animation', 'comedy', 'family']</td>\n",
- " <td>81.0</td>\n",
- " <td>7.7</td>\n",
- " <td>5415.0</td>\n",
- " <td>1995</td>\n",
- " <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
- " <td>862</td>\n",
- " <td>[{'cast_id': 14, 'character': 'Woody (voice)',...</td>\n",
- " <td>[{'credit_id': '52fe4284c3a36847f8024f49', 'de...</td>\n",
- " <td>[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>Jumanji</td>\n",
- " <td>['adventure', 'fantasy', 'family']</td>\n",
- " <td>104.0</td>\n",
- " <td>6.9</td>\n",
- " <td>2413.0</td>\n",
- " <td>1995</td>\n",
- " <td>When siblings Judy and Peter discover an encha...</td>\n",
- " <td>8844</td>\n",
- " <td>[{'cast_id': 1, 'character': 'Alan Parrish', '...</td>\n",
- " <td>[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...</td>\n",
- " <td>[{'id': 10090, 'name': 'board game'}, {'id': 1...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>Grumpier Old Men</td>\n",
- " <td>['romance', 'comedy']</td>\n",
- " <td>101.0</td>\n",
- " <td>6.5</td>\n",
- " <td>92.0</td>\n",
- " <td>1995</td>\n",
- " <td>A family wedding reignites the ancient feud be...</td>\n",
- " <td>15602</td>\n",
- " <td>[{'cast_id': 2, 'character': 'Max Goldman', 'c...</td>\n",
- " <td>[{'credit_id': '52fe466a9251416c75077a89', 'de...</td>\n",
- " <td>[{'id': 1495, 'name': 'fishing'}, {'id': 12392...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>Waiting to Exhale</td>\n",
- " <td>['comedy', 'drama', 'romance']</td>\n",
- " <td>127.0</td>\n",
- " <td>6.1</td>\n",
- " <td>34.0</td>\n",
- " <td>1995</td>\n",
- " <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
- " <td>31357</td>\n",
- " <td>[{'cast_id': 1, 'character': \"Savannah 'Vannah...</td>\n",
- " <td>[{'credit_id': '52fe44779251416c91011acb', 'de...</td>\n",
- " <td>[{'id': 818, 'name': 'based on novel'}, {'id':...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>Father of the Bride Part II</td>\n",
- " <td>['comedy']</td>\n",
- " <td>106.0</td>\n",
- " <td>5.7</td>\n",
- " <td>173.0</td>\n",
- " <td>1995</td>\n",
- " <td>Just when George Banks has recovered from his ...</td>\n",
- " <td>11862</td>\n",
- " <td>[{'cast_id': 1, 'character': 'George Banks', '...</td>\n",
- " <td>[{'credit_id': '52fe44959251416c75039ed7', 'de...</td>\n",
- " <td>[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " title genres runtime \\\n",
- "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
- "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
- "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
- "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
- "4 Father of the Bride Part II ['comedy'] 106.0 \n",
- "\n",
- " vote_average vote_count year \\\n",
- "0 7.7 5415.0 1995 \n",
- "1 6.9 2413.0 1995 \n",
- "2 6.5 92.0 1995 \n",
- "3 6.1 34.0 1995 \n",
- "4 5.7 173.0 1995 \n",
- "\n",
- " overview id \\\n",
- "0 Led by Woody, Andy's toys live happily in his ... 862 \n",
- "1 When siblings Judy and Peter discover an encha... 8844 \n",
- "2 A family wedding reignites the ancient feud be... 15602 \n",
- "3 Cheated on, mistreated and stepped on, the wom... 31357 \n",
- "4 Just when George Banks has recovered from his ... 11862 \n",
- "\n",
- " cast \\\n",
- "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n",
- "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n",
- "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n",
- "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n",
- "4 [{'cast_id': 1, 'character': 'George Banks', '... \n",
- "\n",
- " crew \\\n",
- "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... \n",
- "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... \n",
- "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... \n",
- "3 [{'credit_id': '52fe44779251416c91011acb', 'de... \n",
- "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... \n",
- "\n",
- " keywords \n",
- "0 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,... \n",
- "1 [{'id': 10090, 'name': 'board game'}, {'id': 1... \n",
- "2 [{'id': 1495, 'name': 'fishing'}, {'id': 12392... \n",
- "3 [{'id': 818, 'name': 'based on novel'}, {'id':... \n",
- "4 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n... "
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Convert IDs into integer\n",
- "df['id'] = df['id'].astype('int')\n",
- "key_df['id'] = key_df['id'].astype('int')\n",
- "cred_df['id'] = cred_df['id'].astype('int')\n",
- "\n",
- "# Merge keywords and credits into your main metadata dataframe\n",
- "df = df.merge(cred_df, on='id')\n",
- "df = df.merge(key_df, on='id')\n",
- "\n",
- "#Display the head of df\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Convert the stringified objects into the native python objects\n",
- "from ast import literal_eval\n",
- "\n",
- "features = ['cast', 'crew', 'keywords', 'genres']\n",
- "for feature in features:\n",
- " df[feature] = df[feature].apply(literal_eval)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'credit_id': '52fe4284c3a36847f8024f49',\n",
- " 'department': 'Directing',\n",
- " 'gender': 2,\n",
- " 'id': 7879,\n",
- " 'job': 'Director',\n",
- " 'name': 'John Lasseter',\n",
- " 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Print the first cast member of the first movie in df\n",
- "df.iloc[0]['crew'][0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Extract the director's name. If director is not listed, return NaN\n",
- "def get_director(x):\n",
- " for crew_member in x:\n",
- " if crew_member['job'] == 'Director':\n",
- " return crew_member['name']\n",
- " return np.nan"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 John Lasseter\n",
- "1 Joe Johnston\n",
- "2 Howard Deutch\n",
- "3 Forest Whitaker\n",
- "4 Charles Shyer\n",
- "Name: director, dtype: object"
- ]
- },
- "execution_count": 20,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Define the new director feature\n",
- "df['director'] = df['crew'].apply(get_director)\n",
- "\n",
- "#Print the directors of the first five movies\n",
- "df['director'].head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Returns the list top 3 elements or entire list; whichever is more.\n",
- "def generate_list(x):\n",
- " if isinstance(x, list):\n",
- " names = [i['name'] for i in x]\n",
- " #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.\n",
- " if len(names) > 3:\n",
- " names = names[:3]\n",
- " return names\n",
- "\n",
- " #Return empty list in case of missing/malformed data\n",
- " return []"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Apply the generate_list function to cast and keywords\n",
- "df['cast'] = df['cast'].apply(generate_list)\n",
- "df['keywords'] = df['keywords'].apply(generate_list)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Only consider a maximum of 3 genres\n",
- "df['genres'] = df['genres'].apply(lambda x: x[:3])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>title</th>\n",
- " <th>cast</th>\n",
- " <th>director</th>\n",
- " <th>keywords</th>\n",
- " <th>genres</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>Toy Story</td>\n",
- " <td>[Tom Hanks, Tim Allen, Don Rickles]</td>\n",
- " <td>John Lasseter</td>\n",
- " <td>[jealousy, toy, boy]</td>\n",
- " <td>[animation, comedy, family]</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>Jumanji</td>\n",
- " <td>[Robin Williams, Jonathan Hyde, Kirsten Dunst]</td>\n",
- " <td>Joe Johnston</td>\n",
- " <td>[board game, disappearance, based on children'...</td>\n",
- " <td>[adventure, fantasy, family]</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>Grumpier Old Men</td>\n",
- " <td>[Walter Matthau, Jack Lemmon, Ann-Margret]</td>\n",
- " <td>Howard Deutch</td>\n",
- " <td>[fishing, best friend, duringcreditsstinger]</td>\n",
- " <td>[romance, comedy]</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>Waiting to Exhale</td>\n",
- " <td>[Whitney Houston, Angela Bassett, Loretta Devine]</td>\n",
- " <td>Forest Whitaker</td>\n",
- " <td>[based on novel, interracial relationship, sin...</td>\n",
- " <td>[comedy, drama, romance]</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>Father of the Bride Part II</td>\n",
- " <td>[Steve Martin, Diane Keaton, Martin Short]</td>\n",
- " <td>Charles Shyer</td>\n",
- " <td>[baby, midlife crisis, confidence]</td>\n",
- " <td>[comedy]</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "</div>"
- ],
- "text/plain": [
- " title \\\n",
- "0 Toy Story \n",
- "1 Jumanji \n",
- "2 Grumpier Old Men \n",
- "3 Waiting to Exhale \n",
- "4 Father of the Bride Part II \n",
- "\n",
- " cast director \\\n",
- "0 [Tom Hanks, Tim Allen, Don Rickles] John Lasseter \n",
- "1 [Robin Williams, Jonathan Hyde, Kirsten Dunst] Joe Johnston \n",
- "2 [Walter Matthau, Jack Lemmon, Ann-Margret] Howard Deutch \n",
- "3 [Whitney Houston, Angela Bassett, Loretta Devine] Forest Whitaker \n",
- "4 [Steve Martin, Diane Keaton, Martin Short] Charles Shyer \n",
- "\n",
- " keywords \\\n",
- "0 [jealousy, toy, boy] \n",
- "1 [board game, disappearance, based on children'... \n",
- "2 [fishing, best friend, duringcreditsstinger] \n",
- "3 [based on novel, interracial relationship, sin... \n",
- "4 [baby, midlife crisis, confidence] \n",
- "\n",
- " genres \n",
- "0 [animation, comedy, family] \n",
- "1 [adventure, fantasy, family] \n",
- "2 [romance, comedy] \n",
- "3 [comedy, drama, romance] \n",
- "4 [comedy] "
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Print the new features of the first 5 movies along with title\n",
- "df[['title', 'cast', 'director', 'keywords', 'genres']].head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase\n",
- "def sanitize(x):\n",
- " if isinstance(x, list):\n",
- " #Strip spaces and convert to lowercase\n",
- " return [str.lower(i.replace(\" \", \"\")) for i in x]\n",
- " else:\n",
- " #Check if director exists. If not, return empty string\n",
- " if isinstance(x, str):\n",
- " return str.lower(x.replace(\" \", \"\"))\n",
- " else:\n",
- " return ''"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Apply the generate_list function to cast, keywords, director and genres\n",
- "for feature in ['cast', 'director', 'genres', 'keywords']:\n",
- " df[feature] = df[feature].apply(sanitize)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "#Function that creates a soup out of the desired metadata\n",
- "def create_soup(x):\n",
- " return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create the new soup feature\n",
- "df['soup'] = df.apply(create_soup, axis=1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#Display the soup of the first movie\n",
- "df.iloc[0]['soup']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Import CountVectorizer\n",
- "from sklearn.feature_extraction.text import CountVectorizer\n",
- "\n",
- "#Define a new CountVectorizer object and create vectors for the soup\n",
- "count = CountVectorizer(stop_words='english')\n",
- "count_matrix = count.fit_transform(df['soup'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [],
- "source": [
- "#Import cosine_similarity function\n",
- "from sklearn.metrics.pairwise import cosine_similarity\n",
- "\n",
- "#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)\n",
- "cosine_sim2 = cosine_similarity(count_matrix, count_matrix)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Reset index of your df and construct reverse mapping again\n",
- "df = df.reset_index()\n",
- "indices2 = pd.Series(df.index, index=df['title'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "29607 Cheburashka\n",
- "40904 VeggieTales: Josh and the Big Wall\n",
- "40913 VeggieTales: Minnesota Cuke and the Search for...\n",
- "27768 The Little Matchgirl\n",
- "15209 Spiderman: The Ultimate Villain Showdown\n",
- "16613 Cirque du Soleil: Varekai\n",
- "24654 The Seventh Brother\n",
- "29198 Superstar Goofy\n",
- "30244 My Love\n",
- "31179 Pokémon: Arceus and the Jewel of Life\n",
- "Name: title, dtype: object"
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "content_recommender('The Lion King', cosine_sim2, df, indices2)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|