tum
/
rec-code


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292
							{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Plot Description Based Recommender"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: scikit-learn in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.1.2)\n",
      "Requirement already satisfied: scipy in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.9.0)\n",
      "Requirement already satisfied: matplotlib in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (3.5.3)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
      "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/site-packages (from scikit-learn) (1.22.4)\n",
      "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (1.1.0)\n",
      "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
      "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (4.34.4)\n",
      "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (1.4.4)\n",
      "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
      "Requirement already satisfied: packaging>=20.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (21.3)\n",
      "Requirement already satisfied: pillow>=6.2.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (9.2.0)\n",
      "Requirement already satisfied: cycler>=0.10 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
      "Requirement already satisfied: six>=1.5 in /usr/local/Cellar/six/1.16.0_2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
      "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n",
      "You should consider upgrading via the '/usr/local/Cellar/ipython/8.4.0/libexec/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install scikit-learn scipy matplotlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: scikit-learn in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.1.2)\n",
      "Requirement already satisfied: scipy in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.9.0)\n",
      "Requirement already satisfied: matplotlib in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (3.5.3)\n",
      "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (1.1.0)\n",
      "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/site-packages (from scikit-learn) (1.22.4)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
      "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
      "Requirement already satisfied: cycler>=0.10 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
      "Requirement already satisfied: pillow>=6.2.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (9.2.0)\n",
      "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (1.4.4)\n",
      "Requirement already satisfied: packaging>=20.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (21.3)\n",
      "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (4.34.4)\n",
      "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
      "Requirement already satisfied: six>=1.5 in /usr/local/Cellar/six/1.16.0_2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
      "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n",
      "You should consider upgrading via the '/usr/local/Cellar/ipython/8.4.0/libexec/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install scikit-learn scipy matplotlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>runtime</th>\n",
       "      <th>vote_average</th>\n",
       "      <th>vote_count</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Toy Story</td>\n",
       "      <td>['animation', 'comedy', 'family']</td>\n",
       "      <td>81.0</td>\n",
       "      <td>7.7</td>\n",
       "      <td>5415.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jumanji</td>\n",
       "      <td>['adventure', 'fantasy', 'family']</td>\n",
       "      <td>104.0</td>\n",
       "      <td>6.9</td>\n",
       "      <td>2413.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Grumpier Old Men</td>\n",
       "      <td>['romance', 'comedy']</td>\n",
       "      <td>101.0</td>\n",
       "      <td>6.5</td>\n",
       "      <td>92.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Waiting to Exhale</td>\n",
       "      <td>['comedy', 'drama', 'romance']</td>\n",
       "      <td>127.0</td>\n",
       "      <td>6.1</td>\n",
       "      <td>34.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Father of the Bride Part II</td>\n",
       "      <td>['comedy']</td>\n",
       "      <td>106.0</td>\n",
       "      <td>5.7</td>\n",
       "      <td>173.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         title                              genres  runtime  \\\n",
       "0                    Toy Story   ['animation', 'comedy', 'family']     81.0   \n",
       "1                      Jumanji  ['adventure', 'fantasy', 'family']    104.0   \n",
       "2             Grumpier Old Men               ['romance', 'comedy']    101.0   \n",
       "3            Waiting to Exhale      ['comedy', 'drama', 'romance']    127.0   \n",
       "4  Father of the Bride Part II                          ['comedy']    106.0   \n",
       "\n",
       "   vote_average  vote_count  year  \n",
       "0           7.7      5415.0  1995  \n",
       "1           6.9      2413.0  1995  \n",
       "2           6.5        92.0  1995  \n",
       "3           6.1        34.0  1995  \n",
       "4           5.7       173.0  1995  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "#Import data from the clean file \n",
    "df = pd.read_csv('../data/metadata_clean.csv')\n",
    "\n",
    "#Print the head of the cleaned DataFrame\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>runtime</th>\n",
       "      <th>vote_average</th>\n",
       "      <th>vote_count</th>\n",
       "      <th>year</th>\n",
       "      <th>overview</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Toy Story</td>\n",
       "      <td>['animation', 'comedy', 'family']</td>\n",
       "      <td>81.0</td>\n",
       "      <td>7.7</td>\n",
       "      <td>5415.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
       "      <td>862</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jumanji</td>\n",
       "      <td>['adventure', 'fantasy', 'family']</td>\n",
       "      <td>104.0</td>\n",
       "      <td>6.9</td>\n",
       "      <td>2413.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>When siblings Judy and Peter discover an encha...</td>\n",
       "      <td>8844</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Grumpier Old Men</td>\n",
       "      <td>['romance', 'comedy']</td>\n",
       "      <td>101.0</td>\n",
       "      <td>6.5</td>\n",
       "      <td>92.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>A family wedding reignites the ancient feud be...</td>\n",
       "      <td>15602</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Waiting to Exhale</td>\n",
       "      <td>['comedy', 'drama', 'romance']</td>\n",
       "      <td>127.0</td>\n",
       "      <td>6.1</td>\n",
       "      <td>34.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
       "      <td>31357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Father of the Bride Part II</td>\n",
       "      <td>['comedy']</td>\n",
       "      <td>106.0</td>\n",
       "      <td>5.7</td>\n",
       "      <td>173.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>Just when George Banks has recovered from his ...</td>\n",
       "      <td>11862</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         title                              genres  runtime  \\\n",
       "0                    Toy Story   ['animation', 'comedy', 'family']     81.0   \n",
       "1                      Jumanji  ['adventure', 'fantasy', 'family']    104.0   \n",
       "2             Grumpier Old Men               ['romance', 'comedy']    101.0   \n",
       "3            Waiting to Exhale      ['comedy', 'drama', 'romance']    127.0   \n",
       "4  Father of the Bride Part II                          ['comedy']    106.0   \n",
       "\n",
       "   vote_average  vote_count  year  \\\n",
       "0           7.7      5415.0  1995   \n",
       "1           6.9      2413.0  1995   \n",
       "2           6.5        92.0  1995   \n",
       "3           6.1        34.0  1995   \n",
       "4           5.7       173.0  1995   \n",
       "\n",
       "                                            overview     id  \n",
       "0  Led by Woody, Andy's toys live happily in his ...    862  \n",
       "1  When siblings Judy and Peter discover an encha...   8844  \n",
       "2  A family wedding reignites the ancient feud be...  15602  \n",
       "3  Cheated on, mistreated and stepped on, the wom...  31357  \n",
       "4  Just when George Banks has recovered from his ...  11862  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Import the original file\n",
    "orig_df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)\n",
    "\n",
    "#Add the useful features into the cleaned dataframe\n",
    "df['overview'], df['id'] = orig_df['overview'], orig_df['id']\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(45466, 75827)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Import TfIdfVectorizer from the scikit-learn library\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "#Define a TF-IDF Vectorizer Object. Remove all english stopwords\n",
    "tfidf = TfidfVectorizer(stop_words='english')\n",
    "\n",
    "#Replace NaN with an empty string\n",
    "df['overview'] = df['overview'].fillna('')\n",
    "\n",
    "#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature\n",
    "tfidf_matrix = tfidf.fit_transform(df['overview'])\n",
    "\n",
    "#Output the shape of tfidf_matrix\n",
    "tfidf_matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import linear_kernel to compute the dot product\n",
    "from sklearn.metrics.pairwise import linear_kernel\n",
    "\n",
    "# Compute the cosine similarity matrix\n",
    "cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any\n",
    "indices = pd.Series(df.index, index=df['title']).drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function that takes in movie title as input and gives recommendations \n",
    "def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):\n",
    "    # Obtain the index of the movie that matches the title\n",
    "    idx = indices[title]\n",
    "\n",
    "    # Get the pairwsie similarity scores of all movies with that movie\n",
    "    # And convert it into a list of tuples as described above\n",
    "    sim_scores = list(enumerate(cosine_sim[idx]))\n",
    "\n",
    "    # Sort the movies based on the cosine similarity scores\n",
    "    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
    "\n",
    "    # Get the scores of the 10 most similar movies. Ignore the first movie.\n",
    "    sim_scores = sim_scores[1:11]\n",
    "\n",
    "    # Get the movie indices\n",
    "    movie_indices = [i[0] for i in sim_scores]\n",
    "\n",
    "    # Return the top 10 most similar movies\n",
    "    return df['title'].iloc[movie_indices]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34682    How the Lion Cub and the Turtle Sang a Song\n",
       "9353                                The Lion King 1½\n",
       "9115                  The Lion King 2: Simba's Pride\n",
       "42829                                           Prey\n",
       "25654                                 Fearless Fagan\n",
       "17041                                   African Cats\n",
       "27933              Massaï, les guerriers de la pluie\n",
       "6094                                       Born Free\n",
       "37409                                     Sour Grape\n",
       "3203                                The Waiting Game\n",
       "Name: title, dtype: object"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Get recommendations for The Lion King\n",
    "content_recommender('The Lion King')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Metadata Based Recommender"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the keywords and credits files\n",
    "cred_df = pd.read_csv('../data/credits.csv')\n",
    "key_df = pd.read_csv('../data/keywords.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cast</th>\n",
       "      <th>crew</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[{'cast_id': 14, 'character': 'Woody (voice)',...</td>\n",
       "      <td>[{'credit_id': '52fe4284c3a36847f8024f49', 'de...</td>\n",
       "      <td>862</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[{'cast_id': 1, 'character': 'Alan Parrish', '...</td>\n",
       "      <td>[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...</td>\n",
       "      <td>8844</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[{'cast_id': 2, 'character': 'Max Goldman', 'c...</td>\n",
       "      <td>[{'credit_id': '52fe466a9251416c75077a89', 'de...</td>\n",
       "      <td>15602</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[{'cast_id': 1, 'character': \"Savannah 'Vannah...</td>\n",
       "      <td>[{'credit_id': '52fe44779251416c91011acb', 'de...</td>\n",
       "      <td>31357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[{'cast_id': 1, 'character': 'George Banks', '...</td>\n",
       "      <td>[{'credit_id': '52fe44959251416c75039ed7', 'de...</td>\n",
       "      <td>11862</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                cast  \\\n",
       "0  [{'cast_id': 14, 'character': 'Woody (voice)',...   \n",
       "1  [{'cast_id': 1, 'character': 'Alan Parrish', '...   \n",
       "2  [{'cast_id': 2, 'character': 'Max Goldman', 'c...   \n",
       "3  [{'cast_id': 1, 'character': \"Savannah 'Vannah...   \n",
       "4  [{'cast_id': 1, 'character': 'George Banks', '...   \n",
       "\n",
       "                                                crew     id  \n",
       "0  [{'credit_id': '52fe4284c3a36847f8024f49', 'de...    862  \n",
       "1  [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...   8844  \n",
       "2  [{'credit_id': '52fe466a9251416c75077a89', 'de...  15602  \n",
       "3  [{'credit_id': '52fe44779251416c91011acb', 'de...  31357  \n",
       "4  [{'credit_id': '52fe44959251416c75039ed7', 'de...  11862  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Print the head of the credit dataframe\n",
    "cred_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>keywords</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>862</td>\n",
       "      <td>[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8844</td>\n",
       "      <td>[{'id': 10090, 'name': 'board game'}, {'id': 1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>15602</td>\n",
       "      <td>[{'id': 1495, 'name': 'fishing'}, {'id': 12392...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>31357</td>\n",
       "      <td>[{'id': 818, 'name': 'based on novel'}, {'id':...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>11862</td>\n",
       "      <td>[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      id                                           keywords\n",
       "0    862  [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n",
       "1   8844  [{'id': 10090, 'name': 'board game'}, {'id': 1...\n",
       "2  15602  [{'id': 1495, 'name': 'fishing'}, {'id': 12392...\n",
       "3  31357  [{'id': 818, 'name': 'based on novel'}, {'id':...\n",
       "4  11862  [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Print the head of the keywords dataframe\n",
    "key_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "invalid literal for int() with base 10: '1997-08-20'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Input \u001b[0;32mIn [13]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m#Convert the IDs of df into int\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mid\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mint\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/generic.py:5912\u001b[0m, in \u001b[0;36mNDFrame.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m   5905\u001b[0m     results \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m   5906\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miloc[:, i]\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[1;32m   5907\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns))\n\u001b[1;32m   5908\u001b[0m     ]\n\u001b[1;32m   5910\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   5911\u001b[0m     \u001b[38;5;66;03m# else, only a single dtype is given\u001b[39;00m\n\u001b[0;32m-> 5912\u001b[0m     new_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mgr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   5913\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_constructor(new_data)\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mastype\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   5915\u001b[0m \u001b[38;5;66;03m# GH 33113: handle empty frame or series\u001b[39;00m\n",
      "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/internals/managers.py:419\u001b[0m, in \u001b[0;36mBaseBlockManager.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m    418\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mastype\u001b[39m(\u001b[38;5;28mself\u001b[39m: T, dtype, copy: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, errors: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m T:\n\u001b[0;32m--> 419\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mastype\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/internals/managers.py:304\u001b[0m, in \u001b[0;36mBaseBlockManager.apply\u001b[0;34m(self, f, align_keys, ignore_failures, **kwargs)\u001b[0m\n\u001b[1;32m    302\u001b[0m         applied \u001b[38;5;241m=\u001b[39m b\u001b[38;5;241m.\u001b[39mapply(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    303\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 304\u001b[0m         applied \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    305\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mTypeError\u001b[39;00m, \u001b[38;5;167;01mNotImplementedError\u001b[39;00m):\n\u001b[1;32m    306\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ignore_failures:\n",
      "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/internals/blocks.py:580\u001b[0m, in \u001b[0;36mBlock.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m    562\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    563\u001b[0m \u001b[38;5;124;03mCoerce to the new dtype.\u001b[39;00m\n\u001b[1;32m    564\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    576\u001b[0m \u001b[38;5;124;03mBlock\u001b[39;00m\n\u001b[1;32m    577\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    578\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvalues\n\u001b[0;32m--> 580\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array_safe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    582\u001b[0m new_values \u001b[38;5;241m=\u001b[39m maybe_coerce_values(new_values)\n\u001b[1;32m    583\u001b[0m newb \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmake_block(new_values)\n",
      "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1292\u001b[0m, in \u001b[0;36mastype_array_safe\u001b[0;34m(values, dtype, copy, errors)\u001b[0m\n\u001b[1;32m   1289\u001b[0m     dtype \u001b[38;5;241m=\u001b[39m dtype\u001b[38;5;241m.\u001b[39mnumpy_dtype\n\u001b[1;32m   1291\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1292\u001b[0m     new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1293\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[1;32m   1294\u001b[0m     \u001b[38;5;66;03m# e.g. astype_nansafe can fail on object-dtype of strings\u001b[39;00m\n\u001b[1;32m   1295\u001b[0m     \u001b[38;5;66;03m#  trying to convert to float\u001b[39;00m\n\u001b[1;32m   1296\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
      "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1237\u001b[0m, in \u001b[0;36mastype_array\u001b[0;34m(values, dtype, copy)\u001b[0m\n\u001b[1;32m   1234\u001b[0m     values \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[1;32m   1236\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1237\u001b[0m     values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1239\u001b[0m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[1;32m   1240\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n",
      "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1154\u001b[0m, in \u001b[0;36mastype_nansafe\u001b[0;34m(arr, dtype, copy, skipna)\u001b[0m\n\u001b[1;32m   1150\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_object_dtype(arr\u001b[38;5;241m.\u001b[39mdtype):\n\u001b[1;32m   1151\u001b[0m \n\u001b[1;32m   1152\u001b[0m     \u001b[38;5;66;03m# work around NumPy brokenness, #1987\u001b[39;00m\n\u001b[1;32m   1153\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39missubdtype(dtype\u001b[38;5;241m.\u001b[39mtype, np\u001b[38;5;241m.\u001b[39minteger):\n\u001b[0;32m-> 1154\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype_intsafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1156\u001b[0m     \u001b[38;5;66;03m# if we have a datetime/timedelta array of objects\u001b[39;00m\n\u001b[1;32m   1157\u001b[0m     \u001b[38;5;66;03m# then coerce to a proper dtype and recall astype_nansafe\u001b[39;00m\n\u001b[1;32m   1159\u001b[0m     \u001b[38;5;28;01melif\u001b[39;00m is_datetime64_dtype(dtype):\n",
      "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/_libs/lib.pyx:668\u001b[0m, in \u001b[0;36mpandas._libs.lib.astype_intsafe\u001b[0;34m()\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: '1997-08-20'"
     ]
    }
   ],
   "source": [
    "#Convert the IDs of df into int\n",
    "df['id'] = df['id'].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to convert all non-integer IDs to NaN\n",
    "def clean_ids(x):\n",
    "    try:\n",
    "        return int(x)\n",
    "    except:\n",
    "        return np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Clean the ids of df\n",
    "df['id'] = df['id'].apply(clean_ids)\n",
    "\n",
    "#Filter all rows that have a null ID\n",
    "df = df[df['id'].notnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/dv/107570dd01b_m3wvmvv0g9lm0000gn/T/ipykernel_21647/2304563750.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['id'] = df['id'].astype('int')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>runtime</th>\n",
       "      <th>vote_average</th>\n",
       "      <th>vote_count</th>\n",
       "      <th>year</th>\n",
       "      <th>overview</th>\n",
       "      <th>id</th>\n",
       "      <th>cast</th>\n",
       "      <th>crew</th>\n",
       "      <th>keywords</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Toy Story</td>\n",
       "      <td>['animation', 'comedy', 'family']</td>\n",
       "      <td>81.0</td>\n",
       "      <td>7.7</td>\n",
       "      <td>5415.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
       "      <td>862</td>\n",
       "      <td>[{'cast_id': 14, 'character': 'Woody (voice)',...</td>\n",
       "      <td>[{'credit_id': '52fe4284c3a36847f8024f49', 'de...</td>\n",
       "      <td>[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jumanji</td>\n",
       "      <td>['adventure', 'fantasy', 'family']</td>\n",
       "      <td>104.0</td>\n",
       "      <td>6.9</td>\n",
       "      <td>2413.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>When siblings Judy and Peter discover an encha...</td>\n",
       "      <td>8844</td>\n",
       "      <td>[{'cast_id': 1, 'character': 'Alan Parrish', '...</td>\n",
       "      <td>[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...</td>\n",
       "      <td>[{'id': 10090, 'name': 'board game'}, {'id': 1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Grumpier Old Men</td>\n",
       "      <td>['romance', 'comedy']</td>\n",
       "      <td>101.0</td>\n",
       "      <td>6.5</td>\n",
       "      <td>92.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>A family wedding reignites the ancient feud be...</td>\n",
       "      <td>15602</td>\n",
       "      <td>[{'cast_id': 2, 'character': 'Max Goldman', 'c...</td>\n",
       "      <td>[{'credit_id': '52fe466a9251416c75077a89', 'de...</td>\n",
       "      <td>[{'id': 1495, 'name': 'fishing'}, {'id': 12392...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Waiting to Exhale</td>\n",
       "      <td>['comedy', 'drama', 'romance']</td>\n",
       "      <td>127.0</td>\n",
       "      <td>6.1</td>\n",
       "      <td>34.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
       "      <td>31357</td>\n",
       "      <td>[{'cast_id': 1, 'character': \"Savannah 'Vannah...</td>\n",
       "      <td>[{'credit_id': '52fe44779251416c91011acb', 'de...</td>\n",
       "      <td>[{'id': 818, 'name': 'based on novel'}, {'id':...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Father of the Bride Part II</td>\n",
       "      <td>['comedy']</td>\n",
       "      <td>106.0</td>\n",
       "      <td>5.7</td>\n",
       "      <td>173.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>Just when George Banks has recovered from his ...</td>\n",
       "      <td>11862</td>\n",
       "      <td>[{'cast_id': 1, 'character': 'George Banks', '...</td>\n",
       "      <td>[{'credit_id': '52fe44959251416c75039ed7', 'de...</td>\n",
       "      <td>[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         title                              genres  runtime  \\\n",
       "0                    Toy Story   ['animation', 'comedy', 'family']     81.0   \n",
       "1                      Jumanji  ['adventure', 'fantasy', 'family']    104.0   \n",
       "2             Grumpier Old Men               ['romance', 'comedy']    101.0   \n",
       "3            Waiting to Exhale      ['comedy', 'drama', 'romance']    127.0   \n",
       "4  Father of the Bride Part II                          ['comedy']    106.0   \n",
       "\n",
       "   vote_average  vote_count  year  \\\n",
       "0           7.7      5415.0  1995   \n",
       "1           6.9      2413.0  1995   \n",
       "2           6.5        92.0  1995   \n",
       "3           6.1        34.0  1995   \n",
       "4           5.7       173.0  1995   \n",
       "\n",
       "                                            overview     id  \\\n",
       "0  Led by Woody, Andy's toys live happily in his ...    862   \n",
       "1  When siblings Judy and Peter discover an encha...   8844   \n",
       "2  A family wedding reignites the ancient feud be...  15602   \n",
       "3  Cheated on, mistreated and stepped on, the wom...  31357   \n",
       "4  Just when George Banks has recovered from his ...  11862   \n",
       "\n",
       "                                                cast  \\\n",
       "0  [{'cast_id': 14, 'character': 'Woody (voice)',...   \n",
       "1  [{'cast_id': 1, 'character': 'Alan Parrish', '...   \n",
       "2  [{'cast_id': 2, 'character': 'Max Goldman', 'c...   \n",
       "3  [{'cast_id': 1, 'character': \"Savannah 'Vannah...   \n",
       "4  [{'cast_id': 1, 'character': 'George Banks', '...   \n",
       "\n",
       "                                                crew  \\\n",
       "0  [{'credit_id': '52fe4284c3a36847f8024f49', 'de...   \n",
       "1  [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...   \n",
       "2  [{'credit_id': '52fe466a9251416c75077a89', 'de...   \n",
       "3  [{'credit_id': '52fe44779251416c91011acb', 'de...   \n",
       "4  [{'credit_id': '52fe44959251416c75039ed7', 'de...   \n",
       "\n",
       "                                            keywords  \n",
       "0  [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...  \n",
       "1  [{'id': 10090, 'name': 'board game'}, {'id': 1...  \n",
       "2  [{'id': 1495, 'name': 'fishing'}, {'id': 12392...  \n",
       "3  [{'id': 818, 'name': 'based on novel'}, {'id':...  \n",
       "4  [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Convert IDs into integer\n",
    "df['id'] = df['id'].astype('int')\n",
    "key_df['id'] = key_df['id'].astype('int')\n",
    "cred_df['id'] = cred_df['id'].astype('int')\n",
    "\n",
    "# Merge keywords and credits into your main metadata dataframe\n",
    "df = df.merge(cred_df, on='id')\n",
    "df = df.merge(key_df, on='id')\n",
    "\n",
    "#Display the head of df\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert the stringified objects into the native python objects\n",
    "from ast import literal_eval\n",
    "\n",
    "features = ['cast', 'crew', 'keywords', 'genres']\n",
    "for feature in features:\n",
    "    df[feature] = df[feature].apply(literal_eval)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'credit_id': '52fe4284c3a36847f8024f49',\n",
       " 'department': 'Directing',\n",
       " 'gender': 2,\n",
       " 'id': 7879,\n",
       " 'job': 'Director',\n",
       " 'name': 'John Lasseter',\n",
       " 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Print the first cast member of the first movie in df\n",
    "df.iloc[0]['crew'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract the director's name. If director is not listed, return NaN\n",
    "def get_director(x):\n",
    "    for crew_member in x:\n",
    "        if crew_member['job'] == 'Director':\n",
    "            return crew_member['name']\n",
    "    return np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      John Lasseter\n",
       "1       Joe Johnston\n",
       "2      Howard Deutch\n",
       "3    Forest Whitaker\n",
       "4      Charles Shyer\n",
       "Name: director, dtype: object"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Define the new director feature\n",
    "df['director'] = df['crew'].apply(get_director)\n",
    "\n",
    "#Print the directors of the first five movies\n",
    "df['director'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Returns the list top 3 elements or entire list; whichever is more.\n",
    "def generate_list(x):\n",
    "    if isinstance(x, list):\n",
    "        names = [i['name'] for i in x]\n",
    "        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.\n",
    "        if len(names) > 3:\n",
    "            names = names[:3]\n",
    "        return names\n",
    "\n",
    "    #Return empty list in case of missing/malformed data\n",
    "    return []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Apply the generate_list function to cast and keywords\n",
    "df['cast'] = df['cast'].apply(generate_list)\n",
    "df['keywords'] = df['keywords'].apply(generate_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Only consider a maximum of 3 genres\n",
    "df['genres'] = df['genres'].apply(lambda x: x[:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>cast</th>\n",
       "      <th>director</th>\n",
       "      <th>keywords</th>\n",
       "      <th>genres</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Toy Story</td>\n",
       "      <td>[Tom Hanks, Tim Allen, Don Rickles]</td>\n",
       "      <td>John Lasseter</td>\n",
       "      <td>[jealousy, toy, boy]</td>\n",
       "      <td>[animation, comedy, family]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jumanji</td>\n",
       "      <td>[Robin Williams, Jonathan Hyde, Kirsten Dunst]</td>\n",
       "      <td>Joe Johnston</td>\n",
       "      <td>[board game, disappearance, based on children'...</td>\n",
       "      <td>[adventure, fantasy, family]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Grumpier Old Men</td>\n",
       "      <td>[Walter Matthau, Jack Lemmon, Ann-Margret]</td>\n",
       "      <td>Howard Deutch</td>\n",
       "      <td>[fishing, best friend, duringcreditsstinger]</td>\n",
       "      <td>[romance, comedy]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Waiting to Exhale</td>\n",
       "      <td>[Whitney Houston, Angela Bassett, Loretta Devine]</td>\n",
       "      <td>Forest Whitaker</td>\n",
       "      <td>[based on novel, interracial relationship, sin...</td>\n",
       "      <td>[comedy, drama, romance]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Father of the Bride Part II</td>\n",
       "      <td>[Steve Martin, Diane Keaton, Martin Short]</td>\n",
       "      <td>Charles Shyer</td>\n",
       "      <td>[baby, midlife crisis, confidence]</td>\n",
       "      <td>[comedy]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         title  \\\n",
       "0                    Toy Story   \n",
       "1                      Jumanji   \n",
       "2             Grumpier Old Men   \n",
       "3            Waiting to Exhale   \n",
       "4  Father of the Bride Part II   \n",
       "\n",
       "                                                cast         director  \\\n",
       "0                [Tom Hanks, Tim Allen, Don Rickles]    John Lasseter   \n",
       "1     [Robin Williams, Jonathan Hyde, Kirsten Dunst]     Joe Johnston   \n",
       "2         [Walter Matthau, Jack Lemmon, Ann-Margret]    Howard Deutch   \n",
       "3  [Whitney Houston, Angela Bassett, Loretta Devine]  Forest Whitaker   \n",
       "4         [Steve Martin, Diane Keaton, Martin Short]    Charles Shyer   \n",
       "\n",
       "                                            keywords  \\\n",
       "0                               [jealousy, toy, boy]   \n",
       "1  [board game, disappearance, based on children'...   \n",
       "2       [fishing, best friend, duringcreditsstinger]   \n",
       "3  [based on novel, interracial relationship, sin...   \n",
       "4                 [baby, midlife crisis, confidence]   \n",
       "\n",
       "                         genres  \n",
       "0   [animation, comedy, family]  \n",
       "1  [adventure, fantasy, family]  \n",
       "2             [romance, comedy]  \n",
       "3      [comedy, drama, romance]  \n",
       "4                      [comedy]  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Print the new features of the first 5 movies along with title\n",
    "df[['title', 'cast', 'director', 'keywords', 'genres']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase\n",
    "def sanitize(x):\n",
    "    if isinstance(x, list):\n",
    "        #Strip spaces and convert to lowercase\n",
    "        return [str.lower(i.replace(\" \", \"\")) for i in x]\n",
    "    else:\n",
    "        #Check if director exists. If not, return empty string\n",
    "        if isinstance(x, str):\n",
    "            return str.lower(x.replace(\" \", \"\"))\n",
    "        else:\n",
    "            return ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Apply the generate_list function to cast, keywords, director and genres\n",
    "for feature in ['cast', 'director', 'genres', 'keywords']:\n",
    "    df[feature] = df[feature].apply(sanitize)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#Function that creates a soup out of the desired metadata\n",
    "def create_soup(x):\n",
    "    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create the new soup feature\n",
    "df['soup'] = df.apply(create_soup, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Display the soup of the first movie\n",
    "df.iloc[0]['soup']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import CountVectorizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "#Define a new CountVectorizer object and create vectors for the soup\n",
    "count = CountVectorizer(stop_words='english')\n",
    "count_matrix = count.fit_transform(df['soup'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Import cosine_similarity function\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)\n",
    "cosine_sim2 = cosine_similarity(count_matrix, count_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reset index of your df and construct reverse mapping again\n",
    "df = df.reset_index()\n",
    "indices2 = pd.Series(df.index, index=df['title'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "29607                                          Cheburashka\n",
       "40904                   VeggieTales: Josh and the Big Wall\n",
       "40913    VeggieTales: Minnesota Cuke and the Search for...\n",
       "27768                                 The Little Matchgirl\n",
       "15209             Spiderman: The Ultimate Villain Showdown\n",
       "16613                            Cirque du Soleil: Varekai\n",
       "24654                                  The Seventh Brother\n",
       "29198                                      Superstar Goofy\n",
       "30244                                              My Love\n",
       "31179                Pokémon: Arceus and the Jewel of Life\n",
       "Name: title, dtype: object"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "content_recommender('The Lion King', cosine_sim2, df, indices2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}