Aucune description

Content Based Recommenders.ipynb 55KB


  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Plot Description Based Recommender"
  8. ]
  9. },
  10. {
  11. "cell_type": "code",
  12. "execution_count": 1,
  13. "metadata": {},
  14. "outputs": [
  15. {
  16. "name": "stdout",
  17. "output_type": "stream",
  18. "text": [
  19. "Requirement already satisfied: scikit-learn in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.1.2)\n",
  20. "Requirement already satisfied: scipy in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.9.0)\n",
  21. "Requirement already satisfied: matplotlib in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (3.5.3)\n",
  22. "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
  23. "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/site-packages (from scikit-learn) (1.22.4)\n",
  24. "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (1.1.0)\n",
  25. "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
  26. "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (4.34.4)\n",
  27. "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (1.4.4)\n",
  28. "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
  29. "Requirement already satisfied: packaging>=20.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (21.3)\n",
  30. "Requirement already satisfied: pillow>=6.2.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (9.2.0)\n",
  31. "Requirement already satisfied: cycler>=0.10 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
  32. "Requirement already satisfied: six>=1.5 in /usr/local/Cellar/six/1.16.0_2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
  33. "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n",
  34. "You should consider upgrading via the '/usr/local/Cellar/ipython/8.4.0/libexec/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
  35. "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
  36. ]
  37. }
  38. ],
  39. "source": [
  40. "%pip install scikit-learn scipy matplotlib"
  41. ]
  42. },
  43. {
  44. "cell_type": "code",
  45. "execution_count": 2,
  46. "metadata": {},
  47. "outputs": [
  48. {
  49. "name": "stdout",
  50. "output_type": "stream",
  51. "text": [
  52. "Requirement already satisfied: scikit-learn in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.1.2)\n",
  53. "Requirement already satisfied: scipy in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.9.0)\n",
  54. "Requirement already satisfied: matplotlib in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (3.5.3)\n",
  55. "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (1.1.0)\n",
  56. "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/site-packages (from scikit-learn) (1.22.4)\n",
  57. "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
  58. "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
  59. "Requirement already satisfied: cycler>=0.10 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
  60. "Requirement already satisfied: pillow>=6.2.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (9.2.0)\n",
  61. "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (1.4.4)\n",
  62. "Requirement already satisfied: packaging>=20.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (21.3)\n",
  63. "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (4.34.4)\n",
  64. "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
  65. "Requirement already satisfied: six>=1.5 in /usr/local/Cellar/six/1.16.0_2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
  66. "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n",
  67. "You should consider upgrading via the '/usr/local/Cellar/ipython/8.4.0/libexec/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
  68. "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
  69. ]
  70. }
  71. ],
  72. "source": [
  73. "%pip install scikit-learn scipy matplotlib"
  74. ]
  75. },
  76. {
  77. "cell_type": "code",
  78. "execution_count": 3,
  79. "metadata": {},
  80. "outputs": [
  81. {
  82. "data": {
  83. "text/html": [
  84. "<div>\n",
  85. "<style scoped>\n",
  86. " .dataframe tbody tr th:only-of-type {\n",
  87. " vertical-align: middle;\n",
  88. " }\n",
  89. "\n",
  90. " .dataframe tbody tr th {\n",
  91. " vertical-align: top;\n",
  92. " }\n",
  93. "\n",
  94. " .dataframe thead th {\n",
  95. " text-align: right;\n",
  96. " }\n",
  97. "</style>\n",
  98. "<table border=\"1\" class=\"dataframe\">\n",
  99. " <thead>\n",
  100. " <tr style=\"text-align: right;\">\n",
  101. " <th></th>\n",
  102. " <th>title</th>\n",
  103. " <th>genres</th>\n",
  104. " <th>runtime</th>\n",
  105. " <th>vote_average</th>\n",
  106. " <th>vote_count</th>\n",
  107. " <th>year</th>\n",
  108. " </tr>\n",
  109. " </thead>\n",
  110. " <tbody>\n",
  111. " <tr>\n",
  112. " <th>0</th>\n",
  113. " <td>Toy Story</td>\n",
  114. " <td>['animation', 'comedy', 'family']</td>\n",
  115. " <td>81.0</td>\n",
  116. " <td>7.7</td>\n",
  117. " <td>5415.0</td>\n",
  118. " <td>1995</td>\n",
  119. " </tr>\n",
  120. " <tr>\n",
  121. " <th>1</th>\n",
  122. " <td>Jumanji</td>\n",
  123. " <td>['adventure', 'fantasy', 'family']</td>\n",
  124. " <td>104.0</td>\n",
  125. " <td>6.9</td>\n",
  126. " <td>2413.0</td>\n",
  127. " <td>1995</td>\n",
  128. " </tr>\n",
  129. " <tr>\n",
  130. " <th>2</th>\n",
  131. " <td>Grumpier Old Men</td>\n",
  132. " <td>['romance', 'comedy']</td>\n",
  133. " <td>101.0</td>\n",
  134. " <td>6.5</td>\n",
  135. " <td>92.0</td>\n",
  136. " <td>1995</td>\n",
  137. " </tr>\n",
  138. " <tr>\n",
  139. " <th>3</th>\n",
  140. " <td>Waiting to Exhale</td>\n",
  141. " <td>['comedy', 'drama', 'romance']</td>\n",
  142. " <td>127.0</td>\n",
  143. " <td>6.1</td>\n",
  144. " <td>34.0</td>\n",
  145. " <td>1995</td>\n",
  146. " </tr>\n",
  147. " <tr>\n",
  148. " <th>4</th>\n",
  149. " <td>Father of the Bride Part II</td>\n",
  150. " <td>['comedy']</td>\n",
  151. " <td>106.0</td>\n",
  152. " <td>5.7</td>\n",
  153. " <td>173.0</td>\n",
  154. " <td>1995</td>\n",
  155. " </tr>\n",
  156. " </tbody>\n",
  157. "</table>\n",
  158. "</div>"
  159. ],
  160. "text/plain": [
  161. " title genres runtime \\\n",
  162. "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
  163. "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
  164. "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
  165. "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
  166. "4 Father of the Bride Part II ['comedy'] 106.0 \n",
  167. "\n",
  168. " vote_average vote_count year \n",
  169. "0 7.7 5415.0 1995 \n",
  170. "1 6.9 2413.0 1995 \n",
  171. "2 6.5 92.0 1995 \n",
  172. "3 6.1 34.0 1995 \n",
  173. "4 5.7 173.0 1995 "
  174. ]
  175. },
  176. "execution_count": 3,
  177. "metadata": {},
  178. "output_type": "execute_result"
  179. }
  180. ],
  181. "source": [
  182. "import pandas as pd\n",
  183. "import numpy as np\n",
  184. "\n",
  185. "#Import data from the clean file \n",
  186. "df = pd.read_csv('../data/metadata_clean.csv')\n",
  187. "\n",
  188. "#Print the head of the cleaned DataFrame\n",
  189. "df.head()"
  190. ]
  191. },
  192. {
  193. "cell_type": "code",
  194. "execution_count": 4,
  195. "metadata": {},
  196. "outputs": [
  197. {
  198. "data": {
  199. "text/html": [
  200. "<div>\n",
  201. "<style scoped>\n",
  202. " .dataframe tbody tr th:only-of-type {\n",
  203. " vertical-align: middle;\n",
  204. " }\n",
  205. "\n",
  206. " .dataframe tbody tr th {\n",
  207. " vertical-align: top;\n",
  208. " }\n",
  209. "\n",
  210. " .dataframe thead th {\n",
  211. " text-align: right;\n",
  212. " }\n",
  213. "</style>\n",
  214. "<table border=\"1\" class=\"dataframe\">\n",
  215. " <thead>\n",
  216. " <tr style=\"text-align: right;\">\n",
  217. " <th></th>\n",
  218. " <th>title</th>\n",
  219. " <th>genres</th>\n",
  220. " <th>runtime</th>\n",
  221. " <th>vote_average</th>\n",
  222. " <th>vote_count</th>\n",
  223. " <th>year</th>\n",
  224. " <th>overview</th>\n",
  225. " <th>id</th>\n",
  226. " </tr>\n",
  227. " </thead>\n",
  228. " <tbody>\n",
  229. " <tr>\n",
  230. " <th>0</th>\n",
  231. " <td>Toy Story</td>\n",
  232. " <td>['animation', 'comedy', 'family']</td>\n",
  233. " <td>81.0</td>\n",
  234. " <td>7.7</td>\n",
  235. " <td>5415.0</td>\n",
  236. " <td>1995</td>\n",
  237. " <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
  238. " <td>862</td>\n",
  239. " </tr>\n",
  240. " <tr>\n",
  241. " <th>1</th>\n",
  242. " <td>Jumanji</td>\n",
  243. " <td>['adventure', 'fantasy', 'family']</td>\n",
  244. " <td>104.0</td>\n",
  245. " <td>6.9</td>\n",
  246. " <td>2413.0</td>\n",
  247. " <td>1995</td>\n",
  248. " <td>When siblings Judy and Peter discover an encha...</td>\n",
  249. " <td>8844</td>\n",
  250. " </tr>\n",
  251. " <tr>\n",
  252. " <th>2</th>\n",
  253. " <td>Grumpier Old Men</td>\n",
  254. " <td>['romance', 'comedy']</td>\n",
  255. " <td>101.0</td>\n",
  256. " <td>6.5</td>\n",
  257. " <td>92.0</td>\n",
  258. " <td>1995</td>\n",
  259. " <td>A family wedding reignites the ancient feud be...</td>\n",
  260. " <td>15602</td>\n",
  261. " </tr>\n",
  262. " <tr>\n",
  263. " <th>3</th>\n",
  264. " <td>Waiting to Exhale</td>\n",
  265. " <td>['comedy', 'drama', 'romance']</td>\n",
  266. " <td>127.0</td>\n",
  267. " <td>6.1</td>\n",
  268. " <td>34.0</td>\n",
  269. " <td>1995</td>\n",
  270. " <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
  271. " <td>31357</td>\n",
  272. " </tr>\n",
  273. " <tr>\n",
  274. " <th>4</th>\n",
  275. " <td>Father of the Bride Part II</td>\n",
  276. " <td>['comedy']</td>\n",
  277. " <td>106.0</td>\n",
  278. " <td>5.7</td>\n",
  279. " <td>173.0</td>\n",
  280. " <td>1995</td>\n",
  281. " <td>Just when George Banks has recovered from his ...</td>\n",
  282. " <td>11862</td>\n",
  283. " </tr>\n",
  284. " </tbody>\n",
  285. "</table>\n",
  286. "</div>"
  287. ],
  288. "text/plain": [
  289. " title genres runtime \\\n",
  290. "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
  291. "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
  292. "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
  293. "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
  294. "4 Father of the Bride Part II ['comedy'] 106.0 \n",
  295. "\n",
  296. " vote_average vote_count year \\\n",
  297. "0 7.7 5415.0 1995 \n",
  298. "1 6.9 2413.0 1995 \n",
  299. "2 6.5 92.0 1995 \n",
  300. "3 6.1 34.0 1995 \n",
  301. "4 5.7 173.0 1995 \n",
  302. "\n",
  303. " overview id \n",
  304. "0 Led by Woody, Andy's toys live happily in his ... 862 \n",
  305. "1 When siblings Judy and Peter discover an encha... 8844 \n",
  306. "2 A family wedding reignites the ancient feud be... 15602 \n",
  307. "3 Cheated on, mistreated and stepped on, the wom... 31357 \n",
  308. "4 Just when George Banks has recovered from his ... 11862 "
  309. ]
  310. },
  311. "execution_count": 4,
  312. "metadata": {},
  313. "output_type": "execute_result"
  314. }
  315. ],
  316. "source": [
  317. "#Import the original file\n",
  318. "orig_df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)\n",
  319. "\n",
  320. "#Add the useful features into the cleaned dataframe\n",
  321. "df['overview'], df['id'] = orig_df['overview'], orig_df['id']\n",
  322. "\n",
  323. "df.head()"
  324. ]
  325. },
  326. {
  327. "cell_type": "code",
  328. "execution_count": 5,
  329. "metadata": {
  330. "scrolled": true
  331. },
  332. "outputs": [
  333. {
  334. "data": {
  335. "text/plain": [
  336. "(45466, 75827)"
  337. ]
  338. },
  339. "execution_count": 5,
  340. "metadata": {},
  341. "output_type": "execute_result"
  342. }
  343. ],
  344. "source": [
  345. "#Import TfIdfVectorizer from the scikit-learn library\n",
  346. "from sklearn.feature_extraction.text import TfidfVectorizer\n",
  347. "\n",
  348. "#Define a TF-IDF Vectorizer Object. Remove all english stopwords\n",
  349. "tfidf = TfidfVectorizer(stop_words='english')\n",
  350. "\n",
  351. "#Replace NaN with an empty string\n",
  352. "df['overview'] = df['overview'].fillna('')\n",
  353. "\n",
  354. "#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature\n",
  355. "tfidf_matrix = tfidf.fit_transform(df['overview'])\n",
  356. "\n",
  357. "#Output the shape of tfidf_matrix\n",
  358. "tfidf_matrix.shape"
  359. ]
  360. },
  361. {
  362. "cell_type": "code",
  363. "execution_count": null,
  364. "metadata": {},
  365. "outputs": [],
  366. "source": []
  367. },
  368. {
  369. "cell_type": "code",
  370. "execution_count": 6,
  371. "metadata": {},
  372. "outputs": [],
  373. "source": [
  374. "# Import linear_kernel to compute the dot product\n",
  375. "from sklearn.metrics.pairwise import linear_kernel\n",
  376. "\n",
  377. "# Compute the cosine similarity matrix\n",
  378. "cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)"
  379. ]
  380. },
  381. {
  382. "cell_type": "code",
  383. "execution_count": 7,
  384. "metadata": {},
  385. "outputs": [],
  386. "source": [
  387. "#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any\n",
  388. "indices = pd.Series(df.index, index=df['title']).drop_duplicates()"
  389. ]
  390. },
  391. {
  392. "cell_type": "code",
  393. "execution_count": 8,
  394. "metadata": {},
  395. "outputs": [],
  396. "source": [
  397. "# Function that takes in movie title as input and gives recommendations \n",
  398. "def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):\n",
  399. " # Obtain the index of the movie that matches the title\n",
  400. " idx = indices[title]\n",
  401. "\n",
  402. " # Get the pairwsie similarity scores of all movies with that movie\n",
  403. " # And convert it into a list of tuples as described above\n",
  404. " sim_scores = list(enumerate(cosine_sim[idx]))\n",
  405. "\n",
  406. " # Sort the movies based on the cosine similarity scores\n",
  407. " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
  408. "\n",
  409. " # Get the scores of the 10 most similar movies. Ignore the first movie.\n",
  410. " sim_scores = sim_scores[1:11]\n",
  411. "\n",
  412. " # Get the movie indices\n",
  413. " movie_indices = [i[0] for i in sim_scores]\n",
  414. "\n",
  415. " # Return the top 10 most similar movies\n",
  416. " return df['title'].iloc[movie_indices]"
  417. ]
  418. },
  419. {
  420. "cell_type": "code",
  421. "execution_count": 9,
  422. "metadata": {},
  423. "outputs": [
  424. {
  425. "data": {
  426. "text/plain": [
  427. "34682 How the Lion Cub and the Turtle Sang a Song\n",
  428. "9353 The Lion King 1½\n",
  429. "9115 The Lion King 2: Simba's Pride\n",
  430. "42829 Prey\n",
  431. "25654 Fearless Fagan\n",
  432. "17041 African Cats\n",
  433. "27933 Massaï, les guerriers de la pluie\n",
  434. "6094 Born Free\n",
  435. "37409 Sour Grape\n",
  436. "3203 The Waiting Game\n",
  437. "Name: title, dtype: object"
  438. ]
  439. },
  440. "execution_count": 9,
  441. "metadata": {},
  442. "output_type": "execute_result"
  443. }
  444. ],
  445. "source": [
  446. "#Get recommendations for The Lion King\n",
  447. "content_recommender('The Lion King')"
  448. ]
  449. },
  450. {
  451. "cell_type": "markdown",
  452. "metadata": {},
  453. "source": [
  454. "# Metadata Based Recommender"
  455. ]
  456. },
  457. {
  458. "cell_type": "code",
  459. "execution_count": 10,
  460. "metadata": {},
  461. "outputs": [],
  462. "source": [
  463. "# Load the keywords and credits files\n",
  464. "cred_df = pd.read_csv('../data/credits.csv')\n",
  465. "key_df = pd.read_csv('../data/keywords.csv')"
  466. ]
  467. },
  468. {
  469. "cell_type": "code",
  470. "execution_count": 11,
  471. "metadata": {},
  472. "outputs": [
  473. {
  474. "data": {
  475. "text/html": [
  476. "<div>\n",
  477. "<style scoped>\n",
  478. " .dataframe tbody tr th:only-of-type {\n",
  479. " vertical-align: middle;\n",
  480. " }\n",
  481. "\n",
  482. " .dataframe tbody tr th {\n",
  483. " vertical-align: top;\n",
  484. " }\n",
  485. "\n",
  486. " .dataframe thead th {\n",
  487. " text-align: right;\n",
  488. " }\n",
  489. "</style>\n",
  490. "<table border=\"1\" class=\"dataframe\">\n",
  491. " <thead>\n",
  492. " <tr style=\"text-align: right;\">\n",
  493. " <th></th>\n",
  494. " <th>cast</th>\n",
  495. " <th>crew</th>\n",
  496. " <th>id</th>\n",
  497. " </tr>\n",
  498. " </thead>\n",
  499. " <tbody>\n",
  500. " <tr>\n",
  501. " <th>0</th>\n",
  502. " <td>[{'cast_id': 14, 'character': 'Woody (voice)',...</td>\n",
  503. " <td>[{'credit_id': '52fe4284c3a36847f8024f49', 'de...</td>\n",
  504. " <td>862</td>\n",
  505. " </tr>\n",
  506. " <tr>\n",
  507. " <th>1</th>\n",
  508. " <td>[{'cast_id': 1, 'character': 'Alan Parrish', '...</td>\n",
  509. " <td>[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...</td>\n",
  510. " <td>8844</td>\n",
  511. " </tr>\n",
  512. " <tr>\n",
  513. " <th>2</th>\n",
  514. " <td>[{'cast_id': 2, 'character': 'Max Goldman', 'c...</td>\n",
  515. " <td>[{'credit_id': '52fe466a9251416c75077a89', 'de...</td>\n",
  516. " <td>15602</td>\n",
  517. " </tr>\n",
  518. " <tr>\n",
  519. " <th>3</th>\n",
  520. " <td>[{'cast_id': 1, 'character': \"Savannah 'Vannah...</td>\n",
  521. " <td>[{'credit_id': '52fe44779251416c91011acb', 'de...</td>\n",
  522. " <td>31357</td>\n",
  523. " </tr>\n",
  524. " <tr>\n",
  525. " <th>4</th>\n",
  526. " <td>[{'cast_id': 1, 'character': 'George Banks', '...</td>\n",
  527. " <td>[{'credit_id': '52fe44959251416c75039ed7', 'de...</td>\n",
  528. " <td>11862</td>\n",
  529. " </tr>\n",
  530. " </tbody>\n",
  531. "</table>\n",
  532. "</div>"
  533. ],
  534. "text/plain": [
  535. " cast \\\n",
  536. "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n",
  537. "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n",
  538. "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n",
  539. "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n",
  540. "4 [{'cast_id': 1, 'character': 'George Banks', '... \n",
  541. "\n",
  542. " crew id \n",
  543. "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n",
  544. "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n",
  545. "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n",
  546. "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n",
  547. "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 "
  548. ]
  549. },
  550. "execution_count": 11,
  551. "metadata": {},
  552. "output_type": "execute_result"
  553. }
  554. ],
  555. "source": [
  556. "#Print the head of the credit dataframe\n",
  557. "cred_df.head()"
  558. ]
  559. },
  560. {
  561. "cell_type": "code",
  562. "execution_count": 12,
  563. "metadata": {},
  564. "outputs": [
  565. {
  566. "data": {
  567. "text/html": [
  568. "<div>\n",
  569. "<style scoped>\n",
  570. " .dataframe tbody tr th:only-of-type {\n",
  571. " vertical-align: middle;\n",
  572. " }\n",
  573. "\n",
  574. " .dataframe tbody tr th {\n",
  575. " vertical-align: top;\n",
  576. " }\n",
  577. "\n",
  578. " .dataframe thead th {\n",
  579. " text-align: right;\n",
  580. " }\n",
  581. "</style>\n",
  582. "<table border=\"1\" class=\"dataframe\">\n",
  583. " <thead>\n",
  584. " <tr style=\"text-align: right;\">\n",
  585. " <th></th>\n",
  586. " <th>id</th>\n",
  587. " <th>keywords</th>\n",
  588. " </tr>\n",
  589. " </thead>\n",
  590. " <tbody>\n",
  591. " <tr>\n",
  592. " <th>0</th>\n",
  593. " <td>862</td>\n",
  594. " <td>[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...</td>\n",
  595. " </tr>\n",
  596. " <tr>\n",
  597. " <th>1</th>\n",
  598. " <td>8844</td>\n",
  599. " <td>[{'id': 10090, 'name': 'board game'}, {'id': 1...</td>\n",
  600. " </tr>\n",
  601. " <tr>\n",
  602. " <th>2</th>\n",
  603. " <td>15602</td>\n",
  604. " <td>[{'id': 1495, 'name': 'fishing'}, {'id': 12392...</td>\n",
  605. " </tr>\n",
  606. " <tr>\n",
  607. " <th>3</th>\n",
  608. " <td>31357</td>\n",
  609. " <td>[{'id': 818, 'name': 'based on novel'}, {'id':...</td>\n",
  610. " </tr>\n",
  611. " <tr>\n",
  612. " <th>4</th>\n",
  613. " <td>11862</td>\n",
  614. " <td>[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...</td>\n",
  615. " </tr>\n",
  616. " </tbody>\n",
  617. "</table>\n",
  618. "</div>"
  619. ],
  620. "text/plain": [
  621. " id keywords\n",
  622. "0 862 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n",
  623. "1 8844 [{'id': 10090, 'name': 'board game'}, {'id': 1...\n",
  624. "2 15602 [{'id': 1495, 'name': 'fishing'}, {'id': 12392...\n",
  625. "3 31357 [{'id': 818, 'name': 'based on novel'}, {'id':...\n",
  626. "4 11862 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
  627. ]
  628. },
  629. "execution_count": 12,
  630. "metadata": {},
  631. "output_type": "execute_result"
  632. }
  633. ],
  634. "source": [
  635. "#Print the head of the keywords dataframe\n",
  636. "key_df.head()"
  637. ]
  638. },
  639. {
  640. "cell_type": "code",
  641. "execution_count": 13,
  642. "metadata": {},
  643. "outputs": [
  644. {
  645. "ename": "ValueError",
  646. "evalue": "invalid literal for int() with base 10: '1997-08-20'",
  647. "output_type": "error",
  648. "traceback": [
  649. "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
  650. "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
  651. "Input \u001b[0;32mIn [13]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#Convert the IDs of df into int\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mid\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mint\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
  652. "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/generic.py:5912\u001b[0m, in \u001b[0;36mNDFrame.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 5905\u001b[0m results \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 5906\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miloc[:, i]\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[1;32m 5907\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns))\n\u001b[1;32m 5908\u001b[0m ]\n\u001b[1;32m 5910\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 5911\u001b[0m \u001b[38;5;66;03m# else, only a single dtype is given\u001b[39;00m\n\u001b[0;32m-> 5912\u001b[0m new_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mgr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5913\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_constructor(new_data)\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mastype\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 5915\u001b[0m \u001b[38;5;66;03m# GH 33113: handle empty frame or series\u001b[39;00m\n",
  653. "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/internals/managers.py:419\u001b[0m, in \u001b[0;36mBaseBlockManager.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 418\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mastype\u001b[39m(\u001b[38;5;28mself\u001b[39m: T, dtype, copy: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, errors: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraise\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m T:\n\u001b[0;32m--> 419\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mastype\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n",
  654. "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/internals/managers.py:304\u001b[0m, in \u001b[0;36mBaseBlockManager.apply\u001b[0;34m(self, f, align_keys, ignore_failures, **kwargs)\u001b[0m\n\u001b[1;32m 302\u001b[0m applied \u001b[38;5;241m=\u001b[39m b\u001b[38;5;241m.\u001b[39mapply(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 304\u001b[0m applied \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mTypeError\u001b[39;00m, \u001b[38;5;167;01mNotImplementedError\u001b[39;00m):\n\u001b[1;32m 306\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m ignore_failures:\n",
  655. "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/internals/blocks.py:580\u001b[0m, in \u001b[0;36mBlock.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 562\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 563\u001b[0m \u001b[38;5;124;03mCoerce to the new dtype.\u001b[39;00m\n\u001b[1;32m 564\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 576\u001b[0m \u001b[38;5;124;03mBlock\u001b[39;00m\n\u001b[1;32m 577\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 578\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvalues\n\u001b[0;32m--> 580\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array_safe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 582\u001b[0m new_values \u001b[38;5;241m=\u001b[39m maybe_coerce_values(new_values)\n\u001b[1;32m 583\u001b[0m newb \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmake_block(new_values)\n",
  656. "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1292\u001b[0m, in \u001b[0;36mastype_array_safe\u001b[0;34m(values, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 1289\u001b[0m dtype \u001b[38;5;241m=\u001b[39m dtype\u001b[38;5;241m.\u001b[39mnumpy_dtype\n\u001b[1;32m 1291\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1292\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1293\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[1;32m 1294\u001b[0m \u001b[38;5;66;03m# e.g. astype_nansafe can fail on object-dtype of strings\u001b[39;00m\n\u001b[1;32m 1295\u001b[0m \u001b[38;5;66;03m# trying to convert to float\u001b[39;00m\n\u001b[1;32m 1296\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
  657. "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1237\u001b[0m, in \u001b[0;36mastype_array\u001b[0;34m(values, dtype, copy)\u001b[0m\n\u001b[1;32m 1234\u001b[0m values \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[1;32m 1236\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1237\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1239\u001b[0m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[1;32m 1240\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n",
  658. "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1154\u001b[0m, in \u001b[0;36mastype_nansafe\u001b[0;34m(arr, dtype, copy, skipna)\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_object_dtype(arr\u001b[38;5;241m.\u001b[39mdtype):\n\u001b[1;32m 1151\u001b[0m \n\u001b[1;32m 1152\u001b[0m \u001b[38;5;66;03m# work around NumPy brokenness, #1987\u001b[39;00m\n\u001b[1;32m 1153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39missubdtype(dtype\u001b[38;5;241m.\u001b[39mtype, np\u001b[38;5;241m.\u001b[39minteger):\n\u001b[0;32m-> 1154\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype_intsafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1156\u001b[0m \u001b[38;5;66;03m# if we have a datetime/timedelta array of objects\u001b[39;00m\n\u001b[1;32m 1157\u001b[0m \u001b[38;5;66;03m# then coerce to a proper dtype and recall astype_nansafe\u001b[39;00m\n\u001b[1;32m 1159\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_datetime64_dtype(dtype):\n",
  659. "File \u001b[0;32m/usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages/pandas/_libs/lib.pyx:668\u001b[0m, in \u001b[0;36mpandas._libs.lib.astype_intsafe\u001b[0;34m()\u001b[0m\n",
  660. "\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: '1997-08-20'"
  661. ]
  662. }
  663. ],
  664. "source": [
  665. "#Convert the IDs of df into int\n",
  666. "df['id'] = df['id'].astype('int')"
  667. ]
  668. },
  669. {
  670. "cell_type": "code",
  671. "execution_count": 14,
  672. "metadata": {},
  673. "outputs": [],
  674. "source": [
  675. "# Function to convert all non-integer IDs to NaN\n",
  676. "def clean_ids(x):\n",
  677. " try:\n",
  678. " return int(x)\n",
  679. " except:\n",
  680. " return np.nan"
  681. ]
  682. },
  683. {
  684. "cell_type": "code",
  685. "execution_count": 15,
  686. "metadata": {},
  687. "outputs": [],
  688. "source": [
  689. "#Clean the ids of df\n",
  690. "df['id'] = df['id'].apply(clean_ids)\n",
  691. "\n",
  692. "#Filter all rows that have a null ID\n",
  693. "df = df[df['id'].notnull()]"
  694. ]
  695. },
  696. {
  697. "cell_type": "code",
  698. "execution_count": 16,
  699. "metadata": {},
  700. "outputs": [
  701. {
  702. "name": "stderr",
  703. "output_type": "stream",
  704. "text": [
  705. "/var/folders/dv/107570dd01b_m3wvmvv0g9lm0000gn/T/ipykernel_21647/2304563750.py:2: SettingWithCopyWarning: \n",
  706. "A value is trying to be set on a copy of a slice from a DataFrame.\n",
  707. "Try using .loc[row_indexer,col_indexer] = value instead\n",
  708. "\n",
  709. "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
  710. " df['id'] = df['id'].astype('int')\n"
  711. ]
  712. },
  713. {
  714. "data": {
  715. "text/html": [
  716. "<div>\n",
  717. "<style scoped>\n",
  718. " .dataframe tbody tr th:only-of-type {\n",
  719. " vertical-align: middle;\n",
  720. " }\n",
  721. "\n",
  722. " .dataframe tbody tr th {\n",
  723. " vertical-align: top;\n",
  724. " }\n",
  725. "\n",
  726. " .dataframe thead th {\n",
  727. " text-align: right;\n",
  728. " }\n",
  729. "</style>\n",
  730. "<table border=\"1\" class=\"dataframe\">\n",
  731. " <thead>\n",
  732. " <tr style=\"text-align: right;\">\n",
  733. " <th></th>\n",
  734. " <th>title</th>\n",
  735. " <th>genres</th>\n",
  736. " <th>runtime</th>\n",
  737. " <th>vote_average</th>\n",
  738. " <th>vote_count</th>\n",
  739. " <th>year</th>\n",
  740. " <th>overview</th>\n",
  741. " <th>id</th>\n",
  742. " <th>cast</th>\n",
  743. " <th>crew</th>\n",
  744. " <th>keywords</th>\n",
  745. " </tr>\n",
  746. " </thead>\n",
  747. " <tbody>\n",
  748. " <tr>\n",
  749. " <th>0</th>\n",
  750. " <td>Toy Story</td>\n",
  751. " <td>['animation', 'comedy', 'family']</td>\n",
  752. " <td>81.0</td>\n",
  753. " <td>7.7</td>\n",
  754. " <td>5415.0</td>\n",
  755. " <td>1995</td>\n",
  756. " <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
  757. " <td>862</td>\n",
  758. " <td>[{'cast_id': 14, 'character': 'Woody (voice)',...</td>\n",
  759. " <td>[{'credit_id': '52fe4284c3a36847f8024f49', 'de...</td>\n",
  760. " <td>[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...</td>\n",
  761. " </tr>\n",
  762. " <tr>\n",
  763. " <th>1</th>\n",
  764. " <td>Jumanji</td>\n",
  765. " <td>['adventure', 'fantasy', 'family']</td>\n",
  766. " <td>104.0</td>\n",
  767. " <td>6.9</td>\n",
  768. " <td>2413.0</td>\n",
  769. " <td>1995</td>\n",
  770. " <td>When siblings Judy and Peter discover an encha...</td>\n",
  771. " <td>8844</td>\n",
  772. " <td>[{'cast_id': 1, 'character': 'Alan Parrish', '...</td>\n",
  773. " <td>[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...</td>\n",
  774. " <td>[{'id': 10090, 'name': 'board game'}, {'id': 1...</td>\n",
  775. " </tr>\n",
  776. " <tr>\n",
  777. " <th>2</th>\n",
  778. " <td>Grumpier Old Men</td>\n",
  779. " <td>['romance', 'comedy']</td>\n",
  780. " <td>101.0</td>\n",
  781. " <td>6.5</td>\n",
  782. " <td>92.0</td>\n",
  783. " <td>1995</td>\n",
  784. " <td>A family wedding reignites the ancient feud be...</td>\n",
  785. " <td>15602</td>\n",
  786. " <td>[{'cast_id': 2, 'character': 'Max Goldman', 'c...</td>\n",
  787. " <td>[{'credit_id': '52fe466a9251416c75077a89', 'de...</td>\n",
  788. " <td>[{'id': 1495, 'name': 'fishing'}, {'id': 12392...</td>\n",
  789. " </tr>\n",
  790. " <tr>\n",
  791. " <th>3</th>\n",
  792. " <td>Waiting to Exhale</td>\n",
  793. " <td>['comedy', 'drama', 'romance']</td>\n",
  794. " <td>127.0</td>\n",
  795. " <td>6.1</td>\n",
  796. " <td>34.0</td>\n",
  797. " <td>1995</td>\n",
  798. " <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
  799. " <td>31357</td>\n",
  800. " <td>[{'cast_id': 1, 'character': \"Savannah 'Vannah...</td>\n",
  801. " <td>[{'credit_id': '52fe44779251416c91011acb', 'de...</td>\n",
  802. " <td>[{'id': 818, 'name': 'based on novel'}, {'id':...</td>\n",
  803. " </tr>\n",
  804. " <tr>\n",
  805. " <th>4</th>\n",
  806. " <td>Father of the Bride Part II</td>\n",
  807. " <td>['comedy']</td>\n",
  808. " <td>106.0</td>\n",
  809. " <td>5.7</td>\n",
  810. " <td>173.0</td>\n",
  811. " <td>1995</td>\n",
  812. " <td>Just when George Banks has recovered from his ...</td>\n",
  813. " <td>11862</td>\n",
  814. " <td>[{'cast_id': 1, 'character': 'George Banks', '...</td>\n",
  815. " <td>[{'credit_id': '52fe44959251416c75039ed7', 'de...</td>\n",
  816. " <td>[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...</td>\n",
  817. " </tr>\n",
  818. " </tbody>\n",
  819. "</table>\n",
  820. "</div>"
  821. ],
  822. "text/plain": [
  823. " title genres runtime \\\n",
  824. "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
  825. "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
  826. "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
  827. "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
  828. "4 Father of the Bride Part II ['comedy'] 106.0 \n",
  829. "\n",
  830. " vote_average vote_count year \\\n",
  831. "0 7.7 5415.0 1995 \n",
  832. "1 6.9 2413.0 1995 \n",
  833. "2 6.5 92.0 1995 \n",
  834. "3 6.1 34.0 1995 \n",
  835. "4 5.7 173.0 1995 \n",
  836. "\n",
  837. " overview id \\\n",
  838. "0 Led by Woody, Andy's toys live happily in his ... 862 \n",
  839. "1 When siblings Judy and Peter discover an encha... 8844 \n",
  840. "2 A family wedding reignites the ancient feud be... 15602 \n",
  841. "3 Cheated on, mistreated and stepped on, the wom... 31357 \n",
  842. "4 Just when George Banks has recovered from his ... 11862 \n",
  843. "\n",
  844. " cast \\\n",
  845. "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n",
  846. "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n",
  847. "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n",
  848. "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n",
  849. "4 [{'cast_id': 1, 'character': 'George Banks', '... \n",
  850. "\n",
  851. " crew \\\n",
  852. "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... \n",
  853. "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... \n",
  854. "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... \n",
  855. "3 [{'credit_id': '52fe44779251416c91011acb', 'de... \n",
  856. "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... \n",
  857. "\n",
  858. " keywords \n",
  859. "0 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,... \n",
  860. "1 [{'id': 10090, 'name': 'board game'}, {'id': 1... \n",
  861. "2 [{'id': 1495, 'name': 'fishing'}, {'id': 12392... \n",
  862. "3 [{'id': 818, 'name': 'based on novel'}, {'id':... \n",
  863. "4 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n... "
  864. ]
  865. },
  866. "execution_count": 16,
  867. "metadata": {},
  868. "output_type": "execute_result"
  869. }
  870. ],
  871. "source": [
  872. "# Convert IDs into integer\n",
  873. "df['id'] = df['id'].astype('int')\n",
  874. "key_df['id'] = key_df['id'].astype('int')\n",
  875. "cred_df['id'] = cred_df['id'].astype('int')\n",
  876. "\n",
  877. "# Merge keywords and credits into your main metadata dataframe\n",
  878. "df = df.merge(cred_df, on='id')\n",
  879. "df = df.merge(key_df, on='id')\n",
  880. "\n",
  881. "#Display the head of df\n",
  882. "df.head()"
  883. ]
  884. },
  885. {
  886. "cell_type": "code",
  887. "execution_count": 17,
  888. "metadata": {},
  889. "outputs": [],
  890. "source": [
  891. "# Convert the stringified objects into the native python objects\n",
  892. "from ast import literal_eval\n",
  893. "\n",
  894. "features = ['cast', 'crew', 'keywords', 'genres']\n",
  895. "for feature in features:\n",
  896. " df[feature] = df[feature].apply(literal_eval)"
  897. ]
  898. },
  899. {
  900. "cell_type": "code",
  901. "execution_count": 18,
  902. "metadata": {},
  903. "outputs": [
  904. {
  905. "data": {
  906. "text/plain": [
  907. "{'credit_id': '52fe4284c3a36847f8024f49',\n",
  908. " 'department': 'Directing',\n",
  909. " 'gender': 2,\n",
  910. " 'id': 7879,\n",
  911. " 'job': 'Director',\n",
  912. " 'name': 'John Lasseter',\n",
  913. " 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}"
  914. ]
  915. },
  916. "execution_count": 18,
  917. "metadata": {},
  918. "output_type": "execute_result"
  919. }
  920. ],
  921. "source": [
  922. "#Print the first cast member of the first movie in df\n",
  923. "df.iloc[0]['crew'][0]"
  924. ]
  925. },
  926. {
  927. "cell_type": "code",
  928. "execution_count": 19,
  929. "metadata": {},
  930. "outputs": [],
  931. "source": [
  932. "# Extract the director's name. If director is not listed, return NaN\n",
  933. "def get_director(x):\n",
  934. " for crew_member in x:\n",
  935. " if crew_member['job'] == 'Director':\n",
  936. " return crew_member['name']\n",
  937. " return np.nan"
  938. ]
  939. },
  940. {
  941. "cell_type": "code",
  942. "execution_count": 20,
  943. "metadata": {},
  944. "outputs": [
  945. {
  946. "data": {
  947. "text/plain": [
  948. "0 John Lasseter\n",
  949. "1 Joe Johnston\n",
  950. "2 Howard Deutch\n",
  951. "3 Forest Whitaker\n",
  952. "4 Charles Shyer\n",
  953. "Name: director, dtype: object"
  954. ]
  955. },
  956. "execution_count": 20,
  957. "metadata": {},
  958. "output_type": "execute_result"
  959. }
  960. ],
  961. "source": [
  962. "#Define the new director feature\n",
  963. "df['director'] = df['crew'].apply(get_director)\n",
  964. "\n",
  965. "#Print the directors of the first five movies\n",
  966. "df['director'].head()"
  967. ]
  968. },
  969. {
  970. "cell_type": "code",
  971. "execution_count": 21,
  972. "metadata": {},
  973. "outputs": [],
  974. "source": [
  975. "# Returns the list top 3 elements or entire list; whichever is more.\n",
  976. "def generate_list(x):\n",
  977. " if isinstance(x, list):\n",
  978. " names = [i['name'] for i in x]\n",
  979. " #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.\n",
  980. " if len(names) > 3:\n",
  981. " names = names[:3]\n",
  982. " return names\n",
  983. "\n",
  984. " #Return empty list in case of missing/malformed data\n",
  985. " return []"
  986. ]
  987. },
  988. {
  989. "cell_type": "code",
  990. "execution_count": 22,
  991. "metadata": {},
  992. "outputs": [],
  993. "source": [
  994. "#Apply the generate_list function to cast and keywords\n",
  995. "df['cast'] = df['cast'].apply(generate_list)\n",
  996. "df['keywords'] = df['keywords'].apply(generate_list)"
  997. ]
  998. },
  999. {
  1000. "cell_type": "code",
  1001. "execution_count": 23,
  1002. "metadata": {},
  1003. "outputs": [],
  1004. "source": [
  1005. "#Only consider a maximum of 3 genres\n",
  1006. "df['genres'] = df['genres'].apply(lambda x: x[:3])"
  1007. ]
  1008. },
  1009. {
  1010. "cell_type": "code",
  1011. "execution_count": 24,
  1012. "metadata": {},
  1013. "outputs": [
  1014. {
  1015. "data": {
  1016. "text/html": [
  1017. "<div>\n",
  1018. "<style scoped>\n",
  1019. " .dataframe tbody tr th:only-of-type {\n",
  1020. " vertical-align: middle;\n",
  1021. " }\n",
  1022. "\n",
  1023. " .dataframe tbody tr th {\n",
  1024. " vertical-align: top;\n",
  1025. " }\n",
  1026. "\n",
  1027. " .dataframe thead th {\n",
  1028. " text-align: right;\n",
  1029. " }\n",
  1030. "</style>\n",
  1031. "<table border=\"1\" class=\"dataframe\">\n",
  1032. " <thead>\n",
  1033. " <tr style=\"text-align: right;\">\n",
  1034. " <th></th>\n",
  1035. " <th>title</th>\n",
  1036. " <th>cast</th>\n",
  1037. " <th>director</th>\n",
  1038. " <th>keywords</th>\n",
  1039. " <th>genres</th>\n",
  1040. " </tr>\n",
  1041. " </thead>\n",
  1042. " <tbody>\n",
  1043. " <tr>\n",
  1044. " <th>0</th>\n",
  1045. " <td>Toy Story</td>\n",
  1046. " <td>[Tom Hanks, Tim Allen, Don Rickles]</td>\n",
  1047. " <td>John Lasseter</td>\n",
  1048. " <td>[jealousy, toy, boy]</td>\n",
  1049. " <td>[animation, comedy, family]</td>\n",
  1050. " </tr>\n",
  1051. " <tr>\n",
  1052. " <th>1</th>\n",
  1053. " <td>Jumanji</td>\n",
  1054. " <td>[Robin Williams, Jonathan Hyde, Kirsten Dunst]</td>\n",
  1055. " <td>Joe Johnston</td>\n",
  1056. " <td>[board game, disappearance, based on children'...</td>\n",
  1057. " <td>[adventure, fantasy, family]</td>\n",
  1058. " </tr>\n",
  1059. " <tr>\n",
  1060. " <th>2</th>\n",
  1061. " <td>Grumpier Old Men</td>\n",
  1062. " <td>[Walter Matthau, Jack Lemmon, Ann-Margret]</td>\n",
  1063. " <td>Howard Deutch</td>\n",
  1064. " <td>[fishing, best friend, duringcreditsstinger]</td>\n",
  1065. " <td>[romance, comedy]</td>\n",
  1066. " </tr>\n",
  1067. " <tr>\n",
  1068. " <th>3</th>\n",
  1069. " <td>Waiting to Exhale</td>\n",
  1070. " <td>[Whitney Houston, Angela Bassett, Loretta Devine]</td>\n",
  1071. " <td>Forest Whitaker</td>\n",
  1072. " <td>[based on novel, interracial relationship, sin...</td>\n",
  1073. " <td>[comedy, drama, romance]</td>\n",
  1074. " </tr>\n",
  1075. " <tr>\n",
  1076. " <th>4</th>\n",
  1077. " <td>Father of the Bride Part II</td>\n",
  1078. " <td>[Steve Martin, Diane Keaton, Martin Short]</td>\n",
  1079. " <td>Charles Shyer</td>\n",
  1080. " <td>[baby, midlife crisis, confidence]</td>\n",
  1081. " <td>[comedy]</td>\n",
  1082. " </tr>\n",
  1083. " </tbody>\n",
  1084. "</table>\n",
  1085. "</div>"
  1086. ],
  1087. "text/plain": [
  1088. " title \\\n",
  1089. "0 Toy Story \n",
  1090. "1 Jumanji \n",
  1091. "2 Grumpier Old Men \n",
  1092. "3 Waiting to Exhale \n",
  1093. "4 Father of the Bride Part II \n",
  1094. "\n",
  1095. " cast director \\\n",
  1096. "0 [Tom Hanks, Tim Allen, Don Rickles] John Lasseter \n",
  1097. "1 [Robin Williams, Jonathan Hyde, Kirsten Dunst] Joe Johnston \n",
  1098. "2 [Walter Matthau, Jack Lemmon, Ann-Margret] Howard Deutch \n",
  1099. "3 [Whitney Houston, Angela Bassett, Loretta Devine] Forest Whitaker \n",
  1100. "4 [Steve Martin, Diane Keaton, Martin Short] Charles Shyer \n",
  1101. "\n",
  1102. " keywords \\\n",
  1103. "0 [jealousy, toy, boy] \n",
  1104. "1 [board game, disappearance, based on children'... \n",
  1105. "2 [fishing, best friend, duringcreditsstinger] \n",
  1106. "3 [based on novel, interracial relationship, sin... \n",
  1107. "4 [baby, midlife crisis, confidence] \n",
  1108. "\n",
  1109. " genres \n",
  1110. "0 [animation, comedy, family] \n",
  1111. "1 [adventure, fantasy, family] \n",
  1112. "2 [romance, comedy] \n",
  1113. "3 [comedy, drama, romance] \n",
  1114. "4 [comedy] "
  1115. ]
  1116. },
  1117. "execution_count": 24,
  1118. "metadata": {},
  1119. "output_type": "execute_result"
  1120. }
  1121. ],
  1122. "source": [
  1123. "# Print the new features of the first 5 movies along with title\n",
  1124. "df[['title', 'cast', 'director', 'keywords', 'genres']].head()"
  1125. ]
  1126. },
  1127. {
  1128. "cell_type": "code",
  1129. "execution_count": 25,
  1130. "metadata": {},
  1131. "outputs": [],
  1132. "source": [
  1133. "# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase\n",
  1134. "def sanitize(x):\n",
  1135. " if isinstance(x, list):\n",
  1136. " #Strip spaces and convert to lowercase\n",
  1137. " return [str.lower(i.replace(\" \", \"\")) for i in x]\n",
  1138. " else:\n",
  1139. " #Check if director exists. If not, return empty string\n",
  1140. " if isinstance(x, str):\n",
  1141. " return str.lower(x.replace(\" \", \"\"))\n",
  1142. " else:\n",
  1143. " return ''"
  1144. ]
  1145. },
  1146. {
  1147. "cell_type": "code",
  1148. "execution_count": 26,
  1149. "metadata": {},
  1150. "outputs": [],
  1151. "source": [
  1152. "#Apply the generate_list function to cast, keywords, director and genres\n",
  1153. "for feature in ['cast', 'director', 'genres', 'keywords']:\n",
  1154. " df[feature] = df[feature].apply(sanitize)"
  1155. ]
  1156. },
  1157. {
  1158. "cell_type": "code",
  1159. "execution_count": 27,
  1160. "metadata": {
  1161. "scrolled": true
  1162. },
  1163. "outputs": [],
  1164. "source": [
  1165. "#Function that creates a soup out of the desired metadata\n",
  1166. "def create_soup(x):\n",
  1167. " return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])"
  1168. ]
  1169. },
  1170. {
  1171. "cell_type": "code",
  1172. "execution_count": 28,
  1173. "metadata": {},
  1174. "outputs": [],
  1175. "source": [
  1176. "# Create the new soup feature\n",
  1177. "df['soup'] = df.apply(create_soup, axis=1)"
  1178. ]
  1179. },
  1180. {
  1181. "cell_type": "code",
  1182. "execution_count": 29,
  1183. "metadata": {},
  1184. "outputs": [
  1185. {
  1186. "data": {
  1187. "text/plain": [
  1188. "'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'"
  1189. ]
  1190. },
  1191. "execution_count": 29,
  1192. "metadata": {},
  1193. "output_type": "execute_result"
  1194. }
  1195. ],
  1196. "source": [
  1197. "#Display the soup of the first movie\n",
  1198. "df.iloc[0]['soup']"
  1199. ]
  1200. },
  1201. {
  1202. "cell_type": "code",
  1203. "execution_count": 30,
  1204. "metadata": {},
  1205. "outputs": [],
  1206. "source": [
  1207. "# Import CountVectorizer\n",
  1208. "from sklearn.feature_extraction.text import CountVectorizer\n",
  1209. "\n",
  1210. "#Define a new CountVectorizer object and create vectors for the soup\n",
  1211. "count = CountVectorizer(stop_words='english')\n",
  1212. "count_matrix = count.fit_transform(df['soup'])"
  1213. ]
  1214. },
  1215. {
  1216. "cell_type": "code",
  1217. "execution_count": 31,
  1218. "metadata": {},
  1219. "outputs": [],
  1220. "source": [
  1221. "#Import cosine_similarity function\n",
  1222. "from sklearn.metrics.pairwise import cosine_similarity\n",
  1223. "\n",
  1224. "#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)\n",
  1225. "cosine_sim2 = cosine_similarity(count_matrix, count_matrix)"
  1226. ]
  1227. },
  1228. {
  1229. "cell_type": "code",
  1230. "execution_count": 32,
  1231. "metadata": {},
  1232. "outputs": [],
  1233. "source": [
  1234. "# Reset index of your df and construct reverse mapping again\n",
  1235. "df = df.reset_index()\n",
  1236. "indices2 = pd.Series(df.index, index=df['title'])"
  1237. ]
  1238. },
  1239. {
  1240. "cell_type": "code",
  1241. "execution_count": 33,
  1242. "metadata": {},
  1243. "outputs": [
  1244. {
  1245. "data": {
  1246. "text/plain": [
  1247. "29607 Cheburashka\n",
  1248. "40904 VeggieTales: Josh and the Big Wall\n",
  1249. "40913 VeggieTales: Minnesota Cuke and the Search for...\n",
  1250. "27768 The Little Matchgirl\n",
  1251. "15209 Spiderman: The Ultimate Villain Showdown\n",
  1252. "16613 Cirque du Soleil: Varekai\n",
  1253. "24654 The Seventh Brother\n",
  1254. "29198 Superstar Goofy\n",
  1255. "30244 My Love\n",
  1256. "31179 Pokémon: Arceus and the Jewel of Life\n",
  1257. "Name: title, dtype: object"
  1258. ]
  1259. },
  1260. "execution_count": 33,
  1261. "metadata": {},
  1262. "output_type": "execute_result"
  1263. }
  1264. ],
  1265. "source": [
  1266. "content_recommender('The Lion King', cosine_sim2, df, indices2)"
  1267. ]
  1268. }
  1269. ],
  1270. "metadata": {
  1271. "kernelspec": {
  1272. "display_name": "Python 3 (ipykernel)",
  1273. "language": "python",
  1274. "name": "python3"
  1275. },
  1276. "language_info": {
  1277. "codemirror_mode": {
  1278. "name": "ipython",
  1279. "version": 3
  1280. },
  1281. "file_extension": ".py",
  1282. "mimetype": "text/x-python",
  1283. "name": "python",
  1284. "nbconvert_exporter": "python",
  1285. "pygments_lexer": "ipython3",
  1286. "version": "3.10.4"
  1287. }
  1288. },
  1289. "nbformat": 4,
  1290. "nbformat_minor": 2
  1291. }