説明なし

Content Based Recommenders-checkpoint.ipynb 26KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Plot Description Based Recommender"
  8. ]
  9. },
  10. {
  11. "cell_type": "code",
  12. "execution_count": 1,
  13. "metadata": {},
  14. "outputs": [
  15. {
  16. "name": "stdout",
  17. "output_type": "stream",
  18. "text": [
  19. "Requirement already satisfied: scikit-learn in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.1.2)\n",
  20. "Requirement already satisfied: scipy in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.9.0)\n",
  21. "Collecting matplotlib\n",
  22. " Downloading matplotlib-3.5.3-cp310-cp310-macosx_10_9_x86_64.whl (7.3 MB)\n",
  23. "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.3/7.3 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m\n",
  24. "\u001b[?25hRequirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/site-packages (from scikit-learn) (1.22.4)\n",
  25. "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (1.1.0)\n",
  26. "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
  27. "Collecting fonttools>=4.22.0\n",
  28. " Using cached fonttools-4.34.4-py3-none-any.whl (944 kB)\n",
  29. "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
  30. "Collecting cycler>=0.10\n",
  31. " Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)\n",
  32. "Collecting kiwisolver>=1.0.1\n",
  33. " Downloading kiwisolver-1.4.4-cp310-cp310-macosx_10_9_x86_64.whl (65 kB)\n",
  34. "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m65.5/65.5 KB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
  35. "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (21.3)\n",
  36. "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
  37. "Collecting pillow>=6.2.0\n",
  38. " Downloading Pillow-9.2.0-cp310-cp310-macosx_10_10_x86_64.whl (3.1 MB)\n",
  39. "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m0m\n",
  40. "\u001b[?25hRequirement already satisfied: six>=1.5 in /usr/local/Cellar/six/1.16.0_2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
  41. "Installing collected packages: pillow, kiwisolver, fonttools, cycler, matplotlib\n",
  42. "Successfully installed cycler-0.11.0 fonttools-4.34.4 kiwisolver-1.4.4 matplotlib-3.5.3 pillow-9.2.0\n",
  43. "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n",
  44. "You should consider upgrading via the '/usr/local/Cellar/ipython/8.4.0/libexec/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
  45. "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
  46. ]
  47. }
  48. ],
  49. "source": [
  50. "%pip install scikit-learn scipy matplotlib"
  51. ]
  52. },
  53. {
  54. "cell_type": "code",
  55. "execution_count": 2,
  56. "metadata": {},
  57. "outputs": [
  58. {
  59. "name": "stdout",
  60. "output_type": "stream",
  61. "text": [
  62. "Requirement already satisfied: scikit-learn in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.1.2)\n",
  63. "Requirement already satisfied: scipy in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.9.0)\n",
  64. "Requirement already satisfied: matplotlib in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (3.5.3)\n",
  65. "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
  66. "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (1.1.0)\n",
  67. "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/site-packages (from scikit-learn) (1.22.4)\n",
  68. "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
  69. "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
  70. "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (1.4.4)\n",
  71. "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (4.34.4)\n",
  72. "Requirement already satisfied: cycler>=0.10 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
  73. "Requirement already satisfied: packaging>=20.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (21.3)\n",
  74. "Requirement already satisfied: pillow>=6.2.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (9.2.0)\n",
  75. "Requirement already satisfied: six>=1.5 in /usr/local/Cellar/six/1.16.0_2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
  76. "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n",
  77. "You should consider upgrading via the '/usr/local/Cellar/ipython/8.4.0/libexec/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
  78. "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
  79. ]
  80. }
  81. ],
  82. "source": [
  83. "%pip install scikit-learn scipy matplotlib"
  84. ]
  85. },
  86. {
  87. "cell_type": "code",
  88. "execution_count": 3,
  89. "metadata": {},
  90. "outputs": [
  91. {
  92. "data": {
  93. "text/html": [
  94. "<div>\n",
  95. "<style scoped>\n",
  96. " .dataframe tbody tr th:only-of-type {\n",
  97. " vertical-align: middle;\n",
  98. " }\n",
  99. "\n",
  100. " .dataframe tbody tr th {\n",
  101. " vertical-align: top;\n",
  102. " }\n",
  103. "\n",
  104. " .dataframe thead th {\n",
  105. " text-align: right;\n",
  106. " }\n",
  107. "</style>\n",
  108. "<table border=\"1\" class=\"dataframe\">\n",
  109. " <thead>\n",
  110. " <tr style=\"text-align: right;\">\n",
  111. " <th></th>\n",
  112. " <th>title</th>\n",
  113. " <th>genres</th>\n",
  114. " <th>runtime</th>\n",
  115. " <th>vote_average</th>\n",
  116. " <th>vote_count</th>\n",
  117. " <th>year</th>\n",
  118. " </tr>\n",
  119. " </thead>\n",
  120. " <tbody>\n",
  121. " <tr>\n",
  122. " <th>0</th>\n",
  123. " <td>Toy Story</td>\n",
  124. " <td>['animation', 'comedy', 'family']</td>\n",
  125. " <td>81.0</td>\n",
  126. " <td>7.7</td>\n",
  127. " <td>5415.0</td>\n",
  128. " <td>1995</td>\n",
  129. " </tr>\n",
  130. " <tr>\n",
  131. " <th>1</th>\n",
  132. " <td>Jumanji</td>\n",
  133. " <td>['adventure', 'fantasy', 'family']</td>\n",
  134. " <td>104.0</td>\n",
  135. " <td>6.9</td>\n",
  136. " <td>2413.0</td>\n",
  137. " <td>1995</td>\n",
  138. " </tr>\n",
  139. " <tr>\n",
  140. " <th>2</th>\n",
  141. " <td>Grumpier Old Men</td>\n",
  142. " <td>['romance', 'comedy']</td>\n",
  143. " <td>101.0</td>\n",
  144. " <td>6.5</td>\n",
  145. " <td>92.0</td>\n",
  146. " <td>1995</td>\n",
  147. " </tr>\n",
  148. " <tr>\n",
  149. " <th>3</th>\n",
  150. " <td>Waiting to Exhale</td>\n",
  151. " <td>['comedy', 'drama', 'romance']</td>\n",
  152. " <td>127.0</td>\n",
  153. " <td>6.1</td>\n",
  154. " <td>34.0</td>\n",
  155. " <td>1995</td>\n",
  156. " </tr>\n",
  157. " <tr>\n",
  158. " <th>4</th>\n",
  159. " <td>Father of the Bride Part II</td>\n",
  160. " <td>['comedy']</td>\n",
  161. " <td>106.0</td>\n",
  162. " <td>5.7</td>\n",
  163. " <td>173.0</td>\n",
  164. " <td>1995</td>\n",
  165. " </tr>\n",
  166. " </tbody>\n",
  167. "</table>\n",
  168. "</div>"
  169. ],
  170. "text/plain": [
  171. " title genres runtime \\\n",
  172. "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
  173. "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
  174. "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
  175. "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
  176. "4 Father of the Bride Part II ['comedy'] 106.0 \n",
  177. "\n",
  178. " vote_average vote_count year \n",
  179. "0 7.7 5415.0 1995 \n",
  180. "1 6.9 2413.0 1995 \n",
  181. "2 6.5 92.0 1995 \n",
  182. "3 6.1 34.0 1995 \n",
  183. "4 5.7 173.0 1995 "
  184. ]
  185. },
  186. "execution_count": 3,
  187. "metadata": {},
  188. "output_type": "execute_result"
  189. }
  190. ],
  191. "source": [
  192. "import pandas as pd\n",
  193. "import numpy as np\n",
  194. "\n",
  195. "#Import data from the clean file \n",
  196. "df = pd.read_csv('../data/metadata_clean.csv')\n",
  197. "\n",
  198. "#Print the head of the cleaned DataFrame\n",
  199. "df.head()"
  200. ]
  201. },
  202. {
  203. "cell_type": "code",
  204. "execution_count": 4,
  205. "metadata": {},
  206. "outputs": [
  207. {
  208. "data": {
  209. "text/html": [
  210. "<div>\n",
  211. "<style scoped>\n",
  212. " .dataframe tbody tr th:only-of-type {\n",
  213. " vertical-align: middle;\n",
  214. " }\n",
  215. "\n",
  216. " .dataframe tbody tr th {\n",
  217. " vertical-align: top;\n",
  218. " }\n",
  219. "\n",
  220. " .dataframe thead th {\n",
  221. " text-align: right;\n",
  222. " }\n",
  223. "</style>\n",
  224. "<table border=\"1\" class=\"dataframe\">\n",
  225. " <thead>\n",
  226. " <tr style=\"text-align: right;\">\n",
  227. " <th></th>\n",
  228. " <th>title</th>\n",
  229. " <th>genres</th>\n",
  230. " <th>runtime</th>\n",
  231. " <th>vote_average</th>\n",
  232. " <th>vote_count</th>\n",
  233. " <th>year</th>\n",
  234. " <th>overview</th>\n",
  235. " <th>id</th>\n",
  236. " </tr>\n",
  237. " </thead>\n",
  238. " <tbody>\n",
  239. " <tr>\n",
  240. " <th>0</th>\n",
  241. " <td>Toy Story</td>\n",
  242. " <td>['animation', 'comedy', 'family']</td>\n",
  243. " <td>81.0</td>\n",
  244. " <td>7.7</td>\n",
  245. " <td>5415.0</td>\n",
  246. " <td>1995</td>\n",
  247. " <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
  248. " <td>862</td>\n",
  249. " </tr>\n",
  250. " <tr>\n",
  251. " <th>1</th>\n",
  252. " <td>Jumanji</td>\n",
  253. " <td>['adventure', 'fantasy', 'family']</td>\n",
  254. " <td>104.0</td>\n",
  255. " <td>6.9</td>\n",
  256. " <td>2413.0</td>\n",
  257. " <td>1995</td>\n",
  258. " <td>When siblings Judy and Peter discover an encha...</td>\n",
  259. " <td>8844</td>\n",
  260. " </tr>\n",
  261. " <tr>\n",
  262. " <th>2</th>\n",
  263. " <td>Grumpier Old Men</td>\n",
  264. " <td>['romance', 'comedy']</td>\n",
  265. " <td>101.0</td>\n",
  266. " <td>6.5</td>\n",
  267. " <td>92.0</td>\n",
  268. " <td>1995</td>\n",
  269. " <td>A family wedding reignites the ancient feud be...</td>\n",
  270. " <td>15602</td>\n",
  271. " </tr>\n",
  272. " <tr>\n",
  273. " <th>3</th>\n",
  274. " <td>Waiting to Exhale</td>\n",
  275. " <td>['comedy', 'drama', 'romance']</td>\n",
  276. " <td>127.0</td>\n",
  277. " <td>6.1</td>\n",
  278. " <td>34.0</td>\n",
  279. " <td>1995</td>\n",
  280. " <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
  281. " <td>31357</td>\n",
  282. " </tr>\n",
  283. " <tr>\n",
  284. " <th>4</th>\n",
  285. " <td>Father of the Bride Part II</td>\n",
  286. " <td>['comedy']</td>\n",
  287. " <td>106.0</td>\n",
  288. " <td>5.7</td>\n",
  289. " <td>173.0</td>\n",
  290. " <td>1995</td>\n",
  291. " <td>Just when George Banks has recovered from his ...</td>\n",
  292. " <td>11862</td>\n",
  293. " </tr>\n",
  294. " </tbody>\n",
  295. "</table>\n",
  296. "</div>"
  297. ],
  298. "text/plain": [
  299. " title genres runtime \\\n",
  300. "0 Toy Story ['animation', 'comedy', 'family'] 81.0 \n",
  301. "1 Jumanji ['adventure', 'fantasy', 'family'] 104.0 \n",
  302. "2 Grumpier Old Men ['romance', 'comedy'] 101.0 \n",
  303. "3 Waiting to Exhale ['comedy', 'drama', 'romance'] 127.0 \n",
  304. "4 Father of the Bride Part II ['comedy'] 106.0 \n",
  305. "\n",
  306. " vote_average vote_count year \\\n",
  307. "0 7.7 5415.0 1995 \n",
  308. "1 6.9 2413.0 1995 \n",
  309. "2 6.5 92.0 1995 \n",
  310. "3 6.1 34.0 1995 \n",
  311. "4 5.7 173.0 1995 \n",
  312. "\n",
  313. " overview id \n",
  314. "0 Led by Woody, Andy's toys live happily in his ... 862 \n",
  315. "1 When siblings Judy and Peter discover an encha... 8844 \n",
  316. "2 A family wedding reignites the ancient feud be... 15602 \n",
  317. "3 Cheated on, mistreated and stepped on, the wom... 31357 \n",
  318. "4 Just when George Banks has recovered from his ... 11862 "
  319. ]
  320. },
  321. "execution_count": 4,
  322. "metadata": {},
  323. "output_type": "execute_result"
  324. }
  325. ],
  326. "source": [
  327. "#Import the original file\n",
  328. "orig_df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)\n",
  329. "\n",
  330. "#Add the useful features into the cleaned dataframe\n",
  331. "df['overview'], df['id'] = orig_df['overview'], orig_df['id']\n",
  332. "\n",
  333. "df.head()"
  334. ]
  335. },
  336. {
  337. "cell_type": "code",
  338. "execution_count": null,
  339. "metadata": {
  340. "scrolled": true
  341. },
  342. "outputs": [],
  343. "source": [
  344. "#Import TfIdfVectorizer from the scikit-learn library\n",
  345. "from sklearn.feature_extraction.text import TfidfVectorizer\n",
  346. "\n",
  347. "#Define a TF-IDF Vectorizer Object. Remove all english stopwords\n",
  348. "tfidf = TfidfVectorizer(stop_words='english')\n",
  349. "\n",
  350. "#Replace NaN with an empty string\n",
  351. "df['overview'] = df['overview'].fillna('')\n",
  352. "\n",
  353. "#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature\n",
  354. "tfidf_matrix = tfidf.fit_transform(df['overview'])\n",
  355. "\n",
  356. "#Output the shape of tfidf_matrix\n",
  357. "tfidf_matrix.shape"
  358. ]
  359. },
  360. {
  361. "cell_type": "code",
  362. "execution_count": null,
  363. "metadata": {},
  364. "outputs": [],
  365. "source": []
  366. },
  367. {
  368. "cell_type": "code",
  369. "execution_count": null,
  370. "metadata": {},
  371. "outputs": [],
  372. "source": [
  373. "# Import linear_kernel to compute the dot product\n",
  374. "from sklearn.metrics.pairwise import linear_kernel\n",
  375. "\n",
  376. "# Compute the cosine similarity matrix\n",
  377. "cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)"
  378. ]
  379. },
  380. {
  381. "cell_type": "code",
  382. "execution_count": null,
  383. "metadata": {},
  384. "outputs": [],
  385. "source": [
  386. "#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any\n",
  387. "indices = pd.Series(df.index, index=df['title']).drop_duplicates()"
  388. ]
  389. },
  390. {
  391. "cell_type": "code",
  392. "execution_count": null,
  393. "metadata": {},
  394. "outputs": [],
  395. "source": [
  396. "# Function that takes in movie title as input and gives recommendations \n",
  397. "def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):\n",
  398. " # Obtain the index of the movie that matches the title\n",
  399. " idx = indices[title]\n",
  400. "\n",
  401. " # Get the pairwsie similarity scores of all movies with that movie\n",
  402. " # And convert it into a list of tuples as described above\n",
  403. " sim_scores = list(enumerate(cosine_sim[idx]))\n",
  404. "\n",
  405. " # Sort the movies based on the cosine similarity scores\n",
  406. " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
  407. "\n",
  408. " # Get the scores of the 10 most similar movies. Ignore the first movie.\n",
  409. " sim_scores = sim_scores[1:11]\n",
  410. "\n",
  411. " # Get the movie indices\n",
  412. " movie_indices = [i[0] for i in sim_scores]\n",
  413. "\n",
  414. " # Return the top 10 most similar movies\n",
  415. " return df['title'].iloc[movie_indices]"
  416. ]
  417. },
  418. {
  419. "cell_type": "code",
  420. "execution_count": null,
  421. "metadata": {},
  422. "outputs": [],
  423. "source": [
  424. "#Get recommendations for The Lion King\n",
  425. "content_recommender('The Lion King')"
  426. ]
  427. },
  428. {
  429. "cell_type": "markdown",
  430. "metadata": {},
  431. "source": [
  432. "# Metadata Based Recommender"
  433. ]
  434. },
  435. {
  436. "cell_type": "code",
  437. "execution_count": null,
  438. "metadata": {},
  439. "outputs": [],
  440. "source": [
  441. "# Load the keywords and credits files\n",
  442. "cred_df = pd.read_csv('../data/credits.csv')\n",
  443. "key_df = pd.read_csv('../data/keywords.csv')"
  444. ]
  445. },
  446. {
  447. "cell_type": "code",
  448. "execution_count": null,
  449. "metadata": {},
  450. "outputs": [],
  451. "source": [
  452. "#Print the head of the credit dataframe\n",
  453. "cred_df.head()"
  454. ]
  455. },
  456. {
  457. "cell_type": "code",
  458. "execution_count": null,
  459. "metadata": {},
  460. "outputs": [],
  461. "source": [
  462. "#Print the head of the keywords dataframe\n",
  463. "key_df.head()"
  464. ]
  465. },
  466. {
  467. "cell_type": "code",
  468. "execution_count": null,
  469. "metadata": {},
  470. "outputs": [],
  471. "source": [
  472. "#Convert the IDs of df into int\n",
  473. "df['id'] = df['id'].astype('int')"
  474. ]
  475. },
  476. {
  477. "cell_type": "code",
  478. "execution_count": null,
  479. "metadata": {},
  480. "outputs": [],
  481. "source": [
  482. "# Function to convert all non-integer IDs to NaN\n",
  483. "def clean_ids(x):\n",
  484. " try:\n",
  485. " return int(x)\n",
  486. " except:\n",
  487. " return np.nan"
  488. ]
  489. },
  490. {
  491. "cell_type": "code",
  492. "execution_count": null,
  493. "metadata": {},
  494. "outputs": [],
  495. "source": [
  496. "#Clean the ids of df\n",
  497. "df['id'] = df['id'].apply(clean_ids)\n",
  498. "\n",
  499. "#Filter all rows that have a null ID\n",
  500. "df = df[df['id'].notnull()]"
  501. ]
  502. },
  503. {
  504. "cell_type": "code",
  505. "execution_count": null,
  506. "metadata": {},
  507. "outputs": [],
  508. "source": [
  509. "# Convert IDs into integer\n",
  510. "df['id'] = df['id'].astype('int')\n",
  511. "key_df['id'] = key_df['id'].astype('int')\n",
  512. "cred_df['id'] = cred_df['id'].astype('int')\n",
  513. "\n",
  514. "# Merge keywords and credits into your main metadata dataframe\n",
  515. "df = df.merge(cred_df, on='id')\n",
  516. "df = df.merge(key_df, on='id')\n",
  517. "\n",
  518. "#Display the head of df\n",
  519. "df.head()"
  520. ]
  521. },
  522. {
  523. "cell_type": "code",
  524. "execution_count": null,
  525. "metadata": {},
  526. "outputs": [],
  527. "source": [
  528. "# Convert the stringified objects into the native python objects\n",
  529. "from ast import literal_eval\n",
  530. "\n",
  531. "features = ['cast', 'crew', 'keywords', 'genres']\n",
  532. "for feature in features:\n",
  533. " df[feature] = df[feature].apply(literal_eval)"
  534. ]
  535. },
  536. {
  537. "cell_type": "code",
  538. "execution_count": null,
  539. "metadata": {},
  540. "outputs": [],
  541. "source": [
  542. "#Print the first cast member of the first movie in df\n",
  543. "df.iloc[0]['crew'][0]"
  544. ]
  545. },
  546. {
  547. "cell_type": "code",
  548. "execution_count": null,
  549. "metadata": {},
  550. "outputs": [],
  551. "source": [
  552. "# Extract the director's name. If director is not listed, return NaN\n",
  553. "def get_director(x):\n",
  554. " for crew_member in x:\n",
  555. " if crew_member['job'] == 'Director':\n",
  556. " return crew_member['name']\n",
  557. " return np.nan"
  558. ]
  559. },
  560. {
  561. "cell_type": "code",
  562. "execution_count": null,
  563. "metadata": {},
  564. "outputs": [],
  565. "source": [
  566. "#Define the new director feature\n",
  567. "df['director'] = df['crew'].apply(get_director)\n",
  568. "\n",
  569. "#Print the directors of the first five movies\n",
  570. "df['director'].head()"
  571. ]
  572. },
  573. {
  574. "cell_type": "code",
  575. "execution_count": null,
  576. "metadata": {},
  577. "outputs": [],
  578. "source": [
  579. "# Returns the list top 3 elements or entire list; whichever is more.\n",
  580. "def generate_list(x):\n",
  581. " if isinstance(x, list):\n",
  582. " names = [i['name'] for i in x]\n",
  583. " #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.\n",
  584. " if len(names) > 3:\n",
  585. " names = names[:3]\n",
  586. " return names\n",
  587. "\n",
  588. " #Return empty list in case of missing/malformed data\n",
  589. " return []"
  590. ]
  591. },
  592. {
  593. "cell_type": "code",
  594. "execution_count": null,
  595. "metadata": {},
  596. "outputs": [],
  597. "source": [
  598. "#Apply the generate_list function to cast and keywords\n",
  599. "df['cast'] = df['cast'].apply(generate_list)\n",
  600. "df['keywords'] = df['keywords'].apply(generate_list)"
  601. ]
  602. },
  603. {
  604. "cell_type": "code",
  605. "execution_count": null,
  606. "metadata": {},
  607. "outputs": [],
  608. "source": [
  609. "#Only consider a maximum of 3 genres\n",
  610. "df['genres'] = df['genres'].apply(lambda x: x[:3])"
  611. ]
  612. },
  613. {
  614. "cell_type": "code",
  615. "execution_count": null,
  616. "metadata": {},
  617. "outputs": [],
  618. "source": [
  619. "# Print the new features of the first 5 movies along with title\n",
  620. "df[['title', 'cast', 'director', 'keywords', 'genres']].head()"
  621. ]
  622. },
  623. {
  624. "cell_type": "code",
  625. "execution_count": null,
  626. "metadata": {},
  627. "outputs": [],
  628. "source": [
  629. "# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase\n",
  630. "def sanitize(x):\n",
  631. " if isinstance(x, list):\n",
  632. " #Strip spaces and convert to lowercase\n",
  633. " return [str.lower(i.replace(\" \", \"\")) for i in x]\n",
  634. " else:\n",
  635. " #Check if director exists. If not, return empty string\n",
  636. " if isinstance(x, str):\n",
  637. " return str.lower(x.replace(\" \", \"\"))\n",
  638. " else:\n",
  639. " return ''"
  640. ]
  641. },
  642. {
  643. "cell_type": "code",
  644. "execution_count": null,
  645. "metadata": {},
  646. "outputs": [],
  647. "source": [
  648. "#Apply the generate_list function to cast, keywords, director and genres\n",
  649. "for feature in ['cast', 'director', 'genres', 'keywords']:\n",
  650. " df[feature] = df[feature].apply(sanitize)"
  651. ]
  652. },
  653. {
  654. "cell_type": "code",
  655. "execution_count": null,
  656. "metadata": {
  657. "scrolled": true
  658. },
  659. "outputs": [],
  660. "source": [
  661. "#Function that creates a soup out of the desired metadata\n",
  662. "def create_soup(x):\n",
  663. " return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])"
  664. ]
  665. },
  666. {
  667. "cell_type": "code",
  668. "execution_count": null,
  669. "metadata": {},
  670. "outputs": [],
  671. "source": [
  672. "# Create the new soup feature\n",
  673. "df['soup'] = df.apply(create_soup, axis=1)"
  674. ]
  675. },
  676. {
  677. "cell_type": "code",
  678. "execution_count": null,
  679. "metadata": {},
  680. "outputs": [],
  681. "source": [
  682. "#Display the soup of the first movie\n",
  683. "df.iloc[0]['soup']"
  684. ]
  685. },
  686. {
  687. "cell_type": "code",
  688. "execution_count": null,
  689. "metadata": {},
  690. "outputs": [],
  691. "source": [
  692. "# Import CountVectorizer\n",
  693. "from sklearn.feature_extraction.text import CountVectorizer\n",
  694. "\n",
  695. "#Define a new CountVectorizer object and create vectors for the soup\n",
  696. "count = CountVectorizer(stop_words='english')\n",
  697. "count_matrix = count.fit_transform(df['soup'])"
  698. ]
  699. },
  700. {
  701. "cell_type": "code",
  702. "execution_count": null,
  703. "metadata": {},
  704. "outputs": [],
  705. "source": [
  706. "#Import cosine_similarity function\n",
  707. "from sklearn.metrics.pairwise import cosine_similarity\n",
  708. "\n",
  709. "#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)\n",
  710. "cosine_sim2 = cosine_similarity(count_matrix, count_matrix)"
  711. ]
  712. },
  713. {
  714. "cell_type": "code",
  715. "execution_count": null,
  716. "metadata": {},
  717. "outputs": [],
  718. "source": [
  719. "# Reset index of your df and construct reverse mapping again\n",
  720. "df = df.reset_index()\n",
  721. "indices2 = pd.Series(df.index, index=df['title'])"
  722. ]
  723. },
  724. {
  725. "cell_type": "code",
  726. "execution_count": null,
  727. "metadata": {},
  728. "outputs": [],
  729. "source": [
  730. "content_recommender('The Lion King', cosine_sim2, df, indices2)"
  731. ]
  732. },
  733. {
  734. "cell_type": "code",
  735. "execution_count": null,
  736. "metadata": {},
  737. "outputs": [],
  738. "source": []
  739. },
  740. {
  741. "cell_type": "code",
  742. "execution_count": null,
  743. "metadata": {},
  744. "outputs": [],
  745. "source": []
  746. },
  747. {
  748. "cell_type": "code",
  749. "execution_count": null,
  750. "metadata": {},
  751. "outputs": [],
  752. "source": []
  753. },
  754. {
  755. "cell_type": "code",
  756. "execution_count": null,
  757. "metadata": {},
  758. "outputs": [],
  759. "source": []
  760. }
  761. ],
  762. "metadata": {
  763. "kernelspec": {
  764. "display_name": "Python 3 (ipykernel)",
  765. "language": "python",
  766. "name": "python3"
  767. },
  768. "language_info": {
  769. "codemirror_mode": {
  770. "name": "ipython",
  771. "version": 3
  772. },
  773. "file_extension": ".py",
  774. "mimetype": "text/x-python",
  775. "name": "python",
  776. "nbconvert_exporter": "python",
  777. "pygments_lexer": "ipython3",
  778. "version": "3.10.4"
  779. }
  780. },
  781. "nbformat": 4,
  782. "nbformat_minor": 2
  783. }