Nav apraksta

Knowledge Recommender.ipynb 26KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [
  8. {
  9. "name": "stderr",
  10. "output_type": "stream",
  11. "text": [
  12. "/var/folders/dv/107570dd01b_m3wvmvv0g9lm0000gn/T/ipykernel_4569/2038691245.py:4: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
  13. " df = pd.read_csv('../data/movies_metadata.csv')\n"
  14. ]
  15. },
  16. {
  17. "data": {
  18. "text/plain": [
  19. "Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',\n",
  20. " 'imdb_id', 'original_language', 'original_title', 'overview',\n",
  21. " 'popularity', 'poster_path', 'production_companies',\n",
  22. " 'production_countries', 'release_date', 'revenue', 'runtime',\n",
  23. " 'spoken_languages', 'status', 'tagline', 'title', 'video',\n",
  24. " 'vote_average', 'vote_count'],\n",
  25. " dtype='object')"
  26. ]
  27. },
  28. "execution_count": 1,
  29. "metadata": {},
  30. "output_type": "execute_result"
  31. }
  32. ],
  33. "source": [
  34. "import pandas as pd\n",
  35. "import numpy as np\n",
  36. "\n",
  37. "df = pd.read_csv('../data/movies_metadata.csv')\n",
  38. "\n",
  39. "#Print all the features (or columns) of the DataFrame\n",
  40. "df.columns"
  41. ]
  42. },
  43. {
  44. "cell_type": "code",
  45. "execution_count": 2,
  46. "metadata": {},
  47. "outputs": [
  48. {
  49. "data": {
  50. "text/html": [
  51. "<div>\n",
  52. "<style scoped>\n",
  53. " .dataframe tbody tr th:only-of-type {\n",
  54. " vertical-align: middle;\n",
  55. " }\n",
  56. "\n",
  57. " .dataframe tbody tr th {\n",
  58. " vertical-align: top;\n",
  59. " }\n",
  60. "\n",
  61. " .dataframe thead th {\n",
  62. " text-align: right;\n",
  63. " }\n",
  64. "</style>\n",
  65. "<table border=\"1\" class=\"dataframe\">\n",
  66. " <thead>\n",
  67. " <tr style=\"text-align: right;\">\n",
  68. " <th></th>\n",
  69. " <th>title</th>\n",
  70. " <th>genres</th>\n",
  71. " <th>release_date</th>\n",
  72. " <th>runtime</th>\n",
  73. " <th>vote_average</th>\n",
  74. " <th>vote_count</th>\n",
  75. " </tr>\n",
  76. " </thead>\n",
  77. " <tbody>\n",
  78. " <tr>\n",
  79. " <th>0</th>\n",
  80. " <td>Toy Story</td>\n",
  81. " <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
  82. " <td>1995-10-30</td>\n",
  83. " <td>81.0</td>\n",
  84. " <td>7.7</td>\n",
  85. " <td>5415.0</td>\n",
  86. " </tr>\n",
  87. " <tr>\n",
  88. " <th>1</th>\n",
  89. " <td>Jumanji</td>\n",
  90. " <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
  91. " <td>1995-12-15</td>\n",
  92. " <td>104.0</td>\n",
  93. " <td>6.9</td>\n",
  94. " <td>2413.0</td>\n",
  95. " </tr>\n",
  96. " <tr>\n",
  97. " <th>2</th>\n",
  98. " <td>Grumpier Old Men</td>\n",
  99. " <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
  100. " <td>1995-12-22</td>\n",
  101. " <td>101.0</td>\n",
  102. " <td>6.5</td>\n",
  103. " <td>92.0</td>\n",
  104. " </tr>\n",
  105. " <tr>\n",
  106. " <th>3</th>\n",
  107. " <td>Waiting to Exhale</td>\n",
  108. " <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
  109. " <td>1995-12-22</td>\n",
  110. " <td>127.0</td>\n",
  111. " <td>6.1</td>\n",
  112. " <td>34.0</td>\n",
  113. " </tr>\n",
  114. " <tr>\n",
  115. " <th>4</th>\n",
  116. " <td>Father of the Bride Part II</td>\n",
  117. " <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
  118. " <td>1995-02-10</td>\n",
  119. " <td>106.0</td>\n",
  120. " <td>5.7</td>\n",
  121. " <td>173.0</td>\n",
  122. " </tr>\n",
  123. " </tbody>\n",
  124. "</table>\n",
  125. "</div>"
  126. ],
  127. "text/plain": [
  128. " title \\\n",
  129. "0 Toy Story \n",
  130. "1 Jumanji \n",
  131. "2 Grumpier Old Men \n",
  132. "3 Waiting to Exhale \n",
  133. "4 Father of the Bride Part II \n",
  134. "\n",
  135. " genres release_date runtime \\\n",
  136. "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 1995-10-30 81.0 \n",
  137. "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 1995-12-15 104.0 \n",
  138. "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 1995-12-22 101.0 \n",
  139. "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 1995-12-22 127.0 \n",
  140. "4 [{'id': 35, 'name': 'Comedy'}] 1995-02-10 106.0 \n",
  141. "\n",
  142. " vote_average vote_count \n",
  143. "0 7.7 5415.0 \n",
  144. "1 6.9 2413.0 \n",
  145. "2 6.5 92.0 \n",
  146. "3 6.1 34.0 \n",
  147. "4 5.7 173.0 "
  148. ]
  149. },
  150. "execution_count": 2,
  151. "metadata": {},
  152. "output_type": "execute_result"
  153. }
  154. ],
  155. "source": [
  156. "#Only keep those features that we require \n",
  157. "df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]\n",
  158. "\n",
  159. "df.head()"
  160. ]
  161. },
  162. {
  163. "cell_type": "code",
  164. "execution_count": 3,
  165. "metadata": {},
  166. "outputs": [],
  167. "source": [
  168. "#Convert release_date into pandas datetime format\n",
  169. "df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')\n",
  170. "\n",
  171. "#Extract year from the datetime\n",
  172. "df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)"
  173. ]
  174. },
  175. {
  176. "cell_type": "code",
  177. "execution_count": 4,
  178. "metadata": {},
  179. "outputs": [],
  180. "source": [
  181. "#Helper function to convert NaT to 0 and all other years to integers.\n",
  182. "def convert_int(x):\n",
  183. " try:\n",
  184. " return int(x)\n",
  185. " except:\n",
  186. " return 0"
  187. ]
  188. },
  189. {
  190. "cell_type": "code",
  191. "execution_count": 5,
  192. "metadata": {},
  193. "outputs": [],
  194. "source": [
  195. "#Apply convert_int to the year feature\n",
  196. "df['year'] = df['year'].apply(convert_int)"
  197. ]
  198. },
  199. {
  200. "cell_type": "code",
  201. "execution_count": 6,
  202. "metadata": {},
  203. "outputs": [
  204. {
  205. "data": {
  206. "text/html": [
  207. "<div>\n",
  208. "<style scoped>\n",
  209. " .dataframe tbody tr th:only-of-type {\n",
  210. " vertical-align: middle;\n",
  211. " }\n",
  212. "\n",
  213. " .dataframe tbody tr th {\n",
  214. " vertical-align: top;\n",
  215. " }\n",
  216. "\n",
  217. " .dataframe thead th {\n",
  218. " text-align: right;\n",
  219. " }\n",
  220. "</style>\n",
  221. "<table border=\"1\" class=\"dataframe\">\n",
  222. " <thead>\n",
  223. " <tr style=\"text-align: right;\">\n",
  224. " <th></th>\n",
  225. " <th>title</th>\n",
  226. " <th>genres</th>\n",
  227. " <th>runtime</th>\n",
  228. " <th>vote_average</th>\n",
  229. " <th>vote_count</th>\n",
  230. " <th>year</th>\n",
  231. " </tr>\n",
  232. " </thead>\n",
  233. " <tbody>\n",
  234. " <tr>\n",
  235. " <th>0</th>\n",
  236. " <td>Toy Story</td>\n",
  237. " <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
  238. " <td>81.0</td>\n",
  239. " <td>7.7</td>\n",
  240. " <td>5415.0</td>\n",
  241. " <td>1995</td>\n",
  242. " </tr>\n",
  243. " <tr>\n",
  244. " <th>1</th>\n",
  245. " <td>Jumanji</td>\n",
  246. " <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
  247. " <td>104.0</td>\n",
  248. " <td>6.9</td>\n",
  249. " <td>2413.0</td>\n",
  250. " <td>1995</td>\n",
  251. " </tr>\n",
  252. " <tr>\n",
  253. " <th>2</th>\n",
  254. " <td>Grumpier Old Men</td>\n",
  255. " <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
  256. " <td>101.0</td>\n",
  257. " <td>6.5</td>\n",
  258. " <td>92.0</td>\n",
  259. " <td>1995</td>\n",
  260. " </tr>\n",
  261. " <tr>\n",
  262. " <th>3</th>\n",
  263. " <td>Waiting to Exhale</td>\n",
  264. " <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
  265. " <td>127.0</td>\n",
  266. " <td>6.1</td>\n",
  267. " <td>34.0</td>\n",
  268. " <td>1995</td>\n",
  269. " </tr>\n",
  270. " <tr>\n",
  271. " <th>4</th>\n",
  272. " <td>Father of the Bride Part II</td>\n",
  273. " <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
  274. " <td>106.0</td>\n",
  275. " <td>5.7</td>\n",
  276. " <td>173.0</td>\n",
  277. " <td>1995</td>\n",
  278. " </tr>\n",
  279. " </tbody>\n",
  280. "</table>\n",
  281. "</div>"
  282. ],
  283. "text/plain": [
  284. " title \\\n",
  285. "0 Toy Story \n",
  286. "1 Jumanji \n",
  287. "2 Grumpier Old Men \n",
  288. "3 Waiting to Exhale \n",
  289. "4 Father of the Bride Part II \n",
  290. "\n",
  291. " genres runtime vote_average \\\n",
  292. "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 81.0 7.7 \n",
  293. "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 104.0 6.9 \n",
  294. "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 101.0 6.5 \n",
  295. "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 127.0 6.1 \n",
  296. "4 [{'id': 35, 'name': 'Comedy'}] 106.0 5.7 \n",
  297. "\n",
  298. " vote_count year \n",
  299. "0 5415.0 1995 \n",
  300. "1 2413.0 1995 \n",
  301. "2 92.0 1995 \n",
  302. "3 34.0 1995 \n",
  303. "4 173.0 1995 "
  304. ]
  305. },
  306. "execution_count": 6,
  307. "metadata": {},
  308. "output_type": "execute_result"
  309. }
  310. ],
  311. "source": [
  312. "#Drop the release_date column\n",
  313. "df = df.drop('release_date', axis=1)\n",
  314. "\n",
  315. "#Display the dataframe\n",
  316. "df.head()"
  317. ]
  318. },
  319. {
  320. "cell_type": "code",
  321. "execution_count": 7,
  322. "metadata": {},
  323. "outputs": [
  324. {
  325. "data": {
  326. "text/plain": [
  327. "\"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]\""
  328. ]
  329. },
  330. "execution_count": 7,
  331. "metadata": {},
  332. "output_type": "execute_result"
  333. }
  334. ],
  335. "source": [
  336. "#Print genres of the first movie\n",
  337. "df.iloc[0]['genres']"
  338. ]
  339. },
  340. {
  341. "cell_type": "code",
  342. "execution_count": 8,
  343. "metadata": {},
  344. "outputs": [
  345. {
  346. "name": "stdout",
  347. "output_type": "stream",
  348. "text": [
  349. "<class 'str'>\n",
  350. "<class 'list'>\n"
  351. ]
  352. }
  353. ],
  354. "source": [
  355. "#Import the literal_eval function from ast\n",
  356. "from ast import literal_eval\n",
  357. "\n",
  358. "#Define a stringified list and output its type\n",
  359. "a = \"[1,2,3]\"\n",
  360. "print(type(a))\n",
  361. "\n",
  362. "#Apply literal_eval and output type\n",
  363. "b = literal_eval(a)\n",
  364. "print(type(b))"
  365. ]
  366. },
  367. {
  368. "cell_type": "code",
  369. "execution_count": 9,
  370. "metadata": {},
  371. "outputs": [],
  372. "source": [
  373. "#Convert all NaN into stringified empty lists\n",
  374. "df['genres'] = df['genres'].fillna('[]')\n",
  375. "\n",
  376. "#Apply literal_eval to convert stringified empty lists to the list object\n",
  377. "df['genres'] = df['genres'].apply(literal_eval)\n",
  378. "\n",
  379. "#Convert list of dictionaries to a list of strings\n",
  380. "df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])"
  381. ]
  382. },
  383. {
  384. "cell_type": "code",
  385. "execution_count": 10,
  386. "metadata": {},
  387. "outputs": [
  388. {
  389. "data": {
  390. "text/html": [
  391. "<div>\n",
  392. "<style scoped>\n",
  393. " .dataframe tbody tr th:only-of-type {\n",
  394. " vertical-align: middle;\n",
  395. " }\n",
  396. "\n",
  397. " .dataframe tbody tr th {\n",
  398. " vertical-align: top;\n",
  399. " }\n",
  400. "\n",
  401. " .dataframe thead th {\n",
  402. " text-align: right;\n",
  403. " }\n",
  404. "</style>\n",
  405. "<table border=\"1\" class=\"dataframe\">\n",
  406. " <thead>\n",
  407. " <tr style=\"text-align: right;\">\n",
  408. " <th></th>\n",
  409. " <th>title</th>\n",
  410. " <th>genres</th>\n",
  411. " <th>runtime</th>\n",
  412. " <th>vote_average</th>\n",
  413. " <th>vote_count</th>\n",
  414. " <th>year</th>\n",
  415. " </tr>\n",
  416. " </thead>\n",
  417. " <tbody>\n",
  418. " <tr>\n",
  419. " <th>0</th>\n",
  420. " <td>Toy Story</td>\n",
  421. " <td>[animation, comedy, family]</td>\n",
  422. " <td>81.0</td>\n",
  423. " <td>7.7</td>\n",
  424. " <td>5415.0</td>\n",
  425. " <td>1995</td>\n",
  426. " </tr>\n",
  427. " <tr>\n",
  428. " <th>1</th>\n",
  429. " <td>Jumanji</td>\n",
  430. " <td>[adventure, fantasy, family]</td>\n",
  431. " <td>104.0</td>\n",
  432. " <td>6.9</td>\n",
  433. " <td>2413.0</td>\n",
  434. " <td>1995</td>\n",
  435. " </tr>\n",
  436. " <tr>\n",
  437. " <th>2</th>\n",
  438. " <td>Grumpier Old Men</td>\n",
  439. " <td>[romance, comedy]</td>\n",
  440. " <td>101.0</td>\n",
  441. " <td>6.5</td>\n",
  442. " <td>92.0</td>\n",
  443. " <td>1995</td>\n",
  444. " </tr>\n",
  445. " <tr>\n",
  446. " <th>3</th>\n",
  447. " <td>Waiting to Exhale</td>\n",
  448. " <td>[comedy, drama, romance]</td>\n",
  449. " <td>127.0</td>\n",
  450. " <td>6.1</td>\n",
  451. " <td>34.0</td>\n",
  452. " <td>1995</td>\n",
  453. " </tr>\n",
  454. " <tr>\n",
  455. " <th>4</th>\n",
  456. " <td>Father of the Bride Part II</td>\n",
  457. " <td>[comedy]</td>\n",
  458. " <td>106.0</td>\n",
  459. " <td>5.7</td>\n",
  460. " <td>173.0</td>\n",
  461. " <td>1995</td>\n",
  462. " </tr>\n",
  463. " </tbody>\n",
  464. "</table>\n",
  465. "</div>"
  466. ],
  467. "text/plain": [
  468. " title genres runtime \\\n",
  469. "0 Toy Story [animation, comedy, family] 81.0 \n",
  470. "1 Jumanji [adventure, fantasy, family] 104.0 \n",
  471. "2 Grumpier Old Men [romance, comedy] 101.0 \n",
  472. "3 Waiting to Exhale [comedy, drama, romance] 127.0 \n",
  473. "4 Father of the Bride Part II [comedy] 106.0 \n",
  474. "\n",
  475. " vote_average vote_count year \n",
  476. "0 7.7 5415.0 1995 \n",
  477. "1 6.9 2413.0 1995 \n",
  478. "2 6.5 92.0 1995 \n",
  479. "3 6.1 34.0 1995 \n",
  480. "4 5.7 173.0 1995 "
  481. ]
  482. },
  483. "execution_count": 10,
  484. "metadata": {},
  485. "output_type": "execute_result"
  486. }
  487. ],
  488. "source": [
  489. "df.head()"
  490. ]
  491. },
  492. {
  493. "cell_type": "code",
  494. "execution_count": 11,
  495. "metadata": {},
  496. "outputs": [
  497. {
  498. "name": "stderr",
  499. "output_type": "stream",
  500. "text": [
  501. "/var/folders/dv/107570dd01b_m3wvmvv0g9lm0000gn/T/ipykernel_4569/328443552.py:2: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.\n",
  502. " s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)\n"
  503. ]
  504. },
  505. {
  506. "data": {
  507. "text/html": [
  508. "<div>\n",
  509. "<style scoped>\n",
  510. " .dataframe tbody tr th:only-of-type {\n",
  511. " vertical-align: middle;\n",
  512. " }\n",
  513. "\n",
  514. " .dataframe tbody tr th {\n",
  515. " vertical-align: top;\n",
  516. " }\n",
  517. "\n",
  518. " .dataframe thead th {\n",
  519. " text-align: right;\n",
  520. " }\n",
  521. "</style>\n",
  522. "<table border=\"1\" class=\"dataframe\">\n",
  523. " <thead>\n",
  524. " <tr style=\"text-align: right;\">\n",
  525. " <th></th>\n",
  526. " <th>title</th>\n",
  527. " <th>runtime</th>\n",
  528. " <th>vote_average</th>\n",
  529. " <th>vote_count</th>\n",
  530. " <th>year</th>\n",
  531. " <th>genre</th>\n",
  532. " </tr>\n",
  533. " </thead>\n",
  534. " <tbody>\n",
  535. " <tr>\n",
  536. " <th>0</th>\n",
  537. " <td>Toy Story</td>\n",
  538. " <td>81.0</td>\n",
  539. " <td>7.7</td>\n",
  540. " <td>5415.0</td>\n",
  541. " <td>1995</td>\n",
  542. " <td>animation</td>\n",
  543. " </tr>\n",
  544. " <tr>\n",
  545. " <th>0</th>\n",
  546. " <td>Toy Story</td>\n",
  547. " <td>81.0</td>\n",
  548. " <td>7.7</td>\n",
  549. " <td>5415.0</td>\n",
  550. " <td>1995</td>\n",
  551. " <td>comedy</td>\n",
  552. " </tr>\n",
  553. " <tr>\n",
  554. " <th>0</th>\n",
  555. " <td>Toy Story</td>\n",
  556. " <td>81.0</td>\n",
  557. " <td>7.7</td>\n",
  558. " <td>5415.0</td>\n",
  559. " <td>1995</td>\n",
  560. " <td>family</td>\n",
  561. " </tr>\n",
  562. " <tr>\n",
  563. " <th>1</th>\n",
  564. " <td>Jumanji</td>\n",
  565. " <td>104.0</td>\n",
  566. " <td>6.9</td>\n",
  567. " <td>2413.0</td>\n",
  568. " <td>1995</td>\n",
  569. " <td>adventure</td>\n",
  570. " </tr>\n",
  571. " <tr>\n",
  572. " <th>1</th>\n",
  573. " <td>Jumanji</td>\n",
  574. " <td>104.0</td>\n",
  575. " <td>6.9</td>\n",
  576. " <td>2413.0</td>\n",
  577. " <td>1995</td>\n",
  578. " <td>fantasy</td>\n",
  579. " </tr>\n",
  580. " </tbody>\n",
  581. "</table>\n",
  582. "</div>"
  583. ],
  584. "text/plain": [
  585. " title runtime vote_average vote_count year genre\n",
  586. "0 Toy Story 81.0 7.7 5415.0 1995 animation\n",
  587. "0 Toy Story 81.0 7.7 5415.0 1995 comedy\n",
  588. "0 Toy Story 81.0 7.7 5415.0 1995 family\n",
  589. "1 Jumanji 104.0 6.9 2413.0 1995 adventure\n",
  590. "1 Jumanji 104.0 6.9 2413.0 1995 fantasy"
  591. ]
  592. },
  593. "execution_count": 11,
  594. "metadata": {},
  595. "output_type": "execute_result"
  596. }
  597. ],
  598. "source": [
  599. "#Create a new feature by exploding genres\n",
  600. "s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)\n",
  601. "\n",
  602. "#Name the new feature as 'genre'\n",
  603. "s.name = 'genre'\n",
  604. "\n",
  605. "#Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.\n",
  606. "gen_df = df.drop('genres', axis=1).join(s)\n",
  607. "\n",
  608. "#Print the head of the new gen_df\n",
  609. "gen_df.head()"
  610. ]
  611. },
  612. {
  613. "cell_type": "code",
  614. "execution_count": 12,
  615. "metadata": {},
  616. "outputs": [],
  617. "source": [
  618. "def build_chart(gen_df, percentile=0.8):\n",
  619. " #Ask for preferred genres\n",
  620. " print(\"Input preferred genre\")\n",
  621. " genre = input()\n",
  622. " \n",
  623. " #Ask for lower limit of duration\n",
  624. " print(\"Input shortest duration\")\n",
  625. " low_time = int(input())\n",
  626. " \n",
  627. " #Ask for upper limit of duration\n",
  628. " print(\"Input longest duration\")\n",
  629. " high_time = int(input())\n",
  630. " \n",
  631. " #Ask for lower limit of timeline\n",
  632. " print(\"Input earliest year\")\n",
  633. " low_year = int(input())\n",
  634. " \n",
  635. " #Ask for upper limit of timeline\n",
  636. " print(\"Input latest year\")\n",
  637. " high_year = int(input())\n",
  638. " \n",
  639. " #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies\n",
  640. " movies = gen_df.copy()\n",
  641. " \n",
  642. " #Filter based on the condition\n",
  643. " movies = movies[(movies['genre'] == genre) & \n",
  644. " (movies['runtime'] >= low_time) & \n",
  645. " (movies['runtime'] <= high_time) & \n",
  646. " (movies['year'] >= low_year) & \n",
  647. " (movies['year'] <= high_year)]\n",
  648. " \n",
  649. " #Compute the values of C and m for the filtered movies\n",
  650. " C = movies['vote_average'].mean()\n",
  651. " m = movies['vote_count'].quantile(percentile)\n",
  652. " \n",
  653. " #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies\n",
  654. " q_movies = movies.copy().loc[movies['vote_count'] >= m]\n",
  655. " \n",
  656. " #Calculate score using the IMDB formula\n",
  657. " q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) \n",
  658. " + (m/(m+x['vote_count']) * C)\n",
  659. " ,axis=1)\n",
  660. "\n",
  661. " #Sort movies in descending order of their scores\n",
  662. " q_movies = q_movies.sort_values('score', ascending=False)\n",
  663. " \n",
  664. " return q_movies"
  665. ]
  666. },
  667. {
  668. "cell_type": "code",
  669. "execution_count": 13,
  670. "metadata": {},
  671. "outputs": [
  672. {
  673. "name": "stdout",
  674. "output_type": "stream",
  675. "text": [
  676. "Input preferred genre\n",
  677. "horror\n",
  678. "Input shortest duration\n",
  679. "60\n",
  680. "Input longest duration\n",
  681. "120\n",
  682. "Input earliest year\n",
  683. "1990\n",
  684. "Input latest year\n",
  685. "2022\n"
  686. ]
  687. },
  688. {
  689. "data": {
  690. "text/html": [
  691. "<div>\n",
  692. "<style scoped>\n",
  693. " .dataframe tbody tr th:only-of-type {\n",
  694. " vertical-align: middle;\n",
  695. " }\n",
  696. "\n",
  697. " .dataframe tbody tr th {\n",
  698. " vertical-align: top;\n",
  699. " }\n",
  700. "\n",
  701. " .dataframe thead th {\n",
  702. " text-align: right;\n",
  703. " }\n",
  704. "</style>\n",
  705. "<table border=\"1\" class=\"dataframe\">\n",
  706. " <thead>\n",
  707. " <tr style=\"text-align: right;\">\n",
  708. " <th></th>\n",
  709. " <th>title</th>\n",
  710. " <th>runtime</th>\n",
  711. " <th>vote_average</th>\n",
  712. " <th>vote_count</th>\n",
  713. " <th>year</th>\n",
  714. " <th>genre</th>\n",
  715. " <th>score</th>\n",
  716. " </tr>\n",
  717. " </thead>\n",
  718. " <tbody>\n",
  719. " <tr>\n",
  720. " <th>39821</th>\n",
  721. " <td>Train to Busan</td>\n",
  722. " <td>118.0</td>\n",
  723. " <td>7.7</td>\n",
  724. " <td>984.0</td>\n",
  725. " <td>2016</td>\n",
  726. " <td>horror</td>\n",
  727. " <td>7.424441</td>\n",
  728. " </tr>\n",
  729. " <tr>\n",
  730. " <th>8147</th>\n",
  731. " <td>Shaun of the Dead</td>\n",
  732. " <td>99.0</td>\n",
  733. " <td>7.5</td>\n",
  734. " <td>2479.0</td>\n",
  735. " <td>2004</td>\n",
  736. " <td>horror</td>\n",
  737. " <td>7.392081</td>\n",
  738. " </tr>\n",
  739. " <tr>\n",
  740. " <th>21276</th>\n",
  741. " <td>The Conjuring</td>\n",
  742. " <td>112.0</td>\n",
  743. " <td>7.4</td>\n",
  744. " <td>3169.0</td>\n",
  745. " <td>2013</td>\n",
  746. " <td>horror</td>\n",
  747. " <td>7.318185</td>\n",
  748. " </tr>\n",
  749. " <tr>\n",
  750. " <th>4591</th>\n",
  751. " <td>The Others</td>\n",
  752. " <td>101.0</td>\n",
  753. " <td>7.4</td>\n",
  754. " <td>1708.0</td>\n",
  755. " <td>2001</td>\n",
  756. " <td>horror</td>\n",
  757. " <td>7.252502</td>\n",
  758. " </tr>\n",
  759. " <tr>\n",
  760. " <th>12891</th>\n",
  761. " <td>Let the Right One In</td>\n",
  762. " <td>115.0</td>\n",
  763. " <td>7.5</td>\n",
  764. " <td>997.0</td>\n",
  765. " <td>2008</td>\n",
  766. " <td>horror</td>\n",
  767. " <td>7.247838</td>\n",
  768. " </tr>\n",
  769. " </tbody>\n",
  770. "</table>\n",
  771. "</div>"
  772. ],
  773. "text/plain": [
  774. " title runtime vote_average vote_count year genre \\\n",
  775. "39821 Train to Busan 118.0 7.7 984.0 2016 horror \n",
  776. "8147 Shaun of the Dead 99.0 7.5 2479.0 2004 horror \n",
  777. "21276 The Conjuring 112.0 7.4 3169.0 2013 horror \n",
  778. "4591 The Others 101.0 7.4 1708.0 2001 horror \n",
  779. "12891 Let the Right One In 115.0 7.5 997.0 2008 horror \n",
  780. "\n",
  781. " score \n",
  782. "39821 7.424441 \n",
  783. "8147 7.392081 \n",
  784. "21276 7.318185 \n",
  785. "4591 7.252502 \n",
  786. "12891 7.247838 "
  787. ]
  788. },
  789. "execution_count": 13,
  790. "metadata": {},
  791. "output_type": "execute_result"
  792. }
  793. ],
  794. "source": [
  795. "#Generate the chart for top animation movies and display top 5.\n",
  796. "build_chart(gen_df).head()"
  797. ]
  798. },
  799. {
  800. "cell_type": "code",
  801. "execution_count": 14,
  802. "metadata": {},
  803. "outputs": [],
  804. "source": [
  805. "#Convert the cleaned (non-exploded) dataframe df into a CSV file and save it in the data folder\n",
  806. "#Set parameter index to False as the index of the DataFrame has no inherent meaning.\n",
  807. "df.to_csv('../data/metadata_clean.csv', index=False)"
  808. ]
  809. },
  810. {
  811. "cell_type": "code",
  812. "execution_count": null,
  813. "metadata": {
  814. "collapsed": true
  815. },
  816. "outputs": [],
  817. "source": []
  818. }
  819. ],
  820. "metadata": {
  821. "kernelspec": {
  822. "display_name": "Python 3 (ipykernel)",
  823. "language": "python",
  824. "name": "python3"
  825. },
  826. "language_info": {
  827. "codemirror_mode": {
  828. "name": "ipython",
  829. "version": 3
  830. },
  831. "file_extension": ".py",
  832. "mimetype": "text/x-python",
  833. "name": "python",
  834. "nbconvert_exporter": "python",
  835. "pygments_lexer": "ipython3",
  836. "version": "3.10.4"
  837. }
  838. },
  839. "nbformat": 4,
  840. "nbformat_minor": 2
  841. }