Aucune description

Knowledge Recommender-checkpoint.ipynb 25KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [
  8. {
  9. "name": "stderr",
  10. "output_type": "stream",
  11. "text": [
  12. "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
  13. " interactivity=interactivity, compiler=compiler, result=result)\n"
  14. ]
  15. },
  16. {
  17. "data": {
  18. "text/plain": [
  19. "Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',\n",
  20. " 'imdb_id', 'original_language', 'original_title', 'overview',\n",
  21. " 'popularity', 'poster_path', 'production_companies',\n",
  22. " 'production_countries', 'release_date', 'revenue', 'runtime',\n",
  23. " 'spoken_languages', 'status', 'tagline', 'title', 'video',\n",
  24. " 'vote_average', 'vote_count'],\n",
  25. " dtype='object')"
  26. ]
  27. },
  28. "execution_count": 1,
  29. "metadata": {},
  30. "output_type": "execute_result"
  31. }
  32. ],
  33. "source": [
  34. "import pandas as pd\n",
  35. "import numpy as np\n",
  36. "\n",
  37. "df = pd.read_csv('../data/movies_metadata.csv')\n",
  38. "\n",
  39. "#Print all the features (or columns) of the DataFrame\n",
  40. "df.columns"
  41. ]
  42. },
  43. {
  44. "cell_type": "code",
  45. "execution_count": 2,
  46. "metadata": {},
  47. "outputs": [
  48. {
  49. "data": {
  50. "text/html": [
  51. "<div>\n",
  52. "<style>\n",
  53. " .dataframe thead tr:only-child th {\n",
  54. " text-align: right;\n",
  55. " }\n",
  56. "\n",
  57. " .dataframe thead th {\n",
  58. " text-align: left;\n",
  59. " }\n",
  60. "\n",
  61. " .dataframe tbody tr th {\n",
  62. " vertical-align: top;\n",
  63. " }\n",
  64. "</style>\n",
  65. "<table border=\"1\" class=\"dataframe\">\n",
  66. " <thead>\n",
  67. " <tr style=\"text-align: right;\">\n",
  68. " <th></th>\n",
  69. " <th>title</th>\n",
  70. " <th>genres</th>\n",
  71. " <th>release_date</th>\n",
  72. " <th>runtime</th>\n",
  73. " <th>vote_average</th>\n",
  74. " <th>vote_count</th>\n",
  75. " </tr>\n",
  76. " </thead>\n",
  77. " <tbody>\n",
  78. " <tr>\n",
  79. " <th>0</th>\n",
  80. " <td>Toy Story</td>\n",
  81. " <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
  82. " <td>1995-10-30</td>\n",
  83. " <td>81.0</td>\n",
  84. " <td>7.7</td>\n",
  85. " <td>5415.0</td>\n",
  86. " </tr>\n",
  87. " <tr>\n",
  88. " <th>1</th>\n",
  89. " <td>Jumanji</td>\n",
  90. " <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
  91. " <td>1995-12-15</td>\n",
  92. " <td>104.0</td>\n",
  93. " <td>6.9</td>\n",
  94. " <td>2413.0</td>\n",
  95. " </tr>\n",
  96. " <tr>\n",
  97. " <th>2</th>\n",
  98. " <td>Grumpier Old Men</td>\n",
  99. " <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
  100. " <td>1995-12-22</td>\n",
  101. " <td>101.0</td>\n",
  102. " <td>6.5</td>\n",
  103. " <td>92.0</td>\n",
  104. " </tr>\n",
  105. " <tr>\n",
  106. " <th>3</th>\n",
  107. " <td>Waiting to Exhale</td>\n",
  108. " <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
  109. " <td>1995-12-22</td>\n",
  110. " <td>127.0</td>\n",
  111. " <td>6.1</td>\n",
  112. " <td>34.0</td>\n",
  113. " </tr>\n",
  114. " <tr>\n",
  115. " <th>4</th>\n",
  116. " <td>Father of the Bride Part II</td>\n",
  117. " <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
  118. " <td>1995-02-10</td>\n",
  119. " <td>106.0</td>\n",
  120. " <td>5.7</td>\n",
  121. " <td>173.0</td>\n",
  122. " </tr>\n",
  123. " </tbody>\n",
  124. "</table>\n",
  125. "</div>"
  126. ],
  127. "text/plain": [
  128. " title \\\n",
  129. "0 Toy Story \n",
  130. "1 Jumanji \n",
  131. "2 Grumpier Old Men \n",
  132. "3 Waiting to Exhale \n",
  133. "4 Father of the Bride Part II \n",
  134. "\n",
  135. " genres release_date runtime \\\n",
  136. "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 1995-10-30 81.0 \n",
  137. "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 1995-12-15 104.0 \n",
  138. "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 1995-12-22 101.0 \n",
  139. "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 1995-12-22 127.0 \n",
  140. "4 [{'id': 35, 'name': 'Comedy'}] 1995-02-10 106.0 \n",
  141. "\n",
  142. " vote_average vote_count \n",
  143. "0 7.7 5415.0 \n",
  144. "1 6.9 2413.0 \n",
  145. "2 6.5 92.0 \n",
  146. "3 6.1 34.0 \n",
  147. "4 5.7 173.0 "
  148. ]
  149. },
  150. "execution_count": 2,
  151. "metadata": {},
  152. "output_type": "execute_result"
  153. }
  154. ],
  155. "source": [
  156. "#Only keep those features that we require \n",
  157. "df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]\n",
  158. "\n",
  159. "df.head()"
  160. ]
  161. },
  162. {
  163. "cell_type": "code",
  164. "execution_count": 3,
  165. "metadata": {
  166. "collapsed": true
  167. },
  168. "outputs": [],
  169. "source": [
  170. "#Convert release_date into pandas datetime format\n",
  171. "df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')\n",
  172. "\n",
  173. "#Extract year from the datetime\n",
  174. "df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)"
  175. ]
  176. },
  177. {
  178. "cell_type": "code",
  179. "execution_count": 4,
  180. "metadata": {
  181. "collapsed": true
  182. },
  183. "outputs": [],
  184. "source": [
  185. "#Helper function to convert NaT to 0 and all other years to integers.\n",
  186. "def convert_int(x):\n",
  187. " try:\n",
  188. " return int(x)\n",
  189. " except:\n",
  190. " return 0"
  191. ]
  192. },
  193. {
  194. "cell_type": "code",
  195. "execution_count": 5,
  196. "metadata": {
  197. "collapsed": true
  198. },
  199. "outputs": [],
  200. "source": [
  201. "#Apply convert_int to the year feature\n",
  202. "df['year'] = df['year'].apply(convert_int)"
  203. ]
  204. },
  205. {
  206. "cell_type": "code",
  207. "execution_count": 6,
  208. "metadata": {},
  209. "outputs": [
  210. {
  211. "data": {
  212. "text/html": [
  213. "<div>\n",
  214. "<style>\n",
  215. " .dataframe thead tr:only-child th {\n",
  216. " text-align: right;\n",
  217. " }\n",
  218. "\n",
  219. " .dataframe thead th {\n",
  220. " text-align: left;\n",
  221. " }\n",
  222. "\n",
  223. " .dataframe tbody tr th {\n",
  224. " vertical-align: top;\n",
  225. " }\n",
  226. "</style>\n",
  227. "<table border=\"1\" class=\"dataframe\">\n",
  228. " <thead>\n",
  229. " <tr style=\"text-align: right;\">\n",
  230. " <th></th>\n",
  231. " <th>title</th>\n",
  232. " <th>genres</th>\n",
  233. " <th>runtime</th>\n",
  234. " <th>vote_average</th>\n",
  235. " <th>vote_count</th>\n",
  236. " <th>year</th>\n",
  237. " </tr>\n",
  238. " </thead>\n",
  239. " <tbody>\n",
  240. " <tr>\n",
  241. " <th>0</th>\n",
  242. " <td>Toy Story</td>\n",
  243. " <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
  244. " <td>81.0</td>\n",
  245. " <td>7.7</td>\n",
  246. " <td>5415.0</td>\n",
  247. " <td>1995</td>\n",
  248. " </tr>\n",
  249. " <tr>\n",
  250. " <th>1</th>\n",
  251. " <td>Jumanji</td>\n",
  252. " <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
  253. " <td>104.0</td>\n",
  254. " <td>6.9</td>\n",
  255. " <td>2413.0</td>\n",
  256. " <td>1995</td>\n",
  257. " </tr>\n",
  258. " <tr>\n",
  259. " <th>2</th>\n",
  260. " <td>Grumpier Old Men</td>\n",
  261. " <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
  262. " <td>101.0</td>\n",
  263. " <td>6.5</td>\n",
  264. " <td>92.0</td>\n",
  265. " <td>1995</td>\n",
  266. " </tr>\n",
  267. " <tr>\n",
  268. " <th>3</th>\n",
  269. " <td>Waiting to Exhale</td>\n",
  270. " <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
  271. " <td>127.0</td>\n",
  272. " <td>6.1</td>\n",
  273. " <td>34.0</td>\n",
  274. " <td>1995</td>\n",
  275. " </tr>\n",
  276. " <tr>\n",
  277. " <th>4</th>\n",
  278. " <td>Father of the Bride Part II</td>\n",
  279. " <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
  280. " <td>106.0</td>\n",
  281. " <td>5.7</td>\n",
  282. " <td>173.0</td>\n",
  283. " <td>1995</td>\n",
  284. " </tr>\n",
  285. " </tbody>\n",
  286. "</table>\n",
  287. "</div>"
  288. ],
  289. "text/plain": [
  290. " title \\\n",
  291. "0 Toy Story \n",
  292. "1 Jumanji \n",
  293. "2 Grumpier Old Men \n",
  294. "3 Waiting to Exhale \n",
  295. "4 Father of the Bride Part II \n",
  296. "\n",
  297. " genres runtime vote_average \\\n",
  298. "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 81.0 7.7 \n",
  299. "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 104.0 6.9 \n",
  300. "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 101.0 6.5 \n",
  301. "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 127.0 6.1 \n",
  302. "4 [{'id': 35, 'name': 'Comedy'}] 106.0 5.7 \n",
  303. "\n",
  304. " vote_count year \n",
  305. "0 5415.0 1995 \n",
  306. "1 2413.0 1995 \n",
  307. "2 92.0 1995 \n",
  308. "3 34.0 1995 \n",
  309. "4 173.0 1995 "
  310. ]
  311. },
  312. "execution_count": 6,
  313. "metadata": {},
  314. "output_type": "execute_result"
  315. }
  316. ],
  317. "source": [
  318. "#Drop the release_date column\n",
  319. "df = df.drop('release_date', axis=1)\n",
  320. "\n",
  321. "#Display the dataframe\n",
  322. "df.head()"
  323. ]
  324. },
  325. {
  326. "cell_type": "code",
  327. "execution_count": 7,
  328. "metadata": {},
  329. "outputs": [
  330. {
  331. "data": {
  332. "text/plain": [
  333. "\"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]\""
  334. ]
  335. },
  336. "execution_count": 7,
  337. "metadata": {},
  338. "output_type": "execute_result"
  339. }
  340. ],
  341. "source": [
  342. "#Print genres of the first movie\n",
  343. "df.iloc[0]['genres']"
  344. ]
  345. },
  346. {
  347. "cell_type": "code",
  348. "execution_count": 8,
  349. "metadata": {},
  350. "outputs": [
  351. {
  352. "name": "stdout",
  353. "output_type": "stream",
  354. "text": [
  355. "<class 'str'>\n",
  356. "<class 'list'>\n"
  357. ]
  358. }
  359. ],
  360. "source": [
  361. "#Import the literal_eval function from ast\n",
  362. "from ast import literal_eval\n",
  363. "\n",
  364. "#Define a stringified list and output its type\n",
  365. "a = \"[1,2,3]\"\n",
  366. "print(type(a))\n",
  367. "\n",
  368. "#Apply literal_eval and output type\n",
  369. "b = literal_eval(a)\n",
  370. "print(type(b))"
  371. ]
  372. },
  373. {
  374. "cell_type": "code",
  375. "execution_count": 9,
  376. "metadata": {
  377. "collapsed": true
  378. },
  379. "outputs": [],
  380. "source": [
  381. "#Convert all NaN into stringified empty lists\n",
  382. "df['genres'] = df['genres'].fillna('[]')\n",
  383. "\n",
  384. "#Apply literal_eval to convert stringified empty lists to the list object\n",
  385. "df['genres'] = df['genres'].apply(literal_eval)\n",
  386. "\n",
  387. "#Convert list of dictionaries to a list of strings\n",
  388. "df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])"
  389. ]
  390. },
  391. {
  392. "cell_type": "code",
  393. "execution_count": 10,
  394. "metadata": {},
  395. "outputs": [
  396. {
  397. "data": {
  398. "text/html": [
  399. "<div>\n",
  400. "<style>\n",
  401. " .dataframe thead tr:only-child th {\n",
  402. " text-align: right;\n",
  403. " }\n",
  404. "\n",
  405. " .dataframe thead th {\n",
  406. " text-align: left;\n",
  407. " }\n",
  408. "\n",
  409. " .dataframe tbody tr th {\n",
  410. " vertical-align: top;\n",
  411. " }\n",
  412. "</style>\n",
  413. "<table border=\"1\" class=\"dataframe\">\n",
  414. " <thead>\n",
  415. " <tr style=\"text-align: right;\">\n",
  416. " <th></th>\n",
  417. " <th>title</th>\n",
  418. " <th>genres</th>\n",
  419. " <th>runtime</th>\n",
  420. " <th>vote_average</th>\n",
  421. " <th>vote_count</th>\n",
  422. " <th>year</th>\n",
  423. " </tr>\n",
  424. " </thead>\n",
  425. " <tbody>\n",
  426. " <tr>\n",
  427. " <th>0</th>\n",
  428. " <td>Toy Story</td>\n",
  429. " <td>[animation, comedy, family]</td>\n",
  430. " <td>81.0</td>\n",
  431. " <td>7.7</td>\n",
  432. " <td>5415.0</td>\n",
  433. " <td>1995</td>\n",
  434. " </tr>\n",
  435. " <tr>\n",
  436. " <th>1</th>\n",
  437. " <td>Jumanji</td>\n",
  438. " <td>[adventure, fantasy, family]</td>\n",
  439. " <td>104.0</td>\n",
  440. " <td>6.9</td>\n",
  441. " <td>2413.0</td>\n",
  442. " <td>1995</td>\n",
  443. " </tr>\n",
  444. " <tr>\n",
  445. " <th>2</th>\n",
  446. " <td>Grumpier Old Men</td>\n",
  447. " <td>[romance, comedy]</td>\n",
  448. " <td>101.0</td>\n",
  449. " <td>6.5</td>\n",
  450. " <td>92.0</td>\n",
  451. " <td>1995</td>\n",
  452. " </tr>\n",
  453. " <tr>\n",
  454. " <th>3</th>\n",
  455. " <td>Waiting to Exhale</td>\n",
  456. " <td>[comedy, drama, romance]</td>\n",
  457. " <td>127.0</td>\n",
  458. " <td>6.1</td>\n",
  459. " <td>34.0</td>\n",
  460. " <td>1995</td>\n",
  461. " </tr>\n",
  462. " <tr>\n",
  463. " <th>4</th>\n",
  464. " <td>Father of the Bride Part II</td>\n",
  465. " <td>[comedy]</td>\n",
  466. " <td>106.0</td>\n",
  467. " <td>5.7</td>\n",
  468. " <td>173.0</td>\n",
  469. " <td>1995</td>\n",
  470. " </tr>\n",
  471. " </tbody>\n",
  472. "</table>\n",
  473. "</div>"
  474. ],
  475. "text/plain": [
  476. " title genres runtime \\\n",
  477. "0 Toy Story [animation, comedy, family] 81.0 \n",
  478. "1 Jumanji [adventure, fantasy, family] 104.0 \n",
  479. "2 Grumpier Old Men [romance, comedy] 101.0 \n",
  480. "3 Waiting to Exhale [comedy, drama, romance] 127.0 \n",
  481. "4 Father of the Bride Part II [comedy] 106.0 \n",
  482. "\n",
  483. " vote_average vote_count year \n",
  484. "0 7.7 5415.0 1995 \n",
  485. "1 6.9 2413.0 1995 \n",
  486. "2 6.5 92.0 1995 \n",
  487. "3 6.1 34.0 1995 \n",
  488. "4 5.7 173.0 1995 "
  489. ]
  490. },
  491. "execution_count": 10,
  492. "metadata": {},
  493. "output_type": "execute_result"
  494. }
  495. ],
  496. "source": [
  497. "df.head()"
  498. ]
  499. },
  500. {
  501. "cell_type": "code",
  502. "execution_count": 11,
  503. "metadata": {},
  504. "outputs": [
  505. {
  506. "data": {
  507. "text/html": [
  508. "<div>\n",
  509. "<style>\n",
  510. " .dataframe thead tr:only-child th {\n",
  511. " text-align: right;\n",
  512. " }\n",
  513. "\n",
  514. " .dataframe thead th {\n",
  515. " text-align: left;\n",
  516. " }\n",
  517. "\n",
  518. " .dataframe tbody tr th {\n",
  519. " vertical-align: top;\n",
  520. " }\n",
  521. "</style>\n",
  522. "<table border=\"1\" class=\"dataframe\">\n",
  523. " <thead>\n",
  524. " <tr style=\"text-align: right;\">\n",
  525. " <th></th>\n",
  526. " <th>title</th>\n",
  527. " <th>runtime</th>\n",
  528. " <th>vote_average</th>\n",
  529. " <th>vote_count</th>\n",
  530. " <th>year</th>\n",
  531. " <th>genre</th>\n",
  532. " </tr>\n",
  533. " </thead>\n",
  534. " <tbody>\n",
  535. " <tr>\n",
  536. " <th>0</th>\n",
  537. " <td>Toy Story</td>\n",
  538. " <td>81.0</td>\n",
  539. " <td>7.7</td>\n",
  540. " <td>5415.0</td>\n",
  541. " <td>1995</td>\n",
  542. " <td>animation</td>\n",
  543. " </tr>\n",
  544. " <tr>\n",
  545. " <th>0</th>\n",
  546. " <td>Toy Story</td>\n",
  547. " <td>81.0</td>\n",
  548. " <td>7.7</td>\n",
  549. " <td>5415.0</td>\n",
  550. " <td>1995</td>\n",
  551. " <td>comedy</td>\n",
  552. " </tr>\n",
  553. " <tr>\n",
  554. " <th>0</th>\n",
  555. " <td>Toy Story</td>\n",
  556. " <td>81.0</td>\n",
  557. " <td>7.7</td>\n",
  558. " <td>5415.0</td>\n",
  559. " <td>1995</td>\n",
  560. " <td>family</td>\n",
  561. " </tr>\n",
  562. " <tr>\n",
  563. " <th>1</th>\n",
  564. " <td>Jumanji</td>\n",
  565. " <td>104.0</td>\n",
  566. " <td>6.9</td>\n",
  567. " <td>2413.0</td>\n",
  568. " <td>1995</td>\n",
  569. " <td>adventure</td>\n",
  570. " </tr>\n",
  571. " <tr>\n",
  572. " <th>1</th>\n",
  573. " <td>Jumanji</td>\n",
  574. " <td>104.0</td>\n",
  575. " <td>6.9</td>\n",
  576. " <td>2413.0</td>\n",
  577. " <td>1995</td>\n",
  578. " <td>fantasy</td>\n",
  579. " </tr>\n",
  580. " </tbody>\n",
  581. "</table>\n",
  582. "</div>"
  583. ],
  584. "text/plain": [
  585. " title runtime vote_average vote_count year genre\n",
  586. "0 Toy Story 81.0 7.7 5415.0 1995 animation\n",
  587. "0 Toy Story 81.0 7.7 5415.0 1995 comedy\n",
  588. "0 Toy Story 81.0 7.7 5415.0 1995 family\n",
  589. "1 Jumanji 104.0 6.9 2413.0 1995 adventure\n",
  590. "1 Jumanji 104.0 6.9 2413.0 1995 fantasy"
  591. ]
  592. },
  593. "execution_count": 11,
  594. "metadata": {},
  595. "output_type": "execute_result"
  596. }
  597. ],
  598. "source": [
  599. "#Create a new feature by exploding genres\n",
  600. "s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)\n",
  601. "\n",
  602. "#Name the new feature as 'genre'\n",
  603. "s.name = 'genre'\n",
  604. "\n",
  605. "#Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.\n",
  606. "gen_df = df.drop('genres', axis=1).join(s)\n",
  607. "\n",
  608. "#Print the head of the new gen_df\n",
  609. "gen_df.head()"
  610. ]
  611. },
  612. {
  613. "cell_type": "code",
  614. "execution_count": 12,
  615. "metadata": {
  616. "collapsed": true
  617. },
  618. "outputs": [],
  619. "source": [
  620. "def build_chart(gen_df, percentile=0.8):\n",
  621. " #Ask for preferred genres\n",
  622. " print(\"Input preferred genre\")\n",
  623. " genre = input()\n",
  624. " \n",
  625. " #Ask for lower limit of duration\n",
  626. " print(\"Input shortest duration\")\n",
  627. " low_time = int(input())\n",
  628. " \n",
  629. " #Ask for upper limit of duration\n",
  630. " print(\"Input longest duration\")\n",
  631. " high_time = int(input())\n",
  632. " \n",
  633. " #Ask for lower limit of timeline\n",
  634. " print(\"Input earliest year\")\n",
  635. " low_year = int(input())\n",
  636. " \n",
  637. " #Ask for upper limit of timeline\n",
  638. " print(\"Input latest year\")\n",
  639. " high_year = int(input())\n",
  640. " \n",
  641. " #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies\n",
  642. " movies = gen_df.copy()\n",
  643. " \n",
  644. " #Filter based on the condition\n",
  645. " movies = movies[(movies['genre'] == genre) & \n",
  646. " (movies['runtime'] >= low_time) & \n",
  647. " (movies['runtime'] <= high_time) & \n",
  648. " (movies['year'] >= low_year) & \n",
  649. " (movies['year'] <= high_year)]\n",
  650. " \n",
  651. " #Compute the values of C and m for the filtered movies\n",
  652. " C = movies['vote_average'].mean()\n",
  653. " m = movies['vote_count'].quantile(percentile)\n",
  654. " \n",
  655. " #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies\n",
  656. " q_movies = movies.copy().loc[movies['vote_count'] >= m]\n",
  657. " \n",
  658. " #Calculate score using the IMDB formula\n",
  659. " q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) \n",
  660. " + (m/(m+x['vote_count']) * C)\n",
  661. " ,axis=1)\n",
  662. "\n",
  663. " #Sort movies in descending order of their scores\n",
  664. " q_movies = q_movies.sort_values('score', ascending=False)\n",
  665. " \n",
  666. " return q_movies"
  667. ]
  668. },
  669. {
  670. "cell_type": "code",
  671. "execution_count": 13,
  672. "metadata": {},
  673. "outputs": [
  674. {
  675. "name": "stdout",
  676. "output_type": "stream",
  677. "text": [
  678. "Input preferred genre\n",
  679. "action\n",
  680. "Input shortest duration\n",
  681. "80\n",
  682. "Input longest duration\n",
  683. "120\n",
  684. "Input earliest year\n",
  685. "1990\n",
  686. "Input latest year\n",
  687. "2000\n"
  688. ]
  689. },
  690. {
  691. "data": {
  692. "text/html": [
  693. "<div>\n",
  694. "<style>\n",
  695. " .dataframe thead tr:only-child th {\n",
  696. " text-align: right;\n",
  697. " }\n",
  698. "\n",
  699. " .dataframe thead th {\n",
  700. " text-align: left;\n",
  701. " }\n",
  702. "\n",
  703. " .dataframe tbody tr th {\n",
  704. " vertical-align: top;\n",
  705. " }\n",
  706. "</style>\n",
  707. "<table border=\"1\" class=\"dataframe\">\n",
  708. " <thead>\n",
  709. " <tr style=\"text-align: right;\">\n",
  710. " <th></th>\n",
  711. " <th>title</th>\n",
  712. " <th>runtime</th>\n",
  713. " <th>vote_average</th>\n",
  714. " <th>vote_count</th>\n",
  715. " <th>year</th>\n",
  716. " <th>genre</th>\n",
  717. " <th>score</th>\n",
  718. " </tr>\n",
  719. " </thead>\n",
  720. " <tbody>\n",
  721. " <tr>\n",
  722. " <th>723</th>\n",
  723. " <td>Ghost in the Shell</td>\n",
  724. " <td>83.0</td>\n",
  725. " <td>7.8</td>\n",
  726. " <td>854.0</td>\n",
  727. " <td>1995</td>\n",
  728. " <td>action</td>\n",
  729. " <td>7.521643</td>\n",
  730. " </tr>\n",
  731. " <tr>\n",
  732. " <th>550</th>\n",
  733. " <td>True Romance</td>\n",
  734. " <td>120.0</td>\n",
  735. " <td>7.5</td>\n",
  736. " <td>762.0</td>\n",
  737. " <td>1993</td>\n",
  738. " <td>action</td>\n",
  739. " <td>7.231980</td>\n",
  740. " </tr>\n",
  741. " <tr>\n",
  742. " <th>3902</th>\n",
  743. " <td>O Brother, Where Art Thou?</td>\n",
  744. " <td>106.0</td>\n",
  745. " <td>7.3</td>\n",
  746. " <td>1144.0</td>\n",
  747. " <td>2000</td>\n",
  748. " <td>action</td>\n",
  749. " <td>7.131617</td>\n",
  750. " </tr>\n",
  751. " <tr>\n",
  752. " <th>348</th>\n",
  753. " <td>The Crow</td>\n",
  754. " <td>102.0</td>\n",
  755. " <td>7.3</td>\n",
  756. " <td>980.0</td>\n",
  757. " <td>1994</td>\n",
  758. " <td>action</td>\n",
  759. " <td>7.106412</td>\n",
  760. " </tr>\n",
  761. " <tr>\n",
  762. " <th>3871</th>\n",
  763. " <td>Crouching Tiger, Hidden Dragon</td>\n",
  764. " <td>120.0</td>\n",
  765. " <td>7.2</td>\n",
  766. " <td>949.0</td>\n",
  767. " <td>2000</td>\n",
  768. " <td>action</td>\n",
  769. " <td>7.011634</td>\n",
  770. " </tr>\n",
  771. " </tbody>\n",
  772. "</table>\n",
  773. "</div>"
  774. ],
  775. "text/plain": [
  776. " title runtime vote_average vote_count year \\\n",
  777. "723 Ghost in the Shell 83.0 7.8 854.0 1995 \n",
  778. "550 True Romance 120.0 7.5 762.0 1993 \n",
  779. "3902 O Brother, Where Art Thou? 106.0 7.3 1144.0 2000 \n",
  780. "348 The Crow 102.0 7.3 980.0 1994 \n",
  781. "3871 Crouching Tiger, Hidden Dragon 120.0 7.2 949.0 2000 \n",
  782. "\n",
  783. " genre score \n",
  784. "723 action 7.521643 \n",
  785. "550 action 7.231980 \n",
  786. "3902 action 7.131617 \n",
  787. "348 action 7.106412 \n",
  788. "3871 action 7.011634 "
  789. ]
  790. },
  791. "execution_count": 13,
  792. "metadata": {},
  793. "output_type": "execute_result"
  794. }
  795. ],
  796. "source": [
  797. "#Generate the chart for top animation movies and display top 5.\n",
  798. "build_chart(gen_df).head()"
  799. ]
  800. },
  801. {
  802. "cell_type": "code",
  803. "execution_count": null,
  804. "metadata": {},
  805. "outputs": [],
  806. "source": []
  807. }
  808. ],
  809. "metadata": {
  810. "kernelspec": {
  811. "display_name": "Python 3",
  812. "language": "python",
  813. "name": "python3"
  814. },
  815. "language_info": {
  816. "codemirror_mode": {
  817. "name": "ipython",
  818. "version": 3
  819. },
  820. "file_extension": ".py",
  821. "mimetype": "text/x-python",
  822. "name": "python",
  823. "nbconvert_exporter": "python",
  824. "pygments_lexer": "ipython3",
  825. "version": "3.6.0"
  826. }
  827. },
  828. "nbformat": 4,
  829. "nbformat_minor": 2
  830. }