Browse Source

first commit

tum 3 years ago
commit
8788366aea

+ 1 - 0
.gitignore

@@ -0,0 +1 @@
1
+data

File diff suppressed because it is too large
+ 4789 - 0
.ipynb_checkpoints/contentbase-checkpoint.ipynb


File diff suppressed because it is too large
+ 34807 - 0
.ipynb_checkpoints/mongoconnect-checkpoint.ipynb


+ 6 - 0
Chapter2/.ipynb_checkpoints/Chapter2-checkpoint.ipynb

@@ -0,0 +1,6 @@
1
+{
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 2
6
+}

File diff suppressed because it is too large
+ 1873 - 0
Chapter2/Chapter2.ipynb


+ 830 - 0
Chapter3/.ipynb_checkpoints/Knowledge Recommender-checkpoint.ipynb

@@ -0,0 +1,830 @@
1
+{
2
+ "cells": [
3
+  {
4
+   "cell_type": "code",
5
+   "execution_count": 1,
6
+   "metadata": {},
7
+   "outputs": [
8
+    {
9
+     "name": "stderr",
10
+     "output_type": "stream",
11
+     "text": [
12
+      "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
13
+      "  interactivity=interactivity, compiler=compiler, result=result)\n"
14
+     ]
15
+    },
16
+    {
17
+     "data": {
18
+      "text/plain": [
19
+       "Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',\n",
20
+       "       'imdb_id', 'original_language', 'original_title', 'overview',\n",
21
+       "       'popularity', 'poster_path', 'production_companies',\n",
22
+       "       'production_countries', 'release_date', 'revenue', 'runtime',\n",
23
+       "       'spoken_languages', 'status', 'tagline', 'title', 'video',\n",
24
+       "       'vote_average', 'vote_count'],\n",
25
+       "      dtype='object')"
26
+      ]
27
+     },
28
+     "execution_count": 1,
29
+     "metadata": {},
30
+     "output_type": "execute_result"
31
+    }
32
+   ],
33
+   "source": [
34
+    "import pandas as pd\n",
35
+    "import numpy as np\n",
36
+    "\n",
37
+    "df = pd.read_csv('../data/movies_metadata.csv')\n",
38
+    "\n",
39
+    "#Print all the features (or columns) of the DataFrame\n",
40
+    "df.columns"
41
+   ]
42
+  },
43
+  {
44
+   "cell_type": "code",
45
+   "execution_count": 2,
46
+   "metadata": {},
47
+   "outputs": [
48
+    {
49
+     "data": {
50
+      "text/html": [
51
+       "<div>\n",
52
+       "<style>\n",
53
+       "    .dataframe thead tr:only-child th {\n",
54
+       "        text-align: right;\n",
55
+       "    }\n",
56
+       "\n",
57
+       "    .dataframe thead th {\n",
58
+       "        text-align: left;\n",
59
+       "    }\n",
60
+       "\n",
61
+       "    .dataframe tbody tr th {\n",
62
+       "        vertical-align: top;\n",
63
+       "    }\n",
64
+       "</style>\n",
65
+       "<table border=\"1\" class=\"dataframe\">\n",
66
+       "  <thead>\n",
67
+       "    <tr style=\"text-align: right;\">\n",
68
+       "      <th></th>\n",
69
+       "      <th>title</th>\n",
70
+       "      <th>genres</th>\n",
71
+       "      <th>release_date</th>\n",
72
+       "      <th>runtime</th>\n",
73
+       "      <th>vote_average</th>\n",
74
+       "      <th>vote_count</th>\n",
75
+       "    </tr>\n",
76
+       "  </thead>\n",
77
+       "  <tbody>\n",
78
+       "    <tr>\n",
79
+       "      <th>0</th>\n",
80
+       "      <td>Toy Story</td>\n",
81
+       "      <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
82
+       "      <td>1995-10-30</td>\n",
83
+       "      <td>81.0</td>\n",
84
+       "      <td>7.7</td>\n",
85
+       "      <td>5415.0</td>\n",
86
+       "    </tr>\n",
87
+       "    <tr>\n",
88
+       "      <th>1</th>\n",
89
+       "      <td>Jumanji</td>\n",
90
+       "      <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
91
+       "      <td>1995-12-15</td>\n",
92
+       "      <td>104.0</td>\n",
93
+       "      <td>6.9</td>\n",
94
+       "      <td>2413.0</td>\n",
95
+       "    </tr>\n",
96
+       "    <tr>\n",
97
+       "      <th>2</th>\n",
98
+       "      <td>Grumpier Old Men</td>\n",
99
+       "      <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
100
+       "      <td>1995-12-22</td>\n",
101
+       "      <td>101.0</td>\n",
102
+       "      <td>6.5</td>\n",
103
+       "      <td>92.0</td>\n",
104
+       "    </tr>\n",
105
+       "    <tr>\n",
106
+       "      <th>3</th>\n",
107
+       "      <td>Waiting to Exhale</td>\n",
108
+       "      <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
109
+       "      <td>1995-12-22</td>\n",
110
+       "      <td>127.0</td>\n",
111
+       "      <td>6.1</td>\n",
112
+       "      <td>34.0</td>\n",
113
+       "    </tr>\n",
114
+       "    <tr>\n",
115
+       "      <th>4</th>\n",
116
+       "      <td>Father of the Bride Part II</td>\n",
117
+       "      <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
118
+       "      <td>1995-02-10</td>\n",
119
+       "      <td>106.0</td>\n",
120
+       "      <td>5.7</td>\n",
121
+       "      <td>173.0</td>\n",
122
+       "    </tr>\n",
123
+       "  </tbody>\n",
124
+       "</table>\n",
125
+       "</div>"
126
+      ],
127
+      "text/plain": [
128
+       "                         title  \\\n",
129
+       "0                    Toy Story   \n",
130
+       "1                      Jumanji   \n",
131
+       "2             Grumpier Old Men   \n",
132
+       "3            Waiting to Exhale   \n",
133
+       "4  Father of the Bride Part II   \n",
134
+       "\n",
135
+       "                                              genres release_date  runtime  \\\n",
136
+       "0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   1995-10-30     81.0   \n",
137
+       "1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   1995-12-15    104.0   \n",
138
+       "2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   1995-12-22    101.0   \n",
139
+       "3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   1995-12-22    127.0   \n",
140
+       "4                     [{'id': 35, 'name': 'Comedy'}]   1995-02-10    106.0   \n",
141
+       "\n",
142
+       "   vote_average  vote_count  \n",
143
+       "0           7.7      5415.0  \n",
144
+       "1           6.9      2413.0  \n",
145
+       "2           6.5        92.0  \n",
146
+       "3           6.1        34.0  \n",
147
+       "4           5.7       173.0  "
148
+      ]
149
+     },
150
+     "execution_count": 2,
151
+     "metadata": {},
152
+     "output_type": "execute_result"
153
+    }
154
+   ],
155
+   "source": [
156
+    "#Only keep those features that we require \n",
157
+    "df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]\n",
158
+    "\n",
159
+    "df.head()"
160
+   ]
161
+  },
162
+  {
163
+   "cell_type": "code",
164
+   "execution_count": 3,
165
+   "metadata": {
166
+    "collapsed": true
167
+   },
168
+   "outputs": [],
169
+   "source": [
170
+    "#Convert release_date into pandas datetime format\n",
171
+    "df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')\n",
172
+    "\n",
173
+    "#Extract year from the datetime\n",
174
+    "df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)"
175
+   ]
176
+  },
177
+  {
178
+   "cell_type": "code",
179
+   "execution_count": 4,
180
+   "metadata": {
181
+    "collapsed": true
182
+   },
183
+   "outputs": [],
184
+   "source": [
185
+    "#Helper function to convert NaT to 0 and all other years to integers.\n",
186
+    "def convert_int(x):\n",
187
+    "    try:\n",
188
+    "        return int(x)\n",
189
+    "    except:\n",
190
+    "        return 0"
191
+   ]
192
+  },
193
+  {
194
+   "cell_type": "code",
195
+   "execution_count": 5,
196
+   "metadata": {
197
+    "collapsed": true
198
+   },
199
+   "outputs": [],
200
+   "source": [
201
+    "#Apply convert_int to the year feature\n",
202
+    "df['year'] = df['year'].apply(convert_int)"
203
+   ]
204
+  },
205
+  {
206
+   "cell_type": "code",
207
+   "execution_count": 6,
208
+   "metadata": {},
209
+   "outputs": [
210
+    {
211
+     "data": {
212
+      "text/html": [
213
+       "<div>\n",
214
+       "<style>\n",
215
+       "    .dataframe thead tr:only-child th {\n",
216
+       "        text-align: right;\n",
217
+       "    }\n",
218
+       "\n",
219
+       "    .dataframe thead th {\n",
220
+       "        text-align: left;\n",
221
+       "    }\n",
222
+       "\n",
223
+       "    .dataframe tbody tr th {\n",
224
+       "        vertical-align: top;\n",
225
+       "    }\n",
226
+       "</style>\n",
227
+       "<table border=\"1\" class=\"dataframe\">\n",
228
+       "  <thead>\n",
229
+       "    <tr style=\"text-align: right;\">\n",
230
+       "      <th></th>\n",
231
+       "      <th>title</th>\n",
232
+       "      <th>genres</th>\n",
233
+       "      <th>runtime</th>\n",
234
+       "      <th>vote_average</th>\n",
235
+       "      <th>vote_count</th>\n",
236
+       "      <th>year</th>\n",
237
+       "    </tr>\n",
238
+       "  </thead>\n",
239
+       "  <tbody>\n",
240
+       "    <tr>\n",
241
+       "      <th>0</th>\n",
242
+       "      <td>Toy Story</td>\n",
243
+       "      <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
244
+       "      <td>81.0</td>\n",
245
+       "      <td>7.7</td>\n",
246
+       "      <td>5415.0</td>\n",
247
+       "      <td>1995</td>\n",
248
+       "    </tr>\n",
249
+       "    <tr>\n",
250
+       "      <th>1</th>\n",
251
+       "      <td>Jumanji</td>\n",
252
+       "      <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
253
+       "      <td>104.0</td>\n",
254
+       "      <td>6.9</td>\n",
255
+       "      <td>2413.0</td>\n",
256
+       "      <td>1995</td>\n",
257
+       "    </tr>\n",
258
+       "    <tr>\n",
259
+       "      <th>2</th>\n",
260
+       "      <td>Grumpier Old Men</td>\n",
261
+       "      <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
262
+       "      <td>101.0</td>\n",
263
+       "      <td>6.5</td>\n",
264
+       "      <td>92.0</td>\n",
265
+       "      <td>1995</td>\n",
266
+       "    </tr>\n",
267
+       "    <tr>\n",
268
+       "      <th>3</th>\n",
269
+       "      <td>Waiting to Exhale</td>\n",
270
+       "      <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
271
+       "      <td>127.0</td>\n",
272
+       "      <td>6.1</td>\n",
273
+       "      <td>34.0</td>\n",
274
+       "      <td>1995</td>\n",
275
+       "    </tr>\n",
276
+       "    <tr>\n",
277
+       "      <th>4</th>\n",
278
+       "      <td>Father of the Bride Part II</td>\n",
279
+       "      <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
280
+       "      <td>106.0</td>\n",
281
+       "      <td>5.7</td>\n",
282
+       "      <td>173.0</td>\n",
283
+       "      <td>1995</td>\n",
284
+       "    </tr>\n",
285
+       "  </tbody>\n",
286
+       "</table>\n",
287
+       "</div>"
288
+      ],
289
+      "text/plain": [
290
+       "                         title  \\\n",
291
+       "0                    Toy Story   \n",
292
+       "1                      Jumanji   \n",
293
+       "2             Grumpier Old Men   \n",
294
+       "3            Waiting to Exhale   \n",
295
+       "4  Father of the Bride Part II   \n",
296
+       "\n",
297
+       "                                              genres  runtime  vote_average  \\\n",
298
+       "0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...     81.0           7.7   \n",
299
+       "1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...    104.0           6.9   \n",
300
+       "2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...    101.0           6.5   \n",
301
+       "3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...    127.0           6.1   \n",
302
+       "4                     [{'id': 35, 'name': 'Comedy'}]    106.0           5.7   \n",
303
+       "\n",
304
+       "   vote_count  year  \n",
305
+       "0      5415.0  1995  \n",
306
+       "1      2413.0  1995  \n",
307
+       "2        92.0  1995  \n",
308
+       "3        34.0  1995  \n",
309
+       "4       173.0  1995  "
310
+      ]
311
+     },
312
+     "execution_count": 6,
313
+     "metadata": {},
314
+     "output_type": "execute_result"
315
+    }
316
+   ],
317
+   "source": [
318
+    "#Drop the release_date column\n",
319
+    "df = df.drop('release_date', axis=1)\n",
320
+    "\n",
321
+    "#Display the dataframe\n",
322
+    "df.head()"
323
+   ]
324
+  },
325
+  {
326
+   "cell_type": "code",
327
+   "execution_count": 7,
328
+   "metadata": {},
329
+   "outputs": [
330
+    {
331
+     "data": {
332
+      "text/plain": [
333
+       "\"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]\""
334
+      ]
335
+     },
336
+     "execution_count": 7,
337
+     "metadata": {},
338
+     "output_type": "execute_result"
339
+    }
340
+   ],
341
+   "source": [
342
+    "#Print genres of the first movie\n",
343
+    "df.iloc[0]['genres']"
344
+   ]
345
+  },
346
+  {
347
+   "cell_type": "code",
348
+   "execution_count": 8,
349
+   "metadata": {},
350
+   "outputs": [
351
+    {
352
+     "name": "stdout",
353
+     "output_type": "stream",
354
+     "text": [
355
+      "<class 'str'>\n",
356
+      "<class 'list'>\n"
357
+     ]
358
+    }
359
+   ],
360
+   "source": [
361
+    "#Import the literal_eval function from ast\n",
362
+    "from ast import literal_eval\n",
363
+    "\n",
364
+    "#Define a stringified list and output its type\n",
365
+    "a = \"[1,2,3]\"\n",
366
+    "print(type(a))\n",
367
+    "\n",
368
+    "#Apply literal_eval and output type\n",
369
+    "b = literal_eval(a)\n",
370
+    "print(type(b))"
371
+   ]
372
+  },
373
+  {
374
+   "cell_type": "code",
375
+   "execution_count": 9,
376
+   "metadata": {
377
+    "collapsed": true
378
+   },
379
+   "outputs": [],
380
+   "source": [
381
+    "#Convert all NaN into stringified empty lists\n",
382
+    "df['genres'] = df['genres'].fillna('[]')\n",
383
+    "\n",
384
+    "#Apply literal_eval to convert stringified empty lists to the list object\n",
385
+    "df['genres'] = df['genres'].apply(literal_eval)\n",
386
+    "\n",
387
+    "#Convert list of dictionaries to a list of strings\n",
388
+    "df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])"
389
+   ]
390
+  },
391
+  {
392
+   "cell_type": "code",
393
+   "execution_count": 10,
394
+   "metadata": {},
395
+   "outputs": [
396
+    {
397
+     "data": {
398
+      "text/html": [
399
+       "<div>\n",
400
+       "<style>\n",
401
+       "    .dataframe thead tr:only-child th {\n",
402
+       "        text-align: right;\n",
403
+       "    }\n",
404
+       "\n",
405
+       "    .dataframe thead th {\n",
406
+       "        text-align: left;\n",
407
+       "    }\n",
408
+       "\n",
409
+       "    .dataframe tbody tr th {\n",
410
+       "        vertical-align: top;\n",
411
+       "    }\n",
412
+       "</style>\n",
413
+       "<table border=\"1\" class=\"dataframe\">\n",
414
+       "  <thead>\n",
415
+       "    <tr style=\"text-align: right;\">\n",
416
+       "      <th></th>\n",
417
+       "      <th>title</th>\n",
418
+       "      <th>genres</th>\n",
419
+       "      <th>runtime</th>\n",
420
+       "      <th>vote_average</th>\n",
421
+       "      <th>vote_count</th>\n",
422
+       "      <th>year</th>\n",
423
+       "    </tr>\n",
424
+       "  </thead>\n",
425
+       "  <tbody>\n",
426
+       "    <tr>\n",
427
+       "      <th>0</th>\n",
428
+       "      <td>Toy Story</td>\n",
429
+       "      <td>[animation, comedy, family]</td>\n",
430
+       "      <td>81.0</td>\n",
431
+       "      <td>7.7</td>\n",
432
+       "      <td>5415.0</td>\n",
433
+       "      <td>1995</td>\n",
434
+       "    </tr>\n",
435
+       "    <tr>\n",
436
+       "      <th>1</th>\n",
437
+       "      <td>Jumanji</td>\n",
438
+       "      <td>[adventure, fantasy, family]</td>\n",
439
+       "      <td>104.0</td>\n",
440
+       "      <td>6.9</td>\n",
441
+       "      <td>2413.0</td>\n",
442
+       "      <td>1995</td>\n",
443
+       "    </tr>\n",
444
+       "    <tr>\n",
445
+       "      <th>2</th>\n",
446
+       "      <td>Grumpier Old Men</td>\n",
447
+       "      <td>[romance, comedy]</td>\n",
448
+       "      <td>101.0</td>\n",
449
+       "      <td>6.5</td>\n",
450
+       "      <td>92.0</td>\n",
451
+       "      <td>1995</td>\n",
452
+       "    </tr>\n",
453
+       "    <tr>\n",
454
+       "      <th>3</th>\n",
455
+       "      <td>Waiting to Exhale</td>\n",
456
+       "      <td>[comedy, drama, romance]</td>\n",
457
+       "      <td>127.0</td>\n",
458
+       "      <td>6.1</td>\n",
459
+       "      <td>34.0</td>\n",
460
+       "      <td>1995</td>\n",
461
+       "    </tr>\n",
462
+       "    <tr>\n",
463
+       "      <th>4</th>\n",
464
+       "      <td>Father of the Bride Part II</td>\n",
465
+       "      <td>[comedy]</td>\n",
466
+       "      <td>106.0</td>\n",
467
+       "      <td>5.7</td>\n",
468
+       "      <td>173.0</td>\n",
469
+       "      <td>1995</td>\n",
470
+       "    </tr>\n",
471
+       "  </tbody>\n",
472
+       "</table>\n",
473
+       "</div>"
474
+      ],
475
+      "text/plain": [
476
+       "                         title                        genres  runtime  \\\n",
477
+       "0                    Toy Story   [animation, comedy, family]     81.0   \n",
478
+       "1                      Jumanji  [adventure, fantasy, family]    104.0   \n",
479
+       "2             Grumpier Old Men             [romance, comedy]    101.0   \n",
480
+       "3            Waiting to Exhale      [comedy, drama, romance]    127.0   \n",
481
+       "4  Father of the Bride Part II                      [comedy]    106.0   \n",
482
+       "\n",
483
+       "   vote_average  vote_count  year  \n",
484
+       "0           7.7      5415.0  1995  \n",
485
+       "1           6.9      2413.0  1995  \n",
486
+       "2           6.5        92.0  1995  \n",
487
+       "3           6.1        34.0  1995  \n",
488
+       "4           5.7       173.0  1995  "
489
+      ]
490
+     },
491
+     "execution_count": 10,
492
+     "metadata": {},
493
+     "output_type": "execute_result"
494
+    }
495
+   ],
496
+   "source": [
497
+    "df.head()"
498
+   ]
499
+  },
500
+  {
501
+   "cell_type": "code",
502
+   "execution_count": 11,
503
+   "metadata": {},
504
+   "outputs": [
505
+    {
506
+     "data": {
507
+      "text/html": [
508
+       "<div>\n",
509
+       "<style>\n",
510
+       "    .dataframe thead tr:only-child th {\n",
511
+       "        text-align: right;\n",
512
+       "    }\n",
513
+       "\n",
514
+       "    .dataframe thead th {\n",
515
+       "        text-align: left;\n",
516
+       "    }\n",
517
+       "\n",
518
+       "    .dataframe tbody tr th {\n",
519
+       "        vertical-align: top;\n",
520
+       "    }\n",
521
+       "</style>\n",
522
+       "<table border=\"1\" class=\"dataframe\">\n",
523
+       "  <thead>\n",
524
+       "    <tr style=\"text-align: right;\">\n",
525
+       "      <th></th>\n",
526
+       "      <th>title</th>\n",
527
+       "      <th>runtime</th>\n",
528
+       "      <th>vote_average</th>\n",
529
+       "      <th>vote_count</th>\n",
530
+       "      <th>year</th>\n",
531
+       "      <th>genre</th>\n",
532
+       "    </tr>\n",
533
+       "  </thead>\n",
534
+       "  <tbody>\n",
535
+       "    <tr>\n",
536
+       "      <th>0</th>\n",
537
+       "      <td>Toy Story</td>\n",
538
+       "      <td>81.0</td>\n",
539
+       "      <td>7.7</td>\n",
540
+       "      <td>5415.0</td>\n",
541
+       "      <td>1995</td>\n",
542
+       "      <td>animation</td>\n",
543
+       "    </tr>\n",
544
+       "    <tr>\n",
545
+       "      <th>0</th>\n",
546
+       "      <td>Toy Story</td>\n",
547
+       "      <td>81.0</td>\n",
548
+       "      <td>7.7</td>\n",
549
+       "      <td>5415.0</td>\n",
550
+       "      <td>1995</td>\n",
551
+       "      <td>comedy</td>\n",
552
+       "    </tr>\n",
553
+       "    <tr>\n",
554
+       "      <th>0</th>\n",
555
+       "      <td>Toy Story</td>\n",
556
+       "      <td>81.0</td>\n",
557
+       "      <td>7.7</td>\n",
558
+       "      <td>5415.0</td>\n",
559
+       "      <td>1995</td>\n",
560
+       "      <td>family</td>\n",
561
+       "    </tr>\n",
562
+       "    <tr>\n",
563
+       "      <th>1</th>\n",
564
+       "      <td>Jumanji</td>\n",
565
+       "      <td>104.0</td>\n",
566
+       "      <td>6.9</td>\n",
567
+       "      <td>2413.0</td>\n",
568
+       "      <td>1995</td>\n",
569
+       "      <td>adventure</td>\n",
570
+       "    </tr>\n",
571
+       "    <tr>\n",
572
+       "      <th>1</th>\n",
573
+       "      <td>Jumanji</td>\n",
574
+       "      <td>104.0</td>\n",
575
+       "      <td>6.9</td>\n",
576
+       "      <td>2413.0</td>\n",
577
+       "      <td>1995</td>\n",
578
+       "      <td>fantasy</td>\n",
579
+       "    </tr>\n",
580
+       "  </tbody>\n",
581
+       "</table>\n",
582
+       "</div>"
583
+      ],
584
+      "text/plain": [
585
+       "       title  runtime  vote_average  vote_count  year      genre\n",
586
+       "0  Toy Story     81.0           7.7      5415.0  1995  animation\n",
587
+       "0  Toy Story     81.0           7.7      5415.0  1995     comedy\n",
588
+       "0  Toy Story     81.0           7.7      5415.0  1995     family\n",
589
+       "1    Jumanji    104.0           6.9      2413.0  1995  adventure\n",
590
+       "1    Jumanji    104.0           6.9      2413.0  1995    fantasy"
591
+      ]
592
+     },
593
+     "execution_count": 11,
594
+     "metadata": {},
595
+     "output_type": "execute_result"
596
+    }
597
+   ],
598
+   "source": [
599
+    "#Create a new feature by exploding genres\n",
600
+    "s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)\n",
601
+    "\n",
602
+    "#Name the new feature as 'genre'\n",
603
+    "s.name = 'genre'\n",
604
+    "\n",
605
+    "#Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.\n",
606
+    "gen_df = df.drop('genres', axis=1).join(s)\n",
607
+    "\n",
608
+    "#Print the head of the new gen_df\n",
609
+    "gen_df.head()"
610
+   ]
611
+  },
612
+  {
613
+   "cell_type": "code",
614
+   "execution_count": 12,
615
+   "metadata": {
616
+    "collapsed": true
617
+   },
618
+   "outputs": [],
619
+   "source": [
620
+    "def build_chart(gen_df, percentile=0.8):\n",
621
+    "    #Ask for preferred genres\n",
622
+    "    print(\"Input preferred genre\")\n",
623
+    "    genre = input()\n",
624
+    "    \n",
625
+    "    #Ask for lower limit of duration\n",
626
+    "    print(\"Input shortest duration\")\n",
627
+    "    low_time = int(input())\n",
628
+    "    \n",
629
+    "    #Ask for upper limit of duration\n",
630
+    "    print(\"Input longest duration\")\n",
631
+    "    high_time = int(input())\n",
632
+    "    \n",
633
+    "    #Ask for lower limit of timeline\n",
634
+    "    print(\"Input earliest year\")\n",
635
+    "    low_year = int(input())\n",
636
+    "    \n",
637
+    "    #Ask for upper limit of timeline\n",
638
+    "    print(\"Input latest year\")\n",
639
+    "    high_year = int(input())\n",
640
+    "    \n",
641
+    "    #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies\n",
642
+    "    movies = gen_df.copy()\n",
643
+    "    \n",
644
+    "    #Filter based on the condition\n",
645
+    "    movies = movies[(movies['genre'] == genre) & \n",
646
+    "                    (movies['runtime'] >= low_time) & \n",
647
+    "                    (movies['runtime'] <= high_time) & \n",
648
+    "                    (movies['year'] >= low_year) & \n",
649
+    "                    (movies['year'] <= high_year)]\n",
650
+    "    \n",
651
+    "    #Compute the values of C and m for the filtered movies\n",
652
+    "    C = movies['vote_average'].mean()\n",
653
+    "    m = movies['vote_count'].quantile(percentile)\n",
654
+    "    \n",
655
+    "    #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies\n",
656
+    "    q_movies = movies.copy().loc[movies['vote_count'] >= m]\n",
657
+    "    \n",
658
+    "    #Calculate score using the IMDB formula\n",
659
+    "    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) \n",
660
+    "                                       + (m/(m+x['vote_count']) * C)\n",
661
+    "                                       ,axis=1)\n",
662
+    "\n",
663
+    "    #Sort movies in descending order of their scores\n",
664
+    "    q_movies = q_movies.sort_values('score', ascending=False)\n",
665
+    "    \n",
666
+    "    return q_movies"
667
+   ]
668
+  },
669
+  {
670
+   "cell_type": "code",
671
+   "execution_count": 13,
672
+   "metadata": {},
673
+   "outputs": [
674
+    {
675
+     "name": "stdout",
676
+     "output_type": "stream",
677
+     "text": [
678
+      "Input preferred genre\n",
679
+      "action\n",
680
+      "Input shortest duration\n",
681
+      "80\n",
682
+      "Input longest duration\n",
683
+      "120\n",
684
+      "Input earliest year\n",
685
+      "1990\n",
686
+      "Input latest year\n",
687
+      "2000\n"
688
+     ]
689
+    },
690
+    {
691
+     "data": {
692
+      "text/html": [
693
+       "<div>\n",
694
+       "<style>\n",
695
+       "    .dataframe thead tr:only-child th {\n",
696
+       "        text-align: right;\n",
697
+       "    }\n",
698
+       "\n",
699
+       "    .dataframe thead th {\n",
700
+       "        text-align: left;\n",
701
+       "    }\n",
702
+       "\n",
703
+       "    .dataframe tbody tr th {\n",
704
+       "        vertical-align: top;\n",
705
+       "    }\n",
706
+       "</style>\n",
707
+       "<table border=\"1\" class=\"dataframe\">\n",
708
+       "  <thead>\n",
709
+       "    <tr style=\"text-align: right;\">\n",
710
+       "      <th></th>\n",
711
+       "      <th>title</th>\n",
712
+       "      <th>runtime</th>\n",
713
+       "      <th>vote_average</th>\n",
714
+       "      <th>vote_count</th>\n",
715
+       "      <th>year</th>\n",
716
+       "      <th>genre</th>\n",
717
+       "      <th>score</th>\n",
718
+       "    </tr>\n",
719
+       "  </thead>\n",
720
+       "  <tbody>\n",
721
+       "    <tr>\n",
722
+       "      <th>723</th>\n",
723
+       "      <td>Ghost in the Shell</td>\n",
724
+       "      <td>83.0</td>\n",
725
+       "      <td>7.8</td>\n",
726
+       "      <td>854.0</td>\n",
727
+       "      <td>1995</td>\n",
728
+       "      <td>action</td>\n",
729
+       "      <td>7.521643</td>\n",
730
+       "    </tr>\n",
731
+       "    <tr>\n",
732
+       "      <th>550</th>\n",
733
+       "      <td>True Romance</td>\n",
734
+       "      <td>120.0</td>\n",
735
+       "      <td>7.5</td>\n",
736
+       "      <td>762.0</td>\n",
737
+       "      <td>1993</td>\n",
738
+       "      <td>action</td>\n",
739
+       "      <td>7.231980</td>\n",
740
+       "    </tr>\n",
741
+       "    <tr>\n",
742
+       "      <th>3902</th>\n",
743
+       "      <td>O Brother, Where Art Thou?</td>\n",
744
+       "      <td>106.0</td>\n",
745
+       "      <td>7.3</td>\n",
746
+       "      <td>1144.0</td>\n",
747
+       "      <td>2000</td>\n",
748
+       "      <td>action</td>\n",
749
+       "      <td>7.131617</td>\n",
750
+       "    </tr>\n",
751
+       "    <tr>\n",
752
+       "      <th>348</th>\n",
753
+       "      <td>The Crow</td>\n",
754
+       "      <td>102.0</td>\n",
755
+       "      <td>7.3</td>\n",
756
+       "      <td>980.0</td>\n",
757
+       "      <td>1994</td>\n",
758
+       "      <td>action</td>\n",
759
+       "      <td>7.106412</td>\n",
760
+       "    </tr>\n",
761
+       "    <tr>\n",
762
+       "      <th>3871</th>\n",
763
+       "      <td>Crouching Tiger, Hidden Dragon</td>\n",
764
+       "      <td>120.0</td>\n",
765
+       "      <td>7.2</td>\n",
766
+       "      <td>949.0</td>\n",
767
+       "      <td>2000</td>\n",
768
+       "      <td>action</td>\n",
769
+       "      <td>7.011634</td>\n",
770
+       "    </tr>\n",
771
+       "  </tbody>\n",
772
+       "</table>\n",
773
+       "</div>"
774
+      ],
775
+      "text/plain": [
776
+       "                               title  runtime  vote_average  vote_count  year  \\\n",
777
+       "723               Ghost in the Shell     83.0           7.8       854.0  1995   \n",
778
+       "550                     True Romance    120.0           7.5       762.0  1993   \n",
779
+       "3902      O Brother, Where Art Thou?    106.0           7.3      1144.0  2000   \n",
780
+       "348                         The Crow    102.0           7.3       980.0  1994   \n",
781
+       "3871  Crouching Tiger, Hidden Dragon    120.0           7.2       949.0  2000   \n",
782
+       "\n",
783
+       "       genre     score  \n",
784
+       "723   action  7.521643  \n",
785
+       "550   action  7.231980  \n",
786
+       "3902  action  7.131617  \n",
787
+       "348   action  7.106412  \n",
788
+       "3871  action  7.011634  "
789
+      ]
790
+     },
791
+     "execution_count": 13,
792
+     "metadata": {},
793
+     "output_type": "execute_result"
794
+    }
795
+   ],
796
+   "source": [
797
+    "#Generate the chart for top animation movies and display top 5.\n",
798
+    "build_chart(gen_df).head()"
799
+   ]
800
+  },
801
+  {
802
+   "cell_type": "code",
803
+   "execution_count": null,
804
+   "metadata": {},
805
+   "outputs": [],
806
+   "source": []
807
+  }
808
+ ],
809
+ "metadata": {
810
+  "kernelspec": {
811
+   "display_name": "Python 3",
812
+   "language": "python",
813
+   "name": "python3"
814
+  },
815
+  "language_info": {
816
+   "codemirror_mode": {
817
+    "name": "ipython",
818
+    "version": 3
819
+   },
820
+   "file_extension": ".py",
821
+   "mimetype": "text/x-python",
822
+   "name": "python",
823
+   "nbconvert_exporter": "python",
824
+   "pygments_lexer": "ipython3",
825
+   "version": "3.6.0"
826
+  }
827
+ },
828
+ "nbformat": 4,
829
+ "nbformat_minor": 2
830
+}

+ 6 - 0
Chapter3/.ipynb_checkpoints/Simple Recommender-checkpoint.ipynb

@@ -0,0 +1,6 @@
1
+{
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 2
6
+}

+ 841 - 0
Chapter3/Knowledge Recommender.ipynb

@@ -0,0 +1,841 @@
1
+{
2
+ "cells": [
3
+  {
4
+   "cell_type": "code",
5
+   "execution_count": 1,
6
+   "metadata": {},
7
+   "outputs": [
8
+    {
9
+     "name": "stderr",
10
+     "output_type": "stream",
11
+     "text": [
12
+      "/var/folders/dv/107570dd01b_m3wvmvv0g9lm0000gn/T/ipykernel_4569/2038691245.py:4: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
13
+      "  df = pd.read_csv('../data/movies_metadata.csv')\n"
14
+     ]
15
+    },
16
+    {
17
+     "data": {
18
+      "text/plain": [
19
+       "Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',\n",
20
+       "       'imdb_id', 'original_language', 'original_title', 'overview',\n",
21
+       "       'popularity', 'poster_path', 'production_companies',\n",
22
+       "       'production_countries', 'release_date', 'revenue', 'runtime',\n",
23
+       "       'spoken_languages', 'status', 'tagline', 'title', 'video',\n",
24
+       "       'vote_average', 'vote_count'],\n",
25
+       "      dtype='object')"
26
+      ]
27
+     },
28
+     "execution_count": 1,
29
+     "metadata": {},
30
+     "output_type": "execute_result"
31
+    }
32
+   ],
33
+   "source": [
34
+    "import pandas as pd\n",
35
+    "import numpy as np\n",
36
+    "\n",
37
+    "df = pd.read_csv('../data/movies_metadata.csv')\n",
38
+    "\n",
39
+    "#Print all the features (or columns) of the DataFrame\n",
40
+    "df.columns"
41
+   ]
42
+  },
43
+  {
44
+   "cell_type": "code",
45
+   "execution_count": 2,
46
+   "metadata": {},
47
+   "outputs": [
48
+    {
49
+     "data": {
50
+      "text/html": [
51
+       "<div>\n",
52
+       "<style scoped>\n",
53
+       "    .dataframe tbody tr th:only-of-type {\n",
54
+       "        vertical-align: middle;\n",
55
+       "    }\n",
56
+       "\n",
57
+       "    .dataframe tbody tr th {\n",
58
+       "        vertical-align: top;\n",
59
+       "    }\n",
60
+       "\n",
61
+       "    .dataframe thead th {\n",
62
+       "        text-align: right;\n",
63
+       "    }\n",
64
+       "</style>\n",
65
+       "<table border=\"1\" class=\"dataframe\">\n",
66
+       "  <thead>\n",
67
+       "    <tr style=\"text-align: right;\">\n",
68
+       "      <th></th>\n",
69
+       "      <th>title</th>\n",
70
+       "      <th>genres</th>\n",
71
+       "      <th>release_date</th>\n",
72
+       "      <th>runtime</th>\n",
73
+       "      <th>vote_average</th>\n",
74
+       "      <th>vote_count</th>\n",
75
+       "    </tr>\n",
76
+       "  </thead>\n",
77
+       "  <tbody>\n",
78
+       "    <tr>\n",
79
+       "      <th>0</th>\n",
80
+       "      <td>Toy Story</td>\n",
81
+       "      <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
82
+       "      <td>1995-10-30</td>\n",
83
+       "      <td>81.0</td>\n",
84
+       "      <td>7.7</td>\n",
85
+       "      <td>5415.0</td>\n",
86
+       "    </tr>\n",
87
+       "    <tr>\n",
88
+       "      <th>1</th>\n",
89
+       "      <td>Jumanji</td>\n",
90
+       "      <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
91
+       "      <td>1995-12-15</td>\n",
92
+       "      <td>104.0</td>\n",
93
+       "      <td>6.9</td>\n",
94
+       "      <td>2413.0</td>\n",
95
+       "    </tr>\n",
96
+       "    <tr>\n",
97
+       "      <th>2</th>\n",
98
+       "      <td>Grumpier Old Men</td>\n",
99
+       "      <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
100
+       "      <td>1995-12-22</td>\n",
101
+       "      <td>101.0</td>\n",
102
+       "      <td>6.5</td>\n",
103
+       "      <td>92.0</td>\n",
104
+       "    </tr>\n",
105
+       "    <tr>\n",
106
+       "      <th>3</th>\n",
107
+       "      <td>Waiting to Exhale</td>\n",
108
+       "      <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
109
+       "      <td>1995-12-22</td>\n",
110
+       "      <td>127.0</td>\n",
111
+       "      <td>6.1</td>\n",
112
+       "      <td>34.0</td>\n",
113
+       "    </tr>\n",
114
+       "    <tr>\n",
115
+       "      <th>4</th>\n",
116
+       "      <td>Father of the Bride Part II</td>\n",
117
+       "      <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
118
+       "      <td>1995-02-10</td>\n",
119
+       "      <td>106.0</td>\n",
120
+       "      <td>5.7</td>\n",
121
+       "      <td>173.0</td>\n",
122
+       "    </tr>\n",
123
+       "  </tbody>\n",
124
+       "</table>\n",
125
+       "</div>"
126
+      ],
127
+      "text/plain": [
128
+       "                         title  \\\n",
129
+       "0                    Toy Story   \n",
130
+       "1                      Jumanji   \n",
131
+       "2             Grumpier Old Men   \n",
132
+       "3            Waiting to Exhale   \n",
133
+       "4  Father of the Bride Part II   \n",
134
+       "\n",
135
+       "                                              genres release_date  runtime  \\\n",
136
+       "0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   1995-10-30     81.0   \n",
137
+       "1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   1995-12-15    104.0   \n",
138
+       "2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   1995-12-22    101.0   \n",
139
+       "3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   1995-12-22    127.0   \n",
140
+       "4                     [{'id': 35, 'name': 'Comedy'}]   1995-02-10    106.0   \n",
141
+       "\n",
142
+       "   vote_average  vote_count  \n",
143
+       "0           7.7      5415.0  \n",
144
+       "1           6.9      2413.0  \n",
145
+       "2           6.5        92.0  \n",
146
+       "3           6.1        34.0  \n",
147
+       "4           5.7       173.0  "
148
+      ]
149
+     },
150
+     "execution_count": 2,
151
+     "metadata": {},
152
+     "output_type": "execute_result"
153
+    }
154
+   ],
155
+   "source": [
156
+    "#Only keep those features that we require \n",
157
+    "df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]\n",
158
+    "\n",
159
+    "df.head()"
160
+   ]
161
+  },
162
+  {
163
+   "cell_type": "code",
164
+   "execution_count": 3,
165
+   "metadata": {},
166
+   "outputs": [],
167
+   "source": [
168
+    "#Convert release_date into pandas datetime format\n",
169
+    "df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')\n",
170
+    "\n",
171
+    "#Extract year from the datetime\n",
172
+    "df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)"
173
+   ]
174
+  },
175
+  {
176
+   "cell_type": "code",
177
+   "execution_count": 4,
178
+   "metadata": {},
179
+   "outputs": [],
180
+   "source": [
181
+    "#Helper function to convert NaT to 0 and all other years to integers.\n",
182
+    "def convert_int(x):\n",
183
+    "    try:\n",
184
+    "        return int(x)\n",
185
+    "    except:\n",
186
+    "        return 0"
187
+   ]
188
+  },
189
+  {
190
+   "cell_type": "code",
191
+   "execution_count": 5,
192
+   "metadata": {},
193
+   "outputs": [],
194
+   "source": [
195
+    "#Apply convert_int to the year feature\n",
196
+    "df['year'] = df['year'].apply(convert_int)"
197
+   ]
198
+  },
199
+  {
200
+   "cell_type": "code",
201
+   "execution_count": 6,
202
+   "metadata": {},
203
+   "outputs": [
204
+    {
205
+     "data": {
206
+      "text/html": [
207
+       "<div>\n",
208
+       "<style scoped>\n",
209
+       "    .dataframe tbody tr th:only-of-type {\n",
210
+       "        vertical-align: middle;\n",
211
+       "    }\n",
212
+       "\n",
213
+       "    .dataframe tbody tr th {\n",
214
+       "        vertical-align: top;\n",
215
+       "    }\n",
216
+       "\n",
217
+       "    .dataframe thead th {\n",
218
+       "        text-align: right;\n",
219
+       "    }\n",
220
+       "</style>\n",
221
+       "<table border=\"1\" class=\"dataframe\">\n",
222
+       "  <thead>\n",
223
+       "    <tr style=\"text-align: right;\">\n",
224
+       "      <th></th>\n",
225
+       "      <th>title</th>\n",
226
+       "      <th>genres</th>\n",
227
+       "      <th>runtime</th>\n",
228
+       "      <th>vote_average</th>\n",
229
+       "      <th>vote_count</th>\n",
230
+       "      <th>year</th>\n",
231
+       "    </tr>\n",
232
+       "  </thead>\n",
233
+       "  <tbody>\n",
234
+       "    <tr>\n",
235
+       "      <th>0</th>\n",
236
+       "      <td>Toy Story</td>\n",
237
+       "      <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
238
+       "      <td>81.0</td>\n",
239
+       "      <td>7.7</td>\n",
240
+       "      <td>5415.0</td>\n",
241
+       "      <td>1995</td>\n",
242
+       "    </tr>\n",
243
+       "    <tr>\n",
244
+       "      <th>1</th>\n",
245
+       "      <td>Jumanji</td>\n",
246
+       "      <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
247
+       "      <td>104.0</td>\n",
248
+       "      <td>6.9</td>\n",
249
+       "      <td>2413.0</td>\n",
250
+       "      <td>1995</td>\n",
251
+       "    </tr>\n",
252
+       "    <tr>\n",
253
+       "      <th>2</th>\n",
254
+       "      <td>Grumpier Old Men</td>\n",
255
+       "      <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
256
+       "      <td>101.0</td>\n",
257
+       "      <td>6.5</td>\n",
258
+       "      <td>92.0</td>\n",
259
+       "      <td>1995</td>\n",
260
+       "    </tr>\n",
261
+       "    <tr>\n",
262
+       "      <th>3</th>\n",
263
+       "      <td>Waiting to Exhale</td>\n",
264
+       "      <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
265
+       "      <td>127.0</td>\n",
266
+       "      <td>6.1</td>\n",
267
+       "      <td>34.0</td>\n",
268
+       "      <td>1995</td>\n",
269
+       "    </tr>\n",
270
+       "    <tr>\n",
271
+       "      <th>4</th>\n",
272
+       "      <td>Father of the Bride Part II</td>\n",
273
+       "      <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
274
+       "      <td>106.0</td>\n",
275
+       "      <td>5.7</td>\n",
276
+       "      <td>173.0</td>\n",
277
+       "      <td>1995</td>\n",
278
+       "    </tr>\n",
279
+       "  </tbody>\n",
280
+       "</table>\n",
281
+       "</div>"
282
+      ],
283
+      "text/plain": [
284
+       "                         title  \\\n",
285
+       "0                    Toy Story   \n",
286
+       "1                      Jumanji   \n",
287
+       "2             Grumpier Old Men   \n",
288
+       "3            Waiting to Exhale   \n",
289
+       "4  Father of the Bride Part II   \n",
290
+       "\n",
291
+       "                                              genres  runtime  vote_average  \\\n",
292
+       "0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...     81.0           7.7   \n",
293
+       "1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...    104.0           6.9   \n",
294
+       "2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...    101.0           6.5   \n",
295
+       "3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...    127.0           6.1   \n",
296
+       "4                     [{'id': 35, 'name': 'Comedy'}]    106.0           5.7   \n",
297
+       "\n",
298
+       "   vote_count  year  \n",
299
+       "0      5415.0  1995  \n",
300
+       "1      2413.0  1995  \n",
301
+       "2        92.0  1995  \n",
302
+       "3        34.0  1995  \n",
303
+       "4       173.0  1995  "
304
+      ]
305
+     },
306
+     "execution_count": 6,
307
+     "metadata": {},
308
+     "output_type": "execute_result"
309
+    }
310
+   ],
311
+   "source": [
312
+    "#Drop the release_date column\n",
313
+    "df = df.drop('release_date', axis=1)\n",
314
+    "\n",
315
+    "#Display the dataframe\n",
316
+    "df.head()"
317
+   ]
318
+  },
319
+  {
320
+   "cell_type": "code",
321
+   "execution_count": 7,
322
+   "metadata": {},
323
+   "outputs": [
324
+    {
325
+     "data": {
326
+      "text/plain": [
327
+       "\"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]\""
328
+      ]
329
+     },
330
+     "execution_count": 7,
331
+     "metadata": {},
332
+     "output_type": "execute_result"
333
+    }
334
+   ],
335
+   "source": [
336
+    "#Print genres of the first movie\n",
337
+    "df.iloc[0]['genres']"
338
+   ]
339
+  },
340
+  {
341
+   "cell_type": "code",
342
+   "execution_count": 8,
343
+   "metadata": {},
344
+   "outputs": [
345
+    {
346
+     "name": "stdout",
347
+     "output_type": "stream",
348
+     "text": [
349
+      "<class 'str'>\n",
350
+      "<class 'list'>\n"
351
+     ]
352
+    }
353
+   ],
354
+   "source": [
355
+    "#Import the literal_eval function from ast\n",
356
+    "from ast import literal_eval\n",
357
+    "\n",
358
+    "#Define a stringified list and output its type\n",
359
+    "a = \"[1,2,3]\"\n",
360
+    "print(type(a))\n",
361
+    "\n",
362
+    "#Apply literal_eval and output type\n",
363
+    "b = literal_eval(a)\n",
364
+    "print(type(b))"
365
+   ]
366
+  },
367
+  {
368
+   "cell_type": "code",
369
+   "execution_count": 9,
370
+   "metadata": {},
371
+   "outputs": [],
372
+   "source": [
373
+    "#Convert all NaN into stringified empty lists\n",
374
+    "df['genres'] = df['genres'].fillna('[]')\n",
375
+    "\n",
376
+    "#Apply literal_eval to convert stringified empty lists to the list object\n",
377
+    "df['genres'] = df['genres'].apply(literal_eval)\n",
378
+    "\n",
379
+    "#Convert list of dictionaries to a list of strings\n",
380
+    "df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])"
381
+   ]
382
+  },
383
+  {
384
+   "cell_type": "code",
385
+   "execution_count": 10,
386
+   "metadata": {},
387
+   "outputs": [
388
+    {
389
+     "data": {
390
+      "text/html": [
391
+       "<div>\n",
392
+       "<style scoped>\n",
393
+       "    .dataframe tbody tr th:only-of-type {\n",
394
+       "        vertical-align: middle;\n",
395
+       "    }\n",
396
+       "\n",
397
+       "    .dataframe tbody tr th {\n",
398
+       "        vertical-align: top;\n",
399
+       "    }\n",
400
+       "\n",
401
+       "    .dataframe thead th {\n",
402
+       "        text-align: right;\n",
403
+       "    }\n",
404
+       "</style>\n",
405
+       "<table border=\"1\" class=\"dataframe\">\n",
406
+       "  <thead>\n",
407
+       "    <tr style=\"text-align: right;\">\n",
408
+       "      <th></th>\n",
409
+       "      <th>title</th>\n",
410
+       "      <th>genres</th>\n",
411
+       "      <th>runtime</th>\n",
412
+       "      <th>vote_average</th>\n",
413
+       "      <th>vote_count</th>\n",
414
+       "      <th>year</th>\n",
415
+       "    </tr>\n",
416
+       "  </thead>\n",
417
+       "  <tbody>\n",
418
+       "    <tr>\n",
419
+       "      <th>0</th>\n",
420
+       "      <td>Toy Story</td>\n",
421
+       "      <td>[animation, comedy, family]</td>\n",
422
+       "      <td>81.0</td>\n",
423
+       "      <td>7.7</td>\n",
424
+       "      <td>5415.0</td>\n",
425
+       "      <td>1995</td>\n",
426
+       "    </tr>\n",
427
+       "    <tr>\n",
428
+       "      <th>1</th>\n",
429
+       "      <td>Jumanji</td>\n",
430
+       "      <td>[adventure, fantasy, family]</td>\n",
431
+       "      <td>104.0</td>\n",
432
+       "      <td>6.9</td>\n",
433
+       "      <td>2413.0</td>\n",
434
+       "      <td>1995</td>\n",
435
+       "    </tr>\n",
436
+       "    <tr>\n",
437
+       "      <th>2</th>\n",
438
+       "      <td>Grumpier Old Men</td>\n",
439
+       "      <td>[romance, comedy]</td>\n",
440
+       "      <td>101.0</td>\n",
441
+       "      <td>6.5</td>\n",
442
+       "      <td>92.0</td>\n",
443
+       "      <td>1995</td>\n",
444
+       "    </tr>\n",
445
+       "    <tr>\n",
446
+       "      <th>3</th>\n",
447
+       "      <td>Waiting to Exhale</td>\n",
448
+       "      <td>[comedy, drama, romance]</td>\n",
449
+       "      <td>127.0</td>\n",
450
+       "      <td>6.1</td>\n",
451
+       "      <td>34.0</td>\n",
452
+       "      <td>1995</td>\n",
453
+       "    </tr>\n",
454
+       "    <tr>\n",
455
+       "      <th>4</th>\n",
456
+       "      <td>Father of the Bride Part II</td>\n",
457
+       "      <td>[comedy]</td>\n",
458
+       "      <td>106.0</td>\n",
459
+       "      <td>5.7</td>\n",
460
+       "      <td>173.0</td>\n",
461
+       "      <td>1995</td>\n",
462
+       "    </tr>\n",
463
+       "  </tbody>\n",
464
+       "</table>\n",
465
+       "</div>"
466
+      ],
467
+      "text/plain": [
468
+       "                         title                        genres  runtime  \\\n",
469
+       "0                    Toy Story   [animation, comedy, family]     81.0   \n",
470
+       "1                      Jumanji  [adventure, fantasy, family]    104.0   \n",
471
+       "2             Grumpier Old Men             [romance, comedy]    101.0   \n",
472
+       "3            Waiting to Exhale      [comedy, drama, romance]    127.0   \n",
473
+       "4  Father of the Bride Part II                      [comedy]    106.0   \n",
474
+       "\n",
475
+       "   vote_average  vote_count  year  \n",
476
+       "0           7.7      5415.0  1995  \n",
477
+       "1           6.9      2413.0  1995  \n",
478
+       "2           6.5        92.0  1995  \n",
479
+       "3           6.1        34.0  1995  \n",
480
+       "4           5.7       173.0  1995  "
481
+      ]
482
+     },
483
+     "execution_count": 10,
484
+     "metadata": {},
485
+     "output_type": "execute_result"
486
+    }
487
+   ],
488
+   "source": [
489
+    "df.head()"
490
+   ]
491
+  },
492
+  {
493
+   "cell_type": "code",
494
+   "execution_count": 11,
495
+   "metadata": {},
496
+   "outputs": [
497
+    {
498
+     "name": "stderr",
499
+     "output_type": "stream",
500
+     "text": [
501
+      "/var/folders/dv/107570dd01b_m3wvmvv0g9lm0000gn/T/ipykernel_4569/328443552.py:2: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.\n",
502
+      "  s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)\n"
503
+     ]
504
+    },
505
+    {
506
+     "data": {
507
+      "text/html": [
508
+       "<div>\n",
509
+       "<style scoped>\n",
510
+       "    .dataframe tbody tr th:only-of-type {\n",
511
+       "        vertical-align: middle;\n",
512
+       "    }\n",
513
+       "\n",
514
+       "    .dataframe tbody tr th {\n",
515
+       "        vertical-align: top;\n",
516
+       "    }\n",
517
+       "\n",
518
+       "    .dataframe thead th {\n",
519
+       "        text-align: right;\n",
520
+       "    }\n",
521
+       "</style>\n",
522
+       "<table border=\"1\" class=\"dataframe\">\n",
523
+       "  <thead>\n",
524
+       "    <tr style=\"text-align: right;\">\n",
525
+       "      <th></th>\n",
526
+       "      <th>title</th>\n",
527
+       "      <th>runtime</th>\n",
528
+       "      <th>vote_average</th>\n",
529
+       "      <th>vote_count</th>\n",
530
+       "      <th>year</th>\n",
531
+       "      <th>genre</th>\n",
532
+       "    </tr>\n",
533
+       "  </thead>\n",
534
+       "  <tbody>\n",
535
+       "    <tr>\n",
536
+       "      <th>0</th>\n",
537
+       "      <td>Toy Story</td>\n",
538
+       "      <td>81.0</td>\n",
539
+       "      <td>7.7</td>\n",
540
+       "      <td>5415.0</td>\n",
541
+       "      <td>1995</td>\n",
542
+       "      <td>animation</td>\n",
543
+       "    </tr>\n",
544
+       "    <tr>\n",
545
+       "      <th>0</th>\n",
546
+       "      <td>Toy Story</td>\n",
547
+       "      <td>81.0</td>\n",
548
+       "      <td>7.7</td>\n",
549
+       "      <td>5415.0</td>\n",
550
+       "      <td>1995</td>\n",
551
+       "      <td>comedy</td>\n",
552
+       "    </tr>\n",
553
+       "    <tr>\n",
554
+       "      <th>0</th>\n",
555
+       "      <td>Toy Story</td>\n",
556
+       "      <td>81.0</td>\n",
557
+       "      <td>7.7</td>\n",
558
+       "      <td>5415.0</td>\n",
559
+       "      <td>1995</td>\n",
560
+       "      <td>family</td>\n",
561
+       "    </tr>\n",
562
+       "    <tr>\n",
563
+       "      <th>1</th>\n",
564
+       "      <td>Jumanji</td>\n",
565
+       "      <td>104.0</td>\n",
566
+       "      <td>6.9</td>\n",
567
+       "      <td>2413.0</td>\n",
568
+       "      <td>1995</td>\n",
569
+       "      <td>adventure</td>\n",
570
+       "    </tr>\n",
571
+       "    <tr>\n",
572
+       "      <th>1</th>\n",
573
+       "      <td>Jumanji</td>\n",
574
+       "      <td>104.0</td>\n",
575
+       "      <td>6.9</td>\n",
576
+       "      <td>2413.0</td>\n",
577
+       "      <td>1995</td>\n",
578
+       "      <td>fantasy</td>\n",
579
+       "    </tr>\n",
580
+       "  </tbody>\n",
581
+       "</table>\n",
582
+       "</div>"
583
+      ],
584
+      "text/plain": [
585
+       "       title  runtime  vote_average  vote_count  year      genre\n",
586
+       "0  Toy Story     81.0           7.7      5415.0  1995  animation\n",
587
+       "0  Toy Story     81.0           7.7      5415.0  1995     comedy\n",
588
+       "0  Toy Story     81.0           7.7      5415.0  1995     family\n",
589
+       "1    Jumanji    104.0           6.9      2413.0  1995  adventure\n",
590
+       "1    Jumanji    104.0           6.9      2413.0  1995    fantasy"
591
+      ]
592
+     },
593
+     "execution_count": 11,
594
+     "metadata": {},
595
+     "output_type": "execute_result"
596
+    }
597
+   ],
598
+   "source": [
599
+    "#Create a new feature by exploding genres\n",
600
+    "s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)\n",
601
+    "\n",
602
+    "#Name the new feature as 'genre'\n",
603
+    "s.name = 'genre'\n",
604
+    "\n",
605
+    "#Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.\n",
606
+    "gen_df = df.drop('genres', axis=1).join(s)\n",
607
+    "\n",
608
+    "#Print the head of the new gen_df\n",
609
+    "gen_df.head()"
610
+   ]
611
+  },
612
+  {
613
+   "cell_type": "code",
614
+   "execution_count": 12,
615
+   "metadata": {},
616
+   "outputs": [],
617
+   "source": [
618
+    "def build_chart(gen_df, percentile=0.8):\n",
619
+    "    #Ask for preferred genres\n",
620
+    "    print(\"Input preferred genre\")\n",
621
+    "    genre = input()\n",
622
+    "    \n",
623
+    "    #Ask for lower limit of duration\n",
624
+    "    print(\"Input shortest duration\")\n",
625
+    "    low_time = int(input())\n",
626
+    "    \n",
627
+    "    #Ask for upper limit of duration\n",
628
+    "    print(\"Input longest duration\")\n",
629
+    "    high_time = int(input())\n",
630
+    "    \n",
631
+    "    #Ask for lower limit of timeline\n",
632
+    "    print(\"Input earliest year\")\n",
633
+    "    low_year = int(input())\n",
634
+    "    \n",
635
+    "    #Ask for upper limit of timeline\n",
636
+    "    print(\"Input latest year\")\n",
637
+    "    high_year = int(input())\n",
638
+    "    \n",
639
+    "    #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies\n",
640
+    "    movies = gen_df.copy()\n",
641
+    "    \n",
642
+    "    #Filter based on the condition\n",
643
+    "    movies = movies[(movies['genre'] == genre) & \n",
644
+    "                    (movies['runtime'] >= low_time) & \n",
645
+    "                    (movies['runtime'] <= high_time) & \n",
646
+    "                    (movies['year'] >= low_year) & \n",
647
+    "                    (movies['year'] <= high_year)]\n",
648
+    "    \n",
649
+    "    #Compute the values of C and m for the filtered movies\n",
650
+    "    C = movies['vote_average'].mean()\n",
651
+    "    m = movies['vote_count'].quantile(percentile)\n",
652
+    "    \n",
653
+    "    #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies\n",
654
+    "    q_movies = movies.copy().loc[movies['vote_count'] >= m]\n",
655
+    "    \n",
656
+    "    #Calculate score using the IMDB formula\n",
657
+    "    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) \n",
658
+    "                                       + (m/(m+x['vote_count']) * C)\n",
659
+    "                                       ,axis=1)\n",
660
+    "\n",
661
+    "    #Sort movies in descending order of their scores\n",
662
+    "    q_movies = q_movies.sort_values('score', ascending=False)\n",
663
+    "    \n",
664
+    "    return q_movies"
665
+   ]
666
+  },
667
+  {
668
+   "cell_type": "code",
669
+   "execution_count": 13,
670
+   "metadata": {},
671
+   "outputs": [
672
+    {
673
+     "name": "stdout",
674
+     "output_type": "stream",
675
+     "text": [
676
+      "Input preferred genre\n",
677
+      "horror\n",
678
+      "Input shortest duration\n",
679
+      "60\n",
680
+      "Input longest duration\n",
681
+      "120\n",
682
+      "Input earliest year\n",
683
+      "1990\n",
684
+      "Input latest year\n",
685
+      "2022\n"
686
+     ]
687
+    },
688
+    {
689
+     "data": {
690
+      "text/html": [
691
+       "<div>\n",
692
+       "<style scoped>\n",
693
+       "    .dataframe tbody tr th:only-of-type {\n",
694
+       "        vertical-align: middle;\n",
695
+       "    }\n",
696
+       "\n",
697
+       "    .dataframe tbody tr th {\n",
698
+       "        vertical-align: top;\n",
699
+       "    }\n",
700
+       "\n",
701
+       "    .dataframe thead th {\n",
702
+       "        text-align: right;\n",
703
+       "    }\n",
704
+       "</style>\n",
705
+       "<table border=\"1\" class=\"dataframe\">\n",
706
+       "  <thead>\n",
707
+       "    <tr style=\"text-align: right;\">\n",
708
+       "      <th></th>\n",
709
+       "      <th>title</th>\n",
710
+       "      <th>runtime</th>\n",
711
+       "      <th>vote_average</th>\n",
712
+       "      <th>vote_count</th>\n",
713
+       "      <th>year</th>\n",
714
+       "      <th>genre</th>\n",
715
+       "      <th>score</th>\n",
716
+       "    </tr>\n",
717
+       "  </thead>\n",
718
+       "  <tbody>\n",
719
+       "    <tr>\n",
720
+       "      <th>39821</th>\n",
721
+       "      <td>Train to Busan</td>\n",
722
+       "      <td>118.0</td>\n",
723
+       "      <td>7.7</td>\n",
724
+       "      <td>984.0</td>\n",
725
+       "      <td>2016</td>\n",
726
+       "      <td>horror</td>\n",
727
+       "      <td>7.424441</td>\n",
728
+       "    </tr>\n",
729
+       "    <tr>\n",
730
+       "      <th>8147</th>\n",
731
+       "      <td>Shaun of the Dead</td>\n",
732
+       "      <td>99.0</td>\n",
733
+       "      <td>7.5</td>\n",
734
+       "      <td>2479.0</td>\n",
735
+       "      <td>2004</td>\n",
736
+       "      <td>horror</td>\n",
737
+       "      <td>7.392081</td>\n",
738
+       "    </tr>\n",
739
+       "    <tr>\n",
740
+       "      <th>21276</th>\n",
741
+       "      <td>The Conjuring</td>\n",
742
+       "      <td>112.0</td>\n",
743
+       "      <td>7.4</td>\n",
744
+       "      <td>3169.0</td>\n",
745
+       "      <td>2013</td>\n",
746
+       "      <td>horror</td>\n",
747
+       "      <td>7.318185</td>\n",
748
+       "    </tr>\n",
749
+       "    <tr>\n",
750
+       "      <th>4591</th>\n",
751
+       "      <td>The Others</td>\n",
752
+       "      <td>101.0</td>\n",
753
+       "      <td>7.4</td>\n",
754
+       "      <td>1708.0</td>\n",
755
+       "      <td>2001</td>\n",
756
+       "      <td>horror</td>\n",
757
+       "      <td>7.252502</td>\n",
758
+       "    </tr>\n",
759
+       "    <tr>\n",
760
+       "      <th>12891</th>\n",
761
+       "      <td>Let the Right One In</td>\n",
762
+       "      <td>115.0</td>\n",
763
+       "      <td>7.5</td>\n",
764
+       "      <td>997.0</td>\n",
765
+       "      <td>2008</td>\n",
766
+       "      <td>horror</td>\n",
767
+       "      <td>7.247838</td>\n",
768
+       "    </tr>\n",
769
+       "  </tbody>\n",
770
+       "</table>\n",
771
+       "</div>"
772
+      ],
773
+      "text/plain": [
774
+       "                      title  runtime  vote_average  vote_count  year   genre  \\\n",
775
+       "39821        Train to Busan    118.0           7.7       984.0  2016  horror   \n",
776
+       "8147      Shaun of the Dead     99.0           7.5      2479.0  2004  horror   \n",
777
+       "21276         The Conjuring    112.0           7.4      3169.0  2013  horror   \n",
778
+       "4591             The Others    101.0           7.4      1708.0  2001  horror   \n",
779
+       "12891  Let the Right One In    115.0           7.5       997.0  2008  horror   \n",
780
+       "\n",
781
+       "          score  \n",
782
+       "39821  7.424441  \n",
783
+       "8147   7.392081  \n",
784
+       "21276  7.318185  \n",
785
+       "4591   7.252502  \n",
786
+       "12891  7.247838  "
787
+      ]
788
+     },
789
+     "execution_count": 13,
790
+     "metadata": {},
791
+     "output_type": "execute_result"
792
+    }
793
+   ],
794
+   "source": [
795
+    "#Generate the chart for top animation movies and display top 5.\n",
796
+    "build_chart(gen_df).head()"
797
+   ]
798
+  },
799
+  {
800
+   "cell_type": "code",
801
+   "execution_count": 14,
802
+   "metadata": {},
803
+   "outputs": [],
804
+   "source": [
805
+    "#Convert the cleaned (non-exploded) dataframe df into a CSV file and save it in the data folder\n",
806
+    "#Set parameter index to False as the index of the DataFrame has no inherent meaning.\n",
807
+    "df.to_csv('../data/metadata_clean.csv', index=False)"
808
+   ]
809
+  },
810
+  {
811
+   "cell_type": "code",
812
+   "execution_count": null,
813
+   "metadata": {
814
+    "collapsed": true
815
+   },
816
+   "outputs": [],
817
+   "source": []
818
+  }
819
+ ],
820
+ "metadata": {
821
+  "kernelspec": {
822
+   "display_name": "Python 3 (ipykernel)",
823
+   "language": "python",
824
+   "name": "python3"
825
+  },
826
+  "language_info": {
827
+   "codemirror_mode": {
828
+    "name": "ipython",
829
+    "version": 3
830
+   },
831
+   "file_extension": ".py",
832
+   "mimetype": "text/x-python",
833
+   "name": "python",
834
+   "nbconvert_exporter": "python",
835
+   "pygments_lexer": "ipython3",
836
+   "version": "3.10.4"
837
+  }
838
+ },
839
+ "nbformat": 4,
840
+ "nbformat_minor": 2
841
+}

+ 697 - 0
Chapter3/Simple Recommender.ipynb

@@ -0,0 +1,697 @@
1
+{
2
+ "cells": [
3
+  {
4
+   "cell_type": "code",
5
+   "execution_count": 1,
6
+   "metadata": {},
7
+   "outputs": [
8
+    {
9
+     "name": "stderr",
10
+     "output_type": "stream",
11
+     "text": [
12
+      "/var/folders/dv/107570dd01b_m3wvmvv0g9lm0000gn/T/ipykernel_18673/1882081200.py:4: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
13
+      "  df = pd.read_csv('../data/movies_metadata.csv')\n"
14
+     ]
15
+    },
16
+    {
17
+     "data": {
18
+      "text/html": [
19
+       "<div>\n",
20
+       "<style scoped>\n",
21
+       "    .dataframe tbody tr th:only-of-type {\n",
22
+       "        vertical-align: middle;\n",
23
+       "    }\n",
24
+       "\n",
25
+       "    .dataframe tbody tr th {\n",
26
+       "        vertical-align: top;\n",
27
+       "    }\n",
28
+       "\n",
29
+       "    .dataframe thead th {\n",
30
+       "        text-align: right;\n",
31
+       "    }\n",
32
+       "</style>\n",
33
+       "<table border=\"1\" class=\"dataframe\">\n",
34
+       "  <thead>\n",
35
+       "    <tr style=\"text-align: right;\">\n",
36
+       "      <th></th>\n",
37
+       "      <th>adult</th>\n",
38
+       "      <th>belongs_to_collection</th>\n",
39
+       "      <th>budget</th>\n",
40
+       "      <th>genres</th>\n",
41
+       "      <th>homepage</th>\n",
42
+       "      <th>id</th>\n",
43
+       "      <th>imdb_id</th>\n",
44
+       "      <th>original_language</th>\n",
45
+       "      <th>original_title</th>\n",
46
+       "      <th>overview</th>\n",
47
+       "      <th>...</th>\n",
48
+       "      <th>release_date</th>\n",
49
+       "      <th>revenue</th>\n",
50
+       "      <th>runtime</th>\n",
51
+       "      <th>spoken_languages</th>\n",
52
+       "      <th>status</th>\n",
53
+       "      <th>tagline</th>\n",
54
+       "      <th>title</th>\n",
55
+       "      <th>video</th>\n",
56
+       "      <th>vote_average</th>\n",
57
+       "      <th>vote_count</th>\n",
58
+       "    </tr>\n",
59
+       "  </thead>\n",
60
+       "  <tbody>\n",
61
+       "    <tr>\n",
62
+       "      <th>0</th>\n",
63
+       "      <td>False</td>\n",
64
+       "      <td>{'id': 10194, 'name': 'Toy Story Collection', ...</td>\n",
65
+       "      <td>30000000</td>\n",
66
+       "      <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
67
+       "      <td>http://toystory.disney.com/toy-story</td>\n",
68
+       "      <td>862</td>\n",
69
+       "      <td>tt0114709</td>\n",
70
+       "      <td>en</td>\n",
71
+       "      <td>Toy Story</td>\n",
72
+       "      <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
73
+       "      <td>...</td>\n",
74
+       "      <td>1995-10-30</td>\n",
75
+       "      <td>373554033.0</td>\n",
76
+       "      <td>81.0</td>\n",
77
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
78
+       "      <td>Released</td>\n",
79
+       "      <td>NaN</td>\n",
80
+       "      <td>Toy Story</td>\n",
81
+       "      <td>False</td>\n",
82
+       "      <td>7.7</td>\n",
83
+       "      <td>5415.0</td>\n",
84
+       "    </tr>\n",
85
+       "    <tr>\n",
86
+       "      <th>1</th>\n",
87
+       "      <td>False</td>\n",
88
+       "      <td>NaN</td>\n",
89
+       "      <td>65000000</td>\n",
90
+       "      <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
91
+       "      <td>NaN</td>\n",
92
+       "      <td>8844</td>\n",
93
+       "      <td>tt0113497</td>\n",
94
+       "      <td>en</td>\n",
95
+       "      <td>Jumanji</td>\n",
96
+       "      <td>When siblings Judy and Peter discover an encha...</td>\n",
97
+       "      <td>...</td>\n",
98
+       "      <td>1995-12-15</td>\n",
99
+       "      <td>262797249.0</td>\n",
100
+       "      <td>104.0</td>\n",
101
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}, {'iso...</td>\n",
102
+       "      <td>Released</td>\n",
103
+       "      <td>Roll the dice and unleash the excitement!</td>\n",
104
+       "      <td>Jumanji</td>\n",
105
+       "      <td>False</td>\n",
106
+       "      <td>6.9</td>\n",
107
+       "      <td>2413.0</td>\n",
108
+       "    </tr>\n",
109
+       "    <tr>\n",
110
+       "      <th>2</th>\n",
111
+       "      <td>False</td>\n",
112
+       "      <td>{'id': 119050, 'name': 'Grumpy Old Men Collect...</td>\n",
113
+       "      <td>0</td>\n",
114
+       "      <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
115
+       "      <td>NaN</td>\n",
116
+       "      <td>15602</td>\n",
117
+       "      <td>tt0113228</td>\n",
118
+       "      <td>en</td>\n",
119
+       "      <td>Grumpier Old Men</td>\n",
120
+       "      <td>A family wedding reignites the ancient feud be...</td>\n",
121
+       "      <td>...</td>\n",
122
+       "      <td>1995-12-22</td>\n",
123
+       "      <td>0.0</td>\n",
124
+       "      <td>101.0</td>\n",
125
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
126
+       "      <td>Released</td>\n",
127
+       "      <td>Still Yelling. Still Fighting. Still Ready for...</td>\n",
128
+       "      <td>Grumpier Old Men</td>\n",
129
+       "      <td>False</td>\n",
130
+       "      <td>6.5</td>\n",
131
+       "      <td>92.0</td>\n",
132
+       "    </tr>\n",
133
+       "    <tr>\n",
134
+       "      <th>3</th>\n",
135
+       "      <td>False</td>\n",
136
+       "      <td>NaN</td>\n",
137
+       "      <td>16000000</td>\n",
138
+       "      <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
139
+       "      <td>NaN</td>\n",
140
+       "      <td>31357</td>\n",
141
+       "      <td>tt0114885</td>\n",
142
+       "      <td>en</td>\n",
143
+       "      <td>Waiting to Exhale</td>\n",
144
+       "      <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
145
+       "      <td>...</td>\n",
146
+       "      <td>1995-12-22</td>\n",
147
+       "      <td>81452156.0</td>\n",
148
+       "      <td>127.0</td>\n",
149
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
150
+       "      <td>Released</td>\n",
151
+       "      <td>Friends are the people who let you be yourself...</td>\n",
152
+       "      <td>Waiting to Exhale</td>\n",
153
+       "      <td>False</td>\n",
154
+       "      <td>6.1</td>\n",
155
+       "      <td>34.0</td>\n",
156
+       "    </tr>\n",
157
+       "    <tr>\n",
158
+       "      <th>4</th>\n",
159
+       "      <td>False</td>\n",
160
+       "      <td>{'id': 96871, 'name': 'Father of the Bride Col...</td>\n",
161
+       "      <td>0</td>\n",
162
+       "      <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
163
+       "      <td>NaN</td>\n",
164
+       "      <td>11862</td>\n",
165
+       "      <td>tt0113041</td>\n",
166
+       "      <td>en</td>\n",
167
+       "      <td>Father of the Bride Part II</td>\n",
168
+       "      <td>Just when George Banks has recovered from his ...</td>\n",
169
+       "      <td>...</td>\n",
170
+       "      <td>1995-02-10</td>\n",
171
+       "      <td>76578911.0</td>\n",
172
+       "      <td>106.0</td>\n",
173
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
174
+       "      <td>Released</td>\n",
175
+       "      <td>Just When His World Is Back To Normal... He's ...</td>\n",
176
+       "      <td>Father of the Bride Part II</td>\n",
177
+       "      <td>False</td>\n",
178
+       "      <td>5.7</td>\n",
179
+       "      <td>173.0</td>\n",
180
+       "    </tr>\n",
181
+       "  </tbody>\n",
182
+       "</table>\n",
183
+       "<p>5 rows × 24 columns</p>\n",
184
+       "</div>"
185
+      ],
186
+      "text/plain": [
187
+       "   adult                              belongs_to_collection    budget  \\\n",
188
+       "0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   \n",
189
+       "1  False                                                NaN  65000000   \n",
190
+       "2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   \n",
191
+       "3  False                                                NaN  16000000   \n",
192
+       "4  False  {'id': 96871, 'name': 'Father of the Bride Col...         0   \n",
193
+       "\n",
194
+       "                                              genres  \\\n",
195
+       "0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   \n",
196
+       "1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   \n",
197
+       "2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   \n",
198
+       "3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   \n",
199
+       "4                     [{'id': 35, 'name': 'Comedy'}]   \n",
200
+       "\n",
201
+       "                               homepage     id    imdb_id original_language  \\\n",
202
+       "0  http://toystory.disney.com/toy-story    862  tt0114709                en   \n",
203
+       "1                                   NaN   8844  tt0113497                en   \n",
204
+       "2                                   NaN  15602  tt0113228                en   \n",
205
+       "3                                   NaN  31357  tt0114885                en   \n",
206
+       "4                                   NaN  11862  tt0113041                en   \n",
207
+       "\n",
208
+       "                original_title  \\\n",
209
+       "0                    Toy Story   \n",
210
+       "1                      Jumanji   \n",
211
+       "2             Grumpier Old Men   \n",
212
+       "3            Waiting to Exhale   \n",
213
+       "4  Father of the Bride Part II   \n",
214
+       "\n",
215
+       "                                            overview  ... release_date  \\\n",
216
+       "0  Led by Woody, Andy's toys live happily in his ...  ...   1995-10-30   \n",
217
+       "1  When siblings Judy and Peter discover an encha...  ...   1995-12-15   \n",
218
+       "2  A family wedding reignites the ancient feud be...  ...   1995-12-22   \n",
219
+       "3  Cheated on, mistreated and stepped on, the wom...  ...   1995-12-22   \n",
220
+       "4  Just when George Banks has recovered from his ...  ...   1995-02-10   \n",
221
+       "\n",
222
+       "       revenue runtime                                   spoken_languages  \\\n",
223
+       "0  373554033.0    81.0           [{'iso_639_1': 'en', 'name': 'English'}]   \n",
224
+       "1  262797249.0   104.0  [{'iso_639_1': 'en', 'name': 'English'}, {'iso...   \n",
225
+       "2          0.0   101.0           [{'iso_639_1': 'en', 'name': 'English'}]   \n",
226
+       "3   81452156.0   127.0           [{'iso_639_1': 'en', 'name': 'English'}]   \n",
227
+       "4   76578911.0   106.0           [{'iso_639_1': 'en', 'name': 'English'}]   \n",
228
+       "\n",
229
+       "     status                                            tagline  \\\n",
230
+       "0  Released                                                NaN   \n",
231
+       "1  Released          Roll the dice and unleash the excitement!   \n",
232
+       "2  Released  Still Yelling. Still Fighting. Still Ready for...   \n",
233
+       "3  Released  Friends are the people who let you be yourself...   \n",
234
+       "4  Released  Just When His World Is Back To Normal... He's ...   \n",
235
+       "\n",
236
+       "                         title  video vote_average vote_count  \n",
237
+       "0                    Toy Story  False          7.7     5415.0  \n",
238
+       "1                      Jumanji  False          6.9     2413.0  \n",
239
+       "2             Grumpier Old Men  False          6.5       92.0  \n",
240
+       "3            Waiting to Exhale  False          6.1       34.0  \n",
241
+       "4  Father of the Bride Part II  False          5.7      173.0  \n",
242
+       "\n",
243
+       "[5 rows x 24 columns]"
244
+      ]
245
+     },
246
+     "execution_count": 1,
247
+     "metadata": {},
248
+     "output_type": "execute_result"
249
+    }
250
+   ],
251
+   "source": [
252
+    "import pandas as pd\n",
253
+    "import numpy as np\n",
254
+    "\n",
255
+    "df = pd.read_csv('../data/movies_metadata.csv')\n",
256
+    "df.head()"
257
+   ]
258
+  },
259
+  {
260
+   "cell_type": "code",
261
+   "execution_count": 2,
262
+   "metadata": {},
263
+   "outputs": [
264
+    {
265
+     "data": {
266
+      "text/plain": [
267
+       "50.0"
268
+      ]
269
+     },
270
+     "execution_count": 2,
271
+     "metadata": {},
272
+     "output_type": "execute_result"
273
+    }
274
+   ],
275
+   "source": [
276
+    "#Calculate the number of votes garnered by the 80th percentile movie\n",
277
+    "m = df['vote_count'].quantile(0.80)\n",
278
+    "m"
279
+   ]
280
+  },
281
+  {
282
+   "cell_type": "code",
283
+   "execution_count": 3,
284
+   "metadata": {},
285
+   "outputs": [
286
+    {
287
+     "data": {
288
+      "text/plain": [
289
+       "(8963, 24)"
290
+      ]
291
+     },
292
+     "execution_count": 3,
293
+     "metadata": {},
294
+     "output_type": "execute_result"
295
+    }
296
+   ],
297
+   "source": [
298
+    "#Only consider movies longer than 45 minutes and shorter than 300 minutes\n",
299
+    "q_movies = df[(df['runtime'] >= 45) & (df['runtime'] <= 300)]\n",
300
+    "\n",
301
+    "#Only consider movies that have garnered more than m votes\n",
302
+    "q_movies = q_movies[q_movies['vote_count'] >= m]\n",
303
+    "\n",
304
+    "#Inspect the number of movies that made the cut\n",
305
+    "q_movies.shape"
306
+   ]
307
+  },
308
+  {
309
+   "cell_type": "code",
310
+   "execution_count": 4,
311
+   "metadata": {},
312
+   "outputs": [
313
+    {
314
+     "data": {
315
+      "text/plain": [
316
+       "5.618207215134185"
317
+      ]
318
+     },
319
+     "execution_count": 4,
320
+     "metadata": {},
321
+     "output_type": "execute_result"
322
+    }
323
+   ],
324
+   "source": [
325
+    "# Calculate C\n",
326
+    "C = df['vote_average'].mean()\n",
327
+    "C"
328
+   ]
329
+  },
330
+  {
331
+   "cell_type": "code",
332
+   "execution_count": 5,
333
+   "metadata": {},
334
+   "outputs": [],
335
+   "source": [
336
+    "# Function to compute the IMDB weighted rating for each movie\n",
337
+    "def weighted_rating(x, m=m, C=C):\n",
338
+    "    v = x['vote_count']\n",
339
+    "    R = x['vote_average']\n",
340
+    "    # Compute the weighted score\n",
341
+    "    return (v/(v+m) * R) + (m/(m+v) * C)"
342
+   ]
343
+  },
344
+  {
345
+   "cell_type": "code",
346
+   "execution_count": 6,
347
+   "metadata": {},
348
+   "outputs": [],
349
+   "source": [
350
+    "# Compute the score using the weighted_rating function defined above\n",
351
+    "q_movies['score'] = q_movies.apply(weighted_rating, axis=1)"
352
+   ]
353
+  },
354
+  {
355
+   "cell_type": "code",
356
+   "execution_count": 7,
357
+   "metadata": {},
358
+   "outputs": [
359
+    {
360
+     "data": {
361
+      "text/html": [
362
+       "<div>\n",
363
+       "<style scoped>\n",
364
+       "    .dataframe tbody tr th:only-of-type {\n",
365
+       "        vertical-align: middle;\n",
366
+       "    }\n",
367
+       "\n",
368
+       "    .dataframe tbody tr th {\n",
369
+       "        vertical-align: top;\n",
370
+       "    }\n",
371
+       "\n",
372
+       "    .dataframe thead th {\n",
373
+       "        text-align: right;\n",
374
+       "    }\n",
375
+       "</style>\n",
376
+       "<table border=\"1\" class=\"dataframe\">\n",
377
+       "  <thead>\n",
378
+       "    <tr style=\"text-align: right;\">\n",
379
+       "      <th></th>\n",
380
+       "      <th>title</th>\n",
381
+       "      <th>vote_count</th>\n",
382
+       "      <th>vote_average</th>\n",
383
+       "      <th>score</th>\n",
384
+       "      <th>runtime</th>\n",
385
+       "    </tr>\n",
386
+       "  </thead>\n",
387
+       "  <tbody>\n",
388
+       "    <tr>\n",
389
+       "      <th>10309</th>\n",
390
+       "      <td>Dilwale Dulhania Le Jayenge</td>\n",
391
+       "      <td>661.0</td>\n",
392
+       "      <td>9.1</td>\n",
393
+       "      <td>8.855148</td>\n",
394
+       "      <td>190.0</td>\n",
395
+       "    </tr>\n",
396
+       "    <tr>\n",
397
+       "      <th>314</th>\n",
398
+       "      <td>The Shawshank Redemption</td>\n",
399
+       "      <td>8358.0</td>\n",
400
+       "      <td>8.5</td>\n",
401
+       "      <td>8.482863</td>\n",
402
+       "      <td>142.0</td>\n",
403
+       "    </tr>\n",
404
+       "    <tr>\n",
405
+       "      <th>834</th>\n",
406
+       "      <td>The Godfather</td>\n",
407
+       "      <td>6024.0</td>\n",
408
+       "      <td>8.5</td>\n",
409
+       "      <td>8.476278</td>\n",
410
+       "      <td>175.0</td>\n",
411
+       "    </tr>\n",
412
+       "    <tr>\n",
413
+       "      <th>40251</th>\n",
414
+       "      <td>Your Name.</td>\n",
415
+       "      <td>1030.0</td>\n",
416
+       "      <td>8.5</td>\n",
417
+       "      <td>8.366584</td>\n",
418
+       "      <td>106.0</td>\n",
419
+       "    </tr>\n",
420
+       "    <tr>\n",
421
+       "      <th>12481</th>\n",
422
+       "      <td>The Dark Knight</td>\n",
423
+       "      <td>12269.0</td>\n",
424
+       "      <td>8.3</td>\n",
425
+       "      <td>8.289115</td>\n",
426
+       "      <td>152.0</td>\n",
427
+       "    </tr>\n",
428
+       "    <tr>\n",
429
+       "      <th>2843</th>\n",
430
+       "      <td>Fight Club</td>\n",
431
+       "      <td>9678.0</td>\n",
432
+       "      <td>8.3</td>\n",
433
+       "      <td>8.286216</td>\n",
434
+       "      <td>139.0</td>\n",
435
+       "    </tr>\n",
436
+       "    <tr>\n",
437
+       "      <th>292</th>\n",
438
+       "      <td>Pulp Fiction</td>\n",
439
+       "      <td>8670.0</td>\n",
440
+       "      <td>8.3</td>\n",
441
+       "      <td>8.284623</td>\n",
442
+       "      <td>154.0</td>\n",
443
+       "    </tr>\n",
444
+       "    <tr>\n",
445
+       "      <th>522</th>\n",
446
+       "      <td>Schindler's List</td>\n",
447
+       "      <td>4436.0</td>\n",
448
+       "      <td>8.3</td>\n",
449
+       "      <td>8.270109</td>\n",
450
+       "      <td>195.0</td>\n",
451
+       "    </tr>\n",
452
+       "    <tr>\n",
453
+       "      <th>23673</th>\n",
454
+       "      <td>Whiplash</td>\n",
455
+       "      <td>4376.0</td>\n",
456
+       "      <td>8.3</td>\n",
457
+       "      <td>8.269704</td>\n",
458
+       "      <td>105.0</td>\n",
459
+       "    </tr>\n",
460
+       "    <tr>\n",
461
+       "      <th>5481</th>\n",
462
+       "      <td>Spirited Away</td>\n",
463
+       "      <td>3968.0</td>\n",
464
+       "      <td>8.3</td>\n",
465
+       "      <td>8.266628</td>\n",
466
+       "      <td>125.0</td>\n",
467
+       "    </tr>\n",
468
+       "    <tr>\n",
469
+       "      <th>2211</th>\n",
470
+       "      <td>Life Is Beautiful</td>\n",
471
+       "      <td>3643.0</td>\n",
472
+       "      <td>8.3</td>\n",
473
+       "      <td>8.263691</td>\n",
474
+       "      <td>116.0</td>\n",
475
+       "    </tr>\n",
476
+       "    <tr>\n",
477
+       "      <th>1178</th>\n",
478
+       "      <td>The Godfather: Part II</td>\n",
479
+       "      <td>3418.0</td>\n",
480
+       "      <td>8.3</td>\n",
481
+       "      <td>8.261335</td>\n",
482
+       "      <td>200.0</td>\n",
483
+       "    </tr>\n",
484
+       "    <tr>\n",
485
+       "      <th>1152</th>\n",
486
+       "      <td>One Flew Over the Cuckoo's Nest</td>\n",
487
+       "      <td>3001.0</td>\n",
488
+       "      <td>8.3</td>\n",
489
+       "      <td>8.256051</td>\n",
490
+       "      <td>133.0</td>\n",
491
+       "    </tr>\n",
492
+       "    <tr>\n",
493
+       "      <th>1176</th>\n",
494
+       "      <td>Psycho</td>\n",
495
+       "      <td>2405.0</td>\n",
496
+       "      <td>8.3</td>\n",
497
+       "      <td>8.245381</td>\n",
498
+       "      <td>109.0</td>\n",
499
+       "    </tr>\n",
500
+       "    <tr>\n",
501
+       "      <th>351</th>\n",
502
+       "      <td>Forrest Gump</td>\n",
503
+       "      <td>8147.0</td>\n",
504
+       "      <td>8.2</td>\n",
505
+       "      <td>8.184252</td>\n",
506
+       "      <td>142.0</td>\n",
507
+       "    </tr>\n",
508
+       "    <tr>\n",
509
+       "      <th>1184</th>\n",
510
+       "      <td>Once Upon a Time in America</td>\n",
511
+       "      <td>1104.0</td>\n",
512
+       "      <td>8.3</td>\n",
513
+       "      <td>8.183804</td>\n",
514
+       "      <td>229.0</td>\n",
515
+       "    </tr>\n",
516
+       "    <tr>\n",
517
+       "      <th>1154</th>\n",
518
+       "      <td>The Empire Strikes Back</td>\n",
519
+       "      <td>5998.0</td>\n",
520
+       "      <td>8.2</td>\n",
521
+       "      <td>8.178656</td>\n",
522
+       "      <td>124.0</td>\n",
523
+       "    </tr>\n",
524
+       "    <tr>\n",
525
+       "      <th>18465</th>\n",
526
+       "      <td>The Intouchables</td>\n",
527
+       "      <td>5410.0</td>\n",
528
+       "      <td>8.2</td>\n",
529
+       "      <td>8.176357</td>\n",
530
+       "      <td>112.0</td>\n",
531
+       "    </tr>\n",
532
+       "    <tr>\n",
533
+       "      <th>289</th>\n",
534
+       "      <td>Leon: The Professional</td>\n",
535
+       "      <td>4293.0</td>\n",
536
+       "      <td>8.2</td>\n",
537
+       "      <td>8.170276</td>\n",
538
+       "      <td>110.0</td>\n",
539
+       "    </tr>\n",
540
+       "    <tr>\n",
541
+       "      <th>3030</th>\n",
542
+       "      <td>The Green Mile</td>\n",
543
+       "      <td>4166.0</td>\n",
544
+       "      <td>8.2</td>\n",
545
+       "      <td>8.169381</td>\n",
546
+       "      <td>189.0</td>\n",
547
+       "    </tr>\n",
548
+       "    <tr>\n",
549
+       "      <th>1170</th>\n",
550
+       "      <td>GoodFellas</td>\n",
551
+       "      <td>3211.0</td>\n",
552
+       "      <td>8.2</td>\n",
553
+       "      <td>8.160414</td>\n",
554
+       "      <td>145.0</td>\n",
555
+       "    </tr>\n",
556
+       "    <tr>\n",
557
+       "      <th>2216</th>\n",
558
+       "      <td>American History X</td>\n",
559
+       "      <td>3120.0</td>\n",
560
+       "      <td>8.2</td>\n",
561
+       "      <td>8.159278</td>\n",
562
+       "      <td>119.0</td>\n",
563
+       "    </tr>\n",
564
+       "    <tr>\n",
565
+       "      <th>1161</th>\n",
566
+       "      <td>12 Angry Men</td>\n",
567
+       "      <td>2130.0</td>\n",
568
+       "      <td>8.2</td>\n",
569
+       "      <td>8.140785</td>\n",
570
+       "      <td>96.0</td>\n",
571
+       "    </tr>\n",
572
+       "    <tr>\n",
573
+       "      <th>9698</th>\n",
574
+       "      <td>Howl's Moving Castle</td>\n",
575
+       "      <td>2049.0</td>\n",
576
+       "      <td>8.2</td>\n",
577
+       "      <td>8.138499</td>\n",
578
+       "      <td>119.0</td>\n",
579
+       "    </tr>\n",
580
+       "    <tr>\n",
581
+       "      <th>2884</th>\n",
582
+       "      <td>Princess Mononoke</td>\n",
583
+       "      <td>2041.0</td>\n",
584
+       "      <td>8.2</td>\n",
585
+       "      <td>8.138264</td>\n",
586
+       "      <td>134.0</td>\n",
587
+       "    </tr>\n",
588
+       "  </tbody>\n",
589
+       "</table>\n",
590
+       "</div>"
591
+      ],
592
+      "text/plain": [
593
+       "                                 title  vote_count  vote_average     score  \\\n",
594
+       "10309      Dilwale Dulhania Le Jayenge       661.0           9.1  8.855148   \n",
595
+       "314           The Shawshank Redemption      8358.0           8.5  8.482863   \n",
596
+       "834                      The Godfather      6024.0           8.5  8.476278   \n",
597
+       "40251                       Your Name.      1030.0           8.5  8.366584   \n",
598
+       "12481                  The Dark Knight     12269.0           8.3  8.289115   \n",
599
+       "2843                        Fight Club      9678.0           8.3  8.286216   \n",
600
+       "292                       Pulp Fiction      8670.0           8.3  8.284623   \n",
601
+       "522                   Schindler's List      4436.0           8.3  8.270109   \n",
602
+       "23673                         Whiplash      4376.0           8.3  8.269704   \n",
603
+       "5481                     Spirited Away      3968.0           8.3  8.266628   \n",
604
+       "2211                 Life Is Beautiful      3643.0           8.3  8.263691   \n",
605
+       "1178            The Godfather: Part II      3418.0           8.3  8.261335   \n",
606
+       "1152   One Flew Over the Cuckoo's Nest      3001.0           8.3  8.256051   \n",
607
+       "1176                            Psycho      2405.0           8.3  8.245381   \n",
608
+       "351                       Forrest Gump      8147.0           8.2  8.184252   \n",
609
+       "1184       Once Upon a Time in America      1104.0           8.3  8.183804   \n",
610
+       "1154           The Empire Strikes Back      5998.0           8.2  8.178656   \n",
611
+       "18465                 The Intouchables      5410.0           8.2  8.176357   \n",
612
+       "289             Leon: The Professional      4293.0           8.2  8.170276   \n",
613
+       "3030                    The Green Mile      4166.0           8.2  8.169381   \n",
614
+       "1170                        GoodFellas      3211.0           8.2  8.160414   \n",
615
+       "2216                American History X      3120.0           8.2  8.159278   \n",
616
+       "1161                      12 Angry Men      2130.0           8.2  8.140785   \n",
617
+       "9698              Howl's Moving Castle      2049.0           8.2  8.138499   \n",
618
+       "2884                 Princess Mononoke      2041.0           8.2  8.138264   \n",
619
+       "\n",
620
+       "       runtime  \n",
621
+       "10309    190.0  \n",
622
+       "314      142.0  \n",
623
+       "834      175.0  \n",
624
+       "40251    106.0  \n",
625
+       "12481    152.0  \n",
626
+       "2843     139.0  \n",
627
+       "292      154.0  \n",
628
+       "522      195.0  \n",
629
+       "23673    105.0  \n",
630
+       "5481     125.0  \n",
631
+       "2211     116.0  \n",
632
+       "1178     200.0  \n",
633
+       "1152     133.0  \n",
634
+       "1176     109.0  \n",
635
+       "351      142.0  \n",
636
+       "1184     229.0  \n",
637
+       "1154     124.0  \n",
638
+       "18465    112.0  \n",
639
+       "289      110.0  \n",
640
+       "3030     189.0  \n",
641
+       "1170     145.0  \n",
642
+       "2216     119.0  \n",
643
+       "1161      96.0  \n",
644
+       "9698     119.0  \n",
645
+       "2884     134.0  "
646
+      ]
647
+     },
648
+     "execution_count": 7,
649
+     "metadata": {},
650
+     "output_type": "execute_result"
651
+    }
652
+   ],
653
+   "source": [
654
+    "#Sort movies in descending order of their scores\n",
655
+    "q_movies = q_movies.sort_values('score', ascending=False)\n",
656
+    "\n",
657
+    "#Print the top 25 movies\n",
658
+    "q_movies[['title', 'vote_count', 'vote_average', 'score', 'runtime']].head(25)"
659
+   ]
660
+  },
661
+  {
662
+   "cell_type": "code",
663
+   "execution_count": null,
664
+   "metadata": {},
665
+   "outputs": [],
666
+   "source": []
667
+  },
668
+  {
669
+   "cell_type": "code",
670
+   "execution_count": null,
671
+   "metadata": {},
672
+   "outputs": [],
673
+   "source": []
674
+  }
675
+ ],
676
+ "metadata": {
677
+  "kernelspec": {
678
+   "display_name": "Python 3 (ipykernel)",
679
+   "language": "python",
680
+   "name": "python3"
681
+  },
682
+  "language_info": {
683
+   "codemirror_mode": {
684
+    "name": "ipython",
685
+    "version": 3
686
+   },
687
+   "file_extension": ".py",
688
+   "mimetype": "text/x-python",
689
+   "name": "python",
690
+   "nbconvert_exporter": "python",
691
+   "pygments_lexer": "ipython3",
692
+   "version": "3.10.4"
693
+  }
694
+ },
695
+ "nbformat": 4,
696
+ "nbformat_minor": 2
697
+}

+ 783 - 0
Chapter4/.ipynb_checkpoints/Content Based Recommenders-checkpoint.ipynb

@@ -0,0 +1,783 @@
1
+{
2
+ "cells": [
3
+  {
4
+   "cell_type": "markdown",
5
+   "metadata": {},
6
+   "source": [
7
+    "# Plot Description Based Recommender"
8
+   ]
9
+  },
10
+  {
11
+   "cell_type": "code",
12
+   "execution_count": 1,
13
+   "metadata": {},
14
+   "outputs": [
15
+    {
16
+     "name": "stdout",
17
+     "output_type": "stream",
18
+     "text": [
19
+      "Requirement already satisfied: scikit-learn in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.1.2)\n",
20
+      "Requirement already satisfied: scipy in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.9.0)\n",
21
+      "Collecting matplotlib\n",
22
+      "  Downloading matplotlib-3.5.3-cp310-cp310-macosx_10_9_x86_64.whl (7.3 MB)\n",
23
+      "\u001b[2K     \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.3/7.3 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m\n",
24
+      "\u001b[?25hRequirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/site-packages (from scikit-learn) (1.22.4)\n",
25
+      "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (1.1.0)\n",
26
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
27
+      "Collecting fonttools>=4.22.0\n",
28
+      "  Using cached fonttools-4.34.4-py3-none-any.whl (944 kB)\n",
29
+      "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
30
+      "Collecting cycler>=0.10\n",
31
+      "  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)\n",
32
+      "Collecting kiwisolver>=1.0.1\n",
33
+      "  Downloading kiwisolver-1.4.4-cp310-cp310-macosx_10_9_x86_64.whl (65 kB)\n",
34
+      "\u001b[2K     \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m65.5/65.5 KB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
35
+      "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (21.3)\n",
36
+      "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
37
+      "Collecting pillow>=6.2.0\n",
38
+      "  Downloading Pillow-9.2.0-cp310-cp310-macosx_10_10_x86_64.whl (3.1 MB)\n",
39
+      "\u001b[2K     \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m0m\n",
40
+      "\u001b[?25hRequirement already satisfied: six>=1.5 in /usr/local/Cellar/six/1.16.0_2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
41
+      "Installing collected packages: pillow, kiwisolver, fonttools, cycler, matplotlib\n",
42
+      "Successfully installed cycler-0.11.0 fonttools-4.34.4 kiwisolver-1.4.4 matplotlib-3.5.3 pillow-9.2.0\n",
43
+      "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n",
44
+      "You should consider upgrading via the '/usr/local/Cellar/ipython/8.4.0/libexec/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
45
+      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
46
+     ]
47
+    }
48
+   ],
49
+   "source": [
50
+    "%pip install scikit-learn scipy matplotlib"
51
+   ]
52
+  },
53
+  {
54
+   "cell_type": "code",
55
+   "execution_count": 2,
56
+   "metadata": {},
57
+   "outputs": [
58
+    {
59
+     "name": "stdout",
60
+     "output_type": "stream",
61
+     "text": [
62
+      "Requirement already satisfied: scikit-learn in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.1.2)\n",
63
+      "Requirement already satisfied: scipy in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (1.9.0)\n",
64
+      "Requirement already satisfied: matplotlib in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (3.5.3)\n",
65
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (3.1.0)\n",
66
+      "Requirement already satisfied: joblib>=1.0.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from scikit-learn) (1.1.0)\n",
67
+      "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/site-packages (from scikit-learn) (1.22.4)\n",
68
+      "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (3.0.9)\n",
69
+      "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n",
70
+      "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (1.4.4)\n",
71
+      "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (4.34.4)\n",
72
+      "Requirement already satisfied: cycler>=0.10 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n",
73
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (21.3)\n",
74
+      "Requirement already satisfied: pillow>=6.2.0 in /usr/local/Cellar/ipython/8.4.0/libexec/lib/python3.10/site-packages (from matplotlib) (9.2.0)\n",
75
+      "Requirement already satisfied: six>=1.5 in /usr/local/Cellar/six/1.16.0_2/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
76
+      "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.2.2 is available.\n",
77
+      "You should consider upgrading via the '/usr/local/Cellar/ipython/8.4.0/libexec/bin/python3.10 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
78
+      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n"
79
+     ]
80
+    }
81
+   ],
82
+   "source": [
83
+    "%pip install scikit-learn scipy matplotlib"
84
+   ]
85
+  },
86
+  {
87
+   "cell_type": "code",
88
+   "execution_count": 3,
89
+   "metadata": {},
90
+   "outputs": [
91
+    {
92
+     "data": {
93
+      "text/html": [
94
+       "<div>\n",
95
+       "<style scoped>\n",
96
+       "    .dataframe tbody tr th:only-of-type {\n",
97
+       "        vertical-align: middle;\n",
98
+       "    }\n",
99
+       "\n",
100
+       "    .dataframe tbody tr th {\n",
101
+       "        vertical-align: top;\n",
102
+       "    }\n",
103
+       "\n",
104
+       "    .dataframe thead th {\n",
105
+       "        text-align: right;\n",
106
+       "    }\n",
107
+       "</style>\n",
108
+       "<table border=\"1\" class=\"dataframe\">\n",
109
+       "  <thead>\n",
110
+       "    <tr style=\"text-align: right;\">\n",
111
+       "      <th></th>\n",
112
+       "      <th>title</th>\n",
113
+       "      <th>genres</th>\n",
114
+       "      <th>runtime</th>\n",
115
+       "      <th>vote_average</th>\n",
116
+       "      <th>vote_count</th>\n",
117
+       "      <th>year</th>\n",
118
+       "    </tr>\n",
119
+       "  </thead>\n",
120
+       "  <tbody>\n",
121
+       "    <tr>\n",
122
+       "      <th>0</th>\n",
123
+       "      <td>Toy Story</td>\n",
124
+       "      <td>['animation', 'comedy', 'family']</td>\n",
125
+       "      <td>81.0</td>\n",
126
+       "      <td>7.7</td>\n",
127
+       "      <td>5415.0</td>\n",
128
+       "      <td>1995</td>\n",
129
+       "    </tr>\n",
130
+       "    <tr>\n",
131
+       "      <th>1</th>\n",
132
+       "      <td>Jumanji</td>\n",
133
+       "      <td>['adventure', 'fantasy', 'family']</td>\n",
134
+       "      <td>104.0</td>\n",
135
+       "      <td>6.9</td>\n",
136
+       "      <td>2413.0</td>\n",
137
+       "      <td>1995</td>\n",
138
+       "    </tr>\n",
139
+       "    <tr>\n",
140
+       "      <th>2</th>\n",
141
+       "      <td>Grumpier Old Men</td>\n",
142
+       "      <td>['romance', 'comedy']</td>\n",
143
+       "      <td>101.0</td>\n",
144
+       "      <td>6.5</td>\n",
145
+       "      <td>92.0</td>\n",
146
+       "      <td>1995</td>\n",
147
+       "    </tr>\n",
148
+       "    <tr>\n",
149
+       "      <th>3</th>\n",
150
+       "      <td>Waiting to Exhale</td>\n",
151
+       "      <td>['comedy', 'drama', 'romance']</td>\n",
152
+       "      <td>127.0</td>\n",
153
+       "      <td>6.1</td>\n",
154
+       "      <td>34.0</td>\n",
155
+       "      <td>1995</td>\n",
156
+       "    </tr>\n",
157
+       "    <tr>\n",
158
+       "      <th>4</th>\n",
159
+       "      <td>Father of the Bride Part II</td>\n",
160
+       "      <td>['comedy']</td>\n",
161
+       "      <td>106.0</td>\n",
162
+       "      <td>5.7</td>\n",
163
+       "      <td>173.0</td>\n",
164
+       "      <td>1995</td>\n",
165
+       "    </tr>\n",
166
+       "  </tbody>\n",
167
+       "</table>\n",
168
+       "</div>"
169
+      ],
170
+      "text/plain": [
171
+       "                         title                              genres  runtime  \\\n",
172
+       "0                    Toy Story   ['animation', 'comedy', 'family']     81.0   \n",
173
+       "1                      Jumanji  ['adventure', 'fantasy', 'family']    104.0   \n",
174
+       "2             Grumpier Old Men               ['romance', 'comedy']    101.0   \n",
175
+       "3            Waiting to Exhale      ['comedy', 'drama', 'romance']    127.0   \n",
176
+       "4  Father of the Bride Part II                          ['comedy']    106.0   \n",
177
+       "\n",
178
+       "   vote_average  vote_count  year  \n",
179
+       "0           7.7      5415.0  1995  \n",
180
+       "1           6.9      2413.0  1995  \n",
181
+       "2           6.5        92.0  1995  \n",
182
+       "3           6.1        34.0  1995  \n",
183
+       "4           5.7       173.0  1995  "
184
+      ]
185
+     },
186
+     "execution_count": 3,
187
+     "metadata": {},
188
+     "output_type": "execute_result"
189
+    }
190
+   ],
191
+   "source": [
192
+    "import pandas as pd\n",
193
+    "import numpy as np\n",
194
+    "\n",
195
+    "#Import data from the clean file \n",
196
+    "df = pd.read_csv('../data/metadata_clean.csv')\n",
197
+    "\n",
198
+    "#Print the head of the cleaned DataFrame\n",
199
+    "df.head()"
200
+   ]
201
+  },
202
+  {
203
+   "cell_type": "code",
204
+   "execution_count": 4,
205
+   "metadata": {},
206
+   "outputs": [
207
+    {
208
+     "data": {
209
+      "text/html": [
210
+       "<div>\n",
211
+       "<style scoped>\n",
212
+       "    .dataframe tbody tr th:only-of-type {\n",
213
+       "        vertical-align: middle;\n",
214
+       "    }\n",
215
+       "\n",
216
+       "    .dataframe tbody tr th {\n",
217
+       "        vertical-align: top;\n",
218
+       "    }\n",
219
+       "\n",
220
+       "    .dataframe thead th {\n",
221
+       "        text-align: right;\n",
222
+       "    }\n",
223
+       "</style>\n",
224
+       "<table border=\"1\" class=\"dataframe\">\n",
225
+       "  <thead>\n",
226
+       "    <tr style=\"text-align: right;\">\n",
227
+       "      <th></th>\n",
228
+       "      <th>title</th>\n",
229
+       "      <th>genres</th>\n",
230
+       "      <th>runtime</th>\n",
231
+       "      <th>vote_average</th>\n",
232
+       "      <th>vote_count</th>\n",
233
+       "      <th>year</th>\n",
234
+       "      <th>overview</th>\n",
235
+       "      <th>id</th>\n",
236
+       "    </tr>\n",
237
+       "  </thead>\n",
238
+       "  <tbody>\n",
239
+       "    <tr>\n",
240
+       "      <th>0</th>\n",
241
+       "      <td>Toy Story</td>\n",
242
+       "      <td>['animation', 'comedy', 'family']</td>\n",
243
+       "      <td>81.0</td>\n",
244
+       "      <td>7.7</td>\n",
245
+       "      <td>5415.0</td>\n",
246
+       "      <td>1995</td>\n",
247
+       "      <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
248
+       "      <td>862</td>\n",
249
+       "    </tr>\n",
250
+       "    <tr>\n",
251
+       "      <th>1</th>\n",
252
+       "      <td>Jumanji</td>\n",
253
+       "      <td>['adventure', 'fantasy', 'family']</td>\n",
254
+       "      <td>104.0</td>\n",
255
+       "      <td>6.9</td>\n",
256
+       "      <td>2413.0</td>\n",
257
+       "      <td>1995</td>\n",
258
+       "      <td>When siblings Judy and Peter discover an encha...</td>\n",
259
+       "      <td>8844</td>\n",
260
+       "    </tr>\n",
261
+       "    <tr>\n",
262
+       "      <th>2</th>\n",
263
+       "      <td>Grumpier Old Men</td>\n",
264
+       "      <td>['romance', 'comedy']</td>\n",
265
+       "      <td>101.0</td>\n",
266
+       "      <td>6.5</td>\n",
267
+       "      <td>92.0</td>\n",
268
+       "      <td>1995</td>\n",
269
+       "      <td>A family wedding reignites the ancient feud be...</td>\n",
270
+       "      <td>15602</td>\n",
271
+       "    </tr>\n",
272
+       "    <tr>\n",
273
+       "      <th>3</th>\n",
274
+       "      <td>Waiting to Exhale</td>\n",
275
+       "      <td>['comedy', 'drama', 'romance']</td>\n",
276
+       "      <td>127.0</td>\n",
277
+       "      <td>6.1</td>\n",
278
+       "      <td>34.0</td>\n",
279
+       "      <td>1995</td>\n",
280
+       "      <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
281
+       "      <td>31357</td>\n",
282
+       "    </tr>\n",
283
+       "    <tr>\n",
284
+       "      <th>4</th>\n",
285
+       "      <td>Father of the Bride Part II</td>\n",
286
+       "      <td>['comedy']</td>\n",
287
+       "      <td>106.0</td>\n",
288
+       "      <td>5.7</td>\n",
289
+       "      <td>173.0</td>\n",
290
+       "      <td>1995</td>\n",
291
+       "      <td>Just when George Banks has recovered from his ...</td>\n",
292
+       "      <td>11862</td>\n",
293
+       "    </tr>\n",
294
+       "  </tbody>\n",
295
+       "</table>\n",
296
+       "</div>"
297
+      ],
298
+      "text/plain": [
299
+       "                         title                              genres  runtime  \\\n",
300
+       "0                    Toy Story   ['animation', 'comedy', 'family']     81.0   \n",
301
+       "1                      Jumanji  ['adventure', 'fantasy', 'family']    104.0   \n",
302
+       "2             Grumpier Old Men               ['romance', 'comedy']    101.0   \n",
303
+       "3            Waiting to Exhale      ['comedy', 'drama', 'romance']    127.0   \n",
304
+       "4  Father of the Bride Part II                          ['comedy']    106.0   \n",
305
+       "\n",
306
+       "   vote_average  vote_count  year  \\\n",
307
+       "0           7.7      5415.0  1995   \n",
308
+       "1           6.9      2413.0  1995   \n",
309
+       "2           6.5        92.0  1995   \n",
310
+       "3           6.1        34.0  1995   \n",
311
+       "4           5.7       173.0  1995   \n",
312
+       "\n",
313
+       "                                            overview     id  \n",
314
+       "0  Led by Woody, Andy's toys live happily in his ...    862  \n",
315
+       "1  When siblings Judy and Peter discover an encha...   8844  \n",
316
+       "2  A family wedding reignites the ancient feud be...  15602  \n",
317
+       "3  Cheated on, mistreated and stepped on, the wom...  31357  \n",
318
+       "4  Just when George Banks has recovered from his ...  11862  "
319
+      ]
320
+     },
321
+     "execution_count": 4,
322
+     "metadata": {},
323
+     "output_type": "execute_result"
324
+    }
325
+   ],
326
+   "source": [
327
+    "#Import the original file\n",
328
+    "orig_df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)\n",
329
+    "\n",
330
+    "#Add the useful features into the cleaned dataframe\n",
331
+    "df['overview'], df['id'] = orig_df['overview'], orig_df['id']\n",
332
+    "\n",
333
+    "df.head()"
334
+   ]
335
+  },
336
+  {
337
+   "cell_type": "code",
338
+   "execution_count": null,
339
+   "metadata": {
340
+    "scrolled": true
341
+   },
342
+   "outputs": [],
343
+   "source": [
344
+    "#Import TfIdfVectorizer from the scikit-learn library\n",
345
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
346
+    "\n",
347
+    "#Define a TF-IDF Vectorizer Object. Remove all english stopwords\n",
348
+    "tfidf = TfidfVectorizer(stop_words='english')\n",
349
+    "\n",
350
+    "#Replace NaN with an empty string\n",
351
+    "df['overview'] = df['overview'].fillna('')\n",
352
+    "\n",
353
+    "#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature\n",
354
+    "tfidf_matrix = tfidf.fit_transform(df['overview'])\n",
355
+    "\n",
356
+    "#Output the shape of tfidf_matrix\n",
357
+    "tfidf_matrix.shape"
358
+   ]
359
+  },
360
+  {
361
+   "cell_type": "code",
362
+   "execution_count": null,
363
+   "metadata": {},
364
+   "outputs": [],
365
+   "source": []
366
+  },
367
+  {
368
+   "cell_type": "code",
369
+   "execution_count": null,
370
+   "metadata": {},
371
+   "outputs": [],
372
+   "source": [
373
+    "# Import linear_kernel to compute the dot product\n",
374
+    "from sklearn.metrics.pairwise import linear_kernel\n",
375
+    "\n",
376
+    "# Compute the cosine similarity matrix\n",
377
+    "cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)"
378
+   ]
379
+  },
380
+  {
381
+   "cell_type": "code",
382
+   "execution_count": null,
383
+   "metadata": {},
384
+   "outputs": [],
385
+   "source": [
386
+    "#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any\n",
387
+    "indices = pd.Series(df.index, index=df['title']).drop_duplicates()"
388
+   ]
389
+  },
390
+  {
391
+   "cell_type": "code",
392
+   "execution_count": null,
393
+   "metadata": {},
394
+   "outputs": [],
395
+   "source": [
396
+    "# Function that takes in movie title as input and gives recommendations \n",
397
+    "def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):\n",
398
+    "    # Obtain the index of the movie that matches the title\n",
399
+    "    idx = indices[title]\n",
400
+    "\n",
401
+    "    # Get the pairwsie similarity scores of all movies with that movie\n",
402
+    "    # And convert it into a list of tuples as described above\n",
403
+    "    sim_scores = list(enumerate(cosine_sim[idx]))\n",
404
+    "\n",
405
+    "    # Sort the movies based on the cosine similarity scores\n",
406
+    "    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
407
+    "\n",
408
+    "    # Get the scores of the 10 most similar movies. Ignore the first movie.\n",
409
+    "    sim_scores = sim_scores[1:11]\n",
410
+    "\n",
411
+    "    # Get the movie indices\n",
412
+    "    movie_indices = [i[0] for i in sim_scores]\n",
413
+    "\n",
414
+    "    # Return the top 10 most similar movies\n",
415
+    "    return df['title'].iloc[movie_indices]"
416
+   ]
417
+  },
418
+  {
419
+   "cell_type": "code",
420
+   "execution_count": null,
421
+   "metadata": {},
422
+   "outputs": [],
423
+   "source": [
424
+    "#Get recommendations for The Lion King\n",
425
+    "content_recommender('The Lion King')"
426
+   ]
427
+  },
428
+  {
429
+   "cell_type": "markdown",
430
+   "metadata": {},
431
+   "source": [
432
+    "# Metadata Based Recommender"
433
+   ]
434
+  },
435
+  {
436
+   "cell_type": "code",
437
+   "execution_count": null,
438
+   "metadata": {},
439
+   "outputs": [],
440
+   "source": [
441
+    "# Load the keywords and credits files\n",
442
+    "cred_df = pd.read_csv('../data/credits.csv')\n",
443
+    "key_df = pd.read_csv('../data/keywords.csv')"
444
+   ]
445
+  },
446
+  {
447
+   "cell_type": "code",
448
+   "execution_count": null,
449
+   "metadata": {},
450
+   "outputs": [],
451
+   "source": [
452
+    "#Print the head of the credit dataframe\n",
453
+    "cred_df.head()"
454
+   ]
455
+  },
456
+  {
457
+   "cell_type": "code",
458
+   "execution_count": null,
459
+   "metadata": {},
460
+   "outputs": [],
461
+   "source": [
462
+    "#Print the head of the keywords dataframe\n",
463
+    "key_df.head()"
464
+   ]
465
+  },
466
+  {
467
+   "cell_type": "code",
468
+   "execution_count": null,
469
+   "metadata": {},
470
+   "outputs": [],
471
+   "source": [
472
+    "#Convert the IDs of df into int\n",
473
+    "df['id'] = df['id'].astype('int')"
474
+   ]
475
+  },
476
+  {
477
+   "cell_type": "code",
478
+   "execution_count": null,
479
+   "metadata": {},
480
+   "outputs": [],
481
+   "source": [
482
+    "# Function to convert all non-integer IDs to NaN\n",
483
+    "def clean_ids(x):\n",
484
+    "    try:\n",
485
+    "        return int(x)\n",
486
+    "    except:\n",
487
+    "        return np.nan"
488
+   ]
489
+  },
490
+  {
491
+   "cell_type": "code",
492
+   "execution_count": null,
493
+   "metadata": {},
494
+   "outputs": [],
495
+   "source": [
496
+    "#Clean the ids of df\n",
497
+    "df['id'] = df['id'].apply(clean_ids)\n",
498
+    "\n",
499
+    "#Filter all rows that have a null ID\n",
500
+    "df = df[df['id'].notnull()]"
501
+   ]
502
+  },
503
+  {
504
+   "cell_type": "code",
505
+   "execution_count": null,
506
+   "metadata": {},
507
+   "outputs": [],
508
+   "source": [
509
+    "# Convert IDs into integer\n",
510
+    "df['id'] = df['id'].astype('int')\n",
511
+    "key_df['id'] = key_df['id'].astype('int')\n",
512
+    "cred_df['id'] = cred_df['id'].astype('int')\n",
513
+    "\n",
514
+    "# Merge keywords and credits into your main metadata dataframe\n",
515
+    "df = df.merge(cred_df, on='id')\n",
516
+    "df = df.merge(key_df, on='id')\n",
517
+    "\n",
518
+    "#Display the head of df\n",
519
+    "df.head()"
520
+   ]
521
+  },
522
+  {
523
+   "cell_type": "code",
524
+   "execution_count": null,
525
+   "metadata": {},
526
+   "outputs": [],
527
+   "source": [
528
+    "# Convert the stringified objects into the native python objects\n",
529
+    "from ast import literal_eval\n",
530
+    "\n",
531
+    "features = ['cast', 'crew', 'keywords', 'genres']\n",
532
+    "for feature in features:\n",
533
+    "    df[feature] = df[feature].apply(literal_eval)"
534
+   ]
535
+  },
536
+  {
537
+   "cell_type": "code",
538
+   "execution_count": null,
539
+   "metadata": {},
540
+   "outputs": [],
541
+   "source": [
542
+    "#Print the first cast member of the first movie in df\n",
543
+    "df.iloc[0]['crew'][0]"
544
+   ]
545
+  },
546
+  {
547
+   "cell_type": "code",
548
+   "execution_count": null,
549
+   "metadata": {},
550
+   "outputs": [],
551
+   "source": [
552
+    "# Extract the director's name. If director is not listed, return NaN\n",
553
+    "def get_director(x):\n",
554
+    "    for crew_member in x:\n",
555
+    "        if crew_member['job'] == 'Director':\n",
556
+    "            return crew_member['name']\n",
557
+    "    return np.nan"
558
+   ]
559
+  },
560
+  {
561
+   "cell_type": "code",
562
+   "execution_count": null,
563
+   "metadata": {},
564
+   "outputs": [],
565
+   "source": [
566
+    "#Define the new director feature\n",
567
+    "df['director'] = df['crew'].apply(get_director)\n",
568
+    "\n",
569
+    "#Print the directors of the first five movies\n",
570
+    "df['director'].head()"
571
+   ]
572
+  },
573
+  {
574
+   "cell_type": "code",
575
+   "execution_count": null,
576
+   "metadata": {},
577
+   "outputs": [],
578
+   "source": [
579
+    "# Returns the list top 3 elements or entire list; whichever is more.\n",
580
+    "def generate_list(x):\n",
581
+    "    if isinstance(x, list):\n",
582
+    "        names = [i['name'] for i in x]\n",
583
+    "        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.\n",
584
+    "        if len(names) > 3:\n",
585
+    "            names = names[:3]\n",
586
+    "        return names\n",
587
+    "\n",
588
+    "    #Return empty list in case of missing/malformed data\n",
589
+    "    return []"
590
+   ]
591
+  },
592
+  {
593
+   "cell_type": "code",
594
+   "execution_count": null,
595
+   "metadata": {},
596
+   "outputs": [],
597
+   "source": [
598
+    "#Apply the generate_list function to cast and keywords\n",
599
+    "df['cast'] = df['cast'].apply(generate_list)\n",
600
+    "df['keywords'] = df['keywords'].apply(generate_list)"
601
+   ]
602
+  },
603
+  {
604
+   "cell_type": "code",
605
+   "execution_count": null,
606
+   "metadata": {},
607
+   "outputs": [],
608
+   "source": [
609
+    "#Only consider a maximum of 3 genres\n",
610
+    "df['genres'] = df['genres'].apply(lambda x: x[:3])"
611
+   ]
612
+  },
613
+  {
614
+   "cell_type": "code",
615
+   "execution_count": null,
616
+   "metadata": {},
617
+   "outputs": [],
618
+   "source": [
619
+    "# Print the new features of the first 5 movies along with title\n",
620
+    "df[['title', 'cast', 'director', 'keywords', 'genres']].head()"
621
+   ]
622
+  },
623
+  {
624
+   "cell_type": "code",
625
+   "execution_count": null,
626
+   "metadata": {},
627
+   "outputs": [],
628
+   "source": [
629
+    "# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase\n",
630
+    "def sanitize(x):\n",
631
+    "    if isinstance(x, list):\n",
632
+    "        #Strip spaces and convert to lowercase\n",
633
+    "        return [str.lower(i.replace(\" \", \"\")) for i in x]\n",
634
+    "    else:\n",
635
+    "        #Check if director exists. If not, return empty string\n",
636
+    "        if isinstance(x, str):\n",
637
+    "            return str.lower(x.replace(\" \", \"\"))\n",
638
+    "        else:\n",
639
+    "            return ''"
640
+   ]
641
+  },
642
+  {
643
+   "cell_type": "code",
644
+   "execution_count": null,
645
+   "metadata": {},
646
+   "outputs": [],
647
+   "source": [
648
+    "#Apply the generate_list function to cast, keywords, director and genres\n",
649
+    "for feature in ['cast', 'director', 'genres', 'keywords']:\n",
650
+    "    df[feature] = df[feature].apply(sanitize)"
651
+   ]
652
+  },
653
+  {
654
+   "cell_type": "code",
655
+   "execution_count": null,
656
+   "metadata": {
657
+    "scrolled": true
658
+   },
659
+   "outputs": [],
660
+   "source": [
661
+    "#Function that creates a soup out of the desired metadata\n",
662
+    "def create_soup(x):\n",
663
+    "    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])"
664
+   ]
665
+  },
666
+  {
667
+   "cell_type": "code",
668
+   "execution_count": null,
669
+   "metadata": {},
670
+   "outputs": [],
671
+   "source": [
672
+    "# Create the new soup feature\n",
673
+    "df['soup'] = df.apply(create_soup, axis=1)"
674
+   ]
675
+  },
676
+  {
677
+   "cell_type": "code",
678
+   "execution_count": null,
679
+   "metadata": {},
680
+   "outputs": [],
681
+   "source": [
682
+    "#Display the soup of the first movie\n",
683
+    "df.iloc[0]['soup']"
684
+   ]
685
+  },
686
+  {
687
+   "cell_type": "code",
688
+   "execution_count": null,
689
+   "metadata": {},
690
+   "outputs": [],
691
+   "source": [
692
+    "# Import CountVectorizer\n",
693
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
694
+    "\n",
695
+    "#Define a new CountVectorizer object and create vectors for the soup\n",
696
+    "count = CountVectorizer(stop_words='english')\n",
697
+    "count_matrix = count.fit_transform(df['soup'])"
698
+   ]
699
+  },
700
+  {
701
+   "cell_type": "code",
702
+   "execution_count": null,
703
+   "metadata": {},
704
+   "outputs": [],
705
+   "source": [
706
+    "#Import cosine_similarity function\n",
707
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
708
+    "\n",
709
+    "#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)\n",
710
+    "cosine_sim2 = cosine_similarity(count_matrix, count_matrix)"
711
+   ]
712
+  },
713
+  {
714
+   "cell_type": "code",
715
+   "execution_count": null,
716
+   "metadata": {},
717
+   "outputs": [],
718
+   "source": [
719
+    "# Reset index of your df and construct reverse mapping again\n",
720
+    "df = df.reset_index()\n",
721
+    "indices2 = pd.Series(df.index, index=df['title'])"
722
+   ]
723
+  },
724
+  {
725
+   "cell_type": "code",
726
+   "execution_count": null,
727
+   "metadata": {},
728
+   "outputs": [],
729
+   "source": [
730
+    "content_recommender('The Lion King', cosine_sim2, df, indices2)"
731
+   ]
732
+  },
733
+  {
734
+   "cell_type": "code",
735
+   "execution_count": null,
736
+   "metadata": {},
737
+   "outputs": [],
738
+   "source": []
739
+  },
740
+  {
741
+   "cell_type": "code",
742
+   "execution_count": null,
743
+   "metadata": {},
744
+   "outputs": [],
745
+   "source": []
746
+  },
747
+  {
748
+   "cell_type": "code",
749
+   "execution_count": null,
750
+   "metadata": {},
751
+   "outputs": [],
752
+   "source": []
753
+  },
754
+  {
755
+   "cell_type": "code",
756
+   "execution_count": null,
757
+   "metadata": {},
758
+   "outputs": [],
759
+   "source": []
760
+  }
761
+ ],
762
+ "metadata": {
763
+  "kernelspec": {
764
+   "display_name": "Python 3 (ipykernel)",
765
+   "language": "python",
766
+   "name": "python3"
767
+  },
768
+  "language_info": {
769
+   "codemirror_mode": {
770
+    "name": "ipython",
771
+    "version": 3
772
+   },
773
+   "file_extension": ".py",
774
+   "mimetype": "text/x-python",
775
+   "name": "python",
776
+   "nbconvert_exporter": "python",
777
+   "pygments_lexer": "ipython3",
778
+   "version": "3.10.4"
779
+  }
780
+ },
781
+ "nbformat": 4,
782
+ "nbformat_minor": 2
783
+}

File diff suppressed because it is too large
+ 1291 - 0
Chapter4/Content Based Recommenders.ipynb


File diff suppressed because it is too large
+ 1011 - 0
Chapter5/.ipynb_checkpoints/Data Mining-checkpoint.ipynb


File diff suppressed because it is too large
+ 1011 - 0
Chapter5/Data Mining.ipynb


File diff suppressed because it is too large
+ 2037 - 0
Chapter6/.ipynb_checkpoints/Collaborative Filtering-checkpoint.ipynb


File diff suppressed because it is too large
+ 2037 - 0
Chapter6/Collaborative Filtering.ipynb


+ 6 - 0
Chapter7/.ipynb_checkpoints/Hybrid Recommender-checkpoint.ipynb

@@ -0,0 +1,6 @@
1
+{
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 2
6
+}

+ 496 - 0
Chapter7/Hybrid Recommender.ipynb

@@ -0,0 +1,496 @@
1
+{
2
+ "cells": [
3
+  {
4
+   "cell_type": "markdown",
5
+   "metadata": {},
6
+   "source": [
7
+    "# Hybrid Recommenders"
8
+   ]
9
+  },
10
+  {
11
+   "cell_type": "code",
12
+   "execution_count": 1,
13
+   "metadata": {
14
+    "collapsed": true
15
+   },
16
+   "outputs": [],
17
+   "source": [
18
+    "import numpy as np\n",
19
+    "import pandas as pd"
20
+   ]
21
+  },
22
+  {
23
+   "cell_type": "code",
24
+   "execution_count": 2,
25
+   "metadata": {
26
+    "collapsed": true
27
+   },
28
+   "outputs": [],
29
+   "source": [
30
+    "#Import or compute the cosine_sim matrix\n",
31
+    "cosine_sim = pd.read_csv('../data/cosine_sim.csv')"
32
+   ]
33
+  },
34
+  {
35
+   "cell_type": "code",
36
+   "execution_count": 3,
37
+   "metadata": {},
38
+   "outputs": [],
39
+   "source": [
40
+    "#Import or compute the cosine sim mapping matrix\n",
41
+    "cosine_sim_map = pd.read_csv('../data/cosine_sim_map.csv', header=None)\n",
42
+    "\n",
43
+    "#Convert cosine_sim_map into a Pandas Series\n",
44
+    "cosine_sim_map = cosine_sim_map.set_index(0)\n",
45
+    "cosine_sim_map = cosine_sim_map[1]"
46
+   ]
47
+  },
48
+  {
49
+   "cell_type": "code",
50
+   "execution_count": 4,
51
+   "metadata": {},
52
+   "outputs": [],
53
+   "source": [
54
+    "#Build the SVD based Collaborative filter\n",
55
+    "from surprise import SVD, Reader, Dataset\n",
56
+    "\n",
57
+    "reader = Reader()\n",
58
+    "ratings = pd.read_csv('../data/ratings_small.csv')\n",
59
+    "data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)\n",
60
+    "data.split(n_folds=5)\n",
61
+    "svd = SVD()\n",
62
+    "trainset = data.build_full_trainset()\n",
63
+    "svd.train(trainset)"
64
+   ]
65
+  },
66
+  {
67
+   "cell_type": "code",
68
+   "execution_count": 5,
69
+   "metadata": {},
70
+   "outputs": [],
71
+   "source": [
72
+    "#Build title to ID and ID to title mappings\n",
73
+    "id_map = pd.read_csv('../data/movie_ids.csv')\n",
74
+    "id_to_title = id_map.set_index('id')\n",
75
+    "title_to_id = id_map.set_index('title')"
76
+   ]
77
+  },
78
+  {
79
+   "cell_type": "code",
80
+   "execution_count": 6,
81
+   "metadata": {},
82
+   "outputs": [],
83
+   "source": [
84
+    "#Import or compute relevant metadata of the movies\n",
85
+    "smd = pd.read_csv('../data/metadata_small.csv')"
86
+   ]
87
+  },
88
+  {
89
+   "cell_type": "code",
90
+   "execution_count": 7,
91
+   "metadata": {},
92
+   "outputs": [],
93
+   "source": [
94
+    "def hybrid(userId, title):\n",
95
+    "    #Extract the cosine_sim index of the movie\n",
96
+    "    idx = cosine_sim_map[title]\n",
97
+    "    \n",
98
+    "    #Extract the TMDB ID of the movie\n",
99
+    "    tmdbId = title_to_id.loc[title]['id']\n",
100
+    "    \n",
101
+    "    #Extract the movie ID internally assigned by the dataset\n",
102
+    "    movie_id = title_to_id.loc[title]['movieId']\n",
103
+    "    \n",
104
+    "    #Extract the similarity scores and their corresponding index for every movie from the cosine_sim matrix\n",
105
+    "    sim_scores = list(enumerate(cosine_sim[str(int(idx))]))\n",
106
+    "    \n",
107
+    "    #Sort the (index, score) tuples in decreasing order of similarity scores\n",
108
+    "    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n",
109
+    "    \n",
110
+    "    #Select the top 25 tuples, excluding the first \n",
111
+    "    #(as it is the similarity score of the movie with itself)\n",
112
+    "    sim_scores = sim_scores[1:26]\n",
113
+    "    \n",
114
+    "    #Store the cosine_sim indices of the top 25 movies in a list\n",
115
+    "    movie_indices = [i[0] for i in sim_scores]\n",
116
+    "\n",
117
+    "    #Extract the metadata of the aforementioned movies\n",
118
+    "    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]\n",
119
+    "    \n",
120
+    "    #Compute the predicted ratings using the SVD filter\n",
121
+    "    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, id_to_title.loc[x]['movieId']).est)\n",
122
+    "    \n",
123
+    "    #Sort the movies in decreasing order of predicted rating\n",
124
+    "    movies = movies.sort_values('est', ascending=False)\n",
125
+    "    \n",
126
+    "    #Return the top 10 movies as recommendations\n",
127
+    "    return movies.head(10)"
128
+   ]
129
+  },
130
+  {
131
+   "cell_type": "code",
132
+   "execution_count": 8,
133
+   "metadata": {},
134
+   "outputs": [
135
+    {
136
+     "data": {
137
+      "text/html": [
138
+       "<div>\n",
139
+       "<style>\n",
140
+       "    .dataframe thead tr:only-child th {\n",
141
+       "        text-align: right;\n",
142
+       "    }\n",
143
+       "\n",
144
+       "    .dataframe thead th {\n",
145
+       "        text-align: left;\n",
146
+       "    }\n",
147
+       "\n",
148
+       "    .dataframe tbody tr th {\n",
149
+       "        vertical-align: top;\n",
150
+       "    }\n",
151
+       "</style>\n",
152
+       "<table border=\"1\" class=\"dataframe\">\n",
153
+       "  <thead>\n",
154
+       "    <tr style=\"text-align: right;\">\n",
155
+       "      <th></th>\n",
156
+       "      <th>title</th>\n",
157
+       "      <th>vote_count</th>\n",
158
+       "      <th>vote_average</th>\n",
159
+       "      <th>year</th>\n",
160
+       "      <th>id</th>\n",
161
+       "      <th>est</th>\n",
162
+       "    </tr>\n",
163
+       "  </thead>\n",
164
+       "  <tbody>\n",
165
+       "    <tr>\n",
166
+       "      <th>1011</th>\n",
167
+       "      <td>The Terminator</td>\n",
168
+       "      <td>4208.0</td>\n",
169
+       "      <td>7.4</td>\n",
170
+       "      <td>1984</td>\n",
171
+       "      <td>218</td>\n",
172
+       "      <td>3.140748</td>\n",
173
+       "    </tr>\n",
174
+       "    <tr>\n",
175
+       "      <th>974</th>\n",
176
+       "      <td>Aliens</td>\n",
177
+       "      <td>3282.0</td>\n",
178
+       "      <td>7.7</td>\n",
179
+       "      <td>1986</td>\n",
180
+       "      <td>679</td>\n",
181
+       "      <td>3.126947</td>\n",
182
+       "    </tr>\n",
183
+       "    <tr>\n",
184
+       "      <th>8401</th>\n",
185
+       "      <td>Star Trek Into Darkness</td>\n",
186
+       "      <td>4479.0</td>\n",
187
+       "      <td>7.4</td>\n",
188
+       "      <td>2013</td>\n",
189
+       "      <td>54138</td>\n",
190
+       "      <td>3.079551</td>\n",
191
+       "    </tr>\n",
192
+       "    <tr>\n",
193
+       "      <th>7705</th>\n",
194
+       "      <td>Alice in Wonderland</td>\n",
195
+       "      <td>8.0</td>\n",
196
+       "      <td>5.4</td>\n",
197
+       "      <td>1933</td>\n",
198
+       "      <td>25694</td>\n",
199
+       "      <td>3.054995</td>\n",
200
+       "    </tr>\n",
201
+       "    <tr>\n",
202
+       "      <th>3060</th>\n",
203
+       "      <td>Sinbad and the Eye of the Tiger</td>\n",
204
+       "      <td>39.0</td>\n",
205
+       "      <td>6.3</td>\n",
206
+       "      <td>1977</td>\n",
207
+       "      <td>11940</td>\n",
208
+       "      <td>3.028386</td>\n",
209
+       "    </tr>\n",
210
+       "    <tr>\n",
211
+       "      <th>8658</th>\n",
212
+       "      <td>X-Men: Days of Future Past</td>\n",
213
+       "      <td>6155.0</td>\n",
214
+       "      <td>7.5</td>\n",
215
+       "      <td>2014</td>\n",
216
+       "      <td>127585</td>\n",
217
+       "      <td>2.997411</td>\n",
218
+       "    </tr>\n",
219
+       "    <tr>\n",
220
+       "      <th>2014</th>\n",
221
+       "      <td>Fantastic Planet</td>\n",
222
+       "      <td>140.0</td>\n",
223
+       "      <td>7.6</td>\n",
224
+       "      <td>1973</td>\n",
225
+       "      <td>16306</td>\n",
226
+       "      <td>2.957614</td>\n",
227
+       "    </tr>\n",
228
+       "    <tr>\n",
229
+       "      <th>522</th>\n",
230
+       "      <td>Terminator 2: Judgment Day</td>\n",
231
+       "      <td>4274.0</td>\n",
232
+       "      <td>7.7</td>\n",
233
+       "      <td>1991</td>\n",
234
+       "      <td>280</td>\n",
235
+       "      <td>2.914548</td>\n",
236
+       "    </tr>\n",
237
+       "    <tr>\n",
238
+       "      <th>1621</th>\n",
239
+       "      <td>Darby O'Gill and the Little People</td>\n",
240
+       "      <td>35.0</td>\n",
241
+       "      <td>6.7</td>\n",
242
+       "      <td>1959</td>\n",
243
+       "      <td>18887</td>\n",
244
+       "      <td>2.844940</td>\n",
245
+       "    </tr>\n",
246
+       "    <tr>\n",
247
+       "      <th>1668</th>\n",
248
+       "      <td>Return from Witch Mountain</td>\n",
249
+       "      <td>38.0</td>\n",
250
+       "      <td>5.6</td>\n",
251
+       "      <td>1978</td>\n",
252
+       "      <td>14822</td>\n",
253
+       "      <td>2.804012</td>\n",
254
+       "    </tr>\n",
255
+       "  </tbody>\n",
256
+       "</table>\n",
257
+       "</div>"
258
+      ],
259
+      "text/plain": [
260
+       "                                   title  vote_count  vote_average  year  \\\n",
261
+       "1011                      The Terminator      4208.0           7.4  1984   \n",
262
+       "974                               Aliens      3282.0           7.7  1986   \n",
263
+       "8401             Star Trek Into Darkness      4479.0           7.4  2013   \n",
264
+       "7705                 Alice in Wonderland         8.0           5.4  1933   \n",
265
+       "3060     Sinbad and the Eye of the Tiger        39.0           6.3  1977   \n",
266
+       "8658          X-Men: Days of Future Past      6155.0           7.5  2014   \n",
267
+       "2014                    Fantastic Planet       140.0           7.6  1973   \n",
268
+       "522           Terminator 2: Judgment Day      4274.0           7.7  1991   \n",
269
+       "1621  Darby O'Gill and the Little People        35.0           6.7  1959   \n",
270
+       "1668          Return from Witch Mountain        38.0           5.6  1978   \n",
271
+       "\n",
272
+       "          id       est  \n",
273
+       "1011     218  3.140748  \n",
274
+       "974      679  3.126947  \n",
275
+       "8401   54138  3.079551  \n",
276
+       "7705   25694  3.054995  \n",
277
+       "3060   11940  3.028386  \n",
278
+       "8658  127585  2.997411  \n",
279
+       "2014   16306  2.957614  \n",
280
+       "522      280  2.914548  \n",
281
+       "1621   18887  2.844940  \n",
282
+       "1668   14822  2.804012  "
283
+      ]
284
+     },
285
+     "execution_count": 8,
286
+     "metadata": {},
287
+     "output_type": "execute_result"
288
+    }
289
+   ],
290
+   "source": [
291
+    "hybrid(1, 'Avatar')"
292
+   ]
293
+  },
294
+  {
295
+   "cell_type": "code",
296
+   "execution_count": 9,
297
+   "metadata": {},
298
+   "outputs": [
299
+    {
300
+     "data": {
301
+      "text/html": [
302
+       "<div>\n",
303
+       "<style>\n",
304
+       "    .dataframe thead tr:only-child th {\n",
305
+       "        text-align: right;\n",
306
+       "    }\n",
307
+       "\n",
308
+       "    .dataframe thead th {\n",
309
+       "        text-align: left;\n",
310
+       "    }\n",
311
+       "\n",
312
+       "    .dataframe tbody tr th {\n",
313
+       "        vertical-align: top;\n",
314
+       "    }\n",
315
+       "</style>\n",
316
+       "<table border=\"1\" class=\"dataframe\">\n",
317
+       "  <thead>\n",
318
+       "    <tr style=\"text-align: right;\">\n",
319
+       "      <th></th>\n",
320
+       "      <th>title</th>\n",
321
+       "      <th>vote_count</th>\n",
322
+       "      <th>vote_average</th>\n",
323
+       "      <th>year</th>\n",
324
+       "      <th>id</th>\n",
325
+       "      <th>est</th>\n",
326
+       "    </tr>\n",
327
+       "  </thead>\n",
328
+       "  <tbody>\n",
329
+       "    <tr>\n",
330
+       "      <th>522</th>\n",
331
+       "      <td>Terminator 2: Judgment Day</td>\n",
332
+       "      <td>4274.0</td>\n",
333
+       "      <td>7.7</td>\n",
334
+       "      <td>1991</td>\n",
335
+       "      <td>280</td>\n",
336
+       "      <td>3.943639</td>\n",
337
+       "    </tr>\n",
338
+       "    <tr>\n",
339
+       "      <th>2834</th>\n",
340
+       "      <td>Predator</td>\n",
341
+       "      <td>2129.0</td>\n",
342
+       "      <td>7.3</td>\n",
343
+       "      <td>1987</td>\n",
344
+       "      <td>106</td>\n",
345
+       "      <td>3.866272</td>\n",
346
+       "    </tr>\n",
347
+       "    <tr>\n",
348
+       "      <th>8401</th>\n",
349
+       "      <td>Star Trek Into Darkness</td>\n",
350
+       "      <td>4479.0</td>\n",
351
+       "      <td>7.4</td>\n",
352
+       "      <td>2013</td>\n",
353
+       "      <td>54138</td>\n",
354
+       "      <td>3.858491</td>\n",
355
+       "    </tr>\n",
356
+       "    <tr>\n",
357
+       "      <th>1011</th>\n",
358
+       "      <td>The Terminator</td>\n",
359
+       "      <td>4208.0</td>\n",
360
+       "      <td>7.4</td>\n",
361
+       "      <td>1984</td>\n",
362
+       "      <td>218</td>\n",
363
+       "      <td>3.856029</td>\n",
364
+       "    </tr>\n",
365
+       "    <tr>\n",
366
+       "      <th>7705</th>\n",
367
+       "      <td>Alice in Wonderland</td>\n",
368
+       "      <td>8.0</td>\n",
369
+       "      <td>5.4</td>\n",
370
+       "      <td>1933</td>\n",
371
+       "      <td>25694</td>\n",
372
+       "      <td>3.701565</td>\n",
373
+       "    </tr>\n",
374
+       "    <tr>\n",
375
+       "      <th>922</th>\n",
376
+       "      <td>The Abyss</td>\n",
377
+       "      <td>822.0</td>\n",
378
+       "      <td>7.1</td>\n",
379
+       "      <td>1989</td>\n",
380
+       "      <td>2756</td>\n",
381
+       "      <td>3.676465</td>\n",
382
+       "    </tr>\n",
383
+       "    <tr>\n",
384
+       "      <th>974</th>\n",
385
+       "      <td>Aliens</td>\n",
386
+       "      <td>3282.0</td>\n",
387
+       "      <td>7.7</td>\n",
388
+       "      <td>1986</td>\n",
389
+       "      <td>679</td>\n",
390
+       "      <td>3.672303</td>\n",
391
+       "    </tr>\n",
392
+       "    <tr>\n",
393
+       "      <th>1621</th>\n",
394
+       "      <td>Darby O'Gill and the Little People</td>\n",
395
+       "      <td>35.0</td>\n",
396
+       "      <td>6.7</td>\n",
397
+       "      <td>1959</td>\n",
398
+       "      <td>18887</td>\n",
399
+       "      <td>3.628234</td>\n",
400
+       "    </tr>\n",
401
+       "    <tr>\n",
402
+       "      <th>1668</th>\n",
403
+       "      <td>Return from Witch Mountain</td>\n",
404
+       "      <td>38.0</td>\n",
405
+       "      <td>5.6</td>\n",
406
+       "      <td>1978</td>\n",
407
+       "      <td>14822</td>\n",
408
+       "      <td>3.614118</td>\n",
409
+       "    </tr>\n",
410
+       "    <tr>\n",
411
+       "      <th>2014</th>\n",
412
+       "      <td>Fantastic Planet</td>\n",
413
+       "      <td>140.0</td>\n",
414
+       "      <td>7.6</td>\n",
415
+       "      <td>1973</td>\n",
416
+       "      <td>16306</td>\n",
417
+       "      <td>3.602051</td>\n",
418
+       "    </tr>\n",
419
+       "  </tbody>\n",
420
+       "</table>\n",
421
+       "</div>"
422
+      ],
423
+      "text/plain": [
424
+       "                                   title  vote_count  vote_average  year  \\\n",
425
+       "522           Terminator 2: Judgment Day      4274.0           7.7  1991   \n",
426
+       "2834                            Predator      2129.0           7.3  1987   \n",
427
+       "8401             Star Trek Into Darkness      4479.0           7.4  2013   \n",
428
+       "1011                      The Terminator      4208.0           7.4  1984   \n",
429
+       "7705                 Alice in Wonderland         8.0           5.4  1933   \n",
430
+       "922                            The Abyss       822.0           7.1  1989   \n",
431
+       "974                               Aliens      3282.0           7.7  1986   \n",
432
+       "1621  Darby O'Gill and the Little People        35.0           6.7  1959   \n",
433
+       "1668          Return from Witch Mountain        38.0           5.6  1978   \n",
434
+       "2014                    Fantastic Planet       140.0           7.6  1973   \n",
435
+       "\n",
436
+       "         id       est  \n",
437
+       "522     280  3.943639  \n",
438
+       "2834    106  3.866272  \n",
439
+       "8401  54138  3.858491  \n",
440
+       "1011    218  3.856029  \n",
441
+       "7705  25694  3.701565  \n",
442
+       "922    2756  3.676465  \n",
443
+       "974     679  3.672303  \n",
444
+       "1621  18887  3.628234  \n",
445
+       "1668  14822  3.614118  \n",
446
+       "2014  16306  3.602051  "
447
+      ]
448
+     },
449
+     "execution_count": 9,
450
+     "metadata": {},
451
+     "output_type": "execute_result"
452
+    }
453
+   ],
454
+   "source": [
455
+    "hybrid(2, 'Avatar')"
456
+   ]
457
+  },
458
+  {
459
+   "cell_type": "code",
460
+   "execution_count": null,
461
+   "metadata": {},
462
+   "outputs": [],
463
+   "source": []
464
+  },
465
+  {
466
+   "cell_type": "code",
467
+   "execution_count": null,
468
+   "metadata": {
469
+    "collapsed": true
470
+   },
471
+   "outputs": [],
472
+   "source": []
473
+  }
474
+ ],
475
+ "metadata": {
476
+  "kernelspec": {
477
+   "display_name": "Python 3",
478
+   "language": "python",
479
+   "name": "python3"
480
+  },
481
+  "language_info": {
482
+   "codemirror_mode": {
483
+    "name": "ipython",
484
+    "version": 3
485
+   },
486
+   "file_extension": ".py",
487
+   "mimetype": "text/x-python",
488
+   "name": "python",
489
+   "nbconvert_exporter": "python",
490
+   "pygments_lexer": "ipython3",
491
+   "version": "3.6.0"
492
+  }
493
+ },
494
+ "nbformat": 4,
495
+ "nbformat_minor": 2
496
+}

+ 21 - 0
LICENSE

@@ -0,0 +1,21 @@
1
+MIT License
2
+
3
+Copyright (c) 2018 Packt
4
+
5
+Permission is hereby granted, free of charge, to any person obtaining a copy
6
+of this software and associated documentation files (the "Software"), to deal
7
+in the Software without restriction, including without limitation the rights
8
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+copies of the Software, and to permit persons to whom the Software is
10
+furnished to do so, subject to the following conditions:
11
+
12
+The above copyright notice and this permission notice shall be included in all
13
+copies or substantial portions of the Software.
14
+
15
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+SOFTWARE.

+ 73 - 0
README.md

@@ -0,0 +1,73 @@
1
+# Hands-On Recommendation Systems with Python
2
+
3
+<a href="https://www.packtpub.com/big-data-and-business-intelligence/hands-recommendation-systems-python?utm_source=github&utm_medium=repository&utm_campaign=9781788993753"><img src="https://d255esdrn735hr.cloudfront.net/sites/default/files/imagecache/ppv4_main_book_cover/B10302_cover_New.png" alt="Hands-On Recommendation Systems with Python" height="256px" align="right"></a>
4
+
5
+This is the code repository for [Hands-On Recommendation Systems with Python](https://www.packtpub.com/big-data-and-business-intelligence/hands-recommendation-systems-python?utm_source=github&utm_medium=repository&utm_campaign=9781788993753), published by Packt.
6
+
7
+**Start building powerful and personalized, recommendation engines with Python**
8
+
9
+## What is this book about?
10
+First Paragraph from the Long Description
11
+
12
+This book covers the following exciting features:
13
+* The different kinds of recommender systems
14
+* Data wrangling techniques using the pandas library
15
+* Building an IMDB Top 250 Clone
16
+* Building a content based engine to recommend movies based on movie metadata
17
+* Data mining techniques used in building recommenders
18
+
19
+If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1788993756) today!
20
+
21
+<a href="https://www.packtpub.com/?utm_source=github&utm_medium=banner&utm_campaign=GitHubBanner"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" 
22
+alt="https://www.packtpub.com/" border="5" /></a>
23
+
24
+
25
+## Instructions and Navigations
26
+All of the code is organized into folders. For example, Chapter02.
27
+
28
+The code will look like the following:
29
+```
30
+#Import SVD
31
+from surprise import SVD
32
+
33
+#Define the SVD algorithm object
34
+svd = SVD()
35
+
36
+#Evaluate the performance in terms of RMSE
37
+evaluate(svd, data, measures=['RMSE'])
38
+```
39
+
40
+**Following is what you need for this book:**
41
+If you are a Python developer and want to develop applications for social networking, news personalization or smart advertising, this is the book for you. Basic knowledge of machine learning techniques will be helpful, but not mandatory.
42
+
43
+With the following software and hardware list you can run all code files present in the book (Chapter 1-7).
44
+
45
+### Software and Hardware List
46
+
47
+| Chapter  | Software required                   | OS required                        |
48
+| -------- | ------------------------------------| -----------------------------------|
49
+| 1        | Samba 4.x Server Software           | Windows                            |
50
+
51
+
52
+
53
+We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://www.packtpub.com/sites/default/files/downloads/HandsOnRecommendationSystemswithPython_ColorImages.pdf).
54
+
55
+## Code in Action
56
+
57
+Click on the following link to see the Code in Action:
58
+
59
+[http://bit.ly/2JV4oeu](http://bit.ly/2JV4oeu)
60
+
61
+### Related products
62
+* Statistics for Machine Learning [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/statistics-machine-learning?utm_source=github&utm_medium=repository&utm_campaign=9781788295758) [[Amazon]](https://www.amazon.com/dp/1788295757)
63
+
64
+* Feature Engineering Made Easy [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/feature-engineering-made-easy?utm_source=github&utm_medium=repository&utm_campaign=9781787287600) [[Amazon]](https://www.amazon.com/dp/1787287602)
65
+
66
+## Get to Know the Author
67
+**Rounak Banik**
68
+Rounak Banik is a Young India Fellow and an ECE graduate from IIT Roorkee. He has worked as a software engineer at Parceed, a New York start-up, and Springboard, an EdTech start-up based in San Francisco and Bangalore. He has also served as a backend development instructor at Acadview, teaching Python and Django to around 35 college students from Delhi and Dehradun.
69
+
70
+He is an alumni of Springboard's data science career track. He has given talks at the SciPy India Conference and published popular tutorials on Kaggle and DataCamp.
71
+
72
+### Suggestions and Feedback
73
+[Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions.

File diff suppressed because it is too large
+ 4789 - 0
contentbase.ipynb


File diff suppressed because it is too large
+ 34807 - 0
mongoconnect.ipynb