{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] }, { "data": { "text/plain": [ "Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',\n", " 'imdb_id', 'original_language', 'original_title', 'overview',\n", " 'popularity', 'poster_path', 'production_companies',\n", " 'production_countries', 'release_date', 'revenue', 'runtime',\n", " 'spoken_languages', 'status', 'tagline', 'title', 'video',\n", " 'vote_average', 'vote_count'],\n", " dtype='object')" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "df = pd.read_csv('../data/movies_metadata.csv')\n", "\n", "#Print all the features (or columns) of the DataFrame\n", "df.columns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegenresrelease_dateruntimevote_averagevote_count
0Toy Story[{'id': 16, 'name': 'Animation'}, {'id': 35, '...1995-10-3081.07.75415.0
1Jumanji[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...1995-12-15104.06.92413.0
2Grumpier Old Men[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...1995-12-22101.06.592.0
3Waiting to Exhale[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...1995-12-22127.06.134.0
4Father of the Bride Part II[{'id': 35, 'name': 'Comedy'}]1995-02-10106.05.7173.0
\n", "
" ], "text/plain": [ " title \\\n", "0 Toy Story \n", "1 Jumanji \n", "2 Grumpier Old Men \n", "3 Waiting to Exhale \n", "4 Father of the Bride Part II \n", "\n", " genres release_date runtime \\\n", "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 1995-10-30 81.0 \n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 1995-12-15 104.0 \n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 1995-12-22 101.0 \n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 1995-12-22 127.0 \n", "4 [{'id': 35, 'name': 'Comedy'}] 1995-02-10 106.0 \n", "\n", " vote_average vote_count \n", "0 7.7 5415.0 \n", "1 6.9 2413.0 \n", "2 6.5 92.0 \n", "3 6.1 34.0 \n", "4 5.7 173.0 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Only keep those features that we require \n", "df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Convert release_date into pandas datetime format\n", "df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')\n", "\n", "#Extract year from the datetime\n", "df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Helper function to convert NaT to 0 and all other years to integers.\n", "def convert_int(x):\n", " try:\n", " return int(x)\n", " except:\n", " return 0" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Apply convert_int to the year feature\n", "df['year'] = df['year'].apply(convert_int)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegenresruntimevote_averagevote_countyear
0Toy Story[{'id': 16, 'name': 'Animation'}, {'id': 35, '...81.07.75415.01995
1Jumanji[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...104.06.92413.01995
2Grumpier Old Men[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...101.06.592.01995
3Waiting to Exhale[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...127.06.134.01995
4Father of the Bride Part II[{'id': 35, 'name': 'Comedy'}]106.05.7173.01995
\n", "
" ], "text/plain": [ " title \\\n", "0 Toy Story \n", "1 Jumanji \n", "2 Grumpier Old Men \n", "3 Waiting to Exhale \n", "4 Father of the Bride Part II \n", "\n", " genres runtime vote_average \\\n", "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 81.0 7.7 \n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 104.0 6.9 \n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 101.0 6.5 \n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 127.0 6.1 \n", "4 [{'id': 35, 'name': 'Comedy'}] 106.0 5.7 \n", "\n", " vote_count year \n", "0 5415.0 1995 \n", "1 2413.0 1995 \n", "2 92.0 1995 \n", "3 34.0 1995 \n", "4 173.0 1995 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Drop the release_date column\n", "df = df.drop('release_date', axis=1)\n", "\n", "#Display the dataframe\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]\"" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Print genres of the first movie\n", "df.iloc[0]['genres']" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] } ], "source": [ "#Import the literal_eval function from ast\n", "from ast import literal_eval\n", "\n", "#Define a stringified list and output its type\n", "a = \"[1,2,3]\"\n", "print(type(a))\n", "\n", "#Apply literal_eval and output type\n", "b = literal_eval(a)\n", "print(type(b))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Convert all NaN into stringified empty lists\n", "df['genres'] = df['genres'].fillna('[]')\n", "\n", "#Apply literal_eval to convert stringified empty lists to the list object\n", "df['genres'] = df['genres'].apply(literal_eval)\n", "\n", "#Convert list of dictionaries to a list of strings\n", "df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegenresruntimevote_averagevote_countyear
0Toy Story[animation, comedy, family]81.07.75415.01995
1Jumanji[adventure, fantasy, family]104.06.92413.01995
2Grumpier Old Men[romance, comedy]101.06.592.01995
3Waiting to Exhale[comedy, drama, romance]127.06.134.01995
4Father of the Bride Part II[comedy]106.05.7173.01995
\n", "
" ], "text/plain": [ " title genres runtime \\\n", "0 Toy Story [animation, comedy, family] 81.0 \n", "1 Jumanji [adventure, fantasy, family] 104.0 \n", "2 Grumpier Old Men [romance, comedy] 101.0 \n", "3 Waiting to Exhale [comedy, drama, romance] 127.0 \n", "4 Father of the Bride Part II [comedy] 106.0 \n", "\n", " vote_average vote_count year \n", "0 7.7 5415.0 1995 \n", "1 6.9 2413.0 1995 \n", "2 6.5 92.0 1995 \n", "3 6.1 34.0 1995 \n", "4 5.7 173.0 1995 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleruntimevote_averagevote_countyeargenre
0Toy Story81.07.75415.01995animation
0Toy Story81.07.75415.01995comedy
0Toy Story81.07.75415.01995family
1Jumanji104.06.92413.01995adventure
1Jumanji104.06.92413.01995fantasy
\n", "
" ], "text/plain": [ " title runtime vote_average vote_count year genre\n", "0 Toy Story 81.0 7.7 5415.0 1995 animation\n", "0 Toy Story 81.0 7.7 5415.0 1995 comedy\n", "0 Toy Story 81.0 7.7 5415.0 1995 family\n", "1 Jumanji 104.0 6.9 2413.0 1995 adventure\n", "1 Jumanji 104.0 6.9 2413.0 1995 fantasy" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Create a new feature by exploding genres\n", "s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)\n", "\n", "#Name the new feature as 'genre'\n", "s.name = 'genre'\n", "\n", "#Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.\n", "gen_df = df.drop('genres', axis=1).join(s)\n", "\n", "#Print the head of the new gen_df\n", "gen_df.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def build_chart(gen_df, percentile=0.8):\n", " #Ask for preferred genres\n", " print(\"Input preferred genre\")\n", " genre = input()\n", " \n", " #Ask for lower limit of duration\n", " print(\"Input shortest duration\")\n", " low_time = int(input())\n", " \n", " #Ask for upper limit of duration\n", " print(\"Input longest duration\")\n", " high_time = int(input())\n", " \n", " #Ask for lower limit of timeline\n", " print(\"Input earliest year\")\n", " low_year = int(input())\n", " \n", " #Ask for upper limit of timeline\n", " print(\"Input latest year\")\n", " high_year = int(input())\n", " \n", " #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies\n", " movies = gen_df.copy()\n", " \n", " #Filter based on the condition\n", " movies = movies[(movies['genre'] == genre) & \n", " (movies['runtime'] >= low_time) & \n", " (movies['runtime'] <= high_time) & \n", " (movies['year'] >= low_year) & \n", " (movies['year'] <= high_year)]\n", " \n", " #Compute the values of C and m for the filtered movies\n", " C = movies['vote_average'].mean()\n", " m = movies['vote_count'].quantile(percentile)\n", " \n", " #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies\n", " q_movies = movies.copy().loc[movies['vote_count'] >= m]\n", " \n", " #Calculate score using the IMDB formula\n", " q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) \n", " + (m/(m+x['vote_count']) * C)\n", " ,axis=1)\n", "\n", " #Sort movies in descending order of their scores\n", " q_movies = q_movies.sort_values('score', ascending=False)\n", " \n", " return q_movies" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Input preferred genre\n", "action\n", "Input shortest duration\n", "80\n", "Input longest duration\n", "120\n", "Input earliest year\n", "1990\n", "Input latest year\n", "2000\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleruntimevote_averagevote_countyeargenrescore
723Ghost in the Shell83.07.8854.01995action7.521643
550True Romance120.07.5762.01993action7.231980
3902O Brother, Where Art Thou?106.07.31144.02000action7.131617
348The Crow102.07.3980.01994action7.106412
3871Crouching Tiger, Hidden Dragon120.07.2949.02000action7.011634
\n", "
" ], "text/plain": [ " title runtime vote_average vote_count year \\\n", "723 Ghost in the Shell 83.0 7.8 854.0 1995 \n", "550 True Romance 120.0 7.5 762.0 1993 \n", "3902 O Brother, Where Art Thou? 106.0 7.3 1144.0 2000 \n", "348 The Crow 102.0 7.3 980.0 1994 \n", "3871 Crouching Tiger, Hidden Dragon 120.0 7.2 949.0 2000 \n", "\n", " genre score \n", "723 action 7.521643 \n", "550 action 7.231980 \n", "3902 action 7.131617 \n", "348 action 7.106412 \n", "3871 action 7.011634 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Generate the chart for top animation movies and display top 5.\n", "build_chart(gen_df).head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }