tum
/
rec-code


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842
							{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/dv/107570dd01b_m3wvmvv0g9lm0000gn/T/ipykernel_4569/2038691245.py:4: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv('../data/movies_metadata.csv')\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',\n",
       "       'imdb_id', 'original_language', 'original_title', 'overview',\n",
       "       'popularity', 'poster_path', 'production_companies',\n",
       "       'production_countries', 'release_date', 'revenue', 'runtime',\n",
       "       'spoken_languages', 'status', 'tagline', 'title', 'video',\n",
       "       'vote_average', 'vote_count'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "df = pd.read_csv('../data/movies_metadata.csv')\n",
    "\n",
    "#Print all the features (or columns) of the DataFrame\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>release_date</th>\n",
       "      <th>runtime</th>\n",
       "      <th>vote_average</th>\n",
       "      <th>vote_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Toy Story</td>\n",
       "      <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
       "      <td>1995-10-30</td>\n",
       "      <td>81.0</td>\n",
       "      <td>7.7</td>\n",
       "      <td>5415.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jumanji</td>\n",
       "      <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
       "      <td>1995-12-15</td>\n",
       "      <td>104.0</td>\n",
       "      <td>6.9</td>\n",
       "      <td>2413.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Grumpier Old Men</td>\n",
       "      <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
       "      <td>1995-12-22</td>\n",
       "      <td>101.0</td>\n",
       "      <td>6.5</td>\n",
       "      <td>92.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Waiting to Exhale</td>\n",
       "      <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
       "      <td>1995-12-22</td>\n",
       "      <td>127.0</td>\n",
       "      <td>6.1</td>\n",
       "      <td>34.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Father of the Bride Part II</td>\n",
       "      <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
       "      <td>1995-02-10</td>\n",
       "      <td>106.0</td>\n",
       "      <td>5.7</td>\n",
       "      <td>173.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         title  \\\n",
       "0                    Toy Story   \n",
       "1                      Jumanji   \n",
       "2             Grumpier Old Men   \n",
       "3            Waiting to Exhale   \n",
       "4  Father of the Bride Part II   \n",
       "\n",
       "                                              genres release_date  runtime  \\\n",
       "0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   1995-10-30     81.0   \n",
       "1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   1995-12-15    104.0   \n",
       "2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   1995-12-22    101.0   \n",
       "3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   1995-12-22    127.0   \n",
       "4                     [{'id': 35, 'name': 'Comedy'}]   1995-02-10    106.0   \n",
       "\n",
       "   vote_average  vote_count  \n",
       "0           7.7      5415.0  \n",
       "1           6.9      2413.0  \n",
       "2           6.5        92.0  \n",
       "3           6.1        34.0  \n",
       "4           5.7       173.0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Only keep those features that we require \n",
    "df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Convert release_date into pandas datetime format\n",
    "df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')\n",
    "\n",
    "#Extract year from the datetime\n",
    "df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Helper function to convert NaT to 0 and all other years to integers.\n",
    "def convert_int(x):\n",
    "    try:\n",
    "        return int(x)\n",
    "    except:\n",
    "        return 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Apply convert_int to the year feature\n",
    "df['year'] = df['year'].apply(convert_int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>runtime</th>\n",
       "      <th>vote_average</th>\n",
       "      <th>vote_count</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Toy Story</td>\n",
       "      <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
       "      <td>81.0</td>\n",
       "      <td>7.7</td>\n",
       "      <td>5415.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jumanji</td>\n",
       "      <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
       "      <td>104.0</td>\n",
       "      <td>6.9</td>\n",
       "      <td>2413.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Grumpier Old Men</td>\n",
       "      <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
       "      <td>101.0</td>\n",
       "      <td>6.5</td>\n",
       "      <td>92.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Waiting to Exhale</td>\n",
       "      <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
       "      <td>127.0</td>\n",
       "      <td>6.1</td>\n",
       "      <td>34.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Father of the Bride Part II</td>\n",
       "      <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
       "      <td>106.0</td>\n",
       "      <td>5.7</td>\n",
       "      <td>173.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         title  \\\n",
       "0                    Toy Story   \n",
       "1                      Jumanji   \n",
       "2             Grumpier Old Men   \n",
       "3            Waiting to Exhale   \n",
       "4  Father of the Bride Part II   \n",
       "\n",
       "                                              genres  runtime  vote_average  \\\n",
       "0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...     81.0           7.7   \n",
       "1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...    104.0           6.9   \n",
       "2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...    101.0           6.5   \n",
       "3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...    127.0           6.1   \n",
       "4                     [{'id': 35, 'name': 'Comedy'}]    106.0           5.7   \n",
       "\n",
       "   vote_count  year  \n",
       "0      5415.0  1995  \n",
       "1      2413.0  1995  \n",
       "2        92.0  1995  \n",
       "3        34.0  1995  \n",
       "4       173.0  1995  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Drop the release_date column\n",
    "df = df.drop('release_date', axis=1)\n",
    "\n",
    "#Display the dataframe\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]\""
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Print genres of the first movie\n",
    "df.iloc[0]['genres']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'str'>\n",
      "<class 'list'>\n"
     ]
    }
   ],
   "source": [
    "#Import the literal_eval function from ast\n",
    "from ast import literal_eval\n",
    "\n",
    "#Define a stringified list and output its type\n",
    "a = \"[1,2,3]\"\n",
    "print(type(a))\n",
    "\n",
    "#Apply literal_eval and output type\n",
    "b = literal_eval(a)\n",
    "print(type(b))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Convert all NaN into stringified empty lists\n",
    "df['genres'] = df['genres'].fillna('[]')\n",
    "\n",
    "#Apply literal_eval to convert stringified empty lists to the list object\n",
    "df['genres'] = df['genres'].apply(literal_eval)\n",
    "\n",
    "#Convert list of dictionaries to a list of strings\n",
    "df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>runtime</th>\n",
       "      <th>vote_average</th>\n",
       "      <th>vote_count</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Toy Story</td>\n",
       "      <td>[animation, comedy, family]</td>\n",
       "      <td>81.0</td>\n",
       "      <td>7.7</td>\n",
       "      <td>5415.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jumanji</td>\n",
       "      <td>[adventure, fantasy, family]</td>\n",
       "      <td>104.0</td>\n",
       "      <td>6.9</td>\n",
       "      <td>2413.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Grumpier Old Men</td>\n",
       "      <td>[romance, comedy]</td>\n",
       "      <td>101.0</td>\n",
       "      <td>6.5</td>\n",
       "      <td>92.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Waiting to Exhale</td>\n",
       "      <td>[comedy, drama, romance]</td>\n",
       "      <td>127.0</td>\n",
       "      <td>6.1</td>\n",
       "      <td>34.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Father of the Bride Part II</td>\n",
       "      <td>[comedy]</td>\n",
       "      <td>106.0</td>\n",
       "      <td>5.7</td>\n",
       "      <td>173.0</td>\n",
       "      <td>1995</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         title                        genres  runtime  \\\n",
       "0                    Toy Story   [animation, comedy, family]     81.0   \n",
       "1                      Jumanji  [adventure, fantasy, family]    104.0   \n",
       "2             Grumpier Old Men             [romance, comedy]    101.0   \n",
       "3            Waiting to Exhale      [comedy, drama, romance]    127.0   \n",
       "4  Father of the Bride Part II                      [comedy]    106.0   \n",
       "\n",
       "   vote_average  vote_count  year  \n",
       "0           7.7      5415.0  1995  \n",
       "1           6.9      2413.0  1995  \n",
       "2           6.5        92.0  1995  \n",
       "3           6.1        34.0  1995  \n",
       "4           5.7       173.0  1995  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/dv/107570dd01b_m3wvmvv0g9lm0000gn/T/ipykernel_4569/328443552.py:2: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.\n",
      "  s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>runtime</th>\n",
       "      <th>vote_average</th>\n",
       "      <th>vote_count</th>\n",
       "      <th>year</th>\n",
       "      <th>genre</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Toy Story</td>\n",
       "      <td>81.0</td>\n",
       "      <td>7.7</td>\n",
       "      <td>5415.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>animation</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Toy Story</td>\n",
       "      <td>81.0</td>\n",
       "      <td>7.7</td>\n",
       "      <td>5415.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>comedy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Toy Story</td>\n",
       "      <td>81.0</td>\n",
       "      <td>7.7</td>\n",
       "      <td>5415.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>family</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jumanji</td>\n",
       "      <td>104.0</td>\n",
       "      <td>6.9</td>\n",
       "      <td>2413.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>adventure</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jumanji</td>\n",
       "      <td>104.0</td>\n",
       "      <td>6.9</td>\n",
       "      <td>2413.0</td>\n",
       "      <td>1995</td>\n",
       "      <td>fantasy</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       title  runtime  vote_average  vote_count  year      genre\n",
       "0  Toy Story     81.0           7.7      5415.0  1995  animation\n",
       "0  Toy Story     81.0           7.7      5415.0  1995     comedy\n",
       "0  Toy Story     81.0           7.7      5415.0  1995     family\n",
       "1    Jumanji    104.0           6.9      2413.0  1995  adventure\n",
       "1    Jumanji    104.0           6.9      2413.0  1995    fantasy"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Create a new feature by exploding genres\n",
    "s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)\n",
    "\n",
    "#Name the new feature as 'genre'\n",
    "s.name = 'genre'\n",
    "\n",
    "#Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.\n",
    "gen_df = df.drop('genres', axis=1).join(s)\n",
    "\n",
    "#Print the head of the new gen_df\n",
    "gen_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_chart(gen_df, percentile=0.8):\n",
    "    #Ask for preferred genres\n",
    "    print(\"Input preferred genre\")\n",
    "    genre = input()\n",
    "    \n",
    "    #Ask for lower limit of duration\n",
    "    print(\"Input shortest duration\")\n",
    "    low_time = int(input())\n",
    "    \n",
    "    #Ask for upper limit of duration\n",
    "    print(\"Input longest duration\")\n",
    "    high_time = int(input())\n",
    "    \n",
    "    #Ask for lower limit of timeline\n",
    "    print(\"Input earliest year\")\n",
    "    low_year = int(input())\n",
    "    \n",
    "    #Ask for upper limit of timeline\n",
    "    print(\"Input latest year\")\n",
    "    high_year = int(input())\n",
    "    \n",
    "    #Define a new movies variable to store the preferred movies. Copy the contents of gen_df to movies\n",
    "    movies = gen_df.copy()\n",
    "    \n",
    "    #Filter based on the condition\n",
    "    movies = movies[(movies['genre'] == genre) & \n",
    "                    (movies['runtime'] >= low_time) & \n",
    "                    (movies['runtime'] <= high_time) & \n",
    "                    (movies['year'] >= low_year) & \n",
    "                    (movies['year'] <= high_year)]\n",
    "    \n",
    "    #Compute the values of C and m for the filtered movies\n",
    "    C = movies['vote_average'].mean()\n",
    "    m = movies['vote_count'].quantile(percentile)\n",
    "    \n",
    "    #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies\n",
    "    q_movies = movies.copy().loc[movies['vote_count'] >= m]\n",
    "    \n",
    "    #Calculate score using the IMDB formula\n",
    "    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) \n",
    "                                       + (m/(m+x['vote_count']) * C)\n",
    "                                       ,axis=1)\n",
    "\n",
    "    #Sort movies in descending order of their scores\n",
    "    q_movies = q_movies.sort_values('score', ascending=False)\n",
    "    \n",
    "    return q_movies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input preferred genre\n",
      "horror\n",
      "Input shortest duration\n",
      "60\n",
      "Input longest duration\n",
      "120\n",
      "Input earliest year\n",
      "1990\n",
      "Input latest year\n",
      "2022\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>runtime</th>\n",
       "      <th>vote_average</th>\n",
       "      <th>vote_count</th>\n",
       "      <th>year</th>\n",
       "      <th>genre</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>39821</th>\n",
       "      <td>Train to Busan</td>\n",
       "      <td>118.0</td>\n",
       "      <td>7.7</td>\n",
       "      <td>984.0</td>\n",
       "      <td>2016</td>\n",
       "      <td>horror</td>\n",
       "      <td>7.424441</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8147</th>\n",
       "      <td>Shaun of the Dead</td>\n",
       "      <td>99.0</td>\n",
       "      <td>7.5</td>\n",
       "      <td>2479.0</td>\n",
       "      <td>2004</td>\n",
       "      <td>horror</td>\n",
       "      <td>7.392081</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21276</th>\n",
       "      <td>The Conjuring</td>\n",
       "      <td>112.0</td>\n",
       "      <td>7.4</td>\n",
       "      <td>3169.0</td>\n",
       "      <td>2013</td>\n",
       "      <td>horror</td>\n",
       "      <td>7.318185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4591</th>\n",
       "      <td>The Others</td>\n",
       "      <td>101.0</td>\n",
       "      <td>7.4</td>\n",
       "      <td>1708.0</td>\n",
       "      <td>2001</td>\n",
       "      <td>horror</td>\n",
       "      <td>7.252502</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12891</th>\n",
       "      <td>Let the Right One In</td>\n",
       "      <td>115.0</td>\n",
       "      <td>7.5</td>\n",
       "      <td>997.0</td>\n",
       "      <td>2008</td>\n",
       "      <td>horror</td>\n",
       "      <td>7.247838</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      title  runtime  vote_average  vote_count  year   genre  \\\n",
       "39821        Train to Busan    118.0           7.7       984.0  2016  horror   \n",
       "8147      Shaun of the Dead     99.0           7.5      2479.0  2004  horror   \n",
       "21276         The Conjuring    112.0           7.4      3169.0  2013  horror   \n",
       "4591             The Others    101.0           7.4      1708.0  2001  horror   \n",
       "12891  Let the Right One In    115.0           7.5       997.0  2008  horror   \n",
       "\n",
       "          score  \n",
       "39821  7.424441  \n",
       "8147   7.392081  \n",
       "21276  7.318185  \n",
       "4591   7.252502  \n",
       "12891  7.247838  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Generate the chart for top animation movies and display top 5.\n",
    "build_chart(gen_df).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Convert the cleaned (non-exploded) dataframe df into a CSV file and save it in the data folder\n",
    "#Set parameter index to False as the index of the DataFrame has no inherent meaning.\n",
    "df.to_csv('../data/metadata_clean.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}