Importing the libraries¶

In [1]:
import pandas as pd

Reading the dataset as a dataframe¶

In [8]:
# Reading dataframe
titles_df = pd.read_csv("titles.csv")
In [3]:
titles_df.head()
Out[3]:
id title type description release_year age_certification runtime genres production_countries seasons imdb_id imdb_score imdb_votes tmdb_popularity tmdb_score
0 ts300399 Five Came Back: The Reference Films SHOW This collection includes 12 World War II-era p... 1945 TV-MA 51 ['documentation'] ['US'] 1.0 NaN NaN NaN 0.600 NaN
1 tm84618 Taxi Driver MOVIE A mentally unstable Vietnam War veteran works ... 1976 R 114 ['drama', 'crime'] ['US'] NaN tt0075314 8.2 808582.0 40.965 8.179
2 tm154986 Deliverance MOVIE Intent on seeing the Cahulawassee River before... 1972 R 109 ['drama', 'action', 'thriller', 'european'] ['US'] NaN tt0068473 7.7 107673.0 10.010 7.300
3 tm127384 Monty Python and the Holy Grail MOVIE King Arthur, accompanied by his squire, recrui... 1975 PG 91 ['fantasy', 'action', 'comedy'] ['GB'] NaN tt0071853 8.2 534486.0 15.461 7.811
4 tm120801 The Dirty Dozen MOVIE 12 American military prisoners in World War II... 1967 NaN 150 ['war', 'action'] ['GB', 'US'] NaN tt0061578 7.7 72662.0 20.398 7.600

Discovering the dataset¶

In [4]:
# Show the number of movies and shows
print("Number of movies and shows: ", titles_df.size)

# Show the first year Netflix produced a movie or a show
print("The first time Netflix  produced a movie or a show was in: ", titles_df['release_year'][0])
Number of movies and shows:  87750
The first time Netflix  produced a movie or a show was in:  1945
In [5]:
titles_df.dtypes
Out[5]:
id                       object
title                    object
type                     object
description              object
release_year              int64
age_certification        object
runtime                   int64
genres                   object
production_countries     object
seasons                 float64
imdb_id                  object
imdb_score              float64
imdb_votes              float64
tmdb_popularity         float64
tmdb_score              float64
dtype: object
1. Drop records where the imdb_score column has missing values (NaN)¶
In [9]:
# Try your code here
titles_df.dropna(subset=['imdb_score'], inplace=True)
2. For each TV show or movie which has NaN value in the age certification column, replace it to be No certification¶
In [11]:
# Try your solution
titles_df['age_certification'].fillna('No certification', inplace=True)
3. For each TV show or movie which has NaN value in the seasons column, replace it to be the most occured value in the seasons¶
In [13]:
most_common_season = titles_df['seasons'].mode()[0]
In [ ]:
 
In [ ]:
 
In [14]:
# Try your solution
titles_df['seasons'].fillna(most_common_season, inplace=True)
4. Count the number of movies or TV shows for each age certification.¶
In [15]:
# Try your solution here
certification_counts = titles_df.groupby('age_certification').size().reset_index(name='count')
In [ ]:
 
In [ ]:
 
5. Count the number of movies and TV shows (seperately) produced in each release year.¶
In [16]:
# Try your solution here
count_by_release_year = titles_df.groupby(['release_year', 'type']).size().reset_index(name='count')
6. Calculate the average runtime and imdb score of movies and TV shows for each release year.¶
In [17]:
average_duration_imdb_score = titles_df.groupby('release_year')[['runtime', 'imdb_score']].mean().reset_index()
In [28]:
average_duration_imdb_score
Out[28]:
release_year runtime imdb_score
0 1954 107.500000 7.450000
1 1956 120.000000 6.700000
2 1958 77.000000 7.500000
3 1959 142.000000 6.600000
4 1960 158.000000 6.400000
... ... ... ...
57 2018 75.150273 6.502186
58 2019 75.697861 6.518316
59 2020 74.625551 6.316300
60 2021 74.990251 6.302089
61 2022 73.220238 6.194345

62 rows × 3 columns

7. Count the number of movies and TV shows for each genre.¶
In [34]:
titles_df['genres']
Out[34]:
1                                ['drama', 'crime']
2       ['drama', 'action', 'thriller', 'european']
3                   ['fantasy', 'action', 'comedy']
4                                 ['war', 'action']
5                            ['comedy', 'european']
                           ...                     
5843                 ['romance', 'comedy', 'drama']
5845                           ['romance', 'drama']
5846                                      ['drama']
5847                                     ['comedy']
5849              ['family', 'animation', 'comedy']
Name: genres, Length: 5368, dtype: object
In [22]:
genre_counts = titles_df.groupby('genres').size().reset_index(name='count')
In [23]:
new_titles_df = titles_df.copy()
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('genres')
genre_counts = exploded_df.groupby('genres').size().reset_index(name='count')
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
8. Calculate the standard deviation of movies and TV shows imdb ratings for each release year.¶
In [24]:
imdb_score_std = titles_df.groupby('release_year')['imdb_score'].std().reset_index(name='imdb_score')
9. Calculate the maximum TMDB popularity and minimum IMDb score for each production country¶
In [27]:
new_titles_df = titles_df.copy()
import ast
new_titles_df['production_countries'] = new_titles_df['production_countries'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('production_countries')
TMDB_popularity = exploded_df.groupby("production_countries").agg({"tmdb_popularity": "max", "imdb_score": "min"}).reset_index()
In [ ]:
 
In [ ]:
 
10. Calculate the sum of IMDb votes for each genre and find the average TMDB score¶
In [29]:
new_titles_df = titles_df.copy()
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('genres')
genres_votes_scores = exploded_df.groupby("genres").agg({"imdb_votes": "sum", "tmdb_score": "mean"}).reset_index()
11. Calculate the average rating deviation from the mean for each genre (use custom defined function)¶
In [32]:
new_titles_df = titles_df.copy()
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('genres')
genre_avg_deviation = exploded_df.groupby('genres').apply(lambda x: (x['imdb_score'] - x['imdb_score'].mean()).mean()).reset_index(name="average_deviation")

def rating_deviation(row):
    mean_rating = row['imdb_score'].mean()  # Calculate the mean rating
    deviation = row['imdb_score'] - mean_rating  # Calculate the deviation from the mean
    return deviation

genre_avg_deviation = exploded_df.groupby('genres').apply(lambda x: rating_deviation(x).mean()).reset_index(name="average_deviation")
12. Calculate the standardized score for TMDB popularity for each movie or TV show within its respective genre (use custom defined function)¶
In [35]:
# Try your solution here
def standardize_score(x):
    return (x - x.mean()) / x.std()
titles_df['standardized_tmdb_popularity'] = titles_df.groupby('genres')['tmdb_popularity'].transform(standardize_score)
13. Find the minimum and maximum release year for each type (movie or TV show):¶
In [45]:
min_max_year = titles_df.groupby('type').agg({'release_year': ['min', 'max']}).reset_index()
min_max_year = titles_df.groupby('type').agg(min_release_year=('release_year', 'min'), 
                                              max_release_year=('release_year', 'max')).reset_index()
min_max_year = titles_df.groupby('type')['release_year'].agg(['min', 'max'])
In [46]:
min_max_year
Out[46]:
min max
type
MOVIE 1954 2022
SHOW 1969 2022
14. Calculate the average IMDb score and the max TMDB score for each genre and release year combination¶
In [48]:
new_titles_df = titles_df.copy()
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('genres')
In [53]:
genre_year_scores = exploded_df.groupby(['genres', 'release_year']).agg({'imdb_score': 'mean', 'tmdb_score': 'max'})
In [54]:
genre_year_scores
Out[54]:
imdb_score tmdb_score
genres release_year
action 1956 6.700 5.9
1961 7.500 7.3
1963 7.600 7.1
1966 7.300 7.1
1967 7.700 7.6
... ... ... ...
western 2018 5.350 7.3
2019 5.800 6.5
2020 6.200 8.7
2021 5.925 6.9
2022 6.100 6.8

666 rows × 2 columns

15. Calculate the average length of titles (number of characters) for each genre (use custom defined function)¶¶
In [ ]:
genre_average_length = ...
16. Find the count and average IMDb score for each age certification category (use custom defined function)¶
In [ ]:
certification_stats = ...

The End!¶