import pandas as pd
# Reading dataframe
titles_df = pd.read_csv("titles.csv")
titles_df.head()
| id | title | type | description | release_year | age_certification | runtime | genres | production_countries | seasons | imdb_id | imdb_score | imdb_votes | tmdb_popularity | tmdb_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ts300399 | Five Came Back: The Reference Films | SHOW | This collection includes 12 World War II-era p... | 1945 | TV-MA | 51 | ['documentation'] | ['US'] | 1.0 | NaN | NaN | NaN | 0.600 | NaN |
| 1 | tm84618 | Taxi Driver | MOVIE | A mentally unstable Vietnam War veteran works ... | 1976 | R | 114 | ['drama', 'crime'] | ['US'] | NaN | tt0075314 | 8.2 | 808582.0 | 40.965 | 8.179 |
| 2 | tm154986 | Deliverance | MOVIE | Intent on seeing the Cahulawassee River before... | 1972 | R | 109 | ['drama', 'action', 'thriller', 'european'] | ['US'] | NaN | tt0068473 | 7.7 | 107673.0 | 10.010 | 7.300 |
| 3 | tm127384 | Monty Python and the Holy Grail | MOVIE | King Arthur, accompanied by his squire, recrui... | 1975 | PG | 91 | ['fantasy', 'action', 'comedy'] | ['GB'] | NaN | tt0071853 | 8.2 | 534486.0 | 15.461 | 7.811 |
| 4 | tm120801 | The Dirty Dozen | MOVIE | 12 American military prisoners in World War II... | 1967 | NaN | 150 | ['war', 'action'] | ['GB', 'US'] | NaN | tt0061578 | 7.7 | 72662.0 | 20.398 | 7.600 |
# Show the number of movies and shows
print("Number of movies and shows: ", titles_df.size)
# Show the first year Netflix produced a movie or a show
print("The first time Netflix produced a movie or a show was in: ", titles_df['release_year'][0])
Number of movies and shows: 87750 The first time Netflix produced a movie or a show was in: 1945
titles_df.dtypes
id object title object type object description object release_year int64 age_certification object runtime int64 genres object production_countries object seasons float64 imdb_id object imdb_score float64 imdb_votes float64 tmdb_popularity float64 tmdb_score float64 dtype: object
# Try your code here
titles_df.dropna(subset=['imdb_score'], inplace=True)
No certification¶# Try your solution
titles_df['age_certification'].fillna('No certification', inplace=True)
most_common_season = titles_df['seasons'].mode()[0]
# Try your solution
titles_df['seasons'].fillna(most_common_season, inplace=True)
# Try your solution here
certification_counts = titles_df.groupby('age_certification').size().reset_index(name='count')
# Try your solution here
count_by_release_year = titles_df.groupby(['release_year', 'type']).size().reset_index(name='count')
average_duration_imdb_score = titles_df.groupby('release_year')[['runtime', 'imdb_score']].mean().reset_index()
average_duration_imdb_score
| release_year | runtime | imdb_score | |
|---|---|---|---|
| 0 | 1954 | 107.500000 | 7.450000 |
| 1 | 1956 | 120.000000 | 6.700000 |
| 2 | 1958 | 77.000000 | 7.500000 |
| 3 | 1959 | 142.000000 | 6.600000 |
| 4 | 1960 | 158.000000 | 6.400000 |
| ... | ... | ... | ... |
| 57 | 2018 | 75.150273 | 6.502186 |
| 58 | 2019 | 75.697861 | 6.518316 |
| 59 | 2020 | 74.625551 | 6.316300 |
| 60 | 2021 | 74.990251 | 6.302089 |
| 61 | 2022 | 73.220238 | 6.194345 |
62 rows × 3 columns
titles_df['genres']
1 ['drama', 'crime']
2 ['drama', 'action', 'thriller', 'european']
3 ['fantasy', 'action', 'comedy']
4 ['war', 'action']
5 ['comedy', 'european']
...
5843 ['romance', 'comedy', 'drama']
5845 ['romance', 'drama']
5846 ['drama']
5847 ['comedy']
5849 ['family', 'animation', 'comedy']
Name: genres, Length: 5368, dtype: object
genre_counts = titles_df.groupby('genres').size().reset_index(name='count')
new_titles_df = titles_df.copy()
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('genres')
genre_counts = exploded_df.groupby('genres').size().reset_index(name='count')
imdb_score_std = titles_df.groupby('release_year')['imdb_score'].std().reset_index(name='imdb_score')
new_titles_df = titles_df.copy()
import ast
new_titles_df['production_countries'] = new_titles_df['production_countries'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('production_countries')
TMDB_popularity = exploded_df.groupby("production_countries").agg({"tmdb_popularity": "max", "imdb_score": "min"}).reset_index()
new_titles_df = titles_df.copy()
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('genres')
genres_votes_scores = exploded_df.groupby("genres").agg({"imdb_votes": "sum", "tmdb_score": "mean"}).reset_index()
new_titles_df = titles_df.copy()
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('genres')
genre_avg_deviation = exploded_df.groupby('genres').apply(lambda x: (x['imdb_score'] - x['imdb_score'].mean()).mean()).reset_index(name="average_deviation")
def rating_deviation(row):
mean_rating = row['imdb_score'].mean() # Calculate the mean rating
deviation = row['imdb_score'] - mean_rating # Calculate the deviation from the mean
return deviation
genre_avg_deviation = exploded_df.groupby('genres').apply(lambda x: rating_deviation(x).mean()).reset_index(name="average_deviation")
# Try your solution here
def standardize_score(x):
return (x - x.mean()) / x.std()
titles_df['standardized_tmdb_popularity'] = titles_df.groupby('genres')['tmdb_popularity'].transform(standardize_score)
min_max_year = titles_df.groupby('type').agg({'release_year': ['min', 'max']}).reset_index()
min_max_year = titles_df.groupby('type').agg(min_release_year=('release_year', 'min'),
max_release_year=('release_year', 'max')).reset_index()
min_max_year = titles_df.groupby('type')['release_year'].agg(['min', 'max'])
min_max_year
| min | max | |
|---|---|---|
| type | ||
| MOVIE | 1954 | 2022 |
| SHOW | 1969 | 2022 |
new_titles_df = titles_df.copy()
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('genres')
genre_year_scores = exploded_df.groupby(['genres', 'release_year']).agg({'imdb_score': 'mean', 'tmdb_score': 'max'})
genre_year_scores
| imdb_score | tmdb_score | ||
|---|---|---|---|
| genres | release_year | ||
| action | 1956 | 6.700 | 5.9 |
| 1961 | 7.500 | 7.3 | |
| 1963 | 7.600 | 7.1 | |
| 1966 | 7.300 | 7.1 | |
| 1967 | 7.700 | 7.6 | |
| ... | ... | ... | ... |
| western | 2018 | 5.350 | 7.3 |
| 2019 | 5.800 | 6.5 | |
| 2020 | 6.200 | 8.7 | |
| 2021 | 5.925 | 6.9 | |
| 2022 | 6.100 | 6.8 |
666 rows × 2 columns
genre_average_length = ...
certification_stats = ...