import pandas as pd
# Reading dataframe
titles_df = pd.read_csv("titles.csv")
titles_df.head()
| id | title | type | description | release_year | age_certification | runtime | genres | production_countries | seasons | imdb_id | imdb_score | imdb_votes | tmdb_popularity | tmdb_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ts300399 | Five Came Back: The Reference Films | SHOW | This collection includes 12 World War II-era p... | 1945 | TV-MA | 51 | ['documentation'] | ['US'] | 1.0 | NaN | NaN | NaN | 0.600 | NaN |
| 1 | tm84618 | Taxi Driver | MOVIE | A mentally unstable Vietnam War veteran works ... | 1976 | R | 114 | ['drama', 'crime'] | ['US'] | NaN | tt0075314 | 8.2 | 808582.0 | 40.965 | 8.179 |
| 2 | tm154986 | Deliverance | MOVIE | Intent on seeing the Cahulawassee River before... | 1972 | R | 109 | ['drama', 'action', 'thriller', 'european'] | ['US'] | NaN | tt0068473 | 7.7 | 107673.0 | 10.010 | 7.300 |
| 3 | tm127384 | Monty Python and the Holy Grail | MOVIE | King Arthur, accompanied by his squire, recrui... | 1975 | PG | 91 | ['fantasy', 'action', 'comedy'] | ['GB'] | NaN | tt0071853 | 8.2 | 534486.0 | 15.461 | 7.811 |
| 4 | tm120801 | The Dirty Dozen | MOVIE | 12 American military prisoners in World War II... | 1967 | NaN | 150 | ['war', 'action'] | ['GB', 'US'] | NaN | tt0061578 | 7.7 | 72662.0 | 20.398 | 7.600 |
# Show the number of movies and shows
print("Number of movies and shows: ", titles_df.size)
# Show the first year Netflix produced a movie or a show
print("The first time Netflix produced a movie or a show was in: ", titles_df['release_year'][0])
Number of movies and shows: 87750 The first time Netflix produced a movie or a show was in: 1945
titles_df.dtypes
id object title object type object description object release_year int64 age_certification object runtime int64 genres object production_countries object seasons float64 imdb_id object imdb_score float64 imdb_votes float64 tmdb_popularity float64 tmdb_score float64 dtype: object
# Try your code here
titles_df = titles_df.dropna(subset=['imdb_score'])
No certification¶# Try your solution
titles_df['age_certification'] = titles_df['age_certification'].fillna('No certification')
# Try your solution
titles_df['seasons'] = titles_df['seasons'].fillna(titles_df['seasons'].mode()[0])
# Try your solution here
certification_counts = certification_counts = titles_df.groupby("age_certification").size().reset_index(name="count")
# Try your solution here
count_by_release_year = titles_df.groupby(["release_year", "type"]).size().reset_index(name="count")
average_duration_imdb_score = titles_df.groupby('release_year')[['runtime', 'imdb_score']].mean().reset_index()
new_titles_df = titles_df.copy()
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('genres')
genre_counts = exploded_df.groupby('genres').size().reset_index(name='count')
imdb_score_std = titles_df.groupby("release_year")["imdb_score"].std().reset_index()
new_titles_df = titles_df.copy()
import ast
new_titles_df['production_countries'] = new_titles_df['production_countries'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('production_countries')
TMDB_popularity = exploded_df.groupby("production_countries").agg({"tmdb_popularity": "max", "imdb_score": "min"}).reset_index()
import ast
new_titles_df = titles_df.copy()
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('genres')
genres_votes_scores = exploded_df.groupby('genres').agg({
'imdb_votes': 'sum', # Calculate the sum of IMDb votes for each genre
'tmdb_score': 'mean' # Calculate the average TMDB score for each genre
}).reset_index()
def rating_deviation(row):
mean_rating = row['imdb_score'].mean() # Calculate the mean rating
deviation = row['imdb_score'] - mean_rating # Calculate the deviation from the mean
return deviation
new_titles_df = titles_df.copy()
import ast
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)
exploded_df = new_titles_df.explode('genres')
genre_avg_deviation = exploded_df.groupby('genres').apply(lambda x: rating_deviation(x).mean()).reset_index(name="average_deviation")
# Try your solution here
def standardize_score(x):
return (x - x.mean()) / x.std()
titles_df['standardized_tmdb_popularity'] = titles_df.groupby('genres')['tmdb_popularity'].transform(standardize_score)
min_max_year = titles_df.groupby('type')['release_year'].agg(['min', 'max'])
new_titles_df = titles_df.copy()
genre_average_length = ...
certification_stats = ...