# Importing neccessary libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Loading the dataset
data = pd.read_csv('flavors_of_cocoa.csv')

data.head()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1795 entries, 0 to 1794
Data columns (total 9 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Company 
(Maker-if known)         1795 non-null   object 
 1   Specific Bean Origin
or Bar Name  1795 non-null   object 
 2   REF                               1795 non-null   int64  
 3   Review
Date                       1795 non-null   int64  
 4   Cocoa
Percent                     1795 non-null   object 
 5   Company
Location                  1795 non-null   object 
 6   Rating                            1795 non-null   float64
 7   Bean
Type                         1794 non-null   object 
 8   Broad Bean
Origin                 1794 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 126.3+ KB

# Note the column have `\n` in their names so run the following line to know the column names

data.columns

Index(['Company \n(Maker-if known)', 'Specific Bean Origin\nor Bar Name',
       'REF', 'Review\nDate', 'Cocoa\nPercent', 'Company\nLocation', 'Rating',
       'Bean\nType', 'Broad Bean\nOrigin'],
      dtype='object')

data.describe()

# Identify the top 5 chocolates based on their rating

low_rated_count = data[data['Rating'] < 2].count()
low_rated_count

Company \n(Maker-if known)           17
Specific Bean Origin\nor Bar Name    17
REF                                  17
Review\nDate                         17
Cocoa\nPercent                       17
Company\nLocation                    17
Rating                               17
Bean\nType                           17
Broad Bean\nOrigin                   17
dtype: int64

data['Cocoa\nPercent'] = data['Cocoa\nPercent'].str.rstrip('%').astype(float)
high_cocoa_chocolates = data[data['Cocoa\nPercent'] > 70]
high_cocoa_chocolates

mean_rating = data['Rating'].mean()
above_avg_chocolates = data[data['Rating'] > mean_rating].count()
above_avg_chocolates

Company \n(Maker-if known)           1005
Specific Bean Origin\nor Bar Name    1005
REF                                  1005
Review\nDate                         1005
Cocoa\nPercent                       1005
Company\nLocation                    1005
Rating                               1005
Bean\nType                           1004
Broad Bean\nOrigin                   1005
dtype: int64

# Create scatter plot for cocoa percentage vs rating
plt.figure(figsize=(16, 12))
sns.scatterplot(data=data, x='Cocoa\nPercent', y='Rating', hue='Cocoa\nPercent', palette='coolwarm')
plt.title('Cocoa Percentage vs. Chocolate Rating')
plt.xlabel('Cocoa Percent')
plt.ylabel('Rating')
plt.show()

filtered_chocolates_series = data[(data['Cocoa\nPercent'] > 60) & (data['Rating'] >= 4.0)]['Specific Bean Origin\nor Bar Name']

extreme_chocolates = data[(data['Rating'] < 2) | (data['Cocoa\nPercent'] > 90)].count()
extreme_chocolates

Company \n(Maker-if known)           34
Specific Bean Origin\nor Bar Name    34
REF                                  34
Review\nDate                         34
Cocoa\nPercent                       34
Company\nLocation                    34
Rating                               34
Bean\nType                           34
Broad Bean\nOrigin                   34
dtype: int64

# Provide the correct syntax to filter chocolates with a rating greater than `4.5` and a cocoa percentage less than `70%`.

venezuela_chocolates = data[(data['Broad Bean\nOrigin'] == 'Venezuela') & (data['Rating'] > 3.5)].count()
venezuela_chocolates

Company \n(Maker-if known)           54
Specific Bean Origin\nor Bar Name    54
REF                                  54
Review\nDate                         54
Cocoa\nPercent                       54
Company\nLocation                    54
Rating                               54
Bean\nType                           54
Broad Bean\nOrigin                   54
dtype: int64

# Group by 'Company Location' and calculate the mean rating
top_countries = data.groupby('Company\nLocation')['Rating'].mean().sort_values(ascending=False).head(10)

# Plot the top 10 countries with highest-rated chocolates
plt.figure(figsize=(10, 6))
sns.barplot(x=top_countries.values, y=top_countries.index, palette='viridis')
plt.title('Top 10 Countries Producing Highest-Rated Chocolate Bars')
plt.xlabel('Average Rating')
plt.ylabel('Company Location')
plt.show()

/tmp/ipykernel_18/2659750790.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_countries.values, y=top_countries.index, palette='viridis')

recent_high_rated_count = data[(data['Review\nDate'] > 2015) & (data['Rating'] >= 4)].count()
recent_high_rated_count

Company \n(Maker-if known)           9
Specific Bean Origin\nor Bar Name    9
REF                                  9
Review\nDate                         9
Cocoa\nPercent                       9
Company\nLocation                    9
Rating                               9
Bean\nType                           9
Broad Bean\nOrigin                   9
dtype: int64

top_rated_common_origin = ...

# Get unique company locations
unique_locations = ...

# Get unique company locations
unique_locations = data['Company\nLocation'].unique()

#Initialize a dictionary to store results
location_stats = {}

#Calculate stats for each location
for location in unique_locations:
    location_data = data[data['Company\nLocation'] == location]
    count = len(location_data)
    if count >= 10:  # Only consider locations with at least 10 reviews
        mean_rating = location_data['Rating'].mean()
        location_stats[location] = mean_rating

#Convert results to a Series and sort
avg_rating_by_location = pd.Series(location_stats).sort_values(ascending=False)
avg_rating_by_location

Vietnam        3.409091
Brazil         3.397059
Australia      3.357143
Guatemala      3.350000
Switzerland    3.342105
Italy          3.325397
Scotland       3.325000
Canada         3.324000
Denmark        3.283333
Spain          3.270000
France         3.251603
Austria        3.240385
Hungary        3.204545
New Zealand    3.191176
Germany        3.178571
Venezuela      3.175000
Colombia       3.173913
U.S.A.         3.154123
Madagascar     3.147059
Belgium        3.093750
Japan          3.088235
U.K.           3.054688
Ecuador        3.009259
Peru           2.897059
dtype: float64

# Convert results to a Series and sort
avg_rating_by_location = ...

	Company \n(Maker-if known)	Specific Bean Origin\nor Bar Name	REF	Review\nDate	Cocoa\nPercent	Company\nLocation	Rating	Broad Bean\nOrigin
0	A. Morin	Agua Grande	1876	2016	63%	France	3.75	Sao Tome
1	A. Morin	Kpime	1676	2015	70%	France	2.75	Togo
2	A. Morin	Atsane	1676	2015	70%	France	3.00	Togo
3	A. Morin	Akata	1680	2015	70%	France	3.50	Togo
4	A. Morin	Quilla	1704	2015	70%	France	3.50	Peru

	REF	Review\nDate	Rating
count	1795.000000	1795.000000	1795.000000
mean	1035.904735	2012.325348	3.185933
std	552.886365	2.927210	0.478062
min	5.000000	2006.000000	1.000000
25%	576.000000	2010.000000	2.875000
50%	1069.000000	2013.000000	3.250000
75%	1502.000000	2015.000000	3.500000
max	1952.000000	2017.000000	5.000000

	Company \n(Maker-if known)	Specific Bean Origin\nor Bar Name	REF	Review\nDate	Cocoa\nPercent	Company\nLocation	Rating	Bean\nType	Broad Bean\nOrigin
26	Adi	Vanua Levu, Toto-A	705	2011	80.0	Fiji	3.25	Trinitario	Fiji
27	Adi	Vanua Levu	705	2011	88.0	Fiji	3.50	Trinitario	Fiji
28	Adi	Vanua Levu, Ami-Ami-CA	705	2011	72.0	Fiji	3.50	Trinitario	Fiji
32	Akesson's (Pralus)	Bali (west), Sukrama Family, Melaya area	636	2011	75.0	Switzerland	3.75	Trinitario	Indonesia
33	Akesson's (Pralus)	Madagascar, Ambolikapiky P.	502	2010	75.0	Switzerland	2.75	Criollo	Madagascar
...	...	...	...	...	...	...	...	...	...
1778	Zotter	Raw	1205	2014	80.0	Austria	2.75
1779	Zotter	Bocas del Toro, Cocabo Co-op	801	2012	72.0	Austria	3.50		Panama
1784	Zotter	El Oro	879	2012	75.0	Austria	3.00	Forastero (Nacional)	Ecuador
1785	Zotter	Huiwani Coop	879	2012	75.0	Austria	3.00	Criollo, Trinitario	Papua New Guinea
1786	Zotter	El Ceibo Coop	879	2012	90.0	Austria	3.25		Bolivia

Cocoa Curations: Series Filtering with Chocolate Ratings¶

Let's Start¶

1. Which of the following methods is used to find the `top 5` chocolates based on their `Rating`?¶

2. Identify Low-Rated Chocolate Bars¶

3. High Cocoa Percent Chocolates¶

4. Count Chocolates Above Average Rating¶

Just for Exploration¶

5. Identify Beans with High Cocoa and High Rating!¶

6. Count Extreme Chocolates¶

7. What is the correct syntax to filter chocolates with a rating greater than `4.5` and a cocoa percentage less than `70%`?¶

8. Count High-Rated Venezuelan Chocolates¶

Just for Exploration¶

9. Recent High-Rated Bars¶

10. Most Common Bean Origin for Highly Rated Chocolates¶

11. Average Rating by Company Location¶

Cocoa Curations: Series Filtering with Chocolate Ratings¶

Let's Start¶

1. Which of the following methods is used to find the top 5 chocolates based on their Rating?¶

2. Identify Low-Rated Chocolate Bars¶

3. High Cocoa Percent Chocolates¶

4. Count Chocolates Above Average Rating¶

Just for Exploration¶

5. Identify Beans with High Cocoa and High Rating!¶

6. Count Extreme Chocolates¶

7. What is the correct syntax to filter chocolates with a rating greater than 4.5 and a cocoa percentage less than 70%?¶

8. Count High-Rated Venezuelan Chocolates¶

Just for Exploration¶

9. Recent High-Rated Bars¶

10. Most Common Bean Origin for Highly Rated Chocolates¶

11. Average Rating by Company Location¶

1. Which of the following methods is used to find the `top 5` chocolates based on their `Rating`?¶

7. What is the correct syntax to filter chocolates with a rating greater than `4.5` and a cocoa percentage less than `70%`?¶