In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
In [2]:
df = pd.read_csv('spaceship_passengers.csv')
In [3]:
df.head()
Out[3]:
PassengerId | Name | HomePlanet | CryoSleep | Destination | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | CabinDeck | CabinNumber | CabinSide | Transported | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0001_01 | Maham Ofracculy | Europa | False | TRAPPIST-1e | 39.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | B | 0.0 | P | False |
1 | 0002_01 | Juanna Vines | Earth | False | TRAPPIST-1e | 24.0 | False | 109.0 | 9.0 | 25.0 | 549.0 | 44.0 | F | 0.0 | S | True |
2 | 0003_01 | Altark Susent | Europa | False | TRAPPIST-1e | 58.0 | True | 43.0 | 3576.0 | 0.0 | 6715.0 | 49.0 | A | 0.0 | S | False |
3 | 0003_02 | Solam Susent | Europa | False | TRAPPIST-1e | 33.0 | False | 0.0 | 1283.0 | 371.0 | 3329.0 | 193.0 | A | 0.0 | S | False |
4 | 0004_01 | Willy Santantines | Earth | False | TRAPPIST-1e | 16.0 | False | 303.0 | 70.0 | 151.0 | 565.0 | 2.0 | F | 1.0 | S | True |
Data Cleaning¶
1. How many null/missing values has the column CryoSleep
?¶
In [4]:
df['CryoSleep'].isnull().sum()
Out[4]:
217
2. How many null/missing values has the column FoodCourt
?¶
In [6]:
df['FoodCourt'].isnull().sum()
Out[6]:
183
3. How many null/missing values has the column PassengerId
?¶
In [7]:
df['PassengerId'].isnull().sum()
Out[7]:
0
4. Fill the null/missing values in Destination
with the most common choice¶
In [15]:
df['Destination'].value_counts()
Out[15]:
Destination TRAPPIST-1e 5915 55 Cancri e 1800 PSO J318.5-22 796 Name: count, dtype: int64
In [16]:
df['Destination'].fillna('TRAPPIST-1e' , inplace=True)
/tmp/ipykernel_18/767106531.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. df['Destination'].fillna('TRAPPIST-1e' , inplace=True)
5. Fill the null/missing values in RoomService
, FoodCourt
, ShoppingMall
, Spa
and VRDeck
with the median¶
In [18]:
df['RoomService'].fillna(df['RoomService'].median() , inplace=True)
df['FoodCourt'].fillna(df['FoodCourt'].median() , inplace=True)
df['ShoppingMall'].fillna(df['ShoppingMall'].median() , inplace=True)
df['Spa'].fillna(df['Spa'].median() , inplace=True)
df['VRDeck'].fillna(df['VRDeck'].median() , inplace=True)
/tmp/ipykernel_18/1974137612.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. df['RoomService'].fillna(df['RoomService'].median() , inplace=True) /tmp/ipykernel_18/1974137612.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. df['FoodCourt'].fillna(df['FoodCourt'].median() , inplace=True) /tmp/ipykernel_18/1974137612.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. df['ShoppingMall'].fillna(df['ShoppingMall'].median() , inplace=True) /tmp/ipykernel_18/1974137612.py:4: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. df['Spa'].fillna(df['Spa'].median() , inplace=True) /tmp/ipykernel_18/1974137612.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. df['VRDeck'].fillna(df['VRDeck'].median() , inplace=True)
In [25]:
type(df['VIP'][0])
Out[25]:
bool
6. Fill the null/missing values in VIP
with the most common value¶
In [26]:
df['VIP'].fillna(False , inplace = True)
/tmp/ipykernel_18/2530139235.py:1: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` df['VIP'].fillna(False , inplace = True)
8. Drop the previously defined columns (PassengerId
and Name
)¶
In [28]:
df.drop(columns=['Name' , 'PassengerId'], inplace=True)
9. Drop any other row that contains a null value¶
In [30]:
df.dropna(inplace=True)
Feature Engineering¶
Encode the features previously identified in its own dataframe¶
In [32]:
features_to_encode = ['VIP' , 'CryoSleep' , 'HomePlanet' , 'Destination' , 'CabinDeck' , 'CabinSide']
In [33]:
df[features_to_encode].info()
<class 'pandas.core.frame.DataFrame'> Index: 7919 entries, 0 to 8692 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 VIP 7919 non-null bool 1 CryoSleep 7919 non-null object 2 HomePlanet 7919 non-null object 3 Destination 7919 non-null object 4 CabinDeck 7919 non-null object 5 CabinSide 7919 non-null object dtypes: bool(1), object(5) memory usage: 378.9+ KB
In [34]:
df_encoded = pd.get_dummies(df[features_to_encode].astype(str))
Remove the original encoded features from df
¶
In [36]:
df_no_categorical = df.drop(['VIP' , 'CryoSleep' , 'HomePlanet' , 'Destination' , 'CabinDeck' , 'CabinSide'] ,axis=1)
Create a new dataframe combining df_no_categorical
and df_encoded
¶
In [44]:
df_final = pd.concat([df_no_categorical,df_encoded],axis=1)
Finally, separate the target variable Transported
from the training data¶
In [46]:
df_train = df_final.drop('Transported' , axis=1)
transported = df_final['Transported']
Model selection and tuning¶
In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
These splits are given and set by us, don't change them:
In [53]:
X_train, X_test, y_train, y_test = train_test_split(df_train, transported, test_size=0.2, random_state=0)
Random Forest¶
Use a GridSearchCV
to find the best max_depth
parameter for a RandomForestClassifier
¶
Given the RandomForestClassifier
created (with random_state=42
, important, don't change it!) instantiate a GridSearchCV
to find the best possible parameter for max_depth
, in the range 5
to 25
.
In [54]:
rf = RandomForestClassifier(random_state=42)
In [55]:
# your grid of parameters
param_grid = {'max_depth':range(5,26)}
In [56]:
grid_search = GridSearchCV(rf, param_grid, cv=5)
In [57]:
grid_search.fit(X_train, y_train)
Out[57]:
GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), param_grid={'max_depth': range(5, 26)})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), param_grid={'max_depth': range(5, 26)})
RandomForestClassifier(max_depth=20, random_state=42)
RandomForestClassifier(max_depth=20, random_state=42)
What's the best hyperparameter value for max_depth
?
In [59]:
grid_search.best_params_
Out[59]:
{'max_depth': 20}
Create a RandomForestClassifier
that achieves at least 0.75
in precision and at least 0.80
in recall¶
In [61]:
model = RandomForestClassifier(random_state=42,max_depth=20)
In [62]:
model.fit(X_train, y_train)
Out[62]:
RandomForestClassifier(max_depth=20, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=20, random_state=42)
In [63]:
y_pred = model.predict(X_test)
In [68]:
print(f"Model precision: {precision_score(y_test, y_pred)}")
print(f"Model recall: {recall_score(y_test, y_pred)}")
Model precision: 0.8462549277266754 Model recall: 0.7777777777777778
In [67]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay , recall_score,precision_score
In [69]:
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm).plot()
Out[69]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7070fae436d0>
In [ ]: