In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
In [2]:
df = pd.read_csv('spaceship_passengers.csv')
In [3]:
df.head()
Out[3]:
PassengerId Name HomePlanet CryoSleep Destination Age VIP RoomService FoodCourt ShoppingMall Spa VRDeck CabinDeck CabinNumber CabinSide Transported
0 0001_01 Maham Ofracculy Europa False TRAPPIST-1e 39.0 False 0.0 0.0 0.0 0.0 0.0 B 0.0 P False
1 0002_01 Juanna Vines Earth False TRAPPIST-1e 24.0 False 109.0 9.0 25.0 549.0 44.0 F 0.0 S True
2 0003_01 Altark Susent Europa False TRAPPIST-1e 58.0 True 43.0 3576.0 0.0 6715.0 49.0 A 0.0 S False
3 0003_02 Solam Susent Europa False TRAPPIST-1e 33.0 False 0.0 1283.0 371.0 3329.0 193.0 A 0.0 S False
4 0004_01 Willy Santantines Earth False TRAPPIST-1e 16.0 False 303.0 70.0 151.0 565.0 2.0 F 1.0 S True

Data Cleaning¶

1. How many null/missing values has the column CryoSleep?¶

In [4]:
df['CryoSleep'].isnull().sum()
Out[4]:
217

2. How many null/missing values has the column FoodCourt?¶

In [6]:
df['FoodCourt'].isnull().sum()
Out[6]:
183

3. How many null/missing values has the column PassengerId?¶

In [7]:
df['PassengerId'].isnull().sum()
Out[7]:
0

4. Fill the null/missing values in Destination with the most common choice¶

In [15]:
df['Destination'].value_counts()
Out[15]:
Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: count, dtype: int64
In [16]:
df['Destination'].fillna('TRAPPIST-1e' , inplace=True)
/tmp/ipykernel_18/767106531.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Destination'].fillna('TRAPPIST-1e' , inplace=True)

5. Fill the null/missing values in RoomService, FoodCourt, ShoppingMall, Spa and VRDeck with the median¶

In [18]:
df['RoomService'].fillna(df['RoomService'].median() , inplace=True)
df['FoodCourt'].fillna(df['FoodCourt'].median() , inplace=True)
df['ShoppingMall'].fillna(df['ShoppingMall'].median() , inplace=True)
df['Spa'].fillna(df['Spa'].median() , inplace=True)
df['VRDeck'].fillna(df['VRDeck'].median() , inplace=True)
/tmp/ipykernel_18/1974137612.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['RoomService'].fillna(df['RoomService'].median() , inplace=True)
/tmp/ipykernel_18/1974137612.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['FoodCourt'].fillna(df['FoodCourt'].median() , inplace=True)
/tmp/ipykernel_18/1974137612.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ShoppingMall'].fillna(df['ShoppingMall'].median() , inplace=True)
/tmp/ipykernel_18/1974137612.py:4: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Spa'].fillna(df['Spa'].median() , inplace=True)
/tmp/ipykernel_18/1974137612.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['VRDeck'].fillna(df['VRDeck'].median() , inplace=True)
In [25]:
type(df['VIP'][0])
Out[25]:
bool

6. Fill the null/missing values in VIP with the most common value¶

In [26]:
df['VIP'].fillna(False , inplace = True)
/tmp/ipykernel_18/2530139235.py:1: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df['VIP'].fillna(False , inplace = True)

8. Drop the previously defined columns (PassengerId and Name)¶

In [28]:
df.drop(columns=['Name' , 'PassengerId'], inplace=True)

9. Drop any other row that contains a null value¶

In [30]:
df.dropna(inplace=True)

Feature Engineering¶

Encode the features previously identified in its own dataframe¶

In [32]:
features_to_encode = ['VIP' , 'CryoSleep' , 'HomePlanet' , 'Destination' , 'CabinDeck' , 'CabinSide']
In [33]:
df[features_to_encode].info()
<class 'pandas.core.frame.DataFrame'>
Index: 7919 entries, 0 to 8692
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   VIP          7919 non-null   bool  
 1   CryoSleep    7919 non-null   object
 2   HomePlanet   7919 non-null   object
 3   Destination  7919 non-null   object
 4   CabinDeck    7919 non-null   object
 5   CabinSide    7919 non-null   object
dtypes: bool(1), object(5)
memory usage: 378.9+ KB
In [34]:
df_encoded = pd.get_dummies(df[features_to_encode].astype(str))

Remove the original encoded features from df¶

In [36]:
df_no_categorical = df.drop(['VIP' , 'CryoSleep' , 'HomePlanet' , 'Destination' , 'CabinDeck' , 'CabinSide'] ,axis=1)

Create a new dataframe combining df_no_categorical and df_encoded¶

In [44]:
df_final = pd.concat([df_no_categorical,df_encoded],axis=1)

Finally, separate the target variable Transported from the training data¶

In [46]:
df_train = df_final.drop('Transported' , axis=1)
transported = df_final['Transported']

Model selection and tuning¶

In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

These splits are given and set by us, don't change them:

In [53]:
X_train, X_test, y_train, y_test = train_test_split(df_train, transported, test_size=0.2, random_state=0)

Random Forest¶

Use a GridSearchCV to find the best max_depth parameter for a RandomForestClassifier¶

Given the RandomForestClassifier created (with random_state=42, important, don't change it!) instantiate a GridSearchCV to find the best possible parameter for max_depth, in the range 5 to 25.

In [54]:
rf = RandomForestClassifier(random_state=42)
In [55]:
# your grid of parameters
param_grid = {'max_depth':range(5,26)}
In [56]:
grid_search = GridSearchCV(rf, param_grid, cv=5)
In [57]:
grid_search.fit(X_train, y_train)
Out[57]:
GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': range(5, 26)})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': range(5, 26)})
RandomForestClassifier(max_depth=20, random_state=42)
RandomForestClassifier(max_depth=20, random_state=42)

What's the best hyperparameter value for max_depth?

In [59]:
grid_search.best_params_
Out[59]:
{'max_depth': 20}

Create a RandomForestClassifier that achieves at least 0.75 in precision and at least 0.80 in recall¶

In [61]:
model = RandomForestClassifier(random_state=42,max_depth=20)
In [62]:
model.fit(X_train, y_train)
Out[62]:
RandomForestClassifier(max_depth=20, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=20, random_state=42)
In [63]:
y_pred = model.predict(X_test)
In [68]:
print(f"Model precision: {precision_score(y_test, y_pred)}")
print(f"Model recall: {recall_score(y_test, y_pred)}")
Model precision: 0.8462549277266754
Model recall: 0.7777777777777778
In [67]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay , recall_score,precision_score
In [69]:
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm).plot()
Out[69]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7070fae436d0>
No description has been provided for this image
In [ ]: