The sinking of the Titanic on April 15th, 1912 is one of the most tragic tragedies in history. The Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers. The numbers of survivors were low due to the lack of lifeboats for all passengers and crew. Some passengers were more likely to survive than others, such as women, children, and upper-class. This case study analyzes what sorts of people were likely to survive this tragedy. The dataset includes the following:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
url = "https://datascienceschools.github.io/Machine_Learning/Classification_Models_CaseStudies/Train_Titanic.csv"
df = pd.read_csv(url)
df.head()
- Cabin & Embarked are unnecessary columns -> drop them after data visualisation
df.isnull().sum()
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
plt.show()
def Fill_Age(data):
age = data[0]
sex = data[1]
if pd.isnull(age):
if sex is 'male':
return 29
else:
return 27
else:
return age
df['Age'] = df[['Age','Sex']].apply(Fill_Age,axis=1)
df.isnull().sum()
survived = df[df['Survived'] == 1]
not_survived = df[df['Survived'] == 0]
print("Total =", len(df))
print("\nNumber of Survived passengers =", len(survived))
print("Percentage Survived = {:.2f}%".format(len(survived)*100/len(df)))
print("\nDid not Survive =", len(not_survived))
print("Percentage who did not survive = {:.2f}%".format(len(not_survived)*100/len(df)))
!pip install cufflinks
import cufflinks as cf
cf.go_offline()
survived = df[df['Survived']==1]['Survived'].value_counts()
dead = df[df['Survived']==0]['Survived'].value_counts()
df1 = pd.DataFrame([survived ,dead])
df1.index = ['Survived','Dead']
df1.iplot(kind='bar',barmode='stack', title='Number of Survived & Dead')
- If you are a female,
- you have a higher chance of survival
survived_sex = df[df['Survived']==1]['Sex'].value_counts()
dead_sex = df[df['Survived']==0]['Sex'].value_counts()
df1 = pd.DataFrame([survived_sex,dead_sex])
df1.index = ['Survived','Dead']
df1.iplot(kind='bar',barmode='stack', title='Survival by Sex')
- If you are a first class
- you have a higher chance of survival
survived_pclass = df[df['Survived']==1]['Pclass'].value_counts()
dead_pclass = df[df['Survived']==0]['Pclass'].value_counts()
df1 = pd.DataFrame([survived_pclass, dead_pclass])
df1.index = ['Survived','Dead']
df1.iplot(kind='bar',barmode='stack', title='Survival by Pclass')
- If you have 1 sibling (SibSp = 1)
- you have a higher chance of survival compared to being alone (Parch = 0)
survived_SibSp = df[df['Survived']==1]['SibSp'].value_counts()
dead_SibSp = df[df['Survived']==0]['SibSp'].value_counts()
df1 = pd.DataFrame([survived_SibSp, dead_SibSp])
df1.index = ['Survived','Dead']
df1.iplot(kind='bar',barmode='stack', title='Survival by Number of siblings / spouses aboard the Titanic')
- If you have 1 family member (Parch = 1)
- you have a higher chance of survival compared to being alone (Parch = 0)
survived_Parch = df[df['Survived']==1]['Parch'].value_counts()
dead_Parch = df[df['Survived']==0]['Parch'].value_counts()
df1 = pd.DataFrame([survived_Parch, dead_Parch])
df1.index = ['Survived','Dead']
df1.iplot(kind='bar',barmode='stack', title='Survival by Number of parents / children aboard the Titanic')
- Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton
- If you embarked from port "C"
- you have a higher chance of survival compared to other ports!
survived_Embarked = df[df['Survived']==1]['Embarked'].value_counts()
dead_Embarked = df[df['Survived']==0]['Embarked'].value_counts()
df1 = pd.DataFrame([survived_Embarked, dead_Embarked])
df1.index = ['Survived','Dead']
df1.iplot(kind='bar',barmode='stack', title='Survival by Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton')
- If you are a baby
- you have a higher chance of survival
df['Age_Group'] = pd.cut(df['Age'], bins=[0,5,10,20,30,40,50,60,70,80])
df.head()
survived_Age_Group = df[df['Survived']==1]['Age_Group'].value_counts()
dead_Age_Group = df[df['Survived']==0]['Age_Group'].value_counts()
df1 = pd.DataFrame([survived_Age_Group, dead_Age_Group])
df1.index = ['Survived','Dead']
df['Age'].iplot(kind='hist',bins=30, xTitle='Age',color='skyblue')
df['Age'].iplot(kind='box', xTitle='Age',color='lightgreen')
df1.iplot(kind='bar',barmode='stack', title='Survival by Age Group')
- If you pay a higher fare
- you have a higher chance of survival
df['Fare_Group'] = pd.cut(df['Fare'], bins=[0, 50, 100, 200, 300, 600])
df.head()
survived_Fare_Group = df[df['Survived']==1]['Fare_Group'].value_counts()
dead_Fare_Group = df[df['Survived']==0]['Fare_Group'].value_counts()
df1 = pd.DataFrame([survived_Fare_Group, dead_Fare_Group])
df1.index = ['Survived','Dead']
df['Fare'].iplot(kind='hist',bins=30, xTitle='Fare', color='lightgreen')
df['Fare'].iplot(kind='box', xTitle='Age',color='lightgreen')
df1.iplot(kind='bar',barmode='stack', title='Survival by Fare Group')
- male: 1
- female: 0
df['Male'] = pd.get_dummies(df['Sex'], drop_first = True)
df.head()
df.drop(['PassengerId','Name', 'Sex','Ticket','Cabin', 'Embarked', 'Age_Group', 'Fare_Group' ], axis = 1 , inplace = True)
df.head()
df = df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Male', 'Survived']]
df.head()
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state = 0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is: {:.2f}%".format(accuracy*100))
sns.heatmap(cm, annot = True, fmt="d")
plt.show()
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
url = "https://datascienceschools.github.io/Machine_Learning/Classification_Models_CaseStudies/Test_Titanic.csv"
new_data = pd.read_csv(url)
new_data.head()
new_data.drop(['PassengerId','Name', 'Ticket','Cabin', 'Embarked' ], axis = 1 , inplace = True)
new_data.head()
new_data.isnull().sum()
def Fill_Age(data):
age = data[0]
sex = data[1]
if pd.isnull(age):
if sex is 'male':
return 29
else:
return 27
else:
return age
new_data['Age'] = new_data[['Age','Sex']].apply(Fill_Age,axis=1)
new_data = new_data.dropna(axis=0)
new_data.isnull().sum()
new_data['Male'] = pd.get_dummies(new_data['Sex'], drop_first = True)
new_data.drop(['Sex'], axis = 1, inplace = True)
new_data.head()
new_data_X = new_data.iloc[:,:].values
new_data_X = sc.transform(new_data_X)
new_data_y_pred = model.predict(new_data_X)
new_data['predicted_Survive'] = new_data_y_pred
new_data.head()
survive = new_data[new_data['predicted_Survive']==1]['predicted_Survive'].value_counts()
not_survive = new_data[new_data['predicted_Survive']==0]['predicted_Survive'].value_counts()
df1 = pd.DataFrame([survive , not_survive ])
df1.index = ['Survive','Not Survive']
df1.iplot(kind='bar',barmode='stack', title='Number of Predicted Survive & Not Survive')