# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set_style('whitgrid')
plt.style.use('ggplot')
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output.
To preprocess the dataset, we need:
# Read the dataset
gender_submission = pd.read_csv('../input/gender_submission.csv')
test = pd.read_csv('../input/test.csv')
train = pd.read_csv('../input/train.csv')
# head of the trainining dataset
train.head(5)
train.tail(5)
# print the information of the training dataset
train.info()
# print the distribution of the numerical features in the training dataset
train.describe()
# print the distribution of the categorical features in the training dataset
train.describe(include=['O'])
From the column information, wen can conclude two results:
Basically, there are around 11 columns except for the output label, so it is reasonable to select the features from these 11 columns for the baseline model at the beginning.
Here, I list several possible features here:
We need to check the distribution of these features.
It is necessary to observe the relation between the output label and each feature to provide an initial observations.
grid = sns.FacetGrid(train, col = 'Survived')
grid.map(plt.hist, 'Age', bins=20)
grid = sns.FacetGrid(train, col = 'Survived')
grid.map(plt.hist, 'Fare', bins=20)
train[['Pclass', 'Survived']].groupby('Pclass').mean()
grid = sns.FacetGrid(train, col = 'Survived')
grid.map(plt.hist, 'Pclass')
train[['Sex', 'Survived']].groupby('Sex').mean()
grid = sns.FacetGrid(train, col = 'Survived')
grid.map(sns.countplot, 'Sex')
train[['SibSp', 'Survived']].groupby('SibSp').mean()
grid = sns.FacetGrid(train, col = 'Survived')
grid.map(plt.hist, 'SibSp')
train[['Parch', 'Survived']].groupby('Parch').mean()
grid = sns.FacetGrid(train, col = 'Survived')
grid.map(plt.hist, 'Parch')
train[['Embarked', 'Survived']].groupby('Embarked').mean()
grid = sns.FacetGrid(train, col = 'Survived')
grid.map(sns.countplot, 'Embarked')
There are three columns have missing values:
Here, since we only take the 'Age' and the 'Embarked' as the features, we can ignore the feature 'Cabin' here for our baseline model.
When dealing with the missing values, we have two ways:
Here, I use regression model to estimate the age, because I believe that the age is related to some other features like 'Pclass' and 'Parch'. For the 'Embarked', since there are only 2 records are missing, thus I will use the mode to estimate the missing value.
# fill the missing value with the mode value
mode = train['Embarked'].mode()
train.loc[train.Embarked.isnull(), 'Embarked'] = mode[0]
train.describe(include=['O'])
There are many different regression models I could choose for estimating the value, and I decided to use the random forest at the begining for the baseline model.
from sklearn.ensemble import RandomForestRegressor
# Build the random forest regressor to estimate the age
def Estimate_Age(df):
dataset = df[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare']]
known_age = dataset[dataset.Age.notnull()].as_matrix()
unknown_age = dataset[dataset.Age.isnull()].as_matrix()
X = known_age[:, 1:]
y = known_age[:, 0]
regr = RandomForestRegressor(n_estimators = 1000, random_state = 0)
regr.fit(X, y)
estimate_age = regr.predict(unknown_age[:, 1:])
train.loc[train.Age.isnull(), 'Age'] = estimate_age
return train, regr
train_set, regr_age = Estimate_Age(train)
train_set.describe()
Here, we use the logistic regression as the baseline model and we need to transform the features like 'Embarked' into features with numbers.
The value of 'Pclass' is 1, 2, 3, thus we can convert the feature 'Pclass' into a feature with 3 dimensions (or 3 features):
def Transform_Feature(df):
Pclass_dummies = pd.get_dummies(df['Pclass'], prefix = 'Pclass')
Sex_dummies = pd.get_dummies(df['Sex'], prefix = 'Sex')
Embarked_dummies = pd.get_dummies(df['Embarked'], prefix = 'Embarked')
df = pd.concat([df, Pclass_dummies, Sex_dummies, Embarked_dummies], axis=1)
# df.drop(['Pclass', 'Sex', 'Embarked', 'Cabin', 'Name', 'Ticket'], inplace=True, axis=1)
return df
train = Transform_Feature(train_set)
train.head(5)
There are four features need to be scaled:
Here, we will use the scaling methods from sklearn.
The reason we use standardscaler instead of scale in sklearn is that we can use the standardscaler to transform the test data according to the training data.
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
scale_param = scaler.fit(train[['Age', 'SibSp', 'Parch', 'Fare']].values)
scaled_features = scaler.fit_transform(train[['Age', 'SibSp', 'Parch', 'Fare']].values, scale_param)
train[['Age_scaled', 'SibSp_scaled', 'Parch_scaled', 'Fare_scaled']] = pd.DataFrame(scaled_features, index=train.index)
train.head(5)
# Check the test set
test.describe()
test.describe(include=['O'])
According to the observation, for the selected features, we need to add the estimated value for the features 'Age' and 'Fare'. Here we all use the Random Forest Regression model trained from the training set to estimate.
There is only one record that doesn't have the value of 'Fare'. To make the process easier, we decided to use the mean value
test.loc[test.Fare.isnull(), 'Fare'] = train.Fare.mean()
We use the regression model obtained from the training set to predict the missing age in the test set.
df= test[['Age', 'Pclass', 'SibSp', 'Parch', 'Fare']]
unknown_age = df[df.Age.isnull()].as_matrix()
estimated_age = regr_age.predict(unknown_age[:, 1:])
test.loc[test.Age.isnull(), 'Age'] = estimated_age
test.head(5)
test = Transform_Feature(test)
test.head(5)
scaled_features = scaler.fit_transform(test[['Age', 'SibSp', 'Parch', 'Fare']].values, scale_param)
test[['Age_scaled', 'SibSp_scaled', 'Parch_scaled', 'Fare_scaled']] = pd.DataFrame(scaled_features, index=test.index)
test.head(5)
Here we use the logisitic regression model for classification.
There are several steps we can do here:
# build the dataset
from sklearn.linear_model import LogisticRegression
train_data = train[['Survived', 'Age_scaled', 'SibSp_scaled', 'Parch_scaled', 'Fare_scaled', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
test_data = test[['Age_scaled', 'SibSp_scaled', 'Parch_scaled', 'Fare_scaled', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
train_data.head(5)
test_data.head(5)
# train the model
X_train, Y_train = train_data.as_matrix()[:, 1:], train_data.as_matrix()[:, 0]
X_test = test_data.as_matrix()
clf = LogisticRegression(penalty='l1')
clf.fit(X_train, Y_train)
# predict the result of the test set
Y_test = clf.predict(X_test)
result = pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix(), 'Survived': Y_test.astype(np.int32)})
# result.to_csv('../input/submission.csv', index=False)
The accuracy for the baseline model is 75.598% and it's time to improve it!
pd.DataFrame({'features': list(train_data.columns)[1:], 'coef': list(clf.coef_.T)})
According to the above table, there is no group of features whos coefficients are closed to zero. Thus, I decided to keep all of these features (If the coefficient of 'Embarked_S' is very closed to zero as well, I might consider to remove the features like 'Embarked_C', 'Embarked_Q', and 'Embarked_S').
# split the data
from sklearn.model_selection import train_test_split
training_set, validation_set = train_test_split(train, test_size=0.4, random_state=0)
train_data = training_set[['Survived', 'Age_scaled', 'SibSp_scaled', 'Parch_scaled', 'Fare_scaled', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
validation_data = validation_set[['Survived', 'Age_scaled', 'SibSp_scaled', 'Parch_scaled', 'Fare_scaled', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
# train the model and predict the validation set
X_train, Y_train = train_data.as_matrix()[:, 1:], train_data.as_matrix()[:, 0]
X_validation, Y_validation = validation_data.as_matrix()[:, 1:], validation_data.as_matrix()[:, 0]
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_validation)
bad_case = validation_set.loc[validation_set.PassengerId.isin(validation_set[Y_pred != validation_data.as_matrix()[:, 0]]['PassengerId'].values)]
bad_case.head(10)
bad_case.describe()
There are several features needed to be added into the model and some features should be removed.
Features needed to be added:
train['Title'] = train_set.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(train['Title'], train['Survived'])
from collections import defaultdict
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Miss')
tmpDict = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4}
TitleDict = defaultdict(lambda: 5, tmpDict)
train['Title'] = train['Title'].map(TitleDict)
train['Title'] = train['Title'].fillna(0)
FamilySize = SibSp + Parch
train['FamilySize'] = train['SibSp'] + train['Parch']
train['IsAlone'] = (train['FamilySize'] == 0)
According to other solutions, the age are often divided into 5 groups, that's what we are going to do.
train['AgeBand'] = pd.cut(train['Age'], 5, labels=[1, 2, 3, 4, 5])
To improve the model, we have several steps to do:
def new_transform(df):
Title_dummies = pd.get_dummies(df['Title'], prefix = 'Title')
IsAlone_dummies = pd.get_dummies(df['IsAlone'], prefix = 'IsAlone')
AgeBand_dummies = pd.get_dummies(df['AgeBand'], prefix = 'AgeBand')
df = pd.concat([df, Title_dummies, IsAlone_dummies, AgeBand_dummies], axis=1)
# df.drop(['Title', 'IsAlone', 'AgeBand'], inplace=True, axis=1)
return df
train = new_transform(train)
New feature: FamilySize
scaler = preprocessing.StandardScaler()
scale_param = scaler.fit(train[['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']].values)
scaled_features = scaler.fit_transform(train[['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']].values, scale_param)
train[['Age_scaled', 'SibSp_scaled', 'Parch_scaled', 'Fare_scaled', 'FamilySize_scaled']] = pd.DataFrame(scaled_features, index=train.index)
# train.drop(['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize'], inplace=True, axis=1)
train.head(5)
Now we have 36 features in total for each record, and it's time to rerun the model and predict the test result again. But we also need to preprocess the test dataset at first.
test['Title'] = test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Title'].map(TitleDict)
test['Title'] = test['Title'].fillna(0)
test['FamilySize'] = test['SibSp'] + test['Parch']
test['IsAlone'] = (test['FamilySize'] == 0)
test['AgeBand'] = pd.cut(test['Age'], 5, labels=[1, 2, 3, 4, 5])
test = new_transform(test)
scaled_features = scaler.fit_transform(test[['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']].values, scale_param)
test[['Age_scaled', 'SibSp_scaled', 'Parch_scaled', 'Fare_scaled', 'FamilySize_scaled']] = pd.DataFrame(scaled_features, index=test.index)
# build the dataset from the complete dataset
train_data = train.filter(regex='Survived|Age_.*|SibSp_.*|Parch_.*|Fare_.*|FamilySize_.*|Pclass_.*|Embarked_.*|Sex_.*|Title_.*|IsAlone_.*|AgeBand_.*')
test_data = test.filter(regex='Age_.*|SibSp_.*|Parch_.*|Fare_.*|FamilySize_.*|Pclass_.*|Embarked_.*|Sex_.*|Title_.*|IsAlone_.*|AgeBand_.*')
X_train, Y_train = train_data.as_matrix()[:, 1:], train_data.as_matrix()[:, 0]
X_test = test_data.as_matrix()
# train the model
clf = LogisticRegression(penalty='l1')
clf.fit(X_train, Y_train)
# predict the result of the test set
Y_test = clf.predict(X_test)
result = pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix(), 'Survived': Y_test.astype(np.int32)})
# result.to_csv('../input/new_lr.csv', index=False)
The accuracy in Kaggle.com is 77.51%. Now we need to draw the learning curve to make sure there is no overfitting.
# Learning curve
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, X, y, train_sizes):
train_sizes, train_scores, validation_scores = learning_curve(estimator, X, y, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
validation_scores_mean = np.mean(validation_scores, axis=1)
validation_scores_std = np.std(validation_scores, axis=1)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std,
validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, validation_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return train_sizes, train_scores, validation_scores
train_sizes, train_scores, validation_scores = plot_learning_curve(LogisticRegression(penalty='l1'), X_train, Y_train, np.linspace(.05, 1., 20))
# train the model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, Y_train)
# predict the result of the test set
Y_test = clf.predict(X_test)
result = pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix(), 'Survived': Y_test.astype(np.int32)})
# result.to_csv('../input/new_randomForest.csv', index=False)
The accuracy in Kaggle.com is 0.75598. Now we need to draw the learning curve to make sure there is no overfitting.
# Learning curve
train_sizes, train_scores, validation_scores = plot_learning_curve(RandomForestClassifier(n_estimators=100), X_train, Y_train, np.linspace(.05, 1., 10))
Overfitting is much more obvious than that of the random forest.
# train the model
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, Y_train)
# predict the result of the test set
Y_test = clf.predict(X_test)
result = pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix(), 'Survived': Y_test.astype(np.int32)})
# result.to_csv('../input/new_knn.csv', index=False)
The accuracy in Kaggle.com is 0.74162
# Learning curve
train_sizes, train_scores, validation_scores = plot_learning_curve(KNeighborsClassifier(n_neighbors=5), X_train, Y_train, np.linspace(.05, 1., 20))
# train the model
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, Y_train)
# predict the result of the test set
Y_test = clf.predict(X_test)
result = pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix(), 'Survived': Y_test.astype(np.int32)})
# result.to_csv('../input/new_svm.csv', index=False)
The accuracy in Kaggle.com is 0.78947
# Learning curve
train_sizes, train_scores, validation_scores = plot_learning_curve(svm.SVC(), X_train, Y_train, np.linspace(.05, 1., 20))
# train the model
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, Y_train)
# predict the result of the test set
Y_test = clf.predict(X_test)
result = pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix(), 'Survived': Y_test.astype(np.int32)})
# result.to_csv('../input/new_tree.csv', index=False)
The accuracy in Kaggle.com is 0.74162
# Learning curve
train_sizes, train_scores, validation_scores = plot_learning_curve(tree.DecisionTreeClassifier(), X_train, Y_train, np.linspace(.05, 1., 20))
Decision tree and the random forest all cause the overfitting easily.
After checking the learning curve and comparing the prediction result, knn, logistic regression and svm are selected to predict the labels together by voting.
# lr model
lr_clf = LogisticRegression(penalty='l1')
lr_clf.fit(X_train, Y_train)
Y_test = lr_clf.predict(X_test)
lr_result = pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix(), 'Survived': Y_test.astype(np.int32)})
# knn model
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, Y_train)
Y_test = knn_clf.predict(X_test)
knn_result = pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix(), 'Survived': Y_test.astype(np.int32)})
# svm model
svm_clf = svm.SVC()
svm_clf.fit(X_train, Y_train)
Y_test = svm_clf.predict(X_test)
svm_result = pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix(), 'Survived': Y_test.astype(np.int32)})
# random forest
rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train, Y_train)
Y_test = rf_clf.predict(X_test)
rf_result = pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix(), 'Survived': Y_test.astype(np.int32)})
# decision tree
dt_clf = tree.DecisionTreeClassifier()
dt_clf.fit(X_train, Y_train)
Y_test = dt_clf.predict(X_test)
dt_result = pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix(), 'Survived': Y_test.astype(np.int32)})
# voting predict
result = pd.DataFrame({'PassengerId': test['PassengerId'].as_matrix()})
result['Survived_Vote'] = lr_result['Survived'] + knn_result['Survived'] + svm_result['Survived'] + rf_result['Survived'] + dt_result['Survived']
result['Survived'] = result['Survived_Vote'].apply(lambda x: 1 if x > 2 else 0)
result.drop(['Survived_Vote'], inplace=True, axis=1)
result.to_csv("../input/voting.csv", index=False)
The accuracy of the voting model with 3 or 5 models in Kaggle.com are both 0.78468