# Imports

In [17]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


# Load Dataset

In [32]:
# Load the Titanic dataset
df = sns.load_dataset('titanic')

# Deal With Missing Values
* When continuous column is approximately normal use mean imputation
* When continuous column is skewed or has outliers use median imputation
* Use mode imputation for categorical
* If the column contains too high a percentage of missing values, remove the column

### Mean Imputation

In [45]:
# Calculate the mean of the column
col_names = ['age', 'fare'] # replace this

print(df[col_names].mean)


# Replace missing values in the 'age' column with the mean age
df[col_names] = df[col_names].fillna(df[col_names].mean())

<bound method DataFrame.mean of       age     fare
0    22.0   7.2500
1    38.0  71.2833
2    26.0   7.9250
3    35.0  53.1000
4    35.0   8.0500
..    ...      ...
886  27.0  13.0000
887  19.0  30.0000
888  28.0  23.4500
889  26.0  30.0000
890  32.0   7.7500

[891 rows x 2 columns]>


### Median Imputation

In [33]:
# Calculate the mean of the column
col_names = ['age', 'fare'] # replace this

print(df[col_names].median())


# Replace missing values in the 'age' column with the mean age
df[col_names] = df[col_names].fillna(df[col_names].median())

fare    14.4542
dtype: float64


### Mode Imputation

In [34]:
# Calculate the mean of the column
col_names = ['embarked', 'embark_town'] # replace this

print(df[col_names].mode().iloc[0])

# Replace missing values in the 'age' column with the mean age
df[col_names] = df[col_names].fillna(df[col_names].mode().iloc[0])

embarked                 S
embark_town    Southampton
Name: 0, dtype: object


### Imputation Based on Groups

In [35]:
imp_col = ['age']
group_col = ['who']

# Calculate the mean age for each group in the 'who' column
mean_age_per_group = df.groupby(group_col)[imp_col].mean()

# Impute the missing values in the 'age' column with the mean age for each group
df[imp_col] = df.groupby(group_col)[imp_col].transform(lambda x: x.fillna(x.mean()))

### Remove Columns With Too Many Missing Values

In [36]:
# Remove the columns
col_names = ['deck']
df.drop(col_names, axis=1, inplace=True)

### Check Missing Values are Fixed

In [37]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Calculate the percentage of missing values in each column
missing_percentage = round((df.isnull().sum() / len(df)) * 100, 2)

# Print the number of missing values and percentages in each column
print(pd.concat([missing_values, missing_percentage], axis=1))

            0    1
survived    0  0.0
pclass      0  0.0
sex         0  0.0
age         0  0.0
sibsp       0  0.0
parch       0  0.0
fare        0  0.0
embarked    0  0.0
adult_male  0  0.0
alone       0  0.0


# Column Type Altering

In [None]:
# Change the 'age' column to numeric (float)
df['age'] = df['age'].astype(float)

# Change the 'who' column to categorical (pandas categorical data type)
df['who'] = df['who'].astype('category')

# Change the 'embarked' column to string (object)
df['embarked'] = df['embarked'].astype(str)

# Replace Values

In [13]:
df['embarked'] = df['embarked'].replace('S', 's')
df.loc[df['age'] > 50, 'age'] = df['age'].mean()

In [38]:
df.to_csv('C:/Users/casey/OneDrive/Documents/Data_Science/Machine_Learning/Supervised_Learning/Data/Clean_Data_Titanic.csv', index=False)