import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
import math
import optuna
import lightgbm as lgb
import re
import plotly.express as px
import statsmodels.api as sm
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
import math
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_curve
from sklearn import metrics
from copy import deepcopy
import h2o
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
df = pd.read_csv('Telco-Customer-Churn.csv')
pd.options.display.float_format = '{:,.2f}'.format
def summary(df):
print(f'data shape: {df.shape}')
summ = pd.DataFrame(df.dtypes, columns=['data type'])
summ['#missing'] = df.isnull().sum().values
summ['%missing'] = df.isnull().sum().values / len(df) * 100
summ['#duplicated'] = df.duplicated()
summ['%duplicated'] = df.duplicated()/ len(df) * 100
summ['#unique'] = df.nunique().values
desc = pd.DataFrame(df.describe(include='all').transpose())
summ['min'] = desc['min'].values
summ['max'] = desc['max'].values
summ['average'] = desc['mean'].values
summ['standard_deviation'] = desc['std'].values
summ['first value'] = df.loc[0].values
summ['second value'] = df.loc[1].values
summ['third value'] = df.loc[2].values
return summ
summary(df).style.background_gradient(cmap='YlOrBr')
One can deduct that Customer ID will be droped and MonthlyCharges,TotalCharges,tenure are numerical, the rest is categorical and due to low cardinality they will be one hot encoded. Potentially it can be applied PAC to reduce the dimension of the new columns.
Identify Categories and Numerical Features
target_col=df['Churn']
# Unique value counts for each column
unique_counts = df.nunique()
# Threshold to distinguish continuous and categorical
threshold = 10
num_cols = unique_counts[unique_counts > threshold].index.tolist()
cat_vars = unique_counts[unique_counts <= threshold].index.tolist()
# Removing the 'outcome' from categorical since it's our target variable
if 'Churn' in cat_vars:
cat_vars.remove('Churn')
if 'customerID' in num_cols:
num_cols.remove('customerID')
cat_vars
num_cols
def toNum(num_cols, df):
for col_name in num_cols:
df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
toNum(num_cols,df)
df.isnull().sum()
Remove Missing and Duplicates
train_missing_values = df.isna()
def plot_missing(data, set_type='Training'):
plt.figure(figsize=(8, 6))
sns.heatmap(data.T, cmap=['#5F68B9', '#EAECFE'], cbar=False, yticklabels=True)
plt.title(f'Missing Values | {set_type} Set')
plt.show()
plot_missing(train_missing_values)
Due to the low number (11) of missing values we will drop them instead of replace with a calculation (mean/mode/median)
def drop_missing_rows(df):
if df.isnull().any().any():
print("Missing values found in the dataframe.")
df_original=df
df = df.dropna(axis=0)
dropped_rows = len(df_original) - len(df)
print(f"Dropped {dropped_rows} rows.")
else:
print("No missing values found in the dataframe.")
return df
df=drop_missing_rows(df)
def count_duplicate_rows(df):
duplicate_rows = df[df.duplicated()]
if not duplicate_rows.empty:
print("Duplicate rows found in the dataframe.")
count = len(duplicate_rows)
print(f"Count of duplicate rows: {count}.")
else:
print("No duplicate rows found in the dataframe.")
return df
df = count_duplicate_rows(df)
# Get target count and percentage per unique value
outcome_counts = df['Churn'].value_counts()
outcome_percentages = df['Churn'].value_counts(normalize=True) * 100
# Plotting
plt.figure(figsize=(8, 6))
plt.pie(outcome_counts, labels=outcome_counts.index, autopct=lambda p: f'{p:.1f}% ({int(p*len(df)/100)})', startangle=90)
plt.title('Target Count and % per Unique Value')
plt.legend(outcome_counts.index, loc="best")
plt.show()
Potentially SMOTE to increase the porportion of 'Yes' Churn.
Distribution of target in numercal attributes
df_with_churn = df[num_cols].copy()
df_with_churn['Churn'] = df['Churn']
# Plot pairplot
sns.pairplot(data=df_with_churn, hue='Churn', corner=True)
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(14, len(num_cols) * 2.5))
for idx, column in enumerate(num_cols):
# Plotting for outcome
plt.subplot(len(num_cols), 2, idx*2+1)
sns.histplot(x=column, hue="Churn", data=df, bins=30, kde=True, palette='YlOrRd')
plt.title(f"{column} Distribution for Churn")
plt.ylim(0, df[column].value_counts().max() + 500)
plt.tight_layout()
plt.show()
Distribution Categorical Features
plt.figure(figsize=(14, len(cat_vars)*2.5))
for idx, column in enumerate(cat_vars):
plt.subplot(len(cat_vars)//2 + len(cat_vars) % 2, 2, idx+1)
sns.countplot(x=column, hue="Churn", data=df, palette='YlOrRd')
plt.title(f"{column} Countplot by Churn")
plt.ylim(0, df[column].value_counts().max() + 10)
plt.tight_layout()
plt.show()
Correlation Numerical Columns
corr_matrix = df[num_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
plt.figure(figsize=(8, 6))
heatmap = sns.heatmap(corr_matrix, mask=mask, annot=True, square=True, cmap='OrRd', cbar=False, linewidths=1)
title = heatmap.set_title("Correlation Heatmap of Numerical Features", weight='bold', size=14)
title.set_position([0.35, 1.05])
plt.show()
# Encode binary variables
df['Churn'] = df['Churn'].map({'No':0, 'Yes':1})
def encode_and_bind(original_dataframe, feature_to_encode):
dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], drop_first=True)
res = pd.concat([original_dataframe, dummies], axis=1)
res = res.drop([feature_to_encode], axis=1)
return res
for feature in cat_vars:
df = encode_and_bind(df, feature)
df.head()
Lets try model a model with RAW data with XGBoost
df.drop('customerID', axis=1, inplace=True)
# Generate x and y sets
x = df.drop('Churn', axis=1).values
y = df['Churn']
# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=1234)
from xgboost import XGBClassifier
classifier = XGBClassifier(random_state=1234)
classifier.fit(x_train,y_train)
The evaluation metric is be based on the Area Under the ROC Curve.
# Predicting the test set
y_pred = classifier.predict(x_test)
# Making the confusion matrix and calculating accuracy score
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy:{accuracy:.2f}')
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fpr, tpr)
print(f'Area Under the ROC Curve:{auc:.2f}')
def show_confusion(y_pred, y_true):
# Assuming y_true and y_pred are numpy arrays
labels = sorted(list(set(y_true)))
cm = confusion_matrix(y_true, y_pred, labels=labels)
df_cm = pd.DataFrame(cm, index=labels, columns=labels)
plt.figure(figsize=(8,6))
sns.heatmap(df_cm, annot=True, fmt='g', cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()
show_confusion(y_pred, y_test)
If predicted probabilities
y_pred_prob = classifier.predict_proba(x_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)
plt.plot([0, 1], [0, 1], 'k--' )
plt.plot(fpr, tpr, label='Logistic Regression',color = "r")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('XGBoost ROC Curve (AUC = {:.2f})'.format(roc_auc),fontsize=16)
plt.legend(loc="lower right")
plt.show()
Let's apply SMOTE
SMOTE
# Importing packages for SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
sm = SMOTE(sampling_strategy='auto', random_state=1234)
x_sm, y_sm = sm.fit_resample(x_train, y_train)
print(Counter(y_train))
print(Counter(y_sm))
classifier = XGBClassifier(random_state=1234)
classifier.fit(x_sm, y_sm)
# Predicting the test set
y_pred = classifier.predict(x_test)
# Making the confusion matrix and calculating accuracy score
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fpr, tpr)
print(auc)
Does not improve
SMOTE and UnderSample
over = BorderlineSMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.6)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
x_sm_us, y_sm_us = pipeline.fit_resample(x_train, y_train)
print(Counter(y_train))
print(Counter(y_sm_us))
classifier = XGBClassifier(random_state=1234)
classifier.fit(x_sm_us, y_sm_us)
# Predicting the test set
y_pred = classifier.predict(x_test)
# Making the confusion matrix and calculating accuracy score
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fpr, tpr)
print(auc)
y_pred_prob = classifier.predict_proba(x_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)
plt.plot([0, 1], [0, 1], 'k--' )
plt.plot(fpr, tpr, label='Logistic Regression',color = "r")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('XGBoost ROC Curve (AUC = {:.2f})'.format(roc_auc),fontsize=16)
plt.legend(loc="lower right")
plt.show()
Better result with SMOTE and UnderSampling so we will generate a new DataFrame with this data
# Names of the independent variables
feature_names = list(df.drop('Churn', axis=1).columns)
# Concatenate train (with resampling) and test sets to build the new dataframe
sm_us_x = np.concatenate((x_sm_us, x_test))
sm_us_y = np.concatenate((y_sm_us, y_test))
sm_us_df = pd.DataFrame(np.column_stack([sm_us_y, sm_us_x]), columns=['Churn'] + feature_names)
sm_us_df.head()
#Get Correlation of "Churn" with other variables:
plt.figure(figsize=(15,8))
sm_us_df.corr()['Churn'].sort_values(ascending = False).plot(kind='bar')
Now we are going to select the top features for the model (because we have done One-Hot-Encoding and there are to many variables at the moment). There are different ways to do this such as Manually selecting by the correlation (example ternure, contract 2 years, internet service... see above), PCA: To select the top features or Train a model: select the top importance variables for a simple tree.
We are going to train for this instance a Random Forest model and use its feature importance information to select the best columns.
Feature Importance Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=1234)
rf_clf.fit(x_sm_us, y_sm_us)
rf_clf.feature_importances_
features_to_plot = 25
importances = rf_clf.feature_importances_
indices = np.argsort(importances)
best_vars = np.array(feature_names)[indices][-features_to_plot:]
values = importances[indices][-features_to_plot:]
y_ticks = np.arange(0, features_to_plot)
fig, ax = plt.subplots()
ax.barh(y_ticks, values)
ax.set_yticklabels(best_vars)
ax.set_yticks(y_ticks)
ax.set_title("Random Forest Feature Importances")
fig.tight_layout()
plt.show()
We select the 10 best features.
best_vars = best_vars[-10:]
best_vars
Now, with the resampled dataset and the selected feaures, we are going to train an xgboost model and due to the lack of time search the best parameters with H2O AutoML. We used XGBoost but if we had more time we could try other supervised algorithm for clasification such as logistics regression, Random Forest, Catboost, LightGBM, SVM, or AdaBoost.
from h2o.automl import H2OAutoML
h2o.init()
hf = h2o.H2OFrame(sm_us_df[['Churn'] + list(best_vars)])
hf.head()
hf['Churn'] = hf['Churn'].asfactor()
predictors = hf.drop('Churn').columns
response = 'Churn'
# Split into train and test
train, valid = hf.split_frame(ratios=[.8], seed=1234)
We execute AutoML with H2O on the resampled dataset and only with the final variables
# Add a Stopping Creterias: max number of models and max time
# We are going to exclude DeepLearning algorithms because they are too slow
aml = H2OAutoML(
max_models=20,
max_runtime_secs=300,
seed=1234,
exclude_algos = ["DeepLearning"]
)
# Train the model
aml.train(x=predictors,
y=response,
training_frame=train,
validation_frame=valid
)
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=5) # Print the first 5 rows
lb = lb.as_data_frame()
lb['model_type'] = lb['model_id'].apply(lambda x: x.split('_')[0])
fig = px.bar(
lb,
x='model_id',
y='auc',
color='model_type'
)
fig.show()
print('The model performance in Accuracy: {}'.format(aml.leader.accuracy(valid=True)))
print('The model performance in AUC: {}'.format(aml.leader.auc(valid=True)))