import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
import math
import optuna
import lightgbm as lgb
import re
import plotly.express as px
import statsmodels.api as sm

from tqdm.auto import tqdm
from sklearn.metrics import f1_score
import math
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_curve

from sklearn import metrics
from copy import deepcopy
import h2o


import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

df = pd.read_csv('Telco-Customer-Churn.csv')


pd.options.display.float_format = '{:,.2f}'.format
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values 
    summ['%missing'] = df.isnull().sum().values / len(df) * 100
    summ['#duplicated'] = df.duplicated()
    summ['%duplicated'] = df.duplicated()/ len(df) * 100
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['average'] = desc['mean'].values
    summ['standard_deviation'] = desc['std'].values
    summ['first value'] = df.loc[0].values
    summ['second value'] = df.loc[1].values
    summ['third value'] = df.loc[2].values
    
    return summ


summary(df).style.background_gradient(cmap='YlOrBr')


target_col=df['Churn']
# Unique value counts for each column
unique_counts = df.nunique()

# Threshold to distinguish continuous and categorical
threshold = 10

num_cols = unique_counts[unique_counts > threshold].index.tolist()
cat_vars = unique_counts[unique_counts <= threshold].index.tolist()

# Removing the 'outcome' from categorical since it's our target variable
if 'Churn' in cat_vars:
    cat_vars.remove('Churn')
if 'customerID' in num_cols:
    num_cols.remove('customerID')


cat_vars


num_cols


def toNum(num_cols, df):
    for col_name in num_cols:
        df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
toNum(num_cols,df)
df.isnull().sum()


train_missing_values = df.isna()

def plot_missing(data, set_type='Training'):
    plt.figure(figsize=(8, 6))
    sns.heatmap(data.T, cmap=['#5F68B9', '#EAECFE'], cbar=False, yticklabels=True)
    plt.title(f'Missing Values | {set_type} Set')
    plt.show()

plot_missing(train_missing_values)


def drop_missing_rows(df):
    if df.isnull().any().any():
        print("Missing values found in the dataframe.")
        df_original=df
        df = df.dropna(axis=0)
        dropped_rows = len(df_original) - len(df)
        print(f"Dropped {dropped_rows} rows.")
    else:
        print("No missing values found in the dataframe.")

    return df
df=drop_missing_rows(df)


def count_duplicate_rows(df):
    duplicate_rows = df[df.duplicated()]
    
    if not duplicate_rows.empty:
        print("Duplicate rows found in the dataframe.")
        count = len(duplicate_rows)
        print(f"Count of duplicate rows: {count}.")
    else:
        print("No duplicate rows found in the dataframe.")

    return df

df = count_duplicate_rows(df)


# Get target count and percentage per unique value
outcome_counts = df['Churn'].value_counts()
outcome_percentages = df['Churn'].value_counts(normalize=True) * 100

# Plotting
plt.figure(figsize=(8, 6))
plt.pie(outcome_counts, labels=outcome_counts.index, autopct=lambda p: f'{p:.1f}% ({int(p*len(df)/100)})', startangle=90)
plt.title('Target Count and % per Unique Value')
plt.legend(outcome_counts.index, loc="best")
plt.show()


df_with_churn = df[num_cols].copy()
df_with_churn['Churn'] = df['Churn']

# Plot pairplot
sns.pairplot(data=df_with_churn, hue='Churn', corner=True)


import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(14, len(num_cols) * 2.5))

for idx, column in enumerate(num_cols):
    # Plotting for outcome
    plt.subplot(len(num_cols), 2, idx*2+1)
    sns.histplot(x=column, hue="Churn", data=df, bins=30, kde=True, palette='YlOrRd')
    plt.title(f"{column} Distribution for Churn")
    plt.ylim(0, df[column].value_counts().max() + 500)
    
plt.tight_layout()
plt.show()


plt.figure(figsize=(14, len(cat_vars)*2.5))

for idx, column in enumerate(cat_vars):
    plt.subplot(len(cat_vars)//2 + len(cat_vars) % 2, 2, idx+1)
    sns.countplot(x=column, hue="Churn", data=df, palette='YlOrRd')
    plt.title(f"{column} Countplot by Churn")
    plt.ylim(0, df[column].value_counts().max() + 10)
    
plt.tight_layout()
plt.show()


corr_matrix = df[num_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

plt.figure(figsize=(8, 6))
heatmap = sns.heatmap(corr_matrix, mask=mask, annot=True, square=True, cmap='OrRd', cbar=False, linewidths=1)
title = heatmap.set_title("Correlation Heatmap of Numerical Features", weight='bold', size=14)
title.set_position([0.35, 1.05])
plt.show()


# Encode binary variables
df['Churn'] = df['Churn'].map({'No':0, 'Yes':1})


def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], drop_first=True)
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return res


for feature in cat_vars:
    df = encode_and_bind(df, feature)

df.head()


df.drop('customerID', axis=1, inplace=True)


# Generate x and y sets
x = df.drop('Churn', axis=1).values
y = df['Churn']


# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=1234)


from xgboost import XGBClassifier

classifier = XGBClassifier(random_state=1234)
classifier.fit(x_train,y_train)


# Predicting the test set
y_pred = classifier.predict(x_test)

# Making the confusion matrix and calculating accuracy score
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy:{accuracy:.2f}')

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fpr, tpr)
print(f'Area Under the ROC Curve:{auc:.2f}')


def show_confusion(y_pred, y_true):
    # Assuming y_true and y_pred are numpy arrays
    labels = sorted(list(set(y_true)))
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    df_cm = pd.DataFrame(cm, index=labels, columns=labels)
    
    plt.figure(figsize=(8,6))
    sns.heatmap(df_cm, annot=True, fmt='g', cmap='Blues')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.show()


show_confusion(y_pred, y_test)


y_pred_prob = classifier.predict_proba(x_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)

plt.plot([0, 1], [0, 1], 'k--' )
plt.plot(fpr, tpr, label='Logistic Regression',color = "r")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('XGBoost ROC Curve (AUC = {:.2f})'.format(roc_auc),fontsize=16)
plt.legend(loc="lower right")
plt.show()


# Importing packages for SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler

from imblearn.pipeline import Pipeline

from collections import Counter


sm = SMOTE(sampling_strategy='auto', random_state=1234)
x_sm, y_sm = sm.fit_resample(x_train, y_train)


print(Counter(y_train))
print(Counter(y_sm))


classifier = XGBClassifier(random_state=1234)
classifier.fit(x_sm, y_sm)


# Predicting the test set
y_pred = classifier.predict(x_test)

# Making the confusion matrix and calculating accuracy score
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fpr, tpr)
print(auc)


over = BorderlineSMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.6)

steps = [('o', over), ('u', under)]


pipeline = Pipeline(steps=steps)

# transform the dataset
x_sm_us, y_sm_us = pipeline.fit_resample(x_train, y_train)

print(Counter(y_train))
print(Counter(y_sm_us))


classifier = XGBClassifier(random_state=1234)
classifier.fit(x_sm_us, y_sm_us)


# Predicting the test set
y_pred = classifier.predict(x_test)

# Making the confusion matrix and calculating accuracy score
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fpr, tpr)
print(auc)


y_pred_prob = classifier.predict_proba(x_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)

plt.plot([0, 1], [0, 1], 'k--' )
plt.plot(fpr, tpr, label='Logistic Regression',color = "r")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('XGBoost ROC Curve (AUC = {:.2f})'.format(roc_auc),fontsize=16)
plt.legend(loc="lower right")
plt.show()


# Names of the independent variables
feature_names = list(df.drop('Churn', axis=1).columns)
# Concatenate train (with resampling) and test sets to build the new dataframe 
sm_us_x = np.concatenate((x_sm_us, x_test))
sm_us_y = np.concatenate((y_sm_us, y_test))
sm_us_df = pd.DataFrame(np.column_stack([sm_us_y, sm_us_x]), columns=['Churn'] + feature_names)
sm_us_df.head()


#Get Correlation of "Churn" with other variables:
plt.figure(figsize=(15,8))
sm_us_df.corr()['Churn'].sort_values(ascending = False).plot(kind='bar')


from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=1234)
rf_clf.fit(x_sm_us, y_sm_us)


rf_clf.feature_importances_
features_to_plot = 25

importances = rf_clf.feature_importances_
indices = np.argsort(importances)

best_vars = np.array(feature_names)[indices][-features_to_plot:]
values = importances[indices][-features_to_plot:]
y_ticks = np.arange(0, features_to_plot)
fig, ax = plt.subplots()
ax.barh(y_ticks, values)
ax.set_yticklabels(best_vars)
ax.set_yticks(y_ticks)
ax.set_title("Random Forest Feature Importances")
fig.tight_layout()
plt.show()


best_vars = best_vars[-10:]
best_vars


from h2o.automl import H2OAutoML

h2o.init()


hf = h2o.H2OFrame(sm_us_df[['Churn'] + list(best_vars)])
hf.head()


hf['Churn'] = hf['Churn'].asfactor()
predictors = hf.drop('Churn').columns
response = 'Churn'


# Split into train and test
train, valid = hf.split_frame(ratios=[.8], seed=1234)


# Add a Stopping Creterias: max number of models and max time
# We are going to exclude DeepLearning algorithms because they are too slow
aml = H2OAutoML(
    max_models=20,
    max_runtime_secs=300,
    seed=1234,
    exclude_algos = ["DeepLearning"]
)


# Train the model
aml.train(x=predictors,
        y=response,
        training_frame=train,
        validation_frame=valid
)


# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=5)  # Print the first 5 rows


lb = lb.as_data_frame()
lb['model_type'] = lb['model_id'].apply(lambda x: x.split('_')[0])
fig = px.bar(
    lb, 
    x='model_id',
    y='auc',
    color='model_type'
)
fig.show()


print('The model performance in Accuracy: {}'.format(aml.leader.accuracy(valid=True)))
print('The model performance in AUC: {}'.format(aml.leader.auc(valid=True)))

Import Data & Libraries¶

Investigate Dataset¶¶

Explore Trian Dataset¶

One Hot Encoding¶

SMOTE¶

Feature Selection¶

Train the model¶