## Data Science y Big Data
## Aprendizaje Automatico
## Python para Data Science
## Machine Learning y Deep Learning
## Vizualizacion de Datos


## Liberias Necesarias

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from plotly import graph_objects as go

import tensorflow as tf

from sklearn.model_selection import train_test_split, GridSearchCV


data = pd.read_csv('C:/Users/Droidex/Dropbox/Documentos/Diplomado/Proyecto Final/Churn_Modelling.csv')
print(data.shape)
data.head()

(10000, 14)


#Tipo de datos
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


#Deteccion de nulls por cada columna
print(round(((data.isnull().sum() / len(data))*100),2))

RowNumber          0.0
CustomerId         0.0
Surname            0.0
CreditScore        0.0
Geography          0.0
Gender             0.0
Age                0.0
Tenure             0.0
Balance            0.0
NumOfProducts      0.0
HasCrCard          0.0
IsActiveMember     0.0
EstimatedSalary    0.0
Exited             0.0
dtype: float64


data = data.drop(['RowNumber', 'Surname', 'CustomerId' ], axis=1)
data.head()


data.describe()


data.hist(edgecolor='black', linewidth=1.2, figsize=(18,12))

array([[<AxesSubplot:title={'center':'CreditScore'}>,
        <AxesSubplot:title={'center':'Age'}>,
        <AxesSubplot:title={'center':'Tenure'}>],
       [<AxesSubplot:title={'center':'Balance'}>,
        <AxesSubplot:title={'center':'NumOfProducts'}>,
        <AxesSubplot:title={'center':'HasCrCard'}>],
       [<AxesSubplot:title={'center':'IsActiveMember'}>,
        <AxesSubplot:title={'center':'EstimatedSalary'}>,
        <AxesSubplot:title={'center':'Exited'}>]], dtype=object)


fig = go.Figure(data=[go.Pie(labels= data['Geography'],  title='Lugares')]) 
fig.show()


churn = data.loc[data['Exited']==1]
stayed = data.loc[data['Exited']==0]

churn['Geography'].hist(color='r', alpha=0.8, label='Left')
stayed['Geography'].hist(color='g', alpha=0.2, label='Stayed')
plt.legend()

<matplotlib.legend.Legend at 0x22743dc3bb0>


fig = go.Figure(data=[go.Pie(labels= churn['Geography'],  title='Lugares - Clientes que se Fueron')]) 
fig.show()


fig = go.Figure(data=[go.Pie(labels= stayed['Geography'],  title='Lugares - Clientes que se Quedaron')]) 
fig.show()


#Vamos a separar las columnas independientes de la dependiente para hacer seguir con el analisis exploratorio previo
X_data = data.drop(['Exited'], axis= 1)
Y_data = data['Exited']


import matplotlib.pyplot as plt
import numpy as np

plt.subplot(2, 3, 1) 
plt.pie(Y_data.value_counts(), autopct = '%.2f')
plt.title('Exited')

plt.subplot(2, 3, 2)
plt.pie(X_data['Geography'].value_counts(), autopct = '%.2f')
plt.title('Geography')

plt.subplot(2, 3, 3)
plt.pie(X_data['Gender'].value_counts(), autopct = '%.2f')
plt.title('Gender')

plt.tight_layout(pad=0.4, w_pad=1, h_pad=1.0) 

plt.show()


#Tipos de salida y cantidades respectivamente de la columna Exited
Y_data.value_counts().plot.pie(autopct = '%.2f')
Y_data.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64


X_data= pd.get_dummies(X_data, columns= ['Geography', 'Gender'], drop_first=True)


from imblearn.over_sampling import RandomOverSampler

rus = RandomOverSampler(sampling_strategy=1)
X_data, Y_data = rus.fit_resample(X_data,Y_data)

Y_data.value_counts().plot.pie(autopct = '%.2f')
Y_data.value_counts()
X_data.shape
X_data.head()


ax = plt.axes()
plt.gcf().set_size_inches(3,10)
sns.heatmap(data.corr()[['Exited']].sort_values('Exited', ascending=False), cmap='Blues', annot=True)
ax.set_title('Features Correlations')

plt.show()


datanum= pd.get_dummies(data, columns= ['Geography', 'Gender'], drop_first=True)

ax = plt.axes()
plt.gcf().set_size_inches(5,10)
sns.heatmap(datanum.corr()[['Exited']].sort_values('Exited', ascending=False), cmap='Blues', annot=True)
ax.set_title('Features Correlations')

plt.show()
datanum.shape

(10000, 12)


from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size= 0.2, random_state=1)

print(X_data.shape), print(X_data.shape)

print(X_train.shape), print(y_train.shape)

print(X_test.shape), print(y_test.shape)

(15926, 11)
(15926, 11)
(12740, 11)
(12740,)
(3186, 11)
(3186,)

(None, None)


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

model_le=LogisticRegression(random_state=42,max_iter=10000)
model_le.fit(X_train,y_train)
parameters = {'C':[0.8,0.9,1,1.1,1.2]}
model_le_grid = GridSearchCV(model_le, parameters,cv=10,verbose=1,n_jobs=-1).fit(X_train,y_train)
print('Best parameters:',model_le_grid.best_params_)

print('Logistic Regression Train score:',model_le.score(X_train,y_train)*100)
print('Logistic Regression Cros validation score:',model_le_grid.best_score_*100)

Fitting 10 folds for each of 5 candidates, totalling 50 fits
Best parameters: {'C': 0.8}
Logistic Regression Train score: 65.47880690737834
Logistic Regression Cros validation score: 65.5337519623234


model_le=LogisticRegression(random_state=42,max_iter=10000)
model_le.fit(X_train_scaled,y_train)
parameters = {'C':[0.8,0.9,1,1.1,1.2]}
model_le_grid = GridSearchCV(model_le, parameters,cv=10,verbose=1,n_jobs=-1).fit(X_train_scaled,y_train)
print('Best parameters:',model_le_grid.best_params_)

print('Logistic Regression Train score:',model_le.score(X_train_scaled,y_train)*100)
print('Logistic Regression Cros validation score:',model_le_grid.best_score_*100)

Fitting 10 folds for each of 5 candidates, totalling 50 fits
Best parameters: {'C': 0.8}
Logistic Regression Train score: 70.42386185243328
Logistic Regression Cros validation score: 70.27472527472527


model_dnn=tf.keras.Sequential()

model_dnn.add(tf.keras.layers.Dense(25,activation='relu',input_dim=11))
model_dnn.add(tf.keras.layers.Dropout(0.5))
model_dnn.add(tf.keras.layers.Dense(10,activation='relu'))
model_dnn.add(tf.keras.layers.Dropout(0.3))
model_dnn.add(tf.keras.layers.Dense(1,activation='sigmoid'))
model_dnn.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),loss="binary_crossentropy",metrics=['accuracy'])


model_dnn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense_3 (Dense)             (None, 25)                300       
                                                                 
 dropout_2 (Dropout)         (None, 25)                0         
                                                                 
 dense_4 (Dense)             (None, 10)                260       
                                                                 
 dropout_3 (Dropout)         (None, 10)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 11        
                                                                 
=================================================================
Total params: 571
Trainable params: 571
Non-trainable params: 0
_________________________________________________________________


epoch=200
history=model_dnn.fit(X_train_scaled,y_train,epochs=epoch,verbose=0)


print('Loss:',history.history['loss'][-1])
print('Accuracy: %',history.history['accuracy'][-1]*100)

Loss: 0.4946763217449188
Accuracy: % 76.28728151321411


plt.plot(history.history['loss'], label='loss')
plt.xlim([0,epoch])
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.legend()
plt.grid(True)


from sklearn.metrics import classification_report

def test_score(model_name):
  for i in model_name: 
    print(f'{i.__class__} \n{classification_report(y_test,i.predict(X_test))}')

def test_score_ss(model_name):
  for i in model_name:
    print(f'{i.__class__} \n{classification_report(y_test,i.predict(X_test_scaled))}') 

liste_test=[model_le ]


test_score(liste_test)
test_score_ss(liste_test)

liste_dnn=[]
for i in model_dnn.predict(X_test_scaled):
  if i<0.5:
    liste_dnn.append(0)
  else:
    liste_dnn.append(1)
print(f'{model_dnn.__class__} \n{classification_report(y_test,liste_dnn)}')

<class 'sklearn.linear_model._logistic.LogisticRegression'> 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1534
           1       0.52      1.00      0.68      1652

    accuracy                           0.52      3186
   macro avg       0.26      0.50      0.34      3186
weighted avg       0.27      0.52      0.35      3186

<class 'sklearn.linear_model._logistic.LogisticRegression'> 
              precision    recall  f1-score   support

           0       0.70      0.72      0.71      1534
           1       0.73      0.71      0.72      1652

    accuracy                           0.71      3186
   macro avg       0.71      0.71      0.71      3186
weighted avg       0.71      0.71      0.71      3186

<class 'keras.engine.sequential.Sequential'> 
              precision    recall  f1-score   support

           0       0.72      0.86      0.78      1534
           1       0.84      0.68      0.75      1652

    accuracy                           0.77      3186
   macro avg       0.78      0.77      0.77      3186
weighted avg       0.78      0.77      0.77      3186

C:\Users\Droidex\anaconda3\envs\DiplomadoModulo3\lib\site-packages\sklearn\base.py:443: UserWarning:

X has feature names, but LogisticRegression was fitted without feature names

C:\Users\Droidex\anaconda3\envs\DiplomadoModulo3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

C:\Users\Droidex\anaconda3\envs\DiplomadoModulo3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

C:\Users\Droidex\anaconda3\envs\DiplomadoModulo3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

	RowNumber	CustomerId	Surname	CreditScore	Geography	Gender	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	Exited
0	1	15634602	Hargrave	619	France	Female	42	2	0.00	1	1	1	101348.88	1
1	2	15647311	Hill	608	Spain	Female	41	1	83807.86	1	0	1	112542.58	0
2	3	15619304	Onio	502	France	Female	42	8	159660.80	3	1	0	113931.57	1
3	4	15701354	Boni	699	France	Female	39	1	0.00	2	0	0	93826.63	0
4	5	15737888	Mitchell	850	Spain	Female	43	2	125510.82	1	1	1	79084.10	0

	CreditScore	Geography	Gender	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	Exited
0	619	France	Female	42	2	0.00	1	1	1	101348.88	1
1	608	Spain	Female	41	1	83807.86	1	0	1	112542.58	0
2	502	France	Female	42	8	159660.80	3	1	0	113931.57	1
3	699	France	Female	39	1	0.00	2	0	0	93826.63	0
4	850	Spain	Female	43	2	125510.82	1	1	1	79084.10	0

	CreditScore	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	Exited
count	10000.000000	10000.000000	10000.000000	10000.000000	10000.000000	10000.00000	10000.000000	10000.000000	10000.000000
mean	650.528800	38.921800	5.012800	76485.889288	1.530200	0.70550	0.515100	100090.239881	0.203700
std	96.653299	10.487806	2.892174	62397.405202	0.581654	0.45584	0.499797	57510.492818	0.402769
min	350.000000	18.000000	0.000000	0.000000	1.000000	0.00000	0.000000	11.580000	0.000000
25%	584.000000	32.000000	3.000000	0.000000	1.000000	0.00000	0.000000	51002.110000	0.000000
50%	652.000000	37.000000	5.000000	97198.540000	1.000000	1.00000	1.000000	100193.915000	0.000000
75%	718.000000	44.000000	7.000000	127644.240000	2.000000	1.00000	1.000000	149388.247500	0.000000
max	850.000000	92.000000	10.000000	250898.090000	4.000000	1.00000	1.000000	199992.480000	1.000000

	CreditScore	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	Geography_Spain
0	619	42	2	0.00	1	1	1	101348.88	0
1	608	41	1	83807.86	1	0	1	112542.58	1
2	502	42	8	159660.80	3	1	0	113931.57	0
3	699	39	1	0.00	2	0	0	93826.63	0
4	850	43	2	125510.82	1	1	1	79084.10	1

PROYECTO FINAL

1. OBTENCION DE DATOS¶

2. AED( Análisis exploratorio de datos )¶

2.1 Preguntas¶

3. MODELAMIENTO¶

3.1 Split: Train / Valid / Test¶

4. RESULTADOS¶