In [1]:
#Importing libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
import os
import matplotlib.pyplot as plt#visualization
from PIL import  Image
%matplotlib inline
import pandas as pd
import seaborn as sns#visualization
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py#visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff#visualization

DATA

In [2]:
telcom = pd.read_csv('Customer Churn.csv')
#first few rows
telcom.head()
Out[2]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes

5 rows × 21 columns

DATA OVERVIEW

In [3]:
print ("Rows     : " ,telcom.shape[0])
print ("Columns  : " ,telcom.shape[1])
print ("\nFeatures : \n" ,telcom.columns.tolist())
print ("\nMissing values :  ", telcom.isnull().sum().values.sum())
print ("\nUnique values :  \n",telcom.nunique())
Rows     :  7043
Columns  :  21

Features : 
 ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Missing values :   0

Unique values :  
 customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64

DATA MANUPULATION

In [4]:
#Data Manipulation

#Replacing spaces with null values in total charges column
telcom['TotalCharges'] = telcom["TotalCharges"].replace(" ",np.nan)

#Dropping null values from total charges column which contain .15% missing data 
telcom = telcom[telcom["TotalCharges"].notnull()]
telcom = telcom.reset_index()[telcom.columns]

#convert to float type
telcom["TotalCharges"] = telcom["TotalCharges"].astype(float)

#replace 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for i in replace_cols : 
    telcom[i]  = telcom[i].replace({'No internet service' : 'No'})
    
#replace values
telcom["SeniorCitizen"] = telcom["SeniorCitizen"].replace({1:"Yes",0:"No"})

#Tenure to categorical column
def tenure_lab(telcom) :
    
    if telcom["tenure"] <= 12 :
        return "Tenure_0-12"
    elif (telcom["tenure"] > 12) & (telcom["tenure"] <= 24 ):
        return "Tenure_12-24"
    elif (telcom["tenure"] > 24) & (telcom["tenure"] <= 48) :
        return "Tenure_24-48"
    elif (telcom["tenure"] > 48) & (telcom["tenure"] <= 60) :
        return "Tenure_48-60"
    elif telcom["tenure"] > 60 :
        return "Tenure_gt_60"
telcom["tenure_group"] = telcom.apply(lambda telcom:tenure_lab(telcom),
                                      axis = 1)

#Separating churn and non churn customers
churn     = telcom[telcom["Churn"] == "Yes"]
not_churn = telcom[telcom["Churn"] == "No"]

#Separating catagorical and numerical columns
Id_col     = ['customerID']
target_col = ["Churn"]
cat_cols   = telcom.nunique()[telcom.nunique() < 6].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
num_cols   = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]

EXPLORATORY DATA ANALYSIS

CUSTOMER ATTRITION IN DATA

In [5]:
#labels
lab = telcom["Churn"].value_counts().keys().tolist()
#values
val = telcom["Churn"].value_counts().values.tolist()

trace = go.Pie(labels = lab ,
               values = val ,
               marker = dict(colors =  [ 'royalblue' ,'lime'],
                             line = dict(color = "white",
                                         width =  1.3)
                            ),
               rotation = 90,
               hoverinfo = "label+value+text",
               hole = .5
              )
layout = go.Layout(dict(title = "Customer attrition in data",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                       )
                  )

data = [trace]
fig = go.Figure(data = data,layout = layout)
py.iplot(fig)
In [ ]:
 

VARIABLES DISTRIBUTION IN CUSTOMER ATTRITION

In [6]:
#function  for pie plot for customer attrition types
def plot_pie(column) :
    
    trace1 = go.Pie(values  = churn[column].value_counts().values.tolist(),
                    labels  = churn[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    domain  = dict(x = [0,.48]),
                    name    = "Churn Customers",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    hole    = .6
                   )
    trace2 = go.Pie(values  = not_churn[column].value_counts().values.tolist(),
                    labels  = not_churn[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    domain  = dict(x = [.52,1]),
                    hole    = .6,
                    name    = "Non churn customers" 
                   )


    layout = go.Layout(dict(title = column + " distribution in customer attrition ",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            annotations = [dict(text = "churn customers",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .15, y = .5),
                                           dict(text = "Non churn customers",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .88,y = .5
                                               )
                                          ]
                           )
                      )
    data = [trace1,trace2]
    fig  = go.Figure(data = data,layout = layout)
    py.iplot(fig)


#function  for histogram for customer attrition types
def histogram(column) :
    trace1 = go.Histogram(x  = churn[column],
                          histnorm= "percent",
                          name = "Churn Customers",
                          marker = dict(line = dict(width = .5,
                                                    color = "black"
                                                    )
                                        ),
                         opacity = .9 
                         ) 
    
    trace2 = go.Histogram(x  = not_churn[column],
                          histnorm = "percent",
                          name = "Non churn customers",
                          marker = dict(line = dict(width = .5,
                                              color = "black"
                                             )
                                 ),
                          opacity = .9
                         )
    
    data = [trace1,trace2]
    layout = go.Layout(dict(title =column + " distribution in customer attrition ",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = column,
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = "percent",
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                           )
                      )
    fig  = go.Figure(data=data,layout=layout)
    
    py.iplot(fig)
    
#function  for scatter plot matrix  for numerical columns in data
def scatter_matrix(df)  :
    
    df  = df.sort_values(by = "Churn" ,ascending = True)
    classes = df["Churn"].unique().tolist()
    classes
    
    class_code  = {classes[k] : k for k in range(2)}
    class_code

    color_vals = [class_code[cl] for cl in df["Churn"]]
    color_vals

    pl_colorscale = "Portland"

    pl_colorscale

    text = [df.loc[k,"Churn"] for k in range(len(df))]
    text

    trace = go.Splom(dimensions = [dict(label  = "tenure",
                                       values = df["tenure"]),
                                  dict(label  = 'MonthlyCharges',
                                       values = df['MonthlyCharges']),
                                  dict(label  = 'TotalCharges',
                                       values = df['TotalCharges'])],
                     text = text,
                     marker = dict(color = color_vals,
                                   colorscale = pl_colorscale,
                                   size = 3,
                                   showscale = False,
                                   line = dict(width = .1,
                                               color='rgb(230,230,230)'
                                              )
                                  )
                    )
    axis = dict(showline  = True,
                zeroline  = False,
                gridcolor = "#fff",
                ticklen   = 4
               )
    
    layout = go.Layout(dict(title  = 
                            "Scatter plot matrix for Numerical columns for customer attrition",
                            autosize = False,
                            height = 800,
                            width  = 800,
                            dragmode = "select",
                            hovermode = "closest",
                            plot_bgcolor  = 'rgba(240,240,240, 0.95)',
                            xaxis1 = dict(axis),
                            yaxis1 = dict(axis),
                            xaxis2 = dict(axis),
                            yaxis2 = dict(axis),
                            xaxis3 = dict(axis),
                            yaxis3 = dict(axis),
                           )
                      )
    data   = [trace]
    fig = go.Figure(data = data,layout = layout )
    py.iplot(fig)

#for all categorical columns plot pie
for i in cat_cols :
    plot_pie(i)

#for all categorical columns plot histogram    
for i in num_cols :
    histogram(i)

#scatter plot matrix
scatter_matrix(telcom)

CUSTOMER ATTRITION IN TENURE GROUPS

In [7]:
#cusomer attrition in tenure groups
tg_ch  =  churn["tenure_group"].value_counts().reset_index()
tg_ch.columns  = ["tenure_group","count"]
tg_nch =  not_churn["tenure_group"].value_counts().reset_index()
tg_nch.columns = ["tenure_group","count"]

#bar - churn
trace1 = go.Bar(x = tg_ch["tenure_group"]  , y = tg_ch["count"],
                name = "Churn Customers",
                marker = dict(line = dict(width = .5,color = "black")),
                opacity = .9)

#bar - not churn
trace2 = go.Bar(x = tg_nch["tenure_group"] , y = tg_nch["count"],
                name = "Non Churn Customers",
                marker = dict(line = dict(width = .5,color = "black")),
                opacity = .9)

layout = go.Layout(dict(title = "Customer attrition in tenure groups",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "tenure group",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "count",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                       )
                  )
data = [trace1,trace2]
fig  = go.Figure(data=data,layout=layout)
py.iplot(fig)

MONTHLY CHARGES AND TOTAL CHARGES BY TENURE AND CHURN GROUPS

In [8]:
telcom[['MonthlyCharges', 'TotalCharges','tenure',"tenure_group"]]

#scatter plot monthly charges & total charges by tenure group

def plot_tenure_scatter(tenure_group,color) :
    tracer = go.Scatter(x = telcom[telcom["tenure_group"] == tenure_group]["MonthlyCharges"],
                        y = telcom[telcom["tenure_group"] == tenure_group]["TotalCharges"],
                        mode = "markers",marker = dict(line = dict(color = "black",
                                                                   width = .2),
                                                       size = 4 , color = color,
                                                       symbol = "diamond-dot",
                                                      ),
                        name = tenure_group,
                        opacity = .9
                       )
    return tracer

#scatter plot monthly charges & total charges by churn group
def plot_churncharges_scatter(churn,color) :
    tracer = go.Scatter(x = telcom[telcom["Churn"] == churn]["MonthlyCharges"],
                        y = telcom[telcom["Churn"] == churn]["TotalCharges"],
                        mode = "markers",marker = dict(line = dict(color = "black",
                                                                   width = .2),
                                                       size = 4 , color = color,
                                                       symbol = "diamond-dot",
                                                      ),
                        name = "Churn - " + churn,
                        opacity = .9
                       )
    return tracer

trace1 = plot_tenure_scatter("Tenure_0-12","#FF3300")
trace2 = plot_tenure_scatter("Tenure_12-24","#6666FF")
trace3 = plot_tenure_scatter("Tenure_24-48","#99FF00")
trace4 = plot_tenure_scatter("Tenure_48-60","#996600")
trace5 = plot_tenure_scatter("Tenure_gt_60","grey")
trace6 = plot_churncharges_scatter("Yes","red")
trace7 = plot_churncharges_scatter("No","blue")

data1   = [trace1,trace2,trace3,trace4,trace5] 
data2   = [trace7,trace6]

#layout
def layout_title(title) :
    layout = go.Layout(dict(title = title,
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                         title = "Monthly charges",
                                         zerolinewidth=1,ticklen=5,gridwidth=2),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                         title = "Total Charges",
                                         zerolinewidth=1,ticklen=5,gridwidth=2),
                            height = 600
                           )
                      )
    return layout

layout1  = layout_title("Monthly Charges & Total Charges by Tenure group")
layout2  = layout_title("Monthly Charges & Total Charges by Churn group")
fig1 = go.Figure(data = data1,layout = layout1)
fig2 = go.Figure(data = data2,layout = layout2)
py.iplot(fig1)
py.iplot(fig2)

AVERAGE CHARGES BY TENURE GROUPS

In [9]:
avg_tgc = telcom.groupby(["tenure_group","Churn"])[["MonthlyCharges",
                                                    "TotalCharges"]].mean().reset_index()

#function for tracing 
def mean_charges(column,aggregate) :
    tracer = go.Bar(x = avg_tgc[avg_tgc["Churn"] == aggregate]["tenure_group"],
                    y = avg_tgc[avg_tgc["Churn"] == aggregate][column],
                    name = aggregate,marker = dict(line = dict(width = 1)),
                    text = "Churn"
                   )
    return tracer

#function for layout
def layout_plot(title,xaxis_lab,yaxis_lab) :
    layout = go.Layout(dict(title = title,
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = xaxis_lab,
                                         zerolinewidth=1,ticklen=5,gridwidth=2),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',title = yaxis_lab,
                                         zerolinewidth=1,ticklen=5,gridwidth=2),
                           )
                      )
    return layout
    

#plot1 - mean monthly charges by tenure groups
trace1  = mean_charges("MonthlyCharges","Yes")
trace2  = mean_charges("MonthlyCharges","No")
layout1 = layout_plot("Average Monthly Charges by Tenure groups",
                      "Tenure group","Monthly Charges")
data1   = [trace1,trace2]
fig1    = go.Figure(data=data1,layout=layout1)

#plot2 - mean total charges by tenure groups
trace3  = mean_charges("TotalCharges","Yes")
trace4  = mean_charges("TotalCharges","No")
layout2 = layout_plot("Average Total Charges by Tenure groups",
                      "Tenure group","Total Charges")
data2   = [trace3,trace4]
fig2    = go.Figure(data=data2,layout=layout2)

py.iplot(fig1)
py.iplot(fig2)

MONTHLY CHARGES,TOTAL CHARGES AND TENURE IN CUSTOMER ATTRITION

In [10]:
##copy data
tel_df = telcom.copy()
#Drop tenure column
telcom = telcom.drop(columns = "tenure_group",axis = 1)

trace1 = go.Scatter3d(x = churn["MonthlyCharges"],
                      y = churn["TotalCharges"],
                      z = churn["tenure"],
                      mode = "markers",
                      name = "Churn customers",
                      text = "Id : " + churn["customerID"],
                      marker = dict(size = 1,color = "red")
                     )
trace2 = go.Scatter3d(x = not_churn["MonthlyCharges"],
                      y = not_churn["TotalCharges"],
                      z = not_churn["tenure"],
                      name = "Non churn customers",
                      text = "Id : " + not_churn["customerID"],
                      mode = "markers",
                      marker = dict(size = 1,color= "green")
                     )



layout = go.Layout(dict(title = "Monthly charges,total charges & tenure in customer attrition",
                        scene = dict(camera = dict(up=dict(x= 0 , y=0, z=0),
                                                   center=dict(x=0, y=0, z=0),
                                                   eye=dict(x=1.25, y=1.25, z=1.25)),
                                     xaxis  = dict(title = "monthly charges",
                                                   gridcolor='rgb(255, 255, 255)',
                                                   zerolinecolor='rgb(255, 255, 255)',
                                                   showbackground=True,
                                                   backgroundcolor='rgb(230, 230,230)'),
                                     yaxis  = dict(title = "total charges",
                                                   gridcolor='rgb(255, 255, 255)',
                                                   zerolinecolor='rgb(255, 255, 255)',
                                                   showbackground=True,
                                                   backgroundcolor='rgb(230, 230,230)'
                                                  ),
                                     zaxis  = dict(title = "tenure",
                                                   gridcolor='rgb(255, 255, 255)',
                                                   zerolinecolor='rgb(255, 255, 255)',
                                                   showbackground=True,
                                                   backgroundcolor='rgb(230, 230,230)'
                                                  )
                                    ),
                        height = 700,
                       )
                  )
                  

data = [trace1,trace2]
fig  = go.Figure(data = data,layout = layout)
py.iplot(fig)

DATA PREPROCESSING

In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#customer id col
Id_col     = ['customerID']
#Target columns
target_col = ["Churn"]
#categorical columns
cat_cols   = telcom.nunique()[telcom.nunique() < 6].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
#numerical columns
num_cols   = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]
#Binary columns with 2 values
bin_cols   = telcom.nunique()[telcom.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    telcom[i] = le.fit_transform(telcom[i])
    
#Duplicating columns for multi value columns
telcom = pd.get_dummies(data = telcom,columns = multi_cols )

#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(telcom[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_telcom_og = telcom.copy()
telcom = telcom.drop(columns = num_cols,axis = 1)
telcom = telcom.merge(scaled,left_index=True,right_index=True,how = "left")

VARIABLE SUMMARY

In [12]:
summary = (df_telcom_og[[i for i in df_telcom_og.columns if i not in Id_col]].
           describe().transpose().reset_index())

summary = summary.rename(columns = {"index" : "feature"})
summary = np.around(summary,3)

val_lst = [summary['feature'], summary['count'],
           summary['mean'],summary['std'],
           summary['min'], summary['25%'],
           summary['50%'], summary['75%'], summary['max']]

trace  = go.Table(header = dict(values = summary.columns.tolist(),
                                line = dict(color = ['#506784']),
                                fill = dict(color = ['#119DFF']),
                               ),
                  cells  = dict(values = val_lst,
                                line = dict(color = ['#506784']),
                                fill = dict(color = ["lightgrey",'#F5F8FF'])
                               ),
                  columnwidth = [200,60,100,100,60,60,80,80,80])
layout = go.Layout(dict(title = "Variable Summary"))
figure = go.Figure(data=[trace],layout=layout)
py.iplot(figure)

CORRELATON MATRIX

In [13]:
#correlation
correlation = telcom.corr()
#tick labels
matrix_cols = correlation.columns.tolist()
#convert to array
corr_array  = np.array(correlation)

#Plotting
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   colorscale = "Viridis",
                   colorbar   = dict(title = "Pearson Correlation coefficient",
                                     titleside = "right"
                                    ) ,
                  )

layout = go.Layout(dict(title = "Correlation Matrix for variables",
                        autosize = False,
                        height  = 720,
                        width   = 800,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                      ),
                        yaxis   = dict(tickfont = dict(size = 9)),
                        xaxis   = dict(tickfont = dict(size = 9))
                       )
                  )

data = [trace]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

VISUALISING DATA WITH PRINCIPAL COMPONENTS

In [14]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)

X = telcom[[i for i in telcom.columns if i not in Id_col + target_col]]
Y = telcom[target_col + Id_col]

principal_components = pca.fit_transform(X)
pca_data = pd.DataFrame(principal_components,columns = ["PC1","PC2"])
pca_data = pca_data.merge(Y,left_index=True,right_index=True,how="left")
pca_data["Churn"] = pca_data["Churn"].replace({1:"Churn",0:"Not Churn"})

def pca_scatter(target,color) :
    tracer = go.Scatter(x = pca_data[pca_data["Churn"] == target]["PC1"] ,
                        y = pca_data[pca_data["Churn"] == target]["PC2"],
                        name = target,mode = "markers",
                        marker = dict(color = color,
                                      line = dict(width = .5),
                                      symbol =  "diamond-open"),
                        text = ("Customer Id : " + 
                                pca_data[pca_data["Churn"] == target]['customerID'])
                       )
    return tracer

layout = go.Layout(dict(title = "Visualising data with principal components",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "principal component 1",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "principal component 2",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        height = 600
                       )
                  )
trace1 = pca_scatter("Churn",'red')
trace2 = pca_scatter("Not Churn",'royalblue')
data = [trace2,trace1]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

BINARY VARIABLES DISTRIBUTION IN CUSTOMER ATTRITION(RADAR CHART)

In [15]:
#separating binary columns
bi_cs = telcom.nunique()[telcom.nunique() == 2].keys()
dat_rad = telcom[bi_cs]

#plotting radar chart for churn and non churn customers(binary variables)
def plot_radar(df,aggregate,title) :
    data_frame = df[df["Churn"] == aggregate] 
    data_frame_x = data_frame[bi_cs].sum().reset_index()
    data_frame_x.columns  = ["feature","yes"]
    data_frame_x["no"]    = data_frame.shape[0]  - data_frame_x["yes"]
    data_frame_x  = data_frame_x[data_frame_x["feature"] != "Churn"]
    
    #count of 1's(yes)
    trace1 = go.Scatterpolar(r = data_frame_x["yes"].values.tolist(),
                             theta = data_frame_x["feature"].tolist(),
                             fill  = "toself",name = "count of 1's",
                             mode = "markers+lines",
                             marker = dict(size = 5)
                            )
    #count of 0's(No)
    trace2 = go.Scatterpolar(r = data_frame_x["no"].values.tolist(),
                             theta = data_frame_x["feature"].tolist(),
                             fill  = "toself",name = "count of 0's",
                             mode = "markers+lines",
                             marker = dict(size = 5)
                            ) 
    layout = go.Layout(dict(polar = dict(radialaxis = dict(visible = True,
                                                           side = "counterclockwise",
                                                           showline = True,
                                                           linewidth = 2,
                                                           tickwidth = 2,
                                                           gridcolor = "white",
                                                           gridwidth = 2),
                                         angularaxis = dict(tickfont = dict(size = 10),
                                                            layer = "below traces"
                                                           ),
                                         bgcolor  = "rgb(243,243,243)",
                                        ),
                            paper_bgcolor = "rgb(243,243,243)",
                            title = title,height = 700))
    
    data = [trace2,trace1]
    fig = go.Figure(data=data,layout=layout)
    py.iplot(fig)

#plot
plot_radar(dat_rad,1,"Churn -  Customers")
plot_radar(dat_rad,0,"Non Churn - Customers")

MODEL BUILDING

LOGISTIC REGRESSION BASELINE MODEL

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score
#from yellowbrick.classifier import DiscriminationThreshold
#splitting train and test data 
train,test = train_test_split(telcom,test_size = .25 ,random_state = 111)
    
##seperating dependent and independent variables
cols    = [i for i in telcom.columns if i not in Id_col + target_col]
train_X = train[cols]
train_Y = train[target_col]
test_X  = test[cols]
test_Y  = test[target_col]

#Function attributes
#dataframe     - processed dataframe
#Algorithm     - Algorithm used 
#training_x    - predictor variables dataframe(training)
#testing_x     - predictor variables dataframe(testing)
#training_y    - target variable(training)
#training_y    - target variable(testing)
#cf - ["coefficients","features"](cooefficients for logistic 
                                 #regression,features for tree based models)

#threshold_plot - if True returns threshold plot for model
    
def telecom_churn_prediction(algorithm,training_x,testing_x,
                             training_y,testing_y,cols,cf,threshold_plot) :
    
    #model
    algorithm.fit(training_x,training_y)
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    #coeffs
    if   cf == "coefficients" :
        coefficients  = pd.DataFrame(algorithm.coef_.ravel())
    elif cf == "features" :
        coefficients  = pd.DataFrame(algorithm.feature_importances_)
        
    column_df     = pd.DataFrame(cols)
    coef_sumry    = (pd.merge(coefficients,column_df,left_index= True,
                              right_index= True, how = "left"))
    coef_sumry.columns = ["coefficients","features"]
    coef_sumry    = coef_sumry.sort_values(by = "coefficients",ascending = False)
    
    print (algorithm)
    print ("\n Classification report : \n",classification_report(testing_y,predictions))
    print ("Accuracy   Score : ",accuracy_score(testing_y,predictions))
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y,predictions)
    #roc_auc_score
    model_roc_auc = roc_auc_score(testing_y,predictions) 
    print ("Area under curve : ",model_roc_auc,"\n")
    fpr,tpr,thresholds = roc_curve(testing_y,probabilities[:,1])
    
    #plot confusion matrix
    trace1 = go.Heatmap(z = conf_matrix ,
                        x = ["Not churn","Churn"],
                        y = ["Not churn","Churn"],
                        showscale  = False,colorscale = "Picnic",
                        name = "matrix")
    
    #plot roc curve
    trace2 = go.Scatter(x = fpr,y = tpr,
                        name = "Roc : " + str(model_roc_auc),
                        line = dict(color = ('rgb(22, 96, 167)'),width = 2))
    trace3 = go.Scatter(x = [0,1],y=[0,1],
                        line = dict(color = ('rgb(205, 12, 24)'),width = 2,
                        dash = 'dot'))
    
    #plot coeffs
    trace4 = go.Bar(x = coef_sumry["features"],y = coef_sumry["coefficients"],
                    name = "coefficients",
                    marker = dict(color = coef_sumry["coefficients"],
                                  colorscale = "Picnic",
                                  line = dict(width = .6,color = "black")))
    
    #subplots
    fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
                            subplot_titles=('Confusion Matrix',
                                            'Receiver operating characteristic',
                                            'Feature Importances'))
    
    fig.append_trace(trace1,1,1)
    fig.append_trace(trace2,1,2)
    fig.append_trace(trace3,1,2)
    fig.append_trace(trace4,2,1)
    
    fig['layout'].update(showlegend=False, title="Model performance" ,
                         autosize = False,height = 900,width = 800,
                         plot_bgcolor = 'rgba(240,240,240, 0.95)',
                         paper_bgcolor = 'rgba(240,240,240, 0.95)',
                         margin = dict(b = 195))
    fig["layout"]["xaxis2"].update(dict(title = "false positive rate"))
    fig["layout"]["yaxis2"].update(dict(title = "true positive rate"))
    fig["layout"]["xaxis3"].update(dict(showgrid = True,tickfont = dict(size = 10),
                                        tickangle = 90))
    py.iplot(fig)
        
logit  = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

telecom_churn_prediction(logit,train_X,test_X,train_Y,test_Y,
                         cols,"coefficients",threshold_plot = True)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

 Classification report : 
               precision    recall  f1-score   support

           0       0.83      0.90      0.87      1268
           1       0.68      0.54      0.60       490

   micro avg       0.80      0.80      0.80      1758
   macro avg       0.76      0.72      0.73      1758
weighted avg       0.79      0.80      0.79      1758

Accuracy   Score :  0.8003412969283277
Area under curve :  0.7194714478851477 

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3           -      ]

LOGISTIC REGRESSION-SMOTE

In [17]:
from imblearn.over_sampling import SMOTE

cols    = [i for i in telcom.columns if i not in Id_col+target_col]

smote_X = telcom[cols]
smote_Y = telcom[target_col]

#Split train and test data
smote_train_X,smote_test_X,smote_train_Y,smote_test_Y = train_test_split(smote_X,smote_Y,
                                                                         test_size = .25 ,
                                                                         random_state = 111)

#oversampling minority class using smote
os = SMOTE(random_state = 0)
os_smote_X,os_smote_Y = os.fit_sample(smote_train_X,smote_train_Y)
os_smote_X = pd.DataFrame(data = os_smote_X,columns=cols)
os_smote_Y = pd.DataFrame(data = os_smote_Y,columns=target_col)
###



logit_smote = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

telecom_churn_prediction(logit_smote,os_smote_X,test_X,os_smote_Y,test_Y,
                         cols,"coefficients",threshold_plot = True)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

 Classification report : 
               precision    recall  f1-score   support

           0       0.90      0.75      0.82      1268
           1       0.55      0.79      0.65       490

   micro avg       0.76      0.76      0.76      1758
   macro avg       0.73      0.77      0.73      1758
weighted avg       0.80      0.76      0.77      1758

Accuracy   Score :  0.7605233219567691
Area under curve :  0.769503637417112 

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3           -      ]

RECURSIVE FEATURE ELIMINATION

Recursive Feature Elimination (RFE) is based on the idea to repeatedly construct a model and choose either the best or worst performing feature, setting the feature aside and then repeating the process with the rest of the features. This process is applied until all features in the dataset are exhausted. The goal of RFE is to select features by recursively considering smaller and smaller sets of features.

In [18]:
from sklearn.feature_selection import RFE

logit = LogisticRegression()

rfe = RFE(logit,10)
rfe = rfe.fit(os_smote_X,os_smote_Y.values.ravel())

rfe.support_
rfe.ranking_

#identified columns Recursive Feature Elimination
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                       "columns" : [i for i in telcom.columns if i not in Id_col + target_col],
                       "ranking" : rfe.ranking_,
                      })
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()


#separating train and test data
train_rf_X = os_smote_X[cols]
train_rf_Y = os_smote_Y
test_rf_X  = test[cols]
test_rf_Y  = test[target_col]

logit_rfe = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
#applying model
telecom_churn_prediction(logit_rfe,train_rf_X,test_rf_X,train_rf_Y,test_rf_Y,
                         cols,"coefficients",threshold_plot = True)

tab_rk = ff.create_table(idc_rfe)
py.iplot(tab_rk)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

 Classification report : 
               precision    recall  f1-score   support

           0       0.91      0.73      0.81      1268
           1       0.53      0.80      0.64       490

   micro avg       0.75      0.75      0.75      1758
   macro avg       0.72      0.77      0.73      1758
weighted avg       0.80      0.75      0.76      1758

Accuracy   Score :  0.7502844141069397
Area under curve :  0.7667884503959312 

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3           -      ]

UNIVARIATE SELECTION

In [19]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

#select columns
cols = [i for i in telcom.columns if i not in Id_col + target_col ]

#dataframe with non negative values
df_x = df_telcom_og[cols]
df_y = df_telcom_og[target_col]

#fit model with k= 3
select = SelectKBest(score_func = chi2,k = 3)
fit    = select.fit(df_x,df_y)

#Summerize scores
print ("scores")
print (fit.scores_)
print ("P - Values")
print (fit.pvalues_)

#create dataframe
score = pd.DataFrame({"features":cols,"scores":fit.scores_,"p_values":fit.pvalues_ })
score = score.sort_values(by = "scores" ,ascending =False)


#createing new label for categorical and numerical columns
score["feature_type"] = np.where(score["features"].isin(num_cols),"Numerical","Categorical")

#plot
trace  = go.Scatter(x = score[score["feature_type"] == "Categorical"]["features"],
                    y = score[score["feature_type"] == "Categorical"]["scores"],
                    name = "Categorial",mode = "lines+markers",
                    marker = dict(color = "red",
                                  line = dict(width =1))
                   )

trace1 = go.Bar(x = score[score["feature_type"] == "Numerical"]["features"],
                y = score[score["feature_type"] == "Numerical"]["scores"],name = "Numerical",
                marker = dict(color = "royalblue",
                              line = dict(width =1)),
                xaxis = "x2",yaxis = "y2"
               )
layout = go.Layout(dict(title = "Scores for Categorical & Numerical features",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     tickfont = dict(size =10),
                                     domain=[0, 0.7],
                                     tickangle = 90,zerolinewidth=1,
                                     ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "scores",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        margin = dict(b=200),
                        xaxis2=dict(domain=[0.8, 1],tickangle = 90,
                                    gridcolor = 'rgb(255, 255, 255)'),
                        yaxis2=dict(anchor='x2',gridcolor = 'rgb(255, 255, 255)')
                        )
                  )

data=[trace,trace1]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
scores
[2.54297062e-01 1.33482766e+02 8.18577694e+01 1.31271509e+02
 9.29483891e-02 1.47165601e+02 3.12098318e+01 2.02160070e+01
 1.35439602e+02 1.73206148e+01 1.59306111e+01 1.04979224e+02
 3.88864216e+00 8.68247305e-01 6.51465136e+00 7.11376111e+01
 3.72082851e+02 2.85475152e+02 5.16714004e+02 1.76608724e+02
 4.86223101e+02 7.66190658e+01 9.99725387e+01 4.24113152e+02
 4.47251434e+01 1.63773281e+04 3.65307468e+03 6.29630810e+05]
P - Values
[6.14065505e-001 7.08954608e-031 1.46240915e-019 2.15953960e-030
 7.60461827e-001 7.21988253e-034 2.31590182e-008 6.91717063e-006
 2.64595220e-031 3.15742928e-005 6.57073922e-005 1.23423173e-024
 4.86137123e-002 3.51440986e-001 1.06989295e-002 3.33158163e-017
 6.58713045e-083 4.81399951e-064 2.19511926e-114 2.66631661e-040
 9.45428638e-108 2.07328356e-018 1.54524820e-023 3.10584857e-094
 2.26727030e-011 0.00000000e+000 0.00000000e+000 0.00000000e+000]

DECISION TREE

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn import tree
from graphviz import Source
from IPython.display import SVG,display

#top 3 categorical features
features_cat  = score[score["feature_type"] == "Categorical"]["features"][:3].tolist()

#top 3 numerical features
features_num  = score[score["feature_type"] == "Numerical"]["features"][:3].tolist()


#Function attributes
#columns        - selected columns
#maximum_depth  - depth of tree
#criterion_type - ["gini" or "entropy"]
#split_type     - ["best" or "random"]
#Model Performance - True (gives model output)

def plot_decision_tree(columns,maximum_depth,criterion_type,
                       split_type,model_performance = None) :
    
    #separating dependent and in dependent variables
    dtc_x = df_x[columns]
    dtc_y = df_y[target_col]
    
    #model
    dt_classifier = DecisionTreeClassifier(max_depth = maximum_depth,
                                           splitter  = split_type,
                                           criterion = criterion_type,
                                          )
    dt_classifier.fit(dtc_x,dtc_y)
    
    #plot decision tree
    graph = Source(tree.export_graphviz(dt_classifier,out_file=None,
                                        rounded=True,proportion = False,
                                        feature_names = columns, 
                                        precision  = 2,
                                        class_names=["Not churn","Churn"],
                                        filled = True                         
                                       )
                  )
    
    #model performance
    if model_performance == True :
        telecom_churn_prediction(dt_classifier,
                                 dtc_x,test_X[columns],
                                 dtc_y,test_Y,
                                 columns,"features",threshold_plot = True)
    display(graph)
    
plot_decision_tree(features_num,3,"gini","best")
Tree 0 tenure <= 16.5 gini = 0.39 samples = 7032 value = [5163, 1869] class = Not churn 1 MonthlyCharges <= 68.62 gini = 0.5 samples = 2539 value = [1375, 1164] class = Not churn 0->1 True 8 MonthlyCharges <= 69.97 gini = 0.26 samples = 4493 value = [3788, 705] class = Not churn 0->8 False 2 tenure <= 3.5 gini = 0.42 samples = 1386 value = [975, 411] class = Not churn 1->2 5 TotalCharges <= 120.0 gini = 0.45 samples = 1153 value = [400, 753] class = Churn 1->5 3 gini = 0.49 samples = 605 value = [342, 263] class = Not churn 2->3 4 gini = 0.31 samples = 781 value = [633, 148] class = Not churn 2->4 6 gini = 0.25 samples = 239 value = [35, 204] class = Churn 5->6 7 gini = 0.48 samples = 914 value = [365, 549] class = Churn 5->7 9 MonthlyCharges <= 28.55 gini = 0.12 samples = 1957 value = [1832, 125] class = Not churn 8->9 12 tenure <= 43.5 gini = 0.35 samples = 2536 value = [1956, 580] class = Not churn 8->12 10 gini = 0.04 samples = 949 value = [932, 17] class = Not churn 9->10 11 gini = 0.19 samples = 1008 value = [900, 108] class = Not churn 9->11 13 gini = 0.46 samples = 979 value = [621, 358] class = Not churn 12->13 14 gini = 0.24 samples = 1557 value = [1335, 222] class = Not churn 12->14
In [21]:
plot_decision_tree(features_cat,3,"entropy","best",
                   model_performance = True ,)
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

 Classification report : 
               precision    recall  f1-score   support

           0       0.83      0.84      0.84      1268
           1       0.58      0.54      0.56       490

   micro avg       0.76      0.76      0.76      1758
   macro avg       0.70      0.69      0.70      1758
weighted avg       0.76      0.76      0.76      1758

Accuracy   Score :  0.7610921501706485
Area under curve :  0.6947675915792185 

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3           -      ]

Tree 0 Contract_Month-to-month <= 0.5 entropy = 0.84 samples = 7032 value = [5163, 1869] class = Not churn 1 Contract_Two year <= 0.5 entropy = 0.36 samples = 3157 value = [2943, 214] class = Not churn 0->1 True 8 PaymentMethod_Electronic check <= 0.5 entropy = 0.98 samples = 3875 value = [2220, 1655] class = Not churn 0->8 False 2 PaymentMethod_Electronic check <= 0.5 entropy = 0.51 samples = 1472 value = [1306, 166] class = Not churn 1->2 5 PaymentMethod_Electronic check <= 0.5 entropy = 0.19 samples = 1685 value = [1637, 48] class = Not churn 1->5 3 entropy = 0.44 samples = 1125 value = [1023, 102] class = Not churn 2->3 4 entropy = 0.69 samples = 347 value = [283, 64] class = Not churn 2->4 6 entropy = 0.16 samples = 1517 value = [1482, 35] class = Not churn 5->6 7 entropy = 0.39 samples = 168 value = [155, 13] class = Not churn 5->7 9 entropy = 0.91 samples = 2025 value = [1364, 661] class = Not churn 8->9 10 entropy = 1.0 samples = 1850 value = [856, 994] class = Churn 8->10
In [22]:
#using contract,tenure and paperless billing variables
columns = ['tenure','Contract_Month-to-month', 'PaperlessBilling',
           'Contract_One year', 'Contract_Two year']

plot_decision_tree(columns,3,"gini","best",model_performance= True)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

 Classification report : 
               precision    recall  f1-score   support

           0       0.85      0.74      0.79      1268
           1       0.50      0.66      0.57       490

   micro avg       0.72      0.72      0.72      1758
   macro avg       0.68      0.70      0.68      1758
weighted avg       0.75      0.72      0.73      1758

Accuracy   Score :  0.7218430034129693
Area under curve :  0.7038724006952939 

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3           -      ]

Tree 0 Contract_Month-to-month <= 0.5 gini = 0.39 samples = 7032 value = [5163, 1869] class = Not churn 1 Contract_One year <= 0.5 gini = 0.13 samples = 3157 value = [2943, 214] class = Not churn 0->1 True 8 tenure <= 5.5 gini = 0.49 samples = 3875 value = [2220, 1655] class = Not churn 0->8 False 2 PaperlessBilling <= 0.5 gini = 0.06 samples = 1685 value = [1637, 48] class = Not churn 1->2 5 PaperlessBilling <= 0.5 gini = 0.2 samples = 1472 value = [1306, 166] class = Not churn 1->5 3 gini = 0.03 samples = 902 value = [887, 15] class = Not churn 2->3 4 gini = 0.08 samples = 783 value = [750, 33] class = Not churn 2->4 6 gini = 0.13 samples = 673 value = [625, 48] class = Not churn 5->6 7 gini = 0.25 samples = 799 value = [681, 118] class = Not churn 5->7 9 PaperlessBilling <= 0.5 gini = 0.49 samples = 1318 value = [578, 740] class = Churn 8->9 12 PaperlessBilling <= 0.5 gini = 0.46 samples = 2557 value = [1642, 915] class = Not churn 8->12 10 gini = 0.49 samples = 535 value = [309, 226] class = Not churn 9->10 11 gini = 0.45 samples = 783 value = [269, 514] class = Churn 9->11 13 gini = 0.36 samples = 754 value = [574, 180] class = Not churn 12->13 14 gini = 0.48 samples = 1803 value = [1068, 735] class = Not churn 12->14

K NEAREST NEIGHBOUR CLASSIFIER

In [23]:
def telecom_churn_prediction_alg(algorithm,training_x,testing_x,
                                 training_y,testing_y,threshold_plot = True) :
    
    #model
    algorithm.fit(training_x,training_y)
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    
    print (algorithm)
    print ("\n Classification report : \n",classification_report(testing_y,predictions))
    print ("Accuracy Score   : ",accuracy_score(testing_y,predictions))
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y,predictions)
    #roc_auc_score
    model_roc_auc = roc_auc_score(testing_y,predictions) 
    print ("Area under curve : ",model_roc_auc)
    fpr,tpr,thresholds = roc_curve(testing_y,probabilities[:,1])
     
    #plot roc curve
    trace1 = go.Scatter(x = fpr,y = tpr,
                        name = "Roc : " + str(model_roc_auc),
                        line = dict(color = ('rgb(22, 96, 167)'),width = 2),
                       )
    trace2 = go.Scatter(x = [0,1],y=[0,1],
                        line = dict(color = ('rgb(205, 12, 24)'),width = 2,
                        dash = 'dot'))
    
    #plot confusion matrix
    trace3 = go.Heatmap(z = conf_matrix ,x = ["Not churn","Churn"],
                        y = ["Not churn","Churn"],
                        showscale  = False,colorscale = "Blues",name = "matrix",
                        xaxis = "x2",yaxis = "y2"
                       )
    
    layout = go.Layout(dict(title="Model performance" ,
                            autosize = False,height = 500,width = 800,
                            showlegend = False,
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(title = "false positive rate",
                                         gridcolor = 'rgb(255, 255, 255)',
                                         domain=[0, 0.6],
                                         ticklen=5,gridwidth=2),
                            yaxis = dict(title = "true positive rate",
                                         gridcolor = 'rgb(255, 255, 255)',
                                         zerolinewidth=1,
                                         ticklen=5,gridwidth=2),
                            margin = dict(b=200),
                            xaxis2=dict(domain=[0.7, 1],tickangle = 90,
                                        gridcolor = 'rgb(255, 255, 255)'),
                            yaxis2=dict(anchor='x2',gridcolor = 'rgb(255, 255, 255)')
                           )
                  )
    data = [trace1,trace2,trace3]
    fig = go.Figure(data=data,layout=layout)
    
    py.iplot(fig)
    
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
telecom_churn_prediction_alg(knn,os_smote_X,test_X,
                             os_smote_Y,test_Y,threshold_plot = True)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

 Classification report : 
               precision    recall  f1-score   support

           0       0.86      0.69      0.76      1268
           1       0.47      0.71      0.56       490

   micro avg       0.69      0.69      0.69      1758
   macro avg       0.66      0.70      0.66      1758
weighted avg       0.75      0.69      0.71      1758

Accuracy Score   :  0.6939704209328783
Area under curve :  0.6989506212579669

VIZUALISING A DECISION TREE FROM RANDOM FOREST CLASSIFIER

In [24]:
from sklearn.ensemble import RandomForestClassifier

#function attributes
#columns  - column used
#nf_estimators   - The number of trees in the forest.
#estimated_tree  - tree number to be displayed
#maximum_depth   - depth of the tree
#criterion_type  - split criterion type ["gini" or "entropy"]
#Model performance - prints performance of model

def plot_tree_randomforest(columns,nf_estimators,
                           estimated_tree,maximum_depth,
                           criterion_type,model_performance = None) :
    
    dataframe = df_telcom_og[columns + target_col].copy()
    
    #train and test datasets
    rf_x     = dataframe[[i for i in columns if i not in target_col]]
    rf_y     = dataframe[target_col]
    
    #random forest classifier
    rfc   = RandomForestClassifier(n_estimators = nf_estimators,
                                   max_depth = maximum_depth,
                                   criterion = criterion_type,
                                  )
    rfc.fit(rf_x,rf_y)
    
    estimated_tree = rfc.estimators_[estimated_tree]
    
    graph = Source(tree.export_graphviz(estimated_tree,out_file=None,
                                        rounded=True,proportion = False,
                            feature_names = columns, 
                            precision  = 2,
                            class_names=["Not churn","Churn"],
                            filled = True))
    display(graph)
    
    #model performance
    if model_performance == True :
        telecom_churn_prediction(rfc,
                                 rf_x,test_X[columns],
                                 rf_y,test_Y,
                                 columns,"features",threshold_plot = True)
        

cols1 = [ i for i in train_X.columns if i not in target_col + Id_col] 
plot_tree_randomforest(cols1,100,99,3,"entropy",True)
Tree 0 Contract_Month-to-month <= 0.5 entropy = 0.84 samples = 4432 value = [5163, 1869] class = Not churn 1 InternetService_No <= 0.5 entropy = 0.36 samples = 1993 value = [2985, 221] class = Not churn 0->1 True 8 MonthlyCharges <= 67.88 entropy = 0.99 samples = 2439 value = [2178, 1648] class = Not churn 0->8 False 2 Contract_Two year <= 0.5 entropy = 0.45 samples = 1373 value = [1990, 209] class = Not churn 1->2 5 PaperlessBilling <= 0.5 entropy = 0.09 samples = 620 value = [995, 12] class = Not churn 1->5 3 entropy = 0.61 samples = 712 value = [984, 173] class = Not churn 2->3 4 entropy = 0.22 samples = 661 value = [1006, 36] class = Not churn 2->4 6 entropy = 0.08 samples = 456 value = [734, 7] class = Not churn 5->6 7 entropy = 0.13 samples = 164 value = [261, 5] class = Not churn 5->7 9 PaymentMethod_Electronic check <= 0.5 entropy = 0.88 samples = 1025 value = [1144, 482] class = Not churn 8->9 12 tenure <= 15.5 entropy = 1.0 samples = 1414 value = [1034, 1166] class = Churn 8->12 10 entropy = 0.83 samples = 719 value = [848, 298] class = Not churn 9->10 11 entropy = 0.96 samples = 306 value = [296, 184] class = Not churn 9->11 13 entropy = 0.9 samples = 703 value = [347, 745] class = Churn 12->13 14 entropy = 0.96 samples = 711 value = [687, 421] class = Not churn 12->14
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

 Classification report : 
               precision    recall  f1-score   support

           0       0.79      0.94      0.86      1268
           1       0.70      0.33      0.45       490

   micro avg       0.77      0.77      0.77      1758
   macro avg       0.74      0.64      0.65      1758
weighted avg       0.76      0.77      0.74      1758

Accuracy   Score :  0.7741751990898749
Area under curve :  0.6387240069529388 

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3           -      ]

RANDOM FOREST CLASSIFIER

A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is always the same as the original input sample size but the samples are drawn with replacement . Below are the trees produced by random forest model with 10 estimated trees with maximum depth of three for each tree. Each tree produced is slightly different from other.

In [25]:
#making 10 trees with random forest.
n = np.arange(0,10).tolist()
cols1 = [ i for i in train_X.columns if i not in target_col + Id_col] 
for i in n :
    plot_tree_randomforest(cols1,10,i,3,"entropy",model_performance=False)
Tree 0 InternetService_No <= 0.5 entropy = 0.84 samples = 4430 value = [5124, 1908] class = Not churn 1 tenure <= 22.5 entropy = 0.91 samples = 3446 value = [3696, 1801] class = Not churn 0->1 True 8 Partner <= 0.5 entropy = 0.36 samples = 984 value = [1428, 107] class = Not churn 0->8 False 2 InternetService_DSL <= 0.5 entropy = 1.0 samples = 1475 value = [1119, 1214] class = Churn 1->2 5 tenure <= 54.5 entropy = 0.69 samples = 1971 value = [2577, 587] class = Not churn 1->5 3 entropy = 0.93 samples = 812 value = [440, 850] class = Churn 2->3 4 entropy = 0.93 samples = 663 value = [679, 364] class = Not churn 2->4 6 entropy = 0.81 samples = 1077 value = [1312, 441] class = Not churn 5->6 7 entropy = 0.48 samples = 894 value = [1265, 146] class = Not churn 5->7 9 TotalCharges <= 38.98 entropy = 0.44 samples = 524 value = [760, 76] class = Not churn 8->9 12 gender <= 0.5 entropy = 0.26 samples = 460 value = [668, 31] class = Not churn 8->12 10 entropy = 0.84 samples = 100 value = [111, 41] class = Not churn 9->10 11 entropy = 0.29 samples = 424 value = [649, 35] class = Not churn 9->11 13 entropy = 0.34 samples = 265 value = [378, 26] class = Not churn 12->13 14 entropy = 0.12 samples = 195 value = [290, 5] class = Not churn 12->14
Tree 0 Contract_Two year <= 0.5 entropy = 0.83 samples = 4448 value = [5171, 1861] class = Not churn 1 Contract_Month-to-month <= 0.5 entropy = 0.92 samples = 3383 value = [3558, 1820] class = Not churn 0->1 True 8 MonthlyCharges <= 24.48 entropy = 0.17 samples = 1065 value = [1613, 41] class = Not churn 0->8 False 2 StreamingMovies <= 0.5 entropy = 0.51 samples = 932 value = [1334, 170] class = Not churn 1->2 5 Dependents <= 0.5 entropy = 0.98 samples = 2451 value = [2224, 1650] class = Not churn 1->5 3 entropy = 0.25 samples = 510 value = [783, 35] class = Not churn 2->3 4 entropy = 0.72 samples = 422 value = [551, 135] class = Not churn 2->4 6 entropy = 0.99 samples = 1974 value = [1705, 1407] class = Not churn 5->6 7 entropy = 0.9 samples = 477 value = [519, 243] class = Not churn 5->7 9 gender <= 0.5 entropy = 0.02 samples = 299 value = [480, 1] class = Not churn 8->9 12 MonthlyCharges <= 24.52 entropy = 0.21 samples = 766 value = [1133, 40] class = Not churn 8->12 10 entropy = 0.0 samples = 160 value = [266, 0] class = Not churn 9->10 11 entropy = 0.04 samples = 139 value = [214, 1] class = Not churn 9->11 13 entropy = 1.0 samples = 2 value = [2, 2] class = Not churn 12->13 14 entropy = 0.21 samples = 764 value = [1131, 38] class = Not churn 12->14
Tree 0 Contract_Two year <= 0.5 entropy = 0.83 samples = 4482 value = [5198, 1834] class = Not churn 1 InternetService_No <= 0.5 entropy = 0.92 samples = 3409 value = [3579, 1791] class = Not churn 0->1 True 8 InternetService_No <= 0.5 entropy = 0.17 samples = 1073 value = [1619, 43] class = Not churn 0->8 False 2 Contract_Month-to-month <= 0.5 entropy = 0.96 samples = 2841 value = [2776, 1680] class = Not churn 1->2 5 TotalCharges <= 36.35 entropy = 0.53 samples = 568 value = [803, 111] class = Not churn 1->5 3 entropy = 0.59 samples = 718 value = [970, 161] class = Not churn 2->3 4 entropy = 0.99 samples = 2123 value = [1806, 1519] class = Not churn 2->4 6 entropy = 0.93 samples = 112 value = [120, 64] class = Not churn 5->6 7 entropy = 0.34 samples = 456 value = [683, 47] class = Not churn 5->7 9 OnlineSecurity <= 0.5 entropy = 0.2 samples = 676 value = [1010, 33] class = Not churn 8->9 12 SeniorCitizen <= 0.5 entropy = 0.12 samples = 397 value = [609, 10] class = Not churn 8->12 10 entropy = 0.31 samples = 193 value = [295, 17] class = Not churn 9->10 11 entropy = 0.15 samples = 483 value = [715, 16] class = Not churn 9->11 13 entropy = 0.12 samples = 384 value = [593, 10] class = Not churn 12->13 14 entropy = 0.0 samples = 13 value = [16, 0] class = Not churn 12->14
Tree 0 InternetService_No <= 0.5 entropy = 0.83 samples = 4460 value = [5178, 1854] class = Not churn 1 OnlineSecurity <= 0.5 entropy = 0.9 samples = 3510 value = [3785, 1756] class = Not churn 0->1 True 8 Contract_One year <= 0.5 entropy = 0.35 samples = 950 value = [1393, 98] class = Not churn 0->8 False 2 TechSupport <= 0.5 entropy = 0.98 samples = 2226 value = [2068, 1439] class = Not churn 1->2 5 tenure <= 51.5 entropy = 0.62 samples = 1284 value = [1717, 317] class = Not churn 1->5 3 entropy = 1.0 samples = 1636 value = [1338, 1232] class = Not churn 2->3 4 entropy = 0.76 samples = 590 value = [730, 207] class = Not churn 2->4 6 entropy = 0.79 samples = 656 value = [789, 249] class = Not churn 5->6 7 entropy = 0.36 samples = 628 value = [928, 68] class = Not churn 5->7 9 Contract_Month-to-month <= 0.5 entropy = 0.42 samples = 713 value = [1018, 93] class = Not churn 8->9 12 TotalCharges <= 1022.38 entropy = 0.1 samples = 237 value = [375, 5] class = Not churn 8->12 10 entropy = 0.05 samples = 405 value = [633, 4] class = Not churn 9->10 11 entropy = 0.7 samples = 308 value = [385, 89] class = Not churn 9->11 13 entropy = 0.12 samples = 190 value = [295, 5] class = Not churn 12->13 14 entropy = 0.0 samples = 47 value = [80, 0] class = Not churn 12->14
Tree 0 tenure <= 16.5 entropy = 0.83 samples = 4471 value = [5186, 1846] class = Not churn 1 InternetService_No <= 0.5 entropy = 0.99 samples = 1578 value = [1370, 1113] class = Not churn 0->1 True 8 Contract_Month-to-month <= 0.5 entropy = 0.64 samples = 2893 value = [3816, 733] class = Not churn 0->8 False 2 PaymentMethod_Electronic check <= 0.5 entropy = 1.0 samples = 1202 value = [873, 1009] class = Churn 1->2 5 tenure <= 1.5 entropy = 0.66 samples = 376 value = [497, 104] class = Not churn 1->5 3 entropy = 0.99 samples = 553 value = [498, 377] class = Not churn 2->3 4 entropy = 0.95 samples = 649 value = [375, 632] class = Churn 2->4 6 entropy = 0.94 samples = 95 value = [102, 57] class = Not churn 5->6 7 entropy = 0.49 samples = 281 value = [395, 47] class = Not churn 5->7 9 Contract_Two year <= 0.5 entropy = 0.38 samples = 1854 value = [2650, 208] class = Not churn 8->9 12 InternetService_Fiber optic <= 0.5 entropy = 0.89 samples = 1039 value = [1166, 525] class = Not churn 8->12 10 entropy = 0.52 samples = 842 value = [1151, 153] class = Not churn 9->10 11 entropy = 0.22 samples = 1012 value = [1499, 55] class = Not churn 9->11 13 entropy = 0.66 samples = 359 value = [481, 100] class = Not churn 12->13 14 entropy = 0.96 samples = 680 value = [685, 425] class = Not churn 12->14
Tree 0 Contract_Month-to-month <= 0.5 entropy = 0.84 samples = 4449 value = [5154, 1878] class = Not churn 1 InternetService_Fiber optic <= 0.5 entropy = 0.36 samples = 2018 value = [2957, 220] class = Not churn 0->1 True 8 InternetService_Fiber optic <= 0.5 entropy = 0.99 samples = 2431 value = [2197, 1658] class = Not churn 0->8 False 2 PaymentMethod_Electronic check <= 0.5 entropy = 0.24 samples = 1423 value = [2161, 87] class = Not churn 1->2 5 PaymentMethod_Credit card (automatic) <= 0.5 entropy = 0.59 samples = 595 value = [796, 133] class = Not churn 1->5 3 entropy = 0.2 samples = 1269 value = [1945, 63] class = Not churn 2->3 4 entropy = 0.47 samples = 154 value = [216, 24] class = Not churn 2->4 6 entropy = 0.64 samples = 406 value = [532, 103] class = Not churn 5->6 7 entropy = 0.48 samples = 189 value = [264, 30] class = Not churn 5->7 9 tenure <= 5.5 entropy = 0.85 samples = 1082 value = [1237, 476] class = Not churn 8->9 12 tenure <= 15.5 entropy = 0.99 samples = 1349 value = [960, 1182] class = Churn 8->12 10 entropy = 0.96 samples = 446 value = [436, 277] class = Not churn 9->10 11 entropy = 0.72 samples = 636 value = [801, 199] class = Not churn 9->11 13 entropy = 0.89 samples = 667 value = [322, 728] class = Churn 12->13 14 entropy = 0.98 samples = 682 value = [638, 454] class = Not churn 12->14
Tree 0 tenure <= 10.5 entropy = 0.84 samples = 4415 value = [5163, 1869] class = Not churn 1 InternetService_Fiber optic <= 0.5 entropy = 1.0 samples = 1236 value = [977, 994] class = Churn 0->1 True 8 MonthlyCharges <= 29.18 entropy = 0.66 samples = 3179 value = [4186, 875] class = Not churn 0->8 False 2 InternetService_DSL <= 0.5 entropy = 0.94 samples = 707 value = [726, 408] class = Not churn 1->2 5 gender <= 0.5 entropy = 0.88 samples = 529 value = [251, 586] class = Churn 1->5 3 entropy = 0.76 samples = 279 value = [346, 97] class = Not churn 2->3 4 entropy = 0.99 samples = 428 value = [380, 311] class = Not churn 2->4 6 entropy = 0.86 samples = 265 value = [116, 288] class = Churn 5->6 7 entropy = 0.9 samples = 264 value = [135, 298] class = Churn 5->7 9 Contract_Month-to-month <= 0.5 entropy = 0.12 samples = 665 value = [1012, 17] class = Not churn 8->9 12 Contract_Month-to-month <= 0.5 entropy = 0.75 samples = 2514 value = [3174, 858] class = Not churn 8->12 10 entropy = 0.07 samples = 563 value = [873, 8] class = Not churn 9->10 11 entropy = 0.33 samples = 102 value = [139, 9] class = Not churn 9->11 13 entropy = 0.45 samples = 1358 value = [1996, 211] class = Not churn 12->13 14 entropy = 0.94 samples = 1156 value = [1178, 647] class = Not churn 12->14
Tree 0 MonthlyCharges <= 68.8 entropy = 0.82 samples = 4419 value = [5225, 1807] class = Not churn 1 Contract_Two year <= 0.5 entropy = 0.63 samples = 2063 value = [2774, 522] class = Not churn 0->1 True 8 InternetService_DSL <= 0.5 entropy = 0.93 samples = 2356 value = [2451, 1285] class = Not churn 0->8 False 2 PaymentMethod_Credit card (automatic) <= 0.5 entropy = 0.75 samples = 1497 value = [1895, 510] class = Not churn 1->2 5 tenure <= 24.5 entropy = 0.1 samples = 566 value = [879, 12] class = Not churn 1->5 3 entropy = 0.78 samples = 1249 value = [1535, 462] class = Not churn 2->3 4 entropy = 0.52 samples = 248 value = [360, 48] class = Not churn 2->4 6 entropy = 0.0 samples = 90 value = [139, 0] class = Not churn 5->6 7 entropy = 0.12 samples = 476 value = [740, 12] class = Not churn 5->7 9 TechSupport <= 0.5 entropy = 0.97 samples = 1919 value = [1812, 1233] class = Not churn 8->9 12 MonthlyCharges <= 80.35 entropy = 0.39 samples = 437 value = [639, 52] class = Not churn 8->12 10 entropy = 1.0 samples = 1388 value = [1157, 1039] class = Not churn 9->10 11 entropy = 0.78 samples = 531 value = [655, 194] class = Not churn 9->11 13 entropy = 0.53 samples = 252 value = [358, 49] class = Not churn 12->13 14 entropy = 0.08 samples = 185 value = [281, 3] class = Not churn 12->14
Tree 0 TechSupport <= 0.5 entropy = 0.83 samples = 4448 value = [5187, 1845] class = Not churn 1 Contract_Month-to-month <= 0.5 entropy = 0.89 samples = 3146 value = [3441, 1540] class = Not churn 0->1 True 8 InternetService_Fiber optic <= 0.5 entropy = 0.61 samples = 1302 value = [1746, 305] class = Not churn 0->8 False 2 StreamingMovies <= 0.5 entropy = 0.33 samples = 1145 value = [1722, 111] class = Not churn 1->2 5 PaperlessBilling <= 0.5 entropy = 0.99 samples = 2001 value = [1719, 1429] class = Not churn 1->5 3 entropy = 0.14 samples = 827 value = [1281, 25] class = Not churn 2->3 4 entropy = 0.64 samples = 318 value = [441, 86] class = Not churn 2->4 6 entropy = 0.9 samples = 680 value = [727, 337] class = Not churn 5->6 7 entropy = 1.0 samples = 1321 value = [992, 1092] class = Churn 5->7 9 Contract_Month-to-month <= 0.5 entropy = 0.42 samples = 754 value = [1080, 99] class = Not churn 8->9 12 Contract_Two year <= 0.5 entropy = 0.79 samples = 548 value = [666, 206] class = Not churn 8->12 10 entropy = 0.24 samples = 534 value = [800, 33] class = Not churn 9->10 11 entropy = 0.7 samples = 220 value = [280, 66] class = Not churn 9->11 13 entropy = 0.89 samples = 356 value = [403, 178] class = Not churn 12->13 14 entropy = 0.46 samples = 192 value = [263, 28] class = Not churn 12->14
Tree 0 Contract_Two year <= 0.5 entropy = 0.83 samples = 4392 value = [5184, 1848] class = Not churn 1 PaperlessBilling <= 0.5 entropy = 0.92 samples = 3337 value = [3555, 1797] class = Not churn 0->1 True 8 PaymentMethod_Electronic check <= 0.5 entropy = 0.2 samples = 1055 value = [1629, 51] class = Not churn 0->8 False 2 Contract_Month-to-month <= 0.5 entropy = 0.77 samples = 1213 value = [1539, 444] class = Not churn 1->2 5 InternetService_DSL <= 0.5 entropy = 0.97 samples = 2124 value = [2016, 1353] class = Not churn 1->5 3 entropy = 0.39 samples = 422 value = [657, 54] class = Not churn 2->3 4 entropy = 0.89 samples = 791 value = [882, 390] class = Not churn 2->4 6 entropy = 0.99 samples = 1503 value = [1287, 1070] class = Not churn 5->6 7 entropy = 0.85 samples = 621 value = [729, 283] class = Not churn 5->7 9 InternetService_No <= 0.5 entropy = 0.16 samples = 956 value = [1480, 34] class = Not churn 8->9 12 PhoneService <= 0.5 entropy = 0.48 samples = 99 value = [149, 17] class = Not churn 8->12 10 entropy = 0.21 samples = 577 value = [879, 30] class = Not churn 9->10 11 entropy = 0.06 samples = 379 value = [601, 4] class = Not churn 9->11 13 entropy = 0.95 samples = 8 value = [7, 4] class = Not churn 12->13 14 entropy = 0.42 samples = 91 value = [142, 13] class = Not churn 12->14
In [26]:
#making 10 trees with random forest for columns 
#selected from recursive feature elimination

n = np.arange(0,10).tolist()
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist() 
for i in n :
    plot_tree_randomforest(cols,10,i,3,"gini",model_performance=False)
Tree 0 tenure <= 17.5 gini = 0.39 samples = 4403 value = [5139, 1893] class = Not churn 1 InternetService_No <= 0.5 gini = 0.5 samples = 1630 value = [1396, 1199] class = Not churn 0->1 True 8 Contract_Two year <= 0.5 gini = 0.26 samples = 2773 value = [3743, 694] class = Not churn 0->8 False 2 TechSupport <= 0.5 gini = 0.49 samples = 1249 value = [888, 1101] class = Churn 1->2 5 Contract_Two year <= 0.5 gini = 0.27 samples = 381 value = [508, 98] class = Not churn 1->5 3 gini = 0.48 samples = 1030 value = [684, 980] class = Churn 2->3 4 gini = 0.47 samples = 219 value = [204, 121] class = Not churn 2->4 6 gini = 0.31 samples = 330 value = [423, 98] class = Not churn 5->6 7 gini = 0.0 samples = 51 value = [85, 0] class = Not churn 5->7 9 TotalCharges <= 1447.8 gini = 0.35 samples = 1764 value = [2207, 643] class = Not churn 8->9 12 TotalCharges <= 4627.82 gini = 0.06 samples = 1009 value = [1536, 51] class = Not churn 8->12 10 gini = 0.17 samples = 375 value = [547, 58] class = Not churn 9->10 11 gini = 0.39 samples = 1389 value = [1660, 585] class = Not churn 9->11 13 gini = 0.02 samples = 567 value = [865, 8] class = Not churn 12->13 14 gini = 0.11 samples = 442 value = [671, 43] class = Not churn 12->14
Tree 0 InternetService_Fiber optic <= 0.5 gini = 0.39 samples = 4485 value = [5198, 1834] class = Not churn 1 Contract_Two year <= 0.5 gini = 0.25 samples = 2513 value = [3381, 576] class = Not churn 0->1 True 8 Contract_Month-to-month <= 0.5 gini = 0.48 samples = 1972 value = [1817, 1258] class = Not churn 0->8 False 2 Contract_Month-to-month <= 0.5 gini = 0.33 samples = 1726 value = [2169, 566] class = Not churn 1->2 5 InternetService_No <= 0.5 gini = 0.02 samples = 787 value = [1212, 10] class = Not churn 1->5 3 gini = 0.12 samples = 589 value = [893, 61] class = Not churn 2->3 4 gini = 0.41 samples = 1137 value = [1276, 505] class = Not churn 2->4 6 gini = 0.02 samples = 376 value = [549, 7] class = Not churn 5->6 7 gini = 0.01 samples = 411 value = [663, 3] class = Not churn 5->7 9 Contract_Two year <= 0.5 gini = 0.23 samples = 628 value = [846, 131] class = Not churn 8->9 12 tenure <= 15.5 gini = 0.5 samples = 1344 value = [971, 1127] class = Churn 8->12 10 gini = 0.29 samples = 359 value = [449, 98] class = Not churn 9->10 11 gini = 0.14 samples = 269 value = [397, 33] class = Not churn 9->11 13 gini = 0.43 samples = 656 value = [326, 701] class = Churn 12->13 14 gini = 0.48 samples = 688 value = [645, 426] class = Not churn 12->14
Tree 0 Contract_Two year <= 0.5 gini = 0.39 samples = 4466 value = [5144, 1888] class = Not churn 1 OnlineSecurity <= 0.5 gini = 0.45 samples = 3407 value = [3507, 1817] class = Not churn 0->1 True 8 TotalCharges <= 6867.55 gini = 0.08 samples = 1059 value = [1637, 71] class = Not churn 0->8 False 2 InternetService_No <= 0.5 gini = 0.47 samples = 2603 value = [2472, 1567] class = Not churn 1->2 5 tenure <= 22.5 gini = 0.31 samples = 804 value = [1035, 250] class = Not churn 1->5 3 gini = 0.5 samples = 2020 value = [1694, 1451] class = Not churn 2->3 4 gini = 0.23 samples = 583 value = [778, 116] class = Not churn 2->4 6 gini = 0.42 samples = 259 value = [288, 121] class = Not churn 5->6 7 gini = 0.25 samples = 545 value = [747, 129] class = Not churn 5->7 9 tenure <= 67.5 gini = 0.05 samples = 908 value = [1424, 39] class = Not churn 8->9 12 tenure <= 71.5 gini = 0.23 samples = 151 value = [213, 32] class = Not churn 8->12 10 gini = 0.07 samples = 604 value = [941, 35] class = Not churn 9->10 11 gini = 0.02 samples = 304 value = [483, 4] class = Not churn 9->11 13 gini = 0.35 samples = 74 value = [93, 27] class = Not churn 12->13 14 gini = 0.08 samples = 77 value = [120, 5] class = Not churn 12->14
Tree 0 tenure <= 16.5 gini = 0.39 samples = 4438 value = [5164, 1868] class = Not churn 1 Contract_Month-to-month <= 0.5 gini = 0.49 samples = 1615 value = [1441, 1158] class = Not churn 0->1 True 8 InternetService_No <= 0.5 gini = 0.27 samples = 2823 value = [3723, 710] class = Not churn 0->8 False 2 TotalCharges <= 37.08 gini = 0.07 samples = 180 value = [279, 11] class = Not churn 1->2 5 InternetService_No <= 0.5 gini = 0.5 samples = 1435 value = [1162, 1147] class = Not churn 1->5 3 gini = 0.28 samples = 11 value = [15, 3] class = Not churn 2->3 4 gini = 0.06 samples = 169 value = [264, 8] class = Not churn 2->4 6 gini = 0.49 samples = 1172 value = [837, 1069] class = Churn 5->6 7 gini = 0.31 samples = 263 value = [325, 78] class = Not churn 5->7 9 InternetService_Fiber optic <= 0.5 gini = 0.32 samples = 2254 value = [2838, 693] class = Not churn 8->9 12 Contract_Month-to-month <= 0.5 gini = 0.04 samples = 569 value = [885, 17] class = Not churn 8->12 10 gini = 0.18 samples = 1013 value = [1436, 156] class = Not churn 9->10 11 gini = 0.4 samples = 1241 value = [1402, 537] class = Not churn 9->11 13 gini = 0.03 samples = 511 value = [785, 13] class = Not churn 12->13 14 gini = 0.07 samples = 58 value = [100, 4] class = Not churn 12->14
Tree 0 tenure <= 17.5 gini = 0.4 samples = 4454 value = [5127, 1905] class = Not churn 1 InternetService_No <= 0.5 gini = 0.5 samples = 1668 value = [1432, 1199] class = Not churn 0->1 True 8 InternetService_No <= 0.5 gini = 0.27 samples = 2786 value = [3695, 706] class = Not churn 0->8 False 2 TotalCharges <= 106.97 gini = 0.49 samples = 1258 value = [873, 1088] class = Churn 1->2 5 tenure <= 1.5 gini = 0.28 samples = 410 value = [559, 111] class = Not churn 1->5 3 gini = 0.41 samples = 323 value = [141, 345] class = Churn 2->3 4 gini = 0.5 samples = 935 value = [732, 743] class = Churn 2->4 6 gini = 0.45 samples = 118 value = [131, 70] class = Not churn 5->6 7 gini = 0.16 samples = 292 value = [428, 41] class = Not churn 5->7 9 Contract_Month-to-month <= 0.5 gini = 0.32 samples = 2229 value = [2830, 692] class = Not churn 8->9 12 TotalCharges <= 392.0 gini = 0.03 samples = 557 value = [865, 14] class = Not churn 8->12 10 gini = 0.18 samples = 1297 value = [1833, 203] class = Not churn 9->10 11 gini = 0.44 samples = 932 value = [997, 489] class = Not churn 9->11 13 gini = 0.15 samples = 17 value = [22, 2] class = Not churn 12->13 14 gini = 0.03 samples = 540 value = [843, 12] class = Not churn 12->14
Tree 0 Contract_Month-to-month <= 0.5 gini = 0.39 samples = 4444 value = [5170, 1862] class = Not churn 1 InternetService_No <= 0.5 gini = 0.11 samples = 1972 value = [2957, 191] class = Not churn 0->1 True 8 InternetService_No <= 0.5 gini = 0.49 samples = 2472 value = [2213, 1671] class = Not churn 0->8 False 2 InternetService_Fiber optic <= 0.5 gini = 0.16 samples = 1366 value = [1994, 189] class = Not churn 1->2 5 TotalCharges <= 929.43 gini = 0.0 samples = 606 value = [963, 2] class = Not churn 1->5 3 gini = 0.1 samples = 732 value = [1120, 62] class = Not churn 2->3 4 gini = 0.22 samples = 634 value = [874, 127] class = Not churn 2->4 6 gini = 0.0 samples = 302 value = [491, 0] class = Not churn 5->6 7 gini = 0.01 samples = 304 value = [472, 2] class = Not churn 5->7 9 TotalCharges <= 211.57 gini = 0.5 samples = 2149 value = [1799, 1561] class = Not churn 8->9 12 TotalCharges <= 36.52 gini = 0.33 samples = 323 value = [414, 110] class = Not churn 8->12 10 gini = 0.45 samples = 500 value = [270, 509] class = Churn 9->10 11 gini = 0.48 samples = 1649 value = [1529, 1052] class = Not churn 9->11 13 gini = 0.49 samples = 109 value = [94, 73] class = Not churn 12->13 14 gini = 0.19 samples = 214 value = [320, 37] class = Not churn 12->14
Tree 0 InternetService_Fiber optic <= 0.5 gini = 0.39 samples = 4428 value = [5195, 1837] class = Not churn 1 TotalCharges <= 223.6 gini = 0.23 samples = 2450 value = [3405, 532] class = Not churn 0->1 True 8 PaperlessBilling <= 0.5 gini = 0.49 samples = 1978 value = [1790, 1305] class = Not churn 0->8 False 2 Contract_Month-to-month <= 0.5 gini = 0.46 samples = 533 value = [523, 294] class = Not churn 1->2 5 Contract_Month-to-month <= 0.5 gini = 0.14 samples = 1917 value = [2882, 238] class = Not churn 1->5 3 gini = 0.08 samples = 68 value = [92, 4] class = Not churn 2->3 4 gini = 0.48 samples = 465 value = [431, 290] class = Not churn 2->4 6 gini = 0.06 samples = 1324 value = [2077, 69] class = Not churn 5->6 7 gini = 0.29 samples = 593 value = [805, 169] class = Not churn 5->7 9 TotalCharges <= 1961.67 gini = 0.44 samples = 464 value = [499, 235] class = Not churn 8->9 12 tenure <= 39.5 gini = 0.5 samples = 1514 value = [1291, 1070] class = Not churn 8->12 10 gini = 0.49 samples = 188 value = [130, 161] class = Churn 9->10 11 gini = 0.28 samples = 276 value = [369, 74] class = Not churn 9->11 13 gini = 0.47 samples = 887 value = [505, 867] class = Churn 12->13 14 gini = 0.33 samples = 627 value = [786, 203] class = Not churn 12->14
Tree 0 InternetService_Fiber optic <= 0.5 gini = 0.38 samples = 4460 value = [5250, 1782] class = Not churn 1 tenure <= 5.5 gini = 0.24 samples = 2501 value = [3389, 551] class = Not churn 0->1 True 8 tenure <= 17.5 gini = 0.48 samples = 1959 value = [1861, 1231] class = Not churn 0->8 False 2 InternetService_No <= 0.5 gini = 0.49 samples = 483 value = [444, 321] class = Not churn 1->2 5 PhoneService <= 0.5 gini = 0.13 samples = 2018 value = [2945, 230] class = Not churn 1->5 3 gini = 0.5 samples = 281 value = [208, 239] class = Churn 2->3 4 gini = 0.38 samples = 202 value = [236, 82] class = Not churn 2->4 6 gini = 0.26 samples = 356 value = [469, 84] class = Not churn 5->6 7 gini = 0.11 samples = 1662 value = [2476, 146] class = Not churn 5->7 9 OnlineSecurity <= 0.5 gini = 0.45 samples = 712 value = [372, 741] class = Churn 8->9 12 Contract_Month-to-month <= 0.5 gini = 0.37 samples = 1247 value = [1489, 490] class = Not churn 8->12 10 gini = 0.43 samples = 633 value = [308, 679] class = Churn 9->10 11 gini = 0.5 samples = 79 value = [64, 62] class = Not churn 9->11 13 gini = 0.24 samples = 618 value = [843, 134] class = Not churn 12->13 14 gini = 0.46 samples = 629 value = [646, 356] class = Not churn 12->14
Tree 0 InternetService_Fiber optic <= 0.5 gini = 0.39 samples = 4488 value = [5189, 1843] class = Not churn 1 tenure <= 5.5 gini = 0.25 samples = 2480 value = [3346, 561] class = Not churn 0->1 True 8 TotalCharges <= 1537.4 gini = 0.48 samples = 2008 value = [1843, 1282] class = Not churn 0->8 False 2 InternetService_No <= 0.5 gini = 0.48 samples = 488 value = [458, 313] class = Not churn 1->2 5 Contract_Month-to-month <= 0.5 gini = 0.15 samples = 1992 value = [2888, 248] class = Not churn 1->5 3 gini = 0.5 samples = 289 value = [230, 228] class = Not churn 2->3 4 gini = 0.4 samples = 199 value = [228, 85] class = Not churn 2->4 6 gini = 0.07 samples = 1364 value = [2072, 74] class = Not churn 5->6 7 gini = 0.29 samples = 628 value = [816, 174] class = Not churn 5->7 9 TotalCharges <= 121.78 gini = 0.45 samples = 715 value = [388, 746] class = Churn 8->9 12 tenure <= 54.5 gini = 0.39 samples = 1293 value = [1455, 536] class = Not churn 8->12 10 gini = 0.23 samples = 147 value = [29, 189] class = Churn 9->10 11 gini = 0.48 samples = 568 value = [359, 557] class = Churn 9->11 13 gini = 0.46 samples = 745 value = [758, 411] class = Not churn 12->13 14 gini = 0.26 samples = 548 value = [697, 125] class = Not churn 12->14
Tree 0 InternetService_Fiber optic <= 0.5 gini = 0.39 samples = 4475 value = [5163, 1869] class = Not churn 1 tenure <= 5.5 gini = 0.24 samples = 2549 value = [3454, 554] class = Not churn 0->1 True 8 TechSupport <= 0.5 gini = 0.49 samples = 1926 value = [1709, 1315] class = Not churn 0->8 False 2 PaperlessBilling <= 0.5 gini = 0.48 samples = 496 value = [458, 305] class = Not churn 1->2 5 tenure <= 22.5 gini = 0.14 samples = 2053 value = [2996, 249] class = Not churn 1->5 3 gini = 0.45 samples = 264 value = [279, 142] class = Not churn 2->3 4 gini = 0.5 samples = 232 value = [179, 163] class = Not churn 2->4 6 gini = 0.26 samples = 616 value = [813, 150] class = Not churn 5->6 7 gini = 0.08 samples = 1437 value = [2183, 99] class = Not churn 5->7 9 tenure <= 42.5 gini = 0.5 samples = 1396 value = [1081, 1120] class = Churn 8->9 12 Contract_Two year <= 0.5 gini = 0.36 samples = 530 value = [628, 195] class = Not churn 8->12 10 gini = 0.47 samples = 992 value = [588, 979] class = Churn 9->10 11 gini = 0.35 samples = 404 value = [493, 141] class = Not churn 9->11 13 gini = 0.44 samples = 348 value = [372, 179] class = Not churn 12->13 14 gini = 0.11 samples = 182 value = [256, 16] class = Not churn 12->14

GAUSSIAN NAIVE BAYES

In [27]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB(priors=None)

telecom_churn_prediction_alg(gnb,os_smote_X,test_X,os_smote_Y,test_Y)
GaussianNB(priors=None, var_smoothing=1e-09)

 Classification report : 
               precision    recall  f1-score   support

           0       0.90      0.74      0.81      1268
           1       0.54      0.79      0.64       490

   micro avg       0.75      0.75      0.75      1758
   macro avg       0.72      0.77      0.73      1758
weighted avg       0.80      0.75      0.77      1758

Accuracy Score   :  0.7542662116040956
Area under curve :  0.7657921843816391

SUPPORT VECTOR MACHINE

Support Vector Machine” (SVM) is a supervised machine learning algorithm which can be used for both classification or regression challenges. it is mostly used in classification problems. In this algorithm, we plot each data item as a point in n-dimensional space .where n is number of features you have) with the value of each feature being the value of a particular coordinate. Then, we perform classification by finding the hyper-plane that differentiate the two classes

In [28]:
from sklearn.svm import SVC

#Support vector classifier
#using linear hyper plane
svc_lin  = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
               decision_function_shape='ovr', degree=3, gamma=1.0, kernel='linear',
               max_iter=-1, probability=True, random_state=None, shrinking=True,
               tol=0.001, verbose=False)

cols = [i for i in telcom.columns if i not in Id_col + target_col]
telecom_churn_prediction(svc_lin,os_smote_X,test_X,os_smote_Y,test_Y,
                         cols,"coefficients",threshold_plot = False)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.0, kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

 Classification report : 
               precision    recall  f1-score   support

           0       0.91      0.73      0.81      1268
           1       0.54      0.81      0.64       490

   micro avg       0.75      0.75      0.75      1758
   macro avg       0.72      0.77      0.73      1758
weighted avg       0.80      0.75      0.76      1758

Accuracy   Score :  0.7508532423208191
Area under curve :  0.7684349449559004 

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3           -      ]

TUNING PARAMETERS FOR SUPPORT VECTOR MACHINE

In [29]:
#tuning parameters
#Support vector classifier
#using non-linear hyper plane("rbf")

svc_rbf  = SVC(C=1.0, kernel='rbf', 
               degree= 3, gamma=1.0, 
               coef0=0.0, shrinking=True,
               probability=True,tol=0.001,
               cache_size=200, class_weight=None,
               verbose=False,max_iter= -1,
               random_state=None)

telecom_churn_prediction_alg(svc_rbf,os_smote_X,test_X,os_smote_Y,test_Y,threshold_plot = False)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.0, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

 Classification report : 
               precision    recall  f1-score   support

           0       0.82      0.86      0.84      1268
           1       0.58      0.51      0.55       490

   micro avg       0.76      0.76      0.76      1758
   macro avg       0.70      0.69      0.69      1758
weighted avg       0.75      0.76      0.76      1758

Accuracy Score   :  0.7622298065984073
Area under curve :  0.6855388527650809

LightGBM CLASSIFIER

In [30]:
from lightgbm import LGBMClassifier

lgbm_c = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                        learning_rate=0.5, max_depth=7, min_child_samples=20,
                        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
                        n_jobs=-1, num_leaves=500, objective='binary', random_state=None,
                        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
                        subsample_for_bin=200000, subsample_freq=0)

cols = [i for i in telcom.columns if i not in Id_col + target_col]
telecom_churn_prediction(lgbm_c,os_smote_X,test_X,os_smote_Y,test_Y,
                         cols,"features",threshold_plot = True)
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.5, max_depth=7,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=500, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

 Classification report : 
               precision    recall  f1-score   support

           0       0.82      0.85      0.84      1268
           1       0.58      0.53      0.55       490

   micro avg       0.76      0.76      0.76      1758
   macro avg       0.70      0.69      0.70      1758
weighted avg       0.76      0.76      0.76      1758

Accuracy   Score :  0.7627986348122867
Area under curve :  0.6909418657052726 

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3           -      ]

XGBoost CLASSIFIER

In [31]:
from xgboost import XGBClassifier

xgc = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                    colsample_bytree=1, gamma=0, learning_rate=0.9, max_delta_step=0,
                    max_depth = 7, min_child_weight=1, missing=None, n_estimators=100,
                    n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
                    reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                    silent=True, subsample=1)


telecom_churn_prediction(xgc,os_smote_X,test_X,os_smote_Y,test_Y,
                         cols,"features",threshold_plot = True)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.9, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

 Classification report : 
               precision    recall  f1-score   support

           0       0.83      0.85      0.84      1268
           1       0.59      0.53      0.56       490

   micro avg       0.76      0.76      0.76      1758
   macro avg       0.71      0.69      0.70      1758
weighted avg       0.76      0.76      0.76      1758

Accuracy   Score :  0.764505119453925
Area under curve :  0.6933770037983648 

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3           -      ]

MODEL PERFORMANCES

MODEL PERFORMANCE METRICS

In [32]:
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score

#gives model report in dataframe
def model_report(model,training_x,testing_x,training_y,testing_y,name) :
    model.fit(training_x,training_y)
    predictions  = model.predict(testing_x)
    accuracy     = accuracy_score(testing_y,predictions)
    recallscore  = recall_score(testing_y,predictions)
    precision    = precision_score(testing_y,predictions)
    roc_auc      = roc_auc_score(testing_y,predictions)
    f1score      = f1_score(testing_y,predictions) 
    kappa_metric = cohen_kappa_score(testing_y,predictions)
    
    df = pd.DataFrame({"Model"           : [name],
                       "Accuracy_score"  : [accuracy],
                       "Recall_score"    : [recallscore],
                       "Precision"       : [precision],
                       "f1_score"        : [f1score],
                       "Area_under_curve": [roc_auc],
                       "Kappa_metric"    : [kappa_metric],
                      })
    return df

#outputs for every model
model1 = model_report(logit,train_X,test_X,train_Y,test_Y,
                      "Logistic Regression(Baseline_model)")
model2 = model_report(logit_smote,os_smote_X,test_X,os_smote_Y,test_Y,
                      "Logistic Regression(SMOTE)")
model3 = model_report(logit_rfe,train_rf_X,test_rf_X,train_rf_Y,test_rf_Y,
                      "Logistic Regression(RFE)")
decision_tree = DecisionTreeClassifier(max_depth = 9,
                                       random_state = 123,
                                       splitter  = "best",
                                       criterion = "gini",
                                      )
model4 = model_report(decision_tree,train_X,test_X,train_Y,test_Y,
                      "Decision Tree")
model5 = model_report(knn,os_smote_X,test_X,os_smote_Y,test_Y,
                      "KNN Classifier")
rfc = RandomForestClassifier(n_estimators = 1000,
                             random_state = 123,
                             max_depth = 9,
                             criterion = "gini")
model6 = model_report(rfc,train_X,test_X,train_Y,test_Y,
                      "Random Forest Classifier")
model7 = model_report(gnb,os_smote_X,test_X,os_smote_Y,test_Y,
                      "Naive Bayes")
model8 = model_report(svc_lin,os_smote_X,test_X,os_smote_Y,test_Y,
                      "SVM Classifier Linear")
model9 = model_report(svc_rbf,os_smote_X,test_X,os_smote_Y,test_Y,
                      "SVM Classifier RBF")
model10 = model_report(lgbm_c,os_smote_X,test_X,os_smote_Y,test_Y,
                      "LGBM Classifier")
model11 = model_report(xgc,os_smote_X,test_X,os_smote_Y,test_Y,
                      "XGBoost Classifier")

#concat all models
model_performances = pd.concat([model1,model2,model3,
                                model4,model5,model6,
                                model7,model8,model9,
                                model10,model11],axis = 0).reset_index()

model_performances = model_performances.drop(columns = "index",axis =1)

table  = ff.create_table(np.round(model_performances,4))

py.iplot(table)

COMPARISION OF MODEL METRICS

In [33]:
model_performances
def output_tracer(metric,color) :
    tracer = go.Bar(y = model_performances["Model"] ,
                    x = model_performances[metric],
                    orientation = "h",name = metric ,
                    marker = dict(line = dict(width =.7),
                                  color = color)
                   )
    return tracer

layout = go.Layout(dict(title = "Model performances",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "metric",
                                     zerolinewidth=1,
                                     ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        margin = dict(l = 250),
                        height = 780
                       )
                  )


trace1  = output_tracer("Accuracy_score","#6699FF")
trace2  = output_tracer('Recall_score',"red")
trace3  = output_tracer('Precision',"#33CC99")
trace4  = output_tracer('f1_score',"lightgrey")
trace5  = output_tracer('Kappa_metric',"#FFCC99")

data = [trace1,trace2,trace3,trace4,trace5]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

CONFUSION MATRICES FOR MODELS

In [34]:
lst    = [logit,logit_smote,decision_tree,knn,rfc,
          gnb,svc_lin,svc_rbf,lgbm_c,xgc]

length = len(lst)

mods   = ['Logistic Regression(Baseline_model)','Logistic Regression(SMOTE)',
          'Decision Tree','KNN Classifier','Random Forest Classifier',"Naive Bayes",
          'SVM Classifier Linear','SVM Classifier RBF', 'LGBM Classifier',
          'XGBoost Classifier']

fig = plt.figure(figsize=(13,15))
fig.set_facecolor("#F3F3F3")
for i,j,k in itertools.zip_longest(lst,range(length),mods) :
    plt.subplot(4,3,j+1)
    predictions = i.predict(test_X)
    conf_matrix = confusion_matrix(predictions,test_Y)
    sns.heatmap(conf_matrix,annot=True,fmt = "d",square = True,
                xticklabels=["not churn","churn"],
                yticklabels=["not churn","churn"],
                linewidths = 2,linecolor = "w",cmap = "Set1")
    plt.title(k,color = "b")
    plt.subplots_adjust(wspace = .3,hspace = .3)

ROC CURVES FOR MODELS

In [35]:
lst    = [logit,logit_smote,decision_tree,knn,rfc,
          gnb,svc_lin,svc_rbf,lgbm_c,xgc]

length = len(lst)

mods   = ['Logistic Regression(Baseline_model)','Logistic Regression(SMOTE)',
          'Decision Tree','KNN Classifier','Random Forest Classifier',"Naive Bayes",
          'SVM Classifier Linear','SVM Classifier RBF', 'LGBM Classifier',
          'XGBoost Classifier']

plt.style.use("classic")
fig = plt.figure(figsize=(12,16))
fig.set_facecolor("#F3F3F3")
for i,j,k in itertools.zip_longest(lst,range(length),mods) :
    qx = plt.subplot(4,3,j+1)
    probabilities = i.predict_proba(test_X)
    predictions   = i.predict(test_X)
    fpr,tpr,thresholds = roc_curve(test_Y,probabilities[:,1])
    plt.plot(fpr,tpr,linestyle = "dotted",
             color = "royalblue",linewidth = 2,
             label = "AUC = " + str(np.around(roc_auc_score(test_Y,predictions),3)))
    plt.plot([0,1],[0,1],linestyle = "dashed",
             color = "orangered",linewidth = 1.5)
    plt.fill_between(fpr,tpr,alpha = .4)
    plt.fill_between([0,1],[0,1],color = "k")
    plt.legend(loc = "lower right",
               prop = {"size" : 12})
    qx.set_facecolor("k")
    plt.grid(True,alpha = .15)
    plt.title(k,color = "b")
    plt.xticks(np.arange(0,1,.3))
    plt.yticks(np.arange(0,1,.3))

PRECISION RECALL CURVES

In [36]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score


lst    = [logit,logit_smote,decision_tree,knn,rfc,
          gnb,svc_lin,svc_rbf,lgbm_c,xgc]

length = len(lst)

mods   = ['Logistic Regression(Baseline_model)','Logistic Regression(SMOTE)',
          'Decision Tree','KNN Classifier','Random Forest Classifier',"Naive Bayes",
          'SVM Classifier Linear','SVM Classifier RBF', 'LGBM Classifier',
          'XGBoost Classifier']

fig = plt.figure(figsize=(13,17))
fig.set_facecolor("#F3F3F3")
for i,j,k in itertools.zip_longest(lst,range(length),mods) :
    
    qx = plt.subplot(4,3,j+1)
    probabilities = i.predict_proba(test_X)
    predictions   = i.predict(test_X)
    recall,precision,thresholds = precision_recall_curve(test_Y,probabilities[:,1])
    plt.plot(recall,precision,linewidth = 1.5,
             label = ("avg_pcn : " + 
                      str(np.around(average_precision_score(test_Y,predictions),3))))
    plt.plot([0,1],[0,0],linestyle = "dashed")
    plt.fill_between(recall,precision,alpha = .2)
    plt.legend(loc = "lower left",
               prop = {"size" : 10})
    qx.set_facecolor("k")
    plt.grid(True,alpha = .15)
    plt.title(k,color = "b")
    plt.xlabel("recall",fontsize =7)
    plt.ylabel("precision",fontsize =7)
    plt.xlim([0.25,1])
    plt.yticks(np.arange(0,1,.3))