import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import chi2_contingency
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE


%matplotlib inline


os.chdir(r'D:\MyDrive2\pythonprojects\class2\DataMining\homework\ChunrAnalysis')
workdir = os.getcwd()
data_dir = os.path.join(workdir, 'data')

train = pd.read_csv(os.path.join(data_dir, 'Telco-Customer-Churn.csv'))
train.drop('customerID', axis=1, inplace=True)
train0 = train.copy()

train

y = train['Churn']
train.drop(['Churn'], inplace=True, axis=1)
train_y = train.copy()
train_y.drop(['TotalCharges', 'MonthlyCharges', 'tenure'], inplace=True, axis=1)
dup_list = train_y.duplicated()
print(f"{dup_list.sum()} rows are duplicated if do not consider target")
train_y = train.join(y).copy()
total_dup_list = train_y.duplicated()
print(f"{total_dup_list.sum()} rows are duplicated if consider target")
conflict_list = dup_list & ~total_dup_list
print(f"{conflict_list.sum()} rows are conflict")
train_y.drop(train[conflict_list].index, inplace=True)
y = train_y['Churn']
train = train_y.drop(columns=['Churn'])

train = train0.copy()

2620 rows are duplicated if do not consider target
22 rows are duplicated if consider target
2598 rows are conflict

print(len(train[train.duplicated()]))
print(len(train[train.drop(['Churn'], axis=1).duplicated()]))

total_dup_list = train.duplicated()
feat_dup_list = train.drop(['Churn'], axis=1).duplicated()

conflict_list = feat_dup_list & ~total_dup_list
conflict_list.sum()

22
40

np.int64(18)

len(train[conflict_list].index)

18

train.drop(train[conflict_list].index, inplace=True)

train

train['Contract'].value_counts()

Contract
Month-to-month    3857
Two year          1695
One year          1473
Name: count, dtype: int64

train['PaymentMethod'].value_counts()

PaymentMethod
Electronic check             2363
Mailed check                 1596
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: count, dtype: int64

train.nunique()

gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64

train.describe()

train.isnull().sum().sum()

np.int64(0)

plt.figure(figsize=(20, 5))
plt.hist(train['tenure'], bins=100);

train['tenure'].value_counts().sort_index()

tenure
0      11
1     595
2     238
3     200
4     176
     ... 
68    100
69     95
70    119
71    170
72    362
Name: count, Length: 73, dtype: int64

plt.bar(train['SeniorCitizen'].value_counts().index, train['SeniorCitizen'].value_counts())

<BarContainer object of 2 artists>

plt.hist(train['MonthlyCharges'], bins=50);

train['TotalCharges_as_float'] = pd.to_numeric(train['TotalCharges'], errors='coerce')
non_float_entries = train[train['TotalCharges_as_float'].isna()]['TotalCharges']
non_float_entries

488      
753      
936      
1082     
1340     
3331     
3826     
4380     
5218     
6670     
6754     
Name: TotalCharges, dtype: object

train.dropna(inplace=True)
train.drop('TotalCharges_as_float', axis=1, inplace=True)

train['TotalCharges'] = train['TotalCharges'].astype(float)

for f in train.columns:
    print(f, '\t\t', train[f].dtype, '\t\t', train[f].nunique(), '\t', end="")
    if train[f].unique().size <= 4:
        print(f"{train[f].unique()}")
    else:
        print(f"[]")

gender 		 object 		 2 	['Female' 'Male']
SeniorCitizen 		 int64 		 2 	[0 1]
Partner 		 object 		 2 	['Yes' 'No']
Dependents 		 object 		 2 	['No' 'Yes']
tenure 		 int64 		 72 	[]
PhoneService 		 object 		 2 	['No' 'Yes']
MultipleLines 		 object 		 3 	['No phone service' 'No' 'Yes']
InternetService 		 object 		 3 	['DSL' 'Fiber optic' 'No']
OnlineSecurity 		 object 		 3 	['No' 'Yes' 'No internet service']
OnlineBackup 		 object 		 3 	['Yes' 'No' 'No internet service']
DeviceProtection 		 object 		 3 	['No' 'Yes' 'No internet service']
TechSupport 		 object 		 3 	['No' 'Yes' 'No internet service']
StreamingTV 		 object 		 3 	['No' 'Yes' 'No internet service']
StreamingMovies 		 object 		 3 	['No' 'Yes' 'No internet service']
Contract 		 object 		 3 	['Month-to-month' 'One year' 'Two year']
PaperlessBilling 		 object 		 2 	['Yes' 'No']
PaymentMethod 		 object 		 4 	['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
MonthlyCharges 		 float64 		 1584 	[]
TotalCharges 		 float64 		 6530 	[]
Churn 		 object 		 2 	['No' 'Yes']

#Convert SeniorCitizen to categorical
train['SeniorCitizen'] = train['SeniorCitizen'].astype('O')
train['SeniorCitizen'] = train['SeniorCitizen'].apply(lambda x: 'Yes' if x == 1 else 'No')

train['month'] = train['tenure'] % 12
train['month'] = train['month'].apply(lambda x: 12 if x == 0 else x)
train['month'] = train['month'].astype('category')

train['avg_charge'] = train['TotalCharges'] / train['tenure']
df = train[['tenure', 'avg_charge', 'MonthlyCharges', 'TotalCharges']]
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', cbar=False)
plt.yticks(rotation=0)

(array([0.5, 1.5, 2.5, 3.5]),
 [Text(0, 0.5, 'tenure'),
  Text(0, 1.5, 'avg_charge'),
  Text(0, 2.5, 'MonthlyCharges'),
  Text(0, 3.5, 'TotalCharges')])

# plt.scatter(x='MonthlyCharges', y='avg_charge', data=df)
df['Churn'] = train['Churn']
sns.scatterplot(x='MonthlyCharges', y='avg_charge', data=df, hue='Churn')
plt.xlabel('Monthly Charges')
plt.ylabel('Average Charge')
plt.title('Monthly Charges vs Average Charge')

C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\868448621.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Churn'] = train['Churn']

Text(0.5, 1.0, 'Monthly Charges vs Average Charge')

train['Churn'] = train['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)
train['Churn'] = train['Churn'].astype('int')

train['mc_vs_avg'] = train['MonthlyCharges'] / train['avg_charge']
plt.hist(train['mc_vs_avg'], bins=100)

(array([1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
        0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        1.000e+00, 0.000e+00, 1.000e+00, 2.000e+00, 4.000e+00, 0.000e+00,
        1.000e+00, 3.000e+00, 8.000e+00, 4.000e+00, 1.000e+00, 6.000e+00,
        8.000e+00, 8.000e+00, 8.000e+00, 1.400e+01, 1.300e+01, 8.000e+00,
        2.300e+01, 2.400e+01, 2.900e+01, 4.400e+01, 5.400e+01, 7.500e+01,
        8.800e+01, 1.140e+02, 1.590e+02, 2.210e+02, 2.890e+02, 3.760e+02,
        5.180e+02, 6.390e+02, 1.237e+03, 6.180e+02, 5.450e+02, 3.880e+02,
        3.030e+02, 2.770e+02, 1.730e+02, 1.540e+02, 1.060e+02, 6.200e+01,
        6.400e+01, 6.900e+01, 3.600e+01, 3.400e+01, 3.200e+01, 1.500e+01,
        2.000e+01, 1.500e+01, 1.300e+01, 1.000e+01, 1.900e+01, 6.000e+00,
        1.200e+01, 3.000e+00, 4.000e+00, 9.000e+00, 5.000e+00, 5.000e+00,
        5.000e+00, 2.000e+00, 1.000e+00, 5.000e+00, 2.000e+00, 1.000e+00,
        2.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 2.000e+00, 1.000e+00,
        1.000e+00, 1.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 1.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 2.000e+00]),
 array([0.63554464, 0.64369547, 0.65184631, 0.65999715, 0.66814798,
        0.67629882, 0.68444966, 0.6926005 , 0.70075133, 0.70890217,
        0.71705301, 0.72520385, 0.73335468, 0.74150552, 0.74965636,
        0.7578072 , 0.76595803, 0.77410887, 0.78225971, 0.79041054,
        0.79856138, 0.80671222, 0.81486306, 0.82301389, 0.83116473,
        0.83931557, 0.84746641, 0.85561724, 0.86376808, 0.87191892,
        0.88006975, 0.88822059, 0.89637143, 0.90452227, 0.9126731 ,
        0.92082394, 0.92897478, 0.93712562, 0.94527645, 0.95342729,
        0.96157813, 0.96972897, 0.9778798 , 0.98603064, 0.99418148,
        1.00233231, 1.01048315, 1.01863399, 1.02678483, 1.03493566,
        1.0430865 , 1.05123734, 1.05938818, 1.06753901, 1.07568985,
        1.08384069, 1.09199152, 1.10014236, 1.1082932 , 1.11644404,
        1.12459487, 1.13274571, 1.14089655, 1.14904739, 1.15719822,
        1.16534906, 1.1734999 , 1.18165074, 1.18980157, 1.19795241,
        1.20610325, 1.21425408, 1.22240492, 1.23055576, 1.2387066 ,
        1.24685743, 1.25500827, 1.26315911, 1.27130995, 1.27946078,
        1.28761162, 1.29576246, 1.30391329, 1.31206413, 1.32021497,
        1.32836581, 1.33651664, 1.34466748, 1.35281832, 1.36096916,
        1.36911999, 1.37727083, 1.38542167, 1.39357251, 1.40172334,
        1.40987418, 1.41802502, 1.42617585, 1.43432669, 1.44247753,
        1.45062837]),
 <BarContainer object of 100 artists>)

train.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn', 'month', 'avg_charge',
       'mc_vs_avg'],
      dtype='object')

for f in train.columns:
    if train[f].nunique() > 12:
        continue
    if f == 'Churn':
        continue
    plt.figure()
    fig, ax = plt.subplots(figsize=(20, 5))
    # Calculate the percentage of target=1 per category value

    cat_perc = train[[f, 'Churn']].groupby([f], as_index=False).mean()
    cat_perc.sort_values(by='Churn', ascending=False, inplace=True)
    # Bar plot
    # Order the bars descending on target mean
    sns.barplot(ax=ax, x=f, y='Churn', data=cat_perc, order=cat_perc[f])
    plt.ylabel('% Churn', fontsize=18)
    plt.xlabel(f, fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show()

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

train['Churn'].value_counts()

Churn
0    5155
1    1859
Name: count, dtype: int64

numeric_columns = []
for f in train.columns:
    if train[f].nunique() > 12:
        numeric_columns.append(f)
numeric_columns

['tenure', 'MonthlyCharges', 'TotalCharges', 'avg_charge', 'mc_vs_avg']

train[numeric_columns]

sns.pairplot(train[numeric_columns])

<seaborn.axisgrid.PairGrid at 0x1cd11743200>

numeric_columns

['tenure', 'MonthlyCharges', 'TotalCharges', 'avg_charge', 'mc_vs_avg']

sns.heatmap(train[numeric_columns].corr(), annot=True)

<Axes: >

plt.figure(figsize=(30, 5))
sns.boxplot(x=train['tenure'], y=train['MonthlyCharges'])

<Axes: xlabel='tenure', ylabel='MonthlyCharges'>

train.describe()

scaler = StandardScaler()
df_std = train[numeric_columns].copy()
df_std[numeric_columns] = scaler.fit_transform(df_std)

train['Churn']

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7014, dtype: int64

df_std['target'] = train['Churn'].astype('O')
df = df_std
sns.pairplot(df, hue='target')

<seaborn.axisgrid.PairGrid at 0x1cd0cbf2960>

df = df_std[df_std['target'] == 0].copy()
df.drop('target', axis=1, inplace=True)
pairplot = sns.pairplot(df)
pairplot.fig.suptitle('Existing Customer', y=1.02)

Text(0.5, 1.02, 'Existing Customer')

df = df_std[df_std['target'] == 1].copy()
df.drop('target', axis=1, inplace=True)
pairplot = sns.pairplot(df)
pairplot.fig.suptitle('Churned Customer', y=1.02)

Text(0.5, 1.02, 'Churned Customer')

sns.heatmap(train[numeric_columns].corr(), annot=True)

<Axes: >

scaler = StandardScaler()

train[numeric_columns] = scaler.fit_transform(train[numeric_columns])

y = train['Churn']

correlation_with_target = train[numeric_columns].join(y).corr()
correlation_with_target['Churn']

tenure           -0.353405
MonthlyCharges    0.195623
TotalCharges     -0.198494
avg_charge        0.194784
mc_vs_avg         0.011741
Churn             1.000000
Name: Churn, dtype: float64

plt.figure(figsize=(5, 8))
sns.heatmap(correlation_with_target[['Churn']], annot=True, cmap='coolwarm', cbar=True)

<Axes: >

train['binned_MonthlyCharges'] = pd.cut(train['MonthlyCharges'], bins=[i for i in
                                                                       np.arange(min(train['MonthlyCharges']),
                                                                                 max(train['MonthlyCharges']), (max(
                                                                               train['MonthlyCharges']) - min(
                                                                               train['MonthlyCharges'])) / 73)])
train['binned_TotalCharges'] = pd.cut(train['TotalCharges'], bins=[i for i in np.arange(min(train['TotalCharges']),
                                                                                        max(train['TotalCharges']),
                                                                                        (max(train['TotalCharges']) -
                                                                                         min(train[
                                                                                                 'TotalCharges'])) / 73)])

train['binned_mc_vs_avg'] = pd.cut(train['mc_vs_avg'], bins=[i for i in np.arange(min(train['mc_vs_avg']),
                                                                                  max(train['mc_vs_avg']),
                                                                                  (max(train['mc_vs_avg']) -
                                                                                   min(train['mc_vs_avg'])) / 73)])
train = train.dropna(subset=['binned_mc_vs_avg'])

train['binned_avg_charge'] = pd.cut(train['avg_charge'], bins=[i for i in np.arange(min(train['avg_charge']),
                                                                                    max(train['avg_charge']),
                                                                                    (max(train['avg_charge']) -
                                                                                     min(train['avg_charge'])) / 73)])

C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\1908494919.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['binned_avg_charge'] = pd.cut(train['avg_charge'], bins=[i for i in np.arange(min(train['avg_charge']),

train['binned_MonthlyCharges'].describe()

count                7000
unique                 72
top       (-1.506, -1.46]
freq                  829
Name: binned_MonthlyCharges, dtype: object

from matplotlib.ticker import MaxNLocator

# from pandas.api.types import is_interval_dtype

interval_columns_bin = ['tenure', 'binned_MonthlyCharges', 'binned_TotalCharges', 'binned_mc_vs_avg',
                        'binned_avg_charge']
for f in interval_columns_bin:
    data = train[[f]].join(y)
    plt.figure()
    fig, ax = plt.subplots(figsize=(25, 5))

    # Calculate the percentage of target=1 per category value
    cat_perc = data[[f, 'Churn']].groupby([f], as_index=False).mean()
    cat_perc.sort_values(by=f, ascending=True, inplace=True)

    sns.barplot(ax=ax, x=f, y='Churn', data=cat_perc, order=cat_perc[f], color='red')
    # plt.xticks(rotation=90, ticks=[i for i in np.arange(min(train[f]),max(train[f]))])
    # plt.xticks(rotation=90, labels=[i for i in range(0, train['tenure'].nunique())],
    #            ticks=[i for i in range(0, train['tenure'].nunique())])
    plt.ylabel('% target', fontsize=18)
    plt.xlabel(f, fontsize=18)
    plt.xticks(rotation=90)
    # ax.xaxis.set_major_locator(MaxNLocator(integer=True, nbins=73))
    # plt.ylim(0, 0.1)
    plt.show()

<Figure size 640x480 with 0 Axes>

C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\4273029473.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  cat_perc = data[[f, 'Churn']].groupby([f], as_index=False).mean()

<Figure size 640x480 with 0 Axes>

C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\4273029473.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  cat_perc = data[[f, 'Churn']].groupby([f], as_index=False).mean()

<Figure size 640x480 with 0 Axes>

C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\4273029473.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  cat_perc = data[[f, 'Churn']].groupby([f], as_index=False).mean()

<Figure size 640x480 with 0 Axes>

C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\4273029473.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  cat_perc = data[[f, 'Churn']].groupby([f], as_index=False).mean()

<Figure size 640x480 with 0 Axes>

a = [i for i in range(0, train['tenure'].nunique())]
b = a.copy()
b.remove(0)

b.extend([0])
np.subtract(np.array(b), np.array(a))
len(train0['tenure'].unique())
train['tenure'].nunique()

72

interval_columns = ['tenure', 'MonthlyCharges', 'TotalCharges', 'mc_vs_avg', 'avg_charge']
for f in interval_columns:
    data = train[[f]].join(y)
    plt.figure()
    fig, ax = plt.subplots(figsize=(25, 5))
    # Calculate the percentage of target=1 per category value

    cat_perc = data[[f, 'Churn']].groupby([f], as_index=False).mean()
    cat_perc.sort_values(by='Churn', ascending=False, inplace=True)
    # Bar plot
    # Order the bars descending on target mean
    # sns.barplot(ax=ax, x=f, y='Churn', data=cat_perc, order=cat_perc[f], color='blue')
    sns.kdeplot(data=data, x=f, hue='Churn')
    plt.xticks(rotation=90)
    plt.ylabel('% target', fontsize=18)
    plt.xlabel(f, fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show()

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

y.unique()

array([0, 1])

train

print(train['binned_MonthlyCharges'])

0       (-1.185, -1.139]
1       (-0.269, -0.223]
2       (-0.407, -0.361]
3       (-0.773, -0.727]
4         (0.189, 0.235]
              ...       
7038      (0.647, 0.693]
7039      (1.242, 1.288]
7040    (-1.185, -1.139]
7041      (0.281, 0.326]
7042       (1.334, 1.38]
Name: binned_MonthlyCharges, Length: 7011, dtype: category
Categories (72, interval[float64, right]): [(-1.552, -1.506] < (-1.506, -1.46] < (-1.46, -1.414] < (-1.414, -1.369] ... (1.563, 1.609] < (1.609, 1.655] < (1.655, 1.701] < (1.701, 1.746]]

# Identify categorical features (excluding target variable)
categorical_features = [col for col in train.columns if train[col].dtype == 'O' or train[col].nunique() <= 10]

# Store results
chi2_results = []

# Perform Chi-Square test for each categorical feature
for feature in categorical_features:
    print(f"Testing feature: {feature}")

    # Create a contingency table
    contingency_table = pd.crosstab(train[feature], y)

    # Perform Chi-Square test
    chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

    # Collect results
    chi2_results.append({
        'Feature': feature,
        'Chi2_Stat': chi2_stat,
        'p-value': p_value,
        'Degrees_of_Freedom': dof,
        'Significant': p_value < 0.05  # True if p-value < 0.05
    })

    print(f"Chi-Square Statistic: {chi2_stat}, p-value: {p_value}, Significant: {p_value < 0.05}\n")

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(chi2_results)

# Filter significant results
significant_features = results_df[results_df['Significant']]
print("Significant Features (p < 0.05):")
print(significant_features)

Testing feature: gender
Chi-Square Statistic: 0.5062156219588821, p-value: 0.4767816894213255, Significant: False

Testing feature: SeniorCitizen
Chi-Square Statistic: 160.10513402866923, p-value: 1.073193079735253e-36, Significant: True

Testing feature: Partner
Chi-Square Statistic: 155.28517991505026, p-value: 1.2130356962926863e-35, Significant: True

Testing feature: Dependents
Chi-Square Statistic: 184.33692924368518, p-value: 5.4771349693371126e-42, Significant: True

Testing feature: PhoneService
Chi-Square Statistic: 0.8035620385800925, p-value: 0.37003050788642045, Significant: False

Testing feature: MultipleLines
Chi-Square Statistic: 12.011745358588566, p-value: 0.0024642379205827634, Significant: True

Testing feature: InternetService
Chi-Square Statistic: 738.2754140566219, p-value: 4.847644630207988e-161, Significant: True

Testing feature: OnlineSecurity
Chi-Square Statistic: 857.1546402159107, p-value: 7.434209716515333e-187, Significant: True

Testing feature: OnlineBackup
Chi-Square Statistic: 609.3459819464206, p-value: 4.810622025549891e-133, Significant: True

Testing feature: DeviceProtection
Chi-Square Statistic: 566.0198983673617, p-value: 1.2312344138065643e-123, Significant: True

Testing feature: TechSupport
Chi-Square Statistic: 835.3742851533483, p-value: 3.988217336538509e-182, Significant: True

Testing feature: StreamingTV
Chi-Square Statistic: 382.6326915770964, p-value: 8.172721995373581e-84, Significant: True

Testing feature: StreamingMovies
Chi-Square Statistic: 384.4343348376759, p-value: 3.3200518247196693e-84, Significant: True

Testing feature: Contract
Chi-Square Statistic: 1175.9415045841956, p-value: 4.441674808222638e-256, Significant: True

Testing feature: PaperlessBilling
Chi-Square Statistic: 261.71587157481474, p-value: 7.252734065747982e-59, Significant: True

Testing feature: PaymentMethod
Chi-Square Statistic: 650.3389796080692, p-value: 1.2298753935495455e-140, Significant: True

Testing feature: Churn
Chi-Square Statistic: 7005.868726015489, p-value: 0.0, Significant: True

Significant Features (p < 0.05):
             Feature    Chi2_Stat        p-value  Degrees_of_Freedom  \
1      SeniorCitizen   160.105134   1.073193e-36                   1   
2            Partner   155.285180   1.213036e-35                   1   
3         Dependents   184.336929   5.477135e-42                   1   
5      MultipleLines    12.011745   2.464238e-03                   2   
6    InternetService   738.275414  4.847645e-161                   2   
7     OnlineSecurity   857.154640  7.434210e-187                   2   
8       OnlineBackup   609.345982  4.810622e-133                   2   
9   DeviceProtection   566.019898  1.231234e-123                   2   
10       TechSupport   835.374285  3.988217e-182                   2   
11       StreamingTV   382.632692   8.172722e-84                   2   
12   StreamingMovies   384.434335   3.320052e-84                   2   
13          Contract  1175.941505  4.441675e-256                   2   
14  PaperlessBilling   261.715872   7.252734e-59                   1   
15     PaymentMethod   650.338980  1.229875e-140                   3   
16             Churn  7005.868726   0.000000e+00                   1   

    Significant  
1          True  
2          True  
3          True  
5          True  
6          True  
7          True  
8          True  
9          True  
10         True  
11         True  
12         True  
13         True  
14         True  
15         True  
16         True

plt.hist(train.tenure)

(array([1703.,  735.,  561.,  538.,  473.,  444.,  452.,  495.,  501.,
        1109.]),
 array([-1.28459424, -0.99507305, -0.70555187, -0.41603068, -0.12650949,
         0.16301169,  0.45253288,  0.74205407,  1.03157525,  1.32109644,
         1.61061763]),
 <BarContainer object of 10 artists>)

plt.hist(train['month'], bins=12)

(array([990., 589., 567., 536., 508., 501., 495., 494., 467., 531., 570.,
        763.]),
 array([ 1.        ,  1.91666667,  2.83333333,  3.75      ,  4.66666667,
         5.58333333,  6.5       ,  7.41666667,  8.33333333,  9.25      ,
        10.16666667, 11.08333333, 12.        ]),
 <BarContainer object of 12 artists>)

train['binned_MonthlyCharges'].describe()

count                7000
unique                 72
top       (-1.506, -1.46]
freq                  829
Name: binned_MonthlyCharges, dtype: object

a = pd.DataFrame(train['binned_MonthlyCharges'].value_counts())
a.reset_index(inplace=True)
a['binned_MonthlyCharges'] = a['binned_MonthlyCharges'].astype(str)
plt.figure(figsize=(20, 5))
plt.bar(a['binned_MonthlyCharges'], a['count'])
plt.xticks(rotation=90);
# plt.hist(train['binned_MonthlyCharges'].value_counts().sort_index(), bins=72)

train['binned_MonthlyCharges'].value_counts().sort_index()

binned_MonthlyCharges
(-1.552, -1.506]    313
(-1.506, -1.46]     829
(-1.46, -1.414]      20
(-1.414, -1.369]     15
(-1.369, -1.323]    240
                   ... 
(1.517, 1.563]       59
(1.563, 1.609]       25
(1.609, 1.655]       34
(1.655, 1.701]       39
(1.701, 1.746]       25
Name: count, Length: 72, dtype: int64

a = pd.DataFrame(train['binned_mc_vs_avg'].value_counts())
a.reset_index(inplace=True)
a['binned_mc_vs_avg'] = a['binned_mc_vs_avg'].astype(str)
plt.figure(figsize=(20, 5))
# Ensure the data is sorted
# sorted_data = a.sort_values(by='binned_mc_vs_avg')  # Sort data by 'binned_mc_vs_avg'
# # Plot the data
# plt.bar(sorted_data['binned_mc_vs_avg'], sorted_data['count'])
# # Sort x-ticks
# plt.xticks(sorted_data['binned_mc_vs_avg'], sorted(sorted_data['binned_mc_vs_avg']))

plt.scatter(a['binned_mc_vs_avg'], a['count'])
plt.xticks(rotation=90);



# sns.barplot(ax=ax, x=f, y='Churn', data=cat_perc, order=cat_perc[f], color='red')

# plt.plot(train['binned_mc_vs_avg'].value_counts().sort_index())

df = a.copy()
df['binned_mc_vs_avg'] = pd.Categorical(
    df['binned_mc_vs_avg'],
    categories=sorted(df['binned_mc_vs_avg'].unique(), key=lambda x: float(x.split(',')[0][1:])),
    ordered=True
)
df = df.sort_values(by='binned_mc_vs_avg').reset_index(drop=True)
plt.figure(figsize=(20, 5))
plt.scatter(df['binned_mc_vs_avg'], df['count'])
plt.xticks(rotation=90);

cat_meaningful_list = ['month', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                       'TechSupport', 'PaymentMethod', 'PaperlessBilling', 'Contract', 'Dependents', 'Partner',
                       'SeniorCitizen', 'StreamingMovies', 'PhoneService', 'StreamingTV', 'MultipleLines']


def set_spines_visibility(ax, is_visible):
    for s in ['left', 'right', 'top', 'bottom']:
        ax.spines[s].set_visible(is_visible)


fig = plt.figure(figsize=(16, 250))
gs = fig.add_gridspec(55, 2)

for i in range(len(cat_meaningful_list) + 1):
    if i == 0:
        ax = fig.add_subplot(gs[0, 0])
        ax.text(x=0.5, y=0.5, s="Bar Plot for \nCategorical Variables",
                horizontalalignment='center', verticalalignment='center',
                fontweight='bold', fontsize='24')
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.tick_params(left=False, bottom=False)
        set_spines_visibility(ax, False)
    else:
        ax = fig.add_subplot(gs[(i) // 2, i % 2])
        sns.countplot(x=train[cat_meaningful_list[i - 1]], ax=ax)
        set_spines_visibility(ax, False)
        plt.xticks(rotation=0)
plt.show()

cat_meaningful_list.extend(['MonthlyCharges', 'mc_vs_avg'])

cat_meaningful_list

['month',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'PaymentMethod',
 'PaperlessBilling',
 'Contract',
 'Dependents',
 'Partner',
 'SeniorCitizen',
 'StreamingMovies',
 'PhoneService',
 'StreamingTV',
 'MultipleLines',
 'MonthlyCharges',
 'mc_vs_avg']

interval_columns_bin

['tenure',
 'binned_MonthlyCharges',
 'binned_TotalCharges',
 'binned_mc_vs_avg',
 'binned_avg_charge']

train_list = cat_meaningful_list.copy()
train_list.extend(interval_columns_bin)
train_list.remove('binned_TotalCharges')
# train_list.remove('binned_MonthlyCharges')
train_list.remove('binned_avg_charge')
train_list

['month',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'PaymentMethod',
 'PaperlessBilling',
 'Contract',
 'Dependents',
 'Partner',
 'SeniorCitizen',
 'StreamingMovies',
 'PhoneService',
 'StreamingTV',
 'MultipleLines',
 'MonthlyCharges',
 'mc_vs_avg',
 'tenure',
 'binned_MonthlyCharges',
 'binned_mc_vs_avg']

train = train[train_list]

multi_value_cat_volumns = []
for f in train.columns:
    if ((train[f].dtype == 'O' or train[f].dtype == 'category') and
            (train[f].nunique() <= 12) and (train[f].nunique() > 0)):
        multi_value_cat_volumns.append(f)
# print(multi_value_cat_volumns)

train = pd.get_dummies(train, columns=multi_value_cat_volumns)

train.columns

Index(['MonthlyCharges', 'mc_vs_avg', 'tenure', 'binned_MonthlyCharges',
       'binned_mc_vs_avg', 'month_1', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10',
       'month_11', 'month_12', 'InternetService_DSL',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'OnlineBackup_No',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No', 'DeviceProtection_No internet service',
       'DeviceProtection_Yes', 'TechSupport_No',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'PaperlessBilling_No', 'PaperlessBilling_Yes',
       'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year',
       'Dependents_No', 'Dependents_Yes', 'Partner_No', 'Partner_Yes',
       'SeniorCitizen_No', 'SeniorCitizen_Yes', 'StreamingMovies_No',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'PhoneService_No', 'PhoneService_Yes', 'StreamingTV_No',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'MultipleLines_No', 'MultipleLines_No phone service',
       'MultipleLines_Yes'],
      dtype='object')

drop_list = ['InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_No internet service',
             'OnlineBackup_No internet service', 'DeviceProtection_No internet service',
             'TechSupport_No internet service', 'StreamingMovies_No internet service', 'MultipleLines_No phone service',
             'StreamingTV_No internet service',
             'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Bank transfer (automatic)',
             'PaymentMethod_Mailed check',
             'PaperlessBilling_No', 'Contract_Two year', 'Dependents_Yes', 'Partner_Yes', 'SeniorCitizen_No',
             'PhoneService_No', 'month_7']
train.drop(columns=drop_list, inplace=True)

train

translation_dict = {}

for f in train.columns:
    if (train[f].dtype == 'O' or train[f].dtype == 'bool') and (train[f].nunique() == 2):
        translation_dict[f] = {train[f].unique()[0]: 0, train[f].unique()[1]: 1}
        train[f] = train[f].map({train[f].unique()[0]: 0, train[f].unique()[1]: 1})

translation_dict

{'month_1': {np.True_: 0, np.False_: 1},
 'month_2': {np.False_: 0, np.True_: 1},
 'month_3': {np.False_: 0, np.True_: 1},
 'month_4': {np.False_: 0, np.True_: 1},
 'month_5': {np.False_: 0, np.True_: 1},
 'month_6': {np.False_: 0, np.True_: 1},
 'month_8': {np.False_: 0, np.True_: 1},
 'month_9': {np.False_: 0, np.True_: 1},
 'month_10': {np.False_: 0, np.True_: 1},
 'month_11': {np.False_: 0, np.True_: 1},
 'month_12': {np.False_: 0, np.True_: 1},
 'InternetService_DSL': {np.True_: 0, np.False_: 1},
 'InternetService_Fiber optic': {np.False_: 0, np.True_: 1},
 'OnlineSecurity_No': {np.True_: 0, np.False_: 1},
 'OnlineSecurity_Yes': {np.False_: 0, np.True_: 1},
 'OnlineBackup_No': {np.False_: 0, np.True_: 1},
 'OnlineBackup_Yes': {np.True_: 0, np.False_: 1},
 'DeviceProtection_No': {np.True_: 0, np.False_: 1},
 'DeviceProtection_Yes': {np.False_: 0, np.True_: 1},
 'TechSupport_No': {np.True_: 0, np.False_: 1},
 'TechSupport_Yes': {np.False_: 0, np.True_: 1},
 'PaymentMethod_Electronic check': {np.True_: 0, np.False_: 1},
 'PaperlessBilling_Yes': {np.True_: 0, np.False_: 1},
 'Contract_Month-to-month': {np.True_: 0, np.False_: 1},
 'Contract_One year': {np.False_: 0, np.True_: 1},
 'Dependents_No': {np.True_: 0, np.False_: 1},
 'Partner_No': {np.False_: 0, np.True_: 1},
 'SeniorCitizen_Yes': {np.False_: 0, np.True_: 1},
 'StreamingMovies_No': {np.True_: 0, np.False_: 1},
 'StreamingMovies_Yes': {np.False_: 0, np.True_: 1},
 'PhoneService_Yes': {np.False_: 0, np.True_: 1},
 'StreamingTV_No': {np.True_: 0, np.False_: 1},
 'StreamingTV_Yes': {np.False_: 0, np.True_: 1},
 'MultipleLines_No': {np.False_: 0, np.True_: 1},
 'MultipleLines_Yes': {np.False_: 0, np.True_: 1}}

translation_dict_reverse = {}
for feat_key in translation_dict:
    translation_dict_reverse[feat_key] = {value: key for key, value in translation_dict[feat_key].items()}
translation_dict_reverse

{'month_1': {0: np.True_, 1: np.False_},
 'month_2': {0: np.False_, 1: np.True_},
 'month_3': {0: np.False_, 1: np.True_},
 'month_4': {0: np.False_, 1: np.True_},
 'month_5': {0: np.False_, 1: np.True_},
 'month_6': {0: np.False_, 1: np.True_},
 'month_8': {0: np.False_, 1: np.True_},
 'month_9': {0: np.False_, 1: np.True_},
 'month_10': {0: np.False_, 1: np.True_},
 'month_11': {0: np.False_, 1: np.True_},
 'month_12': {0: np.False_, 1: np.True_},
 'InternetService_DSL': {0: np.True_, 1: np.False_},
 'InternetService_Fiber optic': {0: np.False_, 1: np.True_},
 'OnlineSecurity_No': {0: np.True_, 1: np.False_},
 'OnlineSecurity_Yes': {0: np.False_, 1: np.True_},
 'OnlineBackup_No': {0: np.False_, 1: np.True_},
 'OnlineBackup_Yes': {0: np.True_, 1: np.False_},
 'DeviceProtection_No': {0: np.True_, 1: np.False_},
 'DeviceProtection_Yes': {0: np.False_, 1: np.True_},
 'TechSupport_No': {0: np.True_, 1: np.False_},
 'TechSupport_Yes': {0: np.False_, 1: np.True_},
 'PaymentMethod_Electronic check': {0: np.True_, 1: np.False_},
 'PaperlessBilling_Yes': {0: np.True_, 1: np.False_},
 'Contract_Month-to-month': {0: np.True_, 1: np.False_},
 'Contract_One year': {0: np.False_, 1: np.True_},
 'Dependents_No': {0: np.True_, 1: np.False_},
 'Partner_No': {0: np.False_, 1: np.True_},
 'SeniorCitizen_Yes': {0: np.False_, 1: np.True_},
 'StreamingMovies_No': {0: np.True_, 1: np.False_},
 'StreamingMovies_Yes': {0: np.False_, 1: np.True_},
 'PhoneService_Yes': {0: np.False_, 1: np.True_},
 'StreamingTV_No': {0: np.True_, 1: np.False_},
 'StreamingTV_Yes': {0: np.False_, 1: np.True_},
 'MultipleLines_No': {0: np.False_, 1: np.True_},
 'MultipleLines_Yes': {0: np.False_, 1: np.True_}}

for f in train.columns:
    # print(train[f].nunique())
    print(f, '\t\t', train[f].dtype, '\t\t', train[f].nunique(), '\t', end="")
    if train[f].unique().size <= 12:
        print(f"{train[f].unique()}")
    else:
        print(f"[]")

MonthlyCharges 		 float64 		 1584 	[]
mc_vs_avg 		 float64 		 6411 	[]
tenure 		 float64 		 72 	[]
binned_MonthlyCharges 		 category 		 72 	[]
binned_mc_vs_avg 		 category 		 62 	[]
month_1 		 int64 		 2 	[0 1]
month_2 		 int64 		 2 	[0 1]
month_3 		 int64 		 2 	[0 1]
month_4 		 int64 		 2 	[0 1]
month_5 		 int64 		 2 	[0 1]
month_6 		 int64 		 2 	[0 1]
month_8 		 int64 		 2 	[0 1]
month_9 		 int64 		 2 	[0 1]
month_10 		 int64 		 2 	[0 1]
month_11 		 int64 		 2 	[0 1]
month_12 		 int64 		 2 	[0 1]
InternetService_DSL 		 int64 		 2 	[0 1]
InternetService_Fiber optic 		 int64 		 2 	[0 1]
OnlineSecurity_No 		 int64 		 2 	[0 1]
OnlineSecurity_Yes 		 int64 		 2 	[0 1]
OnlineBackup_No 		 int64 		 2 	[0 1]
OnlineBackup_Yes 		 int64 		 2 	[0 1]
DeviceProtection_No 		 int64 		 2 	[0 1]
DeviceProtection_Yes 		 int64 		 2 	[0 1]
TechSupport_No 		 int64 		 2 	[0 1]
TechSupport_Yes 		 int64 		 2 	[0 1]
PaymentMethod_Electronic check 		 int64 		 2 	[0 1]
PaperlessBilling_Yes 		 int64 		 2 	[0 1]
Contract_Month-to-month 		 int64 		 2 	[0 1]
Contract_One year 		 int64 		 2 	[0 1]
Dependents_No 		 int64 		 2 	[0 1]
Partner_No 		 int64 		 2 	[0 1]
SeniorCitizen_Yes 		 int64 		 2 	[0 1]
StreamingMovies_No 		 int64 		 2 	[0 1]
StreamingMovies_Yes 		 int64 		 2 	[0 1]
PhoneService_Yes 		 int64 		 2 	[0 1]
StreamingTV_No 		 int64 		 2 	[0 1]
StreamingTV_Yes 		 int64 		 2 	[0 1]
MultipleLines_No 		 int64 		 2 	[0 1]
MultipleLines_Yes 		 int64 		 2 	[0 1]

# object_columns = train.select_dtypes(include=['object']).columns
# len(object_columns)

dup_list = train.duplicated()
dup_list.sum()

np.int64(59)

train_y = train.join(y)

total_dup_list = train_y.duplicated()
total_dup_list.sum()

np.int64(45)

conflict_list = dup_list & ~total_dup_list
conflict_list.sum()

np.int64(14)

# train_y.drop(train[conflict_list].index, inplace=True)

y = train_y['Churn']
train = train_y.drop(columns=['Churn'])
train

service_count_list = ['OnlineSecurity_Yes', 'OnlineBackup_Yes', 'DeviceProtection_Yes', 'TechSupport_Yes',
                      'StreamingMovies_Yes', 'MultipleLines_Yes', 'InternetService_DSL',
                      'InternetService_Fiber optic']
train['serviceCount'] = train[service_count_list].sum(axis=1)
train

train['Churn'] = y

for f in ['serviceCount']:
    plt.figure()
    fig, ax = plt.subplots(figsize=(20, 5))
    # Calculate the percentage of target=1 per category value

    cat_perc = train[[f, 'Churn']].groupby([f], as_index=False).mean()
    cat_perc.sort_values(by='Churn', ascending=False, inplace=True)
    # Bar plot
    # Order the bars descending on target mean
    sns.barplot(ax=ax, x=f, y='Churn', data=cat_perc, order=cat_perc[f])
    plt.ylabel('% Churn', fontsize=18)
    plt.xlabel(f, fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show()

<Figure size 640x480 with 0 Axes>

train['serviceCount'] = train['serviceCount'].astype('category')

print(train.columns)

Index(['MonthlyCharges', 'mc_vs_avg', 'tenure', 'binned_MonthlyCharges',
       'binned_mc_vs_avg', 'month_1', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'InternetService_DSL', 'InternetService_Fiber optic',
       'OnlineSecurity_No', 'OnlineSecurity_Yes', 'OnlineBackup_No',
       'OnlineBackup_Yes', 'DeviceProtection_No', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_Yes', 'PaymentMethod_Electronic check',
       'PaperlessBilling_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Dependents_No', 'Partner_No', 'SeniorCitizen_Yes',
       'StreamingMovies_No', 'StreamingMovies_Yes', 'PhoneService_Yes',
       'StreamingTV_No', 'StreamingTV_Yes', 'MultipleLines_No',
       'MultipleLines_Yes', 'serviceCount', 'Churn'],
      dtype='object')

sns.scatterplot(x='serviceCount', y='MonthlyCharges', hue='Churn', data=train)

<Axes: xlabel='serviceCount', ylabel='MonthlyCharges'>

train

y = train['Churn']
train.drop('Churn', inplace=True, axis=1)
train

for col in ['binned_mc_vs_avg']:
    train[col] = train[col].apply(lambda x: x.mid)  # Convert intervals to their midpoint

train.head()

for col in train.columns:
    if train[col].dtype != 'float64':
        train[col] = train[col].astype('category')

train.columns

Index(['MonthlyCharges', 'mc_vs_avg', 'tenure', 'binned_MonthlyCharges',
       'binned_mc_vs_avg', 'month_1', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'InternetService_DSL', 'InternetService_Fiber optic',
       'OnlineSecurity_No', 'OnlineSecurity_Yes', 'OnlineBackup_No',
       'OnlineBackup_Yes', 'DeviceProtection_No', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_Yes', 'PaymentMethod_Electronic check',
       'PaperlessBilling_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Dependents_No', 'Partner_No', 'SeniorCitizen_Yes',
       'StreamingMovies_No', 'StreamingMovies_Yes', 'PhoneService_Yes',
       'StreamingTV_No', 'StreamingTV_Yes', 'MultipleLines_No',
       'MultipleLines_Yes', 'serviceCount'],
      dtype='object')

train.drop(['binned_MonthlyCharges', 'MonthlyCharges', 'mc_vs_avg', 'serviceCount'], inplace=True, axis=1)

y = y.astype('int')

train_y = train.copy()
train_y.columns

Index(['tenure', 'binned_mc_vs_avg', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_8', 'month_9', 'month_10',
       'month_11', 'month_12', 'InternetService_DSL',
       'InternetService_Fiber optic', 'OnlineSecurity_No',
       'OnlineSecurity_Yes', 'OnlineBackup_No', 'OnlineBackup_Yes',
       'DeviceProtection_No', 'DeviceProtection_Yes', 'TechSupport_No',
       'TechSupport_Yes', 'PaymentMethod_Electronic check',
       'PaperlessBilling_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Dependents_No', 'Partner_No', 'SeniorCitizen_Yes',
       'StreamingMovies_No', 'StreamingMovies_Yes', 'PhoneService_Yes',
       'StreamingTV_No', 'StreamingTV_Yes', 'MultipleLines_No',
       'MultipleLines_Yes'],
      dtype='object')

# train_y.drop(['avg_charge', 'TotalCharges', 'MonthlyCharges', 'mc_vs_avg', 'tenure'], inplace=True, axis=1)
# dup_list = train_y.duplicated()
# print(dup_list.sum())
# train_y = train.join(y).copy()
# total_dup_list = train_y.duplicated()
# print(total_dup_list.sum())
# conflict_list = dup_list & ~total_dup_list
# print(conflict_list.sum())
# train_y.drop(train[conflict_list].index, inplace=True)
# y = train_y['Churn']
# train = train_y.drop(columns=['Churn'])

train.shape
y.value_counts()

Churn
0    5152
1    1859
Name: count, dtype: int64

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42, stratify=y)
feat_labels = X_train.columns

# smote = SMOTE(random_state=42)
smote = SMOTE(random_state=0)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

D:\PythonProjects\class2_venv2\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py:136: UserWarning: Could not find the number of physical cores for the following reason:
[WinError 2] The system cannot find the file specified
Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.
  warnings.warn(
  File "D:\PythonProjects\class2_venv2\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 550, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1028, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1540, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# from imblearn.over_sampling import ADASYN
#
# adasyn = ADASYN(sampling_strategy=0.75, n_neighbors=4)
# adasyn = ADASYN()
# X_train_sm, y_train_sm = adasyn.fit_resample(X_train, y_train)

# X_train_sm = X_train
# y_train_sm = y_train

y_train_sm.value_counts() / len(y_train_sm)

Churn
0    0.5
1    0.5
Name: count, dtype: float64

model = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1, min_samples_leaf=10)
model.fit(X_train_sm, y_train_sm)

importances = model.feature_importances_
indices = np.argsort(model.feature_importances_)[::-1]

print(np.cumsum(importances[indices]))
print(importances[indices])
print(feat_labels[indices])

[0.25885911 0.38850738 0.4914927  0.56114405 0.62522077 0.68775492
 0.73601749 0.76147469 0.78623121 0.81033346 0.8334346  0.85397203
 0.86741355 0.88053893 0.89283672 0.90487134 0.91686185 0.92801217
 0.93586798 0.94285803 0.94898147 0.95469178 0.96039404 0.96545328
 0.97017336 0.97436966 0.97761533 0.98046968 0.98319377 0.98582835
 0.98840161 0.99094224 0.99292451 0.99482136 0.99670612 0.99841269
 1.        ]
[0.25885911 0.12964827 0.10298532 0.06965134 0.06407672 0.06253415
 0.04826258 0.0254572  0.02475651 0.02410225 0.02310115 0.02053742
 0.01344152 0.01312538 0.01229779 0.01203462 0.01199051 0.01115032
 0.0078558  0.00699006 0.00612343 0.00571031 0.00570226 0.00505924
 0.00472009 0.0041963  0.00324567 0.00285435 0.00272409 0.00263458
 0.00257325 0.00254064 0.00198227 0.00189684 0.00188477 0.00170657
 0.00158731]
Index(['binned_mc_vs_avg', 'Contract_Month-to-month', 'tenure',
       'PaymentMethod_Electronic check', 'OnlineSecurity_No', 'TechSupport_No',
       'InternetService_Fiber optic', 'month_1', 'PaperlessBilling_Yes',
       'OnlineBackup_No', 'Partner_No', 'Dependents_No', 'DeviceProtection_No',
       'SeniorCitizen_Yes', 'InternetService_DSL', 'OnlineSecurity_Yes',
       'Contract_One year', 'TechSupport_Yes', 'MultipleLines_Yes',
       'MultipleLines_No', 'OnlineBackup_Yes', 'StreamingTV_Yes',
       'StreamingMovies_Yes', 'StreamingTV_No', 'StreamingMovies_No',
       'DeviceProtection_Yes', 'month_6', 'month_3', 'month_2', 'month_10',
       'month_5', 'month_12', 'month_8', 'PhoneService_Yes', 'month_9',
       'month_11', 'month_4'],
      dtype='object')

y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# F1-Score
f1 = f1_score(y_test, y_pred)
print(f"F1-Score: {f1}")

Accuracy: 0.7954383464005702
Precision: 0.6164383561643836
Recall: 0.6048387096774194
F1-Score: 0.6105834464043419

y_train_sm.value_counts()

Churn
0    4121
1    4121
Name: count, dtype: int64

from sklearn.metrics import make_scorer, f1_score

# Define the random forest model
rf = RandomForestClassifier(random_state=0, n_jobs=-1)

# Define the hyperparameter grid
param_distributions = {
    'n_estimators': [i for i in range(300, 1100, 100)],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 120],  # Maximum depth of trees
    'min_samples_split': [i for i in range(2, 11)],  # Minimum samples required to split an internal node
    'min_samples_leaf': [i for i in range(1, 11)],  # Minimum samples required in a leaf node
    'max_features': ['sqrt', 'log2', None]  # Number of features to consider for the best split
}

f1_class_1_scorer = make_scorer(f1_score, pos_label=1)

# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=50,  # Number of parameter settings to sample
    scoring=f1_class_1_scorer,  # Score based on F1-Score
    n_jobs=-1,  # Use all processors
    cv=5,  # 5-fold cross-validation
    verbose=2,  # Print status messages during search
    random_state=0
)

# Fit the randomized search model
random_search.fit(X_train_sm, y_train_sm)

# Output the best parameters and score
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best F1-Score: {random_search.best_score_}")

# Evaluate on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))  # Detailed performance metrics

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'n_estimators': 600, 'min_samples_split': 8, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 10}
Best F1-Score: 0.8169772588421169
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      1031
           1       0.63      0.63      0.63       372

    accuracy                           0.80      1403
   macro avg       0.75      0.75      0.75      1403
weighted avg       0.80      0.80      0.80      1403

import pickle

importances = best_model.feature_importances_
indices = np.argsort(model.feature_importances_)[::-1]
print(feat_labels[indices])

# Save the model to a file
with open(os.path.join(workdir, 'churn_mcvsavg_month.pkl'), 'wb') as file:
    pickle.dump(model, file)

print("Model saved successfully!")

Index(['binned_mc_vs_avg', 'Contract_Month-to-month', 'tenure',
       'PaymentMethod_Electronic check', 'OnlineSecurity_No', 'TechSupport_No',
       'InternetService_Fiber optic', 'month_1', 'PaperlessBilling_Yes',
       'OnlineBackup_No', 'Partner_No', 'Dependents_No', 'DeviceProtection_No',
       'SeniorCitizen_Yes', 'InternetService_DSL', 'OnlineSecurity_Yes',
       'Contract_One year', 'TechSupport_Yes', 'MultipleLines_Yes',
       'MultipleLines_No', 'OnlineBackup_Yes', 'StreamingTV_Yes',
       'StreamingMovies_Yes', 'StreamingTV_No', 'StreamingMovies_No',
       'DeviceProtection_Yes', 'month_6', 'month_3', 'month_2', 'month_10',
       'month_5', 'month_12', 'month_8', 'PhoneService_Yes', 'month_9',
       'month_11', 'month_4'],
      dtype='object')
Model saved successfully!

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
smote = SMOTE(random_state=0)
X_train_sm, y_train_sm = smote.fit_resample(train, y)
scores = cross_val_score(best_model, X_train_sm, y_train_sm, cv=kf, scoring='f1')
average_f1 = np.mean(scores)
print(f"F1 Score for each fold: {[round(score, 4) for score in scores]}")
print(f"Average F1 across {k} folds: {average_f1:.2f}")

F1 Score for each fold: [np.float64(0.8468), np.float64(0.8539), np.float64(0.8682), np.float64(0.8573), np.float64(0.8523)]
Average F1 across 5 folds: 0.86

	SeniorCitizen	tenure	MonthlyCharges
count	7025.000000	7025.000000	7025.000000
mean	0.162562	32.451530	64.851082
std	0.368992	24.539465	30.061207
min	0.000000	0.000000	18.250000
25%	0.000000	9.000000	35.750000
50%	0.000000	29.000000	70.400000
75%	0.000000	55.000000	89.900000
max	1.000000	72.000000	118.750000

	tenure	MonthlyCharges	TotalCharges	avg_charge	mc_vs_avg
0	1	29.85	29.85	29.850000	1.000000
1	34	56.95	1889.50	55.573529	1.024768
2	2	53.85	108.15	54.075000	0.995839
3	45	42.30	1840.75	40.905556	1.034089
4	2	70.70	151.65	75.825000	0.932410
...	...	...	...	...	...
7038	24	84.80	1990.50	82.937500	1.022457
7039	72	103.20	7362.90	102.262500	1.009168
7040	11	29.60	346.45	31.495455	0.939818
7041	4	74.40	306.60	76.650000	0.970646
7042	66	105.65	6844.50	103.704545	1.018760

	tenure	MonthlyCharges	TotalCharges	Churn	avg_charge	mc_vs_avg
count	7014.000000	7014.000000	7014.000000	7014.000000	7014.000000	7014.000000
mean	32.502424	64.887831	2289.083397	0.265041	64.889050	1.002317
std	24.524997	30.056965	2266.797933	0.441387	30.157231	0.051354
min	1.000000	18.250000	18.800000	0.000000	13.775000	0.635545
25%	9.000000	35.750000	406.275000	0.000000	36.505357	0.980691
50%	29.000000	70.400000	1401.825000	0.000000	70.453125	1.000000
75%	56.000000	89.900000	3806.612500	1.000000	90.227604	1.020930
max	72.000000	118.750000	8684.800000	1.000000	121.400000	1.450628

	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	Female	0	Yes	No	1	No	No phone service	DSL	No	Yes	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	Male	0	No	No	34	Yes	No	DSL	Yes	No	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	Male	0	No	No	2	Yes	No	DSL	Yes	Yes	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	Male	0	No	No	45	No	No phone service	DSL	Yes	No	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	Female	0	No	No	2	Yes	No	Fiber optic	No	No	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7038	Male	0	Yes	Yes	24	Yes	Yes	DSL	Yes	No	Yes	Yes	Yes	Yes	One year	Yes	Mailed check	84.80	1990.5	No
7039	Female	0	Yes	Yes	72	Yes	Yes	Fiber optic	No	Yes	Yes	No	Yes	Yes	One year	Yes	Credit card (automatic)	103.20	7362.9	No
7040	Female	0	Yes	Yes	11	No	No phone service	DSL	Yes	No	No	No	No	No	Month-to-month	Yes	Electronic check	29.60	346.45	No
7041	Male	1	Yes	No	4	Yes	Yes	Fiber optic	No	No	No	No	No	No	Month-to-month	Yes	Mailed check	74.40	306.6	Yes
7042	Male	0	No	No	66	Yes	No	Fiber optic	Yes	No	Yes	Yes	Yes	Yes	Two year	Yes	Bank transfer (automatic)	105.65	6844.5	No

	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	...	MonthlyCharges	TotalCharges	Churn	month	avg_charge	mc_vs_avg	binned_MonthlyCharges	binned_TotalCharges	binned_mc_vs_avg	binned_avg_charge
0	Female	No	Yes	No	-1.284594	No	No phone service	DSL	No	Yes	...	-1.165797	-0.996734	0	1	-1.161962	-0.045126	(-1.185, -1.139]	(-1.002, -0.949]	(-0.185, 0.0329]	(-1.206, -1.157]
1	Male	No	No	No	0.061068	Yes	No	DSL	Yes	No	...	-0.264112	-0.176289	0	10	-0.308920	0.437219	(-0.269, -0.223]	(-0.216, -0.164]	(0.25, 0.468]	(-0.326, -0.277]
2	Male	No	No	No	-1.243817	Yes	No	DSL	Yes	Yes	...	-0.367257	-0.962189	1	2	-0.358615	-0.126156	(-0.407, -0.361]	(-1.002, -0.949]	(-0.185, 0.0329]	(-0.375, -0.326]
3	Male	No	No	No	0.509622	No	No phone service	DSL	Yes	No	...	-0.751554	-0.197797	0	9	-0.795338	0.618735	(-0.773, -0.727]	(-0.216, -0.164]	(0.468, 0.685]	(-0.815, -0.766]
4	Female	No	No	No	-1.243817	Yes	No	Fiber optic	No	No	...	0.193386	-0.942998	1	2	0.362657	-1.361380	(0.189, 0.235]	(-0.949, -0.897]	(-1.489, -1.272]	(0.358, 0.407]
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7038	Male	No	Yes	Yes	-0.346709	Yes	Yes	DSL	Yes	No	...	0.662528	-0.131730	0	12	0.598521	0.392199	(0.647, 0.693]	(-0.164, -0.111]	(0.25, 0.468]	(0.554, 0.603]
7039	Female	No	Yes	Yes	1.610618	Yes	Yes	Fiber optic	No	Yes	...	1.274743	2.238478	0	12	1.239375	0.133405	(1.242, 1.288]	(2.193, 2.246]	(0.0329, 0.25]	(1.238, 1.287]
7040	Female	No	Yes	Yes	-0.876818	No	No phone service	DSL	Yes	No	...	-1.174115	-0.857056	0	11	-1.107395	-1.217116	(-1.185, -1.139]	(-0.897, -0.844]	(-1.272, -1.054]	(-1.108, -1.059]
7041	Male	Yes	Yes	No	-1.162261	Yes	Yes	Fiber optic	No	No	...	0.316494	-0.874637	1	4	0.390016	-0.616774	(0.281, 0.326]	(-0.897, -0.844]	(-0.619, -0.402]	(0.358, 0.407]
7042	Male	No	No	No	1.365952	Yes	No	Fiber optic	Yes	No	...	1.356261	2.009770	0	6	1.287196	0.320201	(1.334, 1.38]	(1.984, 2.036]	(0.25, 0.468]	(1.238, 1.287]

Data Preprocessing¶

Purpose: Load and examine the dataset¶

Load data and drop CustomerID¶

If drop these 3 columns ['tenure','MonthlyCharges','TotalCharges'], 2598 lines are conflict¶

Do these 3 columns have meaning?¶

drop conflict rows¶

Examine some 3-4 values variables¶

Purpose- Handle possible missing values and incorrect data types¶

Overview the features value count¶

Check null fields existence¶

The tenure unit is month¶

A telecom customer tenure can be zero, meaning a customer has just signed up and has not yet had any service time with the company; essentially representing a brand new customer with a fresh account.¶

Drop 11 rows with TotalCharges is Na¶

Action: The column TotalCharges is converted to float.¶

Transform categorical variables appropriately¶

Convert SeniorCitizen to object and 1/0¶

Explore relation with tenure¶

TotalCharge and MonthlyCharge¶

Encode Churn into 1/0¶

The monthly charges correlate to total charges devided by tenure but not the same¶

The mc_vs_avg give the ratio that the last month's charge vs total mean month charge¶

Target Encoding¶

Check target imbalance.¶

Action- Drop customerID ;drop Churn to y as target¶

Create normalized versions of relavant variables¶

interval columns analysis¶

Conclusion: Monthly Charges does not change too much. I guess it mainly related to inflation.¶

normalize the numeric_columns¶

Check numeric variables correlation with target¶

Target Encoding for numeric variables¶

If the last month charges is too high or too low compared to average monthly charges, The customer tend to Churn.¶

Conclusion:¶

Purpose-Develop meaningful feature engineering(e.g., tenure groups, service counts)¶

Conclusion:¶

Exploratory Data Analysis¶

Purpose- Perform univariate analysis on key variables¶

Chi-Square test for categorical features correlation¶

tenure¶

month¶

binned_MonthlyCharges¶

binned_mc_vs_avg¶

Categorical Count Bar Plot¶

Analyze relationships between features and churn¶

Create informative visualizations with proper labeling and titles¶

Conduct correlation analysis to identify significant patterns¶

Conclusion-This is done in Data Preprocessing¶

Model and Evaluation¶

Feature Selection¶

One-hot encoding the 3-4 values variables¶

The dictionary for category translation¶

If need to convert two value category to binary 0/1 column¶

reverse dict for translating from int to object¶

Convert group to midpoint¶

I think it's not proper to drop these rows. Because the conflic is caused by data preprocessing.¶

encode the service count to represent number of service¶

service count vs monthly charges¶

prepare the train set and target¶

binned_MonthlyCharges is useless or even worse. Drop it.¶

Implement a Random Forest classification model¶

Evaluate the model metrics¶

random search hyper parameters for the random forest classifier¶

Cross Validation¶

Adding features¶

Adding feature¶