import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import chi2_contingency
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
%matplotlib inline
os.chdir(r'D:\MyDrive2\pythonprojects\class2\DataMining\homework\ChunrAnalysis')
workdir = os.getcwd()
data_dir = os.path.join(workdir, 'data')
Data Preprocessing¶
Purpose: Load and examine the dataset¶
Load data and drop CustomerID¶
train = pd.read_csv(os.path.join(data_dir, 'Telco-Customer-Churn.csv'))
train.drop('customerID', axis=1, inplace=True)
train0 = train.copy()
train
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | Male | 0 | Yes | Yes | 24 | Yes | Yes | DSL | Yes | No | Yes | Yes | Yes | Yes | One year | Yes | Mailed check | 84.80 | 1990.5 | No |
| 7039 | Female | 0 | Yes | Yes | 72 | Yes | Yes | Fiber optic | No | Yes | Yes | No | Yes | Yes | One year | Yes | Credit card (automatic) | 103.20 | 7362.9 | No |
| 7040 | Female | 0 | Yes | Yes | 11 | No | No phone service | DSL | Yes | No | No | No | No | No | Month-to-month | Yes | Electronic check | 29.60 | 346.45 | No |
| 7041 | Male | 1 | Yes | No | 4 | Yes | Yes | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Mailed check | 74.40 | 306.6 | Yes |
| 7042 | Male | 0 | No | No | 66 | Yes | No | Fiber optic | Yes | No | Yes | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.5 | No |
7043 rows × 20 columns
y = train['Churn']
train.drop(['Churn'], inplace=True, axis=1)
train_y = train.copy()
train_y.drop(['TotalCharges', 'MonthlyCharges', 'tenure'], inplace=True, axis=1)
dup_list = train_y.duplicated()
print(f"{dup_list.sum()} rows are duplicated if do not consider target")
train_y = train.join(y).copy()
total_dup_list = train_y.duplicated()
print(f"{total_dup_list.sum()} rows are duplicated if consider target")
conflict_list = dup_list & ~total_dup_list
print(f"{conflict_list.sum()} rows are conflict")
train_y.drop(train[conflict_list].index, inplace=True)
y = train_y['Churn']
train = train_y.drop(columns=['Churn'])
train = train0.copy()
2620 rows are duplicated if do not consider target 22 rows are duplicated if consider target 2598 rows are conflict
drop conflict rows¶
print(len(train[train.duplicated()]))
print(len(train[train.drop(['Churn'], axis=1).duplicated()]))
total_dup_list = train.duplicated()
feat_dup_list = train.drop(['Churn'], axis=1).duplicated()
conflict_list = feat_dup_list & ~total_dup_list
conflict_list.sum()
22 40
np.int64(18)
len(train[conflict_list].index)
18
train.drop(train[conflict_list].index, inplace=True)
train
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | Male | 0 | Yes | Yes | 24 | Yes | Yes | DSL | Yes | No | Yes | Yes | Yes | Yes | One year | Yes | Mailed check | 84.80 | 1990.5 | No |
| 7039 | Female | 0 | Yes | Yes | 72 | Yes | Yes | Fiber optic | No | Yes | Yes | No | Yes | Yes | One year | Yes | Credit card (automatic) | 103.20 | 7362.9 | No |
| 7040 | Female | 0 | Yes | Yes | 11 | No | No phone service | DSL | Yes | No | No | No | No | No | Month-to-month | Yes | Electronic check | 29.60 | 346.45 | No |
| 7041 | Male | 1 | Yes | No | 4 | Yes | Yes | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Mailed check | 74.40 | 306.6 | Yes |
| 7042 | Male | 0 | No | No | 66 | Yes | No | Fiber optic | Yes | No | Yes | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.5 | No |
7025 rows × 20 columns
Examine some 3-4 values variables¶
train['Contract'].value_counts()
Contract Month-to-month 3857 Two year 1695 One year 1473 Name: count, dtype: int64
train['PaymentMethod'].value_counts()
PaymentMethod Electronic check 2363 Mailed check 1596 Bank transfer (automatic) 1544 Credit card (automatic) 1522 Name: count, dtype: int64
Purpose- Handle possible missing values and incorrect data types¶
Overview the features value count¶
train.nunique()
gender 2 SeniorCitizen 2 Partner 2 Dependents 2 tenure 73 PhoneService 2 MultipleLines 3 InternetService 3 OnlineSecurity 3 OnlineBackup 3 DeviceProtection 3 TechSupport 3 StreamingTV 3 StreamingMovies 3 Contract 3 PaperlessBilling 2 PaymentMethod 4 MonthlyCharges 1585 TotalCharges 6531 Churn 2 dtype: int64
train.describe()
| SeniorCitizen | tenure | MonthlyCharges | |
|---|---|---|---|
| count | 7025.000000 | 7025.000000 | 7025.000000 |
| mean | 0.162562 | 32.451530 | 64.851082 |
| std | 0.368992 | 24.539465 | 30.061207 |
| min | 0.000000 | 0.000000 | 18.250000 |
| 25% | 0.000000 | 9.000000 | 35.750000 |
| 50% | 0.000000 | 29.000000 | 70.400000 |
| 75% | 0.000000 | 55.000000 | 89.900000 |
| max | 1.000000 | 72.000000 | 118.750000 |
Check null fields existence¶
train.isnull().sum().sum()
np.int64(0)
plt.figure(figsize=(20, 5))
plt.hist(train['tenure'], bins=100);
train['tenure'].value_counts().sort_index()
tenure
0 11
1 595
2 238
3 200
4 176
...
68 100
69 95
70 119
71 170
72 362
Name: count, Length: 73, dtype: int64
The tenure unit is month¶
A telecom customer tenure can be zero, meaning a customer has just signed up and has not yet had any service time with the company; essentially representing a brand new customer with a fresh account.¶
plt.bar(train['SeniorCitizen'].value_counts().index, train['SeniorCitizen'].value_counts())
<BarContainer object of 2 artists>
plt.hist(train['MonthlyCharges'], bins=50);
Drop 11 rows with TotalCharges is Na¶
train['TotalCharges_as_float'] = pd.to_numeric(train['TotalCharges'], errors='coerce')
non_float_entries = train[train['TotalCharges_as_float'].isna()]['TotalCharges']
non_float_entries
488 753 936 1082 1340 3331 3826 4380 5218 6670 6754 Name: TotalCharges, dtype: object
train.dropna(inplace=True)
train.drop('TotalCharges_as_float', axis=1, inplace=True)
train['TotalCharges'] = train['TotalCharges'].astype(float)
Action: 11 rows are removed. Reason:column TotalCharges is NA
Action: The column TotalCharges is converted to float.¶
Transform categorical variables appropriately¶
for f in train.columns:
print(f, '\t\t', train[f].dtype, '\t\t', train[f].nunique(), '\t', end="")
if train[f].unique().size <= 4:
print(f"{train[f].unique()}")
else:
print(f"[]")
gender object 2 ['Female' 'Male'] SeniorCitizen int64 2 [0 1] Partner object 2 ['Yes' 'No'] Dependents object 2 ['No' 'Yes'] tenure int64 72 [] PhoneService object 2 ['No' 'Yes'] MultipleLines object 3 ['No phone service' 'No' 'Yes'] InternetService object 3 ['DSL' 'Fiber optic' 'No'] OnlineSecurity object 3 ['No' 'Yes' 'No internet service'] OnlineBackup object 3 ['Yes' 'No' 'No internet service'] DeviceProtection object 3 ['No' 'Yes' 'No internet service'] TechSupport object 3 ['No' 'Yes' 'No internet service'] StreamingTV object 3 ['No' 'Yes' 'No internet service'] StreamingMovies object 3 ['No' 'Yes' 'No internet service'] Contract object 3 ['Month-to-month' 'One year' 'Two year'] PaperlessBilling object 2 ['Yes' 'No'] PaymentMethod object 4 ['Electronic check' 'Mailed check' 'Bank transfer (automatic)' 'Credit card (automatic)'] MonthlyCharges float64 1584 [] TotalCharges float64 6530 [] Churn object 2 ['No' 'Yes']
Convert SeniorCitizen to object and 1/0¶
#Convert SeniorCitizen to categorical
train['SeniorCitizen'] = train['SeniorCitizen'].astype('O')
train['SeniorCitizen'] = train['SeniorCitizen'].apply(lambda x: 'Yes' if x == 1 else 'No')
Explore relation with tenure¶
add a feature--Month
train['month'] = train['tenure'] % 12
train['month'] = train['month'].apply(lambda x: 12 if x == 0 else x)
train['month'] = train['month'].astype('category')
TotalCharge and MonthlyCharge¶
train['avg_charge'] = train['TotalCharges'] / train['tenure']
df = train[['tenure', 'avg_charge', 'MonthlyCharges', 'TotalCharges']]
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', cbar=False)
plt.yticks(rotation=0)
(array([0.5, 1.5, 2.5, 3.5]), [Text(0, 0.5, 'tenure'), Text(0, 1.5, 'avg_charge'), Text(0, 2.5, 'MonthlyCharges'), Text(0, 3.5, 'TotalCharges')])
# plt.scatter(x='MonthlyCharges', y='avg_charge', data=df)
df['Churn'] = train['Churn']
sns.scatterplot(x='MonthlyCharges', y='avg_charge', data=df, hue='Churn')
plt.xlabel('Monthly Charges')
plt.ylabel('Average Charge')
plt.title('Monthly Charges vs Average Charge')
C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\868448621.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['Churn'] = train['Churn']
Text(0.5, 1.0, 'Monthly Charges vs Average Charge')
Encode Churn into 1/0¶
train['Churn'] = train['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)
train['Churn'] = train['Churn'].astype('int')
The monthly charges correlate to total charges devided by tenure but not the same¶
I tend to recognize the Monthlycharges column is the last month's charges, the last attitude before customer churn.
train['mc_vs_avg'] = train['MonthlyCharges'] / train['avg_charge']
plt.hist(train['mc_vs_avg'], bins=100)
(array([1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
1.000e+00, 0.000e+00, 1.000e+00, 2.000e+00, 4.000e+00, 0.000e+00,
1.000e+00, 3.000e+00, 8.000e+00, 4.000e+00, 1.000e+00, 6.000e+00,
8.000e+00, 8.000e+00, 8.000e+00, 1.400e+01, 1.300e+01, 8.000e+00,
2.300e+01, 2.400e+01, 2.900e+01, 4.400e+01, 5.400e+01, 7.500e+01,
8.800e+01, 1.140e+02, 1.590e+02, 2.210e+02, 2.890e+02, 3.760e+02,
5.180e+02, 6.390e+02, 1.237e+03, 6.180e+02, 5.450e+02, 3.880e+02,
3.030e+02, 2.770e+02, 1.730e+02, 1.540e+02, 1.060e+02, 6.200e+01,
6.400e+01, 6.900e+01, 3.600e+01, 3.400e+01, 3.200e+01, 1.500e+01,
2.000e+01, 1.500e+01, 1.300e+01, 1.000e+01, 1.900e+01, 6.000e+00,
1.200e+01, 3.000e+00, 4.000e+00, 9.000e+00, 5.000e+00, 5.000e+00,
5.000e+00, 2.000e+00, 1.000e+00, 5.000e+00, 2.000e+00, 1.000e+00,
2.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 2.000e+00, 1.000e+00,
1.000e+00, 1.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 1.000e+00,
0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00,
0.000e+00, 0.000e+00, 0.000e+00, 2.000e+00]),
array([0.63554464, 0.64369547, 0.65184631, 0.65999715, 0.66814798,
0.67629882, 0.68444966, 0.6926005 , 0.70075133, 0.70890217,
0.71705301, 0.72520385, 0.73335468, 0.74150552, 0.74965636,
0.7578072 , 0.76595803, 0.77410887, 0.78225971, 0.79041054,
0.79856138, 0.80671222, 0.81486306, 0.82301389, 0.83116473,
0.83931557, 0.84746641, 0.85561724, 0.86376808, 0.87191892,
0.88006975, 0.88822059, 0.89637143, 0.90452227, 0.9126731 ,
0.92082394, 0.92897478, 0.93712562, 0.94527645, 0.95342729,
0.96157813, 0.96972897, 0.9778798 , 0.98603064, 0.99418148,
1.00233231, 1.01048315, 1.01863399, 1.02678483, 1.03493566,
1.0430865 , 1.05123734, 1.05938818, 1.06753901, 1.07568985,
1.08384069, 1.09199152, 1.10014236, 1.1082932 , 1.11644404,
1.12459487, 1.13274571, 1.14089655, 1.14904739, 1.15719822,
1.16534906, 1.1734999 , 1.18165074, 1.18980157, 1.19795241,
1.20610325, 1.21425408, 1.22240492, 1.23055576, 1.2387066 ,
1.24685743, 1.25500827, 1.26315911, 1.27130995, 1.27946078,
1.28761162, 1.29576246, 1.30391329, 1.31206413, 1.32021497,
1.32836581, 1.33651664, 1.34466748, 1.35281832, 1.36096916,
1.36911999, 1.37727083, 1.38542167, 1.39357251, 1.40172334,
1.40987418, 1.41802502, 1.42617585, 1.43432669, 1.44247753,
1.45062837]),
<BarContainer object of 100 artists>)
The mc_vs_avg give the ratio that the last month's charge vs total mean month charge¶
train.columns
Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
'MonthlyCharges', 'TotalCharges', 'Churn', 'month', 'avg_charge',
'mc_vs_avg'],
dtype='object')
Target Encoding¶
for f in train.columns:
if train[f].nunique() > 12:
continue
if f == 'Churn':
continue
plt.figure()
fig, ax = plt.subplots(figsize=(20, 5))
# Calculate the percentage of target=1 per category value
cat_perc = train[[f, 'Churn']].groupby([f], as_index=False).mean()
cat_perc.sort_values(by='Churn', ascending=False, inplace=True)
# Bar plot
# Order the bars descending on target mean
sns.barplot(ax=ax, x=f, y='Churn', data=cat_perc, order=cat_perc[f])
plt.ylabel('% Churn', fontsize=18)
plt.xlabel(f, fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\1826624636.py:10: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. cat_perc = train[[f, 'Churn']].groupby([f], as_index=False).mean()
<Figure size 640x480 with 0 Axes>
Check target imbalance.¶
The balance is not bad. Churn:Existing = 0.36
train['Churn'].value_counts()
Churn 0 5155 1 1859 Name: count, dtype: int64
Should You Use One-Hot Encoding for Binary Variables? Whether to use one-hot encoding depends on the machine learning algorithm you're working with:
- When NOT to Use One-Hot Encoding (Binary Encoding is Enough):
- Tree-Based Models: Algorithms like Decision Trees, Random Forests, and Gradient Boosted Trees (e.g., XGBoost, LightGBM) do not require one-hot encoding. These models handle numerical and categorical features well, and binary encoding provides the same information as one-hot encoding.
- Dimensionality Reduction: One-hot encoding increases the dataset's dimensionality. For binary features, one-hot encoding is redundant because it adds an extra column that carries no additional information (the second column is the inverse of the first).
- When to Consider One-Hot Encoding:
- When Using Linear Models: Algorithms like Logistic Regression and Linear Regression might benefit from one-hot encoding, depending on how the data is structured. These models assume linear relationships, and one-hot encoding can prevent issues where a single numeric column (e.g., 0 or 1) might suggest an ordinal relationship.
- For Consistency: If you are consistently one-hot encoding categorical features and want to treat all of them uniformly, you might also encode binary variables using one-hot.
Key Takeaways:
- Binary Encoding (0/1) works well for binary variables, especially with tree-based models, as they can handle categorical features directly.
- One-Hot Encoding is generally unnecessary for binary features but can be used with linear models or when consistency across all categorical features is desired.
- Efficiency Matters: Skipping one-hot encoding for binary variables reduces the dimensionality of your dataset and simplifies data preparation.
Action- Drop customerID ;drop Churn to y as target¶
Create normalized versions of relavant variables¶
interval columns analysis¶
numeric_columns = []
for f in train.columns:
if train[f].nunique() > 12:
numeric_columns.append(f)
numeric_columns
['tenure', 'MonthlyCharges', 'TotalCharges', 'avg_charge', 'mc_vs_avg']
train[numeric_columns]
| tenure | MonthlyCharges | TotalCharges | avg_charge | mc_vs_avg | |
|---|---|---|---|---|---|
| 0 | 1 | 29.85 | 29.85 | 29.850000 | 1.000000 |
| 1 | 34 | 56.95 | 1889.50 | 55.573529 | 1.024768 |
| 2 | 2 | 53.85 | 108.15 | 54.075000 | 0.995839 |
| 3 | 45 | 42.30 | 1840.75 | 40.905556 | 1.034089 |
| 4 | 2 | 70.70 | 151.65 | 75.825000 | 0.932410 |
| ... | ... | ... | ... | ... | ... |
| 7038 | 24 | 84.80 | 1990.50 | 82.937500 | 1.022457 |
| 7039 | 72 | 103.20 | 7362.90 | 102.262500 | 1.009168 |
| 7040 | 11 | 29.60 | 346.45 | 31.495455 | 0.939818 |
| 7041 | 4 | 74.40 | 306.60 | 76.650000 | 0.970646 |
| 7042 | 66 | 105.65 | 6844.50 | 103.704545 | 1.018760 |
7014 rows × 5 columns
sns.pairplot(train[numeric_columns])
<seaborn.axisgrid.PairGrid at 0x1cd11743200>
numeric_columns
['tenure', 'MonthlyCharges', 'TotalCharges', 'avg_charge', 'mc_vs_avg']
sns.heatmap(train[numeric_columns].corr(), annot=True)
<Axes: >
plt.figure(figsize=(30, 5))
sns.boxplot(x=train['tenure'], y=train['MonthlyCharges'])
<Axes: xlabel='tenure', ylabel='MonthlyCharges'>
Conclusion: Monthly Charges does not change too much. I guess it mainly related to inflation.¶
train.describe()
| tenure | MonthlyCharges | TotalCharges | Churn | avg_charge | mc_vs_avg | |
|---|---|---|---|---|---|---|
| count | 7014.000000 | 7014.000000 | 7014.000000 | 7014.000000 | 7014.000000 | 7014.000000 |
| mean | 32.502424 | 64.887831 | 2289.083397 | 0.265041 | 64.889050 | 1.002317 |
| std | 24.524997 | 30.056965 | 2266.797933 | 0.441387 | 30.157231 | 0.051354 |
| min | 1.000000 | 18.250000 | 18.800000 | 0.000000 | 13.775000 | 0.635545 |
| 25% | 9.000000 | 35.750000 | 406.275000 | 0.000000 | 36.505357 | 0.980691 |
| 50% | 29.000000 | 70.400000 | 1401.825000 | 0.000000 | 70.453125 | 1.000000 |
| 75% | 56.000000 | 89.900000 | 3806.612500 | 1.000000 | 90.227604 | 1.020930 |
| max | 72.000000 | 118.750000 | 8684.800000 | 1.000000 | 121.400000 | 1.450628 |
normalize the numeric_columns¶
scaler = StandardScaler()
df_std = train[numeric_columns].copy()
df_std[numeric_columns] = scaler.fit_transform(df_std)
train['Churn']
0 0
1 0
2 1
3 0
4 1
..
7038 0
7039 0
7040 0
7041 1
7042 0
Name: Churn, Length: 7014, dtype: int64
df_std['target'] = train['Churn'].astype('O')
df = df_std
sns.pairplot(df, hue='target')
<seaborn.axisgrid.PairGrid at 0x1cd0cbf2960>
df = df_std[df_std['target'] == 0].copy()
df.drop('target', axis=1, inplace=True)
pairplot = sns.pairplot(df)
pairplot.fig.suptitle('Existing Customer', y=1.02)
Text(0.5, 1.02, 'Existing Customer')
df = df_std[df_std['target'] == 1].copy()
df.drop('target', axis=1, inplace=True)
pairplot = sns.pairplot(df)
pairplot.fig.suptitle('Churned Customer', y=1.02)
Text(0.5, 1.02, 'Churned Customer')
sns.heatmap(train[numeric_columns].corr(), annot=True)
<Axes: >
scaler = StandardScaler()
train[numeric_columns] = scaler.fit_transform(train[numeric_columns])
Check numeric variables correlation with target¶
y = train['Churn']
correlation_with_target = train[numeric_columns].join(y).corr()
correlation_with_target['Churn']
tenure -0.353405 MonthlyCharges 0.195623 TotalCharges -0.198494 avg_charge 0.194784 mc_vs_avg 0.011741 Churn 1.000000 Name: Churn, dtype: float64
plt.figure(figsize=(5, 8))
sns.heatmap(correlation_with_target[['Churn']], annot=True, cmap='coolwarm', cbar=True)
<Axes: >
Target Encoding for numeric variables¶
train['binned_MonthlyCharges'] = pd.cut(train['MonthlyCharges'], bins=[i for i in
np.arange(min(train['MonthlyCharges']),
max(train['MonthlyCharges']), (max(
train['MonthlyCharges']) - min(
train['MonthlyCharges'])) / 73)])
train['binned_TotalCharges'] = pd.cut(train['TotalCharges'], bins=[i for i in np.arange(min(train['TotalCharges']),
max(train['TotalCharges']),
(max(train['TotalCharges']) -
min(train[
'TotalCharges'])) / 73)])
train['binned_mc_vs_avg'] = pd.cut(train['mc_vs_avg'], bins=[i for i in np.arange(min(train['mc_vs_avg']),
max(train['mc_vs_avg']),
(max(train['mc_vs_avg']) -
min(train['mc_vs_avg'])) / 73)])
train = train.dropna(subset=['binned_mc_vs_avg'])
train['binned_avg_charge'] = pd.cut(train['avg_charge'], bins=[i for i in np.arange(min(train['avg_charge']),
max(train['avg_charge']),
(max(train['avg_charge']) -
min(train['avg_charge'])) / 73)])
C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\1908494919.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train['binned_avg_charge'] = pd.cut(train['avg_charge'], bins=[i for i in np.arange(min(train['avg_charge']),
train['binned_MonthlyCharges'].describe()
count 7000 unique 72 top (-1.506, -1.46] freq 829 Name: binned_MonthlyCharges, dtype: object
from matplotlib.ticker import MaxNLocator
# from pandas.api.types import is_interval_dtype
interval_columns_bin = ['tenure', 'binned_MonthlyCharges', 'binned_TotalCharges', 'binned_mc_vs_avg',
'binned_avg_charge']
for f in interval_columns_bin:
data = train[[f]].join(y)
plt.figure()
fig, ax = plt.subplots(figsize=(25, 5))
# Calculate the percentage of target=1 per category value
cat_perc = data[[f, 'Churn']].groupby([f], as_index=False).mean()
cat_perc.sort_values(by=f, ascending=True, inplace=True)
sns.barplot(ax=ax, x=f, y='Churn', data=cat_perc, order=cat_perc[f], color='red')
# plt.xticks(rotation=90, ticks=[i for i in np.arange(min(train[f]),max(train[f]))])
# plt.xticks(rotation=90, labels=[i for i in range(0, train['tenure'].nunique())],
# ticks=[i for i in range(0, train['tenure'].nunique())])
plt.ylabel('% target', fontsize=18)
plt.xlabel(f, fontsize=18)
plt.xticks(rotation=90)
# ax.xaxis.set_major_locator(MaxNLocator(integer=True, nbins=73))
# plt.ylim(0, 0.1)
plt.show()
<Figure size 640x480 with 0 Axes>
C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\4273029473.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. cat_perc = data[[f, 'Churn']].groupby([f], as_index=False).mean()
<Figure size 640x480 with 0 Axes>
C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\4273029473.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. cat_perc = data[[f, 'Churn']].groupby([f], as_index=False).mean()
<Figure size 640x480 with 0 Axes>
C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\4273029473.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. cat_perc = data[[f, 'Churn']].groupby([f], as_index=False).mean()
<Figure size 640x480 with 0 Axes>
C:\Users\tropi\AppData\Local\Temp\ipykernel_13468\4273029473.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. cat_perc = data[[f, 'Churn']].groupby([f], as_index=False).mean()
<Figure size 640x480 with 0 Axes>
If the last month charges is too high or too low compared to average monthly charges, The customer tend to Churn.¶
a = [i for i in range(0, train['tenure'].nunique())]
b = a.copy()
b.remove(0)
b.extend([0])
np.subtract(np.array(b), np.array(a))
len(train0['tenure'].unique())
train['tenure'].nunique()
72
interval_columns = ['tenure', 'MonthlyCharges', 'TotalCharges', 'mc_vs_avg', 'avg_charge']
for f in interval_columns:
data = train[[f]].join(y)
plt.figure()
fig, ax = plt.subplots(figsize=(25, 5))
# Calculate the percentage of target=1 per category value
cat_perc = data[[f, 'Churn']].groupby([f], as_index=False).mean()
cat_perc.sort_values(by='Churn', ascending=False, inplace=True)
# Bar plot
# Order the bars descending on target mean
# sns.barplot(ax=ax, x=f, y='Churn', data=cat_perc, order=cat_perc[f], color='blue')
sns.kdeplot(data=data, x=f, hue='Churn')
plt.xticks(rotation=90)
plt.ylabel('% target', fontsize=18)
plt.xlabel(f, fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
y.unique()
array([0, 1])
Conclusion:¶
Action: Columns MonthlyCharges TotalCharges Churn are normalized by standardscaler() Information: tenure give information. As tenure increases, the churn rate decrease. Information: InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies, these multi-service give entropy gain. Information: SeniorCitizen Partner Dependents Contract PaperlessBilling PaymentMethod, these categorical columns give entropy gain.
Purpose-Develop meaningful feature engineering(e.g., tenure groups, service counts)¶
train
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | ... | MonthlyCharges | TotalCharges | Churn | month | avg_charge | mc_vs_avg | binned_MonthlyCharges | binned_TotalCharges | binned_mc_vs_avg | binned_avg_charge | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | No | Yes | No | -1.284594 | No | No phone service | DSL | No | Yes | ... | -1.165797 | -0.996734 | 0 | 1 | -1.161962 | -0.045126 | (-1.185, -1.139] | (-1.002, -0.949] | (-0.185, 0.0329] | (-1.206, -1.157] |
| 1 | Male | No | No | No | 0.061068 | Yes | No | DSL | Yes | No | ... | -0.264112 | -0.176289 | 0 | 10 | -0.308920 | 0.437219 | (-0.269, -0.223] | (-0.216, -0.164] | (0.25, 0.468] | (-0.326, -0.277] |
| 2 | Male | No | No | No | -1.243817 | Yes | No | DSL | Yes | Yes | ... | -0.367257 | -0.962189 | 1 | 2 | -0.358615 | -0.126156 | (-0.407, -0.361] | (-1.002, -0.949] | (-0.185, 0.0329] | (-0.375, -0.326] |
| 3 | Male | No | No | No | 0.509622 | No | No phone service | DSL | Yes | No | ... | -0.751554 | -0.197797 | 0 | 9 | -0.795338 | 0.618735 | (-0.773, -0.727] | (-0.216, -0.164] | (0.468, 0.685] | (-0.815, -0.766] |
| 4 | Female | No | No | No | -1.243817 | Yes | No | Fiber optic | No | No | ... | 0.193386 | -0.942998 | 1 | 2 | 0.362657 | -1.361380 | (0.189, 0.235] | (-0.949, -0.897] | (-1.489, -1.272] | (0.358, 0.407] |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | Male | No | Yes | Yes | -0.346709 | Yes | Yes | DSL | Yes | No | ... | 0.662528 | -0.131730 | 0 | 12 | 0.598521 | 0.392199 | (0.647, 0.693] | (-0.164, -0.111] | (0.25, 0.468] | (0.554, 0.603] |
| 7039 | Female | No | Yes | Yes | 1.610618 | Yes | Yes | Fiber optic | No | Yes | ... | 1.274743 | 2.238478 | 0 | 12 | 1.239375 | 0.133405 | (1.242, 1.288] | (2.193, 2.246] | (0.0329, 0.25] | (1.238, 1.287] |
| 7040 | Female | No | Yes | Yes | -0.876818 | No | No phone service | DSL | Yes | No | ... | -1.174115 | -0.857056 | 0 | 11 | -1.107395 | -1.217116 | (-1.185, -1.139] | (-0.897, -0.844] | (-1.272, -1.054] | (-1.108, -1.059] |
| 7041 | Male | Yes | Yes | No | -1.162261 | Yes | Yes | Fiber optic | No | No | ... | 0.316494 | -0.874637 | 1 | 4 | 0.390016 | -0.616774 | (0.281, 0.326] | (-0.897, -0.844] | (-0.619, -0.402] | (0.358, 0.407] |
| 7042 | Male | No | No | No | 1.365952 | Yes | No | Fiber optic | Yes | No | ... | 1.356261 | 2.009770 | 0 | 6 | 1.287196 | 0.320201 | (1.334, 1.38] | (1.984, 2.036] | (0.25, 0.468] | (1.238, 1.287] |
7011 rows × 27 columns
Conclusion:¶
Information: tenure give information. As tenure increases, the churn rate decrease. Action: MonthlyCharges -> binned_MonthlyCharges Action: TotalCharges -> binned_TotalCharges
Exploratory Data Analysis¶
Purpose- Perform univariate analysis on key variables¶
Chi-Square test for categorical features correlation¶
print(train['binned_MonthlyCharges'])
0 (-1.185, -1.139]
1 (-0.269, -0.223]
2 (-0.407, -0.361]
3 (-0.773, -0.727]
4 (0.189, 0.235]
...
7038 (0.647, 0.693]
7039 (1.242, 1.288]
7040 (-1.185, -1.139]
7041 (0.281, 0.326]
7042 (1.334, 1.38]
Name: binned_MonthlyCharges, Length: 7011, dtype: category
Categories (72, interval[float64, right]): [(-1.552, -1.506] < (-1.506, -1.46] < (-1.46, -1.414] < (-1.414, -1.369] ... (1.563, 1.609] < (1.609, 1.655] < (1.655, 1.701] < (1.701, 1.746]]
# Identify categorical features (excluding target variable)
categorical_features = [col for col in train.columns if train[col].dtype == 'O' or train[col].nunique() <= 10]
# Store results
chi2_results = []
# Perform Chi-Square test for each categorical feature
for feature in categorical_features:
print(f"Testing feature: {feature}")
# Create a contingency table
contingency_table = pd.crosstab(train[feature], y)
# Perform Chi-Square test
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
# Collect results
chi2_results.append({
'Feature': feature,
'Chi2_Stat': chi2_stat,
'p-value': p_value,
'Degrees_of_Freedom': dof,
'Significant': p_value < 0.05 # True if p-value < 0.05
})
print(f"Chi-Square Statistic: {chi2_stat}, p-value: {p_value}, Significant: {p_value < 0.05}\n")
# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(chi2_results)
# Filter significant results
significant_features = results_df[results_df['Significant']]
print("Significant Features (p < 0.05):")
print(significant_features)
Testing feature: gender
Chi-Square Statistic: 0.5062156219588821, p-value: 0.4767816894213255, Significant: False
Testing feature: SeniorCitizen
Chi-Square Statistic: 160.10513402866923, p-value: 1.073193079735253e-36, Significant: True
Testing feature: Partner
Chi-Square Statistic: 155.28517991505026, p-value: 1.2130356962926863e-35, Significant: True
Testing feature: Dependents
Chi-Square Statistic: 184.33692924368518, p-value: 5.4771349693371126e-42, Significant: True
Testing feature: PhoneService
Chi-Square Statistic: 0.8035620385800925, p-value: 0.37003050788642045, Significant: False
Testing feature: MultipleLines
Chi-Square Statistic: 12.011745358588566, p-value: 0.0024642379205827634, Significant: True
Testing feature: InternetService
Chi-Square Statistic: 738.2754140566219, p-value: 4.847644630207988e-161, Significant: True
Testing feature: OnlineSecurity
Chi-Square Statistic: 857.1546402159107, p-value: 7.434209716515333e-187, Significant: True
Testing feature: OnlineBackup
Chi-Square Statistic: 609.3459819464206, p-value: 4.810622025549891e-133, Significant: True
Testing feature: DeviceProtection
Chi-Square Statistic: 566.0198983673617, p-value: 1.2312344138065643e-123, Significant: True
Testing feature: TechSupport
Chi-Square Statistic: 835.3742851533483, p-value: 3.988217336538509e-182, Significant: True
Testing feature: StreamingTV
Chi-Square Statistic: 382.6326915770964, p-value: 8.172721995373581e-84, Significant: True
Testing feature: StreamingMovies
Chi-Square Statistic: 384.4343348376759, p-value: 3.3200518247196693e-84, Significant: True
Testing feature: Contract
Chi-Square Statistic: 1175.9415045841956, p-value: 4.441674808222638e-256, Significant: True
Testing feature: PaperlessBilling
Chi-Square Statistic: 261.71587157481474, p-value: 7.252734065747982e-59, Significant: True
Testing feature: PaymentMethod
Chi-Square Statistic: 650.3389796080692, p-value: 1.2298753935495455e-140, Significant: True
Testing feature: Churn
Chi-Square Statistic: 7005.868726015489, p-value: 0.0, Significant: True
Significant Features (p < 0.05):
Feature Chi2_Stat p-value Degrees_of_Freedom \
1 SeniorCitizen 160.105134 1.073193e-36 1
2 Partner 155.285180 1.213036e-35 1
3 Dependents 184.336929 5.477135e-42 1
5 MultipleLines 12.011745 2.464238e-03 2
6 InternetService 738.275414 4.847645e-161 2
7 OnlineSecurity 857.154640 7.434210e-187 2
8 OnlineBackup 609.345982 4.810622e-133 2
9 DeviceProtection 566.019898 1.231234e-123 2
10 TechSupport 835.374285 3.988217e-182 2
11 StreamingTV 382.632692 8.172722e-84 2
12 StreamingMovies 384.434335 3.320052e-84 2
13 Contract 1175.941505 4.441675e-256 2
14 PaperlessBilling 261.715872 7.252734e-59 1
15 PaymentMethod 650.338980 1.229875e-140 3
16 Churn 7005.868726 0.000000e+00 1
Significant
1 True
2 True
3 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 True
14 True
15 True
16 True
tenure¶
plt.hist(train.tenure)
(array([1703., 735., 561., 538., 473., 444., 452., 495., 501.,
1109.]),
array([-1.28459424, -0.99507305, -0.70555187, -0.41603068, -0.12650949,
0.16301169, 0.45253288, 0.74205407, 1.03157525, 1.32109644,
1.61061763]),
<BarContainer object of 10 artists>)
month¶
plt.hist(train['month'], bins=12)
(array([990., 589., 567., 536., 508., 501., 495., 494., 467., 531., 570.,
763.]),
array([ 1. , 1.91666667, 2.83333333, 3.75 , 4.66666667,
5.58333333, 6.5 , 7.41666667, 8.33333333, 9.25 ,
10.16666667, 11.08333333, 12. ]),
<BarContainer object of 12 artists>)
binned_MonthlyCharges¶
train['binned_MonthlyCharges'].describe()
count 7000 unique 72 top (-1.506, -1.46] freq 829 Name: binned_MonthlyCharges, dtype: object
a = pd.DataFrame(train['binned_MonthlyCharges'].value_counts())
a.reset_index(inplace=True)
a['binned_MonthlyCharges'] = a['binned_MonthlyCharges'].astype(str)
plt.figure(figsize=(20, 5))
plt.bar(a['binned_MonthlyCharges'], a['count'])
plt.xticks(rotation=90);
# plt.hist(train['binned_MonthlyCharges'].value_counts().sort_index(), bins=72)
train['binned_MonthlyCharges'].value_counts().sort_index()
binned_MonthlyCharges
(-1.552, -1.506] 313
(-1.506, -1.46] 829
(-1.46, -1.414] 20
(-1.414, -1.369] 15
(-1.369, -1.323] 240
...
(1.517, 1.563] 59
(1.563, 1.609] 25
(1.609, 1.655] 34
(1.655, 1.701] 39
(1.701, 1.746] 25
Name: count, Length: 72, dtype: int64
binned_mc_vs_avg¶
a = pd.DataFrame(train['binned_mc_vs_avg'].value_counts())
a.reset_index(inplace=True)
a['binned_mc_vs_avg'] = a['binned_mc_vs_avg'].astype(str)
plt.figure(figsize=(20, 5))
# Ensure the data is sorted
# sorted_data = a.sort_values(by='binned_mc_vs_avg') # Sort data by 'binned_mc_vs_avg'
# # Plot the data
# plt.bar(sorted_data['binned_mc_vs_avg'], sorted_data['count'])
# # Sort x-ticks
# plt.xticks(sorted_data['binned_mc_vs_avg'], sorted(sorted_data['binned_mc_vs_avg']))
plt.scatter(a['binned_mc_vs_avg'], a['count'])
plt.xticks(rotation=90);
# sns.barplot(ax=ax, x=f, y='Churn', data=cat_perc, order=cat_perc[f], color='red')
# plt.plot(train['binned_mc_vs_avg'].value_counts().sort_index())
df = a.copy()
df['binned_mc_vs_avg'] = pd.Categorical(
df['binned_mc_vs_avg'],
categories=sorted(df['binned_mc_vs_avg'].unique(), key=lambda x: float(x.split(',')[0][1:])),
ordered=True
)
df = df.sort_values(by='binned_mc_vs_avg').reset_index(drop=True)
plt.figure(figsize=(20, 5))
plt.scatter(df['binned_mc_vs_avg'], df['count'])
plt.xticks(rotation=90);
Categorical Count Bar Plot¶
cat_meaningful_list = ['month', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
'TechSupport', 'PaymentMethod', 'PaperlessBilling', 'Contract', 'Dependents', 'Partner',
'SeniorCitizen', 'StreamingMovies', 'PhoneService', 'StreamingTV', 'MultipleLines']
def set_spines_visibility(ax, is_visible):
for s in ['left', 'right', 'top', 'bottom']:
ax.spines[s].set_visible(is_visible)
fig = plt.figure(figsize=(16, 250))
gs = fig.add_gridspec(55, 2)
for i in range(len(cat_meaningful_list) + 1):
if i == 0:
ax = fig.add_subplot(gs[0, 0])
ax.text(x=0.5, y=0.5, s="Bar Plot for \nCategorical Variables",
horizontalalignment='center', verticalalignment='center',
fontweight='bold', fontsize='24')
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.tick_params(left=False, bottom=False)
set_spines_visibility(ax, False)
else:
ax = fig.add_subplot(gs[(i) // 2, i % 2])
sns.countplot(x=train[cat_meaningful_list[i - 1]], ax=ax)
set_spines_visibility(ax, False)
plt.xticks(rotation=0)
plt.show()
cat_meaningful_list.extend(['MonthlyCharges', 'mc_vs_avg'])
Model and Evaluation¶
Feature Selection¶
cat_meaningful_list
['month', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'PaymentMethod', 'PaperlessBilling', 'Contract', 'Dependents', 'Partner', 'SeniorCitizen', 'StreamingMovies', 'PhoneService', 'StreamingTV', 'MultipleLines', 'MonthlyCharges', 'mc_vs_avg']
interval_columns_bin
['tenure', 'binned_MonthlyCharges', 'binned_TotalCharges', 'binned_mc_vs_avg', 'binned_avg_charge']
train_list = cat_meaningful_list.copy()
train_list.extend(interval_columns_bin)
train_list.remove('binned_TotalCharges')
# train_list.remove('binned_MonthlyCharges')
train_list.remove('binned_avg_charge')
train_list
['month', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'PaymentMethod', 'PaperlessBilling', 'Contract', 'Dependents', 'Partner', 'SeniorCitizen', 'StreamingMovies', 'PhoneService', 'StreamingTV', 'MultipleLines', 'MonthlyCharges', 'mc_vs_avg', 'tenure', 'binned_MonthlyCharges', 'binned_mc_vs_avg']
train = train[train_list]
One-hot encoding the 3-4 values variables¶
multi_value_cat_volumns = []
for f in train.columns:
if ((train[f].dtype == 'O' or train[f].dtype == 'category') and
(train[f].nunique() <= 12) and (train[f].nunique() > 0)):
multi_value_cat_volumns.append(f)
# print(multi_value_cat_volumns)
train = pd.get_dummies(train, columns=multi_value_cat_volumns)
train.columns
Index(['MonthlyCharges', 'mc_vs_avg', 'tenure', 'binned_MonthlyCharges',
'binned_mc_vs_avg', 'month_1', 'month_2', 'month_3', 'month_4',
'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10',
'month_11', 'month_12', 'InternetService_DSL',
'InternetService_Fiber optic', 'InternetService_No',
'OnlineSecurity_No', 'OnlineSecurity_No internet service',
'OnlineSecurity_Yes', 'OnlineBackup_No',
'OnlineBackup_No internet service', 'OnlineBackup_Yes',
'DeviceProtection_No', 'DeviceProtection_No internet service',
'DeviceProtection_Yes', 'TechSupport_No',
'TechSupport_No internet service', 'TechSupport_Yes',
'PaymentMethod_Bank transfer (automatic)',
'PaymentMethod_Credit card (automatic)',
'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
'PaperlessBilling_No', 'PaperlessBilling_Yes',
'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year',
'Dependents_No', 'Dependents_Yes', 'Partner_No', 'Partner_Yes',
'SeniorCitizen_No', 'SeniorCitizen_Yes', 'StreamingMovies_No',
'StreamingMovies_No internet service', 'StreamingMovies_Yes',
'PhoneService_No', 'PhoneService_Yes', 'StreamingTV_No',
'StreamingTV_No internet service', 'StreamingTV_Yes',
'MultipleLines_No', 'MultipleLines_No phone service',
'MultipleLines_Yes'],
dtype='object')
drop_list = ['InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_No internet service',
'OnlineBackup_No internet service', 'DeviceProtection_No internet service',
'TechSupport_No internet service', 'StreamingMovies_No internet service', 'MultipleLines_No phone service',
'StreamingTV_No internet service',
'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Bank transfer (automatic)',
'PaymentMethod_Mailed check',
'PaperlessBilling_No', 'Contract_Two year', 'Dependents_Yes', 'Partner_Yes', 'SeniorCitizen_No',
'PhoneService_No', 'month_7']
train.drop(columns=drop_list, inplace=True)
train
| MonthlyCharges | mc_vs_avg | tenure | binned_MonthlyCharges | binned_mc_vs_avg | month_1 | month_2 | month_3 | month_4 | month_5 | ... | Dependents_No | Partner_No | SeniorCitizen_Yes | StreamingMovies_No | StreamingMovies_Yes | PhoneService_Yes | StreamingTV_No | StreamingTV_Yes | MultipleLines_No | MultipleLines_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.165797 | -0.045126 | -1.284594 | (-1.185, -1.139] | (-0.185, 0.0329] | True | False | False | False | False | ... | True | False | False | True | False | False | True | False | False | False |
| 1 | -0.264112 | 0.437219 | 0.061068 | (-0.269, -0.223] | (0.25, 0.468] | False | False | False | False | False | ... | True | True | False | True | False | True | True | False | True | False |
| 2 | -0.367257 | -0.126156 | -1.243817 | (-0.407, -0.361] | (-0.185, 0.0329] | False | True | False | False | False | ... | True | True | False | True | False | True | True | False | True | False |
| 3 | -0.751554 | 0.618735 | 0.509622 | (-0.773, -0.727] | (0.468, 0.685] | False | False | False | False | False | ... | True | True | False | True | False | False | True | False | False | False |
| 4 | 0.193386 | -1.361380 | -1.243817 | (0.189, 0.235] | (-1.489, -1.272] | False | True | False | False | False | ... | True | True | False | True | False | True | True | False | True | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 0.662528 | 0.392199 | -0.346709 | (0.647, 0.693] | (0.25, 0.468] | False | False | False | False | False | ... | False | False | False | False | True | True | False | True | False | True |
| 7039 | 1.274743 | 0.133405 | 1.610618 | (1.242, 1.288] | (0.0329, 0.25] | False | False | False | False | False | ... | False | False | False | False | True | True | False | True | False | True |
| 7040 | -1.174115 | -1.217116 | -0.876818 | (-1.185, -1.139] | (-1.272, -1.054] | False | False | False | False | False | ... | False | False | False | True | False | False | True | False | False | False |
| 7041 | 0.316494 | -0.616774 | -1.162261 | (0.281, 0.326] | (-0.619, -0.402] | False | False | False | True | False | ... | True | False | True | True | False | True | True | False | False | True |
| 7042 | 1.356261 | 0.320201 | 1.365952 | (1.334, 1.38] | (0.25, 0.468] | False | False | False | False | False | ... | True | True | False | False | True | True | False | True | True | False |
7011 rows × 40 columns
The dictionary for category translation¶
translation_dict = {}
If need to convert two value category to binary 0/1 column¶
for f in train.columns:
if (train[f].dtype == 'O' or train[f].dtype == 'bool') and (train[f].nunique() == 2):
translation_dict[f] = {train[f].unique()[0]: 0, train[f].unique()[1]: 1}
train[f] = train[f].map({train[f].unique()[0]: 0, train[f].unique()[1]: 1})
translation_dict
{'month_1': {np.True_: 0, np.False_: 1},
'month_2': {np.False_: 0, np.True_: 1},
'month_3': {np.False_: 0, np.True_: 1},
'month_4': {np.False_: 0, np.True_: 1},
'month_5': {np.False_: 0, np.True_: 1},
'month_6': {np.False_: 0, np.True_: 1},
'month_8': {np.False_: 0, np.True_: 1},
'month_9': {np.False_: 0, np.True_: 1},
'month_10': {np.False_: 0, np.True_: 1},
'month_11': {np.False_: 0, np.True_: 1},
'month_12': {np.False_: 0, np.True_: 1},
'InternetService_DSL': {np.True_: 0, np.False_: 1},
'InternetService_Fiber optic': {np.False_: 0, np.True_: 1},
'OnlineSecurity_No': {np.True_: 0, np.False_: 1},
'OnlineSecurity_Yes': {np.False_: 0, np.True_: 1},
'OnlineBackup_No': {np.False_: 0, np.True_: 1},
'OnlineBackup_Yes': {np.True_: 0, np.False_: 1},
'DeviceProtection_No': {np.True_: 0, np.False_: 1},
'DeviceProtection_Yes': {np.False_: 0, np.True_: 1},
'TechSupport_No': {np.True_: 0, np.False_: 1},
'TechSupport_Yes': {np.False_: 0, np.True_: 1},
'PaymentMethod_Electronic check': {np.True_: 0, np.False_: 1},
'PaperlessBilling_Yes': {np.True_: 0, np.False_: 1},
'Contract_Month-to-month': {np.True_: 0, np.False_: 1},
'Contract_One year': {np.False_: 0, np.True_: 1},
'Dependents_No': {np.True_: 0, np.False_: 1},
'Partner_No': {np.False_: 0, np.True_: 1},
'SeniorCitizen_Yes': {np.False_: 0, np.True_: 1},
'StreamingMovies_No': {np.True_: 0, np.False_: 1},
'StreamingMovies_Yes': {np.False_: 0, np.True_: 1},
'PhoneService_Yes': {np.False_: 0, np.True_: 1},
'StreamingTV_No': {np.True_: 0, np.False_: 1},
'StreamingTV_Yes': {np.False_: 0, np.True_: 1},
'MultipleLines_No': {np.False_: 0, np.True_: 1},
'MultipleLines_Yes': {np.False_: 0, np.True_: 1}}
reverse dict for translating from int to object¶
translation_dict_reverse = {}
for feat_key in translation_dict:
translation_dict_reverse[feat_key] = {value: key for key, value in translation_dict[feat_key].items()}
translation_dict_reverse
{'month_1': {0: np.True_, 1: np.False_},
'month_2': {0: np.False_, 1: np.True_},
'month_3': {0: np.False_, 1: np.True_},
'month_4': {0: np.False_, 1: np.True_},
'month_5': {0: np.False_, 1: np.True_},
'month_6': {0: np.False_, 1: np.True_},
'month_8': {0: np.False_, 1: np.True_},
'month_9': {0: np.False_, 1: np.True_},
'month_10': {0: np.False_, 1: np.True_},
'month_11': {0: np.False_, 1: np.True_},
'month_12': {0: np.False_, 1: np.True_},
'InternetService_DSL': {0: np.True_, 1: np.False_},
'InternetService_Fiber optic': {0: np.False_, 1: np.True_},
'OnlineSecurity_No': {0: np.True_, 1: np.False_},
'OnlineSecurity_Yes': {0: np.False_, 1: np.True_},
'OnlineBackup_No': {0: np.False_, 1: np.True_},
'OnlineBackup_Yes': {0: np.True_, 1: np.False_},
'DeviceProtection_No': {0: np.True_, 1: np.False_},
'DeviceProtection_Yes': {0: np.False_, 1: np.True_},
'TechSupport_No': {0: np.True_, 1: np.False_},
'TechSupport_Yes': {0: np.False_, 1: np.True_},
'PaymentMethod_Electronic check': {0: np.True_, 1: np.False_},
'PaperlessBilling_Yes': {0: np.True_, 1: np.False_},
'Contract_Month-to-month': {0: np.True_, 1: np.False_},
'Contract_One year': {0: np.False_, 1: np.True_},
'Dependents_No': {0: np.True_, 1: np.False_},
'Partner_No': {0: np.False_, 1: np.True_},
'SeniorCitizen_Yes': {0: np.False_, 1: np.True_},
'StreamingMovies_No': {0: np.True_, 1: np.False_},
'StreamingMovies_Yes': {0: np.False_, 1: np.True_},
'PhoneService_Yes': {0: np.False_, 1: np.True_},
'StreamingTV_No': {0: np.True_, 1: np.False_},
'StreamingTV_Yes': {0: np.False_, 1: np.True_},
'MultipleLines_No': {0: np.False_, 1: np.True_},
'MultipleLines_Yes': {0: np.False_, 1: np.True_}}
for f in train.columns:
# print(train[f].nunique())
print(f, '\t\t', train[f].dtype, '\t\t', train[f].nunique(), '\t', end="")
if train[f].unique().size <= 12:
print(f"{train[f].unique()}")
else:
print(f"[]")
MonthlyCharges float64 1584 [] mc_vs_avg float64 6411 [] tenure float64 72 [] binned_MonthlyCharges category 72 [] binned_mc_vs_avg category 62 [] month_1 int64 2 [0 1] month_2 int64 2 [0 1] month_3 int64 2 [0 1] month_4 int64 2 [0 1] month_5 int64 2 [0 1] month_6 int64 2 [0 1] month_8 int64 2 [0 1] month_9 int64 2 [0 1] month_10 int64 2 [0 1] month_11 int64 2 [0 1] month_12 int64 2 [0 1] InternetService_DSL int64 2 [0 1] InternetService_Fiber optic int64 2 [0 1] OnlineSecurity_No int64 2 [0 1] OnlineSecurity_Yes int64 2 [0 1] OnlineBackup_No int64 2 [0 1] OnlineBackup_Yes int64 2 [0 1] DeviceProtection_No int64 2 [0 1] DeviceProtection_Yes int64 2 [0 1] TechSupport_No int64 2 [0 1] TechSupport_Yes int64 2 [0 1] PaymentMethod_Electronic check int64 2 [0 1] PaperlessBilling_Yes int64 2 [0 1] Contract_Month-to-month int64 2 [0 1] Contract_One year int64 2 [0 1] Dependents_No int64 2 [0 1] Partner_No int64 2 [0 1] SeniorCitizen_Yes int64 2 [0 1] StreamingMovies_No int64 2 [0 1] StreamingMovies_Yes int64 2 [0 1] PhoneService_Yes int64 2 [0 1] StreamingTV_No int64 2 [0 1] StreamingTV_Yes int64 2 [0 1] MultipleLines_No int64 2 [0 1] MultipleLines_Yes int64 2 [0 1]
# object_columns = train.select_dtypes(include=['object']).columns
# len(object_columns)
Convert group to midpoint¶
dup_list = train.duplicated()
dup_list.sum()
np.int64(59)
train_y = train.join(y)
total_dup_list = train_y.duplicated()
total_dup_list.sum()
np.int64(45)
conflict_list = dup_list & ~total_dup_list
conflict_list.sum()
np.int64(14)
I think it's not proper to drop these rows. Because the conflic is caused by data preprocessing.¶
# train_y.drop(train[conflict_list].index, inplace=True)
y = train_y['Churn']
train = train_y.drop(columns=['Churn'])
train
| MonthlyCharges | mc_vs_avg | tenure | binned_MonthlyCharges | binned_mc_vs_avg | month_1 | month_2 | month_3 | month_4 | month_5 | ... | Dependents_No | Partner_No | SeniorCitizen_Yes | StreamingMovies_No | StreamingMovies_Yes | PhoneService_Yes | StreamingTV_No | StreamingTV_Yes | MultipleLines_No | MultipleLines_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.165797 | -0.045126 | -1.284594 | (-1.185, -1.139] | (-0.185, 0.0329] | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | -0.264112 | 0.437219 | 0.061068 | (-0.269, -0.223] | (0.25, 0.468] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 2 | -0.367257 | -0.126156 | -1.243817 | (-0.407, -0.361] | (-0.185, 0.0329] | 1 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 3 | -0.751554 | 0.618735 | 0.509622 | (-0.773, -0.727] | (0.468, 0.685] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0.193386 | -1.361380 | -1.243817 | (0.189, 0.235] | (-1.489, -1.272] | 1 | 1 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 0.662528 | 0.392199 | -0.346709 | (0.647, 0.693] | (0.25, 0.468] | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 |
| 7039 | 1.274743 | 0.133405 | 1.610618 | (1.242, 1.288] | (0.0329, 0.25] | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 |
| 7040 | -1.174115 | -1.217116 | -0.876818 | (-1.185, -1.139] | (-1.272, -1.054] | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7041 | 0.316494 | -0.616774 | -1.162261 | (0.281, 0.326] | (-0.619, -0.402] | 1 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 7042 | 1.356261 | 0.320201 | 1.365952 | (1.334, 1.38] | (0.25, 0.468] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 |
7011 rows × 40 columns
encode the service count to represent number of service¶
service_count_list = ['OnlineSecurity_Yes', 'OnlineBackup_Yes', 'DeviceProtection_Yes', 'TechSupport_Yes',
'StreamingMovies_Yes', 'MultipleLines_Yes', 'InternetService_DSL',
'InternetService_Fiber optic']
train['serviceCount'] = train[service_count_list].sum(axis=1)
train
| MonthlyCharges | mc_vs_avg | tenure | binned_MonthlyCharges | binned_mc_vs_avg | month_1 | month_2 | month_3 | month_4 | month_5 | ... | Partner_No | SeniorCitizen_Yes | StreamingMovies_No | StreamingMovies_Yes | PhoneService_Yes | StreamingTV_No | StreamingTV_Yes | MultipleLines_No | MultipleLines_Yes | serviceCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.165797 | -0.045126 | -1.284594 | (-1.185, -1.139] | (-0.185, 0.0329] | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | -0.264112 | 0.437219 | 0.061068 | (-0.269, -0.223] | (0.25, 0.468] | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 3 |
| 2 | -0.367257 | -0.126156 | -1.243817 | (-0.407, -0.361] | (-0.185, 0.0329] | 1 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
| 3 | -0.751554 | 0.618735 | 0.509622 | (-0.773, -0.727] | (0.468, 0.685] | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
| 4 | 0.193386 | -1.361380 | -1.243817 | (0.189, 0.235] | (-1.489, -1.272] | 1 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 0.662528 | 0.392199 | -0.346709 | (0.647, 0.693] | (0.25, 0.468] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 6 |
| 7039 | 1.274743 | 0.133405 | 1.610618 | (1.242, 1.288] | (0.0329, 0.25] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 5 |
| 7040 | -1.174115 | -1.217116 | -0.876818 | (-1.185, -1.139] | (-1.272, -1.054] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
| 7041 | 0.316494 | -0.616774 | -1.162261 | (0.281, 0.326] | (-0.619, -0.402] | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 4 |
| 7042 | 1.356261 | 0.320201 | 1.365952 | (1.334, 1.38] | (0.25, 0.468] | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 7 |
7011 rows × 41 columns
train['Churn'] = y
for f in ['serviceCount']:
plt.figure()
fig, ax = plt.subplots(figsize=(20, 5))
# Calculate the percentage of target=1 per category value
cat_perc = train[[f, 'Churn']].groupby([f], as_index=False).mean()
cat_perc.sort_values(by='Churn', ascending=False, inplace=True)
# Bar plot
# Order the bars descending on target mean
sns.barplot(ax=ax, x=f, y='Churn', data=cat_perc, order=cat_perc[f])
plt.ylabel('% Churn', fontsize=18)
plt.xlabel(f, fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
<Figure size 640x480 with 0 Axes>
train['serviceCount'] = train['serviceCount'].astype('category')
service count vs monthly charges¶
print(train.columns)
Index(['MonthlyCharges', 'mc_vs_avg', 'tenure', 'binned_MonthlyCharges',
'binned_mc_vs_avg', 'month_1', 'month_2', 'month_3', 'month_4',
'month_5', 'month_6', 'month_8', 'month_9', 'month_10', 'month_11',
'month_12', 'InternetService_DSL', 'InternetService_Fiber optic',
'OnlineSecurity_No', 'OnlineSecurity_Yes', 'OnlineBackup_No',
'OnlineBackup_Yes', 'DeviceProtection_No', 'DeviceProtection_Yes',
'TechSupport_No', 'TechSupport_Yes', 'PaymentMethod_Electronic check',
'PaperlessBilling_Yes', 'Contract_Month-to-month', 'Contract_One year',
'Dependents_No', 'Partner_No', 'SeniorCitizen_Yes',
'StreamingMovies_No', 'StreamingMovies_Yes', 'PhoneService_Yes',
'StreamingTV_No', 'StreamingTV_Yes', 'MultipleLines_No',
'MultipleLines_Yes', 'serviceCount', 'Churn'],
dtype='object')
sns.scatterplot(x='serviceCount', y='MonthlyCharges', hue='Churn', data=train)
<Axes: xlabel='serviceCount', ylabel='MonthlyCharges'>
train
| MonthlyCharges | mc_vs_avg | tenure | binned_MonthlyCharges | binned_mc_vs_avg | month_1 | month_2 | month_3 | month_4 | month_5 | ... | SeniorCitizen_Yes | StreamingMovies_No | StreamingMovies_Yes | PhoneService_Yes | StreamingTV_No | StreamingTV_Yes | MultipleLines_No | MultipleLines_Yes | serviceCount | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.165797 | -0.045126 | -1.284594 | (-1.185, -1.139] | (-0.185, 0.0329] | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | -0.264112 | 0.437219 | 0.061068 | (-0.269, -0.223] | (0.25, 0.468] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 3 | 0 |
| 2 | -0.367257 | -0.126156 | -1.243817 | (-0.407, -0.361] | (-0.185, 0.0329] | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 |
| 3 | -0.751554 | 0.618735 | 0.509622 | (-0.773, -0.727] | (0.468, 0.685] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 |
| 4 | 0.193386 | -1.361380 | -1.243817 | (0.189, 0.235] | (-1.489, -1.272] | 1 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 3 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 0.662528 | 0.392199 | -0.346709 | (0.647, 0.693] | (0.25, 0.468] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 6 | 0 |
| 7039 | 1.274743 | 0.133405 | 1.610618 | (1.242, 1.288] | (0.0329, 0.25] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 5 | 0 |
| 7040 | -1.174115 | -1.217116 | -0.876818 | (-1.185, -1.139] | (-1.272, -1.054] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 |
| 7041 | 0.316494 | -0.616774 | -1.162261 | (0.281, 0.326] | (-0.619, -0.402] | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 4 | 1 |
| 7042 | 1.356261 | 0.320201 | 1.365952 | (1.334, 1.38] | (0.25, 0.468] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 7 | 0 |
7011 rows × 42 columns
prepare the train set and target¶
y = train['Churn']
train.drop('Churn', inplace=True, axis=1)
train
| MonthlyCharges | mc_vs_avg | tenure | binned_MonthlyCharges | binned_mc_vs_avg | month_1 | month_2 | month_3 | month_4 | month_5 | ... | Partner_No | SeniorCitizen_Yes | StreamingMovies_No | StreamingMovies_Yes | PhoneService_Yes | StreamingTV_No | StreamingTV_Yes | MultipleLines_No | MultipleLines_Yes | serviceCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.165797 | -0.045126 | -1.284594 | (-1.185, -1.139] | (-0.185, 0.0329] | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | -0.264112 | 0.437219 | 0.061068 | (-0.269, -0.223] | (0.25, 0.468] | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 3 |
| 2 | -0.367257 | -0.126156 | -1.243817 | (-0.407, -0.361] | (-0.185, 0.0329] | 1 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
| 3 | -0.751554 | 0.618735 | 0.509622 | (-0.773, -0.727] | (0.468, 0.685] | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
| 4 | 0.193386 | -1.361380 | -1.243817 | (0.189, 0.235] | (-1.489, -1.272] | 1 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 0.662528 | 0.392199 | -0.346709 | (0.647, 0.693] | (0.25, 0.468] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 6 |
| 7039 | 1.274743 | 0.133405 | 1.610618 | (1.242, 1.288] | (0.0329, 0.25] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 5 |
| 7040 | -1.174115 | -1.217116 | -0.876818 | (-1.185, -1.139] | (-1.272, -1.054] | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
| 7041 | 0.316494 | -0.616774 | -1.162261 | (0.281, 0.326] | (-0.619, -0.402] | 1 | 0 | 0 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 4 |
| 7042 | 1.356261 | 0.320201 | 1.365952 | (1.334, 1.38] | (0.25, 0.468] | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 7 |
7011 rows × 41 columns
for col in ['binned_mc_vs_avg']:
train[col] = train[col].apply(lambda x: x.mid) # Convert intervals to their midpoint
train.head()
| MonthlyCharges | mc_vs_avg | tenure | binned_MonthlyCharges | binned_mc_vs_avg | month_1 | month_2 | month_3 | month_4 | month_5 | ... | Partner_No | SeniorCitizen_Yes | StreamingMovies_No | StreamingMovies_Yes | PhoneService_Yes | StreamingTV_No | StreamingTV_Yes | MultipleLines_No | MultipleLines_Yes | serviceCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.165797 | -0.045126 | -1.284594 | (-1.185, -1.139] | -0.07605 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | -0.264112 | 0.437219 | 0.061068 | (-0.269, -0.223] | 0.35900 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 3 |
| 2 | -0.367257 | -0.126156 | -1.243817 | (-0.407, -0.361] | -0.07605 | 1 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
| 3 | -0.751554 | 0.618735 | 0.509622 | (-0.773, -0.727] | 0.57650 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 |
| 4 | 0.193386 | -1.361380 | -1.243817 | (0.189, 0.235] | -1.38050 | 1 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 3 |
5 rows × 41 columns
for col in train.columns:
if train[col].dtype != 'float64':
train[col] = train[col].astype('category')
train.columns
Index(['MonthlyCharges', 'mc_vs_avg', 'tenure', 'binned_MonthlyCharges',
'binned_mc_vs_avg', 'month_1', 'month_2', 'month_3', 'month_4',
'month_5', 'month_6', 'month_8', 'month_9', 'month_10', 'month_11',
'month_12', 'InternetService_DSL', 'InternetService_Fiber optic',
'OnlineSecurity_No', 'OnlineSecurity_Yes', 'OnlineBackup_No',
'OnlineBackup_Yes', 'DeviceProtection_No', 'DeviceProtection_Yes',
'TechSupport_No', 'TechSupport_Yes', 'PaymentMethod_Electronic check',
'PaperlessBilling_Yes', 'Contract_Month-to-month', 'Contract_One year',
'Dependents_No', 'Partner_No', 'SeniorCitizen_Yes',
'StreamingMovies_No', 'StreamingMovies_Yes', 'PhoneService_Yes',
'StreamingTV_No', 'StreamingTV_Yes', 'MultipleLines_No',
'MultipleLines_Yes', 'serviceCount'],
dtype='object')
binned_MonthlyCharges is useless or even worse. Drop it.¶
train.drop(['binned_MonthlyCharges', 'MonthlyCharges', 'mc_vs_avg', 'serviceCount'], inplace=True, axis=1)
y = y.astype('int')
train_y = train.copy()
train_y.columns
Index(['tenure', 'binned_mc_vs_avg', 'month_1', 'month_2', 'month_3',
'month_4', 'month_5', 'month_6', 'month_8', 'month_9', 'month_10',
'month_11', 'month_12', 'InternetService_DSL',
'InternetService_Fiber optic', 'OnlineSecurity_No',
'OnlineSecurity_Yes', 'OnlineBackup_No', 'OnlineBackup_Yes',
'DeviceProtection_No', 'DeviceProtection_Yes', 'TechSupport_No',
'TechSupport_Yes', 'PaymentMethod_Electronic check',
'PaperlessBilling_Yes', 'Contract_Month-to-month', 'Contract_One year',
'Dependents_No', 'Partner_No', 'SeniorCitizen_Yes',
'StreamingMovies_No', 'StreamingMovies_Yes', 'PhoneService_Yes',
'StreamingTV_No', 'StreamingTV_Yes', 'MultipleLines_No',
'MultipleLines_Yes'],
dtype='object')
# train_y.drop(['avg_charge', 'TotalCharges', 'MonthlyCharges', 'mc_vs_avg', 'tenure'], inplace=True, axis=1)
# dup_list = train_y.duplicated()
# print(dup_list.sum())
# train_y = train.join(y).copy()
# total_dup_list = train_y.duplicated()
# print(total_dup_list.sum())
# conflict_list = dup_list & ~total_dup_list
# print(conflict_list.sum())
# train_y.drop(train[conflict_list].index, inplace=True)
# y = train_y['Churn']
# train = train_y.drop(columns=['Churn'])
train.shape
y.value_counts()
Churn 0 5152 1 1859 Name: count, dtype: int64
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42, stratify=y)
feat_labels = X_train.columns
# smote = SMOTE(random_state=42)
smote = SMOTE(random_state=0)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
D:\PythonProjects\class2_venv2\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py:136: UserWarning: Could not find the number of physical cores for the following reason:
[WinError 2] The system cannot find the file specified
Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.
warnings.warn(
File "D:\PythonProjects\class2_venv2\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
cpu_info = subprocess.run(
^^^^^^^^^^^^^^^
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 550, in run
with Popen(*popenargs, **kwargs) as process:
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1028, in __init__
self._execute_child(args, executable, preexec_fn, close_fds,
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1540, in _execute_child
hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# from imblearn.over_sampling import ADASYN
#
# adasyn = ADASYN(sampling_strategy=0.75, n_neighbors=4)
# adasyn = ADASYN()
# X_train_sm, y_train_sm = adasyn.fit_resample(X_train, y_train)
# X_train_sm = X_train
# y_train_sm = y_train
y_train_sm.value_counts() / len(y_train_sm)
Churn 0 0.5 1 0.5 Name: count, dtype: float64
Implement a Random Forest classification model¶
model = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1, min_samples_leaf=10)
model.fit(X_train_sm, y_train_sm)
importances = model.feature_importances_
indices = np.argsort(model.feature_importances_)[::-1]
print(np.cumsum(importances[indices]))
print(importances[indices])
print(feat_labels[indices])
[0.25885911 0.38850738 0.4914927 0.56114405 0.62522077 0.68775492
0.73601749 0.76147469 0.78623121 0.81033346 0.8334346 0.85397203
0.86741355 0.88053893 0.89283672 0.90487134 0.91686185 0.92801217
0.93586798 0.94285803 0.94898147 0.95469178 0.96039404 0.96545328
0.97017336 0.97436966 0.97761533 0.98046968 0.98319377 0.98582835
0.98840161 0.99094224 0.99292451 0.99482136 0.99670612 0.99841269
1. ]
[0.25885911 0.12964827 0.10298532 0.06965134 0.06407672 0.06253415
0.04826258 0.0254572 0.02475651 0.02410225 0.02310115 0.02053742
0.01344152 0.01312538 0.01229779 0.01203462 0.01199051 0.01115032
0.0078558 0.00699006 0.00612343 0.00571031 0.00570226 0.00505924
0.00472009 0.0041963 0.00324567 0.00285435 0.00272409 0.00263458
0.00257325 0.00254064 0.00198227 0.00189684 0.00188477 0.00170657
0.00158731]
Index(['binned_mc_vs_avg', 'Contract_Month-to-month', 'tenure',
'PaymentMethod_Electronic check', 'OnlineSecurity_No', 'TechSupport_No',
'InternetService_Fiber optic', 'month_1', 'PaperlessBilling_Yes',
'OnlineBackup_No', 'Partner_No', 'Dependents_No', 'DeviceProtection_No',
'SeniorCitizen_Yes', 'InternetService_DSL', 'OnlineSecurity_Yes',
'Contract_One year', 'TechSupport_Yes', 'MultipleLines_Yes',
'MultipleLines_No', 'OnlineBackup_Yes', 'StreamingTV_Yes',
'StreamingMovies_Yes', 'StreamingTV_No', 'StreamingMovies_No',
'DeviceProtection_Yes', 'month_6', 'month_3', 'month_2', 'month_10',
'month_5', 'month_12', 'month_8', 'PhoneService_Yes', 'month_9',
'month_11', 'month_4'],
dtype='object')
y_pred = model.predict(X_test)
Evaluate the model metrics¶
- Precision: The ratio of correctly predicted positive observations to the total predicted positives (
TP / (TP + FP)). - Recall (Sensitivity): The ratio of correctly predicted positive observations to all actual positives (
TP / (TP + FN)). - F1-score: The harmonic mean of precision and recall. It's a better measure when precision and recall are imbalanced.
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")
# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")
# F1-Score
f1 = f1_score(y_test, y_pred)
print(f"F1-Score: {f1}")
Accuracy: 0.7954383464005702 Precision: 0.6164383561643836 Recall: 0.6048387096774194 F1-Score: 0.6105834464043419
random search hyper parameters for the random forest classifier¶
y_train_sm.value_counts()
Churn 0 4121 1 4121 Name: count, dtype: int64
from sklearn.metrics import make_scorer, f1_score
# Define the random forest model
rf = RandomForestClassifier(random_state=0, n_jobs=-1)
# Define the hyperparameter grid
param_distributions = {
'n_estimators': [i for i in range(300, 1100, 100)], # Number of trees in the forest
'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 120], # Maximum depth of trees
'min_samples_split': [i for i in range(2, 11)], # Minimum samples required to split an internal node
'min_samples_leaf': [i for i in range(1, 11)], # Minimum samples required in a leaf node
'max_features': ['sqrt', 'log2', None] # Number of features to consider for the best split
}
f1_class_1_scorer = make_scorer(f1_score, pos_label=1)
# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(
estimator=rf,
param_distributions=param_distributions,
n_iter=50, # Number of parameter settings to sample
scoring=f1_class_1_scorer, # Score based on F1-Score
n_jobs=-1, # Use all processors
cv=5, # 5-fold cross-validation
verbose=2, # Print status messages during search
random_state=0
)
# Fit the randomized search model
random_search.fit(X_train_sm, y_train_sm)
# Output the best parameters and score
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best F1-Score: {random_search.best_score_}")
# Evaluate on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred)) # Detailed performance metrics
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'n_estimators': 600, 'min_samples_split': 8, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 10}
Best F1-Score: 0.8169772588421169
precision recall f1-score support
0 0.87 0.87 0.87 1031
1 0.63 0.63 0.63 372
accuracy 0.80 1403
macro avg 0.75 0.75 0.75 1403
weighted avg 0.80 0.80 0.80 1403
import pickle
importances = best_model.feature_importances_
indices = np.argsort(model.feature_importances_)[::-1]
print(feat_labels[indices])
# Save the model to a file
with open(os.path.join(workdir, 'churn_mcvsavg_month.pkl'), 'wb') as file:
pickle.dump(model, file)
print("Model saved successfully!")
Index(['binned_mc_vs_avg', 'Contract_Month-to-month', 'tenure',
'PaymentMethod_Electronic check', 'OnlineSecurity_No', 'TechSupport_No',
'InternetService_Fiber optic', 'month_1', 'PaperlessBilling_Yes',
'OnlineBackup_No', 'Partner_No', 'Dependents_No', 'DeviceProtection_No',
'SeniorCitizen_Yes', 'InternetService_DSL', 'OnlineSecurity_Yes',
'Contract_One year', 'TechSupport_Yes', 'MultipleLines_Yes',
'MultipleLines_No', 'OnlineBackup_Yes', 'StreamingTV_Yes',
'StreamingMovies_Yes', 'StreamingTV_No', 'StreamingMovies_No',
'DeviceProtection_Yes', 'month_6', 'month_3', 'month_2', 'month_10',
'month_5', 'month_12', 'month_8', 'PhoneService_Yes', 'month_9',
'month_11', 'month_4'],
dtype='object')
Model saved successfully!
SMOTE
Fitting 5 folds for each of 50 candidates, totalling 250 fitsFitting 5 folds for each of 50 candidates, totalling 250 fits Best Parameters: {'n_estimators': 600, 'min_samples_split': 8, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 10} Best F1-Score: 0.8118358823942179 precision recall f1-score support
0 0.87 0.86 0.86 1031
1 0.62 0.64 0.63 372
accuracy 0.80 1403
macro avg 0.75 0.75 0.75 1403 weighted avg 0.80 0.80 0.80 1403
ADASYN Fitting 5 folds for each of 50 candidates, totalling 250 fits Best Parameters: {'n_estimators': 800, 'min_samples_split': 8, 'min_samples_leaf': 7, 'max_features': 'log2', 'max_depth': 40} Best F1-Score: 0.7764571213311428 precision recall f1-score support
0 0.85 0.87 0.86 1031
1 0.62 0.58 0.60 372
accuracy 0.79 1403
macro avg 0.74 0.73 0.73 1403 weighted avg 0.79 0.79 0.79 1403
Cross Validation¶
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
smote = SMOTE(random_state=0)
X_train_sm, y_train_sm = smote.fit_resample(train, y)
scores = cross_val_score(best_model, X_train_sm, y_train_sm, cv=kf, scoring='f1')
average_f1 = np.mean(scores)
print(f"F1 Score for each fold: {[round(score, 4) for score in scores]}")
print(f"Average F1 across {k} folds: {average_f1:.2f}")
F1 Score for each fold: [np.float64(0.8468), np.float64(0.8539), np.float64(0.8682), np.float64(0.8573), np.float64(0.8523)] Average F1 across 5 folds: 0.86
- Definition: R² measures how well the regression model explains the variability in the target variable.
- Use Case:
- R² is crucial for regression problems, like predicting continuous variables such as customer spending or tenure.
- An R² value close to 1 indicates the model explains most of the variability in the target variable, while a value close to 0 indicates poor predictive power.
- Limitations:
- R² is not commonly used for classification problems (as it isn't meaningful in that context).
- A high R² doesn't necessarily mean the model is a good fit; it might be overfitting on the dataset.
A high churn rate in January often occurs due to a combination of factors related to the New Year, including: tightened budgets, post-holiday spending fatigue, new year resolutions leading to service cancellations, trial period expirations from holiday sign-ups, and the tendency for customers to re-evaluate subscriptions at the beginning of the year when making financial planning decisions. # Analysis:
In the first months of the tenure, user churn according to the service experience.
After several months, the user tend to be stable. The reason they leave should be environment change. Maybe life change, maybe new service provider.
The service_count_number do not provide a boost in model performance. I guess this is because service combination already provide the information.
Adding features¶
I made a feature train['avg_charge'] = train['TotalCharges'] / train['tenure']. This reveal the MonthlyCharges should be the latest subscription fee.
I made a feature train['mc_vs_avg'] = train['MonthlyCharges'] / train['avg_charge'] to explore the last month change.
Then I made a feature binned_mc_vs_avg, which the binning of mc_vs_avg. When this variable change too much, the user will churn.
Adding feature¶
I made a feature train['month'] = train['tenure']%12+1. This feature give some clue on churn rate.
A high churn rate in January often occurs due to a combination of factors related to the New Year, including: tightened budgets, post-holiday spending fatigue, new year resolutions leading to service cancellations, trial period expirations from holiday sign-ups, and the tendency for customers to re-evaluate subscriptions at the beginning of the year when making financial planning decisions.
And so on. The customer behavior has relation with seasons.