In [1]:
!python -m pip install --upgrade pip
!pip install numpy
!pip install pandas
!pip install itertools
!pip install catboost
!pip install scikit-learn
Requirement already satisfied: pip in /usr/local/lib/python3.10/dist-packages (24.2)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (1.24.1)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.2.3)
Requirement already satisfied: numpy>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.24.1)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)
Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
ERROR: Could not find a version that satisfies the requirement itertools (from versions: none)
ERROR: No matching distribution found for itertools
Requirement already satisfied: catboost in /usr/local/lib/python3.10/dist-packages (1.2.7)
Requirement already satisfied: graphviz in /usr/local/lib/python3.10/dist-packages (from catboost) (0.20.3)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from catboost) (3.9.2)
Requirement already satisfied: numpy<2.0,>=1.16.0 in /usr/local/lib/python3.10/dist-packages (from catboost) (1.24.1)
Requirement already satisfied: pandas>=0.24 in /usr/local/lib/python3.10/dist-packages (from catboost) (2.2.3)
Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from catboost) (1.14.1)
Requirement already satisfied: plotly in /usr/local/lib/python3.10/dist-packages (from catboost) (5.24.1)
Requirement already satisfied: six in /usr/lib/python3/dist-packages (from catboost) (1.16.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2024.2)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (1.3.0)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (4.54.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (1.4.7)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (23.2)
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (9.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/lib/python3/dist-packages (from matplotlib->catboost) (2.4.7)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from plotly->catboost) (9.0.0)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.5.2)
Requirement already satisfied: numpy>=1.19.5 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.24.1)
Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.14.1)
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.5.0)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
In [2]:
import numpy as np
import pandas as pd
import joblib
import os
import operator
from itertools import combinations
from itertools import product
import gc
import catboost as cat
import multiprocessing
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

Load Data¶

In [3]:
# train_file_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\data\train.csv"
# test_file_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\data\test.csv"
# submission_sample_file_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\data\sample_submission.csv"
# submission_file_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\data\submission.csv"
# good_feature_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\assignment\featimp_20241012_1720_8gpu_num2cat_num999_longparam.csv"

train_file_path = r"train.csv"
test_file_path = r"test.csv"
submission_sample_file_path = r"sample_submission.csv"
submission_file_path = r"submission.csv"
# good_feature_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\assignment\featimp_20241012_1720_8gpu_num2cat_num999_longparam.csv"
In [4]:
train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)
y = train['target']
train.drop('target', axis=1, inplace=True)
train.drop('ID', axis=1, inplace=True)
train.drop('v74', axis=1, inplace=True)  #v74, v3 are univariate
train.drop('v3', axis=1, inplace=True)
test.drop('ID', axis=1, inplace=True) #ID is useless
test.drop('v74', axis=1, inplace=True)
test.drop('v3', axis=1, inplace=True)

# train = train[:50]
# test = test[:50]
# y = y[:50]

Convert Low Cardinality Numerical to Categorical¶

In [5]:
# modify numerical features v62 v129 v38 v72 to categorical features
num_2_cat_list = ['v62', 'v129', 'v38', 'v72']
for c in num_2_cat_list:
    train[c] = train[c].astype('string')
    train[c] = train[c].astype('O')
for c in num_2_cat_list:
    test[c] = test[c].astype('string')
    test[c] = test[c].astype('O')
In [6]:
## Calculate feature's correlation with target, remove useless features
In [7]:
def calc_rel(df: pd.DataFrame, target_col: pd.Series, list1: list) -> dict:
    """calculate correlation
    @list1: a list of columns in df, to calculate with target_col"""
    correlations = dict()
    for col in list1:
        if df[col].dtype == 'O':
            corr = target_col.corr(df[col].astype('category').cat.codes)
        else:
            corr = target_col.corr(df[col])
        if not np.isnan(corr):
            correlations[col] = abs(corr)
    correlations = sorted(correlations.items(), key=lambda item: item[1], reverse=True)
    return correlations
In [8]:
#corr works on numerical column
num_cols = train.select_dtypes(include=['number']).columns
num_corr = calc_rel(train, y, num_cols)
# use 0.03 as threshold
num_corr = pd.DataFrame(num_corr[:30],columns=['feature','importance'])
In [9]:
num_cols = num_corr['feature'].to_list()
In [ ]:
 
In [10]:
#round num to 2 decimals
for col in num_cols:
    train[col] = np.round(train[col],2)
for col in num_cols:
    test[col] = np.round(test[col],2)
In [11]:
cat_cols = train.select_dtypes(exclude=['number']).columns
cat_corr = pd.DataFrame(calc_rel(train, y, cat_cols), columns=['feature', 'correlation'])
cat_cols = cat_corr['feature'].to_list()
In [12]:
#combine the cat and num cols
sorted_num_list = [i for i in num_cols]
sorted_cat_list = [i for i in cat_cols]
keep_cols = sorted_num_list + sorted_cat_list
In [13]:
#drop useless from the train and test
train = train[keep_cols]
test = test[keep_cols]
In [14]:
#fill nan values
for col in sorted_cat_list:
    train.fillna({col:'NA'},inplace=True)
    # test[col] = test[col].fillna('NA')
    test.fillna({col:'NA'},inplace=True)
    
In [15]:
for col in sorted_num_list:
    # mean = train[col].mean(skipna=True)
    train[col] = train[col].fillna(-999)
    test[col] = test[col].fillna(-999)

assert (train.isnull().sum().sum() == 0)
assert (test.isnull().sum().sum() == 0)
In [16]:
#get 99% percent importance
def get_n_per_corr(org_df: pd.DataFrame, target_per: float = 0.99) -> pd.DataFrame:
    """df[correlation] should be descending ordered"""
    df = org_df.copy()
    df = df.sort_values(by='correlation', ascending=False, inplace=False)
    #normalization
    df['correlation'] = df['correlation'] / np.sum(df['correlation'])
    cum_relation = 0
    for i in range(len(df)):
        cum_relation += df.iloc[i]['correlation']
        if cum_relation >= target_per:
            break
    new_df = df.iloc[0:i + 1]
    print(i, cum_relation, new_df.shape)
    return new_df
In [17]:
#2-way combination for cat feat
c2 = list(combinations(sorted_cat_list, 2))
names_cs = [c[1] + c[0] for c in c2]
train_c2 = pd.concat([train[c[1]] + train[c[0]] for c in c2], axis=1, keys=names_cs)
test_c2 = pd.concat([test[c[1]] + test[c[0]] for c in c2], axis=1, keys=names_cs)
In [18]:
#v22 + c2
catcol_without_v22 = [i for i in sorted_cat_list if i != 'v22']

# First generate combination part without v22, then generate a new pair combine with v22
cc2_no_v22 = list(combinations(catcol_without_v22, 2))
v22_cc2 = list(product(['v22'], cc2_no_v22))
column_names = [c[0] + '_' + c[1][0] + '_' + c[1][1] for c in v22_cc2]

train_v22c2 = pd.concat([train[c[0]] + train[c[1][0]] + train[c[1][1]] for c in v22_cc2], axis=1, keys=column_names)
train_v22c2.head()

test_v22c2 = pd.concat([test[c[0]] + test[c[1][0]] + test[c[1][1]] for c in v22_cc2], axis=1, keys=column_names)
test_v22c2.head()
Out[18]:
v22_v110_v47 v22_v110_v31 v22_v110_v129 v22_v110_v113 v22_v110_v62 v22_v110_v72 v22_v110_v38 v22_v110_v66 v22_v110_v79 v22_v110_v56 ... v22_v91_v75 v22_v91_v71 v22_v91_v125 v22_v91_v52 v22_v75_v71 v22_v75_v125 v22_v75_v52 v22_v71_v125 v22_v71_v52 v22_v125_v52
0 AFPBAJ AFPBAA AFPBA0 AFPBANA AFPBA1 AFPBA1 AFPBA0 AFPBAA AFPBAQ AFPBAAF ... AFPBBD AFPBBF AFPBBAF AFPBBC AFPBDF AFPBDAF AFPBDC AFPBFAF AFPBFC AFPBAFC
1 FOGBC FOGBA FOGB0 FOGBNA FOGB1 FOGB5 FOGB4 FOGBC FOGBE FOGBDI ... FOGCD FOGCF FOGCI FOGCA FOGDF FOGDI FOGDA FOGFI FOGFA FOGIA
2 HXZAI HXZAA HXZA0 HXZAAJ HXZA1 HXZA1 HXZA0 HXZAC HXZAC HXZADO ... HXZCD HXZCF HXZCAV HXZCA HXZDF HXZDAV HXZDA HXZFAV HXZFA HXZAVA
3 AGMUAI AGMUAA AGMUA0 AGMUAG AGMUA1 AGMUA1 AGMUA0 AGMUAB AGMUAC AGMUACN ... AGMUBD AGMUBF AGMUBB AGMUBA AGMUDF AGMUDB AGMUDA AGMUFB AGMUFA AGMUBA
4 AWWAI AWWAA AWWA0 AWWAI AWWA2 AWWA2 AWWA0 AWWAB AWWAC AWWANA ... AWWBD AWWBF AWWBA AWWBH AWWDF AWWDA AWWDH AWWFA AWWFH AWWAH

5 rows × 190 columns

In [19]:
train = pd.concat([train, train_c2, train_v22c2], axis=1)
test = pd.concat([test, test_c2, test_v22c2], axis=1)
In [20]:
del train_c2, train_v22c2, test_c2, test_v22c2
gc.collect()
Out[20]:
0

V22+C3¶

In [21]:
cc3_no_v22 = list(combinations(catcol_without_v22, 3))
v22_cc3 = list(product(['v22'], cc3_no_v22))
len(v22_cc3)
Out[21]:
1140
In [22]:
column_names = [col[0] + '_' + col[1][0] + '_' + col[1][1] + '_' + col[1][2] for col in v22_cc3]
train_cc3 = pd.concat([train[col[0]] + train[col[1][0]] + train[col[1][1]] + train[col[1][2]] for col in v22_cc3],
                      axis=1, keys=column_names)
train_cc3.shape
Out[22]:
(114321, 1140)
In [23]:
corr_list = calc_rel(train_cc3, y, train_cc3.columns.to_list())
corr_df = pd.DataFrame(corr_list, columns=['feature', 'correlation'])
In [24]:
p99_corr_train_cc3 = get_n_per_corr(corr_df, target_per=0.99)
# p99_corr = corr_df
1123 0.9905946819599104 (1124, 2)
In [25]:
train_cc3 = train_cc3[p99_corr_train_cc3['feature'].to_list()]
train = pd.concat([train, train_cc3], axis=1)
del train_cc3
gc.collect()
Out[25]:
0
In [26]:
test_cc3 = pd.concat([test[col[0]] + test[col[1][0]] + test[col[1][1]] + test[col[1][2]] for col in v22_cc3],
                      axis=1, keys=column_names)
test_cc3 = test_cc3[p99_corr_train_cc3['feature'].to_list()]
test = pd.concat([test, test_cc3], axis=1)
del test_cc3
gc.collect()
Out[26]:
0

deal comb with num features¶

In [ ]:
 
In [27]:
#2-way combination for num feat
n2 = list(combinations(sorted_num_list, 2))
names_cs = [c[1] + c[0] for c in n2]
train_n2 = pd.concat([train[c[1]] + train[c[0]] for c in n2], axis=1, keys=names_cs)
test_n2 = pd.concat([test[c[1]] + test[c[0]] for c in n2], axis=1, keys=names_cs)
In [ ]:
 
In [28]:
#v22 + c2
numcol_without_v50 = [i for i in sorted_num_list if i != 'v50']

# First generate combination part without v22, then generate a new pair combine with v22
nc2_no_v50 = list(combinations(numcol_without_v50, 2))
v50_nc2 = list(product(['v50'], nc2_no_v50))
column_names = [c[0] + '_' + c[1][0] + '_' + c[1][1] for c in v50_nc2]

train_v50n2 = pd.concat([train[c[0]] + train[c[1][0]] + train[c[1][1]] for c in v50_nc2], axis=1, keys=column_names)
train_v50n2.head()

test_v50n2 = pd.concat([test[c[0]] + test[c[1][0]] + test[c[1][1]] for c in v50_nc2], axis=1, keys=column_names)
test_v50n2.head()
Out[28]:
v50_v10_v14 v50_v10_v34 v50_v10_v114 v50_v10_v21 v50_v10_v4 v50_v10_v119 v50_v10_v123 v50_v10_v48 v50_v10_v12 v50_v10_v106 ... v50_v121_v88 v50_v121_v36 v50_v121_v111 v50_v121_v65 v50_v88_v36 v50_v88_v111 v50_v88_v65 v50_v36_v111 v50_v36_v65 v50_v111_v65
0 13.66 6.66 13.68 8.87 5.63 1.43 3.25 14.20 8.39 13.12 ... 5.17 15.57 6.96 19.00 14.08 5.47 17.51 15.87 27.91 19.30
1 13.08 10.71 18.60 8.22 -996.68 -996.68 -996.68 -996.68 8.94 -996.68 ... -1996.97 -1996.97 -1996.97 -1996.97 -1996.97 -1996.97 -1996.97 -1996.97 -1996.97 -1996.97
2 15.73 10.38 18.04 10.67 6.96 2.42 4.39 15.19 9.31 14.17 ... 9.05 15.54 6.46 19.12 18.75 9.67 22.33 16.16 28.82 19.74
3 12.93 8.75 16.23 8.07 4.19 20.77 10.69 19.32 9.39 8.49 ... 5.53 18.84 9.03 17.47 15.12 5.31 13.75 18.62 27.06 17.25
4 13.89 8.02 15.94 9.28 7.18 3.20 3.87 13.12 8.81 17.21 ... 5.40 13.98 5.00 20.66 14.53 5.55 21.21 14.13 29.79 20.81

5 rows × 406 columns

In [ ]:
 
In [29]:
corr_list = calc_rel(train_v50n2, y, train_v50n2.columns.to_list())
corr_df = pd.DataFrame(corr_list, columns=['feature', 'correlation'])
In [30]:
p99_corr_v50n2 = get_n_per_corr(corr_df, target_per=0.99)
# p99_corr = corr_df
393 0.9900523716723956 (394, 2)
In [31]:
train_v50n2 = train_v50n2[p99_corr_v50n2['feature'].to_list()]
train = pd.concat([train, train_n2, train_v50n2], axis=1)
del train_n2, train_v50n2
gc.collect()
Out[31]:
0
In [ ]:

v50+n2¶

In [32]:
test_v50n2 = test_v50n2[p99_corr_v50n2['feature'].to_list()]
test = pd.concat([test, test_n2, test_v50n2], axis=1)
# test.shape
In [33]:
del test_n2, test_v50n2
gc.collect()
Out[33]:
0
In [34]:
assert(train.shape[1] == test.shape[1])
for i in test.columns:
    if i not in train.columns:
        assert(True)
In [35]:
cat_col = train.select_dtypes(exclude=['number']).columns.values
num_col = train.select_dtypes(include=['number']).columns.values
cols = train.columns.to_list()
In [ ]:
 

a list for removing useless features¶

In [36]:
# featimp_df = pd.read_csv("featimp_20241012_1720_8gpu_num2cat_num999_longparam.csv")
# featimp_df = featimp_df.sort_values(by=['importance'],ascending=False)
# important_features = featimp_df['feature'].to_list()
# cols = [i for i in train.columns if i in important_features]
# cols_cat = [i for i in cat_col if i in important_features]
# cols = [i for i in train.columns if i in important_features]
# cat_col = [i for i in cat_col if i in important_features]
In [37]:
# train = train[cols] #not necessary
# test = test[cols] #not necessary
# train_data = cat.Pool(train, label=y,cat_features=cat_col)
# test_data = cat.Pool(test,cat_features=cat_col)
In [38]:
params = {
    "loss_function": "Logloss",
    "eval_metric": "Logloss",
    "learning_rate": 0.02,
    "iterations": 2800,
    # "l2_leaf_reg": 3,
    # "random_seed": 432013,
    # "subsample": 0.66,
    # "od_type": "Iter",
    # "rsm": 0.2,
    "depth": 6,
    # "border_count": 128,
    "task_type": "CPU",
    "cat_features":cat_col,
    "verbose":100
}
In [39]:
model = cat.CatBoostClassifier(**params)
fit_model = model.fit(train,y,verbose=100)
0:	learn: 0.6839533	total: 10.7s	remaining: 8h 16m 56s
100:	learn: 0.4735283	total: 8m 54s	remaining: 3h 58m 9s
200:	learn: 0.4579872	total: 18m 2s	remaining: 3h 53m 20s
300:	learn: 0.4524678	total: 27m 8s	remaining: 3h 45m 18s
400:	learn: 0.4491700	total: 36m 2s	remaining: 3h 35m 37s
500:	learn: 0.4469595	total: 45m 2s	remaining: 3h 26m 39s
600:	learn: 0.4448003	total: 54m 17s	remaining: 3h 18m 40s
700:	learn: 0.4429411	total: 1h 3m 23s	remaining: 3h 9m 48s
800:	learn: 0.4415269	total: 1h 12m 38s	remaining: 3h 1m 16s
900:	learn: 0.4401731	total: 1h 21m 43s	remaining: 2h 52m 15s
1000:	learn: 0.4389522	total: 1h 30m 50s	remaining: 2h 43m 16s
1100:	learn: 0.4378323	total: 1h 39m 52s	remaining: 2h 34m 7s
1200:	learn: 0.4367671	total: 1h 49m 3s	remaining: 2h 25m 12s
1300:	learn: 0.4356167	total: 1h 57m 57s	remaining: 2h 15m 54s
1400:	learn: 0.4346000	total: 2h 7m	remaining: 2h 6m 49s
1500:	learn: 0.4335259	total: 2h 15m 53s	remaining: 1h 57m 35s
1600:	learn: 0.4325657	total: 2h 24m 59s	remaining: 1h 48m 35s
1700:	learn: 0.4316748	total: 2h 33m 58s	remaining: 1h 39m 29s
1800:	learn: 0.4307473	total: 2h 43m	remaining: 1h 30m 24s
1900:	learn: 0.4298424	total: 2h 52m 4s	remaining: 1h 21m 22s
2000:	learn: 0.4290419	total: 3h 1m 32s	remaining: 1h 12m 29s
2100:	learn: 0.4282659	total: 3h 10m 46s	remaining: 1h 3m 28s
2200:	learn: 0.4274384	total: 3h 19m 45s	remaining: 54m 21s
2300:	learn: 0.4266024	total: 3h 28m 59s	remaining: 45m 19s
2400:	learn: 0.4257348	total: 3h 38m 3s	remaining: 36m 14s
2500:	learn: 0.4248639	total: 3h 47m 6s	remaining: 27m 9s
2600:	learn: 0.4240666	total: 3h 56m 21s	remaining: 18m 5s
2700:	learn: 0.4233074	total: 4h 5m 38s	remaining: 9m
2799:	learn: 0.4224872	total: 4h 14m 48s	remaining: 0us
In [ ]:
ite_round = 0

class CustomDataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,**kwargs):
        return

    def fit(self, X, y=None, **kwargs):

        # Perform necessary fitting operations
        return self

    def transform(self, X, y=None, **kwargs):
        # Perform transformations on the data
        global ite_round
        ite_round += 1
        print(f"This is the {ite_round=:}")
        return X
In [ ]:
from sklearn.pipeline import Pipeline, make_pipeline

param_grid_1 = {
    "catboostclassifier__iterations": np.arange(2800,2900,100),
    # "catboostclassifier__learning_rate": np.arange(0.01,0.05,0.01),
    "catboostclassifier__depth": np.arange(6,7,1)
    }

def pipeforcat():
    return make_pipeline(CustomDataPreprocessor(),cat.CatBoostClassifier(**params))
In [ ]:
piped = pipeforcat()
piped
In [ ]:
piped.get_params().keys()
In [ ]:
# piped.fit(train,y)
In [ ]:
from sklearn.model_selection import GridSearchCV


# grid = GridSearchCV(cat.CatBoostClassifier(**params), param_grid_1, cv=2)
grid = GridSearchCV(piped, param_grid_1, cv=3, error_score='raise', verbose=3)
In [ ]:
grid_model = grid.fit(train,y)
In [ ]:
grid.best_params_
In [ ]:
grid.cv_results_
In [ ]:
# y_pred = grid.best_estimator_[1].predict_proba(test_data)
In [42]:
y_pred = fit_model.predict_proba(test)
In [43]:
y_pred.shape
Out[43]:
(114393, 2)
In [44]:
submission = pd.read_csv("sample_submission.csv")
submission['PredictedProb'] = y_pred[:,1]
submission.to_csv('submission_20241013_1900_grid_.csv', index=False)
In [ ]:
fit_model.get_best_iteration()
In [ ]:
pd.DataFrame(grid.cv_results_).to_csv("cv_results_.csv",index=True)
In [ ]:
# Save the grid search model
joblib.dump(grid, 'grid_search_model.pkl')
# Load the model
# loaded_model = joblib.load('grid_search_model.pkl')
In [ ]:
#save feature importance of grid search
fit_model = grid_model.best_estimator_[1]
features = fit_model.feature_names_
feature_importances = fit_model.get_feature_importance()
featimp_df = pd.DataFrame({'importance':feature_importances, 'feature':features}).sort_values(by=['importance'],ascending=False)
featimp_df.to_csv('featimp_20241012_2100_8gpu_runpod_grid_.csv', index=True)
In [ ]:
# Load the model back
# fit_model = cat.CatBoostClassifier()
# fit_model.load_model("model_20241012_1720_8gpu_num2cat_num999_longparam.cbm")
# best_estimator = fit_model.best_estimator_[1]
# features = best_estimator.feature_names_
# feature_importances = best_estimator.get_feature_importance()
# featimp_df = pd.DataFrame({'importance':feature_importances, 'feature':features}).sort_values(by=['importance'],ascending=False)
# featimp_df.to_csv('featimp_20241012_2100_8gpu_runpod_grid_2100_6.csv', index=True)
# important_features = featimp_df['feature'].to_list()
In [46]:
joblib.dump(fit_model, 'fit_model.pkl')
Out[46]:
['fit_model.pkl']
In [50]:
features = fit_model.feature_names_
feature_importances = fit_model.get_feature_importance()
featimp_df = pd.DataFrame({'importance':feature_importances, 'feature':features}).sort_values(by=['importance'],ascending=False)
featimp_df.to_csv('featimp.csv', index=True)
In [ ]: