In [1]:
!python -m pip install --upgrade pip
!pip install numpy
!pip install pandas
!pip install itertools
!pip install catboost
!pip install scikit-learn
Requirement already satisfied: pip in /usr/local/lib/python3.10/dist-packages (24.2) WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning. Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (1.24.1) WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning. Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.2.3) Requirement already satisfied: numpy>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.24.1) Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2) Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2) Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0) WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning. ERROR: Could not find a version that satisfies the requirement itertools (from versions: none) ERROR: No matching distribution found for itertools Requirement already satisfied: catboost in /usr/local/lib/python3.10/dist-packages (1.2.7) Requirement already satisfied: graphviz in /usr/local/lib/python3.10/dist-packages (from catboost) (0.20.3) Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from catboost) (3.9.2) Requirement already satisfied: numpy<2.0,>=1.16.0 in /usr/local/lib/python3.10/dist-packages (from catboost) (1.24.1) Requirement already satisfied: pandas>=0.24 in /usr/local/lib/python3.10/dist-packages (from catboost) (2.2.3) Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from catboost) (1.14.1) Requirement already satisfied: plotly in /usr/local/lib/python3.10/dist-packages (from catboost) (5.24.1) Requirement already satisfied: six in /usr/lib/python3/dist-packages (from catboost) (1.16.0) Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2024.2) Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2024.2) Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (1.3.0) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (4.54.1) Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (1.4.7) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (23.2) Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (9.3.0) Requirement already satisfied: pyparsing>=2.3.1 in /usr/lib/python3/dist-packages (from matplotlib->catboost) (2.4.7) Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from plotly->catboost) (9.0.0) WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning. Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.5.2) Requirement already satisfied: numpy>=1.19.5 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.24.1) Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.14.1) Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.4.2) Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.5.0) WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
In [2]:
import numpy as np
import pandas as pd
import joblib
import os
import operator
from itertools import combinations
from itertools import product
import gc
import catboost as cat
import multiprocessing
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
Load Data¶
In [3]:
# train_file_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\data\train.csv"
# test_file_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\data\test.csv"
# submission_sample_file_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\data\sample_submission.csv"
# submission_file_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\data\submission.csv"
# good_feature_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\assignment\featimp_20241012_1720_8gpu_num2cat_num999_longparam.csv"
train_file_path = r"train.csv"
test_file_path = r"test.csv"
submission_sample_file_path = r"sample_submission.csv"
submission_file_path = r"submission.csv"
# good_feature_path = r"D:\MyDrive\pythonprojects\class\ML\BNP\assignment\featimp_20241012_1720_8gpu_num2cat_num999_longparam.csv"
In [4]:
train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)
y = train['target']
train.drop('target', axis=1, inplace=True)
train.drop('ID', axis=1, inplace=True)
train.drop('v74', axis=1, inplace=True) #v74, v3 are univariate
train.drop('v3', axis=1, inplace=True)
test.drop('ID', axis=1, inplace=True) #ID is useless
test.drop('v74', axis=1, inplace=True)
test.drop('v3', axis=1, inplace=True)
# train = train[:50]
# test = test[:50]
# y = y[:50]
Convert Low Cardinality Numerical to Categorical¶
In [5]:
# modify numerical features v62 v129 v38 v72 to categorical features
num_2_cat_list = ['v62', 'v129', 'v38', 'v72']
for c in num_2_cat_list:
train[c] = train[c].astype('string')
train[c] = train[c].astype('O')
for c in num_2_cat_list:
test[c] = test[c].astype('string')
test[c] = test[c].astype('O')
In [6]:
## Calculate feature's correlation with target, remove useless features
In [7]:
def calc_rel(df: pd.DataFrame, target_col: pd.Series, list1: list) -> dict:
"""calculate correlation
@list1: a list of columns in df, to calculate with target_col"""
correlations = dict()
for col in list1:
if df[col].dtype == 'O':
corr = target_col.corr(df[col].astype('category').cat.codes)
else:
corr = target_col.corr(df[col])
if not np.isnan(corr):
correlations[col] = abs(corr)
correlations = sorted(correlations.items(), key=lambda item: item[1], reverse=True)
return correlations
In [8]:
#corr works on numerical column
num_cols = train.select_dtypes(include=['number']).columns
num_corr = calc_rel(train, y, num_cols)
# use 0.03 as threshold
num_corr = pd.DataFrame(num_corr[:30],columns=['feature','importance'])
In [9]:
num_cols = num_corr['feature'].to_list()
In [ ]:
In [10]:
#round num to 2 decimals
for col in num_cols:
train[col] = np.round(train[col],2)
for col in num_cols:
test[col] = np.round(test[col],2)
In [11]:
cat_cols = train.select_dtypes(exclude=['number']).columns
cat_corr = pd.DataFrame(calc_rel(train, y, cat_cols), columns=['feature', 'correlation'])
cat_cols = cat_corr['feature'].to_list()
In [12]:
#combine the cat and num cols
sorted_num_list = [i for i in num_cols]
sorted_cat_list = [i for i in cat_cols]
keep_cols = sorted_num_list + sorted_cat_list
In [13]:
#drop useless from the train and test
train = train[keep_cols]
test = test[keep_cols]
In [14]:
#fill nan values
for col in sorted_cat_list:
train.fillna({col:'NA'},inplace=True)
# test[col] = test[col].fillna('NA')
test.fillna({col:'NA'},inplace=True)
In [15]:
for col in sorted_num_list:
# mean = train[col].mean(skipna=True)
train[col] = train[col].fillna(-999)
test[col] = test[col].fillna(-999)
assert (train.isnull().sum().sum() == 0)
assert (test.isnull().sum().sum() == 0)
In [16]:
#get 99% percent importance
def get_n_per_corr(org_df: pd.DataFrame, target_per: float = 0.99) -> pd.DataFrame:
"""df[correlation] should be descending ordered"""
df = org_df.copy()
df = df.sort_values(by='correlation', ascending=False, inplace=False)
#normalization
df['correlation'] = df['correlation'] / np.sum(df['correlation'])
cum_relation = 0
for i in range(len(df)):
cum_relation += df.iloc[i]['correlation']
if cum_relation >= target_per:
break
new_df = df.iloc[0:i + 1]
print(i, cum_relation, new_df.shape)
return new_df
In [17]:
#2-way combination for cat feat
c2 = list(combinations(sorted_cat_list, 2))
names_cs = [c[1] + c[0] for c in c2]
train_c2 = pd.concat([train[c[1]] + train[c[0]] for c in c2], axis=1, keys=names_cs)
test_c2 = pd.concat([test[c[1]] + test[c[0]] for c in c2], axis=1, keys=names_cs)
In [18]:
#v22 + c2
catcol_without_v22 = [i for i in sorted_cat_list if i != 'v22']
# First generate combination part without v22, then generate a new pair combine with v22
cc2_no_v22 = list(combinations(catcol_without_v22, 2))
v22_cc2 = list(product(['v22'], cc2_no_v22))
column_names = [c[0] + '_' + c[1][0] + '_' + c[1][1] for c in v22_cc2]
train_v22c2 = pd.concat([train[c[0]] + train[c[1][0]] + train[c[1][1]] for c in v22_cc2], axis=1, keys=column_names)
train_v22c2.head()
test_v22c2 = pd.concat([test[c[0]] + test[c[1][0]] + test[c[1][1]] for c in v22_cc2], axis=1, keys=column_names)
test_v22c2.head()
Out[18]:
| v22_v110_v47 | v22_v110_v31 | v22_v110_v129 | v22_v110_v113 | v22_v110_v62 | v22_v110_v72 | v22_v110_v38 | v22_v110_v66 | v22_v110_v79 | v22_v110_v56 | ... | v22_v91_v75 | v22_v91_v71 | v22_v91_v125 | v22_v91_v52 | v22_v75_v71 | v22_v75_v125 | v22_v75_v52 | v22_v71_v125 | v22_v71_v52 | v22_v125_v52 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFPBAJ | AFPBAA | AFPBA0 | AFPBANA | AFPBA1 | AFPBA1 | AFPBA0 | AFPBAA | AFPBAQ | AFPBAAF | ... | AFPBBD | AFPBBF | AFPBBAF | AFPBBC | AFPBDF | AFPBDAF | AFPBDC | AFPBFAF | AFPBFC | AFPBAFC |
| 1 | FOGBC | FOGBA | FOGB0 | FOGBNA | FOGB1 | FOGB5 | FOGB4 | FOGBC | FOGBE | FOGBDI | ... | FOGCD | FOGCF | FOGCI | FOGCA | FOGDF | FOGDI | FOGDA | FOGFI | FOGFA | FOGIA |
| 2 | HXZAI | HXZAA | HXZA0 | HXZAAJ | HXZA1 | HXZA1 | HXZA0 | HXZAC | HXZAC | HXZADO | ... | HXZCD | HXZCF | HXZCAV | HXZCA | HXZDF | HXZDAV | HXZDA | HXZFAV | HXZFA | HXZAVA |
| 3 | AGMUAI | AGMUAA | AGMUA0 | AGMUAG | AGMUA1 | AGMUA1 | AGMUA0 | AGMUAB | AGMUAC | AGMUACN | ... | AGMUBD | AGMUBF | AGMUBB | AGMUBA | AGMUDF | AGMUDB | AGMUDA | AGMUFB | AGMUFA | AGMUBA |
| 4 | AWWAI | AWWAA | AWWA0 | AWWAI | AWWA2 | AWWA2 | AWWA0 | AWWAB | AWWAC | AWWANA | ... | AWWBD | AWWBF | AWWBA | AWWBH | AWWDF | AWWDA | AWWDH | AWWFA | AWWFH | AWWAH |
5 rows × 190 columns
In [19]:
train = pd.concat([train, train_c2, train_v22c2], axis=1)
test = pd.concat([test, test_c2, test_v22c2], axis=1)
In [20]:
del train_c2, train_v22c2, test_c2, test_v22c2
gc.collect()
Out[20]:
0
V22+C3¶
In [21]:
cc3_no_v22 = list(combinations(catcol_without_v22, 3))
v22_cc3 = list(product(['v22'], cc3_no_v22))
len(v22_cc3)
Out[21]:
1140
In [22]:
column_names = [col[0] + '_' + col[1][0] + '_' + col[1][1] + '_' + col[1][2] for col in v22_cc3]
train_cc3 = pd.concat([train[col[0]] + train[col[1][0]] + train[col[1][1]] + train[col[1][2]] for col in v22_cc3],
axis=1, keys=column_names)
train_cc3.shape
Out[22]:
(114321, 1140)
In [23]:
corr_list = calc_rel(train_cc3, y, train_cc3.columns.to_list())
corr_df = pd.DataFrame(corr_list, columns=['feature', 'correlation'])
In [24]:
p99_corr_train_cc3 = get_n_per_corr(corr_df, target_per=0.99)
# p99_corr = corr_df
1123 0.9905946819599104 (1124, 2)
In [25]:
train_cc3 = train_cc3[p99_corr_train_cc3['feature'].to_list()]
train = pd.concat([train, train_cc3], axis=1)
del train_cc3
gc.collect()
Out[25]:
0
In [26]:
test_cc3 = pd.concat([test[col[0]] + test[col[1][0]] + test[col[1][1]] + test[col[1][2]] for col in v22_cc3],
axis=1, keys=column_names)
test_cc3 = test_cc3[p99_corr_train_cc3['feature'].to_list()]
test = pd.concat([test, test_cc3], axis=1)
del test_cc3
gc.collect()
Out[26]:
0
deal comb with num features¶
In [ ]:
In [27]:
#2-way combination for num feat
n2 = list(combinations(sorted_num_list, 2))
names_cs = [c[1] + c[0] for c in n2]
train_n2 = pd.concat([train[c[1]] + train[c[0]] for c in n2], axis=1, keys=names_cs)
test_n2 = pd.concat([test[c[1]] + test[c[0]] for c in n2], axis=1, keys=names_cs)
In [ ]:
In [28]:
#v22 + c2
numcol_without_v50 = [i for i in sorted_num_list if i != 'v50']
# First generate combination part without v22, then generate a new pair combine with v22
nc2_no_v50 = list(combinations(numcol_without_v50, 2))
v50_nc2 = list(product(['v50'], nc2_no_v50))
column_names = [c[0] + '_' + c[1][0] + '_' + c[1][1] for c in v50_nc2]
train_v50n2 = pd.concat([train[c[0]] + train[c[1][0]] + train[c[1][1]] for c in v50_nc2], axis=1, keys=column_names)
train_v50n2.head()
test_v50n2 = pd.concat([test[c[0]] + test[c[1][0]] + test[c[1][1]] for c in v50_nc2], axis=1, keys=column_names)
test_v50n2.head()
Out[28]:
| v50_v10_v14 | v50_v10_v34 | v50_v10_v114 | v50_v10_v21 | v50_v10_v4 | v50_v10_v119 | v50_v10_v123 | v50_v10_v48 | v50_v10_v12 | v50_v10_v106 | ... | v50_v121_v88 | v50_v121_v36 | v50_v121_v111 | v50_v121_v65 | v50_v88_v36 | v50_v88_v111 | v50_v88_v65 | v50_v36_v111 | v50_v36_v65 | v50_v111_v65 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13.66 | 6.66 | 13.68 | 8.87 | 5.63 | 1.43 | 3.25 | 14.20 | 8.39 | 13.12 | ... | 5.17 | 15.57 | 6.96 | 19.00 | 14.08 | 5.47 | 17.51 | 15.87 | 27.91 | 19.30 |
| 1 | 13.08 | 10.71 | 18.60 | 8.22 | -996.68 | -996.68 | -996.68 | -996.68 | 8.94 | -996.68 | ... | -1996.97 | -1996.97 | -1996.97 | -1996.97 | -1996.97 | -1996.97 | -1996.97 | -1996.97 | -1996.97 | -1996.97 |
| 2 | 15.73 | 10.38 | 18.04 | 10.67 | 6.96 | 2.42 | 4.39 | 15.19 | 9.31 | 14.17 | ... | 9.05 | 15.54 | 6.46 | 19.12 | 18.75 | 9.67 | 22.33 | 16.16 | 28.82 | 19.74 |
| 3 | 12.93 | 8.75 | 16.23 | 8.07 | 4.19 | 20.77 | 10.69 | 19.32 | 9.39 | 8.49 | ... | 5.53 | 18.84 | 9.03 | 17.47 | 15.12 | 5.31 | 13.75 | 18.62 | 27.06 | 17.25 |
| 4 | 13.89 | 8.02 | 15.94 | 9.28 | 7.18 | 3.20 | 3.87 | 13.12 | 8.81 | 17.21 | ... | 5.40 | 13.98 | 5.00 | 20.66 | 14.53 | 5.55 | 21.21 | 14.13 | 29.79 | 20.81 |
5 rows × 406 columns
In [ ]:
In [29]:
corr_list = calc_rel(train_v50n2, y, train_v50n2.columns.to_list())
corr_df = pd.DataFrame(corr_list, columns=['feature', 'correlation'])
In [30]:
p99_corr_v50n2 = get_n_per_corr(corr_df, target_per=0.99)
# p99_corr = corr_df
393 0.9900523716723956 (394, 2)
In [31]:
train_v50n2 = train_v50n2[p99_corr_v50n2['feature'].to_list()]
train = pd.concat([train, train_n2, train_v50n2], axis=1)
del train_n2, train_v50n2
gc.collect()
Out[31]:
0
In [ ]:
v50+n2¶
In [32]:
test_v50n2 = test_v50n2[p99_corr_v50n2['feature'].to_list()]
test = pd.concat([test, test_n2, test_v50n2], axis=1)
# test.shape
In [33]:
del test_n2, test_v50n2
gc.collect()
Out[33]:
0
In [34]:
assert(train.shape[1] == test.shape[1])
for i in test.columns:
if i not in train.columns:
assert(True)
In [35]:
cat_col = train.select_dtypes(exclude=['number']).columns.values
num_col = train.select_dtypes(include=['number']).columns.values
cols = train.columns.to_list()
In [ ]:
a list for removing useless features¶
In [36]:
# featimp_df = pd.read_csv("featimp_20241012_1720_8gpu_num2cat_num999_longparam.csv")
# featimp_df = featimp_df.sort_values(by=['importance'],ascending=False)
# important_features = featimp_df['feature'].to_list()
# cols = [i for i in train.columns if i in important_features]
# cols_cat = [i for i in cat_col if i in important_features]
# cols = [i for i in train.columns if i in important_features]
# cat_col = [i for i in cat_col if i in important_features]
In [37]:
# train = train[cols] #not necessary
# test = test[cols] #not necessary
# train_data = cat.Pool(train, label=y,cat_features=cat_col)
# test_data = cat.Pool(test,cat_features=cat_col)
In [38]:
params = {
"loss_function": "Logloss",
"eval_metric": "Logloss",
"learning_rate": 0.02,
"iterations": 2800,
# "l2_leaf_reg": 3,
# "random_seed": 432013,
# "subsample": 0.66,
# "od_type": "Iter",
# "rsm": 0.2,
"depth": 6,
# "border_count": 128,
"task_type": "CPU",
"cat_features":cat_col,
"verbose":100
}
In [39]:
model = cat.CatBoostClassifier(**params)
fit_model = model.fit(train,y,verbose=100)
0: learn: 0.6839533 total: 10.7s remaining: 8h 16m 56s 100: learn: 0.4735283 total: 8m 54s remaining: 3h 58m 9s 200: learn: 0.4579872 total: 18m 2s remaining: 3h 53m 20s 300: learn: 0.4524678 total: 27m 8s remaining: 3h 45m 18s 400: learn: 0.4491700 total: 36m 2s remaining: 3h 35m 37s 500: learn: 0.4469595 total: 45m 2s remaining: 3h 26m 39s 600: learn: 0.4448003 total: 54m 17s remaining: 3h 18m 40s 700: learn: 0.4429411 total: 1h 3m 23s remaining: 3h 9m 48s 800: learn: 0.4415269 total: 1h 12m 38s remaining: 3h 1m 16s 900: learn: 0.4401731 total: 1h 21m 43s remaining: 2h 52m 15s 1000: learn: 0.4389522 total: 1h 30m 50s remaining: 2h 43m 16s 1100: learn: 0.4378323 total: 1h 39m 52s remaining: 2h 34m 7s 1200: learn: 0.4367671 total: 1h 49m 3s remaining: 2h 25m 12s 1300: learn: 0.4356167 total: 1h 57m 57s remaining: 2h 15m 54s 1400: learn: 0.4346000 total: 2h 7m remaining: 2h 6m 49s 1500: learn: 0.4335259 total: 2h 15m 53s remaining: 1h 57m 35s 1600: learn: 0.4325657 total: 2h 24m 59s remaining: 1h 48m 35s 1700: learn: 0.4316748 total: 2h 33m 58s remaining: 1h 39m 29s 1800: learn: 0.4307473 total: 2h 43m remaining: 1h 30m 24s 1900: learn: 0.4298424 total: 2h 52m 4s remaining: 1h 21m 22s 2000: learn: 0.4290419 total: 3h 1m 32s remaining: 1h 12m 29s 2100: learn: 0.4282659 total: 3h 10m 46s remaining: 1h 3m 28s 2200: learn: 0.4274384 total: 3h 19m 45s remaining: 54m 21s 2300: learn: 0.4266024 total: 3h 28m 59s remaining: 45m 19s 2400: learn: 0.4257348 total: 3h 38m 3s remaining: 36m 14s 2500: learn: 0.4248639 total: 3h 47m 6s remaining: 27m 9s 2600: learn: 0.4240666 total: 3h 56m 21s remaining: 18m 5s 2700: learn: 0.4233074 total: 4h 5m 38s remaining: 9m 2799: learn: 0.4224872 total: 4h 14m 48s remaining: 0us
In [ ]:
ite_round = 0
class CustomDataPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self,**kwargs):
return
def fit(self, X, y=None, **kwargs):
# Perform necessary fitting operations
return self
def transform(self, X, y=None, **kwargs):
# Perform transformations on the data
global ite_round
ite_round += 1
print(f"This is the {ite_round=:}")
return X
In [ ]:
from sklearn.pipeline import Pipeline, make_pipeline
param_grid_1 = {
"catboostclassifier__iterations": np.arange(2800,2900,100),
# "catboostclassifier__learning_rate": np.arange(0.01,0.05,0.01),
"catboostclassifier__depth": np.arange(6,7,1)
}
def pipeforcat():
return make_pipeline(CustomDataPreprocessor(),cat.CatBoostClassifier(**params))
In [ ]:
piped = pipeforcat()
piped
In [ ]:
piped.get_params().keys()
In [ ]:
# piped.fit(train,y)
In [ ]:
from sklearn.model_selection import GridSearchCV
# grid = GridSearchCV(cat.CatBoostClassifier(**params), param_grid_1, cv=2)
grid = GridSearchCV(piped, param_grid_1, cv=3, error_score='raise', verbose=3)
In [ ]:
grid_model = grid.fit(train,y)
In [ ]:
grid.best_params_
In [ ]:
grid.cv_results_
In [ ]:
# y_pred = grid.best_estimator_[1].predict_proba(test_data)
In [42]:
y_pred = fit_model.predict_proba(test)
In [43]:
y_pred.shape
Out[43]:
(114393, 2)
In [44]:
submission = pd.read_csv("sample_submission.csv")
submission['PredictedProb'] = y_pred[:,1]
submission.to_csv('submission_20241013_1900_grid_.csv', index=False)
In [ ]:
fit_model.get_best_iteration()
In [ ]:
pd.DataFrame(grid.cv_results_).to_csv("cv_results_.csv",index=True)
In [ ]:
# Save the grid search model
joblib.dump(grid, 'grid_search_model.pkl')
# Load the model
# loaded_model = joblib.load('grid_search_model.pkl')
In [ ]:
#save feature importance of grid search
fit_model = grid_model.best_estimator_[1]
features = fit_model.feature_names_
feature_importances = fit_model.get_feature_importance()
featimp_df = pd.DataFrame({'importance':feature_importances, 'feature':features}).sort_values(by=['importance'],ascending=False)
featimp_df.to_csv('featimp_20241012_2100_8gpu_runpod_grid_.csv', index=True)
In [ ]:
# Load the model back
# fit_model = cat.CatBoostClassifier()
# fit_model.load_model("model_20241012_1720_8gpu_num2cat_num999_longparam.cbm")
# best_estimator = fit_model.best_estimator_[1]
# features = best_estimator.feature_names_
# feature_importances = best_estimator.get_feature_importance()
# featimp_df = pd.DataFrame({'importance':feature_importances, 'feature':features}).sort_values(by=['importance'],ascending=False)
# featimp_df.to_csv('featimp_20241012_2100_8gpu_runpod_grid_2100_6.csv', index=True)
# important_features = featimp_df['feature'].to_list()
In [46]:
joblib.dump(fit_model, 'fit_model.pkl')
Out[46]:
['fit_model.pkl']
In [50]:
features = fit_model.feature_names_
feature_importances = fit_model.get_feature_importance()
featimp_df = pd.DataFrame({'importance':feature_importances, 'feature':features}).sort_values(by=['importance'],ascending=False)
featimp_df.to_csv('featimp.csv', index=True)
In [ ]: