Task3 特征分析
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')data_train =pd.read_csv('../train.csv')
data_test_a = pd.read_csv('../testA.csv')
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
label = 'isDefault'
numerical_fea.remove(label)
data_train.isnull().sum()
data_train[numerical_fea] =
data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] =
data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] =
data_test_a[category_fea].fillna(data_train[category_fea].mode())data_train.isnull().sum()
category_fea
for data in [data_train, data_test_a]:data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.daysdata_train['employmentLength'].value_counts(dropna=False).sort_index()
def employmentLength_to_int(s):if pd.isnull(s):return selse:return np.int8(s.split()[0])
for data in [data_train, data_test_a]:data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)data['employmentLength'].replace('< 1 year', '0 years', inplace=True)data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)data['employmentLength'].value_counts(dropna=False).sort_index()
data_train['earliesCreditLine'].sample(5)for data in [data_train, data_test_a]:data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:print(f, '类型数:', data[f].nunique())for data in [data_train, data_test_a]:data['grade'] = data['grade'].map({
'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
for data in [data_train, data_test_a]:data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus','purpose', 'regionCode'], drop_first=True)
def find_outliers_by_3segama(data,fea):data_std = np.std(data[fea])data_mean = np.mean(data[fea])outliers_cut_off = data_std * 3lower_rule = data_mean - outliers_cut_offupper_rule = data_mean + outliers_cut_offdata[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x <lower_rule else '正常值')return datadata_train = data_train.copy()
for fea in numerical_fea:data_train = find_outliers_by_3segama(data_train,fea)print(data_train[fea+'_outliers'].value_counts())print(data_train.groupby(fea+'_outliers')['isDefault'].sum())print('*'*10)
for col in ['grade', 'subGrade']:temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={
'mean': col + '_target_mean'})temp_dict.index = temp_dict[col].valuestemp_dict = temp_dict[col + '_target_mean'].to_dict()data_train[col + '_target_mean'] = data_train[col].map(temp_dict)data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)
for df in [data_train, data_test_a]:for item in ['n0','n1','n2','n2.1','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):le = LabelEncoder()le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))data_train[col] = le.transform(list(data_train[col].astype(str).values))data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))print('Label Encoding 完成')
from sklearn.feature_selection import VarianceThreshold
VarianceThreshold(threshold=3).fit_transform(train,target_train)from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
SelectKBest(k=5).fit_transform(train,target_train)from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
SelectKBest(chi2, k=5).fit_transform(train,target_train)from sklearn.feature_selection import SelectKBest
from minepy import MINE
def mic(x, y):m = MINE()m.compute_score(x, y)return (m.mic(), 0.5)
SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(train,target_train)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
RFE(estimator=LogisticRegression(),
n_features_to_select=2).fit_transform(train,target_train)
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(train,target_train)from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
SelectFromModel(GradientBoostingClassifier()).fit_transform(train,target_train)