主要变量为分类变量的建模实例——入金分析



【主要变量为分类变量的建模实例——入金分析】




#修正中文字体 from matplotlib.font_manager import FontProperties myfont=FontProperties(fname=r'C:\Windows\Fonts\simhei.ttf',size=14) sns.set(font=myfont.get_name())import pandas as pd import missingno as msno import seaborn as sns %matplotlib inline import datetime import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import numpy as np#sklearn系列包 from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_recall_curve from sklearn.metrics import auc from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve from sklearn.metrics import recall_score from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score from sklearn.preprocessing import StandardScaler#读入的时候马上处理缺失值 data=https://www.it610.com/article/pd.read_csv('anscontest4deli1.csv',na_values='NULL')#填充缺失值 data.fillna( { 'BRANCH_NO':'其他', 'ANSCON886':'其他', 'ANSCON887':'其他', 'ANSCON888':'其他', 'ANSCON889':'其他', 'ANSCON890':'其他', 'ANSCON891':'其他', 'ANSCON892':'其他', 'ANSCON893':'其他', 'ANSCON894':'其他', 'ANSCON896':'其他', 'ANSCON897':'其他', 'ANSCON898':'其他', 'ANSCON899':'其他', 'ANSCON900':'其他', 'ANSCON901':'其他' },inplace=True )#把日期转换为字符串 data[['OPEN_DATE','MINCRRQ','BRANCH_NO']]=data[['OPEN_DATE','MINCRRQ','BRANCH_NO']].astype(str)data[['OPEN_DATE','MINCRRQ']]=data[['OPEN_DATE','MINCRRQ']].apply(pd.to_datetime)#生成目标变量 data['Y']=[1 if t.days<=14 else 0 for t in (data.MINCRRQ-data.OPEN_DATE)]# 目标变量分布可视化 fig, axs = plt.subplots(1,2,figsize=(14,7)) sns.countplot(x='Y',data=https://www.it610.com/article/data,ax=axs[0]) axs[0].set_title("Frequency of each Class") data['Y'].value_counts().plot(x=None,y=None, kind='pie', ax=axs[1],autopct='%1.2f%%') axs[1].set_title("Percentage of each Class") plt.show() #约有81.85%比例的客户在注册后14天内入金#仅有crts是数值型变量,对其作图 v_feat = data.columns plt.figure(figsize=(16,4)) gs = gridspec.GridSpec(1, 1) ax = plt.subplot(gs[0]) sns.distplot(data['crts'][data["Y"] == 1],label='<=14天', bins=50) sns.distplot(data['crts'][data["Y"] == 0],label='>14天', bins=100) ax.set_xlabel('') ax.set_title('histogram of feature crts: ') plt.legend() plt.xlim(0,25) #可见crts变量在0-10之间的,14天内入金概率更高 #把数据划分为训练集和测试集 x_feature = list(data.columns) x_feature.remove('Y') x_val = data[x_feature] y_val = data['Y'] X = data[x_feature] y = data["Y"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) # random_state = 0 每次切分的数据都一样 data_train=pd.concat([X_train,y_train],axis=1).reset_index() data_test=pd.concat([X_test,y_test],axis=1).reset_index() #计算总共的入金客户数目及不入金的客户数目 Grp =data_train.groupby('Y') total_good = Grp.size()[1] total_bad = Grp.size()[0] #Grp.size() #WOE函数,data表示数据库,target表示预测变量,varname表示要求的列 def CalcWOE(data,target,VarName): WOE_Map = pd.DataFrame() Vars = np.unique(data[VarName]) for v in Vars: tmp = data[VarName] == v grp = data[tmp].groupby(target) Good = grp.size()[1] if 1 in grp.size().index else 0 Bad = grp.size()[0]if 0 in grp.size().index else 0 good_ratio = float(Good)/total_good bad_ratio = float(Bad)/total_bad WOE = np.log(good_ratio/(bad_ratio if bad_ratio!=0 else 0.001)) IV = (good_ratio - bad_ratio)*WOE result = pd.DataFrame([[VarName, v, WOE, IV]], index=None, columns=['variable', 'class', 'woe', 'iv']) WOE_Map = WOE_Map.append(result, ignore_index=True) return WOE_Map #计算各分组WOE值 BRANCH_NO_woe = CalcWOE(data_train,'Y','BRANCH_NO') ANSCON886_woe = CalcWOE(data_train,'Y','ANSCON886') ANSCON887_woe = CalcWOE(data_train,'Y','ANSCON887') ANSCON888_woe = CalcWOE(data_train,'Y','ANSCON888') ANSCON889_woe = CalcWOE(data_train,'Y','ANSCON889') ANSCON890_woe = CalcWOE(data_train,'Y','ANSCON890') ANSCON891_woe = CalcWOE(data_train,'Y','ANSCON891') ANSCON892_woe = CalcWOE(data_train,'Y','ANSCON892') ANSCON893_woe = CalcWOE(data_train,'Y','ANSCON893') ANSCON894_woe = CalcWOE(data_train,'Y','ANSCON894') ANSCON896_woe = CalcWOE(data_train,'Y','ANSCON896') ANSCON897_woe = CalcWOE(data_train,'Y','ANSCON897') ANSCON898_woe = CalcWOE(data_train,'Y','ANSCON898') ANSCON899_woe = CalcWOE(data_train,'Y','ANSCON899') ANSCON900_woe = CalcWOE(data_train,'Y','ANSCON900') ANSCON901_woe = CalcWOE(data_train,'Y','ANSCON901') #将原来的变量用其WOE替代 def ReplaceWOE(VarName, SourceDF, VarWOE): dict1 = dict.fromkeys(VarWOE['class']) j = 0 for key in dict1: dict1[key] = VarWOE['woe'][j] j = j + 1 SourceDF[VarName] = SourceDF[VarName].map(dict1) return SourceDF #修改训练集的WOE data_woe = data_train temp = ReplaceWOE('ANSCON886', data_woe, ANSCON886_woe) temp1= ReplaceWOE('ANSCON887', temp, ANSCON887_woe) temp= ReplaceWOE('ANSCON888', temp1, ANSCON888_woe) temp1= ReplaceWOE('ANSCON889', temp, ANSCON889_woe) temp= ReplaceWOE('ANSCON890', temp1, ANSCON890_woe) temp1= ReplaceWOE('ANSCON891', temp, ANSCON891_woe) temp= ReplaceWOE('ANSCON892', temp1, ANSCON892_woe) temp1= ReplaceWOE('ANSCON893', temp, ANSCON893_woe) temp= ReplaceWOE('ANSCON894', temp1, ANSCON894_woe) temp1= ReplaceWOE('ANSCON896', temp, ANSCON896_woe) temp= ReplaceWOE('ANSCON897', temp1, ANSCON897_woe) temp1= ReplaceWOE('ANSCON898', temp, ANSCON898_woe) temp= ReplaceWOE('ANSCON899', temp1, ANSCON899_woe) temp1= ReplaceWOE('ANSCON900', temp, ANSCON900_woe) temp= ReplaceWOE('ANSCON901', temp1, ANSCON901_woe) temp1=ReplaceWOE('BRANCH_NO', temp, BRANCH_NO_woe) data_train_new=temp1.copy(deep=True) clf = LogisticRegression() # 构建逻辑回归分类器 X_train_new=temp1.loc[:,['BRANCH_NO','ANSCON886','ANSCON887','ANSCON888','ANSCON889','ANSCON890','ANSCON891', \ 'ANSCON892','ANSCON893','ANSCON894','ANSCON896','ANSCON897','ANSCON898','ANSCON899', \ 'ANSCON900','ANSCON901','crts']] Y_train_new=temp1.Y clf.fit(X_train_new,Y_train_new) #修改测试集的WOE data_woe = data_test temp = ReplaceWOE('ANSCON886', data_woe, ANSCON886_woe) temp1= ReplaceWOE('ANSCON887', temp, ANSCON887_woe) temp= ReplaceWOE('ANSCON888', temp1, ANSCON888_woe) temp1= ReplaceWOE('ANSCON889', temp, ANSCON889_woe) temp= ReplaceWOE('ANSCON890', temp1, ANSCON890_woe) temp1= ReplaceWOE('ANSCON891', temp, ANSCON891_woe) temp= ReplaceWOE('ANSCON892', temp1, ANSCON892_woe) temp1= ReplaceWOE('ANSCON893', temp, ANSCON893_woe) temp= ReplaceWOE('ANSCON894', temp1, ANSCON894_woe) temp1= ReplaceWOE('ANSCON896', temp, ANSCON896_woe) temp= ReplaceWOE('ANSCON897', temp1, ANSCON897_woe) temp1= ReplaceWOE('ANSCON898', temp, ANSCON898_woe) temp= ReplaceWOE('ANSCON899', temp1, ANSCON899_woe) temp1= ReplaceWOE('ANSCON900', temp, ANSCON900_woe) temp= ReplaceWOE('ANSCON901', temp1, ANSCON901_woe) temp1=ReplaceWOE('BRANCH_NO', temp, BRANCH_NO_woe) X_test_new=temp1.loc[:,['BRANCH_NO','ANSCON886','ANSCON887','ANSCON888','ANSCON889','ANSCON890','ANSCON891', \ 'ANSCON892','ANSCON893','ANSCON894','ANSCON896','ANSCON897','ANSCON898','ANSCON899', \ 'ANSCON900','ANSCON901','crts']] Y_test_new=temp1.Y predicted = clf.predict(X_test_new) # 通过分类器产生预测结果 print("预测准确率: {:.5f}".format(accuracy_score(predicted, Y_test_new,))) y_pred1_prob = clf.predict_proba(X_test_new)[:, 1]# 阈值默认值为0.5 fpr, tpr, thresholds = roc_curve(Y_test_new,y_pred1_prob) roc_auc = auc(fpr,tpr) # 绘制 ROC曲线 plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b',label='AUC = %0.5f'% roc_auc) plt.legend(loc='lower right') plt.plot([0,1],[0,1],'r--') plt.xlim([-0.1,1.0]) plt.ylim([-0.1,1.01]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show() print(clf.coef_) print(clf.intercept_)


    推荐阅读