主要变量为分类变量的建模实例——入金分析建模

【主要变量为分类变量的建模实例——入金分析】

#修正中文字体 from matplotlib.font_manager import FontProperties myfont=FontProperties(fname=r'C:\Windows\Fonts\simhei.ttf',size=14) sns.set(font=myfont.get_name())import pandas as pd import missingno as msno import seaborn as sns %matplotlib inline import datetime import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import numpy as np#sklearn系列包 from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_recall_curve from sklearn.metrics import auc from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve from sklearn.metrics import recall_score from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score from sklearn.preprocessing import StandardScaler#读入的时候马上处理缺失值 data=https://www.it610.com/article/pd.read_csv('anscontest4deli1.csv',na_values='NULL')#填充缺失值 data.fillna( { 'BRANCH_NO':'其他', 'ANSCON886':'其他', 'ANSCON887':'其他', 'ANSCON888':'其他', 'ANSCON889':'其他', 'ANSCON890':'其他', 'ANSCON891':'其他', 'ANSCON892':'其他', 'ANSCON893':'其他', 'ANSCON894':'其他', 'ANSCON896':'其他', 'ANSCON897':'其他', 'ANSCON898':'其他', 'ANSCON899':'其他', 'ANSCON900':'其他', 'ANSCON901':'其他' },inplace=True )#把日期转换为字符串 data[['OPEN_DATE','MINCRRQ','BRANCH_NO']]=data[['OPEN_DATE','MINCRRQ','BRANCH_NO']].astype(str)data[['OPEN_DATE','MINCRRQ']]=data[['OPEN_DATE','MINCRRQ']].apply(pd.to_datetime)#生成目标变量 data['Y']=[1 if t.days<=14 else 0 for t in (data.MINCRRQ-data.OPEN_DATE)]# 目标变量分布可视化 fig, axs = plt.subplots(1,2,figsize=(14,7)) sns.countplot(x='Y',data=https://www.it610.com/article/data,ax=axs[0]) axs[0].set_title("Frequency of each Class") data['Y'].value_counts().plot(x=None,y=None, kind='pie', ax=axs[1],autopct='%1.2f%%') axs[1].set_title("Percentage of each Class") plt.show() #约有81.85%比例的客户在注册后14天内入金#仅有crts是数值型变量，对其作图 v_feat = data.columns plt.figure(figsize=(16,4)) gs = gridspec.GridSpec(1, 1) ax = plt.subplot(gs[0]) sns.distplot(data['crts'][data["Y"] == 1],label='<=14天', bins=50) sns.distplot(data['crts'][data["Y"] == 0],label='>14天', bins=100) ax.set_xlabel('') ax.set_title('histogram of feature crts: ') plt.legend() plt.xlim(0,25) #可见crts变量在0-10之间的，14天内入金概率更高 #把数据划分为训练集和测试集 x_feature = list(data.columns) x_feature.remove('Y') x_val = data[x_feature] y_val = data['Y'] X = data[x_feature] y = data["Y"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) # random_state = 0 每次切分的数据都一样 data_train=pd.concat([X_train,y_train],axis=1).reset_index() data_test=pd.concat([X_test,y_test],axis=1).reset_index() #计算总共的入金客户数目及不入金的客户数目 Grp =data_train.groupby('Y') total_good = Grp.size()[1] total_bad = Grp.size()[0] #Grp.size() #WOE函数,data表示数据库,target表示预测变量，varname表示要求的列 def CalcWOE(data,target,VarName): WOE_Map = pd.DataFrame() Vars = np.unique(data[VarName]) for v in Vars: tmp = data[VarName] == v grp = data[tmp].groupby(target) Good = grp.size()[1] if 1 in grp.size().index else 0 Bad = grp.size()[0]if 0 in grp.size().index else 0 good_ratio = float(Good)/total_good bad_ratio = float(Bad)/total_bad WOE = np.log(good_ratio/(bad_ratio if bad_ratio!=0 else 0.001)) IV = (good_ratio - bad_ratio)*WOE result = pd.DataFrame([[VarName, v, WOE, IV]], index=None, columns=['variable', 'class', 'woe', 'iv']) WOE_Map = WOE_Map.append(result, ignore_index=True) return WOE_Map #计算各分组WOE值 BRANCH_NO_woe = CalcWOE(data_train,'Y','BRANCH_NO') ANSCON886_woe = CalcWOE(data_train,'Y','ANSCON886') ANSCON887_woe = CalcWOE(data_train,'Y','ANSCON887') ANSCON888_woe = CalcWOE(data_train,'Y','ANSCON888') ANSCON889_woe = CalcWOE(data_train,'Y','ANSCON889') ANSCON890_woe = CalcWOE(data_train,'Y','ANSCON890') ANSCON891_woe = CalcWOE(data_train,'Y','ANSCON891') ANSCON892_woe = CalcWOE(data_train,'Y','ANSCON892') ANSCON893_woe = CalcWOE(data_train,'Y','ANSCON893') ANSCON894_woe = CalcWOE(data_train,'Y','ANSCON894') ANSCON896_woe = CalcWOE(data_train,'Y','ANSCON896') ANSCON897_woe = CalcWOE(data_train,'Y','ANSCON897') ANSCON898_woe = CalcWOE(data_train,'Y','ANSCON898') ANSCON899_woe = CalcWOE(data_train,'Y','ANSCON899') ANSCON900_woe = CalcWOE(data_train,'Y','ANSCON900') ANSCON901_woe = CalcWOE(data_train,'Y','ANSCON901') #将原来的变量用其WOE替代 def ReplaceWOE(VarName, SourceDF, VarWOE): dict1 = dict.fromkeys(VarWOE['class']) j = 0 for key in dict1: dict1[key] = VarWOE['woe'][j] j = j + 1 SourceDF[VarName] = SourceDF[VarName].map(dict1) return SourceDF #修改训练集的WOE data_woe = data_train temp = ReplaceWOE('ANSCON886', data_woe, ANSCON886_woe) temp1= ReplaceWOE('ANSCON887', temp, ANSCON887_woe) temp= ReplaceWOE('ANSCON888', temp1, ANSCON888_woe) temp1= ReplaceWOE('ANSCON889', temp, ANSCON889_woe) temp= ReplaceWOE('ANSCON890', temp1, ANSCON890_woe) temp1= ReplaceWOE('ANSCON891', temp, ANSCON891_woe) temp= ReplaceWOE('ANSCON892', temp1, ANSCON892_woe) temp1= ReplaceWOE('ANSCON893', temp, ANSCON893_woe) temp= ReplaceWOE('ANSCON894', temp1, ANSCON894_woe) temp1= ReplaceWOE('ANSCON896', temp, ANSCON896_woe) temp= ReplaceWOE('ANSCON897', temp1, ANSCON897_woe) temp1= ReplaceWOE('ANSCON898', temp, ANSCON898_woe) temp= ReplaceWOE('ANSCON899', temp1, ANSCON899_woe) temp1= ReplaceWOE('ANSCON900', temp, ANSCON900_woe) temp= ReplaceWOE('ANSCON901', temp1, ANSCON901_woe) temp1=ReplaceWOE('BRANCH_NO', temp, BRANCH_NO_woe) data_train_new=temp1.copy(deep=True) clf = LogisticRegression() # 构建逻辑回归分类器 X_train_new=temp1.loc[:,['BRANCH_NO','ANSCON886','ANSCON887','ANSCON888','ANSCON889','ANSCON890','ANSCON891', \ 'ANSCON892','ANSCON893','ANSCON894','ANSCON896','ANSCON897','ANSCON898','ANSCON899', \ 'ANSCON900','ANSCON901','crts']] Y_train_new=temp1.Y clf.fit(X_train_new,Y_train_new) #修改测试集的WOE data_woe = data_test temp = ReplaceWOE('ANSCON886', data_woe, ANSCON886_woe) temp1= ReplaceWOE('ANSCON887', temp, ANSCON887_woe) temp= ReplaceWOE('ANSCON888', temp1, ANSCON888_woe) temp1= ReplaceWOE('ANSCON889', temp, ANSCON889_woe) temp= ReplaceWOE('ANSCON890', temp1, ANSCON890_woe) temp1= ReplaceWOE('ANSCON891', temp, ANSCON891_woe) temp= ReplaceWOE('ANSCON892', temp1, ANSCON892_woe) temp1= ReplaceWOE('ANSCON893', temp, ANSCON893_woe) temp= ReplaceWOE('ANSCON894', temp1, ANSCON894_woe) temp1= ReplaceWOE('ANSCON896', temp, ANSCON896_woe) temp= ReplaceWOE('ANSCON897', temp1, ANSCON897_woe) temp1= ReplaceWOE('ANSCON898', temp, ANSCON898_woe) temp= ReplaceWOE('ANSCON899', temp1, ANSCON899_woe) temp1= ReplaceWOE('ANSCON900', temp, ANSCON900_woe) temp= ReplaceWOE('ANSCON901', temp1, ANSCON901_woe) temp1=ReplaceWOE('BRANCH_NO', temp, BRANCH_NO_woe) X_test_new=temp1.loc[:,['BRANCH_NO','ANSCON886','ANSCON887','ANSCON888','ANSCON889','ANSCON890','ANSCON891', \ 'ANSCON892','ANSCON893','ANSCON894','ANSCON896','ANSCON897','ANSCON898','ANSCON899', \ 'ANSCON900','ANSCON901','crts']] Y_test_new=temp1.Y predicted = clf.predict(X_test_new) # 通过分类器产生预测结果 print("预测准确率: {:.5f}".format(accuracy_score(predicted, Y_test_new,))) y_pred1_prob = clf.predict_proba(X_test_new)[:, 1]# 阈值默认值为0.5 fpr, tpr, thresholds = roc_curve(Y_test_new,y_pred1_prob) roc_auc = auc(fpr,tpr) # 绘制 ROC曲线 plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b',label='AUC = %0.5f'% roc_auc) plt.legend(loc='lower right') plt.plot([0,1],[0,1],'r--') plt.xlim([-0.1,1.0]) plt.ylim([-0.1,1.01]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show() print(clf.coef_) print(clf.intercept_)

主要变量为分类变量的建模实例——入金分析

推荐阅读

小米时间怎么显示桌面新闻小米时间怎么显示桌面

如何评价文件共享服务器？文件共享服务器怎么样

局域网有限的地理范围是多少

米兔是什么意思

中日餐桌礼仪对比中日餐桌礼仪的差异

狐狸学游泳的故事告诉了我们什么道理猴子学游泳的故事告诉了我们什么道理

阳台洗衣柜什么材料好阳台洗衣柜什么材料的好

java8新特性总结

冻疮什么样是快要好了

尺子的种类有几种尺子的种类和用途

分辨率|行业最强2K直屏！realme真我GT2 Pro好评不断

redis 清理 redis清理数据的方法

微信运动刷步数软件介绍

老年人缺铁性贫血的症状

庆东锅炉显示故障10原因？

英文翻译器怎样在线操作英文翻译中文？

省属事业单位笔试考什么

羊驼吃啥我的世界我的世界藏羚羊吃什么

Mac zsh 使用Sublime打开文件

自定义来电秀怎么实现（Android 来电秀源码分析）