【主要变量为分类变量的建模实例——入金分析】
#修正中文字体
from matplotlib.font_manager import FontProperties
myfont=FontProperties(fname=r'C:\Windows\Fonts\simhei.ttf',size=14)
sns.set(font=myfont.get_name())import pandas as pd
import missingno as msno
import seaborn as sns
%matplotlib inline
import datetime
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np#sklearn系列包
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler#读入的时候马上处理缺失值
data=https://www.it610.com/article/pd.read_csv('anscontest4deli1.csv',na_values='NULL')#填充缺失值
data.fillna(
{
'BRANCH_NO':'其他',
'ANSCON886':'其他',
'ANSCON887':'其他',
'ANSCON888':'其他',
'ANSCON889':'其他',
'ANSCON890':'其他',
'ANSCON891':'其他',
'ANSCON892':'其他',
'ANSCON893':'其他',
'ANSCON894':'其他',
'ANSCON896':'其他',
'ANSCON897':'其他',
'ANSCON898':'其他',
'ANSCON899':'其他',
'ANSCON900':'其他',
'ANSCON901':'其他'
},inplace=True
)#把日期转换为字符串
data[['OPEN_DATE','MINCRRQ','BRANCH_NO']]=data[['OPEN_DATE','MINCRRQ','BRANCH_NO']].astype(str)data[['OPEN_DATE','MINCRRQ']]=data[['OPEN_DATE','MINCRRQ']].apply(pd.to_datetime)#生成目标变量
data['Y']=[1 if t.days<=14 else 0 for t in (data.MINCRRQ-data.OPEN_DATE)]# 目标变量分布可视化
fig, axs = plt.subplots(1,2,figsize=(14,7))
sns.countplot(x='Y',data=https://www.it610.com/article/data,ax=axs[0])
axs[0].set_title("Frequency of each Class")
data['Y'].value_counts().plot(x=None,y=None, kind='pie', ax=axs[1],autopct='%1.2f%%')
axs[1].set_title("Percentage of each Class")
plt.show()
#约有81.85%比例的客户在注册后14天内入金#仅有crts是数值型变量,对其作图
v_feat = data.columns
plt.figure(figsize=(16,4))
gs = gridspec.GridSpec(1, 1)
ax = plt.subplot(gs[0])
sns.distplot(data['crts'][data["Y"] == 1],label='<=14天', bins=50)
sns.distplot(data['crts'][data["Y"] == 0],label='>14天', bins=100)
ax.set_xlabel('')
ax.set_title('histogram of feature crts: ')
plt.legend()
plt.xlim(0,25)
#可见crts变量在0-10之间的,14天内入金概率更高
#把数据划分为训练集和测试集
x_feature = list(data.columns)
x_feature.remove('Y')
x_val = data[x_feature]
y_val = data['Y']
X = data[x_feature]
y = data["Y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) # random_state = 0 每次切分的数据都一样
data_train=pd.concat([X_train,y_train],axis=1).reset_index()
data_test=pd.concat([X_test,y_test],axis=1).reset_index()
#计算总共的入金客户数目及不入金的客户数目
Grp =data_train.groupby('Y')
total_good = Grp.size()[1]
total_bad = Grp.size()[0]
#Grp.size()
#WOE函数,data表示数据库,target表示预测变量,varname表示要求的列
def CalcWOE(data,target,VarName):
WOE_Map = pd.DataFrame()
Vars = np.unique(data[VarName])
for v in Vars:
tmp = data[VarName] == v
grp = data[tmp].groupby(target)
Good = grp.size()[1] if 1 in grp.size().index else 0
Bad = grp.size()[0]if 0 in grp.size().index else 0
good_ratio = float(Good)/total_good
bad_ratio = float(Bad)/total_bad
WOE = np.log(good_ratio/(bad_ratio if bad_ratio!=0 else 0.001))
IV = (good_ratio - bad_ratio)*WOE
result = pd.DataFrame([[VarName, v, WOE, IV]], index=None, columns=['variable', 'class', 'woe', 'iv'])
WOE_Map = WOE_Map.append(result, ignore_index=True)
return WOE_Map
#计算各分组WOE值
BRANCH_NO_woe = CalcWOE(data_train,'Y','BRANCH_NO')
ANSCON886_woe = CalcWOE(data_train,'Y','ANSCON886')
ANSCON887_woe = CalcWOE(data_train,'Y','ANSCON887')
ANSCON888_woe = CalcWOE(data_train,'Y','ANSCON888')
ANSCON889_woe = CalcWOE(data_train,'Y','ANSCON889')
ANSCON890_woe = CalcWOE(data_train,'Y','ANSCON890')
ANSCON891_woe = CalcWOE(data_train,'Y','ANSCON891')
ANSCON892_woe = CalcWOE(data_train,'Y','ANSCON892')
ANSCON893_woe = CalcWOE(data_train,'Y','ANSCON893')
ANSCON894_woe = CalcWOE(data_train,'Y','ANSCON894')
ANSCON896_woe = CalcWOE(data_train,'Y','ANSCON896')
ANSCON897_woe = CalcWOE(data_train,'Y','ANSCON897')
ANSCON898_woe = CalcWOE(data_train,'Y','ANSCON898')
ANSCON899_woe = CalcWOE(data_train,'Y','ANSCON899')
ANSCON900_woe = CalcWOE(data_train,'Y','ANSCON900')
ANSCON901_woe = CalcWOE(data_train,'Y','ANSCON901')
#将原来的变量用其WOE替代
def ReplaceWOE(VarName, SourceDF, VarWOE):
dict1 = dict.fromkeys(VarWOE['class'])
j = 0
for key in dict1:
dict1[key] = VarWOE['woe'][j]
j = j + 1
SourceDF[VarName] = SourceDF[VarName].map(dict1)
return SourceDF
#修改训练集的WOE
data_woe = data_train
temp = ReplaceWOE('ANSCON886', data_woe, ANSCON886_woe)
temp1= ReplaceWOE('ANSCON887', temp, ANSCON887_woe)
temp= ReplaceWOE('ANSCON888', temp1, ANSCON888_woe)
temp1= ReplaceWOE('ANSCON889', temp, ANSCON889_woe)
temp= ReplaceWOE('ANSCON890', temp1, ANSCON890_woe)
temp1= ReplaceWOE('ANSCON891', temp, ANSCON891_woe)
temp= ReplaceWOE('ANSCON892', temp1, ANSCON892_woe)
temp1= ReplaceWOE('ANSCON893', temp, ANSCON893_woe)
temp= ReplaceWOE('ANSCON894', temp1, ANSCON894_woe)
temp1= ReplaceWOE('ANSCON896', temp, ANSCON896_woe)
temp= ReplaceWOE('ANSCON897', temp1, ANSCON897_woe)
temp1= ReplaceWOE('ANSCON898', temp, ANSCON898_woe)
temp= ReplaceWOE('ANSCON899', temp1, ANSCON899_woe)
temp1= ReplaceWOE('ANSCON900', temp, ANSCON900_woe)
temp= ReplaceWOE('ANSCON901', temp1, ANSCON901_woe)
temp1=ReplaceWOE('BRANCH_NO', temp, BRANCH_NO_woe)
data_train_new=temp1.copy(deep=True)
clf = LogisticRegression() # 构建逻辑回归分类器
X_train_new=temp1.loc[:,['BRANCH_NO','ANSCON886','ANSCON887','ANSCON888','ANSCON889','ANSCON890','ANSCON891', \
'ANSCON892','ANSCON893','ANSCON894','ANSCON896','ANSCON897','ANSCON898','ANSCON899', \
'ANSCON900','ANSCON901','crts']]
Y_train_new=temp1.Y
clf.fit(X_train_new,Y_train_new)
#修改测试集的WOE
data_woe = data_test
temp = ReplaceWOE('ANSCON886', data_woe, ANSCON886_woe)
temp1= ReplaceWOE('ANSCON887', temp, ANSCON887_woe)
temp= ReplaceWOE('ANSCON888', temp1, ANSCON888_woe)
temp1= ReplaceWOE('ANSCON889', temp, ANSCON889_woe)
temp= ReplaceWOE('ANSCON890', temp1, ANSCON890_woe)
temp1= ReplaceWOE('ANSCON891', temp, ANSCON891_woe)
temp= ReplaceWOE('ANSCON892', temp1, ANSCON892_woe)
temp1= ReplaceWOE('ANSCON893', temp, ANSCON893_woe)
temp= ReplaceWOE('ANSCON894', temp1, ANSCON894_woe)
temp1= ReplaceWOE('ANSCON896', temp, ANSCON896_woe)
temp= ReplaceWOE('ANSCON897', temp1, ANSCON897_woe)
temp1= ReplaceWOE('ANSCON898', temp, ANSCON898_woe)
temp= ReplaceWOE('ANSCON899', temp1, ANSCON899_woe)
temp1= ReplaceWOE('ANSCON900', temp, ANSCON900_woe)
temp= ReplaceWOE('ANSCON901', temp1, ANSCON901_woe)
temp1=ReplaceWOE('BRANCH_NO', temp, BRANCH_NO_woe)
X_test_new=temp1.loc[:,['BRANCH_NO','ANSCON886','ANSCON887','ANSCON888','ANSCON889','ANSCON890','ANSCON891', \
'ANSCON892','ANSCON893','ANSCON894','ANSCON896','ANSCON897','ANSCON898','ANSCON899', \
'ANSCON900','ANSCON901','crts']]
Y_test_new=temp1.Y
predicted = clf.predict(X_test_new) # 通过分类器产生预测结果
print("预测准确率: {:.5f}".format(accuracy_score(predicted, Y_test_new,)))
y_pred1_prob = clf.predict_proba(X_test_new)[:, 1]# 阈值默认值为0.5
fpr, tpr, thresholds = roc_curve(Y_test_new,y_pred1_prob)
roc_auc = auc(fpr,tpr)
# 绘制 ROC曲线
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.5f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
print(clf.coef_)
print(clf.intercept_)