################################################################################################### # # Default of credit card clients Data Set # # Data Set Information: # # This research aimed at the case of customers default payments in Taiwan. From the # perspective of risk management, the result of predictive accuracy of the estimated # probability of default will be more valuable than the binary result of # classification - credible or not credible clients. # # Attribute Information: # # This research employed a binary variable, default payment (Yes = 1, No = 0), # as the response variable. This study reviewed the literature and used the # following 23 variables as explanatory variables: # # X1: Amount of the given credit (NT dollar): it includes both the individual # consumer credit and his/her family (supplementary) credit. # X2: Gender (1 = male; 2 = female). # X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others). # X4: Marital status (1 = married; 2 = single; 3 = others). # X5: Age (year). # X6-X11: History of past payment. # We tracked the past monthly payment records as follows: # X6 = the repayment status in September, 2005; # X7 = the repayment status in August, 2005; # X8 = the repayment status in July, 2005; # X9 = the repayment status in June, 2005; # X10 = the repayment status in May, 2005; # X11 = the repayment status in April, 2005. # The measurement scale for the repayment status is: # -1 = pay duly; # 1 = payment delay for one month; # 2 = payment delay for two months; # 3 = payment delay for three months; # 4 = payment delay for four months; # 5 = payment delay for five months; # 6 = payment delay for six months; # 7 = payment delay for seven months; # 8 = payment delay for eight months; # 9 = payment delay for nine months and above. # X12-X17: Amount of bill statement (NT dollar). # X12 = amount of bill statement in September, 2005; # X13 = amount of bill statement in August, 2005; # X14 = amount of bill statement in July, 2005; # X15 = amount of bill statement in June, 2005; # X16 = amount of bill statement in May, 2005; # X17 = amount of bill statement in April, 2005. # X18-X23: Amount of previous payment (NT dollar). # X18 = amount paid in September, 2005; # X19 = amount paid in August, 2005; # X20 = amount paid in July, 2005; # X21 = amount paid in June, 2005; # X22 = amount paid in May, 2005; # X23 = amount paid in April, 2005. # # Relevant Papers: # # Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive # accuracy of probability of default of credit card clients. Expert Systems with Applications, # 36(2), 2473-2480. # ################################################################################################### names=c("LIMIT-BAL","SEX","EDUCATION","MARRIAGE","AGE","PAY-0","PAY-2","PAY-3", "PAY-4","PAY-5","PAY-6","BILL-AMT1","BILL-AMT2","BILL-AMT3","BILL-AMT4", "BILL-AMT5","BILL-AMT6","PAY-AMT1","PAY-AMT2","PAY-AMT3","PAY-AMT4", "PAY-AMT5","PAY-AMT6","default payment next month") data = matrix(scan("https://hedibert.org/wp-content/uploads/2025/06/creditcard.txt"),ncol=24,byrow=TRUE) ind = data[,3]==1 | data[,3]==2 | data[,3]==3 data = data[ind,] ind = data[,4]==1 | data[,4]==2 data = data[ind,] y = data[,24] X = data[,1:23] n = nrow(X) # X1: Amount of the given credit (NT dollar): it includes both the individual # consumer credit and his/her family (supplementary) credit. # X2: Gender (1 = male; 2 = female). # X4: Marital status (1 = married; 2 = single; 3 = others). # X5: Age (year). credit = X[,1]/1000 age = X[,5] gender = rep(0,n) gender[X[,2]==2] = 1 married = rep(0,n) married[X[,4]==1] = 1 eps = rnorm(n,0,0.01) fit.glm = glm(y~credit+age+gender+married) summary(fit.glm) prob=1/(1+exp(-fit.glm$fit)) par(mfrow=c(1,1)) boxplot(prob[y==0],prob[y==1],outline=FALSE,names=c("No default payment","Default payment"), ylab="Probability of default payment") ind1 = gender==0 & married==0 & age<=60 ind2 = gender==0 & married==1 & age<=60 ind3 = gender==1 & married==0 & age<=60 ind4 = gender==1 & married==1 & age<=60 par(mfrow=c(1,4)) boxplot(prob[ind1]~age[ind1],outline=FALSE,ylim=c(0.48,0.6),xlab="Age",ylab="Probability of default payment", main="Male & single") boxplot(prob[ind2]~age[ind2],outline=FALSE,ylim=c(0.48,0.6),xlab="Age",ylab="Probability of default payment", main="Male & married") boxplot(prob[ind3]~age[ind3],outline=FALSE,ylim=c(0.48,0.6),xlab="Age",ylab="Probability of default payment", main="Female & single") boxplot(prob[ind4]~age[ind4],outline=FALSE,ylim=c(0.48,0.6),xlab="Age",ylab="Probability of default payment", main="Female & married") ages = sort(unique(age)) nages = length(ages) quant = array(0,c(4,nages,3)) for (i in 1:nages){ quant[1,i,] = quantile(prob[age==ages[i] & ind1],c(0.25,0.5,0.75)) quant[2,i,] = quantile(prob[age==ages[i] & ind2],c(0.25,0.5,0.75)) quant[3,i,] = quantile(prob[age==ages[i] & ind3],c(0.25,0.5,0.75)) quant[4,i,] = quantile(prob[age==ages[i] & ind4],c(0.25,0.5,0.75)) } par(mfrow=c(1,1)) plot(ages,quant[1,,2],type="l",xlim=c(20,60),ylim=c(0.54,0.59),ylab="Probability of default payment",xlab="Age",lwd=2) lines(ages,quant[2,,2],col=2,lwd=2) lines(ages,quant[3,,2],col=3,lwd=2) lines(ages,quant[4,,2],col=4,lwd=2) legend("topright",legend=c("Male & single","Male & married","Female & single","Female & married"),col=1:4,lwd=2,bty="n") ind1 = gender==0 & married==0 & credit<=500 ind2 = gender==0 & married==1 & credit<=500 ind3 = gender==1 & married==0 & credit<=500 ind4 = gender==1 & married==1 & credit<=500 par(mfrow=c(1,1)) boxplot(prob[ind1]~credit[ind1],outline=FALSE,ylim=c(0.5,0.6), xlab="Amount of the given credit",ylab="Probability of default payment",main="") boxplot(prob[ind2]~credit[ind2],outline=FALSE,add=TRUE,col=2,axes=FALSE) boxplot(prob[ind3]~credit[ind3],outline=FALSE,add=TRUE,col=3,axes=FALSE) boxplot(prob[ind4]~credit[ind4],outline=FALSE,add=TRUE,col=4,axes=FALSE) legend("topright",legend=c("Male & single","Male & married","Female & single","Female & married"),col=1:4,lwd=3,bty="n")