###################################################################################################
#
# Default of credit card clients Data Set
#
# Data Set Information:
#
# This research aimed at the case of customers default payments in Taiwan. From the 
# perspective of risk management, the result of predictive accuracy of the estimated 
# probability of default will be more valuable than the binary result of 
# classification - credible or not credible clients.
#
# Attribute Information:
#
# This research employed a binary variable, default payment (Yes = 1, No = 0), 
# as the response variable. This study reviewed the literature and used the 
# following 23 variables as explanatory variables:
#
# X1: Amount of the given credit (NT dollar): it includes both the individual 
#     consumer credit and his/her family (supplementary) credit.
# X2: Gender (1 = male; 2 = female).
# X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).
# X4: Marital status (1 = married; 2 = single; 3 = others).
# X5: Age (year).
# X6-X11: History of past payment. 
#         We tracked the past monthly payment records as follows: 
#         X6 = the repayment status in September, 2005; 
#         X7 = the repayment status in August, 2005; 
#         X8 = the repayment status in July, 2005; 
#         X9 = the repayment status in June, 2005; 
#         X10 = the repayment status in May, 2005; 
#         X11 = the repayment status in April, 2005. 
#         The measurement scale for the repayment status is: 
#         -1 = pay duly; 
#          1 = payment delay for one month; 
#          2 = payment delay for two months;
#          3 = payment delay for three months;
#          4 = payment delay for four months;
#          5 = payment delay for five months;
#          6 = payment delay for six months;
#          7 = payment delay for seven months;
#          8 = payment delay for eight months; 
#          9 = payment delay for nine months and above.
# X12-X17: Amount of bill statement (NT dollar). 
#          X12 = amount of bill statement in September, 2005; 
#          X13 = amount of bill statement in August, 2005;
#          X14 = amount of bill statement in July, 2005;
#          X15 = amount of bill statement in June, 2005;
#          X16 = amount of bill statement in May, 2005;
#          X17 = amount of bill statement in April, 2005.
# X18-X23: Amount of previous payment (NT dollar). 
#          X18 = amount paid in September, 2005; 
#          X19 = amount paid in August, 2005;
#          X20 = amount paid in July, 2005;
#          X21 = amount paid in June, 2005;
#          X22 = amount paid in May, 2005;
#          X23 = amount paid in April, 2005.
#
# Relevant Papers:
#
# Yeh, I. C., & Lien, C. H. (2009). The comparisons of data mining techniques for the predictive
# accuracy of probability of default of credit card clients. Expert Systems with Applications, 
# 36(2), 2473-2480.
#
###################################################################################################
names=c("LIMIT-BAL","SEX","EDUCATION","MARRIAGE","AGE","PAY-0","PAY-2","PAY-3",
        "PAY-4","PAY-5","PAY-6","BILL-AMT1","BILL-AMT2","BILL-AMT3","BILL-AMT4",
        "BILL-AMT5","BILL-AMT6","PAY-AMT1","PAY-AMT2","PAY-AMT3","PAY-AMT4",
        "PAY-AMT5","PAY-AMT6","default payment next month")
        
data = matrix(scan("https://hedibert.org/wp-content/uploads/2025/06/creditcard.txt"),ncol=24,byrow=TRUE)
ind  = data[,3]==1 | data[,3]==2 | data[,3]==3
data = data[ind,]
ind  = data[,4]==1 | data[,4]==2
data = data[ind,]
y    = data[,24]
X    = data[,1:23]
n    = nrow(X)

# X1: Amount of the given credit (NT dollar): it includes both the individual 
#     consumer credit and his/her family (supplementary) credit.
# X2: Gender (1 = male; 2 = female).
# X4: Marital status (1 = married; 2 = single; 3 = others).
# X5: Age (year).

credit               = X[,1]/1000
age                  = X[,5]
gender               = rep(0,n)
gender[X[,2]==2]     = 1 
married              = rep(0,n)
married[X[,4]==1]    = 1

eps = rnorm(n,0,0.01)

fit.glm = glm(y~credit+age+gender+married)
summary(fit.glm)
prob=1/(1+exp(-fit.glm$fit))

par(mfrow=c(1,1))
boxplot(prob[y==0],prob[y==1],outline=FALSE,names=c("No default payment","Default payment"),
        ylab="Probability of default payment")


ind1 = gender==0 & married==0 & age<=60
ind2 = gender==0 & married==1 & age<=60
ind3 = gender==1 & married==0 & age<=60
ind4 = gender==1 & married==1 & age<=60

par(mfrow=c(1,4))
boxplot(prob[ind1]~age[ind1],outline=FALSE,ylim=c(0.48,0.6),xlab="Age",ylab="Probability of default payment",
        main="Male & single")
boxplot(prob[ind2]~age[ind2],outline=FALSE,ylim=c(0.48,0.6),xlab="Age",ylab="Probability of default payment",
        main="Male & married")
boxplot(prob[ind3]~age[ind3],outline=FALSE,ylim=c(0.48,0.6),xlab="Age",ylab="Probability of default payment",
        main="Female & single")
boxplot(prob[ind4]~age[ind4],outline=FALSE,ylim=c(0.48,0.6),xlab="Age",ylab="Probability of default payment",
        main="Female & married")
        
ages = sort(unique(age))  
nages = length(ages)      
quant = array(0,c(4,nages,3))
for (i in 1:nages){
   quant[1,i,] = quantile(prob[age==ages[i] & ind1],c(0.25,0.5,0.75))
   quant[2,i,] = quantile(prob[age==ages[i] & ind2],c(0.25,0.5,0.75))
   quant[3,i,] = quantile(prob[age==ages[i] & ind3],c(0.25,0.5,0.75))
   quant[4,i,] = quantile(prob[age==ages[i] & ind4],c(0.25,0.5,0.75))
}

par(mfrow=c(1,1))
plot(ages,quant[1,,2],type="l",xlim=c(20,60),ylim=c(0.54,0.59),ylab="Probability of default payment",xlab="Age",lwd=2)
lines(ages,quant[2,,2],col=2,lwd=2)
lines(ages,quant[3,,2],col=3,lwd=2)
lines(ages,quant[4,,2],col=4,lwd=2)
legend("topright",legend=c("Male & single","Male & married","Female & single","Female & married"),col=1:4,lwd=2,bty="n")
        
ind1 = gender==0 & married==0 & credit<=500
ind2 = gender==0 & married==1 & credit<=500
ind3 = gender==1 & married==0 & credit<=500
ind4 = gender==1 & married==1 & credit<=500

par(mfrow=c(1,1))
boxplot(prob[ind1]~credit[ind1],outline=FALSE,ylim=c(0.5,0.6),
        xlab="Amount of the given credit",ylab="Probability of default payment",main="")
boxplot(prob[ind2]~credit[ind2],outline=FALSE,add=TRUE,col=2,axes=FALSE)        
boxplot(prob[ind3]~credit[ind3],outline=FALSE,add=TRUE,col=3,axes=FALSE)        
boxplot(prob[ind4]~credit[ind4],outline=FALSE,add=TRUE,col=4,axes=FALSE)
legend("topright",legend=c("Male & single","Male & married","Female & single","Female & married"),col=1:4,lwd=3,bty="n")