# Problem 8, page 480-481 of Jim Albert and Jingchen Hu's (2020) 
# Probability and Bayesian Modeling, CRC Press, Taylor & Francis Group.


# What factors determine admission to graduate school?  In a study, data on 400 graduate
# school admission cases was collected.  Admission is a binary response, with 0 indicating 
# not admitted, and 1 indicating admitted.  Moreover, the applicant's GRE score, and 
# undergraduate grade point average (GPA) are available.  GRE: Graduate Record Examination.

# Removing objects from the workspace
rm(list = ls())

# File name
filename = "https://raw.githubusercontent.com/monika76five/ProbBayes/master/R%20Code/chapter%2012/data/GradSchoolAdmission.csv"

# Reading the data
data <- read.csv(file = filename)

# Sample size
n = nrow(data)

# Using the columns of the dataset individually
attach(data)

# 2/3 admitted and 1/3 not admitted
table(Admission)

pdf(file="graduate-school-admission.pdf",width=18,height=8)

par(mfrow=c(1,2))
plot(table(round(GPA,1)),xlab="GPA (rounded)",ylab="")
abline(v=3.45,col=2,lwd=3)
plot(table(GRE),xlab="GRE")

GPA35 = rep(0,n)
GPA35[GPA>=3.5]=1

table(GPA35,Admission)

par(mfrow=c(1,1))
boxplot(GRE~GPA35,names=c("GPA<3.5","GPA>=3.5"),xlab="GPA")

# Linear model with Gaussian errors
fit.lm  = lm(Admission~GRE+GPA35)

# Generalized linear model with Binomial link
fit.glm = glm(Admission~GRE+GPA35,family=binomial(link = "logit"))

par(mfrow=c(1,1))
plot(GRE,fit.glm$fitted,xlab="GRE",ylab="Fitted",ylim=c(0,0.6),col=GPA35+1,pch=16)
abline(h=mean(Admission),col=4)
points(GRE,fit.lm$fitted,col=GPA35+1)
legend("topleft",legend=c("Binomial","LM (GPA<3.5)","LM (GPA>=3.5)","GLM (GPA<3.5)","GLM (GPA>=3.5)"),col=c(4,1,2,1,2),pch=c(16,1,1,16,16))


M     = 1000000
beta0 = rnorm(M,-2,1)
beta1 = rnorm(M,0,1)
beta2 = rnorm(M,0,1)

eta = matrix(0,M,n)
for (i in 1:n)
  eta[,i] = beta0 + beta1*GRE[i] + beta2*GPA35[i]
fit.prior = 1/(1+exp(-eta))

qfit.prior = t(apply(fit.prior,2,quantile,c(0.05,0.5,0.95)))

par(mfrow=c(1,1))
plot(GRE,qfit.prior[,2],ylim=range(qfit),main="Prior",ylab="Fit")
points(GRE,qfit.prior[,1],col=2)
points(GRE,qfit.prior[,3],col=2)

weight = rep(0,M)
like = rep(0,M)
fit.post = matrix(0,M,n)
for (i in 1:M){
  prob = beta0[i] + beta1[i]*GRE + beta2[i]*GPA35
  prob = 1/(1+exp(-prob))
  like[i] = sum(dbinom(Admission,1,prob,log=TRUE))
  fit.post[i,] = prob
}
like = exp(like-max(like))
	
M1 = M/100
ind = sample(1:M,size=M1,replace=TRUE,prob=like)	
qfit.post = t(apply(fit.post[ind,],2,quantile,c(0.05,0.5,0.95)))

beta0p = beta0[ind]
beta1p = beta1[ind]
beta2p = beta2[ind]


par(mfrow=c(2,3))
hist(beta0,prob=TRUE,xlab="",main=expression(beta[0]),ylab="Prior density")
hist(beta1,prob=TRUE,xlab="",main=expression(beta[1]),ylab="Prior density")
hist(beta2,prob=TRUE,xlab="",main=expression(beta[2]),ylab="Prior density")
hist(beta0p,xlim=range(beta0),prob=TRUE,xlab="",main=expression(beta[0]),ylab="Posterior density")
abline(v=0,col=2)
points(fit.glm$coef[1],0,pch=16,cex=2,col=4)
hist(beta1p,prob=TRUE,xlab="",main=expression(beta[1]),ylab="Posterior density")
abline(v=0,col=2)
points(fit.glm$coef[2],0,pch=16,cex=2,col=4)
hist(beta2p,xlim=range(beta2),prob=TRUE,xlab="",main=expression(beta[2]),ylab="Posterior density")
abline(v=0,col=2)
points(fit.glm$coef[3],0,pch=16,cex=2,col=4)


ind1 = sample(1:M,size=M1,replace=TRUE)
par(mfrow=c(2,3))
plot(beta0[ind1],beta1[ind1])
points(beta0p,beta1p,col=2)
plot(beta0[ind1],beta2[ind1])
points(beta0p,beta2p,col=2)
plot(beta1[ind1],beta2[ind1])
points(beta1p,beta2p,col=2)

plot(beta0p,beta1p,col=2)
points(fit.glm$coef[1],fit.glm$coef[2],col=4,pch=16,cex=2)
plot(beta0p,beta2p,col=2)
points(fit.glm$coef[1],fit.glm$coef[3],col=4,pch=16,cex=2)
plot(beta1p,beta2p,col=2)
points(fit.glm$coef[2],fit.glm$coef[3],col=4,pch=16,cex=2)

par(mfrow=c(1,2))
plot(GRE,qfit.prior[,2],ylim=c(0,1),main="Prior",ylab="Fit")
points(GRE,qfit.prior[,1],col=2)
points(GRE,qfit.prior[,3],col=2)

plot(GRE,qfit.post[,2],ylim=c(0,1),main="Posterior",ylab="Fit",col=GPA35+1,pch=16)
points(GRE,qfit.post[,1],col=GPA35+1)
points(GRE,qfit.post[,3],col=GPA35+1)
legend("topleft",legend=c("Bayes GLM (GPA<3.5)","Bayes GLM (GPA>=3.5)"),col=1:2,pch=16)


dev.off()