Data WNBA/NBA 2013-2014
nba = read.csv("http://hedibert.org/wp-content/uploads/2016/03/height-weight-nba.csv")
wnba = read.csv("http://hedibert.org/wp-content/uploads/2016/03/height-weight-wnba.csv")
n1 = nrow(wnba)
n2 = nrow(nba)
n = n1+n2
y = rep(0,n)
y[(n1+1):n] = 1
x = rep(0,n)
x[1:n1]=wnba[,4]*2.54
x[(n1+1):n]=nba[,3]*2.54
loc = 190
x = x-loc
par(mfrow=c(1,2))
boxplot((x+loc)~y,horizontal=TRUE,col=2:3,names=c("WNBA","NBA"),ylab="League",xlab="Height (in cm)")
plot(x,y+rnorm(n,0,0.05),xlab="Height (in cm)",ylab="League",col=y+2,axes=FALSE)
axis(2,at=c(0,1),lab=c("WNBA (0)","NBA (1)"));box()
axis(1,at=seq(-30,30,by=10),lab=loc+seq(-30,30,by=10))
GLM fit - logit and probit links
summary(glm(y~x,family=binomial(link="logit")))
##
## Call:
## glm(formula = y ~ x, family = binomial(link = "logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.66884 0.08479 0.18531 0.39997 2.15786
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.81216 0.12932 6.28 3.39e-10 ***
## x 0.20609 0.01817 11.34 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 671.81 on 643 degrees of freedom
## Residual deviance: 385.20 on 642 degrees of freedom
## AIC: 389.2
##
## Number of Fisher Scoring iterations: 6
summary(glm(y~x,family=binomial(link="probit")))
##
## Call:
## glm(formula = y ~ x, family = binomial(link = "probit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.76266 0.03649 0.14439 0.40400 2.13939
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.460769 0.073206 6.294 3.09e-10 ***
## x 0.117658 0.009455 12.443 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 671.81 on 643 degrees of freedom
## Residual deviance: 381.49 on 642 degrees of freedom
## AIC: 385.49
##
## Number of Fisher Scoring iterations: 7
gamma1 = glm(y~x,family=binomial(link="logit"))$coef
gamma2 = glm(y~x,family=binomial(link="probit"))$coef
h = seq(min(x),max(x),length=100)
fit1 = 1/(1+exp(-gamma1[1]-gamma1[2]*h))
fit2 = pnorm(gamma2[1]+gamma2[2]*h)
par(mfrow=c(1,1))
plot(x,y+rnorm(n,0,0.05),xlab="Height (in cm)",ylab="Probability of NBA",col=y+2,cex=0.75,axes=FALSE)
axis(2,at=c(0,1),lab=c("WNBA (0)","NBA (1)"));box()
axis(1,at=seq(-30,30,by=10),lab=loc+seq(-30,30,by=10))
abline(lm(y~x),col=4,lwd=2)
abline(h=0.5,lty=2)
lines(h,fit1,col=5,lwd=2)
lines(h,fit2,col=6,lwd=2)
legend("topleft",legend=c("LM fit","GLM (logit)","GLM (probit)"),col=4:6,lty=1,lwd=2,bty="n")
Bayesian inference
loglike1 = function(gamma){
theta = 1/(1+exp(-gamma[1]-gamma[2]*x))
return(sum(y*log(theta))+sum((1-y)*log(1-theta)))
}
loglike2 = function(gamma){
theta = pnorm(gamma[1]+gamma[2]*x)
return(sum(y*log(theta))+sum((1-y)*log(1-theta)))
}
M = 100000
N = 0.2*M
alpha.draw1 = runif(M,0.35,1.3)
beta.draw1 = runif(M,0.07,0.34)
alpha.draw2 = runif(M,0.1,0.8)
beta.draw2 = runif(M,0,0.2)
w1 = rep(0,M)
w2 = rep(0,M)
for (i in 1:M){
w1[i] = loglike1(c(alpha.draw1[i],beta.draw1[i]))
w2[i] = loglike2(c(alpha.draw2[i],beta.draw2[i]))
}
summary(w1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -259.1 -209.8 -201.7 -205.5 -196.9 -192.6
summary(w2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -417.6 -234.8 -208.1 -226.0 -198.6 -190.7
logpred1 = log(sum(exp(w1)))
logpred2 = log(sum(exp(w2)))
w1 = exp(w1-max(w1))
w2 = exp(w2-max(w2))
k1 = sample(1:M,size=N,replace=TRUE,prob=w1)
k2 = sample(1:M,size=N,replace=TRUE,prob=w2)
alpha.post1 = alpha.draw1[k1]
beta.post1 = beta.draw1[k1]
alpha.post2 = alpha.draw2[k2]
beta.post2 = beta.draw2[k2]
Posterior summaries
par(mfrow=c(1,2))
plot(alpha.post1,beta.post1,xlab="",ylab="")
points(gamma1[1],gamma1[2],pch=16,col=2,cex=2)
title("Joint posterior\nLogit link")
mtext(side=1,expression(alpha),line=3,cex=2)
mtext(side=2,expression(beta),line=2,cex=2)
plot(alpha.post2,beta.post2,xlab="",ylab="")
points(gamma2[1],gamma2[2],pch=16,col=2,cex=2)
title("Joint posterior\nProbit link")
mtext(side=1,expression(alpha),line=3,cex=2)
mtext(side=2,expression(beta),line=2,cex=2)
Posterior predictive
N = 1000
h = seq(min(x),max(x),length=N)
bayesfit1 = matrix(0,N,3)
bayesfit2 = matrix(0,N,3)
for (j in 1:N){
bayesfit1[j,] = quantile(1/(1+exp(-alpha.post1-beta.post1*h[j])),c(0.025,0.5,0.975))
bayesfit2[j,] = quantile(pnorm(alpha.post2+beta.post2*h[j]),c(0.025,0.5,0.975))
}
par(mfrow=c(1,1))
plot(x,y+rnorm(n,0,0.05),xlab="Height (in cm)",ylab="Probability of NBA",col=y+2,cex=0.75,axes=FALSE)
axis(2,at=c(0,1),lab=c("WNBA (0)","NBA (1)"));box()
axis(1,at=seq(-30,30,by=10),lab=loc+seq(-30,30,by=10))
abline(h=0.5,lty=2)
lines(h,bayesfit1[,2],col=5,lwd=2)
lines(h,bayesfit1[,1],col=5,lwd=2,lty=2)
lines(h,bayesfit1[,3],col=5,lwd=2,lty=2)
lines(h,bayesfit2[,2],col=6,lwd=2)
lines(h,bayesfit2[,1],col=6,lwd=2,lty=2)
lines(h,bayesfit2[,3],col=6,lwd=2,lty=2)
legend("topleft",legend=c("Bayes GLM (logit)","Bayes GLM (probit)"),col=5:6,lty=1,lwd=2,bty="n")
title(paste("Bayes factor probit/logit = ",round(exp(logpred2-logpred1),3),sep=""))
Misclassification errors
probs = seq(0.5,0.9,by=0.01)
L = length(probs)
cutoff1 = rep(0,L)
error11 = rep(0,L)
error21 = rep(0,L)
cutoff2 = rep(0,L)
error12 = rep(0,L)
error22 = rep(0,L)
for (i in 1:L){
prob = probs[i]
cutoff1[i] = h[bayesfit1[,2]>prob][1]
error11[i] = mean((x>cutoff1[i])&(y==0))
error21[i] = mean((x<cutoff1[i])&(y==1))
cutoff2[i] = h[bayesfit2[,2]>prob][1]
error12[i] = mean((x>cutoff2[i])&(y==0))
error22[i] = mean((x<cutoff2[i])&(y==1))
}
range = range(error11,error21,error11+error21,error12,error22,error12+error22)
plot(probs,error11,ylim=range,pch=16,cex=2,ylab="Classification error",
xlab="Cutoff probability")
points(probs,error21,col=grey(0.5),pch=16,cex=2)
points(probs,error11+error21,col=grey(0.85),pch=16,cex=2)
points(probs,error12,col=2,pch=16,cex=1)
points(probs,error22,col=3,pch=16,cex=1)
points(probs,error12+error22,col=6,pch=16,cex=1)
title("If individual probability > cutoff probability => classify as NBA")
legend("topleft",legend=c("Misclassifying WNBA player (logit)","Misclassifying NBA player (logit)","Misclassification error (logit)","Misclassifying WNBA player (probit)","Misclassifying NBA player (probit)","Misclassification error (probit)"),col=c(1,grey(0.5),grey(0.85),2,3,6),pch=16,bty="n")
Cut-off probability equal to \(0.8\)
N = 1000
h = seq(min(x),max(x),length=N)
bayesfit1 = matrix(0,N,M)
bayesfit2 = matrix(0,N,M)
for (j in 1:N){
bayesfit1[j,] = 1/(1+exp(-alpha.post1-beta.post1*h[j]))
bayesfit2[j,] = pnorm(alpha.post2+beta.post2*h[j])
}
prob = 0.8
# Logit model
error1 = rep(0,M)
error2 = rep(0,M)
for (i in 1:M){
cutoff = h[bayesfit1[,i]>prob][1]
error1[i] = mean((x>cutoff)&(y==0))
error2[i] = mean((x<cutoff)&(y==1))
}
tab1 = rbind(summary(error1),summary(error2),summary(error1+error2))
errors1 = cbind(error1,error2)
# Probit model
error1 = rep(0,M)
error2 = rep(0,M)
for (i in 1:M){
cutoff = h[bayesfit2[,i]>prob][1]
error1[i] = mean((x>cutoff)&(y==0))
error2[i] = mean((x<cutoff)&(y==1))
}
tab2 = rbind(summary(error1),summary(error2),summary(error1+error2))
errors2 = cbind(error1,error2)
Misclassifying WNBA players
# Logit & probit
c(n1,mean(errors1[,1]),mean(errors2[,1]))
## [1] 139.00000000 0.03822003 0.03165854
Misclassifying NBA players
# Logit & probit
c(n2,mean(errors1[,2]),mean(errors2[,2]))
## [1] 505.0000000 0.1673506 0.1804806
Overall misclassification
# Logit & probit
c(n,mean(errors1[,1]+errors1[,2]),mean(errors2[,1]+errors2[,2]))
## [1] 644.0000000 0.2055707 0.2121391