Data WNBA/NBA 2013-2014

nba     = read.csv("http://hedibert.org/wp-content/uploads/2016/03/height-weight-nba.csv")
wnba    = read.csv("http://hedibert.org/wp-content/uploads/2016/03/height-weight-wnba.csv")
n1 = nrow(wnba)
n2 = nrow(nba)
n = n1+n2
y = rep(0,n)
y[(n1+1):n] = 1
x = rep(0,n)
x[1:n1]=wnba[,4]*2.54
x[(n1+1):n]=nba[,3]*2.54
loc = 190
x   = x-loc
par(mfrow=c(1,2))
boxplot((x+loc)~y,horizontal=TRUE,col=2:3,names=c("WNBA","NBA"),ylab="League",xlab="Height (in cm)")
plot(x,y+rnorm(n,0,0.05),xlab="Height (in cm)",ylab="League",col=y+2,axes=FALSE)
axis(2,at=c(0,1),lab=c("WNBA (0)","NBA (1)"));box()
axis(1,at=seq(-30,30,by=10),lab=loc+seq(-30,30,by=10))

Bayesian inference

loglike1 = function(gamma){
  theta = 1/(1+exp(-gamma[1]-gamma[2]*x))
  return(sum(y*log(theta))+sum((1-y)*log(1-theta)))
}
loglike2 = function(gamma){
  theta = pnorm(gamma[1]+gamma[2]*x)
  return(sum(y*log(theta))+sum((1-y)*log(1-theta)))
}


M = 100000
N = 0.2*M
alpha.draw1 = runif(M,0.35,1.3)
beta.draw1  = runif(M,0.07,0.34)
alpha.draw2 = runif(M,0.1,0.8)
beta.draw2  = runif(M,0,0.2)
w1 = rep(0,M)
w2 = rep(0,M)
for (i in 1:M){
  w1[i] = loglike1(c(alpha.draw1[i],beta.draw1[i]))
  w2[i] = loglike2(c(alpha.draw2[i],beta.draw2[i]))
}
summary(w1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -259.1  -209.8  -201.7  -205.5  -196.9  -192.6
summary(w2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -417.6  -234.8  -208.1  -226.0  -198.6  -190.7
logpred1 = log(sum(exp(w1)))
logpred2 = log(sum(exp(w2)))
w1 = exp(w1-max(w1))
w2 = exp(w2-max(w2))
k1 = sample(1:M,size=N,replace=TRUE,prob=w1)
k2 = sample(1:M,size=N,replace=TRUE,prob=w2)
alpha.post1 = alpha.draw1[k1]
beta.post1  = beta.draw1[k1]
alpha.post2 = alpha.draw2[k2]
beta.post2  = beta.draw2[k2]

Posterior summaries

par(mfrow=c(1,2))
plot(alpha.post1,beta.post1,xlab="",ylab="")
points(gamma1[1],gamma1[2],pch=16,col=2,cex=2)
title("Joint posterior\nLogit link")
mtext(side=1,expression(alpha),line=3,cex=2)
mtext(side=2,expression(beta),line=2,cex=2)

plot(alpha.post2,beta.post2,xlab="",ylab="")
points(gamma2[1],gamma2[2],pch=16,col=2,cex=2)
title("Joint posterior\nProbit link")
mtext(side=1,expression(alpha),line=3,cex=2)
mtext(side=2,expression(beta),line=2,cex=2)

Posterior predictive

N = 1000
h = seq(min(x),max(x),length=N)
bayesfit1 = matrix(0,N,3)
bayesfit2 = matrix(0,N,3)
for (j in 1:N){
  bayesfit1[j,] = quantile(1/(1+exp(-alpha.post1-beta.post1*h[j])),c(0.025,0.5,0.975))
  bayesfit2[j,] = quantile(pnorm(alpha.post2+beta.post2*h[j]),c(0.025,0.5,0.975))
}

par(mfrow=c(1,1))
plot(x,y+rnorm(n,0,0.05),xlab="Height (in cm)",ylab="Probability of NBA",col=y+2,cex=0.75,axes=FALSE)
axis(2,at=c(0,1),lab=c("WNBA (0)","NBA (1)"));box()
axis(1,at=seq(-30,30,by=10),lab=loc+seq(-30,30,by=10))
abline(h=0.5,lty=2)
lines(h,bayesfit1[,2],col=5,lwd=2)
lines(h,bayesfit1[,1],col=5,lwd=2,lty=2)
lines(h,bayesfit1[,3],col=5,lwd=2,lty=2)
lines(h,bayesfit2[,2],col=6,lwd=2)
lines(h,bayesfit2[,1],col=6,lwd=2,lty=2)
lines(h,bayesfit2[,3],col=6,lwd=2,lty=2)
legend("topleft",legend=c("Bayes GLM (logit)","Bayes GLM (probit)"),col=5:6,lty=1,lwd=2,bty="n")
title(paste("Bayes factor probit/logit = ",round(exp(logpred2-logpred1),3),sep=""))

Misclassification errors

probs = seq(0.5,0.9,by=0.01)
L = length(probs)
cutoff1 = rep(0,L)
error11 = rep(0,L)
error21 = rep(0,L)
cutoff2 = rep(0,L)
error12 = rep(0,L)
error22 = rep(0,L)
for (i in 1:L){
  prob   = probs[i]
  cutoff1[i] = h[bayesfit1[,2]>prob][1]
  error11[i] = mean((x>cutoff1[i])&(y==0))
  error21[i] = mean((x<cutoff1[i])&(y==1))
  cutoff2[i] = h[bayesfit2[,2]>prob][1]
  error12[i] = mean((x>cutoff2[i])&(y==0))
  error22[i] = mean((x<cutoff2[i])&(y==1))
}

range = range(error11,error21,error11+error21,error12,error22,error12+error22)

plot(probs,error11,ylim=range,pch=16,cex=2,ylab="Classification error",
xlab="Cutoff probability")
points(probs,error21,col=grey(0.5),pch=16,cex=2)
points(probs,error11+error21,col=grey(0.85),pch=16,cex=2)
points(probs,error12,col=2,pch=16,cex=1)
points(probs,error22,col=3,pch=16,cex=1)
points(probs,error12+error22,col=6,pch=16,cex=1)

title("If individual probability > cutoff probability => classify as NBA")
legend("topleft",legend=c("Misclassifying WNBA player (logit)","Misclassifying NBA player (logit)","Misclassification error (logit)","Misclassifying WNBA player (probit)","Misclassifying NBA player (probit)","Misclassification error (probit)"),col=c(1,grey(0.5),grey(0.85),2,3,6),pch=16,bty="n")

Cut-off probability equal to \(0.8\)

N = 1000
h = seq(min(x),max(x),length=N)
bayesfit1 = matrix(0,N,M)
bayesfit2 = matrix(0,N,M)
for (j in 1:N){
  bayesfit1[j,] = 1/(1+exp(-alpha.post1-beta.post1*h[j]))
  bayesfit2[j,] = pnorm(alpha.post2+beta.post2*h[j])  
}

prob   = 0.8

# Logit model
error1 = rep(0,M)
error2 = rep(0,M)
for (i in 1:M){
  cutoff = h[bayesfit1[,i]>prob][1]
  error1[i] = mean((x>cutoff)&(y==0))
  error2[i] = mean((x<cutoff)&(y==1))
}
tab1    = rbind(summary(error1),summary(error2),summary(error1+error2))
errors1 = cbind(error1,error2)

# Probit model
error1 = rep(0,M)
error2 = rep(0,M)
for (i in 1:M){
  cutoff = h[bayesfit2[,i]>prob][1]
  error1[i] = mean((x>cutoff)&(y==0))
  error2[i] = mean((x<cutoff)&(y==1))
}
tab2    = rbind(summary(error1),summary(error2),summary(error1+error2))
errors2 = cbind(error1,error2)

Misclassifying WNBA players

# Logit & probit
c(n1,mean(errors1[,1]),mean(errors2[,1]))
## [1] 139.00000000   0.03822003   0.03165854

Misclassifying NBA players

# Logit & probit
c(n2,mean(errors1[,2]),mean(errors2[,2]))
## [1] 505.0000000   0.1673506   0.1804806

Overall misclassification

# Logit & probit
c(n,mean(errors1[,1]+errors1[,2]),mean(errors2[,1]+errors2[,2]))
## [1] 644.0000000   0.2055707   0.2121391