############################################################################################# # # Cribari-Neto and Zeiles (2010) Beta Regression in R # Journal of Statistical Software, Volume 34, Issue 2. # https://www.jstatsoft.org/article/view/v034i02 # # Section 5.1. Dyslexia and IQ predicting reading accuracy # We consider an application that analyzes reading accuracy data for # nondyslexic and dyslexic Australian children (Smithson and Verkuilen 2006). # The variable of interest is accuracy providing the scores on a test of reading # accuracy taken by 44 children, which is predicted by the two regressors dyslexia # (a factor with sum contrasts separating a dyslexic and a control group) and # intelligent quotient (iq, converted to z scores), see Figure 6 for a visualization. # The sample includes 19 dyslexics and 25 controls who were recruited from # primary schools in the Australian Capital Territory. The children’s ages ranged # from eight years five months to twelve years three months; mean reading # accuracy was 0.606 for dyslexic readers and 0.900 for controls. # # Additional reference: # Ferrari and Cribari-Neto (2004) Beta Regression for Modelling Rates and Proportions, # Journal of Applied Statistics, 31:7, 799-815, DOI: 10.1080/0266476042000214501 # ############################################################################################# install.packages("betareg") library("betareg") library("lmtest") pdf("betaregression.pdf",width=15,height=10) ############################################################ # reading the data and creating dependent and independent variables ############################################################ data("ReadingSkills", package = "betareg") y = ReadingSkills$accuracy x = ReadingSkills$iq dys = ReadingSkills$dyslexia d = rep(0,length(dys)) d[dys=="yes"]=1 par(mfrow=c(1,1)) plot(x,y,col=d+1,xlab="IQ score",ylab="Reading accuracy",pch=16) legend("topleft",legend=c("control","dyslexic"),col=1:2,pch=16) ################################################################# # Maximum likelihood estimation (approximation based on logit transformation) ################################################################# y1 = log(y/(1-y)) ols = lm(y1 ~ x+d+x*d) summary(ols) xxx = seq(min(x),max(x),length=100) fit1 = ols$coef[1]+ols$coef[2]*xxx fit2 = ols$coef[1]+ols$coef[3]+(ols$coef[2]+ols$coef[4])*xxx fit1 = exp(fit1)/(1+exp(fit1)) fit2 = exp(fit2)/(1+exp(fit2)) par(mfrow=c(1,1)) plot(x,y,col=d+1,xlab="IQ score",ylab="Reading accuracy",pch=16) lines(xxx,fit1,col=1) lines(xxx,fit2,col=2) legend("topleft",legend=c("control","dyslexic"),col=1:2,pch=16) ################################################################# # Beta regression: maximum likelihood estimation ################################################################# beta.fit = betareg(y~x*d | x+d,hessian=TRUE) coeftest(beta.fit) beta1 = beta.fit$coef$mean[1]+beta.fit$coef$mean[2]*xxx beta2 = beta.fit$coef$mean[1]+beta.fit$coef$mean[3]+(beta.fit$coef$mean[2]+beta.fit$coef$mean[4])*xxx beta1 = exp(beta1)/(1+exp(beta1)) beta2 = exp(beta2)/(1+exp(beta2)) par(mfrow=c(1,1)) plot(x,y,col=d+1,xlab="IQ score",ylab="Reading accuracy",pch=1+d) lines(xxx,fit1,col=1,lty=2) lines(xxx,fit2,col=2,lty=2) lines(xxx,beta1,col=1) lines(xxx,beta2,col=2) legend("bottomright",legend=c("control","dyslexic"),col=1:2,,pch=1:2) legend("topleft",legend=c("betareg","lm"),lty=1:2) ################################################################# # Beta regression: Bayesian inference ################################################################# like = function(y,x,d,beta){ mu = 1/(1+exp(-beta[1]-beta[2]*x-beta[3]*d-beta[4]*x*d)) phi = exp(beta[5]+beta[6]*x+beta[7]*d) return(sum(dbeta(y,phi*mu,phi*(1-mu),log=TRUE))) } # Draws from the prior beta0 = c(2,1,-1.5,-1,1.5,1.5,3.5) M = 1000000 M1 = M/100 betas = matrix(0,M,7) for (i in 1:7) betas[,i] = rnorm(M,beta0[i],0.5) # Draws from the posterior (via SIR) w = rep(0,M) for (i in 1:M) w[i] = like(y,x,d,betas[i,]) w = exp(w-max(w)) ind = sample(1:M,size=M1,prob=w,replace=TRUE) betas1 = betas[ind,] par(mfrow=c(2,4)) for (i in 1:4){ hist(betas1[,i],main=paste("beta",i,sep=""),xlab="",prob=TRUE) abline(v=beta.fit$coef$mean[i],col=2) } for (i in 1:3){ hist(betas1[,4+i],main=paste("beta",4+i,sep=""),xlab="",prob=TRUE) abline(v=beta.fit$coef$precision[i],col=2) } pairs(betas1[,1:4],label=c(expression(beta[1]),expression(beta[2]),expression(beta[3]),expression(beta[4]))) pairs(betas1[,5:7],label=c(expression(beta[5]),expression(beta[6]),expression(beta[7]))) # Posterior predictive analysis y.draw0 = matrix(0,M1,100) y.draw1 = matrix(0,M1,100) for (i in 1:100){ xx = xxx[i] mu0 = 1/(1+exp(-betas1[,1]-betas1[,2]*xx)) phi0 = exp(betas1[,5]+betas1[,6]*xx) mu1 = 1/(1+exp(-(betas1[,1]+betas1[,3])-(betas1[,2]+betas1[,4])*xx)) phi1 = exp(betas1[,5]+betas1[,7]+betas1[,6]*xx) y.draw0[,i] = rbeta(M1,mu0*phi0,(1-mu0)*phi0) y.draw1[,i] = rbeta(M1,mu1*phi1,(1-mu1)*phi1) } qy0 = t(apply(y.draw0,2,quantile,c(0.25,0.5,0.75))) qy1 = t(apply(y.draw1,2,quantile,c(0.25,0.5,0.75))) par(mfrow=c(1,1)) plot(x,y,col=d+1,xlab="IQ score",ylab="Reading accuracy",pch=1+d) lines(xxx,qy1[,1],col=2,lty=2) lines(xxx,qy1[,2],col=2,lty=2) lines(xxx,qy1[,3],col=2,lty=2) lines(xxx,qy0[,1],lty=2) lines(xxx,qy0[,2],lty=2) lines(xxx,qy0[,3],lty=2) lines(xxx,beta1,col=1,lwd=2) lines(xxx,beta2,col=2,lwd=2) lines(xxx,apply(y.draw0,2,mean),col=3,lty=2) lines(xxx,apply(y.draw1,2,mean),col=3,lty=2) legend("bottomright",legend=c("control","dyslexic"),col=1:2,,pch=1:2) legend("right",legend=c("MLE","Bayes (quartiles)","Bayes (mean)"),lty=c(1,2,2),col=c(1,1,3)) dev.off()