# Section 8.8 (Modeling Count Data) from Jim Albert and Jingchen Hu's 2020 book # Probability and Bayexsian Modeling, CRC Press, Taylor & Francis Group # Suppose one is interestedin monitoring the popularityof a particular blog # cofusing on baseball analytics. The data in y (below) displays the number of # visitors viewing this blog for 28 days during June of 2019. y = c(95,81,85,100,111,130,113, 92,65,78,96,118,120,104, 91,91,79,106,91,114,110, 98,61,84,96,126,119,90) n = length(y) days = c("fri","sat","sun","mon","tue","wed","thu") t = rep(1:7,4) # Exploratory data analysis par(mfrow=c(2,2)) hist(y,xlab="",main="Number of visitors") boxplot(y,ylab="Number of visitors") ts.plot(y,type="b",xlab="Days",ylab="Number of visitors") abline(v=7.5,lty=2) abline(v=14.5,lty=2) abline(v=21.5,lty=2) plot(t,y,xlab="Day of the week",ylab="Number of visitors",axes=FALSE) axis(2);axis(1,at=1:7,lab=days) # Linear regression vs Poisson regression weekend=rep(0,n) weekend[t<=3]=1 c(mean(y[weekend==0]),sqrt(var((y[weekend==0])))) c(mean(y[weekend==1]),sqrt(var((y[weekend==1])))) par(mfrow=c(1,2)) boxplot(y,ylab="Number of visitors") boxplot(y~weekend,xlab="Type of day",names=c("Weekday","Weekend"), ylab="Number of visitors") summary(lm(y~weekend)) summary(glm(y~weekend,family="poisson")) tab = rbind(c(109.000,109.000-25.667), c(exp(4.69135),exp(4.69135-0.26850))) rownames(tab) = c("Gaussian regression","Poisson regression") colnames(tab) = c("Intercept","Slope") tab ma = 4.7;sda=0.1 mb = -0.3;sdb=0.1 N = 100 alpha = seq(4.5,4.9,length=N) beta = seq(-0.5,0,length=N) loglike = matrix(0,N,N) prior = matrix(0,N,N) posterior = matrix(0,N,N) for (i in 1:N) for (j in 1:N){ lambda = exp(alpha[i]+beta[j]*weekend) loglike[i,j] = sum(dpois(y,lambda,log=TRUE)) prior[i,j] = dnorm(alpha[i],ma,sda)*dnorm(beta[j],mb,sdb) } like = exp(loglike) post = prior*like par(mfrow=c(1,1)) image(alpha,beta,prior,xlab=expression(alpha),ylab=expression(beta)) contour(alpha,beta,like,col=4,drawlabels=FALSE,add=TRUE,xlab=expression(alpha),ylab=expression(beta),lwd=2) contour(alpha,beta,post,col=3,drawlabels=FALSE,add=TRUE,xlab=expression(alpha),ylab=expression(beta),lwd=2) legend("topright",legend=c("Prior","Likelihood","Posterior"),col=c(2,4,3),lty=1,lwd=3) # SIR-based posterior inference M = 10000 alphas = rnorm(M,ma,sda) betas = rnorm(M,mb,sdb) plot(alphas,betas,xlab=expression(alpha),ylab=expression(beta)) contour(alpha,beta,prior,add=TRUE,drawlabels=FALSE,lwd=2,col=2) w = rep(0,M) for (i in 1:M) w[i] = sum(dpois(y,exp(alphas[i]+betas[i]*weekend),log=TRUE)) w = exp(w-max(w)) ind = sample(1:M,size=M,replace=TRUE,prob=w) alphas1 = alphas[ind] betas1 = betas[ind] plot(alphas1,betas1) image(alpha,beta,post,xlab=expression(alpha),ylab=expression(beta)) contour(alpha,beta,post,add=TRUE,drawlabels=FALSE,lwd=2,col=2) par(mfrow=c(1,1)) plot(density(exp(alphas1)),xlim=c(70,120),ylim=c(0,0.2),xlab="Rate of visitors",ylab="Density",main="",lwd=2) lines(density(exp(alphas1+betas1)),col=2,lwd=2) legend("topright",legend=c("Weekdays","Weekends"),col=1:2,lty=1,lwd=2)