####################################################################### # # Dataset: houseprice.txt # # Description: Prices of 128 houses and house characteristics # ####################################################################### # # Nbhd : Neighborhood (1,2,3) # Offers: Yes or No # SqFt: Size of the house (in square feet) # Brick: Yes No # Bedrooms: Number of bedrooms # Bathrooms: Number of bathrooms # Price: Price of the house (in dollars) # ####################################################################### # # Copyright of R code by: # # Hedibert Freitas Lopes # Professor of Statistics and Econometrics # Insper - Institute for Education and Research # ####################################################################### rm(list=ls()) pdf(file="houseprices-dummies-interactions.pdf",width=12,height=10) data = read.table("houseprices.txt",header=TRUE) attach(data) n = nrow(data) # Data transformation price = Price/1000 size = SqFt/1000 brick = rep(0,n) N1 = rep(0,n) N2 = rep(0,n) brick[Brick=="Yes"]=1 N1[Nbhd==1]=1 N2[Nbhd==2]=1 # Exploratory analysis of the data par(mfrow=c(2,3)) plot(price,xlab="Observation index",main="",ylab="Price (1000 $)") plot(size,xlab="Observation index",main="",ylab="Size (1000 sqft)") plot(brick,xlab="Observation index",ylab="Type of house",axes=FALSE) axis(1);box();axis(2,at=0:1,lab=c("Not brick","Brick")) plot(Nbhd,xlab="Observation index",ylab="Neighborhood",axes=FALSE) axis(1);box();axis(2,at=1:3,lab=c("Nbhd=1","Nbhd=2","Nbhd=2")) boxplot(price[brick==0],price[brick==1],names=c("Brick=No","Brick=Yes"), ylab="Price (1000 $)") boxplot(price[Nbhd==1],price[Nbhd==2],price[Nbhd==3],names=c("Nbhd=1","Nbhd=2","Nbhd=3"),ylab="Price (1000 $)") # Regression models reg1 = lm(price ~ size) reg2 = lm(price ~ size + brick) reg3 = lm(price ~ size + brick + size*brick) reg4 = lm(price ~ size + N1 + N2) reg5 = lm(price ~ size + N1 + N2 + size*N1 + size*N2) reg6 = lm(price ~ size + brick + N1 + N2 + size*brick + size*N1 + size*N2) # Standard error se = rep(0,7) se[1] = sqrt(var(price)) se[2] = sqrt(sum(reg1$res^2/(n-length(reg1$coef)))) se[3] = sqrt(sum(reg2$res^2/(n-length(reg2$coef)))) se[4] = sqrt(sum(reg3$res^2/(n-length(reg3$coef)))) se[5] = sqrt(sum(reg4$res^2/(n-length(reg4$coef)))) se[6] = sqrt(sum(reg5$res^2/(n-length(reg5$coef)))) se[7] = sqrt(sum(reg6$res^2/(n-length(reg6$coef)))) # R-squared res0 = price-mean(price) R2 = rep(0,7) R2[1] = 0 R2[2] = 1-sum(reg1$res^2)/sum(res0^2) R2[3] = 1-sum(reg2$res^2)/sum(res0^2) R2[4] = 1-sum(reg3$res^2)/sum(res0^2) R2[5] = 1-sum(reg4$res^2)/sum(res0^2) R2[6] = 1-sum(reg5$res^2)/sum(res0^2) R2[7] = 1-sum(reg6$res^2)/sum(res0^2) par(mfrow=c(1,1)) plot(price,xlab="Size (1000 sqft)",ylab="Observation index",pch=16) abline(h=mean(price),col=2,lwd=2) title("Model 0: constant") par(mfrow=c(1,1)) plot(size,price,xlab="Size (1000 sqft)",ylab="Price (1000 $)",pch=16) abline(reg1$coef,lwd=2) title("Model 1: constant + size") par(mfrow=c(1,1)) plot(size,price,col=brick+2,xlab="Size (1000 sqft)",ylab="Price (1000 $)",pch=16) abline(reg2$coef[1],reg2$coef[2],lwd=2,col=2) abline(reg2$coef[1]+reg2$coef[3],reg2$coef[2],lwd=2,col=3) legend(1.5,200,legend=c("brick=0","brick=1"),col=2:3,lwd=2,bty="n",cex=1.25) title("Model 2: constant + size + brick") par(mfrow=c(1,1)) plot(size,price,col=brick+2,xlab="Size (1000 sqft)",ylab="Price (1000 $)",pch=16) abline(reg3$coef[1],reg3$coef[2],lwd=2,col=2) abline(reg3$coef[1]+reg3$coef[3],reg3$coef[2]+reg3$coef[4],lwd=2,col=3) legend(1.5,200,legend=c("brick=0","brick=1"),col=2:3,lwd=2,bty="n",cex=1.25) title("Model 3: constant + size + brick + size*brick") par(mfrow=c(1,1)) plot(size,price,col=Nbhd,xlab="Size (1000 sqft)",ylab="Price (1000 $)",pch=16) abline(reg4$coef[1]+reg4$coef[3],reg4$coef[2],col=1,lwd=2) abline(reg4$coef[1]+reg4$coef[4],reg4$coef[2],col=2,lwd=2) abline(reg4$coef[1],reg4$coef[2],col=3,lwd=2) legend(1.5,200,legend=c("Nbhd=1","Nbhd=2","Nbhd=3"),col=1:3,lwd=2,bty="n",cex=1.25) title("Model 4: constant + size + Nbhd") par(mfrow=c(1,1)) plot(size,price,col=Nbhd,xlab="Size (1000 sqft)",ylab="Price (1000 $)",pch=16) abline(reg5$coef[1]+reg5$coef[3],reg5$coef[2]+reg5$coef[5],col=1,lwd=2) abline(reg5$coef[1]+reg5$coef[4],reg5$coef[2]+reg5$coef[5],col=2,lwd=2) abline(reg5$coef[1],reg5$coef[2],col=3,lwd=2) legend(1.5,200,legend=c("Nbhd=1","Nbhd=2","Nbhd=3"),col=1:3,lwd=2,bty="n",cex=1.25) title("Model 5: constant + size + Nbhd + size*Nbhd") par(mfrow=c(1,1)) plot(size,price,col=Nbhd,pch=16+brick,xlab="Size (1000 sqft)",ylab="Price (1000 $)",cex=1.25) legend(1.45,210,legend=c("Nbhd=1,brick=0","Nbhd=2,brick=0","Nbhd=3,brick=0", "Nbhd=1,brick=1","Nbhd=2,brick=1","Nbhd=3,brick=1"),col=c(1:3,1:3),bty="n",cex=1.25,pch=c(16,16,16,17,17,17)) abline(reg6$coef[1]+reg6$coef[4],reg6$coef[2]+reg6$coef[7],col=1,lwd=2) abline(reg6$coef[1]+reg6$coef[3]+reg6$coef[4], reg6$coef[2]+reg6$coef[6]+reg6$coef[7],col=1,lwd=2) abline(reg6$coef[1]+reg6$coef[5],reg6$coef[2]+reg6$coef[8],col=2,lwd=2) abline(reg6$coef[1]+reg6$coef[3]+reg6$coef[5], reg6$coef[2]+reg6$coef[6]+reg6$coef[8],col=2,lwd=2) abline(reg6$coef[1],reg6$coef[2],col=3,lwd=2) abline(reg6$coef[1]+reg6$coef[3],reg6$coef[2]+reg6$coef[6],col=3,lwd=2) title("Model 6: constant + size + brick + Nbhd + size*brick + size*Nbhd") # Residual analysis boxplot(res0,reg1$res,reg2$res,reg3$res,reg4$res,reg5$res,reg6$res, xlab="",ylab="Residuals",ylim=c(-81,81), names=c("Model 0","Model 1","Model 2","Model 3","Model 4","Model 5","Model 6")) abline(h=0,col=2) for (i in 1:7){ text(i,-70,round(se[i],3)) text(i,-80,round(R2[i],3)) } text(0.5,-70,"s.e.") text(0.5,-80,"R2") dev.off()