############################################################################################ # # Linear regression # ############################################################################################ # # HEDIBERT FREITAS LOPES # Associate Professor of Econometrics and Statistics # The University of Chicago Booth School of Business # 5807 South Woodlawn Avenue # Chicago, Illinois, 60637 # Email : hlopes@ChicagoBooth.edu # URL: http://faculty.chicagobooth.edu/hedibert.lopes/research/ # ############################################################################################ data = read.table("houseprice.txt",header=TRUE) n = nrow(data) nbhd = data[,2] offers = data[,3] size = data[,4]/1000 brick = data[,5] bedrooms = data[,6] bathrooms = data[,7] price = data[,8]/1000 # Categorical variable: type of house # Brick = Yes => brickdum=1 # Brick = No => brickdum=0 brickdum = rep(0,n) brickdum[brick=="Yes"]=1 # Categorical variable: neighborhood # Nbhd = 1 => N1=1 and N2=0 # Nbhd = 2 => N1=0 and N2=1 # Nbhd = 3 => N1=0 and N2=0 N1 = rep(0,n) N2 = rep(0,n) N1[nbhd==1]=1 N2[nbhd==2]=1 pdf(file="houseprice.pdf",width=20,height=10) par(mfrow=c(1,2)) reg = lm(price~size) coef = round(reg$coef,4) fit = reg$fit res = reg$res Se = round(sqrt(mean(res^2)),4) R2 = round(1-sum(res^2)/sum((price-mean(price))^2),4) plot(size,price,pch=16,xlab="SIZE",ylab="PRICE") points(size,fit,col=4,pch=15) title(paste("PRICE=",coef[1],"+",coef[2]," SIZE",sep="")) plot(fit,price,xlim=range(price),ylim=range(price),pch=16,xlab="FITTED PRICE",ylab="PRICE") abline(0,1,col=grey(0.75),lty=2,lwd=3) title(paste("R2=",R2," - Se=",Se,sep="")) reg = lm(price~size+brickdum) coef = round(reg$coef,4) fit = reg$fit res = reg$res Se = round(sqrt(mean(res^2)),4) R2 = round(1-sum(res^2)/sum((price-mean(price))^2),4) plot(size,price,pch=16+brickdum,xlab="SIZE",ylab="PRICE",col=0) text(size,price,brickdum,col=1+brickdum,cex=1.25) points(size,fit,col=4,pch=15) title(paste("PRICE=",coef[1],"+",coef[2]," SIZE + ",coef[3],"BRICKDUM",sep="")) plot(fit,price,xlim=range(price),ylim=range(price),pch=16,xlab="FITTED PRICE",ylab="PRICE") abline(0,1,col=grey(0.75),lty=2,lwd=3) title(paste("R2=",R2," - Se=",Se,sep="")) reg = lm(price~size+N1+N2) coef = round(reg$coef,4) fit = reg$fit res = reg$res Se = round(sqrt(mean(res^2)),4) R2 = round(1-sum(res^2)/sum((price-mean(price))^2),4) plot(size,price,pch=15+nbhd,xlab="SIZE",ylab="PRICE",col=0) text(size,price,nbhd,col=nbhd,cex=1.25) points(size,fit,pch=15,col=4) title(paste("PRICE=",coef[1],"+",coef[2]," SIZE ",coef[3],"N1 ",coef[4],"N2",sep="")) plot(fit,price,xlim=range(price),ylim=range(price),pch=16,xlab="FITTED PRICE",ylab="PRICE") abline(0,1,col=grey(0.75),lty=2,lwd=3) title(paste("R2=",R2," - Se=",Se,sep="")) bricknbhd = rep(0,n) bricknbhd[((N1==1)&(N2==0)&(brickdum==0))]=1 bricknbhd[((N1==1)&(N2==0)&(brickdum==1))]=2 bricknbhd[((N1==0)&(N2==1)&(brickdum==0))]=3 bricknbhd[((N1==0)&(N2==1)&(brickdum==1))]=4 bricknbhd[((N1==0)&(N2==0)&(brickdum==0))]=5 bricknbhd[((N1==0)&(N2==0)&(brickdum==1))]=6 reg = lm(price~size+brickdum+N1+N2) coef = round(reg$coef,4) fit = reg$fit res = reg$res Se = round(sqrt(mean(res^2)),4) R2 = round(1-sum(res^2)/sum((price-mean(price))^2),4) plot(size,price,pch=15+bricknbhd,xlab="SIZE",ylab="PRICE",col=0) text(size,price,bricknbhd,col=bricknbhd,cex=1.25) points(size,fit,pch=15,col=4) title(paste("PRICE=",coef[1],"+",coef[2]," SIZE + ",coef[3],"BRICKDUM ",coef[4],"N1",coef[5]," N2",sep="")) text(1.6,200,"1: BRICK=NO, NBHD=1",col=1) text(1.6,197,"2: BRICK=YES,NBHD=1",col=2) text(1.6,194,"3: BRICK=NO, NBHD=2",col=3) text(1.6,191,"4: BRICK=YES,NBHD=2",col=4) text(1.6,188,"5: BRICK=NO, NBHD=3",col=5) text(1.6,185,"6: BRICK=YES,NBHD=3",col=6) plot(fit,price,xlim=range(price),ylim=range(price),pch=16,xlab="FITTED PRICE",ylab="PRICE") abline(0,1,col=grey(0.75),lty=2,lwd=3) title(paste("R2=",R2," - Se=",Se,sep="")) par(mfrow=c(1,3)) reg = lm(price~size+brickdum+N1+N2+bedrooms) coef = round(reg$coef,4) fit = reg$fit res = reg$res Se = round(sqrt(mean(res^2)),4) R2 = round(1-sum(res^2)/sum((price-mean(price))^2),4) plot(fit,price,xlim=range(price),ylim=range(price),pch=16,xlab="FITTED PRICE",ylab="PRICE") abline(0,1,col=grey(0.75),lty=2,lwd=3) text(100,200,paste("R2=",R2,sep=""),cex=2) text(100,195,paste("Se=",Se,sep=""),cex=2) title("price = 53 + 42.7size + 19.4brickdum - 34.9N1 - 29.8N2 + 2.7bedrooms") reg = lm(price~size+brickdum+N1+N2+bedrooms+bathrooms) coef = round(reg$coef,4) fit = reg$fit res = reg$res Se = round(sqrt(mean(res^2)),4) R2 = round(1-sum(res^2)/sum((price-mean(price))^2),4) plot(fit,price,xlim=range(price),ylim=range(price),pch=16,xlab="FITTED PRICE",ylab="PRICE") abline(0,1,col=grey(0.75),lty=2,lwd=3) text(100,200,paste("R2=",R2,sep=""),cex=2) text(100,195,paste("Se=",Se,sep=""),cex=2) title("price = 52 + 35.9size + 18.5brickdum - 34.1N1 - 29.2N2 + 1.9bedrooms + 6.8bathrooms") reg = lm(price~size+brickdum+N1+N2+bedrooms+bathrooms+offers) coef = round(reg$coef,4) fit = reg$fit res = reg$res Se = round(sqrt(mean(res^2)),4) R2 = round(1-sum(res^2)/sum((price-mean(price))^2),4) plot(fit,price,xlim=range(price),ylim=range(price),pch=16,xlab="FITTED PRICE",ylab="PRICE") abline(0,1,col=grey(0.75),lty=2,lwd=3) text(100,200,paste("R2=",R2,sep=""),cex=2) text(100,195,paste("Se=",Se,sep=""),cex=2) title("price = 22.8 + 53size + 17.3brickdum - 20.7N1 - 22.2N2 + 4.3bedrooms + 7.9bathrooms - 8.3offers") reg0 = lm(price~1) reg1.1 = lm(price~size) reg1.2 = lm(price~brickdum) reg1.3 = lm(price~N1+N2) reg1.4 = lm(price~bedrooms) reg1.5 = lm(price~bathrooms) reg1.6 = lm(price~offers) reg2.1 = lm(price~size+brickdum) reg2.2 = lm(price~size+N1+N2) reg2.3 = lm(price~size+bedrooms) reg2.4 = lm(price~size+bathrooms) reg2.5 = lm(price~size+offers) reg2.6 = lm(price~brickdum+N1+N2) reg2.7 = lm(price~brickdum+bedrooms) reg2.8 = lm(price~brickdum+bathrooms) reg2.9 = lm(price~brickdum+offers) reg2.10 = lm(price~N1+N2+bedrooms) reg2.11 = lm(price~N1+N2+bathrooms) reg2.12 = lm(price~N1+N2+offers) reg2.13 = lm(price~bedrooms+bathrooms) reg2.14 = lm(price~bedrooms+offers) reg2.15 = lm(price~bathrooms+offers) reg3.1 = lm(price~size+brickdum+N1+N2) reg3.2 = lm(price~size+brickdum+bedrooms) reg3.3 = lm(price~size+brickdum+bathrooms) reg3.4 = lm(price~size+brickdum+offers) reg3.5 = lm(price~size+N1+N2+bedrooms) reg3.6 = lm(price~size+N1+N2+bathrooms) reg3.7 = lm(price~size+N1+N2+offers) reg3.8 = lm(price~size+bedrooms+bathrooms) reg3.9 = lm(price~size+bedrooms+offers) reg3.10 = lm(price~size+bathrooms+offers) reg3.11 = lm(price~brickdum+N1+N2+bedrooms) reg3.12 = lm(price~brickdum+N1+N2+bathrooms) reg3.13 = lm(price~brickdum+N1+N2+offers) reg3.14 = lm(price~brickdum+bedrooms+bathrooms) reg3.15 = lm(price~brickdum+bedrooms+offers) reg3.16 = lm(price~brickdum+bathrooms+offers) reg3.17 = lm(price~N1+N2+bedrooms+bathrooms) reg3.18 = lm(price~N1+N2+bathrooms+offers) reg3.19 = lm(price~N1+N2+bedrooms+offers) reg3.20 = lm(price~bedrooms+bathrooms+offers) reg4.1 = lm(price~size+brickdum+N1+N2+bedrooms) reg4.2 = lm(price~size+brickdum+N1+N2+bathrooms) reg4.3 = lm(price~size+brickdum+N1+N2+offers) reg4.4 = lm(price~size+N1+N2+bedrooms+bathrooms) reg4.5 = lm(price~size+N1+N2+bedrooms+offers) reg4.6 = lm(price~size+N1+N2+bathrooms+offers) reg4.7 = lm(price~size+N1+N2+bedrooms+bathrooms) reg4.8 = lm(price~size+N1+N2+bedrooms+offers) reg4.9 = lm(price~size+N1+N2+bathrooms+offers) reg4.10 = lm(price~size+bedrooms+bathrooms+offers) reg4.11 = lm(price~brickdum+N1+N2+bedrooms+bathrooms) reg4.12 = lm(price~brickdum+N1+N2+bedrooms+offers) reg4.13 = lm(price~brickdum+N1+N2+bathrooms+offers) reg4.14 = lm(price~brickdum+bedrooms+bathrooms+offers) reg4.15 = lm(price~N1+N2+bedrooms+bathrooms+offers) reg5.1 = lm(price~size+brickdum+N1+N2+bedrooms+bathrooms) reg5.2 = lm(price~size+brickdum+N1+N2+bedrooms+offers) reg5.3 = lm(price~size+brickdum+N1+N2+bathrooms+offers) reg5.4 = lm(price~size+brickdum+bedrooms+bathrooms+offers) reg5.5 = lm(price~size+N1+N2+bedrooms+bathrooms+offers) reg5.6 = lm(price~brickdum+N1+N2+bedrooms+bathrooms+offers) reg6 = lm(price~size+brickdum+N1+N2+bedrooms+bathrooms+offers) Se=sqrt(c( mean(reg0$res^2), mean(reg1.1$res^2), mean(reg1.2$res^2), mean(reg1.3$res^2), mean(reg1.4$res^2), mean(reg1.5$res^2), mean(reg1.6$res^2), mean(reg2.1$res^2), mean(reg2.2$res^2), mean(reg2.3$res^2), mean(reg2.4$res^2), mean(reg2.5$res^2), mean(reg2.6$res^2), mean(reg2.7$res^2), mean(reg2.8$res^2), mean(reg2.9$res^2), mean(reg2.10$res^2), mean(reg2.11$res^2), mean(reg2.12$res^2), mean(reg2.13$res^2), mean(reg2.14$res^2), mean(reg2.15$res^2), mean(reg3.1$res^2), mean(reg3.2$res^2), mean(reg3.3$res^2), mean(reg3.4$res^2), mean(reg3.5$res^2), mean(reg3.6$res^2), mean(reg3.7$res^2), mean(reg3.8$res^2), mean(reg3.9$res^2), mean(reg3.10$res^2), mean(reg3.11$res^2), mean(reg3.12$res^2), mean(reg3.13$res^2), mean(reg3.14$res^2), mean(reg3.15$res^2), mean(reg3.16$res^2), mean(reg3.17$res^2), mean(reg3.18$res^2), mean(reg3.19$res^2), mean(reg3.20$res^2), mean(reg4.1$res^2), mean(reg4.2$res^2), mean(reg4.3$res^2), mean(reg4.4$res^2), mean(reg4.5$res^2), mean(reg4.6$res^2), mean(reg4.7$res^2), mean(reg4.8$res^2), mean(reg4.9$res^2), mean(reg4.10$res^2), mean(reg4.11$res^2), mean(reg4.12$res^2), mean(reg4.13$res^2), mean(reg4.14$res^2), mean(reg4.15$res^2), mean(reg5.1$res^2), mean(reg5.2$res^2), mean(reg5.3$res^2), mean(reg5.4$res^2), mean(reg5.5$res^2), mean(reg5.6$res^2), mean(reg6$res^2))) vp = sum((price-mp)^2) R2 = c( 1-sum(reg0$res^2)/vp, 1-sum(reg1.1$res^2)/vp, 1-sum(reg1.2$res^2)/vp, 1-sum(reg1.3$res^2)/vp, 1-sum(reg1.4$res^2)/vp, 1-sum(reg1.5$res^2)/vp, 1-sum(reg1.6$res^2)/vp, 1-sum(reg2.1$res^2)/vp, 1-sum(reg2.2$res^2)/vp, 1-sum(reg2.3$res^2)/vp, 1-sum(reg2.4$res^2)/vp, 1-sum(reg2.5$res^2)/vp, 1-sum(reg2.6$res^2)/vp, 1-sum(reg2.7$res^2)/vp, 1-sum(reg2.8$res^2)/vp, 1-sum(reg2.9$res^2)/vp, 1-sum(reg2.10$res^2)/vp, 1-sum(reg2.11$res^2)/vp, 1-sum(reg2.12$res^2)/vp, 1-sum(reg2.13$res^2)/vp, 1-sum(reg2.14$res^2)/vp, 1-sum(reg2.15$res^2)/vp, 1-sum(reg3.1$res^2)/vp, 1-sum(reg3.2$res^2)/vp, 1-sum(reg3.3$res^2)/vp, 1-sum(reg3.4$res^2)/vp, 1-sum(reg3.5$res^2)/vp, 1-sum(reg3.6$res^2)/vp, 1-sum(reg3.7$res^2)/vp, 1-sum(reg3.8$res^2)/vp, 1-sum(reg3.9$res^2)/vp, 1-sum(reg3.10$res^2)/vp, 1-sum(reg3.11$res^2)/vp, 1-sum(reg3.12$res^2)/vp, 1-sum(reg3.13$res^2)/vp, 1-sum(reg3.14$res^2)/vp, 1-sum(reg3.15$res^2)/vp, 1-sum(reg3.16$res^2)/vp, 1-sum(reg3.17$res^2)/vp, 1-sum(reg3.18$res^2)/vp, 1-sum(reg3.19$res^2)/vp, 1-sum(reg3.20$res^2)/vp, 1-sum(reg4.1$res^2)/vp, 1-sum(reg4.2$res^2)/vp, 1-sum(reg4.3$res^2)/vp, 1-sum(reg4.4$res^2)/vp, 1-sum(reg4.5$res^2)/vp, 1-sum(reg4.6$res^2)/vp, 1-sum(reg4.7$res^2)/vp, 1-sum(reg4.8$res^2)/vp, 1-sum(reg4.9$res^2)/vp, 1-sum(reg4.10$res^2)/vp, 1-sum(reg4.11$res^2)/vp, 1-sum(reg4.12$res^2)/vp, 1-sum(reg4.13$res^2)/vp, 1-sum(reg4.14$res^2)/vp, 1-sum(reg4.15$res^2)/vp, 1-sum(reg5.1$res^2)/vp, 1-sum(reg5.2$res^2)/vp, 1-sum(reg5.3$res^2)/vp, 1-sum(reg5.4$res^2)/vp, 1-sum(reg5.5$res^2)/vp, 1-sum(reg5.6$res^2)/vp, 1-sum(reg6$res^2)/vp) par(mfrow=c(1,2)) plot(Se,xlab="Model index",ylab="Se",pch=16,col=c(1,rep(2,6),rep(3,15),rep(4,20),rep(5,15),rep(6,6),8)) text(40,25,"Model with 0 covariates",col=1) text(40,24.5,"Models with 1 covariate",col=2) text(40,24,"Models with 2 covariates",col=3) text(40,23.5,"Models with 3 covariates",col=4) text(40,23,"Models with 4 covariates",col=5) text(40,22.5,"Models with 5 covariates",col=6) text(40,22,"Models with 6 covariates",col=8) plot(R2,xlab="Model index",ylab="R2",pch=16,col=c(1,rep(2,6),rep(3,15),rep(4,20),rep(5,15),rep(6,6),8)) dev.off()