######################################################################## # # TUTORIAL DE PROGRAMACAO EM R - PARTE 2 # ######################################################################## # # Dataset: houseprice.txt # Description: Prices of 128 houses and house characteristics # # Nbhd : Neighborhood (1,2,3) # Offers: Yes or No # SqFt: Size of the house (in square feet) # Brick: Yes No # Bedrooms: Number of bedrooms # Bathrooms: Number of bathrooms # Price: Price of the house (in dollars) # ######################################################################## # # Copyright of R code by: # # Hedibert Freitas Lopes # Professor of Statistics and Econometrics # Insper - Institute for Education and Research # ######################################################################## # # Copyright of R code by: # # Hedibert Freitas Lopes # Professor of Statistics and Econometrics # Insper - Institute for Education and Research # ####################################################################### rm(list=ls()) pdf(file="R-tutorial-part-II.pdf",width=10,height=8) # Reading the data # ---------------- data = read.table("houseprices.txt",header=TRUE) n = nrow(data) attach(data) # Data transformation # ------------------- price = Price/1000 size = SqFt/1000 par(mfrow=c(1,1)) plot(size,price,pch=16,xlab="Size of the house (in 000's sqft)", ylab="Price of the house (in 000's dollars)") par(mfrow=c(1,1)) boxplot(price[Nbhd==1],price[Nbhd==2],price[Nbhd==3], xlab="Neighborhood of the house", ylab="Price of the house (in 000's dollars)", names=c("Nbhd=1","Nbhd=2","Nbhd=3")) par(mfrow=c(1,1)) boxplot(price[Brick=="No"],price[Brick=="Yes"], xlab="Type of the house", ylab="Price of the house (in 000's dollars)", names=c("Brick=No","Brick=Yes")) # Criando as variaveis dummies para type of house e neighborhood # -------------------------------------------------------------- brick = rep(0,n) brick[Brick=="Yes"]=1 N1 = rep(0,n) N1[Nbhd==1]=1 N2 = rep(0,n) N2[Nbhd==2]=1 N3 = rep(0,n) N3[Nbhd==3]=1 # "Olhando para os dados" # ----------------------- cbind(price,size,brick,N1,N2) # regressoes lineares (efeitos aditivos das dummies) # -------------------------------------------------- reg = lm(price~size+brick+N1+N2) coef = reg$coef par(mfrow=c(1,1)) plot(size,price,col=Nbhd,pch=16+brick, xlab="Size of the house (in 000's sqft)", ylab="Price of the house (in 000's dollars)") abline(coef[1],coef[2],col=3) abline(coef[1]+coef[4],coef[2],col=1) abline(coef[1]+coef[5],coef[2],col=2) abline(coef[1]+coef[3],coef[2],col=3,lty=2) abline(coef[1]+coef[3]+coef[4],coef[2],col=1,lty=2) abline(coef[1]+coef[3]+coef[5],coef[2],col=2,lty=2) legend(1.5,210,legend=c("Brick=Yes,Nbhd=1","Brick=No,Nbhd=1","Brick=Yes,Nbhd=2", "Brick=No,Nbhd=2","Brick=Yes,Nbhd=3","Brick=No,Nbhd=3"), col=c(1,1,2,2,3,3), pch=c(16:17,16:17,16:17),bty="n") title("price = 56.0 + 45.9*size + 19.1*brick - 36.8*N1 - 31.3*N2\n All fitted lines have the same SLOPE") # regressoes lineares (efeitos multiplicativos das dummies) - parte 1 # ------------------------------------------------------------------- reg = lm(price~size+brick+N1+N2+brick*N1+brick*N2) coef = reg$coef plot(size,price,col=Nbhd,pch=16+brick, xlab="Size of the house (in 000's sqft)", ylab="Price of the house (in 000's dollars)") abline(coef[1],coef[2],col=3) abline(coef[1]+coef[4],coef[2],col=1) abline(coef[1]+coef[5],coef[2],col=2) abline(coef[1]+coef[3],coef[2],col=3,lty=2) abline(coef[1]+coef[3]+coef[4]+coef[6],coef[2],col=1,lty=2) abline(coef[1]+coef[3]+coef[5]+coef[7],coef[2],col=2,lty=2) legend(1.5,210,legend=c("Brick=Yes,Nbhd=1","Brick=No,Nbhd=1","Brick=Yes,Nbhd=2", "Brick=No,Nbhd=2","Brick=Yes,Nbhd=3","Brick=No,Nbhd=3"), col=c(1,1,2,2,3,3), pch=c(16:17,16:17,16:17),bty="n") title("price = 53.9 + 45.4*size + 26.3*brick - 32.9*N1 - 27.0*N2 - 13.3*brick*N1 - 10.3*brick*N2\n All fitted lines have the same SLOPE") # regressoes lineares (efeitos multiplicativos das dummies) - parte 2 # ------------------------------------------------------------------- reg = lm(price~size+brick+N1+N2+brick*N1+brick*N2+size*brick+size*N1+size*N2) coef = reg$coef plot(size,price,col=Nbhd,pch=16+brick, xlab="Size of the house (in 000's sqft)", ylab="Price of the house (in 000's dollars)") abline(coef[1],coef[2],col=3) abline(coef[1]+coef[4],coef[2]+coef[9],col=1) abline(coef[1]+coef[5],coef[2]+coef[10],col=2) abline(coef[1]+coef[3],coef[2]+coef[8],col=3,lty=2) abline(coef[1]+coef[3]+coef[4]+coef[6],coef[2]+coef[8]+coef[9],col=1,lty=2) abline(coef[1]+coef[3]+coef[5]+coef[7],coef[2]+coef[8]+coef[10],col=2,lty=2) legend(1.5,210,legend=c("Brick=Yes,Nbhd=1","Brick=No,Nbhd=1","Brick=Yes,Nbhd=2", "Brick=No,Nbhd=2","Brick=Yes,Nbhd=3","Brick=No,Nbhd=3"), col=c(1,1,2,2,3,3), pch=c(16:17,16:17,16:17),bty="n") title("price = 57 + 43.9*size + 4.5*brick - 27.7*N1 - 25.1*N2 - 11.1*brick*N1-\n9.6*brick*N2 - 2.8*size*N1 - 1.0*size*N2") # regressoes lineares (efeitos multiplicativos das dummies) - parte 3 # ------------------------------------------------------------------- reg = lm(price~size+brick+N1+N2+brick*N1+brick*N2+size*brick+ size*N1+size*N2+size*brick*N1+size*brick*N2) coef = reg$coef plot(size,price,col=Nbhd,pch=16+brick, xlab="Size of the house (in 000's sqft)", ylab="Price of the house (in 000's dollars)") abline(coef[1],coef[2],col=3) abline(coef[1]+coef[4],coef[2]+coef[9],col=1) abline(coef[1]+coef[5],coef[2]+coef[10],col=2) abline(coef[1]+coef[3],coef[2]+coef[8],col=3,lty=2) abline(coef[1]+coef[3]+coef[4]+coef[6],coef[2]+coef[8]+coef[9]+coef[11],col=1,lty=2) abline(coef[1]+coef[3]+coef[5]+coef[7],coef[2]+coef[8]+coef[10]+coef[12],col=2,lty=2) legend(1.5,210,legend=c("Brick=Yes,Nbhd=1","Brick=No,Nbhd=1","Brick=Yes,Nbhd=2", "Brick=No,Nbhd=2","Brick=Yes,Nbhd=3","Brick=No,Nbhd=3"), col=c(1,1,2,2,3,3), pch=c(16:17,16:17,16:17),bty="n") title("price = 72.4 + 36.5*size -50.7*brick - 53.2*N1 - 44.1*N2 + 93*brick*N1 + 58*brick*N2 + \n9.9*size*N1 + 8.2*size*N2 - 52.6*size*brick*N1 - 32.6*size*brick*N2") # Comparando varias regressoes (sem interacoes) # --------------------------------------------- X = cbind(size,brick,N1,N2) p = ncol(X) nm = 2^p-1 mod = matrix(0,nm+1,p) m = 0 lab = rep(0,nm+1) for (i in 0:1) for (j in 0:1) for (k in 0:1) for (l in 0:1){ m = m+1 mod[m,] = c(i,j,k,l) lab[m] = paste(i,j,k,l,sep="") } mod = mod[-1,] lab = lab[-1] SST = sum((price-mean(price))^2) n = nrow(data) se = rep(0,15) R2 = rep(0,15) res = matrix(0,n,15) for (m in 1:15){ k = sum(mod[m,]) X1 = X[,mod[m,]==TRUE] reg = lm(price~X1) SSR = sum(reg$res^2) R2[m] = 1-SSR/SST se[m] = sqrt(SSR/(n-k+1)) res[,m] = reg$res } par(mfrow=c(1,1)) boxplot.matrix(res,ylim=c(-70,100),axes=FALSE,xlab="Models",ylab="Residuals") axis(2);box();axis(1,at=1:nm,lab=lab) for (m in 1:nm){ text(m,100,round(R2[m],3),cex=0.85,col=2) text(m,90,round(se[m],3),cex=0.85,col=4) } text(0.2,100,"R2",col=2,cex=0.85) text(0.2,90,"se",col=4,cex=0.85) par(mfrow=c(3,5)) for (m in 1:15){ qqnorm(res[,m],main=paste("Model: ",lab[m],sep="")) qqline(res[,m],col=2) } # Price = alpha + beta*Offers ou Price = alpha + beta1*O1+beta1*O2+beta1*O3 # ------------------------------------------------------------------------- y = price[Offers<=4] x = Offers[Offers<=4] n = length(y) O2 = rep(0,n) O3 = rep(0,n) O4 = rep(0,n) O2[x==2] = 1 O3[x==3] = 1 O4[x==4] = 1 reg1 = lm(y~x) reg2 = lm(y~O2+O3+O4) par(mfrow=c(1,1)) plot(x,y,xlab="Offers",ylab="Price",pch=16,axes=FALSE,ylim=c(50,max(y))) axis(2);box();axis(1,at=1:4) abline(reg1$coef,lwd=2) points(1,reg2$coef[1],pch=17,col=2,cex=2) points(2,reg2$coef[1]+reg2$coef[2],pch=17,col=2,cex=2) points(3,reg2$coef[1]+reg2$coef[3],pch=17,col=2,cex=2) points(4,reg2$coef[1]+reg2$coef[4],pch=17,col=2,cex=2) title("price = 150.7 - 7.88*Offer (se=25.9)\n price=145.2 - 12.328*O2 - 18.55*O3 - 24.17*O4 (se=26.1)") for (i in 1:4){ points(i,mean(y[x==i]),pch=18,col=3,cex=1.25) text(i,50,round(sqrt(var(y[x==i])),1)) text(i,55,"se") } dev.off()