######################################################################## # # TUTORIAL DE PROGRAMACAO EM R - PARTE I # ######################################################################## # # Esse "script" tem o intuito de introduzir conceitos basicos de R # utilizando-se o modelo de regressao linear simples e multipla # como motivacao. # # Topicos abordados: # - Construcao de vetores e matrizes # - Operacoes com vetores e matrizes # - Leitura de dados TXT e CSV # - Salvando gráficos em PDF (mas pode ser tambem em PNG, JPEG, TIFF, etc) # - Usando a funcao lm (linear models) para regressao linear # - Usando as funcoes plot e summary para os objetos tipo "lm" # - Incluindo cores/legendas/titulos/limites em graficos # - Criacao de variavel dummy (binaria 0/1) # - Boxplot # ######################################################################## # # Copyright of R code by: # # Hedibert Freitas Lopes # Professor of Statistics and Econometrics # Insper - Institute for Education and Research # ######################################################################## # limpando toda a area de trabalho para evitar ambiguidade nas variaveis rm(list=ls()) # nesse arquivo serao salvas todas as figuras pdf("regressaolinear.pdf",width=10,height=8) # x: altura (em cm) # y: peso (em kg) n = 7 k = 1 x = c(185,178,175,175,180,180,174) y = c(80,62,71,70,80,82,70) # uma forma de montar X X = matrix(c(1,1,1,1,1,1,1,185,178,175,175,180,180,174),n,k+1) # outra forma de montar X X = cbind(1,x) # Regressao "na mao" # %*% : produto matricial # t(X) : transposta de X # solve(A) : inversa de A betahat = solve(t(X)%*%X)%*%t(X)%*%y yfit = X%*%betahat e = y-yfit SSR = t(e)%*%e sig2hat = SSR/(n-(k+1)) SST = sum((y-mean(y))^2) R2 = 1-SSR/SST plot(x,y,xlab="Altura (cm)",ylab="Peso (kg)",pch=16,ylim=c(50,100)) abline(betahat[1],betahat[2],col=2,lwd=2) text(178,63,"Luiz") # Excluindo o Luiz (muito alto e muito magro!) x1 = x[-2] X1 = cbind(1,x1) y1 = y[-2] n1 = n-1 betahat1 = solve(t(X1)%*%X1)%*%t(X1)%*%y1 yfit1 = X1%*%betahat1 e1 = y1-yfit1 SSR1 = t(e1)%*%e1 sig2hat1 = SSR1/(n1-(k+1)) SST1 = sum((y1-mean(y1))^2) R21 = 1-SSR1/SST1 plot(x,y,xlab="Altura (cm)",ylab="Peso (kg)",pch=16,ylim=c(50,100)) abline(betahat[1],betahat[2],col=2,lwd=2) abline(betahat1[1],betahat1[2],col=4,lwd=2) legend(180,70,legend=c("Com Luiz","Sem Luiz"),col=c(2,4),lwd=2,bty="n",cex=1.5) text(178,63,"Luiz") # Comparando as regressoes com/sem o Luiz par(mfrow=c(1,1)) plot(x,y,xlab="Altura (cm)",ylab="Peso (kg)",pch=16,ylim=c(50,100)) abline(betahat[1],betahat[2],col=2,lwd=2) abline(betahat[1]+2*sqrt(sig2hat),betahat[2],lty=2,col=2,lwd=2) abline(betahat[1]-2*sqrt(sig2hat),betahat[2],lty=2,col=2,lwd=2) abline(betahat1[1],betahat1[2],col=4,lwd=2) abline(betahat1[1]+2*sqrt(sig2hat1),betahat1[2],lty=2,col=4,lwd=2) abline(betahat1[1]-2*sqrt(sig2hat1),betahat1[2],lty=2,col=4,lwd=2) legend(180,60,legend=c(paste("Com Luiz (R2=",round(R2,3),")",sep=""),paste("Sem Luiz (R2=",round(R21,3),")",sep="")),col=c(2,4),lwd=2,bty="n") text(178,63,"Luiz") title(paste("Com Luiz: peso = ",round(betahat[1],1)," + ",round(betahat[2],2),"peso\nSem Luiz: peso = ",round(betahat1[1],1)," + ",round(betahat1[2],2),"peso",sep="")) # Usando a funcao lm (linear models) do R reg = lm(y~x) reg1 = lm(y1~x1) summary(reg) par(mfrow=c(2,4)) plot(reg) plot(reg1) # Rodando uma regressao com muito mais dados # sendo que os dados estao armazenados externamente # # salario posicao anosexperiencia sexo #148 7 16.7 1 #165 7 6.7 1 #145 5 14.8 1 # . . . . # . . . . #156 7 15.1 1 #132 4 4.7 1 #161 7 16.5 1 # Se o arquivo esta' no formato TXT data = read.table("salario.txt",header=TRUE) # Se o arquivo esta' no formato CSV #data = read.csv("salario.csv",header=TRUE) n = nrow(data) attach(data) reg2 = lm(salario~anosexperiencia) summary(reg2) par(mfrow=c(1,1)) plot(anosexperiencia,salario,xlab="Anos de experiencia",ylab="Salario",pch=16) abline(reg2$coef,col=2,lwd=2) title("Salario = 135+0.75(Anos de experiencia)\n R2=0.09") # Criando uma variavel dummy para posicao dentro da empresa posi1 = rep(0,n) posi1[posicao>=6]=1 reg3 = lm(salario~posi1) summary(reg3) plot(posi1,salario,xlab="Posicao",ylab="Salario",pch=16,axes=FALSE,xlim=c(-1,2)) axis(2);box() axis(1,at=0:1,lab=c("Posicao <=5","Posicao >5")) abline(reg3$coef,col=2,lwd=2) title("Salario = 135.5+17.4Posicao\n R2=0.47") boxplot(salario,salario[posi1==0],salario[posi1==1], names = c("Todos","Posicao <=5","Posicao >5"),ylab="Salario") dev.off()