# Contains data from http://fmwww.bc.edu/ec-p/data/stockwatson/mcas.dta
# obs:           220                          
# vars:            17                          30 Mar 2002 16:48
# size:        25,080 (99.9% of memory free)
# -------------------------------------------------------------------------------
              # storage  display     value
# variable name   type   format      label      variable label
# -------------------------------------------------------------------------------
# code            int    %8.0g                  District Code (numerical)
# municipa        str29  %29s                   Municipality (name)
# district        str25  %25s                   District Name
# regday          int    %8.0g                  Spending per pupil, regular
# specneed        float  %9.0g                  Spending per pupil, special needs
# bilingua        long   %12.0g                 Spending per pupil, bilingual
# occupday        int    %8.0g                  Spending per pupil, occupational
# tot_day         int    %8.0g                  Spending per pupil, Total
# s_p_c           float  %9.0g                  Students per Computer
# spec_ed         float  %9.0g                  % Special Education students
# lnch_pct        float  %9.0g                  % Eligible for free/reduced price lunch
# tchratio        float  %9.0g                  Students per Teacher
# percap          double %12.0g                 Per Capita Income
# totsc4          float  %9.0g                  
# totsc8          float  %9.0g                  
# avgsalry        float  %9.0g                  
# pctel           float  %9.0g                  
# -------------------------------------------------------------------------------
# Sorted by:  municipa  

# THE MASSACHUSETTS TEST SCORE DATA SET
# The Massachusetts data are district-wide averages for public elementary school districts in 1998.  
# The test score is taken from the Massachusetts Comprehensive Assessment System (MCAS) test, 
# administered to all fourth graders in Massachusetts public schools in the spring of 1998.  
# The test is sponsored by the Massachusetts Department of Education and is mandatory for all public 
# schools.  The data analyzed here are the overall total score, which is the sum of the scores on the 
# English, Math, and Science portions of the test. Data on the student-teacher ratio, 
# he percent of students receiving a subsidized lunch and on the percent of students still 
# learning english are averages for each elementary school district for the 1997-1998 school 
# year and were obtained from the massachusetts department of education.  Data on average district 
# income were obtained from the 1990 U.S. Census.

library(foreign)
data = read.dta("http://fmwww.bc.edu/ec-p/data/stockwatson/mcas.dta",convert.factors=FALSE)
attach(data)

######################################################################
# PARTE 1: Explicanto resultado do teste via renda per capita 
######################################################################

par(mfrow=c(1,1))
plot(percap,totsc8,ylim=c(640,800),xlab="Per Capita Income",ylab="Test score")


x = percap
y = totsc8

reg1 = lm(y~x)
bhat1 = reg1$coef
yhat1 = bhat1[1]+bhat1[2]*x
xxx = seq(min(x),max(x),length=1000)
lines(xxx,bhat1[1]+bhat1[2]*xxx,col=2,lwd=2)


reg2 = lm(log(y)~x)
bhat2 = reg2$coef
yhat2 = exp(bhat2[1]+bhat2[2]*x)
xxx = seq(min(x),max(x),length=1000)
lines(xxx,exp(bhat2[1]+bhat2[2]*xxx),col=3,lwd=2)


reg3 = lm(log(y)~log(x))
bhat3 = reg3$coef
yhat3 = exp(bhat3[1]+bhat3[2]*log(x))
xxx = seq(min(x),max(x),length=1000)
lines(xxx,exp(bhat3[1]+bhat3[2]*log(xxx)),col=4,lwd=2)

reg4 = lm(y~log(x))
bhat4 = reg4$coef
yhat4 = bhat4[1]+bhat4[2]*log(x)
xxx = seq(min(x),max(x),length=1000)
lines(xxx,bhat4[1]+bhat4[2]*log(xxx),col=5,lwd=2)

rmse1 = sqrt(mean((y[!is.na(y)]-yhat1[!is.na(y)])^2))
rmse2 = sqrt(mean((y[!is.na(y)]-yhat2[!is.na(y)])^2))
rmse3 = sqrt(mean((y[!is.na(y)]-yhat3[!is.na(y)])^2))
rmse4 = sqrt(mean((y[!is.na(y)]-yhat4[!is.na(y)])^2))

c(rmse1,rmse2,rmse3,rmse4)

legend("topleft",legend=c("nivel-nivel, MSE=13.22865","log-nivel, MSE=13.39479","log-log, MSE=11.89769","nivel-log, MSE=11.78571"),col=2:5,lty=1,lwd=2)


###############################################################################################
# PARTE 2: Explicanto resultado do teste pela % alunos elegiveis a reducao de preco do almoco
###############################################################################################

par(mfrow=c(1,1))
plot(lnch_pct,totsc8,ylim=c(640,800),xlab="% Eligible for free/reduced price lunch",ylab="Test score")

x = lnch_pct
y = totsc8

reg1 = lm(y~x)
bhat1 = reg1$coef
yhat1 = bhat1[1]+bhat1[2]*x
xxx = seq(min(x),max(x),length=1000)
lines(xxx,bhat1[1]+bhat1[2]*xxx,col=2,lwd=2)


reg2 = lm(log(y)~x)
bhat2 = reg2$coef
yhat2 = exp(bhat2[1]+bhat2[2]*x)
xxx = seq(min(x),max(x),length=1000)
lines(xxx,exp(bhat2[1]+bhat2[2]*xxx),col=3,lwd=2)


reg3 = lm(log(y)~log(x))
bhat3 = reg3$coef
yhat3 = exp(bhat3[1]+bhat3[2]*log(x))
xxx = seq(min(x),max(x),length=1000)
lines(xxx,exp(bhat3[1]+bhat3[2]*log(xxx)),col=4,lwd=2)

reg4 = lm(y~log(x))
bhat4 = reg4$coef
yhat4 = bhat4[1]+bhat4[2]*log(x)
xxx = seq(min(x),max(x),length=1000)
lines(xxx,bhat4[1]+bhat4[2]*log(xxx),col=5,lwd=2)

rmse1 = sqrt(mean((y[!is.na(y)]-yhat1[!is.na(y)])^2))
rmse2 = sqrt(mean((y[!is.na(y)]-yhat2[!is.na(y)])^2))
rmse3 = sqrt(mean((y[!is.na(y)]-yhat3[!is.na(y)])^2))
rmse4 = sqrt(mean((y[!is.na(y)]-yhat4[!is.na(y)])^2))

c(rmse1,rmse2,rmse3,rmse4)


legend("topright",legend=c("nivel-nivel, MSE=11.59168","log-nivel, MSE=11.46681","log-log, MSE=10.91771","nivel-log, MSE=10.79310"),col=2:5,lty=1,lwd=2)