########################################################################
# DATASET: Boston housing
# 
# Housing Values in Suburbs of Boston
#
# The Boston data frame has 506 rows and 14 columns.
#
# This data frame contains the following columns:
#
# crim:    per capita crime rate by town.
# zn:      proportion of residential land zoned for lots over 25,000 sq.ft.
# indus:   proportion of non-retail business acres per town.
# chas:    Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
# nox:     nitrogen oxides concentration (parts per 10 million).
# rm:      average number of rooms per dwelling.
# age:     proportion of owner-occupied units built prior to 1940.
# dis:     weighted mean of distances to five Boston employment centres.
# rad:     index of accessibility to radial highways.
# tax:     full-value property-tax rate per $10,000.
# ptratio: pupil-teacher ratio by town.
# black:   1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
# lstat:   lower status of the population (percent).
# medv:   median value of owner-occupied homes in $1000s.
#
# Source:
# Harrison, D. and Rubinfeld, D.L. (1978) Hedonic prices and the demand 
#         for clean air. J. Environ. Economics and Management 5, 81–102.
# Belsley D.A., Kuh, E. and Welsch, R.E. (1980) Regression Diagnostics. 
#         Identifying Influential Data and Sources of Collinearity. 
#         New York: Wiley.


########################################################################
install.packages("tree")
install.packages("randomForest")
library(randomForest)
library(MASS)
library(tree)

data(Boston)

?Boston

str(Boston)

# Linear regression
ols = lm(medv ~ ., data = Boston)
summary(ols)
fit.ols = predict(ols, newdata = Boston)
RMSE.ols = sqrt(mean((fit.ols - Boston$medv)^2))

# Tree regression
tree = tree(medv ~ ., data = Boston)
summary(tree)
par(mfrow=c(1,2))
plot(tree, type = "uniform") # interpretar
text(tree, cex = 0.75)
fit.tree = predict(tree, newdata = Boston)
RMSE.tree = sqrt(mean((fit.tree - Boston$medv)^2))
c(RMSE.ols,RMSE.tree)

# Pruned tree
pruned = prune.tree(tree, best = 7)
summary(tree)
plot(pruned, type = "uniform") # interpretar
text(pruned, cex = 0.75)
fit.pruned  = predict(pruned, newdata = Boston)
RMSE.pruned = sqrt(mean((fit.pruned - Boston$medv)^2))
c(RMSE.ols,RMSE.tree,RMSE.pruned)

# Bagging
bag         = randomForest(medv ~ ., data = Boston,mtry=13)
fit.bag     = predict(bag, newdata = Boston)
RMSE.bag    = sqrt(mean((fit.bag - Boston$medv)^2))

# Random forest
rf        = randomForest(medv ~ ., data = Boston)
fit.rf    = predict(rf, newdata = Boston)
RMSE.rf   = sqrt(mean((fit.rf - Boston$medv)^2))
c(RMSE.ols,RMSE.tree,RMSE.pruned,RMSE.bag,RMSE.rf)


# Training and testing samples
set.seed(54321)
perc = 0.5   # Percentage in the training sample
B    = 100  # Replications
RMSE = matrix(0,B,5)
par(mfrow=c(1,1))
for (i in 1:B){
  idx         = sample(1:nrow(Boston),round(perc*nrow(Boston)))
  training    = Boston[idx,]
  test        = Boston[-idx,]
  ols         = lm(medv ~ ., data = training)
  fit.ols     = predict(ols, newdata = test)
  RMSE.ols    = sqrt(mean((fit.ols - test$medv)^2))
  tree        = tree(medv ~ ., data = training)
  fit.tree    = predict(tree, newdata = test)
  RMSE.tree   = sqrt(mean((fit.tree - test$medv)^2))
  size        = 0.8*summary(tree)$size
  pruned      = prune.tree(tree, best = size)
  fit.pruned  = predict(pruned, newdata = test)
  RMSE.pruned = sqrt(mean((fit.pruned - test$medv)^2))
  bag         = randomForest(medv ~ ., data = training,mtry=13)
  fit.bag     = predict(bag, newdata = test)
  RMSE.bag    = sqrt(mean((fit.bag - test$medv)^2))
  rf          = randomForest(medv ~ ., data = training)
  fit.rf      = predict(rf, newdata = test)
  RMSE.rf     = sqrt(mean((fit.rf - test$medv)^2))
  RMSE[i,]    = c(RMSE.ols,RMSE.tree,RMSE.pruned,RMSE.bag,RMSE.rf)
}
perc1 = apply(RMSE==apply(RMSE,1,min),2,mean)

boxplot(RMSE,names=c(
paste("OLS=",perc1[1],sep=""),
paste("TREE=",perc1[2],sep=""),
paste("PRUNED=",perc1[3],sep=""),
paste("BAG=",perc1[4],sep=""),
paste("RF=",perc1[5],sep="")),ylab="Root MSE",main=paste("Train=",perc," - ",B," replications",sep=""),ylim=range(RMSE))