######################################################################## # DATASET: Boston housing # # Housing Values in Suburbs of Boston # # The Boston data frame has 506 rows and 14 columns. # # This data frame contains the following columns: # # crim: per capita crime rate by town. # zn: proportion of residential land zoned for lots over 25,000 sq.ft. # indus: proportion of non-retail business acres per town. # chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). # nox: nitrogen oxides concentration (parts per 10 million). # rm: average number of rooms per dwelling. # age: proportion of owner-occupied units built prior to 1940. # dis: weighted mean of distances to five Boston employment centres. # rad: index of accessibility to radial highways. # tax: full-value property-tax rate per $10,000. # ptratio: pupil-teacher ratio by town. # black: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town. # lstat: lower status of the population (percent). # medv: median value of owner-occupied homes in $1000s. # # Source: # Harrison, D. and Rubinfeld, D.L. (1978) Hedonic prices and the demand # for clean air. J. Environ. Economics and Management 5, 81–102. # Belsley D.A., Kuh, E. and Welsch, R.E. (1980) Regression Diagnostics. # Identifying Influential Data and Sources of Collinearity. # New York: Wiley. ######################################################################## install.packages("tree") install.packages("randomForest") library(randomForest) library(MASS) library(tree) data(Boston) ?Boston str(Boston) # Linear regression ols = lm(medv ~ ., data = Boston) summary(ols) fit.ols = predict(ols, newdata = Boston) RMSE.ols = sqrt(mean((fit.ols - Boston$medv)^2)) # Tree regression tree = tree(medv ~ ., data = Boston) summary(tree) par(mfrow=c(1,2)) plot(tree, type = "uniform") # interpretar text(tree, cex = 0.75) fit.tree = predict(tree, newdata = Boston) RMSE.tree = sqrt(mean((fit.tree - Boston$medv)^2)) c(RMSE.ols,RMSE.tree) # Pruned tree pruned = prune.tree(tree, best = 7) summary(tree) plot(pruned, type = "uniform") # interpretar text(pruned, cex = 0.75) fit.pruned = predict(pruned, newdata = Boston) RMSE.pruned = sqrt(mean((fit.pruned - Boston$medv)^2)) c(RMSE.ols,RMSE.tree,RMSE.pruned) # Bagging bag = randomForest(medv ~ ., data = Boston,mtry=13) fit.bag = predict(bag, newdata = Boston) RMSE.bag = sqrt(mean((fit.bag - Boston$medv)^2)) # Random forest rf = randomForest(medv ~ ., data = Boston) fit.rf = predict(rf, newdata = Boston) RMSE.rf = sqrt(mean((fit.rf - Boston$medv)^2)) c(RMSE.ols,RMSE.tree,RMSE.pruned,RMSE.bag,RMSE.rf) # Training and testing samples set.seed(54321) perc = 0.5 # Percentage in the training sample B = 100 # Replications RMSE = matrix(0,B,5) par(mfrow=c(1,1)) for (i in 1:B){ idx = sample(1:nrow(Boston),round(perc*nrow(Boston))) training = Boston[idx,] test = Boston[-idx,] ols = lm(medv ~ ., data = training) fit.ols = predict(ols, newdata = test) RMSE.ols = sqrt(mean((fit.ols - test$medv)^2)) tree = tree(medv ~ ., data = training) fit.tree = predict(tree, newdata = test) RMSE.tree = sqrt(mean((fit.tree - test$medv)^2)) size = 0.8*summary(tree)$size pruned = prune.tree(tree, best = size) fit.pruned = predict(pruned, newdata = test) RMSE.pruned = sqrt(mean((fit.pruned - test$medv)^2)) bag = randomForest(medv ~ ., data = training,mtry=13) fit.bag = predict(bag, newdata = test) RMSE.bag = sqrt(mean((fit.bag - test$medv)^2)) rf = randomForest(medv ~ ., data = training) fit.rf = predict(rf, newdata = test) RMSE.rf = sqrt(mean((fit.rf - test$medv)^2)) RMSE[i,] = c(RMSE.ols,RMSE.tree,RMSE.pruned,RMSE.bag,RMSE.rf) } perc1 = apply(RMSE==apply(RMSE,1,min),2,mean) boxplot(RMSE,names=c( paste("OLS=",perc1[1],sep=""), paste("TREE=",perc1[2],sep=""), paste("PRUNED=",perc1[3],sep=""), paste("BAG=",perc1[4],sep=""), paste("RF=",perc1[5],sep="")),ylab="Root MSE",main=paste("Train=",perc," - ",B," replications",sep=""),ylim=range(RMSE))