##########################################################################################
#
# ANOTHER HOUSE PRICE DATASET
#
##########################################################################################
#
# The data set has 546 observations and were gathered by the authors from the
# Multiple Listing Service for houses sold in Windsor, Ontario, Canada. The
# variables are defined as follows:
#
# sell = sale price of a house
# lot = the lot size of a property in square feet
# bdms = the number of bedrooms
# fb = the number of full bathrooms
# sty = the number of stories excluding basement
# drv = 1 if the house has a driveway
# rec = 1 if the house has a recreational room
# ffin = 1 if the house has a full finished basement
# ghw = 1 if the house uses gas for hot water heating
# ca = 1 if there is central air conditioning
# gar = the number of garage places
# reg = 1 if the house is located in the preferred neighbourhood of the city
#
# Data: http://qed.econ.queensu.ca/jae/1996-v11.6/anglin-gencay/readme.ag.txt
#
# Source: Paul Anglin and Ramazan Gencay, "Semiparametric Estimation of a Hedonic
# Price Function", Journal of Applied Econometrics, Vol. 11, No. 6, 1996,
# pp. 633-648.
#
##########################################################################################
#
# Breusch-Pagan test = 112.93 and White test = 44.97. chi-square(9) has a 5% critical
# value is 16.92. Since both test statistics are greater than critical value, both of
# these tests indicate that heteroskedasticity is present. Take logs of all variables:
# BP=12.96 and W=11.45 which are both less than the 5% critical value.
#
##########################################################################################
rm(list=ls())
names = c("Sale price (in 1000 reals)","Lot size (in square meters)","Number of bedrooms",
"Number of full bathrooms","Number of stories excluding basement","1=house has a driveway",
"1=house has a recreational room","1=house has a full finished basement",
"1=house uses gas for hot water heating","1=central air conditioning",
"the number of garage places","1=located in the preferred neighbourhood of the city")
lnames = c("Log sale price (in 1000 reals)","Log lot size (in square meters)",
"Log number of bedrooms","Log number of full bathrooms",
"Log number of stories excluding basement","1=house has a driveway",
"1=house has a recreational room","1=house has a full finished basement",
"1=house uses gas for hot water heating","1=central air conditioning",
"the number of garage places","1=located in the preferred neighbourhood of the city")
names1 = c("sell","lot","bdms","fb","sty","drv","rec","ffin","ghw","ca","gar","reg")
lnames1 = c("logsell","loglot","logbdms","logfb","logsty","drv","rec","ffin","ghw","ca","gar","reg")
data = read.table("houseprice-canada.txt",header=TRUE)
n = nrow(data)
# Transformations: from sq-feet to sq-meters and dollars to reals
data[,1] = data[,1]*2.21500132/1000
data[,2] = data[,2]*0.09290304
attach(data)
pdf(file="houseprice-canada.pdf",width=10,height=8)
par(mfrow=c(1,1))
for (i in 2:12){
reg1 = lm(data[,1]~data[,i])
X = cbind(1,data[,i])
y = data[,1]
coef = round(reg1$coef,3)
se = round(sqrt(diag(sum(reg1$res^2)/(n-2)*solve(t(X)%*%X))),2)
plot(data[,i],data[,1],xlab=names[i],ylab=names[1])
abline(coef,col=2,lwd=3)
title(paste("sell = ",coef[1],"+",coef[2],"*",names1[i],"\n (",se[1],") (",se[2],")",sep=""))
}
# Full model
reg2 = lm(sell ~ lot+bdms+fb+sty+drv+rec+ffin+ghw+ca+gar+reg)
yhat = reg2$fit
yhat2 = yhat^2
ehat = reg2$res
ehat2 = ehat^2
par(mfrow=c(1,2))
plot(yhat,ehat,xlab="Fitted",ylab="Residuals",main="Full model")
abline(h=0,lty=2)
plot(yhat,ehat2,xlab="Fitted",ylab="Square residuals")
abline(lm(ehat2~yhat),col=2,lwd=2)
# Pagan homocedasticity test
summary(lm(ehat2 ~ lot+bdms+fb+sty+drv+rec+ffin+ghw+ca+gar+reg))
# White (simplified) homocedasticity test
summary(lm(ehat2 ~ yhat+yhat2))
################################
# Log model
################################
for (i in 1:5)
data[,i] = log(data[,i])
lsell = log(sell)
llot = log(lot)
lbdms = log(bdms)
lfb = log(fb)
lsty = log(sty)
par(mfrow=c(1,1))
for (i in 2:12){
reg1 = lm(data[,1]~data[,i])
X = cbind(1,data[,i])
y = data[,1]
coef = round(reg1$coef,3)
se = round(sqrt(diag(sum(reg1$res^2)/(n-2)*solve(t(X)%*%X))),2)
plot(data[,i],data[,1],xlab=names[i],ylab=lnames[1])
abline(coef,col=2,lwd=3)
title(paste("logsell = ",coef[1],"+",coef[2],"*",lnames1[i],"\n (",se[1],") (",se[2],")",sep=""))
}
reg3 = lm(lsell ~ llot+lbdms+lfb+lsty+drv+rec+ffin+ghw+ca+gar+reg)
yhat = reg3$fit
yhat2 = yhat^2
ehat = reg3$res
ehat2 = ehat^2
par(mfrow=c(1,2))
plot(yhat,ehat,xlab="Fitted",ylab="Residuals",main="Full model (log)")
abline(h=0,lty=2)
plot(yhat,ehat2,xlab="Fitted",ylab="Square residuals")
abline(lm(ehat2~yhat),col=2,lwd=2)
# Pagan homocedasticity test
summary(lm(ehat2 ~ llot+lbdms+lfb+lsty+drv+rec+ffin+ghw+ca+gar+reg))
# White (simplified) homocedasticity test
summary(lm(ehat2 ~ yhat+yhat2))
dev.off()