########################################################################
#
# ORANGE JUICE 
#
# Source: Ledolter (2013) Data Mingin and Business Analytics with R
#
# Description: Weekly sales data of refrigerated 64-ounce orange juice containers 
# from 83 stores in the Chicago area. There are many stores throughout the city, 
# many time periods, and also three different brands (Dominicks, MinuteMaid,
# and Tropicana). The data are arranged in rows, with each row giving the recorded
# store sales (in logarithms; logmove), as well as brand, price, presence/absence of
# feature advertisement, and the demographic characteristics of the store. 
# There are 28,947 rows in this data set. 
# The data is taken from P. Rossi’s bayesm package for R, and it has been used 
# earlier in Montgomery (1987).

# store    : store number
# brand    : brand indicator
# week     : week number
# logmove  : log of the number of units sold
# feat     : feature advertisement
# price    : price
# AGE60    : percentage of the population that is aged 60 or older
# EDUC     : percentage of the population that has a college degree
# ETHNIC   : percent of the population that is black or Hispanic
# INCOME   : median income
# HHLARGE  : percentage of households with 5 or more persons
# WORKWOM  : percentage of women with full-time jobs
# HVAL150  : percentage of households worth more than $150,000
# SSTRDIST : distance to the nearest warehouse store
# SSTRVOL  : ratio of sales of this store to the nearest warehouse store
# CPDIST5  : average distance in miles to the nearest 5 supermarkets
# CPWVOL5  : ratio of sales of this store to the average of the nearest five stores
#
# Additional references:
#
# Alan L. Montgomery (1997) Creating Micro-Marketing Pricing Strategies Using 
# Supermarket Scanner Data. Marketing Science, 16, 315-337.
#
# Rossi, Allenby and McCulloch (2005) Bayesian and Statistics in Marketing.
#
########################################################################
rm(list=ls())

oj = read.csv("orangejuice-chicagoarea.csv",header=TRUE)

n  = nrow(oj)

attach(oj)

# Creating dummy variables
Brand = rep(0,n)
Brand[brand=="dominicks"]   = 1
Brand[brand=="minute.maid"] = 2
Brand[brand=="tropicana"]   = 3
brand.name = c("Dominicks","Minute Maid","Tropicana")

Dom = rep(0,n)
Min = rep(0,n)
Tro = rep(0,n)
Dom[brand=="dominicks"]   = 1
Min[brand=="minute.maid"] = 1
Tro[brand=="tropicana"]   = 1

# Regression analysis
SST  = sum((logmove-mean(logmove))^2)
reg  = lm(logmove~price*brand)
reg1 = lm(logmove~price)
R2   = 100*(1-sum((reg$res^2))/SST)
R21  = 100*(1-sum((reg1$res^2))/SST)

# Graphical summaries
pdf(file="orangejuice-chicagoarea.pdf",width=15,height=10)

par(mfrow=c(1,1))
plot(price,logmove,ylab="Log of the number of units sold",xlab="Price")
title("Weekly sales data of refrigerated 64-ounce orange juice containers from 83 stores in the Chicago area.\nThere are many stores throughout the city, many time periods, and also three different brands")

par(mfrow=c(1,1))
plot(price,logmove,ylab="Log of the number of units sold",xlab="Price",col=Brand,pch=16)
title("Weekly sales data of refrigerated 64-ounce orange juice containers from 83 stores in the Chicago area.\nThere are many stores throughout the city, many time periods, and also three different brands")
legend(3.25,14,legend=brand.name,col=1:3,pch=16,bty="n",cex=1.25)

par(mfrow=c(1,1))
plot(price,logmove,ylab="Log of the number of units sold",xlab="Price",col=Brand,pch=16)
abline(reg$coef[1],reg$coef[2],lwd=4,col=1)
abline(reg$coef[1]+reg$coef[3],reg$coef[2]+reg$coef[5],lwd=4,col=2)
abline(reg$coef[1]+reg$coef[4],reg$coef[2]+reg$coef[6],lwd=4,col=3)
abline(reg1$coef,col=4,lwd=4)
legend(3.25,14,legend=brand.name,col=1:3,pch=16,bty="n",cex=1.25)
title("Weekly sales data of refrigerated 64-ounce orange juice containers from 83 stores in the Chicago area.\nThere are many stores throughout the city, many time periods, and also three different brands")
text(0.875,4.5,paste("R2 = ",round(R21,1),"% (log units on price)",sep=""))
text(1,4,paste("R2 = ",round(R2,1),"% (log units on price and brand)",sep=""))

par(mfrow=c(1,2))
plot(price,reg1$res,xlab="Price",ylab="Residuals",ylim=range(-5,5))
title("Log units on price")
abline(h=0,lwd=2,col=2)
plot(price,reg$res,xlab="Price",ylab="Residuals",ylim=range(-5,5))
title("Log units on price and brand")
abline(h=0,lwd=2,col=2)

par(mfrow=c(1,2))
xx  = seq(-5.1,5.1,length=100)
xxx = seq(-5.1,5.1,length=1000)

hist(reg1$res,prob=TRUE,xlab="Residuals",main="Log units on price",
     xlim=range(xx),breaks=xx)
box()
lines(xxx,dnorm(xxx,0,sqrt(var(reg1$res))),col=2,lwd=2)
lines(density(reg1$res),lwd=2)

hist(reg$res,prob=TRUE,xlab="Residuals",main="Log units on price and brand",
     xlim=range(xx),breaks=xx)
box()
lines(xxx,dnorm(xxx,0,sqrt(var(reg$res))),col=2,lwd=2)
lines(density(reg$res),lwd=2)

dev.off()