######################################################################## # # ORANGE JUICE # # Source: Ledolter (2013) Data Mingin and Business Analytics with R # # Description: Weekly sales data of refrigerated 64-ounce orange juice containers # from 83 stores in the Chicago area. There are many stores throughout the city, # many time periods, and also three different brands (Dominicks, MinuteMaid, # and Tropicana). The data are arranged in rows, with each row giving the recorded # store sales (in logarithms; logmove), as well as brand, price, presence/absence of # feature advertisement, and the demographic characteristics of the store. # There are 28,947 rows in this data set. # The data is taken from P. Rossi’s bayesm package for R, and it has been used # earlier in Montgomery (1987). # store : store number # brand : brand indicator # week : week number # logmove : log of the number of units sold # feat : feature advertisement # price : price # AGE60 : percentage of the population that is aged 60 or older # EDUC : percentage of the population that has a college degree # ETHNIC : percent of the population that is black or Hispanic # INCOME : median income # HHLARGE : percentage of households with 5 or more persons # WORKWOM : percentage of women with full-time jobs # HVAL150 : percentage of households worth more than $150,000 # SSTRDIST : distance to the nearest warehouse store # SSTRVOL : ratio of sales of this store to the nearest warehouse store # CPDIST5 : average distance in miles to the nearest 5 supermarkets # CPWVOL5 : ratio of sales of this store to the average of the nearest five stores # # Additional references: # # Alan L. Montgomery (1997) Creating Micro-Marketing Pricing Strategies Using # Supermarket Scanner Data. Marketing Science, 16, 315-337. # # Rossi, Allenby and McCulloch (2005) Bayesian and Statistics in Marketing. # ######################################################################## rm(list=ls()) oj = read.csv("orangejuice-chicagoarea.csv",header=TRUE) n = nrow(oj) attach(oj) # Creating dummy variables Brand = rep(0,n) Brand[brand=="dominicks"] = 1 Brand[brand=="minute.maid"] = 2 Brand[brand=="tropicana"] = 3 brand.name = c("Dominicks","Minute Maid","Tropicana") Dom = rep(0,n) Min = rep(0,n) Tro = rep(0,n) Dom[brand=="dominicks"] = 1 Min[brand=="minute.maid"] = 1 Tro[brand=="tropicana"] = 1 # Regression analysis SST = sum((logmove-mean(logmove))^2) reg = lm(logmove~price*brand) reg1 = lm(logmove~price) R2 = 100*(1-sum((reg$res^2))/SST) R21 = 100*(1-sum((reg1$res^2))/SST) # Graphical summaries pdf(file="orangejuice-chicagoarea.pdf",width=15,height=10) par(mfrow=c(1,1)) plot(price,logmove,ylab="Log of the number of units sold",xlab="Price") title("Weekly sales data of refrigerated 64-ounce orange juice containers from 83 stores in the Chicago area.\nThere are many stores throughout the city, many time periods, and also three different brands") par(mfrow=c(1,1)) plot(price,logmove,ylab="Log of the number of units sold",xlab="Price",col=Brand,pch=16) title("Weekly sales data of refrigerated 64-ounce orange juice containers from 83 stores in the Chicago area.\nThere are many stores throughout the city, many time periods, and also three different brands") legend(3.25,14,legend=brand.name,col=1:3,pch=16,bty="n",cex=1.25) par(mfrow=c(1,1)) plot(price,logmove,ylab="Log of the number of units sold",xlab="Price",col=Brand,pch=16) abline(reg$coef[1],reg$coef[2],lwd=4,col=1) abline(reg$coef[1]+reg$coef[3],reg$coef[2]+reg$coef[5],lwd=4,col=2) abline(reg$coef[1]+reg$coef[4],reg$coef[2]+reg$coef[6],lwd=4,col=3) abline(reg1$coef,col=4,lwd=4) legend(3.25,14,legend=brand.name,col=1:3,pch=16,bty="n",cex=1.25) title("Weekly sales data of refrigerated 64-ounce orange juice containers from 83 stores in the Chicago area.\nThere are many stores throughout the city, many time periods, and also three different brands") text(0.875,4.5,paste("R2 = ",round(R21,1),"% (log units on price)",sep="")) text(1,4,paste("R2 = ",round(R2,1),"% (log units on price and brand)",sep="")) par(mfrow=c(1,2)) plot(price,reg1$res,xlab="Price",ylab="Residuals",ylim=range(-5,5)) title("Log units on price") abline(h=0,lwd=2,col=2) plot(price,reg$res,xlab="Price",ylab="Residuals",ylim=range(-5,5)) title("Log units on price and brand") abline(h=0,lwd=2,col=2) par(mfrow=c(1,2)) xx = seq(-5.1,5.1,length=100) xxx = seq(-5.1,5.1,length=1000) hist(reg1$res,prob=TRUE,xlab="Residuals",main="Log units on price", xlim=range(xx),breaks=xx) box() lines(xxx,dnorm(xxx,0,sqrt(var(reg1$res))),col=2,lwd=2) lines(density(reg1$res),lwd=2) hist(reg$res,prob=TRUE,xlab="Residuals",main="Log units on price and brand", xlim=range(xx),breaks=xx) box() lines(xxx,dnorm(xxx,0,sqrt(var(reg$res))),col=2,lwd=2) lines(density(reg$res),lwd=2) dev.off()