## ======================================
## Figure 7: Bias from omitting two-way interaction terms with Southern CA as a control group 
## ======================================

  rm(list=ls())
  
  #Assume base R is running
  my.wd <- getSrcDirectory(function(x){x})
  
  #Check for errors
  if(grepl("error", tolower(class(my.wd)[1])) | my.wd==""){
    #Try to access working directory through R Studio API
    my.wd <- tryCatch(dirname(rstudioapi::getActiveDocumentContext()$path),
                      error = function(e) e)
    
  }
  
  #Set working directory
  setwd(my.wd)

## ======================================
## Load packages and functions
## ======================================

  #Read in function to reshape data for modeling
  source("Functions/prepModelData.R")
  source("Functions/installPackageNotFound.R")
  source("Functions/formatSig.R")

  #Function to proportionally allocate counts
  source("Functions/allocateProportionalCounts.R")
  
  #Libraries for modeling
  installPackageNotFound("data.table")
  installPackageNotFound("plm")

## ======================================
## Read in data
## ======================================

  #Hospitalizations
  hosp.zip3.qtr.1983.2009 <- read.csv("../Data/hosp_zip3_quarter_1983_2009.csv", stringsAsFactors = FALSE)

  ## Drop everything but LA and SoCal
  hosp.data <- subset(hosp.zip3.qtr.1983.2009, (prop.la>0 | prop.socal > 0) & (year%in%1995:1999))
  
## ==========================
## Reshape data for modeling
## ==========================  

  ## Original JL disease selection
  dta.agg.jl.alt <- prepModelData(dta = hosp.data, 
                                  fbi.name = "jl.codes.mdc6", 
                                  control.name = "control.jl.codes", 
                                  years = 1993:2009,
                                  control.grp = TRUE,
                                  filter = FALSE)

  ## ==========================
  ## Run jl spec and triple diff models
  ## ==========================  
  
  ## J&L specification with logCount
  jl.lm.log <- lm(logCount ~ -1 + I(prop.la*as.numeric(year>=1998)) + 
                    I(prop.la*as.numeric(year>=1998)*disease.type) + 
                    as.factor(year.qtr) +
                    as.factor(geo.type.pk),
                  data=dta.agg.jl.alt)
  
  ##Save predictions
  dta.agg.jl.alt$p.jl.log <-predict(jl.lm.log, type = "response")
  dta.agg.jl.alt$p.jl.elog <- exp(predict(jl.lm.log, type = "response")) - 1
  
  ## Triple diff specification with logCount
  ddd.lm.log <- lm(logCount ~ -1 + I(prop.la*as.numeric(year>=1998)) + 
                     I(prop.la*as.numeric(year>=1998)*disease.type) + 
                     as.factor(year.qtr.type) +
                     as.factor(geo.type.pk),
                   data=dta.agg.jl.alt)
  
  ##Save predictions
  dta.agg.jl.alt$p.ddd.log <- predict(ddd.lm.log, type = "response")
  dta.agg.jl.alt$p.ddd.elog <- exp(predict(ddd.lm.log, type = "response")) - 1
  
  ## ==========================
  ## Create predictions for 0/1 binary case using posited averages 
  ## ========================== 
  #Proportion LA matrix
  la.mat <- aggregate(prop.la~geo, data = dta.agg.jl.alt, FUN = max)
  
  #Dataframe to hold predictions
  pred.data <- expand.grid(la = c(0,1),
                           disease.type = c(0,1),
                           year = sort(unique(hosp.data$year)))
  
  #Dataframe of model coefficients (DDD)
  ddd.model.data <- data.frame(coeff.names = names(ddd.lm.log$coefficients),
                               coefficients = ddd.lm.log$coefficients,
                               stringsAsFactors = FALSE)
  
  #Dataframe of model coefficients (JL)
  jl.model.data <- data.frame(coeff.names = names(jl.lm.log$coefficients),
                              coefficients = jl.lm.log$coefficients,
                              stringsAsFactors = FALSE)
  
  #Merge in proportion LA DDD
  ddd.model.data$zip3 <- as.numeric(ifelse(grepl("geo.type.pk", ddd.model.data$coeff.names), substring(ddd.model.data$coeff.names, 23, nchar(ddd.model.data$coeff.names)-2), ""))
  ddd.model.data$prop.la <- la.mat[match(ddd.model.data$zip3, la.mat$geo), "prop.la"]
  ddd.model.data$prop.la[is.na(ddd.model.data$prop.la)] <- 1
  
  #Merge in proportion LA JL
  jl.model.data$zip3 <- as.numeric(ifelse(grepl("geo.type.pk", jl.model.data$coeff.names), substring(jl.model.data$coeff.names, 23, nchar(jl.model.data$coeff.names)-2), ""))
  jl.model.data$prop.la <- la.mat[match(jl.model.data$zip3, la.mat$geo), "prop.la"]
  jl.model.data$prop.la[is.na(jl.model.data$prop.la)] <- 1
  
  #Names of year and zip coefficients
  ddd.year.coeff <- ddd.model.data$coeff.names[grepl("year.qtr.type", ddd.model.data$coeff.names)]
  jl.year.coeff <- jl.model.data$coeff.names[grepl("year.qtr", jl.model.data$coeff.names)]
  zip.coeff <- ddd.model.data$coeff.names[grepl("geo.type.pk", ddd.model.data$coeff.names)]
  
  #Set values for each predictor
  for(i in 1:nrow(pred.data)){
    
    #Proportional factor
    if(pred.data$la[i]==1){
      prop.factor <- ddd.model.data$prop.la[ddd.model.data$coeff.names%in%zip.coeff]
    
    }else{
      prop.factor <- 1-ddd.model.data$prop.la[ddd.model.data$coeff.names%in%zip.coeff]
    }
    
    #Interactions
    b1 <- pred.data$la[i] * as.numeric(pred.data$year[i] >= 1998)
    b2 <- pred.data$la[i] * as.numeric(pred.data$year[i] >= 1998) * pred.data$disease.type[i]
    
    #Years 
    year.ddd.filter <- grepl(pred.data$year[i], ddd.year.coeff)
    year.ddd.vec <- as.numeric(year.ddd.filter & substring(ddd.year.coeff, nchar(ddd.year.coeff), nchar(ddd.year.coeff))==pred.data$disease.type[i] )*0.25
    year.jl.filter <- grepl(pred.data$year[i], jl.year.coeff)
    year.jl.vec <- as.numeric(year.jl.filter)*0.25
    
    #Zips
    zip.vec <- as.numeric(substring(zip.coeff, nchar(zip.coeff), nchar(zip.coeff))==pred.data$disease.type[i])*prop.factor
    
    #Subtract reference category for 900-0
    if(pred.data$disease.type[i] == 0 & pred.data$la[i]){
      zip.vec <- zip.vec/(sum(zip.vec)+1)
    }else{
      zip.vec <- zip.vec/sum(zip.vec)
    }
    
    #Predictor vector
    pred.ddd.vec <- c(b1,b2,year.ddd.vec,zip.vec)
    pred.jl.vec <- c(b1,b2,year.jl.vec,zip.vec)
    
    #Get prediction
    this.ddd.pred <- sum(pred.ddd.vec*ddd.model.data$coefficients, na.rm = TRUE)
    dta.agg.jl.alt$p.ddd.log[dta.agg.jl.alt$geo.type.pk=="902-1" &
                               #dta.agg.jl.alt$year.qtr.type=="1992-1-1"]
                               dta.agg.jl.alt$year.qtr.type=="1993-1-1"]
    this.jl.pred <- sum(pred.jl.vec*jl.model.data$coefficients, na.rm = TRUE)
    
    #Add to dataframe
    pred.data$ddd.pred[i] <- this.ddd.pred
    pred.data$jl.pred[i] <- this.jl.pred
    
  }
  
  #Create 8 cell ddd table
  ddd.table <- aggregate(cbind(ddd.pred,jl.pred) ~la+disease.type+as.numeric(year>=1998), data = pred.data, FUN = mean)
  
  #Wide table for DDD
  ddd.table.wide <- cbind(ddd.table[ddd.table$`as.numeric(year >= 1998)`==0,c(names(ddd.table)[1:3], "ddd.pred")], 
                          ddd.table[ddd.table$`as.numeric(year >= 1998)`==1, c("ddd.pred")])
  names(ddd.table.wide) <- c("la", "disease.type", "post", "p.before", "p.after")
  ddd.table.wide$diff <- ddd.table.wide$p.after-ddd.table.wide$p.before
  
  #Calculate treatment effect
  ddd.ctrl.diff <- ddd.table.wide$diff[ddd.table.wide$la==1 & ddd.table.wide$disease.type==0] - ddd.table.wide$diff[ddd.table.wide$la==0 & ddd.table.wide$disease.type==0]
  ddd.fbi.diff <- ddd.table.wide$diff[ddd.table.wide$la==1 & ddd.table.wide$disease.type==1] - ddd.table.wide$diff[ddd.table.wide$la==0 & ddd.table.wide$disease.type==1]
  
  #Wide table for JL
  jl.table.wide <- cbind(ddd.table[ddd.table$`as.numeric(year >= 1998)`==0,c(names(ddd.table)[1:3], "jl.pred")], 
                         ddd.table[ddd.table$`as.numeric(year >= 1998)`==1, c("jl.pred")])
  names(jl.table.wide) <- c("la", "disease.type", "post", "p.before", "p.after")
  jl.table.wide$diff <- jl.table.wide$p.after-jl.table.wide$p.before
  
  #Calculate treatment effect
  jl.ctrl.diff <- jl.table.wide$diff[jl.table.wide$la==1 & jl.table.wide$disease.type==0] - jl.table.wide$diff[jl.table.wide$la==0 & jl.table.wide$disease.type==0]
  jl.fbi.diff <- jl.table.wide$diff[jl.table.wide$la==1 & jl.table.wide$disease.type==1] - jl.table.wide$diff[jl.table.wide$la==0 & jl.table.wide$disease.type==1]
  
  ## ==========================
  ## Calculate the proportional allocation for observed counts
  ## ========================== 
  #LA
  these.vars <- c("logCount")
  log.fbi.yearly <- allocateProportionalCounts(this.data = dta.agg.jl.alt, prop.var = "prop.la", vars = these.vars)
  
  #SOCAL 
  these.vars <- c("logCount")
  counts.soc <- allocateProportionalCounts(this.data = dta.agg.jl.alt, prop.var = "prop.socal", vars = these.vars)

  ## ==========================
  ## Plots (with log counts)
  ## ========================== 
  
  #Set graphing parameters
  par(mar=c(3,3,2,1), mgp=c(1.5,0.5,0), tcl=-0.3)
  m <- rbind(c(3,1,2), c(6,4,5))
  layout(m, widths = c(0.2, 0.4, 0.4), heights = c(1,1,1))
  
  #Colors for plot
  ca.color <- rgb(1,0,1,0.7)
  ca.color2 <- rgb(1,0,1,0.9)
  la.color <- rgb(1,0,0,0.6)
  la.color2 <- rgb(1,0,0,0.8)
  la.lwd <- 0.5
  ca.lwd <- 2.5
 
  #Observation window
  all.years <- 1995:1999
  pre.years <- 1995:1997
  post.years <- 1998:1999
  
  #Plot foodborne
  plot(log.fbi.yearly$year[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==1], 
       log.fbi.yearly$logCount[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==1], 
       type = "n",
       lty = 5,
       lwd = 2,
       ylim = c(1.2, 1.82),
       col = ca.color,
       ylab = "log(Hospitalizations)", xlab = "Year",
       main = "Foodborne Disorders")
  
  ## Visualize difference
  polygon(c(all.years, rev(all.years)),
          c(pred.data$jl.pred[pred.data$la==0 & pred.data$disease.type==1],
            rev(log.fbi.yearly$logCount[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==1])),
          col=rgb(0,0,0,0.2),border=NA)
  
  ## Plot actual values
  lines(log.fbi.yearly$year[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==1], 
        log.fbi.yearly$logCount[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==1],lty=5,lwd=2,col=ca.color2)
  lines(log.fbi.yearly$year[log.fbi.yearly$la==1 & log.fbi.yearly$disease.type==1], 
        log.fbi.yearly$logCount[log.fbi.yearly$la==1 & log.fbi.yearly$disease.type==1], 
        type = "l", lty = 5, col = la.color, lwd = 1)
  abline(v=1997.5,col=rgb(0,0,0,0.6),lwd=1)
  text(1998, 1.75, "LA enacts\ngrading",cex=0.8)
  text(x = 1999, y = 1.30, label = "LA", cex = 0.9, col = "red")
  text(x = 1998.5, y = 1.5, label = "Southern CA", cex = 0.9, col = "purple")
  text(1995.5, 1.65, "Predicted")
  text(1995.4, 1.78, "Actual")
  
  #Plot JL predicted values
  lines(pred.data$year[pred.data$la==1 & pred.data$disease.type==1], 
        pred.data$jl.pred[pred.data$la==1 & pred.data$disease.type==1], type = "l", lty = 1, col = la.color,lwd=0.5)
  lines(pred.data$year[pred.data$la==0 & pred.data$disease.type==1], 
        pred.data$jl.pred[pred.data$la==0 & pred.data$disease.type==1], type = "l", lty = 1, col = ca.color2,lwd=ca.lwd)
  
  
  #Plot nonfoodborne
  plot(log.fbi.yearly$year[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==0], log.fbi.yearly$logCount[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==0], 
       type = "n",
       lty = 5,
       lwd = 2,
       ylim = c(6.4, 7.08),
       col = ca.color,
       ylab = "log(Hospitalizations)", xlab = "Year",
       main = "Digestive Disorders")
  
  ## Visualize difference
  polygon(c(all.years, rev(all.years)),
          c(pred.data$jl.pred[pred.data$la==0 & pred.data$disease.type==0],
            rev(log.fbi.yearly$logCount[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==0])),
          col=rgb(0,0,0,0.2),border=NA)
  
  ## Plot actual values 
  lines(log.fbi.yearly$year[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==0], log.fbi.yearly$logCount[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==0],lty = 5, lwd = 2, col=ca.color2)
  lines(log.fbi.yearly$year[log.fbi.yearly$la==1 & log.fbi.yearly$disease.type==0], 
        log.fbi.yearly$logCount[log.fbi.yearly$la==1 & log.fbi.yearly$disease.type==0], 
        type = "l", lty = 5, col = la.color, lwd = 1)
  abline(v=1997.5,col=rgb(0,0,0,0.6),lwd=1)
  
  #Plot JL predicted values
  lines(pred.data$year[pred.data$la==1 & pred.data$disease.type==0], 
        pred.data$jl.pred[pred.data$la==1 & pred.data$disease.type==0], type = "l", lty = 1, col = la.color,lwd=0.5)
  lines(pred.data$year[pred.data$la==0 & pred.data$disease.type==0], 
        pred.data$jl.pred[pred.data$la==0 & pred.data$disease.type==0], type = "l", lty = 1, col = ca.color2,lwd=ca.lwd)
  
  ##DiD effects
  par(mar=c(1,0,0.5,0))
  plot.new()
  text(0.45, 1, label = bquote(bold(underline("J&L Specification"))), font = 2,cex=1.2,xpd=T)
  text(0, 0.9, label = "Foodborne DID", font = 2,xpd=T,pos=4)
  text(1,0.8, bquote(bar("F")["After"]^"LA"-bar("F")["Before"]^"LA" == .(formatSig(jl.table.wide$diff[jl.table.wide$disease.type==1 & jl.table.wide$la==1],2))),xpd=T,pos=2)
  text(1, 0.7, bquote(bar("FD")["After"]^"S.CA"-bar("FD")["Before"]^"S.CA" == .(formatSig(jl.table.wide$diff[jl.table.wide$disease.type==1 & jl.table.wide$la==0],2))),xpd=T,pos=2)
  text(1,0.6, bquote(delta["Foodborne"] == .(formatSig(jl.fbi.diff, 2))),xpd=T,pos=2)
  
  text(0, 0.5, label = "Digestive DID", font = 2,xpd=T,pos=4)
  text(1,0.4, bquote(bar("D")["After"]^"LA"-bar("D")["Before"]^"LA" == .(formatSig(jl.table.wide$diff[jl.table.wide$disease.type==0 & jl.table.wide$la==1],2))),xpd=T,pos=2)
  text(1,0.3, bquote(bar("FD")["After"]^"S.CA"-bar("FD")["Before"]^"S.CA" == .(formatSig(jl.table.wide$diff[jl.table.wide$disease.type==0 & jl.table.wide$la==0],2))),xpd=T,pos=2)
  text(1,0.2, bquote(delta["Digestive"] == .(formatSig(jl.ctrl.diff, 2))),xpd=T,pos=2)
  
  text(0, 0.1, label = "Treatment Effect", font = 2,xpd=T,pos=4)
  text(1, 0, label = bquote(delta["Grading"] == .(formatSig(jl.fbi.diff-jl.ctrl.diff, 2))),xpd=T,pos=2)
  
  par(mar=c(3,3,2,1), mgp=c(1.5,0.5,0), tcl=-0.3)
  
  #Plot foodborne
  plot(log.fbi.yearly$year[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==1], log.fbi.yearly$logCount[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==1], 
       type = "n",
       lty = 5,
       lwd = 2,
       ylim = c(1.2, 1.82),
       col = ca.color,
       ylab = "log(Hospitalizations)", xlab = "Year",
       main = "Foodborne Disorders")
  abline(v=1997.5,col=rgb(0,0,0,0.6),lwd=1)
  
  ## Prediction errors
  polygon(c(all.years, rev(all.years)),
          c(pred.data$ddd[pred.data$la==0 & pred.data$disease.type==1],
            rev(log.fbi.yearly$logCount[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==1])),
          col=rgb(0,0,0,0.2),border=NA)
  
  ## Lines
  lines(log.fbi.yearly$year[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==1], log.fbi.yearly$logCount[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==1], 
        lty = 5,
        lwd = 1,
        col = ca.color)
  lines(log.fbi.yearly$year[log.fbi.yearly$la==1 & log.fbi.yearly$disease.type==1], 
        log.fbi.yearly$logCount[log.fbi.yearly$la==1 & log.fbi.yearly$disease.type==1], 
        type = "l", lty = 5, col = la.color, lwd = 1)
  
  #Plot DDD predicted values
  lines(pred.data$year[pred.data$la==1 & pred.data$disease.type==1], 
        pred.data$ddd.pred[pred.data$la==1 & pred.data$disease.type==1], type = "l", lty = 1, col = la.color,lwd=0.5)
  lines(pred.data$year[pred.data$la==0 & pred.data$disease.type==1], 
        pred.data$ddd.pred[pred.data$la==0 & pred.data$disease.type==1], type = "l", lty = 1, col = ca.color,lwd=1)
  
  #Plot nonfoodborne
  plot(log.fbi.yearly$year[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==0], log.fbi.yearly$logCount[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==0], 
       type = "n",
       lty = 5,
       lwd = 2,
       ylim = c(6.4, 7.08),
       col = ca.color,
       ylab = "log(Hospitalizations)", xlab = "Year",
       main = "Digestive Disorders")
  abline(v=1997.5,col=rgb(0,0,0,0.6),lwd=1)
  polygon(c(all.years, rev(all.years)),
          c(pred.data$ddd[pred.data$la==0 & pred.data$disease.type==0],
            rev(log.fbi.yearly$logCount[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==0])),
          col=rgb(0,0,0,0.2),border=NA)
  lines(log.fbi.yearly$year[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==0], log.fbi.yearly$logCount[log.fbi.yearly$la==0 & log.fbi.yearly$disease.type==0], 
        lty = 5,
        lwd = 1,
        col = ca.color)
  lines(log.fbi.yearly$year[log.fbi.yearly$la==1 & log.fbi.yearly$disease.type==0], 
        log.fbi.yearly$logCount[log.fbi.yearly$la==1 & log.fbi.yearly$disease.type==0], 
        type = "l", lty = 5, col = la.color, lwd = 1)
  
  #Plot DDD predicted values
  lines(pred.data$year[pred.data$la==1 & pred.data$disease.type==0], 
        pred.data$ddd.pred[pred.data$la==1 & pred.data$disease.type==0], type = "l", lty = 1, col = la.color,lwd=1)
  lines(pred.data$year[pred.data$la==0 & pred.data$disease.type==0], 
        pred.data$ddd.pred[pred.data$la==0 & pred.data$disease.type==0], type = "l", lty = 1, col = ca.color,lwd=1)
  
  # #DiD effects
  par(mar=c(1,0,0.5,0))
  plot.new()
  text(0.45, 1, label = bquote(bold(underline("Triple Differences"))), font = 2,cex=1.2,xpd=T)
  text(0, 0.9, label = "Foodborne DID", font = 2,xpd=T,pos=4)
  text(1,0.8, bquote(bar("F")["After"]^"LA"-bar("F")["Before"]^"LA" == .(formatSig(ddd.table.wide$diff[ddd.table.wide$disease.type==1 & ddd.table.wide$la==1],2))),xpd=T,pos=2)
  text(1,0.7, bquote(bar("F")["After"]^"S.CA"-bar("F")["Before"]^"S.CA" == .(formatSig(ddd.table.wide$diff[ddd.table.wide$disease.type==1 & ddd.table.wide$la==0],2))),xpd=T,pos=2)
  text(01,0.6, bquote(delta["Foodborne"] == .(formatSig(ddd.fbi.diff, 2))),xpd=T,pos=2)
  
  text(0, 0.5, label = "Digestive DID", font = 2,xpd=T,pos=4)
  text(1,0.4, bquote(bar("D")["After"]^"LA"-bar("D")["Before"]^"LA" == .(formatSig(ddd.table.wide$diff[ddd.table.wide$disease.type==0 & ddd.table.wide$la==1],2))),xpd=T,pos=2)
  text(1,0.3, bquote(bar("D")["After"]^"S.CA"-bar("D")["Before"]^"S.CA" == .(formatSig(ddd.table.wide$diff[ddd.table.wide$disease.type==0 & ddd.table.wide$la==0],2))),xpd=T,pos=2)
  text(1,0.2, bquote(delta["Digestive"] == .(formatSig(ddd.ctrl.diff, 2))),xpd=T,pos=2)
  
  text(0, 0.1, label = "Treatment Effect", font = 2,xpd=T,pos=4)
  text(1, 0, label = bquote(delta["Grading"] == .(formatSig(ddd.fbi.diff-ddd.ctrl.diff, 2))),xpd=T,pos=2)
  