Source can be viewed on GitHub

Loading and formatting the data

Begin by loading the models to be compared.

#### Run Rscript0 to load useful packages and functions ----
source("Rscript0 - libraries.R")
source("f.Wrangle.R")
source("f.Diagnostics.R")
source("f.JMod.R")

#### Model data cleanup ----
source("Rscript1a - modeling data appends.R")
source("Rscript1b - derived variables.R")
source("Rscript1c - additional predictors.R")
#### Models from last time ----
load("logistic.models.Rdata")
### Point-in-time data file ----
sols <- read.csv("sols.dedupe.by.mo.txt", sep="\t", stringsAsFactors=F)

Data transformations and setup for the model

The sols data frame was created for a different project and doesn’t have all the columns needed to make predictions in the new model. The following fields are required:

JMod

Curr.Stage, Curr.Dt, Expected.Dt, Expected.Amt, Actual.Amt, Closed.In.FY

GLMs

FY.Plan.Book, FY.Clear.Book, FY.Ask.Book, FY.Oral.Book, Sol.Type.Agg, lt.Planned.Amt, lt.Expected.Amt, lt.Ask.Amt, Plan.Future, Clear.Future, Ask.Future, Oral.Future

Let’s go about reshaping sols:

## Grab existing fields
sols2 <- sols %>% select(Solicitation.ID, Final.Sol.Stage, Solicitation.Stage.Desc, filename, rpt.date, Expected.Dt, Expected.Amt, Actual.Amt, Resolved.In.FY, Closed.In.FY, Solicitation.Type.Desc, Planned.Ask.Amt, Ask.Amt)
## Convert date fields
sols2 <- ToDate(sols2, c("rpt.date", "Expected.Dt"))
## Impute any missing values from mdat
# Expected
sols2$Expected.Amt[is.na(sols2$Expected.Amt)] <- left_join(sols2[is.na(sols2$Expected.Amt), c("Solicitation.ID", "Expected.Amt")], mdat[, c("Solicitation.ID", "Expected.Amt")], by="Solicitation.ID")[, 3]
# Actual
sols2$Actual.Amt[is.na(sols2$Actual.Amt)] <- left_join(sols2[is.na(sols2$Actual.Amt), c("Solicitation.ID", "Actual.Amt")], mdat[, c("Solicitation.ID", "Actual.Amt")], by="Solicitation.ID")[, 3]
# Planned
sols2$Planned.Ask.Amt[is.na(sols2$Planned.Ask.Amt)] <- left_join(sols2[is.na(sols2$Planned.Ask.Amt), c("Solicitation.ID", "Planned.Ask.Amt")], mdat[, c("Solicitation.ID", "Planned.Amt")], by="Solicitation.ID")[, 3]
# Ask
sols2$Ask.Amt[is.na(sols2$Ask.Amt)] <- left_join(sols2[is.na(sols2$Ask.Amt), c("Solicitation.ID", "Ask.Amt")], mdat[, c("Solicitation.ID", "Ask.Amt")], by="Solicitation.ID")[, 3]
## Create derived fields
sol.stage <- substr(sols2$Solicitation.Stage.Desc, 1, 2)
sols2 <- sols2 %>% mutate(
  Curr.Stage = ifelse(sol.stage %in% c("1.", "2.", "3.", "3A", "3B", "3C", "4."), 1,
               ifelse(sol.stage %in% c("5.", "6."), 2,
               ifelse(sol.stage %in% c("7."), 3,
               ifelse(sol.stage %in% c("8."), 4,
               ifelse(sol.stage %in% c("9."), 5,
               NA)
               )))),
  FY.Plan.Book = Curr.Stage == 1 & Closed.In.FY,
  FY.Clear.Book = Curr.Stage == 2 & Closed.In.FY,
  FY.Ask.Book = Curr.Stage == 3 & Closed.In.FY,
  FY.Oral.Book = Curr.Stage == 4 & Closed.In.FY,
  Sol.Type.Agg = factor(
      ifelse(Solicitation.Type.Desc == "Standard Pledge", "Standard Pledge",
      ifelse(Solicitation.Type.Desc == "Outright Gift", "Outright Gift",
      "Other")
  )),
  lt.Planned.Amt = LogTrans(Planned.Ask.Amt),
  lt.Expected.Amt = LogTrans(Expected.Amt),
  lt.Ask.Amt = LogTrans(Ask.Amt),
  Actual.Amt.In.FY = Actual.Amt * Closed.In.FY,
  Expect.In.Future.FY = Expected.Dt > FYE(rpt.date)
)
## Select needed fields
sols2 <- sols2 %>% select(Solicitation.ID, Final.Sol.Stage, Solicitation.Stage.Desc, Curr.Stage, filename, rpt.date, Expected.Dt, Planned.Ask.Amt, Ask.Amt, Expected.Amt, Actual.Amt, Actual.Amt.In.FY, Closed.In.FY, Sol.Type.Agg, lt.Planned.Amt, lt.Expected.Amt, lt.Ask.Amt, Expected.Dt, Expect.In.Future.FY)
write.table(sols2, file="pit-dat.txt", sep="\t", row.names=F)

Comparisons

Now that I have a point-in-time data file, I can make predictions for each discrete time point.

sols3 <- na.omit(sols2 %>% select(Solicitation.ID, Curr.Stage, rpt.date, Expected.Dt, Expected.Amt, Actual.Amt, Closed.In.FY, Expect.In.Future.FY, Sol.Type.Agg, lt.Planned.Amt, lt.Expected.Amt, lt.Ask.Amt))
## JMod predictions
jmod.preds <- JMod(curr.stage=sols3$Curr.Stage, curr.dt=sols3$rpt.date, expected.dt=sols3$Expected.Dt, expect.amt=sols3$Expected.Amt, act.amt=sols3$Actual.Amt, closed.in.fy=sols3$Closed.In.FY)
# Add the report date
jmod.preds$dt <- sols3$rpt.date
jmod.preds$id <- as.character(paste(sols3$Solicitation.ID, sols3$rpt.date, sep="."))
jmod.preds$expect <- sols3$Expected.Amt
jmod.preds <- data.frame(jmod.preds)
## GLM predictions
PredictGLM <- function(model, data){
  data.frame(probability = predict(model, newdata=data, type="response")) %>%
  mutate(prediction = probability * data$Expected.Amt,
         error = prediction - ifelse(is.na(data$Actual.Amt), 0, data$Actual.Amt) * data$Closed.In.FY,
         dt = data$rpt.date,
         expect = data$Expected.Amt)
}
# Plan
glmdat <- sols3 %>% filter(Curr.Stage == 1) %>% mutate(FY.Plan.Book = Closed.In.FY, Plan.Future = Expect.In.Future.FY)
plan.preds <- PredictGLM(glm.pln.t, glmdat)
# Clear
glmdat <- sols3 %>% filter(Curr.Stage == 2) %>% mutate(FY.Clear.Book = Closed.In.FY, Clear.Future = Expect.In.Future.FY)
clear.preds <- PredictGLM(glm.clr.t, glmdat)
# Ask
glmdat <- sols3 %>% filter(Curr.Stage == 3) %>% mutate(FY.Ask.Book = Closed.In.FY, Ask.Future = Expect.In.Future.FY)
ask.preds <- PredictGLM(glm.ask.t, glmdat)
# Oral
glmdat <- sols3 %>% filter(Curr.Stage == 4) %>% mutate(FY.Oral.Book = Closed.In.FY, Oral.Future = Expect.In.Future.FY)
oral.preds <- PredictGLM(glm.ora.t, glmdat)
# Combine
glm.preds <- rbind(plan.preds, clear.preds, ask.preds, oral.preds)

Take a look at the comparison.

This includes PG solicitations; the GLM way overshoots in FY15 on.

The GLM is clearly better pre-FY15 and after that it seems like a toss-up. That makes sense considering the JMod coefficients are revised each fiscal year while the GLM was trained on the whole dataset.

Of course, the as-yet-unmentioned advantage of using a probabalistic approach is that it allows for the easy estimation of confidence intervals.

## Bootstrapped GLM dataset
set.seed(2358)
rep <- 1000
boot.result <- matrix(0, nrow = nrow(glm.preds), ncol = rep)
for (i in 1:rep) {
  boot.result[, i] <- (glm.preds$probability >= runif(n=nrow(glm.preds), min=0, max=1)) * glm.preds$expect
}
glm.boot <- cbind(glm.preds, data.frame(boot.result))
# Summarise by date across every bootstrapped dataset
glm.boot <- filter(glm.boot, expect < x) %>% mutate(dt = DateToNthOfMonth(dt, n=1)) %>% group_by(dt) %>% summarise_each(funs(sum))
# Find empirical 2.5th, 50th, and 97.5th percentiles (95% CI)
boot.names <- paste("X", 1:rep, sep="")
ggdat <- cbind(ggdat, t(apply(glm.boot[, boot.names], MARGIN=1, FUN=quantile, probs=c(.025, .5, .975))))

Basically, for each solicitation, draw rep samples from the \(\text{Unif}(0,1)\) distribution, and if the randomly generated number is less than or equal to the predicted probability the solicitation will close, count it as closed and take the expected value; otherwise assume it did not close. The empirical 95% CI is bounded by the .025 and .975 quantiles of the data, summed by month.

Plot the results:

That’s actually not bad. Of course, it would be more fair to re-fit the GLM just on FY2011-FY2015 data and use FY2016 as an out-of-sample test set, but since FY16 is still in progress it’s probably not worth doing just yet.

I suspect to make big gains in predictive accuracy I’d need to use a different model structure, e.g. regressing directly on dollar amounts, which is probably worth trying later on.

Packages used

session_info()

## Session info --------------------------------------------------------------

##  setting  value                       
##  version  R version 3.3.0 (2016-05-03)
##  system   x86_64, mingw32             
##  ui       RTerm                       
##  language (EN)                        
##  collate  English_United States.1252  
##  tz       America/Chicago             
##  date     2016-06-09

## Packages ------------------------------------------------------------------

##  package    * version date       source        
##  assertthat   0.1     2013-12-06 CRAN (R 3.3.0)
##  colorspace   1.2-6   2015-03-11 CRAN (R 3.3.0)
##  DBI          0.4-1   2016-05-08 CRAN (R 3.3.0)
##  devtools   * 1.11.1  2016-04-21 CRAN (R 3.3.0)
##  digest       0.6.9   2016-01-08 CRAN (R 3.3.0)
##  dplyr      * 0.4.3   2015-09-01 CRAN (R 3.3.0)
##  evaluate     0.9     2016-04-29 CRAN (R 3.3.0)
##  formatR      1.4     2016-05-09 CRAN (R 3.3.0)
##  ggplot2    * 2.1.0   2016-03-01 CRAN (R 3.3.0)
##  gridExtra  * 2.2.1   2016-02-29 CRAN (R 3.3.0)
##  gtable       0.2.0   2016-02-26 CRAN (R 3.3.0)
##  htmltools    0.3.5   2016-03-21 CRAN (R 3.3.0)
##  knitr      * 1.13    2016-05-09 CRAN (R 3.3.0)
##  labeling     0.3     2014-08-23 CRAN (R 3.3.0)
##  lazyeval     0.1.10  2015-01-02 CRAN (R 3.3.0)
##  lubridate  * 1.5.6   2016-04-06 CRAN (R 3.3.0)
##  magrittr     1.5     2014-11-22 CRAN (R 3.3.0)
##  memoise      1.0.0   2016-01-29 CRAN (R 3.3.0)
##  munsell      0.4.3   2016-02-13 CRAN (R 3.3.0)
##  plyr         1.8.3   2015-06-12 CRAN (R 3.3.0)
##  R6           2.1.2   2016-01-26 CRAN (R 3.3.0)
##  Rcpp         0.12.4  2016-03-26 CRAN (R 3.3.0)
##  rmarkdown    0.9.6   2016-05-01 CRAN (R 3.3.0)
##  scales     * 0.4.0   2016-02-26 CRAN (R 3.3.0)
##  stringi      1.0-1   2015-10-22 CRAN (R 3.3.0)
##  stringr      1.0.0   2015-04-30 CRAN (R 3.3.0)
##  tidyr      * 0.4.1   2016-02-05 CRAN (R 3.3.0)
##  withr        1.0.1   2016-02-04 CRAN (R 3.3.0)
##  yaml         2.1.13  2014-06-12 CRAN (R 3.3.0)

4 Head to head models