Source can be viewed on GitHub
Begin by loading the models to be compared.
#### Run Rscript0 to load useful packages and functions ----
source("Rscript0 - libraries.R")
source("f.Wrangle.R")
source("f.Diagnostics.R")
source("f.JMod.R")
#### Model data cleanup ----
source("Rscript1a - modeling data appends.R")
source("Rscript1b - derived variables.R")
source("Rscript1c - additional predictors.R")
#### Models from last time ----
load("logistic.models.Rdata")
### Point-in-time data file ----
sols <- read.csv("sols.dedupe.by.mo.txt", sep="\t", stringsAsFactors=F)
The sols
data frame was created for a different project and doesn’t have all the columns needed to make predictions in the new model. The following fields are required:
Curr.Stage
, Curr.Dt
, Expected.Dt
, Expected.Amt
, Actual.Amt
, Closed.In.FY
FY.Plan.Book
, FY.Clear.Book
, FY.Ask.Book
, FY.Oral.Book
, Sol.Type.Agg
, lt.Planned.Amt
, lt.Expected.Amt
, lt.Ask.Amt
, Plan.Future
, Clear.Future
, Ask.Future
, Oral.Future
Let’s go about reshaping sols
:
## Grab existing fields
sols2 <- sols %>% select(Solicitation.ID, Final.Sol.Stage, Solicitation.Stage.Desc, filename, rpt.date, Expected.Dt, Expected.Amt, Actual.Amt, Resolved.In.FY, Closed.In.FY, Solicitation.Type.Desc, Planned.Ask.Amt, Ask.Amt)
## Convert date fields
sols2 <- ToDate(sols2, c("rpt.date", "Expected.Dt"))
## Impute any missing values from mdat
# Expected
sols2$Expected.Amt[is.na(sols2$Expected.Amt)] <- left_join(sols2[is.na(sols2$Expected.Amt), c("Solicitation.ID", "Expected.Amt")], mdat[, c("Solicitation.ID", "Expected.Amt")], by="Solicitation.ID")[, 3]
# Actual
sols2$Actual.Amt[is.na(sols2$Actual.Amt)] <- left_join(sols2[is.na(sols2$Actual.Amt), c("Solicitation.ID", "Actual.Amt")], mdat[, c("Solicitation.ID", "Actual.Amt")], by="Solicitation.ID")[, 3]
# Planned
sols2$Planned.Ask.Amt[is.na(sols2$Planned.Ask.Amt)] <- left_join(sols2[is.na(sols2$Planned.Ask.Amt), c("Solicitation.ID", "Planned.Ask.Amt")], mdat[, c("Solicitation.ID", "Planned.Amt")], by="Solicitation.ID")[, 3]
# Ask
sols2$Ask.Amt[is.na(sols2$Ask.Amt)] <- left_join(sols2[is.na(sols2$Ask.Amt), c("Solicitation.ID", "Ask.Amt")], mdat[, c("Solicitation.ID", "Ask.Amt")], by="Solicitation.ID")[, 3]
## Create derived fields
sol.stage <- substr(sols2$Solicitation.Stage.Desc, 1, 2)
sols2 <- sols2 %>% mutate(
Curr.Stage = ifelse(sol.stage %in% c("1.", "2.", "3.", "3A", "3B", "3C", "4."), 1,
ifelse(sol.stage %in% c("5.", "6."), 2,
ifelse(sol.stage %in% c("7."), 3,
ifelse(sol.stage %in% c("8."), 4,
ifelse(sol.stage %in% c("9."), 5,
NA)
)))),
FY.Plan.Book = Curr.Stage == 1 & Closed.In.FY,
FY.Clear.Book = Curr.Stage == 2 & Closed.In.FY,
FY.Ask.Book = Curr.Stage == 3 & Closed.In.FY,
FY.Oral.Book = Curr.Stage == 4 & Closed.In.FY,
Sol.Type.Agg = factor(
ifelse(Solicitation.Type.Desc == "Standard Pledge", "Standard Pledge",
ifelse(Solicitation.Type.Desc == "Outright Gift", "Outright Gift",
"Other")
)),
lt.Planned.Amt = LogTrans(Planned.Ask.Amt),
lt.Expected.Amt = LogTrans(Expected.Amt),
lt.Ask.Amt = LogTrans(Ask.Amt),
Actual.Amt.In.FY = Actual.Amt * Closed.In.FY,
Expect.In.Future.FY = Expected.Dt > FYE(rpt.date)
)
## Select needed fields
sols2 <- sols2 %>% select(Solicitation.ID, Final.Sol.Stage, Solicitation.Stage.Desc, Curr.Stage, filename, rpt.date, Expected.Dt, Planned.Ask.Amt, Ask.Amt, Expected.Amt, Actual.Amt, Actual.Amt.In.FY, Closed.In.FY, Sol.Type.Agg, lt.Planned.Amt, lt.Expected.Amt, lt.Ask.Amt, Expected.Dt, Expect.In.Future.FY)
write.table(sols2, file="pit-dat.txt", sep="\t", row.names=F)
Now that I have a point-in-time data file, I can make predictions for each discrete time point.
sols3 <- na.omit(sols2 %>% select(Solicitation.ID, Curr.Stage, rpt.date, Expected.Dt, Expected.Amt, Actual.Amt, Closed.In.FY, Expect.In.Future.FY, Sol.Type.Agg, lt.Planned.Amt, lt.Expected.Amt, lt.Ask.Amt))
## JMod predictions
jmod.preds <- JMod(curr.stage=sols3$Curr.Stage, curr.dt=sols3$rpt.date, expected.dt=sols3$Expected.Dt, expect.amt=sols3$Expected.Amt, act.amt=sols3$Actual.Amt, closed.in.fy=sols3$Closed.In.FY)
# Add the report date
jmod.preds$dt <- sols3$rpt.date
jmod.preds$id <- as.character(paste(sols3$Solicitation.ID, sols3$rpt.date, sep="."))
jmod.preds$expect <- sols3$Expected.Amt
jmod.preds <- data.frame(jmod.preds)
## GLM predictions
PredictGLM <- function(model, data){
data.frame(probability = predict(model, newdata=data, type="response")) %>%
mutate(prediction = probability * data$Expected.Amt,
error = prediction - ifelse(is.na(data$Actual.Amt), 0, data$Actual.Amt) * data$Closed.In.FY,
dt = data$rpt.date,
expect = data$Expected.Amt)
}
# Plan
glmdat <- sols3 %>% filter(Curr.Stage == 1) %>% mutate(FY.Plan.Book = Closed.In.FY, Plan.Future = Expect.In.Future.FY)
plan.preds <- PredictGLM(glm.pln.t, glmdat)
# Clear
glmdat <- sols3 %>% filter(Curr.Stage == 2) %>% mutate(FY.Clear.Book = Closed.In.FY, Clear.Future = Expect.In.Future.FY)
clear.preds <- PredictGLM(glm.clr.t, glmdat)
# Ask
glmdat <- sols3 %>% filter(Curr.Stage == 3) %>% mutate(FY.Ask.Book = Closed.In.FY, Ask.Future = Expect.In.Future.FY)
ask.preds <- PredictGLM(glm.ask.t, glmdat)
# Oral
glmdat <- sols3 %>% filter(Curr.Stage == 4) %>% mutate(FY.Oral.Book = Closed.In.FY, Oral.Future = Expect.In.Future.FY)
oral.preds <- PredictGLM(glm.ora.t, glmdat)
# Combine
glm.preds <- rbind(plan.preds, clear.preds, ask.preds, oral.preds)
Take a look at the comparison.
This includes PG solicitations; the GLM way overshoots in FY15 on.
The GLM is clearly better pre-FY15 and after that it seems like a toss-up. That makes sense considering the JMod coefficients are revised each fiscal year while the GLM was trained on the whole dataset.
Of course, the as-yet-unmentioned advantage of using a probabalistic approach is that it allows for the easy estimation of confidence intervals.
## Bootstrapped GLM dataset
set.seed(2358)
rep <- 1000
boot.result <- matrix(0, nrow = nrow(glm.preds), ncol = rep)
for (i in 1:rep) {
boot.result[, i] <- (glm.preds$probability >= runif(n=nrow(glm.preds), min=0, max=1)) * glm.preds$expect
}
glm.boot <- cbind(glm.preds, data.frame(boot.result))
# Summarise by date across every bootstrapped dataset
glm.boot <- filter(glm.boot, expect < x) %>% mutate(dt = DateToNthOfMonth(dt, n=1)) %>% group_by(dt) %>% summarise_each(funs(sum))
# Find empirical 2.5th, 50th, and 97.5th percentiles (95% CI)
boot.names <- paste("X", 1:rep, sep="")
ggdat <- cbind(ggdat, t(apply(glm.boot[, boot.names], MARGIN=1, FUN=quantile, probs=c(.025, .5, .975))))
Basically, for each solicitation, draw rep
samples from the \(\text{Unif}(0,1)\) distribution, and if the randomly generated number is less than or equal to the predicted probability the solicitation will close, count it as closed and take the expected value; otherwise assume it did not close. The empirical 95% CI is bounded by the .025 and .975 quantiles of the data, summed by month.
Plot the results:
That’s actually not bad. Of course, it would be more fair to re-fit the GLM just on FY2011-FY2015 data and use FY2016 as an out-of-sample test set, but since FY16 is still in progress it’s probably not worth doing just yet.
I suspect to make big gains in predictive accuracy I’d need to use a different model structure, e.g. regressing directly on dollar amounts, which is probably worth trying later on.
session_info()
## Session info --------------------------------------------------------------
## setting value
## version R version 3.3.0 (2016-05-03)
## system x86_64, mingw32
## ui RTerm
## language (EN)
## collate English_United States.1252
## tz America/Chicago
## date 2016-06-09
## Packages ------------------------------------------------------------------
## package * version date source
## assertthat 0.1 2013-12-06 CRAN (R 3.3.0)
## colorspace 1.2-6 2015-03-11 CRAN (R 3.3.0)
## DBI 0.4-1 2016-05-08 CRAN (R 3.3.0)
## devtools * 1.11.1 2016-04-21 CRAN (R 3.3.0)
## digest 0.6.9 2016-01-08 CRAN (R 3.3.0)
## dplyr * 0.4.3 2015-09-01 CRAN (R 3.3.0)
## evaluate 0.9 2016-04-29 CRAN (R 3.3.0)
## formatR 1.4 2016-05-09 CRAN (R 3.3.0)
## ggplot2 * 2.1.0 2016-03-01 CRAN (R 3.3.0)
## gridExtra * 2.2.1 2016-02-29 CRAN (R 3.3.0)
## gtable 0.2.0 2016-02-26 CRAN (R 3.3.0)
## htmltools 0.3.5 2016-03-21 CRAN (R 3.3.0)
## knitr * 1.13 2016-05-09 CRAN (R 3.3.0)
## labeling 0.3 2014-08-23 CRAN (R 3.3.0)
## lazyeval 0.1.10 2015-01-02 CRAN (R 3.3.0)
## lubridate * 1.5.6 2016-04-06 CRAN (R 3.3.0)
## magrittr 1.5 2014-11-22 CRAN (R 3.3.0)
## memoise 1.0.0 2016-01-29 CRAN (R 3.3.0)
## munsell 0.4.3 2016-02-13 CRAN (R 3.3.0)
## plyr 1.8.3 2015-06-12 CRAN (R 3.3.0)
## R6 2.1.2 2016-01-26 CRAN (R 3.3.0)
## Rcpp 0.12.4 2016-03-26 CRAN (R 3.3.0)
## rmarkdown 0.9.6 2016-05-01 CRAN (R 3.3.0)
## scales * 0.4.0 2016-02-26 CRAN (R 3.3.0)
## stringi 1.0-1 2015-10-22 CRAN (R 3.3.0)
## stringr 1.0.0 2015-04-30 CRAN (R 3.3.0)
## tidyr * 0.4.1 2016-02-05 CRAN (R 3.3.0)
## withr 1.0.1 2016-02-04 CRAN (R 3.3.0)
## yaml 2.1.13 2014-06-12 CRAN (R 3.3.0)