## ----include = FALSE----------------------------------------------------------
library(knitr)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
options(rmarkdown.html_vignette.check_title = FALSE) # to supress R-CMD check

## to fold/hook the code
hook_output <- knit_hooks$get("output")
knit_hooks$set(output = function(x, options) {
  lines <- options$output.lines
  if (is.null(lines)) {
    return(hook_output(x, options))  # pass to default hook
  }
  x <- unlist(strsplit(x, "\n"))
  more <- "..."
  if (length(lines) == 1) {
    if (length(x) > lines) {
      # truncate the output, but add ....
      x <- c(head(x, lines), more)
    }
  } else {
    x <- c(if (abs(lines[1]) > 1) more else NULL,
           x[lines],
           if (length(x) > lines[abs(length(lines))]) more else NULL
    )
  }
  # paste these lines together
  x <- paste(c(x, ""), collapse = "\n")
  hook_output(x, options)
})

modern_r <- getRversion() >= "4.1.0"

## ----setup, message = F, warning = F------------------------------------------
library(injurytools)
library(dplyr)
library(stringr)
library(tidyr)
library(lme4)
library(pscl)
# library(glmmTMB)
library(MASS)
library(ggplot2)
library(gridExtra)
library(knitr)

## ----setup2, echo = F---------------------------------------------------------
# set the global theme of all plots
theme_set(theme_bw())

## ----datasetup1, warning = F--------------------------------------------------
## 17/18
df_exposures1718 <- prepare_exp(df_exposures0 = 
                                  raw_df_exposures |> filter(season == "17/18"),
                                person_id     = "player_name",
                                date          = "year",
                                time_expo     = "minutes_played") |> 
  mutate(seasonb = date2season(date))
df_injuries1718 <- prepare_inj(df_injuries0   =
                                 raw_df_injuries |> filter(season == "17/18"),
                               person_id      = "player_name",
                               date_injured   = "from",
                               date_recovered = "until")
injd1718 <- prepare_all(data_exposures = df_exposures1718,
                        data_injuries  = df_injuries1718,
                        exp_unit = "matches_minutes")

## ----datasetup2, warning = F--------------------------------------------------
## 18/19
df_exposures1819 <- prepare_exp(df_exposures0 = 
                                  raw_df_exposures |> filter(season == "18/19"),
                                person_id     = "player_name",
                                date          = "year",
                                time_expo     = "minutes_played") |> 
  mutate(seasonb = date2season(date))
df_injuries1819 <- prepare_inj(df_injuries0   = 
                                 raw_df_injuries |> filter(season == "18/19"),
                               person_id      = "player_name",
                               date_injured   = "from",
                               date_recovered = "until")
injd1819 <- prepare_all(data_exposures = df_exposures1819,
                        data_injuries  = df_injuries1819,
                        exp_unit = "matches_minutes")

## ----warning = F--------------------------------------------------------------
## calculate injury summary statistics
dfs1718 <- calc_summary(injd1718, quiet = T)
dfs1819 <- calc_summary(injd1819, quiet = T)

dfs1718p <- calc_summary(injd1718, overall = FALSE, quiet = T)
dfs1819p <- calc_summary(injd1819, overall = FALSE, quiet = T)

dfsp <- bind_rows("Season 17-18" = dfs1718p,
                    "Season 18-19" = dfs1819p,
                    .id = "season")

## ----eval = F-----------------------------------------------------------------
# ## plot
# p1 <- ggplot(data = dfsp) +
#   geom_histogram(aes(x = incidence, fill = season)) +
#   facet_wrap(~season) +
#   scale_fill_manual(name = "", values = c("#E7B800", "#2E9FDF")) +
#   ylab("Number of players") +
#   xlab("Incidence (number of injuries per 100 player-match)") +
#   ggtitle("Histogram of injury incidence in each season") +
#   theme(legend.position = "none")
# 
# p2 <- ggplot(data = dfsp) +
#   geom_histogram(aes(x = burden, fill = season)) +
#   facet_wrap(~season) +
#   scale_fill_manual(name = "", values = c("#E7B800", "#2E9FDF")) +
#   ylab("Number of players") +
#   xlab("Burden (number of days lost due to injury per 100 player-match)") +
#   ggtitle("Histogram of injury burden in each season") +
#   theme(legend.position = "none")
# 
# grid.arrange(p1, p2, ncol = 1)

## ----echo = F, warning = F----------------------------------------------------
p1 <- ggplot(data = dfsp) + 
  geom_histogram(aes(x = incidence, fill = season)) + 
  facet_wrap(~season) +
  scale_fill_manual(name = "", values = c("#E7B800", "#2E9FDF")) +
  ylab("Number of players") + 
  xlab("Incidence (number of injuries per 100 player-match)") +
  ggtitle(bquote("Histogram of injury" ~ bold("incidence") ~ "in each season")) + 
  theme(legend.position = "none")

p2 <- ggplot(data = dfsp) + 
  geom_histogram(aes(x = burden, fill = season)) + 
  facet_wrap(~season) +
  scale_fill_manual(name = "", values = c("#E7B800", "#2E9FDF")) +
  ylab("Number of players") + 
  xlab("Burden (number of days lost due to injury per 100 player-match)") +
  ggtitle(bquote("Histogram of injury" ~ bold("burden") ~ "in each season")) + 
  theme(legend.position = "none")

## ----eval = F-----------------------------------------------------------------
# theme_counts <- theme(axis.text = element_text(size = rel(1.2)),
#                       axis.title = element_text(size = rel(1.3)),
#                       strip.text = element_text(size = rel(1.4)),
#                       plot.title = element_text(size = rel(1.4)),
#                       legend.text = element_text(size =  rel(1.3)),
#                       legend.title = element_text(size = rel(1.3)))
# p1 <- p1 + theme_counts
# p2 <- p2 + theme_counts

## ----warning = F, message = F, echo = F, fig.width = 10, fig.height = 6.8-----
theme_counts <- theme(axis.text = element_text(size = rel(1.2)),
                      axis.title = element_text(size = rel(1.3)),
                      strip.text = element_text(size = rel(1.4)),
                      plot.title = element_text(size = rel(1.4)),
                      legend.text = element_text(size =  rel(1.3)),
                      legend.title = element_text(size = rel(1.3)))
p1 <- p1 + theme_counts
p2 <- p2 + theme_counts

grid.arrange(p1, p2, ncol = 1)

## -----------------------------------------------------------------------------
## 17/18
df_exposures1718 <- prepare_exp(df_exposures0 = 
                                  raw_df_exposures |> filter(season == "17/18"),
                                person_id        = "player_name",
                                date          = "year",
                                time_expo     = "minutes_played") |> 
  mutate(seasonb = date2season(date))

dfs1718p <- dfs1718p |> 
  mutate(seasonb = "2017/2018") |> 
  ## join to have info, such as position, age, citizenship etc.
  left_join(df_exposures1718, by = c("person_id" = "person_id", 
                                     "seasonb"   = "seasonb")) |> 
  ## create positionb column 
  ## (so that the categories are: Attack, Defender, Goalkeeper and Midfield)
  mutate(positionb = factor(str_split_i(position, "_", 1)))

## -----------------------------------------------------------------------------
## quit Goalkeepers
dfs1718p <- dplyr::filter(dfs1718p, positionb != "Goalkeeper") |> 
  droplevels()

## -----------------------------------------------------------------------------
incidence_glm_pois <- glm(ncases ~ positionb, # + offset(log(totalexpo))
                          offset = log(totalexpo),
                          data = dfs1718p,
                          family = poisson)

## ----eval = F-----------------------------------------------------------------
# # incidence_glm_pois2 <- glmmTMB(formula = ncases ~ foot,
# #                                offset = log(totalexpo),
# #                                family = poisson(), data = dfs1718p)
# # summary(incidence_glm_pois2)

## -----------------------------------------------------------------------------
df_exposures <- prepare_exp(df_exposures0 = raw_df_exposures,
                                person_id     = "player_name",
                                date          = "year",
                                time_expo     = "minutes_played") |> 
  mutate(seasonb = date2season(date))

dfsp <- dfsp |> 
  mutate(seasonb = if_else(season == "Season 17-18", "2017/2018", "2018/2019")) |> 
  ## join to have info, such as position, age, citizenship etc.
  left_join(df_exposures, by = c("person_id" = "person_id",
                                 "seasonb" = "seasonb")) |> 
  ## create positionb column 
  ## (so that the categories are: Attack, Defender, Goalkeeper and Midfield)
  mutate(positionb = factor(str_split_i(position, "_", 1))) |> 
  droplevels()

## ----eval = F-----------------------------------------------------------------
# incidence_glmm_pois <- glmer(formula = ncases ~ positionb + (1 | person_id),
#                              offset = log(totalexpo),
#                              data = dfsp,
#                              family = poisson)
# # incidence_glmm_pois2 <- glmmTMB::glmmTMB(formula = ncases ~ positionb + (1 | person_id),
# #                                          offset = log(totalexpo),
# #                                          data = dfsp,
# #                                          family = poisson)

## -----------------------------------------------------------------------------
burden_glm_pois <- glm(ndayslost ~ positionb, offset = log(totalexpo), ## or ~ foot
                       data = dfs1718p,
                       family = poisson)

## -----------------------------------------------------------------------------
summary(burden_glm_pois)

## -----------------------------------------------------------------------------
cbind(estimate = exp(coef(burden_glm_pois)) * c(90*100, 1, 1), 
      exp(confint(burden_glm_pois)) * c(90*100, 1, 1)) |> # (to report per 100 player-matches)
  kable()

## ----warning = F--------------------------------------------------------------
burden_glm_nb <- glm.nb(ndayslost ~ positionb + offset(log(totalexpo)),
                           data = dfs1718p)

## -----------------------------------------------------------------------------
summary(burden_glm_nb)

## -----------------------------------------------------------------------------
burden_zinfpois <- zeroinfl(ndayslost ~ positionb | positionb,
                             offset = log(totalexpo),
                             data = dfs1718p,
                             link = "logit",
                             dist = "poisson",
                             trace = FALSE, EM = FALSE)
## Or:
# burden_zinfpois <- glmmTMB::glmmTMB(formula = ndayslost ~ 1 +  positionb,
#                                     offset = log(totalexpo),
#                                     ziformula = ~ 1 + positionb,
#                                     data = dfs1718p,
#                                     family = poisson)

## -----------------------------------------------------------------------------
summary(burden_zinfpois)

## -----------------------------------------------------------------------------
burden_zinfnb <- zeroinfl(ndayslost ~ positionb | positionb,
                             offset = log(totalexpo),
                             data = dfs1718p,
                             link = "logit",
                             dist = "negbin",
                             trace = FALSE, EM = FALSE)
## Or:
# burden_zinfnb <- glmmTMB::glmmTMB(ndayslost ~ 1 + positionb, offset = log(totalexpo),
#                                      ziformula = ~ 1 + positionb,
#                                      data = dfs1718p,
#                                      family = nbinom2)

## -----------------------------------------------------------------------------
summary(burden_zinfnb)

## -----------------------------------------------------------------------------
## pois
## predprob: for each subj, prob of observing each value
phat_pois <- predprob(burden_glm_pois) 
phat_pois_mn <- apply(phat_pois, 2, mean) 
## nb
phat_nb <- predprob(burden_glm_nb)           
phat_nb_mn <- apply(phat_nb, 2, mean) 
## zinfpois
phat_zinfpois <- predprob(burden_zinfpois)            
phat_zinfpois_mn <- apply(phat_zinfpois, 2, mean) 
## zinfnb
phat_zinfnb <- predprob(burden_zinfnb)           
phat_zinfnb_mn <- apply(phat_zinfnb, 2, mean) 

## put in a data frame
idx <- seq(1, 62, length.out = 30)
df_probs <- data.frame(phat_pois_mn     = phat_pois_mn[idx],
                       phat_nb_mn       = phat_nb_mn[idx],
                       phat_zinfpois_mn = phat_zinfpois_mn[idx],
                       phat_zinfnb_mn   = phat_zinfnb_mn[idx], 
                       x= idx) |> 
  tidyr::gather(key = "prob_type", value = "value", -x) |> 
  mutate(prob_type = factor(prob_type))

## ----fig.width = 9, fig.height = 4.8------------------------------------------
ggplot(data = dfs1718p) + 
  geom_histogram(aes(x = burden/100, after_stat(density)), 
                 breaks = seq(-0.5, 62, length.out = 30),
                 col = "black", alpha = 0.5) +
  geom_point(data = df_probs, aes(x = x, y = value, 
                                  group = prob_type, col = prob_type)) + 
  geom_line(data = df_probs, aes(x = x, y = value, 
                                 group = prob_type, col = prob_type)) + 
  ylim(c(0, 0.3)) + xlab("Injury burden") + ylab("Density") +
  scale_color_manual(name = "Model:",
                     labels = c("Negative Binomial", "Poisson",
                                "Zero-Inflated Negative Binomial",
                                "Zero-Inflated Poisson"),
                     values = c("darkblue", "chocolate", "purple", "red")) +
  ggtitle("Histogram of injury burden in 2017/2018\nwith conditional Poisson, NB, ZIP and ZINB Densities") +
  theme_counts +
  theme(legend.position = c(0.7, 0.7))

## ----echo = F, eval = F-------------------------------------------------------
# ## using base R
# with(dfs1718p, {
#   hist(ndayslost, prob = TRUE, breaks = seq(-0.5, 316.5, length.out = 30),
#        xlab = "Injury burden (number of injuries per player-season)",
#        main = "Histogram of overall injury burden\nwith conditional Poisson, NB, ZIP and ZINB Densities")
#   lines(x = idx, y = phat_pois_mn[idx], type = "b", lwd = 2, col = "black")
#   lines(x = idx, y = phat_nb_mn[idx], type = "b", lwd = 2, col = "purple")
#   lines(x = idx, y = phat_zinfpois_mn[idx], type = "b", lwd = 2, col = "red")
#   lines(x = idx, y = phat_zinfnb_mn[idx], type = "b", lwd = 2, col = "blue")
# })
# legend(250, 0.026, c("Poisson", "NB", "ZIP","ZINB"), lty = 1,
#        col = c("black", "purple", "red","blue"), lwd = 2)
# 

## -----------------------------------------------------------------------------
models <- list("Poisson model" = burden_glm_pois,
               "Negative binomial model" = burden_glm_nb,
               "Zero-inflated Poisson model" = burden_zinfpois,
               "Zero-inflated Negative Binomial model" = burden_zinfnb)

res_gof <- lapply(models, function(model) {
  aic      <- AIC(model)
  bic      <- BIC(model)
  if (class(model)[[1]] == "zeroinfl") {
    deviance <- -2*logLik(model)[[1]]
    null_model <- update(model, .~ -positionb)
    null_deviance <- -2*logLik(null_model)[[1]]
  } else {
    deviance <- model$deviance
    null_deviance <- model$null.deviance
  }
  dev_expl <- (null_deviance - deviance)/null_deviance * 100
  return(res = data.frame(aic = aic, bic = bic, dev_expl = dev_expl))
})

## -----------------------------------------------------------------------------
res_gof |>   
  bind_rows(.id = "model") |>  
  ## arrange them according to dev_expl.
  arrange(desc(dev_expl)) |> 
  knitr::kable(digits = 2,
               col.names = c("Model", "AIC", "BIC", "Deviance Explained"))

