---
title: "In-vivo study design"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{In-vivo study design}
  %\VignetteEncoding{UTF-8}
  %\VignetteEngine{knitr::rmarkdown}
---
  
```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  error = TRUE,
  fig.width = 6,
  fig.height = 6
)
```
  
```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
library(designit)
library(dplyr)
library(assertthat)
library(stringr)
library(tidyr)
library(purrr)
library(ggplot2)
library(cowplot)
```

```{r helper_functions, echo=FALSE}
# -----------------------------------------------------------------------------
# Helper functions used by the in vivo wrappers

DoE_multipleLevelCols <- function(df, na.rm = FALSE) {
  tmp <- dplyr::summarise(df, across(everything(), ~ dplyr::n_distinct(.x, na.rm = na.rm)))

  colnames(tmp)[tmp > 1]
}

DoE_numericalCols <- function(df, na.rm = FALSE) {
  dplyr::select(df, where(is.numeric)) |>
    colnames()
}
```

```{r scoring_functions, echo=FALSE}
# -----------------------------------------------------------------------------
# Dedicated scoring functions used by the in vivo wrappers

# Scoring function to assess more specific constraints of the type Column A == Column B
bc_misaligned <- function(batch_vars, feature_vars, weight = 1, test_data = NULL) {
  force(batch_vars)
  force(feature_vars)
  force(weight)

  assertthat::assert_that(length(batch_vars) > 0, length(batch_vars) == length(feature_vars),
    msg = "Batch and feature variables must be of equal length (>0)"
  )
  if (!is.null(test_data)) {
    batch_match <- match(batch_vars, colnames(test_data))
    feature_match <- match(feature_vars, colnames(test_data))
    assertthat::assert_that(!any(is.na(batch_match)), !any(is.na(feature_match)),
      msg = "All columns must exist in passed sample data"
    )
    assertthat::assert_that(length(intersect(batch_vars, feature_vars)) == 0,
      msg = "There should be no overlap between batch and feature variables"
    )
    # Could do more checks, like matching number of levels
  }


  function(bc) {
    samples <- bc$get_samples(include_id = TRUE, as_tibble = FALSE)
    misaligns <- 0
    for (i in seq_along(batch_vars)) {
      misaligns <- misaligns + sum(samples[[batch_vars[i]]] != samples[[feature_vars[i]]])
    }
    weight * misaligns
  }
}

# Scoring function to assess more specific constraints of the type 'Feature levels unique in Container'
bc_homogenicity_violations <- function(container_var, feature_var, weight = 1, test_data = NULL) {
  force(container_var)
  force(feature_var)
  force(weight)

  assertthat::assert_that(length(container_var) == 1, length(feature_var) == 1,
    msg = "Container and feature variables must be of length 1"
  )
  if (!is.null(test_data)) {
    container_match <- match(container_var, colnames(test_data))
    feature_match <- match(feature_var, colnames(test_data))
    assertthat::assert_that(!any(is.na(container_match)), !any(is.na(feature_match)),
      msg = "All columns must exist in passed sample data"
    )
    assertthat::assert_that(container_var != feature_var,
      msg = "Container and feature variable must be different"
    )
  }

  function(bc) {
    samples <- bc$get_samples(include_id = TRUE, as_tibble = FALSE)
    viol <- table(samples[[container_var]], samples[[feature_var]]) |>
      apply(1, function(a) {
        sum(a > 0) - 1
      }) |>
      sum()
    weight * viol
  }
}

# Scoring function with one-way-ANOVA; Only model with one descriptor variable possible yet
anova_logP <- function(batch_var, feature_var, weight = 1, test_data = NULL) {
  force(batch_var)
  force(feature_var)
  force(weight)

  assertthat::assert_that(length(batch_var) == 1, length(feature_var) == 1,
    msg = "Batch and feature variable must be of length 1"
  )
  if (!is.null(test_data)) {
    assertthat::assert_that(batch_var != feature_var, !any(is.na(match(c(batch_var, feature_var), colnames(test_data)))),
      msg = "All columns must exist in passed sample data and be different from each other"
    )
    assertthat::assert_that(is.numeric(test_data[[feature_var]]), msg = "Feature variable must be numeric")
    # assertthat::assert_that(is.factor(test_data[[batch_var]]) || is.character(test_data[[batch_var]]), msg="Batch variable must be categorical")
  }

  function(bc) {
    samples <- bc$get_samples(include_id = TRUE, as_tibble = TRUE)
    mod <- lm(samples[[feature_var]] ~ factor(samples[[batch_var]]), na.action = na.omit)

    f <- summary(mod)$fstatistic
    p <- pf(f[1], f[2], f[3], lower.tail = F)
    weight * unname(-log10(p))
  }
}
```

```{r invivo_functions, echo=FALSE}
# -----------------------------------------------------------------------------
# User callable functions for helping with the design of in vivo studies

# Assignment of treatments to an animal list
InVivo_assignTreatments <- function(animal_list, treatments,
                                    balance_treatment_vars = c(),
                                    form_cages_by = c(),
                                    n_shuffle = c(rep(5, 100), rep(3, 200), rep(2, 500), rep(1, 20000)),
                                    quiet_process = FALSE, quiet_optimize = TRUE) {
  if (is.vector(treatments)) treatments <- data.frame(Treatment = treatments)

  if (!quiet_process) message("Performing treatment assignment with constrained animal selection.")

  assertthat::assert_that("Treatment" %in% colnames(treatments), msg = "Column 'Treatment' missing from treatment object")

  # Make sure we have defined factor levels for categorical variables
  treatments_sortedfac <- dplyr::mutate(treatments, across(where(is_character), as.factor)) # here we want alphabetic sort order!
  treatments <- dplyr::mutate(treatments, across(where(is_character), function(v) factor(v, levels = unique(v)))) # Keep orig sort order in treatment object!

  # Create a CageGroup variable if not yet provided in animal list or overridden by specific argument
  if (!is.null(form_cages_by) && length(form_cages_by) > 0) {
    animal_list <- tidyr::unite(animal_list, "CageGroup", any_of(form_cages_by), sep = "_", remove = F)
  }

  # Do we have batch container columns that are multi-value and have to align with identically named sample columns?
  # In this case, make those columns factors, check that numbers match and thus a solution is possible at all
  # Append '_bc' to the common variables in the batch container to allow both sets of variables in the same container
  bc_columns <- intersect(colnames(treatments), colnames(animal_list)) |> intersect(DoE_multipleLevelCols(treatments))

  ani_fac <- dplyr::mutate(animal_list, across(all_of(bc_columns), as.factor)) # here we want alphabetic sort order!

  if (length(bc_columns) > 0) {
    if (!quiet_process) {
      message(
        "Using constraints in variables: ", stringr::str_c(bc_columns, collapse = ", "),
        "\nChecking if solution is possible:"
      )
    }

    ani_counts <- dplyr::count(as_tibble(ani_fac), across(all_of(bc_columns))) |> dplyr::arrange(across(all_of(bc_columns)))
    treat_counts <- dplyr::count(as_tibble(treatments_sortedfac), across(all_of(bc_columns))) |> dplyr::arrange(across(all_of(bc_columns)))
    assertthat::assert_that(identical(ani_counts, treat_counts), msg = "Levels of common variables don't match between animal list and treatment list")
    if (!quiet_process) message("   ... Yes!")
    for (i in seq_along(bc_columns)) { # add bc_ prefix to constraint variables to allow both variable sets in batch container
      treatments_sortedfac <- dplyr::rename(treatments_sortedfac, !!stringr::str_c(bc_columns[i], "_bc") := bc_columns[i])
    }
  }

  if (!quiet_process) message("Setting up batch container.")

  # Set up batch container. Now it would be VERY handy to be able to just pass the treatment object directly. ;)
  # Instead, have to set up dimension vector and exclude table to reflect the valid container positions.
  n_lev <- dplyr::summarize(treatments_sortedfac, across(everything(), nlevels))
  all_fac_combs <- dplyr::count(treatments_sortedfac, across(everything()), .drop = F)
  positions <- max(all_fac_combs$n)

  dimension_vector <- c(setNames(as.integer(n_lev), nm = colnames(n_lev)), Position = positions)

  exclude_table <- tidyr::expand_grid(dplyr::select(all_fac_combs, -c("n")), Position = 1:positions) |>
    dplyr::left_join(all_fac_combs, by = setdiff(colnames(all_fac_combs), "n")) |>
    dplyr::filter(Position > n) |>
    dplyr::select(-c("n")) |>
    dplyr::mutate(across(everything(), as.integer))


  # Prepare all constraint columns to hold integer level numbers --> comparable to batch container
  ani_bclevels <- dplyr::mutate(ani_fac, across(all_of(bc_columns), as.integer))

  # Initial batch container, treatments will be assigned in first step
  bc_treatment <- BatchContainer$new(
    dimensions = dimension_vector,
    exclude = exclude_table
  )

  bc_treatment <- assign_in_order(bc_treatment, ani_bclevels)

  if (!quiet_process) message("Constructing scoring functions:")

  # Set up required scoring functions; most relevant ones come first to allow later application of relevance ranking
  scoring_functions <- list()
  bc_data <- bc_treatment$get_samples()

  if (length(bc_columns) > 0) {
    if (!quiet_process) {
      message(
        "     ... user specified treatment allocation constraint (Treatment-",
        stringr::str_c(bc_columns, collapse = "-"), ")"
      )
    }
    scoring_functions <- c(scoring_functions, trt_constraints = bc_misaligned(
      batch_vars = str_c(bc_columns, "_bc"),
      feature_vars = bc_columns,
      weight = 1,
      test_data = bc_data
    ))
  }

  # This constraint is there to restrict number of distinct treatments within the cage groups, in order to find
  # viable solutions if treatment should be homogeneous in every cage
  if ("Treatment" %in% colnames(bc_data) && "CageGroup" %in% colnames(bc_data) &&
    "CageGroup" %in% DoE_multipleLevelCols(bc_data)) {
    if (!quiet_process) message("     ... facilitating homogeneity of treatment in cages (CageGroup)")
    scoring_functions <- c(scoring_functions, cage_trt_homogeneity = bc_homogenicity_violations(
      container_var = "CageGroup",
      feature_var = "Treatment",
      weight = 1,
      test_data = bc_data
    ))
  }

  trt_vars_cat <- intersect(balance_treatment_vars, colnames(bc_data)) |>
    setdiff(bc_columns) |>
    intersect(DoE_multipleLevelCols(bc_data)) |>
    setdiff(DoE_numericalCols(bc_data))
  if (length(trt_vars_cat) > 0) {
    if (!quiet_process) message("     ... OSAT for categorical variables balanced across treatment (", stringr::str_c(trt_vars_cat, collapse = ", "), ")")
    scoring_functions <- c(scoring_functions, balanced_categorical = osat_score_generator(batch_vars = "Treatment", feature_vars = trt_vars_cat))
  }

  trt_vars_num <- intersect(balance_treatment_vars, colnames(bc_data)) |>
    setdiff(bc_columns) |>
    intersect(DoE_multipleLevelCols(bc_data)) |>
    intersect(DoE_numericalCols(bc_data))
  if (length(trt_vars_num) > 0) {
    if (!quiet_process) message("     ... ANOVA -logP for numerical variables balanced across treatment (", stringr::str_c(trt_vars_num, collapse = ", "), ")")
    sf <- list()
    for (i in seq_along(trt_vars_num)) {
      sf <- c(sf, anova_logP(batch_var = "Treatment", feature_var = trt_vars_num[i], test_data = bc_data))
    }
    names(sf) <- stringr::str_c("balanced_", trt_vars_num)
    scoring_functions <- c(scoring_functions, sf)
  }

  assertthat::assert_that(length(scoring_functions) > 0, msg = "No variables for scoring found or all have only one level. Nothing to do.")
  bc_treatment$score(scoring_functions)

  bc_treatment <- optimize_design(
    bc_treatment,
    scoring = scoring_functions,
    n_shuffle = n_shuffle,
    acceptance_func = ~ accept_leftmost_improvement(..., tolerance = 0.1),
    quiet = quiet_optimize
  )

  # Check if user given constraints (if provided) could be satisfied
  if ("trt_constraints" %in% names(bc_treatment$score(scoring_functions))) {
    if (bc_treatment$score(scoring_functions)[["trt_constraints"]] > 0) {
      message("CAUTION: User defined constraints could not be fully met (remaining score ", bc_treatment$score(scoring_functions)[["trt_constraints"]], ")")
    } else {
      if (!quiet_process) message("Success. User provided constraints could be fully met.")
    }
  }

  design_trt <- bc_treatment$get_samples()

  # Translate back integer codes for 'used' factors into meaningful labels and remove intermediate helper columns
  design_trt[["Treatment"]] <- levels(treatments_sortedfac[["Treatment"]])[design_trt[["Treatment"]]]
  for (i in seq_along(bc_columns)) {
    design_trt[[bc_columns[i]]] <- levels(treatments_sortedfac[[stringr::str_c(bc_columns[i], "_bc")]])[design_trt[[bc_columns[i]]]]
  }

  dplyr::select(design_trt, -all_of(ends_with("_bc"))) |>
    dplyr::select(-any_of(c("Position")))
}

# Form cages from animal list with treatment groups assigned

Invivo_assignCages <- function(design_trt,
                               cagegroup_vars,
                               unique_vars = c(),
                               balance_cage_vars = c(),
                               n_min = 2, n_max = 5, n_ideal = 2, prefer_big_groups = TRUE, strict = TRUE,
                               maxiter = 5e3,
                               quiet_process = FALSE, quiet_optimize = TRUE) {

  # Set up batch container
  # We just need the subgrouping functionality of shuffle_grouped_data(), so the obligatory assignment variable
  # is just a dummy with one factor level. (Initially thought that Treatment would be assigned at this step, too.)

  if (!quiet_process) message("Setting up batch container.")

  bc_cage <- BatchContainer$new(
    dimensions = c("Dummy" = 1, ID = nrow(design_trt))
  )
  bc_cage <- assign_in_order(bc_cage, design_trt)

  shuffle_proposal <- shuffle_grouped_data(bc_cage,
    allocate_var = "Dummy",
    keep_together_vars = cagegroup_vars,
    keep_separate_vars = unique_vars,
    subgroup_var_name = "Cage",
    report_grouping_as_attribute = TRUE,
    n_min = n_min, n_max = n_max, n_ideal = n_ideal,
    prefer_big_groups = prefer_big_groups, strict = strict
  )

  if (!quiet_process) {
    message(
      "\nExpecting ", dplyr::n_distinct(shuffle_proposal()$samples_attr$Cage),
      " cages to be created and ", sum(table(shuffle_proposal()$samples_attr$Cage) == 1),
      " single-housed animals.\n"
    )
  }

  if (!quiet_process) message("Constructing scoring functions:")

  # Set up required scoring functions; most relevant ones come first to allow later application of relevance ranking
  scoring_functions <- list()
  bc_data <- bc_cage$get_samples()

  trt_vars_cat <- intersect(balance_cage_vars, colnames(bc_data)) |>
    intersect(DoE_multipleLevelCols(bc_data)) |>
    setdiff(DoE_numericalCols(bc_data))
  if (length(trt_vars_cat) > 0) {
    if (!quiet_process) message("     ... OSAT for categorical variables balanced across cages (", stringr::str_c(trt_vars_cat, collapse = ", "), ")")
    scoring_functions <- c(scoring_functions, balanced_categorical = osat_score_generator(batch_vars = "Cage", feature_vars = trt_vars_cat))
  }

  trt_vars_num <- intersect(balance_cage_vars, colnames(bc_data)) |>
    intersect(DoE_multipleLevelCols(bc_data)) |>
    intersect(DoE_numericalCols(bc_data))
  if (length(trt_vars_num) > 0) {
    if (!quiet_process) message("     ... ANOVA -logP for numerical variables balanced across cages (", stringr::str_c(trt_vars_num, collapse = ", "), ")")
    sf <- list()
    for (i in seq_along(trt_vars_num)) {
      sf <- c(sf, anova_logP(batch_var = "Cage", feature_var = trt_vars_num[i]))
    }
    names(sf) <- stringr::str_c("balanced_", trt_vars_num)
    scoring_functions <- c(scoring_functions, sf)
  }

  if (length(scoring_functions) == 0) {
    if (!quiet_process) message("     ... just a dummy score as there are no user provided balancing variables")
    scoring_functions <- osat_score_generator(batch_vars = "Dummy", feature_vars = c("Treatment"))
  }

  bc_cage <- optimize_design(
    bc_cage,
    scoring = scoring_functions,
    shuffle_proposal_func = shuffle_proposal,
    acceptance_func = accept_leftmost_improvement,
    max_iter = maxiter,
    sample_attributes_fixed = F,
    quiet = quiet_optimize
  )

  bc_cage$get_samples() |>
    dplyr::select(-any_of(c("Dummy", "ID", "alloc_var_level", "group", "subgroup")))
}

# Arrange cages in a rack

Invivo_arrangeCages <- function(design_cage,
                                distribute_cagerack_vars = "Treatment",
                                rack_size_x = 4,
                                rack_size_y = 4,
                                n_shuffle = c(rep(5, 100), rep(3, 400), rep(2, 500), rep(1, 4000)),
                                quiet_process = FALSE, quiet_optimize = TRUE) {

  # How many racks do we need?
  nr_cages <- dplyr::n_distinct(design_cage[["Cage"]])
  nr_racks <- ceiling(nr_cages / (rack_size_x * rack_size_y))

  # Identify a minimal n x m rectangle to use from every rack
  approx_cages_per_rack <- ceiling(nr_cages / nr_racks)
  n_cage_x <- rack_size_x
  n_cage_y <- rack_size_y
  may_shrink <- TRUE
  while (may_shrink) {
    may_shrink <- FALSE
    if ((n_cage_y - 1) * n_cage_x >= approx_cages_per_rack) {
      n_cage_y <- n_cage_y - 1
      may_shrink <- TRUE
    }
    if ((n_cage_x - 1) * n_cage_y >= approx_cages_per_rack) {
      n_cage_x <- n_cage_x - 1
      may_shrink <- TRUE
    }
  }

  if (!quiet_process) message("Needing ", nr_racks, " rack", ifelse(nr_racks == 1, "", "s"), " with a grid of ", n_cage_x, " x ", n_cage_y, " cages.")

  empty <- nr_racks * (n_cage_x * n_cage_y) - nr_cages
  if (!quiet_process && empty > 0) message("There will be ", empty, " empty position", ifelse(empty == 1, "", "s"), " overall.")

  if (!quiet_process) message("Setting up batch container.")

  design_rack <- dplyr::select(design_cage, all_of(c("Cage", distribute_cagerack_vars))) |> dplyr::distinct()
  assertthat::assert_that(!any(duplicated(design_rack[["Cage"]])), msg = "'Distribute rack' variables are not homogeneous within cages")

  bc_rack <- BatchContainer$new(
    dimensions = c(Rack = nr_racks, CageRow = n_cage_x, CageCol = n_cage_y)
  )

  bc_rack <- assign_random(bc_rack, design_rack)

  # Firstly, distribute variables across racks if necessary
  if (nr_racks > 1) {
    if (!quiet_process) {
      message(
        " \nDistributing target variables evenly across racks (OSAT score for ",
        stringr::str_c(distribute_cagerack_vars, collapse = ", "), ")"
      )
    }

    scoring_functions <- list(across_rack = osat_score_generator(batch_vars = c("Rack"), feature_vars = distribute_cagerack_vars))

    bc_rack <- optimize_design(
      bc_rack,
      scoring = scoring_functions,
      quiet = quiet_optimize,
      min_score = 0, max_iter = 1e3,
      n_shuffle = 2,
      acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 10000, alpha = 0.5))
    )

    if (!quiet_process) message("   ... final score: ", bc_rack$score(scoring_f))
  }

  if (!quiet_process) {
    message(
      "\nDistributing target variables (", stringr::str_c(distribute_cagerack_vars, collapse = ", "),
      ") within rack", ifelse(nr_racks == 1, "", "s")
    )
  }

  scoring_functions <- list()
  for (tv in distribute_cagerack_vars) {
    scoring_functions <- c(scoring_functions, mk_plate_scoring_functions(bc_rack,
      plate = "Rack", row = "CageRow", column = "CageCol",
      group = tv, penalize_lines = "none"
    ))
  }
  names(scoring_functions) <- stringr::str_c(names(scoring_functions), rep(distribute_cagerack_vars, each = nr_racks), sep = "_")

  for (i in 1:nr_racks) {
    if (!quiet_process) message("   ... Rack ", i)

    bc_rack <- optimize_design(
      bc_rack,
      scoring = scoring_functions,
      shuffle_proposal_func = mk_subgroup_shuffling_function(
        subgroup_vars = "Rack",
        restrain_on_subgroup_levels = c(i),
        n_swaps = n_shuffle
      ),
      # aggregate_scores_func = sum_scores,  # L2s aggregation doesn't make sense for autoscaled scores!
      # acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 10000, alpha = 0.5)),
      autoscale_scores = TRUE,
      acceptance_func = ~ accept_leftmost_improvement(..., tolerance = 0.1),
      quiet = quiet_optimize
    )
  }

  if (!quiet_process) message("   ... final scores: ", paste(names(bc_rack$score(scoring_functions)), round(bc_rack$score(scoring_functions), 2), sep = ": ", collapse = ", "))

  # Translate Rack numbers to some text output and assign CageNr
  design_rack <- bc_rack$get_samples() |>
    dplyr::filter(!is.na(Cage)) |> # strip empty locations in rack
    dplyr::arrange(Rack, CageRow, CageCol) |>
    dplyr::mutate(
      CageNr = 1:nr_cages,
      Rack = stringr::str_c("Rack ", Rack)
    )

  # Join animal list with cage information and add Group variable needed for the ScoreSheet generator
  dplyr::inner_join(design_cage, design_rack, by = intersect(colnames(design_cage), colnames(design_rack)))
}
```

```{r plotting_functions, echo=FALSE}
Invivo_plotByRack <- function(design, colorBy = "Treatment", showAnimals = FALSE, animalLabel = "AnimalID", showLegend = FALSE) {
  ps <- list()

  paste_animals <- function(animalids, earmarks) {
    if (any(earmarks != "")) {
      return(stringr::str_c(animalids, " (", earmarks, ")", collapse = "\n"))
    }
    stringr::str_c(animalids, collapse = "\n")
  }

  if (!"Earmark" %in% colnames(design)) {
    design$Earmark <- "-"
  }

  if (!"Rack" %in% colnames(design)) {
    design$Rack <- "Rack 1"
  }

  for (rack in unique(design$Rack)) {
    design_f <- dplyr::filter(design, Rack == rack)
    design_u <- dplyr::select(design_f, all_of(c("CageNr", "CageRow", "CageCol", colorBy))) |> dplyr::distinct()

    p <- ggplot2::ggplot(design_f, aes(x = NA, y = NA)) +
      facet_grid(CageRow ~ CageCol) +
      geom_raster(data = design_u, aes(fill = !!as.symbol(colorBy)), alpha = 0.9) +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank()
      )

    if (showAnimals) {
      cage_list <- dplyr::group_by(design_f, across(all_of(c("CageNr", "CageRow", "CageCol")))) |>
        dplyr::summarize(Animals = paste_animals(!!as.symbol(animalLabel), Earmark), .groups = "drop")
      p <- p + geom_text(data = cage_list, aes(label = Animals), size = 3) +
        theme(legend.position = "none") +
        labs(title = rack, subtitle = paste("Animals labeled by", animalLabel, "(Earmark)"))
    } else {
      p <- p + geom_text(data = design_u, aes(label = CageNr), size = 3) +
        labs(title = rack, fill = "", subtitle = paste("Cage by", colorBy))
      if (showLegend) {
        p <- p + theme(legend.position = "bottom")
      } else {
        p <- p + theme(legend.position = "none")
      }
    }

    ps[[rack]] <- p
  }

  ps
}
```

# Purpose of vignette

This example demonstrates how recurring complex design problems of similar
structure may be handled by writing dedicated wrappers that use designIt
functionality in the background while presenting a simplified interface to the
user. These wrapper functions may completely hide the construction of batch
containers, scoring functions and other fundamental package concepts from the
user, allowing to focus on the correct specification of the concrete design task
at hand.

We are using the very specific design constraints of certain *in vivo* studies
as an example. The implementation of the respective wrapper functions won't be
discussed here, but code may be inspected in the .Rmd file of the vignette if
desired.

# Dataset and design task

We would like to assign 3 treatment conditions to a cohort of 59 animals,
representing 2 relevant strains. There are a few concrete user specified
constraints for the study, on top we have to avoid confounding by common
variables such as animal sex, body weight and age.

The animal information is provided in a sample sheet, the treatment list has to
be stored separately. The example data we are looking at is included in the
package.

```{r}
data("invivo_study_samples")
data("invivo_study_treatments")
```

## The animal (sample) sheet

```{r echo=TRUE}
str(invivo_study_samples)

invivo_study_samples |>
  dplyr::count(Strain, Sex, BirthDate) |>
  gt::gt()
```

A simple data summary reveals that the cohort is almost equally composed of
Strains A and B. There are male and female animals in quite different
proportions, with a noticeable excess of the males. Birth dates are available
for Strain A, but missing completely for Strain B.

Initial body weights (arrival weights), identifying ear marks and litter
information are available for all animals. The litter is nested within the
strain and all individuals within one litter naturally share one birth date.

```{r echo=TRUE}
invivo_study_samples |>
  dplyr::count(Strain, Litter, BirthDate) |>
  gt::gt()
```

## Treatment list

```{r echo=TRUE}
str(invivo_study_treatments)

invivo_study_treatments |>
  dplyr::count(Treatment, Strain, Sex) |>
  gt::gt()
```

We have 3 treatments that should each be administered to a defined number of
animals. In addition, some satellite animals of either strain will not receive
any treatment at all, which is specified by a fourth ('untreated') condition.

In most cases the treatment list could be reduced to the first column, i.e.
repeating each label for the right number of times so that the total length
matches the sample sheet.

However, additional study specific constraints may be specified by adding
columns that also appear in the animal list and indicate how the treatments
should be assigned to subgroups of the cohort. In this example, a different
number of animals is used for each of the treatments, balanced across strains.
However, female animals are only to be used for treatment 2.

## Design constraints and data preparation

The specific constraints for our type of *in vivo* study may be summarized as
follows:

* We want to form cages, each hosting ideally 3 animals (preferred range from 2-5)
* Strain, Sex and Treatment must be homogeneous within a cage
* Males from different litters must not be put into the same cage; litter mixing
  is possible however for female animals!
* Average body weight and age composition should be comparable between treatment
  groups and cages
* If at all possible, we avoid putting animals with identical ear markings into
  the same cage
* The distribution of treatments across animal subgroups (if specified by the
  treatment list!) has to be respected

The very special and intricate nature of these requirements motivate th
creation of dedicated functionality on top of this package, as demonstrated by
this vignette.

Before using these functions, we add two auxiliary columns to the sample sheet:

* **AgeGroup** represents the different birth dates as an integer variable, where
  unknown (NA) values get their own code.
* **Litter_combine_females** groups all female animals in a pseudo litter,
  facilitating the assignment of animals to cages at which point only the
  females can be freely combined (co-housed). 

```{r echo=TRUE}
invivo_study_samples <- dplyr::mutate(invivo_study_samples,
  AgeGroup = as.integer(factor(BirthDate, exclude = NULL)),
  Litter_combine_females = ifelse(Sex == "F", "female_all", Litter)
)

invivo_study_samples |>
  dplyr::count(Strain, Litter_combine_females, BirthDate, AgeGroup) |>
  gt::gt()
```

# Design steps

The process of solving the design problem can be divided into 3 successive steps,
each of which is addressed by a specific *in vivo*-specific wrapper function.

1. Assign treatments to individuals animals (function **InVivo_assignTreatments()**)

2. Allocate animals to cages (function **Invivo_assignCages()**)

3. Arrange cages in one or more racks of given dimension (function **Invivo_arrangeCages()**)


Dedicated constraints have to be handled at each step, as is reflected in the
interface of those wrappers.

As stated above, implementation details are beyond the scope of this example.
We will instead just show the interfaces of the three wrappers,
run the example case and visualize the resulting design.

## Assign treatments to animal list

```{r echo=TRUE, eval=FALSE}
InVivo_assignTreatments <- function(animal_list, treatments,
                                    balance_treatment_vars = c(),
                                    form_cages_by = c(),
                                    n_shuffle = c(rep(5, 100), rep(3, 200), rep(2, 500), rep(1, 20000)),
                                    quiet_process = FALSE, quiet_optimize = TRUE) {
  (...)
}
```

The function works with the initial animal and treatment lists.

Most importantly, **balance_treatment_vars** lists the variables that should be
balanced across treatments (e.g. strain, sex, body weight, age, litter).
Different scoring functions will be created for categorical and numerical
covariates.

**form_cages_by** is not mandatory, but gives important clues regarding the
variables that will later be homogeneous within each cage (e.g. strain, sex,
litter). Providing this may be crucial for finding good solutions with a low
number of single-housed animals that don't fit into any other cage.

It is also possible to modify the shuffling protocol and toggle messaging on the
level of processing steps as well as optimization iterations.

## Populate cages

```{r echo=TRUE, eval=FALSE}
Invivo_assignCages <- function(design_trt,
                               cagegroup_vars,
                               unique_vars = c(),
                               balance_cage_vars = c(),
                               n_min = 2, n_max = 5, n_ideal = 2, prefer_big_groups = TRUE, strict = TRUE,
                               maxiter = 5e3,
                               quiet_process = FALSE, quiet_optimize = TRUE) {
  (...)
}
```

This wrapper takes the output of the previous step ('design_trt') as input. 

* **cagegroup_vars** is a list of variables that must be uniform within each cage
  (e.g. treatment", strain, sex, litter).
* **unique_vars** is a list of variables whose values should be unique per cage
  (e.g. ear marking). This constraint will be relaxed in a stepwise way if no
  solution can be found under strict adherence.
* **balance_cage_vars** lists variables which should be evenly distributed
  across cages, as far as possible (e.g. age,
  body weight).
* **n_min**, **n_max** and **n_ideal** specify the minimal, maximal and ideal
  cage sizes, respectively. It is often necessary
  to release the **strict** criterion to find any solution at all or reduce the
  number of remaining single-housed animals.

## Arrange cages in rack(s)

```{r echo=TRUE, eval=FALSE}
Invivo_arrangeCages <- function(design_cage,
                                distribute_cagerack_vars = "Treatment",
                                rack_size_x = 4,
                                rack_size_y = 4,
                                n_shuffle = c(rep(5, 100), rep(3, 400), rep(2, 500), rep(1, 4000)),
                                quiet_process = FALSE, quiet_optimize = TRUE) {
  (...)
}
```

This wrapper takes the output of the previous step ('design_cage') as input. 

**distribute_cagerack_vars** is a list of variables that should be evenly spaced
out across the rows and columns of a rack (or several racks, if needed). Typical
cases may include treatment, strain and sex of the animals.

**rack_size_x** and **rack_size_y** specify the number of cages that fit into
the rows and columns of a grid like rack, respectively. Depending on the actual
number of cages, one or more racks are automatically assigned. Only rectangular
sub-grids may be used of any rack to accommodate the cages.

```{r include=FALSE}
# stop if called from precompiling the child
if (exists(".precompile")) knitr::knit_exit()
```

```{r, child="cached/_invivo_computed.html"}
```
