## ----setup, include = FALSE---------------------------------------------------
library(eVCGsampler)
library(knitr)



## ----echo=TRUE, warning=FALSE-------------------------------------------------

# Start the shiny app for VCG generation for data that can be uploaded in csv or xlsx format.
# eVCGsampler::launchApp()



## ----echo=TRUE, warning=FALSE, fig.dim = c(6, 6)------------------------------

# Generate data POOL (n=100) to sample from
# Generate baseline data (n=30), for three treatment groups n=10 (all treated groups taken together)
set.seed(3453)
dat <- data.frame(
  treat  = rep(0:1, c(100, 30)),
  weight = c(rlnorm(100, 2, 1), rnorm(30, 8.5, 1.7))
)

# Check whether the POOL has a different covariate distribution than the TG
energy_test(treat ~ weight, data=dat)
  

# Sample a smaller subset that is balanced to TG (it balance group=0 to group=1)
out  <- VCG_sampler(treat ~ weight, data=dat, n=10)
dat2 <- out[[1]] #  data frame with balance information, with new variable VCG, if 1, means the observation was selected to be in VCG.
out[[2]] # Balance target plot (optional)

# Permutation ellipses are generated by randomly permuting the pool and treat groups to estimate usual (random) variability.
# Only the X and Y axes are computed directly; the ellipse is interpolated between the axes.
# This method is intended as a visual approximation rather than a precise statistical test.

vcg <- dat2[which(dat2$VCG==1), ] # select the VCG that was sampled by VCG_sampler

# Test if the balancing was successful (variable "treat_balanced", was created by the VCG_sampler function)
energy_test(treat_balanced ~ weight, data=dat2)


# Check covariate balance visual for the 'weight' covariate.
plot_var(dat2, what='weight')




## ----echo=TRUE, warning=FALSE, fig.dim = c(6, 6)------------------------------

set.seed(2244)
# Generate baseline data for treatment groups (all treated groups taken together)
d_tg <- data.frame(
  age   = rnorm(20, 10, 1),
  weight= rnorm(20, 7,  1),
  class = rbinom(20, 1, 0.3)
  )

# Generate data lake (pool) data (to sample from)
d_pool <- data.frame(
  age   = rnorm(50, 12, 2),
  weight= rnorm(50, 6,  2),
  class = rbinom(50, 1, 0.6)
  )

# Combine the data to balance the covariates between the two data sets (creates the variable treat POOL vs TG)
dat <- combine_data(d_pool, d_tg, indicator_name='treat')

# Calculate current energy distance (optional)
energy_distance(treat ~ age + weight + class, data=dat)

# Check whether the POOL has a different covariate distribution than the TG
energy_test(treat ~ age + weight + class, data=dat)
  
# If you are unsure about the size of VCG, try the BestVCGsize function to determine the optimal size.
# This is a purely exploratory approach!
BestVCGsize(treat ~ age + weight + class, data=dat)
# It is only intended for exploratory purposes, as the VCG size is normally given.


# Sample a smaller subset that is balanced to TG (it balance treat=0 to treat=1)
out <- VCG_sampler(treat ~ age + weight + class, data=dat, n=10)
dat2 <- out[[1]] 
out[[2]] # Balance target plot (optional)

# Test if the balancing was successful
energy_test(treat_balanced ~ age + weight + class, data=dat2)
  
# Check covariate balance one by one
plot_var(dat2, what='age')
plot_var(dat2, what='weight')
plot_var(dat2, what='class')



## ----echo=TRUE, warning=FALSE, fig.dim = c(6, 6)------------------------------

# Generate data POOL (n=100) to sample from
# Generate baseline data (n=30), for all treatment groups together.
# And a stratum factor variable sex
set.seed(14495)
dat <- data.frame(
  treat = rep(0:1, c(100, 30)),
  cov1   = c(rnorm(100, 11, 2),  rnorm(30, 10, 1)),
  cov2   = c(rnorm(100, 11, 2),  rnorm(30, 10, 1)), 
  cov3   = c(rnorm(100, 9, 2),   rnorm(30, 10, 1)),
  sex    = c(rbinom(100, 1, 0.5),  rbinom(30, 1, 0.5)))
dat$sex <- factor(dat$sex, labels=c('M', 'F'))


# Sample a VCG of size 7 for male and 5 for female animals. 
out <- VCG_sampler(treat ~ cov1 + cov2 + cov3 | sex, data=dat, n=c(7, 5), plot=TRUE)
dat2 <- out[[1]] 
out[[2]] 

# Test if the balancing was successful
energy_test(treat_balanced ~ cov1 + cov2 + cov3 | sex, data=dat2, plot=F)
  
# plot cov1
plot_var(dat2, what='cov1')


## ----echo=TRUE, warning=FALSE, fig.dim = c(6, 6)------------------------------

# Create some bimodal sample data for the TG, the POOL data is normal distributed
set.seed(5435)
dat <- data.frame(
treat = rep(0:1, c(100, 50)),
cov_1 = c(rnorm(100, 7, 2),  c(rnorm(25, 5, 1), rnorm(25, 9, 1))))

# Balance cov_1 covariate
dat2 <- VCG_sampler(treat ~ cov_1, data=dat, n=25, plot=F)


# Create histogram plot to show the results
data <- data.frame(
  value = c(dat$cov_1[which(dat$treat==0)],
            dat$cov_1[which(dat$treat==1)],
            dat2$cov_1[which(dat2$VCG==1)]),
  group = factor(rep(c("POOL", "TG", "VCG"), c(100, 50, 25)))
)
custom_colors <- c("POOL" = "#00A091", "TG" = "#7A00E6",  "VCG" = "#FE7500")  
sample_sizes  <- data.frame(label=c('n=100', 'n=50', 'n=25'),group=c("POOL", "TG", "VCG"))
ggplot(data, aes(x = value, fill = group)) +
  geom_density(alpha = 0.6, color = NA) +
  scale_fill_manual(values = custom_colors) +
  facet_wrap(~ group, ncol = 1) +
  labs(title = "", x = "", y = "Density") +
  geom_text(data = sample_sizes, aes(x = Inf, y = Inf, label = label),
            hjust = 1.1, vjust = 1.5, inherit.aes = F) +
  theme_minimal() + theme(legend.position = "none",strip.text = element_text(size = 14, face = "bold"),
      plot.title = element_text(size = 16, face = "bold" ) )



## ----echo=TRUE, warning=FALSE, fig.dim = c(6, 6)------------------------------

   # Create data
   dat <- data.frame(
   treat = rep(0:1, c(50, 30)),
   cov_1 = c(rnorm(50, 5, 2),   rnorm(30, 5, 1)),
   cov_2 = c(rnorm(50, 11, 2),  rnorm(30, 10, 1))
   )

   # Generate 10 VCGs of size 5
   out <- multiSampler(treat ~ cov_1 + cov_2, data = dat, n = 5, Nsamples = 10)
   out[[2]]

   str(out[[1]])


## ----echo=TRUE, warning=FALSE, fig.dim = c(8, 6)------------------------------

# Example: 10 studies where Study-8, Study-9, and Study-10 are outliers
set.seed(1234)
dat <- data.frame(
  study = factor(rep(paste0("Study-", 1:10), each = 20)),
  var1 = c(rnorm(20, 10, 1), rnorm(20, 10, 1), rnorm(20, 10, 1), rnorm(20, 10, 1),
           rnorm(20, 10, 1), rnorm(20, 10, 1), rnorm(20, 10, 1), rnorm(20, 15, 1),  
           rnorm(20, 10, 1), rnorm(20, 16, 1)),                                      
  var2 = c(rnorm(20, 5, 1), rnorm(20, 5, 1), rnorm(20, 5, 1), rnorm(20, 5, 1),
           rnorm(20, 5, 1), rnorm(20, 5, 1), rnorm(20, 5, 1), rnorm(20, 5, 1),
           rnorm(20, 10, 1), rnorm(20, 5, 1))                                         
)

# Find outlier studies based on energy distance
out <- find_outliers(study ~ var1 + var2, data = dat, cutoff = 0.99, R = 200)

# View summary - Study-8, Study-9, Study-10 should be flagged as outliers
out$summary

# Heatmap of pairwise energy distances between studies
out$heatmap

# Bar plot showing median distance to other studies
# Dashed line indicates the 95th percentile cutoff
out$barplot


## ----echo=TRUE, warning=FALSE, fig.dim = c(6, 6)------------------------------

# Create sample data
set.seed(5678)
dat <- data.frame(
  age    = rnorm(100, 50, 10),
  weight = rnorm(100, 70, 15),
  score  = rnorm(100, 100, 20)
)

# Create combined score with custom weights
# age is 3x more important, weight is 2x, score is 1x
dat$combined <- combine_variables(1 ~ age + weight + score, data = dat,
                                   weights = c(3, 2, 1))

# Check correlations - they should be proportional to the weights
# age should have highest correlation, then weight, then score
round(cor(dat, method = 'spearman'), 2)


