## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----echo=FALSE, results='asis'-----------------------------------------------
cat('
<style>
.figure .caption {
  text-align: left !important;
}
</style>
')

## ----fig.cap="This figure shows the mapping from a parameter to a collection of repro samples via the generating function. In this case, the blue and red parameters generate repro samples that are 'close' (lies in the prediction set $B_\\alpha$) to the observed statistic, so they are included in the confidence set. On the other hand, the green parameter generates repro samples that are not 'close enough' to the observed statistic, hence it's not included in the confidence set $\\Gamma_\\alpha$. For additional details on the prediction set $B_\\alpha$ and confidence set $\\Gamma_\\alpha$, please refer to the paper. (Taken from Figure 1 in @Awan25.)", out.width="100%", echo=FALSE----

knitr::include_graphics("figs/figure1.png")

## ----setup--------------------------------------------------------------------
library(SimBaRepro)

## -----------------------------------------------------------------------------
n <- 50 # sample size
repro_size <- 200 # number of repro samples

## -----------------------------------------------------------------------------
seeds <- matrix(rnorm(repro_size * n), nrow = repro_size, ncol = n)

## -----------------------------------------------------------------------------
s_sample <- function(seeds, theta) {
  # generate the raw data points
  data <- theta[1] + sqrt(theta[2]) * seeds[, 1:n]
  
  # compute the regular statistics
  s_mean <- apply(data, 1, mean)
  s_var <- apply(data, 1, var)
  
  return(cbind(s_mean, s_var))
}

## -----------------------------------------------------------------------------
s_obs <- c(1.12, 0.67)

## -----------------------------------------------------------------------------
lower_bds = c(0, 1e-6)
upper_bds = c(0, 10)
result <- p_value(lower_bds, upper_bds, seeds, s_sample, s_obs)
print(result$p_val) # p_value

## -----------------------------------------------------------------------------
lower_bds = c(-1, 1) # lower bounds for mean and variance
upper_bds = c(2, 5) # upper bounds for mean and variance
result <- p_value(lower_bds, upper_bds, seeds, s_sample, s_obs)
print(result$p_val) # the largest p_value found
print(result$theta_hat) # the parameter corresponding to the largest p value

## -----------------------------------------------------------------------------
lower_bds <- c(-5, 1e-6)
upper_bds <- c(5, 10)
parameter_index <- 1 # an integer specifying the parameter of interest
alpha <- .05 # significance level
tol <- 1e-3 # tolerance
# choosing j=1 indicates that we're obtaining a confidence interval for the first argument in s_obs. In this case, it's the mean.
mean_CI <- get_CI(alpha, lower_bds, upper_bds, parameter_index, seeds, s_sample, s_obs, tol)
print(mean_CI)

## -----------------------------------------------------------------------------
parameter_index <- 2
var_CI <- get_CI(alpha, lower_bds, upper_bds, parameter_index, seeds, s_sample, s_obs, tol)
print(var_CI)

## -----------------------------------------------------------------------------
lower_bds <- c(0.5, 1e-6)
upper_bds <- c(1.5, 1.5)
resolution <- 13
result <- confidence_grid(alpha, lower_bds, upper_bds, seeds, s_sample, s_obs, tol, resolution)
indicator_array <- result$ind_array
print(result)

## -----------------------------------------------------------------------------
print(c(result$search_lower_bds[1], result$search_upper_bds[1]))
print(c(result$search_lower_bds[2], result$search_upper_bds[2]))

## -----------------------------------------------------------------------------
parameter_names <- c("mean", "variance")
plot_grid(indicator_array, lower_bds, upper_bds, parameter_names)

## -----------------------------------------------------------------------------
n <- 100 # sample size
repro_size <- 200 # number of repro samples
seeds <- matrix(rnorm(repro_size * (2 * n)), nrow = repro_size, ncol = 2 * n)

s_sample <- function(seeds, theta) {
  n_obs <- ncol(seeds) / 2   
  mu1 <- theta[1]           
  mu2 <- theta[2]          
  sigma <- sqrt(theta[3])
  
  seeds1 <- seeds[, 1:n_obs, drop = FALSE] # first n columns for X
  seeds2 <- seeds[, (n_obs + 1):(2 * n_obs), drop = FALSE] # next n columns for Y
  
  # generate bivariate data
  data1 <- mu1 + sigma * seeds1 # X ~ N(mu_1, sigma^2)
  data2 <- mu2 + sigma * seeds2 # Y ~ N(mu_2, sigma^2)
  
  # compute statistics
  s_mean1 <- rowMeans(data1) # sample mean of X
  s_mean2 <- rowMeans(data2) # sample mean of Y
  s_var1 <- apply(data1, 1, var) # sample variance of X
  s_var2 <- apply(data2, 1, var) # sample variance of Y
  
  return(cbind(s_mean1, s_mean2, s_var1, s_var2))
}

s_obs <- c(-0.1, 0.1, 1.05, 0.9)
alpha <- 0.05
tol <- 1e-3

lower_bds <- c(-0.5, -0.5, 1e-6)
upper_bds <- c(0.5, 0.5, 2)
resolution <- 10
result <- confidence_grid(alpha, lower_bds, upper_bds, seeds, s_sample, s_obs, tol, resolution)
indicator_array <- result$ind_array

## -----------------------------------------------------------------------------
index_set <- c(1,2)
means_array <- grid_projection(indicator_array, index_set)

## -----------------------------------------------------------------------------
means_lower_bds <- lower_bds[index_set]
means_upper_bds <- upper_bds[index_set]
parameter_names <- c("mu1", "mu2")
plot_grid(means_array, means_lower_bds, means_upper_bds, parameter_names)

## -----------------------------------------------------------------------------
index_set2 <- c(1,3)
mu1_var_array <- grid_projection(indicator_array, index_set2)

index_set3 <- c(2,3)
mu2_var_array <- grid_projection(indicator_array, index_set3)

## ----fig.show='hold'----------------------------------------------------------
mu1_var_lower_bds <- lower_bds[index_set2]
mu1_var_upper_bds <- upper_bds[index_set2]
parameter_names <- c("mu1", "variance")
plot_grid(mu1_var_array, mu1_var_lower_bds, mu1_var_upper_bds, parameter_names)

mu2_var_lower_bds <- lower_bds[index_set3]
mu2_var_upper_bds <- upper_bds[index_set3]
parameter_names <- c("mu2", "variance")
plot_grid(mu2_var_array, mu2_var_lower_bds, mu2_var_upper_bds, parameter_names)

## -----------------------------------------------------------------------------
n <- 50 # sample size
repro_size <- 200 # number of repro samples

## -----------------------------------------------------------------------------
upper_clamp <- 3
lower_clamp <- -3
eps <- 4 # privacy guarantee parameter

## -----------------------------------------------------------------------------
regular_seeds <- matrix(rnorm(repro_size * n), nrow = repro_size, ncol = n)
dp_seeds <- matrix((2 * rbinom(n = repro_size * 2, size = 1, prob = 1/2) - 1) * rexp(repro_size * 2), nrow = repro_size, ncol = 2)
seeds <- cbind(regular_seeds, dp_seeds)

## -----------------------------------------------------------------------------
s_sample <- function(seeds, theta) {
  # generate the i.i.d. normal r.v.s using the first n columns of the seeds
  raw_data <- theta[1] + sqrt(theta[2]) * seeds[, 1:n]

  # clamp the raw data using the clamps defined above
  clamped <- pmin(pmax(raw_data, lower_clamp), upper_clamp)

  # compute the private statistics (adding a properly scaled Laplace noise to the sample mean and sample variance)
  s_mean <- apply(clamped, 1, mean) + (upper_clamp - lower_clamp) / (n * eps / 2) * seeds[, n+1]
  s_var <- apply(clamped, 1, var) + (upper_clamp - lower_clamp)^2 / (n * eps / 2) * seeds[, n+2]

  return(cbind(s_mean, s_var))
}

## -----------------------------------------------------------------------------
s_DP <- c(1.12, 1.07) # define the observed statistic as a vector

## -----------------------------------------------------------------------------
lower_bds <- c(0, 1e-6)
upper_bds <- c(1, 10)

## -----------------------------------------------------------------------------
result <- p_value(lower_bds, upper_bds, seeds, s_sample, s_DP)
print(result$p_val) # the largest p_value found
print(result$theta_hat) # the parameter corresponding to the largest p value

## -----------------------------------------------------------------------------
# search lower bounds and upper bounds for mean and variance
alpha <- .05 # significance level
tol <- 1e-3 # tolerance
lower_bds <- c(-5, 1e-6)
upper_bds <- c(5, 10)

## -----------------------------------------------------------------------------
# choose parameter_index=1 indicates that we're obtaining a confidence interval for the first argument in s_DP. In this case, it's the mean.
parameter_index <- 1
mean_CI <- get_CI(alpha, lower_bds, upper_bds, parameter_index, seeds, s_sample, s_DP, tol)
print(mean_CI)

## -----------------------------------------------------------------------------
parameter_index <- 2
var_CI <- get_CI(alpha, lower_bds, upper_bds, parameter_index, seeds, s_sample, s_DP, tol)
print(var_CI)

## -----------------------------------------------------------------------------
lower_bds <- c(0, 1e-6)
upper_bds <- c(2, 5)

resolution <- 20
result <- confidence_grid(alpha, lower_bds, upper_bds, seeds, s_sample, s_DP, tol, resolution)
indicator_array <- result$ind_array
print(result)

## -----------------------------------------------------------------------------
parameter_names <- c("mean", "variance") # optional input
plot_grid(indicator_array, lower_bds, upper_bds, parameter_names)

