## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  dev = "svglite",
  fig.ext = "svg",
  cache = TRUE  # Cache expensive computations to speed up builds
)
library(corrselect)

## -----------------------------------------------------------------------------
data(bioclim_example)

# Data structure
dim(bioclim_example)
head(names(bioclim_example))

# Response variable
summary(bioclim_example$species_richness)

## -----------------------------------------------------------------------------
# Remove highly correlated predictors
bio_clean <- corrPrune(
  data = bioclim_example[, -1],  # Exclude response
  threshold = 0.7,
  mode = "auto"
)

# How much did we reduce?
cat(sprintf("Reduced from %d → %d variables\n",
            ncol(bioclim_example) - 1,
            ncol(bio_clean)))

# Which variables were kept?
head(attr(bio_clean, "selected_vars"), 10)

## ----fig.width=7, fig.height=5, fig.alt="Histogram showing distribution of absolute correlations before and after pruning. Red bars show many correlations exceed 0.7 threshold before pruning. Blue bars show all correlations below 0.7 after pruning."----
cor_before <- cor(bioclim_example[, -1])
cor_after  <- cor(bio_clean)

vals_before <- abs(cor_before[upper.tri(cor_before)])
vals_after  <- abs(cor_after[upper.tri(cor_after)])

# Common x limits
xlim <- c(0, 1)

# Shared breaks for fair comparison
breaks <- seq(0, 1, length.out = 30)

# Before histogram
hist(vals_before,
     breaks = breaks,
     freq = FALSE,
     main = "Distribution of Absolute Correlations",
     xlab = "Absolute Correlation",
     col = rgb(0.8, 0.2, 0.2, 0.4),
     border = "white",
     xlim = xlim)

# After histogram
hist(vals_after,
     breaks = breaks,
     freq = FALSE,
     col = rgb(0.2, 0.5, 0.8, 0.4),
     border = "white",
     add = TRUE)

# Threshold line
abline(v = 0.7, col = "black", lty = 2, lwd = 2)

legend("topright",
       legend = c("Before", "After", "Threshold"),
       fill   = c(rgb(0.8, 0.2, 0.2, 0.4),
                  rgb(0.2, 0.5, 0.8, 0.4),
                  NA),
       border = c("white", "white", NA),
       lty    = c(NA, NA, 2),
       lwd    = c(NA, NA, 2),
       col    = c(NA, NA, "black"),
       bty    = "o",
       bg = "white")

## -----------------------------------------------------------------------------
# Model 1: Full model (19 variables)
model_full <- lm(species_richness ~ ., data = bioclim_example)

# Model 2: After corrPrune (correlation-based pruning)
bio_clean_full <- data.frame(
  species_richness = bioclim_example$species_richness,
  bio_clean
)
model_corrprune <- lm(species_richness ~ ., data = bio_clean_full)

# Model 3: Sequential VIF-based refinement
bio_final <- modelPrune(
  formula = species_richness ~ .,
  data = bio_clean_full,
  limit = 5
)
model_final <- attr(bio_final, "final_model")

## -----------------------------------------------------------------------------
# Variable counts
n_full      <- 19
n_corrprune <- length(attr(bio_clean, "selected_vars"))
n_final     <- length(attr(bio_final, "selected_vars"))

# Compute condition numbers (measure of collinearity)
X_full      <- model.matrix(model_full)[, -1]
X_corrprune <- model.matrix(model_corrprune)[, -1]
X_final     <- model.matrix(model_final)[, -1]

kappa_full      <- kappa(X_full, exact = TRUE)
kappa_corrprune <- kappa(X_corrprune, exact = TRUE)
kappa_final     <- kappa(X_final, exact = TRUE)

# Summary table
comparison <- data.frame(
  Step = c("Full", "corrPrune", "+ modelPrune"),
  Predictors = c(n_full, n_corrprune, n_final),
  Adj_R2 = c(
    summary(model_full)$adj.r.squared,
    summary(model_corrprune)$adj.r.squared,
    summary(model_final)$adj.r.squared
  ),
  Kappa = c(kappa_full, kappa_corrprune, kappa_final)
)

print(comparison)

## ----fig.width=8, fig.height=5, fig.alt="Dual-axis line plot showing number of predictors (x-axis) versus adjusted R² (blue, left y-axis) and condition number κ (red, right y-axis, log scale). As predictors decrease from 19 to 8, adjusted R² remains high (above 0.7) while κ drops from over 10000 to below 100, demonstrating that pruning dramatically improves numerical stability with minimal loss of explanatory power."----
# Extract data
n_vars <- comparison$Predictors
adj_r2 <- comparison$Adj_R2
kappa  <- comparison$Kappa

# Left y-axis: Adjusted R²
par(mar = c(5, 4, 4, 4))  # extra space on the right for second axis

plot(
  n_vars, adj_r2,
  type = "b",
  pch  = 19,
  cex  = 1.5,
  col  = rgb(0.2, 0.5, 0.8, 1),
  lwd  = 2,
  xlab = "Number of Predictors",
  ylab = "Adjusted R²",
  ylim = c(0, 1),
  main = "Pruning Reduces Collinearity While Preserving Fit"
)

# Right y-axis: log10(κ)
log_kappa <- log10(kappa)

# Nice ylim for log10(κ)
ylim_right <- range(log_kappa, finite = TRUE)
ylim_right <- ylim_right * c(0.9, 1.1)

par(new = TRUE)
plot(
  n_vars, log_kappa,
  type = "b",
  pch  = 17,
  cex  = 1.5,
  col  = rgb(0.8, 0.2, 0.2, 1),
  lwd  = 2,
  xaxt = "n",
  yaxt = "n",
  xlab = "",
  ylab = "",
  ylim = ylim_right
)

# Right-hand axis ticks using pretty() on log scale
log_ticks <- pretty(log_kappa)
kappa_labels <- round(10^log_ticks)
axis(4, at = log_ticks, labels = kappa_labels)
mtext("Condition Number (κ)", side = 4, line = 3)

# Legend centered at top
legend(
  "top",
  inset = 0.02,
  legend = c("Adjusted R² (higher better)", "κ (lower better)"),
  col    = c(
    rgb(0.2, 0.5, 0.8, 1),
    rgb(0.8, 0.2, 0.2, 1)
  ),
  pch    = c(19, 17),
  lwd    = 2,
  horiz  = TRUE,
  bty    = "o",
  bg = "white",
  x.intersp = 0.8
)


## ----fig.width=7, fig.height=5, fig.alt="Bar chart comparing regression coefficients between full model (19 variables, red bars) and pruned model (8 variables, blue bars). Variables dropped in pruning show only red bars with large magnitudes. Variables retained in both models show overlapping red-blue bars with more consistent effect sizes, demonstrating improved coefficient stability after pruning."----
# Extract coefficients (excluding intercept)
coef_full  <- coef(model_full)[-1]
coef_final <- coef(model_final)[-1]

# Use *all* variables (not just common ones)
all_vars <- union(names(coef_full), names(coef_final))

# Align coefficients to the same full variable list
vals_full   <- coef_full[all_vars]
vals_pruned <- coef_final[all_vars]

# Replace missing values (variables dropped in a model) with 0
vals_full[is.na(vals_full)]     <- 0
vals_pruned[is.na(vals_pruned)] <- 0

# Colours
col_full   <- rgb(0.8, 0.2, 0.2, 0.5)
col_pruned <- rgb(0.2, 0.5, 0.8, 0.5)

x <- seq_along(all_vars)

# Symmetric Y range
ylim <- range(c(vals_full, vals_pruned)) * 1.15

# Empty plot first
plot(
  x, vals_full,
  type = "n",
  xaxt = "n",
  xlab = "",
  ylab = "Coefficient",
  main = "Coefficient Comparison (Full vs modelPrune)",
  ylim = ylim
)

axis(1, at = x, labels = all_vars, las = 2, cex.axis = 0.7)

# Full model bars
rect(
  x - 0.4, 0,
  x + 0.4, vals_full,
  col = col_full, border = NA
)

# Pruned model bars
rect(
  x - 0.4, 0,
  x + 0.4, vals_pruned,
  col = col_pruned, border = NA
)

legend(
  "topright",
  legend = c(
    sprintf("Full (%d vars)", n_full),
    sprintf("Final (%d vars)", n_final)
  ),
  fill = c(col_full, col_pruned),
  border = "white",
  bty = "o",
  bg = "white"
)


## -----------------------------------------------------------------------------
data(survey_example)

# Data structure
dim(survey_example)
str(survey_example[, 1:10])  # First 10 columns

## -----------------------------------------------------------------------------
# Exclude respondent_id, overall_satisfaction, and factor variables
survey_predictors <- survey_example[, !(names(survey_example) %in%
                                         c("respondent_id", "overall_satisfaction",
                                           "gender", "education"))]

# Convert ordered factors (Likert items 1-7) to numeric for correlation analysis
survey_numeric <- as.data.frame(lapply(survey_predictors, function(x) {
  if (is.ordered(x)) as.numeric(as.character(x)) else as.numeric(x)
}))

# Prune with protected variables
survey_clean <- corrPrune(
  data = survey_numeric,
  threshold = 0.6,
  force_in = "age"
)

# How many items remain?
cat(sprintf("Reduced from %d → %d variables\n",
            ncol(survey_numeric),
            ncol(survey_clean)))

# Which items were kept?
selected <- attr(survey_clean, "selected_vars")
print(selected)

## -----------------------------------------------------------------------------
# Count items per construct
satisfaction_kept <- sum(grepl("satisfaction_", selected))
engagement_kept <- sum(grepl("engagement_", selected))
loyalty_kept <- sum(grepl("loyalty_", selected))

cat(sprintf("Satisfaction: %d/10 items kept\n", satisfaction_kept))
cat(sprintf("Engagement: %d/10 items kept\n", engagement_kept))
cat(sprintf("Loyalty: %d/10 items kept\n", loyalty_kept))

## ----fig.width=8, fig.height=5, fig.alt="Two side-by-side barplots. Left panel shows items kept per construct (Satisfaction, Engagement, Loyalty) with gray bars for original 10 items and blue bars showing reduced counts after pruning, demonstrating balanced retention across all three constructs. Right panel shows total variable reduction from 31 to approximately 10 variables using salmon and light green bars."----
par(mfrow = c(1, 2))

# Items kept per construct
construct_data <- rbind(
  c(10, 10, 10),
  c(satisfaction_kept, engagement_kept, loyalty_kept)
)

barplot(construct_data,
        beside = TRUE,
        names.arg = c("Satisfaction", "Engagement", "Loyalty"),
        col = c("lightgray", "lightblue"),
        legend.text = c("Original (10)", "After pruning"),
        args.legend = list(x = "topright", bty = "n"),
        main = "Items per Construct",
        ylab = "Number of Items",
        ylim = c(0, 12))

# Percentage reduction
barplot(c(ncol(survey_numeric), ncol(survey_clean)),
        names.arg = c("Before", "After"),
        col = c("salmon", "lightgreen"),
        main = "Total Variables",
        ylab = "Count",
        ylim = c(0, max(ncol(survey_numeric)) * 1.2))
text(0.7, ncol(survey_numeric) + 1, ncol(survey_numeric), pos = 3)
text(1.9, ncol(survey_clean) + 1, ncol(survey_clean), pos = 3)

## -----------------------------------------------------------------------------
# Add response back
survey_model_data <- data.frame(
  overall_satisfaction = survey_example$overall_satisfaction,
  survey_clean
)

# Fit regression model
model_survey <- lm(overall_satisfaction ~ ., data = survey_model_data)

# Summary
summary(model_survey)

## -----------------------------------------------------------------------------
# Full model (all 30 items + demographics)
full_survey_data <- data.frame(
  overall_satisfaction = survey_example$overall_satisfaction,
  survey_predictors
)

model_full_survey <- lm(overall_satisfaction ~ ., data = full_survey_data)

# Compare
data.frame(
  Model = c("Full (33 vars)", "Pruned (10 vars)"),
  R2 = c(summary(model_full_survey)$r.squared,
         summary(model_survey)$r.squared),
  Adj_R2 = c(summary(model_full_survey)$adj.r.squared,
             summary(model_survey)$adj.r.squared),
  Num_Predictors = c(33, 10)
)

## -----------------------------------------------------------------------------
data(genes_example)

# Data structure
dim(genes_example)

# Disease prevalence
table(genes_example$disease_status)

## -----------------------------------------------------------------------------
library(microbenchmark)

# Extract gene expression data (exclude ID and outcome)
gene_expr <- genes_example[, -(1:2)]

# Greedy pruning with timing
greedy_timing <- microbenchmark(
  genes_pruned <- corrPrune(
    data = gene_expr,
    threshold = 0.8,
    mode = "greedy"  # Fast for large p
  ),
  times = 3
)
greedy_ms <- median(greedy_timing$time) / 1e6

# Reduction
cat(sprintf("Reduced from %d → %d genes (%.1f ms)\n",
            ncol(gene_expr),
            ncol(genes_pruned),
            greedy_ms))

## ----fig.width=8, fig.height=5, fig.alt="Barplot comparing gene counts before and after pruning. Salmon bar shows 200 original genes, light blue bar shows approximately 100 genes after pruning (50% retained). Text labels display exact counts and retention percentage, demonstrating substantial dimensionality reduction while maintaining low pairwise correlations."----
# Barplot showing reduction
reduction_data <- c(ncol(gene_expr), ncol(genes_pruned))
barplot(reduction_data,
        names.arg = c("Original", "After Pruning"),
        main = "Gene Dimensionality Reduction",
        ylab = "Number of Genes",
        col = c("salmon", "lightblue"),
        ylim = c(0, max(reduction_data) * 1.2))
text(0.7, reduction_data[1] + 10, paste(reduction_data[1], "genes"), pos = 3)
text(1.9, reduction_data[2] + 10, paste(reduction_data[2], "genes\n(",
     round(100 * reduction_data[2] / reduction_data[1], 1), "% retained)"), pos = 3)

## -----------------------------------------------------------------------------
# Subset for comparison (use smaller subset for vignette build speed)
gene_subset <- gene_expr[, 1:20]  # Reduced from 50 to 20 for faster builds

# Benchmark exact mode
exact_time <- median(microbenchmark(
  exact_result <- corrPrune(gene_subset, threshold = 0.8, mode = "exact"),
  times = 3,
  unit = "ms"
)$time) / 1e6  # Convert nanoseconds to milliseconds

# Benchmark greedy mode
greedy_time <- median(microbenchmark(
  greedy_result <- corrPrune(gene_subset, threshold = 0.8, mode = "greedy"),
  times = 3,
  unit = "ms"
)$time) / 1e6  # Convert nanoseconds to milliseconds

# Run once more to get actual results for comparison
exact_result <- corrPrune(gene_subset, threshold = 0.8, mode = "exact")
greedy_result <- corrPrune(gene_subset, threshold = 0.8, mode = "greedy")

# Compare
cat(sprintf("Exact mode: %d genes kept (%.1f ms)\n", ncol(exact_result), exact_time))
cat(sprintf("Greedy mode: %d genes kept (%.1f ms)\n", ncol(greedy_result), greedy_time))
cat(sprintf("Speedup: %.1fx faster\n", exact_time / greedy_time))

## -----------------------------------------------------------------------------
# Prepare classification data
classification_data <- data.frame(
  disease_status = genes_example$disease_status,
  genes_pruned
)

# Logistic regression
model_genes <- glm(disease_status ~ .,
                   data = classification_data,
                   family = binomial())

# Prediction accuracy
predictions <- ifelse(predict(model_genes, type = "response") > 0.5,
                     "Disease", "Healthy")
accuracy <- mean(predictions == genes_example$disease_status)

cat(sprintf("Classification accuracy: %.1f%%\n", accuracy * 100))

## -----------------------------------------------------------------------------
data(longitudinal_example)

# Data structure
dim(longitudinal_example)
head(longitudinal_example)

# Study design
cat(sprintf("Subjects: %d\n", length(unique(longitudinal_example$subject))))
cat(sprintf("Sites: %d\n", length(unique(longitudinal_example$site))))
cat(sprintf("Observations per subject: %d\n",
            nrow(longitudinal_example) / length(unique(longitudinal_example$subject))))

## ----eval=FALSE---------------------------------------------------------------
# # Note: This example requires lme4 package
# library(lme4)
# 
# # Define formula with random effects
# # Note: Only fixed effects (x1-x5) will be pruned
# #       Random effects (1|subject), (1|site) are preserved
# 
# pruned_mixed <- modelPrune(
#   formula = outcome ~ x1 + x2 + x3 + x4 + x5 + (1|subject) + (1|site),
#   data = longitudinal_example,
#   engine = "lme4",
#   limit = 5
# )
# 
# # Which fixed effects were kept?
# selected_fixed <- attr(pruned_mixed, "selected_vars")
# cat("Fixed effects kept:\n")
# print(selected_fixed)
# 
# # Which were removed?
# removed_fixed <- attr(pruned_mixed, "removed_vars")
# cat("\nFixed effects removed:\n")
# print(removed_fixed)

## ----eval=FALSE---------------------------------------------------------------
# final_mixed <- attr(pruned_mixed, "final_model")
# summary(final_mixed)

## ----eval=FALSE---------------------------------------------------------------
# # Note: This example requires lme4 package
# library(lme4)
# 
# # Fit full model
# full_formula <- as.formula(paste("outcome ~",
#                                  paste(paste0("x", 1:5), collapse = " + "),
#                                  "+ (1|subject) + (1|site)"))
# 
# model_full_mixed <- lmer(full_formula, data = longitudinal_example)
# 
# # Extract fixed effects design matrices
# X_full <- getME(model_full_mixed, "X")
# X_pruned <- getME(final_mixed, "X")
# 
# # Compute VIF
# compute_vif <- function(X) {
#   X_scaled <- scale(X[, -1])  # Remove intercept
#   sapply(seq_len(ncol(X_scaled)), function(i) {
#     r2 <- summary(lm(X_scaled[, i] ~ X_scaled[, -i]))$r.squared
#     1 / (1 - r2)
#   })
# }
# 
# vif_full <- compute_vif(X_full)
# vif_pruned <- compute_vif(X_pruned)
# 
# # Compare
# comparison_vif <- data.frame(
#   Predictor = colnames(X_pruned)[-1],
#   VIF_Before = vif_full,
#   VIF_After = vif_pruned
# )
# print(comparison_vif)

## ----eval=FALSE, fig.width=8, fig.height=5, fig.alt="Side-by-side barplot showing VIF values for fixed effects before (red bars) and after (blue bars) pruning in a mixed-effects model. Red dashed horizontal line marks VIF threshold of 5. Before pruning, several predictors exceed the threshold with VIF values above 5. After pruning, all retained predictors have VIF below 5, demonstrating successful multicollinearity reduction while preserving random effects structure."----
# # Extract VIF values
# predictors <- comparison_vif$Predictor
# vif_before <- comparison_vif$VIF_Before
# vif_after <- comparison_vif$VIF_After
# 
# # Set up bar positions
# x <- seq_along(predictors)
# width <- 0.35
# 
# # Create plot
# par(mar = c(5, 4, 4, 2))
# plot(
#   x, vif_before,
#   type = "n",
#   xaxt = "n",
#   xlab = "Fixed Effects",
#   ylab = "VIF",
#   main = "VIF Reduction After Pruning (Mixed Model)",
#   ylim = c(0, max(vif_before) * 1.15)
# )
# 
# # Add horizontal line at VIF = 5 threshold
# abline(h = 5, col = "red", lty = 2, lwd = 2)
# 
# # Before bars (darker)
# rect(
#   x - width, 0,
#   x, vif_before,
#   col = rgb(0.8, 0.2, 0.2, 0.6),
#   border = "white"
# )
# 
# # After bars (lighter)
# rect(
#   x, 0,
#   x + width, vif_after,
#   col = rgb(0.2, 0.5, 0.8, 0.6),
#   border = "white"
# )
# 
# # Add x-axis labels
# axis(1, at = x, labels = predictors, las = 2)
# 
# # Add legend
# legend(
#   "topright",
#   legend = c("Before Pruning", "After Pruning", "VIF = 5 Threshold"),
#   fill = c(rgb(0.8, 0.2, 0.2, 0.6), rgb(0.2, 0.5, 0.8, 0.6), NA),
#   border = c("white", "white", NA),
#   lty = c(NA, NA, 2),
#   lwd = c(NA, NA, 2),
#   col = c(NA, NA, "red"),
#   bty = "o",
#   bg = "white"
# )

## -----------------------------------------------------------------------------
sessionInfo()

