## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(eval = FALSE, echo = TRUE)

## ----load-packages------------------------------------------------------------
# # Load SurveyStat package
# library(SurveyStat)
# 
# # Load required packages
# library(dplyr)
# library(ggplot2)
# library(knitr)
# 
# # Set seed for reproducibility
# set.seed(12345)

## ----load-data, eval = FALSE--------------------------------------------------
# # Create example survey dataset (embedded for vignette building)
# survey_data <- data.frame(
#   Age = c(25, 34, 45, 28, 52, 31, 38, 29, 47, 33, 41, 26, 36, 43, 30),
#   Gender = c("Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male"),
#   Education = c("High School", "Bachelor", "Bachelor", "High School", "Graduate", "Bachelor", "High School", "Graduate", "Bachelor", "High School", "Graduate", "Bachelor", "High School", "Bachelor", "Graduate"),
#   Income = c(35000, 52000, 68000, 31000, 85000, 48000, 42000, 62000, 71000, 38000, 78000, 45000, 55000, 65000, 72000),
#   Weight = c(0.85, 1.12, 0.95, 1.08, 0.92, 1.05, 0.98, 1.15, 0.88, 1.02, 0.91, 1.08, 0.96, 1.03, 0.89)
# )
# 
# # Display basic information
# cat("Dataset dimensions:", nrow(survey_data), "rows,", ncol(survey_data), "columns\n\n")
# cat("Variable names:", paste(names(survey_data), collapse = ", "), "\n\n")
# 
# # Display first few observations
# # Note: Using knitr::kable for CRAN compatibility
# knitr::kable(head(survey_data, 10), caption = "First 10 observations of the example survey dataset")

## ----remove-duplicates--------------------------------------------------------
# # Check for duplicates
# n_duplicates <- nrow(survey_data) - nrow(unique(survey_data))
# cat("Number of duplicate rows:", n_duplicates, "\n")
# 
# # Remove duplicates (if any exist)
# clean_data <- remove_duplicates(survey_data)
# cat("Dataset size after removing duplicates:", nrow(clean_data), "rows\n")

## ----missing-values-----------------------------------------------------------
# # Check for missing values
# missing_summary <- sapply(clean_data, function(x) sum(is.na(x)))
# missing_table <- data.frame(
#   Variable = names(missing_summary),
#   Missing_Count = missing_summary,
#   Missing_Percentage = round(missing_summary / nrow(clean_data) * 100, 2)
# )
# knitr::kable(missing_table, caption = "Missing value summary")
# 
# # For demonstration, let's introduce some missing values
# clean_data$Income[sample(1:nrow(clean_data), min(5, nrow(clean_data)))] <- NA
# clean_data$Education[sample(1:nrow(clean_data), min(3, nrow(clean_data)))] <- NA
# 
# # Clean missing values using mean imputation for Income
# clean_data_income <- clean_missing(clean_data, "Income", method = "mean")
# 
# # Clean missing values using mode imputation for Education
# clean_data_final <- clean_missing(clean_data_income, "Education", method = "mode")
# 
# # Verify missing values are handled
# cat("Missing values after cleaning:\n")
# print(sapply(clean_data_final, function(x) sum(is.na(x))))

## ----standardize-categories---------------------------------------------------
# # Check current gender categories
# cat("Current gender categories:\n")
# print(table(clean_data_final$Gender))
# 
# # For demonstration, let's create some inconsistent categories
# clean_data_final$Gender[sample(1:nrow(clean_data_final), min(8, nrow(clean_data_final)))] <- "M"
# clean_data_final$Gender[sample(1:nrow(clean_data_final), min(7, nrow(clean_data_final)))] <- "F"
# 
# # Standardize gender categories
# gender_mapping <- list(
#   "M" = "Male", "Male" = "Male",
#   "F" = "Female", "Female" = "Female"
# )
# 
# clean_data_standardized <- standardize_categories(clean_data_final, "Gender", gender_mapping)
# 
# # Verify standardization
# cat("Gender categories after standardization:\n")
# print(table(clean_data_standardized$Gender))

## ----apply-weights------------------------------------------------------------
# # Check weight distribution
# cat("Weight summary:\n")
# print(summary(clean_data_standardized$Weight))
# 
# # Apply normalized weights
# weighted_data <- apply_weights(clean_data_standardized, "Weight")
# 
# # Compare weight distributions
# cat("Original weight sum:", sum(clean_data_standardized$Weight), "\n")
# cat("Normalized weight sum:", sum(weighted_data$Weight), "\n")
# cat("Sample size:", nrow(weighted_data), "\n")

## ----weighted-stats-----------------------------------------------------------
# # Calculate unweighted and weighted means for Income
# unweighted_mean_income <- mean(weighted_data$Income, na.rm = TRUE)
# weighted_mean_income <- weighted_mean(weighted_data, "Income", "Weight")
# 
# cat("Unweighted mean income:", round(unweighted_mean_income, 2), "\n")
# cat("Weighted mean income:", round(weighted_mean_income, 2), "\n")
# 
# # Calculate weighted total income
# total_income <- weighted_total(weighted_data, "Income", "Weight")
# cat("Weighted total income:", round(total_income, 0), "\n")

## ----raking-weights, eval = FALSE---------------------------------------------
# # Create population targets for raking
# population_targets <- list(
#   Gender = c(Male = 1000000, Female = 1050000),
#   Education = c(`High School` = 800000, Bachelor = 900000, Graduate = 350000)
# )
# 
# # Apply raking (note: this is a simplified demonstration)
# weighted_data <- rake_weights(weighted_data, population_targets, "Weight")
# 
# # Compare weight distributions before and after raking
# cat("Weight statistics before raking:\n")
# print(summary(weighted_data$Weight))
# 
# cat("Weight statistics after raking:\n")
# print(summary(weighted_data$Weight))

## ----descriptive-stats, eval = FALSE------------------------------------------
# # Generate comprehensive survey description
# survey_description <- describe_survey(weighted_data, weight_col = "Weight")
# 
# # Display sample information
# cat("Sample Size:", survey_description$Sample_Size, "\n")
# cat("Number of Variables:", survey_description$Number_Variables, "\n")
# 
# # Display numeric variable statistics
# if (!is.null(survey_description$Numeric_Statistics)) {
#   knitr::kable(survey_description$Numeric_Statistics,
#         caption = "Numeric variable statistics (including weighted means)")
# }
# 
# # Display categorical variable statistics for Gender
# if (!is.null(survey_description$Categorical_Statistics$Gender)) {
#   knitr::kable(survey_description$Categorical_Statistics$Gender,
#         caption = "Gender distribution (unweighted and weighted)")
# }

## ----frequency-tables---------------------------------------------------------
# # Generate frequency table for Education
# education_freq <- frequency_table(weighted_data, "Education", "Weight")
# knitr::kable(education_freq, caption = "Education frequency distribution (weighted)")
# 
# # Generate frequency table for Gender (unweighted)
# gender_freq_unweighted <- frequency_table(weighted_data, "Gender")
# knitr::kable(gender_freq_unweighted, caption = "Gender frequency distribution (unweighted)")

## ----cross-tabulation---------------------------------------------------------
# # Create cross-tabulation between Gender and Education
# cross_tab <- cross_tabulation(weighted_data, "Gender", "Education", "Weight")
# 
# # Display cross-tabulation table
# cat("Cross-tabulation table (unweighted):\n")
# print(cross_tab$Cross_Table)
# 
# # Display chi-square test results
# cat("\nChi-square test results:\n")
# cat("Statistic:", cross_tab$Chi_Square_Test$Statistic, "\n")
# cat("Degrees of freedom:", cross_tab$Chi_Square_Test$DF, "\n")
# cat("P-value:", cross_tab$Chi_Square_Test$P_Value, "\n")
# cat("Method:", cross_tab$Chi_Square_Test$Method, "\n")
# 
# # Display weighted cross-tabulation
# if (cross_tab$Weighted) {
#   cat("\nWeighted cross-tabulation table:\n")
#   print(cross_tab$Weighted_Cross_Table)
# }

## ----histogram-age------------------------------------------------------------
# # Create histogram for Age
# age_histogram <- plot_histogram(weighted_data, "Age", bins = 25, add_density = TRUE)
# print(age_histogram)

## ----bar-education------------------------------------------------------------
# # Create weighted bar plot for Education
# education_bar <- plot_weighted_bar(weighted_data, "Education", "Weight", show_percentages = TRUE)
# print(education_bar)

## ----boxplot-income-----------------------------------------------------------
# # Create box plot of Income by Gender
# income_boxplot <- plot_boxplot(weighted_data, "Income", "Gender", add_points = TRUE)
# print(income_boxplot)

## ----boxplot-age--------------------------------------------------------------
# # Create box plot for Age only
# age_boxplot <- plot_boxplot(weighted_data, "Age", add_points = TRUE)
# print(age_boxplot)

## ----table1-------------------------------------------------------------------
# # Create publication-ready table
# table1_data <- data.frame(
#   Variable = c("Age", "Income", "Gender (Male)", "Education (College+)"),
#   Unweighted = c(
#     paste0(round(mean(weighted_data$Age, na.rm = TRUE), 1), " (",
#            round(sd(weighted_data$Age, na.rm = TRUE), 1), ")"),
#     paste0("$", round(mean(weighted_data$Income, na.rm = TRUE), 0)),
#     paste0(sum(weighted_data$Gender == "Male"), " (",
#            round(mean(weighted_data$Gender == "Male") * 100, 1), "%)"),
#     paste0(sum(weighted_data$Education %in% c("Bachelor", "Graduate")), " (",
#            round(mean(weighted_data$Education %in% c("Bachelor", "Graduate")) * 100, 1), "%)")
#   ),
#   Weighted = c(
#     paste0(round(weighted_mean(weighted_data, "Age", "Weight"), 1), " (",
#            "SD not calculated)"),
#     paste0("$", round(weighted_mean(weighted_data, "Income", "Weight"), 0)),
#     paste0(round(survey_description$Categorical_Statistics$Gender$Weighted_Percentage[
#       survey_description$Categorical_Statistics$Gender$Category == "Male"], 1), "%"),
#     paste0(round(
#       sum(survey_description$Categorical_Statistics$Education$Weighted_Percentage[
#         survey_description$Categorical_Statistics$Education$Category %in% c("Bachelor", "Graduate")]), 1), "%")
#   )
# )
# 
# knitr::kable(table1_data, caption = "Table 1: Sample Characteristics (Unweighted vs Weighted)")

## ----table2-------------------------------------------------------------------
# # Create cross-tabulation results table
# table2_data <- data.frame(
#   Category = c("Chi-square statistic", "Degrees of freedom", "P-value"),
#   Value = c(
#     round(cross_tab$Chi_Square_Test$Statistic, 3),
#     cross_tab$Chi_Square_Test$DF,
#     format.pval(cross_tab$Chi_Square_Test$P_Value, digits = 3)
#   )
# )
# 
# knitr::kable(table2_data, caption = "Table 2: Chi-square Test of Independence (Gender vs Education)")

## ----reproducibility----------------------------------------------------------
# # Display session information for reproducibility
# session_info <- sessionInfo()
# cat("R version:", session_info$R.version$version, "\n")
# cat("Platform:", session_info$R.version$platform, "\n")
# cat("Packages loaded:\n")
# print(names(session_info$loadedOnly))

## ----final-check--------------------------------------------------------------
# # Final data quality check
# cat("Final dataset quality:\n")
# cat("Rows:", nrow(weighted_data), "\n")
# cat("Columns:", ncol(weighted_data), "\n")
# cat("Missing values:", sum(is.na(weighted_data)), "\n")
# cat("Weight sum:", sum(weighted_data$Weight, na.rm = TRUE), "\n")
# cat("Analysis completed successfully!\n")

