## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 6, 
  fig.height = 4
)

## -----------------------------------------------------------------------------
library(TopicTestlet)
set.seed(1234)

## -----------------------------------------------------------------------------
# Simulation parameters
N_students <- 100
J_items <- 4
K_topics_true <- 2

# A. Simulate Numeric Scores (0-2)
score_matrix <- matrix(
  sample(0:2, N_students * J_items, replace = TRUE), 
  nrow = N_students, 
  ncol = J_items
)

# B. Simulate Textual Essays
# Define vocabularies for two topics
vocab_topic1 <- c("logic", "reasoning", "evidence", "fact", "analysis", "data")
vocab_topic2 <- c("feeling", "story", "character", "plot", "narrative", "mood")

# Helper function to generate a random essay
generate_essay <- function() {
  words <- sample(c(vocab_topic1, vocab_topic2), size = 20, replace = TRUE)
  paste(words, collapse = " ")
}

# Create matrix of essays
essay_matrix <- matrix(
  replicate(N_students * J_items, generate_essay()),
  nrow = N_students,
  ncol = J_items
)

# Preview the data
head(score_matrix, 3)
substr(essay_matrix[1,1], 1, 50) # First 50 chars of first essay

## -----------------------------------------------------------------------------
text_vector <- aggregate_responses(essay_matrix)

# Check the first student's aggregated text
substr(text_vector[1], 1, 60)

## -----------------------------------------------------------------------------
# In a real analysis, you might check a wider range (e.g., 2:10)
perp_results <- ttm_perplexity(text_vector, k_range = 2:3)

print(perp_results)

# Select the K with the lowest perplexity
best_k <- perp_results$k[which.min(perp_results$perplexity)]
cat("Optimal number of topics:", best_k)

## -----------------------------------------------------------------------------
delta_matrix <- ttm_lda(text_vector, k = best_k)

# The result is an N x K matrix
head(delta_matrix)

## -----------------------------------------------------------------------------
# We use max_iter = 50 for speed in this vignette. 
# For operational use, allow more iterations for convergence.
ttm_results <- ttm_est(
  scores = score_matrix, 
  delta = delta_matrix, 
  max_iter = 50
)

# Model Fit Statistics
print(paste("AIC:", round(ttm_results$AIC, 2)))
print(paste("BIC:", round(ttm_results$BIC, 2)))

## -----------------------------------------------------------------------------
plot(ttm_results$theta, ttm_results$gamma,
     xlab = "Student Ability (Theta)",
     ylab = "Testlet Effect (Gamma)",
     main = "Relationship between Ability and Testlet Effect",
     pch = 19, col = rgb(0, 0, 1, 0.6))
grid()
abline(lm(ttm_results$gamma ~ ttm_results$theta), col = "red", lwd = 2)

## -----------------------------------------------------------------------------
hist(ttm_results$theta, 
     main = "Distribution of Estimated Abilities",
     xlab = "Theta", 
     col = "lightblue", 
     border = "white")

