## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE
)

## ----install, eval=FALSE------------------------------------------------------
# install.packages("SportMiner")

## ----api-key------------------------------------------------------------------
# library(SportMiner)
# 
# # Option 1: Set directly
# sm_set_api_key("your_api_key_here")
# 
# # Option 2: Set via environment variable (recommended)
# # Add to your .Renviron file:
# # SCOPUS_API_KEY=your_api_key_here
# # Then restart R and run:
# sm_set_api_key()

## ----search-------------------------------------------------------------------
# # Define the search query
# query <- paste0(
#   'TITLE-ABS-KEY(',
#   '("talent identification" OR "sport science" OR "athlete") ',
#   'AND ',
#   '("principal component analysis" OR "PCA" OR "cluster analysis") ',
#   ') AND DOCTYPE(ar) AND PUBYEAR > 2010'
# )
# 
# # Retrieve papers
# papers <- sm_search_scopus(
#   query = query,
#   max_count = 100,
#   verbose = TRUE
# )
# 
# # View the data structure
# head(papers[, c("title", "year", "author_keywords")])

## ----preprocess---------------------------------------------------------------
# # Preprocess abstracts
# processed_data <- sm_preprocess_text(
#   data = papers,
#   text_col = "abstract",
#   min_word_length = 3
# )
# 
# # View the processed data
# head(processed_data)

## ----dtm----------------------------------------------------------------------
# # Create DTM
# dtm <- sm_create_dtm(
#   word_counts = processed_data,
#   min_term_freq = 3,
#   max_term_freq = 0.5
# )
# 
# # Check dimensions
# print(paste("Documents:", dtm$nrow, "| Terms:", dtm$ncol))

## ----optimal-k----------------------------------------------------------------
# # Test different values of k
# k_selection <- sm_select_optimal_k(
#   dtm = dtm,
#   k_range = seq(4, 16, by = 2),
#   method = "gibbs",
#   plot = TRUE
# )
# 
# # View results
# print(k_selection$results)
# print(paste("Optimal k:", k_selection$optimal_k))

## ----train-lda----------------------------------------------------------------
# # Train the model
# lda_model <- sm_train_lda(
#   dtm = dtm,
#   k = k_selection$optimal_k,
#   method = "gibbs",
#   iter = 500
# )

## ----plot-terms---------------------------------------------------------------
# # Plot top terms
# sm_plot_topic_terms(
#   model = lda_model,
#   n_terms = 10
# )

## ----plot-frequency-----------------------------------------------------------
# # Plot document distribution
# sm_plot_topic_frequency(
#   model = lda_model,
#   dtm = dtm
# )

## ----plot-trends--------------------------------------------------------------
# # Add doc_id to papers for joining
# papers$doc_id <- paste0("doc_", seq_len(nrow(papers)))
# 
# # Plot trends
# sm_plot_topic_trends(
#   model = lda_model,
#   dtm = dtm,
#   metadata = papers,
#   doc_id_col = "doc_id"
# )

## ----keyword-network----------------------------------------------------------
# # Create network
# network_plot <- sm_keyword_network(
#   data = papers,
#   keyword_col = "author_keywords",
#   min_cooccurrence = 2,
#   top_n = 30
# )
# 
# print(network_plot)

## ----compare-models-----------------------------------------------------------
# # Run comparison
# comparison <- sm_compare_models(
#   dtm = dtm,
#   k = 10,
#   seed = 1729,
#   verbose = TRUE
# )
# 
# # View metrics
# print(comparison$metrics)
# 
# # Get recommendation
# print(paste("Recommended model:", comparison$recommendation))
# 
# # Use the recommended model
# best_model <- comparison$models[[tolower(comparison$recommendation)]]

## ----custom-theme-------------------------------------------------------------
# library(ggplot2)
# 
# # Create a plot with custom theme settings
# p <- sm_plot_topic_frequency(lda_model, dtm)
# 
# # Add customizations
# p +
#   labs(
#     title = "Distribution of Research Topics in Sport Science",
#     subtitle = "Based on 100 papers from Scopus (2010-2025)"
#   ) +
#   theme_sportminer(base_size = 14, grid = FALSE)