## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
library(reticulate)

# Replace the path below with the path of your Python environment
# Then uncomment the command below:
# Tip: BERTOPICR_VENV should be the folder that contains `pyvenv.cfg`.
# Sys.setenv(
#   BERTOPICR_VENV = "C:/Users/teodo/Documents/R/bertopic/bertopic4r",
#   NOT_CRAN = "true"
# )

# 1. Define the libraries you need
required_modules <- c("bertopic", "umap", "hdbscan", "sklearn", "numpy", "plotly", "datetime", "sentence_transformers", "openai", "ollama")

# macOS: if reticulate fails to load Python libraries, run once per session.
if (identical(Sys.info()[["sysname"]], "Darwin")) {
  bertopicr::configure_macos_homebrew_zlib()
}

# Optional: point reticulate at a user-specified virtualenv
venv <- Sys.getenv("BERTOPICR_VENV")
if (nzchar(venv)) {
  venv_cfg <- file.path(venv, "pyvenv.cfg")
  if (file.exists(venv_cfg)) {
    reticulate::use_virtualenv(venv, required = TRUE)
  } else {
    message("Warning: BERTOPICR_VENV does not point to a valid virtualenv: ", venv)
  }
}

# Try to find python, but don't crash if it's missing (e.g. on another user's machine)
if (!reticulate::py_available(initialize = TRUE)) {
  try(reticulate::use_python(Sys.which("python"), required = FALSE), silent = TRUE)
}

# 2. Check if they are installed
python_ready <- tryCatch({
  # Attempt to initialize python and check modules
  py_available(initialize = TRUE) &&
  all(vapply(required_modules, py_module_available, logical(1)))
}, error = function(e) FALSE)

# 3. Only evaluate chunks when Python is ready and NOT_CRAN is set
run_chunks <- python_ready && identical(Sys.getenv("NOT_CRAN"), "true")
knitr::opts_chunk$set(eval = run_chunks)

if (!python_ready) {
  message("Warning: Required Python modules (bertopic, umap-learn) not found. Vignette code will not run.")
} else {
  message("Python environment ready: ", reticulate::py_config()$python)
  if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
    message("Note: Set NOT_CRAN=true to run Python-dependent chunks locally.")
  }
}

## -----------------------------------------------------------------------------
# library(dplyr)
# library(tidyr)
# library(purrr)
# library(utils)
# library(tibble)
# library(readr)
# library(tictoc)
# library(htmltools)
# library(bertopicr)

## ----eval=run_chunks----------------------------------------------------------
# # Import necessary Python modules
# py <- import_builtins()
# np <- import("numpy")
# umap <- import("umap")
# UMAP <- umap$UMAP
# hdbscan <- import("hdbscan")
# HDBSCAN <- hdbscan$HDBSCAN
# sklearn <- import("sklearn")
# CountVectorizer <- sklearn$feature_extraction$text$CountVectorizer
# bertopic <- import("bertopic")
# plotly <- import("plotly")
# datetime <- import("datetime")
# 

## -----------------------------------------------------------------------------
# rds_path <- file.path("inst/extdata", "spiegel_sample.rds")
# dataset <- read_rds(rds_path)
# names(dataset)
# dim(dataset)

## -----------------------------------------------------------------------------
# stopwords_path <- file.path("inst/extdata", "all_stopwords.txt")
# all_stopwords <- read_lines(stopwords_path)

## -----------------------------------------------------------------------------
# texts_cleaned = dataset$text_clean
# titles = dataset$doc_id
# timestamps <- as.list(dataset$date)
# # timestamps <- as.integer(dataset$year)
# 
# texts_cleaned[[1]]

## ----eval=run_chunks----------------------------------------------------------
# # Embed the sentences
# sentence_transformers <- import("sentence_transformers")
# SentenceTransformer <- sentence_transformers$SentenceTransformer
# # choose an appropriate embeddings model
# embedding_model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
# embeddings = embedding_model$encode(texts_cleaned, show_progress_bar=TRUE)
# 

## ----eval=run_chunks----------------------------------------------------------
# # Initialize UMAP and HDBSCAN models
# umap_model <- UMAP(n_neighbors=15L, n_components=5L, min_dist=0.0, metric='cosine', random_state=42L)
# 

## ----eval=run_chunks----------------------------------------------------------
# hdbscan_model <- HDBSCAN(min_cluster_size=50L, min_samples = 20L, metric='euclidean', cluster_selection_method='eom', gen_min_span_tree=TRUE, prediction_data=TRUE, core_dist_n_jobs = 1)
# 

## ----eval=run_chunks----------------------------------------------------------
# # Initialize CountVectorizer
# vectorizer_model <- CountVectorizer(min_df=2L, ngram_range=tuple(1L, 3L),
#                                     max_features = 10000L, max_df = 50L,
#                                     stop_words = all_stopwords)
# sentence_vectors <- vectorizer_model$fit_transform(texts_cleaned)
# sentence_vectors_dense <- np$array(sentence_vectors)
# sentence_vectors_dense <- py_to_r(sentence_vectors_dense)
# 

## ----eval=run_chunks && identical(Sys.getenv("BERTOPICR_ENABLE_REPR"), "true")----
# # Initialize representation models
# keybert_model <- bertopic$representation$KeyBERTInspired()
# openai <- import("openai")
# OpenAI <- openai$OpenAI
# ollama <- import("ollama")
# # lmstudio <- import("lmstudio")
# 
# # Point to the local server (ollama or lm-studio)
# client <- OpenAI(base_url = 'http://localhost:11434/v1', api_key='ollama')
# # client <- OpenAI(base_url = 'http://localhost:1234/v1', api_key='lm-studio')
# 
# prompt <- "
# I have a topic that contains the following documents:
# [DOCUMENTS]
# The topic is described by the following keywords: [KEYWORDS]
# 
# Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
# topic: <topic label>
# "
# 
# # download an appropriate LLM to be hosted by ollama or lm-studio
# openai_model <- bertopic$representation$OpenAI(client,
#                                                model = "gpt-oss:20b",
#                                                exponential_backoff = TRUE,
#                                                chat = TRUE,
#                                                prompt = prompt)
# 
# # downlaod a language model from spacy.io before use here
# # Below a German spacy model is used
# pos_model <- bertopic$representation$PartOfSpeech("de_core_news_lg")
# # diversity set relatively high to reduce repetition of keyword word forms
# mmr_model <- bertopic$representation$MaximalMarginalRelevance(diversity = 0.5)
# 
# # Combine all representation models
# representation_model <- list(
#   "KeyBERT" = keybert_model,
#   "OpenAI" = openai_model,
#   "MMR" = mmr_model,
#   "POS" = pos_model
# )
# 

## ----eval=run_chunks----------------------------------------------------------
# # We can define a number of topics of interest
# zeroshot_topic_list  <- list("german national identity", "minority issues in germany")
# 

## ----eval=run_chunks----------------------------------------------------------
# # Initialize BERTopic model with pipeline models and hyperparameters
# BERTopic <- bertopic$BERTopic
# topic_model <- BERTopic(
#   embedding_model = embedding_model,
#   umap_model = umap_model,
#   hdbscan_model = hdbscan_model,
#   vectorizer_model = vectorizer_model,
#   # zeroshot_topic_list = zeroshot_topic_list,
#   # zeroshot_min_similarity = 0.85,
#   representation_model = representation_model,
#   calculate_probabilities = TRUE,
#   top_n_words = 200L, # if you need more top words, insert the desired number here!!!
#   verbose = TRUE
# )
# 

## ----eval=run_chunks----------------------------------------------------------
# tictoc::tic()
# 
# # Fit the model and transform the texts
# fit_transform <- topic_model$fit_transform(texts_cleaned, embeddings)
# topics <- fit_transform[[1]]
# 
# # Now transform the texts to get the updated probabilities
# transform_result <- topic_model$transform(texts_cleaned)
# probs <- transform_result[[2]]  # Extract the updated probabilities
# 
# tictoc::toc()
# 

## ----eval=run_chunks----------------------------------------------------------
# # Converting R Date to Python datetime
# datetime <- import("datetime")
# 
# timestamps <- as.list(dataset$date)
# # timestamps <- as.integer(dataset$year)
# 
# # Convert each R date object to an ISO 8601 string
# timestamps <- lapply(timestamps, function(x) {
#   format(x, "%Y-%m-%dT%H:%M:%S")  # ISO 8601 format
# })
# 
# # Dynamic topic model
# topics_over_time  <- topic_model$topics_over_time(texts_cleaned, timestamps, nr_bins=20L, global_tuning=TRUE, evolution_tuning=TRUE)
# 

## -----------------------------------------------------------------------------
# # Combine results with additional columns
# results <- dataset |>
#   mutate(Topic = topics,
#          Probability = apply(probs, 1, max))  # Assuming the highest probability for each sentence
# 
# results <- results |>
#   mutate(row_id = row_number()) |>
#   select(row_id, everything())
# 
# head(results,10) |> rmarkdown::paged_table()
# 

## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")-------------
# results |>
#   saveRDS("inst/extdata/spiegel_topic_results_df.rds", version = 2)
# results |>
#   write_csv("inst/extdata/spiegel_topic_results_df.csv")
# 

## ----eval=run_chunks----------------------------------------------------------
# library(bertopicr)
# document_info_df <- get_document_info_df(model = topic_model,
#                                          texts = texts_cleaned,
#                                          drop_expanded_columns = TRUE)
# document_info_df |> head() |> rmarkdown::paged_table()

## ----eval=run_chunks----------------------------------------------------------
# # Create a data frame similar to df_docs
# df_docs <- tibble(Topic = results$Topic,
#                   Document = results$text_clean,
#                   probs = results$Probability)
# rep_docs <- get_most_representative_docs(df = df_docs,
#                                          topic_nr = 3,
#                                          n_docs = 5)
# unique(rep_docs)

## ----eval=run_chunks----------------------------------------------------------
# topic_info_df <- get_topic_info_df(model = topic_model,
#                                    drop_expanded_columns = TRUE)
# head(topic_info_df) |> rmarkdown::paged_table()

## ----eval=run_chunks----------------------------------------------------------
# topics_df <- get_topics_df(model = topic_model)
# head(topics_df, 10)

## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")-------------
# visualize_barchart(model = topic_model,
#                    filename = "topics_topwords_interactive_barchart.html", # default
#                    open_file = FALSE) # TRUE enables output in browser

## ----eval=run_chunks----------------------------------------------------------
# library(ggplot2)
# 
# barchart <- topics_df |>
#   group_by(Topic) |>
#   filter(Topic >= 0 & Topic <= 8) |>
#   slice_head(n=5) |>
#   mutate(Topic = paste("Topic", as.character(Topic)),
#          Word = reorder(Word, Score)) |>
#   ggplot(aes(Score, Word, fill = Topic)) +
#   geom_col() +
#   facet_wrap(~ Topic, scales = "free") +
#   theme(legend.position = "none")
# 
# # # Disabled to avoid poential conflicts
# # library(plotly)
# # ggplotly(barchart)

## ----eval=run_chunks----------------------------------------------------------
# find_topics_df(model = topic_model,
#                queries = "migration", # user input
#                top_n = 10, # default
#                return_tibble = TRUE) # default

## ----eval=run_chunks----------------------------------------------------------
# find_topics_df(model = topic_model,
#                                queries = c("migranten", "asylanten"),
#                                top_n = 5)

## ----eval=run_chunks----------------------------------------------------------
# get_topic_df(model = topic_model,
#                            topic_number = 0,
#                            top_n = 5, # default is 10
#                            return_tibble = TRUE) # default

## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")-------------
# # default filename: topic_dist_interactive.html
# visualize_distribution(model = topic_model,
#                        text_id = 1, # user input
#                        probabilities = probs) # see model training

## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")-------------
# visualize_topics(model = topic_model,
#                  filename = "intertopic_distance_map") # default name

## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")-------------
# visualize_heatmap(model = topic_model,
#                   filename = "topics_similarity_heatmap",
#                   auto_open = FALSE)

## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")-------------
# visualize_hierarchy(model = topic_model,
#                     hierarchical_topics = NULL, # default
#                     filename = "topic_hierarchy", # default name, html extension
#                     auto_open = FALSE) # TRUE enables output in browser

## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")-------------
# hierarchical_topics = topic_model$hierarchical_topics(texts_cleaned)
# visualize_hierarchy(model = topic_model,
#                     hierarchical_topics = hierarchical_topics,
#                     filename = "topic_hierarchy", # default name, html extension
#                     auto_open = FALSE) # TRUE enables output in browser

## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")-------------
# # Reduce dimensionality of embeddings using UMAP
# reduced_embeddings <- umap$UMAP(n_neighbors = 10L, n_components = 2L, min_dist = 0.0, metric = 'cosine')$fit_transform(embeddings)
# 
# visualize_documents(model = topic_model,
#                     texts = texts_cleaned,
#                     reduced_embeddings = reduced_embeddings,
#                     filename = "visualize_documents", # default extension html
#                     auto_open = FALSE) # TRUE enables output in browser
# 

## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")-------------
# # Reduce dimensionality of embeddings using UMAP (n_components = 2L !!!)
# reduced_embeddings <- umap$UMAP(n_neighbors = 10L, n_components = 2L, min_dist = 0.0, metric = 'cosine')$fit_transform(embeddings)
# 
# visualize_documents_2d(model = topic_model,
#                        texts = texts_cleaned,
#                        reduced_embeddings = reduced_embeddings,
#                        custom_labels = FALSE, # default
#                        hide_annotation = TRUE, # default
#                        tooltips = c("Topic", "Name", "Probability", "Text"), # default
#                        filename = "visualize_documents_2d", # default name
#                        auto_open = FALSE) # TRUE enables output in browser

## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")-------------
# # Reduce dimensionality of embeddings using UMAP
# reduced_embeddings <- umap$UMAP(n_neighbors = 10L, n_components = 3L, min_dist = 0.0, metric = 'cosine')$fit_transform(embeddings)
# 
# visualize_documents_3d(model = topic_model,
#                        texts = texts_cleaned,
#                        reduced_embeddings = reduced_embeddings,
#                        custom_labels = FALSE, # default
#                        hide_annotation = TRUE, # default
#                        tooltips = c("Topic", "Name", "Probability", "Text"), # default
#                        filename = "visualize_documents_3d", # default name
#                        auto_open = FALSE) # TRUE enables output in browser

## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")-------------
# visualize_topics_over_time(model = topic_model,
#                            # see Topic Dynamics section above
#                            topics_over_time_model = topics_over_time,
#                            top_n_topics = 10, # default is 20
#                            filename = "topics_over_time") # default, html extension

## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")-------------
# classes = as.list(dataset$genre) # text types
# topics_per_class = topic_model$topics_per_class(texts_cleaned, classes=classes)
# 
# visualize_topics_per_class(model = topic_model,
#                            topics_per_class = topics_per_class,
#                            start = 0, # default
#                            end = 10, # default
#                            filename = "topics_per_class", # default, html extension
#                            auto_open = FALSE) # TRUE enables output in browser

## ----eval=run_chunks----------------------------------------------------------
# BERTopic200 <- bertopic$BERTopic
# topic_model200 <- BERTopic200(
#   embedding_model = embedding_model,
#   umap_model = umap_model,
#   hdbscan_model = hdbscan_model,
#   vectorizer_model = vectorizer_model,
#   # zeroshot_topic_list = zeroshot_topic_list,
#   # zeroshot_min_similarity = 0.85,
#   representation_model = representation_model,
#   calculate_probabilities = TRUE,
#   top_n_words = 200L, # !!!
#   verbose = TRUE
# )
# 
# tictoc::tic()
# 
# # Fit the model and transform the texts
# py_fit <- topic_model200$fit(texts_cleaned, embeddings)
# 
# # ask Python for the top-200 of the desired topic:
# py_topic200 <- py_fit$get_topic(1L, 200L)    # list of (word, score)
# 
# names(py_topic200)
# 
# rep_list <- py_topic200[["Main"]]
# 
# tictoc::toc()
# 

## ----eval=run_chunks----------------------------------------------------------
# df_wc <- data.frame(
#   name = sapply(rep_list, `[[`, 1),
#   freq = as.numeric(sapply(rep_list, `[[`, 2)),
#   stringsAsFactors = FALSE
# )
# 
# library(wordcloud2)
# source("inst/extdata/wordcloud2a.R")
# 
# wordcloud2a(
#   data            = df_wc,
#   size            = 0.5,
#   minSize         = 0,
#   gridSize        = 1,
#   fontFamily      = "Segoe UI",
#   fontWeight      = "bold",
#   color           = "random-dark",
#   backgroundColor = "white",
#   shape           = "circle",
#   ellipticity     = 0.65
# )
# 

