## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = TRUE  # Set to TRUE if you want to run the examples
)

## ----setup--------------------------------------------------------------------
library(contentanalysis)
library(dplyr)

## ----download-----------------------------------------------------------------
# Download example paper
paper_url <- "https://raw.githubusercontent.com/massimoaria/contentanalysis/master/inst/examples/example_paper.pdf"
download.file(paper_url, destfile = "example_paper.pdf", mode = "wb")

## ----import-basic-------------------------------------------------------------
# Import with automatic section detection
doc <- pdf2txt_auto("example_paper.pdf", n_columns = 2, citation_type = "author_year")

# Check what sections were detected
names(doc)

## ----import-manual, eval=FALSE------------------------------------------------
# # Single column
# doc_single <- pdf2txt_auto("example_paper.pdf", n_columns = 1)
# 
# # Three columns
# doc_three <- pdf2txt_auto("example_paper.pdf", n_columns = 3)
# 
# # Without section splitting
# text_only <- pdf2txt_auto("example_paper.pdf", sections = FALSE)

## ----analysis-----------------------------------------------------------------
analysis <- analyze_scientific_content(
  text = doc,
  doi = "10.1016/j.mlwa.2021.100094",        # Paper's DOI for CrossRef lookup
  mailto = "your@email.com",                 # Required for CrossRef API
  citation_type = "author_year",             # Citation style  
  window_size = 10,                          # Words around citations
  remove_stopwords = TRUE,
  ngram_range = c(1, 3),
  use_sections_for_citations = TRUE
)

## ----results-structure--------------------------------------------------------
names(analysis)

## ----summary------------------------------------------------------------------
analysis$summary

## ----reference-sources--------------------------------------------------------
# View enriched references
head(analysis$parsed_references[, c("ref_first_author", "ref_year", 
                                     "ref_journal", "ref_source")])

# Check data sources
table(analysis$parsed_references$ref_source)

## ----openalex-data------------------------------------------------------------
# Check if OpenAlex data is available
if (!is.null(analysis$references_oa)) {
  # View enriched metadata
  head(analysis$references_oa[, c("title", "publication_year", "cited_by_count", 
                                   "type", "is_oa")])
  
  # Analyze citation impact
  cat("Citation impact statistics:\n")
  print(summary(analysis$references_oa$cited_by_count))
  
  # Open access status
  if ("is_oa" %in% names(analysis$references_oa)) {
    oa_count <- sum(analysis$references_oa$is_oa, na.rm = TRUE)
    cat("\nOpen Access references:", oa_count, "out of", 
        nrow(analysis$references_oa), "\n")
  }
}

## ----matching-quality---------------------------------------------------------
# View matching results with confidence levels
matched <- analysis$citation_references_mapping %>%
  select(citation_text_clean, cite_author, cite_year, 
         ref_authors, ref_year, match_confidence)

head(matched)

# Match quality distribution
cat("Match quality distribution:\n")
print(table(matched$match_confidence))

# High-confidence matches
high_conf <- matched %>%
  filter(match_confidence %in% c("high", "high_second_author"))
cat("\nHigh-confidence matches:", nrow(high_conf), "out of", 
    nrow(matched), "\n")

## ----citations----------------------------------------------------------------
# View all citations
head(analysis$citations)

# Citation types found
table(analysis$citations$citation_type)

# Citations by section
analysis$citation_metrics$section_distribution

## ----citation-types-----------------------------------------------------------
# Narrative vs. parenthetical
analysis$citation_metrics$narrative_ratio

# Citation density
cat("Citation density:", 
    analysis$citation_metrics$density$citations_per_1000_words,
    "citations per 1000 words\n")

## ----contexts-----------------------------------------------------------------
# View citation contexts with matched references
contexts <- analysis$citation_contexts %>%
  select(citation_text_clean, section, ref_full_text, 
         full_context, match_confidence)

head(contexts)

# Find citations in specific section
intro_citations <- analysis$citation_contexts %>%
  filter(section == "Introduction")

cat("Citations in Introduction:", nrow(intro_citations), "\n")

## ----network-create, fig.width=8, fig.height=6--------------------------------
# Create interactive citation network
network <- create_citation_network(
  citation_analysis_results = analysis,
  max_distance = 800,          # Maximum distance in characters
  min_connections = 2,          # Minimum connections to include a node
  show_labels = TRUE            # Show citation labels
)

# Display the network
network

## ----network-stats------------------------------------------------------------
# Get network statistics
stats <- attr(network, "stats")

# Network size
cat("Number of nodes:", stats$n_nodes, "\n")
cat("Number of edges:", stats$n_edges, "\n")
cat("Average distance:", stats$avg_distance, "characters\n")
cat("Maximum distance:", stats$max_distance, "characters\n")

# Distribution by section
print(stats$section_distribution)

# Citations appearing in multiple sections
if (nrow(stats$multi_section_citations) > 0) {
  cat("\nCitations appearing in multiple sections:\n")
  print(stats$multi_section_citations)
}

# Color mapping
cat("\nSection colors:\n")
print(stats$section_colors)

## ----network-custom, eval=FALSE-----------------------------------------------
# # Focus on very close citations only
# network_close <- create_citation_network(
#   analysis,
#   max_distance = 300,
#   min_connections = 1
# )
# 
# # Show only highly connected "hub" citations
# network_hubs <- create_citation_network(
#   analysis,
#   max_distance = 1000,
#   min_connections = 5,
#   show_labels = TRUE
# )
# 
# # Clean visualization without labels
# network_clean <- create_citation_network(
#   analysis,
#   max_distance = 800,
#   min_connections = 2,
#   show_labels = FALSE
# )

## ----network-analysis---------------------------------------------------------
# Find hub citations (most connected)
hub_threshold <- quantile(stats$section_distribution$n, 0.75)
cat("Hub citations (top 25%):\n")
print(stats$section_distribution %>% filter(n >= hub_threshold))

# Analyze network density
network_density <- stats$n_edges / (stats$n_nodes * (stats$n_nodes - 1) / 2)
cat("\nNetwork density:", round(network_density, 3), "\n")

## ----network-data-------------------------------------------------------------
# View raw co-occurrence data
network_data <- analysis$network_data
head(network_data)

# Citations appearing very close together
close_citations <- network_data %>%
  filter(distance < 100)  # Within 100 characters

cat("Number of very close citation pairs:", nrow(close_citations), "\n")

## ----word-freq----------------------------------------------------------------
# Top 20 most frequent words
head(analysis$word_frequencies, 20)

## ----ngrams-------------------------------------------------------------------
# Bigrams
head(analysis$ngrams$`2gram`)

# Trigrams
head(analysis$ngrams$`3gram`)

## ----readability--------------------------------------------------------------
# Calculate readability for the full text
readability <- calculate_readability_indices(
  doc$Full_text,
  detailed = TRUE
)

print(readability)

# Compare readability across sections
sections_to_analyze <- c("Abstract", "Introduction", "Methods", "Discussion")
readability_by_section <- lapply(sections_to_analyze, function(section) {
  if (section %in% names(doc)) {
    calculate_readability_indices(doc[[section]], detailed = FALSE)
  }
})
names(readability_by_section) <- sections_to_analyze

# View results
do.call(rbind, readability_by_section)

## ----word-dist----------------------------------------------------------------
# Terms of interest
terms <- c("random forest", "machine learning", "accuracy", "tree")

# Calculate distribution
dist <- calculate_word_distribution(
  text = doc,
  selected_words = terms,
  use_sections = TRUE
)

# View results
dist %>%
  select(segment_name, word, count, percentage) %>%
  arrange(segment_name, desc(percentage))

## ----plot, fig.width=8, fig.height=5, eval=TRUE-------------------------------
# Interactive plot
plot_word_distribution(
  dist,
  plot_type = "line",
  show_points = TRUE,
  smooth = TRUE
)

# Area plot
plot_word_distribution(
  dist,
  plot_type = "area"
)

## ----find-citations-----------------------------------------------------------
# Citations to specific author
analysis$citation_references_mapping %>%
  filter(grepl("Breiman", ref_authors, ignore.case = TRUE))

# Citations in Discussion section
analysis$citations %>%
  filter(section == "Discussion") %>%
  select(citation_text, citation_type, section)

## ----citation-impact----------------------------------------------------------
if (!is.null(analysis$references_oa)) {
  # Top cited references
  top_cited <- analysis$references_oa %>%
    arrange(desc(cited_by_count)) %>%
    select(title, publication_year, cited_by_count, is_oa) %>%
    head(10)
  
  print(top_cited)
}

## ----custom-stop, eval=FALSE--------------------------------------------------
# custom_stops <- c("however", "therefore", "thus", "moreover")
# 
# analysis_custom <- analyze_scientific_content(
#   text = doc,
#   doi = "10.1016/j.mlwa.2021.100094",
#   mailto = "your@email.com",
#   custom_stopwords = custom_stops,
#   remove_stopwords = TRUE
# )

## ----segments, fig.height=5, fig.width=8, eval=FALSE--------------------------
# # Divide into 20 equal segments
# dist_segments <- calculate_word_distribution(
#   text = doc,
#   selected_words = terms,
#   use_sections = FALSE,
#   n_segments = 20
# )
# 
# plot_word_distribution(dist_segments, smooth = TRUE)

## ----crossref-setup, eval=FALSE-----------------------------------------------
# # Always provide your email for the polite pool
# analysis <- analyze_scientific_content(
#   text = doc,
#   doi = "10.xxxx/xxxxx",
#   mailto = "your@email.com"  # Required for CrossRef polite pool
# )

## ----openalex-setup, eval=FALSE-----------------------------------------------
# # Optional: Set API key for higher rate limits
# # Get free key at: https://openalex.org/
# openalexR::oa_apikey("your-api-key-here")
# 
# # Then run your analysis as usual
# analysis <- analyze_scientific_content(
#   text = doc,
#   doi = "10.xxxx/xxxxx",
#   mailto = "your@email.com"
# )

## ----export, eval=FALSE-------------------------------------------------------
# # Export citations
# write.csv(analysis$citations, "citations.csv", row.names = FALSE)
# 
# # Export matched references with confidence scores
# write.csv(analysis$citation_references_mapping,
#           "matched_citations.csv", row.names = FALSE)
# 
# # Export enriched references
# write.csv(analysis$parsed_references,
#           "enriched_references.csv", row.names = FALSE)
# 
# # Export OpenAlex metadata (if available)
# if (!is.null(analysis$references_oa)) {
#   write.csv(analysis$references_oa,
#             "openalex_metadata.csv", row.names = FALSE)
# }
# 
# # Export word frequencies
# write.csv(analysis$word_frequencies,
#           "word_frequencies.csv", row.names = FALSE)
# 
# # Export network statistics
# if (!is.null(network)) {
#   stats <- attr(network, "stats")
#   write.csv(stats$section_distribution,
#             "network_section_distribution.csv", row.names = FALSE)
#   if (nrow(stats$multi_section_citations) > 0) {
#     write.csv(stats$multi_section_citations,
#               "network_multi_section_citations.csv", row.names = FALSE)
#   }
# }

## ----batch, eval=FALSE--------------------------------------------------------
# # Process multiple papers with API enrichment
# papers <- c("paper1.pdf", "paper2.pdf", "paper3.pdf")
# dois <- c("10.xxxx/1", "10.xxxx/2", "10.xxxx/3")
# 
# results <- list()
# networks <- list()
# 
# for (i in seq_along(papers)) {
#   # Import PDF
#   doc <- pdf2txt_auto(papers[i], n_columns = 2)
# 
#   # Analyze with API enrichment
#   results[[i]] <- analyze_scientific_content(
#     doc,
#     doi = dois[i],
#     mailto = "your@email.com"
#   )
# 
#   # Create network for each paper
#   networks[[i]] <- create_citation_network(
#     results[[i]],
#     max_distance = 800,
#     min_connections = 2
#   )
# }
# 
# # Combine citation counts
# citation_counts <- sapply(results, function(x) x$summary$citations_extracted)
# names(citation_counts) <- papers
# 
# # Compare network statistics
# network_stats <- lapply(networks, function(net) {
#   stats <- attr(net, "stats")
#   c(nodes = stats$n_nodes,
#     edges = stats$n_edges,
#     avg_distance = stats$avg_distance)
# })
# 
# do.call(rbind, network_stats)
# 
# # Analyze reference sources across papers
# ref_sources <- lapply(results, function(x) {
#   if (!is.null(x$parsed_references)) {
#     table(x$parsed_references$ref_source)
#   }
# })
# names(ref_sources) <- papers
# ref_sources