## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

# handle package dependencies in Suggests gracefully
need <- function(pkgs) all(vapply(pkgs, requireNamespace, TRUE, quietly = TRUE))



## ----embedding----------------------------------------------------------------
library(textmineR)

# load the data
data(movie_review, package = "text2vec")

# let's take a sample so the demo will run quickly
# note: textmineR is generally quite scaleable, depending on your system
set.seed(123)
s <- sample(1:nrow(movie_review), 200)

movie_review <- movie_review[ s , ]

# let's get those nasty "<br />" symbols out of the way
movie_review$review <- stringr::str_replace_all(movie_review$review, "<br */>", "")

# First create a TCM using skip grams, we'll use a 5-word window
# most options available on CreateDtm are also available for CreateTcm
tcm <- CreateTcm(doc_vec = movie_review$review,
                 skipgram_window = 10,
                 verbose = FALSE,
                 cpus = 1)

# use LDA to get embeddings into probability space
# This will take considerably longer as the TCM matrix has many more rows 
# than a DTM
embeddings <- FitLdaModel(dtm = tcm,
                          k = 50,
                          iterations = 200,
                          burnin = 180,
                          alpha = 0.1,
                          beta = 0.05,
                          optimize_alpha = TRUE,
                          calc_likelihood = FALSE,
                          calc_coherence = FALSE,
                          calc_r2 = FALSE,
                          cpus = 1)


## ----eval = FALSE-------------------------------------------------------------
#   # parse it into sentences
#   sent <- stringi::stri_split_boundaries(doc, type = "sentence")[[ 1 ]]
# 
#   names(sent) <- seq_along(sent) # so we know index and order
# 
#   # embed the sentences in the model
#   e <- CreateDtm(sent, ngram_window = c(1,1), verbose = FALSE, cpus = 1)
# 
#   # remove any documents with 2 or fewer words
#   e <- e[ rowSums(e) > 2 , ]
# 
#   vocab <- intersect(colnames(e), colnames(gamma))
# 
#   e <- e / rowSums(e)
# 
#   e <- e[ , vocab ] %*% t(gamma[ , vocab ])
# 
#   e <- as.matrix(e)
# 

## ----eval = FALSE-------------------------------------------------------------
#   # get the pairwise distances between each embedded sentence
#   e_dist <- CalcHellingerDist(e)

## ----eval = FALSE-------------------------------------------------------------
#   # turn into a similarity matrix
#   g <- (1 - e_dist) * 100

## ----eval = FALSE-------------------------------------------------------------
#   # we don't need sentences connected to themselves
#   diag(g) <- 0
# 
#   # turn into a nearest-neighbor graph
#   g <- apply(g, 1, function(x){
#     x[ x < sort(x, decreasing = TRUE)[ 3 ] ] <- 0
#     x
#   })
# 
#   # by taking pointwise max, we'll make the matrix symmetric again
#   g <- pmax(g, t(g))

## ----eval = FALSE-------------------------------------------------------------
#   g <- igraph::graph.adjacency(g, mode = "undirected", weighted = TRUE)
# 
#   # calculate eigenvector centrality
#   ev <- igraph::evcent(g)
# 
#   # format the result
#   result <- sent[ names(ev$vector)[ order(ev$vector, decreasing = TRUE)[ 1:3 ] ] ]
# 
#   result <- result[ order(as.numeric(names(result))) ]
# 
#   paste(result, collapse = " ")

## ----eval = need("igraph"), summaries-----------------------------------------
# 
# # let's do this in a function
# 
# summarizer <- function(doc, gamma) {
# 
#   # recursive fanciness to handle multiple docs at once
#   if (length(doc) > 1 )
#     # use a try statement to catch any weirdness that may arise
#     return(sapply(doc, function(d) try(summarizer(d, gamma))))
# 
#   # parse it into sentences
#   sent <- stringi::stri_split_boundaries(doc, type = "sentence")[[ 1 ]]
# 
#   names(sent) <- seq_along(sent) # so we know index and order
# 
#   # embed the sentences in the model
#   e <- CreateDtm(sent, ngram_window = c(1,1), verbose = FALSE, cpus = 1)
# 
#   # remove any documents with 2 or fewer words
#   e <- e[ rowSums(e) > 2 , ]
# 
#   vocab <- intersect(colnames(e), colnames(gamma))
# 
#   e <- e / rowSums(e)
# 
#   e <- e[ , vocab ] %*% t(gamma[ , vocab ])
# 
#   e <- as.matrix(e)
# 
#   # get the pairwise distances between each embedded sentence
#   e_dist <- CalcHellingerDist(e)
# 
#   # turn into a similarity matrix
#   g <- (1 - e_dist) * 100
# 
#   # we don't need sentences connected to themselves
#   diag(g) <- 0
# 
#   # turn into a nearest-neighbor graph
#   g <- apply(g, 1, function(x){
#     x[ x < sort(x, decreasing = TRUE)[ 3 ] ] <- 0
#     x
#   })
# 
#   # by taking pointwise max, we'll make the matrix symmetric again
#   g <- pmax(g, t(g))
# 
#   g <- igraph::graph.adjacency(g, mode = "undirected", weighted = TRUE)
# 
#   # calculate eigenvector centrality
#   ev <- igraph::evcent(g)
# 
#   # format the result
#   result <- sent[ names(ev$vector)[ order(ev$vector, decreasing = TRUE)[ 1:3 ] ] ]
# 
#   result <- result[ order(as.numeric(names(result))) ]
# 
#   paste(result, collapse = " ")
# }

## ----eval = need("igraph")----------------------------------------------------
# # Let's see the summary of the first couple of reviews
# docs <- movie_review$review[ 1:3 ]
# names(docs) <- movie_review$id[ 1:3 ]
# 
# sums <- summarizer(docs, gamma = embeddings$gamma)
# 
# sums

