## ----global_options, include=FALSE--------------------------------------------
knitr::opts_chunk$set(echo=TRUE, eval=FALSE, warning=FALSE, message=FALSE)

## -----------------------------------------------------------------------------
#  library(text2vec)
#  text8_file = "~/text8"
#  if (!file.exists(text8_file)) {
#    download.file("http://mattmahoney.net/dc/text8.zip", "~/text8.zip")
#    unzip ("~/text8.zip", files = "text8", exdir = "~/")
#  }
#  wiki = readLines(text8_file, n = 1, warn = FALSE)

## -----------------------------------------------------------------------------
#  # Create iterator over tokens
#  tokens <- space_tokenizer(wiki)
#  # Create vocabulary. Terms will be unigrams (simple words).
#  it = itoken(tokens, progressbar = FALSE)
#  vocab <- create_vocabulary(it)

## -----------------------------------------------------------------------------
#  vocab <- prune_vocabulary(vocab, term_count_min = 5L)

## -----------------------------------------------------------------------------
#  # Use our filtered vocabulary
#  vectorizer <- vocab_vectorizer(vocab)
#  # use window of 5 for context words
#  tcm <- create_tcm(it, vectorizer, skip_grams_window = 5L)

## ----message=TRUE-------------------------------------------------------------
#  glove = GlobalVectors$new(rank = 50, x_max = 10)
#  wv_main = glove$fit_transform(tcm, n_iter = 10, convergence_tol = 0.01, n_threads = 8)
#  # INFO  [09:35:20.779] epoch 1, loss 0.1758
#  # INFO  [09:35:28.212] epoch 2, loss 0.1223
#  # INFO  [09:35:35.500] epoch 3, loss 0.1081
#  # INFO  [09:35:43.100] epoch 4, loss 0.1003
#  # INFO  [09:35:50.848] epoch 5, loss 0.0953
#  # INFO  [09:35:58.593] epoch 6, loss 0.0917
#  # INFO  [09:36:06.346] epoch 7, loss 0.0890
#  # INFO  [09:36:14.123] epoch 8, loss 0.0868
#  # INFO  [09:36:21.862] epoch 9, loss 0.0851
#  # INFO  [09:36:29.610] epoch 10, loss 0.0836

## -----------------------------------------------------------------------------
#  wv_context = glove$components
#  word_vectors = wv_main + t(wv_context)

## -----------------------------------------------------------------------------
#  berlin <- word_vectors["paris", , drop = FALSE] -
#    word_vectors["france", , drop = FALSE] +
#    word_vectors["germany", , drop = FALSE]
#  cos_sim = sim2(x = word_vectors, y = berlin, method = "cosine", norm = "l2")
#  head(sort(cos_sim[,1], decreasing = TRUE), 5)
#  #     paris    berlin    munich    madrid   germany
#  # 0.7859821 0.7410693 0.6490518 0.6216343 0.6160014

