## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------
# encoder representations of four different words
word_1 = matrix(c(1,0,0), nrow=1)
word_2 = matrix(c(0,1,0), nrow=1)
word_3 = matrix(c(1,1,0), nrow=1)
word_4 = matrix(c(0,0,1), nrow=1)

## -----------------------------------------------------------------------------
# stacking the word embeddings into a single array
words = rbind(word_1,
              word_2,
              word_3,
              word_4)

## -----------------------------------------------------------------------------
print(words)

## -----------------------------------------------------------------------------
# initializing the weight matrices (with random values)
set.seed(0)
W_Q = matrix(floor(runif(9, min=0, max=3)),nrow=3,ncol=3)
W_K = matrix(floor(runif(9, min=0, max=3)),nrow=3,ncol=3)
W_V = matrix(floor(runif(9, min=0, max=3)),nrow=3,ncol=3)

## -----------------------------------------------------------------------------
# generating the queries, keys and values
Q = words %*% W_Q
K = words %*% W_K
V = words %*% W_V

## -----------------------------------------------------------------------------
# scoring the query vectors against all key vectors
scores = Q %*% t(K)
print(scores)

## -----------------------------------------------------------------------------
# calculate the max for each row of the scores matrix
maxs = as.matrix(apply(scores, MARGIN=1, FUN=max))
print(maxs)

## -----------------------------------------------------------------------------
# initialize weights matrix
weights = matrix(0, nrow=4, ncol=4)

# computing the weights by a softmax operation
for (i in 1:dim(scores)[1]) {
  weights[i,] = exp((scores[i,]-maxs[i,]) / ncol(K) ^ 0.5)/sum(exp((scores[i,]-maxs[i,]) / ncol(K) ^ 0.5))
}

## -----------------------------------------------------------------------------
print(weights)

## -----------------------------------------------------------------------------
# computing the attention by a weighted sum of the value vectors
attention = weights %*% V

## -----------------------------------------------------------------------------
print(attention)