## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment  = "#>",
  fig.width  = 6,
  fig.height = 6
)

## ----setup, message = FALSE---------------------------------------------------
library(gghinton)
library(ggplot2)

## ----theme-mackay-------------------------------------------------------------
theme_mackay <- function() {
  theme_hinton() +
    theme(
      panel.background = element_rect(fill = "black", colour = NA),
      panel.border = element_rect(colour = "grey30", fill = NA,
                                  linewidth = 0.4), 
      axis.text = element_text(size = 12, family = "mono")
    )
}

## ----unigram------------------------------------------------------------------
chars27     <- c(letters, " ")
axis_labels <- c(letters, "_")

# Probabilities from MacKay ITILA Table / Figure 2.1
p_char <- c(
  a = 0.0575, b = 0.0128, c = 0.0263, d = 0.0285, e = 0.0913,
  f = 0.0173, g = 0.0133, h = 0.0313, i = 0.0599, j = 0.0006,
  k = 0.0084, l = 0.0335, m = 0.0235, n = 0.0596, o = 0.0689,
  p = 0.0192, q = 0.0008, r = 0.0508, s = 0.0567, t = 0.0706,
  u = 0.0334, v = 0.0069, w = 0.0119, x = 0.0073, y = 0.0164,
  z = 0.0007, ` ` = 0.1928
)

# Display as a single-column Hinton diagram (1x27 matrix, one column)
unigram_mat <- matrix(p_char, nrow = length(p_char), ncol = 1,
                      dimnames = list(chars27, "p"))
df_uni <- matrix_to_hinton(unigram_mat)

ggplot(df_uni, aes(x = col, y = row, weight = weight)) +
  geom_hinton() +
  scale_fill_hinton(values = c(unsigned = "white")) +
  scale_y_continuous(breaks = seq_along(chars27),
                     labels = rev(axis_labels),
                     expand = c(0.02, 0.02)) +
  scale_x_continuous(breaks = NULL) +
  coord_fixed() +
  theme_mackay() +
  theme(axis.text.y = element_text(size = 8, family = "mono")) +
  labs(
    x        = NULL,
    y        = NULL
  )

## ----bigram-compute-----------------------------------------------------------
# alice_bigrams[x, y] = count of character x immediately followed by y
bg_prob <- alice_bigrams / sum(alice_bigrams)

# Axis labels: a-z then "-" for space (MacKay's convention)
chars27     <- c(letters, " ")
axis_labels <- c(letters, "_")

df_bg <- matrix_to_hinton(bg_prob)

## ----bigram-plot, fig.width = 7.5, fig.height = 7.5---------------------------
ggplot(df_bg, aes(x = col, y = row, weight = weight)) +
  geom_hinton() +
  scale_fill_hinton(values = c(unsigned = "white")) +
  # x: column 1 = 'a', column 27 = '-' (space)
  scale_x_continuous(
    breaks = seq_along(chars27),
    labels = axis_labels,
    expand = c(0.02, 0.02)
  ) +
  # y: row 1 (matrix row 'a') maps to highest y; labels reversed so 'a' is at top
  scale_y_continuous(
    breaks = seq_along(chars27),
    labels = rev(axis_labels),
    expand = c(0.02, 0.02)
  ) +
  coord_fixed() +
  theme_mackay() +
  labs(
    title    = "English letter bigrams: joint probability P(x, y)",
    subtitle = "Recreating MacKay ITILA Figure 2.2 (source: Alice in Wonderland)",
    x        = "y (second character)",
    y        = "x (first character)"
  )

## ----corpus-size--------------------------------------------------------------
# Fraction of the 729 cells with at least one observed bigram
mean(alice_bigrams > 0)
# Total bigrams observed
sum(alice_bigrams)

## ----cond-compute-------------------------------------------------------------
# P(y|x): row-normalise -- each row sums to 1
row_sums <- rowSums(alice_bigrams)
cond_yx  <- alice_bigrams / row_sums          # M[x, y] = P(y | first = x)

# P(x|y): column-normalise -- each column sums to 1
col_sums <- colSums(alice_bigrams)
cond_xy  <- sweep(alice_bigrams, 2, col_sums, "/")  # M[x, y] = P(x | second = y)

# Combine into one data frame for faceting
df_yx <- matrix_to_hinton(cond_yx)
df_xy <- matrix_to_hinton(cond_xy)
df_yx$panel <- "(a) P(y | x)"
df_xy$panel <- "(b) P(x | y)"
df_cond <- rbind(df_yx, df_xy)

## ----cond-plot, fig.width = 15, fig.height = 7.5------------------------------
ggplot(df_cond, aes(x = col, y = row, weight = weight)) +
  geom_hinton() +
  scale_fill_hinton(values = c(unsigned = "white")) +
  scale_x_continuous(breaks = seq_along(chars27), labels = axis_labels,
                     expand = c(0.02, 0.02)) +
  scale_y_continuous(breaks = seq_along(chars27), labels = rev(axis_labels),
                     expand = c(0.02, 0.02)) +
  coord_fixed() +
  facet_wrap(~ panel, ncol = 2) +
  theme_mackay() +
  labs(
    title    = "English letter bigrams: conditional probability P(x|y) and P(y|x)",
    subtitle = "Recreating MacKay ITILA Figure 2.3",
    x        = "y (second character)",
    y        = "x (first character)"
  )

## ----urn-compute--------------------------------------------------------------
N       <- 10L
u_vals  <- 0:N   # number of black balls in the urn (Fred's choice)
nB_vals <- 0:N   # number of black balls observed in N draws (Bill's data)

# Rows = u (0..10), columns = n_B (0..10)
joint_mat <- outer(u_vals, nB_vals, function(u, nB) {
  (1 / (N + 1)) * dbinom(nB, size = N, prob = u / N)
})
rownames(joint_mat) <- u_vals
colnames(joint_mat) <- nB_vals

df_urn <- matrix_to_hinton(joint_mat)

## ----urn-plot, fig.width = 6, fig.height = 6----------------------------------
ggplot(df_urn, aes(x = col, y = row, weight = weight)) +
  geom_hinton() +
  scale_fill_hinton(values = c(unsigned = "white")) +
  # row 1 of the matrix (u = 0) maps to the highest y, so labels are reversed
  scale_x_continuous(breaks = 1:(N + 1L), labels = nB_vals,
                     expand = c(0.04, 0.04)) +
  scale_y_continuous(breaks = 1:(N + 1L), labels = rev(u_vals),
                     expand = c(0.04, 0.04)) +
  coord_fixed() +
  theme_mackay() +
  labs(
    title    = "Joint probability P(u, n_B | N = 10)",
    subtitle = "Recreating MacKay ITILA Figure 2.5",
    x        = expression(n[B]~~"(observed black balls)"),
    y        = expression(u~~"(black balls in urn)")
  )

