---
title: "Adaptive fitting"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{adaptive-fitting}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

```{r, include = FALSE}
ggplot2dplyr_available <- requireNamespace("ggplot2", quietly = TRUE) && 
  requireNamespace("dplyr", quietly = TRUE)
```

For reproducibility:

```{r}
set.seed(1)
```

# Data

```{r}
library(disclapmix)
data(danes)
db <- as.matrix(danes[rep(1:nrow(danes), danes$n), 1:(ncol(danes)-1)])
str(db)
```


# Using default parameters: partition around medoids (PAM)

Using partition around medoids (PAM) cluster method to find initial clusters:

```{r}
default_fits <- disclapmix_adaptive(db, label = "PAM", margin = 5L)
```

The `label` argument is added to the resulting fits (the advantage is demonstrated later).

# Using custom `init_y_method`: clustering large applications (CLARA)

```{r}
clara_fits <- disclapmix_adaptive(db, label = "CLARA", margin = 5L, init_y_method = "clara")
```

# Using custom `init_y`

Note the argument `init_y_generator` for `disclapmix_adaptive()`:

```{r}
# Random observations:
my_init_y_generator <- function(k) {
  # Or cluster::pam(), cluster::clara() or something else
  db[sample(seq_len(nrow(db)), k, replace = FALSE), , drop = FALSE]
}

my_init_y_generator(1)
my_init_y_generator(2)
```

```{r}
custom_fits <- disclapmix_adaptive(db, label = "Custom", 
                                   margin = 1L, # Just demonstrating my_init_y_generator()
                                   init_y_generator = my_init_y_generator)
rm(custom_fits_best) # To avoid using it by accident later
```

Now, we can do multiple and take the best:

```{r}
set.seed(2) # For reproducibility
custom_fits_extra <- replicate(5, 
                               disclapmix_adaptive(db, 
                                                   label = "Custom", 
                                                   margin = 5L, 
                                                   init_y_generator = my_init_y_generator, 
                                                   # Random starting points may need more iterations
                                                   glm_control_maxit = 100L)
)
str(custom_fits_extra, 2)

custom_fits_max_n <- max(sapply(custom_fits_extra, length))
custom_fits_best <- vector("list", custom_fits_max_n)

for (i in seq_len(custom_fits_max_n)) {
  best_fit_i <- NULL
  
  for (j in seq_along(custom_fits_extra)) {
    if (length(custom_fits_extra[[j]]) < i) {
      next
    }
    
    if (is.null(best_fit_i) || 
        best_fit_i$BIC_marginal > custom_fits_extra[[j]][[i]]$BIC_marginal) {
      
      best_fit_i <- custom_fits_extra[[j]][[i]]
    }
  }
  
  custom_fits_best[[i]] <- best_fit_i
}
```



# Visualising

First we put all fits into a single list:

```{r}
fits <- c(default_fits, clara_fits, custom_fits_best)
```

And then construct a data frame with summary results:

```{r}
d <- data.frame(
  Label = sapply(fits, function(x) x$label),
  BIC = sapply(fits, function(x) x$BIC_marginal),
  Clusters = sapply(fits, function(x) nrow(x$y))
)
```


```{r, eval = ggplot2dplyr_available, fig.width=8, fig.height=5}
library(ggplot2)
ggplot(d, aes(Clusters, BIC, color = Label)) + 
  geom_point() +
  geom_line() +
  scale_x_continuous(breaks = unique(d$Clusters)) + 
  theme_bw()
```

```{r, eval = ggplot2dplyr_available}
library(dplyr)

d %>% 
  group_by(Label) %>% 
  summarise(best_clusters = Clusters[which.min(BIC)])
```


# Saving fits

For all of the above, you can save the objects:

```{r, eval = FALSE}
saveRDS(default_fits, "obj-default_fits.Rdata")
saveRDS(clara_fits, "obj-clara_fits.Rdata")
saveRDS(custom_fits_best, "obj-custom_fits_best.Rdata")
```


# Multiple criteria

```{r}
fits <- disclapmix_adaptive(db, criteria = c("BIC", "AIC", "AICc"), margin = 5L)
length(fits)
d <- data.frame(
  BIC = sapply(fits, function(x) x$BIC_marginal),
  AIC = sapply(fits, function(x) x$AIC_marginal),
  AICc = sapply(fits, function(x) x$AICc_marginal),
  Clusters = sapply(fits, function(x) nrow(x$y))
)
best_BIC <- d$Clusters[which.min(d$BIC)]
best_AIC <- d$Clusters[which.min(d$AIC)]
best_AICc <- d$Clusters[which.min(d$AICc)]
```

```{r, eval = ggplot2dplyr_available, fig.width=8, fig.height=5}
library(ggplot2)
ggplot(d) + 
  
  geom_vline(aes(xintercept = best_BIC, color = "BIC"), linetype = "dashed") +
  geom_vline(aes(xintercept = best_AIC, color = "AIC"), linetype = "dashed") +
  geom_vline(aes(xintercept = best_AICc, color = "AICc"), linetype = "dashed") +
  
  geom_line(aes(Clusters, BIC, color = "BIC")) +
  geom_line(aes(Clusters, AIC, color = "AIC")) +
  geom_line(aes(Clusters, AICc, color = "AICc")) +
  
  scale_x_continuous(breaks = unique(d$Clusters)) + 
  
  labs(y = "Information criteria value", color = "Information criteria") + 
  
  theme_bw()
```
