---
title: "Regression Example"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{RegressionEg}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width=6, fig.height=4
)
```

```{r setup}
library(ensModelVis)
```



```{r, include = FALSE}
if (rlang::is_installed("stacks") && 
    rlang::is_installed("tidymodels") && 
    rlang::is_installed("kernlab")) {
  run <- TRUE
} else {
  run <- FALSE
}

knitr::opts_chunk$set(
  eval = run
)
```

An example of fitting a stacked regression ensemble from `stacks` package vignette and using `ensModelVis` for visualising the models.

Packages we will need:

```{r libs, message=FALSE}
library(tidymodels)
library(stacks)
```

Dataset: predict `mpg` based on other attributes in `mtcars` data. 

```{r}
data("mtcars")
mtcars <- mtcars |> mutate(cyl = as.factor(cyl), vs = as.factor(vs), am = as.factor(am))
```



Split the training data, generate resamples, set the recipe and metric.

```{r}

set.seed(1)
mtcars_split <- initial_split(mtcars)
mtcars_train <- training(mtcars_split)
mtcars_test  <- testing(mtcars_split)

set.seed(1)
folds <- vfold_cv(mtcars_train, v = 5)

mtcars_rec <- 
  recipe(mpg ~ ., 
         data = mtcars_train)

metric <- metric_set(rmse)

ctrl_grid <- control_stack_grid()
ctrl_res <- control_stack_resamples()
```

Fit a linear model and a support vector machine model (with hyperparameters to tune).


```{r}
# LINEAR REG
lin_reg_spec <-
  linear_reg() |>
  set_engine("lm")

# extend the recipe
lin_reg_rec <-
  mtcars_rec |>
  step_dummy(all_nominal()) 

# add both to a workflow
lin_reg_wflow <- 
  workflow() |>
  add_model(lin_reg_spec) |>
  add_recipe(lin_reg_rec)

# fit to the 5-fold cv
set.seed(2020)
lin_reg_res <- 
  fit_resamples(
    lin_reg_wflow,
    resamples = folds,
    metrics = metric,
    control = ctrl_res
  )

# SVM
svm_spec <- 
  svm_rbf(
    cost = tune("cost"), 
    rbf_sigma = tune("sigma")
  ) |>
  set_engine("kernlab") |>
  set_mode("regression")

# extend the recipe
svm_rec <-
  mtcars_rec |>
  step_dummy(all_nominal()) |>
  step_impute_mean(all_numeric(), skip = TRUE) |>
  step_corr(all_predictors(), skip = TRUE) |>
  step_normalize(all_numeric(), skip = TRUE)

# add both to a workflow
svm_wflow <- 
  workflow() |> 
  add_model(svm_spec) |>
  add_recipe(svm_rec)

# tune cost and sigma and fit to the 5-fold cv
set.seed(2020)
svm_res <- 
  tune_grid(
    svm_wflow, 
    resamples = folds, 
    grid = 6,
    metrics = metric,
    control = ctrl_grid
  )

```

Use stacks to get the ensemble:

```{r}
mtcars_model_st <- 
  stacks() |>
  add_candidates(lin_reg_res) |>
  add_candidates(svm_res) |>
  blend_predictions() |>
  fit_members()
```


Predict with test data:



```{r}
member_preds <- 
  mtcars_test |>
  select(mpg) |>
  bind_cols(predict(mtcars_model_st, mtcars_test, members = TRUE))
```

Evaluate RMSE from each model (Stacking decreases RMSE):

```{r}
map(member_preds, rmse_vec, truth = member_preds$mpg) 
```


SVM does not make useful predictions here. We can see this from the RMSE and more clearly from the plots:

```{r}
p1 <- plot_ensemble(truth = member_preds$mpg, tibble_pred = member_preds |> select(-mpg))
p1 + geom_abline()

plot_ensemble(truth = member_preds$mpg, tibble_pred = member_preds |> select(-mpg), facet = TRUE)
```

