## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
library(CEOdata)
library(knitr)
library(tibble)
library(dplyr)
library(haven)

example_path <- function(filename) {
  system.file("extdata", filename, package = "CEOdata", mustWork = TRUE)
}

d <- haven::read_sav(example_path("BOP_presencial_example.sav")) |>
  tibble::as_tibble() |>
  dplyr::mutate(
    dplyr::across(
      where(~ inherits(.x, "haven_labelled")),
      haven::as_factor
    )
  )
d_available <- all(c("SEXE", "BOP_NUM") %in% names(d))
meta <- readRDS(example_path("REO_meta_example.rds"))
meta_tags_available <- all(c("Descriptors", "REO") %in% names(meta))
meta_fieldwork_available <- all(
  c("Dia inici treball de camp", "Dia final treball de camp", "REO", "microdata_available") %in% names(meta)
)

## ----message = FALSE, echo = TRUE, eval = FALSE-------------------------------
# library(CEOdata)
# d <- haven::read_sav("../data/BOP_presencial_example.sav")

## ----message = FALSE, warning = FALSE-----------------------------------------
library(dplyr)
library(tidyr)
library(ggplot2)

## ----eval = d_available-------------------------------------------------------
d |>
  count(SEXE)

## ----prop-females, eval = d_available, fig.width = 8, fig.height = 4, fig.cap = 'Proportion of females in the different Barometers.'----
d |>
  group_by(BOP_NUM) |>
  summarize(propFemales = length(which(SEXE == "Femení")) / n()) |>
  ggplot(aes(x = BOP_NUM, y = propFemales, group = 1)) +
  geom_point() +
  geom_line() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  expand_limits(y = c(0, 1))

## ----tags, eval = meta_tags_available, fig.width = 6, fig.height = 6, fig.cap = 'Prevalence of topics covered.'----
tags <- meta |>
  separate_rows(Descriptors, sep = ";") |>
  mutate(tag = factor(stringr::str_trim(Descriptors))) |>
  select(REO, tag)

tags |>
  group_by(tag) |>
  count() |>
  filter(n > 5) |>
  ggplot(aes(x = n, y = reorder(tag, n))) +
    geom_point() +
    ylab("Topic")

## ----fieldwork, eval = meta_fieldwork_available, fig.width = 8, fig.height = 10, fig.cap = 'Fieldwork periods.'----
meta |>
  filter(`Dia inici treball de camp` > "2018-01-01") |>
  ggplot(aes(xmin = `Dia inici treball de camp`,
             xmax = `Dia final treball de camp`,
             y = reorder(REO, `Dia final treball de camp`),
             color = microdata_available)) +
  geom_linerange() +
  xlab("Date") + ylab("Surveys with fieldwork") +
  theme(axis.ticks.y = element_blank(), axis.text.y = element_blank())

## ----eval = d_available-------------------------------------------------------
survey.data <- d |>
  mutate(Female = ifelse(SEXE == "Dona", 1, 0),
         Age = EDAT,
         # Pass NA correctly
         Income = ifelse(INGRESSOS_1_15 %in% c("No ho sap", "No contesta"), 
                         NA,
                         INGRESSOS_1_15),
         Date = DATA_FIN,
         # Reorganize factor labels
         `Place of birth` = factor(case_when(
            LLOC_NAIX == "Catalunya" ~ "Catalonia",
            LLOC_NAIX %in% c("No ho sap", "No contesta") ~ as.character(NA),
            TRUE ~ "Outside Catalonia")),
         # Convert into numerical (integer)
         `Interest in politics` = case_when(
            INTERES_POL == "Gens" ~ 0L,
            INTERES_POL == "Poc" ~ 1L,
            INTERES_POL == "Bastant" ~ 2L,
            INTERES_POL == "Molt" ~ 3L,
            TRUE ~ as.integer(NA)),
         # Convert into numeric (double) and properly address missing values
         `Satisfaction with democracy` = ifelse(
            SATIS_DEMOCRACIA %in% c("No ho sap", "No contesta"),
            NA,
            as.numeric(SATIS_DEMOCRACIA))) |>
  # Center income to the median
  mutate(Income = Income - median(Income, na.rm = TRUE)) |>
  # Pick only specific variables
  select(Date, Female, Age, Income,
         `Place of birth`, `Interest in politics`, 
         `Satisfaction with democracy`)



## ----eval = FALSE-------------------------------------------------------------
# save(survey.data, file = "my_cleaned_dataset.RData")

## ----eval = FALSE, echo = TRUE------------------------------------------------
# library(vtable)
# st(survey.data)

## ----eval = exists("survey.data"), echo = FALSE-------------------------------
vtable::st(survey.data, out = "kable")

## ----eval = FALSE, echo = TRUE------------------------------------------------
# library(compareGroups)
# createTable(compareGroups(Female ~ . -Date, data = survey.data))

## ----eval = exists("survey.data"), echo = FALSE-------------------------------
library(compareGroups)
createTable(compareGroups(Female ~ . -Date, data = survey.data))