## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.dim = c(6, 4)
)

## ----setup--------------------------------------------------------------------
library(EZbakR)

# Going to show one tidyr trick to help with cB filtering
library(tidyr)

# Going to use a bit of dplyr magic in my demonstrations
library(dplyr)

## -----------------------------------------------------------------------------
simdata <- EZSimulate(nfeatures = 300, nreps = 2)

ezbdo <- EZbakRData(simdata$cB, simdata$metadf)

## -----------------------------------------------------------------------------
ezbdo <- EstimateFractions(ezbdo,
                           features = "feature")

# NOTE:
# For real data from fastq2EZbakR, you will often
# want to set `features = "XF"`, specifying analysis
# of reads mapping to exonic regions of genes.


## -----------------------------------------------------------------------------
ezbdo <- EstimateFractions(ezbdo, mutrate_populations = "TC")

## ----echo = T, results = 'hide'-----------------------------------------------
# Observe contents of cB
standard_fraction_design


## ----echo = FALSE, warning = FALSE--------------------------------------------

knitr::kable(standard_fraction_design, "pipe")


## ----results = "hide"---------------------------------------------------------
# Observe contents of cB
tilac_fraction_design


## ----echo = FALSE, warning = FALSE--------------------------------------------

knitr::kable(tilac_fraction_design, "pipe")


## ----results = "hide"---------------------------------------------------------
# Three populations for fun:
fd_table <- create_fraction_design(c("TC", "GA", "CG"))
fd_table

## ----echo = FALSE, warning = FALSE--------------------------------------------

knitr::kable(fd_table, "pipe")


## -----------------------------------------------------------------------------
ezbdo <- EstimateFractions(ezbdo, features = 'feature')

## -----------------------------------------------------------------------------
ezbdo <- EstimateFractions(ezbdo, remove_features = c("__no_feature", "feature1"))

## -----------------------------------------------------------------------------
ezbdo <- EstimateFractions(ezbdo, filter_condition = `|`)

## ----results = 'hide'---------------------------------------------------------
example_df <- data.frame(feature = c('A', NA, 'C'), 
                         other_feature = c(NA, 'Y', 'Z'))

replaced_df <- replace_na(example_df,
                          list(feature = '__no_feature',
                               other_feature = 'NA'))
replaced_df


## ----echo = FALSE, warning = FALSE--------------------------------------------
example_df <- data.frame(feature = c('A', NA, 'C'), 
                         other_feature = c(NA, 'Y', 'Z'))

replaced_df <- replace_na(example_df,
                          list(feature = '__no_feature',
                               other_feature = 'NA'))

knitr::kable(replaced_df, "pipe")


## -----------------------------------------------------------------------------
ezbdo <- EstimateFractions(ezbdo,
                           split_multi_features = TRUE,
                           multi_feature_cols = "feature")

## -----------------------------------------------------------------------------
ezbdo <- CorrectDropout(ezbdo, 
                        grouping_factors = "treatment")

## -----------------------------------------------------------------------------
ezbdo <- EstimateFractions(ezbdo,
                           pold_from_nolabel = TRUE)

## -----------------------------------------------------------------------------
ezbdo <- EstimateFractions(ezbdo,
                           pold_from_nolabel = TRUE,
                           grouping_factors = 'treatment')

## -----------------------------------------------------------------------------
ezbdo <- EstimateFractions(ezbdo, 
                           strategy = 'hierarchical')


## -----------------------------------------------------------------------------
est <- EZget(ezbdo, type = 'fractions')
truth <- simdata$PerRepTruth

compare <- dplyr::inner_join(est, truth, by = c('sample', 'feature'))

plot(compare$true_fraction_highTC,
     compare$fraction_highTC)
abline(0,1)

## -----------------------------------------------------------------------------
# Simulates a single sample worth of data
simdata_iso <- SimulateIsoforms(nfeatures = 300)

# We have to manually create the metadf in this case
metadf <- data.frame(sample = 'sampleA', 
                     tl = 4, 
                     condition = 'A')

ezbdo <- EZbakRData(simdata_iso$cB,
                    metadf)

## -----------------------------------------------------------------------------
ezbdo <- EstimateFractions(ezbdo)

## -----------------------------------------------------------------------------
### Hack in the true, simulated isoform levels
reads <- simdata_iso$ground_truth %>%
  dplyr::select(transcript_id, true_count, true_TPM) %>%
  dplyr::mutate(sample = 'sampleA',
                effective_length = 10000) %>%
  dplyr::rename(expected_count = true_count,
                TPM = true_TPM)

# Name of table needs to have "isoform_quant" in it
ezbdo[['readcounts']][['simulated_isoform_quant']] <- reads

### Perform deconvolution
ezbdo <- EstimateIsoformFractions(ezbdo)


## ----fig.align='center'-------------------------------------------------------
est <- EZget(ezbdo, 
             type = 'fractions',
             features = "transcript_id")
truth <- simdata_iso$ground_truth

compare <- truth %>%
  dplyr::inner_join(est, by = c("feature", "transcript_id"))

plot(compare$true_fn,
     compare$fraction_highTC)
abline(0,1)

## ----eval=FALSE---------------------------------------------------------------
# library(arrow)
# 
# ### Move into the directory with your cB file
# setwd("Path/to/cB/containing/directory")
# 
# 
# ### This will not load the cB into memory
# # You can run `read_csv("cB.csv.gz", n_max = 1)` to check to see what
# # the order of the columns are, as this order needs to match your provided
# # schema.
# ds <- open_dataset("cB.csv.gz",
#                    format = "csv",
#                    schema = schema(
#                      sample = string(),
#                      rname = string(),
#                      GF = string(),
#                      XF = string(),
#                      exon_bin = string(),
#                      bamfile_transcripts = string(),
#                      junction_start = string(),
#                      junction_end = string(),
#                      TC = int64(),
#                      nT = int32(),
#                      sj = bool(),
#                      n = int64()
#                    ),
#                    skip_rows=1)
# 
# 
# ### Create Arrow dataset
# setwd("Path/to/where/you/want/to/write/Arrow/Dataset")
# ds %>%
#   group_by(sample) %>%
#   write_dataset("fulldataset/",
#                 format = "parquet")

## ----eval = FALSE-------------------------------------------------------------
# ds <- arrow::open_dataset("Path/to/where/you/want/to/Arrow/Dataset/")
# 
# 
# metadf <- tibble(sample = c("WT_1", "WT_2", "WT_ctl",
#                             "KO_1", "KO_2", "KO_ctl"),
#                  tl = c(2, 2, 0,
#                         2, 2, 0),
#                  genotype = rep(c('WT', 'KO'), each = 3))
# 
# 
# ezbado <- EZbakRArrowData(ds, metadf)
# 

## ----eval = FALSE-------------------------------------------------------------
# ezbado <- EstimateFractions(ezbado,
#                             features = c("GF", "XF",
#                                          "junction_start", "junction_end"),
#                             filter_cols = c("XF", "junction_start",
#                                             "junction_end"),
#                             filter_condition = `|`,
#                             split_multi_features = TRUE,
#                             multi_feature_cols = c("junction_start",
#                                                    "junction_end"))
# 
# 

## -----------------------------------------------------------------------------
library(arrow)

simdata <- EZSimulate(nfeatures = 250)

outdir <- tempdir()
dataset_dir <- file.path(outdir, "arrow_dataset")

write_dataset(
  simdata$cB,
  path = dataset_dir,
  format = "parquet",
  partitioning = "sample"
)

ds <- open_dataset(dataset_dir)

ezbdo <- EZbakRArrowData(ds,
                    simdata$metadf)


ezbdo <- EstimateFractions(ezbdo)