\documentclass[article,nojss]{jss}
\pdfcompresslevel=9
\pdfobjcompresslevel=3
\usepackage{amsmath,amssymb,enumerate,subcaption,thumbpdf,lmodern}
\graphicspath{{Figures/}}

\author{Adam Kapelner\\Queens College,\\City University of New York \And 
        Justin Bleich\\The Wharton School of the\\University of Pennsylvania}
\Plainauthor{Adam Kapelner, Justin Bleich}

\title{\pkg{bartMachine}: Machine Learning with Bayesian Additive Regression Trees}
\Plaintitle{bartMachine: Machine Learning with Bayesian Additive Regression Trees}

\Abstract{
  We present a new package in \proglang{R} implementing
  Bayesian additive regression trees (BART). The package introduces
  many new features for data analysis using BART such as variable
  selection, interaction detection, model diagnostic plots,
  incorporation of missing data and the ability to save trees for
  future prediction. It is significantly faster than the current
  \proglang{R} implementation, parallelized, and capable of handling
  both large sample sizes and high-dimensional data.
}

\Keywords{Bayesian, machine learning, statistical learning, non-parametric, \proglang{R}, \proglang{Java}}
\Plainkeywords{Bayesian, machine learning, statistical learning, non-parametric, R, Java,}


\Address{
  Adam Kapelner\\
  Department of Mathematics \\
  Queens College,  City University of New York \\
  65-30 Kissena Blvd Room 604 \\
  Flushing, NY, 11367 \\
  E-mail: \email{kapelner@qc.cuny.edu}\\
  URL: \url{https://kapelner.com/}
}

\newcommand{\bv}[1]{\boldsymbol{#1}}
\newcommand{\qu}[1]{``#1''}
\renewcommand{\min}[1]{\text{min}\braces{#1}}
\renewcommand{\max}[1]{\text{max}\braces{#1}}
\newcommand{\treet}[1]{\mathcal{T}_{#1}}
\newcommand{\treeleaft}[1]{\treet{#1}^{\mathcal{M}}}
\newcommand{\leaf}{\mathcal{M}}
\newcommand{\beqn}{\begin{eqnarray*}}
\newcommand{\eeqn}{\end{eqnarray*}}
\newcommand{\bneqn}{\begin{eqnarray}}
\newcommand{\eneqn}{\end{eqnarray}}
\newcommand{\parens}[1]{\left(#1\right)}
\newcommand{\braces}[1]{\left\{#1\right\}}
\newcommand{\bracks}[1]{\left[#1\right]}
\newcommand{\squared}[1]{\parens{#1}^2}
\newcommand{\tothepow}[2]{\parens{#1}^{#2}}
\newcommand{\errorrv}{\mathcal{E}}
\newcommand{\berrorrv}{\bv{\errorrv}}
\newcommand{\normnot}[2]{\mathcal{N}\parens{#1,\,#2}}
\newcommand{\multnormnot}[3]{\mathcal{N}_{#1}\parens{#2,\,#3}}
\newcommand{\invgammanot}[2]{\text{InvGamma}\parens{#1,\,#2}}
\newcommand{\uniform}[2]{\mathrm{U}\parens{#1,\,#2}}
\newcommand{\zerovec}{\bv{0}}
\newcommand{\0}{\zerovec}
\newcommand{\sigsq}{\sigma^2}
\newcommand{\I}{\bv{I}}
\newcommand{\X}{\bv{X}}
\newcommand{\R}{\bv{R}}
\newcommand{\Z}{\bv{Z}}
\newcommand{\Y}{\bv{Y}}
\newcommand{\M}{\bv{M}}
\newcommand{\x}{\bv{x}}
\newcommand{\y}{\bv{y}}
\newcommand{\prob}[1]{\mathbb{P}\parens{#1}}
\newcommand{\cprob}[2]{\prob{#1~|~#2}}
\newcommand{\expe}[1]{\mathbb{E}\bracks{#1}}
\newcommand{\expesub}[2]{\mathbb{E}_{#1}\bracks{#2}}
\newcommand{\cexpe}[2]{\expe{#1 ~ | ~ #2}}
\newcommand{\iid}{~{\buildrel \text{iid} \over \sim}~}
\newcommand{\mathand}{~~\text{and}~~}
\newcommand{\yhat}{\hat{y}}
\newcommand{\oneover}[1]{\frac{1}{#1}}
\newcommand{\Hint}{H_{\text{internals}}}
\newcommand{\Hterminals}{H_{\text{terminals}}}
\newcommand{\Hleavesbelow}{H_{\text{leaves}\cdot\text{below}}}
\newcommand{\reals}{\mathbb{R}}
\newcommand{\padjeta}{p_{\text{adj}}(\eta)}
\newcommand{\padjetastar}{p_{\text{adj}}(\eta^*)}
\newcommand{\nadjeta}{n_{j\cdot\text{adj}}(\eta)}
\newcommand{\nadjetastar}{n_{j^*\cdot\text{adj}}(\eta^*)}
\newcommand{\nadjcheta}{n_{j\cdot\text{adj}\cdot\text{ch}}(\eta)}
\newcommand{\nadjchetastar}{n_{j\cdot\text{adj}\cdot\text{ch}}(\eta^*)}
\newcommand{\probsplit}[1]{\mathbb{P}_{\text{SPLIT}}\parens{#1}}
\newcommand{\probrule}[1]{\mathbb{P}_{\text{RULE}}\parens{#1}}
\newcommand{\Roneton}{R_1, \ldots, R_n}
\newcommand{\Rlonetonl}{R_{\ell_1}, \ldots, R_{\ell_{n_\ell}}}
\newcommand{\RLlonetonlL}{R_{\ell_{L,1}}, \ldots, R_{\ell_{L, n_{\ell,L}}}}
\newcommand{\RRlonetonlR}{R_{\ell_{R,1}}, \ldots, R_{R,\ell_{n_{\ell,R}}}}
\newcommand{\Rbar}{\bar{R}}
\newcommand{\sigsqmu}{\sigsq_\mu}
\newcommand{\doneover}[1]{\dfrac{1}{#1}}
\newcommand{\Rones}{R_{1,1}, \ldots, R_{1, \none}}
\newcommand{\Rtwos}{R_{2,1}, \ldots, R_{2, \ntwo}}
\newcommand{\Ronestars}{R_{1^*,1}, \ldots, R_{1^*, \nonestar}}
\newcommand{\Rtwostars}{R_{2^*,1}, \ldots, R_{2^*, \ntwostar}}
\newcommand{\none}{n_{1}}
\newcommand{\ntwo}{n_{2}}
\newcommand{\nonestar}{n_{1^*}}
\newcommand{\ntwostar}{n_{2^*}}
\newcommand{\etastar}{\eta_*}
\newcommand{\etaone}{\eta_1}
\newcommand{\etatwo}{\eta_2}
\newcommand{\etaonestar}{\eta_{1^*}}
\newcommand{\etatwostar}{\eta_{2^*}}
\renewcommand{\exp}[1]{\text{exp}\parens{#1}}

%\VignetteIndexEntry{bartMachine}

\begin{document}

\section{Introduction}

Ensemble-of-trees methods have become popular choices for forecasting in both regression and classification problems. Algorithms such as random forests \citep{Breiman2001} and stochastic gradient boosting \citep{Friedman2002} are two well-established and widely employed procedures. Recent advances in ensemble methods include dynamic trees \citep{Taddy2011} and Bayesian additive regression trees \citep[BART;][]{Chipman2010}, which depart from predecessors in that they rely on an underlying Bayesian probability model rather than a pure algorithm. BART has demonstrated substantial promise in a wide variety of simulations and real world applications such as predicting avalanches on mountain roads \citep{Blattenberger2014}, predicting how transcription factors interact with DNA \citep{Zhou2008} and predicting movie box office revenues \citep{Eliashberg2010}. This paper introduces \pkg{bartMachine} \citep{bartMachine}, a new \proglang{R} \citep{Rlang} package available from the Comprehensive \proglang{R} Archive Network (CRAN) at \url{https://CRAN.R-project.org/package=bartMachine} that significantly expands the capabilities of using BART for data analysis. 

Currently, there exists one other implementation of BART on CRAN: \pkg{BayesTree} \citep{BayesTree}, the package developed by the algorithm's original authors. One of the major drawbacks of this implementation is its lack of a \code{predict} function. Test data must be provided as an argument during the training phase of the model. Hence it is impossible to generate forecasts on future data without re-fitting the entire model. Since the run time is not trivial, forecasting becomes an arduous exercise. A significantly faster implementation of BART that contains master-slave parallelization was introduced in \citet{Pratola2013}, but this is only available as standalone \proglang{C++} source code and not integrated with \proglang{R}. Additionally, a recent package \pkg{dbarts} \citep{dbarts} allows updating of BART with new predictors and response values to incorporate BART into a larger Bayesian model. \pkg{dbarts} relies on \pkg{BayesTree} as its BART engine.

The goal of \pkg{bartMachine} is to provide a fast, easy-to-use, visualization-rich machine learning package for \proglang{R} users. Our implementation of BART is in \proglang{Java} and is integrated into \proglang{R} via \pkg{rJava} \citep{rJava}. From a run time perspective, our algorithm is significantly faster and is parallelized, allowing computation on as many cores as desired. Not only is the model construction itself parallelized, but the additional features such as prediction, variable selection, and many others can be divided across cores as well.  

Additionally, we include a variety of expanded and new features. We implement the ability to save trees in memory and provide convenience functions for prediction on test data as well as the ability to save models across \proglang{R} sessions. We also include plotting functions for both posterior credible and predictive intervals and plots for visually inspecting the convergence of BART's MCMC chain. We expand variable importance exploration to include permutation tests and interaction detection. We implement recently developed features for BART including a formal approach to variable selection and the ability to incorporate prior information for covariates \citep{Bleich2014}. We also implement the strategy found in \citet{Kapelner2013} to incorporate missing data during training and handle missingness during prediction.  Table~\ref{tab:bartcomp} emphasizes the differences in features between \pkg{bartMachine} and \pkg{BayesTree}, the two existing \proglang{R} implementations of BART.

\begin{table}[t!]
\centering
\begin{tabular}{lll}
  \hline
Feature & \pkg{bartMachine} & \pkg{BayesTree} \\ 
  \hline
Implementation language &  \proglang{Java} &  \proglang{C++} \\
External predict function & Yes & No \\
Model persistence across sessions & Yes & No \\ 
Parallelization & Yes & No \\ 
Native missing data mechanism & Yes & No \\
Built-in cross-validation & Yes & No \\
Variable importance & Statistical tests & Exploratory \\
Tree proposal types & 3 types & 4 types \\
Partial dependence plots & Yes & Yes \\
Convergence plots & Assess trees and $\sigsq$ & Assess $\sigsq$ \\
Model diagnostics & Yes & No \\
Incorporation into larger model & No & Through \pkg{dbarts} \\ 
\hline
\end{tabular}
\caption{Comparison of features between \pkg{bartMachine} and \pkg{BayesTree}.}
\label{tab:bartcomp}
\end{table}

In Section~\ref{sec:background}, we provide an overview of BART with a special emphasis on the features that have been extended. In Section~\ref{sec:package} we provide a general introduction to the package, highlighting the novel features. Section~\ref{sec:regression_features} provides step-by-step examples of the regression capabilities and Section~\ref{sec:classification_features} introduces additional step-by-step examples of features unique to classification problems. We conclude in Section \ref{sec:discussion}. Appendix~\ref{app:implementation} discusses the details of our algorithm implementation and how it differs from \pkg{BayesTree}. Appendix~\ref{app:bakeoff} offers predictive performance comparisons.

\section{Overview of BART}\label{sec:background}

BART is a Bayesian approach to nonparametric function estimation using regression trees. Regression trees rely on recursive binary partitioning of predictor space into a set of hyperrectangles in order to approximate some unknown function $f$. The predictor space has dimension equal to the number of variables, which we denote $p$. Tree-based regression models have an ability to flexibly fit interactions and nonlinearities. Models composed of sums of regression trees have an even greater ability than single trees to capture interactions and non-linearities as well as additive effects in $f$. 

BART can be considered a sum-of-trees ensemble, with a novel estimation approach relying on a fully Bayesian probability model.  Specifically, the BART model can be expressed as:
\bneqn\label{eq:bart}
\Y = f(\X) + \berrorrv \approx \treeleaft{1}(\X) + \treeleaft{2}(\X) + \ldots + \treeleaft{m}(\X) + \berrorrv, \quad\quad \berrorrv \sim \multnormnot{n}{\zerovec}{\sigsq\I_n},
\eneqn
where $\Y$ is the $n \times 1$ vector of responses, $\X$ is the $n \times p$ design matrix (the predictors column-joined) and $\berrorrv$ is the $n \times 1$ vector of noise. Here we have $m$ distinct regression trees, each composed of a tree structure, denoted by $\treet{}$, and the parameters at the terminal nodes (also called leaves), denoted by $\leaf$. The two together, denoted as $\treeleaft{}$ represents an entire tree with both its structure and set of leaf parameters.

The structure of a given tree $\treet{t}$ includes information on how any observation recurses down the tree. For each nonterminal (internal) node of the tree, there is a \qu{splitting rule} taking the form $\x_j < c$, which consists of the \qu{splitting variable} $\x_j$ and the \qu{splitting value} $c$. An observation moves to the left child node if the condition set by the splitting rule is satisfied (and it moves to the right child node otherwise). The process continues until a terminal node is reached. Then, the observation receives the leaf value of the terminal node. We denote the set of the tree's leaf parameters as  $\leaf_t = \{\mu_{t,1}, \mu_{t,2}, \ldots, \mu_{t_{b_t}}\}$ where $b_t$ is the number of terminal nodes for a given tree. The observation's predicted value is then the sum of the $m$ leaf values arrived at by recursing down all $m$ trees. 

BART can be distinguished from other ensemble-of-trees models due to its underlying probability model. As a Bayesian model, BART consists of a set of priors for the structure and the leaf parameters and a likelihood for data in the terminal nodes. The aim of the priors is to provide regularization, preventing any single regression tree from dominating the total fit. 

We provide an overview of the BART priors and likelihood and then discuss how draws from the posterior distribution are made. A more complete exposition can be found in \citet{Chipman2010}. 

\subsection{Priors and likelihood}\label{subsec:prior_likelihood}

The prior for the BART model has three components: (1) the tree structure itself, (2) the leaf parameters given the tree structure, and (3) the error variance $\sigsq$ which is independent of the tree structure and leaf parameters
\beqn 
\prob{\treeleaft{1},\ldots,\treeleaft{m}, \sigsq} &=& \bracks{\prod_{t}\prob{\treeleaft{t}}}\prob{\sigsq} 
   = \bracks{\prod_{t}\cprob{\leaf_t}{\treet{t}}\prob{\treet{t}}}\prob{\sigsq}  \\
&=& \bracks{\prod_{t}\prod_{\ell}\cprob{ \mu_{t,\ell}}{\treet{t}}\prob{\treet{t}}}\prob{\sigsq},
\eeqn
\noindent where the last equality follows from an additional assumption of conditional independence of the leaf parameters given the tree's structure. 

We first describe $\prob{\treet{t}}$, the component of the prior which affects the locations of nodes within the tree. Node depth is defined as distance from the root. Thus, the root itself has depth 0, its first child node has depth 1, etc. Nodes at depth $d$ are nonterminal with prior probability $\alpha(1+d)^{-\beta}$ where $\alpha \in (0,1)$ and $\beta \in [0, \infty]$. This component of the tree structure prior has the ability to enforce shallow tree structures, thereby limiting complexity of any single tree and resulting in more model regularization. Default values for these hyperparameters of $\alpha = 0.95$ and $\beta = 2$ are recommended by \citet{Chipman2010}. 

For nonterminal nodes, splitting rules occur in two parts. First, the predictor is randomly selected to serve as the splitting variable. In the original formulation, each available predictor is equally likely to be chosen from a discrete uniform distribution with probability that each variable is selected with probability $1/p$. This is relaxed in our implementation to allow for a generalized Bernoulli distribution where the user specifies $p_1, p_2, \ldots, p_p$ (such that $\sum_{j=1}^p p_j = 1$), where each denotes the probability of the $j$th variable being selected a priori. See Section~\ref{subsec:cov_prior} on the covariate prior feature for further details. Additionally, note that \qu{structural zeroes,} variables that do not have any valid split values, are assigned probability zero in the implementation (see Appendix \ref{subapp:grow_step} for details). Once the splitting variable is chosen, the splitting value is chosen from the multiset (the non-unique set) of available values at the node via the discrete uniform distribution. 

We now describe the prior component $\cprob{\leaf_t}{\treet{t}}$ which controls the leaf parameters. Given a tree with a set of terminal nodes, each terminal node (or leaf) has a continuous parameter (the leaf parameter) representing the \qu{best guess} of the response in this partition of predictor space. This parameter is the fitted value assigned to any observation that lands in that node. The prior on each of the leaf parameters is given as: $\mu_\ell \iid \normnot{\mu_\mu / m}{\sigsq_\mu}$. The expectation, $\mu_\mu$, is picked to be the range center, $(y_{\text{min}} + y_{\text{max}}) / 2$. The range center can be affected by outliers. If this is a concern, the user can log-transform or windsorize the response before model construction. 

The variance hyperparameter $\sigsq_\mu$ is empirically chosen so that the range center plus or minus $k = 2$ variances cover 95\% of the provided response values in the training set (where $k=2$ corresponding to 95\% coverage is only by default and can be customized). Thus, since there are $m$ trees, we then choose $\sigma_\mu$ such that $m \mu_\mu - k\sqrt{m} \sigma_\mu = y_{\text{min}}$ and  $m \mu_\mu + k\sqrt{m} \sigma_\mu = y_{\text{max}}$. The aim of this prior is to provide model regularization by shrinking the leaf parameters towards the center of the distribution of the response. The larger the value of $k$, the smaller the value of $\sigsq_\mu$, resulting in more model regularization.

The final prior is on the error variance and is chosen to be $\sigsq \sim \invgammanot{\nu / 2}{\nu\lambda / 2}$. $\lambda$ is determined from the data so that there is a $q = 90\%$ a priori chance (by default) that the BART model will improve upon the RMSE from an ordinary least squares regression. Therefore, the majority of the prior probability mass lies below the RMSE from least squares regression. Additionally, this prior limits the probability mass placed on small values of $\sigsq$ to prevent overfitting. Thus, the higher the value of $q$, the larger the values of the sampled $\sigsq$'s, resulting in more model regularization.

Note that the adjustable hyperparameters are $\alpha$, $\beta$, $k$, $\nu$ and $q$. Additionally, $m$, the number of trees, must be chosen. Default values generally provide good performance, but optimal tuning can be achieved automatically via cross-validation, a feature implemented and described in Section~\ref{subsec:model_building}.

Along with a set of priors, BART specifies the likelihood of responses in the terminal nodes. They are assumed a priori normal with the mean being the \qu{best guess} in the leaf at the moment (i.e., in the current MCMC iteration) and variance being the best guess of the variance at the moment, $y_\ell \sim \normnot{\mu_\ell}{\sigsq}$.

\subsection{Posterior distribution and prediction} \label{subsec:posterior}

A Metropolis-within-Gibbs sampler \citep{Geman1984, Hastings1970} is employed to generate draws from the posterior distribution of $\mathbb{P}(\treeleaft{1}, \ldots, \treeleaft{m}, \sigsq ~|~\y)$. A key feature of this sampler for BART is to employ a form of ``Bayesian backfitting'' \citep{Hastie2000} where the $j$th tree is fit iteratively, holding all other $m-1$ trees constant by exposing only the residual response that remains unfitted: 
%
\bneqn\label{eq:response_unfitted}
\R_{-j} := \y - \sum_{t \neq j} \treeleaft{t}(\X).
\eneqn
%
The sampler,
%
\bneqn\label{eq:gibbs_sampler}
1: && \treet{1} ~|~ \R_{-1}, \sigsq \\ \nonumber
2: && \leaf_1 ~|~ \treet{1}, \R_{-1}, \sigsq \\ \nonumber
3: && \treet{2} ~|~ \R_{-2}, \sigsq \\ \nonumber
4: && \leaf_2 ~|~ \treet{2}, \R_{-2}, \sigsq \\  \nonumber
\vdots &&  \\\nonumber
2m -1: && \treet{m} ~|~ \R_{-m}, \sigsq \\\nonumber
2m: && \leaf_m ~|~ \treet{m}, \R_{-m}, \sigsq \\ \nonumber
2m + 1: && ~\,\sigsq ~|~ \treet{1}, \leaf_1, \ldots, \treet{m}, \leaf_m, \berrorrv, \nonumber
\eneqn
%
\noindent proceeds by first proposing a change to the first tree's structure $\treet{}$ which is accepted or rejected via a Metropolis-Hastings step. Note that sampling from the posterior of the tree structure does not depend on the leaf parameters, as they can be analytically integrated out of the computation (see Appendix ~\ref{subapp:grow_step}). Given the tree structure, samples from the posterior of the $b$ leaf parameters $\leaf_1 := \braces{\mu_1, \ldots, \mu_b}$  are then drawn. This procedure progresses iteratively for each tree, using the updated set of partial residuals $\R_{-j}$. Finally, conditional on the updated set of tree structures and leaf parameters, a draw from the posterior of $\sigsq$ is made based on the full model residuals $\berrorrv := \y - \sum_{t = 1}^m \treeleaft{t}(\X)$.

Within a given terminal node, since both the prior and likelihood are normally distributed, the posterior of each of the leaf parameters in $\leaf$ is conjugate normal with its mean being a weighted combination of the likelihood and prior parameters (lines $2,~4, \ldots, 2m$ in Equation set~\ref{eq:gibbs_sampler}). Due to the normal-inverse-gamma conjugacy, the posterior of $\sigsq$ is inverse gamma as well (line $2m+1$ in Equation set~\ref{eq:gibbs_sampler}). The complete expressions for these posteriors can be found in \citet{Gelman2004}. 

Lines $1,~3,\ldots,~2m-1$ in Equation set~\ref{eq:gibbs_sampler} rely on Metropolis-Hastings draws from the posterior of the tree distributions. These involve introducing small perturbations to the tree structure: growing a terminal node by adding two child nodes, pruning two child nodes (rendering their parent node terminal), or changing a split rule. We denote the three possible tree alterations as: GROW, PRUNE, and CHANGE.\footnote{In the original formulation, \citet{Chipman2010} include an additional alteration called SWAP. Due to the complexity of the bookkeeping associated with this alteration, we do not implement it.} The mathematics associated with the Metropolis-Hastings step are tedious. Appendix~\ref{app:implementation} contains the explicit calculations. Once again, over many MCMC iterations, trees evolve to capture the fit left currently unexplained.

\citet{Pratola2013} argue that a CHANGE step is unnecessary for sufficient mixing of the Gibbs sampler. While we too observed this to be true for estimates of the posterior means, we found that omitting CHANGE can negatively impact the variable inclusion proportions (the feature introduced in Section~\ref{subsec:variable_importance}). As a result, we implement a modified CHANGE step where we only propose new splits for nodes that are singly internal. These are nodes where both children nodes are terminal nodes (details are given in Appendix~\ref{subapp:change_step}). After a singly internal node is selected we (1) select a new split attribute from the set of available predictors and (2) select a new split value from the multiset of available values (these two uniform splitting rules were explained in detail previously). We emphasize that the CHANGE step does not alter the tree structure.

All $2m+1$ steps represent a \textit{single} Gibbs iteration. We have observed that generally no more than 1,000 iterations are needed as ``burn-in'' (the default package setting is 250). See Section~\ref{subsec:assumption_checking} for the use of convergence diagnostics. An additional 1,000 iterations are usually sufficient to serve as draws from the posterior for $f(\x)$. A single predicted value $\hat{f}(\x)$ can be obtained by taking the average of the posterior values and a quantile estimate can be obtained by computing the appropriate quantile of the posterior values. Additional features of the posterior distribution will be discussed in Section~\ref{sec:regression_features}.

\subsection{BART for classification} \label{subsec:probit_bart}

BART can easily be modified to handle classification problems for categorical response variables. In \citet{Chipman2010}, only binary outcomes were explored but recent work has extended BART to the multiclass problem \citep{Kindo2013}. Our implementation handles binary classification and we plan to implement multiclass outcomes in a future release.

For the binary classification problem (coded with outcomes ``0'' and ``1''), we assume a probit model,
\beqn
\cprob{Y = 1}{\X} = \Phi\parens{\treeleaft{1}(\X) + \treeleaft{2}(\X) + \ldots + \treeleaft{m}(\X)},
\eeqn
\noindent where $\Phi$ denotes the cumulative density function of the standard normal distribution. In this formulation, the sum-of-trees model serves as an estimate of the conditional probit at $\x$ which can be easily transformed into a conditional probability estimate of $Y=1$. 

In the classification setting, the prior on $\sigsq$ is not needed as the model assumes $\sigsq=1$. The prior on the tree structure remains the same as in the regression setting and a few minor modifications are required for the prior on the leaf parameters. 

Sampling from the posterior distribution is again obtained via Gibbs sampling in combination with a Metropolis-Hastings step outlined in Section~\ref{subsec:posterior}. Following the data augmentation approach of \citet{Albert1993}, an additional vector of latent variables $\Z$ is introduced into the Gibbs sampler. Then, a new step is created in the Gibbs sampler where draws of $\Z\,|\,\y$ are obtained by conditioning on the sum-of-trees model:
%
\beqn
Z_i~|~y_i=1 &\sim& \max{N\parens{\sum_t \treeleaft{t}\parens{\X},1}, \,0} \mathand \\
Z_i~|~y_i = 0  &\sim& \min{N\parens{\sum_t \treeleaft{t}\parens{\X},1}, \,0}.
\eeqn
%
\noindent Next, $\Z$ is used as the response vector instead of $\y$ in all steps of  Equation~\ref{eq:gibbs_sampler}.

Upon obtaining a sufficient number of samples from the posterior, inferences can be made using the posterior distribution of conditional probabilities and classification can be undertaken by applying a threshold to the averages (or another summary) of these posterior probabilities. The relevant classification features of \pkg{bartMachine} are discussed in Section~\ref{sec:classification_features}.



\section[The bartMachine package]{The \pkg{bartMachine} package}\label{sec:package}

The package \pkg{bartMachine} provides a novel implementation of Bayesian additive regression trees in \proglang{R}. The algorithm is substantially faster than the current \proglang{R} package \pkg{BayesTree} and our implementation is parallelized at the MCMC iteration level during prediction.  Additionally, the interface with \pkg{rJava} allows for the entire posterior distribution of tree ensembles to persist throughout the \proglang{R} session, allowing for prediction and other calls to the trees without having to re-run the Gibbs sampler (a limitation in the current \pkg{BayesTree} implementation). The \pkg{bartMachine} object can be serialized, thereby persisting across \proglang{R} sessions as well (a feature discussed in Section~\ref{subsec:persistence}). Since our implementation is different from \pkg{BayesTree}, we provide a predictive accuracy bakeoff on different datasets in Appendix~\ref{app:bakeoff} which illustrates that the two exhibit similar performance.

\subsection{Speed improvements and parallelization}\label{subsec:speed_and_parallelization}

We make a number of significant speed improvements over the original implementation. 

First, \pkg{bartMachine} is fully parallelized (with the number of
cores customizable) during model creation, prediction, and many of the
other features. During model creation, we chose to parallelize by
creating one independent Gibbs chain per core. Thus, if we employ the
default 250 burn-in samples and 1,000 post-burn-in samples and four
cores, each core would sample 500 samples: 250 for a burn-in and 250
post-burn-in samples. The final model will aggregate the four post
burn-in chains for the four cores yielding the desired 1,000 total
post-burn-in samples. This has the drawback of effectively running the
burn-in serially (which suffers from Amdahl's Law), but has the added
benefit of reducing auto-correlation of the sum-of-trees samples in
the posterior samples since the chains are independent which may
provide greater predictive performance. Parallelization at the level
of likelihood calculations is left for a future release as we were
unable to address the costs of thread overhead. Parallelization for
prediction and other features scale linearly in the number of cores
without Amdahl's diminishing returns.

Additionally, we take advantage of a number of additional computational shortcuts:
%
\begin{enumerate}[1.]
\item Computing the unfitted responses for each tree
  (Equation~\ref{eq:response_unfitted}) can be accomplished by keeping
  a running vector and making entry-wise updates as the Gibbs sampler
  (Equation~\ref{eq:gibbs_sampler}) progresses from step 1 to
  $2m$. Additionally, during the $\sigsq$ sampling (step $2m + 1$),
  the residuals do not have to be computed by dropping the data down
  all the trees.
\item Each node caches its acceptable variables for split rules and
  the acceptable unique split values so they do not need to be
  calculated at each tree sampling step. Recall from the discussion
  concerning uniform splitting rules in
  Section~\ref{subsec:prior_likelihood} that acceptable predictors and
  values change based on the data available at an arbitrary location
  in the tree structure. This speed enhancement, which we call
  \textit{memcache} comes at the expense of memory and may cause
  issues for large data sets. We include a toggle in our
  implementation defaulted to ``on.''
\item Careful calculations in Appendix~\ref{app:implementation}
  eliminate many unnecessary computations. For instance, the
  likelihood ratios are only functions of the squared sum of responses
  and no longer require computing the sum of the responses squared.
\end{enumerate}
%
Figure~\ref{fig:time_plots} displays model creation speeds for
different values of $n$ on a linear regression model with $p=20$,
normally distributed covariates,
$\beta_1, \ldots, \beta_{20} \iid \uniform{-1}{1}$, and standard
normal noise. Note that we do not vary $p$ as it was already shown in
\citet{Chipman2010} that BART's computation time is largely unaffected
by the dimensionality of the problem. We include results for BART
using \pkg{BayesTree}, \pkg{bartMachine} with one and four cores, the
\textit{memcache} option on and off, as well as four cores,
\textit{memcache} off and computation of in-sample statistics off (all
with $m=50$ trees). In-sample statistics by default are computed consisting of 
in-sample predictions ($\yhat$), residuals ($e := y - \yhat$), $L1$
error which is defined as $\sum_{i=1}^{n_{\text{train}}} |e_i|$, $L2$
error which is defined as $\sum_{i=1}^{n_{\text{train}}} e_i^2$,
pseudo-$R^2$ which is defined as
$1 - L2 / (\sum_{i=1}^{n_{\text{train}}} \squared{y_i - \bar{y}})$ and
root mean squared error which is defined as
$\sqrt{L2 / n_{\text{train}}}$. We also include random forests model
creation times via the package \pkg{randomForest} \citep{Liaw2002}
with its default settings.

\begin{figure}[t!]
\centering
\begin{subfigure}[c]{.48\textwidth}
                \centering
                \includegraphics[width=3.2in, trim = 0 10 0 50, clip]{speed_full4}
                \caption{Large sample sizes}
                \label{fig:time_plots_large_sample}
        \end{subfigure}
\begin{subfigure}[c]{.48\textwidth}
                \centering
                \includegraphics[width=3.2in, trim = 0 10 0 50, clip]{speed_zoomed4}
                \caption{Small sample sizes}
                \label{fig:time_plots_small_sample}
        \end{subfigure}
\caption{Model creation times as a function of sample size for a number of settings of \pkg{bartMachine}, \pkg{BayesTree} and \pkg{randomForest}. Simulations were run on a quad-core 3.4GHz Intel i5 desktop with 24GB of RAM running the Windows 7 64bit operating system.}
\label{fig:time_plots}
\end{figure}

We first note that Figure~\ref{fig:time_plots_large_sample} demonstrates that the \pkg{bartMachine} model creation run time is approximately linear in $n$ (without in-sample statistics computed). There is about a 30\% speed-up when using four cores instead of one. The \textit{memcache} enhancement should only be turned off when regressing \qu{large} sample sizes. (We cannot give general advice here; the user should experiment on their specific dataset and specific hardware to find when the RAM requirement of this feature exceeds capacity). Noteworthy is the 50\% reduction in time of constructing the model when not computing in-sample statistics. In-sample statistics are computed by default because the user generally wishes to see them. Also, for the purposes of this comparison, \pkg{BayesTree} models compute the in-sample statistics by necessity since the trees are not saved. The \pkg{randomForest} implementation becomes slower just after $n=1,000$ due to its reliance on greedy exhaustive search at each node.

Figure~\ref{fig:time_plots_small_sample} displays results for smaller sample sizes ($n \leq 2,000$) that are often encountered in practice. We observe the \textit{memcache} enhancement provides about a 10\% speed improvement. Thus, if memory is an issue, it can be turned off with little performance degradation.

\subsection[Missing data in bartMachine]{Missing data in \pkg{bartMachine}}\label{subsec:missing_data}

\pkg{bartMachine} implements a native method for incorporating missing data into both model creation and future prediction with test data. The details are given in \citet{Kapelner2013} but we provide a brief summary here.

There are a number of ways to incorporate missingness into tree-based methods (see \citealp{Ding2010} for a review). The method implemented here is known as \qu{Missing Incorporated in Attributes} \citep[MIA;][Section 2]{Twala2008} which natively incorporates missingness by augmenting the nodes' splitting rules to (a) also handle sorting the missing data to the left or right and (b) use missingness \textit{itself} as a variable to be considered in a splitting rule. Table \ref{tab:mia} summarizes these new splitting rules as they are implemented within the package.

Implementing MIA into the BART procedure is straightforward. These new splitting rules are sampled uniformly during the GROW or CHANGE steps. For example, a splitting rule might be \qu{$\x_j < c$ or $\x_j$ is missing.} To account for splitting on missingness itself, we create dummy vectors of length $n$ for each of the $p$ attributes, denoted $\M_1, \ldots, \M_p$, which assume the value 1 when the entry is missing and 0 when the entry is present. The original training matrix is then augmented with these dummies, giving the opportunity to select missingness \textit{itself} when choosing a new splitting rule during the grow or change steps. Note that this can increase the number of predictors by up to a factor of 2. We illustrate building a \pkg{bartMachine} model with missing data in Section~\ref{subsec:incorporating_missing_data}. As described in \citet[][Section 6]{Chipman2010}, BART's run time increases negligibly in the number of covariates and this has been our experience using the augmented training matrix.

\begin{table}[t!]
\centering
\begin{tabular}{ll}
\hline
1: & If $x_{ij}$ is missing, send it $\longleftarrow$; if it is present and $x_{ij} \leq x^*_{ij}$, send it $\longleftarrow$, otherwise $\longrightarrow$. \\
2: & If $x_{ij}$ is missing, send it $\longrightarrow$; if it is present and $x_{ij} \leq x^*_{ij}$, send it $\longleftarrow$, otherwise $\longrightarrow$. \\
3: & If $x_{ij}$ is missing, send it $\longleftarrow$; if it is present, send it $\longrightarrow$. \\
\hline
\end{tabular}
\caption{The MIA choices for all attributes $j \in \braces{1, \ldots, p}$ and all split points $x^*_{ij}$ where $i \in \braces{1,\ldots,n}$ during a GROW or CHANGE step in \pkg{bartMachine}.}
\label{tab:mia}
\end{table}

\subsection{Variable selection}\label{subsec:variable_selection}

Our package also implements the variable selection procedures developed in \citet{Bleich2014}, which are best applied to data problems where the number of covariates influencing the response is small relative to the total number of covariates. We give a brief summary of the procedures here. 

 In order to select variables, we make use of the \qu{variable inclusion proportions:} the proportion of times each predictor is chosen as a splitting rule divided by the total number of splitting rules appearing in the model (see Section~\ref{subsec:variable_importance} for more details). The variable selection procedure can be outlined as follows: 
%
\begin{enumerate}[1.]
\item Compute the model's variable inclusion proportions.
\item Permute the response vector, thereby breaking the relationship between the covariates and the response. Rebuild the model and compute the \qu{null} variable inclusion proportions. Repeat this a number of times to create a null permutation distribution. 
\item Three selection rules can be used depending on the desired stringency of selection:
\begin{enumerate}
\item Local Threshold: Include a predictor $\x_k$ if its variable inclusion proportion exceeds the $1-\alpha$ quantile of its own null distribution.
\item Global Max Threshold: Include a predictor $\x_k$ if its variable inclusion proportion exceeds the $1-\alpha$ quantile of the distribution of the maximum of the null variable inclusion proportions from each permutation of the response.
\item Global SE Threshold: Select $\x_k$ if its variable inclusion proportion exceeds a threshold based from its own null distribution mean and SD with a global multiplier shared by all predictors.
\end{enumerate}
\end{enumerate}
%
The Local procedure is the least stringent in terms of selection and the Global Max procedure the most. The Global SE procedure is a compromise, but behaves more similarly to the Global Max. \citet{Bleich2014} demonstrate that the best procedure depends on the underlying sparsity of the problem, which is often unknown. Therefore, the authors include an additional procedure that chooses the best of these thresholds via cross-validation and this method is also implemented in \pkg{bartMachine}.  As highlighted in \citet{Bleich2014}, this method performs favorably compared to variable selection using random forests' \qu{importance scores}, which rely on the reduction in out-of-bag forecasting accuracy that occurs from shuffling the values for a particular predictor and dropping the out-of-bag observations down each tree.  Examples of these procedures for variable selection are provided in Section~\ref{subsec:variable_selection_regression}.

\section[bartMachine package features for regression]{\pkg{bartMachine} package features for regression}\label{sec:regression_features}

We illustrate the package features by using both real and simulated data, focusing first on regression problems.

\subsection{Computing parameters}\label{subsec:computing_parameters}

We first set some computing parameters. In this exploration, we allow
up to 5GB of RAM for the \proglang{Java} heap\footnote{Note that the maximum
  amount of memory can be set only \textit{once} at the beginning of
  the \proglang{R} session (a limitation of \pkg{rJava} since only one
  \proglang{Java} Virtual Machine can be initiated per \proglang{R} session), but
  the number of cores can be respecified at any time.} and we set the
number of computing cores available for use to 4.
%
\begin{CodeChunk}
\begin{CodeInput}
R> options(java.parameters = c("-Xmx20g", "--add-modules=jdk.incubator.vector", "-XX:+UseZGC"))
R> library("bartMachine")
R> set_bart_machine_num_cores(4)
\end{CodeInput}
\begin{CodeOutput}
bartMachine now using 4 cores.
\end{CodeOutput}
\end{CodeChunk}
%
The following
Sections~\ref{subsec:model_building}--\ref{subsec:variable_selection_regression}
use a dataset obtained from the UCI Machine Learning Repository \citep{Bache2013}. The $n=201$
observations are automobiles and the goal is to predict each
automobile's price from 25 features (15 continuous and 10 nominal),
first explored by \citet{Kibler1989}.\footnote{We first preprocess the
  data. We first drop one of the nominal predictors (car company) due
  to too many categories (22). We then coerce two of the of the
  nominal predictors to be continuous. Further, the response variable,
  price, was logged to reduce right skew in its distribution.} This
dataset also contains missing data. We omit missing data for now (41
observations that will later be retained in
Section~\ref{subsec:incorporating_missing_data}) and we create a
variable for the design matrix $\X$ and the response $\y$. The
following code loads the data.
%
\begin{CodeChunk}
\begin{CodeInput}
R> data("automobile", package = "bartMachine")
R> automobile <- na.omit(automobile)
R> y <- automobile$log_price
R> X <- automobile; X$log_price <- NULL
\end{CodeInput}
\end{CodeChunk}
%
\subsection{Model building}\label{subsec:model_building}
We are now ready to construct a \pkg{bartMachine} model. The default
hyperparameters generally follow the recommendations of
\citet{Chipman2010} and provide a ready-to-use algorithm for many data
problems. Our hyperparameter settings are $m = 50$,\footnote{In
  contrast to \citet{Chipman2010}, we recommend this default as a good
  starting point rather than $m=200$ due to our experience
  experimenting with the \qu{RMSE by number of trees} feature found
  later in this section. Performance is often similar and
  computational time and memory requirements are dramatically
  reduced.} $\alpha = 0.95$, $\beta = 2$, $k = 2$, $q = 0.9$,
$\nu = 3$, and probabilities of the GROW / PRUNE / CHANGE steps is
28\% / 28\% /44\%. We retain the default number of burn-in Gibbs
samples (250) as well as the default number of post-burn-in samples
(1,000). We default the covariates to be equally important \textit{a
  priori}. Other parameters and their defaults can be found in the
package's online manual. Below is a default \pkg{bartMachine}
model. Here, $\X$ denotes automobile attributes and $\y$ denotes the
log price of the automobile.
%
\begin{CodeChunk}
\begin{CodeInput}
R> bart_machine <- bartMachine(X, y)
\end{CodeInput}
\begin{CodeOutput}
Building bartMachine for regression ... evaluating in sample data...done
\end{CodeOutput}
\end{CodeChunk}
%
If one wishes to see more information about the individual iterations
of the Gibbs sampler of Equation~\ref{eq:gibbs_sampler}, the flag
\texttt{verbose} can be set to \code{TRUE}. One can see more debug
information from the \proglang{Java} program by setting the flag
\texttt{debug\_log} to \code{TRUE} and the program will print to
\texttt{unnamed.log} in the current working directory.

Below we inspect the model object to query its in-sample performance
and to be reminded of the input data and model hyperparameters.
%
\begin{CodeChunk}
\begin{CodeInput}
R> bart_machine
\end{CodeInput}
\begin{CodeOutput}
bartMachine v1.2.1 for regression

training data n = 160 and p = 41 
built in 0.5 secs on 4 cores, 50 trees, 250 burn-in and 1000 post. samples

sigsq est for y beforehand: 0.014 
avg sigsq estimate after burn-in: 0.00751 

in-sample statistics:
 L1 = 7.86 
 L2 = 0.62 
 rmse = 0.06 
 Pseudo-Rsq = 0.9798
p-val for shapiro-wilk test of normality of residuals: 0.06781 
p-val for zero-mean noise: 0.95335
\end{CodeOutput}
\end{CodeChunk}
%
\label{code:default_bart_summary}

Since the response was continuous, \pkg{bartMachine} for regression
was employed automatically (line 1). Line 3 prints the dimensions of
the design matrix. Line 4 records the creation time along with other
model parameters. Line 6 records the MSE for the OLS model and Line 7
displays the \pkg{bartMachine} model's estimate of $\sigsq_e$. We are
then given in-sample statistics on error in lines 10--13. The
pseudo-$R^2$ is calculated via $1 - SSE/SST$. Also provided are
outputs from tests of the error distribution being normal (line 14)
and mean centered (line 15). Note that the \qu{p-val for shapiro-wilk
  test of normality of residuals} is marginally less than 5\%. Thus we
conclude that the noise of Equation~\ref{eq:bart} is not normally
distributed. Just as when interpreting the results from a linear
model, non-normality implies we should be circumspect concerning
\pkg{bartMachine} output that relies on this distributional assumption
such as the credible and prediction intervals of
Section~\ref{subsec:credible_and_prediction_intervals}.

We can also obtain out-of-sample statistics to assess level of
overfitting by using $k$-fold cross-validation. Using 10 randomized
folds we find:
%
\begin{CodeChunk}
\begin{CodeInput}
R> k_fold_cv(X, y, k_folds = 10)
\end{CodeInput}
\begin{CodeOutput}
..........
$L1_err
[1] 15.96878

$L2_err
[1] 2.682349

$rmse
[1] 0.1294785

$PseudoRsq
[1] 0.9133438
\end{CodeOutput}
\end{CodeChunk}
%
The code provides the out-of-sample statistics for the model built
above. This function also returns the $\yhat$ predictions as well as
the vector of the fold indices (which are omitted in the output shown
above).

The Pseudo-$R^2$
being lower out-of-sample (above) versus in-sample suggests that
\pkg{bartMachine} is slightly overfitting (note also that the training
sample during cross-validation is 10\% smaller).

It may also be of interest how the number of trees $m$ affects
performance. One can examine how out-of-sample predictions vary by the
number of trees via
%
\begin{CodeChunk}
\begin{CodeInput}
R> rmse_by_num_trees(bart_machine, num_replicates = 20)
\end{CodeInput}
\end{CodeChunk}
%
\noindent and the output is shown in
Figure~\ref{fig:rmse_by_trees}. This illustration suggests that
predictive performance levels off around $m=50$. We observe similar
illustrations across a wide variety of datasets and hyperparameter
choices which is the reason we have set $m=50$ as the default value in
the package.

\begin{figure}[t!]
\centering
\includegraphics[width=3.5in]{rmse_num_trees_3}
\caption{Out-of-sample predictive performance by number of trees.}
\label{fig:rmse_by_trees}
\end{figure}

Note that there is nominal improvement at $m=200$. There may also be
improvement if other hyperparameters are varied. We can attempt to
build a better \pkg{bartMachine} model using the procedure
\code{bartMachineCV} by grid-searching over a set of hyperparameter
combinations, including $m$ \citep[for more details, see BART-cv
in][]{Chipman2010}. The grid of interest can be customized by the user
and defaults to a small grid.
%
\begin{CodeChunk}
\begin{CodeInput}
R> bart_machine_cv <- bartMachineCV(X, y)
\end{CodeInput}
\begin{CodeOutput}
...
bartMachine CV win: k: 2 nu, q: 3, 0.9 m: 200
\end{CodeOutput}
\end{CodeChunk}
%
This function returns the \qu{winning} model, which is the one with
lowest out-of-sample RMSE over a 5-fold (by default)
cross-validation. Here, the cross-validated \pkg{bartMachine} model
has slightly better in-sample performance (L1 = 8.18, L2 = 0.68 and
Pseudo-$R^2 = 0.978$) in comparison to the default model (output in
this section, page~\pageref{code:default_bart_summary}) as well as
slightly better out-of-sample performance (L1 = 21.05, L2 = 4.40 and
Pseudo-$R^2 = 0.858$) in comparison to the vanilla BART performance
above when assessed via:
%
\begin{CodeChunk}
\begin{CodeInput}
R> k_fold_cv(X, y, k_folds = 10, k = 2, nu = 3, q = 0.9, num_trees = 200)
\end{CodeInput}
\end{CodeChunk}
%
\noindent Predictions are handled with the \texttt{predict} function. Below are fits for the first seven observations.
%
\begin{CodeChunk}
\begin{CodeInput}
R> predict(bart_machine_cv, X[1:7, ])
\end{CodeInput}
\begin{CodeOutput}
[1]  9.494941  9.780791  9.795532 10.058445  9.670211  9.702682  9.911394
\end{CodeOutput}
\end{CodeChunk}
%
\noindent We also include a convenience method
\texttt{bart\_predict\_for\_test\_data} that will predict and return
out-of-sample error metrics when the test outcomes are known.

\subsection{Assumption checking}\label{subsec:assumption_checking}

The package includes features that assess the plausibility of the BART
model assumptions. Checking the mean-centeredness of the noise is
addressed in the summary output of
Section~\ref{subsec:model_building},
page~\pageref{code:default_bart_summary} and is simply a one-sample
$t$-test of the average residual value against a null hypothesis of
true mean zero. We assess both normality and heteroskedasticity via:
%
\begin{CodeChunk}
\begin{CodeInput}
R> check_bart_error_assumptions(bart_machine_cv)
\end{CodeInput}
\end{CodeChunk}
%
This will display a plot similar to
Figure~\ref{fig:bart_normality_heteroskedasticity} which contains a
QQ-plot (to assess normality) as well as a residual-by-predicted plot
(to assess homoskedasticity). There is little evidence of the errors
violating normality and homoskedasticity.

\begin{figure}[t!]
\centering
\includegraphics[width=0.8\textwidth, trim = 0 10 0 10, clip]{bart_normality_heteroskedasticity_2.pdf}
\caption{Test of normality of errors using QQ-plot and the Shapiro-Wilk test (top), residual plot to assess heteroskedasticity (bottom).}
\label{fig:bart_normality_heteroskedasticity}
\end{figure}

In addition to the model assumptions, BART requires convergence of its
Gibbs sampler which can be investigated via:
%
\begin{CodeChunk}
\begin{CodeInput}
R> plot_convergence_diagnostics(bart_machine_cv)
\end{CodeInput}
\end{CodeChunk}
%
Figure~\ref{fig:convergence_diagnostics} displays the plot which
features four types of convergence diagnostics (each one is detailed in
the figure caption). It appears via visual inspection that the
\pkg{bartMachine} model has been sufficiently burned-in as each of the
plots seems to exhibit a stationary process.

\begin{figure}[t!]
\centering
\includegraphics[width=0.98\textwidth, trim = 0 10 0 10, clip]{convergence_diagnostics4.pdf}
\caption{Convergence diagnostics for the cross-validated
  \pkg{bartMachine} model. Top left: $\sigsq$ by MCMC
  iteration. Samples to the left of the first vertical gray line are
  burn-in from the first computing core's MCMC chain. The four
  subsequent plots separated by gray lines are the post-burn-in
  iterations from each of the four computing cores employed during
  model construction. Top right: percent acceptance of
  Metropolis-Hastings proposals across the $m$ trees where each point
  plots one iteration. Points before the gray vertical line illustrate
  burn-in iterations and points after illustrate post-burn-in
  iterations. Each computing core is colored differently. Bottom left:
  average number of leaves across the $m$ trees by iteration (post
  burn-in only where computing cores are separated by vertical gray
  lines). Bottom right: average tree depth across the $m$ trees by
  iteration (post-burn-in only where computing cores are separated by
  vertical gray lines).}
\label{fig:convergence_diagnostics}
\end{figure}

\pagebreak

\subsection{Credible intervals and prediction intervals}\label{subsec:credible_and_prediction_intervals}

An advantage of BART is that if we believe the priors and model
assumptions, the Bayesian probability model and corresponding
burned-in MCMC iterations provide the approximate posterior
distribution of $f\parens{\x}$. Thus, one can compute uncertainty
estimates via quantiles of the posterior samples. These provide
Bayesian \qu{credible intervals} which are intervals for the
conditional expectation function, $\cexpe{\y}{\X}$.

Another useful uncertainty interval can be computed for individual
predictions by combining uncertainty from the conditional expectation
function with the systematic, homoskedastic normal noise produced by
$\errorrv$. This is accomplished by generating 1,000 samples (by
default) from the posterior predictive distribution and then reporting
the appropriate quantiles.

Below is an example of how both types of intervals are computed in the
package (for the 100th observation of the training data):
%
\begin{CodeChunk}
\begin{CodeInput}
R> round(calc_credible_intervals(bart_machine_cv, X[100, ], 
+    ci_conf = 0.95), 2)
\end{CodeInput}
\begin{CodeOutput}
     ci_lower_bd ci_upper_bd
[1,]        8.74        8.98
\end{CodeOutput}
\begin{CodeInput}
R> round(calc_prediction_intervals(bart_machine_cv, X[100, ], 
+    pi_conf = 0.95), 2)
\end{CodeInput}
\begin{CodeOutput}
     pi_lower_bd pi_upper_bd
[1,]        8.65        9.07
\end{CodeOutput}
\end{CodeChunk}
%
Note that the prediction intervals are wider than the credible
intervals because they reflect the uncertainty from the error term.
%
\begin{figure}[t!]
\centering
\begin{subfigure}[c]{\textwidth}
                \centering
                \includegraphics[width=3.5in, trim = 0 10 0 10, clip]{plot_y_vs_y_hat_cred_ints2.pdf}
                \caption{Segments illustrate credible intervals}
                \label{fig:plot_y_vs_y_hat_with_credible_intervals}
        \end{subfigure}\\
\begin{subfigure}[c]{\textwidth}
                \centering
                \includegraphics[width=3.5in, trim = 0 10 0 10, clip]{plot_y_vs_y_hat_pred_ints2.pdf}
                \caption{Segments illustrate prediction intervals}
                \label{fig:plot_y_vs_y_hat_with_prediction_intervals}
        \end{subfigure}
        \caption{Fitted versus actual response values for the
          automobile dataset. Segments are 95\% credible intervals (a)
          or 95\% prediction intervals (b). Green dots indicate the
          true response is within the stated interval and red dots
          indicate otherwise. Note that the percent coverage in (a) is
          not expected to be 95\% because the response includes a
          noise term.}
\label{fig:plot_y_vs_y_hat}
\end{figure}
%
We can then plot these intervals in-sample:
%
\begin{CodeChunk}
\begin{CodeInput}
R> plot_y_vs_yhat(bart_machine_cv, credible_intervals = TRUE)
R> plot_y_vs_yhat(bart_machine_cv, prediction_intervals = TRUE)
\end{CodeInput}
\end{CodeChunk}
%
Figure~\ref{fig:plot_y_vs_y_hat_with_credible_intervals} shows how our
prediction fared against the original response (in-sample) with 95\%
credible intervals.
Figure~\ref{fig:plot_y_vs_y_hat_with_prediction_intervals} shows the
same prediction versus the original response plot now with 95\%
prediction intervals.

\subsection{Variable importance}\label{subsec:variable_importance}

After a \pkg{bartMachine} model is built, it is natural to ask the
question: which variables are most important? This is assessed by
examining the splitting rules in the $m$ trees across the post-burn-in
MCMC iterations which are known as \qu{inclusion proportions}
\citep{Chipman2010}. The inclusion proportion for any given predictor
represents the proportion of times that variable is chosen as a
splitting rule out of all splitting rules among the posterior draws of
the sum-of-trees model. Figure~\ref{fig:var_imp_automobile_cc}
illustrates the inclusion proportions for all variables obtained via:
%
\begin{CodeChunk}
\begin{CodeInput}
R> investigate_var_importance(bart_machine_cv, num_replicates_for_avg = 20)
\end{CodeInput}
\end{CodeChunk}
%
\begin{figure}[t!]
\centering
\includegraphics[width=5.9in, trim = 0 0 0 20, clip]{var_imp_automobile_cc2.pdf}
\caption{Average variable inclusion proportions in the cross-validated
  \pkg{bartMachine} model for the automobile data averaged over 100
  model constructions to obtain stable estimates across many posterior
  modes in the sum-of-trees distribution (as recommended in
  \citealp{Bleich2014}). The segments atop the bars represent 95\%
  confidence intervals. The eight predictors with inclusion
  proportions of zero are predictors with identically one value (after
  missing data was dropped).}
\label{fig:var_imp_automobile_cc}
\end{figure}

Selection of variables which \textit{significantly} affect the
response is addressed briefly in
Section~\ref{subsec:variable_selection}, examples are provided in
Section~\ref{subsec:variable_selection_regression} but for full
treatment of this feature, please see \citet{Bleich2014}.

\subsection{Variable effects}\label{subsec:variable_effects}

It is also natural to ask: does $\x_j$ affect the response,
controlling for other variables in the model? This is roughly
analogous to the $t$-test in ordinary least squares regression of no
linear effect of $\x_j$ on $\y$ while controlling for all other
variables, $\x_{-j}$. The null hypothesis here is the same but the
linearity constraint is relaxed. To test this, we employ a permutation
approach where we record the observed Pseudo-$R^2$ from the
\pkg{bartMachine} model built with the original data. Then we permute
the $\x_j$th column, thereby destroying any relationship between
$\x_j$ and $\y$, construct a new duplicate \pkg{bartMachine} model
from this permuted design matrix and record a ``null''
Pseudo-$R^2$. We then repeat this process to obtain a null
distribution of Pseudo-$R^2$'s. Since the alternative hypothesis is
that $\x_j$ has an effect on $\y$ in terms of predictive power, our
$p$ value is the proportion of null Pseudo-$R^2$'s greater than the
observed Pseudo-$R^2$, making our procedure a natural one-sided
test. Note, however, that this test is conditional on the BART model
and its selected priors being true, similar to the assumptions of the
linear model.

If we wish to test if a set of covariates
$A \subset \braces{\x_{1}, \ldots, \x_{p}}$ affect the response after
controlling for other variables, we repeat the procedure outlined in
the above paragraph by permuting the predictors in $A$ in every null
sample. We do not permute each column separately, but instead permute the rows
as a unit in order to preserve collinearity structure. This is roughly
analogous to the partial $F$-test in ordinary least squares
regression.

If we wish to test if \textit{any} of the covariates matter in
predicting $\y$, we simply permute $\y$ during the null sampling. This
procedure breaks the relationship between the response and the
predictors but does not alter the existing associations between
predictors. This is roughly analogous to the omnibus $F$-test in
ordinary least squares regression.

At $\alpha = 0.05$, Figure~\ref{fig:cov_test_width} demonstrates an
insignificant effect of the variable \texttt{width} of car on
price. Even though \texttt{width} is putatively the \qu{most
  important} variable as measured by proportions of splits in the
posterior sum-of-trees model (Figure~\ref{fig:var_imp_automobile_cc}),
note that this is largely an easy prediction problem with many
collinear predictors. Figure~\ref{fig:cov_test_body_style} shows the
results of a test of the putatively most important categorical
variable, \texttt{body style} (which involves permuting the
categories, then dummifying the levels to preserve the structure of
the variable). We find a marginally significant effect ($p =
0.0495$).
A test of the top ten most important variables is convincingly
significant (Figure~\ref{fig:cov_test_top_10}). For the omnibus test,
Figure~\ref{fig:cov_test_omnibus} illustrates an extremely
statistically significant result, as would be expected. The code to
run these tests is shown below (output suppressed).

\begin{figure}[t!]
\centering
\begin{subfigure}[c]{0.48\textwidth}
                \centering
                \includegraphics[width=3.2in]{cov_test_width2.pdf}
                \caption{\texttt{width}}
                \label{fig:cov_test_width}
        \end{subfigure}~~
\begin{subfigure}[c]{0.48\textwidth}
                \centering
                \includegraphics[width=3.2in]{cov_test_body_style2.pdf}
                \caption{\texttt{body style}}
                \label{fig:cov_test_body_style}
        \end{subfigure}\\
\begin{subfigure}[b]{0.48\textwidth}
                \centering
                \includegraphics[width=3.2in]{cov_test_top_10_2.pdf}
                \caption{The 10 most split-on variables}
                \label{fig:cov_test_top_10}
        \end{subfigure}~~
\begin{subfigure}[b]{0.48\textwidth}
                \centering
                \includegraphics[width=3.2in]{cov_test_omnibus2.pdf}
                \caption{All covariates}
                \label{fig:cov_test_omnibus}
        \end{subfigure}
\caption{Tests of covariate importance conditional on the cross-validated \pkg{bartMachine} model. All tests performed with 100 null samples.}
\label{fig:cov_tests}
\end{figure}

%
\begin{CodeChunk}
\begin{CodeInput}
R> cov_importance_test(bart_machine_cv, covariates = "width")
R> cov_importance_test(bart_machine_cv, covariates = "body_style")
R> cov_importance_test(bart_machine_cv, covariates = c("width",
+    "curb_weight", "city_mpg", "length", "horsepower", "body_style", 
+    "engine_size", "highway_mpg", "peak_rpm", "normalized_losses"))
R> cov_importance_test(bart_machine_cv)
\end{CodeInput}
\end{CodeChunk}

\subsection{Partial dependence}\label{subsec:partial_dependence}

A data analyst may also be interested in understanding how $\x_j$
affects the response on average, after controlling for other
predictors. This can be examined using \citet{Friedman2001}'s partial
dependence function (PDP),

\bneqn\label{eq:true_pdp} f_j(\x_j) =
\expesub{\x_{-j}}{f(\x_j,\x_{-j})} := \int f(\x_j,\x_{-j})
\mathrm{dP}\parens{\x_{-j}}.  \eneqn

\noindent The PDP of predictor $\x_j$ gives the average value of $f$
when $\x_j$ is fixed and $\x_{-j}$ varies over its marginal
distribution, $\mathrm{dP}\parens{\x_{-j}}$. As neither the true model
$f$ nor the distribution of the predictors
$\mathrm{dP}\parens{\x_{-j}}$ are known, we estimate
Equation~\ref{eq:true_pdp} by computing
%
\begin{figure}[t!]
\centering
\begin{subfigure}[c]{0.48\textwidth}
                \centering
                \includegraphics[width=3.2in, trim = 0 10 0 20, clip]{pdp_horsepower2.pdf}
                \caption{\texttt{horsepower}}
                \label{fig:pdp_horsepower}
        \end{subfigure}~~
\begin{subfigure}[c]{0.48\textwidth}
                \centering
                \includegraphics[width=3.2in, trim = 0 10 0 20, clip]{pdp_stroke2.pdf}
                \caption{\texttt{stroke}}
                \label{fig:pdp_stroke}
        \end{subfigure}
        \caption{PDPs plotted in black and 95\% credible intervals
          plotted in blue for variables in the automobile
          dataset. Points plotted are at the 5\%ile, 10\%ile, 20\%ile,
          \ldots, 90\%ile and 95\%ile of the values of the
          predictor. Lines plotted between the points approximate the
          PDP by linear interpolation.}
\label{fig:pdps}
\end{figure}
%
\bneqn\label{eq:est_pdp}
\hat{f}_j(\x_j) = \frac{1}{n}\sum\limits_{i=1}^n \hat{f}(\x_{j},\x_{-j,i})
\eneqn
%
\noindent where $n$ is the number of observations in the training data
and $\hat{f}$ denotes predictions via the \pkg{bartMachine}
model. Since BART provides an estimated posterior distribution, we can
plot credible bands for the PDP function. Credible bands are computed
as follows: in Equation~\ref{eq:est_pdp}, $\hat{f}$ can be
replaced with a function that calculates the $q$th quantile of the
post-burn-in MCMC iterations for
$\yhat$. Figure~\ref{fig:pdp_horsepower} plots the PDP along with the
2.5\%ile and the 97.5\%ile for the variable \texttt{horsepower}. By
varying over most of the range of \texttt{horsepower}, the price is
predicted to increase by about \$1000, holding all else
constant. Figure~\ref{fig:pdp_stroke} plots the PDP along with the
2.5\%ile and the 97.5\%ile for the variable \texttt{stroke}. This
predictor seemed to be relatively unimportant according to
Figure~\ref{fig:var_imp_automobile_cc} and the PDP confirms this, with
a very small, yet nonlinear average partial effect. The code for both
plots is below.
%
\begin{CodeChunk}
\begin{CodeInput}
R> pd_plot(bart_machine_cv, j = "horsepower")
R> pd_plot(bart_machine_cv, j = "stroke")
\end{CodeInput}
\end{CodeChunk}

\subsection{Incorporating missing data}\label{subsec:incorporating_missing_data}

The procedure for incorporating missing data was introduced in
Section~\ref{subsec:missing_data}. We now build a \pkg{bartMachine}
model using this procedure below:
%
\begin{CodeChunk}
\begin{CodeInput}
R> y <- automobile$log_price
R> X <- automobile; X$log_price <- NULL
R> bart_machine <- bartMachine(X, y, use_missing_data = TRUE, 
+    use_missing_data_dummies_as_covars = TRUE)
\end{CodeInput}
\end{CodeChunk}
%
The model output below parallels the model output with the missing
rows omitted (Section~\ref{subsec:model_building}).
%
\begin{CodeChunk}
\begin{CodeInput}
R> bart_machine
\end{CodeInput}
\begin{CodeOutput}
bartMachine v1.2.1 for regression

Missing data feature ON
training data n = 201 and p = 50 
built in 1.4 secs on 1 core, 50 trees, 250 burn-in and 1000 post. samples

sigsq est for y beforehand: 0.016 
avg sigsq estimate after burn-in: 0.00939 

in-sample statistics:
 L1 = 11.49 
 L2 = 1.04 
 rmse = 0.07 
 Pseudo-Rsq = 0.9794
p-val for shapiro-wilk test of normality of residuals: 0.69814 
p-val for zero-mean noise: 0.96389 
\end{CodeOutput}
\end{CodeChunk}
%
Note that the output reflects the use of the complete data set. There
are 41 observations now included for which there are missing
features. Also note that $p$ has now increased from 41 to 50. The nine
\qu{new} predictors are:
%
\begin{CodeChunk}
\begin{CodeOutput}
[1] "engine_location_rear" "engine_type_rotor"    "fuel_system_4bbl"    
[4] "fuel_system_spfi"     "M_normalized_losses"  "M_bore"              
[7] "M_stroke"             "M_horsepower"         "M_peak_rpm"
\end{CodeOutput}
\end{CodeChunk}
%
The first two predictors are two new levels for the variable
\texttt{engine\_location} which appear in the 41 rows with
missingness. The next two predictors are two new levels for the
variable \texttt{fuel\_system} which appear in the 41 rows with
missingness as well. The last five new predictors are dummy variables
which indicate missingness constructed from the predictors which
exhibited missingness (due to the
\texttt{use\_missing\_data\_dummies\_as\_covars} parameter being set
to true).

The procedure of Section~\ref{subsec:missing_data} also natively
incorporates missing data during prediction. Missingness will yield
larger credible intervals. In the example below, we suppose that the
\texttt{curb\_weight} and \texttt{symboling} values were suddenly
unavailable for the 20th automobile and we observe their credible
intervals widening as a result.
%
\begin{CodeChunk}
\begin{CodeInput}
R> x_star <- X[20, ]
R> calc_credible_intervals(bart_machine, x_star, ci_conf = 0.95)
\end{CodeInput}
\begin{CodeOutput}
     ci_lower_bd ci_upper_bd
[1,]    8.650093    8.824515
\end{CodeOutput}
\begin{CodeInput}
R> is.na(x_star[c("curb_weight", "symboling")]) <- TRUE
R> calc_credible_intervals(bart_machine, x_star, ci_conf = 0.95)
\end{CodeInput}
\begin{CodeOutput}
     ci_lower_bd ci_upper_bd
[1,]    8.622582    8.978313
\end{CodeOutput}
\end{CodeChunk}
%

\subsection{Variable selection}\label{subsec:variable_selection_regression}

\begin{figure}[p!]
\centering
\includegraphics[width=5.6in, trim = 0 0 0 15, clip]{var_selection_plot2.pdf}
\caption{Visualization of the three variable selection procedures
  outlined in Section~\ref{subsec:variable_selection} with $\alpha
  =
  0.05$. The top plot illustrates the ``Local'' procedure. The green
  lines are the threshold levels determined from the permutation
  distributions that must be exceeded for a variable to be
  selected. The plotted points are the variable inclusion proportions
  for the observed data (averaged over five duplicate
  \pkg{bartMachine} models). If the observed value is higher than the
  green bar, the variable is included and is displayed as a solid dot;
  if not, it is not included and it is displayed as an open dot. The
  bottom plot illustrates both the ``Global SE'' and ``Global Max''
  thresholds. The red line is the cutoff for ``Global Max'' and
  variables passing this threshold are displayed as solid dots. The blue
  lines represent the thresholds for the ``Global SE''
  procedure. Variables that exceed this cutoff but not the ``Global
  Max'' threshold are displayed as asterisks. Open dots exceed neither
  threshold.}
\label{fig:var_selection_plot}
\end{figure}

In this section we demonstrate the variable selection procedures
introduced in Section~\ref{subsec:variable_selection} and developed in
detail in \citet{Bleich2014}. The following code will select variables
based on the three thresholds and also displays the plot in
Figure~\ref{fig:var_selection_plot}.\footnote{By default, variable
  selection is performed individually on dummy variables for a
  factor. The variable selection procedures return the permutation
  distribution and an aggregation of the dummy variables' inclusion
  proportions can allow for variable selection to be performed on an
  entire factor.}
%
\begin{CodeChunk}
\begin{CodeInput}
R> vs <- var_selection_by_permute(bart_machine, 
+    bottom_margin = 10, num_permute_samples = 10)
R> vs$important_vars_local_names
\end{CodeInput}
\begin{CodeOutput}
  "curb_weight"  "city_mpg" "engine_size"   "horsepower"            
  "length"       "width"    "num_cylinders" "body_style_convertible"
  "wheel_base"   "peak_rpm" "highway_mpg"   "wheel_drive_fwd"       
\end{CodeOutput}
\begin{CodeInput}
R> vs$important_vars_global_max_names
\end{CodeInput}
\begin{CodeOutput}
  "curb_weight" "city_mpg"    "engine_size" "horsepower"  "length"     
\end{CodeOutput}
\begin{CodeInput}
R> vs$important_vars_global_se_names
\end{CodeInput}
\begin{CodeOutput}
  "curb_weight"   "city_mpg"      "engine_size" "horsepower" "length"           
  "width"         "num_cylinders" "wheel_base"  "wheel_drive_fwd"
\end{CodeOutput}
\end{CodeChunk}
%
Usually, ``Global Max'' and ``Global SE'' perform similarly, as they
are both more stringent in selection. However, in many situations it
will not be clear to the data analyst which threshold is most
appropriate. The ``best'' procedure can be chosen via cross-validation
on out-of-sample RMSE as follows:
%
\begin{CodeChunk}
\begin{CodeInput}
var_selection_by_permute_response_cv(bart_machine)
\end{CodeInput}
\begin{CodeOutput}
$best_method
[1] "important_vars_local_names"

$important_vars_cv
 [1] "body_style_convertible" "city_mpg"               "curb_weight"           
 [4] "engine_size"            "engine_type_ohc"        "horsepower"            
 [7] "length"                 "num_cylinders"          "peak_rpm"              
[10] "wheel_base"             "wheel_drive_fwd"        "wheel_drive_rwd"       
[13] "width" 
\end{CodeOutput}
\end{CodeChunk}
%
On this dataset, the \qu{best} approach (as defined by out-of-sample
prediction error) is the ``Local'' procedure.

\subsection{Informed prior information on covariates}\label{subsec:cov_prior}

\citet{Bleich2014} propose a method for incorporating informed prior
information about the predictors into the BART model. This can be
achieved by modifying the prior on the splitting rules as well as the
corresponding calculations in the Metropolis-Hastings step. In
particular, covariates believed to influence the response can a priori
be proposed more often as candidates for splitting rules. Useful prior
information can aid in both variable selection and prediction
tasks. We demonstrate the impact of a correctly informed prior in the
context of the \citet{Friedman1991} function
(Equation~\ref{eq:friedman}).
%
\bneqn\label{eq:friedman}
\y = 10\sin\parens{\pi\x_1\x_2}+20(\x_3-.5)^2+10\x_4+5\x_5 + \berrorrv, \qquad \berrorrv \sim \multnormnot{n}{\0}{\sigsq\bv I}.
\eneqn
%
To illustrate, we construct a design matrix $\X$ where the first five
columns are predictors which influence the response
($\x_1, \ldots, \x_5$ in Equation~\ref{eq:friedman}) and the next 95
columns are predictors that do not affect the response.

All that is required is a specification of relative weights for each
predictor. These weights are normalized internally to become
probabilities. As an example, we assign ten times the weight to the 5
true covariates of the model relative to the 95 useless covariates.
%
\begin{CodeChunk}
\begin{CodeInput}
R> p <- 5
R> p0 <- 95
R> prior <- c(rep(10, times = p), rep(1, times = p0))
\end{CodeInput}
\end{CodeChunk}
%
We now sample 500 observations from the Friedman function with
$\sigma = 1$ for training and another 500 observations for testing.
%
\begin{CodeChunk}
\begin{CodeInput}
R> gen_friedman_data = function(n, p, sigma){
+    if (p < 5){stop("p must be greater than or equal to 5")}
+    X <- matrix(runif(n * p), nrow = n, ncol = p)
+    y <- 10 * sin(pi * X[, 1] * X[, 2]) + 20 * (X[, 3] - .5)^2 + 
+      10 * X[, 4] + 5 * X[, 5] + rnorm(n, 0, sigma)
+    data.frame(y, X)
+  }
R> ntrain <- 500
R> sigma <- 1
R> fr_data <- gen_friedman_data(ntrain, p + p0, sigma)
R> y <- fr_data$y
R> X <- fr_data[, 2:101]
R> ntest <- 500
R> fr_data <- gen_friedman_data(ntest, p + p0, sigma)
R> Xtest <- fr_data[, 2:101]
R> ytest <- fr_data$y
\end{CodeInput}
\end{CodeChunk}
%
We construct a default \pkg{bartMachine} model as well as a
\pkg{bartMachine} model with the informed prior and compare their
performance using the RMSE metric on a test set of another 500
observations.
%
\begin{CodeChunk}
\begin{CodeInput}
R> bart_machine <- bartMachine(X, y)
R> bart_machine_informed <- bartMachine(X, y, cov_prior_vec = prior)
R> bart_predict_for_test_data(bart_machine, Xtest, ytest)$rmse
\end{CodeInput}
\begin{CodeOutput}
[1] 1.661159
\end{CodeOutput}
\begin{CodeInput}
R> bart_predict_for_test_data(bart_machine_informed, Xtest, ytest)$rmse
\end{CodeInput}
\begin{CodeOutput}
[1] 1.232925
\end{CodeOutput}
\end{CodeChunk}
%
There is a substantial improvement in out-of-sample predictive
performance when a properly informed prior is used.

Note that by default the prior vector down-weights the indicator
variables that result from dummifying factors so that the total set of
dummy variables has the same weight as a continuous covariate.

\subsection{Interaction effect detection}\label{subsec:interaction_detection}

In Section~\ref{subsec:variable_importance}, we explored using
variable inclusion proportions to understand the relative influences
of different covariates. A similar procedure can be carried out for
examining interaction effects within a BART model. This question was
initially explored in \citet{Damien2013} where an interaction was
considered to exist between two variables if they both appeared in at
least one splitting rule in a given tree. We refine the definition of
an interaction as follows.

We first begin with a $p \times p$ matrix of zeroes. Within a given
tree, for each split rule variable $j$, we look at all split rule
variables of child nodes, $k$, and we increment the $j, k$ element of
the matrix. Hence variables are considered to interact in a given tree
\textit{only if} they appear together in a contiguous downward path
from the root node to a terminal node.  Note that a variable may
interact with itself (when fitting a linear effect, for
instance). Since there is no order between the parent and child, we
then add the $j, k$ counts together with the $k, j$ counts (if
$j \neq k$). Summing across trees and MCMC iterations gives the total
number of interactions for each pair of variables from which relative
importance can be assessed.

We demonstrate interaction detection on the Friedman function using 10
covariates using the code below:
%
\begin{CodeChunk}
\begin{CodeInput}
R> interaction_investigator(bart_machine, num_replicates_for_avg = 25, 
+    num_var_plot = 10, bottom_margin = 5)
\end{CodeInput}
\end{CodeChunk}
%
Figure~\ref{fig:friedman_function_interactions} shows the ten most
important interactions in the model. The illustration is averaged over
25 model constructions to obtain stable estimates across many
posterior modes in the sum-of-trees distribution. Notice that the
interaction between $\x_1$ and $\x_2$ dominates all other interaction
terms, as \pkg{bartMachine} is correctly capturing the single true
interaction effect: the $\sin\parens{\pi\x_1\x_2}$ term in
Equation~\ref{eq:friedman}. Choosing which of these interactions
\textit{significantly} affect the response is not addressed in this
paper. The methods suggested in
Section~\ref{subsec:variable_selection} may be applicable here and we
consider this to be fruitful future work.

\begin{figure}[t!]
\centering
\includegraphics[width=\textwidth, trim = 0 10 0 30, clip]{friedman_function_interactions2.pdf}
\caption{The top 10 average variable interaction counts (termed
  \qu{relative importance}) in the default \pkg{bartMachine} model for
  the Friedman function data averaged over 25 model constructions. The
  segments atop the bars represent 95\% confidence intervals.}
\label{fig:friedman_function_interactions}
\end{figure}


\subsection[bartMachine model persistence across R
sessions]{\pkg{bartMachine} model persistence across \proglang{R}
  sessions}\label{subsec:persistence}

A convenient feature of \pkg{bartMachine} is its ability to
\qu{serialize} the model. Serialization allows the user to construct
models and have them persist across \proglang{R} sessions. The cost is
time during model creation and hard drive space. Thus, the serialize
feature is defaulted to \qu{off.} We demonstrate the serialize feature using the code below:
%
\begin{CodeChunk}
\begin{CodeInput}
R> bart_machine <- bartMachine(X, y, serialize = TRUE)
R> save.image("bart_demo.RData")
R> q("no")
\end{CodeInput}
\end{CodeChunk}
%
Start a new \proglang{R} session:
%
\begin{CodeChunk}
\begin{CodeInput}
R> options(java.parameters = "-Xmx2500m")
R> library(bartMachine)
R> load("bart_demo.RData")
R> predict(bart_machine, X[1:4, ])
[1] 20.0954617 14.8860727 10.9483889 11.4350277
\end{CodeInput}
\end{CodeChunk}
%
The training data is the same as in the previous section: $n=100$ and
$p=10$. For the default \pkg{bartMachine} settings, $m=50$, number of
burn-in MCMC iterations is 250 and number of posterior samples is
1000. These settings yielded an almost instant serialization and a
2.1MB RData image file. For a more realistic dataset with $n=5000$,
$p=1000$, $m=100$ and 5000 posterior samples, the serialization and
saving of the file took a few minutes and required 100MB.

Note that the package throws an error if the user attempts to make use
of a \pkg{bartMachine} object in another session which was not
serialized:
%
\begin{CodeChunk}
\begin{CodeInput}
R> bart_machine <- bartMachine(X, y) #serialize is FALSE here
R> save.image("bart_demo.RData")
R> q("no")
\end{CodeInput}
\end{CodeChunk}
%
Start a new \proglang{R} session:
%
\begin{CodeChunk}
\begin{CodeInput}
R> options(java.parameters = "-Xmx2500m")
R> library(bartMachine)
R> load("bart_demo.RData")
R> predict(bart_machine, X[1:4, ])
\end{CodeInput}
\begin{CodeOutput}
Error in check_serialization(object) : 
  This bartMachine object was loaded from an R image but was not serialized.
  Please build bartMachine using the option "serialize = TRUE" next time.
\end{CodeOutput}
\end{CodeChunk}

\pagebreak

\section[bartMachine package features for
classification]{\pkg{bartMachine} package features for
  classification}\label{sec:classification_features}

In this section we highlight the features that differ from
\pkg{bartMachine} for regression when the response is dichotomous and
\pkg{bartMachine} for classification is employed. The illustrative
dataset consists of 332 Pima Indians obtained from the UCI Machine
Learning Repository. Of the 332 subjects, 109 were diagnosed with
diabetes, the binary response variable. There are seven continuous
predictors which are body metrics such as blood pressure, glucose
concentration, etc.~and there is no missing data.
	
Building a \pkg{bartMachine} model for classification has the same
computing parameters except that $q, \nu$ cannot be specified since
there is no longer a prior on $\sigsq$ (see
Section~\ref{subsec:probit_bart}). We first build a cross-validated
model below.
%
\begin{CodeChunk}
\begin{CodeInput}
R> data("Pima.te", package = "MASS")
R> X <- data.frame(Pima.te[, -8])
R> y <- Pima.te[, 8]
R> bart_machine_cv <- bartMachineCV(X, y)
\end{CodeInput}
\begin{CodeOutput}
... bartMachine CV win: k: 3 m: 50
\end{CodeOutput}
\begin{CodeInput}
R> bart_machine_cv
\end{CodeInput}
\begin{CodeOutput}
bartMachine v1.2.1 for regression

training data n = 332 and p = 7 
built in 0.7 secs on 4 cores, 50 trees, 250 burn-in and 1000 post. samples

confusion matrix:

           predicted No predicted Yes model errors
actual No       209.000        14.000        0.063
actual Yes       35.000        74.000        0.321
use errors        0.143         0.159        0.148
\end{CodeOutput}
\end{CodeChunk}
%
Classification models have an added hyperparameter,
\texttt{prob\_rule\_class}, which is the rule for determining if the
probability estimate is great enough to be classified into the
positive category. We can see above that the \pkg{bartMachine} at
times predicts ``NO'' for true ``YES'' outcomes and we suffer from a
37.6\% error rate for this outcome. We can try to mitigate this error
by lowering the threshold to increase the number of ``YES'' labels
predicted:
%
\begin{CodeChunk}
\begin{CodeInput}
R> bartMachine(X, y, prob_rule_class = 0.3)
\end{CodeInput}
\begin{CodeOutput}
bartMachine v1.2.1 for classification

training data n = 332 and p = 7 
built in 0.7 secs on 4 cores, 50 trees, 250 burn-in and 1000 post. samples

confusion matrix:

           predicted No predicted Yes model errors
actual No       176.000        47.000        0.211
actual Yes       13.000        96.000        0.119
use errors        0.069         0.329        0.181
\end{CodeOutput}
\end{CodeChunk}
%
This lowers the model error to 10\% for the ``YES'' class, but at the
expense of increasing the error rate for the ``NO'' class. We
encourage the user to cross-validate this rule based on an appropriate
objective function for the problem at hand.

We can also check out-of-sample statistics:
%
\begin{CodeChunk}
\begin{CodeInput}
R> oos_stats <- k_fold_cv(X, y, k_folds = 10)
R> oos_stats$confusion_matrix
\end{CodeInput}
\begin{CodeOutput}
           predicted No predicted Yes model errors
actual No       197.000        26.000        0.117
actual Yes       44.000        65.000        0.404
use errors        0.183         0.286        0.211
\end{CodeOutput}
\end{CodeChunk}
%
\noindent Note that it is possible to predict both class labels and
probability estimates for given observations:
%
\begin{CodeChunk}
\begin{CodeInput}
R> round(predict(bart_machine_cv, X[1 : 2, ], type = "prob"), 3)
\end{CodeInput}
\begin{CodeOutput}
[1] 0.599 0.100
\end{CodeOutput}
\begin{CodeInput}
R> predict(bart_machine_cv, X[1:2, ], type = "class")
\end{CodeInput}
\begin{CodeOutput}
[1] Yes No 
Levels: No Yes
\end{CodeOutput}
\end{CodeChunk}
%
When using the covariate tests of
Section~\ref{subsec:variable_effects}, total misclassification error
becomes the statistic of interest instead of Pseudo-$R^2$.
The $p$
value is calculated now as the proportion of null samples with
\textit{lower} misclassification
error. Figure~\ref{fig:covariate_test_age} illustrates the test
showing that predictor \texttt{age} seems to matter in the prediction
of \texttt{Diabetes}, controlling for other predictors.

\begin{figure}[t!]
\centering
\includegraphics[width=3.9in, trim = 0 15 0 15, clip]{covariate_test_age3.pdf}
\caption{Test of covariate importance for predictor \texttt{age} on whether or not the subject will contract \texttt{Diabetes}.}
\label{fig:covariate_test_age}
\end{figure}

The partial dependence plots of
Section~\ref{subsec:partial_dependence} are now scaled as probit of
the probability estimate. Figure~\ref{fig:glucose_partial_dependence} is
generated via
%
\begin{CodeChunk}
\begin{CodeInput}
R> pd_plot(bart_machine_cv, j = "glu")
\end{CodeInput}
\end{CodeChunk}
%
illustrates that as glucose increases, the probability of contracting
\texttt{Diabetes} increases linearly on a probit scale.

\begin{figure}[t!]
\centering
\includegraphics[width=3.4in, trim = 0 10 0 20, clip]{glucose_partial_dependence2.pdf}
\caption{PDP for predictor \texttt{glu}. The blue lines are 95\% credible intervals.}
\label{fig:glucose_partial_dependence}
\end{figure}

Credible intervals are implemented for classification
\pkg{bartMachine} and are displayed on the probit scale. Note that the
prediction intervals of
Section~\ref{subsec:credible_and_prediction_intervals} do not exist
for classification since $\berrorrv$ noise is not part of the probit
model.
%
\begin{CodeChunk}
\begin{CodeInput}
R> round(calc_credible_intervals(bart_machine_cv, X[1:2, ]), 3)
\end{CodeInput}
\begin{CodeOutput}
     ci_lower_bd ci_upper_bd
[1,]       0.273       0.878
[2,]       0.009       0.291
\end{CodeOutput}
\end{CodeChunk}
%
Other functions work similarly to regression except those that plot
the responses and those that explicitly depend on RMSE as an error
metric.

\section{Discussion}\label{sec:discussion}

This article introduced \pkg{bartMachine}, a new \proglang{R} package
which implements Bayesian additive regression trees. The goal of this
package is to provide a fast, extensive and user-friendly
implementation accessible to a wide range of data analysts, and
increase the visibility of BART to a broader statistical audience. We
hope we have provided organized, well-documented open-source code and
we encourage the community to make innovations on this package.

\section*{Replication}

The stable version of \pkg{bartMachine} is on CRAN and the development
version is located at
\url{https://github.com/kapelner/bartMachine}. The package code is
under the General Public License, version 3 (GPL-3). Results, tables, and figures found in this
paper can be replicated via the scripts located in the
\texttt{bart\_package\_paper} folder within the \texttt{git}
repository.

\section*{Acknowledgments}

We thank Richard Berk, Andreas Buja, Zachary Cohen, Ed George, Alex
Goldstein, Shane Jensen, Abba Krieger, and Robert DeRubeis for helpful
discussions. We thank Simon Urbanek for his very generous help with
\pkg{rJava} at many stages of this project. Adam Kapelner acknowledges
support from the National Science Foundation's Graduate Research
Fellowship Program.

\begin{thebibliography}{29}
\newcommand{\enquote}[1]{``#1''}
\providecommand{\natexlab}[1]{#1}
\providecommand{\url}[1]{\texttt{#1}}
\providecommand{\urlprefix}{URL }
\expandafter\ifx\csname urlstyle\endcsname\relax
  \providecommand{\doi}[1]{doi:\discretionary{}{}{}#1}\else
  \providecommand{\doi}{doi:\discretionary{}{}{}\begingroup
  \urlstyle{rm}\Url}\fi
\providecommand{\eprint}[2][]{\url{#2}}

\bibitem[{Albert and Chib(1993)}]{Albert1993}
Albert JH, Chib S (1993).
\newblock \enquote{{Bayesian Analysis of Binary and Polychotomous Response
  Data}.}
\newblock \emph{Journal of the American Statistical Association},
  \textbf{88}(422), 669--679.
\newblock \doi{10.1080/01621459.1993.10476321}.

\bibitem[{Bache and Lichman(2013)}]{Bache2013}
Bache K, Lichman M (2013).
\newblock \enquote{{UCI} Machine Learning Repository.}
\newblock \urlprefix\url{http://archive.ics.uci.edu/ml}.

\bibitem[{Blattenberger and Fowles(2014)}]{Blattenberger2014}
Blattenberger G, Fowles R (2014).
\newblock \enquote{Avalanche Forecasting: Using Bayesian Additive Regression
  Trees ({BART}).}
\newblock In \emph{Demand for Communications Services -- Insights and
  Perspectives}, pp. 211--227. Springer-Verlag.

\bibitem[{Bleich \emph{et~al.}(2014)Bleich, Kapelner, Jensen, and
  George}]{Bleich2014}
Bleich J, Kapelner A, Jensen ST, George EI (2014).
\newblock \enquote{Variable Selection for BART: An Application to Gene
  Regulation.}
\newblock \emph{The Annals of Applied Statistics}, \textbf{8}(3), 1750--1781.
\newblock \doi{10.1214/14-aoas755}.

\bibitem[{Breiman(2001)}]{Breiman2001}
Breiman L (2001).
\newblock \enquote{{Statistical Modeling: The Two Cultures}.}
\newblock \emph{Statistical Science}, \textbf{16}(3), 199--231.
\newblock \doi{10.1214/ss/1009213726}.

\bibitem[{Chipman and McCulloch(2016)}]{BayesTree}
Chipman H, McCulloch R (2016).
\newblock \emph{\pkg{BayesTree}: Bayesian Additive Regression Trees}.
\newblock \proglang{R} package version 0.3-1.3,
  \urlprefix\url{https://CRAN.R-project.org/package=BayesTree}.

\bibitem[{Chipman \emph{et~al.}(2014)Chipman, McCulloch, and Dorie}]{dbarts}
Chipman H, McCulloch R, Dorie V (2014).
\newblock \emph{\pkg{dbarts}: Discrete Bayesian Additive Regression Trees
  Sampler}.
\newblock \proglang{R} package version 0.8-5,
  \urlprefix\url{https://CRAN.R-project.org/package=dbarts}.

\bibitem[{Chipman \emph{et~al.}(2010)Chipman, George, and
  McCulloch}]{Chipman2010}
Chipman HA, George EI, McCulloch RE (2010).
\newblock \enquote{{BART: Bayesian Additive Regressive Trees}.}
\newblock \emph{The Annals of Applied Statistics}, \textbf{4}(1), 266--298.
\newblock \doi{10.1214/09-aoas285}.

\bibitem[{Damien \emph{et~al.}(2013)Damien, Dellaportas, Polson, and
  Stephens}]{Damien2013}
Damien P, Dellaportas P, Polson N, Stephens D (2013).
\newblock \emph{Bayesian Theory and Applications}.
\newblock 1st edition. Oxford University Press.

\bibitem[{Ding and Simonoff(2010)}]{Ding2010}
Ding Y, Simonoff JS (2010).
\newblock \enquote{{An Investigation of Missing Data Methods for Classification
  Trees Applied to Binary Response Data}.}
\newblock \emph{Journal of Machine Learning Research}, \textbf{11}, 131--170.

\bibitem[{Eliashberg(2010)}]{Eliashberg2010}
Eliashberg J (2010).
\newblock \emph{Green-Lighting Movie Scripts: Revenue Forecasting and Risk
  Management}.
\newblock Ph.D. thesis, University of Pennsylvania.

\bibitem[{Friedman(2001)}]{Friedman2001}
Friedman J (2001).
\newblock \enquote{{Greedy Function Approximation: A Gradient Boosting
  Machine}.}
\newblock \emph{The Annals of Statistics}, \textbf{29}(5), 1189--1232.
\newblock \doi{10.1214/aos/1013203451}.

\bibitem[{Friedman(1991)}]{Friedman1991}
Friedman JH (1991).
\newblock \enquote{{Multivariate Adaptive Regression Splines}.}
\newblock \emph{The Annals of Statistics}, \textbf{19}, 1--67.
\newblock \doi{10.1214/aos/1176347963}.

\bibitem[{Friedman(2002)}]{Friedman2002}
Friedman JH (2002).
\newblock \enquote{{Stochastic Gradient Boosting}.}
\newblock \emph{Computational Statistics \& Data Analysis}, \textbf{38}(4),
  367--378.
\newblock \doi{10.1016/s0167-9473(01)00065-2}.

\bibitem[{Gelman \emph{et~al.}(2004)Gelman, Carlin, Stern, and
  Rubin}]{Gelman2004}
Gelman A, Carlin JB, Stern HS, Rubin DB (2004).
\newblock \emph{Bayesian Data Analysis}.
\newblock 2nd edition. Chapman \& Hall / CRC.

\bibitem[{Geman and Geman(1984)}]{Geman1984}
Geman S, Geman D (1984).
\newblock \enquote{Stochastic Relaxation, {G}ibbs Distributions, and the
  {B}ayesian Restoration of Images.}
\newblock \emph{IEEE Transaction on Pattern Analysis and Machine Intelligence},
  \textbf{6}, 721--741.
\newblock \doi{10.1109/tpami.1984.4767596}.

\bibitem[{Hastie and Tibshirani(2000)}]{Hastie2000}
Hastie T, Tibshirani R (2000).
\newblock \enquote{{Bayesian Backfitting}.}
\newblock \emph{Statistical Science}, \textbf{15}(3), 196--213.
\newblock \doi{10.1214/ss/1009212815}.

\bibitem[{Hastings(1970)}]{Hastings1970}
Hastings WK (1970).
\newblock \enquote{{Monte Carlo Sampling Methods Using Markov Chains and Their
  Applications}.}
\newblock \emph{Biometrika}, \textbf{57}(1), 97--109.
\newblock \doi{10.2307/2334940}.

\bibitem[{Kapelner and Bleich(2015)}]{Kapelner2013}
Kapelner A, Bleich J (2015).
\newblock \enquote{Prediction with Missing Data via Bayesian Additive
  Regression Trees.}
\newblock \emph{Canadian Journal of Statistics}, \textbf{43}(2), 224--239.
\newblock \doi{10.1002/cjs.11248}.

\bibitem[{Kapelner and Bleich(2016)}]{bartMachine}
Kapelner A, Bleich J (2016).
\newblock \enquote{{bartMachine}: Machine Learning with {B}ayesian Additive
  Regression Trees.}
\newblock \emph{Journal of Statistical Software}, \textbf{70}(4), 1--40.
\newblock \doi{10.18637/jss.v070.i04}.

\bibitem[{Kibler \emph{et~al.}(1989)Kibler, Aha, and Albert}]{Kibler1989}
Kibler DF, Aha DW, Albert MK (1989).
\newblock \enquote{Instance Based Prediction of Real Valued Attributes.}
\newblock \emph{Computational Intelligence}, \textbf{5}, 51.
\newblock \doi{10.1111/j.1467-8640.1989.tb00315.x}.

\bibitem[{Kindo \emph{et~al.}(2013)Kindo, Wang, and Pe}]{Kindo2013}
Kindo B, Wang H, Pe E (2013).
\newblock \enquote{{MBACT -- Multiclass Bayesian Additive Classification
  Trees}.}
\newblock {arXiv}:1309.7821 [stat.ML],
  \urlprefix\url{https://arxiv.org/abs/1309.7821v1}.

\bibitem[{Liaw and Wiener(2002)}]{Liaw2002}
Liaw A, Wiener M (2002).
\newblock \enquote{Classification and Regression by \pkg{randomForest}.}
\newblock \emph{\proglang{R} News}, \textbf{2}(3), 18--22.
\newblock \urlprefix\url{https://CRAN.R-project.org/doc/Rnews/}.

\bibitem[{Pratola \emph{et~al.}(2013)Pratola, Chipman, Higdon, McCulloch, and
  Rust}]{Pratola2013}
Pratola MT, Chipman H, Higdon D, McCulloch R, Rust W (2013).
\newblock \enquote{Parallel Bayesian Additive Regression Trees.}
\newblock \emph{Technical report}, University of Chicago.

\bibitem[{{\proglang{R} Core Team}(2016)}]{Rlang}
{\proglang{R} Core Team} (2016).
\newblock \emph{\proglang{R}: A Language and Environment for Statistical
  Computing}.
\newblock \proglang{R} Foundation for Statistical Computing, Vienna, Austria.
\newblock \urlprefix\url{https://www.R-project.org/}.

\bibitem[{Taddy \emph{et~al.}(2011)Taddy, Gramacy, and Polson}]{Taddy2011}
Taddy MA, Gramacy RB, Polson NG (2011).
\newblock \enquote{{Dynamic Trees for Learning and Design}.}
\newblock \emph{Journal of the American Statistical Association},
  \textbf{106}(493), 109--123.
\newblock \doi{10.1198/jasa.2011.ap09769}.

\bibitem[{Twala \emph{et~al.}(2008)Twala, Jones, and Hand}]{Twala2008}
Twala B, Jones M, Hand DJ (2008).
\newblock \enquote{{Good Methods for Coping with Missing Data in Decision
  Trees}.}
\newblock \emph{Pattern Recognition Letters}, \textbf{29}(7), 950--956.
\newblock \doi{10.1016/j.patrec.2008.01.010}.

\bibitem[{Urbanek(2013)}]{rJava}
Urbanek S (2013).
\newblock \emph{\pkg{rJava}: Low-Level \proglang{R} to \proglang{Java}
  Interface}.
\newblock \proglang{R} package version 0.9-6,
  \urlprefix\url{https://CRAN.R-project.org/package=rJava}.

\bibitem[{Zhou and Liu(2008)}]{Zhou2008}
Zhou Q, Liu J (2008).
\newblock \enquote{Extracting Sequence Features to Predict Protein-{DNA}
  Interactions: A Comparative Study.}
\newblock \emph{Nucleic Acids Research}, \textbf{36}(12), 4137--4148.
\newblock \doi{10.1093/nar/gkn361}.

\end{thebibliography}


\newpage

\begin{appendix}

\section{Sampling to modify tree structure}\label{app:implementation}

This section provides details on the implementation of Equation~\ref{eq:gibbs_sampler} (steps 1, 3, \ldots, $2m-1$), the Metropolis-Hastings step for sampling new trees.  Recall from Section~\ref{subsec:posterior} that trees can be altered via growing new child nodes from an existing terminal node, pruning two terminal nodes such that their parent becomes terminal, or changing the splitting rule in a node. 

Below is the Metropolis ratio \citep[p.~291]{Gelman2004} where the parameter sampled is the tree and the data are the responses unexplained by other trees denoted by $\R$. We denote the new, proposal tree with an asterisk and the original tree without the asterisk.
%
\bneqn\label{eq:naive_metropolis}
r = \frac{\prob{\treet{*} \rightarrow \treet{}}}{\prob{\treet{} \rightarrow \treet{*}}} \frac{\cprob{\treet{*}}{\R, \sigsq}}{\cprob{\treet{}}{\R, \sigsq}}.
\eneqn
%
We accept a draw from the posterior distribution of trees if a draw from a standard uniform distribution is less than the value of $r$. Immediately we note that it is difficult (if not impossible) to calculate the posterior probabilities for the trees themselves. Instead, we employ Bayes' rule, 
%
\beqn
\cprob{\treet{}}{\R, \sigsq} = \frac{\cprob{\R}{\treet{}, \sigsq} \cprob{\treet{}}{\sigsq}}{\cprob{\R}{\sigsq}},
\eeqn
%
and plug the result into Equation~\ref{eq:naive_metropolis} to obtain:
%
\beqn
r &=& \underbrace{\frac{\prob{\treet{*} \rightarrow \treet{}}}{\prob{\treet{} \rightarrow \treet{*}}}}_{\text{transition ratio}} ~~~\times~~~ \underbrace{\frac{\cprob{\R}{\treet{*}, \sigsq}}{\cprob{\R}{\treet{}, \sigsq}}}_{\text{likelihood ratio}} ~~~\times \underbrace{\frac{\prob{\treet{*}}}{\prob{\treet{}}}}_{\text{tree structure ratio}}.
\eeqn
%
\noindent Note that the probability of the tree structure is independent of $\sigsq$. 

The goal of this section is to explicitly calculate $r$ for all possible tree proposals --- GROW, PRUNE and CHANGE. For each proposal, the calculations are organized into separate sections detailing each of the three ratios --- transition, likelihood and tree structure. Note that our actual implementation uses the following expressions in log form for numerical accuracy.

\subsection{Grow proposal}\label{subapp:grow_step}

\subsubsection*{Transition ratio}

Transitioning from the original tree to a new tree involves growing two child nodes from a current terminal node:
%
\bneqn\label{eq:grow_transition}
\prob{\treet{} \rightarrow \treet{*}} &=& \prob{\text{GROW}} \prob{\text{selecting $\eta$ to grow from}} \times \\ \nonumber
&& \prob{\text{selecting the $j$th attribute to split on}} \times \\ \nonumber
&& \prob{\text{selecting the $i$th value to split on}} \\ \nonumber
&=& \prob{\text{GROW}} \oneover{b} \oneover{\padjeta} \frac{1}{\nadjeta}.
\eneqn
%
We chose one of the current $b$ terminal nodes which we denote the $\eta$th node, and then we pick an attribute and split point. $\padjeta$ denotes the number of predictors left available to split on. This can be less than $p$ if certain predictors do not have two or more unique values once the data reaches the $\eta$th node. For example, this regularly occurs if a dummy variable was split on in some node higher up in the lineage. $\nadjeta$ denotes the number of \textit{unique} values left in the $p$th attribute after adjusting for parents' splits.

Transitioning from the new tree back to the original tree involves pruning that node:
%
\beqn
\prob{\treet{*} \rightarrow \treet{}} = \prob{\text{PRUNE}} \prob{\text{selecting $\eta$ to prune from}} =  \prob{\text{PRUNE}}\oneover{w_2^*},
\eeqn
%
\noindent where $w_2^*$ denotes the number of second generation internal nodes (nodes with two terminal child nodes) in the new tree. Thus, the full transition ratio is:
%
\beqn
\frac{\prob{\treet{*} \rightarrow \treet{}}}{\prob{\treet{} \rightarrow \treet{*}}} = \frac{\prob{\text{PRUNE}}}{\prob{\text{GROW}}} \frac{b ~ \padjeta ~ \nadjeta}{w_2^*}.
\eeqn
%
Note that when there are no variables with more than two or more unique values, the probability of GROW is set to zero and the step will be automatically rejected.

\subsubsection*{Likelihood ratio}

To calculate the likelihood, the tree structure determines which responses fall into which of the $b$ terminal nodes. Thus,
%
\beqn
\cprob{\Roneton}{\treet{}, \sigsq} = \prod_{\ell=1}^{b} \cprob{\Rlonetonl}{\sigsq},
\eeqn
%
\noindent where each term on the right hand side is the probability of responses in one of the $b$ terminal nodes, which are independent by assumption. The $R_\ell$'s denote the data in the $\ell$th terminal node and where $n_\ell$ denotes how many observations are in each terminal node and $n = \sum_{\ell=1}^b n_\ell$. 

We now find an analytic expression for the node likelihood term. Remember, if the mean in each terminal node, which we denote $\mu_\ell$, was known, then we would have $\Rlonetonl \,|\, \mu_\ell \sigsq ~\iid$ $\normnot{\mu_\ell}{\sigsq}$. BART requires $\mu_\ell$ to be integrated out, allowing the Gibbs sampler in Equation~\ref{eq:gibbs_sampler} to avoid dealing with jumping between continuous spaces of varying dimensions \citep[p.~275]{Chipman2010}. Recall that one of the BART model assumptions is a prior on the average value of $\mu \sim \normnot{0}{\sigsq_\mu}$ and thus,
%
\beqn
\cprob{\Rlonetonl}{\sigsq} = \int_\reals \cprob{\Rlonetonl}{\mu_\ell, \sigsq} \prob{\mu_\ell; \sigsq_\mu} d\mu_\ell,
\eeqn
%
\noindent which can be shown via completion of the square or convolution to be
%
\bneqn\label{eq:likelihood_margined}
\cprob{\Rlonetonl}{\sigsq} &=& \oneover{\tothepow{2\pi\sigsq}{n_\ell / 2}} \sqrt{ \frac{\sigsq}{\sigsq + n_\ell \sigsqmu}}  \times \\ 
&& \exp{-\oneover{2\sigsq} \parens{\sum_{i=1}^{n_\ell} \squared{R_{\ell_i} - \Rbar_\ell} - \frac{\Rbar_\ell^2 n_\ell^2}{n_\ell + \frac{\sigsq}{\sigsqmu}}+ n_\ell \Rbar_\ell^2}}, \nonumber
\eneqn
%
\noindent where $\Rbar_\ell$ denotes the mean response in the node and $R_{\ell_i}$ denotes the observations $i=1\ldots n_\ell$ in the node.

Since the likelihoods are solely determined by the terminal nodes, the proposal tree differs from the original tree by only the selected node to be grown, denoted by $\ell$, which becomes two children after the GROW step denoted by $\ell_L$ and $\ell_R$. Hence, the likelihood ratio becomes:
%
\bneqn\label{eq:grow_likelihood_ratio}
&& \frac{\cprob{\R}{\treet{*}, \sigsq}}{\cprob{\R}{\treet{}, \sigsq}} = \frac{\cprob{\RLlonetonlL}{\sigsq} \cprob{\RRlonetonlR}{\sigsq}}{\cprob{\Rlonetonl}{\sigsq}}.
\eneqn
%
Plugging Equation~\ref{eq:likelihood_margined} into Equation~\ref{eq:grow_likelihood_ratio} three times yields the ratio for the GROW step:
%
\small
\beqn
 \sqrt{\frac{\sigsq \parens{\sigsq + n_\ell \sigsqmu}}{\parens{\sigsq + n_{\ell_L} \sigsqmu}\parens{\sigsq + n_{\ell_R} \sigsqmu}}}~~\exp{\frac{\sigsq_\mu}{2\sigsq} \parens{\frac{\squared{\sum_{i=1}^{n_{\ell_L}} R_{\ell_L, i}}}{\sigsq + n_{\ell_L}\sigsq_\mu} + \frac{\squared{\sum_{i=1}^{n_{\ell_R}} R_{\ell_R, i}}}{\sigsq + n_{\ell_R}\sigsq_\mu} - \frac{\squared{\sum_{i=1}^{n_{\ell}} R_{\ell, i}}}{\sigsq + n_\ell \sigsq_\mu}}},
\eeqn
\normalsize
%
\noindent where $n_{\ell_L}$ and $n_{\ell_R}$ denote the number of data points in the newly grown left and right child nodes.

\subsubsection*{Tree structure ratio}

In Section~\ref{subsec:prior_likelihood} we discussed the prior on the tree structure (where the splits occur) as well as the tree rules. For the entire tree,
%
\beqn
\prob{\treet{}} &=& \prod_{\eta \in \Hterminals} \parens{1 - \probsplit{\eta}}  \prod_{\eta \in \Hint} \probsplit{\eta} \prod_{\eta \in \Hint} \probrule{\eta},
\eeqn
%
\noindent where $\Hterminals$ denotes the set of terminal nodes and $\Hint$ denotes the internal nodes.

Recall that the probability of splitting on a given node $\eta$ is $\probsplit{\eta} = \alpha / \tothepow{1 + d_\eta}{\beta}$. The probability is controlled by two hyperparameters, $\alpha$ and $\beta$, and $d_\eta$ is the depth (number of parent generations) of node $\eta$. When assigning a rule, recall that BART picks from all available attributes and then from all available unique split points. Using the notation from the transition ratio section, $\probrule{\eta} = 1 / \padjeta \times 1 / \nadjeta$.

Once again, the original tree features a node $\eta$ that was selected to be grown. The proposal tree differs with two child nodes denoted $\eta_L$ and $\eta_R$. We can now form the ratio:
%
\beqn
\frac{\prob{\treet{*}}}{\prob{\treet{}}} &=& \frac{\parens{1 - \probsplit{\eta_L}} \parens{1 - \probsplit{\eta_R}} \probsplit{\eta} \probrule{\eta}}{\parens{1 - \probsplit{\eta}}}\\
&=& \frac{\parens{1 - \dfrac{\alpha}{\tothepow{1 + d_{\eta_L}}{\beta}}}\parens{1 - \dfrac{\alpha}{\tothepow{1 + d_{\eta_R}}{\beta}}} \dfrac{\alpha}{\tothepow{1 + d_{\eta}}{\beta}} \doneover{\padjeta}\dfrac{1}{\nadjeta}}{1 -\frac{\alpha}{\tothepow{1 + d_{\eta}}{\beta}}} \\
&=& \alpha \frac{\squared{1 - \frac{\alpha}{\tothepow{2 + d_\eta}{\beta}}}}{\parens{\tothepow{1+d_\eta}{\beta} - \alpha} \padjeta \nadjeta}.
\eeqn
%
The last line follows from some algebra and using the fact that the depth of the grown nodes is the depth of the parent node incremented by one ($d_{\eta_L} = d_{\eta_R} = d_{\eta} + 1$).

\pagebreak

\subsection{Prune proposal}\label{subapp:prune_step}

A prune proposal is the \qu{opposite} of a grow proposal. Prune selects a singly internal node (a node whose children are both terminal) and removes both of its children. Thus, each ratio will be approximately the inverse of the ratios found in the previous section concerning the grow proposal. Note also that prune steps are not considered in trees that consist of a single root node.

\subsubsection*{Transition ratio}

We begin with transitioning from the original tree to the proposal tree:
%
\beqn
\prob{\treet{} \rightarrow \treet{*}} = \prob{\text{PRUNE}} \prob{\text{selecting $\eta$ to prune from}} =  \prob{\text{PRUNE}}\oneover{w_2},
\eeqn
%
\noindent where $w_2$ denotes the number of singly internal parent nodes which have two terminal children (thus no grandchildren). To transition in the opposite direction, we are obligated to grow from node $\eta$. This is similar to Equation~\ref{eq:grow_transition} except the proposed tree has one less terminal node due to the pruning of the original tree, resulting in a  $1 / (b-1)$ term:
%
\beqn
\prob{\treet{*} \rightarrow \treet{}} &=&  \prob{\text{GROW}} \oneover{b-1} \oneover{\padjetastar} \frac{1}{\nadjetastar}.
\eeqn
%
Thus, the transition ratio is:
%
\beqn
\frac{\prob{\treet{*} \rightarrow \treet{}}}{\prob{\treet{} \rightarrow \treet{*}}} = \frac{\prob{\text{GROW}}}{\prob{\text{PRUNE}}} \frac{w_2 }{(b-1) \padjetastar \nadjetastar}.
\eeqn

\subsubsection*{Likelihood ratio}

This is simply the inverse of the likelihood ratio for the grow proposal:
%
\beqn
\frac{\cprob{\R}{\treet{*}, \sigsq}}{\cprob{\R}{\treet{}, \sigsq}} &=&  \sqrt{\frac{\parens{\sigsq + n_{\ell_L} \sigsqmu}\parens{\sigsq + n_{\ell_R} \sigsqmu}}{\sigsq \parens{\sigsq + n_\ell \sigsqmu}}} \times \\
&& \exp{\frac{\sigsq_\mu}{2\sigsq} \parens{\frac{\squared{\sum_{i=1}^{n_{\ell_{~}}} R_{\ell, i}}}{\sigsq + n_\ell \sigsq_\mu} - \frac{\squared{\sum_{i=1}^{n_{\ell_L}} R_{\ell_L, i}}}{\sigsq + n_{\ell_L}\sigsq_\mu} - \frac{\squared{\sum_{i=1}^{n_{\ell_R}} R_{\ell_R, i}}}{\sigsq + n_{\ell_R}\sigsq_\mu}}}. \\
\eeqn

\subsubsection*{Tree structure ratio}

This is also simply the inverse of the tree structure ratio for the grow proposal:
%
\beqn
\frac{\prob{\treet{*}}}{\prob{\treet{}}}  &=& \frac{\parens{\tothepow{1+d_\eta}{\beta} - \alpha} \padjetastar \nadjetastar}{\alpha\squared{1 - \frac{\alpha}{\tothepow{2 + d_\eta}{\beta}}}}.
\eeqn

\pagebreak

\subsection{Change proposal}\label{subapp:change_step}

A change proposal involves picking an internal node and changing its rule by picking both a new available predictor to split on and a new valid split value among values of the selected predictor.  Although this could be implemented for use in any internal node in the tree, for simplicity we limit our implementation to singly internal nodes: those that have two terminal child nodes.

\subsubsection*{Transition ratio}

The transition to a proposal tree is below:
%
\beqn
\prob{\treet{} \rightarrow \treet{*}} &=& \prob{\text{CHANGE}} \prob{\text{selecting node $\eta$ to change}} \times \\
&& \prob{\text{selecting the new attribute to split on}} \times \\
&& \prob{\text{selecting the new value to split on}}.
\eeqn
%
When calculating the ratio, the first three terms are shared in both numerator and denominator. The probability of selecting the new value to split on will differ as different split features have different numbers of unique values available. Thus we are left with
%
\beqn
\frac{\prob{\treet{*} \rightarrow \treet{}}}{\prob{\treet{} \rightarrow \treet{*}}} = \frac{\nadjetastar}{\nadjeta},
\eeqn
%
\noindent where $\nadjetastar$ is the number of split values available under the proposal tree's splitting rule and $\nadjeta$ is the number of split values available under the original tree's splitting rule.


\subsubsection*{Likelihood ratio}

The proposal tree differs from the original tree only in the two child nodes of the selected change node. These two terminal nodes have the unexplained responses apportioned differently. Denote $R_{1\cdot}$ as the residuals of the first child node and $R_{2\cdot}$ as the unexplained responses in the second child node. Thus we begin with:
%
\beqn
\frac{\cprob{\R}{\treet{*}, \sigsq}}{\cprob{\R}{\treet{}, \sigsq}} &=& \frac{\cprob{\Ronestars}{\sigsq}\cprob{\Rtwostars}{\sigsq}}{\cprob{\Rones}{\sigsq}\cprob{\Rtwos}{\sigsq}},
\eeqn
%
\noindent where the responses denoted with an asterisk are the responses in the proposal tree's child nodes.

Substituting Equation~\ref{eq:likelihood_margined} four times and using some algebra, the following expression is obtained for the ratio:
%
\beqn
&& \sqrt{\frac{\parens{\frac{\sigsq}{\sigsqmu} + n_1} \parens{\frac{\sigsq}{\sigsqmu} + n_2}}{\parens{\frac{\sigsq}{\sigsqmu} + n_1^*} \parens{\frac{\sigsq}{\sigsqmu} + n_2^*}}} \times \\
&&  \exp{\oneover{2\sigsq} \parens{\frac{\squared{\sum_{i=1}^{\nonestar} R_{1^*, i}}}{\nonestar + \frac{\sigsq}{\sigsqmu}} + \frac{\squared{\sum_{i=1}^{\ntwostar} R_{2^*, i}}}{\ntwostar + \frac{\sigsq}{\sigsqmu}} - \frac{\squared{\sum_{i=1}^{\none} R_{1, i}}}{\none + \frac{\sigsq}{\sigsqmu}} - \frac{\squared{\sum_{i=1}^{\ntwo} R_{2, i}}}{\ntwo + \frac{\sigsq}{\sigsqmu}}}},
\eeqn
%
\noindent which simplifies to
%
\beqn
\exp{\oneover{2\sigsq} \parens{\frac{\squared{\sum_{i=1}^{\nonestar} R_{1^*, i}} - \squared{\sum_{i=1}^{\nonestar} R_{1, i}}}{\none + \frac{\sigsq}{\sigsqmu}} + \frac{\squared{\sum_{i=1}^{\nonestar} R_{2^*, i}} - \squared{\sum_{i=1}^{\nonestar} R_{2, i}}}{\ntwo + \frac{\sigsq}{\sigsqmu}}}},
\eeqn
%
\noindent if the number of responses in the children do not change in the proposal ($n_1 = n_1^*$ and $n_2 = n_2^*$).

\subsubsection*{Tree structure ratio}

The proposal tree has the same structure as the original tree. Thus we only need to take into account the changed node's children:
%
\beqn
\frac{\prob{\treet{*}}}{\prob{\treet{}}} = \frac{\parens{1 - \probsplit{\etaonestar}} \parens{1 - \probsplit{\etatwostar}} \probsplit{\etastar} \probrule{\etastar}}{\parens{1 - \probsplit{\etaone} \parens{1 - \probsplit{\etatwo}}} \probsplit{\eta} \probrule{\eta}}.
\eeqn
%
The probability of splits remains the same because the child nodes are at the same depths. Thus we only need to consider the ratio of the probability of the rules. Once again, the probability of selecting the new value to split on will differ as different split features have different numbers of unique values available. We are left with $\mathbb{P}(\treet{*}) / \mathbb{P}(\treet{}) = \nadjeta / \nadjetastar$.

Note that this is the inverse of the transition ratio. Hence, for the change step, only the likelihood ratio needs to be computed to determine the Metropolis-Hastings ratio $r$.

\section{Bakeoff}\label{app:bakeoff}

We baked off nine regression data sets and assessed out-of-sample RMSE using 10-fold cross-validation. We average the results across 20 replications of cross-validation. The results are displayed in Table~\ref{tab:bakeoff}.
\begin{table}[t!]
\centering
\begin{tabular}{rrrr}
  \hline
 & \pkg{bartMachine} & \pkg{BayesTree} & \pkg{randomForest} \\ 
  \hline
boston & 3.003* & 4.506~ & 4.581 \\ 
  triazine & 0.130~ & 0.130~ & 0.119 \\ 
  ozone & 4.105~ & 4.129~ & 4.068 \\ 
  baseball & 700.453* & 713.488~ & 730.359 \\ 
  wine.red & 0.627* & 0.653~ & 0.641 \\ 
  ankara & 1.303* & 1.462~ & 1.571 \\ 
  wine.white & 0.715* & 0.764~ & 0.745 \\ 
  pole & 11.731* & 12.764~ & 10.699 \\ 
  compactiv & 3.261~ & 2.820* & 2.994 \\ 
   \hline
\end{tabular}
\caption{RMSE values for three machine learning algorithms averaged across 20 replicates. Asterisks indicate a significant difference between \pkg{bartMachine} and \pkg{BayesTree} at a significance level of 5\% with a Bonferroni correction. Comparisons with \pkg{randomForest}'s performance were not conducted.}
\label{tab:bakeoff}
\end{table}
We conclude that the implementation outlined in this paper performs
approximately the same as the previous implementation with regards to
predictive accuracy.  Table~\ref{tab:times} shows the average run time
for each algorithm. Note that \pkg{bartMachine} is run using 4 cores.

\begin{table}[t!]
\centering
\begin{tabular}{rrrr}
  \hline
 & \pkg{bartMachine} & \pkg{BayesTree} & \pkg{randomForest} \\ 
  \hline
boston & 7.8 & 28.5 & 5.1 \\ 
  triazine & 5.7 & 10.7 & 2.6 \\ 
  ozone & 4.7 & 17.6 & 2.1 \\ 
  baseball & 5.6 & 18.6 & 3.3 \\ 
  wine.red & 13.5 & 51.1 & 10.6 \\ 
  ankara & 12.8 & 27.0 & 10.9 \\ 
  wine.white & 13.5 & 56.0 & 11.0 \\ 
  pole & 18.2 & 7.0 & 12.0 \\ 
  compactiv & 16.3 & 18.4 & 19.2 \\ 
   \hline
\end{tabular}
\caption{Average run times (in seconds) for each complete $k$-fold estimation for three machine learning algorithms on a Windows 7 desktop with four cores at 3.4GHz, 24GB RAM, running \proglang{R} 3.3.0 development build.}
\label{tab:times}
\end{table}

\end{appendix}

\end{document}
