\documentclass[a4paper]{scrartcl}
\usepackage[T1]{fontenc}
\usepackage[bookmarks=TRUE,
            colorlinks,
            pdfpagemode=none,
            pdfstartview=FitH,
            citecolor=black,
            filecolor=black,
            linkcolor=black,
            urlcolor=black,
            ]{hyperref}
\usepackage[utf8]{inputenc}
\usepackage{natbib}
\usepackage{amsmath}

\newcommand{\code}[1]{\texttt{#1}}
\newcommand{\loglik}{\ell}% log likelihood
\newcommand{\var}{\mathrm{Var}\,}
\renewcommand*{\vec}[1]{\boldsymbol{#1}}% vector
\allowdisplaybreaks

\DeclareMathOperator{\artanh}{arctanh}

\title{Interval Regression with Sample Selection}
\author{Arne Henningsen, Sebastian Petersen, G\'eraldine Henningsen}

\begin{document}
\SweaveOpts{concordance=TRUE}
%\VignetteIndexEntry{Interval Regression with Sample Selection}
%\VignetteKeyword{models}
%\VignetteKeyword{regression}

\maketitle

This `vignette' is largely based on \citet*{petersen17}. 

\section{Model Specification}

The general specification
of an interval regression model with sample selection is:
\begin{align}
   y_i^{S*} &= {\vec{\beta}^S}' \vec{x}_i^S + \varepsilon_i^S
   \label{eq:selection*}
   \\
   y_i^S & = 
   \begin{cases}
      0 & \quad \text{if } y_i^{S*} \leq 0
      \label{eq:selection}
      \\
      1 & \quad \text{otherwise}
   \end{cases}
   \\
   y_i^{O*} &= {\vec{\beta}^O}' \vec{x}_i^O + \varepsilon_i^O
   \label{eq:outcome*}
   \\
   y_i^O &= 
   \begin{cases}
      \text{unknown} & \quad \text{if } y_i^{S} = 0\\
      1 & \quad \text{if } \alpha_1 < y_i^{O*} \leq \alpha_2
         \text{ and } y_i^{S} = 1\\
      2 & \quad \text{if } \alpha_2 < y_i^{O*} \leq \alpha_3
         \text{ and } y_i^{S} = 1\\
      \vdots & \\
      M & \quad \text{if } \alpha_{M} < y_i^{O*} \leq \alpha_{M+1}
         \text{ and } y_i^{S} = 1
   \end{cases}
   \\
   \left(
      \begin{array}{c} \varepsilon_i^S \\ \varepsilon_i^O \end{array}
   \right)
   &\sim N_2 \left( \left( \begin{array}{c} 0 \\ 0 \end{array} \right),
      \left[ \begin{matrix}
         1 & \rho \sigma \\ 
         \rho \sigma & \sigma^2
      \end{matrix} \right]
   \right),
\end{align}
where subscript~$i$ indicates the observation,
$y_i^{O*}$~is a latent outcome variable,
$y_i^O$~is a partially observed categorical variable
that indicates in which interval $y_i^{O*}$ lies,
$M$~is the number of intervals,
$\alpha_1 , \ldots , \alpha_{M+1}$~are the boundaries of the intervals
(whereas frequently but not necessarily
$\alpha_1 = - \infty$ and $\alpha_{M+1} = \infty$),
$y_i^S$~is a binary variable
that indicates whether $y_i^O$ is observed,
$y_i^{S*}$ is a latent variable
that indicates the ``tendency'' that $y_i^S$ is one,
$\vec{x}_i^S$ and $\vec{x}_i^O$ are (column) vectors of explanatory variables
for the selection equation and outcome equation, respectively,
$\varepsilon_i^S$ and $\varepsilon_i^O$
are random disturbance terms
that have a joint bivariate normal distribution,
and $\vec{\beta}^S$ and $\vec{\beta}^O$ are (column) vectors
and $\rho$ and $\sigma$ are scalars of unknown model parameters.


\section{Log-Likelihood Function}

The probability that $y_i^O$ is unobserved is:
\begin{align}
P \left( y_i^S = 0 \right)
& = P \left( y_i^{S*} \leq 0 \right)\\
& = P \left( {\vec{\beta}^S}' \vec{x}_i^S + \varepsilon_i^S \leq 0 \right)\\
& = P \left( \varepsilon_i^S \leq - {\vec{\beta}^S}' \vec{x}_i^S \right)
\end{align}

The probability that $y_i^O$ is observed and indicates
that $y_i^{O*}$ lies in the $m$th interval is:
\begin{align}
P \left( y_i^S = 1 \wedge y_i^O = m \right)
& = P \left( y_i^{S*} > 0 \wedge
   \alpha_{m} < y_i^{O*} \leq \alpha_{m+1} \right)\\
& = P \left( {\vec{\beta}^S}' \vec{x}_i^S + \varepsilon_i^S > 0 \wedge
   \alpha_{m} < {\vec{\beta}^O}' \vec{x}_i^O + \varepsilon_i^O
   \leq \alpha_{m+1} \right)\\
& = P \left( \varepsilon_i^S > - {\vec{\beta}^S}' \vec{x}_i^S \wedge
   \alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O < \varepsilon_i^O
   \leq \alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O \right)
\end{align}

The log-likelihood contribution of the $i$th observation is:
\begin{align}
\ell_i = & ( 1 - y_i^S )
   \ln \left[ \Phi \left( - {\vec{\beta}^S}' \vec{x}_i^S \right) \right]\\
   & + \sum_{m=1}^M y_i^S ( y_i^O = m ) \ln \left[
      \Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) \right.\nonumber \\
      & \qquad \left. - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) \nonumber
   \right],
\end{align}
where $\Phi(.)$ indicates the cumulative distribution function
of the univariate standard normal distribution
and $\Phi_2(.)$ indicates the cumulative distribution function
of the bivariate standard normal distribution.


\section{Restricting coefficients $\rho$ and $\sigma^2$}

The parameter $\rho$ needs to be in the interval $(-1,1)$.
In order to restrict $\rho$ to be in this interval,
we estimate $\arctan( \rho )$ instead of $\rho$
so that the derived parameter $\rho = \tan ( \arctan( \rho ) )$
is always in the interval $(-1,1)$.
We use the delta method to calculate approximate standard errors
of the derived parameter~$\rho$,
whereas the corresponding element of the Jacobian matrix is:
\begin{align}
\frac{\partial \tan ( \arctan( \rho ) )}{\partial \arctan( \rho )}
= \frac{\partial \rho }{\partial \arctan( \rho )}
= ( 1 + \rho^2 )
\end{align}

The parameter $\sigma$ needs to be strictly positive, i.e.~$\sigma > 0$.
In order to restrict $\sigma$ to be strictly positive,
we estimate $\log( \sigma )$ instead of $\sigma$ or $\sigma^2$
so that the derived parameters
$\sigma = \exp( \log( \sigma ) )$
and $\sigma^2 = \exp( 2 \; \log( \sigma ) )$
are always strictly positive.
We use the delta method to calculate approximate standard errors
of the derived parameters~$\sigma$ and $\sigma^2$,
whereas the corresponding elements of the Jacobian matrix are:
\begin{align}
\frac{\partial \exp( \log( \sigma ) ) }{
   \partial \log( \sigma )}
& = \exp( \log( \sigma ) ) = \sigma \\[3ex]
\frac{\partial \exp( 2 \; \log( \sigma ) )}{\partial \log( \sigma )}
& = 2 \; \exp( 2 \; \log( \sigma ) )
= 2 \; \sigma^2
\end{align}


\section{Gradients of the CDF of the bivariate standard normal distribution}
\label{sec:gradBiv}

In order to facilitate the calculation of the gradients
of the log-likelihood function,
we calculate the partial derivatives
of the cumulative distribution function~(CDF)
of the bivariate standard normal distribution:
\begin{align}
\Phi_2 ( x_1, x_2 , \rho ) 
& = \int_{- \infty}^{x_2} \int_{- \infty}^{x_1}
   \phi_2 ( a_1 , a_2 , \rho ) \; d a_1 \; d a_2
   \label{eq:biv},
\end{align}
where $\phi_2 ( . )$ is the probability density function~(PDF)
of the bivariate standard normal distribution:
\begin{align}
\phi_2 ( x_1 , x_2 , \rho )
& = \frac{1}{2 \pi \sqrt{ 1 - \rho^2 }} \cdot
   \exp \left( - \frac{x_1^2 - 2 \rho x_1 x_2 + x_2^2}{2 (1-\rho^2)} \right)
\label{eq:bivPdf}
\end{align}

In the following, we check equation~(\ref{eq:bivPdf})
by a simple numerical example:
<<>>=
library( "mvtnorm" )
library( "maxLik" )
x1 <- 0.4
x2 <- -0.3
rho <- -0.6
sigma <-  matrix( c( 1, rho, rho, 1 ), nrow = 2 )

dens <- dmvnorm( c( x1, x2 ), sigma = sigma )
print( dens )
all.equal( dens, ( 2 * pi * sqrt( 1 - rho^2 ) )^(-1) *
   exp( - ( x1^2 - 2 * rho * x1 * x2 + x2^2 ) / ( 2 * ( 1 - rho^2 ) ) ) )
@


\subsection{Gradients with respect to the limits~($x_1$ and~$x_2$)}

\begin{align}
\frac{\partial \Phi_2 ( x_1, x_2 , \rho )}{\partial x_2} 
& = \int_{- \infty}^{x_1} \phi_2( a_1 , x_2 , \rho ) \; d a_1
   \label{eq:derivBivFirst}\\
& = \int_{- \infty}^{x_1} \phi( a_1 | x_2, \rho ) \phi( x_2 ) \; d a_1
   \label{eq:derivBivCondFirst}\\
& = \int_{- \infty}^{x_1}
   \tilde{\phi} \left( a_1, \rho x_2, 1 - \rho^2 \right) \phi( x_2 ) \; d a_1
   \label{eq:derivBivCondNonNormal}\\
& = \int_{- \infty}^{x_1}
   \phi \left( \frac{a_1 - \rho x_2}{\sqrt{1 - \rho^2}} \right)
   \left( \sqrt{1 - \rho^2} \right)^{-1} \phi( x_2 ) \; d a_1
   \label{eq:derivBivCondFinal}\\
& = \int_{- \infty}^{x_1}
      \phi \left( \frac{a_1 - \rho x_2}{\sqrt{1 - \rho^2}} \right)
      \left( \sqrt{1 - \rho^2} \right)^{-1} d a_1 \; \phi( x_2 )
   \label{eq:derivBivCondX2out}\\
& = \int_{- \infty}^{\frac{x_1 - \rho x_2}{\sqrt{1 - \rho^2}}}
      \phi ( a_1 ) \; d a_1 \; \phi( x_2 )
   \label{eq:derivBivBorders}\\
& = \Phi \left( \frac{x_1 - \rho x_2}{\sqrt{1 - \rho^2}} \right)
   \phi( x_2 ),
   \label{eq:derivBivFinal}
\end{align}
where $\tilde{\phi}( \; , \mu , \sigma^2 )$
indicates the density function of a normal distribution
with mean~$\mu$ and variance~$\sigma^2$.

In the following, we use the same simple numerical example
as in the beginning of section~\ref{sec:gradBiv}
to check the above derivations.
First, we check whether the PDF
of the bivariate standard normal distribution,
i.e.\ $\phi_2 ( x_1 , x_2 , \rho )$
(part of equation~\ref{eq:derivBivFirst}),
is equal to $\tilde{\phi} \left( x_1, \rho x_2, 1 - \rho^2 \right) \phi( x_2 )$
(part of equation~\ref{eq:derivBivCondNonNormal})
and equal to $\phi \left( ( x_1 - \rho x_2 ) / ( \sqrt{1 - \rho^2} ) \right)$
   $\left( \sqrt{1 - \rho^2} \right)^{-1} \phi( x_2 )$
(part of equations~\ref{eq:derivBivCondFinal} and~\ref{eq:derivBivCondX2out}):
<<>>=
all.equal( dens, dnorm( x1, rho * x2, sqrt( 1 - rho^2 ) ) * dnorm(x2) )
all.equal( dens, ( dnorm( ( x1 - rho * x2 ) / sqrt( 1 - rho^2 ) ) /
   sqrt( 1 - rho^2 ) ) * dnorm(x2) )
@

In the following, we will numerically calculate the derivative
of the cumulative distribution function of the bivaraite normal distribution
(equation~\ref{eq:biv}) with respect to~$x_2$
and check wehther this partial derivative
is equal to the right-hand sides of equations~(\ref{eq:derivBivFirst}),
(\ref{eq:derivBivCondFinal}), (\ref{eq:derivBivCondX2out}), 
and~(\ref{eq:derivBivFinal}):
<<>>=
funX2 <- function( a2 ) {
   prob <- pmvnorm( upper = c( x1, a2 ), sigma = sigma )
   return( prob )
}
grad <- c( numericGradient( funX2, x2 ) )
print( grad )

funX1 <- function( a1 ) {
   dens <- rep( NA, length( a1 ) )
   for( i in 1:length( a1 ) ) {
      dens[i] <- dmvnorm( c( a1[i], x2 ), sigma = sigma )
   }
   return( dens )
}
all.equal( grad, integrate( funX1, lower = -Inf, upper = x1 )$value )


funX1a <- function( a1 ) {
   dens <- rep( NA, length( a1 ) )
   for( i in 1:length( a1 ) ) {
      dens[i] <- ( dnorm( ( a1[i] - rho * x2 ) / sqrt( 1 - rho^2 ) ) /
            sqrt(1-rho^2) ) * dnorm(x2)
   }
   return( dens )
}
all.equal( grad, integrate( funX1a, lower = -Inf, upper = x1 )$value )

funX1b <- function( a1 ) {
   dens <- rep( NA, length( a1 ) )
   for( i in 1:length( a1 ) ) {
      dens[i] <- dnorm( ( a1[i] - rho * x2 ) / sqrt( 1 - rho^2 ) ) /
         sqrt(1-rho^2)
   }
   return( dens )
}
all.equal( grad,
   integrate( funX1b, lower = -Inf, upper = x1 )$value * dnorm(x2) )

all.equal( grad,
   pnorm( ( x1 - rho * x2 ) / sqrt( 1 - rho^2 ) ) * dnorm( x2 ) )
@


\subsection{Gradients with respect to the coefficient of correlation~($\rho$)}

\begin{align}
& \frac{\partial \Phi_2 ( x_1, x_2 , \rho )}{\partial \rho} \\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
= & \frac{\partial \left[ \int_{- \infty}^{x_1} \int_{- \infty}^{x_2}
   \phi_2 ( a_1, a_2, \rho ) \; d a_2 \; d a_1 \right]}{\partial \rho}
   \label{eq:derivBivrho_start} \\[3ex]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%   
= & \int_{- \infty}^{x_1} \int_{- \infty}^{x_2} 
   \frac{\partial \phi_2 (a_1, a_2, \rho)}{\partial \rho}
   \; d a_2 \; d a_1  \\[3ex]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
= & \int_{- \infty}^{x_1} \int_{- \infty}^{x_2}
  \frac{\partial}{\partial \rho} \left(
  \frac{\exp \left( -\dfrac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)}
  \right)}{2 \pi \sqrt{ 1 - \rho^2 }} \right)
  \; d a_2 \; d a_1  \\[3ex]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  
= & \int_{- \infty}^{x_1} \int_{- \infty}^{x_2}
   \frac{1}{2\pi}
   \frac{\partial}{\partial \rho} \left(
   \frac{\exp \left( -\dfrac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)}
   \right)}{\sqrt{ 1 - \rho^2 }} \right)
   \; d a_2 \; d a_1  \\[3ex]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
= & \int_{- \infty}^{x_1} \int_{- \infty}^{x_2}
   \frac{1}{2\pi}
   \left( \dfrac{
   \dfrac{\partial}{\partial \rho}
   \left( \exp \left( -\dfrac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)}
   \right)\right) \cdot \sqrt{1-\rho^2}}{ 1 - \rho^2 }  \right. \\
  & \left. \qquad - \dfrac{ 
   \dfrac{\partial}{\partial \rho} (\sqrt{1-\rho^2}) \cdot 
   \exp \left( -\dfrac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)}\right)}
   { 1 - \rho^2 } \right)
   \; d a_2 \; d a_1 \nonumber \\[3ex] 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%   
= & \int_{- \infty}^{x_1} \int_{- \infty}^{x_2}
   \frac{1}{2\pi}
   \left( \dfrac{
   \dfrac{\partial}{\partial \rho} \left( -\dfrac{a_1^2 - 2
   \rho a_1 a_2 + a_2^2}{2(1-\rho^2)} \right) \cdot       
    \exp \left( -\dfrac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)} \right) 
    \cdot \sqrt{1-\rho^2}} { 1 - \rho^2 } \right. \\
  & \left. \qquad - \dfrac {
    \left(-\dfrac{\rho}{\sqrt{1-\rho^2}}\right)  \cdot \exp \left( -\dfrac{a_1^2 - 
    2\rho a_1 a_2 + a_2^2}{2(1-\rho^2)}\right)}
    { 1 - \rho^2 } \right)
    \; d a_2 \; d a_1 \nonumber \\[3ex]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
= & \int_{- \infty}^{x_1} \int_{- \infty}^{x_2}
   \frac{1}{2\pi}
   \left( \dfrac{
   \dfrac{ \left( -4\rho (a_1^2 - 2 \rho a_1 a_2 + a_2^2 ) 
   - 2( 1 - \rho^2 ) ( -2 a_1 a_2 ) \right)}{4(1-\rho^2)^2} \cdot       
   \exp \left( -\dfrac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)} \right) \cdot 
   \sqrt{1-\rho^2}}
   { 1 - \rho^2 } \right. \\
  & \left. \qquad -\dfrac{
   \left(-\dfrac{\rho}{ \sqrt{1-\rho^2}} \right) \cdot 
    \exp \left( -\dfrac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)}\right)}
    { 1 - \rho^2 } \right)
   \; d a_2 \; d a_1 \nonumber \\[3ex] 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
= & \int_{- \infty}^{x_1} \int_{- \infty}^{x_2}
   \frac{1}{2\pi}
   \left( \dfrac{
   \dfrac{ \left( -4\rho (a_1^2 - 2 \rho a_1 a_2 + a_2^2 ) 
   - 2( 1 - \rho^2 ) ( -2 a_1 a_2 ) \right)}{4(1-\rho^2)^2} \cdot 
   \sqrt{1-\rho^2}}
   { 1 - \rho^2 } 
   -\dfrac{
   \left(-\dfrac{\rho}{ \sqrt{1-\rho^2}} \right)}
   { 1 - \rho^2 } \right) \\
  &\qquad \cdot       
   \exp \left( -\dfrac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)} \right)
   \; d a_2 \; d a_1 \nonumber \\[3ex] 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
= & \int_{- \infty}^{x_1} \int_{- \infty}^{x_2}
   \frac{1}{2\pi}
   \left( \dfrac{
   \left( -4\rho (a_1^2 - 2 \rho a_1 a_2 + a_2^2 ) 
   - 2( 1 - \rho^2 ) ( -2 a_1 a_2 ) \right)}
   {4(1-\rho^2)^{\frac{5}{3}}}
   +\dfrac{\rho}{ (1-\rho^2)^{\frac{3}{2}}}
   \right) \\
  &\qquad \cdot       
   \exp \left( -\dfrac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)} \right)
   \; d a_2 \; d a_1 \nonumber \\[3ex] 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
= & \int_{- \infty}^{x_1} \int_{- \infty}^{x_2}
   \frac{1}{2\pi}
   \left( \dfrac{
   \left( -4\rho (a_1^2 - 2 \rho a_1 a_2 + a_2^2 ) 
   - 2( 1 - \rho^2 ) ( -2 a_1 a_2 ) \right)}{4(1-\rho^2)^{\frac{5}{2}}}
   +\dfrac{\rho}{ (1-\rho^2)^{\frac{3}{2}}}
   \right) \\
  &\qquad \cdot       
   \exp \left( -\dfrac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)} \right)
   \; d a_2 \; d a_1 \nonumber \\[3ex] 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%   
= & \int_{- \infty}^{x_1} \int_{- \infty}^{x_2}
  \frac{1}{2\pi}
  \left( \dfrac{\rho}
  {( 1 - \rho^2)^{\frac{3}{2}}}  
  - \frac{ \rho( a_1^2 - \rho a_1 a_2  + a_2^2 ) - a_1 a_2 }
  { ( 1 - \rho^2 )^{\frac{5}{2}}} \right) \\
  & \qquad \cdot \exp \left( -\frac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)}
  \right) \; d a_2 \; d a_1 \nonumber \\[3ex] 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
=& \frac{1}{2 \pi \sqrt{( 1 - \rho^2)}} 
 \int_{- \infty}^{x_1} \int_{- \infty}^{x_2} 
 \left( \frac{ \rho}{ 1 - \rho^2 } 
 - \frac{ \rho( a_1^2 - \rho a_1 a_2 + a_2^2 ) - a_1 a_2 }{ (1 - \rho^2)^2 } 
 \right) \\
 & \qquad \cdot \exp \left( -\frac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)}
    \right)
    \; d a_2 \; d a_1 \nonumber \\[3ex] 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
=& \frac{1}{2 \pi \sqrt{( 1 - \rho^2)}} 
 \int_{- \infty}^{x_1} 
 \left| \left( - \frac{ 2 a_1 - 2 \rho a_2 }{ 2( 1 - \rho^2 )} \right) 
 \cdot \exp \left( -\frac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)} \right) 
 \right|_{-\infty}^{x_2}
 \; d a_1  \\[3ex] 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
=& \frac{1}{2 \pi \sqrt{( 1 - \rho^2)}} 
 \int_{- \infty}^{x_1} 
 \left( \left( - \frac{ 2 a_1 - 2 \rho x_2 }{ 2( 1 - \rho^2 )} \right) 
 \cdot \exp \left( -\frac{a_1^2 - 2 \rho a_1 x_2 + x_2^2}{2(1-\rho^2)} \right) \right.\\
 & \left. - \displaystyle{\lim_{a_2 \rightarrow -\infty}} 
 \frac{1}{2( 1 - \rho^2 )} \frac{ - 2 a_1 + 2 \rho a_2} 
 {\exp \left( \dfrac{a_1^2 - 2 \rho a_1 a_2 + a_2^2}{2(1-\rho^2)} \right)} \right)
 \; d a_1 \nonumber 
 \end{align}
 Applying L'Hospital on the last term leads to
\begin{align} 
=& \frac{1}{2 \pi \sqrt{( 1 - \rho^2)}} 
 \int_{- \infty}^{x_1} 
 \left( \left( - \frac{ 2 a_1 - 2 \rho x_2 }{ 2( 1 - \rho^2 )} \right) 
 \cdot \exp \left( -\frac{a_1^2 - 2 \rho a_1 x_2 + x_2^2}{2(1-\rho^2)} \right) 
 - 0 \right)
 \; d a_1 \\[3ex] 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
=& \frac{1}{2 \pi \sqrt{( 1 - \rho^2)}} 
 \int_{- \infty}^{x_1} 
 \left( - \frac{ 2 a_1 - 2 \rho x_2 }{ 2( 1 - \rho^2 )} \right) 
 \cdot \exp \left( -\frac{a_1^2 - 2 \rho a_1 x_2 + x_2^2}{2(1-\rho^2)} \right) 
 \; d a_1 \label{eq:x22} \\[3ex] 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
=& \frac{1}{2 \pi \sqrt{( 1 - \rho^2)}} 
 \left|
 \exp \left( -\frac{a_1^2 - 2 \rho a_1 x_2 + x_2^2}{2(1-\rho^2)} \right) 
 \right|_{-\infty}^{x_1} \label{eq:x11} \\[3ex] 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
=& \frac{1}{2 \pi \sqrt{( 1 - \rho^2)}} \cdot
 \exp \left( -\frac{x_1^2 - 2 \rho x_1 x_2 + x_2^2}{2(1-\rho^2)} \right) \label{eq:x12} \\[3ex]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
=& \phi_2 ( x_1, x_2, \rho ) \label{eq:derivBivrho_final}
\end{align}
This result is in line with \citet{Sibuya1960} and \citet{Sungur1990}.

In the following, we will numerically calculate the derivative
of the cumulative distribution function of the bivariate normal distribution
(equation~\ref{eq:derivBivrho_start}) with respect to~$\rho$
and check whether this partial derivative
is equal to the right-hand sides of equation~(\ref{eq:derivBivrho_final}):
<<>>=
# Numerical gradient of the PDF w.r.t. rho
funrho <- function( p ) {
   prob <- dmvnorm( x = c( x1, x2 ),
      sigma = matrix( c( 1, p, p, 1 ), nrow = 2 ) )
   return( prob )
}
grad <- c( numericGradient( funrho, rho ) )
print( grad )

# Comparison with analytical gradient for rho
efun <- exp(-(x1^2 - 2 * rho * x1 * x2 + x2^2)/(2*(1 - rho^2)))

all.equal( grad, 
( (-((2*rho*(-2*rho*x1*x2+x1^2+x2^2) - 2*x1*x2*(1-rho^2)) * efun)/
    (2*(1-rho^2)^(3/2) )) +
    ((rho*efun)/(sqrt(1-rho^2))) ) /
    (2*pi*(1-rho^2)) )
@
<<echo=FALSE>>=
#Eq??
all.equal(grad,
    (1/(2*pi)) * (
    ((((-4*rho*(x1^2-2*rho*x1*x2+x2^2)-2*(1-rho^2)*(-2*x1*x2))/(4*(1-rho^2)^2)) * 
            efun * sqrt(1-rho^2))/(1-rho^2)) - 
            ((-(rho/(sqrt(1-rho^2)))*efun)/(1-rho^2))
    ))
#Eq??
all.equal(grad,
    (1/(2*pi)) * 
    ((rho/((1-rho^2)^(3/2))) - ((rho*(x1^2-rho*x1*x2+x2^2)-x1*x2)/
            ((1-rho^2)^(5/2)))) * efun
    )

#Eq??
all.equal(grad,
    (1/(2*pi*sqrt(1-rho^2))) * 
    (((rho/(1-rho^2)) - ((rho*(x1^2-rho*x1*x2+x2^2)-x1*x2)/
            ((1-rho^2)^2))) * efun)
    )
@

<<>>=
# Numerical gradient of the CDF w.r.t. rho
cdfRho <- function( p, xa = x1, xb = x2 ) {
   prob <- pmvnorm( upper = c( xa, xb ),
      sigma = matrix( c( 1, p, p, 1 ), nrow = 2 ) )
   return( prob )
}
grad <- c( numericGradient( cdfRho, rho ) )
print( grad )

# comparison with analytical gradient
all.equal( grad, dmvnorm( x = c( x1, x2 ),
      sigma = matrix( c( 1, rho, rho, 1 ), nrow = 2 ) ) )

# comparisons with other values
compDerivRho <- function( xa, xb, p ) {
   dn <- c( numericGradient( cdfRho, p, xa = xa, xb = xb ) )
   da <- dmvnorm( x = c( xa, xb ),
      sigma = matrix( c( 1, p, p, 1 ), nrow = 2 ) )
   return( all.equal( dn, da ) )
}
compDerivRho( x1, x2, rho )
compDerivRho( 0.5, x2, rho )
compDerivRho( 2.5, x2, rho )
compDerivRho( x1, -2, rho )
compDerivRho( x1, x2, 0.2 )
compDerivRho( x1, x2, 0.98 ) 
@

\pagebreak
\section{Gradients of the Log-Likelihood Function}
\subsection{Gradients with respect to the parameters of the selection 
equation~($\vec{\beta}^S$)}

First, we use equation~(\ref{eq:derivBivFinal}), to determine the derivative of 
the bivariate standard normal distribution with respect to the parameter 
$\vec{\beta}^S$ as part of the loglikelihood function:

\begin{align}
& \frac{\partial \Phi_2 \left( \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}
    {\sigma}, {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)}{\partial \beta^S} 
   = \Phi \left( \frac{\frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma} 
      - \rho {\vec{\beta}^S}' \vec{x}_i^S}{\sqrt{1 - \rho^2}} \right)
      \phi( {\vec{\beta}^S}' \vec{x}_i^S ) \cdot 
      \frac{\partial{\vec{\beta}^S}' \vec{x}_i^S}{\partial \vec{\beta}^S} \\
      & = \Phi \left( \frac{\frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}
      {\sigma} + \rho {\vec{\beta}^S}' \vec{x}_i^S}{\sqrt{1 - \rho^2}} \right)
      \phi( {\vec{\beta}^S}' \vec{x}_i^S ) \cdot \vec{x}_i^S
\end{align}

Using this result we can now derive the gradient for $\vec{\beta}^S$ in the 
log-likelihood function:

\begin{align}
& \frac{\partial \ell_i}{\partial \beta^S} = \frac{\partial}{\partial \beta^S} 
    \Bigg( ( 1 - y_i^S )
    \ln \left[ \Phi \left( - {\vec{\beta}^S}' \vec{x}_i^S \right) \right] \\
    & \quad + \sum_{m=1}^M y_i^S ( y_i^O = m ) \ln \left[
      \Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) \right.\nonumber \\
      & \qquad \left. - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) 
   \right] \Bigg) \nonumber \\
& = ( 1 - y_i^S )
    \frac{\partial}{\partial \beta^S} \Bigg( \ln \left[ \Phi 
    \left( - {\vec{\beta}^S}' \vec{x}_i^S \right) \right] \Bigg) \\
    & \quad + \sum_{m=1}^M y_i^S ( y_i^O = m ) \frac{\partial}{\partial \beta^S} 
    \Bigg( \ln \left[\Phi_2 \left(
        \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) \right. \nonumber \\
      & \qquad \left. - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) 
   \right] \Bigg) \nonumber \\
& = ( 1 - y_i^S )
   \frac{\phi \left( - {\vec{\beta}^S}' \vec{x}_i^S \right)
         \cdot \left( - \vec{x}_i^S \right) }{
      \Phi \left( - {\vec{\beta}^S}' \vec{x}_i^S \right)}  \\
    & \quad + \sum_{m=1}^M y_i^S ( y_i^O = m )
      \frac{\frac{\partial \Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) }{\partial \beta^S}
         - \frac{\partial \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) }{\partial \beta^S}}
         {\Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) } \nonumber \\
& = ( 1 - y_i^S )
    \frac{ \phi \left( - {\vec{\beta}^S}' \vec{x}_i^S \right)
         \cdot \left( - \vec{x}_i^S \right) }{
      \Phi \left( - {\vec{\beta}^S}' \vec{x}_i^S \right) } \\
   & \quad + \sum_{m=1}^M y_i^S ( y_i^O = m ) 
   \frac{ 1 }{
      \Phi_2 \left(
         \frac{ \alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O }{ \sigma },
            {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)
      -\Phi_2 \left(
         \frac{ \alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O }{ \sigma },
            {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) } \nonumber \\
   & \qquad \left(
      \Phi \left( \frac{
         \frac{ \alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O }{ \sigma } 
            + \rho {\vec{\beta}^S}' \vec{x}_i^S }{ \sqrt{1 - \rho^2} } \right)
      \phi \left( {\vec{\beta}^S}' \vec{x}_i^S \right)
      \cdot \vec{x}_i^S \right. \nonumber \\
   & \qquad \qquad - \left. \Phi \left( \frac{
         \frac{ \alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O }{ \sigma } 
            + \rho {\vec{\beta}^S}' \vec{x}_i^S) }{ \sqrt{1 - \rho^2} } \right)
      \phi \left( {\vec{\beta}^S}' \vec{x}_i^S \right)
      \cdot \vec{x}_i^S \right) \nonumber \\
& = ( 1 - y_i^S )
    \frac{ \phi \left( - {\vec{\beta}^S}' \vec{x}_i^S \right)
         \cdot \left( - \vec{x}_i^S \right) }{
      \Phi \left( - {\vec{\beta}^S}' \vec{x}_i^S \right) } 
    \\
   & \quad + \sum_{m=1}^M y_i^S ( y_i^O = m ) 
   \frac{ \left(
      \Phi \left( \frac{
         \frac{ \alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O }{ \sigma } 
            + \rho {\vec{\beta}^S}' \vec{x}_i^S }{ \sqrt{1 - \rho^2} } \right)
      - \Phi \left( \frac{
         \frac{ \alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O }{ \sigma } 
            + \rho {\vec{\beta}^S}' \vec{x}_i^S }{ \sqrt{1 - \rho^2} } \right)
      \right)
      \phi \left( {\vec{\beta}^S}' \vec{x}_i^S \right)
      \cdot \vec{x}_i^S \nonumber
   }{
      \Phi_2 \left(
         \frac{ \alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O }{ \sigma },
            {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)
      -\Phi_2 \left(
         \frac{ \alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O }{ \sigma },
            {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) } \nonumber
\end{align}


\pagebreak

\subsection{Gradients with respect to the parameters in the outcome equation~%
($\vec{\beta}^O$)}

%\frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
%{\vec{\beta}^S}' \vec{x}_i^S
Analogous to $\vec{\beta}^S$ and by using equation~(\ref{eq:derivBivFinal}) we 
derive the gradient of $\vec{\beta}^O$:
\begin{align}
& \frac{\partial \Phi_2 \left( \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}
    {\sigma}, {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)}{\partial \beta^O} 
    = \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S 
      + \rho \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}}
      {\sqrt{1 - \rho^2}} \right) \phi \left( \frac{\alpha_{m} - {\vec{\beta}^O}' 
      \vec{x}_i^O}{\sigma} \right) \cdot \left(-\frac{\vec{x}_i^O}{\sigma} \right)
\end{align}

Using this result we derive the gradient for the outcome parameter 
$\vec{\beta}^O$ for the log-likelihood function:

\begin{align}
\frac{\partial \ell_i}{\partial \beta^O} & = \frac{\partial}{\partial \beta^O} 
    \Bigg( ( 1 - y_i^S )
    \ln \left[ \Phi \left( - {\vec{\beta}^S}' \vec{x}_i^S \right) \right] \\
   & \quad + \sum_{m=1}^M y_i^S ( y_i^O = m ) \ln \left[
      \Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) \right.\nonumber \\
      & \qquad \left. - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) 
   \right] \Bigg) \nonumber \\
& = \sum_{m=1}^M y_i^S ( y_i^O = m ) \frac{\partial}{\partial \beta^O} 
        \Bigg( \ln \left[\Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) \right. \\
      & \qquad \left. - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) 
   \right] \Bigg) \nonumber \\
& = \sum_{m=1}^M y_i^S ( y_i^O = m )
      \frac{
         \frac{ \partial \Phi_2 \left(
            \frac{ \alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O }{ \sigma },
               {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)
         }{ \partial \beta^O }
         - \frac{ \partial \Phi_2 \left(
            \frac{ \alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O }{ \sigma },
               {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)
         }{ \partial \beta^O }
      }{ \Phi_2 \left(
            \frac{ \alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O }{ \sigma },
               {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)
         - \Phi_2 \left(
            \frac{ \alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O }{ \sigma },
               {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) } \\
& = \sum_{m=1}^M y_i^S ( y_i^O = m ) \cdot
        \frac{ 1 }{
        \Phi_2 \left(
            \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
            {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)
         - \Phi_2 \left(
            \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
            {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) } \\
   & \quad \left(
      \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S
            + \rho \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
         }{ \sqrt{1 - \rho^2} } \right)
      \phi \left( \frac{ \alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
         \right) \cdot
      \left( -\frac{ \vec{x}_i^O }{ \sigma } \right) \right. \nonumber \\
   & \qquad \left. - \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S 
            + \rho \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
         }{ \sqrt{1 - \rho^2} } \right)
      \phi \left( \frac{ \alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
         \right) \cdot
      \left(-\frac{ \vec{x}_i^O }{ \sigma} \right) \right) \nonumber \\
& = \sum_{m=1}^M y_i^S ( y_i^O = m ) \cdot
        \frac{ 1 }{
        \Phi_2 \left(
            \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
            {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)
         - \Phi_2 \left(
            \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
            {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) } \\
   & \quad \left(
      \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S
            + \rho \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
         }{ \sqrt{1 - \rho^2} } \right)
      \phi \left( \frac{ \alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
         \right) \right. \nonumber \\
   & \qquad \left. - \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S 
            + \rho \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
         }{ \sqrt{1 - \rho^2} } \right)
      \phi \left( \frac{ \alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
         \right)
      \right) \cdot
      \left( - \frac{ \vec{x}_i^O }{ \sigma} \right) \nonumber
\end{align}

\subsection{Gradients with respect to the coefficient of correlation~($\rho$)}

Given the result that the derivative of the CDF with respect to $\rho$ is equal 
to the PDF (see equation \ref{eq:derivBivrho_final}), we can also derive the 
gradient of the correlation parameter ($\rho$):

\begin{align}
& \frac{\partial \ell_i}{\partial \rho} = \frac{\partial}{\partial \rho} 
    \Bigg( ( 1 - y_i^S ) \ln \left[ \Phi \left( - {\vec{\beta}^S}' 
    \vec{x}_i^S \right) \right] \\
   & \quad + \sum_{m=1}^M y_i^S ( y_i^O = m ) \ln \left[
      \Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) \right.\nonumber \\
      & \qquad \left. - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) 
   \right] \Bigg) \nonumber \\
& = \sum_{m=1}^M y_i^S ( y_i^O = m ) \frac{\partial}{\partial \rho} 
        \Bigg( \ln \left[\Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) \right. \\
      & \qquad \left. - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) 
   \right] \Bigg) \nonumber \\
& = \sum_{m=1}^M y_i^S ( y_i^O = m )
      \frac{ \phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)
         - \phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)}
         {\Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) }
\end{align}


\begin{align}
\frac{\partial \ell_i}{\partial \artanh(\rho)}
& = \frac{\partial \ell_i}{\partial \rho}
   \frac{\partial \rho}{\partial \artanh(\rho)}
= \frac{\partial \ell_i}{\partial \rho}
   \; (1 - \rho^2)
\end{align}

\subsection{Gradients with respect to the standard deviation used for 
normalisation~($\sigma$)}

Finally, we derive the gradient for $\sigma$ in the same way as we did for
$\beta^S$ and $\beta^O$:

\begin{align}
& \frac{\partial \Phi_2 \left( \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}
    {\sigma}, {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)}{\partial \sigma} \nonumber \\
& = \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S 
      + \rho \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}}
      {\sqrt{1 - \rho^2}} \right) \phi \left( \frac{\alpha_{m} - {\vec{\beta}^O}' 
      \vec{x}_i^O}{\sigma} \right) \cdot \frac{{\vec{\beta}^O}' \vec{x}_i^O - 
      \alpha_{m}}{\sigma^2}
\end{align}

\begin{align}
& \lim_{\alpha_m \rightarrow \infty}
\frac{\partial \Phi_2 \left( \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}
    {\sigma}, {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)}{\partial \sigma}
    \nonumber \\
& = \lim_{\alpha_m \rightarrow \infty}
   \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S 
      + \rho \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}}
      {\sqrt{1 - \rho^2}} \right) \phi \left( \frac{\alpha_{m} - {\vec{\beta}^O}' 
      \vec{x}_i^O}{\sigma} \right) \cdot \frac{{\vec{\beta}^O}' \vec{x}_i^O - 
      \alpha_{m}}{\sigma^2}\\
& = \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S 
      + \rho \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}}
      {\sqrt{1 - \rho^2}} \right)
      \lim_{\alpha_m \rightarrow \infty}
      \phi \left( \frac{\alpha_{m} - {\vec{\beta}^O}' 
      \vec{x}_i^O}{\sigma} \right) \cdot \frac{{\vec{\beta}^O}' \vec{x}_i^O - 
      \alpha_{m}}{\sigma^2}\\
& = \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S 
      + \rho \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}}
      {\sqrt{1 - \rho^2}} \right)
      \lim_{\alpha_m \rightarrow \infty}
      \frac{ \frac{{\vec{\beta}^O}' \vec{x}_i^O - \alpha_{m}}{\sigma^2} }{ 
         \left( \phi \left( \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{
            \sigma} \right) \right)^{-1} } \\
& = \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S 
      + \rho \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}}
      {\sqrt{1 - \rho^2}} \right) \\
& \qquad \lim_{\alpha_m \rightarrow \infty}
      \frac{ - \frac{1}{\sigma^2} }{ 
         - \left( \phi \left(
               \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
            \right) \right)^{-2} 
         \left( - \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
         \right)
         \phi \left(
               \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
            \right)
         \frac{1}{\sigma}
      } \nonumber \\
& = \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S 
      + \rho \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}}
      {\sqrt{1 - \rho^2}} \right)
   \lim_{\alpha_m \rightarrow \infty}
      \frac{ - \frac{1}{\sigma} }{ 
         \left( \phi \left(
               \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
            \right) \right)^{-1} 
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
      } \\
& = \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S 
      + \rho \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}}
      {\sqrt{1 - \rho^2}} \right)
   \lim_{\alpha_m \rightarrow \infty}
      \frac{ - \frac{1}{\sigma} 
         \phi \left( \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
            \right) }{ 
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}
      } \\
& = 0
\end{align}

Similarly:
\begin{align}
\lim_{\alpha_m \rightarrow - \infty}
\frac{\partial \Phi_2 \left( \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}
    {\sigma}, {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right)}{\partial \sigma}
= 0
\end{align}

\begin{align}
\frac{\partial \ell_i}{\partial \sigma} & = \frac{\partial}{\partial \sigma} 
    \Bigg( ( 1 - y_i^S ) \ln \left[ \Phi \left( - {\vec{\beta}^S}' 
    \vec{x}_i^S \right) \right] \\
   & \quad + \sum_{m=1}^M y_i^S ( y_i^O = m ) \ln \left[
      \Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) \right.\nonumber \\
      & \qquad \left. - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) 
   \right] \Bigg) \nonumber \\
& = \sum_{m=1}^M y_i^S ( y_i^O = m ) \frac{\partial}{\partial \sigma} 
        \Bigg( \ln \left[\Phi_2 \left(
        \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
        {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) \right. \\
      & \qquad \left. - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) 
   \right] \Bigg) \nonumber \\
& = \sum_{m=1}^M y_i^S ( y_i^O = m )
      \frac{\frac{\partial \Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) }{\partial \sigma}
         - \frac{\partial \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) \Bigg)}{\partial \sigma}}
         {\Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) } \\
& = \sum_{m=1}^M \frac{y_i^S ( y_i^O = m )}{\Phi_2 \left(
         \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) - \Phi_2 \left(
         \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma},
         {\vec{\beta}^S}' \vec{x}_i^S, - \rho \right) } \\
   & \quad \left( 
    \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S 
      + \rho \frac{\alpha_{m+1} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}}
      {\sqrt{1 - \rho^2}} \right) \phi \left( \frac{\alpha_{m+1} - {\vec{\beta}^O}' 
      \vec{x}_i^O}{\sigma} \right) \cdot \frac{{\vec{\beta}^O}' 
      \vec{x}_i^O - \alpha_{m+1}}{\sigma^2} \right. \nonumber \\
   & \qquad - \left. \Phi \left( \frac{{\vec{\beta}^S}' \vec{x}_i^S 
      + \rho \frac{\alpha_{m} - {\vec{\beta}^O}' \vec{x}_i^O}{\sigma}}
      {\sqrt{1 - \rho^2}} \right) \phi \left( \frac{\alpha_{m} - {\vec{\beta}^O}' 
      \vec{x}_i^O}{\sigma} \right) \cdot \frac{{\vec{\beta}^O}' 
      \vec{x}_i^O - \alpha_{m}}{\sigma^2} \right) \nonumber
\end{align}

\begin{align}
\frac{\partial \ell_i}{\partial \log(\sigma)}
& = \frac{\partial \ell_i}{\partial \sigma}
   \frac{\partial \sigma}{\partial \log(\sigma)}
= \frac{\partial \ell_i}{\partial \sigma}
   \; \sigma
\end{align}


\section{Example with a Generated Dataset}

\section{Generate Dataset}

<<>>=
library( "mvtnorm" )
# number of observations
nObs <- 300
# parameters
betaS <- c( 1, 1, -1 )
betaO <- c( 10, 4 )
rho <- 0.4
sigma <- 5
# boundaries of the intervals
bound <- c(-Inf,5,15,Inf)

# set 'seed' of the pseudo random number generator
# in order to always generate the same pseudo random numbers
set.seed(123)
# generate variables x1 and x2
dat <- data.frame( x1 = rnorm( nObs ), x2 = rnorm( nObs ) )
# variance-covariance matrix of the two error terms
vcovMat <- matrix( c( 1, rho*sigma, rho*sigma, sigma^2 ), nrow = 2 )
# generate the two error terms
eps <- rmvnorm( nObs, sigma = vcovMat )
dat$epsS <- eps[,1]
dat$epsO <- eps[,2]
# generate the selection variable
dat$yS <- with( dat, betaS[1] + betaS[2] * x1 + betaS[3] * x2 + epsS ) > 0
table( dat$yS )
# generate the unobserved/latent outcome variable
dat$yOu <- with( dat, betaO[1] + betaO[2] * x1 + epsO )
dat$yOu[ !dat$yS ] <- NA
# obtain the intervals of the outcome variable
dat$yO <- cut( dat$yOu, bound )
table( dat$yO )
@

\subsection{Estimation of the Model}

In the following estimation, 
the starting values are obtained by a maximum-likelihood (ML) estimation
of a tobit-2 model, 
where the dependent variable of the outcome equation
is set to the mid points of the intervals:
<<>>=
library( "sampleSelection" )
res <- selection( yS ~ x1 + x2, yO ~ x1, data = dat, boundaries = bound )
res
summary( res )
@


In the following estimation, 
the starting values are obtained by a two-step estimation
of a tobit-2 model, 
where the dependent variable of the outcome equation
is set to the mid points of the intervals:
<<>>=
res2 <- selection( yS ~ x1 + x2, yO ~ x1, data = dat, boundaries = bound, 
   start = "2step" )
res2
summary( res2 )
@

The following commands compare the starting values
and the estimated coefficients:
<<>>=
# compare starting values (small differences)
cbind( res$start, res2$start, res$start - res2$start )

# combare estimated coefficients (virtually identical)
cbind( coef( res ), coef( res2 ), coef( res ) - coef( res2 ) )
@


\section{Example with the `Smoke' dataset}

The following command loads the dataset:
<<>>=
data( "Smoke" )
@

The following command creates a vector with the boundaries of the intervals:
<<>>=
bounds <- c( 0, 5, 10, 20, 50, Inf )
@

The following command estimates the model with few explanatory variables:
<<>>=
SmokeRes1 <- selection( smoker ~ educ + age, 
  cigs_intervals ~ educ, data = Smoke, boundaries = bounds )
@

The following command estimates the model with more explanatory variables:
<<>>=
SmokeRes2 <- selection( smoker ~ educ + age + restaurn, 
  cigs_intervals ~ educ + income + restaurn, data = Smoke, 
  boundaries = bounds )
@

The following commands test 
whether adding further explanatory variables
significantly improves the explanatory power of the model:
<<>>=
library( "lmtest" )
lrtest( SmokeRes1, SmokeRes2 )
waldtest( SmokeRes1, SmokeRes2 )
@
Both tests indicate
that---at 5\%~significance level---%
the model with more explanatory variables (\code{SmokeRes2})
has significantly higher explanatory power
than the model with fewer explanatory variables~(\code{SmokeRes1}).


\bibliographystyle{apalike}
\bibliography{selection}
\end{document}
