%\VignetteIndexEntry{Description of the algorithm}
%\VignettePackageifit}
%\VignetteEncoding{UTF-8}
\documentclass[nojss]{jss}
% no need of \usepackage{Sweave}
\usepackage[utf8]{inputenc}
\usepackage{amsmath,amsfonts,bm}
\usepackage{enumerate}
\usepackage{algorithm,algpseudocode}
\title{Description of the `ifit` algorithm}
\author{Guido Masarotto\\Università of Padova}
\Address{%
Guido Masarotto\\
Department of Statistical Sciences\\
University of Padova, Italy\\
Email: \email{guido.masarotto@unipd.it}
}

\Abstract{%
    This brief document details the algorithm implemented in the \pkg{ifit} package and 
    provides an overview of the optional arguments for the \code{ifit::ifit} function.
}
\Keywords{%
Indirect inference based on intermediate statistics;
Generalized method of moments;
Parametric estimation;
Simulation-based model fitting}


\begin{document}
\maketitle



Package \pkg{ifit} addresses the
problem of fitting a parametric model 
$\bigl\{p_{\vartheta}$: $\vartheta \in R^p  \bigr\}$, 
to some data $y_{obs}$. 
It operates under the following assumptions:
\begin{enumerate} 
    \item 
    It is not possible to analytically compute the
    likelihood function, moments, or other quantities typically used for
    statistical inference. However, for any given $\vartheta$, it is
    possible to simulate pseudo-data $y \sim p_\vartheta$.  
    \item 
        The only prior
    information available on the parameters are \emph{boundary
    constraints}     where each parameter $\vartheta_i$ is known to lie within a specific
    interval $[l_i, u_i]$, i.e., it is possible to assume that
    $\vartheta$ belongs to the hypercube $\Theta=\{(\vartheta_1, \ldots,
    \vartheta_p) \in R^p: l_i \le \vartheta_i \le u_i\}$.
\end{enumerate}

The implemented estimator is a special case 
of the class of methods  that \cite{jiang_indirect_2004} term
\emph{indirect inference based on intermediate statistics}. It
can also be viewed as an application of the \emph{simulated generalized
method of moments} \citep{McFadden1989, Pakes1989}. Additionally,  
it is closely related to the work of \cite{cox_fitting_2012} 
on fitting complex parametric models.

We start defining the score 
$$ u\bigl(y_{obs};\vartheta\bigr) =t_{obs}-\tau\bigl(\vartheta\bigr),$$ 
where $t_{obs}=\Psi(y_{obs})$ is a
vector of $q$, $q\ge p$, features intended to summarize the evidence
from the data, and
$\tau(\vartheta)=E_{\vartheta}\bigl(\Psi(y)\bigr)$ is its expected value. The
package then aims to find 
$$ \hat\vartheta = \arg \min_{\vartheta \in \Theta}
u'\bigl(y_{obs};\vartheta\bigr) V^{-1} u\bigl(y_{obs};\vartheta\bigr)$$
where $V$ is positive-definite weighting matrix.  

Assuming the model is well specified
(i.e., there exists $\vartheta_0$ such that $y_{obs}\sim p_{\vartheta_0}$), 
it can be shown that, under the usual regularity and identifiability conditions, 
the estimates is consistent for every positive-definite matrix $V$.
However, the efficiency depends on $V$ and the most efficient choice 
for the weighting matrix is $V=\Sigma\bigl(\vartheta_0\bigl)$, 
where $\Sigma\bigl(\vartheta\bigr)=\text{var}_{\vartheta}\bigl(\Psi(y)\bigr)$. 
Since $\vartheta_0$ is unknown, this choice of weighting matrix is not
directly possible. However, it is easy to proof under the usual
assumptions that an asymptotically equivalent estimator to the optimal
one can be obtained by solving the quasi-likelihood equation
\begin{equation}
g(\hat\vartheta) =
J'\bigl(\hat\vartheta\bigr)\Sigma^{-1}\bigl(\hat\vartheta\bigr)u\bigl(y_{obs};\hat\vartheta\bigr) 
=0_p
\label{est:eq}
\end{equation}
where $J\bigl(\vartheta\bigr)$ denotes the Jacobian matrix $J\bigl(\vartheta\bigr)=\partial
\tau(\vartheta)/\partial \vartheta$. 
In addition, 
continuing to assume that the model is well-specified and that all 
necessary regularity conditions hold, 
the dispersion matrix of $\hat\vartheta$ can be estimated by
$
\widehat{\text{var}}\bigl(\hat\vartheta\bigr) =
\Omega\bigl(\hat\vartheta\bigr)^{-1}
$ 
where
$ \Omega\bigl(\vartheta\bigl) =
J'\bigl(\vartheta\bigr)
\Sigma^{-1}\bigl(\vartheta\bigr)
J\bigl(\vartheta\bigl)
$.

The implemented algorithm first performs 
a global search to find a promising starting point.
This is followed by a local search, which refines the solution
using a trust-region  version of
the Fisher scoring iteration for solving \eqref{est:eq}.
This basic iteration is defined as
$$ 
\vartheta_{new} = \vartheta_{old} +
\Omega\bigl(\vartheta\bigr)^{-1}g\bigl(\vartheta_{old}\bigr).
$$
Because $\tau\bigl(\vartheta\bigr)$, $J\bigl(\vartheta\bigr)$ and
$\Omega\bigl(\vartheta\bigr)$ are
unknown, they are approximated through simulations. 
The approach is
sequential. In particular, after step $k$ of the algorithm,
$N_k$ pairs $[\vartheta_i;t_i]$,  where $t_i=\Psi(y_i)$ with $y_i
\sim p_{\vartheta_i}$,  are available. These simulated values are then used
to decide the location of the next set of simulations, i.e., 
to choose $\vartheta_{N_k+1}, \ldots, \vartheta_{N_{k+1}}$. When needed,
the approximations of $\tau\bigl(\vartheta\bigl)$, $J\bigl(\vartheta\bigr)$, and
$\Omega\bigl(\vartheta\bigr)$ are
computed from these simulated pairs using local regression techniques.
Specifically, a simple kNN average is used to compute the necessary
quantities during the global search. In contrast, during the local
search, they are obtained by fitting linear models 
neighborhoods of the current guess including a progressively larger
number of points.

The details are as follows.
\newcommand{\pp}[2]{\text{\texttt{#1}}_{#2}}
\algblock{Start}{End}
\begin{algorithmic}[1]
    \State \textbf{Input} (default values in brackets): $\pp{N}{init}$
    (1000), $\pp{N}{elite}$ (100), $\pp{A}{elite}$ ($0.5$), 
    $\pp{Tol}{global}$ ($0.1$), $\pp{NAdd}{global}$ (100), $\pp{NTot}{global}$
    (20000), $\pp{Rho}{max}$ ($0.1$), $\pp{Lambda}{}$ ($0.1$),  $\pp{Tol}{local}$ ($1$), $\pp{NFit}{local}$
    (4000), $\pp{NAdd}{local}$ (10), \text{\texttt{Tol}}$_{model}$ ($1.5$). 

    \State Set $k \gets 0$. 

    \Start\textbf{ Global Search}
    \State \emph{Initialization.}  Set $N_0 \gets \pp{N}{init}$ and draw $\vartheta_1, \ldots,
    \vartheta_{N_{0}}$  in
    $\Theta$ using Latin hypercube sampling. Simulate the
    corresponding summary statistics $t_1, \ldots, t_{N_0}$.
    
    \State \label{glob:begin} \emph{Estimation of $\tau(\vartheta)$ and
    $V$.}
    Estimate $\tau(\vartheta)$ using kNN
    regression. Specifically, for $i=1, \ldots, N_k$, calculate 
    $$ 
    \hat{\tau}_{k, i} =
    \sum_{\{r: d_{G}(\vartheta_r, \vartheta_i) \le
    \overline{d}_{k,i}\}}  
    \left[1-
        \left(\dfrac{d_G(\vartheta_r,\vartheta_i)}{\overline{d}_{k,i}}
    \right)^3\right]^3 t_r
    $$ 
    where $d_G(\vartheta', \vartheta'')=\sqrt{\sum_{i=1}^p
    [(\vartheta'_i-\vartheta''_i)/(u_i-l_i)]^2}$  and
    $\overline{d}_{k,i}$
    is determined such that the size of the neighbourhood
    $\{r: d(\vartheta_r, \vartheta_i) \le \overline{d}_{k,i}\}$ is equal
    to $floor\left(\sqrt{N_k}\right)$.
    In addition, set $V_k \gets S_k R_k S_k$ 
    where $S_k$ is the $q \times q$ diagonal matrix containing the Median Absolute
    Deviations (MADs) of the elements of the vector $t_k-\hat\tau_{k,i}$ 
    (specifically, one MAD for each summary statistic), 
    and $R_k$ is the correlation matrix among the Gaussian scores (or normal scores) 
    of the same elements of $t_i-\hat\tau_{k,i}$ \citep{gaussian_2012}. 
    
    \State \emph{Elite sample.} Determine the indexes $i_{k,1}, \ldots,
    i_{k,E_k}$  of the
$$E_k=floor\left[\pp{N}{elite}+(\pp{N}{init}-\pp{N}{elite})\pp{A}{elite}^{(N_K/\pp{N}{init})^2}\right]$$
            couples $(\vartheta_i, t_i)$ with the smallest Mahalanobis distances 
$\bigl(t_{obs}-\hat\tau_{k,i}\bigr)'V_k^{-1}\bigl(t_{obs}-\hat\tau_{k, i}\bigr)$. 
            Then, compute
            \begin{align*}
                M_k &\gets \text{ sample mean of } 
                \vartheta_{i_{k,1}}, \ldots, \vartheta_{i_{k, E_k}}\\ 
                C_k &\gets \text{ sample covariance matrix of } 
                \vartheta_{i_{k,1}}, \ldots, \vartheta_{i_{k, E_k}}\\ 
            \end{align*}  
            and denote with $s_{k,1}, \ldots, s_{k, p}$ the corresponding sample
            standard deviation, i.e., the square root of the diagonal of
            $C_k$. 
    
            \State \emph{Convergenge?} If either
            \begin{enumerate}[(i)]
            \item $s_{k, r} < \max\bigl(1,
                |M_{k, r}|\bigr) \pp{Tol}{global}\;\forall r=1,\ldots,p$, or
                \item
                    $N_k = \pp{NTot}{global}$,
            \end{enumerate}  
            exit from the global search and go to \ref{glo:exit}.
            
            \State \emph{Elite sample reproduction.} Set $N_{k+1} \gets
            \min\bigl(N_k+\pp{NAdd}{global},
            \pp{NTot}{global}\bigr)$ and sample $\vartheta_{N_K+1}, \ldots,
            \vartheta_{N_{k+1}}$
            from a mixture of $E_k$ multivariate normal
            distributions (truncated to $\Theta$) with means
            $\vartheta_{i_{k,1}}, \ldots, \vartheta_{i_{k, E_k}}$ 
            and the same dispersion matrix $C_k$. Simulate the corresponding
            summary statistics $t_{N_k+1}, \ldots, t_{N_{k+1}}$.
            \State Set $k \gets k+1$ and go to \ref{glob:begin}.     
    \End\textbf{ Global Search}
    
    \State \label{glo:exit}
    Set $L_k \gets \pp{N}{elite}$, $\rho_k \gets \pp{Rho}{max} / 10$ and  
    $\hat\vartheta_k \gets \vartheta_{best}$ where $\vartheta_{best}$  is the ``best point'' sampled
    during the previous phase, i.e., it satisfies
    $$\bigl(t_{obs}-\hat\tau_{k,best}\bigr)'V_k^{-1}
    \bigl(t_{obs}-\hat\tau_{k, best}\bigr)
    = \min_{i=1, \ldots, N_k} 
    \bigl(t_{obs}-\hat\tau_{k, i}\bigr)'V_k^{-1}
    \bigl(t_{obs}-\hat\tau_{k, i}\bigr).$$

    \Start{} \textbf{Local Search}

    \State \label{loc:begin} \emph{Estimation of $g(\hat\vartheta_k)$,
$J(\hat\vartheta_k)$ and $\Omega(\hat\vartheta_k)$.} 
Fit to the $L_k$ couples $(\vartheta_i, t_i)$ 
closest to $\hat\vartheta_k$, as measured by the distance
$d_L(\vartheta, \hat\vartheta^{(k)})=
\sum_{i=1}^p \bigl[(\vartheta_i-\hat\vartheta_i^{(k)})/
    \max(1, |\vartheta_i^{(k)}|)\bigr]^2$,  
the multivariate linear regression model 
$t_i = \tau+B(\vartheta_i-\hat\vartheta_k)+\text{Err}_i.$
Denote with $\hat\tau_k$, $B_k$, $W_k$, $H_k$ the least squares
estimates of $\tau$, $B$, $\text{var}(\text{Err}_i)$ and
$\text{var}(\hat\tau_k)$, respectively. 
Further, set
\begin{align*}
J_k &\gets \begin{cases}
    B_k & \text{if } L_k=\pp{N}{elite} \\
    (1-\pp{Lambda}{})J_{k-1}+\pp{Lambda}{} B_k & \text{otherwise}
\end{cases},\\  
V_k &\gets \begin{cases}
    W_k & \text{if } L_k=\pp{N}{elite} \\
    (1-\pp{Lambda}{})V_{k-1}+\pp{Lambda}{} W_k & \text{otherwise}
\end{cases},\\
    \Omega_k &\gets J'_k V_k^{-1} J_k,\\
    \hat{g}_k &\gets J'_k V_k^{-1}\bigl(t_{obs}-\hat\tau_k\bigr),\\
    \widehat{\text{var}}\bigl(\hat{g}_k\bigr) & \gets
    J'_kV_k^{-1}H_kV_k^{-1}J_k.
\end{align*}  

\State \emph{New guess.} Compute $\tilde\vartheta_k \gets
\hat\vartheta_k+\delta_k$ where $\delta_k$ is the solution
of the linear programming problem
$$ \min_{\delta \in \Delta_k} \sum_{i=1}^p |r_{k,i}(\delta)|$$ 
where
$r_k(\delta)=\Omega_k \delta-\hat{g}_k$
and $\Delta_k=
\bigl\{\delta \in R^p: \hat\vartheta_k+\delta \in \Theta \text{ and }
|\delta_i| \le \max(1, |\hat\vartheta_{k,i}|)\rho_k\bigr\}
$.

\State \emph{Convergence?} 
If $L_k=\pp{NFit}{local}$ and 
$
\hat{g}'_k\widehat{\text{var}}\bigl(\hat{g}_k\bigr)^{-1}\hat{g}_k <
p \pp{Tol}{local}
$ exit from the local search and go to \ref{end:local}.

\State \emph{Sampling near the new guess.}
Set $N_{k+1} \gets N_{k}+\pp{NAdd}{local}$ and draw $\vartheta_{N_k+1},
\ldots, \vartheta_{N_{k+1}}$ uniformily in 
$\bigl\{\vartheta \in \Theta:
(\vartheta-\tilde\vartheta_k)'\Omega_k(\vartheta-\tilde\vartheta_k) \le
1\bigr\}$. Simulate the corresponding summary statistics $t_{N_k+1}, \ldots, t_{N_{k+1}}$.

\State \emph{Accept/reject the guess. Adjust the size of the trust
region.} Compute the differences between the last sampled
summary statistics and their predictions obtained from the current linear
model $D_i =t_i-\hat\tau_k-B_k(\vartheta_i-\tilde\vartheta_k)$ for
$i=N_k+1,\ldots,N_{k+1}$.
If $$\sum_{i=N_k+1}^{N_{k+1}} D'_i V_k^{-1} D_i < q (N_{k+1}-N_k)Mod_{ok}$$ 
accept the proposal and set
$\hat\vartheta_{k+1} \gets \tilde\vartheta_k$ and $\rho_{k+1} \gets
\min(2\rho_k, \pp{Rho}{max})$. Otherwise, set 
$\hat\vartheta_{k+1} \gets \hat\vartheta_k$ and $\rho_{k+1} \gets \rho_k
/4$.

\State Set $k \gets k+1$, $L_k \gets \min(\pp{NFit}{local},
L_{k-1}+\pp{NAdd}{local})$ and go to \ref{loc:begin}.  
\End{} \textbf{Local Search}

\State \label{end:local}
Set $\hat\vartheta \gets \tilde{\vartheta}_k$ and
$\widehat{\text{var}}(\hat\vartheta)=\Omega_k^{-1}$.
\end{algorithmic}

\bibliography{alg}

\end{document}
