% Copyright INRIA
%\documentstyle{book}
%\begin{document}
%\def\verbatok#1{\expandafter \begin{verbatim}
%\input{#1}}
\label{stochastic realization}
\chapter{Stochastic realization}
\index{stochastic!realization}
Let $y_k$ be a wide sense stationary\index{stationary} gaussian~\index{gaussian!process} process with covariance
function $\{R_n;n\in {\Bbb Z} \}$.
It is well-known that $y_k$ may be considered
as the output of a filter $F$ with a white noise $e_t$ as input. The {\em
Stochastic Realization \em} problem for $y_k$ is the construction of an
algorithm giving $F$ ; in addition, $F$ is usually asked to be causal
and minimum delay\index{minimum!delay} (i.e. with causal inverse),
for reasons of realizability. One major consequence of this additional
constraint is that $F$~, when it exists, is minimal and unique,
up to a transformation of coordinates.
The filter $F$ is called the {\em Modeling
Filter \em} \index{modeling filter} and its inverse $F^{-1}$
the {\em Whitening Filter \em} \index{whitening filter}.
Let us consider the following examples, in which the information
on $y_k$ is given two different ways.
First, let be given $S_y(z)$, the spectrum \index{spectrum} of $y_k$.
Then, the whitening filter of $y_k$ is the solution $E(z)$
of the {\em Spectral Factorization \em}\index{spectral!factorization}
problem of $S_y(z)$, that is the solution of :
\begin{eqnarray}
E(z)S_y(z)E^{T}(z^{-1})=I
\label{e.ricc.1}
\end{eqnarray}
such that $E(z)$ and $E^{-1}(z)$ are analytical in the unit disc,
that is that the modeling filter of $y_k$ is causal and minimum
delay. The stochastic realization problem is then the computation
of $E(z)$, given $S_y(e^{i\theta})$. Solutions to this
problem will be given in section \ref{spfact} with direct factorization of
matrix polynomials\index{matrix polynomial} and
in \ref{ricca} via a state-space approach.
Another example is when the covariance function
$R_n=E(y_ky_{k-n}^{T})$ of $y_k$
is given - the information on $y_k$ is then equivalent to that in
the previous example- The whitening filter giving the innovation\index{innovation}, or
prediction error, is obtained by minimizing with respect to the coefficients
$A_k$, the mean square prediction error :
\begin{eqnarray}
E(\| e_t \| ^2)=E\{ \| y_k - \sum_{k>0}A_k y_{t-k}\|^2\}
\label{e.ricc.2}
\end{eqnarray}
The stochastic realization problem is then the computation of the $A_k$
as the solution of the following {\em Yule-Walker \em}, \index{yule-walker}
normal equations :
\begin{equation}
\left[ \begin{array}[c]{ccc}
R_0 & R_1 & R_2 \ldots \\
R_1^T & R_0 & R_1 \ldots \\
R_2^T & R_1^T & R_0 \ldots \\
\vdots
\end{array} \right]
\left[ \begin{array}[c]{c}
A_1 \\A_2 \\A_3 \\ \vdots \end{array} \right]
= \left[ \begin{array}[c]{c}
R_1 \\ R_2 \\ R_3 \\ \vdots \end{array} \right]
\label{e.ricc.3}
\end{equation}
This system being To\"{e}plitz, \index{toeplitz} it may be efficiently solved
using a Levinson-type algorithm ,as will be exposed in section \ref{levin}.
\section{The {\tt sfact} primitive}
\label{spfact}
\index{macro,{\tt spfact}}
Given a matrix polynomial $H(z)$ with size $n \times m$ and rank $n$,
the Scilab function {\tt sfact}
gives the spectral factorization \index{spectral!factorization} of $H(z)$,
that is a matrix
polynomial $D$ such that:
\begin{equation}
H(z)=D(z)D(1/z)z^{n}
\label{e.spfact.1}
\end{equation}
Consider for example the following $2\times 2$ matrix polynomial $a(z)$,
generated from the simulation of a two-dimensional process with
three poles followed by a levinson\index{levinson} filtering (see section
\ref{levin}):
\verbatok{\Diary spfact.dia}
\end{verbatim}
\section{Spectral Factorization via state-space models}
\label{ricca}
As mentioned in the introduction, the stochastic realization
problem may be formulated in terms
of factorization of rational matrices - leading to factoring polynomials -
or in a state-variable framework. In this section, we will concentrate
on the state-space approach, with the underlying
markovian representation following the treatment of \cite{ruckebusch} or that uses
a gaussian spaces\index{gaussian!space} formulation of this problem.
Thus, given $y_k$ a zero-mean, stationary gaussian\index{gaussian!process} process,
we seek after markovian models\index{markovian!model} of the following type:
\begin{equation}
\left\{ \begin{array}{ll}
x_{k+1}=Fx_k + Ju_{k+1}\\
y_{k+1}=Hx_k + Lu_{k+1}
\end{array} \right.
\label{e.ricc.4}
\end{equation}
with the following hypotheses:\\
$x_k$ is q-dimensional.\\
$u_k$ is a q-dimensional, standard, gaussian\index{gaussian!white noise} white noise.\\
$F, H, J, L$ are arbitrary matrices with F asymptotically stable.
Furthermore we shall restrict ourselves to minimal models, that is with
$F$ having minimum dimensions.
\subsection{Spectral Study}
Let $\{R_k; k\in {\Bbb Z}\}$ be the covariance function of $y_k$.
Defining $G$ as :
\begin{eqnarray}
G=E(x_0 y_0^T) \label{e.ricc.5}
\end{eqnarray}
we have that
\begin{eqnarray}
\forall n \geq 1, & R_n=HF^{n-1}G \label{e.ricc.6}
\end{eqnarray}
Letting $Y(z)$ and $U(z)$ be the z-transforms of $y_k$ and $u_k$ respectively,
it follows from (\ref{e.ricc.4}) that :
\begin{eqnarray}
Y(z)=\Gamma (z)U(z)
\label{e.ricc.7}
\end{eqnarray}
where
\begin{eqnarray}
\Gamma (z)=J+Hz(I-Fz)^{-1}L \label{e.ricc.8}
\end{eqnarray}
is a rational matrix without poles in a vicinity of the unit disc.
Thus, the spectrum \index{spectrum} $S_y$ of $y_k$ may be written
in the following factored form:
\begin{eqnarray}
S_y(\theta)=\Gamma (e^{i\theta})\Gamma ^{*}(e^{-i\theta})
& \theta \in [-\pi, \pi]
\label{e.ricc.9}
\end{eqnarray}
where $\Gamma ^{*}$ denotes the transposed conjugate of $\Gamma$.
Furthermore, from (\ref{e.ricc.7}), we can write:
\begin{eqnarray}
U(z)=\Gamma ^{-1}(z)Y(z)
\label{e.ricc.10}
\end{eqnarray}
and when $J>0$ (in the sense of positive matrices) :
\begin{eqnarray}
\Gamma ^{-1}(z)=J^{-1}-J^{-1}Hz(I-(F-LJ^{-1}H)z)^{-1}LJ^{-1}
\label{e.ricc.11}
\end{eqnarray}
so that $u_k$ may be obtained from $y_k$ by a Laurent expansion of
$\Gamma ^{-1}$ in the vicinity of the unit circle, leading to the
whitening filter .
It is worth noting that when $y_k$ is scalar, then
$\Gamma (z)=B(z)/A(z)$, where $A$ and $B$ are coprime polynomials
and $B$ has no zero in a vicinity of the unit disc; in other words,
$y_k$ is an ARMA process \index{arma process}.
%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%
\subsection{The Filter Model}
\label{filter}
\index{filter!model}
Among the many markovian representations of $y_k$, one is of particular
importance, as we shall see later on:
The {\em Filter Model {\em or} Innovations Model} \index{innovations!model}.
To introduce it, we need some definitions: first, let us define
the {\em filter} $\tilde{x}_k$ of $x_k$ as:
\begin{eqnarray}
\tilde{x}_k=E(x_k/ Y^-_{k-1})
\label{e.ricc.12}
\end{eqnarray}
where $Y^-_{k}$
is the gaussian space \index{gaussian!space} generated by the
coordinates of $y_k$, $k\leq n$.
It is the filter of the process $x_k$ by the process $y_k$.
We need also the innovations process\index{innovations!process}
$\tilde{w}_k$ defined as follows :
\begin{eqnarray}
\tilde{w}_k=y_k-E(y_k/Y^-_{k-1})
\label{e.ricc.13}
\end{eqnarray}
$\tilde{w}_k$ is a stationary, gaussian white noise with covariance
$\tilde{R}$. From $\tilde{w}_k$ the standard gaussian white noise
$\tilde{u}_k$ may be obtained as: $\tilde{u}_k=\tilde{R}^{-1/2}\tilde{w}_k$.\\
We are now able to introduce the innovations model:
\begin{equation}
\left\{ \begin{array}{ll}
\tilde{x}_{k+1}=F\tilde{x}_k + T\tilde{w}_{k+1}\\
y_{k+1}=H\tilde{x}_k + \tilde{w}_{k+1}
\end{array} \right.
\label{e.ricc.14}
\end{equation}
where
\begin{eqnarray}
T=E(x_k\tilde{w}_k^T)\tilde{R}^{-1}
\label{e.ricc.15}
\end{eqnarray}
From (\ref{e.ricc.14}), we get the following model too :
\begin{equation}
\left\{
\begin{array}{ll}
\tilde{x}_{k+1}=F\tilde{x}_k + T\tilde{R}^{1/2}\tilde{u}_{k+1}\\
y_{k+1}=H\tilde{x}_k + \tilde{R}^{1/2}\tilde{u}_{k+1}
\end{array} \right.
\label{e.ricc.16}
\end{equation}
which is of the desired type (\ref{e.ricc.4}).
The transfer function matrix $\tilde{\Gamma }(z)$ of the model (\ref{e.ricc.16})
writes :
\begin{eqnarray}
\tilde{\Gamma }(z)=[I+Hz(I-Fz)^{-1}T]\tilde{R}^{1/2}
\label{e.ricc.17}
\end{eqnarray}
and is a maximal factorization of the spectral density\index{spectral!density} of $y_k$,
known as the {\em minimum-delay factorization}
\index{minimum!delay!factorization}. One consequence of this fact
is that the innovation\index{innovation} may be calculated as :
\begin{eqnarray}
\tilde{w}_k=y_{k+1}-H\tilde{x}_k
\label{e.ricc.18}
\end{eqnarray}
The importance of the filter model lies
in that {\em all the minimal markovian\index{markovian!representation} representations of $y_k$ have the same filter}, so that the problem we are faced with is to find this filter,
given the statistics of $y_k$. For this reason of uniqueness,
we will say that $\tilde{x}_k$ is {\em the} filter of $y_k$.
\section{Computing the solution}
We assume now that we could get in some way the covariance sequence of the
process $y_k$.
The models we consider being minimal with dimension $q$,
it is a well-known fact in automatic control that the observability matrix
\index{observability matrix}:
\begin{equation}
O=\left[ \begin{array}{c}H\\HF\\\vdots\\HF^{q-1}\end{array}\right]
\label{e.ricc.19}
\end{equation}
has its rank equal to $q$ and is defined up to a similarity transformation;
thus the pair $(H, F)$ is unique in the same sense.
For evident reasons G, defined in (\ref{e.ricc.5}), shares the same property.
Thus we can conclude that the triple $(H, F, G)$ is unique up
to a similarity transformation. It may be easily obtained from the empirical
covariance function $\{R_k\} $ with algorithms such that Ho's \cite{ho}
or Rissanen's \cite{rissanen}.
It is this point that we shall investigate now.
\subsection{Estimation of the matrices H F G}
Numerous algorithms exist for solving this problem and we shall focus
on just one of them: the so-called {\em Principal Hankel Component} (PHC)
\index{principal hankel component}
\cite{kung}
approximation method, which is singular value decomposition (SVD) based
\index{singular value decomposition}.
It is as follows: from the covariance sequence, form the Hankel matrix:\\
\index{hankel}
\begin{equation}
\cal{H}_{M,N}=\left[ \begin{array}[c]{ccccc}
R_0 & R_1 & R_2 & \ldots & R_{N-1}\\
R_1 & R_2 & R_3 & \ldots & R_{N}\\
R_2 & R_3 & R_4 & \ldots & R_{N+1}\\
\vdots & \vdots & \vdots & & \vdots\\
R_{M-1} & R_{M} & R_{M+1} & \ldots & R_{M+N-2} \end{array} \right]
\label{e.ricc.20}
\end{equation}
The reason for the wide use of this matrix is the Kronecker's
theorem which says that its rank is theoretically equal to the order of the
associated system, i.e. equal to the dimension of state-vectors
of minimal state-space realizations.
Defining the matrix $C$ as :
\begin{equation}
C=\left[ \begin{array}{cccc}
G & FG & \ldots & F^{q-1}G\end{array}\right]
\label{e.ricc.21}
\end{equation}
we have that :
\begin{eqnarray}
\cal{H}_{M,N}=OC
\label{e.ricc.22}
\end{eqnarray}
Now, from the observability matrix $O$, define the two following matrices:
\begin{equation}
O^{'}=\left[ \begin{array}{c}H\\HF\\ \vdots\\ HF^{q-2}\end{array}\right]
\label{e.ricc.23}
\end{equation}
and
\begin{equation}
O^{\uparrow}=\left[ \begin{array}{c}HF\\ \vdots \\HF^{q-1}\end{array}\right]
\label{e.ricc.24}
\end{equation}
It is straightforward that:
\begin{eqnarray}
O^{\uparrow}=O^{'}F
\label{e.ricc.25}
\end{eqnarray}
so that the matrix $F$ is obtained as the least-squares solution of
(\ref{e.ricc.25}). $H$ is obtained as the first bloc-row of $O$ and $G$
as the first bloc-column of $C$ : this is the PHC approximation method.
Numerically, the factorization (\ref{e.ricc.22}) is done via singular-value
decomposition:
\begin{eqnarray}
\cal{H}_{M,N}=USV^T
\label{e.ricc.26}
\end{eqnarray}
$O$ and $C$ are obtained as :
\begin{eqnarray}
O=US^{1/2} & & C=S^{1/2}V^{T}
\label{e.ricc.27}
\end{eqnarray}
\paragraph{The {\tt phc} macro}
This macro implements the preceding algorithm to find the triple
$(H, F, G)$. In the following example, a 64-point length covariance sequence
has been generated for a two-dimensional process, the first
component of which is the sum of two sinusoids with discrete frequencies
$\pi/10$ and $2\pi/10$, while the second component is the sum of
two sinusoids with frequencies $\pi/10$ and $1.9\pi/10$, both
being in additive, gaussian white noise. This is
done as follows:
\verbatok{\Diary simul1.dia}
\end{verbatim}
Then the Hankel matrix $\cal{H}_{M,N}$ is built with the
function {\tt hank}.
Finally, the Scilab function {\tt phc} gives the desired triple $(H, F, G)$.
\subsection{computation of the filter}
Now, we have obtained the triple $(H, F, G)$ and we shall
investigate the matrices $T$ and $\tilde{R}$ that have still
to be computed to completely determine the filter model (\ref{e.ricc.16}).
From this model, let us compute the convenient covariances:\\
\begin{eqnarray}
R_0=H\tilde{P}H^T+\tilde{R}
\label{e.ricc.28}
\end{eqnarray}
\begin{eqnarray}
G=F\tilde{P}H^T+T\tilde{R}
\label{e.ricc.29}
\end{eqnarray}
\begin{eqnarray}
\tilde{P}=F\tilde{P}F^T+T\tilde{R}T^T
\label{e.ricc.30}
\end{eqnarray}
from which the following relations hold:
\begin{eqnarray}
\tilde{R}=R_0-H\tilde{P}H^T
\label{e.ricc.31}
\end{eqnarray}
\begin{eqnarray}
T=(G-F\tilde{P}H^T)(R_0-H\tilde{P}H^T)^{-1}
\label{e.ricc.32}
\end{eqnarray}
Noting that $\tilde{R}$ and $T$ depend solely on $\tilde{P}$
and supposing that $\tilde{R}$ is positive, we can write after
elimination of $\tilde{R}$ between (\ref{e.ricc.28}), (\ref{e.ricc.29}) and (\ref{e.ricc.30}):
\begin{eqnarray}
\tilde{P}=F\tilde{P}F^T+(G-F\tilde{P}H^T)(R_0-H\tilde{P}H^T)^{-1}(G^T-H\tilde{P}F^T)
\label{e.ricc.33}
\end{eqnarray}
which is the well-known {\em algebraic Riccati equation}
\index{riccati equation}. A matrix
$\tilde{P}$ is called a solution of the Riccati equation if it is
positive definite, such that $R_0-H\tilde{P}H^T>0$ and satisfies equation (\ref{e.ricc.33}). Although this equation has several solutions,
the following result gives an answer to our problem: {\em the covariance}
$\tilde{P}$ of the filter is the minimal solution of the algebraic
Riccati equation. We shall give now two algorithms
giving this minimal solution : the Faurre algorithm \index{Faurre algorithm}
and the Lindquist algorithm \index{Lindquist algorithm}.
\paragraph{The Faurre algorithm} \cite{faurre}:
in this method, the solution $\tilde{P}$ is obtained as the growing limit
of the sequence of matrices $P_N$ such that:
\begin{equation}
P_{N+1}=FP_NF^T+(G-FP_NH^T)(R_0-HP_NH^T)^{-1}(G^T-HP_NF^T)
\label{e.ricc.34}
\end{equation}
with the initial condition:
\begin{equation}
P_0=GR_0^{-1}G^T
\label{e.ricc.35}
\end{equation}
Setting $P_0=0$ is allowed too for this leads to $P_1=GR_0^{-1}G^T$
hence to a simple delay in the iterations.
%The matrices $T_N$ and $R_N$ are obtained by the following recursions:
%\begin{equation}
%R_N=R_0-HP_NH^T
%\label{e.ricc.36}
%\end{equation}
%\begin{equation}
%T_N=(G-FP_NH^T)(R_0-HP_NH^T)^{-1}
%\label{e.ricc.37}
%\end{equation}
To conclude, having obtained $\tilde{P}$ via the recursion (\ref{e.ricc.34}),
the matrices $\tilde{R}$ and $T$ are computed with equations
(\ref{e.ricc.31}) and (\ref{e.ricc.32}) respectively.
This is done with the macro {\tt srfaur}. The recursion for $P_N$
is not implemented as in equation (\ref{e.ricc.34}) for it may not converge,
especially when the matrix $F$ has poles near the unit circle. To overcome
this difficulty, a factored form, Chandrasekhar\index{Chandrasekhar}
type recursion has
been implemented, leading to the sought after solution even in the
precedent defavourable situation\footnote{A mistake was found in the
initialization of the algorithm and could not be corrected in time for
the present document}.
The Scilab function {\tt srfaur} implements this sqare-root algorithm.
Finally, the filter and the
corresponding estimated output are calculated with the help
of the model (\ref{e.ricc.16}) by using the macro {\tt ltitr}
which simulates linear systems. To summarize, the preceding example
has been generated the following way:
\verbatok{\Diary simul3.dia}
\end{verbatim}
\paragraph{The Lindquist algorithm} \cite{faurre}:
This algorithm makes use of the fact that
$\tilde{R}$ and $T$ may be computed from $K=\tilde{P}H^T$ instead
of $\tilde{P}$ itself, leading to substantial computational
savings when the observation has a much
lower dimension that the state, the most frequent situation.
Refering the reader to \cite{faurre} for the derivations, we give now
the algorithm itself:
\begin{equation}
\left\{
\begin{array}{l}
K_{N+1}=K_N+ \Gamma _N \tilde{R}^{-1}_N \Gamma ^T_N H^T \\
\Gamma_ {N+1}= [F-(G-FK_N)(R_0-HK_N)^{-1}H]\Gamma _N \\
\tilde{R}_{N+1}=\tilde{R}_N-\Gamma ^T_NH^T(R_0-HK_N)^{-1}H\Gamma _N
\label{e.ricc.36}
\end{array}
\right.
\end{equation}
with the initial conditions:
\begin{eqnarray}
K_0=0 & \Gamma _0=G & \tilde{R}_0=R-0
\label{e.ricc.37}
\end{eqnarray}
Then the expression for $T$ is the following:
\begin{eqnarray}
T=(G-FK)(R_0-HK)^{-1}
\label{e.ricc.38}
\end{eqnarray}
and {\tt lindquist} is the corresponding Scilab function.
\section{Levinson filtering}
\label{levin}
\index{levinson filters!lattice filters}
%\cite{anderson}
We still consider here $y_k$, a stationary, vector-valued time-series
\index{time series},
from which we have available a sample $y_{k-1}, y_{k-2},
\ldots , y_{k-N}$; the scope here is to estimate $y_k$ from this
sample, by some $\hat{y}_k$, say, which may be written then
as a linear combination of the $y_{k-j}$, $j=1, \ldots , N$:
$\hat{y}_k=\sum_{j=1}^{N}A^N_j y_{k-j}$.
As mentioned in the introduction,
this problem may be stated as a stochastic realization problem:
attempting to minimize the mean-square prediction error of order
$N$ at time $k$:
\begin{eqnarray}
E(\| e_k(N) \| ^2)=E\{ \| y_k - \sum_{j=1}^NA^N_j y_{k-j}\|^2\}
\label{e.lev.1}
\end{eqnarray}
the filter coefficients $A^N_j$, where the superscript $N$
indicates the dependence on the sample length,
are found to be the solutions of the
following To\"{e}plitz system \index{toeplitz}:
\begin{equation}
\left[ \begin{array}[c]{ccccc}
R_0 & R_1 & R_2 & \ldots & R_{N-1}\\
R_1^T & R_0 & R_1 & \ldots & R_{N-2}\\
R_2^T & R_1^T & R_0 & \ldots & R_{N-3}\\
\vdots & \vdots & \vdots & \ddots & \vdots \\
R_{N-1}^T & R_{N-2}^T & R_{N-3}^T & \ldots & R_0
\end{array} \right]
\left[ \begin{array}[c]{c}
A_1 \\A_2 \\A_3 \\ \vdots \\A_N \end{array} \right]
= \left[ \begin{array}[c]{c}
R_1 \\ R_2 \\ R_3 \\ \vdots \\R_N \end{array} \right]
\label{e.lev.2}
\end{equation}
The mean-square error $\Sigma_N$ is easily calculated:
\begin{eqnarray}
\Sigma_N&=&E((y_k-\hat{y}_k)(y_k-\hat{y}_k)^T) \nonumber \\ \\
&=&E((y_k-\hat{y}_k)y^T_k) \nonumber \\ \\
&=&E(y_ky^T_k)-\sum_{j=1}^N A^N_{j}E(y_{k-j}y^T_k) \nonumber \\
&=&R_0-\sum_{j=1}^N A^N_{j}R_{-j}
\label{e.lev.3}
\end{eqnarray}
where the second equality holds from the orthogonality principle
between the estimate and the prediction error.
Classically, $\Sigma_N$ is taken as a measure of the quality of the
estimation, since it is a monotone decreasing function of $N$
(the longer the sample, the better the estimation). One may for example
compare it to a preassigned mean-square estimation error. So, the
problem is to find a procedure which calculates, successively for each $N$,
the coefficients $A^N_j$ and $\Sigma_N$. An answer is given with
Levinson-type\index{levinson} algorithms, an important property of which is
their recursivity in the order: one particularity, relative to
other input-output representations such as state-space models
or rational matrices, is that in this approach the filter
structure - its order for example- is considered as a parameter
as well as any other, leading to the idea of lattice filter and
to cascade-type realizations of transfer functions.
Let us describe now the Levinson algorithm in the case of a vector-valued
time-series, noting that in the scalar case the backward prediction
error is no use because $R_{-k}=R_k$, leading to simpler derivations,
albeit similar. For the exposition we shall make use of a Hilbert space
approach, following the treatment given in \ref{levalgo}.
\subsection{The Levinson algorithm}
\label{levalgor}
Suppose we are given the vector-valued time-series $y_k$.
We begin with some definitions:\\
$e_k(N)$ and $f_k(N)$ will denote respectively the forward and backward
prediction errors of order $N$ at time k:
\begin{eqnarray}
e_k(N)=y_k-\sum_{j=1}^N A^N_jy_{k-j}
\label{e.lev.4}
\end{eqnarray}
\begin{eqnarray}
f_k(N)=y_{k-N}-\sum_{j=1}^N B^N_jy_{k-N+j}
\label{e.lev.5}
\end{eqnarray}
with the convention that: $e_k(0)=f_k(0)=y_k$.
We shall need also the following linear space:
$Y^q_p=span\{y_p, \ldots , y_q\}$.
In the present geometrical approach, the covariance $E(xy^T)$ of
$x$ and $y$ will be noted $[x,y]$ (scalar product) and if
$E(xy^T)=0$, we shall write $x\; \bot \; y$ (x orthogonal to y), as well as
$A\; \bot \; B$ for two orthogonal linear spaces.
In view of these definitions, the following relations hold:
\begin{eqnarray}
e_k(N)\in Y^k_{k-N} & \mbox{and} & e_k(N) \; \bot \; Y^{k-1}_{k-N}
\label{e.lev.6}
\end{eqnarray}
\begin{eqnarray}
f_k(N)\in Y^k_{k-N} & \mbox{and} & f_k(N) \; \bot \; Y^{k}_{k-N+1}
\label{e.lev.7}
\end{eqnarray}
From (\ref{e.lev.4}), we get:
\begin{eqnarray}
e_k(N+1)-e_k(N)\in Y^{k-1}_{k-N-1}
\label{e.lev.8}
\end{eqnarray}
while $e_k(N+1) \; \bot \; Y^{k-1}_{k-N-1}$ and $e_k(N) \; \bot \; Y^{k-1}_{k-N}$
imply that:
\begin{eqnarray}
e_k(N+1)-e_k(N) \; \bot \; Y^{k-1}_{k-N}
\label{e.lev.9}
\end{eqnarray}
Recalling (\ref{e.lev.7}), relations (\ref{e.lev.8}) and (\ref{e.lev.9})
caracterize the space spanned by $f_{k-1}(N)$; hence there exists a
matrix $K_N$ such that:
\begin{eqnarray}
e_k(N+1)-e_k(N)=-K_Nf_{k-1}(N)
\label{e.lev.10}
\end{eqnarray}
$K_N$ is determined with the help of the orthogonality condition:
\begin{eqnarray}
e_k(N+1) \; \bot \; y_{k-N-1}
\label{e.lev.11}
\end{eqnarray}
this relation implies that:
\begin{equation}
\begin{array}{l}
[e_k(N+1),y_{k-N-1}] \\
=[e_k(N),y_{k-N-1}] \; -K_N.[f_{k-1}(N),y_{k-N-1}] \\=0
\end{array}
\label{e.lev.12}
\end{equation}
hence giving:
\begin{eqnarray}
K_N=[e_k(N),y_{k-N-1}][f_{k-1}(N),y_{k-N-1}]^{-1}
\label{e.lev.13}
\end{eqnarray}
We recognize the second scalar product as the backward
mean-square error $\Gamma_N$.
Relations for the backward prediction error may be obtained the same way;
they are:
\begin{eqnarray}
f_k(N+1)-f_{k-1}(N)\in Y^{k}_{k-N} \; \mbox{and} \; \bot \; Y^{k-1}_{k-N}
\label{e.lev.14}
\end{eqnarray}
which caracterize the space spanned by $e_k(N)$; thus there exists
a matrix $K^{*}_N$ such that:
\begin{eqnarray}
f_k(N+1)-f_{k-1}(N)=-K^{*}_Ne_{k}(N)
\label{e.lev.15}
\end{eqnarray}
and determined by the orthogonality condition:
\begin{eqnarray}
f_k(N+1) \; \bot \; y_k
\label{e.lev.16}
\end{eqnarray}
which leads to:
\begin{eqnarray}
K^{*}_N=[f_{k-1}(N),y_{k}][e_{k}(N),y_{k}]^{-1}
\label{e.lev.17}
\end{eqnarray}
Here too the second scalar product is seen to be the
forward mean-square error $\Sigma_N$.
Relations (\ref{e.lev.10}), (\ref{e.lev.13}), (\ref{e.lev.15}),
(\ref{e.lev.17}) give the sought after recursions; their
lattice structure may be explicited with the help of the following
matrix polynomials:
\begin{eqnarray}
A_N(z)=I-\sum_{j=1}^N A^N_jz^j\\
B_N(z)=z^NI-\sum_{j=1}^NB^N_jz^{N-j}
\label{e.lev.18}
\end{eqnarray}
and the covariance matrices: $R_n=[y_k,y_{k-n}]$, from which
$K_N$ and $K^{*}_N$ may be expressed:
\begin{eqnarray}
K_N&=&(R_{N+1}-\sum_{j=1}^NA^N_jR_{N+1-j})(R_0-\sum_{j=1}^NB^N_jR_j)^{-1}
\nonumber \\
&=&\beta_N \Gamma^{-1}_N
\label{e.lev.19}
\end{eqnarray}
\begin{eqnarray}
K^{*}_N&=&(R_{-N-1}-\sum_{j=1}^NB^N_jR_{-N-1+j})(R_0-\sum_{j=1}^NA^N_jR_{-j})^{-1}
\nonumber \\
&=&\alpha_N \Sigma^{-1}_N
\label{e.lev.20}
\end{eqnarray}
with evident definitions.
The last recursion to be given is that for $\Sigma_N$, for
which we will use the expressions of $K_N$ (\ref{e.lev.19})
and $K^{*}_N$ in (\ref{e.lev.20}),
and the definition of $\Sigma_N$:
\begin{eqnarray}
\Sigma_N=[e_k(N),y_k]
\label{e.lev.21}
\end{eqnarray}
Then we can compute $\Sigma_{N+1}$ as follows:
\begin{eqnarray}
\Sigma_{N+1}&=&[e_k(N+1),y_k] \nonumber \\
&=&[e_k(N)-\beta_N\Gamma^{-1}_Nf_{k-1}(N),y_k] \nonumber \\
&=&[e_k(N),y_k] - \beta_N\Gamma^{-1}_N[f_{k-1}(N),y_k] \nonumber \\
&=&\Sigma_N- \beta_N\Gamma^{-1}_N\alpha_N
\label{e.lev.22}
\end{eqnarray}
In the same fashion, $\Gamma_{N+1}$ will be written as:
\begin{eqnarray}
\Gamma_{N+1}=\Gamma_N- \alpha_N\Sigma^{-1}_N\beta_N
\label{e.lev.23}
\end{eqnarray}
To summarize,the algorithm is as follows:
\\
\begin{equation}
\left\{
\begin{array}{l}
\left[ \begin{array}{c}
A_{N+1}(z) \\
B_{N+1}(z) \end{array} \right]=\left[ \begin{array}{cc}
I & -K_N\; z \\
-K^{*}_N & z\; I
\end{array} \right]
\left[ \begin{array}{c}
A_{N}(z) \\
B_{N}(z)
\end{array} \right] \\ \\
K_N=\beta_N \Gamma^{-1}_N \\ \\
K^{*}_N=\alpha_N \Sigma^{-1}_N \\ \\
\Sigma_{N+1}=\Sigma_N- \beta_N\Gamma^{-1}_N\alpha_N \\ \\
\Gamma_{N+1}=\Gamma_N- \alpha_N\Sigma^{-1}_N\beta_N \\ \\
\alpha_N=(R_{-N-1}-\sum_{j=1}^NB^N_jR_{-N-1+j}) \\ \\
\beta_N=(R_{N+1}-\sum_{j=1}^NA^N_jR_{N+1-j})
\end{array}
\right.
\label{e.lev.24}
\end{equation}
with the initial conditions:
\begin{equation}
\left\{
\begin{array}{l}
\alpha_0=R_{-N-1};
\beta_0=R_{N+1}\\
\Sigma_0=\Gamma_0=R_0;
A_0(z)=B_0(z)=I
\end{array}
\right.
\label{e.lev.25}
\end{equation}
%To obtain the recursion for $\Sigma_N$, it is worth noting that
%$\Sigma_N$ appears in matrix $K^{*}_N$, so that with an evident definition,
%$K^{*}_N$ may be written as:
%\begin{eqnarray}
%K^{*}_N=\alpha_N \Sigma^{-1}_N
%\label{e.lev.20}
%\end{eqnarray}
%In the same way and with evident definitions too, $K_N$
%(\ref{e.lev.19} may be written as:
%\begin{eqnarray}
%K_N=\beta_N \Gamma^{-1}_N
%\label{e.lev.21}
%\end{eqnarray}
%From:
%\begin{eqnarray}
%\Sigma_N=[e_k(N),y_k]
%\label{e.lev.22}
%\end{eqnarray}
%we can compute $\Sigma_{N+1}$:
%\begin{eqnarray}
%\Sigma_{N+1}&=&[e_k(N+1),y_k]\\
%&=&[e_k(N)-\beta_N\Gamma^{-1}_Nf_{k-1}(N),y_k]\\
%&=&[e_k(N),y_k] - \beta_N\Gamma^{-1}_N[f_{k-1}(N),y_k]\\
%&=&\Sigma_N- \beta_N\Gamma^{-1}_N\alpha_N
%\label{e.lev.23}
%\end{eqnarray}
%In the same fashion, $\Gamma_{N+1}$ will be written as:
%\begin{eqnarray}
%\Gamma_{N+1}=\Gamma_N- \alpha_N\Sigma^{-1}_N\beta_N
%\label{e.lev.24}
%\end{eqnarray}
%This ends the derivation of the Levinson recursions
%for the coefficients $A^N_j$ and for the mean-square error $\Sigma_N$.
The corresponding Scilab function is {\tt levin}.
%\end{document}