% Copyright INRIA
\chapter{Optimal Filtering and Smoothing}
\index{optimal filtering and smoothing}
\section{The Kalman Filter}
\index{Kalman filter}
Consider the following discrete-time system,
%
\begin{eqnarray}
x_{k+1}&=&F_kx_k+G_kw_k\nonumber\\
y_k&=&H_kx_k+v_k
\label{e.kf1}
\end{eqnarray}
%
where
%
\begin{eqnarray}
w_k&\sim&N(0,Q_k)\nonumber\\
v_k&\sim&N(0,R_k)\nonumber\\
x_0&\sim&N(m_0,\Pi_0)\nonumber\\
\label{e.kf1a}
\end{eqnarray}
%
and $x_0$, $\{w_k\}$, and $\{v_k\}$ are independent random vectors.
The state vector, $x_k$, has dimension $N$ and the observation
vector, $y_k$, has dimension $M$. Furthermore, it is assumed that
$R_k>0$.
The problem to be addressed here is the estimation
of the state vector, $x_k$, given observations of the vectors
$Y_k=\{y_0,y_1,\ldots,y_k\}$. Because the collection of variables
$\{x_k,y_0,y_1,\ldots,y_k\}$ are jointly Gaussian
one could estimate $x_k$ by maximizing the likelihood
of the conditional probability distribution $p(x_k|Y_k)$
given the values of the conditional variables. Equivalently,
one could search the estimate, $\hat{x}_k$, which minimized the
mean square error, $\epsilon_k=x_k-\hat{x}_k$. In either case it
is known that the optimal estimate (maximum likelihood or least
squares) for the jointly Gaussian variables is the conditional mean.
The error in the estimate is the conditional covariance.
In what follows, the conditional mean and covariance
of $x_k$ given $Y_k$ is developed. This is followed by
a description of the Kalman filter, an extremely practical
recursive method for calculating the conditional mean and covariance.
Several different implementations of the Kalman filter are discussed
here: The steady-state Kalman filter which can be used when
the system matrices in (\ref{e.kf1}) and (\ref{e.kf2}) are non-time varying,
the non-stationary Kalman filter for use when the system matrices in
(\ref{e.kf1}) and (\ref{e.kf2})
are time-varying, and, finally, the square-root
Kalman filter which is used (for time-varying and non-time-varying
system matrices) when greater numerical accuracy is required.
\subsection{Conditional Statistics of a Gaussian Random Vector}
\index{Gaussian random vectors!conditional statistics}
The minimum mean square estimate of a
Gaussian random vector given observations of some of its
elements is the conditional mean of the remaining elements.
The error covariance of this estimate is the conditional covariance.
Consequently, assuming that $z$ is a Gaussian random vector
composed of two sub-vectors $x$ and $y$, then
%
\begin{equation}
z=\left[\begin{array}{c}x\\y\end{array}\right]
\sim
N\left(\left[\begin{array}{c}m_x\\m_y\end{array}\right],
\left[\begin{array}{cc}\Lambda_{x}&\Lambda_{xy}\\
\Lambda_{yx}&\Lambda_{y}\end{array}
\right]\right)
\label{e.kf2}
\end{equation}
%
where $m_x$ denotes the mean of $x$, $\Lambda_{xy}$ denotes the
covariance of $x$ with $y$, and $\Lambda_x$ is the covariance of $x$
with itself.
It is known that the marginal and conditional distributions
of a Gaussian random vector are also Gaussian. In particular,
the conditional distribution of $x$ given $y$, $p(x|y)$, is
%
\begin{equation}
p(x|y)=N(m_{x|y},\Lambda_{x|y})
\label{e.kf3}
\end{equation}
%
where $m_{x|y}$ denotes the conditional mean and $\Lambda_{x|y}$
denotes the conditional covariance. These two quantities may be
calculated by
%
\begin{eqnarray}
m_{x|y}=m_x+\Lambda_{xy}\Lambda_y^{-1}(y-m_y)\nonumber\\
\Lambda_{x|y}=\Lambda_x-\Lambda_{xy}\Lambda_y^{-1}\Lambda_{yx}.
\label{e.kf4}
\end{eqnarray}
%
(It is useful to note the $x$ and $y$ are not necessarily of
the same dimension). Equation (\ref{e.kf4}) is very important
in our development of the Kalman filter equations.
With regard to the problem posed in the introduction
to this section, the minimum mean square error is
calculated in a straight forward manner. One stacks the
individual observation vectors into a single vector, $Y_k$, and then,
since $x_k$ and $Y_k$ are jointly Gaussian , one applies
(\ref{e.kf4}) to obtain the conditional mean and covariance
of $x_k$ given $Y_k$. The problem with this approach is that for
increasing $k$ the vector $Y_k$ is of increasing dimension, and
consequently, the matrix inverse and multiplication operations in
(\ref{e.kf4}) become increasingly burdensome.
The next few sections are devoted to showing how
the linear system of (\ref{e.kf1}) and the special properties
of (\ref{e.kf4}) can be used to obtain a recursive update to the
estimation of $x_k$. That is, given the best estimate of $x_k$
based on the observations $Y_k$ (we denote this estimate by
$\hat{x}_{k|k}$) and a new observation $y_{k+1}$, it is shown
how to obtain the best estimate of $\hat{x}_{k+1|k+1}$ and its
error covariance matrix $P_{k+1|k+1}$.
\subsection{Linear Systems and Gaussian Random Vectors}
\index{Gaussian random vectors!filtered by linear systems}
Given a random vector, $x$, with a probability
distribution $p(x)$, the minimum mean square error estimate of
$x$ is denoted here by $\hat{x}$ and consists of the mean of
$x$. That is, $\hat{x}=m_x$.
The associated error covariance of the estimate, denoted
$P_x$, is the covariance of $x$. Now assume that $x$ is passed
through a linear system and is disturbed by an independent,
zero-mean Gaussian vector, $v$, of covariance $R$. This yields an
output which can be represented by
%
\begin{equation}
y=Hx+v.
\label{e.kf5}
\end{equation}
%
Since (\ref{e.kf5}) is a linear combination of independent
Gaussian random vectors, $y$ is also a Gaussian random
vector. The mean and covariance of $y$ are calculated
as follows,
%
\begin{eqnarray}
m_y&=&E\{y\}\nonumber\\
&=&E\{Hx+v\}\nonumber\\
&=&Hm_x
\label{e.kf6}
\end{eqnarray}
%
and
%
\begin{eqnarray}
\Lambda_y&=&E\{(y-m_y)(y-m_y)^T\}\nonumber\\
&=&E\{[H(x-m_x)+v][H(x-m_x)+v]^T\}\nonumber\\
&=&H\Lambda_xH^T+R.
\label{e.kf7}
\end{eqnarray}
%
Consequently, the minimum mean square error estimate of $y$
is $\hat{y}=Hm_x$ and the associated error covariance of this
estimate is $P_y=H\Lambda_xH^T+R=HP_xH^T+R$.
\subsection{Recursive Estimation of Gaussian Random Vectors}
\index{Gaussian random vectors!recursive estimation}
Now we assume that we have a Gaussian random
vector which is composed of three sub-vectors $x$, $y$, and $z$.
This is represented by
%
\begin{equation}
\left(\begin{array}{c}x\\y\\z\end{array}\right)
\sim
N\left(\left[\begin{array}{c}m_x\\m_y\\m_z\end{array}\right],
\left[\begin{array}{ccc}\Lambda_{x}&\Lambda_{xy}&\Lambda_{xz}\\
\Lambda_{yx}&\Lambda_{y}&\Lambda_{yz}\\
\Lambda_{zx}&\Lambda_{zy}&\Lambda_{z}
\end{array}
\right]\right).
\label{e.kf8}
\end{equation}
%
From (\ref{e.kf4}) the minimum mean square error estimate of
$x$ given observation of $y$ is
%
\begin{equation}
\hat{x}(y)=m_x+\Lambda_{xy}\Lambda_y^{-1}(y-m_y)
\label{e.kf9}
\end{equation}
%
and the associated error covariance is
%
\begin{equation}
P_x(y)=\Lambda_x-\Lambda_{xy}\Lambda_y^{-1}\Lambda_{yx}.
\label{e.kf10}
\end{equation}
%
It is valuable to note here that
%
\begin{equation}
E\{\hat{x}(y)\}=m_x.
\label{e.kf11}
\end{equation}
%
If $z$ is also observed, then the minimum mean squared error estimate
of $x$ given $y$ and $z$ is
%
\begin{equation}
\hat{x}(y,z)=m_x+[\begin{array}{cc}
\Lambda_{xy}&\Lambda_{xz}\end{array}]
\left[\begin{array}{cc}\Lambda_y & \Lambda_{yz}\\
\Lambda_{zy} & \Lambda_{z}\end{array}\right]^{-1}
\left[\begin{array}{c}
y-m_y\\z-m_z\end{array}\right]
\label{e.kf12}
\end{equation}
%
with error covariance
%
\begin{equation}
P_x(y,z)=\Lambda_x-[\begin{array}{cc}
\Lambda_{xy}&\Lambda_{xz}\end{array}]
\left[\begin{array}{cc}\Lambda_y & \Lambda_{yz}\\
\Lambda_{zy} & \Lambda_{z}\end{array}\right]^{-1}
\left[\begin{array}{c}
\Lambda_{yx}\\\Lambda_{zx}\end{array}\right].
\label{e.kf13}
\end{equation}
%
Now note that if $y$ and $z$ were independent that
$\Lambda_{yz}$ would be zero and then (\ref{e.kf12}) could be written
as
%
\begin{eqnarray}
\hat{x}(y,z)&=&m_x+\Lambda_{xy}\Lambda_y^{-1}(y-m_y)+\Lambda_{xz}
\Lambda_z^{-1}(z-m_z)\nonumber\\
&=&\hat{x}(y)+\Lambda_{xz}\Lambda_z^{-1}(z-m_z).
\label{e.kf14}
\end{eqnarray}
%
The result in (\ref{e.kf14}) is a recursive method for calculating
$\hat{x}(y,z)$ given $\hat{x}(y)$ and $z$. The problem is that (\ref{e.kf14})
depends on $y$ and $z$ being independent random vectors.
Fortunately, by a change of variables, one can always change the
estimation procedure in (\ref{e.kf12}) into that in (\ref{e.kf14}).
This is accomplished as follows. Let $\nu$ be a random vector
defined by
%
\begin{eqnarray}
\nu&=&z-\hat{z}(y)\nonumber\\
&=&z-[m_z+\Lambda_{zy}\Lambda_y^{-1}(y-m_y)]\nonumber\\
&=&(z-m_z)-\Lambda_{zy}\Lambda_y^{-1}(y-m_y)
\label{e.kf15}
\end{eqnarray}
%
where $\hat{z}(y)$ is the minimum mean square estimate of $z$
given observation of $y$ (obtained by using (\ref{e.kf4})).
The new random vector, $\nu$, has several interesting
properties. First, because
%
\begin{equation}
m_{\nu}=E\{(z-m_z)-\Lambda_{zy}\Lambda_y^{-1}(y-m_y)\}=0
\label{e.kf16}
\end{equation}
%
$\nu$ is a zero-mean random variable. Also, since
%
\begin{eqnarray}
\Lambda_{\nu y}&=&E\{\nu(y-m_y)^T\}\nonumber\\
&=&E\{(z-m_z)(y-m_y)^T
-\Lambda_{zy}\Lambda_y^{-1}(y-m_y)(y-m_y)^T\}\nonumber\\
&=&\Lambda_{zy}-\Lambda_{zy}\Lambda_y^{-1}\Lambda_y\nonumber\\
&=&0
\label{e.kf17}
\end{eqnarray}
%
and
%
\begin{eqnarray}
\Lambda_{\nu \hat{x}(y)}&=&
E\{\nu(m_x+\Lambda_{xy}\Lambda_y^{-1}(y-m_y)-m_x)^T\}\nonumber\\
&=&E\{\nu(y-m_y)^T\Lambda_y^{-1}\Lambda_{yx}\}\nonumber\\
&=&0
\label{e.kf18}
\end{eqnarray}
%
it follows that $\nu$ is independent of both $y$ and $\hat{x}(y)$
These properties
are very useful for developing the Kalman filter equations of the
next section.
Now (\ref{e.kf14}) can be rewritten so that
%
\begin{eqnarray}
\hat{x}(y,z)&=&\hat{x}(y,\nu)\nonumber\\
&=&m_x+[\begin{array}{cc}
\Lambda_{xy}&\Lambda_{x\nu}\end{array}]
\left[\begin{array}{cc}\Lambda_y & 0\\
0 & \Lambda_{\nu}\end{array}\right]^{-1}
\left[\begin{array}{c}
y-m_y\\\nu\end{array}\right]\nonumber\\
&=&m_x+\Lambda_{xy}\Lambda_y^{-1}(y-m_y)+\Lambda_{x\nu}
\Lambda_{\nu}^{-1}\nu\nonumber\\
&=&\hat{x}(y)+\Lambda_{x\nu}\Lambda_{\nu}^{-1}\nu
\label{e.kf19}
\end{eqnarray}
%
where, from (\ref{e.kf15}) we obtain
%
\begin{eqnarray}
\Lambda_{x\nu}&=&E\{(x-m_x)(z-m_z-\Lambda_{zy}\Lambda_y^{-1}(y-m_y))^T\}\nonumber\\
&=&\Lambda_{xz}-\Lambda_{xy}\Lambda_y^{-1}\Lambda_{yz}
\label{e.kf20}
\end{eqnarray}
%
and
%
\begin{eqnarray}
\Lambda_{\nu}&=&E\{(z-m_z-\Lambda_{zy}\Lambda_y^{-1}(y-m_y))(z-m_z-\Lambda_{zy}\Lambda_y^{-1}(y-m_y))^T\}\nonumber\\
&=&\Lambda_{z}-\Lambda_{zy}\Lambda_y^{-1}\Lambda_{yz}.
\label{e.kf21}
\end{eqnarray}
%
(Note that the equality of $\hat{x}(y,z)$ and $\hat{x}(y,\nu)$ is due to the
fact that no information is lost in making the change of
variables in (\ref{e.kf15}). We are simply adding a constant vector
to $z$ which renders $\nu$ and $y$ independent of each other).
The error covariance, $P_x(y,\nu)$, associated with (\ref{e.kf19}) is
%
\begin{eqnarray}
P_x(y,\nu)&=&
\Lambda_x-[\begin{array}{cc}
\Lambda_{xy}&\Lambda_{x\nu}\end{array}]
\left[\begin{array}{cc}\Lambda_y & 0\\
0 & \Lambda_{\nu}\end{array}\right]^{-1}
\left[\begin{array}{c}
\Lambda_{yx}\\\Lambda_{\nu x}\end{array}\right]\nonumber\\
&=&\Lambda_x-\Lambda_{xy}\Lambda_y^{-1}\Lambda_{yx}-\Lambda_{x\nu}
\Lambda_{\nu}^{-1}\Lambda_{\nu x}\nonumber\\
&=&P_x(y)-\Lambda_{x\nu}\Lambda_{\nu}^{-1}\Lambda_{\nu x}.
\label{e.kf22}
\end{eqnarray}
%
\subsection{The Kalman Filter Equations}
\index{Kalman filter!equations}
Here the results of the previous two sections
are combined to find the recursive estimation procedure
referred to in the introduction. Before detailing the
procedure we introduce some notation. We denote by
$\hat{x}_{k|l}$ the minimum mean square estimate of $x_k$
given the observations $Y_l=\{y_0,y_1,\ldots,y_l\}$. Furthermore,
$P_{k|l}$ represents the error covariance associated with
$\hat{x}_{k|l}$.
Now, the estimate $\hat{x}_{k|k}$ can be obtained
from the estimate $\hat{x}_{k|k-1}$ and the new observation $y_k$
in the following manner. From (\ref{e.kf19}) and (\ref{e.kf22})
we have
%
\begin{eqnarray}
\hat{x}_{k|k}&=&\hat{x}_{k|k-1}+\Lambda_{x_k\nu_k}\Lambda_{\nu_k}^{-1}\nu_k\nonumber\\
P_{k|k}&=&P_{k|k-1}-\Lambda_{x_k\nu_k}\Lambda_{\nu_k}^{-1}\Lambda_{\nu_k x_k}
\label{e.kf23}
\end{eqnarray}
%
where from (\ref{e.kf1}), (\ref{e.kf6}), and (\ref{e.kf7})
%
\begin{equation}
\nu_k=y_k-H_k\hat{x}_{k|k-1}.
\label{e.kf24}
\end{equation}
%
The covariance
matrices in (\ref{e.kf23}) may be calculated by
%
\begin{eqnarray}
\Lambda_{\nu_k}&=&E\{\nu_k\nu_k^T\}\nonumber\\
&=&E\{(y_k-H_k\hat{x}_{k|k-1})(y_k-H_k\hat{x}_{k|k-1})^T\}\nonumber\\
&=&E\{[H_k(x_k-\hat{x}_{k|k-1})+v_k][H_k(x_k-\hat{x}_{k|k-1})+v_k]^T\}\nonumber\\
&=&H_kP_{k|k-1}H_k^T+R_k
\label{e.kf25}
\end{eqnarray}
%
(where the final equality follows from the fact that $v_k$ and
$x_k$ are independent random vectors), and
%
\begin{eqnarray}
\Lambda_{x_k\nu_k}&=&E\{(x_k-E\{x_k\})\nu_k^T\}\nonumber\\
&=&E\{(x_k-E\{x_k\}+E\{x_k\}-\hat{x}_{k|k-1})\nu_k^T\}\nonumber\\
&=&E\{(x_k-\hat{x}_{k|k-1})\nu_k^T\}\nonumber\\
&=&E\{(x_k-\hat{x}_{k|k-1})(y_k-H_k\hat{x}_{k|k-1})^T\}\nonumber\\
&=&E\{(x_k-\hat{x}_{k|k-1})(x_k-\hat{x}_{k|k-1})^TH_k^T\}\nonumber\\
&=&P_{k|k-1}H_k^T
\label{e.kf26}
\end{eqnarray}
%
(where the second equality follows from (\ref{e.kf12}) and
(\ref{e.kf18})). Substituting (\ref{e.kf24}),
(\ref{e.kf25}), and (\ref{e.kf26}) into (\ref{e.kf23}) yields
%
\begin{eqnarray}
\hat{x}_{k|k}&=&\hat{x}_{k|k-1}+K_k(y_k-H_k\hat{x}_{k|k-1})\nonumber\\
P_{k|k}&=&P_{k|k-1}-K_kH_kP_{k|k-1}
\label{e.kf27}
\end{eqnarray}
%
where $K_k=P_{k|k-1}H_k^T[H_kP_{k|k-1}H_k^T+R_k]^{-1}$ is called the
Kalman gain of the filter. (Note: Since the model proposed in
the first section of this chapter assumes that $R_k>0$ it follows that
$K_k$ always exists. However, when $R_k$ is not strictly
positive definite there may be problems performing
the matrix inverse necessary to calculate $K_k$).
Using (\ref{e.kf6}) and (\ref{e.kf7}) in conjunction with (\ref{e.kf1})
gives the two auxiliary equations
%
\begin{eqnarray}
\hat{x}_{k+1|k}&=&F_k\hat{x}_{k|k}\nonumber\\
P_{k+1|k}&=&F_kP_{k|k}F_k^T+G_kQ_kG_k^T.
\label{e.kf28}
\end{eqnarray}
%
Combining the equations in (\ref{e.kf27}) and (\ref{e.kf28}) yields
one set of recursive equations
%
\begin{eqnarray}
\hat{x}_{k+1|k}&=&F_k\hat{x}_{k|k-1}+F_kK_k(y_k-H_k\hat{x}_{k|k-1})\nonumber\\
P_{k+1|k}&=&F_kP_{k|k-1}F_k^T-F_kK_kH_kP_{k|k-1}F_k^T+G_kQ_kG_k^T.
\label{e.kf29}
\end{eqnarray}
%
The only remaining detail is to identify the initial conditions
of the Kalman filter which are obtained from the {\it a priori}
statistics of $x_0$
%
\begin{eqnarray}
\hat{x}_{0|-1}&=&m_0\nonumber\\
P_{0|-1}&=&\Pi_0.
\label{e.kf30}
\end{eqnarray}
%
\subsection{Asymptotic Properties of the Kalman Filter}
\index{Kalman filter!asymptotic properties}
Depending on the problem formulation posed
in (\ref{e.kf1}) there are situations where the Kalman filter
does not perform well. That is to say, that for
certain formulations, the Kalman filter could provide
state estimates which diverge from the actual evolution
of the state vector. This divergence is by no
means a fault of the Kalman filter, but, rather,
is due to the process model provided by the user.
Consequently, in such a case where the Kalman filter
diverges, the user may wish to re-examine the
model formulation so that it
is better posed for the estimation problem.
The following results concerning the effectiveness of
the Kalman filter are only valid for time invariant
formulations of the model in (\ref{e.kf1}). That is to say
that $F_k=F$, $G_k=G$, $H_k=H$, $Q_k=Q$, and $R_k=R$ for all $k$.
We state the following properties of the
asymptotic behavior of the Kalman filter without
proof. The desirable asymptotic properties of the Kalman
filter depend on the controllability and observability of
the system model in (\ref{e.kf1}). A necessary
and sufficient condition for the system in (\ref{e.kf1}) to be
controllable is that
%
\begin{equation}
rank[\begin{array}{cccc}G&FG&\cdots&F^{N-1}G\end{array}]=N
\label{e.kf31}
\end{equation}
%
(recall that $F$ is an $N\times N$ matrix). A necessary and
sufficient condition that the system be observable is that
%
\begin{equation}
rank\left[\begin{array}{c}H\\HF\\\vdots\\HF^{N-1}\end{array}\right]=N
\label{e.kf32}
\end{equation}
%
Now if the system in (\ref{e.kf1}) is both controllable and observable
then
%
\begin{equation}
\lim_{k\rightarrow\infty}P_{k|k-1}=P<\infty
\label{e.kf33}
\end{equation}
%
and if $Q>0$ then
%
\begin{equation}
P>0.
\label{e.kf34}
\end{equation}
%
These results state that the error covariance matrix,
$P_{k|k-1}$, converges to a finite, positive definite constant matrix
when the system in (\ref{e.kf1}) is controllable and observable.
Consequently, the error in the estimate $\hat{x}_{k|k-1}$ is bounded
as $k\rightarrow\infty$ since $P<\infty$. Furthermore, because $P>0$
the Kalman filter gain is also positive definite. Consequently, the new
observations are always included in the new state estimate.
Another consequence of steady-state analysis of the Kalman
filter is that one can use the steady-state Kalman gain in place of
the time varying Kalman gain. The advantage of such an approach is
that considerable computational savings are possible due to the fact
that one is not re-calculating the Kalman gain for each new observation.
It is important to realize, however, that using the steady-state
Kalman gain does not yield the optimal estimate of the state until after
the transient behavior of the filter has died away.
To use the steady-state
Kalman gain to implement the Kalman filter algorithm there exists,
in Scilab, a function and a primitive which when
used in tandem yield the estimated state.
The function is {\tt optgain} and the primitive is {\tt ltitr}. The function
{\tt optgain} calculates the steady-state error covariance matrix
and the steady-state Kalman gain given the system matrices and the
noise covariance matrices. The primitive {\tt ltitr} generates
the state vectors from a linear dynamic system in state space form
given the system matrices of this system and the input vectors.
In the case of the Kalman filter, the input vectors are the observations
$y_k$ and the system dynamics are
%
\begin{equation}
\hat{x}_{k|k}=(I-KH)F\hat{x}_{k-1|k-1}+Ky_k
\label{e.kf34a}
\end{equation}
%
The following section describes how to use {\tt sskf} the steady-state
Kalman filter function. This is followed by an example using {\tt sskf}.
The following section describes the use of {\tt kalm} the time-varying
version of the Kalman filter function. Finally, several sections follow
describing the square-root Kalman filter algorithm and how to use the
associated function {\tt srkf}.
\subsection{How to Use the Macro {\tt sskf}}
\index{steady state Kalman filter}
\index{function syntax!sskf@{\tt sskf}}
\index{Kalman filter!steady state!function syntax}
The syntax of the {\tt sskf} is as follows
\begin{verbatim}
[xe,pe]=sskf(y,f,h,q,r,x0)
\end{verbatim}
where the system is assumed to take the following form
%
\begin{eqnarray}
x_{k+1}&=&\mtt{ f}x_k+w_k\nonumber\\
y_k&=&\mtt{ h}x_k+v_k\nonumber
\end{eqnarray}
%
where
%
\begin{eqnarray}
w_k&\sim& N(0,\mtt{ q})\nonumber\\
v_k&\sim& N(0,\mtt{ r})\nonumber\\
x_0&\sim& N(\mtt{ x0},\Pi_0).\nonumber
\end{eqnarray}
%
The remaining input, $\mtt{ y}=[y_0,y_1,\ldots,y_n]$, is a matrix where
the individual observations $y_k$ are contained in the matrix as
column vectors.
The outputs {\tt xe} and {\tt pe} are the estimated state
and the steady-state error covariance matrix. The form of the
estimates is $\mtt{ xe}=[\hat{x}_{0|0},\hat{x}_{1|1},\ldots,\hat{x}_{n|n}]$
where each $\hat{x}_{k|k}$ is a column in the matrix. The
steady-state error covariance matrix is a square matrix of the
dimension of the state.
\subsection{An Example Using the {\tt sskf} Macro}
\index{Kalman filter!steady state!example}
The example uses the following system model and prior
statistics:
%
\begin{eqnarray}
x_{k+1}&=&\left[\begin{array}{cc}
1.1 & .1\\
0 & .8\end{array}\right]x_k+
\left[\begin{array}{cc}
1 & 0\\
0 & 1\end{array}\right]w_k\nonumber\\
y_k&=&\left[\begin{array}{cc}
1 & 0\\
0 & 1\end{array}\right]x_k+v_k\nonumber
\end{eqnarray}
%
where
%
\begin{eqnarray}
E\{w_kw_k^T\}&=&\left[\begin{array}{cc}
.03 & .01\\
.01 & .03\end{array}\right]\nonumber\\
E\{v_kv_k^T\}&=&\left[\begin{array}{cc}
2 & 0\\
0 & 2\end{array}\right]\nonumber
\end{eqnarray}
%
and
%
\begin{eqnarray}
E\{x_0\}&=&\left[\begin{array}{c}
10\\
10\end{array}\right]\nonumber\\
E\{(x_0-m_0)(x_0-m_0)^T\}&=&\left[\begin{array}{cc}
2 & 0\\
0 & 2\end{array}\right]\nonumber
\end{eqnarray}
%
Observations from the above system were generated synthetically
using the primitive {\tt ltitr}.
The results of the steady-state Kalman filter estimates
are shown in Figure~\ref{f.kf0}
%
\input{\Figdir kf1.tex}
\dessin{{\tt exec('kf1.code')} Steady-State Kalman Filter Tracking}{f.kf0}
%
where the dotted line marked by stars indicates
the actual state path and the solid line marked
by circles marks the estimated state path.
\subsection{How to Use the Function {\tt kalm}}
\index{Kalman filter!function syntax}
\index{function syntax!kalm@{\tt kalm}}
The function {\tt kalm} takes as input the system description
matrices, the statistics of the noise processes, the prior estimate of
the state and error covariance matrix, and the new observation. The
outputs are the new estimates of the state and error covariance
matrix. The call to {\tt kalm} is as follows:
\begin{verbatim}
--> [x1,p1,x,p]=kf(y,x0,p0,f,g,h,q,r)
\end{verbatim}
where {\tt y} is the new observation, {\tt x0} and {\tt p0} are the
prior state and error covariance estimates at time $t=0$ based on observations
up to time $t=-1$, {\tt f}, {\tt g} and {\tt h} are
the dynamics, input, and observation matrices, respectively, and {\tt q}
and {\tt r} are the noise covariance matrices for the dynamics and
observations equations, respectively. The outputs {\tt x1} and {\tt p1}
are the new state and error covariance estimates at time $t=1$ given
observations up to time $t=0$, respectively, and {\tt x} and {\tt p}
are the new state and error covariance estimates at time $t=0$ given
observations up to time $t=0$, respectively.
\subsection{Examples Using the {\tt kalm} Function}
\index{Kalman filter!examples}
Three examples are illustrated in this section. All three
examples are for two-dimensional state vectors. The first example
illustrates Kalman tracking for a system model which is controllable
and observable. The second and third examples show the results of
uncontrollable and unobservable system models, respectively.
The first example uses the following system model and prior
statistics:
%
\begin{eqnarray}
x_{k+1}&=&\left[\begin{array}{cc}
1.1 & .1\\
0 & .8\end{array}\right]x_k+
\left[\begin{array}{cc}
1 & 0\\
0 & 1\end{array}\right]w_k\nonumber\\
y_k&=&\left[\begin{array}{cc}
1 & 0\\
0 & 1\end{array}\right]x_k+v_k\nonumber
\end{eqnarray}
%
where
%
\begin{eqnarray}
E\{w_kw_k^T\}&=&\left[\begin{array}{cc}
.03 & .01\\
.01 & .03\end{array}\right]\nonumber\\
E\{v_kv_k^T\}&=&\left[\begin{array}{cc}
2 & 0\\
0 & 2\end{array}\right]\nonumber
\end{eqnarray}
%
and
%
\begin{eqnarray}
E\{x_0\}&=&\left[\begin{array}{c}
10\\
10\end{array}\right]\nonumber\\
E\{(x_0-m_0)(x_0-m_0)^T\}&=&\left[\begin{array}{cc}
2 & 0\\
0 & 2\end{array}\right]\nonumber
\end{eqnarray}
%
Observations from the above system were generated synthetically
using the system formulation and values from a random number generator
for the dynamics and observations noise. These observations were then
used as input to the Kalman filter. The result of ten observations is
illustrated in Figure~\ref{f.kf1}.
%
\input{\Figdir kf2.tex}
\dessin{{\tt exec('kf2.code')} Kalman Filter Tracking}{f.kf1}
%
The figure shows the actual state path and the Kalman estimate of
the state path as a dotted and solid line, respectively.
The actual locations of the state and estimated values are
indicated by the star and circle symbols on the respective paths.
The ellipses in the figure are centered about the positions of
the actual state path and their borders represent two standard deviations
of estimation error calculated from the error covariance matrices.
The values of the standard deviations
for the above example are displayed below:
\verbatok{\Diary kf3.dia}
\end{verbatim}
Each row of the above vector represents the standard deviations of the
state vector error covariance matrix where the first row is the standard
deviation of the {\it a priori} error in the state vector and the last row
is the standard deviation of the state vector estimate at the last step
of the Kalman filter. The above standard deviation vector is instructive.
It should be noted that for both state values, the standard deviation is
converging to a steady state value. For the first element of the state
vector this value is .7800312 and for the second element of the state
vector the value is .2824549. The convergence is to be expected since
the above formulation of the system is both controllable and observable.
If we were to change the above system formulation so that the dynamics
equation were now
%
\begin{equation}
x_{k+1}=\left[\begin{array}{cc}
1.1 & .1\\
0 & .8\end{array}\right]x_k+
\left[\begin{array}{cc}
1 & 0\\
0 & 0\end{array}\right]w_k\nonumber
\end{equation}
%
the system would be uncontrollable. Re-running the Kalman filter
with this re-formulation yields the following sequence of standard
deviations:
\verbatok{\Diary kf4.dia}
\end{verbatim}
As can be seen, the second state variable has a standard deviation which
is not converging to a positive value. In fact the value of this
standard deviation converges to zero. As was discussed in the section
on the asymptotic behavior of the Kalman filter, this is what was to be
expected. The result of this behavior is that the Kalman
filter ignores any observed information regarding
the second state variable since the error variance
is going to zero (and, thus, the filter thinks that it has perfect information
concerning this state value). If the above model is perfectly accurate
then such an eventuality is not a problem. However, in practice there
are modeling errors and, consequently, if the new observations are
ignored, there is a danger that the Kalman filter estimates will begin
to diverge from the actual state of the process.
Now we change the original model formulation again so
that the observation equation is now
%
\begin{equation}
y_k=\left[\begin{array}{cc}
0 & 0\\
0 & 1\end{array}\right]x_k+v_k\nonumber.
\end{equation}
%
Under these conditions the system is not observable. The
evolution of the standard deviation for this example is:
\verbatok{\Diary kf5.dia}
\end{verbatim}
Here the standard deviation of the first state variable is growing
without bound. This is due to two things. First, the system is
unobservable in the first state variable. Consequently, the observations
provide no useful information concerning the estimate of this state.
Secondly, since the system matrix {\tt f} has un unstable eigenvalue
the standard deviation of this state, in the limit, is unbounded.
The Scilab code used to generate the examples in this section
is displayed below. The code for the steady-state Kalman filter example
is as follows:
\verbatok{\Figdir kf1.dia}
\end{verbatim}
The code used to generate the non-steady-state Kalman filter example
is:
\verbatok{\Figdir kf2.dia}
\end{verbatim}
\section{The Square Root Kalman Filter}
\index{Kalman filter!square root}
\index{square root Kalman filter}
The Kalman filter is known to have certain numerical
instabilities \cite{Bier}. For example, since the update
of the error covariance matrix involves taking differences
of matrices (see(\ref{e.kf27})) it is possible that machine
discretization error could result in a non positive semi-definite
error covariance. Such an event could lead
to severe divergence of the Kalman filter because this could
render the Kalman gain infinite. Fortunately, there
are a class of algorithms which are known collectively as
square root Kalman filters which avoid this problem.
The square root Kalman filter propagates the
state estimate and the square root of the error
covariance matrix. (Here, $S$ is defined to be the square
root of a matrix, $A$, if $A=SS^T$. The square root, $S$, is
not unique. For example for any other orthogonal matrix, $O$,
such that $OO^T=I$ it follows that $SO$ is also a square root of $A$.
We denote the square root of $A$ by $A^{1/2}$).
The advantage of propagating the square root of the error covariance
matrix is two-fold. First, the
error covariance is always positive semi-definite since
it is formed by squaring the square root. Second,
the dynamic range of the elements in the square
root of the error covariance is much smaller than
that in the error covariance matrix itself. Consequently,
the accuracy of calculations is improved since the machine
precision is better adapted to representing the dynamic range
of the square root. The details of a square root Kalman
filter algorithm base on using
Householder transformations is detailed below.
Restating the model for the dynamics and
observations of the process in (\ref{e.kf1}), we have that
%
\begin{eqnarray}
y_k&=&H_kx_k+v_k\nonumber\\
x_{k+1}&=&F_kx_k+w_k
\label{e.kf41}
\end{eqnarray}
%
where $w_k$ and $v_k$ are independent zero-mean Gaussian
vectors of covariance $Q_k$ and $R_k$, respectively.
The model in (\ref{e.kf41}) is modified so that
$w_k=G_k\mu_k$ and $v_k=L_k\mu_k$ where $\mu_k$ is
a zero-mean Gaussian random vector with
unit covariance matrix (i.e., $E\{\mu_k\mu_k^T\}=I$). Since $w_k$ and
$v_k$are independent $G_kL_k^T=L_kG_k^T=0.$
Now, the model in (\ref{e.kf41}) can be expressed as
%
\begin{equation}
\left[\begin{array}{c}y_k\\x_{k+1}\end{array}\right]
=
\left[\begin{array}{cc}H_k&L_k\\
F_k&G_k\end{array}\right]
\left[\begin{array}{c}x_k\\\mu_k\end{array}\right].
\label{e.kf42}
\end{equation}
%
Also, we recall from (\ref{e.kf6}) and (\ref{e.kf7}) that
%
\begin{equation}
\left[\begin{array}{c}\hat{y}_{k|k-1}\\\hat{x}_{k+1|k-1}\end{array}\right]
=
\left[\begin{array}{c}H_k\\F_k\end{array}\right]\hat{x}_{k|k-1}.
\label{e.kf43}
\end{equation}
%
Substracting (\ref{e.kf43}) from (\ref{e.kf42}) yields
%
\begin{equation}
\left[\begin{array}{c}\nu_k\\\epsilon_{k+1|k-1}\end{array}\right]
=
\left[\begin{array}{cc}H_kP_{k|k-1}^{1/2}&L_k\\
F_kP_{k|k-1}^{1/2}&G_k\end{array}\right]
\left[\begin{array}{c}P_{k|k-1}^{-1/2}\epsilon_{k|k-1}\\\mu_k\end{array}\right]
\label{e.kf44}
\end{equation}
%
where $\nu_k=y_k-H\hat{x}_{k|k-1}$, $\epsilon_{k+1|k-1}=x_{k+1}-\hat{x}_
{k+1|k-1}$, $\epsilon_{k|k-1}=x_k-\hat{x}_{k|k-1}$, and
where we have used the square root of the error
covariance, $P_{k|k-1}$, so that the vector $P_{k|k-1}^{-1/2}\hat{x}_{k|k-1}$
would have unit covariance.
Now we assume that a matrix, $T_k$, exists
such that $T_kT_k^T=I$ and such that
%
\begin{equation}
\left[\begin{array}{cc}H_kP_{k|k-1}^{1/2}&L_k\\
F_kP_{k|k-1}^{1/2}&G_k\end{array}\right]T_k
=
\left[\begin{array}{cc}A_k&0\\
B_k&C_k\end{array}\right].
\label{e.kf45}
\end{equation}
%
(The matrix $T_k$ can always be constructed as a combination
of Householder transformations. The Householder
transformation and the construction of $T_k$ are detailed
in the next section).
Using $T_k$, (\ref{e.kf44}) can be rewritten so that
%
\begin{eqnarray}
\left[\begin{array}{c}\nu_k\\\epsilon_{k+1|k-1}\end{array}\right]
&=&
\left[\begin{array}{cc}H_kP_{k|k-1}^{1/2}&L_k\\
F_kP_{k|k-1}^{1/2}&G_k\end{array}\right]T_kT_k^T
\left[\begin{array}{c}P_{k|k-1}^{-1/2}\epsilon_{k|k-1}\\\mu_k\end{array}\right]
\nonumber\\
&=&
\left[\begin{array}{cc}A_k&0\\
B_k&C_k\end{array}\right]\eta_k
\label{e.kf46}
\end{eqnarray}
%
where $\eta_k$ is a zero-mean Gaussian random vector with unit
covariance. We can now derive the
significance of (\ref{e.kf46}) by calculating the
conditional mean and conditional covariance of $\epsilon_{k+1|k-1}$
given $\nu_k$. Using (\ref{e.kf4}) we obtain
%
\begin{eqnarray}
E\{\epsilon_{k+1|k-1}|\nu_k\}&=&E\{x_{k+1}-\hat{x}_{k+1|k-1}|\nu_k\}\nonumber\\
&=&\Lambda_{\epsilon_{k+1|k-1}\nu_k}\Lambda_{\nu_k}^{-1}\nu_k
\label{e.kf47}
\end{eqnarray}
%
and
%
\begin{eqnarray}
\lefteqn{E\{[\epsilon_{k+1|k-1}-E\{\epsilon_{k+1|k-1}\}][\epsilon_{k+1|k-1}-E\{\epsilon_{k+1|k-1}\}]^T|\nu_k\}=}\nonumber\\
&&\Lambda_{\epsilon_{k+1|k-1}}-\Lambda_{\epsilon_{k+1|k-1}\nu_k}\Lambda_{\nu_k}^{-1}\Lambda_{\epsilon_{k+1|k-1}\nu_k}
\label{e.kf48}
\end{eqnarray}
%
Calculating the covariances in (\ref{e.kf47}) and (\ref{e.kf48}) we
obtain (using (\ref{e.kf46}))
%
\begin{eqnarray}
\Lambda_{\epsilon_{k+1|k-1}\nu_k}&=&B_kA_k^T\nonumber\\
\Lambda_{\nu_k}&=&A_kA_k^T\nonumber\\
\Lambda_{\epsilon_{k+1|k-1}}&=&B_kB_k^T+C_kC_k^T.
\label{e.kf49}
\end{eqnarray}
%
It can be seen that (\ref{e.kf47}) yields the Kalman filter equations.
Consequently, using (\ref{e.kf49}) we obtain the Kalman
gain, $K_k$, and the updated square root of the error covariance,
$P_{k+1|k}^{1/2}$, as
%
\begin{eqnarray}
K_k&=&B_kA_k^{-1}\nonumber\\
P_{k+1|k}^{1/2}&=&C_k.
\label{e.kf50}
\end{eqnarray}
%
The square root Kalman filter algorithm
works as follows. Given $\hat{x}_{k|k-1}$, $P_{k|k-1}^{1/2}$, and
$y_k$ we first form the matrix on the left hand side of (\ref{e.kf45}).
The matrices $G_k$ and $L_k$ are obtained by performing
a Cholesky decomposition of $Q_k$ and $R_k$ (that is $Q_k=G_kG_k^T$
and $R_k=L_kL_k^T$ where $G_k$ and $L_k$ are upper triangular).
Using the Householder transformation $T_k$ we obtain $A_k$, $B_k$,
and $C_k$. Then, the updates are calculated by
%
\begin{eqnarray}
\hat{x}_{k+1|k}&=&F_k\hat{x}_{k|k-1}+B_kA_k^{-1}(y_k-H_k\hat{x}_{k|k-1})\nonumber\\
P_{k+1|k}^{1/2}&=&C_k.
\label{e.kf51}
\end{eqnarray}
%
All that remains is specification of the Householder transformation
which is done in the following section.
\subsection{The Householder Transformation}
\index{Householder transformation}
Let $\beta$ be a vector in $R^N$. Then, we
define the $N\times N$ matrix, $T_{\beta}$, such that
%
\begin{equation}
T_{\beta}=I-\frac{2}{\beta^T\beta}\beta\beta^T.
\label{e.kf50a}
\end{equation}
%
The matrices defined by (\ref{e.kf50}) are called Householder
transformations (see \cite{Stewart}).
The $T_{\beta}$ have the following properties
%
\begin{eqnarray}
T_{\beta}&=&T_{\beta}^T\nonumber\\
T_{\beta}T_{\beta}^T&=&I\nonumber\\
T_{\beta}T_{\beta}&=&I.
\label{e.kf51a}
\end{eqnarray}
%
Furthermore, the matrix $T_{\beta}$ has a geometric interpretation.
The vector $\beta$ uniquely defines a hyper-plane
in $R^N$ as the sub-space in $R^N$ which is orthogonal to $\beta$.
The matrix $T_{\beta}$ acts as a reflector in that $T_{\beta}$ maps
vectors in $R^N$ to their reflected images through the
hyper-plane defined by $\beta$.
Householder transformations can be used to
upper triangularize matrices. This is accomplished as
follows. Let $A=[a_1,a_2,\cdots,a_M]$ be an $N\times M$ matrix
where $a_k$ is an $N$-vector and let $T_{\beta_1}$ be a
Householder transformation where $\beta_1$ is
constructed such that
%
\begin{equation}
T_{\beta_1}a_1=\tilde{a}_1=
\left[\begin{array}{c}
\sqrt{\sum_{k=1}^{N}a_{1k}^2}\\
0\\
\vdots\\
0
\end{array}\right]
\label{e.kf52}
\end{equation}
%
(i.e., we find the $\beta_1$ such that when $a_1$ is reflected
through the hyper-plane defined by $\beta_1$ its image is
as above). The choice for $\beta_1$ which accomplishes this goal is
%
\begin{equation}
\beta_1=\left[\begin{array}{c}
a_{11}+\sqrt{\sum_{k=1}^{N}a_{1k}^2}\\
a_{12}\\
\vdots\\
a_{1N}
\end{array}\right]
\label{e.kf53}
\end{equation}
%
where $a_{1k}$ is the $k^{th}$ element of the $N$-vector $a_1$.
The Householder transformation, $T_{\beta_1}$, can be
used now to transform the matrix $A$,
%
\begin{equation}
T_{\beta_1}A=[\tilde{a}_1,\tilde{a}_2,\cdots,\tilde{a}_M]
\label{e.kf54}
\end{equation}
%
where $\tilde{a}_1$ is as in (\ref{e.kf52}) and the
remaining columns of $A$ are transformed to $\tilde{a}_k=T_{\beta_1}a_k$.
Now we construct a second Householder transformation,
$T_{\beta_2}$ such that
%
\begin{equation}
T_{\beta_2}\tilde{a}_2=\tilde{\tilde{a}}_2=
\left[\begin{array}{c}
\tilde{a}_{21}\\
\sqrt{\sum_{k=2}^{N}\tilde{a}_{2k}^2}\\
0\\
\vdots\\
0
\end{array}\right]
\label{e.kf55}
\end{equation}
%
That is, the first element of $\tilde{a}_2$ is left unchanged,
the second element becomes the norm of the rest of the elements and
the remaining elements become zero. The choice for $\beta_2$ which
accomplishes this is
%
\begin{equation}
\beta_2=\left[\begin{array}{c}
0\\
\tilde{a}_{22}+\sqrt{\sum_{k=2}^{N}\tilde{a}_{2k}^2}\\
\tilde{a}_{23}\\
\vdots\\
\tilde{a}_{2N}
\end{array}\right].
\label{e.kf56}
\end{equation}
%
Calculating $T_{\beta_2}T_{\beta_1}A$ yields
%
\begin{equation}
T_{\beta_2}T_{\beta_1}A=[\tilde{a}_1,\tilde{\tilde{a}}_2,
\tilde{\tilde{a}}_3,
\cdots,
\tilde{\tilde{a}}_M]
\label{e.kf57}
\end{equation}
%
where $\tilde{a}_1$ is still as in (\ref{e.kf52}) and $\tilde{\tilde{a}}_2$
is as in (\ref{e.kf55}). Continuing recursively in this way we can
upper triangularize the $A$ matrix.
The Scilab primitive {\tt qr} performs a triangularization of
a matrix using Householder transformations. In particular, it is
the $r$ part of the qr-decomposition which yields the
desired triangularized matrix.
\subsection{How to Use the Macro {\tt srkf}}
\index{Kalman filter!square root!function syntax}
\index{function syntax!srkf@{\tt srkf}}
The function {\tt srkf} takes as input the system description
matrices, the statistics of the noise processes, the prior estimate of
the state and error covariance matrix, and the new observation. The
outputs are the new estimates of the state and error covariance
matrix. The call to {\tt kalm} is as follows:
\begin{verbatim}
--> [x1,p1]=srkf(y,x0,p0,f,h,q,r)
\end{verbatim}
where {\tt y} is the new observation, {\tt x0} and {\tt p0} are the
prior state and error covariance estimates, {\tt f} and {\tt h} are
the dynamics and observation matrices, respectively, and {\tt q}
and {\tt r} are the noise covariance matrices for the dynamics and
observations equations, respectively. The outputs {\tt x1} and {\tt p1}
are the new state and error covariance estimates, respectively.
\section{The Wiener Filter}
\index{Wiener filter}
The generalized Wiener filtering problem \cite{Kailath1}
can be posed as follows. It is desired
to estimate a zero-mean, Gaussian vector process, $x(t)$, in the
interval $[a,b]$ from observations of a statistically
related, zero-mean, Gaussian vector, $y(t)$. Furthermore, the covariance
functions $R_{yy}(t,s)=E\{y(t)y^T(s)\}$ and $R_{xy}(t,s)=E\{x(t)y^T(s)\}$
are assumed known for $t,s\in[a,b]$. Then, the least mean squared
error estimate of $x(t)$ given $y(t)$ is obtained
as a linear operator on $y(t)$ as
%
\begin{equation}
\hat{x}(t)=\int_{a}^{b}H(t,s)y(s)ds
\label{e.wf1}
\end{equation}
%
where $\hat{x}(t)$ denotes the least mean squared estimate and
$H(t,s)$ is an $N\times M$ matrix function of two arguments $t$
and $s$ (i.e., $[H(t,s)]_{ij}=h_{ij}(t,s)$ and $\hat{x}_i(t)=\int_{a}^{b}
\sum_{j=1}^{M}h_{ij}(t,s)y_j(s)ds$).
By the principle of orthogonality the error
in the estimate in (\ref{e.wf1}), $x(t)-\hat{x}(t)$, must
be orthogonal to $y(u)$ for $t,u\in[a,b]$. Thus,
%
\begin{equation}
0=E\{(x(t)-\hat{x}(t))y^T(u)\}=R_{xy}(t,u)-\int_{a}^{b}H(t,s)R_{yy}(s,u)ds.
\label{e.wf2}
\end{equation}
%
Solving the matrix integral equation in (\ref{e.wf2}) for $H(t,s)$ yields
the optimal estimator for $x(t)$ when used in (\ref{e.wf1}).
A general solution of (\ref{e.wf2}) is not possible, however,
for some very important special cases (\ref{e.wf2}) is
resolvable. For example, if the covariance functions $R_{xy}$
and $R_{yy}$ are wide sense
stationary and with rational spectral densities then
specific solutions techniques are available.
The sections which follow address Wiener filtering
for a special formulation of the relationship
between $x(t)$ and $y(t)$ (albeit, a relationship for which
many engineering problems of interest can be formulated) and,
consequently, permits a solution to the problem posed by
(\ref{e.wf1}) and (\ref{e.wf2}). This special formulation
consists of a state space difference equation identical to that
used in the development of the Kalman filter. This realization
has some advantages in that it allows finite interval estimation
of non-stationary processes. The disadvantage of this procedure
is that sometimes the only knowledge of the processes $x$ and $y$
is their covariance structures. In this case, the construction of a
state-space model which has the appropriate covariance
characteristics may not always be readily evident.
\subsection{Problem Formulation}
In our problem formulation it is assumed
that a dynamic model of the process $x$ is known,
that the process $y$ is related to the process $x$
through an observation equation, and that both
processes are discrete. Consequently, we have
the equations
%
\begin{eqnarray}
x_{k+1}&=&F_kx_k+G_ku_k\nonumber\\
y_k&=&H_kx_k+v_k
\label{e.wf3}
\end{eqnarray}
%
where $x_0$, $u_k$, and $v_k$ are independent and Gaussian
random vectors with statistics
%
\begin{eqnarray}
u_k&\sim&N(0,Q_k)\nonumber\\
v_k&\sim&N(0,R_k)\nonumber\\
x_0&\sim&N(m_0,\Pi_0)\nonumber\\
\label{e.wf4}
\end{eqnarray}
%
From the formulation in (\ref{e.wf3}) and (\ref{e.wf4})
one can determine the covariance functions $R_{xy}$ and $R_{yy}$
and, thus, it is reasonable to try and solve a discrete version of
(\ref{e.wf2}) to obtain the optimal filter.
However, there are certain aspects of the
dynamics and observations models which permit
a solution to the problem posed in the previous
section without explicitly solving for
the filter designated by (\ref{e.wf2}).
The solution to the discrete version of (\ref{e.wf1})
based on the modelization of the underlying processes
given by (\ref{e.wf3}) and (\ref{e.wf4}) can be achieved by
several different Kalman filter formulations. The particular
approach used here is known as the Rauch-Tung-Striebel
\index{Rauch-Tung-Striebel two filter smoother}
formulation \cite{Gelb}.
In this approach a Kalman filter is
used on the data in the usual way to obtain
estimates $\hat{x}_{k|k-1}$ and $\hat{x}_{k|k}$ along with their associated
error covariances $P_{k|k-1}$ and $P_{k|k}$. A second recursive
filter is then used on the estimates and their error
covariances to obtain the estimates $\hat{x}_{k|N}$ and their
error covariances $P_{k|N}$ (that is, the estimate and error covariance
of $x_k$ based on all the data in the interval $[0,N]$). This second
filter processes the estimates in the backward direction using
$\hat{x}_{N|N}$ and $P_{N|N}$ to initialize the filter,
then using $\hat{x}_{N-1|N-1}$, $\hat{x}_{N-1|N-2}$, $P_{N-1|N-1}$, and
$P_{N-1|N-2}$ to obtain $\hat{x}_{N-1|N}$ and $P_{N-1|N}$, etc.,
continuing recursively in this way over the entire interval.
A complete development of the Rauch-Tung-Striebel
approach is too involved
to recount here. We outline a procedure which may be used
to obtain the result without discussing all the
details. The interested reader can find a complete and
unified development in \cite{Nikoukhah}.
The approach we take is based on a constrained
maximum likelihood solution to the problem. We form a cost
function, $J(u,v)$, which we desire to minimize
with respect to its arguments. The function $J$ is
defined by
%
\begin{equation}
J(u,v)=u^TQ^{-1}u+v^TR^{-1}v
\label{e.wf5}
\end{equation}
%
where
%
\begin{eqnarray}
u&=&\left[\begin{array}{c}
m_0\\
u_0\\
u_1\\
\vdots\\
u_{N-1}\end{array}\right]\nonumber\\
v&=&\left[\begin{array}{c}
v_0\\
v_1\\
\vdots\\
v_N\end{array}\right]\nonumber\\
Q&=&\left[\begin{array}{cccc}
\Pi_0&0&\cdots&0\\
0&Q_0&\cdots&0\\
\vdots&\vdots&&\vdots\\
0&0&\cdots&Q_{N-1}\end{array}\right]\nonumber\\
R&=&\left[\begin{array}{cccc}
R_0&0&\cdots&0\\
0&R_1&\cdots&0\\
\vdots&\vdots&&\vdots\\
0&0&\cdots&R_N\end{array}\right]
\label{e.wf5a}
\end{eqnarray}
%
and where $m_0$ and $\Pi_0$ are the {\em a priori} statistics of $x_0$.
The functional in (\ref{e.wf5})
is minimized subject to the constraints
%
\begin{eqnarray}
Sx&=&Gu\nonumber\\
y&=&Hx+v
\label{e.wf6}
\end{eqnarray}
%
where
%
\begin{eqnarray}
x&=&\left[\begin{array}{c}
x_0\\
x_1\\
\vdots\\
x_N\end{array}\right]\nonumber\\
y&=&\left[\begin{array}{c}
y_0\\
y_1\\
\vdots\\
y_N\end{array}\right]\nonumber\\
S&=&\left[\begin{array}{ccccc}
I&0&\cdots&0&0\\
-F_0&I&\cdots&0&0\\
\vdots&\vdots&&\vdots&\vdots\\
0&0&\cdots&-F_{N-1}&I\end{array}\right]\nonumber\\
G&=&\left[\begin{array}{cccc}
I&0&\cdots&0\\
0&G_0&\cdots&0\\
\vdots&\vdots&&\vdots\\
0&0&\cdots&G_{N-1}\end{array}\right]\nonumber\\
H&=&\left[\begin{array}{cccc}
H_0&0&\cdots&0\\
0&G_1&\cdots&0\\
\vdots&\vdots&&\vdots\\
0&0&\cdots&H_N\end{array}\right].
\label{e.wf6a}
\end{eqnarray}
%
Minimizing (\ref{e.wf5}) subject to the system constraints
in (\ref{e.wf6}) yields the estimate
$\hat{x}^T=[\hat{x}_{0|N}^T,\hat{x}_{1|N}^T,\ldots,\hat{x}_{N|N}^T]^T$.
The optimization can be accomplished using the Lagrange
multiplier technique where a new functional
$\tilde{J}$ is formed from (\ref{e.wf5}) and (\ref{e.wf6})
%
\begin{equation}
\tilde{J}(u,v)=J(u,v)+\lambda^T(Sx-Gu)
\label{e.wf7}
\end{equation}
%
where $\lambda$, an unknown vector of the same dimension as $x$,
is called the Lagrange multiplier.
After some algebra and setting the partial derivatives of $\tilde{J}$
with respect to $u$ and $x$ to zero we obtain the
so-called Hamiltonian (see \cite{Kailath2})
%
\begin{equation}
\left[\begin{array}{cc}
S&-GQG^T\\
H^TR^{-1}H&S^T\end{array}\right]
\left[\begin{array}{c}
\hat{x}\\
\hat{\lambda}\end{array}\right]
=
\left[\begin{array}{c}
0\\
H^TR^{-1}y\end{array}\right].
\label{e.wf8}
\end{equation}
%
Solution of (\ref{e.wf8}) yields $\hat{x}_{k|N}$ for
$k=0,1,\ldots,N$. It can be seen that (\ref{e.wf8}) is a
sparse block matrix. In fact, after a re-ordering of the terms
in the vectors, the resulting matrix is tri-block-diagonal.
Solving this tri-block-diagonal system by using Gaussian
elimination, first on the lower diagonal, then on the
upper diagonal yields the following equations in matrix form
%
\begin{equation}
\left[\begin{array}{ccccccc}
-\Pi_0&I&0&0&0&\cdots&0\\
I&\Theta_0&-F_0^T&0&0&\cdots&0\\
0&-F_0&\Gamma_0&I&0&\cdots&0\\
0&0&I&\Theta_1&-F_1^T&\cdots&0\\
&&&\cdots\cdots\cdots\cdots\cdots&&&\\
&&&\cdots\cdots\cdots\cdots\cdots&&&\\
&&&\cdots\cdots\cdots\cdots\cdots&&&\\
0&0&\cdots&0&-F_{N-1}&\Gamma_{N-1}&I\\
0&0&\cdots&0&0&I&\Theta_N\end{array}\right]
\left[\begin{array}{c}
\hat{\lambda}_0\\
\hat{x}_0\\
\hat{\lambda}_1\\
\hat{x}_1\\
\vdots\\
\hat{\lambda}_N\\
\hat{x}_N\end{array}\right]
=\left[\begin{array}{c}
m_0\\
\psi_0\\
0\\
\psi_1\\
0\\
\vdots\\
\psi_N\end{array}\right]
\label{e.wf8a}
\end{equation}
%
where $\Theta_k=H_k^TR_k^{-1}H_k$, $\Gamma_k=-G_kQ_kG_k^T$, and
$\psi_k=H_kR_k^{-1}y_k$.
Beginning by eliminating the lower diagonal of the matrix
we obtain for the first
equation
%
\begin{equation}
\hat{x}_0=\Pi_0\hat{\lambda}_0+m_0
\label{e.wf8.b}
\end{equation}
%
which when $\hat{\lambda}_0=0$ yields the usual Kalman filter initial
condition $\hat{x}_{0|-1}=m_0$. The second equation obtained by the
Gaussian elimination is
%
\begin{equation}
\hat{x}_0=(H_0^TR_0^{-1}H_0+\Pi_0^{-1})^{-1}(F_0^T\hat{\lambda}_1+H_0R_0{-1}y_0+\Pi_0^{-1}m_0)
\label{e.wf8.c}
\end{equation}
%
which, when $\hat{\lambda}_1=0$ and after some algebra,
yields the Kalman filter
filter-update equation
$\hat{x}_{0|0}=\hat{x}_{0|-1}
+P_{0|-1}H_0^T(H_0P_{0|-1}H_0^T+R_0)^{-1}(y_0-H_0\hat{x}_{0|-1})$.
Continuing in this way the Gaussian elimination yields the $2N$ equations
%
\begin{eqnarray}
\hat{x}_{k|k}&=&\hat{x}_{k|k-1}+P_{k|k-1}H_k^T(H_kP_{k|k-1}H_k^T+R_k)^{-1}(y_k-H_k\hat{x}_{k|k-1})\nonumber\\
\hat{x}_{k+1|k}&=&F_k\hat{x}_{k|k}
\label{e.wf9}
\end{eqnarray}
%
where
%
\begin{eqnarray}
P_{k|k}&=&P_{k|k-1}-P_{k|k-1}H_k^T(H_kP_{k|k-1}H_k^T+R_k)^{-1}H_kP_{k|k-1}\nonumber\\
P_{k+1|k}&=&F_kP_{k|k}F_k^T+G_kQ_kG_k^T.
\label{e.wf10}
\end{eqnarray}
%
After eliminating the lower diagonal of the matrix in (\ref{e.wf8a})
the upper diagonal is eliminated in the same way, however, now the
$\hat{\lambda}_k$ are not set to zero which results in the equations
%
\begin{equation}
\hat{x}_{k|N}=\hat{x}_{k|k}+A_k[\hat{x}_{k+1|N}-\hat{x}_{k+1|k}]
\label{e.wf11}
\end{equation}
%
where
%
\begin{eqnarray}
P_{k|N}&=&P_{k|k}+A_k[P_{k+1|N}-P_{k|k-1}]A_k^T\nonumber\\
A_k&=&P_{k|k}F_k^TP_{k|k-1}^{-1}
\label{e.wf12}
\end{eqnarray}
%
and where the $\hat{\lambda}_k$ have been identified as the $\hat{x}_{k|N}$.
It can be seen that equations (\ref{e.wf9}) and (\ref{e.wf10}) specify
the standard Kalman filter. Equations (\ref{e.wf11}) and (\ref{e.wf12})
yield the Rauch-Tung-Striebel smoother.
\subsection{How to Use the Function {\tt wiener}}
\index{function syntax!wiener@{\tt wiener}}
\index{Wiener filter!function syntax}
The syntax for the function {\tt wiener} is as follows
\begin{verbatim}
-->[xs,ps,xf,pf]=wf(y,x0,p0,f,g,h,q,r)
\end{verbatim}
The inputs
{\tt f}, {\tt g}, and {\tt h} are the system matrices in the interval
$[t_0,t_f]$. The construction of, for example, {\tt f} is accomplished
by arranging the individual matrices $F_k$ into a block-row matrix
$[F_0,F_1,\ldots,F_N]$. The other system matrices are identical in
construction. The inputs {\tt q} and {\tt r}
are the covariance matrices of dynamics and observation noise. These
matrices are also constructed in a fashion identical to that for {\tt f}.
The inputs {\tt x0} and {\tt p0} are the initial state estimate vector
and error variance matrix. The vector {\tt x0} must be in column form.
Finally, the input {\tt y} are the observations in the interval $[t_0,t_f]$.
The form of {\tt y} is a matrix where the first column is the observation
$y_0$, the second column $y_1$, etc. The outputs are the smoothed
estimates {\tt xs} of $x_0$, $x_1$, etc. arranged in a matrix of similar form
to {\tt y} and {\tt ps} which is arranged identically to the form
of the input matrices.
\subsection{Example}
\index{Wiener filter!example}
In this section, the {\tt wiener} function is demonstrated on
a two-dimensional problem. The input to the filter is synthetic
data obtained using the same system dynamics given to the {\tt wiener}
function. The system used in the example is as follows.
%
\begin{eqnarray}
x_{k+1}&=&\left[\begin{array}{cc}
1.15 & .1\\
0 & .8\end{array}\right]x_k+
\left[\begin{array}{cc}
1 & 0\\
0 & 1\end{array}\right]w_k\nonumber\\
y_k&=&\left[\begin{array}{cc}
1 & 0\\
0 & 1\end{array}\right]x_k+v_k\nonumber
\end{eqnarray}
%
where
%
\begin{eqnarray}
E\{w_kw_k^T\}&=&\left[\begin{array}{cc}
.01 & 0\\
0 & .01\end{array}\right]\nonumber\\
E\{v_kv_k^T\}&=&\left[\begin{array}{cc}
20 & 0\\
0 & 20\end{array}\right]\nonumber
\end{eqnarray}
%
and
%
\begin{eqnarray}
E\{x_0\}&=&\left[\begin{array}{c}
10\\
10\end{array}\right]\nonumber\\
E\{(x_0-m_0)(x_0-m_0)^T\}&=&\left[\begin{array}{cc}
100 & 0\\
0 & 100\end{array}\right]\nonumber
\end{eqnarray}
%
Figure~\ref{f.wf1} shows the results of using the {\tt wiener}
function on the synthetic data generated by the model above. Here
the dotted line indicates the actual state values generated by the
system. The circles on the dotted line serve to mark the actual state
locations. The solid line marked by circles indicates the Kalman filter
estimate of the state. The estimate of $x_0$ is located in the upper
left corner of the figure and the estimate of $x_{12}$ is located in
the lower right corner of the figure. As can be seen the initial
estimates obtained by the Kalman filter are not so good with respect
to the final estimates obtained by the filter. This is due to the
large initial error covariance matrix given for the initial
estimate of the state vector. The
solid line marked by stars is the smoothed Kalman filter estimate.
As can be seen, the final smoothed estimate is identical to that
of the regular Kalman filter. However, as the smoothed Kalman filter
estimate works its way back to the initial state value, the estimate
becomes greatly improved, as is to be expected since these states
are now estimated based on all of the observed data.
%
\input{\Figdir wf1.tex}
\dessin{{\tt exec('wf1.code')} Wiener Smoothing Filter}{f.wf1}
%
The Scilab code which generated the example in this section
is as follows.
\verbatok{\Diary Swf1.dia}
\end{verbatim}
%\section{Stochastic realization}
%\subsection{Spectral Factorization}
%\subsection{The {\tt sfact} primitive}
%\subsection{Spectral Factorization via Riccati equations}
%\end{document}