From bae77c1d02c38895fdf6b102c55eca4a29f86eeb Mon Sep 17 00:00:00 2001 From: Roger Labbe Date: Sun, 11 May 2014 20:44:25 -0700 Subject: [PATCH] Format changes and multidimensional KF content. Modified the css style to use colors and fonts more to my liking. Still needs tweaking. Added a lot of content to the multidimensional KF chapter. --- DogSensor.py | 26 + Gaussians.ipynb | 47 +- Kalman Filters.ipynb | 7 +- KalmanFilter.py | 75 +++ Multidimensional Kalman Filters.ipynb | 740 ++++++++++++++++++++++---- Untitled0.ipynb | 320 ++++++++++- mkf_internal.py | 82 +++ styles/custom.css | 63 ++- styles/custom2.css | 203 +++++++ styles/custom3.css | 60 +++ 10 files changed, 1482 insertions(+), 141 deletions(-) create mode 100644 DogSensor.py create mode 100644 KalmanFilter.py create mode 100644 styles/custom2.css create mode 100644 styles/custom3.css diff --git a/DogSensor.py b/DogSensor.py new file mode 100644 index 0000000..2ad5ba9 --- /dev/null +++ b/DogSensor.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun May 11 13:21:39 2014 + +@author: rlabbe +""" + +from __future__ import print_function, division + +import numpy.random as random +import math + +class DogSensor(object): + + def __init__(self, x0=0, velocity=1, noise=0.0): + """ x0 - initial position + velocity - (+=right, -=left) + noise - scaling factor for noise, 0== no noise + """ + self.x = x0 + self.velocity = velocity + self.noise = math.sqrt(noise) + + def sense(self): + self.x = self.x + self.velocity + return self.x + random.randn() * self.noise diff --git a/Gaussians.ipynb b/Gaussians.ipynb index 88516e1..90e4130 100644 --- a/Gaussians.ipynb +++ b/Gaussians.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:20b4feb883a71cab956a175006257f6c548b074ace2a845bd9dc5c1c1f9b7a91" + "signature": "sha256:df50da4d6f87f1215c25b03c3c6a3aaa704a4ed1187da87af0018921ba07e577" }, "nbformat": 3, "nbformat_minor": 0, @@ -17,8 +17,8 @@ "### Introduction\n", "\n", "The last chapter ended by discussing some of the drawbacks of the Discrete Bayesian filter. For many tracking and filtering problems our desire is to have a filter that is *unimodal* and *continuous*. That is, we want to model our system using floating point math (continuous) and to have only one belief represented (unimodal). For example, we want to say an aircraft is at (12.34381, -95.54321,2389.5) where that is latitude, longitude, and altidue. We do not want our filter to tell us \"it might be at (1,65,78) or at (34,656,98)\" That doesn't match our physical intuition of how the world works, and as we discussed, it is prohibitively expensive to compute.\n", - "

\n", - "

So we desire a unimodal, continuous way to represent probabilities that models how the real world works, and that is very computationally efficient to calculate. As you might guess from the chapter name, Gaussian distributions provide all of these features.
\n", + "\n", + ">So we desire a unimodal, continuous way to represent probabilities that models how the real world works, and that is very computationally efficient to calculate. As you might guess from the chapter name, Gaussian distributions provide all of these features.\n", "\n", "Before we go into the math, lets just look at a graph of the Gaussian distribution to get a sense of what we are talking about." ] @@ -47,7 +47,14 @@ "Probably this is immediately recognizable to you as a 'bell curve'. This curve is ubiquitious because under real world conditions most observations are distributed in such a manner. In fact, this is the bell curve for IQ (Intelligence Quotient). You've probably seen this before, and understand it. It tells us that the average IQ is 100, and that the number of people that have IQs higher or lower than that drops off as they get further away from 100. It's hard to see the exact number, but we can see that very few people have an IQ over 150 or under 50, but a lot have an IQ of 90 or 110. \n", "\n", "This curve is not unique to IQ distributions - a vast amount of natural phenomena exhibits this sort of distribution, including the sensors that we use in filtering problems. As we will see, it also has all the attributes that we are looking for - it represents a unimodal belief or value as a probabilitiy, it is continuous, and it is computationally efficient. We will soon discover that it also other desirable qualities that we do not yet recognize we need.\n", - "\n", + "

\n", + "

" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "#### Nomenclature\n", "\n", "A bit of nomenclature before we continue - this chart depicts the probability of of a *random variable* having any value between ($-\\infty..\\infty)$. For example, for this chart the probability of the variable being 100 is roughly 2.7%, whereas the probability of it being 80 is around 1%.\n", @@ -74,9 +81,25 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "

Don't be dissuaded by the equation if you haven't seen it before; you will not need to memorize or manipulate it. The computation of this function is stored in gaussian.py. You are free to look at the code if interested by using the '%load gaussian.py' magic. \n", + "

Don't be dissuaded by the equation if you haven't seen it before; you will not need to memorize or manipulate it. The computation of this function is stored in stats.py. \n", "\n", - "We will plot a Gaussian with a mean of 22 ($\\mu=22$), with a variance of 4 ($\\sigma^2=4$), and then discuss what this means. " + "> **Optional:** Let's remind ourselves how to look at a function stored in a file by using the *%load* magic. If you type *%load -s gaussian stats.py* into a code cell and then press CTRL-Enter, the notebook will create a new input cell and load the function into it.\n", + "\n", + " %load -s gaussian stats.py\n", + " \n", + " def gaussian(x, mean, var):\n", + " \"\"\"returns normal distribution for x given a \n", + " gaussian with the specified mean and variance. \n", + " \"\"\"\n", + " return math.exp((-0.5*(x-mean)**2)/var) / \\\n", + " math.sqrt(_two_pi*var)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

We will plot a Gaussian with a mean of 22 $(\\mu=22)$, with a variance of 4 $(\\sigma^2=4)$, and then discuss what this means. " ] }, { @@ -97,7 +120,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "So what does this curve *mean*? Assume for a moment that we have a themometer, which reads 22$\\,^{\\circ}C$. No thermometer is perfectly accurate, and so we normally expect that thermometer will read $\\pm$ that temperature by some amount each time we read it. Furthermore, a theorem called **Central Limit Theorem** states that if we make many measurements that the measurements will be normally distributed. If that is true, then this chart can be interpreted as a continuous curve depicting our belief that the temperature is any given temperature. In this curve, we assign a probability of the temperature being exactly $22\\,^{\\circ}$ is 19.95%. Looking to the right, we assign the probability that the temperature is $24\\,^{\\circ}$ is 12.10%. Because of the curve's symmetry, the probability of $20\\,^{\\circ}$ is also 12.10%.\n", + "So what does this curve *mean*? Assume for a moment that we have a themometer, which reads 22$\\,^{\\circ}C$. No thermometer is perfectly accurate, and so we normally expect that thermometer will read $\\pm$ that temperature by some amount each time we read it. Furthermore, a theorem called **Central Limit Theorem** states that if we make many measurements that the measurements will be normally distributed. If that is true, then this chart can be interpreted as a continuous curve depicting our belief that the temperature is any given temperature. In this curve, we assign a probability of the temperature being exactly $22\\,^{\\circ}C$ is $19.95%$. Looking to the right, we assign the probability that the temperature is $24\\,^{\\circ}C$ is $12.10%$. Because of the curve's symmetry, the probability of $20\\,^{\\circ}C$ is also $12.10%$.\n", "\n", "So the mean ($\\mu$) is what it sounds like - the average of all possible probabilities. Because of the symmetric shape of the curve it is also the tallest part of the curve. The thermometer reads $22\\,^{\\circ}C$, so that is what we used for the mean. \n", "\n", @@ -107,7 +130,7 @@ "\n", "$$temp = \\mathcal{N}(22,4)$$\n", "\n", - "This is an **extremely important** result. Gaussians allow me to capture an infinite number of possible values with only two numbers! With the values $\\mu=22$ and $\\sigma^2=4$ I can compute the probability of the temperature being $22\\,^{\\circ}C$, of $20\\,^{\\circ}C$, of $87.3429\\,^{\\circ}C$, or any other arbitrary value.\n", + "This is an **extremely important** result. Gaussians allow me to capture an infinite number of possible values with only two numbers! With the values $\\mu=22$ and $\\sigma^2=4$ I can compute the probability of the temperature being $22\\,^{\\circ}C$, $20\\,^{\\circ}C$, $87.34\\,^{\\circ}C$, or any other arbitrary value.\n", "\n", "###### The Variance\n", "\n", @@ -140,9 +163,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "So what is this telling us? The blue gaussian is very narrow. It is saying that we believe x=23, and that we are very sure about that (90%). In contrast, the red gaussian also believes that x=23, but we are much less sure about that (18%%). Our believe that x=23 is lower, and so our belief about the likely possible values for x is spread out - we think it is quite likely that x=20 or x=26, for example. The blue gaussian has almost completely eliminated 22 or 24 as possible value - their probability is almost 0%, whereas the red curve considers them nearly as likely as 23.\n", + "So what is this telling us? The blue gaussian is very narrow. It is saying that we believe $x=23$, and that we are very sure about that $(90%)$. In contrast, the red gaussian also believes that $x=23$, but we are much less sure about that $(18%)$. Our believe that $x=23$ is lower, and so our belief about the likely possible values for $x$ is spread out - we think it is quite likely that $x=20$ or $x=26$, for example. The blue gaussian has almost completely eliminated $22$ or $24$ as possible value - their probability is almost $0%$, whereas the red curve considers them nearly as likely as $23$.\n", "\n", - "If we think back to the thermometer, we can consider these three curves as representing the readings from 3 different thermometers. The blue curve represents a very accurate thermometer, and the red one represents a fairly inaccurate one. Green of course represents one in between the two others. Note the very powerful property the Gaussian distribution affords us - we can entirely represent both the reading and the error of a thermometer with only two numbers - the mean and the variance.\n", + "If we think back to the thermometer, we can consider these three curves as representing the readings from three different thermometers. The blue curve represents a very accurate thermometer, and the red one represents a fairly inaccurate one. Green of course represents one in between the two others. Note the very powerful property the Gaussian distribution affords us - we can entirely represent both the reading and the error of a thermometer with only two numbers - the mean and the variance.\n", "\n", "The standard notation for a normal distribution for a random variable $X$ is just $X \\sim\\ \\mathcal{N}(\\mu,\\sigma^2)$ where $\\mu$ is the mean and $\\sigma^2$ is the variance. It may seem odd to use $\\sigma$ squared - why not just $\\sigma$? We will not go into great detail about the math at this point, but in statistics $\\sigma$ is the *standard deviation* of a normal distribution. *Variance* is defined as the square of the standard deviation, hence $\\sigma^2$.\n", "\n", @@ -216,7 +239,7 @@ "\n", "A typical textbook would directly launch into a multipage proof of the behavior of Gaussians under these operations, but I don't see the value in that right now. I think the math will be much more intuitive and clear if we just start developing a Kalman filter using Gaussians. I will provide the equations for multiplying and shifting Gaussians at the appropriate time. You will then be able to develop a physical intuition for what these operations do, rather than be forced to digest a lot of fairly abstract math.\n", "\n", - "The key point, which I will only assert for now, is that all the operations are very simple, and that they preserve the properties of the Gaussian. This is somewhat remarkable, in that the Gaussian is a nonlinear function, and typically if you multiply a nonlinear equation with itself you end up with a different equation. For example, the shape of $sin(x)sin(x)$ is very different from $sin(x)$. But the result of multiplying two Gaussians is yet another Gaussian. This is a fundamental discovery, and the key reason why Kalman filters are possible" + "The key point, which I will only assert for now, is that all the operations are very simple, and that they preserve the properties of the Gaussian. This is somewhat remarkable, in that the Gaussian is a nonlinear function, and typically if you multiply a nonlinear equation with itself you end up with a different equation. For example, the shape of $sin(x)sin(x)$ is very different from $sin(x)$. But the result of multiplying two Gaussians is yet another Gaussian. This is a fundamental property, and the key reason why Kalman filters are possible." ] }, { @@ -242,7 +265,7 @@ "#format the book\n", "from IPython.core.display import HTML\n", "def css_styling():\n", - " styles = open(\"./styles/custom.css\", \"r\").read()\n", + " styles = open(\"./styles/custom2.css\", \"r\").read()\n", " return HTML(styles)\n", "css_styling()" ], diff --git a/Kalman Filters.ipynb b/Kalman Filters.ipynb index 29aa158..44aff1f 100644 --- a/Kalman Filters.ipynb +++ b/Kalman Filters.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:f5b17ce3aa0755a19e9550b31e0a75c9c255bb570b57e1e26c533de072549821" + "signature": "sha256:3a7b44bd3874a8149fcf6a20285e8a0855662094c0cb4dbb11f3ec3a2e1e7a3c" }, "nbformat": 3, "nbformat_minor": 0, @@ -644,8 +644,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this example I only plotted 10 data points so the output from the print statements would not overwhelm us. Now let's look at the filter's performance with more data. This time we will plot both the output of the filter and the variance.\n", - "\n" + "In this example I only plotted 10 data points so the output from the print statements would not overwhelm us. Now let's look at the filter's performance with more data. This time we will plot both the output of the filter and the variance." ] }, { @@ -1385,7 +1384,7 @@ "#format the book\n", "from IPython.core.display import HTML\n", "def css_styling():\n", - " styles = open(\"./styles/custom.css\", \"r\").read()\n", + " styles = open(\"./styles/custom2.css\", \"r\").read()\n", " return HTML(styles)\n", "css_styling()" ], diff --git a/KalmanFilter.py b/KalmanFilter.py new file mode 100644 index 0000000..371e42c --- /dev/null +++ b/KalmanFilter.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Oct 18 18:02:07 2013 + +@author: rlabbe +""" + +import numpy as np +import scipy.linalg as linalg +import matplotlib.pyplot as plt +import numpy.random as random + +class KalmanFilter: + + def __init__(self, dim): + self.x = 0 # estimate + self.P = np.matrix(np.eye(dim)) # uncertainty covariance + self.Q = 0 # motion uncertainty + self.u = np.matrix(np.zeros((dim,1))) # motion vector + self.F = 0 # state transition matrix + self.H = 0 # Measurment function (maps state to measurements) + self.R = np.matrix(np.eye(1)) # state uncertainty + self.I = np.matrix(np.eye(dim)) + + + def measure(self, Z, R=None): + """ + Add a new measurement with an optional state uncertainty (R). + The state uncertainty does not alter the class's state uncertainty, + it is used only for this call. + """ + if R is None: R = self.R + + # measurement update + y = Z - (self.H * self.x) # error (residual) between measurement and prediction + S = (self.H * self.P * self.H.T) + R # project system uncertainty into measurment space + measurement noise(R) + + + K = self.P * self.H.T * linalg.inv(S) # map system uncertainty into kalman gain + + self.x = self.x + (K*y) # predict new x with residual scaled by the kalman gain + self.P = (self.I - (K*self.H))*self.P # and compute the new covariance + + def predict(self): + # prediction + self.x = (self.F*self.x) + self.u + self.P = self.F * self.P * self.F.T + self.Q + + +if __name__ == "__main__": + f = KalmanFilter (dim=2) + + f.x = np.matrix([[200.], [0.]]) # initial np.matrix([[z],[0.]],dtype=float)state (location and velocity) + f.F = np.matrix([[1.,1.],[0.,1.]]) # state transition matrix + f.H = np.matrix([[1.,0.]]) # Measurement function + f.R *= 5 # state uncertainty + f.P *= 1000. # covariance matrix + f.R = 5 + f.Q = 0 + + + + + ps = [] + zs = [] + for t in range (1000): + z = t + random.randn()*10 + f.measure (z) + f.predict() + ps.append (f.x[0,0]) + zs.append(z) + plt.plot (ps) + # plt.ylim([0,2]) + plt.plot(zs) + plt.show() \ No newline at end of file diff --git a/Multidimensional Kalman Filters.ipynb b/Multidimensional Kalman Filters.ipynb index 1db4a93..15d820c 100644 --- a/Multidimensional Kalman Filters.ipynb +++ b/Multidimensional Kalman Filters.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:fd542341131473089289548f12ba7f78ac88663ce51e586ff357c5fb48c3eb91" + "signature": "sha256:5065da4ac6135de8bf6874ebb71a6914d772f4084a148dbe9d30cddd37f45030" }, "nbformat": 3, "nbformat_minor": 0, @@ -12,9 +12,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Multidimensional Kalman Filters\n", - "=====\n", + "

Multidimensional

\n", + "

Kalman Filters

\n", "\n", + "### Introduction\n", "The techniques in the last chapter are very powerful, but they only work in one dimension. The gaussians represent a mean and variance that are scalars - real numbers. They provide no way to represent multidimensional data, such as the position of a dog in a field. You may retort that you could use two Kalman filters for that case, one tracks the x coordinate and the other tracks the y coordinate. That does work in some cases, but put that thought aside, because soon you will see some enormous benefits to implementing the multidimensional case.\n", "\n", "\n", @@ -22,10 +23,13 @@ "\n", "\n", "What might a *multivariate normal distribution* look like? In this context, multivariate just means multiple variables. Our goal is to be able to represent a normal distribution across multiple dimensions. Consider the 2 dimensional case. Let's say we believe that $x = 2$ and $y = 7$. Therefore we can see that for $N$ dimensions, we need $N$ means, like so:\n", - "$$ \\mu = \\begin{bmatrix}{\\mu}_1\\\\{\\mu}_2\\\\ \\vdots \\\\{\\mu}_n\\end{bmatrix} \n", + "\n", + "$$\n", + "\\mu = \\begin{bmatrix}{\\mu}_1\\\\{\\mu}_2\\\\ \\vdots \\\\{\\mu}_n\\end{bmatrix}\n", "$$\n", "\n", "Therefore for this example we would have\n", + "\n", "$$\n", "\\mu = \\begin{bmatrix}2\\\\7\\end{bmatrix} \n", "$$\n", @@ -34,14 +38,19 @@ "\n", "$$\\sigma^2 = \\begin{bmatrix}10\\\\8\\end{bmatrix}$$ \n", "\n", - "While this is possible, it does not consider the more general case. For example, suppose we were tracking house prices vs total $m^2$ of the floor plan. These numbers are *correlated*. It is not an exact correlation, but in general houses in the same neighborhood are more expensive if they have a larger floor plan. We want a way to express not only what we think the variance is in the price and the $m^2$, but also the degree to which they are correlated. It turns out that we use the following matrix to denote *covariances* with multivariate normal distributions. You might guess, correctly, that *covariance* is short for *correlated variances*.\n", - "\n", + "While this is possible, it does not consider the more general case. For example, suppose we were tracking house prices vs total $m^2$ of the floor plan. These numbers are *correlated*. It is not an exact correlation, but in general houses in the same neighborhood are more expensive if they have a larger floor plan. We want a way to express not only what we think the variance is in the price and the $m^2$, but also the degree to which they are correlated. It turns out that we use the following matrix to denote *covariances* with multivariate normal distributions. You might guess, correctly, that *covariance* is short for *correlated variances*." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "$$\n", "\\Sigma = \\begin{pmatrix}\n", - " {{\\sigma}_{1}}^2 & p{\\sigma}_{1}{\\sigma}_{2} & \\cdots & p{\\sigma}_{1}{\\sigma}_{n} \\\\\n", - " p{\\sigma}_{2}{\\sigma}_{1} &{{\\sigma}_{2}}^2 & \\cdots & p{\\sigma}_{2}{\\sigma}_{n} \\\\\n", + " \\sigma_1^2 & p\\sigma_1\\sigma_2 & \\cdots & p\\sigma_1\\sigma_n \\\\\n", + " p\\sigma_2\\sigma_1 &\\sigma_2^2 & \\cdots & p\\sigma_2\\sigma_n \\\\\n", " \\vdots & \\vdots & \\ddots & \\vdots \\\\\n", - " p{\\sigma}_{n}{\\sigma}_{1} & p{\\sigma}_{n}{\\sigma}_{2} & \\cdots & {{\\sigma}_{n}}^2\n", + " p\\sigma_n\\sigma_1 & p\\sigma_n\\sigma_2 & \\cdots & \\sigma_n^2\n", " \\end{pmatrix}\n", "$$\n", "\n", @@ -52,11 +61,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, without explanation, here is the full equation for the multivarate normal distribution.\n", + "Now, without explanation, here is the full equation for the multivarate normal distribution in $n$ dimensions.\n", "\n", "$$\\mathcal{N}(\\mu,\\,\\Sigma) = (2\\pi)^{-\\frac{n}{2}}|\\Sigma|^{-\\frac{1}{2}}\\, e^{ -\\frac{1}{2}(\\mathbf{x}-\\mu)'\\Sigma^{-1}(\\mathbf{x}-\\mu) }$$\n", "\n", - "I urge you to not try to remember this function. We will program it once in a function and then call it when we need to compute a specific value. However, if you look at it briefly you will note that it looks quite similar to the *univarate normal distribution* except it uses matrices instead of scalar values, and the root of $\\pi$ is scaled by $N$.\n", + "I urge you to not try to remember this function. We will program it in a Python function and then call it when we need to compute a specific value. However, if you look at it briefly you will note that it looks quite similar to the *univarate normal distribution* except it uses matrices instead of scalar values, and the root of $\\pi$ is scaled by $n$. Here is the *univariate* equation for reference:\n", "\n", "$$ \n", "f(x, \\mu, \\sigma) = \\frac{1}{\\sigma\\sqrt{2\\pi}} e^{{-\\frac{1}{2}}{(x-\\mu)^2}/\\sigma^2 }\n", @@ -64,17 +73,16 @@ "\n", "If you are reasonably well-versed in linear algebra this equation should look quite managable; if not, don't worry! If you want to learn the math we will cover it in detail in the next optional chapter. If you choose to skip that chapter the rest of this book should still be managable for you\n", "\n", - "I have programmed it and saved it in the file gaussian.py with the function name multivariate_gaussian. I am not showing the code here because I have taken advantage of the linear algebra solving apparatus of numpy to efficiently compute a solution - the code does not correspond to the equation in a one to one manner. If you wish to view the code, I urge you to either load it in an editor, or load it into this worksheet by putting '%load gaussian.py' without the quotes in the next cell and executing it with ctrl-enter. " + "I have programmed it and saved it in the file *stats.py* with the function name *multivariate_gaussian*. I am not showing the code here because I have taken advantage of the linear algebra solving apparatus of numpy to efficiently compute a solution - the code does not correspond to the equation in a one to one manner. If you wish to view the code, I urge you to either load it in an editor, or load it into this worksheet by putting *%load -s multivariate_gaussian stats.py* in the next cell and executing it with ctrl-enter. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "> **Note**: As of version 0.14 scipy has implemented the multivariate normal equation with the function **scipy.stats.multivariate_normal()**. It is superior to my function in several ways. First, it is implemented in Fortran, and is therefore faster than mine. Second, it implements a 'frozen' form where you set the mean and covariance once, and then calculate the probability for any number of values for x over any arbitrary number of calls. This is much more efficient then recomputing everything in each call. So, if you have version 0.14 or later you may want to substitute my function for the built in version. Use **scipy.version.version** to get the version number. Note that I deliberately named my function **multivariate_gaussian()** to ensure it is never confused with the built in version.\n", + ">As of version 0.14 scipy.stats has implemented the multivariate normal equation with the function **multivariate_normal()**. It is superior to my function in several ways. First, it is implemented in Fortran, and is therefore faster than mine. Second, it implements a 'frozen' form where you set the mean and covariance once, and then calculate the probability for any number of values for x over any arbitrary number of calls. This is much more efficient then recomputing everything in each call. So, if you have version 0.14 or later you may want to substitute my function for the built in version. Use **scipy.version.version** to get the version number. I deliberately named my function **multivariate_gaussian()** to ensure it is never confused with the built in version.\n", "\n", - "> If you intend to use Python for Kalman filters, you will want to read the tutorial for the stats module, which explains 'freezing' distributions and other very useful features. As of this date, it includes an example of using the multivariate_normal function, which does work a bit differently from my function.\n", - "http://docs.scipy.org/doc/scipy/reference/tutorial/stats.html" + "> If you intend to use Python for Kalman filters, you will want to read the tutorial for the scipy.stats module, which explains 'freezing' distributions and other very useful features. As of this date, it includes an example of using the multivariate_normal function, which does work a bit differently from my function." ] }, { @@ -93,9 +101,9 @@ "source": [ "Let's use it to compute a few values just to make sure we know how to call and use the function, and then move on to more interesting things.\n", "\n", - "First, let's find the probability for our dog being at (2.5, 7.3) if we believe he is at (2,7) with a variance of 8 for x and a variance of 10 for y. This function requires us to pass everything in as numpy arrays (we will soon provide a more robust version that works with numpy matrices, numpy arrays, and/or scalars in any combinations. That code contains a lot of boilerplate which obscures the algorithm).\n", + "First, let's find the probability for our dog being at (2.5, 7.3) if we believe he is at (2,7) with a variance of 8 for $x$ and a variance of 10 for $y$. This function requires us to pass everything in as numpy arrays (we will soon provide a more robust version that works with numpy matrices, numpy arrays, and/or scalars in any combinations. That code contains a lot of boilerplate which obscures the algorithm).\n", "\n", - "So we set x to (2.5,7.3)" + "Start by setting $x$ to (2.5,7.3):" ] }, { @@ -130,14 +138,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, we have to define our covariance matrix. In the problem statement we did not mention any correlation between x and y, and we will assume there is none. This makes sense; a dog can choose to independently wander in either the x direction or y direction without affecting the other. If there is no correlation between the values you just fill in the diagonal of the covariance matrix with the variances:" + "Finally, we have to define our covariance matrix. In the problem statement we did not mention any correlation between $x$ and $y$, and we will assume there is none. This makes sense; a dog can choose to independently wander in either the $x$ direction or $y$ direction without affecting the other. If there is no correlation between the values you just fill in the diagonal of the covariance matrix with the variances. I will use the seemingly arbitrary name $P$ for the covariance matrix. The Kalman filters use the name $P$ for this matrix, so I will introduce the terminology now to avoid explaining why I change the name later. " ] }, { "cell_type": "code", "collapsed": false, "input": [ - "cov = np.array([[8.,0],[0,10.]])" + "P = np.array([[8.,0],[0,10.]])" ], "language": "python", "metadata": {}, @@ -154,7 +162,7 @@ "cell_type": "code", "collapsed": false, "input": [ - "print(multivariate_gaussian(x,mu,cov))" + "print(multivariate_gaussian(x,mu,P))" ], "language": "python", "metadata": {}, @@ -174,7 +182,7 @@ "from __future__ import print_function\n", "\n", "x = np.array([2,7])\n", - "print(multivariate_gaussian(x,mu,cov))" + "print(\"Probability dog is at (2,7) is %.2f%%\" % (multivariate_gaussian(x,mu,P) * 100.))" ], "language": "python", "metadata": {}, @@ -184,7 +192,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "These numbers are not easy to interpret. Let's plot this in 3D, with the z (up) coordinate being the probability." + "These numbers are not easy to interpret. Let's plot this in 3D, with the $z$ (up) coordinate being the probability." ] }, { @@ -200,29 +208,26 @@ "import numpy as np\n", "\n", "pylab.rcParams['figure.figsize'] = 12,6\n", - "cov = np.array([[8.,0],[0,10.]])\n", - "\n", + "P = np.array([[8.,0],[0,10.]])\n", "mu = np.array([2,7])\n", "\n", - "xs, ys = np.arange(-8, 13, .75), np.arange(-8, 20, .75)\n", + "xs, ys = np.arange(-8, 13, .5), np.arange(-8, 20, .5)\n", "xv, yv = np.meshgrid (xs, ys)\n", "\n", - "zs = np.array([100.* multivariate_gaussian(np.array([x,y]),mu,cov) \\\n", + "zs = np.array([100.* multivariate_gaussian(np.array([x,y]),mu,P) \\\n", " for x,y in zip(np.ravel(xv), np.ravel(yv))])\n", "zv = zs.reshape(xv.shape)\n", "\n", "ax = plt.figure().add_subplot(111, projection='3d')\n", - "ax.plot_surface(xv, yv, zv)\n", + "ax.plot_surface(xv, yv, zv, rstride=1, cstride=1, cmap=cm.autumn)\n", "\n", "ax.set_xlabel('X')\n", "ax.set_ylabel('Y')\n", "\n", "ax.contour(xv, yv, zv, zdir='x', offset=-9, cmap=cm.autumn)\n", "ax.contour(xv, yv, zv, zdir='y', offset=20, cmap=cm.BuGn)\n", - "plt.xlim((-10,20)) \n", - "plt.ylim((-10,20)) \n", - "\n", - "plt.show()\n" + "plt.xlim((-10,15))\n", + "plt.ylim((-10,20))" ], "language": "python", "metadata": {}, @@ -232,9 +237,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The result is clearly a 3D bell shaped curve. We can see that the gaussian is centered around (2,7), and that the probability quickly drops away in all directions. On the sides of the plot I have drawn the Gaussians for x in greens and for y in orange.\n", + "The result is clearly a 3D bell shaped curve. We can see that the gaussian is centered around (2,7), and that the probability quickly drops away in all directions. On the sides of the plot I have drawn the Gaussians for $x$ in greens and for $y$ in orange.\n", "\n", - "As beautiful as this is, it is perhaps a bit hard to get useful information. For example, it is not easy to tell if x and y both have the same variance or not. So for most of the rest of this book we will display multidimensional gaussian using contour plots. I will use some helper functions in gaussian.py to plot them. If you are interested in linear algebra go ahead and look at the code used to produce these contours, otherwise feel free to ignore it." + "As beautiful as this is, it is perhaps a bit hard to get useful information. For example, it is not easy to tell if $x$ and $y$ both have the same variance or not. So for most of the rest of this book we will display multidimensional Gaussian using contour plots. I will use some helper functions in *gaussian.py* to plot them. If you are interested in linear algebra go ahead and look at the code used to produce these contours, otherwise feel free to ignore it." ] }, { @@ -243,21 +248,22 @@ "input": [ "import stats\n", "\n", - "cov = np.array([[2,0],[0,2]])\n", - "e = stats.sigma_ellipse (cov, 2, 7)\n", + "P = np.array([[2,0],[0,2]])\n", + "e = stats.sigma_ellipse (P, 2, 7)\n", "plt.subplot(131)\n", "stats.plot_sigma_ellipse(e, '|2 0|\\n|0 2|')\n", "\n", "\n", - "cov = np.array([[2,0],[0,9]])\n", - "e = stats.sigma_ellipse (cov, 2, 7)\n", + "P = np.array([[2,0],[0,9]])\n", + "e = stats.sigma_ellipse (P, 2, 7)\n", "plt.subplot(132)\n", "stats.plot_sigma_ellipse(e, '|2 0|\\n|0 9|')\n", "\n", "plt.subplot(133)\n", - "cov = np.array([[2,1.2],[1.2,3]])\n", - "e = stats.sigma_ellipse (cov, 2, 7)\n", + "P = np.array([[2,1.2],[1.2,3]])\n", + "e = stats.sigma_ellipse (P, 2, 7)\n", "stats.plot_sigma_ellipse(e,'|2 1.2|\\n|1.2 2|')\n", + "plt.tight_layout()\n", "plt.show()" ], "language": "python", @@ -270,26 +276,48 @@ "source": [ "From a mathematical perspective these display the values that the multivariate gaussian takes for a specific sigma (in this case $\\sigma^2=1$. Think of it as taking a horizontal slice through the 3D surface plot we did above. However, thinking about the physical interpretation of these plots clarifies their meaning.\n", "\n", - "The first plot uses mean and the covariance matrices $\n", - "\\mu =\\begin{bmatrix}2\\\\7\\end{bmatrix}, cov = \\begin{bmatrix}2&0\\\\0&2\\end{bmatrix}$. Let this be our current belief about the position of our dog in a field. In other words, we believe that he is positioned at (2,7) with a variance of $\\sigma^2=2$ for both x and y. The contour plot shows where we believe the dog is located with the '+' in the center of the ellipse. The ellipse shows the boundary for the $1\\sigma^2$ probability - points where the dog is quite likely to be based on our current knowledge. Of course, the dog might be very far from this point, as Gaussians allow the mean to be any value. For example, the dog could be at (3234.76,189989.62), but that has vanishing low probability of being true. Generally speaking displaying the $1\\sigma^2$ to $2\\sigma^2$ contour captures the most likely values for the distribution. An equivelent way of thinking about this is the circle/ellipse shows us the amount of error in our belief. A tiny circle would indicate that we have a very small error, and a very large circle indicates a lot of error in our belief. We will use this throughout the rest of the book to display and evaluate the accuracy of our filters at any point in time. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The second plot uses mean and the covariance matrices $\n", - "\\mu =\\begin{bmatrix}2\\\\7\\end{bmatrix}, cov = \\begin{bmatrix}2&0\\\\0&9\\end{bmatrix}$. This time we use a different variance for x (2) vs y (9). The result is an ellipse. When we look at it we can immediately tell that we have a lot more uncertainty in the y value vs the x value. Our belief that the value is (2,7) is the same in both cases, but errors are different. This sort of thing happens naturally as we track objects in the world - one sensor has a better view of the object, or is closer, than another sensor, and so we end up with different error rates in the different axis." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The third plot uses mean and the covariance matrices $\n", - "\\mu =\\begin{bmatrix}2\\\\7\\end{bmatrix}, cov = \\begin{bmatrix}2&1.2\\\\1.2&2\\end{bmatrix}$. This is the first contour that has values in the off-diagonal elements of $cov$, and this is the first contour plot with a slanted ellipse. This is not a coincidence. The two facts are telling use the same thing. A slanted ellipse tells us that the x and y values are somehow **correlated**. We denote that in the covariance matrix with values off the diagonal. What does this mean in physical terms? Think of trying to park your car in a parking spot. You can not pull up beside the spot and then move sideways into the space because most cars cannot go purely sideways. $x$ and $y$ are not independent. This is a consequence of the steering system in a car. When your tires are turned the car rotates around its rear axle while moving forward. Or think of a horse attached to a pivoting exercise bar in a corral. The horse can only walk in circles, he cannot vary $x$ and $y$ independently, which means he cannot walk straight forward to to the side. If $x$ changes, $y$ must also change in a defined way. \n", + "The first plot uses the mean and covariance matrices of\n", + "$$\n", + "\\begin{align*}\n", + "\\mu &= \\begin{bmatrix}2\\\\7\\end{bmatrix} \\\\\n", + "\\sigma^2 &= \\begin{bmatrix}2&0\\\\0&2\\end{bmatrix}\n", + "\\end{align*}\n", + "$$\n", "\n", - "So when we see this ellipse we know that $x$ and $y$ are correlated, and that the correlation is \"strong\". I will not prove it here, but a 45 $^{\\circ}$ angle denotes complete correlation between $x$ and $y$, whereas $0$ and $90$ denote no correlation at all. Those who are familiar with this math will be objecting quite strongly, as this is actually quite sloppy language that does not adress all of the mathematical issues. They are right, but for now this is a good first approximation to understanding these ellipses from a physical interpretation point of view. The size of the ellipse shows how much error we have in each axis, and the slant shows how strongly correlated the values are.\n", + "Let this be our current belief about the position of our dog in a field. In other words, we believe that he is positioned at (2,7) with a variance of $\\sigma^2=2$ for both x and y. The contour plot shows where we believe the dog is located with the '+' in the center of the ellipse. The ellipse shows the boundary for the $1\\sigma^2$ probability - points where the dog is quite likely to be based on our current knowledge. Of course, the dog might be very far from this point, as Gaussians allow the mean to be any value. For example, the dog could be at (3234.76,189989.62), but that has vanishing low probability of being true. Generally speaking displaying the $1\\sigma^2$ to $2\\sigma^2$ contour captures the most likely values for the distribution. An equivelent way of thinking about this is the circle/ellipse shows us the amount of error in our belief. A tiny circle would indicate that we have a very small error, and a very large circle indicates a lot of error in our belief. We will use this throughout the rest of the book to display and evaluate the accuracy of our filters at any point in time. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The second plot uses the mean and covariance matrices of\n", + "\n", + "$$\n", + "\\begin{align*}\n", + "\\mu &=\\begin{bmatrix}2\\\\7\\end{bmatrix} \\\\\n", + "\\sigma^2 &= \\begin{bmatrix}2&0\\\\0&9\\end{bmatrix}\n", + "\\end{align*}\n", + "$$\n", + "\n", + "This time we use a different variance for $x$ (2) vs $y$ (9). The result is an ellipse. When we look at it we can immediately tell that we have a lot more uncertainty in the $y$ value vs the $x$ value. Our belief that the value is (2,7) is the same in both cases, but errors are different. This sort of thing happens naturally as we track objects in the world - one sensor has a better view of the object, or is closer, than another sensor, and so we end up with different error rates in the different axis." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The third plot uses the mean and covariance matrices of:\n", + "$$\n", + "\\begin{align*}\n", + "\\mu &=\\begin{bmatrix}2\\\\7\\end{bmatrix} \\\\\n", + "\\sigma^2 &= \\begin{bmatrix}2&1.2\\\\1.2&2\\end{bmatrix}\n", + "\\end{align*}\n", + "$$\n", + "\n", + "This is the first contour that has values in the off-diagonal elements of $cov$, and this is the first contour plot with a slanted ellipse. This is not a coincidence. The two facts are telling use the same thing. A slanted ellipse tells us that the $x$ and $y$ values are somehow **correlated**. We denote that in the covariance matrix with values off the diagonal. What does this mean in physical terms? Think of trying to park your car in a parking spot. You can not pull up beside the spot and then move sideways into the space because most cars cannot go purely sideways. $x$ and $y$ are not independent. This is a consequence of the steering system in a car. When your tires are turned the car rotates around its rear axle while moving forward. Or think of a horse attached to a pivoting exercise bar in a corral. The horse can only walk in circles, he cannot vary $x$ and $y$ independently, which means he cannot walk straight forward to to the side. If $x$ changes, $y$ must also change in a defined way. \n", + "\n", + "So when we see this ellipse we know that $x$ and $y$ are correlated, and that the correlation is \"strong\". I will not prove it here, but a $45^\\circ$ angle denotes complete correlation between $x$ and $y$, whereas $0^\\circ$ and $90^\\circ$ denote no correlation at all. Those who are familiar with this math will be objecting quite strongly, as this is actually quite sloppy language that does not adress all of the mathematical issues. They are right, but for now this is a good first approximation to understanding these ellipses from a physical interpretation point of view. The size of the ellipse shows how much error we have in each axis, and the slant shows how strongly correlated the values are.\n", "**IS THIS TRUE???**\n", "\n", "A word about **correlation** and **independence**. If variables are **independent** they can vary separately. If you walk in an open field, you can move in the $x$ direction (east-west), the $y$ direction(north-south), or any combination thereof. Independent variables are always also **uncorrelated**. Except in special cases, the reverse does not hold true. Variables can be uncorrelated, but dependent. For example, consider the pair$(x,y)$ where $y=x^2$. Correlation is a linear measurement, so $x$ and $y$ are uncorrelated. However, they are obviously dependent on each other. ** wikipedia article 'correlation and dependence' claims multivariate normals are a special case, where the correlation coeff $p$ completely defines the dependence. FIGURE THIS OUT!**" @@ -299,7 +327,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "###Kalman Filter Basics\n", + "###Unobserved Variables\n", "\n", "Let's say we are tracking an aircraft and we get the following data for the $x$ coordinate at time $t$=1,2, and 3 seconds. What does your intuition tell you the value of $x$ will be at time $t$=4 seconds?\n" ] @@ -308,10 +336,8 @@ "cell_type": "code", "collapsed": false, "input": [ - "plt.scatter ([1,2,3],[1,2,3])\n", - "plt.xlim([0,4])\n", - "plt.ylim([0,4])\n", - "plt.show()" + "import mkf_internal\n", + "mkf_internal.show_position_chart()" ], "language": "python", "metadata": {}, @@ -321,18 +347,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It appears that the aircraft is flying in a straight line because we can draw a line between the three points, and we know that aircraft cannot turn on a dime. The most reasonable guess is that $x$=4 at $t$=4. I will depict that below with a green square to depict the predictions." + "It appears that the aircraft is flying in a straight line because we can draw a line between the three points, and we know that aircraft cannot turn on a dime. The most reasonable guess is that $x$=4 at $t$=4. I will depict that with a green arrow." ] }, { "cell_type": "code", "collapsed": false, "input": [ - "plt.scatter ([1,2,3],[1,2,3])\n", - "plt.xlim([0,5]); plt.ylim([0,5])\n", - "plt.plot([0,5],[0,5],'r')\n", - "plt.scatter ([4], [4], c='g', marker='s',s=200)\n", - "plt.show()" + "mkf_internal.show_position_prediction_chart()" ], "language": "python", "metadata": {}, @@ -342,21 +364,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If this is data from a Kalman filter, then each point has both a mean and variance. Let's try to show that by showing the approximate error for each point. Don't worry about why I am using a covariance matrix to depict the variance at this point, it will become clear in a few paragraphs. The intent at this point is to show that while we have$x$=1,2,3 that there is a lot of error associated with each measurement." + "If this is data from a Kalman filter, then each point has both a mean and variance. Let's try to show that by showing the approximate error for each point. Don't worry about why I am using a covariance matrix to depict the variance at this point, it will become clear in a few paragraphs. The intent at this point is to show that while we have $x$=1,2,3 that there is a lot of error associated with each measurement." ] }, { "cell_type": "code", "collapsed": false, "input": [ - "cov = np.array([[0.003,0], [0,12]])\n", - "sigma=[0.5,1.,1.5,2]\n", - "e1 = stats.sigma_ellipses(cov, x=1, y=1, sigma=sigma)\n", - "e2 = stats.sigma_ellipses(cov, x=2, y=2, sigma=sigma)\n", - "e3 = stats.sigma_ellipses(cov, x=3, y=3, sigma=sigma)\n", - "stats.plot_sigma_ellipses([e1, e2, e3], axis_equal=True,x_lim=[0,4],y_lim=[0,15])\n", - "plt.ylim([0,11])\n", - "plt.show()" + "mkf_internal.show_x_error_char()" ], "language": "python", "metadata": {}, @@ -368,25 +383,16 @@ "source": [ "We can see that there is a lot of error associated with each value of $x$. We could write a 1D Kalman filter as we did in the last chapter, but suppose this is the output of that filter, and not just raw sensor measurements. Are we out of luck?\n", "\n", - "Let us think about how we predicted that $x$=4 at $t$=4. In one sense we just drew a straight line between the points and saw where it lay at $t$=4. My constant refrain: what is the physical interpretation of that? What is the difference in $x$ over time? What is $\\frac{\\partial x}{\\partial t}$? The derivative, or difference in distance over time is *velocity*. \n", + "Let us think about how we predicted that $x$=4 at $t$=4. In one sense we just drew a straight line between the points and saw where it lay at $t$=4. My constant refrain: what is the physical interpretation of that? What is the difference in $x$ over time? In other words, what is $\\frac{\\partial x}{\\partial t}$? The derivative, or difference in distance over time is *velocity*. \n", "\n", - "This is the **key point** in Kalman filters, so read carefully! Our sensor is only detecting the position of the aircraft (how doesn't matter). It does not have any kind of sensor that provides velocity to us. But based on the position estimates we can compute velocity. In Kalman filters we would call the velocity an **unobserved variable**. Unobserved means what it sounds like - there is no sensor that is measuring velocity directly. Since the velocity is based on the position, and the position has error, the velocity will have error as well. What happens if we draw the velocity errors over the positions errors?" + "This is the **key point** in Kalman filters, so read carefully! Our sensor is only detecting the position of the aircraft (how doesn't matter). It does not have any kind of sensor that provides velocity to us. But based on the position estimates we can compute velocity. In Kalman filters we would call the velocity an *unobserved variable*. Unobserved means what it sounds like - there is no sensor that is measuring velocity directly. Since the velocity is based on the position, and the position has error, the velocity will have error as well. What happens if we draw the velocity errors over the positions errors?" ] }, { "cell_type": "code", "collapsed": false, "input": [ - "from matplotlib.patches import Ellipse\n", - "\n", - "cov = np.array([[1,1],[1,1.1]])\n", - "ev = stats.sigma_ellipses(cov, x=2, y=2, sigma=sigma)\n", - "\n", - "isct = Ellipse(xy=(2,2), width=.2, height=1.2, edgecolor='r', fc='None', lw=4)\n", - "plt.figure().gca().add_artist(isct)\n", - "stats.plot_sigma_ellipses([e1, e2, e3, ev], axis_equal=True,x_lim=[0,4],y_lim=[0,15])\n", - "plt.ylim([0,11])\n", - "plt.show()" + "mkf_internal.show_x_with_unobserved()" ], "language": "python", "metadata": {}, @@ -406,7 +412,11 @@ "metadata": {}, "source": [ "### Kalman Filter Algorithm\n", - "So in general terms we can show how a multidimensional Kalman filter works. In the example above, we compute velocity from the previous position measurements using something called the **measurement function**. Then we predict the next position by using the current estimate and something called the **state transition function**. In our example above, *new_position = old_position + velocity*time*. Next, we take the measurement from the sensor, and compare it to the prediction we just made. In a world with perfect sensors and perfect airplanes the prediction will always match the measured value. In the real world they will always be at least slightly different. We call the difference between the two the **residual**. Finally, we use something called the **Kalman gain** to update our estimate to be somewhere between the measured position and the predicted position. I will not describe how the gain is set, but suppose we had perfect confidence in our measurement - no error is possible. Then, clearly, we would set the gain so that 100% of the position came from the measurement, and 0% from the prediction. At the other extreme, if he have no confidence at all in the sensor (maybe it reported a hardware fault), we would set the gain so that 100% of the position came from the prediction, and 0% from the measurement. In normal cases, we will take a ratio of the two: maybe 53% of the measurement, and 47% of the prediction. The gain is updated on every cycle based on the variance of the variables (in a way yet to be explained). It should be clear that if the variance of the measurement is low, and the variance of the prediction is high we will favor the measurement, and vice versa. \n", + "So in general terms we can show how a multidimensional Kalman filter works. In the example above, we compute velocity from the previous position measurements using something called the **measurement function**. Then we predict the next position by using the current estimate and something called the **state transition function**. In our example above,\n", + "\n", + "$$new\\_position = old\\_position + velocity*time$$ \n", + "\n", + "Next, we take the measurement from the sensor, and compare it to the prediction we just made. In a world with perfect sensors and perfect airplanes the prediction will always match the measured value. In the real world they will always be at least slightly different. We call the difference between the two the **residual**. Finally, we use something called the **Kalman gain** to update our estimate to be somewhere between the measured position and the predicted position. I will not describe how the gain is set, but suppose we had perfect confidence in our measurement - no error is possible. Then, clearly, we would set the gain so that 100% of the position came from the measurement, and 0% from the prediction. At the other extreme, if he have no confidence at all in the sensor (maybe it reported a hardware fault), we would set the gain so that 100% of the position came from the prediction, and 0% from the measurement. In normal cases, we will take a ratio of the two: maybe 53% of the measurement, and 47% of the prediction. The gain is updated on every cycle based on the variance of the variables (in a way yet to be explained). It should be clear that if the variance of the measurement is low, and the variance of the prediction is high we will favor the measurement, and vice versa. \n", "\n", "The chart shows a prior estimate of $x=1$ and $\\dot{x}=1$ ($\\dot{x}$ is the shorthand for the derivative of x, which is velocity). Therefore we predict $\\hat{x}=2$. However, the new measurement $x^{'}=1.3$, giving a residual $r=0.7$. Finally, Kalman filter gain $k$ gives us a new estimate of $\\hat{x^{'}}=1.8$.\n", "\n", @@ -424,14 +434,552 @@ "metadata": {}, "outputs": [] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The Equations\n", + "\n", + "The brilliance of the Kalman filter is taking the insights of the chapter up to this point and finding mathematical solution. The Kalman filter finds what is called a *least squared fit* to the set of measurements to produce an optimal output. We will not trouble ourselves with the derivation of these equations. It runs to several pages, and offers a lot less insight than the words above, in my opinion. Instead, I will just present the equations and then immediately provide the Python code that implements them. \n", + "> Kalman Filter Predict Step:\n", + "\n", + "$$\n", + "\\begin{align*}\n", + "\\hat{x}_{t|t-1} &= F_t\\hat{x}_{t-1} + u_t \\\\\n", + "P_{t|t-1} &= F_tP_{t-1}F^T_t + Q_t\n", + "\\end{align*}\n", + "$$\n", + "\n", + "> Kalman Filter Update Step:\n", + "\n", + "$$\n", + "\\begin{align*}\n", + "\\gamma &= z_t - H_t\\hat{x}_t \\\\\n", + "K_t &= P_t H^T_t (H_t P_t H^T_t + R_t)^{-1} \\\\\n", + "\\\\\n", + "\\hat{x}_t &= \\hat{x}_{t|t-1} + K_t \\gamma \\\\\n", + "P_{t|t} &= (I - K_t H_t)P_{t|t-1} \n", + "\\end{align*}\n", + "\\\\\n", + "$$\n", + "Dash off, wipe the blood out of your eyes, and we'll disuss what this means." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These are nothing more than linear algebra equations that implement the algorithm we used in the last chapter, but using multidimensional Gaussians, and optimizing for a least squares fit. As you should be familiar with, each capital letter denotes a matrix or vector. The subscripts just let us denote from which time step the data comes from; $t$ is now, $t-1$ is the previous step. $A^T$ is the transpose of A, and $A^{-1}$ is the inverse. Finally, the hat denotes an estimate, so $\\hat{x}_t$ is the estimate of $x$ at time $t$.\n", + "\n", + "What do all of the variables mean? What is $P_t$, for example? Don't worry right now. Instead, I am just going to design a Kalman filter, and introduce the names as we go. Then we will just pass them into the Python function that implements the equations above, and we will have our solution.\n", + "\n", + "I will not present all of the Python code for the filter right now, but look at the code for the predict step. Notice how simple it really is. It really isn't much different from the predict step in the previous chapter.\n", + "\n", + " def predict():\n", + " x = F*x + u\n", + " P = F*P*F.T + Q\n", + " \n", + "> *Do not become discouraged when you come across a page of equations like I just gave for the Kalman Filter. They usually turn into very simple code. Take $x = F*x + u$. What does that mean? Clearly we are scaling x by F and then offsetting by u. Once you see that, you can start exploring **why** this is happening. This is really not as obscure as the symbology makes it appear.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tracking a Dog\n", + "\n", + "Let's go back to our tried and true problem of tracking our dog. This time we will include the fundamental insight of this chapter - that of using *unobserved variables* to improve our estimates. In simple terms, our algorithm is:\n", + "\n", + " 2. predict x using 'x + vel*time'\n", + " 3. get measurement for x\n", + " 4. compute residual as: 'x - predicted x'\n", + " 5. compute new position as 'residual * kalman gain'\n", + " \n", + "That is the entire Kalman filter algorithm. It is both what we described above in words, and it is what the rather obscure Kalman Filter equations do. The Kalman filter equations are just way of expresing this algorithm by using linear algebra.\n", + "\n", + "##### **Step 1:** Design State Transition Function\n", + "\n", + "We know from elementary physics that\n", + "\n", + "$$ x = vt + x_0$$\n", + "\n", + "In our problems we will be running the Kalman filter at fixed time intervals, so $t$ is a constant for us. We will just set it to $1$ and worry about the units later.\n", + "\n", + "We have two variables distance $(x)$ and velocity $(\\dot{x})$ which fully represent the state of our system. We will store them in a 1-dimensional array like so:\n", + "\n", + "$$\\begin{pmatrix}x \\\\ \\dot{x}\\end{pmatrix}$$\n", + "\n", + "Now we have to write a linear equation that performs the $ x = vt + x_0$ assignment with our state array.\n", + "\n", + "$$\n", + "\\begin{align*}\n", + "{\\begin{pmatrix}x\\\\\\dot{x}\\end{pmatrix}}' &=\\begin{pmatrix}1&1 \\\\ 0&1\\end{pmatrix} \\times \\begin{pmatrix}x \\\\ \\dot{x}\\end{pmatrix}, \\mbox{or equivelently} \\\\\n", + "x' &= F \\times x\n", + "\\end{align*}\n", + "$$\n", + "\n", + "If we multiply this equation out, it yields:\n", + "\n", + "$$\n", + "\\begin{align*}\n", + "x' &= x + \\dot{x} \\\\\n", + "\\dot{x}' &= \\dot{x}\n", + "\\end{align*}\n", + "$$\n", + "\n", + "You can see that our new $x$ is the old $x$ plus velocity times time, where time equals 1. Velocity is not changed by this equation, so the new $\\dot{x}$ is set to the old $\\dot{x}$.\n", + "\n", + "In the vocabulary of Kalman filters we call this *transforming the state matrix*. We take our state matrix, which for us is $(\\begin{smallmatrix}x \\\\ \\dot{x}\\end{smallmatrix})$,and multipy it by a matrix we will call $F$ to compute the new state. In this case, $F=(\\begin{smallmatrix}1&1\\\\0&1\\end{smallmatrix})$. \n", + "\n", + "\n", + "You will do this for every Kalman filter you ever design. Your state matrix will change depending on how many state random variables you have, and then you will create $F$ so that it updates your state based on whatever the physics of your problem dictates. $F$ is always a matrix of constants. If this is not fully clear, don't worry, we will do this many times in this book.\n", + "\n", + "I will not keep referring to the Kalman filter equations, but look at the first equation $\\hat{x}_{t|t-1} = F_t\\hat{x}_{t-1} + u_t$. There is an unexplained $u_t$ term in there, but shorn of all the diacritics it should be clear that we just designed $F$ for this equation! \n", + "\n", + "\n", + "##### ** Step 2 **: Design the Measurement Function\n", + "\n", + "Now we need a way to go from our measurement to our state matrix. In our problem we have one sensor for the position. We do not have a sensor for velocity. If we put this in linear algebra terms we get:\n", + "\n", + "$$\n", + "z = \\begin{pmatrix}1&0\\end{pmatrix} \\times \\begin{pmatrix}x \\\\ \\dot{x}\\end{pmatrix}\n", + "$$\n", + "\n", + "In other words, we take one times the sensor's $x$ measurement, and zero times the nonexistent velocity measurement. Simple!\n", + "\n", + "In the nomenclature of Kalman filters the $(\\begin{smallmatrix}1&0\\end{smallmatrix})$ matrix is called $H$. If you scroll up to the Kalman filter equations you will see an $H$ term in the update step.\n", + "\n", + "Believe it or not, we have designed the majority of our Kalman filter!! All that is left is to model the noise in our sensors.\n", + "\n", + "\n", + "##### ** Step 3** Design Noise Matrices\n", + "\n", + "In the last chapter we used a variance of 5 for our position sensor. Let's use the same value here. The Kalman filter calls this the *measurement uncertainty*, and uses the symbol $R$.\n", + "\n", + "$$R = 5$$\n", + "\n", + "That was pretty simple, yes? And we are done. There are more variables in the Kalman filter, and we will have to define them in more complicated problems, but for our problem we have done everything we need to do.\n", + "\n", + "As promised, the Kalman filter equations are already programmed for you. In most circumstances you will never have to write your own Kalman filter equations. We will look at the code later, but for now we will just import the code and use it. I have placed it in *KalmanFilter.py*, so let's start by importing it and creating a filter." + ] + }, { "cell_type": "code", "collapsed": false, "input": [ - "cov = np.array([[7,4],[4,7.]])\n", - "mu = np.array([0,0])\n", - "x = np.array([0,0])\n", - "print(multivariate_gaussian(x,mu,cov))" + "import numpy as np\n", + "from KalmanFilter import KalmanFilter\n", + "f = KalmanFilter (dim=2)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it. We import the filter, and create a filter that uses 2 state variables. We specify the number of state variables with the 'dim=2' expression (dim means dimensions).\n", + "\n", + "The Kalman filter class contains a number of variables that you need to set. x is the state, F is the state transition function, and so on. Rather than talk about it, let's just do it!" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "f.x = np.matrix([[0], [0]]) # initial state (location and velocity)\n", + "f.F = np.matrix([[1,1],[0,1]]) # state transition matrix\n", + "f.H = np.matrix([[1,0]]) # Measurement function\n", + "f.R = 5 # state uncertainty\n", + "f.P *= 500. # covariance matrix " + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's look at this line by line. \n", + "\n", + "**1**: We just assign the initial value for our state. Here we just initialize both the position and velocity to zero. \n", + "\n", + "**2**: We set $F=(\\begin{smallmatrix}1&1\\\\0&1\\end{smallmatrix})$, as in design step 1 above. \n", + "\n", + "**3**: We set $H=(\\begin{smallmatrix}1&0\\end{smallmatrix})$, as in design step 2 above.\n", + "\n", + "**4**: We set $R = 5$ as in step 3.\n", + "\n", + "**5**: Recall in the last chapter we set our initial belief to $\\mathcal{N}(\\mu,\\sigma^2)=\\mathcal{N}(0,500)$ to set the initial position to 0 with a very large variance to signify our lack of knowledge about the initial conditions. We implemented in Python with \n", + "\n", + " pos = (0,500)\n", + " \n", + "Multidimensional Kalman filters stores the state variables in $x$ and their *covariance* in $P$. Notionally, this is exactly the same as the one dimension case. We have a mean and variance. For the multidimensional case, we have $$\\mathcal{N}(\\mu,\\sigma^2)=\\mathcal{N}(x,P)$$\n", + "\n", + "$P$ is initialized to the identity matrix of size $n\\times n$, so multiplying by 500 assigns a variance of 500 to $x$ and $\\dot{x}$.\n", + "\n", + "> Summary: For our dog tracking problem, in the 1-D case $\\mu$ was the position, and $\\sigma^2$ was the variance. In the 2-D case $x$ is our position and velocity, and $P$ is the *covariance*. It is the same thing, just in higher dimensions!\n", + "\n", + ">| | 1D | 2D and up|\n", + ">|--|----|---|\n", + ">|state|$\\mu$|$x$|\n", + ">|uncertainty|$\\sigma^2$|$P$|\n", + "\n", + "All that is left is to run the code! The *DogSensor* class from the previous chapter has been placed in *DogSensor.py*. There is an extra variable, Q, which we have not discussed yet. For now it is fine to set it to zero." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from DogSensor import DogSensor\n", + "\n", + "def dog_tracking_filter(R,Q=0,cov=1.):\n", + " f = KalmanFilter (dim=2)\n", + " f.x = np.matrix([[0], [0]]) # initial state (location and velocity)\n", + " f.F = np.matrix([[1,1],[0,1]]) # state transition matrix\n", + " f.H = np.matrix([[1,0]]) # Measurement function\n", + " f.R = R # measurement uncertainty\n", + " f.P *= cov # covariance matrix \n", + " f.Q = Q\n", + " return f\n", + "\n", + "\n", + "def plot_track(noise, count, R, Q=0, plot_P=True, title='Kalman Filter'):\n", + " dog = DogSensor(velocity=1, noise=noise)\n", + " f = dog_tracking_filter(R=R, Q=Q, cov=500.)\n", + "\n", + " ps = []\n", + " zs = []\n", + " cov = []\n", + " for t in range (count):\n", + " z = dog.sense()\n", + " f.measure (z)\n", + " #print (t,z)\n", + " ps.append (f.x[0,0])\n", + " cov.append(f.P)\n", + " zs.append(z)\n", + " f.predict()\n", + "\n", + " p0, = plt.plot([0,count],[0,count],'g')\n", + " p1, = plt.plot(range(1,count+1),zs,c='r', linestyle='dashed')\n", + " p2, = plt.plot(range(1,count+1),ps, c='b')\n", + " plt.legend([p0,p1,p2], ['actual','measurement', 'filter'], 2)\n", + " plt.title(title)\n", + "\n", + " plt.show()\n", + " if plot_P:\n", + " plt.subplot(121)\n", + " plot_covariance(cov, (0,0))\n", + " plt.subplot(122)\n", + " plot_covariance(cov, (1,1))\n", + " plt.show()\n", + " \n", + "def plot_covariance(P, index=(0,0)):\n", + " ps = []\n", + " for p in P:\n", + " ps.append(p[index[0],index[1]])\n", + " plt.plot(ps)\n", + " \n", + "plot_track (noise=30, R=5, count=100)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is still a lot to learn, but we have implemented our first, full Kalman filter using the same theory and equations as published by Nobert Kalman! Code very much like this runs inside of your GPS and phone, inside every airliner, inside of robots, and so on. \n", + "\n", + "The first plot plots the output of the Kalman filter against the measurements and the actual position of our dog (drawn in green). After the initial settling in period the filter should track the dog's position very closely.\n", + "\n", + "The next two plots show the variance of $x$ and of $\\dot{x}$. If you look at the code, this is just a plot of the diagonals of $P$ over time. $P$ is just a covariance matrix, so the diagonal contains the variance of each state variable. So $P[0,0]$ is the variance of $x$, and $P[1,1]$ is the variance of $\\dot{x}$. You can see that despite initializing $P=(\\begin{smallmatrix}500&0\\\\0&500\\end{smallmatrix})$ we quickly converge to small variances for both the position and velocity. We will spend a lot of time on the covariance matrix later, so for now we just briefly point out that we quickly converge onto an accurate estimate despite the initial large uncertainty.\n", + "\n", + "You may not be impressed with these charts. After all, in the previous chapter we filtered very noisy signals with much simpler code. However, realize that right now we are working with a very simple example - an object moving through 1-D space and one sensor. That is about the limit of what we can compute with the code in the last chapter. Though we haven't yet seen it, we can compute very complicated things with this code. Perhaps we want to track 100 dimensions in financial models. Or we have an aircraft with a GPS, INS, TACAN, radar altimeter, baro altimeter, and airspeed indicator, and we want to integrate all those sensors into a model that predicts position, velocity, and accelerations in 3D (which requires 9 state variables). We can do that with the code in this chapter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###Walking Through the Math\n", + "I promised that you would not have to understand how to derive Kalman filter equations, and that is true. However, I do think it is worth walking through the equations one by one and becoming familiar with the variables. If this is your first time through the material feel free to skip ahead to the next section. However, you will eventually want to work through this material, so why not now? You will need to have passing familarity with these equations to read material written about the Kalman filter, as they all presuppose that you are familiar with the equations.at \n", + "\n", + "I will start with the measurement step, as that is what we started with in the one dimensional Kalman filter case. Our first equation is\n", + "\n", + "$$\n", + "\\gamma = z_t - H_t\\hat{x}_t\n", + "$$\n", + "\n", + "On the right, shorn of the diacritics, we have $H\\times x$. That should be recognizable as the measurement function. We are taking our state $x$ and multiplying it by the measurement function $H$ to get the new state. The variable $z$ is just the measurement; it is typical, but not universal to use $z$ to denote measurements in the literature. Do you remember this chart?" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "show_residual_chart()" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The blue prediction line is the output of $H \\times x$, and $z$ is the measurement. Therefore, $\\gamma = z - H\\times x$ is how we compute the residual, drawn in red. So $\\gamma$ is the residual.\n", + "\n", + "The next line is the formidable:\n", + "\n", + "$$K_t = P_t H^T_t (H_t P_t H^T_t + R_t)^{-1}$$\n", + "\n", + "Unfortunately it is a fair amount of linear algebra to derive this. But $K$ is just the *Kalman gain* - the ratio of how much measurement vs prediction we should use to create the new estimate. In the graph above we are favoring the prediction more than the measurement, so the new estimate is near the prediction. Other values of $K$ would choose a different point along the residual line. $R$ is our *measurement noise*, and $P$ is our *uncertainty covariance matrix*. \n", + "\n", + "I look at this equation as saying $K=A \\times B^{-1}$ which is just the way we compute a ratio in linear algebra. If these were scalars, we would write $k=a/b$. In other words, the *Kalman gain* equation is doing nothing more than computing the ratio of two values. Those values are some kind of product of our various uncertainties. As you might guess, if we are confident in our measurements and unconfident in our predictions K will favor the measurement, and vice versa. The math looks complicated, but the concept is simple - scale by a ratio.\n", + "\n", + "Without going into the derivation of $K$, I'll say that this equation is the result of finding a value of $K$ that optimizes the *mean-square estimation error*. It does this by finding the minimal values for $P$ along it's diagonal. Recall that the diagonal of $P$ is just the variance for each state variable. So, this equation for $K$ ensures that the Kalman filter output is optimal. To put this in concrete terms, for our dog tracking problem this means that the estimates for both position and velocity will be optimal - a value of $K$ that made the position extremely accurate but the velocity very inaccurate would be rejected in favor of a $K$ that made both position and velocity just somewhat accurate.\n", + "\n", + "\n", + "Our next line is:\n", + " $$ \\hat{x}_t = \\hat{x}_{t|t-1} + K_t \\gamma$$\n", + "\n", + "This just multiplies the residual by the Kalman gain, and adds it to the state variable. In other words, this is the computation of our new estimate.\n", + "\n", + "Finally, we have:\n", + "\n", + "$$P_{t|t} = (I - K_t H_t)P_{t|t-1} $$\n", + "\n", + "$I$ is the identity matrix, and is the way we represent $1$ in multiple dimensions. $H$ is our measurement function, and is a constant. So, simplified, this is simply $P = (1-cK)P$. $K$ is our ratio of how much prediction vs measurement we use. So, if $K$ is large then $(1-cK)$ is small, and P will be made smaller than it was. If $K$ is small, then $(1-cK)$ is large, and P will be made larger than it was. So we adjust the size of our uncertainty by some factor of the *Kalman gain*. \n", + "\n", + "Now we have the measurement steps. The first equation is\n", + "\n", + "$$\\hat{x}_{t|t-1} = F_t\\hat{x}_{t-1} + u_t$$\n", + "\n", + "In simple terms, we have $x' = Fx + u$. This is just our state transition equation. $u$ is new to us, but that is just our motion vector. We will use this in later problems, but for now consider using a Kalman filter to drive a car. We don't want to just track the car, but we will be issuing it direction - go in such and such direction as some speed. $u$ incorporates these instructions into the filter. If we are just passively tracking then we can set $u$ to zero. This equation is, for our dog tracking problem, just computing:\n", + "\n", + "$$ x'=(vt)+x$$\n", + "\n", + "The final equation is:\n", + "$$P_{t|t-1} = F_tP_{t-1}F^T_t + Q_t$$\n", + "\n", + "Here we are scaling the covariance matrix with our process matrix, and then adding in the uncertainty with our motion ($Q$). **FIX THIS. THIS IS STUPID EXPLANATION.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###Adjusting the Filter\n", + "Your results will vary slightly depending on what numbers your random generator creates for the noise componenet of the noise, but the filter in the last section should track the actual position quite well. Typically as the filter starts up the first several predictions are quite bad, and varies a lot. But as the filter builds its state the estimates become much better. \n", + "\n", + "Let's start varying our parameters to see the effect of various changes. This is a *very normal* thing to be doing with Kalman filters. It is difficult, and often impossible to exactly model our sensors. An imperfect model means imperfect output from our filter. Engineers spend a lot of time tuning Kalman filters so that they perform well with real world sensors. We will spend time now to learn the effect of these changes. As you learn the effect of each change you will develop an intuition for how to design a Kalman filter. As I wrote earlier, designing a Kalman filter is as much art as science. The science is, roughly, designing the $H$ and $F$ matrices - they develop in an obvious manner based on the physics of the system we are modelling. The art comes in modelling the sensors and selecting appropriate values for the rest of our variables.\n", + "\n", + "Let's look at the effects of the noise parameters $R$ and $Q$.I will only run the filter for twenty steps to ensure we can see see the difference between the measurements and filter output. I will start by holding $R$ to 5 and vary $Q$. " + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "plot_track (noise=30, R=5, Q=100,count=30, plot_P=False, title='R = 5, Q = 100')\n", + "plot_track (noise=30, R=5, Q=0.1,count=30, plot_P=False, title='R = 5, Q = 0.1')" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The filter in the first plot should follow the noisy measurement almost exactly. In the second plot the filter should vary from the measurement quite a bit, and be much closer to a straight line than in the first graph. \n", + "\n", + "In the Kalman filter $R$ is the *measurement noise* and $Q$ is the *motion uncertainty*. $R$ is the same in both plots, so ignore it for the moment. Why does $Q$ affect the plots this way?\n", + "\n", + "In the first case we set $Q=100$, which is quite large. In physical terms this is telling the filter \"I don't trust my motion prediction step\". So the filter will be computing velocity ($\\dot{x}), but then mostly ignoring it because we are telling the filter that the computation is extremely suspect. Therefore the filter has nothing to use but the measurements, and thus it follows the measurements closely. \n", + "\n", + "In the second case we set $Q=0.1$, which is quite small. In physical terms we are telling the filter \"trust the motion computation, it is really good!\". So the filter ends up ignoring some of the measurement as it jumps up and down, because the variation in the measurement does not match our velocity prediction. \n", + "\n", + "Now let's leave $Q=0.1$, but bump $R$ up to $1000$. This is telling the filter that the measurement noise is very large. " + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "plot_track (noise=30, R=1000, Q=0.1,count=50, plot_P=False, title='R = 1000, Q = 0.1')" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The filter output should be much closer to the green line, especially after 10-20 cycles. If you are running this in Ipython Notebook, I strongly urge you to run this many times in a row (click inside the code box, and press CTRL-Enter). Most times the filter tracks almost exactly with the actual position, randomly going slightly above and below the green line, but sometimes it stays well over or under the green line for a long time. What is happening in the latter case?\n", + "\n", + "The filter is strongly preferring the motion update to the measurement, so if the prediction is off it takes a lot of measurements to correct it. It will eventually correct because the velocity is a hidden variable - it is computed from the measurements, but it will take awhile **I DON\"T LIKE THIS. I am not sure I have R and Q right**\n", + "\n", + "To some extent you can get similar looking output by varying either $R$ or $Q$, but I urge you to not 'magically' alter these until you get output that you like. Always think about the physical implications of these assignments, and vary $R$ and/or $Q$ based on your knowledge of the system you are filtering." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### A Detailed Examination of the Covariance Matrix\n", + "\n", + "So far I have not given a lot of coverage of the covariance matrix. It is time to look at it more closely. Recall this table comparing the one dimensional and multidimensional Kalman filter.\n", + "\n", + "| | 1D | 2+D|\n", + "|--|----|---|\n", + "|state|$\\mu$|$x$|\n", + "|uncertainty|$\\sigma^2$|$P$|\n", + "\n", + "This should remind you that $P$, the covariance matrix is nothing more than the variance of our state - such as the position of our dog. It has many elements in it, but don't be daunted; we will learn how to interpret a very large $9\\times 9$ covariance matrix, or even larger.\n", + "\n", + "Recall the beginning of the chapter, where we provided the equation for the covariance matrix. It read:\n", + "\n", + "$$\n", + "P = \\begin{pmatrix}\n", + " {{\\sigma}_{1}}^2 & p{\\sigma}_{1}{\\sigma}_{2} & \\cdots & p{\\sigma}_{1}{\\sigma}_{n} \\\\\n", + " p{\\sigma}_{2}{\\sigma}_{1} &{{\\sigma}_{2}}^2 & \\cdots & p{\\sigma}_{2}{\\sigma}_{n} \\\\\n", + " \\vdots & \\vdots & \\ddots & \\vdots \\\\\n", + " p{\\sigma}_{n}{\\sigma}_{1} & p{\\sigma}_{n}{\\sigma}_{2} & \\cdots & {{\\sigma}_{n}}^2\n", + " \\end{pmatrix}\n", + "$$\n", + "\n", + "(I have subtituted $P$ for $\\Sigma$ because of the nomenclature used by the Kalman filter literature).\n", + "\n", + "The diagonal contains the variance of each of our state variables. So, if our state variables are\n", + "\n", + "$$\\begin{pmatrix}x\\\\\\dot{x}\\end{pmatrix}$$\n", + "\n", + "and the covariance matrix happens to be\n", + "$$\\begin{pmatrix}2&0\\\\0&6\\end{pmatrix}$$\n", + "\n", + "we know that the variance of $x$ is 2, and the variance of $\\dot{x}$ is 6. The off diagonal elements are all 0, so we also know that $x$ and $\\dot{x}$ are not correlated. Recall the ellipses that we drew of the covariance matrices. Let's look at the ellipse for the matrix." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "P = np.array([[2,0],[0,6]])\n", + "e = stats.sigma_ellipse (P, 0, 0)\n", + "stats.plot_sigma_ellipse(e, '|2 0|\\n|0 6|')" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Of course it is unlikely that the position and velocity of an object remain uncorrelated for long. Let's look at a more typical covariance matrix" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "P = np.array([[2,2.4],[2.4,6]])\n", + "e = stats.sigma_ellipse (P, 0, 0)\n", + "stats.plot_sigma_ellipse(e, '|2.0 2.4|\\n|2.4 6.0|')" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here the ellipse is slanted, signifying that $x$ and $\\dot{x}$ are correlated (and, of course, not independent - all correlated variables are dependent). You may or may not have noticed that the off diagonal elements were set to the same value, 2.4. This was not an accident. Let's look at the equation for the covariance for the case where the number of dimensions is two.\n", + "\n", + "$$\n", + "P = \\begin{pmatrix}\n", + " \\sigma_1^2 & p\\sigma_1\\sigma_2 \\\\\n", + " p\\sigma_2\\sigma_1 &\\sigma_2^2 \n", + " \\end{pmatrix}\n", + "$$\n", + "\n", + "Look at the computation for the off diagonal elements. \n", + "\n", + "$$\\begin{align*}\n", + "P_{0,1}&=p\\sigma_1\\sigma_2 \\\\\n", + "P_{1,0}&=p\\sigma_2\\sigma_1.\n", + "\\end{align*}$$\n", + "\n", + "If we re-arrange terms we get\n", + "$$\\begin{align*}\n", + "P_{0,1}&=p\\sigma_1\\sigma_2 \\\\\n", + "P_{1,0}&=p\\sigma_1\\sigma_1 \\mbox{, yielding} \\\\\n", + "P_{0,1}&=P_{1,0}\n", + "\\end{align*}$$\n", + "\n", + "In general, we can state that $P_{i,j}=P_{j,i}$.\n", + "\n", + "So for my example I multiplied the diagonals, 2 and 6, to get 12, and then scaled that with the arbitrarily chosen $p=.2$ to get 2.4.\n", + "\n", + "Let's get back to concrete terms. Lets do another Kalman filter for our dog, and this time plot the covariance ellipses on the same plot as the position." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from DogSensor import DogSensor\n", + "\n", + "def dog_tracking_filter(R,Q=0,cov=1.):\n", + " f = KalmanFilter (dim=2)\n", + " f.x = np.matrix([[0], [0]]) # initial state (location and velocity)\n", + " f.F = np.matrix([[1,1],[0,1]]) # state transition matrix\n", + " f.H = np.matrix([[1,0]]) # Measurement function\n", + " f.R = R # measurement uncertainty\n", + " f.P *= cov # covariance matrix \n", + " f.Q = Q\n", + " return f\n", + "\n", + "\n", + "def plot_track(noise, count, R, Q=0, plot_P=True, title='Kalman Filter'):\n", + " dog = DogSensor(velocity=1, noise=noise)\n", + " f = dog_tracking_filter(R=R, Q=Q, cov=4.)\n", + "\n", + " ps = []\n", + " zs = []\n", + " cov = []\n", + " for t in range (count):\n", + " z = dog.sense()\n", + " f.measure (z)\n", + " #print (t,z)\n", + " ps.append (f.x[0,0])\n", + " cov.append(f.P)\n", + " zs.append(z)\n", + " f.predict()\n", + "\n", + " p0, = plt.plot([0,count],[0,count],'g')\n", + " p1, = plt.plot(range(1,count+1),zs,c='r', linestyle='dashed')\n", + " p2, = plt.plot(range(1,count+1),ps, c='b')\n", + " plt.legend([p0,p1,p2], ['actual','measurement', 'filter'], 2)\n", + " plt.title(title)\n", + " \n", + " for i,p in enumerate(cov):\n", + " print(i,p)\n", + " if i==0:\n", + " e = stats.sigma_ellipse (p, i, i)\n", + " stats.plot_sigma_ellipse(e)\n", + " plt.show()\n", + "\n", + " \n", + "plot_track (noise=30, R=5, count=2)" ], "language": "python", "metadata": {}, @@ -444,7 +992,7 @@ "#format the book\n", "from IPython.core.display import HTML\n", "def css_styling():\n", - " styles = open(\"./styles/custom.css\", \"r\").read()\n", + " styles = open(\"./styles/custom2.css\", \"r\").read()\n", " return HTML(styles)\n", "css_styling()" ], diff --git a/Untitled0.ipynb b/Untitled0.ipynb index 8cce331..4499a4a 100644 --- a/Untitled0.ipynb +++ b/Untitled0.ipynb @@ -1,13 +1,56 @@ { "metadata": { "name": "", - "signature": "sha256:889ce3306bc4b69fe3ca9aafb5eec924ed230e1f19586a991b69ad5ef43b60fc" + "signature": "sha256:7495976b98b260c51070e71c79ee140a97a941fe3212c3dd523578ed269b756c" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#format the book\n", + "from IPython.core.display import HTML\n", + "def css_styling():\n", + " styles = open(\"./styles/custom2.css\", \"r\").read()\n", + " return HTML(styles)\n", + "css_styling()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": "" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#One\n", + ">this is a big test asdf asdf asd fasd fasd;fj sadf;lkjas df;kasdj f;lasdkfj asd;lkfj asd;lkfj asd;lfkja sdf;ojasd f;lkasdj f;alsdkfj das;lkfjasdf; asdfkj; asf\n", + "\n", + "\n", + "Here is some text.\n", + "*italic*\n", + "**bold**\n", + "\n", + "##Two\n", + "###Three\n", + "####Four\n", + "#####Five\n", + "######Six" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + }, { "cell_type": "code", "collapsed": false, @@ -154,6 +197,281 @@ ], "language": "python", "metadata": {}, + "outputs": [], + "prompt_number": "" + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "%load stats.py" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": "" + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import math\n", + "import numpy as np\n", + "import numpy.linalg as linalg\n", + "import matplotlib.pyplot as plt\n", + "import scipy.sparse as sp\n", + "import scipy.sparse.linalg as spln\n", + "\n", + "_two_pi = 2*math.pi\n", + "\n", + "\n", + "def gaussian(x, mean, var):\n", + " \"\"\"returns normal distribution for x given a gaussian with the specified\n", + " mean and variance. All must be scalars\n", + " \"\"\"\n", + " return math.exp((-0.5*(x-mean)**2)/var) / math.sqrt(_two_pi*var)\n", + "\n", + "def mul (a_mu, a_var, b_mu, b_var):\n", + " m = (a_var*b_mu + b_var*a_mu) / (a_var + b_var)\n", + " v = 1. / (1./a_var + 1./b_var)\n", + " return (m, v)\n", + "\n", + "def add (a_mu, a_var, b_mu, b_var):\n", + " return (a_mu+b_mu, a_var+b_var)\n", + "\n", + "\n", + "def multivariate_gaussian(x, mu, cov):\n", + " \"\"\" This is designed to work the same as scipy.stats.multivariate_normal\n", + " which is not available before version 0.14. You may either pass in a\n", + " multivariate set of data:\n", + " multivariate_gaussian (array([1,1]), array([3,4]), eye(2)*1.4)\n", + " multivariate_gaussian (array([1,1,1]), array([3,4,5]), 1.4)\n", + " or unidimensional data:\n", + " multivariate_gaussian(1, 3, 1.4)\n", + "\n", + " In the multivariate case if cov is a scalar it is interpreted as eye(n)*cov\n", + "\n", + " The function gaussian() implements the 1D (univariate)case, and is much\n", + " faster than this function.\n", + " \"\"\"\n", + "\t\n", + " # force all to numpy.array type\n", + " x = np.array(x, copy=False, ndmin=1)\n", + " mu = np.array(mu,copy=False, ndmin=1)\n", + "\n", + " nx = len(mu)\n", + " cov = _to_cov(cov, nx)\n", + "\n", + " norm_coeff = nx*math.log(2*math.pi) + np.linalg.slogdet(cov)[1]\n", + "\n", + " err = x - mu\n", + " if (sp.issparse(cov)):\n", + " numerator = spln.spsolve(cov, err).T.dot(err)\n", + " else:\n", + " numerator = np.linalg.solve(cov, err).T.dot(err)\n", + "\n", + " return math.exp(-0.5*(norm_coeff + numerator))\n", + "\t\n", + "\t\n", + "def norm_plot(mean, var):\n", + " min_x = mean - var * 1.5\n", + " max_x = mean + var * 1.5\n", + "\n", + " xs = np.arange(min_x, max_x, 0.1)\n", + " ys = [gaussian(x,23,5) for x in xs]\n", + " plt.plot(xs,ys)\n", + "\n", + "\n", + "def sigma_ellipse(cov, x=0, y=0, sigma=1, num_pts=100):\n", + " \"\"\" Takes a 2D covariance matrix and generates an ellipse showing the\n", + " contour plot at the specified sigma value. Ellipse is centered at (x,y).\n", + " num_pts specifies how many discrete points are used to generate the\n", + " ellipse.\n", + "\n", + " Returns a tuple containing the ellipse,x, and y, in that order.\n", + " The ellipse is a 2D numpy array with shape (2, num_pts). Row 0 contains the\n", + " x components, and row 1 contains the y coordinates\n", + " \"\"\"\n", + " L = linalg.cholesky(cov)\n", + " t = np.linspace(0, _two_pi, num_pts)\n", + " unit_circle = np.array([np.cos(t), np.sin(t)])\n", + "\n", + " ellipse = sigma * L.dot(unit_circle)\n", + " ellipse[0] += x\n", + " ellipse[1] += y\n", + " return (ellipse,x,y)\n", + "\n", + "def sigma_ellipses(cov, x=0, y=0, sigma=[1,2], num_pts=100):\n", + " L = linalg.cholesky(cov)\n", + " t = np.linspace(0, _two_pi, num_pts)\n", + " unit_circle = np.array([np.cos(t), np.sin(t)])\n", + "\n", + " e_list = []\n", + " for s in sigma:\n", + " ellipse = s * L.dot(unit_circle)\n", + " ellipse[0] += x\n", + " ellipse[1] += y\n", + " e_list.append (ellipse)\n", + " return (e_list,x,y)\n", + "\n", + "def plot_sigma_ellipse(ellipse,title=None):\n", + " \"\"\" plots the ellipse produced from sigma_ellipse.\"\"\"\n", + "\n", + " plt.axis('equal')\n", + " e = ellipse[0]\n", + " x = ellipse[1]\n", + " y = ellipse[2]\n", + "\n", + " plt.plot(e[0], e[1])\n", + " plt.scatter(x,y,marker='+') # mark the center\n", + " if title is not None:\n", + " plt.title (title)\n", + "\n", + "def plot_sigma_ellipses(ellipses,title=None,axis_equal=True,x_lim=None,y_lim=None):\n", + " \"\"\" plots the ellipse produced from sigma_ellipse.\"\"\"\n", + "\n", + " if x_lim is not None:\n", + " axis_equal = False\n", + " plt.xlim(x_lim)\n", + "\n", + " if y_lim is not None:\n", + " axis_equal = False\n", + " plt.ylim(y_lim)\n", + "\n", + " if axis_equal:\n", + " plt.axis('equal')\n", + "\n", + " for ellipse in ellipses:\n", + " es = ellipse[0]\n", + " x = ellipse[1]\n", + " y = ellipse[2]\n", + "\n", + " for e in es:\n", + " plt.plot(e[0], e[1], c='b')\n", + "\n", + " plt.scatter(x,y,marker='+') # mark the center\n", + " if title is not None:\n", + " plt.title (title)\n", + "\n", + "\n", + "def _to_cov(x,n):\n", + " \"\"\" If x is a scalar, returns a covariance matrix generated from it\n", + " as the identity matrix multiplied by x. The dimension will be nxn.\n", + " If x is already a numpy array then it is returned unchanged.\n", + " \"\"\"\n", + " try:\n", + " x.shape\n", + " if type(x) != np.ndarray:\n", + " x = np.asarray(x)[0]\n", + " return x\n", + " except:\n", + " return np.eye(n) * x\n", + "\n", + "\n", + "if __name__ == '__main__':\n", + " from scipy.stats import norm\n", + "\n", + " # test conversion of scalar to covariance matrix\n", + " x = multivariate_gaussian(np.array([1,1]), np.array([3,4]), np.eye(2)*1.4)\n", + " x2 = multivariate_gaussian(np.array([1,1]), np.array([3,4]), 1.4)\n", + " assert x == x2\n", + "\n", + " # test univarate case\n", + " rv = norm(loc = 1., scale = np.sqrt(2.3))\n", + " x2 = multivariate_gaussian(1.2, 1., 2.3)\n", + " x3 = gaussian(1.2, 1., 2.3)\n", + "\n", + " assert rv.pdf(1.2) == x2\n", + " assert abs(x2- x3) < 0.00000001\n", + "\n", + " cov = np.array([[1,1],\n", + " [1,1.1]])\n", + "\n", + " sigma = [1,1]\n", + " ev = sigma_ellipses(cov, x=2, y=2, sigma=sigma)\n", + " plot_sigma_ellipses([ev], axis_equal=True,x_lim=[0,4],y_lim=[0,15])\n", + " #isct = plt.Circle((2,2),1,color='b',fill=False)\n", + " #plt.figure().gca().add_artist(isct)\n", + " plt.show()\n", + " print \"all tests passed\"\n" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "%load -s gaussian stats.py\n" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": "" + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def gaussian(x, mean, var):\n", + " \"\"\"returns normal distribution for x given a gaussian with the specified\n", + " mean and variance. All must be scalars\n", + " \"\"\"\n", + " return math.exp((-0.5*(x-mean)**2)/var) / math.sqrt(_two_pi*var)\n" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": "" + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "%rerun -l 2" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": "" + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def gaussian(x, mean, var):\n", + " \"\"\"returns normal distribution for x given a gaussian with the specified\n", + " mean and variance. All must be scalars\n", + " \"\"\"\n", + " return math.exp((-0.5*(x-mean)**2)/var) / math.sqrt(_two_pi*var)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def gaussian(x, mean, var):\n", + " \"\"\"returns normal distribution for x given a gaussian with the specified\n", + " mean and variance. All must be scalars\n", + " \"\"\"\n", + " return math.exp((-0.5*(x-mean)**2)/var) / math.sqrt(_two_pi*var)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "-1" + ], + "language": "python", + "metadata": {}, "outputs": [] }, { diff --git a/mkf_internal.py b/mkf_internal.py index 0b29610..ac2837f 100644 --- a/mkf_internal.py +++ b/mkf_internal.py @@ -4,7 +4,10 @@ Created on Thu May 1 16:56:49 2014 @author: rlabbe """ +import numpy as np +from matplotlib.patches import Ellipse import matplotlib.pyplot as plt +import stats def show_residual_chart(): plt.xlim([0.9,2.5]) @@ -26,4 +29,83 @@ def show_residual_chart(): ec="r", shrinkA=5, shrinkB=5)) plt.title("Kalman Filter Prediction Update Step") + plt.show() + +def show_position_chart(): + """ Displays 3 measurements at t=1,2,3, with x=1,2,3""" + + plt.scatter ([1,2,3],[1,2,3]) + plt.xlim([0,4]); + plt.ylim([0,4]) + + plt.xlabel("Position") + plt.ylabel("Time") + + plt.xticks(np.arange(1,4,1)) + plt.yticks(np.arange(1,4,1)) + plt.show() + +def show_position_prediction_chart(): + """ displays 3 measurements, with the next position predicted""" + + plt.scatter ([1,2,3],[1,2,3],s=128) + + plt.xlim([0,5]) + plt.ylim([0,5]) + + plt.xlabel("Position") + plt.ylabel("Time") + + plt.xticks(np.arange(1,5,1)) + plt.yticks(np.arange(1,5,1)) + + plt.scatter ([4], [4], c='g',s=128) + ax = plt.axes() + ax.annotate('', xy=(4,4), xytext=(3,3), + arrowprops=dict(arrowstyle='->', ec='g',shrinkA=6, lw=3,shrinkB=5)) + plt.show() + + +def show_x_error_char(): + """ displays x=123 with covariances showing error""" + + cov = np.array([[0.003,0], [0,12]]) + sigma=[0.5,1.,1.5,2] + + e1 = stats.sigma_ellipses(cov, x=1, y=1, sigma=sigma) + e2 = stats.sigma_ellipses(cov, x=2, y=2, sigma=sigma) + e3 = stats.sigma_ellipses(cov, x=3, y=3, sigma=sigma) + + stats.plot_sigma_ellipses([e1, e2, e3], axis_equal=True,x_lim=[0,4],y_lim=[0,15]) + + plt.ylim([0,11]) + plt.xticks(np.arange(1,4,1)) + + plt.xlabel("Position") + plt.ylabel("Time") + + plt.show() + +def show_x_with_unobserved(): + """ shows x=1,2,3 with velocity superimposed on top """ + + sigma=[0.5,1.,1.5,2] + cov = np.array([[1,1],[1,1.1]]) + ev = stats.sigma_ellipses(cov, x=2, y=2, sigma=sigma) + + cov = np.array([[0.003,0], [0,12]]) + e1 = stats.sigma_ellipses(cov, x=1, y=1, sigma=sigma) + e2 = stats.sigma_ellipses(cov, x=2, y=2, sigma=sigma) + e3 = stats.sigma_ellipses(cov, x=3, y=3, sigma=sigma) + + isct = Ellipse(xy=(2,2), width=.2, height=1.2, edgecolor='r', fc='None', lw=4) + plt.figure().gca().add_artist(isct) + stats.plot_sigma_ellipses([e1, e2, e3, ev], axis_equal=True,x_lim=[0,4],y_lim=[0,15]) + + plt.ylim([0,11]) + plt.xticks(np.arange(1,4,1)) + + plt.xlabel("Position") + plt.ylabel("Time") + plt.show() \ No newline at end of file diff --git a/styles/custom.css b/styles/custom.css index ac5a409..c369ad9 100644 --- a/styles/custom.css +++ b/styles/custom.css @@ -1,23 +1,10 @@ + + + + diff --git a/styles/custom3.css b/styles/custom3.css new file mode 100644 index 0000000..3f80130 --- /dev/null +++ b/styles/custom3.css @@ -0,0 +1,60 @@ + +