From 8f581f8b4b1e9aefec94ed53d9137bf052f0a521 Mon Sep 17 00:00:00 2001 From: ritchie46 Date: Fri, 24 May 2019 13:31:53 +0200 Subject: [PATCH] expectation_maximization --- clustering/expectation_maximization.ipynb | 215 ++++++++++++++++++++++ readme.md | 1 + 2 files changed, 216 insertions(+) create mode 100644 clustering/expectation_maximization.ipynb diff --git a/clustering/expectation_maximization.ipynb b/clustering/expectation_maximization.ipynb new file mode 100644 index 0000000..5e084fc --- /dev/null +++ b/clustering/expectation_maximization.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import scipy.stats as stats\n", + "# http://bjlkeng.github.io/posts/the-expectation-maximization-algorithm/" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(36817)\n", + "p_n = stats.binom(40, 0.5)\n", + "k1 = np.random.multivariate_normal([0, 0], np.array([[1, -0.5], [-0.5, 1]]), p_n.rvs())\n", + "\n", + "k2 = np.random.multivariate_normal([9, -1], np.array([[1, 0.1], [0.1, 1]]), p_n.rvs())\n", + "\n", + "k3 = np.random.multivariate_normal([-8, 0.3], np.array([[1, 0.9], [0.9, 1]]), p_n.rvs())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(k1[:, 0], k1[:, 1])\n", + "plt.scatter(k2[:, 0], k2[:, 1])\n", + "plt.scatter(k3[:, 0], k3[:, 1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Gaussian mixture\n", + "Assume there is $\\pi$ probability of a data point coming from one of $K$ clusters.\n", + "\n", + "$$ z_i \\sim Categorical(\\pi) $$\n", + "\n", + "$$ x_i|z_i \\sim N(\\mu_k, \\sigma_k^2) $$\n", + "$$ p(x_i|\\theta) = \\sum_{k=1}^Kp(z_i=k) \\cdot p(x_i|z_i=k, \\mu_k, \\sigma_k^2)$$\n", + "\n", + "## maximization step\n", + "$$ \\pi_k = \\frac{1}{N}\\sum_i p(z_i = k| x_i, \\theta)$$\n", + "$$ \\mu_k = \\frac{\\sum_i p(z_i = k| x_i, \\theta) x_i}{\\sum_i p(z_i = k| x_i, \\theta)}$$\n", + "$$ cov_k = \\frac{\\sum_i p(z_i = k| x_i, \\theta) \\cdot (x_i - \\mu_k)(y_i - \\mu_k)}{\\sum_i p(z_i = k| x_i, \\theta)}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "data = np.vstack([k1, k2, k3])" + ] + }, + { + "cell_type": "code", + "execution_count": 493, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.33846154 0.36923077 0.29230769]\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 493, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "def weighted_covariance(x, y, likelihoods, mu_k):\n", + " return np.sum(likelihoods * (x - mu_k[0]) * (y - mu_k[1])) / likelihoods.sum()\n", + "\n", + "class EM:\n", + " def __init__(self, k):\n", + " self.k = k\n", + " self.z = np.arange(k)\n", + " self.mu = np.random.choice(data.flatten(), k * 2).reshape((k, 2)) # 2 dimensional\n", + " self.cov = np.stack([np.array([[1., 0.], [0., 1.]]) for _ in range(k)])\n", + " self.pi = np.ones(k) / k\n", + " self.log_likelihoods = None\n", + " \n", + " def expectation_step(self, x):\n", + " for z_i in self.z:\n", + " mu_i = self.mu[z_i]\n", + " cov_i = self.cov[z_i]\n", + " self.log_likelihoods[z_i, :] = self.pi[z_i] + stats.multivariate_normal(mu_i, cov_i).logpdf(x)\n", + " # normalize by marginalizing K\n", + " self.log_likelihoods = self.log_likelihoods - np.log(np.exp(self.log_likelihoods).sum(0))\n", + " \n", + " def maximization_step(self, x):\n", + " for z_i in self.z:\n", + " likelihoods = np.exp(self.log_likelihoods[z_i])\n", + " # weighted average\n", + " self.mu[z_i] = ((x * likelihoods[:, None]) / likelihoods.sum()).sum(0)\n", + " \n", + " # weighted variance\n", + " cov = weighted_covariance(x[:, 0], x[:, 1], likelihoods, self.mu[z_i])\n", + " \n", + " cov = np.array([[weighted_covariance(x[:, 0], x[:, 0], likelihoods, self.mu[z_i]), cov],\n", + " [cov, weighted_covariance(x[:, 1], x[:, 1], likelihoods, self.mu[z_i])]])\n", + " \n", + " if np.all(np.linalg.eigvals(cov) > 0) and (1e-6 < np.linalg.det(cov)):\n", + " self.cov[z_i] = cov\n", + " \n", + " # weighted pi\n", + " self.pi[z_i] = likelihoods.sum() / x.shape[0]\n", + " \n", + " def fit(self, x):\n", + " self.log_likelihoods = np.zeros((self.k, x.shape[0]))\n", + " last_log_likelihood = np.inf\n", + " while np.abs(self.log_likelihoods.sum() - last_log_likelihood) > 0.01:\n", + " last_log_likelihood = self.log_likelihoods.sum()\n", + " self.expectation_step(x)\n", + " self.maximization_step(x)\n", + " \n", + " def predict(self, x):\n", + " log_likelihoods = np.zeros((self.k, x.shape[0]))\n", + " for z_i in self.z:\n", + " mu_i = self.mu[z_i]\n", + " cov_i = self.cov[z_i]\n", + " log_likelihoods[z_i, :] = self.pi[z_i] + stats.multivariate_normal(mu_i, cov_i).logpdf(x)\n", + " return log_likelihoods.argmax(0)\n", + " \n", + "m = EM(3)\n", + "m.fit(data)\n", + "y = m.predict(data)\n", + "\n", + "mask = y == 0\n", + "plt.scatter(data[mask][:, 0], data[mask][:, 1])\n", + "mask = y == 1\n", + "plt.scatter(data[mask][:, 0], data[mask][:, 1])\n", + "mask = y == 2\n", + "plt.scatter(data[mask][:, 0], data[mask][:, 1])\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/readme.md b/readme.md index e568e4c..8a9c5d0 100644 --- a/readme.md +++ b/readme.md @@ -9,3 +9,4 @@ * [support vector machine](https://www.ritchievink.com/blog/2017/11/27/implementing-a-support-vector-machine-in-scala/) * [neural network](https://ritchievink.com/blog/2017/07/10/programming-a-neural-network-from-scratch/) * [arima models](https://www.ritchievink.com/blog/2018/09/26/algorithm-breakdown-ar-ma-and-arima-models./) +* [expectation_maximization](/clustering/expectation_maximization.ipynb)