diff --git a/ensembles/gradient_boosting.ipynb b/ensembles/gradient_boosting.ipynb new file mode 100644 index 0000000..95ad1ad --- /dev/null +++ b/ensembles/gradient_boosting.ipynb @@ -0,0 +1,361 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import datasets, tree, model_selection, metrics\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "class GradientBooster:\n", + " def __init__(self, n_trees=20):\n", + " self.f = []\n", + " self.learning_rates = []\n", + " self.n_trees = n_trees\n", + " \n", + " def fit(self, x, y, lr=0.1):\n", + " class F0:\n", + " predict = lambda x: np.mean(y) * np.ones(x.shape[0])\n", + " self.f.append(F0)\n", + " self.learning_rates.append(1)\n", + " \n", + " for _ in range(self.n_trees):\n", + " m = tree.DecisionTreeRegressor(max_depth=5)\n", + " res = y - self.predict(x)\n", + " m.fit(x, res)\n", + " self.f.append(m)\n", + " self.learning_rates.append(lr)\n", + " \n", + " def predict(self, x):\n", + " return sum(f.predict(x) * lr for f, lr in zip(self.f, self.learning_rates))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Some data\n", + "np.random.seed(123)\n", + "x = datasets.load_diabetes()['data']\n", + "y = datasets.load_diabetes()['target']\n", + "x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate(m):\n", + " print('Training score:', metrics.r2_score(y_train, m.predict(x_train)), '\\tTesting score:', metrics.r2_score(y_test, m.predict(x_test)))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training score: 0.6595521969069875 \tTesting score: 0.14972533215961115\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/miniconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:1943: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.\n", + " warnings.warn(CV_WARNING, FutureWarning)\n", + "/opt/miniconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n", + " DeprecationWarning)\n" + ] + } + ], + "source": [ + "\n", + "\n", + "# Algorithm to beat\n", + "p = {'max_depth': [5, 10, 15, 20],\n", + " 'min_samples_split': [2, 3, 7],\n", + " 'min_samples_leaf': [1, 3, 7]}\n", + "\n", + "m = model_selection.GridSearchCV(tree.DecisionTreeRegressor(), p)\n", + "m.fit(x_train, y_train, )\n", + "\n", + "evaluate(m)" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "class MAE:\n", + " def loss(y_true, y_pred):\n", + " return y_true - y_pred\n", + " \n", + " def prime(y_true, y_pred):\n", + " return np.sign(y_pred - y_true)\n", + " \n", + " \n", + "class MSE:\n", + " def loss(y_true, y_pred):\n", + " return (y_true - y_pred)**2\n", + " \n", + " def prime(y_true, y_pred):\n", + " return -(y_true - y_pred)\n", + " \n", + "class L1GradientBooster:\n", + " def __init__(self, n_trees=20):\n", + " # It seems that the decision tree splits have a random process?\n", + " np.random.seed(132)\n", + " self.f = []\n", + " self.learning_rates = []\n", + " self.n_trees = n_trees\n", + "\n", + " def fit(self, x, y, lr=0.4):\n", + " class F0:\n", + " predict = lambda x: np.median(y) * np.ones(x.shape[0])\n", + "\n", + " self.f.append(F0)\n", + " self.learning_rates.append(1)\n", + "\n", + " for _ in range(self.n_trees):\n", + " m = tree.DecisionTreeRegressor(max_depth=5)\n", + " \n", + " y_pred = self.predict(x)\n", + " res = y - y_pred\n", + " m.fit(x, -MAE.prime(y, y_pred))\n", + " \n", + " leaf_idx = self.m.apply(x)\n", + " y_pred_tree = self.m.predict(x)\n", + " \n", + " for leaf in set(leaf_idx):\n", + " current_leaf_idx = np.where(leaf_idx == leaf)[0] \n", + " self.m.tree_.value[leaf, 0, 0] = np.median(res[current_leaf_idx]) \n", + " \n", + " self.f.append(m)\n", + " self.learning_rates.append(lr)\n", + "\n", + " def predict(self, x):\n", + " return sum(f.predict(x) * lr for f, lr in zip(self.f, self.learning_rates))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training score: 0.8248281659615961 \tTesting score: 0.42262153419864723\n" + ] + } + ], + "source": [ + "\n", + "m = GradientBooster(20)\n", + "m.fit(x_train, y_train)\n", + "evaluate(m)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training score: 18.028543270693472 \tTesting score: 48.24766238623177\n" + ] + } + ], + "source": [ + "class MAE:\n", + " def loss(y_true, y_pred):\n", + " return y_true - y_pred\n", + " \n", + " def prime(y_true, y_pred):\n", + " return np.sign(y_pred - y_true)\n", + " \n", + "class L1GradientBooster:\n", + " def __init__(self, n_trees=20):\n", + " # It seems that the decision tree splits have a random process?\n", + " np.random.seed(132)\n", + " self.f = []\n", + " self.learning_rates = []\n", + " self.n_trees = n_trees\n", + "\n", + " def fit(self, x, y, lr=0.4):\n", + " class F0:\n", + " predict = lambda x: np.median(y) * np.ones(x.shape[0])\n", + "\n", + " self.f.append(F0)\n", + " self.learning_rates.append(1)\n", + "\n", + " for _ in range(self.n_trees):\n", + " m = tree.DecisionTreeRegressor(max_depth=5)\n", + " \n", + " y_pred = self.predict(x)\n", + " res = y - y_pred\n", + " m.fit(x, -MAE.prime(y, y_pred))\n", + " \n", + " leaf_idx = self.m.apply(x)\n", + " y_pred_tree = self.m.predict(x)\n", + " \n", + " for leaf in set(leaf_idx):\n", + " current_leaf_idx = np.where(leaf_idx == leaf)[0] \n", + " self.m.tree_.value[leaf, 0, 0] = np.median(res[current_leaf_idx]) \n", + " \n", + " self.f.append(m)\n", + " self.learning_rates.append(lr)\n", + "\n", + " def predict(self, x):\n", + " return sum(f.predict(x) * lr for f, lr in zip(self.f, self.learning_rates))\n", + " \n", + " \n", + "m = GenericGradientBooster(MAE, n_trees=20)\n", + "m.fit(x_train, y_train)\n", + "evaluate(m)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training score: 21.389953590779374 \tTesting score: 45.09319231571819\n" + ] + } + ], + "source": [ + "\n", + " \n", + "\n", + "class GenericGradientBooster:\n", + " def __init__(self, criterion=MAE, n_trees=20):\n", + " # It seems that the decision tree splits have a random process?\n", + " np.random.seed(132)\n", + " self.f = []\n", + " self.learning_rates = []\n", + " self.criterion = criterion\n", + " self.n_trees = n_trees\n", + "\n", + " def fit(self, x, y, lr=0.4):\n", + " class F0:\n", + " predict = lambda x: np.mean(y) * np.ones(x.shape[0])\n", + "\n", + " self.f.append(F0)\n", + " self.learning_rates.append(1)\n", + "\n", + " for _ in range(self.n_trees):\n", + " m = tree.DecisionTreeRegressor(max_depth=5)\n", + " \n", + " y_pred = self.predict(x)\n", + " res = y - y_pred\n", + " m.fit(x, -self.criterion.prime(y, y_pred)) \n", + " \n", + " m_wrap = WeakLearner(m, self.criterion)\n", + " # train on the residuals as y_m-1 + res = y\n", + " m_wrap.fit(x, res)\n", + " \n", + " self.f.append(m_wrap)\n", + " self.learning_rates.append(lr)\n", + "\n", + " def predict(self, x):\n", + " return sum(f.predict(x) * lr for f, lr in zip(self.f, self.learning_rates))\n", + " \n", + "class WeakLearner:\n", + " def __init__(self, m, loss):\n", + " \"\"\"\n", + " :param m: (DecisionTree) Trained on the derivate of the loss.\n", + " :param loss: Loss class that implements a `prime` method which returns the gradients.\n", + " \"\"\"\n", + " self.m = m\n", + " self.gamma = None\n", + " self.leaf_map = None\n", + " self.loss = loss\n", + " \n", + " def fit(self, x, y):\n", + " \"\"\"\n", + " :param x: (np.array) The features in shape (rows, columns)\n", + " :param y: (np.array) The residuals, not the gradients (pseudo-residuals)! \n", + " The overall residuals are used to optimize gradient descent, as Fm-1 + Fm-now = y --> Fm-now = y - Fm-1 --> y = residuals.\n", + " \"\"\"\n", + " # apply return the leafs that predicted y|x\n", + " leaf_idx = self.m.apply(x)\n", + " \n", + " for leaf in set(leaf_idx):\n", + " current_leaf_idx = np.where(leaf_idx == leaf)[0] \n", + " for _ in range(100):\n", + " y_pred_tree = self.m.predict(x)\n", + " gradient = self.loss.prime(y[current_leaf_idx], y_pred_tree[current_leaf_idx]).sum()\n", + " self.m.tree_.value[leaf, 0, 0] -= 0.05 * gradient\n", + "\n", + " def predict(self, x):\n", + " return self.m.predict(x) \n", + "\n", + "def evaluate(m):\n", + " print('Training score:', metrics.mean_squared_error(y_train, m.predict(x_train)), '\\tTesting score:', metrics.mean_squared_error(y_test, m.predict(x_test)))\n", + " \n", + " \n", + "# m = GenericGradientBooster(MAE, n_trees=10)\n", + "# m.fit(x_train, y_train)\n", + "# evaluate(m)\n", + "\n", + "def evaluate(m):\n", + " print('Training score:', metrics.mean_absolute_error(y_train, m.predict(x_train)), '\\tTesting score:', metrics.mean_absolute_error(y_test, m.predict(x_test)))\n", + " \n", + " \n", + "m = GenericGradientBooster(MAE, n_trees=20)\n", + "m.fit(x_train, y_train)\n", + "evaluate(m)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}