remove flaw from gbm code

2019-01-01 12:19:55 +01:00 · 2019-01-01 12:19:55 +01:00 · be12e61e15
commit be12e61e15
parent ec6bbdcdf5
1 changed files with 18 additions and 88 deletions
--- a/ensembles/gradient_boosting.ipynb
+++ b/ensembles/gradient_boosting.ipynb
@ -78,7 +78,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/opt/miniconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:1943: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.\n",
+      "/opt/miniconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.\n",
      "  warnings.warn(CV_WARNING, FutureWarning)\n",
      "/opt/miniconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n",
      "  DeprecationWarning)\n"
@ -101,9 +101,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 15,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training score: 0.8588362070152573 \tTesting score: 0.4658251462106008\n"
+     ]
+    }
+   ],
   "source": [
    "class MAE:\n",
    "    def loss(y_true, y_pred):\n",
@ -142,101 +150,23 @@
    "            res = y - y_pred\n",
    "            m.fit(x, -MAE.prime(y, y_pred))\n",
    "            \n",
-    "            leaf_idx = self.m.apply(x)\n",
-    "            y_pred_tree = self.m.predict(x)\n",
+    "            leaf_idx = m.apply(x)\n",
+    "            y_pred_tree = m.predict(x)\n",
    "            \n",
    "            for leaf in set(leaf_idx):\n",
    "                current_leaf_idx = np.where(leaf_idx == leaf)[0]  \n",
-    "                self.m.tree_.value[leaf, 0, 0] = np.median(res[current_leaf_idx])  \n",
-    "   \n",
-    "            self.f.append(m)\n",
-    "            self.learning_rates.append(lr)\n",
-    "\n",
-    "    def predict(self, x):\n",
-    "        return sum(f.predict(x) * lr for f, lr in zip(self.f, self.learning_rates))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Training score: 0.8248281659615961 \tTesting score: 0.42262153419864723\n"
-     ]
-    }
-   ],
-   "source": [
-    "\n",
-    "m = GradientBooster(20)\n",
-    "m.fit(x_train, y_train)\n",
-    "evaluate(m)\n",
-    "            "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 108,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Training score: 18.028543270693472 \tTesting score: 48.24766238623177\n"
-     ]
-    }
-   ],
-   "source": [
-    "class MAE:\n",
-    "    def loss(y_true, y_pred):\n",
-    "        return y_true - y_pred\n",
-    "    \n",
-    "    def prime(y_true, y_pred):\n",
-    "        return np.sign(y_pred - y_true)\n",
-    "    \n",
-    "class L1GradientBooster:\n",
-    "    def __init__(self, n_trees=20):\n",
-    "        # It seems that the decision tree splits have a random process?\n",
-    "        np.random.seed(132)\n",
-    "        self.f = []\n",
-    "        self.learning_rates = []\n",
-    "        self.n_trees = n_trees\n",
-    "\n",
-    "    def fit(self, x, y, lr=0.4):\n",
-    "        class F0:\n",
-    "            predict = lambda x: np.median(y) * np.ones(x.shape[0])\n",
-    "\n",
-    "        self.f.append(F0)\n",
-    "        self.learning_rates.append(1)\n",
-    "\n",
-    "        for _ in range(self.n_trees):\n",
-    "            m = tree.DecisionTreeRegressor(max_depth=5)\n",
-    "        \n",
-    "            y_pred = self.predict(x)\n",
-    "            res = y - y_pred\n",
-    "            m.fit(x, -MAE.prime(y, y_pred))\n",
-    "            \n",
-    "            leaf_idx = self.m.apply(x)\n",
-    "            y_pred_tree = self.m.predict(x)\n",
-    "            \n",
-    "            for leaf in set(leaf_idx):\n",
-    "                current_leaf_idx = np.where(leaf_idx == leaf)[0]  \n",
-    "                self.m.tree_.value[leaf, 0, 0] = np.median(res[current_leaf_idx])  \n",
+    "                m.tree_.value[leaf, 0, 0] = np.median(res[current_leaf_idx])  \n",
    "   \n",
    "            self.f.append(m)\n",
    "            self.learning_rates.append(lr)\n",
    "\n",
    "    def predict(self, x):\n",
    "        return sum(f.predict(x) * lr for f, lr in zip(self.f, self.learning_rates))\n",
-    "        \n",
    "    \n",
-    "m = GenericGradientBooster(MAE, n_trees=20)\n",
+    "m = L1GradientBooster(20)\n",
    "m.fit(x_train, y_train)\n",
-    "evaluate(m)"
+    "evaluate(m)\n",
+    "            "
   ]
  },
  {
@ -353,7 +283,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.0"
+   "version": "3.7.1"
  }
 },
 "nbformat": 4,