policy network openai gym flagpole

2017-10-31 22:30:20 +01:00 · 2017-10-31 22:30:20 +01:00 · 40dcf31329
commit 40dcf31329
parent 5b68b7ec43
4 changed files with 321 additions and 0 deletions
--- a/linear_regression/init.py
+++ b/linear_regression/init.py
--- a/linear_regression/data.txt
+++ b/linear_regression/data.txt
--- a/linear_regression/model.py
+++ b/linear_regression/model.py
--- a/reinforcement_learning/policy_gradients_flagpole.ipynb
+++ b/reinforcement_learning/policy_gradients_flagpole.ipynb
@ -0,0 +1,321 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pickle\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "import math\n",
+    "import gym\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Determination of the indexes of the probabilities which are responsible for the taken actions during the learning phase\n",
+    "\n",
+    "``` python\n",
+    "# NOTE 1. Example of how the probabilites for the loss function are found.\n",
+    "# probabilities (3, 2)\n",
+    "p = np.array([[ 0.43075615,  0.56924385],\n",
+    "             [ 0.63075615,  0.36924385],\n",
+    "             [ 0.43075615,  0.56924385]])\n",
+    "\n",
+    "# actions taken (3, )\n",
+    "a = np.array([1, 0, 1])\n",
+    "\n",
+    "# indexes of the probabilities responsible for the action\n",
+    "indexes = np.arange(0, a.shape[0])\n",
+    "indexes = indexes * p.shape[1] + a\n",
+    "np.take(p.ravel(), indexes)\n",
+    "```\n",
+    "\n",
+    "Output:\n",
+    "\n",
+    "array([ 0.56924385,  0.63075615,  0.56924385])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://theneuralperspective.com/2016/11/25/reinforcement-learning-rl-policy-gradients-i/\n",
+    "\n",
+    "class Agent:\n",
+    "    def __init__(self, D, H, learning_rate):\n",
+    "        # Step 1: Feed forward\n",
+    "        # Neural net with one hidden layer. Outputs a probability of shape (1, 2) -> [[ 0.43075615,  0.56924385]].\n",
+    "        # The argmax is the chosen action.\n",
+    "        self.input_s = tf.placeholder(tf.float32, [None, D], name=\"input_s\")\n",
+    "        self.w1 = tf.get_variable(\"w1\", shape=[D, H], initializer=tf.contrib.layers.xavier_initializer())\n",
+    "        self.layer_1 = tf.nn.relu(tf.matmul(self.input_s, self.w1))\n",
+    "        \n",
+    "        self.w2 = tf.get_variable(\"w2\", shape=[H, 2], initializer=tf.contrib.layers.xavier_initializer())\n",
+    "        # probability\n",
+    "        self.p = tf.nn.softmax(tf.matmul(self.layer_1, self.w2)) \n",
+    "        \n",
+    "        # Step 2: Determine loss / gradients.\n",
+    "        # During learning, the probabilities will summed with random noise, so the agent will execute some random \n",
+    "        # actions for learning purposes.\n",
+    "        \n",
+    "        self.executed_actions = tf.placeholder(tf.float32, name=\"executed_actions\")\n",
+    "        self.gained_reward = tf.placeholder(tf.float32, name=\"gained_reward\")\n",
+    "        \n",
+    "        # because of the random noise, it is possible that not the argmax is chosen. \n",
+    "        # https://theneuralperspective.com/2016/11/25/reinforcement-learning-rl-policy-gradients-i/\n",
+    "        # See note 1\n",
+    "        # y * log(y')\n",
+    "        log_prob = tf.log(tf.reduce_max(self.p * self.executed_actions, 1))\n",
+    "        self.loss = -tf.reduce_mean(log_prob * self.gained_reward)\n",
+    "        self.all_weights = tf.trainable_variables()\n",
+    "        self.gradients = tf.gradients(self.loss, self.all_weights)\n",
+    "        \n",
+    "        # Step 3: Update weights\n",
+    "        self.batched_gradients = [tf.placeholder(tf.float32, name=\"batched_gradients_w1\"),\n",
+    "                                 tf.placeholder(tf.float32, name=\"batched_gradients_w2\")] \n",
+    "        optimizer = tf.train.AdagradOptimizer(learning_rate)\n",
+    "        self.train = optimizer.apply_gradients(zip(self.batched_gradients, self.all_weights))\n",
+    "        \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def discounted_reward(r, gamma):\n",
+    "    \"\"\"\n",
+    "    The reward for a given state. Is the reward for that state + the discounted sum of future rewards.\n",
+    "    \n",
+    "    :param r: (array) Rewards.\n",
+    "    :param gamma: (flt) Discount factor\n",
+    "    \"\"\"\n",
+    "    return np.cumsum(r * gamma**(np.arange(len(r)))[::-1])[::-1]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2017-10-31 22:23:16,898] Making new env: CartPole-v0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "6.9451\n",
+      "running_time 23.3216080402\n",
+      "7.15753\n",
+      "running_time 23.965\n",
+      "7.85798\n",
+      "running_time 27.11\n",
+      "9.00564\n",
+      "running_time 34.065\n",
+      "9.13331\n",
+      "running_time 34.56\n",
+      "10.198\n",
+      "running_time 41.785\n",
+      "11.1733\n",
+      "running_time 50.715\n",
+      "10.3928\n",
+      "running_time 43.225\n",
+      "11.3053\n",
+      "running_time 52.125\n",
+      "11.7537\n",
+      "running_time 54.355\n",
+      "12.2253\n",
+      "running_time 60.125\n",
+      "12.4728\n",
+      "running_time 63.035\n",
+      "12.8118\n",
+      "running_time 66.7\n",
+      "12.5943\n",
+      "running_time 66.055\n",
+      "12.7572\n",
+      "running_time 66.355\n",
+      "13.5309\n",
+      "running_time 77.23\n",
+      "13.3261\n",
+      "running_time 75.375\n",
+      "13.427\n",
+      "running_time 77.5\n",
+      "14.5608\n",
+      "running_time 94.195\n",
+      "14.6467\n",
+      "running_time 94.19\n",
+      "14.5983\n",
+      "running_time 95.485\n",
+      "15.4137\n",
+      "running_time 107.34\n",
+      "15.6474\n",
+      "running_time 112.155\n",
+      "15.6686\n",
+      "running_time 115.3\n",
+      "15.9861\n",
+      "running_time 124.815\n"
+     ]
+    }
+   ],
+   "source": [
+    "tf.reset_default_graph()\n",
+    "\n",
+    "H = 16 # hidden neurons\n",
+    "D = 4 # input (state of the environment)\n",
+    "learning_rate = 3e-2 \n",
+    "gamma = 0.99 # discount factor\n",
+    "epochs = 5000\n",
+    "max_frames = 999\n",
+    "action_space = 2\n",
+    "update_frequency = 5\n",
+    "\n",
+    "agent = Agent(D, H, learning_rate)\n",
+    "env = gym.make(\"CartPole-v0\")\n",
+    "\n",
+    "init = tf.global_variables_initializer()\n",
+    "\n",
+    "#with tf.Session() as sess:\n",
+    "sess = tf.Session()\n",
+    "sess.run(init)\n",
+    "\n",
+    "running_time = []\n",
+    "\n",
+    "for ep in range(epochs):\n",
+    "    a = 200\n",
+    "    if (ep + 1) % a == 0:     \n",
+    "        print(\"running_time\", np.mean(running_time[-a:]))\n",
+    "\n",
+    "    s = env.reset()\n",
+    "    states = [s]\n",
+    "    actions = []\n",
+    "    rewards = []\n",
+    "\n",
+    "    gradients_batch = sess.run(tf.trainable_variables()) * 0\n",
+    "\n",
+    "    for i in range(max_frames):\n",
+    "        p = sess.run(agent.p, {agent.input_s: [s]})\n",
+    "        probs.append(p)\n",
+    "        a = np.random.choice((0, 1), p=p[0]) # choose an action index\n",
+    "\n",
+    "        s, r, done, _ = env.step(a)\n",
+    "\n",
+    "        actions.append(np.eye(action_space)[a])\n",
+    "        rewards.append(r)\n",
+    "\n",
+    "        if not done:\n",
+    "            states.append(s)\n",
+    "\n",
+    "        else: # game is done. Update weights.\n",
+    "            running_time.append(i)\n",
+    "\n",
+    "            feed = {\n",
+    "                agent.input_s: np.vstack(states),\n",
+    "                agent.executed_actions: np.vstack(actions),\n",
+    "                agent.gained_reward: discounted_reward(rewards, gamma)\n",
+    "            }\n",
+    "\n",
+    "            loss, gradients = sess.run([agent.loss, agent.gradients], feed_dict=feed)\n",
+    "            gradients_batch += gradients\n",
+    "\n",
+    "            if (ep + 1) % update_frequency == 0:\n",
+    "                # update weights\n",
+    "                sess.run(agent.train, dict(zip(agent.batched_gradients, gradients_batch)))\n",
+    "                gradients_batch *= 0\n",
+    "\n",
+    "            break\n",
+    "\n",
+    "            \n",
+    "        \n",
+    "        \n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " 1000.0"
+     ]
+    }
+   ],
+   "source": [
+    "s = env.reset()\n",
+    "import time\n",
+    "\n",
+    "total_r = 0\n",
+    "for _ in range(1000):\n",
+    "    time.sleep(0.01)\n",
+    "    a_dst = sess.run(agent.p, {agent.input_s: [s]})\n",
+    "    a = np.argmax(a_dst)\n",
+    "    env.render(close=0)\n",
+    "    s, r, d, _ = env.step(a)\n",
+    "    total_r += r\n",
+    "    \n",
+    "    if d == True:\n",
+    "        env.reset()\n",
+    "    \n",
+    "    print(\"\\r\", total_r, end=\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "env.render(close=1)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}