policy network openai gym flagpole
This commit is contained in:
parent
5b68b7ec43
commit
40dcf31329
0
linear_regression/__init__.py
Normal file
0
linear_regression/__init__.py
Normal file
0
linear_regression/data.txt
Normal file
0
linear_regression/data.txt
Normal file
0
linear_regression/model.py
Normal file
0
linear_regression/model.py
Normal file
321
reinforcement_learning/policy_gradients_flagpole.ipynb
Normal file
321
reinforcement_learning/policy_gradients_flagpole.ipynb
Normal file
@ -0,0 +1,321 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pickle\n",
|
||||
"import tensorflow as tf\n",
|
||||
"\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"%matplotlib inline\n",
|
||||
"import math\n",
|
||||
"import gym\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Determination of the indexes of the probabilities which are responsible for the taken actions during the learning phase\n",
|
||||
"\n",
|
||||
"``` python\n",
|
||||
"# NOTE 1. Example of how the probabilites for the loss function are found.\n",
|
||||
"# probabilities (3, 2)\n",
|
||||
"p = np.array([[ 0.43075615, 0.56924385],\n",
|
||||
" [ 0.63075615, 0.36924385],\n",
|
||||
" [ 0.43075615, 0.56924385]])\n",
|
||||
"\n",
|
||||
"# actions taken (3, )\n",
|
||||
"a = np.array([1, 0, 1])\n",
|
||||
"\n",
|
||||
"# indexes of the probabilities responsible for the action\n",
|
||||
"indexes = np.arange(0, a.shape[0])\n",
|
||||
"indexes = indexes * p.shape[1] + a\n",
|
||||
"np.take(p.ravel(), indexes)\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Output:\n",
|
||||
"\n",
|
||||
"array([ 0.56924385, 0.63075615, 0.56924385])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# https://theneuralperspective.com/2016/11/25/reinforcement-learning-rl-policy-gradients-i/\n",
|
||||
"\n",
|
||||
"class Agent:\n",
|
||||
" def __init__(self, D, H, learning_rate):\n",
|
||||
" # Step 1: Feed forward\n",
|
||||
" # Neural net with one hidden layer. Outputs a probability of shape (1, 2) -> [[ 0.43075615, 0.56924385]].\n",
|
||||
" # The argmax is the chosen action.\n",
|
||||
" self.input_s = tf.placeholder(tf.float32, [None, D], name=\"input_s\")\n",
|
||||
" self.w1 = tf.get_variable(\"w1\", shape=[D, H], initializer=tf.contrib.layers.xavier_initializer())\n",
|
||||
" self.layer_1 = tf.nn.relu(tf.matmul(self.input_s, self.w1))\n",
|
||||
" \n",
|
||||
" self.w2 = tf.get_variable(\"w2\", shape=[H, 2], initializer=tf.contrib.layers.xavier_initializer())\n",
|
||||
" # probability\n",
|
||||
" self.p = tf.nn.softmax(tf.matmul(self.layer_1, self.w2)) \n",
|
||||
" \n",
|
||||
" # Step 2: Determine loss / gradients.\n",
|
||||
" # During learning, the probabilities will summed with random noise, so the agent will execute some random \n",
|
||||
" # actions for learning purposes.\n",
|
||||
" \n",
|
||||
" self.executed_actions = tf.placeholder(tf.float32, name=\"executed_actions\")\n",
|
||||
" self.gained_reward = tf.placeholder(tf.float32, name=\"gained_reward\")\n",
|
||||
" \n",
|
||||
" # because of the random noise, it is possible that not the argmax is chosen. \n",
|
||||
" # https://theneuralperspective.com/2016/11/25/reinforcement-learning-rl-policy-gradients-i/\n",
|
||||
" # See note 1\n",
|
||||
" # y * log(y')\n",
|
||||
" log_prob = tf.log(tf.reduce_max(self.p * self.executed_actions, 1))\n",
|
||||
" self.loss = -tf.reduce_mean(log_prob * self.gained_reward)\n",
|
||||
" self.all_weights = tf.trainable_variables()\n",
|
||||
" self.gradients = tf.gradients(self.loss, self.all_weights)\n",
|
||||
" \n",
|
||||
" # Step 3: Update weights\n",
|
||||
" self.batched_gradients = [tf.placeholder(tf.float32, name=\"batched_gradients_w1\"),\n",
|
||||
" tf.placeholder(tf.float32, name=\"batched_gradients_w2\")] \n",
|
||||
" optimizer = tf.train.AdagradOptimizer(learning_rate)\n",
|
||||
" self.train = optimizer.apply_gradients(zip(self.batched_gradients, self.all_weights))\n",
|
||||
" \n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def discounted_reward(r, gamma):\n",
|
||||
" \"\"\"\n",
|
||||
" The reward for a given state. Is the reward for that state + the discounted sum of future rewards.\n",
|
||||
" \n",
|
||||
" :param r: (array) Rewards.\n",
|
||||
" :param gamma: (flt) Discount factor\n",
|
||||
" \"\"\"\n",
|
||||
" return np.cumsum(r * gamma**(np.arange(len(r)))[::-1])[::-1]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[2017-10-31 22:23:16,898] Making new env: CartPole-v0\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"6.9451\n",
|
||||
"running_time 23.3216080402\n",
|
||||
"7.15753\n",
|
||||
"running_time 23.965\n",
|
||||
"7.85798\n",
|
||||
"running_time 27.11\n",
|
||||
"9.00564\n",
|
||||
"running_time 34.065\n",
|
||||
"9.13331\n",
|
||||
"running_time 34.56\n",
|
||||
"10.198\n",
|
||||
"running_time 41.785\n",
|
||||
"11.1733\n",
|
||||
"running_time 50.715\n",
|
||||
"10.3928\n",
|
||||
"running_time 43.225\n",
|
||||
"11.3053\n",
|
||||
"running_time 52.125\n",
|
||||
"11.7537\n",
|
||||
"running_time 54.355\n",
|
||||
"12.2253\n",
|
||||
"running_time 60.125\n",
|
||||
"12.4728\n",
|
||||
"running_time 63.035\n",
|
||||
"12.8118\n",
|
||||
"running_time 66.7\n",
|
||||
"12.5943\n",
|
||||
"running_time 66.055\n",
|
||||
"12.7572\n",
|
||||
"running_time 66.355\n",
|
||||
"13.5309\n",
|
||||
"running_time 77.23\n",
|
||||
"13.3261\n",
|
||||
"running_time 75.375\n",
|
||||
"13.427\n",
|
||||
"running_time 77.5\n",
|
||||
"14.5608\n",
|
||||
"running_time 94.195\n",
|
||||
"14.6467\n",
|
||||
"running_time 94.19\n",
|
||||
"14.5983\n",
|
||||
"running_time 95.485\n",
|
||||
"15.4137\n",
|
||||
"running_time 107.34\n",
|
||||
"15.6474\n",
|
||||
"running_time 112.155\n",
|
||||
"15.6686\n",
|
||||
"running_time 115.3\n",
|
||||
"15.9861\n",
|
||||
"running_time 124.815\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tf.reset_default_graph()\n",
|
||||
"\n",
|
||||
"H = 16 # hidden neurons\n",
|
||||
"D = 4 # input (state of the environment)\n",
|
||||
"learning_rate = 3e-2 \n",
|
||||
"gamma = 0.99 # discount factor\n",
|
||||
"epochs = 5000\n",
|
||||
"max_frames = 999\n",
|
||||
"action_space = 2\n",
|
||||
"update_frequency = 5\n",
|
||||
"\n",
|
||||
"agent = Agent(D, H, learning_rate)\n",
|
||||
"env = gym.make(\"CartPole-v0\")\n",
|
||||
"\n",
|
||||
"init = tf.global_variables_initializer()\n",
|
||||
"\n",
|
||||
"#with tf.Session() as sess:\n",
|
||||
"sess = tf.Session()\n",
|
||||
"sess.run(init)\n",
|
||||
"\n",
|
||||
"running_time = []\n",
|
||||
"\n",
|
||||
"for ep in range(epochs):\n",
|
||||
" a = 200\n",
|
||||
" if (ep + 1) % a == 0: \n",
|
||||
" print(\"running_time\", np.mean(running_time[-a:]))\n",
|
||||
"\n",
|
||||
" s = env.reset()\n",
|
||||
" states = [s]\n",
|
||||
" actions = []\n",
|
||||
" rewards = []\n",
|
||||
"\n",
|
||||
" gradients_batch = sess.run(tf.trainable_variables()) * 0\n",
|
||||
"\n",
|
||||
" for i in range(max_frames):\n",
|
||||
" p = sess.run(agent.p, {agent.input_s: [s]})\n",
|
||||
" probs.append(p)\n",
|
||||
" a = np.random.choice((0, 1), p=p[0]) # choose an action index\n",
|
||||
"\n",
|
||||
" s, r, done, _ = env.step(a)\n",
|
||||
"\n",
|
||||
" actions.append(np.eye(action_space)[a])\n",
|
||||
" rewards.append(r)\n",
|
||||
"\n",
|
||||
" if not done:\n",
|
||||
" states.append(s)\n",
|
||||
"\n",
|
||||
" else: # game is done. Update weights.\n",
|
||||
" running_time.append(i)\n",
|
||||
"\n",
|
||||
" feed = {\n",
|
||||
" agent.input_s: np.vstack(states),\n",
|
||||
" agent.executed_actions: np.vstack(actions),\n",
|
||||
" agent.gained_reward: discounted_reward(rewards, gamma)\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" loss, gradients = sess.run([agent.loss, agent.gradients], feed_dict=feed)\n",
|
||||
" gradients_batch += gradients\n",
|
||||
"\n",
|
||||
" if (ep + 1) % update_frequency == 0:\n",
|
||||
" # update weights\n",
|
||||
" sess.run(agent.train, dict(zip(agent.batched_gradients, gradients_batch)))\n",
|
||||
" gradients_batch *= 0\n",
|
||||
"\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" \n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 1000.0"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"s = env.reset()\n",
|
||||
"import time\n",
|
||||
"\n",
|
||||
"total_r = 0\n",
|
||||
"for _ in range(1000):\n",
|
||||
" time.sleep(0.01)\n",
|
||||
" a_dst = sess.run(agent.p, {agent.input_s: [s]})\n",
|
||||
" a = np.argmax(a_dst)\n",
|
||||
" env.render(close=0)\n",
|
||||
" s, r, d, _ = env.step(a)\n",
|
||||
" total_r += r\n",
|
||||
" \n",
|
||||
" if d == True:\n",
|
||||
" env.reset()\n",
|
||||
" \n",
|
||||
" print(\"\\r\", total_r, end=\"\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"env.render(close=1)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user