Q algorithm learns

2017-11-04 13:17:22 +01:00
parent 40dcf31329
commit 7e7d931adc
1 changed files with 841 additions and 0 deletions
--- a/reinforcement_learning/deep_Q_bridge.ipynb
+++ b/reinforcement_learning/deep_Q_bridge.ipynb
@@ -0,0 +1,841 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pickle\n",
+    "import tensorflow as tf\n",
+    "import sys\n",
+    "\n",
+    "if sys.platform == \"win32\":\n",
+    "    sys.path.append(r\"C:\\Users\\vik\\Dropbox\\Code\\Python\\structural_engineering\")\n",
+    "else:\n",
+    "    sys.path.append(\"/home/ritchie46/Dropbox/Code/Python/structural_engineering\")\n",
+    "\n",
+    "from anastruct.fem.system import SystemElements\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "import math"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ 0.  0.  0.  0.  1.  0.  0.  0.] 8\n",
+      "(array([ 0.,  2.,  0.,  0.,  1.,  0.,  0.,  0.]), -0.1, False)\n",
+      "[[ 0.  2.  3.  0.]\n",
+      " [ 1.  0.  0.  0.]] \n",
+      "\n",
+      "[[ 0.  2.  3.  0.]\n",
+      " [ 1.  0.  0.  4.]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "class Environment:\n",
+    "    def __init__(self, length=3, height=2, optimize='moment'):\n",
+    "        self.length = length\n",
+    "        self.height = height\n",
+    "        self.state = None\n",
+    "        self.n = None\n",
+    "        self.actions_chosen = None\n",
+    "        self.no_action = None\n",
+    "        self.action_space = {0, 1, 2, 3, 4, 5, 6, 7}\n",
+    "        self.valid_actions = None\n",
+    "        self.optimize = optimize\n",
+    "        self.result_map = {}\n",
+    "        \n",
+    "        # actions\n",
+    "        right = 0\n",
+    "        left = 4\n",
+    "        up = 2\n",
+    "        down = 6\n",
+    "        up_right = 1\n",
+    "        up_left = 3\n",
+    "        down_right = 7\n",
+    "        down_left = 5\n",
+    "        \n",
+    "        # If the state is a flattened array. This maps to the index displacements.\n",
+    "        self.move_map = {right: 1,\n",
+    "                         left: -1,\n",
+    "                         up: -length,\n",
+    "                         down: length,\n",
+    "                         up_right: -length + 1,\n",
+    "                         up_left: -length - 1,\n",
+    "                         down_right: length + 1,\n",
+    "                         down_left: length -1}\n",
+    "            \n",
+    "    def reset(self):\n",
+    "        self.state = np.zeros((self.height, self.length))\n",
+    "        self.n = 1\n",
+    "        self.actions_chosen = 0\n",
+    "        self.state[-1][0] = self.n\n",
+    "        self.det_valid_actions()\n",
+    "        \n",
+    "        return self.state.ravel()\n",
+    "#         # valid action encoding\n",
+    "#         a = np.zeros(8)\n",
+    "#         a[np.array(self.valid_actions)] = 1\n",
+    " \n",
+    "#         return np.concatenate((self.state.ravel(), a))\n",
+    "    \n",
+    "    def return_action(self, r):\n",
+    "        done = False\n",
+    "        \n",
+    "        # Bridge is build\n",
+    "        if self.state[-1][-1] != 0:\n",
+    "            r = r + 10 - self.structure()**2 # that is moment to the power 2\n",
+    "            done = True\n",
+    "            return self.state, r, done\n",
+    "\n",
+    "        s = self.state / np.max(self.state)\n",
+    "        zero_mask = np.where(s == 0)\n",
+    "        s[s < 1] = 0.5\n",
+    "        s[zero_mask] = 0\n",
+    "        self.det_valid_actions()\n",
+    "        \n",
+    "        # valid action encoding\n",
+    "        a = np.zeros(8)\n",
+    "        \n",
+    "        try:\n",
+    "            a[np.array(self.valid_actions)] = 1\n",
+    "        except IndexError:\n",
+    "            done = True\n",
+    "            r -= 2\n",
+    "        \n",
+    "        return self.state.ravel(), r, done\n",
+    "            \n",
+    "#         return np.concatenate((self.state.ravel(), a)), r, done\n",
+    "    \n",
+    "    def det_valid_actions(self):\n",
+    "        no_action = set()\n",
+    "        right = 0\n",
+    "        left = 4\n",
+    "        top = 2\n",
+    "        down = 6\n",
+    "        top_right = 1\n",
+    "        top_left = 3\n",
+    "        down_right = 7\n",
+    "        down_left = 5\n",
+    "        \n",
+    "        # current location\n",
+    "        row, col = np.where(self.state == self.n)\n",
+    "                \n",
+    "        # right:\n",
+    "        try:\n",
+    "            if self.state[row, col + 1] != 0:\n",
+    "                no_action.add(right)\n",
+    "        except IndexError:\n",
+    "            no_action.add(right)\n",
+    " \n",
+    "        if col - 1 < 0:\n",
+    "            no_action.add(left)\n",
+    "        elif self.state[row, col - 1] != 0:\n",
+    "            no_action.add(left)\n",
+    "\n",
+    "        if row - 1 < 0:\n",
+    "            no_action.add(top)\n",
+    "        elif self.state[row - 1, col] != 0:\n",
+    "            no_action.add(top)\n",
+    "            \n",
+    "        try:\n",
+    "            if self.state[row + 1, col] != 0:\n",
+    "                no_action.add(down)\n",
+    "        except IndexError:\n",
+    "            no_action.add(down)\n",
+    "            \n",
+    "        if col -1 < 0 or row + 1 == self.height:\n",
+    "            no_action.add(down_left)\n",
+    "        elif self.state[row + 1, col - 1] != 0:\n",
+    "            no_action.add(down_left)\n",
+    "\n",
+    "        try:\n",
+    "            if self.state[row + 1, col + 1] != 0:\n",
+    "                no_action.add(down_right)\n",
+    "        except IndexError:\n",
+    "            no_action.add(down_right)\n",
+    "            \n",
+    "        if row - 1 < 0 or col - 1 < 0:\n",
+    "            no_action.add(top_left)\n",
+    "        elif self.state[row -1, col - 1] != 0:\n",
+    "            no_action.add(top_left)\n",
+    "\n",
+    "        if row - 1 < 0 or col + 1 == self.length:\n",
+    "            no_action.add(top_right)\n",
+    "        elif self.state[row - 1, col + 1] != 0:\n",
+    "                no_action.add(top_right)\n",
+    "            \n",
+    "        self.no_action = no_action\n",
+    "        self.valid_actions = list(self.action_space - no_action)\n",
+    "        \n",
+    "    \n",
+    "    def step(self, a):\n",
+    "        \"\"\"\n",
+    "        :param a: (int) action direction\n",
+    "        \n",
+    "        → 0\n",
+    "        ↗ 1\n",
+    "        ↑ 2\n",
+    "        ↖ 3\n",
+    "        ← 4\n",
+    "        ↙ 5\n",
+    "        ↓ 6\n",
+    "        ↘ 7\n",
+    "        \"\"\"\n",
+    "        self.actions_chosen += 1\n",
+    "\n",
+    "        flat_location_index = np.argwhere(self.state.ravel() == self.n)\n",
+    "                \n",
+    "        if a in self.no_action:\n",
+    "            return self.return_action(-0.2)\n",
+    "        \n",
+    "        # there is a valid action\n",
+    "        self.n += 1\n",
+    "        \n",
+    "        move = self.move_map[a]\n",
+    "        self.state.ravel()[flat_location_index + move] = self.n\n",
+    "            \n",
+    "        return self.return_action(-0.1)\n",
+    "    \n",
+    "    def structure(self):\n",
+    "        ss = SystemElements()\n",
+    "        last_loc = [0, 0]\n",
+    "        for i in range(2, self.n + 1):\n",
+    "            row, col = np.where(i  == self.state)\n",
+    "            \n",
+    "            y = self.height - 1 - row[0]\n",
+    "            x = col[0] \n",
+    "\n",
+    "            current_loc = [x, y]\n",
+    "            ss.add_element([last_loc, [x, y]])\n",
+    "            last_loc = current_loc\n",
+    "        \n",
+    "        n_nodes = len(ss.node_map)\n",
+    "        forces = -5 / (n_nodes - 2)\n",
+    "        for i in range(2, n_nodes):\n",
+    "            ss.point_load(node_id=i, Fz=forces)\n",
+    "  \n",
+    "        ss.add_support_hinged(1)\n",
+    "        ss.add_support_hinged(len(ss.node_map))\n",
+    "        ss.solve()\n",
+    "        \n",
+    "        f_max = np.max(np.abs(ss.get_element_result_range(self.optimize)))\n",
+    "#         if f_max not in self.result_map:\n",
+    "#             self.result_map[f_max] = ss\n",
+    "        \n",
+    "        return f_max\n",
+    "\n",
+    "    \n",
+    "\n",
+    "        \n",
+    "def test_env():\n",
+    "    env = Environment(4, 2)\n",
+    "    s = env.reset()\n",
+    "    print(s, s.size)\n",
+    "    print(env.step(1))\n",
+    "\n",
+    "    env.step(0)\n",
+    "    print(env.state, \"\\n\")\n",
+    "    env.step(7)\n",
+    "    print(env.state)\n",
+    "    env.structure()\n",
+    "    \n",
+    "test_env()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 158,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# https://theneuralperspective.com/2016/11/25/reinforcement-learning-rl-policy-gradients-i/\n",
+    "\n",
+    "class Agent:\n",
+    "    def __init__(self, data_size, hidden_size, action_space, learning_rate):\n",
+    "        \"\"\"\n",
+    "        :param data_size: (int) Columns of the data vector.\n",
+    "        :param hidden_size: (int) No. of hidden nodes.\n",
+    "        :param action_space: (int) No. of outputs.\n",
+    "        :param learning_rate: (flt)\n",
+    "        \"\"\"\n",
+    "        # Step 1: Feed forward\n",
+    "        # The argmax is the maximum Q-value.\n",
+    "        self.input_s = tf.placeholder(tf.float32, [None, data_size], name=\"input_s\")\n",
+    "        self.w1 = tf.get_variable(\"w1\", shape=[data_size, hidden_size[0]], initializer=tf.contrib.layers.xavier_initializer())\n",
+    "        self.b1 = tf.get_variable(\"b1\", shape=(hidden_size[0], ), initializer=tf.zeros_initializer())\n",
+    "        self.layer_1 = tf.nn.relu(tf.matmul(self.input_s, self.w1) + self.b1)\n",
+    "        \n",
+    "        self.w2 = tf.get_variable(\"w2\", shape=[hidden_size[0], hidden_size[1]], initializer=tf.contrib.layers.xavier_initializer())\n",
+    "        self.b2 = tf.get_variable(\"b2\", shape=(hidden_size[1], ), initializer=tf.zeros_initializer())\n",
+    "        self.layer_2 = tf.nn.relu(tf.matmul(self.layer_1, self.w2) + self.b2)\n",
+    "                \n",
+    "        self.w_out = tf.get_variable(\"w_out\", shape=[hidden_size[1], action_space], initializer=tf.contrib.layers.xavier_initializer())\n",
+    "        self.b_out = tf.get_variable(\"b_out\", shape=(action_space, ), initializer=tf.zeros_initializer())\n",
+    "        \n",
+    "        # argmax(Q(s, a)) \n",
+    "        self.predict_Q = tf.matmul(self.layer_2, self.w_out) + self.b_out # actual Q-value\n",
+    "        self.p = tf.nn.softmax(self.predict_Q)\n",
+    "        self.Q_a = tf.argmax(self.predict_Q, 1)\n",
+    "        self.saver = tf.train.Saver()\n",
+    "\n",
+    "        \n",
+    "        # Step 2: Determine loss / gradients. \n",
+    "        # One hot encoded actions\n",
+    "        self.executed_actions = tf.placeholder(tf.int32, name=\"executed_actions\")\n",
+    "        \n",
+    "        self.one_hot = tf.one_hot(self.executed_actions, 8)\n",
+    "        self.Q = tf.reduce_sum(tf.multiply(self.predict_Q, self.one_hot), axis=1)\n",
+    "        self.next_Q_r = tf.placeholder(tf.float32, name=\"next_Q\")\n",
+    "\n",
+    "         # Loss\n",
+    "         # mse: (     target      -    prediction)^2\n",
+    "         #      r + max(Q(s', a') -    Q(s, a) )^2\n",
+    "        \n",
+    "        self.loss = tf.reduce_sum(tf.square(self.next_Q_r - self.Q))        \n",
+    "        optimizer = tf.train.AdamOptimizer(learning_rate)\n",
+    "        self.train_count = tf.Variable(0, trainable=False)\n",
+    "        self.train = optimizer.minimize(self.loss, self.train_count)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def discounted_reward(r, gamma):\n",
+    "    \"\"\"\n",
+    "    The reward for a given state. Is the reward for that state + the discounted sum of future rewards.\n",
+    "    \n",
+    "    :param r: (array) Rewards.\n",
+    "    :param gamma: (flt) Discount factor\n",
+    "    \"\"\"\n",
+    "    return np.cumsum(r * gamma**(np.arange(len(r)))[::-1])[::-1]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env = Environment(4, 2)\n",
+    "env.reset().size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.3054 train_count 15 loss 3898.21\n",
+      "0.4478 train_count 70 loss 16990.0\n",
+      "0.7954 train_count 107 loss 48269.6\n",
+      "0.9927 train_count 130 loss 88762.8\n",
+      "1.2139 train_count 144 loss 129566.0\n",
+      "1.2376 train_count 145 loss 131087.0\n",
+      "1.2771 train_count 148 loss 135273.0\n",
+      "1.2852 train_count 152 loss 153039.0\n",
+      "1.3482 train_count 154 loss 163115.0\n",
+      "1.3247 train_count 158 loss 173414.0\n",
+      "1.4037 train_count 159 loss 180652.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import deque\n",
+    "# %matplotlib inline\n",
+    "# %matplotlib notebook\n",
+    "# # %load_ext autoreload\n",
+    "# # %autoreload 2\n",
+    "\n",
+    "\n",
+    "# fig = plt.figure(figsize=(12, 6))\n",
+    "# ax = fig.add_subplot(111)\n",
+    "# fig.show()\n",
+    "# fig.canvas.draw()\n",
+    "\n",
+    "\n",
+    "env = Environment(3, 1, \"moment\")\n",
+    "\n",
+    "H = [16, 16] # hidden neurons\n",
+    "D = 3 # input (state of the environment)\n",
+    "learning_rate = 1e-3\n",
+    "gamma = 0.99 # discount factor\n",
+    "epochs = 50000\n",
+    "max_frames = 50\n",
+    "action_space = 8\n",
+    "\n",
+    "contin = 0\n",
+    "\n",
+    "if not contin:\n",
+    "    tf.reset_default_graph()\n",
+    "    agent = Agent(D, H, action_space, learning_rate)\n",
+    "    init = tf.global_variables_initializer()\n",
+    "    #with tf.Session() as sess:\n",
+    "    sess = tf.Session()\n",
+    "    sess.run(init)\n",
+    "    buffer = deque()\n",
+    "\n",
+    "scores= []\n",
+    "n_done = 0\n",
+    "\n",
+    "last_ep = 0\n",
+    "\n",
+    "#https://github.com/awjuliani/DeepRL-Agents/blob/master/Q-Network.ipynb\n",
+    "n_updates = 0\n",
+    "for ep in range(epochs):\n",
+    "    if (ep + 1) % 300 == 0:\n",
+    "        print(np.mean(scores[-500:]), \"train_count\", train_count, \"loss\", loss)\n",
+    "        \n",
+    "    s = env.reset()\n",
+    "    s = [s]\n",
+    "    for c in range(max_frames):\n",
+    "        \n",
+    "        p, Q = sess.run([agent.p, agent.predict_Q], {agent.input_s: s})\n",
+    "        \n",
+    "        a = np.random.choice(np.arange(8), p=p[0]) # choose an action index\n",
+    "        s_new, r, done = env.step(a)\n",
+    "        scores.append(r)\n",
+    "\n",
+    "        buffer.append([s, a, r, s_new])\n",
+    "        \n",
+    "        if len(buffer) > 5000:\n",
+    "            buffer.pop()\n",
+    "        s = [s_new]\n",
+    "        \n",
+    "        if done:\n",
+    "                           \n",
+    "            if len(buffer) > 2000 and c % 5 == 0:\n",
+    "                batch = np.vstack(buffer)\n",
+    "                batch = batch[np.random.randint(0, 2000, size=1500)]\n",
+    "\n",
+    "                s = np.vstack(batch[:, 0])\n",
+    "                s_new = np.vstack(batch[:, 3])\n",
+    "                r = batch[:, 2]\n",
+    "                a = batch[:, 1]  \n",
+    "                Q = sess.run(agent.predict_Q, {agent.input_s: s})\n",
+    "                Q_new = sess.run(agent.predict_Q, {agent.input_s: s_new})\n",
+    "                max_Q_new = np.max(Q_new, 1)\n",
+    "\n",
+    "                target_Q = (r + gamma * max_Q_new)\n",
+    "                \n",
+    "                \n",
+    "                train_count, Q_, one_hot, loss, _ = sess.run([agent.train_count, agent.Q, agent.one_hot, agent.loss, agent.train], \n",
+    "                                                feed_dict={agent.input_s: s, \n",
+    "                                                           agent.executed_actions: a, \n",
+    "                                                           agent.next_Q_r: target_Q})\n",
+    "                \n",
+    "#                 print(one_hot.shape)\n",
+    "#                 print(one_hot[0, :])\n",
+    "#                 print(\"predict_Q\", Q[0,:])\n",
+    "#                 print(Q_.shape)\n",
+    "#                 print(Q_[0])\n",
+    "                \n",
+    "\n",
+    "            break\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 140,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 0.26007631, -0.34215826,  0.26007631, ...,  0.52565402,\n",
+       "        0.80317271,  0.80317271])"
+      ]
+     },
+     "execution_count": 140,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.sum(Q * np.eye(8)[np.array(a, dtype=int)], 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,\n",
+       "        -0.        ,  0.26007631],\n",
+       "       [ 0.        ,  0.        ,  0.        , ...,  0.        ,\n",
+       "        -0.34215826,  0.        ],\n",
+       "       [ 0.        ,  0.        ,  0.        , ...,  0.        ,\n",
+       "        -0.        ,  0.26007631],\n",
+       "       ..., \n",
+       "       [ 0.        ,  0.52565402,  0.        , ...,  0.        ,\n",
+       "        -0.        ,  0.        ],\n",
+       "       [ 0.80317271,  0.        ,  0.        , ...,  0.        ,\n",
+       "        -0.        ,  0.        ],\n",
+       "       [ 0.80317271,  0.        ,  0.        , ...,  0.        ,\n",
+       "        -0.        ,  0.        ]])"
+      ]
+     },
+     "execution_count": 138,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x = Q * np.eye(8)[np.array(a, dtype=int)]\n",
+    "x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 136,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7"
+      ]
+     },
+     "execution_count": 136,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 161,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/home/ritchie46/Downloads/model_anastruct/model_bridge_4_2_moment.ckpt'"
+      ]
+     },
+     "execution_count": 161,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "agent.saver.save(sess, \"/home/ritchie46/Downloads/model_anastruct/model_bridge_4_2_moment.ckpt\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'G:\\\\bridge_builder\\\\model_bridge_4_2_axial\\\\model.ckpt'"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "agent.saver.save(sess, r\"G:\\bridge_builder\\model_bridge_4_2_axial\\model.ckpt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 124,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      " [[ 1.  0.  0.]]\n",
+      "5\n",
+      "[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766\n",
+      "   0.0517213  -0.13007079]]\n",
+      "\n",
+      " [[ 1.  0.  0.]]\n",
+      "5\n",
+      "[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766\n",
+      "   0.0517213  -0.13007079]]\n",
+      "\n",
+      " [[ 1.  0.  0.]]\n",
+      "5\n",
+      "[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766\n",
+      "   0.0517213  -0.13007079]]\n",
+      "\n",
+      " [[ 1.  0.  0.]]\n",
+      "5\n",
+      "[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766\n",
+      "   0.0517213  -0.13007079]]\n",
+      "\n",
+      " [[ 1.  0.  0.]]\n",
+      "5\n",
+      "[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766\n",
+      "   0.0517213  -0.13007079]]\n",
+      "\n",
+      " [[ 1.  0.  0.]]\n",
+      "5\n",
+      "[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766\n",
+      "   0.0517213  -0.13007079]]\n",
+      "\n",
+      " [[ 1.  0.  0.]]\n",
+      "5\n",
+      "[[-0.10357568 -0.11130837  0.08943851 -0.03797921 -0.03640079  0.09984766\n",
+      "   0.0517213  -0.13007079]]\n",
+      "\r",
+      " -1.4"
+     ]
+    }
+   ],
+   "source": [
+    "s = env.reset()\n",
+    "\n",
+    "\n",
+    "\"\"\"    \n",
+    "    → 0\n",
+    "    ↗ 1\n",
+    "    ↑ 2\n",
+    "    ↖ 3\n",
+    "    ← 4\n",
+    "    ↙ 5\n",
+    "    ↓ 6\n",
+    "    ↘ 7\n",
+    "\"\"\"\n",
+    "\n",
+    "total_r = 0\n",
+    "j = 0\n",
+    "for a in [0, 0, 1, 1, 0, 0, 0]:\n",
+    "    j += 1\n",
+    "\n",
+    "    print(\"\\n\", env.state)\n",
+    "    a_dst = sess.run(agent.predict_Q, {agent.input_s: [s]})\n",
+    "    a = np.argmax(a_dst)\n",
+    "    #a = np.random.choice(np.arange(8), p=a_dst[0])\n",
+    "\n",
+    "    s, r, d = env.step(a)\n",
+    "    print(a)\n",
+    "    print(a_dst)\n",
+    "    total_r += r\n",
+    "    \n",
+    "#     if d == True:\n",
+    "#         j = 0\n",
+    "#         print(env.state)\n",
+    "#         break\n",
+    "#         #env.reset()\n",
+    "    \n",
+    "print(\"\\r\", total_r, end=\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 0.76904085,  0.296608  ,  0.9456555 ,  0.73741889,  0.83164985,\n",
+       "        0.82735085,  0.58143395,  0.64800572])"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.random.random(8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.])"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "env = Environment(5, 4)\n",
+    "s = env.reset()\n",
+    "actions = s[-8:]\n",
+    "actions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ 1.  1.  1.  1.  1.  0.  0.  1.]\n",
+      "[0, 1, 2, 3, 4, 7]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([[ 0.,  0.,  0.,  0.,  0.],\n",
+       "       [ 0.,  0.,  5.,  0.,  0.],\n",
+       "       [ 0.,  4.,  3.,  0.,  0.],\n",
+       "       [ 1.,  2.,  0.,  0.,  0.]])"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "s = env.step(0)[0]\n",
+    "s = env.step(1)[0]\n",
+    "s = env.step(4)[0]\n",
+    "s = env.step(1)[0]\n",
+    "actions = s[-8:]\n",
+    "print(actions)\n",
+    "print(env.valid_actions)\n",
+    "env.state"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1.0"
+      ]
+     },
+     "execution_count": 55,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "noise = np.random.random(8)\n",
+    "noise /= noise.sum()\n",
+    "noise.sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 149,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{5, 6, 7}"
+      ]
+     },
+     "execution_count": 149,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "\"\"\"    \n",
+    "    → 0\n",
+    "    ↗ 1\n",
+    "    ↑ 2\n",
+    "    ↖ 3\n",
+    "    ← 4\n",
+    "    ↙ 5\n",
+    "    ↓ 6\n",
+    "    ↘ 7\n",
+    "\"\"\"\n",
+    "env.no_action"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  },
+  "latex_envs": {
+   "bibliofile": "biblio.bib",
+   "cite_by": "apalike",
+   "current_citInitial": 1,
+   "eqLabelWithNumbers": true,
+   "eqNumInitial": 0
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}