diff --git a/functions.py b/functions.py
index 68d7389..e6aa749 100644
--- a/functions.py
+++ b/functions.py
@@ -49,18 +49,18 @@ class CrossEntropy:
         return Softmax.activation(z)
 
     @staticmethod
-    def delta(y_hat, y):
+    def delta(y_true, y):
         """
         http://cs231n.github.io/linear-classify/#softmax
         https://stackoverflow.com/questions/27089932/cross-entropy-softmax-and-the-derivative-term-in-backpropagation
-        :param y_hat: (array) One hot encoded truth vector.
+        :param y_true: (array) One hot encoded truth vector.
         :param y: (array) Prediction vector.
         :return: (array) Delta vector.
 
         y are softmax probabilitys
-        y_hat is truth vector one hot encoded
+        y_true is truth vector one hot encoded
 
-        y         y_hat
+        y         y_true
         [0.8]     [1]
         [0.1]     [0]
         [0.1]     [0]
@@ -72,22 +72,22 @@ class CrossEntropy:
         [0.1]
 
         """
-        return y - y_hat
+        return y - y_true
 
     @staticmethod
-    def loss(y_hat, y):
+    def loss(y_true, y):
         """
         https://datascience.stackexchange.com/questions/9302/the-cross-entropy-error-function-in-neural-networks
 
-        :param y_hat: (array) One hot encoded truth vector.
+        :param y_true: (array) One hot encoded truth vector.
         :param y: (array) Prediction vector
         :return: (flt)
         """
-        return -np.dot(y_hat, np.log(y))
+        return -np.dot(y_true, np.log(y))
 
 
 class MSE:
-    def __int__(self, activation_fn=None):
+    def __init__(self, activation_fn=None):
         """
 
         :param activation_fn: Class object of the activation function.
@@ -101,20 +101,24 @@ class MSE:
         return self.activation_fn.activation(z)
 
     @staticmethod
-    def loss(y_hat, y):
+    def loss(y_true, y_pred):
         """
-        :param y_hat: (array) One hot encoded truth vector.
-        :param y: (array) Prediction vector
+        :param y_true: (array) One hot encoded truth vector.
+        :param y_pred: (array) Prediction vector
         :return: (flt)
         """
-        return np.mean((y - y_hat)**2)
+        return np.mean((y_pred - y_true)**2)
 
     @staticmethod
-    def prime(y_hat, y):
-        return y - y_hat
+    def prime(y_true, y_pred):
+        return y_pred - y_true
 
-    def delta(self, y_hat, y):
-        self.prime(y_hat, y) * self.activation_fn.prime(y)
+    def delta(self, y_true, y_pred):
+        """
+        Back propagation error delta
+        :return: (array)
+        """
+        return self.prime(y_true, y_pred) * self.activation_fn.prime(y_pred)
 
 
 class NoActivation:
@@ -157,6 +161,8 @@ class Network:
         activations = (      Relu,      Sigmoid)
         """
         self.n_layers = len(dimensions)
+        self.loss = None
+        self.learning_rate = None
         # Weights and biases are initiated by index. For a one hidden layer net you will have a w[1] and w[2]
         self.w = {}
         self.b = {}
@@ -189,9 +195,76 @@ class Network:
 
         return z, a
 
+    def back_prop(self, z, a, y_true):
+        """
+        The input dicts keys represent the layers of the net.
+
+        a = { 1: x,
+              2: f(w1(x) + b1)
+              3: f(w2(a2) + b2)
+              }
+
+        :param z: (dict) w(x) + b
+        :param a: (dict) f(z)
+        :param y_true: (array) One hot encoded truth vector.
+        :return:
+        """
+
+        # Determine partial derivative and delta for the output layer.
+        # delta output layer
+        delta = self.loss.delta(a[self.n_layers], y_true)
+        dw = np.dot(a[self.n_layers - 1].T, delta)
+        # update weights and biases
+        self.update_w_b(self.n_layers - 1, dw, delta)
+
+        # In case of three layer net will iterate over i = 2 and i = 1
+        # Determine partial derivative and delta for the rest of the layers.
+        # Each iteration requires the delta from the previous layer, propagating backwards.
+        for i in reversed(range(2, self.n_layers)):
+            delta = np.dot(delta, self.w[i].T) * self.activations[i].prime(z[i])
+            dw = np.dot(a[i - 1].T, delta)
+            self.update_w_b(i - 1, dw, delta)
+
+    def update_w_b(self, index, dw, delta):
+        """
+        Update weights and biases.
+
+        :param index: (int) Number of the layer
+        :param dw: (array) Partial derivatives
+        :param delta: (array) Delta error.
+        """
+        self.w[index] -= self.learning_rate * np.mean(dw, 1)
+        self.b[index] -= self.learning_rate * np.mean(np.mean(delta, 1), 0)
+
+    def fit(self, x, y_true, loss, epochs, batch_size, learning_rate=1e-3):
+        """
+        :param loss: Loss class (MSE, CrossEntropy etc.)
+        """
+        if not x.shape[0] == y_true.shape[0]:
+            raise ValueError("Length of x and y arrays don't match")
+        # Initiate the loss object with the final activation function
+        self.loss = loss(self.activations[self.n_layers])
+        self.learning_rate = learning_rate
+
+        for i in range(epochs):
+            # Shuffle the data
+            seed = np.arange(x.shape[0])
+            np.random.shuffle(seed)
+            x_ = x[seed]
+            y_ = y_true[seed]
+
+            for j in range(x.shape[0] // batch_size):
+                k = j * batch_size
+                l = (j + 1) * batch_size
+                z, a = self.feed_forward(x_[k:l])
+                self.back_prop(z, a, y_[k:l])
+
+            if (i + 1) % epochs // 10 == 0:
+                print("Loss:", self.loss.loss(y_true, z[self.n_layers]))
+
 if __name__ == "__main__":
     from sklearn import datasets
-    import sklearn.metrics
+    #import sklearn.metrics
 
     # Load data
     data = datasets.load_iris()
@@ -202,5 +275,6 @@ if __name__ == "__main__":
     # one hot encoding
     y = np.eye(3)[y]
 
-    nn = Network((4, 2, 2, 1), (Relu, Relu, Sigmoid))
-    nn.feed_forward(x[:1])
\ No newline at end of file
+    nn = Network((4, 8, 3), (Relu, Sigmoid))
+
+    nn.fit(x, y, MSE, 1000, batch_size=16)
\ No newline at end of file
diff --git a/simple_mlp.py b/simple_mlp.py
index b7e53fd..a1af449 100644
--- a/simple_mlp.py
+++ b/simple_mlp.py
@@ -163,7 +163,6 @@ class NeuralNetwork:
         """
 
         for i in range(epochs):
-
             # Shuffle the data
             seed = np.arange(x.shape[0])
             np.random.shuffle(seed)