diff --git a/mlp.py b/mlp.py
old mode 100755
new mode 100644
index 978f1e2..08b173f
--- a/mlp.py
+++ b/mlp.py
@@ -19,6 +19,7 @@ def deriv_sigmoid(x):
 def tanh(x):
     ep = np.exp(x)
     en = np.exp(-x)
+    #print("ep:{}\nen:{}\n".format(ep,en))
     return (ep - en)/(ep + en)
 
 
@@ -28,20 +29,13 @@ def deriv_tanh(x):
 
 
 def relu(x):
-    ret = 0
-    #fixme should map to compare
-    if x > 0:
-        ret = x
-    elif type(x) is np.ndarray:
-        ret = np.zeros(x.shape)
-    return ret
+    return np.maximum(x, np.zeros(x.shape))
 
 def deriv_relu(x):
-    ret = 0
-    if z < 0:
-        ret = 0.01
-    else:
-        ret = 1
+    ret = x
+    ret[ret > 0] = 1
+    ret[ret < 0] = 0
+    return ret
 
 def leaky_relu(x):
     ret = 0.01 * x
@@ -52,16 +46,56 @@ def leaky_relu(x):
         ret = np.ones(x.shape)*0.01
     return ret
 
+def softmax(x):
+    t = np.exp(x)
+    sum_t = np.sum(t, axis=0)
+    return t / sum_t
+
+def w_rand_tanh(n, l, xavier_init=True):
+    """ Initialize weights of a layer for tanH activation function
+
+    :param n: vector of number of units per layer
+    :param l: current layer
+    :param xavier_init: if True will use Xavier initialization
+
+    """
+    if xavier_init:
+        print("tanh factor={}".format(np.sqrt(1/n[l-1])))
+        ret = np.random.randn(n[l], n[l-1]) * np.sqrt(1/n[l-1])
+    else:
+        print("tanh factor={}".format(np.sqrt(2/(n[l-1]+n[l]))))
+        ret = np.random.randn(n[l], n[l-1]) * np.sqrt(2/(n[l-1]+n[l]))
+    return ret
+
+def w_rand_relu(n, l):
+    """ Initialize weights of a layer for tanH activation function
+
+    :param n: vector of number of units per layer
+    :param l: current layer
+
+    """
+    print("relu factor={}".format(np.sqrt(2/n[l-1])))
+    return np.random.randn(n[l], n[l-1]) * np.sqrt(2/n[l-1])
+
+def w_rand_sigmoid(n, l):
+    print("sigmoid factor={}".format(1/(n[l-1]*n[l])))
+    return np.random.randn(n[l], n[l-1]) * (1/(n[l-1]*n[l]))
+
+def w_rand_softmax(n, l, factor=0.01):
+    print("softmax factor={}".format(factor))
+    return np.random.randn(n[l], n[l-1]) * factor
+
 
 class MultiLayerPerceptron(object):
 
     functions = {
-        "sigmoid": {"function": sigmoid, "derivative": deriv_sigmoid},
-        "tanh": {"function": tanh, "derivative": deriv_tanh},
-        "relu": {"function": relu, "derivative": deriv_relu},
+        "sigmoid": {"function": sigmoid, "derivative": deriv_sigmoid, "w_rand": w_rand_sigmoid, "name": "sigmoid"},
+        "tanh": {"function": tanh, "derivative": deriv_tanh, "w_rand": w_rand_tanh, "name": "tanh"},
+        "relu": {"function": relu, "derivative": deriv_relu, "w_rand": w_rand_relu, "name": "relu"},
+        "softmax": {"function": softmax, "derivative": None, "w_rand": w_rand_softmax, "name": "softmax"},
     }
 
-    def __init__(self, L=1, n=None, g=None, alpha=0.01, lambd=0):
+    def __init__(self, L=1, n=None, g=None, alpha=0.01, set_random_w=True, use_formula_w=False, w_rand_factor=1):
         """Initializes network geometry and parameters
         :param L: number of layers including output and excluding input. Defaut 1.
         :type L: int
@@ -71,8 +105,11 @@ class MultiLayerPerceptron(object):
             Possible names are: "sigmoid", "tanh". Default ["sigmoid"].
         :type g: list
         :param alpha: learning rate. Default 0.01.
+        :param set_random_w: if True will initialize randomly weights using either w_rand_factor
+            or a formula depending on activation functions if use_formula_w is True
         """
-        w_rand_factor = 1
+        #w_rand_factor = 1
+        self._prepared = False
         self._L = L
         if n is None:
             n = [2, 1]
@@ -82,7 +119,11 @@ class MultiLayerPerceptron(object):
         else:
             g = [MultiLayerPerceptron.functions[fct] for fct in g]
         self._g = [None] + g
-        self._W = [None] + [np.random.randn(n[l+1], n[l])*w_rand_factor for l in range(L)]
+        # check if softmax multi-class classification
+        self._softmax = False
+        if g[-1]["name"] == "softmax":
+            self._softmax = True
+        self._W = [None] * (L+1)
         self._b = [None] + [np.zeros((n[l+1], 1)) for l in range(L)]
         assert(len(self._g) == len(self._W))
         assert(len(self._g) == len(self._b))
@@ -93,37 +134,123 @@ class MultiLayerPerceptron(object):
         self._Z = None
         self._m = 0
         self._alpha = alpha
-        self._lambda = lambd
+        # optimization
+        self._lambda = 0
+        self._regularization = False
+        self._momentum = False
+        self._rmsprop = False
+        self._adam = False
+        # initialise weights
+        if set_random_w:
+            self.init_random_weights(use_formula_w, w_rand_factor)
 
-    def set_all_input_examples(self, X, m=1):
+    def init_random_weights(self, use_formula=False, w_rand_factor=1):
+        """Initialize randomly weights using a factor or using some formula
+        
+        :param w_rand_factor: factorize random weights with this (default 1)
+        :param use_formula: if True will use formules corresponding to the activation functions
+        """
+        if use_formula:
+            for l0 in range(self._L):
+                l = l0 + 1
+                self._W[l] = self._g[l]["w_rand"](self._n, l)
+        else:
+            if type(w_rand_factor) is list:
+                self._W = [None] + [np.random.randn(self._n[l+1], self._n[l])*w_rand_factor[l] for l in range(self._L)]
+            else:
+                self._W = [None] + [np.random.randn(self._n[l+1], self._n[l])*w_rand_factor for l in range(self._L)]
+
+
+    def use_regularization(self, lambd):
+        """Activates regularization for backpropagation
+
+        :param lambd: the lambda parameter value for regularization
+
+        """
+        self._regularization = True
+        self._lambda_regul = lambd
+
+    def use_momentum(self, beta=0.9, v_dw=0., v_db=0.):
+        """Activates momentum optimization for backpropagation
+
+        :param beta: the beta parameter value for momentum (default 0.9)
+        :param v_dw: v_dw initial value for momentum (default 0.0)
+        :param v_db: v_db initial value for momentum (default 0.0)
+
+        """
+        self._momentum = True
+        self._beta_momentum = beta
+        n = self._n
+        self._v_dw_momentum = [None] + [v_dw * np.ones((n[l+1], n[l])) for l in range(self._L)]
+        self._v_db_momentum = [None] + [v_db * np.ones((n[l+1], 1)) for l in range(self._L)]
+
+    def use_rmsprop(self, beta=0.999, s_dw=0., s_db=0., epsilon=1.0e-8):
+        """Activates RMSProp optimization for backpropagation
+
+        :param beta: the beta parameter value for RMSProp (default 0.999)
+        :param s_dw: s_dw initial value for RMSProp (default 0.0)
+        :param s_db: s_db initial value for RMSProp (default 0.0)
+        :param epsilon: epsilon value for RMSProp (default 1.0e-8)
+
+        """
+        self._rmsprop = True
+        self._beta_rmsprop = beta
+        n = self._n
+        self._s_dw_rmsprop = [None] + [s_dw * np.ones((n[l+1], n[l])) for l in range(self._L)]
+        self._s_db_rmsprop = [None] + [s_db * np.ones((n[l+1], 1)) for l in range(self._L)]
+        self._epsilon_rmsprop = epsilon
+
+    def use_adam(self, beta_m=0.9, v_dw=0., v_db=0., beta_r=0.999, s_dw=0., s_db=0., epsilon=1.0e-8):
+        """Activates Adam optimization for backpropagation
+
+        :param beta_m: the beta parameter value for momentum (default 0.9)
+        :param v_dw: v_dw initial value for momentum (default 0.0)
+        :param v_db: v_db initial value for momentum (default 0.0)
+        :param beta_r: the beta parameter value for RMSProp (default 0.999)
+        :param s_dw: s_dw initial value for RMSProp (default 0.0)
+        :param s_db: s_db initial value for RMSProp (default 0.0)
+        :param epsilon: epsilon value for RMSProp (default 1.0e-8)
+
+        """
+        self._adam = True
+        self.use_momentum(beta_m, v_dw, v_db)
+        self.use_rmsprop(beta_r, s_dw, s_db, epsilon)
+
+    def set_all_input_examples(self, X, m=None):
         """Set the input examples.
 
         :param X: matrix of dimensions (n[0], m). Accepts also a list (len m) of lists (len n[0])
         :param m: number of training examples.
         :type m: int
         """
+        if m is None:
+            m = self._m
         if type(X) is list:
             assert(len(X) == m)
             self._X = np.matrix(X).T
         else:
-            assert(X.shape == (self._n[0], self._m))
+            #print(X.shape, self._n[0], m)
+            assert(X.shape == (self._n[0], m))
             self._X = X
         self._m = m
         assert((self._m == m) or (self._m == 0))
         self._m = m
 
-    def set_all_expected_output_examples(self, Y, m=1):
+    def set_all_expected_output_examples(self, Y, m=None):
         """Set the output examples
 
         :param Y: matrix of dimensions (n[L], m). Accepts also a list (len m) of lists (len n[L])
         :param m: number of training examples.
         :type m: int
         """
+        if m is None:
+            m = self._m
         if type(Y) is list:
             assert(len(Y) == m)
             self._Y = np.matrix(Y).T
         else:
-            assert(Y.shape == (self._n[self._L], self._m))
+            #print(Y.shape, self._n[self._L], m)
+            assert(Y.shape == (self._n[self._L], m))
             self._Y = Y
         assert((self._m == m) or (self._m == 0))
         self._m = m
@@ -141,18 +268,21 @@ class MultiLayerPerceptron(object):
         self.set_all_expected_output_examples(Y, m)
 
     def prepare(self):
-        """Prepare network"""
-        assert(self._X is not None)
-        assert(self._m > 0)
-        m = self._m
-        self._A = [self._X]
-        self._A += [np.empty((self._n[l+1], m)) for l in range(self._L)]
-        self._Z = [None] + [np.empty((self._n[l+1], m)) for l in range(self._L)]
+        """Prepare network for propagation"""
+        if self._prepared == False:
+            self._prepared = True
+            assert(self._X is not None)
+            assert(self._m > 0)
+            m = self._m
+            self._A = [self._X]
+            self._A += [np.empty((self._n[l+1], m)) for l in range(self._L)]
+            self._Z = [None] + [np.empty((self._n[l+1], m)) for l in range(self._L)]
 
     def propagate(self):
         """Forward propagation
 
         :return: matrix of computed outputs (n[L], m)
+
         """
         for l0 in range(self._L):
             l = l0 + 1
@@ -160,48 +290,114 @@ class MultiLayerPerceptron(object):
             self._A[l] = self._g[l]["function"](self._Z[l])
         return self._A[self._L]
 
+    def compute_outputs(self, X=None):
+        """Compute outputs with forward propagation.
+        Note: if no input provided, then the input should have been set using
+        either `set_all_input_examples()` or `set_all_training_examples()`.
+
+        :param X: if None will use self._X
+        :return: the computed output
+
+        """
+        if X is not None:
+            if type(X) is list:
+                m = len(X)
+            else:
+                m = X.shape[1]
+            self.set_all_input_examples(X, m)
+        self.prepare()
+        self.propagate()
+        return self._A[self._L]
+
     def get_output(self):
         return self._A[self._L]
 
+    def get_expected_output(self):
+        return self._Y
+
+    def get_input(self):
+        return self._X
+
     def get_weights(self):
         return self._W[1:]
 
+    def get_bias(self):
+        return self._b[1:]
+
     def back_propagation(self, get_cost_function=False):
         """Back propagation
 
         :param get_cost_function: if True the cost function J
             will be computed and returned.
             J = -1/m((Y(A.T)) + (1-Y)(A.T))
+            if self._regularization will add:
+            J += lamda/(2*m)*Wnorm
         :return: the cost function if get_cost_function==True else None
+
         """
         J = None
-        l = self._L
+        L = self._L
         m = self._m
         dW = [None] + [None] * self._L
         db = [None] + [None] * self._L
         dA = [None] + [None] * self._L
-        dA[l] = -self._Y/self._A[l] + ((1-self._Y)/(1-self._A[l]))
-        if get_cost_function:
-            wnorms = 0
-            for w in self._W[1:]:
-                wnorms += np.linalg.norm(w)
-            J = -1/m * ( np.dot(self._Y, np.log(self._A[l]).T) + \
-                         np.dot((1 - self._Y), np.log(1-self._A[l]).T) ) + \
-                self._lambda/(2*m) * wnorms # regularization
+        dA[L] = -self._Y/self._A[L] + ((1-self._Y)/(1-self._A[L]))
 
-        #dZ = self._A[l] - self._Y
-        for l in range(self._L, 0, -1):
-            dZ = dA[l] * self._g[l]["derivative"](self._Z[l])
+        # Compute cost function
+        if get_cost_function:
+            if self._softmax:
+                # case of softmax multi-class
+                loss = -np.sum(self._Y * np.log(self._A[L]), axis=0)
+                J = 1/m * np.sum(loss)
+            else:
+                J = -1/m * np.sum(( np.dot(self._Y, np.log(self._A[L]).T) + \
+                             np.dot((1 - self._Y), np.log(1-self._A[L]).T) ), axis=1)
+            # add regularization
+            if self._regularization:
+                wnorms = 0
+                for w in self._W[1:]:
+                    wnorms += np.linalg.norm(w)
+                J += self._lambda_regul/(2*m) * wnorms
+
+        # Compute weights derivatives
+        for l in range(L, 0, -1):
+            if self._softmax and l == L:
+                # output layer for softmax multi-class
+                dZ = self._A[L] - self._Y
+            else:
+                dZ = dA[l] * self._g[l]["derivative"](self._Z[l])
             dW[l] = 1/self._m * np.dot(dZ, self._A[l-1].T)
             db[l] = 1/m * np.sum(dZ, axis=1, keepdims=True)
             dA[l-1] = np.dot(self._W[l].T, dZ)
-#            dZ = np.dot(self._W[l+1].T, dZ) * self._g[l]["derivative"](self._Z[l])
-#            dW[l] = 1/m * np.dot(dZ, self._A[l-1].T)
-#            db[l] = 1/m * np.sum(dZ, axis=1, keepdims=True)
-        for l in range(self._L, 0, -1):
-            self._W[l] = self._W[l] - self._alpha * dW[l] - \
-                    (self._alpha*self._lambda/m * self._W[l]) # regularization
-            self._b[l] = self._b[l] - self._alpha * db[l]
+
+        # Update weights
+        for l in range(L, 0, -1):
+            w_factor = dW[l]
+            b_factor = db[l]
+            # add momentum
+            if self._momentum:
+                self._v_dw_momentum[l] = self._beta_momentum * self._v_dw_momentum[l] + \
+                                      (1 - self._beta_momentum) * dW[l]
+                self._v_db_momentum[l] = self._beta_momentum * self._v_db_momentum[l] + \
+                                      (1 - self._beta_momentum) * db[l]
+                w_factor = self._v_dw_momentum[l]
+                b_factor = self._v_db_momentum[l]
+            # add RMSProp
+            if self._rmsprop:
+                self._s_dw_rmsprop[l] = self._beta_rmsprop * self._s_dw_rmsprop[l] + \
+                                      (1 - self._beta_rmsprop) * (dW[l]**2)
+                self._s_db_rmsprop[l] = self._beta_rmsprop * self._s_db_rmsprop[l] + \
+                                      (1 - self._beta_rmsprop) * (db[l]**2)
+                # if adam optimization is use the formula will work as w/b_factor are set in momentum
+                w_factor = w_factor / (np.sqrt(self._s_dw_rmsprop[l]) + self._epsilon_rmsprop)
+                b_factor = b_factor / (np.sqrt(self._s_db_rmsprop[l]) + self._epsilon_rmsprop)
+            # add regularization
+            if self._regularization:
+                self._W[l] = self._W[l] - self._alpha * w_factor - \
+                             (self._alpha*self._lambda_regul/m) * self._W[l]
+            else:
+                self._W[l] = self._W[l] - self._alpha * w_factor
+            self._b[l] = self._b[l] - self._alpha * b_factor
 
         return J
 
@@ -223,7 +419,7 @@ class MultiLayerPerceptron(object):
         for i in range(max_iter):
             J = self.back_propagation(True)
             if plot:
-                y.append(J[0][0])
+                y.append(J)
                 x.append(nb_iter)
             self.propagate()
             nb_iter = i + 1
@@ -250,30 +446,3 @@ class MultiLayerPerceptron(object):
         res = self.minimize_cost(min_cost, max_iter, alpha, plot)
         return res
 
-
-if __name__ == "__main__":
-    mlp = MultiLayerPerceptron(L=2, n=[2, 3, 1], g=["tanh", "sigmoid"], alpha=2, lambd=0.005)
-    #mlp = MultiLayerPerceptron(L=1, n=[2, 1], g=["sigmoid"], alpha=0.1)
-
-    X = np.array([[0, 0],
-                  [0, 1],
-                  [1, 0],
-                  [1, 1]])
-
-    Y = np.array([[0],
-                  [1],
-                  [1],
-                  [0]])
-
-    res = mlp.learning(X.T, Y.T, 4, max_iter=5000, plot=True)
-    print(res)
-    print(mlp.get_output())
-    print(mlp.get_weights())
-    #mlp.set_all_training_examples(X.T, Y.T, 4)
-    #mlp.prepare()
-    #print(mlp.propagate())
-    #for i in range(100):
-    #    print(mlp.back_propagation())
-    #    mlp.propagate()
-    #print(mlp.propagate())
-