add softmax, RMSProp, momentum, Adam. Fix ReLU. Enhance weights init.

This commit is contained in:
dadel 2018-01-15 22:59:36 +01:00
parent 9495c7db09
commit fee1994adb
1 changed files with 246 additions and 77 deletions

323
mlp.py Executable file → Normal file
View File

@ -19,6 +19,7 @@ def deriv_sigmoid(x):
def tanh(x): def tanh(x):
ep = np.exp(x) ep = np.exp(x)
en = np.exp(-x) en = np.exp(-x)
#print("ep:{}\nen:{}\n".format(ep,en))
return (ep - en)/(ep + en) return (ep - en)/(ep + en)
@ -28,20 +29,13 @@ def deriv_tanh(x):
def relu(x): def relu(x):
ret = 0 return np.maximum(x, np.zeros(x.shape))
#fixme should map to compare
if x > 0:
ret = x
elif type(x) is np.ndarray:
ret = np.zeros(x.shape)
return ret
def deriv_relu(x): def deriv_relu(x):
ret = 0 ret = x
if z < 0: ret[ret > 0] = 1
ret = 0.01 ret[ret < 0] = 0
else: return ret
ret = 1
def leaky_relu(x): def leaky_relu(x):
ret = 0.01 * x ret = 0.01 * x
@ -52,16 +46,56 @@ def leaky_relu(x):
ret = np.ones(x.shape)*0.01 ret = np.ones(x.shape)*0.01
return ret return ret
def softmax(x):
t = np.exp(x)
sum_t = np.sum(t, axis=0)
return t / sum_t
def w_rand_tanh(n, l, xavier_init=True):
""" Initialize weights of a layer for tanH activation function
:param n: vector of number of units per layer
:param l: current layer
:param xavier_init: if True will use Xavier initialization
"""
if xavier_init:
print("tanh factor={}".format(np.sqrt(1/n[l-1])))
ret = np.random.randn(n[l], n[l-1]) * np.sqrt(1/n[l-1])
else:
print("tanh factor={}".format(np.sqrt(2/(n[l-1]+n[l]))))
ret = np.random.randn(n[l], n[l-1]) * np.sqrt(2/(n[l-1]+n[l]))
return ret
def w_rand_relu(n, l):
""" Initialize weights of a layer for tanH activation function
:param n: vector of number of units per layer
:param l: current layer
"""
print("relu factor={}".format(np.sqrt(2/n[l-1])))
return np.random.randn(n[l], n[l-1]) * np.sqrt(2/n[l-1])
def w_rand_sigmoid(n, l):
print("sigmoid factor={}".format(1/(n[l-1]*n[l])))
return np.random.randn(n[l], n[l-1]) * (1/(n[l-1]*n[l]))
def w_rand_softmax(n, l, factor=0.01):
print("softmax factor={}".format(factor))
return np.random.randn(n[l], n[l-1]) * factor
class MultiLayerPerceptron(object): class MultiLayerPerceptron(object):
functions = { functions = {
"sigmoid": {"function": sigmoid, "derivative": deriv_sigmoid}, "sigmoid": {"function": sigmoid, "derivative": deriv_sigmoid, "w_rand": w_rand_sigmoid, "name": "sigmoid"},
"tanh": {"function": tanh, "derivative": deriv_tanh}, "tanh": {"function": tanh, "derivative": deriv_tanh, "w_rand": w_rand_tanh, "name": "tanh"},
"relu": {"function": relu, "derivative": deriv_relu}, "relu": {"function": relu, "derivative": deriv_relu, "w_rand": w_rand_relu, "name": "relu"},
"softmax": {"function": softmax, "derivative": None, "w_rand": w_rand_softmax, "name": "softmax"},
} }
def __init__(self, L=1, n=None, g=None, alpha=0.01, lambd=0): def __init__(self, L=1, n=None, g=None, alpha=0.01, set_random_w=True, use_formula_w=False, w_rand_factor=1):
"""Initializes network geometry and parameters """Initializes network geometry and parameters
:param L: number of layers including output and excluding input. Defaut 1. :param L: number of layers including output and excluding input. Defaut 1.
:type L: int :type L: int
@ -71,8 +105,11 @@ class MultiLayerPerceptron(object):
Possible names are: "sigmoid", "tanh". Default ["sigmoid"]. Possible names are: "sigmoid", "tanh". Default ["sigmoid"].
:type g: list :type g: list
:param alpha: learning rate. Default 0.01. :param alpha: learning rate. Default 0.01.
:param set_random_w: if True will initialize randomly weights using either w_rand_factor
or a formula depending on activation functions if use_formula_w is True
""" """
w_rand_factor = 1 #w_rand_factor = 1
self._prepared = False
self._L = L self._L = L
if n is None: if n is None:
n = [2, 1] n = [2, 1]
@ -82,7 +119,11 @@ class MultiLayerPerceptron(object):
else: else:
g = [MultiLayerPerceptron.functions[fct] for fct in g] g = [MultiLayerPerceptron.functions[fct] for fct in g]
self._g = [None] + g self._g = [None] + g
self._W = [None] + [np.random.randn(n[l+1], n[l])*w_rand_factor for l in range(L)] # check if softmax multi-class classification
self._softmax = False
if g[-1]["name"] == "softmax":
self._softmax = True
self._W = [None] * (L+1)
self._b = [None] + [np.zeros((n[l+1], 1)) for l in range(L)] self._b = [None] + [np.zeros((n[l+1], 1)) for l in range(L)]
assert(len(self._g) == len(self._W)) assert(len(self._g) == len(self._W))
assert(len(self._g) == len(self._b)) assert(len(self._g) == len(self._b))
@ -93,37 +134,123 @@ class MultiLayerPerceptron(object):
self._Z = None self._Z = None
self._m = 0 self._m = 0
self._alpha = alpha self._alpha = alpha
self._lambda = lambd # optimization
self._lambda = 0
self._regularization = False
self._momentum = False
self._rmsprop = False
self._adam = False
# initialise weights
if set_random_w:
self.init_random_weights(use_formula_w, w_rand_factor)
def set_all_input_examples(self, X, m=1): def init_random_weights(self, use_formula=False, w_rand_factor=1):
"""Initialize randomly weights using a factor or using some formula
:param w_rand_factor: factorize random weights with this (default 1)
:param use_formula: if True will use formules corresponding to the activation functions
"""
if use_formula:
for l0 in range(self._L):
l = l0 + 1
self._W[l] = self._g[l]["w_rand"](self._n, l)
else:
if type(w_rand_factor) is list:
self._W = [None] + [np.random.randn(self._n[l+1], self._n[l])*w_rand_factor[l] for l in range(self._L)]
else:
self._W = [None] + [np.random.randn(self._n[l+1], self._n[l])*w_rand_factor for l in range(self._L)]
def use_regularization(self, lambd):
"""Activates regularization for backpropagation
:param lambd: the lambda parameter value for regularization
"""
self._regularization = True
self._lambda_regul = lambd
def use_momentum(self, beta=0.9, v_dw=0., v_db=0.):
"""Activates momentum optimization for backpropagation
:param beta: the beta parameter value for momentum (default 0.9)
:param v_dw: v_dw initial value for momentum (default 0.0)
:param v_db: v_db initial value for momentum (default 0.0)
"""
self._momentum = True
self._beta_momentum = beta
n = self._n
self._v_dw_momentum = [None] + [v_dw * np.ones((n[l+1], n[l])) for l in range(self._L)]
self._v_db_momentum = [None] + [v_db * np.ones((n[l+1], 1)) for l in range(self._L)]
def use_rmsprop(self, beta=0.999, s_dw=0., s_db=0., epsilon=1.0e-8):
"""Activates RMSProp optimization for backpropagation
:param beta: the beta parameter value for RMSProp (default 0.999)
:param s_dw: s_dw initial value for RMSProp (default 0.0)
:param s_db: s_db initial value for RMSProp (default 0.0)
:param epsilon: epsilon value for RMSProp (default 1.0e-8)
"""
self._rmsprop = True
self._beta_rmsprop = beta
n = self._n
self._s_dw_rmsprop = [None] + [s_dw * np.ones((n[l+1], n[l])) for l in range(self._L)]
self._s_db_rmsprop = [None] + [s_db * np.ones((n[l+1], 1)) for l in range(self._L)]
self._epsilon_rmsprop = epsilon
def use_adam(self, beta_m=0.9, v_dw=0., v_db=0., beta_r=0.999, s_dw=0., s_db=0., epsilon=1.0e-8):
"""Activates Adam optimization for backpropagation
:param beta_m: the beta parameter value for momentum (default 0.9)
:param v_dw: v_dw initial value for momentum (default 0.0)
:param v_db: v_db initial value for momentum (default 0.0)
:param beta_r: the beta parameter value for RMSProp (default 0.999)
:param s_dw: s_dw initial value for RMSProp (default 0.0)
:param s_db: s_db initial value for RMSProp (default 0.0)
:param epsilon: epsilon value for RMSProp (default 1.0e-8)
"""
self._adam = True
self.use_momentum(beta_m, v_dw, v_db)
self.use_rmsprop(beta_r, s_dw, s_db, epsilon)
def set_all_input_examples(self, X, m=None):
"""Set the input examples. """Set the input examples.
:param X: matrix of dimensions (n[0], m). Accepts also a list (len m) of lists (len n[0]) :param X: matrix of dimensions (n[0], m). Accepts also a list (len m) of lists (len n[0])
:param m: number of training examples. :param m: number of training examples.
:type m: int :type m: int
""" """
if m is None:
m = self._m
if type(X) is list: if type(X) is list:
assert(len(X) == m) assert(len(X) == m)
self._X = np.matrix(X).T self._X = np.matrix(X).T
else: else:
assert(X.shape == (self._n[0], self._m)) #print(X.shape, self._n[0], m)
assert(X.shape == (self._n[0], m))
self._X = X self._X = X
self._m = m self._m = m
assert((self._m == m) or (self._m == 0)) assert((self._m == m) or (self._m == 0))
self._m = m self._m = m
def set_all_expected_output_examples(self, Y, m=1): def set_all_expected_output_examples(self, Y, m=None):
"""Set the output examples """Set the output examples
:param Y: matrix of dimensions (n[L], m). Accepts also a list (len m) of lists (len n[L]) :param Y: matrix of dimensions (n[L], m). Accepts also a list (len m) of lists (len n[L])
:param m: number of training examples. :param m: number of training examples.
:type m: int :type m: int
""" """
if m is None:
m = self._m
if type(Y) is list: if type(Y) is list:
assert(len(Y) == m) assert(len(Y) == m)
self._Y = np.matrix(Y).T self._Y = np.matrix(Y).T
else: else:
assert(Y.shape == (self._n[self._L], self._m)) #print(Y.shape, self._n[self._L], m)
assert(Y.shape == (self._n[self._L], m))
self._Y = Y self._Y = Y
assert((self._m == m) or (self._m == 0)) assert((self._m == m) or (self._m == 0))
self._m = m self._m = m
@ -141,18 +268,21 @@ class MultiLayerPerceptron(object):
self.set_all_expected_output_examples(Y, m) self.set_all_expected_output_examples(Y, m)
def prepare(self): def prepare(self):
"""Prepare network""" """Prepare network for propagation"""
assert(self._X is not None) if self._prepared == False:
assert(self._m > 0) self._prepared = True
m = self._m assert(self._X is not None)
self._A = [self._X] assert(self._m > 0)
self._A += [np.empty((self._n[l+1], m)) for l in range(self._L)] m = self._m
self._Z = [None] + [np.empty((self._n[l+1], m)) for l in range(self._L)] self._A = [self._X]
self._A += [np.empty((self._n[l+1], m)) for l in range(self._L)]
self._Z = [None] + [np.empty((self._n[l+1], m)) for l in range(self._L)]
def propagate(self): def propagate(self):
"""Forward propagation """Forward propagation
:return: matrix of computed outputs (n[L], m) :return: matrix of computed outputs (n[L], m)
""" """
for l0 in range(self._L): for l0 in range(self._L):
l = l0 + 1 l = l0 + 1
@ -160,48 +290,114 @@ class MultiLayerPerceptron(object):
self._A[l] = self._g[l]["function"](self._Z[l]) self._A[l] = self._g[l]["function"](self._Z[l])
return self._A[self._L] return self._A[self._L]
def compute_outputs(self, X=None):
"""Compute outputs with forward propagation.
Note: if no input provided, then the input should have been set using
either `set_all_input_examples()` or `set_all_training_examples()`.
:param X: if None will use self._X
:return: the computed output
"""
if X is not None:
if type(X) is list:
m = len(X)
else:
m = X.shape[1]
self.set_all_input_examples(X, m)
self.prepare()
self.propagate()
return self._A[self._L]
def get_output(self): def get_output(self):
return self._A[self._L] return self._A[self._L]
def get_expected_output(self):
return self._Y
def get_input(self):
return self._X
def get_weights(self): def get_weights(self):
return self._W[1:] return self._W[1:]
def get_bias(self):
return self._b[1:]
def back_propagation(self, get_cost_function=False): def back_propagation(self, get_cost_function=False):
"""Back propagation """Back propagation
:param get_cost_function: if True the cost function J :param get_cost_function: if True the cost function J
will be computed and returned. will be computed and returned.
J = -1/m((Y(A.T)) + (1-Y)(A.T)) J = -1/m((Y(A.T)) + (1-Y)(A.T))
if self._regularization will add:
J += lamda/(2*m)*Wnorm
:return: the cost function if get_cost_function==True else None :return: the cost function if get_cost_function==True else None
""" """
J = None J = None
l = self._L L = self._L
m = self._m m = self._m
dW = [None] + [None] * self._L dW = [None] + [None] * self._L
db = [None] + [None] * self._L db = [None] + [None] * self._L
dA = [None] + [None] * self._L dA = [None] + [None] * self._L
dA[l] = -self._Y/self._A[l] + ((1-self._Y)/(1-self._A[l])) dA[L] = -self._Y/self._A[L] + ((1-self._Y)/(1-self._A[L]))
if get_cost_function:
wnorms = 0
for w in self._W[1:]:
wnorms += np.linalg.norm(w)
J = -1/m * ( np.dot(self._Y, np.log(self._A[l]).T) + \
np.dot((1 - self._Y), np.log(1-self._A[l]).T) ) + \
self._lambda/(2*m) * wnorms # regularization
#dZ = self._A[l] - self._Y # Compute cost function
for l in range(self._L, 0, -1): if get_cost_function:
dZ = dA[l] * self._g[l]["derivative"](self._Z[l]) if self._softmax:
# case of softmax multi-class
loss = -np.sum(self._Y * np.log(self._A[L]), axis=0)
J = 1/m * np.sum(loss)
else:
J = -1/m * np.sum(( np.dot(self._Y, np.log(self._A[L]).T) + \
np.dot((1 - self._Y), np.log(1-self._A[L]).T) ), axis=1)
# add regularization
if self._regularization:
wnorms = 0
for w in self._W[1:]:
wnorms += np.linalg.norm(w)
J += self._lambda_regul/(2*m) * wnorms
# Compute weights derivatives
for l in range(L, 0, -1):
if self._softmax and l == L:
# output layer for softmax multi-class
dZ = self._A[L] - self._Y
else:
dZ = dA[l] * self._g[l]["derivative"](self._Z[l])
dW[l] = 1/self._m * np.dot(dZ, self._A[l-1].T) dW[l] = 1/self._m * np.dot(dZ, self._A[l-1].T)
db[l] = 1/m * np.sum(dZ, axis=1, keepdims=True) db[l] = 1/m * np.sum(dZ, axis=1, keepdims=True)
dA[l-1] = np.dot(self._W[l].T, dZ) dA[l-1] = np.dot(self._W[l].T, dZ)
# dZ = np.dot(self._W[l+1].T, dZ) * self._g[l]["derivative"](self._Z[l])
# dW[l] = 1/m * np.dot(dZ, self._A[l-1].T) # Update weights
# db[l] = 1/m * np.sum(dZ, axis=1, keepdims=True) for l in range(L, 0, -1):
for l in range(self._L, 0, -1): w_factor = dW[l]
self._W[l] = self._W[l] - self._alpha * dW[l] - \ b_factor = db[l]
(self._alpha*self._lambda/m * self._W[l]) # regularization # add momentum
self._b[l] = self._b[l] - self._alpha * db[l] if self._momentum:
self._v_dw_momentum[l] = self._beta_momentum * self._v_dw_momentum[l] + \
(1 - self._beta_momentum) * dW[l]
self._v_db_momentum[l] = self._beta_momentum * self._v_db_momentum[l] + \
(1 - self._beta_momentum) * db[l]
w_factor = self._v_dw_momentum[l]
b_factor = self._v_db_momentum[l]
# add RMSProp
if self._rmsprop:
self._s_dw_rmsprop[l] = self._beta_rmsprop * self._s_dw_rmsprop[l] + \
(1 - self._beta_rmsprop) * (dW[l]**2)
self._s_db_rmsprop[l] = self._beta_rmsprop * self._s_db_rmsprop[l] + \
(1 - self._beta_rmsprop) * (db[l]**2)
# if adam optimization is use the formula will work as w/b_factor are set in momentum
w_factor = w_factor / (np.sqrt(self._s_dw_rmsprop[l]) + self._epsilon_rmsprop)
b_factor = b_factor / (np.sqrt(self._s_db_rmsprop[l]) + self._epsilon_rmsprop)
# add regularization
if self._regularization:
self._W[l] = self._W[l] - self._alpha * w_factor - \
(self._alpha*self._lambda_regul/m) * self._W[l]
else:
self._W[l] = self._W[l] - self._alpha * w_factor
self._b[l] = self._b[l] - self._alpha * b_factor
return J return J
@ -223,7 +419,7 @@ class MultiLayerPerceptron(object):
for i in range(max_iter): for i in range(max_iter):
J = self.back_propagation(True) J = self.back_propagation(True)
if plot: if plot:
y.append(J[0][0]) y.append(J)
x.append(nb_iter) x.append(nb_iter)
self.propagate() self.propagate()
nb_iter = i + 1 nb_iter = i + 1
@ -250,30 +446,3 @@ class MultiLayerPerceptron(object):
res = self.minimize_cost(min_cost, max_iter, alpha, plot) res = self.minimize_cost(min_cost, max_iter, alpha, plot)
return res return res
if __name__ == "__main__":
mlp = MultiLayerPerceptron(L=2, n=[2, 3, 1], g=["tanh", "sigmoid"], alpha=2, lambd=0.005)
#mlp = MultiLayerPerceptron(L=1, n=[2, 1], g=["sigmoid"], alpha=0.1)
X = np.array([[0, 0],
[0, 1],
[1, 0],
[1, 1]])
Y = np.array([[0],
[1],
[1],
[0]])
res = mlp.learning(X.T, Y.T, 4, max_iter=5000, plot=True)
print(res)
print(mlp.get_output())
print(mlp.get_weights())
#mlp.set_all_training_examples(X.T, Y.T, 4)
#mlp.prepare()
#print(mlp.propagate())
#for i in range(100):
# print(mlp.back_propagation())
# mlp.propagate()
#print(mlp.propagate())