Vanishing Gradients

这里只使用numpy来实现一个neural network，而非借助pytorch这样的框架，如此，可以更好的帮助我理解neural network以及vanishing gradients.

import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

# generate random data -- not linearly separable
np.random.seed(0)
N = 100 # number of points per class
D = 2 # dimensionality (向量维度)
K = 3 # number of classes
X = np.zeros((N*K, D))
num_train_examples = X.shape[0]
y = np.zeros(N*K, dtype='uint8')  # 无符号整数
for j in range(K):
    ix = range(N*j, N*(j+1))
    r = np.linspace(0.0, 1, N) # radius, evenly spaced numbers
    t = np.linspace(j*4, (j+1)*4, N) + np.random.randn(N)*0.2 # theta
    X[ix] = np.c_[r*np.sin(t), r*np.cos(t)] # 变成2列的matrix
    y[ix] = j
fig = plt.figure()
plt.scatter(X[:,0], X[:,1], c=y, s=40, cmap=plt.cm.Spectral)
plt.xlim([-1,1])
plt.ylim([-1,1])

sigmoid 函数的值在0,1之间，尤其是在输入值的绝对值很大的情况下，两端会无限接近0或1，因此变得很扁平，由此，对应的梯度（Gradient）就会无限逼近0。这就导致了所谓的梯度消失（vanishing gradients）的现象。因为梯度消失使得神经网络无法进行有效的学习（因为每次迭代对参数W的修正几乎都是0）。

而另一方面，relu函数不会出现因为输入参数变大而输出变得不敏感。

def sigmoid(x):
    x = 1/(1+np.exp(-x))
    return x

def sigmoid_grad(x): # sigmoid函数的导数
    return (x)*(1-x)

def relu(x):
    return np.maximum(0,x)

# plot the 2 function
x = np.linspace(-3,3,100)
y_sigmoid = sigmoid(x)
y_relu = relu(x)

plt.plot(x, y_sigmoid, 'b-', x, y_relu, 'r-')

接下来让我们看一下2种不同的非线性函数（sigmoid和relu）对神经网络在训练时的影响。以下，我们会创建一个简单的3层神经网路（2 hidden layers）。通过使用sigmoid和relu我们可以比较在训练过程中的区别。

# function to train a 3 layer neural net with either relu or sigmoid nonlinearity via vanilla grad decent.
def three_layer_net(NONLINEARITY, X, y, model, step_size, reg):
    # param init
    h = model['h']
    h2 = model['h2']
    W1 = model['W1']
    W2 = model['W2']
    W3 = model['W3']
    b1 = model['b1']
    b2 = model['b2']
    b3 = model['b3']

    # some hyperparameters

    # gradient descent loop
    num_examples = X.shape[0]
    plot_array_1 = []
    plot_array_2 = []
    for i in range(50000):
        # forward prop
        # 假设有X的维度[N, M], 即有N条training 数据，每条数据有M个feature
        if NONLINEARITY == 'RELU':
            # 从X到hidden layer 1, X=[N*2], W1=[2*50] --> hidden layer=[N*50]
            hidden_layer = relu(np.dot(X,W1) + b1)  
            # 从 hidden layer 1 到 layer2. hidden 1 = [N*50], W2=[50*50]-->hidden2 = [N*50]
            hidden_layer2 = relu(np.dot(hidden_layer, W2) + b2)  
            # 从 hidden 2 到 最总output layer， hidden 2 = [1*50], W3=[50*3] --> output = [3] 
            scores = np.dot(hidden_layer2, W3) + b3  # scores的维度是[N,3]
        
        elif NONLINEARITY == 'SIGM':
            hidden_layer = sigmoid(np.dot(X,W1) + b1)  # 从X到hidden layer 1
            hidden_layer2 = sigmoid(np.dot(hidden_layer, W2) + b2)  # 从 hidden layer 1 到 layer2
            scores = np.dot(hidden_layer2, W3) + b3

        exp_scores = np.exp(scores) # [N*K], K=3
        # keepdims=True 意思是np.sum被sum的维度变为1，另一个维度保持不变。
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # normalize，行方向sum

        # compute the loss: average cross-entropy loss and regularization
        corect_logprobs = -np.log(probs[range(num_examples), y]) # y的取值范围0，1，2。结果[N*1]
        data_loss = np.sum(corect_logprobs)/num_examples
        # regularization项
        reg_loss = 0.5*reg*np.sum(W1*W1) + 0.5*reg*np.sum(W2*W2)+0.5*reg*np.sum(W3*W3)
        loss = data_loss + reg_loss
        # if i % 1000 == 0:
        #     print("iteration {0}: loss {1}".format(i, loss))
        
        # comput the gradient on scores
        dscores = probs  #  -- [N*K]
        dscores[range(num_examples), y] -= 1  # error value -- [N*K]
        dscores /= num_examples

        # backprop here
        # 从output layer back prop 到 hidden layer2, 由于该层没有nonlinearity
        # 即 hidden2*W3 = s, ds/dW3 = hidden2 (local gradient), 因此back prop error
        # 只需要将error * local gradient，即hidden2.
        dW3 = (hidden_layer2.T).dot(dscores)  # [50*N] * [N*3] = [50*3]
        db3 = np.sum(dscores, axis=0, keepdims=True) # [3]

        if NONLINEARITY == 'RELU':
            # backprop Relu nonlinearity here
            dhidden2 = np.dot(dscores, W3.T)
            dhidden2[hidden_layer2 <= 0] = 0
            dW2 = np.dot(hidden_layer.T, dhidden2)
            plot_array_2.append(np.sum(np.abs(dW2))/np.sum(np.abs(dW2.shape)))
            db2 = np.sum(dhidden2, axis=0)
            dhidden = np.dot(dhidden2, W2.T)
            dhidden[hidden_layer <= 0] = 0
        
        elif NONLINEARITY == 'SIGM':
            # backprop sigmoid nonlinearity there
            # hidden layer 2 是 sigmoid的输出，因此需要乘以一个local gradiant
            dhidden2 = dscores.dot(W3.T)*sigmoid_grad(hidden_layer2) 
            dW2 = (hidden_layer.T).dot(dhidden2)
            plot_array_2.append(np.sum(np.abs(dW2))/np.sum(np.abs(dW2.shape)))
            db2 = np.sum(dhidden2, axis=0)
            dhidden = dhidden2.dot(W2.T)*sigmoid_grad(hidden_layer)

        dW1 = np.dot(X.T, dhidden)
        plot_array_1.append(np.sum(np.abs(dW1))/np.sum(np.abs(dW1.shape)))
        db1 = np.sum(dhidden, axis=0)

        # add regularization
        dW3 += reg * W3
        dW2 += reg * W2
        dW1 += reg * W1

        # option to return loss, grads, 
        # grads = {}
        # grads['W1'] = dW1
        # grads['W2'] = dW2
        # grads['W3'] = dW3
        # grads['b1'] = db1
        # grads['b2'] = db2
        # grads['b3'] = db3

        # update grads
        W1 += -step_size * dW1
        b1 += -step_size * db1
        W2 += -step_size * dW2
        b2 += -step_size * db2        
        W3 += -step_size * dW3
        b3 += -step_size * db3
    # evaluate training set accuracy， 使用training出来的W1，W2,W3,b1,b2,b3
    # 来计算结果。
    if NONLINEARITY == 'RELU':
        hidden_layer = relu(np.dot(X, W1) + b1)
        hidden_layer2 = relu(np.dot(hidden_layer, W2) + b2)
    elif NONLINEARITY == 'SIGM':
        hidden_layer = sigmoid(np.dot(X, W1) + b1)
        hidden_layer2 = sigmoid(np.dot(hidden_layer, W2) + b2)

    scores = np.dot(hidden_layer2, W3) + b3
    predicted_class = np.argmax(scores, axis=1)  # 找出每一行的最大值的index
    print('training accuracy: {0}'.format(np.mean(predicted_class==y)))
    
    return plot_array_1, plot_array_2, W1, W2, W3, b1, b2, b3

关于back propagation 的计算参见下图。其实就是对chain rule的应用。

Train net with sigmoid nonlinearity first

# Initialize toy model, train sigmoid net 

N = 100  # number of points per class
D = 2  # dimensionality
K = 3  # number of classes
h = 50  # hidden layer 1 size
h2 = 50  # hidden layer 2 size
num_train_examples = X.shape[0]

model = {}
model['h'] = h
model['h2'] = h2
model['W1'] = 0.1 * np.random.randn(D, h)
model['b1'] = np.zeros((1,h))
model['W2'] = 0.1 * np.random.randn(h, h2)
model['b2'] = np.zeros((1,h2))
model['W3'] = 0.1 * np.random.randn(h2, K)
model['b3'] = np.zeros((1,K))

(sigm_array_1, sigm_array_2, s_W1, s_W2, s_W3, s_b1, s_b2, s_b3) = three_layer_net('SIGM', X, y, model, step_size=1e-1, reg=1e-3)

training accuracy: 0.97

Now train net with ReLU nonlinearity

# Re-initialize model, train relu net
model = {}
model['h'] = h
model['h2'] = h2
model['W1'] = 0.1 * np.random.randn(D, h)
model['b1'] = np.zeros((1,h))
model['W2'] = 0.1 * np.random.randn(h, h2)
model['b2'] = np.zeros((1,h2))
model['W3'] = 0.1 * np.random.randn(h2, K)
model['b3'] = np.zeros((1,K))

(relu_array_1, relu_array_2, r_W1, r_W2, r_W3, r_b1, r_b2, r_b3) = three_layer_net('RELU', X, y, model, step_size=1e-1, reg=1e-3)

training accuracy: 0.9933333333333333

The Vanishing Gradient Issue — 梯度消失的问题

我们可以对某一hidden层W的梯度（dW）进行加总，用这个简单的指标来衡量学习的速度，显然，sum(dW)越大，说明神经网络的学习速度越快。

plt.plot(np.array(sigm_array_1))
plt.plot(np.array(sigm_array_2))
plt.title('Sum of magnitudes of gradients -- SIGM weights')
plt.legend(("sigm first layer", "sigm second layer"))

由上图可见，第二层的梯度显著大于第一层。说明在进行back prop的时候，hidden层越多，那么排在最前的hidden层对应的梯度（dW）就会变得越来越小。直观的讲，因为chain rule的原因，hidden层越多，则chain rule里乘的local gradient越多，而另一方面，由于nonlinearity使用的是sigmod，sigmoid grad = δ*(1-δ)，输出必在[0,1]之间。所以local gradient的值都落在[0，1]，那么随着hidden层的增加，排在最前的hidden层对应的梯度（dW）就会变得越来越小。

plt.plot(np.array(relu_array_1))
plt.plot(np.array(relu_array_2))
plt.title('Sum of magnitudes of gradients -- ReLU weights')
plt.legend(("relu first layer", "relu second layer"))

由上图可见，ReLU收敛的速度比sigmoid快很多，而且收敛后就很稳定。

# overlaying the 2 plots to compare
plt.plot(np.array(sigm_array_1))
plt.plot(np.array(sigm_array_2))
plt.plot(np.array(relu_array_1))
plt.plot(np.array(relu_array_2))
plt.title('Sum of magnitudes of gradients -- hidden layer neurons')
plt.legend(("sigm first layer", "sigm second layer", "relu first layer", "relu second layer"))

上图可以更明显的看到，ReLU的收敛速度快，一开始的gradient更高。

最后，看看2种分类器的表现，由于ReLU训练速度更快，因此用同样的epochs，ReLU表现的更好。

# plot the classifiers -- SIGMOID
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = np.dot(sigmoid(np.dot(sigmoid(np.dot(np.c_[xx.ravel(), yy.ravel()], s_W1) + s_b1), s_W2) + s_b2), s_W3) + s_b3
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

# plot the classifiers-- RELU
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = np.dot(relu(np.dot(relu(np.dot(np.c_[xx.ravel(), yy.ravel()], r_W1) + r_b1), r_W2) + r_b2), r_W3) + r_b3
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())