010-lec-001. hidden layer,deep nn,issue of backpropagation with sigmoid,vanishing gradient,sigmoid and relu W1_variable_node=tf.Variable(tf.random_uniform([2,5],-1.0,1.0)) W2_variable_node=tf.Variable(tf.random_uniform([5,4],-1.0,1.0)) W3_variable_node=tf.Variable(tf.random_uniform([4,1],-1.0,1.0)) b1_variable_node=tf.Variable(tf.zeros([5]),name="Bias1") b2_variable_node=tf.Variable(tf.zeros([4]),name="Bias2") b3_variable_node=tf.Variable(tf.zeros([1]),name="Bias3") L2_hypothesis_f_node=tf.sigmoid(tf.matmul(X,W1_variable_node)+b1_variable_node) L3_hypothesis_f_node=tf.sigmoid(tf.matmul(L2_hypothesis_f_node,W2_variable_node)+b2_variable_node) hypothesis_f_node=tf.sigmoid(tf.matmul(L3_hypothesis_f_node,W3_variable_node)+b3_variable_node) # @ # Let's build 9 layers # We only should need to be careful, # on first and last shape of weight # First layer's weight has shape of [2,5] # W1_variable_node=tf.Variable(tf.random_uniform([2,5],-1.0,1.0),name='Weight1') # First layer's weight has shape of [5,1] # W11_variable_node=tf.Variable(tf.random_uniform([5,1],-1.0,1.0),name='Weight11') # Shape of weight in hidden layer, # located between first layer and last layer, # can be defined whatever you want # [n,2][?,?]=[n,5] W1_variable_node=tf.Variable(tf.random_uniform([2,5],-1.0,1.0),name='Weight1') # [n,5][?,?]=[n,what you want] W2_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight2') W3_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight3') W4_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight4') W5_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight5') W6_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight6') W7_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight7') W8_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight8') W9_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight9') W10_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight10') # [n,5][?,?]=[n,1] # [?,?]=[5,1] W11_variable_node=tf.Variable(tf.random_uniform([5,1],-1.0,1.0),name='Weight11') b1_variable_node=tf.Variable(tf.zeros([5]),name='Bias1') b2_variable_node=tf.Variable(tf.zeros([5]),name='Bias2') b3_variable_node=tf.Variable(tf.zeros([5]),name='Bias3') b4_variable_node=tf.Variable(tf.zeros([5]),name='Bias4') b5_variable_node=tf.Variable(tf.zeros([5]),name='Bias5') b6_variable_node=tf.Variable(tf.zeros([5]),name='Bias6') b7_variable_node=tf.Variable(tf.zeros([5]),name='Bias7') b8_variable_node=tf.Variable(tf.zeros([5]),name='Bias8') b9_variable_node=tf.Variable(tf.zeros([5]),name='Bias9') b10_variable_node=tf.Variable(tf.zeros([5]),name='Bias10') # Last shape of bias should be 1 from [5,1] # W11_variable_node=tf.Variable(tf.random_uniform([5,1],-1.0,1.0),name='Weight11') b11_variable_node=tf.Variable(tf.zeros([1]),name='Bias11') # Then, you should connect them # Each L is hypothesis_f_node function in each layer # XW=H(X) # L1_hypothesis_f_node L2_hypothesis_f_node L3_hypothesis_f_node L4_hypothesis_f_node L5_hypothesis_f_node L6_hypothesis_f_node L7_hypothesis_f_node L8_hypothesis_f_node L9_hypothesis_f_node L10_hypothesis_f_node L1_hypothesis_f_node=tf.sigmoid(tf.matmul(X,W1_variable_node)+b1_variable_node) L2_hypothesis_f_node=tf.sigmoid(tf.matmul(L1_hypothesis_f_node,W2_variable_node)+b2_variable_node) L3_hypothesis_f_node=tf.sigmoid(tf.matmul(L2_hypothesis_f_node,W3_variable_node)+b3_variable_node) L4_hypothesis_f_node=tf.sigmoid(tf.matmul(L3_hypothesis_f_node,W4_variable_node)+b4_variable_node) L5_hypothesis_f_node=tf.sigmoid(tf.matmul(L4_hypothesis_f_node,W5_variable_node)+b5_variable_node) L6_hypothesis_f_node=tf.sigmoid(tf.matmul(L5_hypothesis_f_node,W6_variable_node)+b6_variable_node) L7_hypothesis_f_node=tf.sigmoid(tf.matmul(L6_hypothesis_f_node,W7_variable_node)+b7_variable_node) L8_hypothesis_f_node=tf.sigmoid(tf.matmul(L7_hypothesis_f_node,W8_variable_node)+b8_variable_node) L9_hypothesis_f_node=tf.sigmoid(tf.matmul(L8_hypothesis_f_node,W9_variable_node)+b9_variable_node) L10_hypothesis_f_node=tf.sigmoid(tf.matmul(L9_hypothesis_f_node,W10_variable_node)+b10_variable_node) hypothesis_f_node=tf.sigmoid(tf.matmul(L10_hypothesis_f_node,W11_variable_node)+b11_variable_node) # @ # If you want to use tensorboard, # you should give them name when you build layers # So, you can replace above codes with following one with tf.name_scope('layer1') as scope: L1_hypothesis_f_node=tf.sigmoid(tf.matmul(X,W1_variable_node)+b1_variable_node) with tf.name_scope('layer2') as scope: L2_hypothesis_f_node=tf.sigmoid(tf.matmul(L1_hypothesis_f_node,W2_variable_node)+b2_variable_node) with tf.name_scope('layer3') as scope: L3_hypothesis_f_node=tf.sigmoid(tf.matmul(L2_hypothesis_f_node,W3_variable_node)+b3_variable_node) with tf.name_scope('layer4') as scope: L4_hypothesis_f_node=tf.sigmoid(tf.matmul(L3_hypothesis_f_node,W4_variable_node)+b4_variable_node) with tf.name_scope('layer5') as scope: L5_hypothesis_f_node=tf.sigmoid(tf.matmul(L4_hypothesis_f_node,W5_variable_node)+b5_variable_node) with tf.name_scope('layer6') as scope: L6_hypothesis_f_node=tf.sigmoid(tf.matmul(L5_hypothesis_f_node,W6_variable_node)+b6_variable_node) with tf.name_scope('layer7') as scope: L7_hypothesis_f_node=tf.sigmoid(tf.matmul(L6_hypothesis_f_node,W7_variable_node)+b7_variable_node) with tf.name_scope('layer8') as scope: L8_hypothesis_f_node=tf.sigmoid(tf.matmul(L7_hypothesis_f_node,W8_variable_node)+b8_variable_node) with tf.name_scope('layer9') as scope: L9_hypothesis_f_node=tf.sigmoid(tf.matmul(L8_hypothesis_f_node,W9_variable_node)+b9_variable_node) with tf.name_scope('layer10') as scope: L10_hypothesis_f_node=tf.sigmoid(tf.matmul(L9_hypothesis_f_node,W10_variable_node)+b10_variable_node) with tf.name_scope('lat') as scope: hypothesis_f_node=tf.sigmoid(tf.matmul(L10_hypothesis_f_node,W11_variable_node)+b11_variable_node) # @ # When you perform backpropagation, # if some defferentiation is given after sigmoid, # you will get near 0, # when you perform chainrule, # because sigmoid function outputs value between 0 and 1 # That is, the more you use layers, # the more frequently above issue occurs # This issue is called 'vanishing gradient' # @ # To resolve this issue, # you use relu(rectified linear unit) function, # instead of sigmoid function # So, in nn, you use relu than sigmoid # L1_hypothesis_f_node=tf.sigmoid(tf.matmul(X,W1_variable_node)+b1_variable_node) # L1_hypothesis_f_node=tf.nn.relu(tf.matmul(X,W1_variable_node)+b1_variable_node) # @ # Let's apply relu function on layers with tf.name_scope('layer1') as scope: L1_hypothesis_f_node=tf.nn.relu(tf.matmul(X,W1_variable_node)+b1_variable_node) with tf.name_scope('layer2') as scope: L2_hypothesis_f_node=tf.nn.relu(tf.matmul(L1_hypothesis_f_node,W2_variable_node)+b2_variable_node) with tf.name_scope('layer3') as scope: L3_hypothesis_f_node=tf.nn.relu(tf.matmul(L2_hypothesis_f_node,W3_variable_node)+b3_variable_node) with tf.name_scope('layer4') as scope: L4_hypothesis_f_node=tf.nn.relu(tf.matmul(L3_hypothesis_f_node,W4_variable_node)+b4_variable_node) with tf.name_scope('layer5') as scope: L5_hypothesis_f_node=tf.nn.relu(tf.matmul(L4_hypothesis_f_node,W5_variable_node)+b5_variable_node) with tf.name_scope('layer6') as scope: L6_hypothesis_f_node=tf.nn.relu(tf.matmul(L5_hypothesis_f_node,W6_variable_node)+b6_variable_node) with tf.name_scope('layer7') as scope: L7_hypothesis_f_node=tf.nn.relu(tf.matmul(L6_hypothesis_f_node,W7_variable_node)+b7_variable_node) with tf.name_scope('layer8') as scope: L8_hypothesis_f_node=tf.nn.relu(tf.matmul(L7_hypothesis_f_node,W8_variable_node)+b8_variable_node) with tf.name_scope('layer9') as scope: L9_hypothesis_f_node=tf.nn.relu(tf.matmul(L8_hypothesis_f_node,W9_variable_node)+b9_variable_node) with tf.name_scope('layer10') as scope: L10_hypothesis_f_node=tf.nn.relu(tf.matmul(L9_hypothesis_f_node,W10_variable_node)+b10_variable_node) # Note that you should use sigmoid on last layer # because value of last layer should be between 0 and 1 with tf.name_scope('lat') as scope: hypothesis_f_node=tf.sigmoid(tf.matmul(L10_hypothesis_f_node,W11_variable_node)+b11_variable_node) # @ # There are other activation functions # Sigmoid # tanh # ReLU # Leaky ReLu # Maxout # ELU