010-lec-001. hidden layer,deep nn,issue of backpropagation with sigmoid,vanishing gradient,sigmoid and relu
W1_variable_node=tf.Variable(tf.random_uniform([2,5],-1.0,1.0))
W2_variable_node=tf.Variable(tf.random_uniform([5,4],-1.0,1.0))
W3_variable_node=tf.Variable(tf.random_uniform([4,1],-1.0,1.0))
b1_variable_node=tf.Variable(tf.zeros([5]),name="Bias1")
b2_variable_node=tf.Variable(tf.zeros([4]),name="Bias2")
b3_variable_node=tf.Variable(tf.zeros([1]),name="Bias3")
L2_hypothesis_f_node=tf.sigmoid(tf.matmul(X,W1_variable_node)+b1_variable_node)
L3_hypothesis_f_node=tf.sigmoid(tf.matmul(L2_hypothesis_f_node,W2_variable_node)+b2_variable_node)
hypothesis_f_node=tf.sigmoid(tf.matmul(L3_hypothesis_f_node,W3_variable_node)+b3_variable_node)
# @
# Let's build 9 layers
# We only should need to be careful,
# on first and last shape of weight
# First layer's weight has shape of [2,5]
# W1_variable_node=tf.Variable(tf.random_uniform([2,5],-1.0,1.0),name='Weight1')
# First layer's weight has shape of [5,1]
# W11_variable_node=tf.Variable(tf.random_uniform([5,1],-1.0,1.0),name='Weight11')
# Shape of weight in hidden layer,
# located between first layer and last layer,
# can be defined whatever you want
# [n,2][?,?]=[n,5]
W1_variable_node=tf.Variable(tf.random_uniform([2,5],-1.0,1.0),name='Weight1')
# [n,5][?,?]=[n,what you want]
W2_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight2')
W3_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight3')
W4_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight4')
W5_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight5')
W6_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight6')
W7_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight7')
W8_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight8')
W9_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight9')
W10_variable_node=tf.Variable(tf.random_uniform([5,5],-1.0,1.0),name='Weight10')
# [n,5][?,?]=[n,1]
# [?,?]=[5,1]
W11_variable_node=tf.Variable(tf.random_uniform([5,1],-1.0,1.0),name='Weight11')
b1_variable_node=tf.Variable(tf.zeros([5]),name='Bias1')
b2_variable_node=tf.Variable(tf.zeros([5]),name='Bias2')
b3_variable_node=tf.Variable(tf.zeros([5]),name='Bias3')
b4_variable_node=tf.Variable(tf.zeros([5]),name='Bias4')
b5_variable_node=tf.Variable(tf.zeros([5]),name='Bias5')
b6_variable_node=tf.Variable(tf.zeros([5]),name='Bias6')
b7_variable_node=tf.Variable(tf.zeros([5]),name='Bias7')
b8_variable_node=tf.Variable(tf.zeros([5]),name='Bias8')
b9_variable_node=tf.Variable(tf.zeros([5]),name='Bias9')
b10_variable_node=tf.Variable(tf.zeros([5]),name='Bias10')
# Last shape of bias should be 1 from [5,1]
# W11_variable_node=tf.Variable(tf.random_uniform([5,1],-1.0,1.0),name='Weight11')
b11_variable_node=tf.Variable(tf.zeros([1]),name='Bias11')
# Then, you should connect them
# Each L is hypothesis_f_node function in each layer
# XW=H(X)
# L1_hypothesis_f_node L2_hypothesis_f_node L3_hypothesis_f_node L4_hypothesis_f_node L5_hypothesis_f_node L6_hypothesis_f_node L7_hypothesis_f_node L8_hypothesis_f_node L9_hypothesis_f_node L10_hypothesis_f_node
L1_hypothesis_f_node=tf.sigmoid(tf.matmul(X,W1_variable_node)+b1_variable_node)
L2_hypothesis_f_node=tf.sigmoid(tf.matmul(L1_hypothesis_f_node,W2_variable_node)+b2_variable_node)
L3_hypothesis_f_node=tf.sigmoid(tf.matmul(L2_hypothesis_f_node,W3_variable_node)+b3_variable_node)
L4_hypothesis_f_node=tf.sigmoid(tf.matmul(L3_hypothesis_f_node,W4_variable_node)+b4_variable_node)
L5_hypothesis_f_node=tf.sigmoid(tf.matmul(L4_hypothesis_f_node,W5_variable_node)+b5_variable_node)
L6_hypothesis_f_node=tf.sigmoid(tf.matmul(L5_hypothesis_f_node,W6_variable_node)+b6_variable_node)
L7_hypothesis_f_node=tf.sigmoid(tf.matmul(L6_hypothesis_f_node,W7_variable_node)+b7_variable_node)
L8_hypothesis_f_node=tf.sigmoid(tf.matmul(L7_hypothesis_f_node,W8_variable_node)+b8_variable_node)
L9_hypothesis_f_node=tf.sigmoid(tf.matmul(L8_hypothesis_f_node,W9_variable_node)+b9_variable_node)
L10_hypothesis_f_node=tf.sigmoid(tf.matmul(L9_hypothesis_f_node,W10_variable_node)+b10_variable_node)
hypothesis_f_node=tf.sigmoid(tf.matmul(L10_hypothesis_f_node,W11_variable_node)+b11_variable_node)
# @
# If you want to use tensorboard,
# you should give them name when you build layers
# So, you can replace above codes with following one
with tf.name_scope('layer1') as scope:
L1_hypothesis_f_node=tf.sigmoid(tf.matmul(X,W1_variable_node)+b1_variable_node)
with tf.name_scope('layer2') as scope:
L2_hypothesis_f_node=tf.sigmoid(tf.matmul(L1_hypothesis_f_node,W2_variable_node)+b2_variable_node)
with tf.name_scope('layer3') as scope:
L3_hypothesis_f_node=tf.sigmoid(tf.matmul(L2_hypothesis_f_node,W3_variable_node)+b3_variable_node)
with tf.name_scope('layer4') as scope:
L4_hypothesis_f_node=tf.sigmoid(tf.matmul(L3_hypothesis_f_node,W4_variable_node)+b4_variable_node)
with tf.name_scope('layer5') as scope:
L5_hypothesis_f_node=tf.sigmoid(tf.matmul(L4_hypothesis_f_node,W5_variable_node)+b5_variable_node)
with tf.name_scope('layer6') as scope:
L6_hypothesis_f_node=tf.sigmoid(tf.matmul(L5_hypothesis_f_node,W6_variable_node)+b6_variable_node)
with tf.name_scope('layer7') as scope:
L7_hypothesis_f_node=tf.sigmoid(tf.matmul(L6_hypothesis_f_node,W7_variable_node)+b7_variable_node)
with tf.name_scope('layer8') as scope:
L8_hypothesis_f_node=tf.sigmoid(tf.matmul(L7_hypothesis_f_node,W8_variable_node)+b8_variable_node)
with tf.name_scope('layer9') as scope:
L9_hypothesis_f_node=tf.sigmoid(tf.matmul(L8_hypothesis_f_node,W9_variable_node)+b9_variable_node)
with tf.name_scope('layer10') as scope:
L10_hypothesis_f_node=tf.sigmoid(tf.matmul(L9_hypothesis_f_node,W10_variable_node)+b10_variable_node)
with tf.name_scope('lat') as scope:
hypothesis_f_node=tf.sigmoid(tf.matmul(L10_hypothesis_f_node,W11_variable_node)+b11_variable_node)
# @
# When you perform backpropagation,
# if some defferentiation is given after sigmoid,
# you will get near 0,
# when you perform chainrule,
# because sigmoid function outputs value between 0 and 1
# That is, the more you use layers,
# the more frequently above issue occurs
# This issue is called 'vanishing gradient'
# @
# To resolve this issue,
# you use relu(rectified linear unit) function,
# instead of sigmoid function
# So, in nn, you use relu than sigmoid
# L1_hypothesis_f_node=tf.sigmoid(tf.matmul(X,W1_variable_node)+b1_variable_node)
# L1_hypothesis_f_node=tf.nn.relu(tf.matmul(X,W1_variable_node)+b1_variable_node)
# @
# Let's apply relu function on layers
with tf.name_scope('layer1') as scope:
L1_hypothesis_f_node=tf.nn.relu(tf.matmul(X,W1_variable_node)+b1_variable_node)
with tf.name_scope('layer2') as scope:
L2_hypothesis_f_node=tf.nn.relu(tf.matmul(L1_hypothesis_f_node,W2_variable_node)+b2_variable_node)
with tf.name_scope('layer3') as scope:
L3_hypothesis_f_node=tf.nn.relu(tf.matmul(L2_hypothesis_f_node,W3_variable_node)+b3_variable_node)
with tf.name_scope('layer4') as scope:
L4_hypothesis_f_node=tf.nn.relu(tf.matmul(L3_hypothesis_f_node,W4_variable_node)+b4_variable_node)
with tf.name_scope('layer5') as scope:
L5_hypothesis_f_node=tf.nn.relu(tf.matmul(L4_hypothesis_f_node,W5_variable_node)+b5_variable_node)
with tf.name_scope('layer6') as scope:
L6_hypothesis_f_node=tf.nn.relu(tf.matmul(L5_hypothesis_f_node,W6_variable_node)+b6_variable_node)
with tf.name_scope('layer7') as scope:
L7_hypothesis_f_node=tf.nn.relu(tf.matmul(L6_hypothesis_f_node,W7_variable_node)+b7_variable_node)
with tf.name_scope('layer8') as scope:
L8_hypothesis_f_node=tf.nn.relu(tf.matmul(L7_hypothesis_f_node,W8_variable_node)+b8_variable_node)
with tf.name_scope('layer9') as scope:
L9_hypothesis_f_node=tf.nn.relu(tf.matmul(L8_hypothesis_f_node,W9_variable_node)+b9_variable_node)
with tf.name_scope('layer10') as scope:
L10_hypothesis_f_node=tf.nn.relu(tf.matmul(L9_hypothesis_f_node,W10_variable_node)+b10_variable_node)
# Note that you should use sigmoid on last layer
# because value of last layer should be between 0 and 1
with tf.name_scope('lat') as scope:
hypothesis_f_node=tf.sigmoid(tf.matmul(L10_hypothesis_f_node,W11_variable_node)+b11_variable_node)
# @
# There are other activation functions
# Sigmoid
# tanh
# ReLU
# Leaky ReLu
# Maxout
# ELU