https://www.youtube.com/watch?v=VYOq-He90bE&list=PLlMkM4tgfjnKsCWav-Z2F-MMFRx-2gMGG&index=7 Key points Example code for exploration using adding noise ================================================================================
# ================================================================================
import gym
import numpy as np
import matplotlib.pyplot as plt
from gym.envs.registration import register
import random as pr

# ================================================================================
# Random argmax

def rargmax(vector):
  m=np.amax(vector)
  indices=np.nonzero(vector==m)[0]
  returnpr.choice(indices)

# ================================================================================
# @ Env options

register(
  id='FrozenLake-v3',
  entry_point='gym.envs.toy_text:FrozenLakeEnv',
  kwargs={'map_name':'4x4','is_slippery': False}
)

# ================================================================================
# @ Create env

env=gym.make('FrozenLake-v3')

# ================================================================================
# c Q: (16,4) 2D array
Q=np.zeros([env.observation_space.n,env.action_space.n])

# c dis: discount factor for future rewards
dis=0.99

# c num_episodes: 2000 episodes
num_episodes=2000

# ================================================================================
# c rList: saves "total rewards" and "steps" per each episode
rList=[]

# ================================================================================
for i in range(num_episodes):
  # c state: reset env and get current 1st state
  state=env.reset()

  rAll=0

  # done=True: end of game
  done=False

  # ================================================================================
  while not done:
    # c action: chosen action using adding noise to Q values
    action=np.argmax(Q[state,:]+np.random.randn(1,env.action_space.n)/(i+1))

    # ================================================================================
    # @ Execute action
    new_state,reward,done,_=env.step(action)

    # ================================================================================
    # Update Q function using discounted factors to future rewards
    Q[state,action]=reward+dis*np.max(Q[new_state,:])

    # ================================================================================
    # @ Accumulate reward
    rAll+=reward

    # ================================================================================
    # c state: new observed state into current state
    state=new_state

  # ================================================================================
  # After end of single episode, you append accumulated reward into rList
  rList.append(rAll)

# ================================================================================
print("Success rate: "+str(sum(rList)/num_episodes))
print("Final Q-Table Values")
print("LEFT DOWN RIGHT UP")
print(Q)
plt.bar(range(len(rList)),rList,color="blue")
plt.show()
# https://raw.githubusercontent.com/youngminpark2559/pracrl/master/shkim-rl/pic/2019_04_22_11:29:11.png
# Note that you can see various float numbers as well as 1.0 in Q values
# because you use discounted factors on future rewards
================================================================================