Q-learning w/ e-greedy policy (FrozenLake / Taxi) 코드
2019. 5. 13. 22:17
Taxi-v2 Q-learning
import gym
import random
import numpy as np
env = gym.make("Taxi-v2")
env.reset()
q_table = np.zeros([env.observation_space.n, env.action_space.n])
alpha = 0.8
gamma = 0.95
epsilon = 1
success = 0
for i in range(10000):
epsilon = 5 / (1 + i)
state = env.reset()
epochs = 0
done = False
while not done and epochs < 500:
epochs += 1
# e-greedy
if random.uniform(0, 1) < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(q_table[state])
next_state, reward, done, info = env.step(action)
# update Q
old_value = q_table[state, action]
next_max = np.max(q_table[next_state])
new_value = old_value + alpha * (reward + gamma * next_max - old_value)
q_table[state, action] = new_value
state = next_state
if reward > 0:
success += 1
print("Success rate : " + str(success / 10000))
env.close()
FrozenLake Q-learning
(V3 : Deterministic, V0 : Stochastic)
import gym
import numpy as np
gym.envs.registration.register(
id='FrozenLake-v3',
entry_point='gym.envs.toy_text:FrozenLakeEnv',
kwargs={'map_name': '4x4',
'is_slippery': False}
)
# Stochastic
# env = gym.make("FrozenLake-v0")
# Deterministic
env = gym.make("FrozenLake-v3")
env.reset()
q_table = np.zeros([env.observation_space.n, env.action_space.n])
alpha = 0.6
gamma = 0.6
epsilon = 1
success = 0
for i in range(20000):
epsilon = 1. / ((i // 1000) + 1)
state = env.reset()
epochs = 0
done = False
episode_reward = 0
while not done and epochs < 500:
epochs += 1
# e-greedy
if np.random.uniform(0, 1) < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(q_table[state])
next_state, reward, done, info = env.step(action)
# update Q
old_value = q_table[state, action]
next_max = np.max(q_table[next_state])
q_table[state, action] = old_value + alpha * (reward + gamma * next_max - old_value)
state = next_state
episode_reward += reward
if done and reward == 0:
q_table[state][action] -= 1
if reward == 1:
success += 1
print("Success rate : " + str(success / 20000))
env.close()
오늘의 교훈
1. epsillon을 줄이자. (줄이는 코드를 안짜서 한참 헤멨음.)
2. epsillon을 줄이는 방법에 따라 결과가 천차만별이다. (Frozen-lake에서 epsillon을 1 / (i+1) 로 뒀더니 결과가 똥망함)
3. 시간은 얼마 없고 갈 길은 멀다. 빨리 공부해야지.
'인공지능 > 실습 자료' 카테고리의 다른 글
openAi gym CarRacing 훈련 중간결과 (13) | 2019.10.20 |
---|