-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcart_pole_tabular.py
102 lines (78 loc) · 3.33 KB
/
cart_pole_tabular.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gymnasium as gym
import numpy as np
import random
from collections import deque, defaultdict
class Agent:
def __init__(self, environment, learning_rate=0.1, gamma=0.95, epsilon=0.1,
n_steps=6):
# Create action and observation spaces
self.n_actions = environment.action_space.n
self.observation_space = environment.observation_space.shape
limits = [2.4, 0.8, 0.418, 3.5] # Little trick here - although observations 1 and 3 can go to infinity,
# in practise, these don't exceed roughly 0.62 and 3.1
self.milestones = [list(np.linspace(-l, l, num=n_steps)) for l in limits]
self.Q = defaultdict(lambda: np.zeros(self.n_actions))
# Hyperparameters for actual learning
self.gamma = gamma
self.lr = learning_rate
self.eps = epsilon
def observation_to_discrete(self, state):
discrete = []
for s, milestones in zip(state, self.milestones):
for i, m in enumerate(milestones):
if s <= m:
discrete.append(i)
break
#print(discrete)
return tuple(discrete)
def predict_action(self, state):
return np.argmax(self.Q[self.observation_to_discrete(state)])
def get_training_action(self, state):
# Take random action
if random.random() <= self.eps:
return random.randint(0, self.n_actions-1)
# Predict best action
else:
return self.predict_action(state)
def fit(self, state, action, reward, terminated, truncated, new_state):
state_discrete = self.observation_to_discrete(state)
new_state_discrete = self.observation_to_discrete(new_state)
max_future_q = np.max(self.Q[new_state_discrete]) if not terminated else 0
q_current = self.Q[state_discrete][action]
new_q = (1 - self.lr) * q_current + self.lr * (reward + self.gamma * max_future_q)
self.Q[state_discrete][action] = new_q
# Create environment
env = gym.make("CartPole-v1", )
agent = Agent(env, learning_rate=0.05, epsilon=0.1, n_steps=24, gamma=0.99)
# Initialise stuff needed for exploration and exploitation
# High number of episodes is needed to fully explore all possible combinations
EPISODES, TESTS = 150000, 50
history = deque(maxlen=1000)
print("------------ LEARNING ------------")
for e in range(EPISODES):
state = env.reset()[0]
terminated, truncated = False, False
steps = 0
while not terminated and not truncated:
action = agent.get_training_action(state)
new_state, reward, terminated, truncated, _ = env.step(action)
#if terminated:
# reward = -1
agent.fit(state, action, int(reward), terminated, truncated, new_state)
state = new_state
steps += 1
history.append(steps)
if (e+1) % 1000 == 0:
print(f"Episode {e+1}: {np.mean(history)} steps")
print("-------------- TEST --------------")
history.clear()
found = 0
for t in range(TESTS):
state = env.reset()[0]
steps, terminated, truncated = 0, False, False
while not terminated and not truncated:
action = agent.predict_action(state)
new_state, reward, terminated, truncated, _ = env.step(action)
state = new_state
steps += 1
print(f"Test {t + 1}: {steps} steps")