Skip to content

Commit

Permalink
updates on RL
Browse files Browse the repository at this point in the history
  • Loading branch information
dnitti-psee committed Nov 13, 2021
1 parent 51d8559 commit 1d57f03
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 33 deletions.
13 changes: 7 additions & 6 deletions RL/agent/agent_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def onehot(i, n, dtype=np.float32):

def do_rollout(agent, env, episode, num_steps=None, render=False, useConv=True, discount=1,
learn=True, sleep=0.):
baseline_env = ('baseline_env' in agent.config and agent.config['baseline_env'])
if num_steps == None:
num_steps = env.spec.max_episode_steps
total_rew = 0.
Expand All @@ -24,9 +25,8 @@ def do_rollout(agent, env, episode, num_steps=None, render=False, useConv=True,
else:
scaling = 'none'
obs_cur = env.reset()
if not ('baseline_env' in agent.config and agent.config['baseline_env']):
if not baseline_env:
obs_cur = preprocess(obs_cur, agent.observation_space, agent.scaled_obs, type=scaling)

if agent.config['terminal_life']:
last_lives = -1
else:
Expand Down Expand Up @@ -57,23 +57,24 @@ def do_rollout(agent, env, episode, num_steps=None, render=False, useConv=True,
start_time = time.time()
(obs_next, rr, done, _info) = env.step(a)
start_time2 = time.time()
if agent.config['terminal_life'] and not ('baseline_env' in agent.config and agent.config['baseline_env']):
if agent.config['terminal_life'] and not baseline_env:
if _info['ale.lives'] < last_lives:
terminal_memory = True
else:
terminal_memory = done
last_lives = _info['ale.lives']
else:
terminal_memory = done
if ('baseline_env' in agent.config and agent.config['baseline_env']):
if baseline_env:
if hasattr(env,'was_real_done'): # when using EpisodicLifeEnv wrapper
done = env.was_real_done

if not ('baseline_env' in agent.config and agent.config['baseline_env']):
if not baseline_env:
obs_next = preprocess(obs_next, agent.observation_space, agent.scaled_obs, type=scaling)
reward = rr*agent.config['scalereward']
else:
# fixme rewards are clipped when baseline_env is enabled!!!!!!
logger.warning('rewards are clipped when baseline_env is enabled')
reward = rr
obs_next = np.moveaxis(obs_next, -1, 0)

Expand All @@ -82,7 +83,7 @@ def do_rollout(agent, env, episode, num_steps=None, render=False, useConv=True,
else:
limitreward = reward

if useConv == False:
if not useConv:
obs_next = obs_next.reshape(-1, )

if len(obs_cur.shape) == 3:
Expand Down
4 changes: 2 additions & 2 deletions RL/agent/default_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def get_default(nameenv):
"eps": 0.5, # Epsilon in epsilon greedy policies
"mineps": 0.01,
"linear_decay": 0.0001,#"decay": 0.995, # Epsilon decay in epsilon greedy policies
"initial_learnrate": 0.0002,
"initial_learnrate": 0.0003,
"eps_optim": 1e-5, # 1.5e-4 before
"decay_learnrate": 1,
"discount": 0.99,
Expand Down Expand Up @@ -150,7 +150,7 @@ def get_default(nameenv):
"limitreward": [-1., 1.],
'doubleQ':False,
"copyQ": 10000,
"probupdate": 0.35,
"probupdate": 0.4,
"init_weight": True,
"lambda": 0.,
"entropy": 0.01,
Expand Down
43 changes: 23 additions & 20 deletions RL/agent/run.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
'''
"""
@author: Davide Nitti
'''
"""

from . import common
from . import default_params
Expand All @@ -15,8 +14,6 @@
import gym.spaces
from multiprocessing import Process
import logging


import argparse
import os
import json
Expand Down Expand Up @@ -92,36 +89,40 @@ def upload_res(callback, process_upload=None, upload_checkpoint=False, parallel=
return process_upload


def main(params=[], callback=None, upload_ckp=False, numavg=100, sleep=0.0):
params = getparams(params)
logger.info('Params' + str(params))
if params['plot'] != True:
import matplotlib
matplotlib.use('pdf')
else:
def plt_init(plot):
if plot:
import matplotlib
# matplotlib.use('Agg')
# matplotlib.use("Qt5agg")
import matplotlib.pyplot as plt

plt.rcParams['image.interpolation'] = 'nearest'
plt.ion()
else:
import matplotlib
matplotlib.use('pdf')
return plt

nameenv = params['target']

reward_threshold = gym.envs.registry.spec(nameenv).reward_threshold
def main(params=[], callback=None, upload_ckp=False, numavg=100, sleep=0.0):
params = getparams(params)
logger.info('Params' + str(params))
plt = plt_init(params['plot'])
name_env = params['target']

reward_threshold = gym.envs.registry.spec(name_env).reward_threshold
if 'baseline_env' in params and params['baseline_env']:
if params['path_exp']:
stats_path = os.path.join(params["res_dir"], 'stats')
if not os.path.exists(stats_path):
os.makedirs(stats_path)
else:
stats_path = None
env = env_utils.build_env(nameenv, env_type=None, num_env=1, batch=False,
env = env_utils.build_env(name_env, env_type=None, num_env=1, batch=False,
seed=params["seed"], reward_scale=params['scalereward'], gamestate=None,
logger_dir=stats_path)
reward_range = env.reward_range#env.envs[0].reward_range
reward_range = env.reward_range # env.envs[0].reward_range
else:
env = gym.make(nameenv)
env = gym.make(name_env)
reward_range = env.reward_range
if params['seed'] is not None:
env.seed(params["seed"])
Expand Down Expand Up @@ -152,8 +153,6 @@ def main(params=[], callback=None, upload_ckp=False, numavg=100, sleep=0.0):
num_steps = env.spec.max_episode_steps
avg = None
process_upload = None
if params['plot']:
plt.ion()

totrewlist = []
test_rew_epis = [[], []]
Expand Down Expand Up @@ -284,3 +283,7 @@ def main(params=[], callback=None, upload_ckp=False, numavg=100, sleep=0.0):
finally:
env.close()
return np.mean(totrewlist[-numavg:]), agent.config, totrewlist, test_rew_smooth, test_rew_epis, reward_threshold


if __name__ == '__main__':
main(None)
9 changes: 4 additions & 5 deletions RL/agent/torchagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@

from .agent_utils import onehot, vis
import json
import pickle

import copy

import matplotlib.pyplot as plt0

Expand Down Expand Up @@ -618,6 +615,8 @@ def learnpolicy(self, startind=None, endind=None):
total_reward = torch.from_numpy(total_reward).to(self.device, non_blocking=True)
actions = torch.from_numpy(actions).to(self.device, non_blocking=True)
# Vnext = np.append([[0]],Vallstate[:-1],axis=0)
if (notdonevec[:-1,0]==0).any():
raise NotImplementedError
if self.config['episodic']:
targetV = currew + self.config['discount'] * Vnext * notdonevec
if notdonevec[-1, 0] == 1:
Expand Down Expand Up @@ -670,11 +669,11 @@ def learnpolicy(self, startind=None, endind=None):
if self.avg_target is None:
self.avg_target = scale_target
else:
self.avg_target = 0.99 * self.avg_target + 0.01 * scale_target
self.avg_target = 0.995 * self.avg_target + 0.005 * scale_target
scaling = torch.sqrt(self.avg_target) + 0.001
v_loss = self.criterion(Vallstate / scaling, targetV.detach() / scaling)
if np.random.random() < 0.001:
print("avg target", self.avg_target.data.item(), "v loss", v_loss.mean().data.item())
print("avg target", self.avg_target.data.item()**0.5, "v loss", v_loss.mean().data.item())
else:
self.avg_target=1
v_loss = self.criterion(Vallstate, targetV.detach())
Expand Down

0 comments on commit 1d57f03

Please sign in to comment.