Skip to content

Commit

Permalink
add run script and update exploration noise
Browse files Browse the repository at this point in the history
  • Loading branch information
vitchyr committed Nov 28, 2018
1 parent 0030a2d commit 6938006
Show file tree
Hide file tree
Showing 8 changed files with 207 additions and 43 deletions.
88 changes: 88 additions & 0 deletions examples/her/her_td3_gym_fetch_reach.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""
This should results in an average return of ~3000 by the end of training.
Usually hits 3000 around epoch 80-100. Within a see, the performance will be
a bit noisy from one epoch to the next (occasionally dips dow to ~2000).
Note that one epoch = 5k steps, so 200 epochs = 1 million steps.
"""
import gym

import rlkit.torch.pytorch_util as ptu
from rlkit.exploration_strategies.base import (
PolicyWrappedWithExplorationStrategy
)
from rlkit.exploration_strategies.gaussian_and_epsilon_strategy import (
GaussianAndEpislonStrategy
)
from rlkit.launchers.launcher_util import setup_logger
from rlkit.torch.her.her import HerTd3
from rlkit.torch.her.obs_dict_replay_buffer import ObsDictRelabelingBuffer
from rlkit.torch.networks import FlattenMlp, TanhMlpPolicy


def experiment(variant):
env = gym.make('FetchReach-v1')
es = GaussianAndEpislonStrategy(
action_space=env.action_space,
max_sigma=.2,
min_sigma=.2, # constant sigma
epsilon=.3,
)
obs_dim = env.observation_space.spaces['observation'].low.size
goal_dim = env.observation_space.spaces['desired_goal'].low.size
action_dim = env.action_space.low.size
qf1 = FlattenMlp(
input_size=obs_dim + goal_dim + action_dim,
output_size=1,
hidden_sizes=[400, 300],
)
qf2 = FlattenMlp(
input_size=obs_dim + goal_dim + action_dim,
output_size=1,
hidden_sizes=[400, 300],
)
policy = TanhMlpPolicy(
input_size=obs_dim + goal_dim,
output_size=action_dim,
hidden_sizes=[400, 300],
)
exploration_policy = PolicyWrappedWithExplorationStrategy(
exploration_strategy=es,
policy=policy,
)
replay_buffer = ObsDictRelabelingBuffer(
env=env,
**variant['replay_buffer_kwargs']
)
algorithm = HerTd3(
env=env,
qf1=qf1,
qf2=qf2,
policy=policy,
exploration_policy=exploration_policy,
replay_buffer=replay_buffer,
**variant['algo_kwargs']
)
algorithm.to(ptu.device)
algorithm.train()


if __name__ == "__main__":
variant = dict(
algo_kwargs=dict(
num_epochs=100,
num_steps_per_epoch=1000,
num_steps_per_eval=1000,
max_path_length=50,
batch_size=128,
discount=0.99,
),
replay_buffer_kwargs=dict(
max_size=100000,
fraction_goals_rollout_goals=0.2, # equal to k = 4 in HER paper
fraction_goals_env_goals=0.0,
),
)
setup_logger('her-td3-fetch-experiment', variant=variant)
experiment(variant)
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,23 @@
import rlkit.torch.pytorch_util as ptu
from rlkit.exploration_strategies.base import \
PolicyWrappedWithExplorationStrategy
from rlkit.exploration_strategies.gaussian_strategy import GaussianStrategy
from rlkit.exploration_strategies.gaussian_and_epsilon_strategy import (
GaussianAndEpislonStrategy
)
from rlkit.launchers.launcher_util import setup_logger
from rlkit.torch.her.her import HerTd3
from rlkit.torch.her.obs_dict_replay_buffer import ObsDictRelabelingBuffer
from rlkit.torch.networks import FlattenMlp, TanhMlpPolicy
import multiworld.envs.mujoco
import multiworld.envs.mujoco # trigger environment registration


def experiment(variant):
env = gym.make('FetchReach-v1')
env = gym.make('SawyerReachXYEnv-v1')
es = GaussianStrategy(
es = GaussianAndEpislonStrategy(
action_space=env.action_space,
max_sigma=0.1,
min_sigma=0.1, # Constant sigma
max_sigma=.2,
min_sigma=.2, # constant sigma
epsilon=.3,
)
obs_dim = env.observation_space.spaces['observation'].low.size
goal_dim = env.observation_space.spaces['desired_goal'].low.size
Expand Down Expand Up @@ -71,23 +73,18 @@ def experiment(variant):
if __name__ == "__main__":
variant = dict(
algo_kwargs=dict(
# num_epochs=200,
# num_steps_per_epoch=5000,
# num_steps_per_eval=10000,
# max_path_length=100,
num_epochs=20,
num_steps_per_epoch=500,
num_steps_per_eval=100,
num_epochs=100,
num_steps_per_epoch=1000,
num_steps_per_eval=1000,
max_path_length=50,
min_num_steps_before_training=1000,
batch_size=100,
batch_size=128,
discount=0.99,
),
replay_buffer_kwargs=dict(
max_size=100000,
fraction_goals_rollout_goals=1.0,
fraction_goals_rollout_goals=0.2,
fraction_goals_env_goals=0.0,
),
)
setup_logger('name-of-td3-experiment', variant=variant)
setup_logger('her-td3-sawyer-experiment', variant=variant)
experiment(variant)
14 changes: 12 additions & 2 deletions rlkit/core/rl_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def __init__(
num_steps_per_epoch=10000,
num_steps_per_eval=1000,
num_updates_per_env_step=1,
min_num_steps_before_training=None,
batch_size=1024,
max_path_length=1000,
discount=0.99,
Expand Down Expand Up @@ -61,12 +62,15 @@ def __init__(
:param eval_policy: Policy to evaluate with.
:param replay_buffer:
"""
if min_num_steps_before_training is None:
min_num_steps_before_training = num_steps_per_epoch
self.training_env = training_env or pickle.loads(pickle.dumps(env))
self.exploration_policy = exploration_policy
self.num_epochs = num_epochs
self.num_env_steps_per_epoch = num_steps_per_epoch
self.num_steps_per_eval = num_steps_per_eval
self.num_updates_per_train_call = num_updates_per_env_step
self.min_num_steps_before_training = min_num_steps_before_training
self.batch_size = batch_size
self.max_path_length = max_path_length
self.discount = discount
Expand Down Expand Up @@ -246,7 +250,10 @@ def _can_evaluate(self):
)

def _can_train(self):
return self.replay_buffer.num_steps_can_sample() >= self.batch_size
return (
self.replay_buffer.num_steps_can_sample() >=
self.min_num_steps_before_training
)

def _get_action_and_info(self, observation):
"""
Expand Down Expand Up @@ -401,7 +408,7 @@ def evaluate(self, epoch, eval_paths=None):
if eval_paths:
test_paths = eval_paths
else:
test_paths = self.eval_sampler.obtain_samples()
test_paths = self.get_eval_paths()

statistics.update(eval_util.get_generic_path_information(
test_paths, stat_prefix="Test",
Expand All @@ -421,6 +428,9 @@ def evaluate(self, epoch, eval_paths=None):
logger.record_tabular(key, value)
self.need_to_update_eval_statistics = True

def get_eval_paths(self):
return self.eval_sampler.obtain_samples()

@abc.abstractmethod
def _do_training(self):
"""
Expand Down
3 changes: 0 additions & 3 deletions rlkit/exploration_strategies/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@ class RawExplorationStrategy(ExplorationStrategy, metaclass=abc.ABCMeta):
def get_action_from_raw_action(self, action, **kwargs):
pass

def get_actions_from_raw_actions(self, actions, **kwargs):
raise NotImplementedError()

def get_action(self, t, policy, *args, **kwargs):
action, agent_info = policy.get_action(*args, **kwargs)
return self.get_action_from_raw_action(action, t=t), agent_info
Expand Down
37 changes: 37 additions & 0 deletions rlkit/exploration_strategies/gaussian_and_epsilon_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import random
from rlkit.exploration_strategies.base import RawExplorationStrategy
from rlkit.core.serializable import Serializable
import numpy as np


class GaussianAndEpislonStrategy(RawExplorationStrategy, Serializable):
"""
With probability epsilon, take a completely random action.
with probability 1-epsilon, add Gaussian noise to the action taken by a
deterministic policy.
"""
def __init__(self, action_space, epsilon, max_sigma=1.0, min_sigma=None,
decay_period=1000000):
assert len(action_space.shape) == 1
Serializable.quick_init(self, locals())
if min_sigma is None:
min_sigma = max_sigma
self._max_sigma = max_sigma
self._epsilon = epsilon
self._min_sigma = min_sigma
self._decay_period = decay_period
self._action_space = action_space

def get_action_from_raw_action(self, action, t=None, **kwargs):
if random.random() < self._epsilon:
return self._action_space.sample()
else:
sigma = (
self._max_sigma - (self._max_sigma - self._min_sigma)
* min(1.0, t * 1.0 / self._decay_period)
)
return np.clip(
action + np.random.normal(size=len(action)) * sigma,
self._action_space.low,
self._action_space.high,
)
7 changes: 0 additions & 7 deletions rlkit/exploration_strategies/ou_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,3 @@ def get_action_from_raw_action(self, action, t=0, **kwargs):
* min(1.0, t * 1.0 / self._decay_period)
)
return np.clip(action + ou_state, self.low, self.high)

def get_actions_from_raw_actions(self, actions, t=0, **kwargs):
noise = (
self.state + self.theta * (self.mu - self.state)
+ self.sigma * nr.randn(*actions.shape)
)
return np.clip(actions + noise, self.low, self.high)
14 changes: 0 additions & 14 deletions rlkit/torch/td3/td3.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def __init__(

target_policy_noise=0.2,
target_policy_noise_clip=0.5,
min_num_steps_before_training=1000,

policy_learning_rate=1e-3,
qf_learning_rate=1e-3,
Expand All @@ -52,7 +51,6 @@ def __init__(

self.target_policy_noise = target_policy_noise
self.target_policy_noise_clip = target_policy_noise_clip
self.min_num_steps_before_training = min_num_steps_before_training

self.policy_and_target_update_period = policy_and_target_update_period
self.tau = tau
Expand Down Expand Up @@ -189,18 +187,6 @@ def get_epoch_snapshot(self, epoch):
)
return snapshot

def _can_train(self):
return (
self.replay_buffer.num_steps_can_sample() >=
self.min_num_steps_before_training
)

def _can_evaluate(self):
return (
len(self._exploration_paths) > 0
and self.eval_statistics is not None
)

@property
def networks(self):
return [
Expand Down
56 changes: 56 additions & 0 deletions scripts/sim_multigoal_policy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import argparse
import pickle

from rlkit.core import logger
from rlkit.samplers.rollout_functions import multitask_rollout
from rlkit.torch import pytorch_util as ptu


def simulate_policy(args):
if args.pause:
import ipdb; ipdb.set_trace()
data = pickle.load(open(args.file, "rb"))
policy = data['policy']
env = data['env']
print("Policy and environment loaded")
if args.gpu:
ptu.set_gpu_mode(True)
policy.to(ptu.device)
policy.train(False)
paths = []
while True:
paths.append(multitask_rollout(
env,
policy,
max_path_length=args.H,
animated=not args.hide,
observation_key='observation',
desired_goal_key='desired_goal',
))
if hasattr(env, "log_diagnostics"):
env.log_diagnostics(paths)
if hasattr(env, "get_diagnostics"):
for k, v in env.get_diagnostics(paths).items():
logger.record_tabular(k, v)
logger.dump_tabular()


if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument('file', type=str,
help='path to the snapshot file')
parser.add_argument('--H', type=int, default=300,
help='Max length of rollout')
parser.add_argument('--speedup', type=float, default=10,
help='Speedup')
parser.add_argument('--mode', default='video_env', type=str,
help='env mode')
parser.add_argument('--gpu', action='store_true')
parser.add_argument('--pause', action='store_true')
parser.add_argument('--enable_render', action='store_true')
parser.add_argument('--multitaskpause', action='store_true')
parser.add_argument('--hide', action='store_true')
args = parser.parse_args()

simulate_policy(args)

0 comments on commit 6938006

Please sign in to comment.