Skip to content

Commit

Permalink
Add in requested changes
Browse files Browse the repository at this point in the history
- Add to sync documentation
- Remove hyperthreading in sync command, and explicitly add a default walltime
- Raise error when sync path is not defined
- Remove sync ssh keys
- Add flag for syncing uncollated files which defaults to True when collation is enabled.
  • Loading branch information
Jo Basevi committed Nov 1, 2023
1 parent 202e45f commit 472b9f9
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 103 deletions.
31 changes: 20 additions & 11 deletions docs/source/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ Postprocessing

``sync``
Sync archive to a remote directory using rsync. Make sure that the
configured path to sync output to, i.e. ``path`` is the correct location
configured path to sync output to, i.e. ``path``, is the correct location
before enabling automatic syncing or before running ``payu sync``.

If postscript is also configured, the latest output and restart files will
Expand All @@ -409,7 +409,7 @@ Postprocessing
``queue`` (*Default:* ``copyq``)
PBS queue used to submit the sync job.

``walltime``
``walltime`` (*Default:* ``10:00:00``)
Time required to run the job.

``mem`` (*Default:* ``2GB``)
Expand All @@ -419,21 +419,30 @@ Postprocessing
Number of ncpus required for the job.

``path``
Destination path to copy archive outputs to. This must be a unique
Destination path to sync archive outputs to. This must be a unique
absolute path for your experiment, otherwise, outputs will be
overwritten.

``restarts`` (*Default:* ``False``)
Sync permanently archived restarts, which are determined by
``restart_freq``.

``rsync_flags``
Additional flags to add to rsync commands used for syncing files. Note
that these will be added to the default flags ``-vrltoD --safe-links``.

``rsync_flags`` (*Default:* ``-vrltoD --safe-links``)
Additional flags to add to rsync commands used for syncing files.

``exclude``
Patterns to exclude from rsync commands. This can be a single pattern or
a list of patterns. This is equivalent to rsync's ``--exclude PATTERN``.
Patterns to exclude from rsync commands. This is equivalent to rsync's
``--exclude PATTERN``. This can be a single pattern or a list of
patterns. If a pattern includes any special characters,
e.g. ``.*+?|[]{}()``, it will need to be quoted. For example::
exclude:
- 'iceh.????-??-??.nc'
- '*-IN-PROGRESS'

``exclude_uncollated`` (*Default:* ``True`` if collation is enabled)
Flag to exclude uncollated files from being synced. This is equivalent
to adding ``--exclude *.nc.*``.

``extra_paths``
List of ``glob`` patterns which match extra paths to sync to remote
Expand All @@ -443,8 +452,8 @@ Postprocessing
``remove_local_files`` (*Default:* ``False``)
Remove local files once they are successfully synced to the remote
archive. Files in protected paths will not be deleted. Protected paths
include the last output, the last saved restart (determined by
``restart_freq``), and any subsequent restarts.
include the ``extra_paths`` (if defined), last output, the last saved
restart (determined by ``restart_freq``), and any subsequent restarts.

``remove_local_dirs`` (*Default:* ``False``)
Remove local directories once a directory has been successfully synced.
Expand Down
5 changes: 3 additions & 2 deletions docs/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,8 @@ Then run::

payu sync

To sync all restarts including the latest restarts, use the --sync-restarts
flag::
By default ``payu sync`` will not sync the latest restarts that may be pruned
at a later date. To sync all restarts including the latest restarts, use the
``--sync-restarts`` flag::

payu sync --sync-restarts
29 changes: 4 additions & 25 deletions payu/subcommands/sync_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,16 @@ def runcmd(model_type, config_path, lab_path, dir_path, sync_restarts,
default_ncpus = 1
default_queue = 'copyq'
default_mem = '2GB'
default_walltime = '10:00:00'

pbs_config['queue'] = sync_config.get('queue', default_queue)

pbs_config['ncpus'] = sync_config.get('ncpus', default_ncpus)

pbs_config['mem'] = sync_config.get('mem', default_mem)

pbs_config['walltime'] = sync_config.get('walltime', default_walltime)

sync_jobname = sync_config.get('jobname')
if not sync_jobname:
pbs_jobname = pbs_config.get('jobname')
Expand All @@ -53,31 +56,7 @@ def runcmd(model_type, config_path, lab_path, dir_path, sync_restarts,

pbs_config['jobname'] = sync_jobname[:15]

# Replace (or remove) walltime
walltime = sync_config.get('walltime')
if walltime:
pbs_config['walltime'] = walltime
else:
# Remove walltime if set
try:
pbs_config.pop('walltime')
except KeyError:
pass

# Disable hyperthreading
qsub_flags = []
iflags = iter(pbs_config.get('qsub_flags', '').split())
for flag in iflags:
if flag == '-l':
try:
flag += ' ' + next(iflags)
except StopIteration:
break

if 'hyperthread' not in flag:
qsub_flags.append(flag)

pbs_config['qsub_flags'] = ' '.join(qsub_flags)
pbs_config['qsub_flags'] = sync_config.get('qsub_flags', '')

cli.submit_job('payu-sync', pbs_config, pbs_vars)

Expand Down
85 changes: 36 additions & 49 deletions payu/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,59 +121,30 @@ def add_extra_source_paths(self):

def set_destination_path(self):
"set or create destination path to sync archive to"
# Remote archive user
default_user = getpass.getuser()
remote_user = self.config.get('user', default_user)

# Remote path to sync output to
dest_path = self.config.get('path', None)
if dest_path is None:
print("There's is no configured path to sync output to. "
"In config.yaml, set:\n"
" sync:\n path: PATH/TO/REMOTE/ARCHIVE\n"
"Replace PATH/TO/REMOTE/ARCHIVE with a unique absolute path "
"to sync outputs to. Ensure path is unique to avoid "
"overwriting exsiting output!")
raise ValueError("payu: error: Sync path is not defined.")

if not self.remote_syncing:
if dest_path is None:
# Automate destination path to:
# /g/data/{project}/{user}/{model}/{experiment_name}/archive
project = self.expt.config.get('project',
os.environ['PROJECT'])
dest_path = os.path.join('/', 'g', 'data', project,
remote_user, self.expt.model_name,
self.expt.name, 'archive')

# Create destination directory if not already exists
# Create local destination directory if it does not exist
mkdir_p(dest_path)
else:
# Top-level path is implicitly set by the SSH key
# (Usually /projects/[group])

# Remote mkdir is currently not possible, so any new subdirectories
# must be created before auto-archival
if dest_path is None:
os.path.join(self.expt.model_name, self.expt.name, 'archive')
dest_path = f'{remote_user}@{self.remote_url}:{dest_path}'
# Syncing to remote machine
remote_user = self.config.get('user', None)
if remote_user is not None:
dest_path = f'{remote_user}@{self.remote_url}:{dest_path}'
else:
dest_path = f'{self.remote_url}:{dest_path}'

self.destination_path = dest_path

def set_base_rsync_cmd(self):
"""Set base rsync command with default rsync flags, any configured
additional flags, rsync protocol, and ssh-key (if remote syncing)"""
rsync_cmd = f'rsync -vrltoD --safe-links'

# Add any additional rsync flags
additional_rsync_flags = self.config.get('rsync_flags', None)
if additional_rsync_flags:
rsync_cmd += f' {additional_rsync_flags}'

# Add rsync protocol, if defined
rsync_protocol = self.config.get('rsync_protocol', None)
if rsync_protocol:
rsync_cmd += f' --protocol={rsync_protocol}'

# Add remote host rsync options
if self.remote_syncing:
ssh_key_path = os.path.join(os.getenv('HOME'), '.ssh',
'id_rsa_file_transfer')
rsync_cmd += f' -e "ssh -i {ssh_key_path}"'

self.base_rsync_cmd = rsync_cmd

def set_excludes_flags(self):
"""Add lists of patterns of filepaths to exclude from sync commands"""
# Get any excludes
Expand All @@ -182,10 +153,22 @@ def set_excludes_flags(self):
exclude = [exclude]

excludes = ' '.join(['--exclude ' + pattern for pattern in exclude])
if "--exclude *.nc.*" not in excludes:
# TODO: Useful enough to keep??
# Uncollated files are always excluded

# Default to exclude uncollated files if collation is enabled
# This can be over-riden using exclude_uncollated config flag
exclude_uncollated = self.config.get('exclude_uncollated', None)

if exclude_uncollated is None:
collate_config = self.expt.config.get('collate', {})
collating = collate_config.get('enable', True)
if collating:
exclude_uncollated = True

exclude_flag = "--exclude *.nc.*"
if (exclude_uncollated and exclude_flag not in excludes
and exclude_flag not in self.config.get('rsync_flags', [])):
excludes += " --exclude *.nc.*"

self.excludes = excludes

def build_cmd(self, source_path):
Expand Down Expand Up @@ -272,9 +255,13 @@ def run(self):

# Set rsync command components
self.set_destination_path()
self.set_base_rsync_cmd()
self.set_excludes_flags()

# Set base rsync command
default_flags = '-vrltoD --safe-links'
rsync_flags = self.config.get('rsync_flags', default_flags)
self.base_rsync_cmd = f'rsync {rsync_flags}'

# Set remove local files/dirs options
remove_files = self.config.get('remove_local_files', False)
self.remove_files = '--remove-source-files' if remove_files else ''
Expand Down
62 changes: 46 additions & 16 deletions test/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,35 +184,65 @@ def test_restarts_to_sync(add_config, envt_vars,
del os.environ[envt_var]


def test_rsync_components():
def test_set_destination_path():
additional_config = {
"sync": {
"rsync_flags": "--compress",
"rsync_protocol": 29,
"url": "test.domain",
"user": "test-usr",
"path": "remote/path",
"exclude": ["iceh.????-??-??.nc", "*-DEPRECATED"]
}}
sync = setup_sync(additional_config=additional_config)

# Test base_rsync_cmd
sync.set_base_rsync_cmd()

home_path = os.getenv('HOME')
expected_cmd = ('rsync -vrltoD --safe-links --compress --protocol=29'
f' -e "ssh -i {home_path}/.ssh/id_rsa_file_transfer"')

assert sync.base_rsync_cmd == expected_cmd

# Test destination_path
sync.set_destination_path()
assert sync.destination_path == "[email protected]:remote/path"

# Test excludes
# Test value error raised when path is not set
sync = setup_sync(additional_config={})
with pytest.raises(ValueError):
sync.set_destination_path()


@pytest.mark.parametrize(
"add_config, expected_excludes",
[
(
{
"sync": {
"exclude": ["iceh.????-??-??.nc", "*-DEPRECATED"]
},
"collate": {
"enable": True
}
}, ("--exclude iceh.????-??-??.nc --exclude *-DEPRECATED"
" --exclude *.nc.*")
),
(
{
"sync": {
"exclude_uncollated": False
},
"collate": {
"enable": True
}
}, ""
),
(
{
"sync": {
"exclude": "*-DEPRECATED"
},
"collate": {
"enable": False
}
}, "--exclude *-DEPRECATED"
)
])
def test_set_excludes_flags(add_config, expected_excludes):
sync = setup_sync(additional_config=add_config)

# Test setting excludes
sync.set_excludes_flags()
expected_excludes = "--exclude iceh.????-??-??.nc --exclude *-DEPRECATED"
expected_excludes += " --exclude *.nc.*"
assert sync.excludes == expected_excludes


Expand Down

0 comments on commit 472b9f9

Please sign in to comment.