Skip to content

Commit

Permalink
Metrics wrapper part 2 (pytorch#1449)
Browse files Browse the repository at this point in the history
* first pass at logic to retrieve old metrics and run regression checks

* clean up and document

* remove debug statement

* raise instead of sys.exit and print to stderr

* required args and different empty string check

* raise early if child process has non-0 exit code
  • Loading branch information
zcain117 authored Dec 5, 2019
1 parent 4e23690 commit 5cb06e5
Showing 1 changed file with 115 additions and 25 deletions.
140 changes: 115 additions & 25 deletions torch_xla/test/metrics_test_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,35 +7,75 @@
metrics results. In the outermost directory, there should be a base config file
detailed below. Within that directory, there should be a subdirectory for each
test that contains a metrics_history subdirectory and (optionally) a config file
to override fields of the base config. In other words, it should look like this:
named 'config.json' to override fields of the base config. In other words, it
should look like this:
root/
base_config.json
config.json
mnist/
config_overrides.json
config.json
metrics_history/
TODO(zcain): Design and document the config file.
config.json has 2 relevant fields:
1. `write_metrics_to_disk`: (bool) If false, this wrapper will not write
any XLA metrics to disk.
2. `regression_test_config`: (dict) The config that will be used to determine
whether any metrics have regressed in a meaningful way. If absent, this
wrapper script will not perform any regression checks. For more details,
see `compare_metrics` in `torch_xla/debug/metrics_compare_utils`. The
config allows different checks for individual metrics.
The config.json found in root/ is used as the base and any config.json files
found in child directories will overwrite some or all of the base config for
that specific test. Simple recommended starter config.json:
{
"write_metrics_to_disk": true,
"regression_test_config": {
"base_expression": "v <= v_mean + (v_stddev * 3.0)"
}
}
Example usage:
python metrics_test_wrapper.py --root="gs://model_metrics" \
--test_folder_name="mnist" -- python test/test_train_mnist.py --num_epochs=1
"""
from __future__ import print_function

import argparse
import datetime
import glob
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile

import torch_xla.debug.metrics_compare_utils as mcu
import torch_xla.utils.gcsfs as gcsfs

try:
import google.api_core.exceptions
except:
msg = """Google Cloud Storage libraries are missing.
Please install them using the following command:
pip install --upgrade google-cloud-storage
Also follow the instructions in the link below to configure authentication:
https://cloud.google.com/storage/docs/reference/libraries
"""
print(msg, file=sys.stderr)
raise


_CLOUD_STORAGE_PREFIX = 'gs://'
_METRICS_HISTORY_DIR_NAME = 'metrics_history'
_XLA_METRICS_FILE = 'XLA_METRICS_FILE'
_METRICS_FILE_PATTERN = r'.*\d{4}_\d{2}_\d{2}'


def _find_correct_metrics_file(base_metrics_file):
Expand Down Expand Up @@ -99,34 +139,84 @@ def _write_to_disk(output_string, output_filename):
'--test_name="mnist" -- python train.py'))
parser.add_argument('--root', type=str, default=None,
help='Root dir for metrics test data. See docstring at '
'the top of this script.')
'the top of this script.', required=True)
parser.add_argument('--test_folder_name', type=str, default=None,
help='Folder within root/ for this test. See docstring '
'at the top of this script.')
'at the top of this script.', required=True)
parser.add_argument(
'positional',
nargs='+',
type=str,
help='The python command to run.')
FLAGS = parser.parse_args()
if not FLAGS.root or not FLAGS.test_folder_name:
raise ValueError('root and test_folder_name are required arguments.')

# TODO(zcain): Verify that root contains base_config.
# TODO(zcain): Read tolerances from base_config and maybe override with
# test-specific config from test_folder_name.
# TODO(zcain): Read metrics_history for current test, retrieve N days of
# history.
# TODO(zcain): Calculate mean and std dev for each metrics, then compare
# against metrics from the subprocess call.

metrics, sp_return_code = _run_subprocess(FLAGS.positional)

# Include the params for this invocation when saving metrics.
output_string = '{}\n\n{}'.format(FLAGS, metrics)
output_filename = os.path.join(
FLAGS.root, FLAGS.test_folder_name, _METRICS_HISTORY_DIR_NAME,
datetime.datetime.utcnow().strftime('%Y_%m_%d'))
_write_to_disk(output_string, output_filename)
# Run the user-supplied command.
metrics, sp_return_code = _run_subprocess(FLAGS.positional)
if sp_return_code:
raise AssertionError(
'Child process had non-zero exit code: {}'.format(sp_return_code))

# Retrieve any config files that affect this test.
# NOTE: these are ordered in increasing specificity. For example, if there
# was a base config that affects all tests and a specific config for a
# particular test, then the base config will be the first element in the
# list and the most specific config will be the last element.
ordered_config_dicts = []

path_to_search = FLAGS.test_folder_name
while True:
try:
f = gcsfs.open(os.path.join(FLAGS.root, path_to_search, 'config.json'))
ordered_config_dicts.insert(0, json.load(f))
except google.api_core.exceptions.NotFound:
pass
if not path_to_search:
break
path_to_search = os.path.split(path_to_search)[0]
if not ordered_config_dicts:
raise ValueError('No config files found in {} or parent directories. '
'See example usage at top of metrics_test_wrapper.py'.format(
os.path.join(FLAGS.root, FLAGS.test_folder_name)))

# Consolidate configs into 1 dict by overwriting the least-specific configs
# with the increasingly more-specific configs.
config = ordered_config_dicts[0]
for c in ordered_config_dicts:
config.update(c)

# Collect historical metrics for this test and check for any regressions in
# the current run vs. the averages from previous runs.
metrics_storage_dir = os.path.join(
FLAGS.root, FLAGS.test_folder_name, _METRICS_HISTORY_DIR_NAME)
metrics_storage_dir += '/'
regression_test_config = config.get('regression_test_config', None)
if regression_test_config:
metrics_file_pattern = re.compile(_METRICS_FILE_PATTERN)
prev_metrics_files = [f for f in gcsfs.list(
metrics_storage_dir) if metrics_file_pattern.match(f.path)]
prev_metrics_strings = [gcsfs.open(
os.path.join(FLAGS.root, f.path), mode='rt').read() for f in
prev_metrics_files]
data_points = mcu.get_data_points_from_metrics_reports(
prev_metrics_strings)
regression_report = mcu.compare_metrics(
data_points, metrics, config=regression_test_config)
else:
print('Unable to check for metrics regressions. Config should contain '
'"regression_test_config" key -- see example at the top of '
'metrics_test_wrapper.py.', file=sys.stderr)
regression_report = ''

# Write the metrics from the current run to disk unless disabled by config.
if config.get('write_metrics_to_disk', True):
# Include the params for this invocation when saving metrics.
output_string = '{}\n\n{}'.format(FLAGS, metrics)
output_filename = os.path.join(
metrics_storage_dir, datetime.datetime.utcnow().strftime('%Y_%m_%d'))
_write_to_disk(output_string, output_filename)

if regression_report:
raise AssertionError(
'Non-empty XLA metrics regression report:\n{}'.format(
regression_report))

sys.exit(sp_return_code)

0 comments on commit 5cb06e5

Please sign in to comment.