From 9610391ab319403cef079b438edd016a2443af54 Mon Sep 17 00:00:00 2001 From: Eric Hallahan Date: Thu, 9 Mar 2023 17:11:31 -0500 Subject: [PATCH] Simplify and relax dependencies (Take 2) (#818) * Remove unused einops dependency * Make Weights & Biases monitoring optional - Move `wandb` to `./requirements/requirements-wandb.txt` - Clean up `./deepy.py` - Make GPT-NeoX not explode if `wandb` is not installed. - Tell the user when `wandb` is not importable and explain how to fix. - Remove implicit dependence on `shortuuid`. - Ensure that `wandb` is installed in Dockerfile. * Relax many dependencies * Remove usage of uuid.uuid4() * Update Dockerfile Add flash attention install * Update logging.py to pass when wandb is unimportable --------- Co-authored-by: Quentin Anthony --- Dockerfile | 3 +++ deepy.py | 29 +++++++++++++---------- megatron/logging.py | 7 +++++- megatron/neox_arguments/arguments.py | 30 ++++++++++++++---------- megatron/utils.py | 11 +++++---- requirements/requirements-dev.txt | 13 +++++----- requirements/requirements-onebitadam.txt | 2 +- requirements/requirements-wandb.txt | 1 + requirements/requirements.txt | 22 ++++++++--------- 9 files changed, 67 insertions(+), 51 deletions(-) create mode 100644 requirements/requirements-wandb.txt diff --git a/Dockerfile b/Dockerfile index 9205f0f1f..277ba2a15 100644 --- a/Dockerfile +++ b/Dockerfile @@ -88,10 +88,13 @@ RUN mkdir -p /home/mchorse/.ssh /job && \ #### Python packages RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge COPY requirements/requirements.txt . +COPY requirements/requirements-wandb.txt . COPY requirements/requirements-onebitadam.txt . COPY requirements/requirements-sparseattention.txt . RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && \ pip install -r requirements-sparseattention.txt && \ + pip install -r requirements-flashattention.txt && \ + pip install -r requirements-wandb.txt && \ pip install protobuf==3.20.* && \ pip cache purge diff --git a/deepy.py b/deepy.py index e776dc5a3..c158c76c5 100755 --- a/deepy.py +++ b/deepy.py @@ -16,23 +16,26 @@ import logging import os -import deepspeed -from deepspeed.launcher.runner import main +import deepspeed.launcher.runner -logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) -from megatron.neox_arguments import NeoXArgs -from megatron.utils import get_wandb_api_key +def main(): + logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) + from megatron.neox_arguments import NeoXArgs + from megatron.utils import get_wandb_api_key -neox_args = NeoXArgs.consume_deepy_args() -deepspeed_main_args = neox_args.get_deepspeed_main_args() + neox_args = NeoXArgs.consume_deepy_args() + deepspeed_main_args = neox_args.get_deepspeed_main_args() + + # Extract wandb API key and inject into worker environments + wandb_token = get_wandb_api_key(neox_args=neox_args) + if wandb_token is not None: + deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY") + os.environ["WANDB_API_KEY"] = wandb_token + + deepspeed.launcher.runner.main(deepspeed_main_args) -# Extract wandb API key and inject into worker environments -wandb_token = get_wandb_api_key(neox_args=neox_args) -if wandb_token is not None: - deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY") - os.environ["WANDB_API_KEY"] = wandb_token if __name__ == "__main__": - main(deepspeed_main_args) + main() diff --git a/megatron/logging.py b/megatron/logging.py index 8e1f38cc1..2845305f7 100644 --- a/megatron/logging.py +++ b/megatron/logging.py @@ -14,7 +14,12 @@ import sys import torch -import wandb + +try: + import wandb +except ModuleNotFoundError: + pass + from megatron import mpu, print_rank_0 from megatron.utils import report_memory diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 1caaf17d9..f3fe9e7d6 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -18,7 +18,6 @@ import yaml import json import logging -import shortuuid import copy import torch import argparse @@ -278,13 +277,13 @@ def consume_deepy_args(cls): "--wandb_group", type=str, default=None, - help='Weights and Biases group name - used to group together "runs".', + help='Weights & Biases group name - used to group together "runs".', ) group.add_argument( "--wandb_team", type=str, default=None, - help="Team name for Weights and Biases.", + help="Weights & Biases team name.", ) group = parser.add_argument_group(title="Eval args") @@ -372,11 +371,22 @@ def consume_deepy_args(cls): paths_to_yml_files=conf_files, overwrite_values=overwrite_values ) - if neox_args.wandb_group is not None: - # concat the wandb group name with a uid to make sure it's unique - import wandb + if neox_args.use_wandb: + try: + import wandb + + # Check if the W&B group name is configured + if neox_args.wandb_group is None: + # Set a randomized string as group name if no group name is provided + neox_args.wandb_group = wandb.sdk.lib.runid.generate_id() + else: + # Concatenate the W&B group name with a randomized string to ensure uniqueness. + neox_args.wandb_group += "_" + wandb.sdk.lib.runid.generate_id() + except ModuleNotFoundError as e: + if e.name == "wandb": + e.msg += "\nWeights & Biases monitoring was requested but `wandb` was not found. Install `wandb` to use Weights & Biases, or set the `use_wandb` configuration option to a boolean false to disable Weights & Biases logging." + raise e - neox_args.wandb_group += "_" + wandb.util.generate_id() neox_args.print() return neox_args @@ -736,12 +746,6 @@ def calculate_derived(self): Derives additional configuration values necessary for training from the current config """ - # wandb - # sets a unique wandb group - if self.wandb_group is None: - # if none is defined a uuid is set for the run - self.wandb_group = shortuuid.uuid() - # number of gpus # Get number of GPUs param or hostfile to determine train_batch_size global_num_gpus = getattr(self, "global_num_gpus", None) diff --git a/megatron/utils.py b/megatron/utils.py index 632f92a1c..ebb08f962 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -24,8 +24,11 @@ from typing import Dict, List import requests -import wandb -from wandb import UsageError + +try: + import wandb +except ModuleNotFoundError: + pass import torch @@ -33,7 +36,7 @@ from megatron import print_rank_0 from megatron import mpu -from deepspeed import PipelineEngine, DeepSpeedEngine + from collections import deque @@ -167,7 +170,7 @@ def init_wandb(neox_args): force=False, entity=neox_args.wandb_team, ) - except UsageError as e: + except wandb.UsageError as e: neox_args.update_value("use_wandb", False) print(e) print( diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 2057a7568..6c58478f3 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,8 +1,7 @@ -autopep8==1.5.6 -clang-format==13.0.1 -pre-commit~=2.17.0 -pytest==6.2.3 -pytest-cov==2.11.1 -pytest-forked==1.3.0 +autopep8>=1.5.6 +clang-format>=13.0.1 +pre-commit>=2.17.0 +pytest>=6.2.3 +pytest-cov>=2.11.1 +pytest-forked>=1.3.0 pytest-xdist -transformers~=4.16.2 diff --git a/requirements/requirements-onebitadam.txt b/requirements/requirements-onebitadam.txt index a6dd402b3..349e3b39a 100644 --- a/requirements/requirements-onebitadam.txt +++ b/requirements/requirements-onebitadam.txt @@ -1 +1 @@ -cupy-cuda111==8.6.0 +cupy-cuda111>=8.6.0 diff --git a/requirements/requirements-wandb.txt b/requirements/requirements-wandb.txt new file mode 100644 index 000000000..1df18b051 --- /dev/null +++ b/requirements/requirements-wandb.txt @@ -0,0 +1 @@ +wandb>=0.10.28 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index e9d9940e8..bd78064d1 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,16 +1,14 @@ -einops==0.3.0 -ftfy==6.0.1 -git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 -huggingface_hub==0.11.0 -lm_eval==0.3.0 -mpi4py==3.0.3 -numpy==1.22.0 -pybind11==2.6.2 deepspeed +ftfy>=6.0.1 +git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 +huggingface_hub>=0.11.0 +lm_eval>=0.3.0 +mpi4py>=3.0.3 +numpy>=1.22.0 +pybind11>=2.6.2 regex sentencepiece six -tiktoken==0.1.2 -tokenizers==0.12.1 -transformers~=4.24.0 -wandb==0.10.28 +tiktoken>=0.1.2 +tokenizers>=0.12.1 +transformers>=4.24.0