Skip to content

Commit

Permalink
Fix llama3 generation
Browse files Browse the repository at this point in the history
  • Loading branch information
satyaog committed Sep 11, 2024
1 parent 51cfb81 commit b03a424
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 8 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/cloud-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,10 @@ jobs:
RUN_ON="azure__a100"
EXCLUDE="$EXCLUDE,$_MULTI_GPUS,$_MULTI_NODES"
;;
# "2g")
# RUN_ON="azure__a100_x2"
# SELECT="$SELECT,$_MULTI_GPUS"
# ;;
"2g")
RUN_ON="azure__a100_x2"
SELECT="$SELECT,$_MULTI_GPUS"
;;
"4g")
RUN_ON="azure__a100_x4"
SELECT="$SELECT,$_MULTI_GPUS"
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/llm/configs/llama3_70B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ checkpointer:
_component_: torchtune.utils.FullModelHFCheckpointer
checkpoint_dir: /tmp/Meta-Llama-3.1-70B-Instruct/
checkpoint_files: [
model-00001-of-00030.safetensors,
model-00001-of-00030.safetensors,
model-00002-of-00030.safetensors,
model-00003-of-00030.safetensors,
model-00004-of-00030.safetensors,
Expand Down
14 changes: 11 additions & 3 deletions benchmarks/llm/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
class Arguments:
recipe: str
config: str = None
no_pretrained: bool = False


@dataclass
Expand Down Expand Up @@ -100,12 +99,19 @@ def load_model(recipe, cfg):


def generate_weights(args, config):
is_done:Path = args.output_dir / "generated"
if is_done.exists():
print(f"{args.output_dir}/['*.safetensors'] or ['*consolidated.*.pth'] already generated")
return

if config.get("safetensors", False):
params_path = args.output_dir / "config.json"
model = LlamaForCausalLM(LlamaConfig(**json.loads(params_path.read_text())))
# Avoid saving this as part of the config.
del model.config._name_or_path
model.config.torch_dtype = torch.float16
# Even if model if loaded with a config.torch_dtype == bf16, model.dtype
# seams to be f32. Force model.dtype to be bf16
model.to(model.config.torch_dtype)
model.save_pretrained(str(args.output_dir), safe_serialization=True)

else:
Expand Down Expand Up @@ -138,6 +144,8 @@ def generate_weights(args, config):
conn.send(True)
p.join()

is_done.touch()


def main():
parser = ArgumentParser()
Expand All @@ -154,7 +162,7 @@ def main():

#
huggingface_format = config.get("safetensors", False)
pretrained = not args.no_pretrained
pretrained = not config.get("no_pretrained", False)

if not pretrained:
# if we will generate the weights do not download anyweights
Expand Down
7 changes: 7 additions & 0 deletions config/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,7 @@ llm-lora-single:
inherits: _llm
plan:
method: per_gpu

argv:
"{milabench_code}/recipes/lora_finetune_single_device.py": true
--config: "{milabench_code}/configs/llama3_8B_lora_single_device.yaml"
Expand All @@ -546,6 +547,7 @@ llm-lora-single:
repo_id="meta-llama/Meta-Llama-3.1-8B": true
batch_size=8: true
gradient_accumulation_steps=8: true
no_pretrained=True: true

llm-lora-ddp-gpus:
inherits: _llm
Expand All @@ -565,6 +567,7 @@ llm-lora-ddp-gpus:
repo_id="meta-llama/Meta-Llama-3.1-8B": true
batch_size=8: true
gradient_accumulation_steps=8: true
no_pretrained=True: true

llm-lora-ddp-nodes:
tags:
Expand All @@ -587,6 +590,7 @@ llm-lora-ddp-nodes:
repo_id="meta-llama/Meta-Llama-3.1-8B": true
batch_size=8: true
gradient_accumulation_steps=8: true
no_pretrained=True: true

num_machines: 2
requires_capabilities:
Expand All @@ -611,6 +615,7 @@ llm-lora-mp-gpus:
repo_id="meta-llama/Meta-Llama-3.1-70B": true
batch_size=8: true
gradient_accumulation_steps=1: true
no_pretrained=True: true

llm-full-mp-gpus:
inherits: _llm
Expand All @@ -631,6 +636,7 @@ llm-full-mp-gpus:
safetensors=true: true
batch_size=2: true
gradient_accumulation_steps=1: true
no_pretrained=True: true

llm-full-mp-nodes:
tags:
Expand All @@ -654,6 +660,7 @@ llm-full-mp-nodes:
safetensors=true: true
batch_size=2: true
gradient_accumulation_steps=1: true
no_pretrained=True: true

num_machines: 2
requires_capabilities:
Expand Down

0 comments on commit b03a424

Please sign in to comment.