From 37050b84d07f7cfa731fc85ecb9de3faef1d282e Mon Sep 17 00:00:00 2001 From: Conglong Li Date: Tue, 14 Nov 2023 23:48:46 -0800 Subject: [PATCH] Fix lm_eval_harness for GPT models (#292) --- examples_deepspeed/MoE/ds_evalharness.sh | 3 ++- examples_deepspeed/MoE/readme_evalharness.md | 8 +++---- .../compression/ds_evalharness.sh | 5 +++-- .../gpt/eval/ds_evalharness_1gpu.sh | 3 ++- .../gpt/eval/ds_evalharness_parallel_run.sh | 1 + .../ds_evalharness_parallel_run_10shot.sh | 1 + tasks/eval_harness/evaluate.py | 22 +++++++++++-------- 7 files changed, 26 insertions(+), 17 deletions(-) diff --git a/examples_deepspeed/MoE/ds_evalharness.sh b/examples_deepspeed/MoE/ds_evalharness.sh index f989b1f37..3496ada20 100644 --- a/examples_deepspeed/MoE/ds_evalharness.sh +++ b/examples_deepspeed/MoE/ds_evalharness.sh @@ -28,7 +28,7 @@ TASKS="lambada" VOCAB_FILE=/data/Megatron-LM/data/gpt2-vocab.json MERGE_FILE=/data/Megatron-LM/data/gpt2-merges.txt -export HF_DATASETS_OFFLINE=1 +# export HF_DATASETS_OFFLINE=1 # Dummy arguments to make megatron happy. No need to configure them. # The reason we don't need to configure them and many other arguments is @@ -53,6 +53,7 @@ CMD="../../tasks/eval_harness/evaluate.py \ --no-load-rng \ --inference \ --disable-moe-token-dropping \ + --tokenizer-type GPT2BPETokenizer \ --adaptive_seq_len\ --eval_fp32\ --task_list $TASKS\ diff --git a/examples_deepspeed/MoE/readme_evalharness.md b/examples_deepspeed/MoE/readme_evalharness.md index 426d63735..d30075e2f 100644 --- a/examples_deepspeed/MoE/readme_evalharness.md +++ b/examples_deepspeed/MoE/readme_evalharness.md @@ -11,11 +11,10 @@ This particular setup uses the normal deepspeed checkpoint and requires no conve On login console with external network Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `best-download==0.0.7` needed to download some tasks. +Below package version numbers are what we tested that work. ``` (maybe need pip install --upgrade pip) -pip install best-download==0.0.7 -pip install lm-eval -(previously we used "pip install git+https://github.com/EleutherAI/lm-evaluation-harness" to install, but later found the command above has less dependency issues) +pip install best-download==0.0.7 lm-eval==0.2.0 datasets==1.15.1 transformers==4.20.1 huggingface-hub==0.8.1 ``` 2. Pre-download needed datasets @@ -33,7 +32,8 @@ Then install datasets for the tasks: ``` python ../../tasks/eval_harness/download.py --task_list hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli ``` -and make sure that `export HF_DATASETS_OFFLINE=1` + +Previously we set `export HF_DATASETS_OFFLINE=1` to make the dataset offline after the above manual download. But somehow now this could trigger error on some kind of online verification for some of the datasets, so it's recommended to only set offline mode when necessary.