forked from intel/llm-on-ray
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add redpajama data preprocessing code (intel#12)
* add testing scripts * remove temp-dir for worker * remove test files * add redpajama dp code * ignore all notebook files * update streaming code * add write-on-host for streaming * better line alignment * move files * rename folder * rename folder and add group_files * debug * add recovery test scripts * add additional python packages * add test flag * add README and some minor fixes * change the image name * change the directory back * add training stop for the second * fix typo * add data source support * clean up a bit * restructure folders * restructure files * add script headers * reorder and add READMEs * revert back due to file movements * fix typo * fix lib import * enable mounting localdisk * change name of cc * fix dtype * performance optimization for streaming * use the latest ray * change node * add new files * bug fix * add nltk * fix hdfs after re-order folders * set default to false * use variables instead of credentials * change the training config path * update README
- Loading branch information
Showing
29 changed files
with
1,825 additions
and
83 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
__pycache__ | ||
**.ipynb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# How to run End2End Validation of the Recovery Test? | ||
|
||
## Step 1: Set up Env | ||
Please follow [this guide](../workload_in_containers/README.md) on how to set-up the container environment of this workload. When the containers are running, you can enter the container on head node using following command: | ||
```bash | ||
docker exec -it ray-leader bash | ||
``` | ||
|
||
## Step 2: Start the script | ||
You can use the `test_end2end.sh` to run the end-to-end validation for ray recovery mechanism. | ||
```bash | ||
cd tools/pretrain_recovery_test | ||
./test_end2end.sh | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import json | ||
import time | ||
import argparse | ||
import os | ||
|
||
class bcolors: | ||
HEADER = '\033[95m' | ||
OKBLUE = '\033[94m' | ||
OKCYAN = '\033[96m' | ||
OKGREEN = '\033[92m' | ||
WARNING = '\033[93m' | ||
FAIL = '\033[91m' | ||
ENDC = '\033[0m' | ||
BOLD = '\033[1m' | ||
UNDERLINE = '\033[4m' | ||
|
||
def read_json(json_file): | ||
|
||
with open(json_file) as file: | ||
parsed_json = json.load(file) | ||
|
||
return parsed_json | ||
|
||
def get_all_episodes(parsed_json): | ||
|
||
parsed_json = dict(sorted(parsed_json.items())) | ||
|
||
return parsed_json.keys() | ||
|
||
def identify_common_episode(first_json, second_json): | ||
|
||
first_episodes = get_all_episodes(first_json) | ||
second_episodes = get_all_episodes(second_json) | ||
|
||
common_episodes = list(set(first_episodes).intersection(second_episodes)) | ||
|
||
if len(common_episodes) == 0: | ||
print("the 2 trainings have no episode overlapped. Check your json file!") | ||
return -1 | ||
elif len(common_episodes) > 1: | ||
print("the 2 trainings have more than 1 overlapped episodes. Check your json files!") | ||
return -1 | ||
else: | ||
return common_episodes[0] | ||
|
||
def compare_training_states(json1, json2, step): | ||
|
||
step = f'step_{step}' | ||
|
||
data_result = json1[step]['data'] == json2[step]['data'] | ||
lr_result = json1[step]['learning_rate'] == json2[step]['learning_rate'] | ||
loss_result = json1[step]['loss'] == json2[step]['loss'] | ||
|
||
return data_result, lr_result, loss_result | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"--file_path", | ||
type=str, | ||
default='/home/user/tmp/state', | ||
help="absolute path of the json files" | ||
) | ||
args = parser.parse_args() | ||
|
||
# read the json files | ||
state1 = read_json(os.path.join(args.file_path, 'stepwise_training_state.json')) | ||
state2 = read_json(os.path.join(args.file_path, 'stepwise_training_state_recovery.json')) | ||
|
||
# identify the overlapped episode | ||
common_episode = identify_common_episode(state1, state2) | ||
print(f"the common episode of 2 trainings: {common_episode}\n") | ||
|
||
# compare the different training states | ||
data_result, lr_result, loss_result = compare_training_states(state1[common_episode], state2[common_episode], 0) | ||
|
||
# print out the detailed comparison results | ||
print(f"Are the Data the same?\n{data_result}") | ||
print(f"Are the Learning Rate the same?\n{lr_result}") | ||
print(f"Are the Training Loss the same?\n{loss_result}") | ||
|
||
if data_result and lr_result and loss_result: | ||
print(f"{bcolors.OKGREEN}\nrecovery tests all passed!{bcolors.ENDC}") | ||
else: | ||
print(f"{bcolors.FAIL}recovery test failed! check the detailed log above.{bcolors.ENDC}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.