Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge from CTuning #1038

Merged
merged 6 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions cm-mlops/script/app-mlperf-inference/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def postprocess(i):

if env.get('CM_MLPERF_USER_CONF', '') == '':
return {'return': 0}

output_dir = env['CM_MLPERF_OUTPUT_DIR']
mode = env['CM_MLPERF_LOADGEN_MODE']

Expand Down Expand Up @@ -61,7 +62,8 @@ def postprocess(i):
model = env['CM_MODEL']
model_full_name = env.get('CM_ML_MODEL_FULL_NAME', model)

if model == "resnet50":
if mode == "accuracy":
if model == "resnet50":
accuracy_filename = "accuracy-imagenet.py"
accuracy_filepath = os.path.join(env['CM_MLPERF_INFERENCE_CLASSIFICATION_AND_DETECTION_PATH'], "tools", \
accuracy_filename)
Expand All @@ -70,15 +72,15 @@ def postprocess(i):
accuracy_log_file_option_name = " --mlperf-accuracy-file "
datatype_option = " --dtype "+env['CM_IMAGENET_ACCURACY_DTYPE']

elif model == "retinanet":
elif model == "retinanet":
accuracy_filename = "accuracy-openimages.py"
accuracy_filepath = os.path.join(env['CM_MLPERF_INFERENCE_CLASSIFICATION_AND_DETECTION_PATH'], "tools", \
accuracy_filename)
dataset_args = " --openimages-dir " + env['CM_DATASET_PATH']
accuracy_log_file_option_name = " --mlperf-accuracy-file "
datatype_option = ""

elif 'bert' in model:
elif 'bert' in model:
accuracy_filename = "accuracy-squad.py"
accuracy_filepath = os.path.join(env['CM_MLPERF_INFERENCE_BERT_PATH'], accuracy_filename)
dataset_args = " --val_data '" + env['CM_DATASET_SQUAD_VAL_PATH'] + "' --vocab_file '" + env['CM_DATASET_SQUAD_VOCAB_PATH'] + "' --out_file predictions.json "
Expand Down
21 changes: 9 additions & 12 deletions cm-mlops/script/calibrate-model-for.qaic/_cm.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,17 @@
"tags": "get,dataset,imagenet,preprocessed,_calibration,_for.resnet50"
},
{
"enable_if_env":
{
"CM_CALIBRATE_SQUAD": [
"on"
]
},
"names": [
"squad-cal",
"preprocessed-dataset"
],
"tags": "get,dataset,calibration,squad,_pickle,_width.384"
"tags": "get,dataset,preprocessed,_calib1,squad,_pickle,_seq-length.384,_packed"
},
{
"names": [
Expand Down Expand Up @@ -172,22 +178,13 @@
},
"adr": {
"model-src": {
"tags": "retinanet,_no-nms"
"tags": "bert-large,_onnx"
}
},
"deps": [
{
"names": [
"squad-preprocessed",
"preprocessed-dataset"
],
"tags": "get,preprocessed,dataset,squad,_packed,_pickle"
}
],
"env": {
"CM_CALIBRATE_SQUAD": "yes",
"CM_QAIC_COMPILER_ARGS": "",
"CM_QAIC_COMPILER_PARAMS": "-onnx-define-symbol=batch_size,1 -onnx-define-symbol=seg_length,<<<CM_DATASET_SQUAD_TOKENIZED_MAX_SEQ_LENGTH>>> -input-list-file=<<<CM_DATASET_SQUAD_TOKENIZED_PACKED_FILENAMES_FILE>>> -num-histogram-bins=512 -profiling-threads=4",
"CM_QAIC_COMPILER_PARAMS": "-onnx-define-symbol=batch_size,1 -onnx-define-symbol=seg_length,<<<CM_DATASET_SQUAD_TOKENIZED_MAX_SEQ_LENGTH>>> -input-list-file=<<<CM_DATASET_SQUAD_TOKENIZED_PACKED_FILENAMES_FILE>>> -num-histogram-bins=512 -profiling-threads=96",
"CM_QAIC_MODEL_TO_CONVERT": "calibrate_bert_mlperf"
},
"seq.#": {
Expand Down
2 changes: 1 addition & 1 deletion cm-mlops/script/calibrate-model-for.qaic/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def create_batched_inputs(env):

def construct_calibration_cmd(env):
compiler_params = env['CM_QAIC_COMPILER_PARAMS']
batchsize = env['CM_QAIC_MODEL_BATCH_SIZE']
batchsize = env.get('CM_QAIC_MODEL_BATCH_SIZE', "1")
cmd = env['CM_QAIC_EXEC_PATH'] + " "
if env.get('CM_CREATE_INPUT_BATCH', '') == 'yes':
cmd += " -input-list-file=batched_input_files -batchsize="+batchsize + " "
Expand Down
44 changes: 38 additions & 6 deletions cm-mlops/script/compile-model-for.qaic/_cm.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@
"variations": {
"bs.1": {
"group": "batch-size",
"default": true,
"env": {
"CM_QAIC_MODEL_BATCH_SIZE": "1"
},
Expand Down Expand Up @@ -148,7 +147,7 @@
},
"resnet50,server,nsp.14": {
"env": {
"CM_QAIC_MODEL_COMPILER_ARGS_SUT": "-aic-num-cores=48 -ols=4"
"CM_QAIC_MODEL_COMPILER_ARGS_SUT": "-aic-num-cores=4 -ols=4"
},
"default_variations": {
"batch-size": "bs.8"
Expand Down Expand Up @@ -186,6 +185,30 @@
"CM_QAIC_MODEL_COMPILER_ARGS": "-sdp-cluster-sizes=4,4 -mos=1,4"
}
},
"bert-99,offline": {
"env": {
}
},
"bert-99,offline,nsp.14": {
"env": {
"CM_QAIC_MODEL_COMPILER_ARGS_SUT": "-aic-num-cores=1 -mos=1 -ols=3"
}
},
"bert-99,server,nsp.14": {
"env": {
"CM_QAIC_MODEL_COMPILER_ARGS_SUT": "-aic-num-cores=1 -mos=1 -ols=3"
}
},
"bert-99,multistream,nsp.14": {
"env": {
"CM_QAIC_MODEL_COMPILER_ARGS_SUT": "-aic-num-cores=4"
}
},
"bert-99,singlestream,nsp.14": {
"env": {
"CM_QAIC_MODEL_COMPILER_ARGS_SUT": "-aic-num-cores=8"
}
},
"tf": {
"group": "model-framework"
},
Expand Down Expand Up @@ -217,15 +240,24 @@
"bert-99": {
"adr": {
"model-src": {
"tags": "bert-99,_onnx"
"tags": "bert-large,_onnx"
}
},
"env": {
"CM_COMPILE_BERT": "on",
"CM_QAIC_MODEL_TO_CONVERT": "calibrate_bert_mlperf",
"CM_QAIC_MODEL_COMPILER_ARGS": "-aic-hw -aic-hw-version=2.0 -execute-nodes-in-fp16=Mul,Sqrt,Div,Add,ReduceMean,Softmax,Sub,Gather,Erf,Pow,Concat,Tile,LayerNormalization -quantization-schema=symmetric_with_uint8 -quantization-precision=Int8 -quantization-precision-bias=Int32 -vvv -compile-only -onnx-define-symbol=batch_size,1 -onnx-define-symbol=seg_length,[SEG] -multicast-weights",
"CM_QAIC_MODEL_COMPILER_PARAMS": "-enable-channelwise -profiling-threads=32 -onnx-define-symbol=batch_size,[BATCH_SIZE] -node-precision-info=[NODE_PRECISION_FILE]"
}
"CM_QAIC_MODEL_COMPILER_ARGS": "-aic-hw -aic-hw-version=2.0 -execute-nodes-in-fp16=Mul,Sqrt,Div,Add,ReduceMean,Softmax,Sub,Gather,Erf,Pow,Concat,Tile,LayerNormalization -quantization-schema=symmetric_with_uint8 -quantization-precision=Int8 -quantization-precision-bias=Int32 -vvv -compile-only -onnx-define-symbol=batch_size,1 -onnx-define-symbol=seg_length,384 -multicast-weights",
"CM_QAIC_MODEL_COMPILER_PARAMS_BASE": ""
},
"deps": [
{
"tags": "calibrate,qaic,_bert-99",
"names": [
"bert-profile",
"qaic-profile"
]
}
]
}
}
}
5 changes: 3 additions & 2 deletions cm-mlops/script/compile-model-for.qaic/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,16 @@ def preprocess(i):
def construct_compilation_cmd(env):
compiler_params_base = env['CM_QAIC_MODEL_COMPILER_PARAMS_BASE']
compiler_args = env['CM_QAIC_MODEL_COMPILER_ARGS'] + ' ' + env.get('CM_QAIC_MODEL_COMPILER_ARGS_SUT', '')
batchsize = env['CM_QAIC_MODEL_BATCH_SIZE']
batchsize = env.get('CM_QAIC_MODEL_BATCH_SIZE')

if env.get('CM_QAIC_MODEL_QUANTIZATION', '') == 'yes':
profile_string = " -load-profile=" + env['CM_QAIC_MODEL_PROFILE_WITH_PATH']
else:
profile_string = ''

compiler_params = compiler_params_base + ' ' + compiler_args
compiler_params += " -batchsize="+batchsize
if batchsize:
compiler_params += " -batchsize="+batchsize

aic_binary_dir = os.path.join(os.getcwd(), "elfs")

Expand Down
12 changes: 7 additions & 5 deletions cm-mlops/script/get-preprocessed-dataset-squad/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,16 @@ variations:
calib1:
group: calibration-set
env:
CM_SQUAD_CALIBRATION_SET: one
CM_DATASET_SQUAD_CALIBRATION_SET: one
calib2:
group: calibration-set
env:
CM_SQUAD_CALIBRATION_SET: two
CM_DATASET_SQUAD_CALIBRATION_SET: two
no-calib:
group: calibration-set
default: true
env:
CM_SQUAD_CALIBRATION_SET: ''
CM_DATASET_SQUAD_CALIBRATION_SET: ''
raw:
group: raw
default: true
Expand Down Expand Up @@ -85,8 +85,10 @@ variations:
CM_DATASET_SQUAD_PACKED: 'yes'
deps:
- tags: get,preprocessed,squad,_pickle
inherit_varation_tags: true
skipa_inherit_variation_groups:
env:
CM_DATASET_SQUAD_PACKED: ''
inherit_variation_tags: true
skip_inherit_variation_groups:
- packing

versions: {}
8 changes: 4 additions & 4 deletions cm-mlops/script/get-preprocessed-dataset-squad/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ def preprocess(i):

quiet = (env.get('CM_QUIET', False) == 'yes')

if env.get('CM_SQUAD_CALIBRATION_SET', '') == "one":
if env.get('CM_DATASET_SQUAD_CALIBRATION_SET', '') == "one":
env['DATASET_CALIBRATION_FILE'] = os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], 'calibration', 'SQuAD-v1.1', 'bert_calibration_features.txt')
env['DATASET_CALIBRATION_ID'] = 1
elif env.get('CM_SQUAD_CALIBRATION_SET', '') == "two":
elif env.get('CM_DATASET_SQUAD_CALIBRATION_SET', '') == "two":
env['DATASET_CALIBRATION_FILE'] = os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], 'calibration', 'SQuAD-v1.1', 'bert_calibration_qas_ids.txt')
env['DATASET_CALIBRATION_ID'] = 2
else:
Expand Down Expand Up @@ -55,8 +55,8 @@ def postprocess(i):
else:
with open("packed_filenames.txt", "w") as f:
for dirname in os.listdir(cur):
if os.path.isdir(dirname):
f.write(os.path.join(cur, "input_ids.raw") + ", " + os.path.join(cur, "segment_ids.raw") + ", " + os.path.join(cur, "input_position_ids.raw")+ "\n")
if os.path.isdir(dirname) and not dirname.startswith("_"):
f.write(os.path.join(cur, dirname, "input_ids.raw") + "," + os.path.join(cur, dirname, "segment_ids.raw") + "," + os.path.join(cur, dirname, "input_position_ids.raw")+ "\n")
env['CM_DATASET_SQUAD_TOKENIZED_PACKED_FILENAMES_FILE'] = os.path.join(cur, "packed_filenames.txt")

return {'return':0}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# MLPerf Inference Benchmarking on AWS dl2q.24xlarge instance using 8 QAIC Cloud AI 100

`dl2q.24xlarge` instance is available in `us-west-2d` and it has 96 vCPUs and 768 GB of memory.

[Deep Learning Base Qualcomm AMI (Amazon Linux 2)](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Images:visibility=public-images;imageId=ami-0287712deef96ecc6) image is recommended OS image as it comes with the QIAC SDKs (both Apps and Platform) preinstalled.


## System setup
```
yum install -y python3-devel git
python3 -m pip install cmind
cm pull repo mlcommons@ck
```

## ResNet50

Do a performance run for the Offline scenario

```
cm run script --tags=generate-run-cmds,inference,_performance-only --device=qaic --backend=glow \
--scenario=Offline --implementation=kilt --model=resnet50 \
--test_query_count=40000 --precision=fp32 --rerun
```

* `--adr.lperf-inference-implementation.device_ids=0` can be used to run the inference only on the first QAIC device
* `--precision=uint8` is the best option to be used but unfortunately, it is not working with the default platform SDK. When we use `--precision=fp32` the float32 inputs are on the fly converted by the QAIC driver to uint8 format. This overhead and 4x memory BW usage reduces the Offline scenario performance by nearly 50%. We got `~9000` QPS for a single device run

*WIP*
36 changes: 29 additions & 7 deletions cm-mlops/script/reproduce-mlperf-inference-kilt/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ new_env_keys:
- CM_ML_MODEL_*
- CM_MAX_EXAMPLES
- CM_IMAGENET_ACCURACY_DTYPE
- CM_SQUAD_ACCURACY_DTYPE


# Dependencies on other CM scripts
Expand Down Expand Up @@ -152,7 +153,7 @@ deps:
- bert-99.9
names:
- squad-tokenized
tags: get,dataset,tokenized,squad
tags: get,dataset,tokenized,squad,_raw

########################################################################
# Install OpenImages
Expand Down Expand Up @@ -343,14 +344,19 @@ variations:
CM_BENCHMARK: STANDALONE_BERT
kilt_model_name: bert
kilt_model_seq_length: 384
kilt_model_batch_size: 384
kilt_model_batch_size: 1
kilt_model_bert_variant: BERT_PACKED
kilt_input_format: "INT64,1,384:INT64,1,8:INT64,1,384:INT64,1,384"
kilt_output_format: "FLOAT32,1,384:FLOAT32,1,384"
dataset_squad_tokenized_max_seq_length: 384
loadgen_buffer_size: 10833
loadgen_dataset_size: 10833

bert_,qaic:
env:
kilt_input_format: "UINT32,1,384:UINT32,1,8:UINT32,1,384:UINT32,1,384"
kilt_device_qaic_skip_stage: convert

standalone:
group: run-mode
default: true
Expand Down Expand Up @@ -384,6 +390,7 @@ variations:
- bert_
env:
CM_MODEL: bert-99
CM_SQUAD_ACCURACY_DTYPE: float32
CM_NOT_ML_MODEL_STARTING_WEIGHTS_FILENAME: "https://zenodo.org/record/3750364/files/bert_large_v1_1_fake_quant.onnx"

bert-99.9:
Expand Down Expand Up @@ -422,19 +429,31 @@ variations:
env:
CM_MLPERF_LOADGEN_SCENARIO: SingleStream
kilt_model_batch_size: 1
adr:
qaic-model-compiler:
tags: _singlestream

multistream:
group: loadgen-scenario
env:
CM_MLPERF_LOADGEN_SCENARIO: MultiStream
adr:
qaic-model-compiler:
tags: _multistream
offline:
group: loadgen-scenario
env:
CM_MLPERF_LOADGEN_SCENARIO: Offline
adr:
qaic-model-compiler:
tags: _offline
server:
group: loadgen-scenario
env:
CM_MLPERF_LOADGEN_SCENARIO: Server
adr:
qaic-model-compiler:
tags: _server

uint8:
group: precision
Expand All @@ -461,15 +480,18 @@ variations:
base:
- nsp.14
env:
kilt_device_ids: 0,1,2,3,4,5,6,7
kilt_device_ids: "0"
qaic_queue_length: 6

dl2q.24xlarge,singlestream:
env:
kilt_device_ids: 0
base:
- activation-count.1
qaic_activation_count: "1"

dl2q.24xlarge,resnet50,offline:
base:
- activation-count.3
env:
qaic_activation_count: "3"

dl2q.24xlarge,bert-99,offline:
env:
qaic_activation_count: "14"
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ def preprocess(i):
elif "bert" in env.get('CM_MODEL'):
env['dataset_squad_tokenized_max_seq_length'] = env['CM_DATASET_SQUAD_TOKENIZED_MAX_SEQ_LENGTH']
env['dataset_squad_tokenized_root'] = env['CM_DATASET_SQUAD_TOKENIZED_ROOT']
env['dataset_squad_tokenized_input_ids'] = env['CM_DATASET_SQUAD_TOKENIZED_INPUT_IDS']
env['dataset_squad_tokenized_input_mask'] = env['CM_DATASET_SQUAD_TOKENIZED_INPUT_MASK']
env['dataset_squad_tokenized_segment_ids'] = env['CM_DATASET_SQUAD_TOKENIZED_SEGMENT_IDS']
env['dataset_squad_tokenized_input_ids'] = os.path.basename(env['CM_DATASET_SQUAD_TOKENIZED_INPUT_IDS'])
env['dataset_squad_tokenized_input_mask'] = os.path.basename(env['CM_DATASET_SQUAD_TOKENIZED_INPUT_MASK'])
env['dataset_squad_tokenized_segment_ids'] = os.path.basename(env['CM_DATASET_SQUAD_TOKENIZED_SEGMENT_IDS'])

if env.get('CM_BENCHMARK', '') == 'NETWORK_BERT_SERVER':
source_files.append(os.path.join(kilt_root, "benchmarks", "network", "bert", "server", "pack.cpp"))
Expand Down Expand Up @@ -96,7 +96,7 @@ def preprocess(i):
env['+ CXXFLAGS'].append("-DKILT_DEVICE_" + env['device'].upper())

# add preprocessor flag like "#define CM_MODEL_RESNET50"
env['+ CXXFLAGS'].append('-DCM_MODEL_' + env['CM_MODEL'].upper())
#env['+ CXXFLAGS'].append('-DCM_MODEL_' + env['CM_MODEL'].upper())
# add preprocessor flag like "#define CM_MLPERF_BACKEND_ONNXRUNTIME"
env['+ CXXFLAGS'].append('-DCM_MLPERF_BACKEND_' + env['CM_MLPERF_BACKEND'].upper())
# add preprocessor flag like "#define CM_MLPERF_DEVICE_CPU"
Expand Down