Merge from CTuning (#1038)

mlcommons · Dec 13, 2023 · ce981c3 · ce981c3
2 parents fc08ed0 + c7a18ff
commit ce981c3
Show file tree

Hide file tree

Showing 10 changed files with 128 additions and 44 deletions.
diff --git a/cm-mlops/script/app-mlperf-inference/customize.py b/cm-mlops/script/app-mlperf-inference/customize.py
@@ -32,6 +32,7 @@ def postprocess(i):
 
     if env.get('CM_MLPERF_USER_CONF', '') == '':
         return {'return': 0}
+
     output_dir = env['CM_MLPERF_OUTPUT_DIR']
     mode = env['CM_MLPERF_LOADGEN_MODE']
 
@@ -61,7 +62,8 @@ def postprocess(i):
     model = env['CM_MODEL']
     model_full_name = env.get('CM_ML_MODEL_FULL_NAME', model)
 
-    if model == "resnet50":
+    if mode == "accuracy":
+      if model == "resnet50":
         accuracy_filename = "accuracy-imagenet.py"
         accuracy_filepath = os.path.join(env['CM_MLPERF_INFERENCE_CLASSIFICATION_AND_DETECTION_PATH'], "tools", \
                         accuracy_filename)
@@ -70,15 +72,15 @@ def postprocess(i):
         accuracy_log_file_option_name = " --mlperf-accuracy-file "
         datatype_option = " --dtype "+env['CM_IMAGENET_ACCURACY_DTYPE']
 
-    elif model == "retinanet":
+      elif model == "retinanet":
         accuracy_filename = "accuracy-openimages.py"
         accuracy_filepath = os.path.join(env['CM_MLPERF_INFERENCE_CLASSIFICATION_AND_DETECTION_PATH'], "tools", \
                         accuracy_filename)
         dataset_args = " --openimages-dir " + env['CM_DATASET_PATH']
         accuracy_log_file_option_name = " --mlperf-accuracy-file "
         datatype_option = ""
 
-    elif 'bert' in model:
+      elif 'bert' in model:
         accuracy_filename = "accuracy-squad.py"
         accuracy_filepath = os.path.join(env['CM_MLPERF_INFERENCE_BERT_PATH'], accuracy_filename)
         dataset_args = " --val_data '" + env['CM_DATASET_SQUAD_VAL_PATH'] + "' --vocab_file '" + env['CM_DATASET_SQUAD_VOCAB_PATH'] + "' --out_file predictions.json "

diff --git a/cm-mlops/script/calibrate-model-for.qaic/_cm.json b/cm-mlops/script/calibrate-model-for.qaic/_cm.json
@@ -55,11 +55,17 @@
       "tags": "get,dataset,imagenet,preprocessed,_calibration,_for.resnet50"
     },
     {
+      "enable_if_env":
+      {
+        "CM_CALIBRATE_SQUAD": [
+          "on"
+          ]
+      },
       "names": [
         "squad-cal",
         "preprocessed-dataset"
       ],
-      "tags": "get,dataset,calibration,squad,_pickle,_width.384"
+      "tags": "get,dataset,preprocessed,_calib1,squad,_pickle,_seq-length.384,_packed"
     },
     {
       "names": [
@@ -172,22 +178,13 @@
       },
       "adr": {
         "model-src": {
-          "tags": "retinanet,_no-nms"
+          "tags": "bert-large,_onnx"
         }
       },
-      "deps": [
-        {
-          "names": [
-            "squad-preprocessed",
-            "preprocessed-dataset"
-          ],
-          "tags": "get,preprocessed,dataset,squad,_packed,_pickle"
-        }
-      ],
       "env": {
         "CM_CALIBRATE_SQUAD": "yes",
         "CM_QAIC_COMPILER_ARGS": "",
-        "CM_QAIC_COMPILER_PARAMS": "-onnx-define-symbol=batch_size,1 -onnx-define-symbol=seg_length,<<<CM_DATASET_SQUAD_TOKENIZED_MAX_SEQ_LENGTH>>> -input-list-file=<<<CM_DATASET_SQUAD_TOKENIZED_PACKED_FILENAMES_FILE>>> -num-histogram-bins=512 -profiling-threads=4",
+        "CM_QAIC_COMPILER_PARAMS": "-onnx-define-symbol=batch_size,1 -onnx-define-symbol=seg_length,<<<CM_DATASET_SQUAD_TOKENIZED_MAX_SEQ_LENGTH>>> -input-list-file=<<<CM_DATASET_SQUAD_TOKENIZED_PACKED_FILENAMES_FILE>>> -num-histogram-bins=512 -profiling-threads=96",
         "CM_QAIC_MODEL_TO_CONVERT": "calibrate_bert_mlperf"
       },
       "seq.#": {

diff --git a/cm-mlops/script/calibrate-model-for.qaic/customize.py b/cm-mlops/script/calibrate-model-for.qaic/customize.py
@@ -67,7 +67,7 @@ def create_batched_inputs(env):
 
 def construct_calibration_cmd(env):
     compiler_params = env['CM_QAIC_COMPILER_PARAMS']
-    batchsize = env['CM_QAIC_MODEL_BATCH_SIZE']
+    batchsize = env.get('CM_QAIC_MODEL_BATCH_SIZE', "1")
     cmd = env['CM_QAIC_EXEC_PATH']  + " "
     if env.get('CM_CREATE_INPUT_BATCH', '') == 'yes':
         cmd += " -input-list-file=batched_input_files  -batchsize="+batchsize + " "

diff --git a/cm-mlops/script/compile-model-for.qaic/_cm.json b/cm-mlops/script/compile-model-for.qaic/_cm.json
@@ -59,7 +59,6 @@
   "variations": {
     "bs.1": {
       "group": "batch-size",
-      "default": true,
       "env": {
         "CM_QAIC_MODEL_BATCH_SIZE": "1"
       },
@@ -148,7 +147,7 @@
     },
     "resnet50,server,nsp.14": {
       "env": {
-        "CM_QAIC_MODEL_COMPILER_ARGS_SUT": "-aic-num-cores=48 -ols=4"
+        "CM_QAIC_MODEL_COMPILER_ARGS_SUT": "-aic-num-cores=4 -ols=4"
       },
       "default_variations": {
         "batch-size": "bs.8"
@@ -186,6 +185,30 @@
         "CM_QAIC_MODEL_COMPILER_ARGS": "-sdp-cluster-sizes=4,4 -mos=1,4"
       }
     },
+    "bert-99,offline": {
+      "env": {
+      }
+    },
+    "bert-99,offline,nsp.14": {
+      "env": {
+        "CM_QAIC_MODEL_COMPILER_ARGS_SUT": "-aic-num-cores=1 -mos=1 -ols=3"
+      }
+    },
+    "bert-99,server,nsp.14": {
+      "env": {
+        "CM_QAIC_MODEL_COMPILER_ARGS_SUT": "-aic-num-cores=1 -mos=1 -ols=3"
+      }
+    },
+    "bert-99,multistream,nsp.14": {
+      "env": {
+        "CM_QAIC_MODEL_COMPILER_ARGS_SUT": "-aic-num-cores=4"
+      }
+    },
+    "bert-99,singlestream,nsp.14": {
+      "env": {
+        "CM_QAIC_MODEL_COMPILER_ARGS_SUT": "-aic-num-cores=8"
+      }
+    },
     "tf": {
       "group": "model-framework"
     },
@@ -217,15 +240,24 @@
     "bert-99": {
       "adr": {
         "model-src": {
-          "tags": "bert-99,_onnx"
+          "tags": "bert-large,_onnx"
         }
       },
       "env": {
         "CM_COMPILE_BERT": "on",
         "CM_QAIC_MODEL_TO_CONVERT": "calibrate_bert_mlperf",
-        "CM_QAIC_MODEL_COMPILER_ARGS": "-aic-hw -aic-hw-version=2.0 -execute-nodes-in-fp16=Mul,Sqrt,Div,Add,ReduceMean,Softmax,Sub,Gather,Erf,Pow,Concat,Tile,LayerNormalization -quantization-schema=symmetric_with_uint8 -quantization-precision=Int8 -quantization-precision-bias=Int32 -vvv -compile-only -onnx-define-symbol=batch_size,1 -onnx-define-symbol=seg_length,[SEG] -multicast-weights",
-        "CM_QAIC_MODEL_COMPILER_PARAMS": "-enable-channelwise -profiling-threads=32 -onnx-define-symbol=batch_size,[BATCH_SIZE] -node-precision-info=[NODE_PRECISION_FILE]"
-      }
+        "CM_QAIC_MODEL_COMPILER_ARGS": "-aic-hw -aic-hw-version=2.0 -execute-nodes-in-fp16=Mul,Sqrt,Div,Add,ReduceMean,Softmax,Sub,Gather,Erf,Pow,Concat,Tile,LayerNormalization -quantization-schema=symmetric_with_uint8 -quantization-precision=Int8 -quantization-precision-bias=Int32 -vvv -compile-only -onnx-define-symbol=batch_size,1 -onnx-define-symbol=seg_length,384 -multicast-weights",
+        "CM_QAIC_MODEL_COMPILER_PARAMS_BASE": ""
+      },
+      "deps": [
+        {
+          "tags": "calibrate,qaic,_bert-99",
+          "names": [
+            "bert-profile",
+            "qaic-profile"
+	  ]
+        }
+      ]
     }
   }
 }
diff --git a/cm-mlops/script/compile-model-for.qaic/customize.py b/cm-mlops/script/compile-model-for.qaic/customize.py
@@ -27,15 +27,16 @@ def preprocess(i):
 def construct_compilation_cmd(env):
     compiler_params_base = env['CM_QAIC_MODEL_COMPILER_PARAMS_BASE']
     compiler_args = env['CM_QAIC_MODEL_COMPILER_ARGS'] + ' ' + env.get('CM_QAIC_MODEL_COMPILER_ARGS_SUT', '')
-    batchsize = env['CM_QAIC_MODEL_BATCH_SIZE']
+    batchsize = env.get('CM_QAIC_MODEL_BATCH_SIZE')
 
     if env.get('CM_QAIC_MODEL_QUANTIZATION', '') == 'yes':
         profile_string = " -load-profile=" + env['CM_QAIC_MODEL_PROFILE_WITH_PATH']
     else:
         profile_string = ''
 
     compiler_params = compiler_params_base + ' ' + compiler_args
-    compiler_params += " -batchsize="+batchsize
+    if batchsize:
+        compiler_params += " -batchsize="+batchsize
 
     aic_binary_dir = os.path.join(os.getcwd(), "elfs")
 

diff --git a/cm-mlops/script/get-preprocessed-dataset-squad/_cm.yaml b/cm-mlops/script/get-preprocessed-dataset-squad/_cm.yaml
@@ -42,16 +42,16 @@ variations:
   calib1:
     group: calibration-set
     env:
-      CM_SQUAD_CALIBRATION_SET: one
+      CM_DATASET_SQUAD_CALIBRATION_SET: one
   calib2:
     group: calibration-set
     env:
-      CM_SQUAD_CALIBRATION_SET: two
+      CM_DATASET_SQUAD_CALIBRATION_SET: two
   no-calib:
     group: calibration-set
     default: true
     env:
-      CM_SQUAD_CALIBRATION_SET: ''
+      CM_DATASET_SQUAD_CALIBRATION_SET: ''
   raw:
     group: raw
     default: true
@@ -85,8 +85,10 @@ variations:
       CM_DATASET_SQUAD_PACKED: 'yes'
     deps:
       - tags: get,preprocessed,squad,_pickle
-        inherit_varation_tags: true
-        skipa_inherit_variation_groups:
+        env:
+          CM_DATASET_SQUAD_PACKED: ''
+        inherit_variation_tags: true
+        skip_inherit_variation_groups:
         - packing
 
 versions: {}
diff --git a/cm-mlops/script/get-preprocessed-dataset-squad/customize.py b/cm-mlops/script/get-preprocessed-dataset-squad/customize.py
@@ -13,10 +13,10 @@ def preprocess(i):
 
     quiet = (env.get('CM_QUIET', False) == 'yes')
 
-    if env.get('CM_SQUAD_CALIBRATION_SET', '') == "one":
+    if env.get('CM_DATASET_SQUAD_CALIBRATION_SET', '') == "one":
         env['DATASET_CALIBRATION_FILE'] = os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], 'calibration', 'SQuAD-v1.1', 'bert_calibration_features.txt')
         env['DATASET_CALIBRATION_ID'] = 1
-    elif env.get('CM_SQUAD_CALIBRATION_SET', '') == "two":
+    elif env.get('CM_DATASET_SQUAD_CALIBRATION_SET', '') == "two":
         env['DATASET_CALIBRATION_FILE'] = os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], 'calibration', 'SQuAD-v1.1', 'bert_calibration_qas_ids.txt')
         env['DATASET_CALIBRATION_ID'] = 2
     else:
@@ -55,8 +55,8 @@ def postprocess(i):
     else:
         with open("packed_filenames.txt", "w") as f:
             for dirname in os.listdir(cur):
-                if os.path.isdir(dirname):
-                    f.write(os.path.join(cur, "input_ids.raw") + ", " + os.path.join(cur, "segment_ids.raw") + ", " + os.path.join(cur, "input_position_ids.raw")+ "\n")
+                if os.path.isdir(dirname) and not dirname.startswith("_"):
+                    f.write(os.path.join(cur, dirname, "input_ids.raw") + "," + os.path.join(cur, dirname, "segment_ids.raw") + "," + os.path.join(cur, dirname, "input_position_ids.raw")+ "\n")
         env['CM_DATASET_SQUAD_TOKENIZED_PACKED_FILENAMES_FILE'] = os.path.join(cur, "packed_filenames.txt")
 
     return {'return':0}
diff --git a/cm-mlops/script/reproduce-mlperf-inference-kilt/README_aws_dl2q.24xlarge.md b/cm-mlops/script/reproduce-mlperf-inference-kilt/README_aws_dl2q.24xlarge.md
@@ -0,0 +1,28 @@
+# MLPerf Inference Benchmarking on AWS dl2q.24xlarge instance using 8 QAIC Cloud AI 100
+
+`dl2q.24xlarge` instance is available in `us-west-2d` and it has 96 vCPUs and 768 GB of memory. 
+
+[Deep Learning Base Qualcomm AMI (Amazon Linux 2)](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Images:visibility=public-images;imageId=ami-0287712deef96ecc6) image is recommended OS image as it comes with the QIAC SDKs (both Apps and Platform) preinstalled. 
+
+
+## System setup
+```
+yum install -y python3-devel git
+python3 -m pip install cmind
+cm pull repo mlcommons@ck
+```
+
+## ResNet50
+
+Do a performance run for the Offline scenario 
+
+```
+cm run script --tags=generate-run-cmds,inference,_performance-only --device=qaic --backend=glow \
+--scenario=Offline  --implementation=kilt --model=resnet50 \
+--test_query_count=40000 --precision=fp32 --rerun
+```
+
+* `--adr.lperf-inference-implementation.device_ids=0` can be used to run the inference only on the first QAIC device
+* `--precision=uint8` is the best option to be used but unfortunately, it is not working with the default platform SDK. When we use `--precision=fp32` the float32 inputs are on the fly converted by the QAIC driver to uint8 format. This overhead and 4x memory BW usage reduces the Offline scenario performance by nearly 50%. We got `~9000` QPS for a single device run
+
+*WIP*
diff --git a/cm-mlops/script/reproduce-mlperf-inference-kilt/_cm.yaml b/cm-mlops/script/reproduce-mlperf-inference-kilt/_cm.yaml
@@ -74,6 +74,7 @@ new_env_keys:
   - CM_ML_MODEL_*
   - CM_MAX_EXAMPLES
   - CM_IMAGENET_ACCURACY_DTYPE
+  - CM_SQUAD_ACCURACY_DTYPE
 
 
 # Dependencies on other CM scripts
@@ -152,7 +153,7 @@ deps:
       - bert-99.9
     names:
       - squad-tokenized
-    tags: get,dataset,tokenized,squad
+    tags: get,dataset,tokenized,squad,_raw
 
   ########################################################################
   # Install OpenImages
@@ -343,14 +344,19 @@ variations:
       CM_BENCHMARK: STANDALONE_BERT
       kilt_model_name: bert
       kilt_model_seq_length: 384
-      kilt_model_batch_size: 384
+      kilt_model_batch_size: 1
       kilt_model_bert_variant: BERT_PACKED
       kilt_input_format: "INT64,1,384:INT64,1,8:INT64,1,384:INT64,1,384"
       kilt_output_format: "FLOAT32,1,384:FLOAT32,1,384"
       dataset_squad_tokenized_max_seq_length: 384
       loadgen_buffer_size: 10833
       loadgen_dataset_size: 10833
 
+  bert_,qaic:
+    env:
+      kilt_input_format: "UINT32,1,384:UINT32,1,8:UINT32,1,384:UINT32,1,384"
+      kilt_device_qaic_skip_stage: convert
+
   standalone:
     group: run-mode
     default: true
@@ -384,6 +390,7 @@ variations:
     - bert_
     env:
       CM_MODEL: bert-99
+      CM_SQUAD_ACCURACY_DTYPE: float32
       CM_NOT_ML_MODEL_STARTING_WEIGHTS_FILENAME: "https://zenodo.org/record/3750364/files/bert_large_v1_1_fake_quant.onnx"
 
   bert-99.9:
@@ -422,19 +429,31 @@ variations:
     env:
       CM_MLPERF_LOADGEN_SCENARIO: SingleStream
       kilt_model_batch_size: 1
+    adr:
+      qaic-model-compiler:
+        tags: _singlestream
 
   multistream:
     group: loadgen-scenario
     env:
       CM_MLPERF_LOADGEN_SCENARIO: MultiStream
+    adr:
+      qaic-model-compiler:
+        tags: _multistream
   offline:
     group: loadgen-scenario
     env:
       CM_MLPERF_LOADGEN_SCENARIO: Offline
+    adr:
+      qaic-model-compiler:
+        tags: _offline
   server:
     group: loadgen-scenario
     env:
       CM_MLPERF_LOADGEN_SCENARIO: Server
+    adr:
+      qaic-model-compiler:
+        tags: _server
 
   uint8:
     group: precision
@@ -461,15 +480,18 @@ variations:
     base:
       - nsp.14
     env:
-      kilt_device_ids: 0,1,2,3,4,5,6,7
+      kilt_device_ids: "0"
       qaic_queue_length: 6
 
   dl2q.24xlarge,singlestream:
     env:
       kilt_device_ids: 0
-    base:
-      - activation-count.1
+      qaic_activation_count: "1"
 
   dl2q.24xlarge,resnet50,offline:
-    base:
-      - activation-count.3
+    env:
+      qaic_activation_count: "3"
+
+  dl2q.24xlarge,bert-99,offline:
+    env:
+      qaic_activation_count: "14"
diff --git a/cm-mlops/script/reproduce-mlperf-inference-kilt/customize.py b/cm-mlops/script/reproduce-mlperf-inference-kilt/customize.py
@@ -42,9 +42,9 @@ def preprocess(i):
     elif "bert" in env.get('CM_MODEL'):
         env['dataset_squad_tokenized_max_seq_length'] = env['CM_DATASET_SQUAD_TOKENIZED_MAX_SEQ_LENGTH']
         env['dataset_squad_tokenized_root'] =  env['CM_DATASET_SQUAD_TOKENIZED_ROOT']
-        env['dataset_squad_tokenized_input_ids'] = env['CM_DATASET_SQUAD_TOKENIZED_INPUT_IDS']
-        env['dataset_squad_tokenized_input_mask'] =  env['CM_DATASET_SQUAD_TOKENIZED_INPUT_MASK']
-        env['dataset_squad_tokenized_segment_ids'] =  env['CM_DATASET_SQUAD_TOKENIZED_SEGMENT_IDS']
+        env['dataset_squad_tokenized_input_ids'] = os.path.basename(env['CM_DATASET_SQUAD_TOKENIZED_INPUT_IDS'])
+        env['dataset_squad_tokenized_input_mask'] =  os.path.basename(env['CM_DATASET_SQUAD_TOKENIZED_INPUT_MASK'])
+        env['dataset_squad_tokenized_segment_ids'] =  os.path.basename(env['CM_DATASET_SQUAD_TOKENIZED_SEGMENT_IDS'])
 
     if env.get('CM_BENCHMARK', '') == 'NETWORK_BERT_SERVER':
         source_files.append(os.path.join(kilt_root, "benchmarks", "network", "bert", "server", "pack.cpp"))
@@ -96,7 +96,7 @@ def preprocess(i):
     env['+ CXXFLAGS'].append("-DKILT_DEVICE_" + env['device'].upper())
 
     # add preprocessor flag like "#define CM_MODEL_RESNET50"
-    env['+ CXXFLAGS'].append('-DCM_MODEL_' + env['CM_MODEL'].upper())
+    #env['+ CXXFLAGS'].append('-DCM_MODEL_' + env['CM_MODEL'].upper())
     # add preprocessor flag like "#define CM_MLPERF_BACKEND_ONNXRUNTIME"
     env['+ CXXFLAGS'].append('-DCM_MLPERF_BACKEND_' + env['CM_MLPERF_BACKEND'].upper())
     # add preprocessor flag like "#define CM_MLPERF_DEVICE_CPU"