diff --git a/benchmarking/debug.sh b/benchmarking/debug.sh index be2ace94c7..4a5741199e 100755 --- a/benchmarking/debug.sh +++ b/benchmarking/debug.sh @@ -17,7 +17,7 @@ reset make -j install # python ../inference/utils/download_hf_model.py $MODEL_NAME -# python ../inference/utils/download_peft_model.py $PEFT_MODEL_NAME +python ../inference/utils/download_peft_model.py $PEFT_MODEL_NAME export LEGION_BACKTRACE=1 diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index d6aa2b3f03..d48c1125e6 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -257,6 +257,19 @@ def __get_resource_path( else: raise ValueError(f"Invalid resource type {resource_type}") + def __is_empty_dir(self, folder: str) -> bool: + """Check whether a folder only contains the rev_sha.txt file + + Args: + folder (str): Path to the folder to check + + Returns: + bool: True if the folder is empty, False otherwise + """ + if not os.path.isdir(folder) or not os.path.exists(folder): + return True + return len(os.listdir(folder)) == 1 and "rev_sha.txt" in os.listdir(folder) + def __need_cache_refresh( self, model_name: str, resource_type: CachedResourceType ) -> bool: @@ -272,7 +285,8 @@ def __need_cache_refresh( """ resource_path = self.__get_resource_path(model_name, resource_type) ff_revision, latest_revision = self.__get_revision_hashes(self.model_name, resource_path) - if self.refresh_cache or not os.path.exists(resource_path) or ff_revision != latest_revision: + + if self.refresh_cache or not os.path.exists(resource_path) or self.__is_empty_dir(resource_path) or ff_revision != latest_revision: print( f"Refreshing {resource_type} in cache for model {model_name} at path {resource_path} ..." ) @@ -395,7 +409,7 @@ def download_and_convert_peft_model(hf_peft_model_id: str): weights_path = self.__get_resource_path( hf_peft_model_id.lower(), CachedResourceType.WEIGHTS ) - print(f"Opening {adapter_path}...") + adapter_path = os.path.join(adapter_path, "adapter_model.safetensors") with safe_open(adapter_path, framework="pt", device="cpu") as f: for tensor_name in f.keys(): tensor = f.get_tensor(tensor_name) diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu index 301f2255cd..e799ba2076 100644 --- a/src/ops/kernels/lora_linear_kernels.cu +++ b/src/ops/kernels/lora_linear_kernels.cu @@ -208,7 +208,7 @@ void inference_kernel(LoraLinearMeta *m, assert(lora_config.trainable == bc->requestsInfo[i].finetuning_request && "Trainable flag mismatch"); int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch; - assert(num_peft_tokens == bc->num_finetuning_tokens()); + // assert(num_peft_tokens == bc->num_finetuning_tokens()); // int max_peft_tokens = bc->requestsInfo[i].max_length; int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch; LoraLinearWeight weight = m->peft_memory_manager->get_peft(