From 442a67dd767f34e5a8019dc33cc00162299b256e Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 1 Nov 2023 11:00:34 +0800 Subject: [PATCH 1/5] add itrex for cpu Signed-off-by: yiliu30 --- examples/models/gpt2/gpt2_woq.py | 9 +++++++ examples/models/llama2/llama2_woq.py | 9 +++++++ src/xturing/config/__init__.py | 11 +++++--- src/xturing/engines/causal.py | 40 +++++++++++++++++++--------- src/xturing/models/causal.py | 8 +++--- src/xturing/utils/utils.py | 29 ++++++++++++++++++++ 6 files changed, 87 insertions(+), 19 deletions(-) create mode 100644 examples/models/gpt2/gpt2_woq.py create mode 100644 examples/models/llama2/llama2_woq.py diff --git a/examples/models/gpt2/gpt2_woq.py b/examples/models/gpt2/gpt2_woq.py new file mode 100644 index 0000000..23d5008 --- /dev/null +++ b/examples/models/gpt2/gpt2_woq.py @@ -0,0 +1,9 @@ +# from xturing.datasets.instruction_dataset import InstructionDataset +from xturing.models import BaseModel + +# Initializes the model: Quantize model with weight only algorithms and replace the linear with itrex's qbits_linear kernel +model = BaseModel.create("gpt2_int8") + +# Once the model has been quantized, you can start doing inferences +output = model.generate(texts=["Why LLM models are becoming so important?"]) +print(output) \ No newline at end of file diff --git a/examples/models/llama2/llama2_woq.py b/examples/models/llama2/llama2_woq.py new file mode 100644 index 0000000..5ebbd74 --- /dev/null +++ b/examples/models/llama2/llama2_woq.py @@ -0,0 +1,9 @@ +# from xturing.datasets.instruction_dataset import InstructionDataset +from xturing.models import BaseModel + +# Initializes the model: Quantize model with weight only algorithms and replace the linear with itrex's qbits_linear kernel +model = BaseModel.create("llama2_int8") + +# Once the model has been quantized, you can start doing inferences +output = model.generate(texts=["Why LLM models are becoming so important?"]) +print(output) \ No newline at end of file diff --git a/src/xturing/config/__init__.py b/src/xturing/config/__init__.py index 56820fb..8331e15 100644 --- a/src/xturing/config/__init__.py +++ b/src/xturing/config/__init__.py @@ -1,6 +1,10 @@ import torch from xturing.utils.interactive import is_interactive_execution +from xturing.utils.logging import configure_logger +from xturing.utils.utils import assert_install_itrex + +logger = configure_logger(__name__) # check if cuda is available, if not use cpu and throw warning DEFAULT_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -8,8 +12,9 @@ IS_INTERACTIVE = is_interactive_execution() if DEFAULT_DEVICE.type == "cpu": - print("WARNING: CUDA is not available, using CPU instead, can be very slow") + logger.warning("WARNING: CUDA is not available, using CPU instead, try running the model with Itrex.") -def assert_not_cpu_int8(): - assert DEFAULT_DEVICE.type != "cpu", "Int8 models are not supported on CPU" +def assert_cpu_int8_on_itrex(): + if DEFAULT_DEVICE.type == "cpu": + assert_install_itrex() \ No newline at end of file diff --git a/src/xturing/engines/causal.py b/src/xturing/engines/causal.py index 64d068e..5e7d40f 100644 --- a/src/xturing/engines/causal.py +++ b/src/xturing/engines/causal.py @@ -20,6 +20,7 @@ from xturing.engines.quant_utils.peft_utils import LoraConfig as peftLoraConfig from xturing.engines.quant_utils.peft_utils import prepare_model_for_kbit_training from xturing.utils.loss_fns import CrossEntropyLoss +from xturing.utils.utils import assert_install_itrex class CausalEngine(BaseEngine): @@ -60,18 +61,33 @@ def __init__( self.tokenizer = tokenizer elif model_name is not None: if load_8bit: - device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} - self.model = AutoModelForCausalLM.from_pretrained( - model_name, - torch_dtype=DEFAULT_DTYPE, - load_in_8bit=True, - device_map=device_map, - trust_remote_code=trust_remote_code, - **kwargs, - ) - for param in self.model.parameters(): - param.data = param.data.contiguous() - self.model = prepare_model_for_int8_training(self.model) + use_itrex = DEFAULT_DEVICE.type == "cpu" + # CUDA is not available, using CPU instead, running the model with itrex + if use_itrex: + assert_install_itrex() + # quantize model with weight-only quantization + from intel_extension_for_transformers.transformers import AutoModelForCausalLM as ItrexAutoModelForCausalLM + from intel_extension_for_transformers.transformers import WeightOnlyQuantConfig + woq_config = WeightOnlyQuantConfig(weight_dtype='int8') + self.model = ItrexAutoModelForCausalLM.from_pretrained( + model_name, + quantization_config=woq_config, + trust_remote_code=trust_remote_code, + use_llm_runtime=False, # TODO disable llm runtime for gpt2, removed it later + **kwargs) + else: + device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} + self.model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=DEFAULT_DTYPE, + load_in_8bit=True, + device_map=device_map, + trust_remote_code=trust_remote_code, + **kwargs, + ) + for param in self.model.parameters(): + param.data = param.data.contiguous() + self.model = prepare_model_for_int8_training(self.model) else: self.model = AutoModelForCausalLM.from_pretrained( model_name, diff --git a/src/xturing/models/causal.py b/src/xturing/models/causal.py index 62bb274..3683999 100644 --- a/src/xturing/models/causal.py +++ b/src/xturing/models/causal.py @@ -8,7 +8,7 @@ from tqdm import tqdm from transformers import BatchEncoding -from xturing.config import DEFAULT_DEVICE, assert_not_cpu_int8 +from xturing.config import DEFAULT_DEVICE, assert_cpu_int8_on_itrex from xturing.config.config_data_classes import FinetuningConfig, GenerationConfig from xturing.config.read_config import load_config from xturing.datasets.instruction_dataset import InstructionDataset @@ -320,7 +320,7 @@ def __init__( model_name: Optional[str] = None, **kwargs, ): - assert_not_cpu_int8() + assert_cpu_int8_on_itrex() super().__init__( engine, weights_path=weights_path, @@ -376,7 +376,7 @@ def __init__( target_modules: Optional[List[str]] = None, **kwargs, ): - assert_not_cpu_int8() + assert_cpu_int8_on_itrex() super().__init__( engine, weights_path=weights_path, @@ -395,7 +395,7 @@ def __init__( target_modules: Optional[List[str]] = None, **kwargs, ): - assert_not_cpu_int8() + assert_cpu_int8_on_itrex() super().__init__( engine, weights_path=weights_path, diff --git a/src/xturing/utils/utils.py b/src/xturing/utils/utils.py index a1bc22a..c8af938 100644 --- a/src/xturing/utils/utils.py +++ b/src/xturing/utils/utils.py @@ -150,3 +150,32 @@ def _index_samples(samples: List[Any], logger: logging.Logger): logger.info(f"Evaluating {len(indices)} samples") work_items = [(samples[i], i) for i in indices] return work_items + + +def is_itrex_available(): + """ + Check the availability of 'intel_extension_for_transformers' as an optional dependency. + + Returns: + bool: True if 'intel_extension_for_transformers' is available, False otherwise. + + Raises: + subprocess.CalledProcessError: If the pip installation process fails. + """ + import importlib + if importlib.util.find_spec("intel_extension_for_transformers") is not None: + return True + else: + try: + import subprocess + import sys + subprocess.check_call([sys.executable, "-m", "pip", "install", "intel-extension-for-transformers"]) + return importlib.util.find_spec("intel_extension_for_transformers") is not None + except: + return False + +def assert_install_itrex(): + assert is_itrex_available(), ( + "To run int8 or k-bits model on cpu, please install the `intel-extension-for-transformers` package." + "You can install it with `pip install intel-extension-for-transformers`." + ) \ No newline at end of file From c53c79bbdd18f21a8d1b3368cf88b39eb42c4be2 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 1 Nov 2023 11:14:26 +0800 Subject: [PATCH 2/5] add ut Signed-off-by: yiliu30 --- tests/xturing/models/test_gpt2_model.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/xturing/models/test_gpt2_model.py b/tests/xturing/models/test_gpt2_model.py index 3ad29d2..5db2702 100644 --- a/tests/xturing/models/test_gpt2_model.py +++ b/tests/xturing/models/test_gpt2_model.py @@ -101,3 +101,27 @@ def test_saving_loading_model_lora(): model2 = BaseModel.load(str(saving_path)) model2.generate(texts=["Why are the LLM so important?"]) + + +import os + +def disable_cuda(func): + def wrapper(*args, **kwargs): + # Save the current value of CUDA_VISIBLE_DEVICES + original_cuda_visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None) + # Set CUDA_VISIBLE_DEVICES to -1 to disable CUDA + os.environ['CUDA_VISIBLE_DEVICES'] = '-1' + try: + # Call the decorated function + return func(*args, **kwargs) + finally: + # Restore the original value of CUDA_VISIBLE_DEVICES + if original_cuda_visible_devices is not None: + os.environ['CUDA_VISIBLE_DEVICES'] = original_cuda_visible_devices + else: + + +@disable_cuda +def test_gpt2_int8_woq_cpu(): + other_model = BaseModel.create("gpt2_int8") + assert other_model.generate(texts="I want to") != "" \ No newline at end of file From 6d563f781020b07322bad7c6b39811140d0f7097 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 1 Nov 2023 12:37:02 +0800 Subject: [PATCH 3/5] revert change for qlora Signed-off-by: yiliu30 --- examples/models/gpt2/gpt2_woq.py | 5 +++-- examples/models/llama2/llama2_woq.py | 5 +++-- src/xturing/config/__init__.py | 5 ++++- src/xturing/engines/causal.py | 5 ++++- src/xturing/models/causal.py | 6 +++--- tests/xturing/models/test_gpt2_model.py | 8 ++++++++ 6 files changed, 25 insertions(+), 9 deletions(-) diff --git a/examples/models/gpt2/gpt2_woq.py b/examples/models/gpt2/gpt2_woq.py index 23d5008..0539f45 100644 --- a/examples/models/gpt2/gpt2_woq.py +++ b/examples/models/gpt2/gpt2_woq.py @@ -1,9 +1,10 @@ # from xturing.datasets.instruction_dataset import InstructionDataset from xturing.models import BaseModel -# Initializes the model: Quantize model with weight only algorithms and replace the linear with itrex's qbits_linear kernel +# Initializes the model: Quantize model with weight only algorithms and +# replace the linear with itrex's qbits_linear kernel model = BaseModel.create("gpt2_int8") -# Once the model has been quantized, you can start doing inferences +# Once the model has been quantized, you can do inferences directly output = model.generate(texts=["Why LLM models are becoming so important?"]) print(output) \ No newline at end of file diff --git a/examples/models/llama2/llama2_woq.py b/examples/models/llama2/llama2_woq.py index 5ebbd74..f6cda94 100644 --- a/examples/models/llama2/llama2_woq.py +++ b/examples/models/llama2/llama2_woq.py @@ -1,9 +1,10 @@ # from xturing.datasets.instruction_dataset import InstructionDataset from xturing.models import BaseModel -# Initializes the model: Quantize model with weight only algorithms and replace the linear with itrex's qbits_linear kernel +# Initializes the model: Quantize model with weight only algorithms and +# replace the linear with itrex's qbits_linear kernel model = BaseModel.create("llama2_int8") -# Once the model has been quantized, you can start doing inferences +# Once the model has been quantized, you can do inferences directly output = model.generate(texts=["Why LLM models are becoming so important?"]) print(output) \ No newline at end of file diff --git a/src/xturing/config/__init__.py b/src/xturing/config/__init__.py index 8331e15..c925602 100644 --- a/src/xturing/config/__init__.py +++ b/src/xturing/config/__init__.py @@ -12,9 +12,12 @@ IS_INTERACTIVE = is_interactive_execution() if DEFAULT_DEVICE.type == "cpu": - logger.warning("WARNING: CUDA is not available, using CPU instead, try running the model with Itrex.") + logger.warning("WARNING: CUDA is not available, using CPU instead, can be very slow") +def assert_not_cpu_int8(): + assert DEFAULT_DEVICE.type != "cpu", "Int8 models are not supported on CPU" + def assert_cpu_int8_on_itrex(): if DEFAULT_DEVICE.type == "cpu": assert_install_itrex() \ No newline at end of file diff --git a/src/xturing/engines/causal.py b/src/xturing/engines/causal.py index 5e7d40f..2fed0f1 100644 --- a/src/xturing/engines/causal.py +++ b/src/xturing/engines/causal.py @@ -19,10 +19,13 @@ ) from xturing.engines.quant_utils.peft_utils import LoraConfig as peftLoraConfig from xturing.engines.quant_utils.peft_utils import prepare_model_for_kbit_training +from xturing.utils.logging import configure_logger from xturing.utils.loss_fns import CrossEntropyLoss from xturing.utils.utils import assert_install_itrex +logger = configure_logger(__name__) + class CausalEngine(BaseEngine): def __init__( self, @@ -62,7 +65,7 @@ def __init__( elif model_name is not None: if load_8bit: use_itrex = DEFAULT_DEVICE.type == "cpu" - # CUDA is not available, using CPU instead, running the model with itrex + logger.info("CUDA is not available, using CPU instead, running the model with itrex.") if use_itrex: assert_install_itrex() # quantize model with weight-only quantization diff --git a/src/xturing/models/causal.py b/src/xturing/models/causal.py index 3683999..ca085ff 100644 --- a/src/xturing/models/causal.py +++ b/src/xturing/models/causal.py @@ -8,7 +8,7 @@ from tqdm import tqdm from transformers import BatchEncoding -from xturing.config import DEFAULT_DEVICE, assert_cpu_int8_on_itrex +from xturing.config import DEFAULT_DEVICE, assert_not_cpu_int8, assert_cpu_int8_on_itrex from xturing.config.config_data_classes import FinetuningConfig, GenerationConfig from xturing.config.read_config import load_config from xturing.datasets.instruction_dataset import InstructionDataset @@ -376,7 +376,7 @@ def __init__( target_modules: Optional[List[str]] = None, **kwargs, ): - assert_cpu_int8_on_itrex() + assert_not_cpu_int8() super().__init__( engine, weights_path=weights_path, @@ -395,7 +395,7 @@ def __init__( target_modules: Optional[List[str]] = None, **kwargs, ): - assert_cpu_int8_on_itrex() + assert_not_cpu_int8() super().__init__( engine, weights_path=weights_path, diff --git a/tests/xturing/models/test_gpt2_model.py b/tests/xturing/models/test_gpt2_model.py index 5db2702..d79fd82 100644 --- a/tests/xturing/models/test_gpt2_model.py +++ b/tests/xturing/models/test_gpt2_model.py @@ -114,14 +114,22 @@ def wrapper(*args, **kwargs): try: # Call the decorated function return func(*args, **kwargs) + except Exception as e: + # Handle exceptions here + print(f"An error occurred: {e}") finally: # Restore the original value of CUDA_VISIBLE_DEVICES if original_cuda_visible_devices is not None: os.environ['CUDA_VISIBLE_DEVICES'] = original_cuda_visible_devices else: + # If CUDA_VISIBLE_DEVICES was not set before, remove it from the environment + if 'CUDA_VISIBLE_DEVICES' in os.environ: + del os.environ['CUDA_VISIBLE_DEVICES'] + return wrapper @disable_cuda def test_gpt2_int8_woq_cpu(): + # test quantize gpt2 with itrex other_model = BaseModel.create("gpt2_int8") assert other_model.generate(texts="I want to") != "" \ No newline at end of file From fbc3558ca1608cb636989109d487ef050e1f2580 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 1 Nov 2023 12:54:32 +0800 Subject: [PATCH 4/5] add more log Signed-off-by: yiliu30 --- src/xturing/engines/causal.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/xturing/engines/causal.py b/src/xturing/engines/causal.py index 2fed0f1..e8cd001 100644 --- a/src/xturing/engines/causal.py +++ b/src/xturing/engines/causal.py @@ -65,8 +65,8 @@ def __init__( elif model_name is not None: if load_8bit: use_itrex = DEFAULT_DEVICE.type == "cpu" - logger.info("CUDA is not available, using CPU instead, running the model with itrex.") if use_itrex: + logger.info("CUDA is not available, using CPU instead, running the model with itrex.") assert_install_itrex() # quantize model with weight-only quantization from intel_extension_for_transformers.transformers import AutoModelForCausalLM as ItrexAutoModelForCausalLM @@ -78,6 +78,7 @@ def __init__( trust_remote_code=trust_remote_code, use_llm_runtime=False, # TODO disable llm runtime for gpt2, removed it later **kwargs) + logger.info("Loaded int8 model from Itrex.") else: device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} self.model = AutoModelForCausalLM.from_pretrained( From a9dbb28926df5ca750bc0ed8b5722dc7b05336fa Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 1 Nov 2023 13:13:00 +0800 Subject: [PATCH 5/5] remove comments --- src/xturing/engines/causal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xturing/engines/causal.py b/src/xturing/engines/causal.py index e8cd001..8f6b7e8 100644 --- a/src/xturing/engines/causal.py +++ b/src/xturing/engines/causal.py @@ -76,7 +76,7 @@ def __init__( model_name, quantization_config=woq_config, trust_remote_code=trust_remote_code, - use_llm_runtime=False, # TODO disable llm runtime for gpt2, removed it later + use_llm_runtime=False, **kwargs) logger.info("Loaded int8 model from Itrex.") else: