updated submodule

PromtEngineer · Feb 8, 2024 · 489cbe0 · 489cbe0
2 parents f5a68e1 + b20d221
commit 489cbe0
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 3 deletions.
diff --git a/load_models.py b/load_models.py
@@ -9,7 +9,8 @@
     LlamaForCausalLM,
     LlamaTokenizer,
 )
-import  intel_extension_for_pytorch as ipex
+# Uncomment below line if you have Intel® Discrete GPU's and it has XPU Support.
+#import intel_extension_for_pytorch as ipex
 from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH
 
 

diff --git a/run_localGPT.py b/run_localGPT.py
@@ -8,7 +8,9 @@
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler  # for streaming response
 from langchain.callbacks.manager import CallbackManager
 from langchain.llms.base import LLM
-import intel_extension_for_pytorch as ipex
+
+# Uncomment below line if you have Intel Discrete GPU's and it has XPU Support
+#import intel_extension_for_pytorch as ipex
 
 callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
 
@@ -21,6 +23,18 @@
     pipeline,
 )
 
+import warnings
+
+warnings.filterwarnings(
+    "ignore", category=UserWarning, module="intel_extension_for_pytorch"
+)
+warnings.filterwarnings(
+    "ignore", category=UserWarning, module="torchvision.io.image", lineno=13
+)
+warnings.filterwarnings(
+    "ignore", category=UserWarning, module="transformers"
+)
+
 from load_models import (
     load_quantized_model_gguf_ggml,
     load_quantized_model_qptq,
@@ -66,7 +80,20 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
         else:
             model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING)
     else:
-        model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING)
+        model, tokenizer = load_full_model(model_id, model_basename,  device_type, LOGGING)
+        if device_type == "xpu":
+            class CustomLLM(LLM):
+                def _call(self, prompt, stop=None, run_manager=None) -> str:
+                    input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
+                    result = model.generate(input_ids=input_ids, max_new_tokens=MAX_NEW_TOKENS)
+                    result = tokenizer.decode(result[0])
+                    return result
+                @property
+                def _llm_type(self) -> str:
+                    return "custom"
+
+            llm = CustomLLM()
+            return llm
 
         class CustomLLM(LLM):
             def _call(self, prompt, stop=None, run_manager=None) -> str: