diff --git a/docs/web_ui.md b/docs/web_ui.md index da92b7f66..3e247f06f 100644 --- a/docs/web_ui.md +++ b/docs/web_ui.md @@ -8,7 +8,7 @@ Please follow [setup.md](setup.md) to setup the environment first. ## Start Web UI ```bash -python -u inference/start_ui.py --node_user_name $user --conda_env_name $conda_env --master_ip_port "$node_ip:6379" +python -u ui/start_ui.py --node_user_name $user --conda_env_name $conda_env --master_ip_port "$node_ip:6379" # Get urls from the log # Running on local URL: http://0.0.0.0:8080 # Running on public URL: https://180cd5f7c31a1cfd3c.gradio.live @@ -28,7 +28,7 @@ On the `Deployment` tab, you can choose a model to deploy, configure parameter ` ## Chatbot -On the `Inferenc` tab, you can now test the model by asking questions. +On the `Inference` tab, you can now test the model by asking questions. ![webui3](https://github.com/intel/llm-on-ray/assets/9278199/f7b9dc79-92fe-4e75-85fa-2cf7f36bb58d) diff --git a/finetune/finetune.py b/finetune/finetune.py index b8e3747ff..90351e3a8 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -20,10 +20,9 @@ from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig import sys -sys.path.append(os.path.join(os.path.dirname(__file__), '..')) - +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) import common -from finetune_config import FinetuneConfig +from finetune.finetune_config import FinetuneConfig def get_accelerate_environment_variable(mode: str, config: Dict[str, Any]) -> dict: @@ -167,9 +166,10 @@ def get_finetune_config(): def main(external_config = None): - config = get_finetune_config() - if external_config is not None: - config.merge(external_config) + if not external_config: + config = get_finetune_config() + else: + config = external_config config["cwd"] = os.getcwd() num_training_workers = config["Training"].get("num_training_workers") diff --git a/inference/api_server_openai.py b/inference/api_server_openai.py index 34eca9889..45179d6c5 100644 --- a/inference/api_server_openai.py +++ b/inference/api_server_openai.py @@ -34,8 +34,8 @@ import os from ray import serve -from api_openai_backend.query_client import RouterQueryClient -from api_openai_backend.router_app import Router, router_app +from inference.api_openai_backend.query_client import RouterQueryClient +from inference.api_openai_backend.router_app import Router, router_app def router_application(deployments): diff --git a/inference/deepspeed_predictor.py b/inference/deepspeed_predictor.py index 98fbed0aa..c2bf14835 100644 --- a/inference/deepspeed_predictor.py +++ b/inference/deepspeed_predictor.py @@ -15,8 +15,7 @@ from predictor import Predictor from utils import get_torch_dtype - -from inference_config import InferenceConfig, DEVICE_CPU, DEVICE_XPU, IPEX_PRECISION_BF16 +from inference.inference_config import InferenceConfig, DEVICE_CPU, DEVICE_XPU, IPEX_PRECISION_BF16 class DSPipeline: def __init__( diff --git a/inference/predictor.py b/inference/predictor.py index fb0cc1ef3..8344ecf86 100644 --- a/inference/predictor.py +++ b/inference/predictor.py @@ -1,7 +1,7 @@ import re import torch from transformers import AutoTokenizer, StoppingCriteriaList -from inference_config import InferenceConfig +from inference.inference_config import InferenceConfig from utils import max_input_len, StoppingCriteriaSub class Predictor: diff --git a/inference/predictor_deployment.py b/inference/predictor_deployment.py index bb2e7ad5e..440d071f9 100644 --- a/inference/predictor_deployment.py +++ b/inference/predictor_deployment.py @@ -14,6 +14,8 @@ # limitations under the License. # +import os +import sys import asyncio import functools from ray import serve @@ -21,10 +23,10 @@ from queue import Empty import torch from transformers import TextIteratorStreamer -from inference_config import InferenceConfig +from inference.inference_config import InferenceConfig from typing import Union from starlette.responses import StreamingResponse -from api_openai_backend.openai_protocol import ModelResponse +from inference.api_openai_backend.openai_protocol import ModelResponse @serve.deployment @@ -35,7 +37,11 @@ def __init__(self, infer_conf: InferenceConfig): chat_processor_name = infer_conf.model_description.chat_processor prompt = infer_conf.model_description.prompt if chat_processor_name: - module = __import__("chat_process") + try: + module = __import__("chat_process") + except: + sys.path.append(os.path.dirname(__file__)) + module = __import__("chat_process") chat_processor = getattr(module, chat_processor_name, None) if chat_processor is None: raise ValueError(infer_conf.name + " deployment failed. chat_processor(" + chat_processor_name + ") does not exist.") diff --git a/inference/serve.py b/inference/serve.py index a76316d23..cdd4607f9 100644 --- a/inference/serve.py +++ b/inference/serve.py @@ -15,13 +15,13 @@ # import ray -from inference_config import ModelDescription, InferenceConfig, all_models import sys from utils import get_deployment_actor_options from pydantic_yaml import parse_yaml_raw_as from api_server_simple import serve_run from api_server_openai import openai_serve_run from predictor_deployment import PredictorDeployment +from inference.inference_config import ModelDescription, InferenceConfig, all_models def get_deployed_models(args): # serve all pre-defined models, or model from MODEL_TO_SERVE env, if no model argument specified diff --git a/inference/transformer_predictor.py b/inference/transformer_predictor.py index 942d2a26b..406400878 100644 --- a/inference/transformer_predictor.py +++ b/inference/transformer_predictor.py @@ -1,7 +1,7 @@ import torch from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig from transformers import TextIteratorStreamer -from inference_config import InferenceConfig, IPEX_PRECISION_BF16 +from inference.inference_config import InferenceConfig, IPEX_PRECISION_BF16 from predictor import Predictor from utils import get_torch_dtype diff --git a/inference/utils.py b/inference/utils.py index 10210094c..3c7e47fe0 100644 --- a/inference/utils.py +++ b/inference/utils.py @@ -16,8 +16,7 @@ from transformers import StoppingCriteria import torch - -from inference_config import InferenceConfig, DEVICE_CPU +from inference.inference_config import InferenceConfig, DEVICE_CPU def get_deployment_actor_options(infer_conf: InferenceConfig): _ray_env_key = "env_vars" diff --git a/inference/html_format.py b/ui/html_format.py similarity index 100% rename from inference/html_format.py rename to ui/html_format.py diff --git a/inference/ui_images/Picture1.png b/ui/images/Picture1.png similarity index 100% rename from inference/ui_images/Picture1.png rename to ui/images/Picture1.png diff --git a/inference/ui_images/Picture2.png b/ui/images/Picture2.png similarity index 100% rename from inference/ui_images/Picture2.png rename to ui/images/Picture2.png diff --git a/inference/ui_images/logo.png b/ui/images/logo.png similarity index 100% rename from inference/ui_images/logo.png rename to ui/images/logo.png diff --git a/inference/start_ui.py b/ui/start_ui.py similarity index 98% rename from inference/start_ui.py rename to ui/start_ui.py index 6d0d22525..63b3b7f27 100644 --- a/inference/start_ui.py +++ b/ui/start_ui.py @@ -15,17 +15,17 @@ # import requests -from inference_config import all_models, ModelDescription, Prompt -from inference_config import InferenceConfig as FinetunedConfig import time import os -from chat_process import ChatModelGptJ, ChatModelLLama -import torch -from predictor_deployment import PredictorDeployment +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +from inference.inference_config import all_models, ModelDescription, Prompt +from inference.inference_config import InferenceConfig as FinetunedConfig +from inference.chat_process import ChatModelGptJ, ChatModelLLama +from inference.predictor_deployment import PredictorDeployment from ray import serve import ray import gradio as gr -import sys import argparse from ray.tune import Stopper from ray.train.base_trainer import TrainingFailedError @@ -34,7 +34,7 @@ from ray.util import queue import paramiko from html_format import cpu_memory_html, ray_status_html, custom_css -from typing import Any, Dict +from typing import Dict from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from pyrecdp.LLM import TextPipeline @@ -172,7 +172,7 @@ def model_generate(self, prompt, request_url, config): sample_input = {"text": prompt, "config": config, "stream": True} proxies = { "http": None, "https": None} - outputs = requests.post(request_url, proxies=proxies, json=[sample_input], stream=True) + outputs = requests.post(request_url, proxies=proxies, json=sample_input, stream=True) outputs.raise_for_status() for output in outputs.iter_content(chunk_size=None, decode_unicode=True): # remove context @@ -479,7 +479,7 @@ def deploy_func(self, model_name: str, replica_num: int, cpus_per_worker_deploy: finetuned_deploy = finetuned.copy(deep=True) finetuned_deploy.device = 'cpu' - finetuned_deploy.precision = 'bf16' + finetuned_deploy.ipex.precision = 'bf16' finetuned_deploy.model_description.prompt.stop_words = stop_words finetuned_deploy.cpus_per_worker = cpus_per_worker_deploy # transformers 4.35 is needed for neural-chat-7b-v3-1, will be fixed later @@ -581,7 +581,7 @@ def _init_ui(self): with gr.Blocks(css=custom_css,title=title) as gr_chat: head_content = """
Manage LLM Lifecycle
Fine-Tune LLMs using workflow on Ray, Deploy and Inference