From ddb2cc97ab5d336bf201b83be60187f76db2026d Mon Sep 17 00:00:00 2001 From: ali asaria Date: Fri, 9 Feb 2024 14:04:35 -0500 Subject: [PATCH] Update MLX integration to use new generate_step function signature (#3021) --- docs/mlx_integration.md | 6 +++--- fastchat/serve/mlx_worker.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/mlx_integration.md b/docs/mlx_integration.md index b500207a6..21642d948 100644 --- a/docs/mlx_integration.md +++ b/docs/mlx_integration.md @@ -13,11 +13,11 @@ Note that for Apple Silicon Macs with less memory, smaller models (or quantized 1. Install MLX. ``` - pip install mlx-lm + pip install "mlx-lm>=0.0.6" ``` -2. When you launch a model worker, replace the normal worker (`fastchat.serve.model_worker`) with the MLX worker (`fastchat.serve.mlx_worker`). +2. When you launch a model worker, replace the normal worker (`fastchat.serve.model_worker`) with the MLX worker (`fastchat.serve.mlx_worker`). Remember to launch a model worker after you have launched the controller ([instructions](../README.md)) ``` - python3 -m fastchat.serve.mlx_worker --model-path microsoft/phi-2 + python3 -m fastchat.serve.mlx_worker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 ``` diff --git a/fastchat/serve/mlx_worker.py b/fastchat/serve/mlx_worker.py index 795529104..a7e85f848 100644 --- a/fastchat/serve/mlx_worker.py +++ b/fastchat/serve/mlx_worker.py @@ -124,7 +124,7 @@ async def generate_stream(self, params): ) for i in range(max_new_tokens): - token = await run_in_threadpool(next, iterator) + (token, _) = await run_in_threadpool(next, iterator) if token == self.mlx_tokenizer.eos_token_id: finish_reason = "stop" break