Skip to content

Commit

Permalink
Merge pull request #5005 from oobabooga/dev
Browse files Browse the repository at this point in the history
Merge dev branch
  • Loading branch information
oobabooga authored Dec 19, 2023
2 parents 71eb744 + e83e6ce commit 5b791ca
Show file tree
Hide file tree
Showing 10 changed files with 73 additions and 25 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ List of command-line flags

| Flag | Description |
|-------------|-------------|
| `--tensorcores` | Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only. |
| `--n_ctx N_CTX` | Size of the prompt context. |
| `--threads` | Number of threads to use. |
| `--threads-batch THREADS_BATCH` | Number of threads to use for batches/prompt processing. |
Expand Down
6 changes: 3 additions & 3 deletions modules/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,13 +768,13 @@ def delete_character(name, instruct=False):

def jinja_template_from_old_format(params, verbose=False):
MASTER_TEMPLATE = """
{%- set found_item = false -%}
{%- set ns = namespace(found=false) -%}
{%- for message in messages -%}
{%- if message['role'] == 'system' -%}
{%- set found_item = true -%}
{%- set ns.found = true -%}
{%- endif -%}
{%- endfor -%}
{%- if not found_item -%}
{%- if not ns.found -%}
{{- '<|PRE-SYSTEM|>' + '<|SYSTEM-MESSAGE|>' + '<|POST-SYSTEM|>' -}}
{%- endif %}
{%- for message in messages %}
Expand Down
13 changes: 11 additions & 2 deletions modules/llamacpp_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,21 @@
except:
llama_cpp_cuda = None

try:
import llama_cpp_cuda_tensorcores
except:
llama_cpp_cuda_tensorcores = None


def llama_cpp_lib():
if (shared.args.cpu and llama_cpp is not None) or llama_cpp_cuda is None:
if shared.args.cpu and llama_cpp is not None:
return llama_cpp
else:
elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
return llama_cpp_cuda_tensorcores
elif llama_cpp_cuda is not None:
return llama_cpp_cuda
else:
return llama_cpp


class LlamacppHF(PreTrainedModel):
Expand Down
13 changes: 11 additions & 2 deletions modules/llamacpp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,21 @@
except:
llama_cpp_cuda = None

try:
import llama_cpp_cuda_tensorcores
except:
llama_cpp_cuda_tensorcores = None


def llama_cpp_lib():
if (shared.args.cpu and llama_cpp is not None) or llama_cpp_cuda is None:
if shared.args.cpu and llama_cpp is not None:
return llama_cpp
else:
elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
return llama_cpp_cuda_tensorcores
elif llama_cpp_cuda is not None:
return llama_cpp_cuda
else:
return llama_cpp


def ban_eos_logits_processor(eos_token, input_ids, logits):
Expand Down
4 changes: 3 additions & 1 deletion modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@
'compress_pos_emb',
'cpu',
'numa',
'no_offload_kqv'
'no_offload_kqv',
'tensorcores',
],
'llamacpp_HF': [
'n_ctx',
Expand All @@ -65,6 +66,7 @@
'no_use_fast',
'logits_all',
'no_offload_kqv',
'tensorcores',
'llamacpp_HF_info',
],
'ExLlamav2_HF': [
Expand Down
1 change: 1 addition & 0 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
parser.add_argument('--quant_type', type=str, default='nf4', help='quant_type for 4-bit. Valid options: nf4, fp4.')

# llama.cpp
parser.add_argument('--tensorcores', action='store_true', help='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
parser.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
parser.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
Expand Down
1 change: 1 addition & 0 deletions modules/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def list_model_elements():
'numa',
'logits_all',
'no_offload_kqv',
'tensorcores',
'hqq_backend',
]
if is_torch_xpu_available():
Expand Down
3 changes: 2 additions & 1 deletion modules/ui_model_menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ def create_ui():
shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# only works on Linux.')

with gr.Column():
shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
Expand All @@ -115,7 +117,6 @@ def create_ui():
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu)
shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
Expand Down
28 changes: 20 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,26 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.24+cpuavx2-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.24+cpuavx2-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"

# llama-cpp-python (CUDA, no tensor cores)
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"

# llama-cpp-python (CUDA, tensor cores)
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.24+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.24+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.24+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.24+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.24+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.24+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.24+cu121-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.24+cu121-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"

# CUDA wheels
https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
Expand Down Expand Up @@ -68,14 +88,6 @@ https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.4/flash_attn
https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu122torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu122torch2.1cxx11abiFALSE-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu122torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.24+cu121-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
Expand Down
Loading

0 comments on commit 5b791ca

Please sign in to comment.