Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump llama-cpp-python to 0.2.18 (2nd attempt) #4637

Merged
merged 2 commits into from
Nov 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,6 @@ Optionally, you can use the following command-line flags:
| `--mlock` | Force the system to keep the model in RAM. |
| `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. |
| `--tensor_split TENSOR_SPLIT` | Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17. |
| `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default is 0 (random). |
| `--numa` | Activate NUMA task allocation for llama.cpp. |
| `--logits_all`| Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower. |
| `--cache-capacity CACHE_CAPACITY` | Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
Expand Down
11 changes: 5 additions & 6 deletions modules/llamacpp_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __init__(self, model, path):
'n_tokens': self.model.n_tokens,
'input_ids': self.model.input_ids,
'scores': self.model.scores,
'ctx': self.model.ctx
'ctx': self.model._ctx
}

if shared.args.cfg_cache:
Expand All @@ -65,28 +65,28 @@ def save_cache(self):
'n_tokens': self.model.n_tokens,
'input_ids': self.model.input_ids,
'scores': self.model.scores,
'ctx': self.model.ctx
'ctx': self.model._ctx
})

def save_negative_cache(self):
self.llamacpp_cache_negative.update({
'n_tokens': self.model.n_tokens,
'input_ids': self.model.input_ids,
'scores': self.model.scores,
'ctx': self.model.ctx
'ctx': self.model._ctx
})

def load_cache(self):
self.model.n_tokens = self.llamacpp_cache['n_tokens']
self.model.input_ids = self.llamacpp_cache['input_ids']
self.model.scores = self.llamacpp_cache['scores']
self.model.ctx = self.llamacpp_cache['ctx']
self.model._ctx = self.llamacpp_cache['ctx']

def load_negative_cache(self):
self.model.n_tokens = self.llamacpp_cache_negative['n_tokens']
self.model.input_ids = self.llamacpp_cache_negative['input_ids']
self.model.scores = self.llamacpp_cache_negative['scores']
self.model.ctx = self.llamacpp_cache_negative['ctx']
self.model._ctx = self.llamacpp_cache_negative['ctx']

@property
def device(self) -> torch.device:
Expand Down Expand Up @@ -192,7 +192,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
params = {
'model_path': str(model_file),
'n_ctx': shared.args.n_ctx,
'seed': int(shared.args.llama_cpp_seed),
'n_threads': shared.args.threads or None,
'n_threads_batch': shared.args.threads_batch or None,
'n_batch': shared.args.n_batch,
Expand Down
10 changes: 5 additions & 5 deletions modules/llamacpp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def from_pretrained(self, path):
params = {
'model_path': str(path),
'n_ctx': shared.args.n_ctx,
'seed': int(shared.args.llama_cpp_seed),
'n_threads': shared.args.threads or None,
'n_threads_batch': shared.args.threads_batch or None,
'n_batch': shared.args.n_batch,
Expand Down Expand Up @@ -144,15 +143,16 @@ def generate(self, prompt, state, callback=None):
max_tokens=state['max_new_tokens'],
temperature=state['temperature'],
top_p=state['top_p'],
top_k=state['top_k'],
repeat_penalty=state['repetition_penalty'],
presence_penalty=state['presence_penalty'],
frequency_penalty=state['frequency_penalty'],
presence_penalty=state['presence_penalty'],
repeat_penalty=state['repetition_penalty'],
top_k=state['top_k'],
stream=True,
seed=int(state['seed']) if state['seed'] != -1 else None,
tfs_z=state['tfs'],
mirostat_mode=int(state['mirostat_mode']),
mirostat_tau=state['mirostat_tau'],
mirostat_eta=state['mirostat_eta'],
stream=True,
logits_processor=logit_processors,
grammar=self.grammar
)
Expand Down
2 changes: 1 addition & 1 deletion modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@
'no_mmap',
'mlock',
'no_mul_mat_q',
'llama_cpp_seed',
'alpha_value',
'rope_freq_base',
'compress_pos_emb',
Expand Down Expand Up @@ -366,6 +365,7 @@
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'seed',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
Expand Down
2 changes: 1 addition & 1 deletion modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@
parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
parser.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17.')
parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default is 0 (random).')
parser.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
parser.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
Expand Down Expand Up @@ -182,6 +181,7 @@
parser.add_argument('--mul_mat_q', action='store_true', help='DEPRECATED')
parser.add_argument('--api-blocking-port', type=int, default=5000, help='DEPRECATED')
parser.add_argument('--api-streaming-port', type=int, default=5005, help='DEPRECATED')
parser.add_argument('--llama_cpp_seed', type=int, default=0, help='DEPRECATED')
parser.add_argument('--use_fast', action='store_true', help='DEPRECATED')

args = parser.parse_args()
Expand Down
1 change: 0 additions & 1 deletion modules/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ def list_model_elements():
'n_gpu_layers',
'tensor_split',
'n_ctx',
'llama_cpp_seed',
'gpu_split',
'max_seq_len',
'compress_pos_emb',
Expand Down
1 change: 0 additions & 1 deletion modules/ui_model_menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ def create_ui():
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
shared.gradio['llama_cpp_seed'] = gr.Number(label='Seed (0 for random)', value=shared.args.llama_cpp_seed)
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code)
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
Expand Down
32 changes: 16 additions & 16 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ bitsandbytes==0.41.1; platform_system != "Windows"
https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"

# llama-cpp-python (CPU only, AVX2)
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.11/llama_cpp_python-0.2.11-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-manylinux_2_17_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.18/llama_cpp_python-0.2.18-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"

# CUDA wheels
https://github.com/jllllll/AutoGPTQ/releases/download/v0.5.1/auto_gptq-0.5.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
Expand Down Expand Up @@ -67,14 +67,14 @@ https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn
https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.2/flash_attn-2.3.2+cu122torch2.1cxx11abiFALSE-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.11+cu121-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp39-cp39-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.18+cu121-cp38-cp38-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9"
Expand Down
Loading