Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove non-HF ExLlamaV2 loader #5431

Merged
merged 2 commits into from
Feb 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions modules/LoRA.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
def add_lora_to_model(lora_names):
if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ':
add_lora_autogptq(lora_names)
elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader == ['ExLlamav2', 'ExLlamav2_HF']:
elif shared.model.__class__.__name__ in ['Exllamav2HF'] or shared.args.loader == ['ExLlamav2_HF']:
add_lora_exllamav2(lora_names)
else:
add_lora_transformers(lora_names)
Expand All @@ -39,11 +39,7 @@ def add_lora_exllamav2(lora_names):
shared.model.loras = []
for lora_name in lora_names:
lora_path = get_lora_path(lora_name)
if shared.model.__class__.__name__ == 'Exllamav2Model':
lora = ExLlamaV2Lora.from_directory(shared.model.model, str(lora_path))
else:
lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path))

lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path))
shared.model.loras.append(lora)

shared.lora_names = lora_names
Expand Down
149 changes: 0 additions & 149 deletions modules/exllamav2.py

This file was deleted.

33 changes: 0 additions & 33 deletions modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,6 @@
'trust_remote_code',
'no_use_fast',
],
'ExLlamav2': [
'gpu_split',
'max_seq_len',
'no_flash_attn',
'num_experts_per_token',
'cache_8bit',
'alpha_value',
'compress_pos_emb',
'exllamav2_info',
],
'AutoGPTQ': [
'triton',
'no_inject_fused_attention',
Expand Down Expand Up @@ -204,29 +194,6 @@ def transformers_samplers():
'AutoAWQ': transformers_samplers(),
'QuIP#': transformers_samplers(),
'HQQ': transformers_samplers(),
'ExLlamav2': {
'temperature',
'temperature_last',
'top_p',
'min_p',
'top_k',
'typical_p',
'tfs',
'top_a',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'repetition_penalty_range',
'seed',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'ban_eos_token',
'add_bos_token',
'custom_token_bans',
'skip_special_tokens',
'auto_max_new_tokens',
},
'ExLlamav2_HF': {
'temperature',
'temperature_last',
Expand Down
12 changes: 3 additions & 9 deletions modules/logits.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return
logger.error("No model is loaded! Select one in the Model tab.")
return 'Error: No model is loaded1 Select one in the Model tab.', previous

is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
is_non_hf_llamacpp = shared.model.__class__.__name__ == 'LlamaCppModel'

if use_samplers:
if any([is_non_hf_exllamav2, is_non_hf_llamacpp]):
if is_non_hf_llamacpp:
logger.error("Sampler hijacking is not supported non-Huggingface loaders.")
# sampling is all done in c for exllama, so it is really hard to hijack
# it should be possible to hijack llamacpp sampler by hijacking all their sampling methods,
Expand All @@ -31,20 +30,15 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return

scores = sampler_hijack.global_scores[-1]
else:
if is_non_hf_exllamav2:
if is_torch_xpu_available():
tokens = shared.tokenizer.encode(prompt).to("xpu:0")
else:
tokens = shared.tokenizer.encode(prompt).cuda()
scores = shared.model.get_logits(tokens)[-1][-1]
elif is_non_hf_llamacpp:
if is_non_hf_llamacpp:
tokens = shared.tokenizer.encode(prompt)
scores = shared.model.get_logits(tokens)[-1][-1]
else:
if is_torch_xpu_available():
tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("xpu:0")
else:
tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()

output = shared.model(input_ids=tokens)
scores = output['logits'][-1][-1]

Expand Down
8 changes: 0 additions & 8 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ def load_model(model_name, loader=None):
'GPTQ-for-LLaMa': GPTQ_loader,
'llama.cpp': llamacpp_loader,
'llamacpp_HF': llamacpp_HF_loader,
'ExLlamav2': ExLlamav2_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'ctransformers': ctransformers_loader,
'AutoAWQ': AutoAWQ_loader,
Expand Down Expand Up @@ -376,13 +375,6 @@ def AutoGPTQ_loader(model_name):
return modules.AutoGPTQ_loader.load_quantized(model_name)


def ExLlamav2_loader(model_name):
from modules.exllamav2 import Exllamav2Model

model, tokenizer = Exllamav2Model.from_pretrained(model_name)
return model, tokenizer


def ExLlamav2_HF_loader(model_name):
from modules.exllamav2_hf import Exllamav2HF

Expand Down
2 changes: 2 additions & 0 deletions modules/models_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ def get_model_metadata(model):
if re.match(pat.lower(), model.lower()):
for k in settings[pat]:
model_settings[k] = settings[pat][k]
if k == 'loader' and settings[pat][k] == 'ExLlamav2':
model_settings[k] = 'ExLlamav2_HF'

return model_settings

Expand Down
14 changes: 5 additions & 9 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@

# Model loader
group = parser.add_argument_group('Model loader')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.')

# Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate')
Expand Down Expand Up @@ -130,11 +130,11 @@
group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')

# ExLlama
group = parser.add_argument_group('ExLlama')
# ExLlamaV2
group = parser.add_argument_group('ExLlamaV2')
group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.')
group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
group.add_argument('--cfg-cache', action='store_true', help='Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
Expand Down Expand Up @@ -248,11 +248,7 @@ def fix_loader_name(name):
return 'AutoGPTQ'
elif name in ['gptq-for-llama', 'gptqforllama', 'gptqllama', 'gptq for llama', 'gptq_for_llama']:
return 'GPTQ-for-LLaMa'
elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
return 'ExLlama'
elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
return 'ExLlamav2'
elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2', 'exllama', 'ex-llama', 'ex_llama', 'exlama', 'exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
return 'ExLlamav2_HF'
elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
return 'ctransformers'
Expand Down
11 changes: 6 additions & 5 deletions modules/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
yield ''
return

if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel']:
generate_func = generate_reply_custom
else:
generate_func = generate_reply_HF
Expand Down Expand Up @@ -120,10 +120,11 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if shared.tokenizer is None:
raise ValueError('No tokenizer is loaded')

if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel']:
input_ids = shared.tokenizer.encode(str(prompt))
if shared.model.__class__.__name__ not in ['Exllamav2Model']:
input_ids = np.array(input_ids).reshape(1, len(input_ids))
# The step below is necessary for llama.cpp, but may not be
# necessary for future loaders.
input_ids = np.array(input_ids).reshape(1, len(input_ids))
else:
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
if not add_bos_token:
Expand All @@ -134,7 +135,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]

if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel'] or shared.args.cpu:
return input_ids
elif shared.args.deepspeed:
return input_ids.to(device=local_rank)
Expand Down
1 change: 0 additions & 1 deletion modules/ui_model_menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ def create_ui():
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.')
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1 (recommended): place your .gguf in a subfolder of models/ along with these 4 files: special_tokens_map.json, tokenizer_config.json, tokenizer.json, tokenizer.model.\n\nOption 2: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer that will work for some (but not all) models.')

with gr.Column():
Expand Down