Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split by rows instead of layers for llama.cpp multi-gpu #5435

Merged
merged 33 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
c3e0fcf
Merge pull request #4927 from oobabooga/dev
oobabooga Dec 15, 2023
443be39
Merge pull request #4937 from oobabooga/dev
oobabooga Dec 15, 2023
7be0983
Merge pull request #4961 from oobabooga/dev
oobabooga Dec 17, 2023
b28020a
Merge pull request #4980 from oobabooga/dev
oobabooga Dec 18, 2023
781367b
Merge pull request #4988 from oobabooga/dev
oobabooga Dec 19, 2023
71eb744
Merge pull request #5002 from oobabooga/dev
oobabooga Dec 19, 2023
5b791ca
Merge pull request #5005 from oobabooga/dev
oobabooga Dec 19, 2023
c1f78db
Merge pull request #5011 from oobabooga/dev
oobabooga Dec 20, 2023
489f4a2
Merge pull request #5012 from oobabooga/dev
oobabooga Dec 20, 2023
11288d1
Merge pull request #5022 from oobabooga/dev
oobabooga Dec 20, 2023
4b25acf
Merge pull request #5039 from oobabooga/dev
oobabooga Dec 21, 2023
af87609
Merge pull request #5073 from oobabooga/dev
oobabooga Dec 25, 2023
19d1374
Merge pull request #5078 from oobabooga/dev
oobabooga Dec 25, 2023
3fd7073
Merge pull request #5100 from oobabooga/dev
oobabooga Dec 27, 2023
3e3a66e
Merge pull request #5132 from oobabooga/dev
oobabooga Dec 31, 2023
3f28925
Merge pull request #5152 from oobabooga/dev
oobabooga Jan 2, 2024
c54d1da
Merge pull request #5163 from oobabooga/dev
oobabooga Jan 4, 2024
8ea3f31
Merge pull request #5181 from oobabooga/dev
oobabooga Jan 5, 2024
e169993
Merge pull request #5195 from oobabooga/dev
oobabooga Jan 7, 2024
ad1ff53
Merge pull request #5199 from oobabooga/dev
oobabooga Jan 7, 2024
2dc8db8
Merge pull request #5220 from oobabooga/dev
oobabooga Jan 10, 2024
61e4bfe
Merge pull request #5253 from oobabooga/dev
oobabooga Jan 14, 2024
d8c3a5b
Merge pull request #5266 from oobabooga/dev
oobabooga Jan 14, 2024
1343aa3
Merge pull request #5347 from oobabooga/dev
oobabooga Jan 22, 2024
837bd88
Merge pull request #5348 from oobabooga/dev
oobabooga Jan 22, 2024
e7a760e
Merge pull request #5379 from oobabooga/dev
oobabooga Jan 26, 2024
4f3fdf1
Merge pull request #5404 from oobabooga/dev
oobabooga Jan 30, 2024
fea8315
Change shared.py
Ph0rk0z Feb 4, 2024
572ab05
Update llamacpp_hf.py
Ph0rk0z Feb 4, 2024
b8c17a5
Update llamacpp_model.py
Ph0rk0z Feb 4, 2024
f35b723
Update loaders.py
Ph0rk0z Feb 4, 2024
2b21b86
Update ui.py
Ph0rk0z Feb 4, 2024
c86f447
Update ui_model_menu.py
Ph0rk0z Feb 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion modules/llamacpp_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
'tensor_split': tensor_split_list,
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
'logits_all': shared.args.logits_all,
'offload_kqv': not shared.args.no_offload_kqv
'offload_kqv': not shared.args.no_offload_kqv,
'split_mode': 1 if not shared.args.row_split else 2
}

Llama = llama_cpp_lib().Llama
Expand Down
3 changes: 2 additions & 1 deletion modules/llamacpp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ def from_pretrained(self, path):
'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base),
'tensor_split': tensor_split_list,
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
'offload_kqv': not shared.args.no_offload_kqv
'offload_kqv': not shared.args.no_offload_kqv,
'split_mode': 1 if not shared.args.row_split else 2
}

result.model = Llama(**params)
Expand Down
2 changes: 2 additions & 0 deletions modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
'cpu',
'numa',
'no_offload_kqv',
'row_split',
'tensorcores',
],
'llamacpp_HF': [
Expand All @@ -66,6 +67,7 @@
'no_use_fast',
'logits_all',
'no_offload_kqv',
'row_split',
'tensorcores',
'llamacpp_HF_info',
],
Expand Down
1 change: 1 addition & 0 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@
group.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
group.add_argument('--row_split', action='store_true', help='Split multi-gpu by row instead of layer. Faster on some cards.')

# ExLlama
group = parser.add_argument_group('ExLlama')
Expand Down
1 change: 1 addition & 0 deletions modules/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def list_model_elements():
'numa',
'logits_all',
'no_offload_kqv',
'row_split',
'tensorcores',
'hqq_backend',
]
Expand Down
1 change: 1 addition & 0 deletions modules/ui_model_menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def create_ui():
with gr.Column():
shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split model by rows across GPUs. Improves performance on some cards.')
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
Expand Down