Skip to content

Commit

Permalink
Add support for numa and selecting main GPU in llama.cpp/hf
Browse files Browse the repository at this point in the history
  • Loading branch information
mint committed Sep 16, 2023
1 parent c39157f commit bab1491
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 0 deletions.
2 changes: 2 additions & 0 deletions modules/llamacpp_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
'seed': int(shared.args.llama_cpp_seed),
'n_threads': shared.args.threads or None,
'n_batch': shared.args.n_batch,
'numa': shared.args.numa,
'main_gpu': shared.args.maing_gpu,
'use_mmap': not shared.args.no_mmap,
'use_mlock': shared.args.mlock,
'mul_mat_q': shared.args.mul_mat_q,
Expand Down
2 changes: 2 additions & 0 deletions modules/llamacpp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ def from_pretrained(self, path):
'seed': int(shared.args.llama_cpp_seed),
'n_threads': shared.args.threads or None,
'n_batch': shared.args.n_batch,
'numa': shared.args.numa,
'main_gpu': shared.args.maing_gpu,
'use_mmap': not shared.args.no_mmap,
'use_mlock': shared.args.mlock,
'mul_mat_q': shared.args.mul_mat_q,
Expand Down
4 changes: 4 additions & 0 deletions modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@
'n_gpu_layers',
'tensor_split',
'n_batch',
'numa',
'main_gpu',
'threads',
'no_mmap',
'low_vram',
Expand All @@ -129,6 +131,8 @@
'n_gpu_layers',
'tensor_split',
'n_batch',
'numa',
'main_gpu',
'threads',
'no_mmap',
'low_vram',
Expand Down
2 changes: 2 additions & 0 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ def str2bool(v):
# llama.cpp
parser.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
parser.add_argument('--numa', action='store_true', help='Enable numa support for multiple processors.')
parser.add_argument('--main-gpu', type=int, default=0, help='Main GPU to use for CPP.')
parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
parser.add_argument('--low-vram', action='store_true', help='Low VRAM Mode')
parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
Expand Down
2 changes: 2 additions & 0 deletions modules/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def list_model_elements():
'threads',
'n_batch',
'no_mmap',
'numa',
'main_gpu',
'mlock',
'n_gpu_layers',
'autograd',
Expand Down
2 changes: 2 additions & 0 deletions modules/ui_model_menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ def create_ui():
shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=32768, step=256, label="n_ctx", value=shared.args.n_ctx)
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=96, value=shared.args.threads)
shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
shared.gradio['numa'] = gr.Checkbox(label="numa support", value=shared.args.numa)

This comment has been minimized.

Copy link
@StoyanStAtanasov

StoyanStAtanasov Sep 24, 2023

I would only add the long description:
info='Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing \'3\' to \'/proc/sys/vm/drop_caches\' as root.'

This comment has been minimized.

Copy link
@Ph0rk0z

Ph0rk0z Sep 24, 2023

Owner

I mean go ahead. I am not so good with docs like ooba. More important would be adding the parameter from this post ggerganov/llama.cpp#1437 (comment)

Turning off the balancing finally let it load more evenly on the procs. Check with numactl -H on your system how it is using it.

shared.gradio['main_gpu'] = gr.Number(label='Main GPU', value=shared.args.main_gpu)
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
shared.gradio['low_vram'] = gr.Checkbox(label="low-vram", value=shared.args.low_vram)
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
Expand Down

0 comments on commit bab1491

Please sign in to comment.