Skip to content

Commit

Permalink
Converting scripts called by train to use click (NVIDIA#605)
Browse files Browse the repository at this point in the history
* Converting scripts called by train to use click

Signed-off-by: Dan McPherson <[email protected]>

* Converting scripts called by convert to use click

Signed-off-by: Dan McPherson <[email protected]>

---------

Signed-off-by: Dan McPherson <[email protected]>
  • Loading branch information
danmcp authored Mar 18, 2024
1 parent 7af2491 commit 3e2563d
Show file tree
Hide file tree
Showing 6 changed files with 471 additions and 462 deletions.
213 changes: 111 additions & 102 deletions cli/llamacpp/llamacpp_convert_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from dataclasses import dataclass
from pathlib import Path
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
import argparse
import concurrent.futures
import enum
import faulthandler
Expand All @@ -28,6 +27,7 @@

# Third Party
from sentencepiece import SentencePieceProcessor
import click
import numpy as np

if "NO_LOCAL_GGUF" not in os.environ:
Expand Down Expand Up @@ -1604,166 +1604,175 @@ def do_dump_model(model_plus: ModelPlus) -> None:
)


def main(args_in: list[str] | None = None) -> None:
output_choices = ["f32", "f16"]
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# We currently only support Q8_0 output on little endian systems.
output_choices.append("q8_0")
parser = argparse.ArgumentParser(
description="Convert a LLaMA model to a GGML compatible file"
)
parser.add_argument(
"--awq-path", type=Path, help="Path to scale awq cache file", default=None
)
parser.add_argument(
"--dump",
action="store_true",
help="don't convert, just show what's in the model",
)
parser.add_argument(
"--dump-single",
action="store_true",
help="don't convert, just show what's in a single model file",
)
parser.add_argument(
"--vocab-only", action="store_true", help="extract only the vocab"
)
parser.add_argument(
"--outtype",
choices=output_choices,
help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
)
parser.add_argument(
"--vocab-dir",
type=Path,
help="directory containing tokenizer.model, if separate from model file",
)
parser.add_argument(
"--vocab-type",
help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)",
default="spm,hfft",
)
parser.add_argument(
"--outfile", type=Path, help="path to write to; default: based on input"
)
parser.add_argument(
"model",
type=Path,
help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)",
)
parser.add_argument(
"--ctx", type=int, help="model training context (default: based on input)"
)
parser.add_argument(
"--concurrency",
type=int,
help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
default=DEFAULT_CONCURRENCY,
)
parser.add_argument(
"--big-endian",
action="store_true",
help="model is executed on big endian machine",
)
parser.add_argument(
"--pad-vocab",
action="store_true",
help="add pad tokens when model vocab expects more than tokenizer metadata provides",
)
parser.add_argument(
"--skip-unknown",
action="store_true",
help="skip unknown tensor names instead of failing",
)

args = parser.parse_args(args_in)
if args.awq_path:
@click.command()
@click.argument(
"model",
nargs=1,
type=click.Path(),
)
@click.option(
"--awq-path", type=click.Path(), help="Path to scale awq cache file", default=None
)
@click.option(
"--dump", is_flag=True, help="Don't convert, just show what's in the model"
)
@click.option(
"--dump-single",
is_flag=True,
help="Don't convert, just show what's in a single model file",
)
@click.option("--vocab-only", is_flag=True, help="Extract only the vocab")
# We currently only support Q8_0 output on little endian systems.
@click.option(
"--outtype",
type=click.Choice(
["f32", "f16", "q8_0"]
if np.uint32(1) == np.uint32(1).newbyteorder("<")
else ["f32", "f16"],
case_sensitive=True,
),
help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
)
@click.option(
"--vocab-dir",
type=click.Path(),
help="Directory containing tokenizer.model, if separate from model file",
)
@click.option(
"--vocab-type",
help="Vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)",
default="spm,hfft",
)
@click.option(
"--outfile", type=click.Path(), help="Path to write to; default: based on input"
)
@click.option(
"--ctx", type=click.INT, help="Model training context (default: based on input)"
)
@click.option(
"--concurrency",
type=click.INT,
help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
default=DEFAULT_CONCURRENCY,
)
@click.option(
"--big-endian", is_flag=True, help="Model is executed on big endian machine"
)
@click.option(
"--pad-vocab",
is_flag=True,
help="add pad tokens when model vocab expects more than tokenizer metadata provides",
)
@click.option(
"--skip-unknown", is_flag=True, help="skip unknown tensor names instead of failing"
)
def convert_llama_to_gguf(
model,
awq_path,
dump,
dump_single,
vocab_only,
outtype,
vocab_dir,
vocab_type,
outfile,
ctx,
concurrency,
big_endian,
pad_vocab,
skip_unknown,
):
"""Convert a LLaMA model to a GGML compatible file"""
# click.argument doesn't seem to be returning a Path for model
if isinstance(model, str):
model = Path(model)
if awq_path:
sys.path.insert(1, str(Path(__file__).parent / "awq-py"))
# Third Party
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]

tmp_model_path = args.model / "weighted_model"
tmp_model_path = model / "weighted_model"
if tmp_model_path.is_dir():
print(f"{tmp_model_path} exists as a weighted model.")
else:
tmp_model_path.mkdir(parents=True, exist_ok=True)
print("Saving new weighted model ...")
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
add_scale_weights(str(model), str(awq_path), str(tmp_model_path))
print(f"Saved weighted model at {tmp_model_path}.")
args.model = tmp_model_path
model = tmp_model_path

if args.dump_single:
model_plus = lazy_load_file(args.model)
if dump_single:
model_plus = lazy_load_file(model)
do_dump_model(model_plus)
return

if not args.vocab_only:
model_plus = load_some_model(args.model)
if not vocab_only:
model_plus = load_some_model(model)
else:
model_plus = ModelPlus(
model={}, paths=[args.model / "dummy"], format="none", vocab=None
model={}, paths=[model / "dummy"], format="none", vocab=None
)

if args.dump:
if dump:
do_dump_model(model_plus)
return
endianess = gguf.GGUFEndian.LITTLE
if args.big_endian:
if big_endian:
endianess = gguf.GGUFEndian.BIG

params = Params.load(model_plus)
if params.n_ctx == -1:
if args.ctx is None:
if ctx is None:
raise Exception(
"The model doesn't have a context size, and you didn't specify one with --ctx\n"
"Please specify one with --ctx:\n"
" - LLaMA v1: --ctx 2048\n"
" - LLaMA v2: --ctx 4096\n"
)
params.n_ctx = args.ctx
params.n_ctx = ctx

if args.outtype:
if outtype:
params.ftype = {
"f32": GGMLFileType.AllF32,
"f16": GGMLFileType.MostlyF16,
"q8_0": GGMLFileType.MostlyQ8_0,
}[args.outtype]
}[outtype]

print(f"params = {params}")

model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
vocab_path = Path(vocab_dir or model or model_parent_path)
vocab_factory = VocabFactory(vocab_path)
vocab, special_vocab = vocab_factory.load_vocab(
args.vocab_type.split(","), model_parent_path
vocab_type.split(","), model_parent_path
)

if args.vocab_only:
if not args.outfile:
if vocab_only:
if not outfile:
raise ValueError("need --outfile if using --vocab-only")
outfile = args.outfile
outfile = outfile
OutputFile.write_vocab_only(
outfile,
params,
vocab,
special_vocab,
endianess=endianess,
pad_vocab=args.pad_vocab,
pad_vocab=pad_vocab,
)
print(f"Wrote {outfile}")
return

if model_plus.vocab is not None and args.vocab_dir is None:
if model_plus.vocab is not None and vocab_dir is None:
vocab = model_plus.vocab

print(f"Vocab info: {vocab}")
print(f"Special vocab info: {special_vocab}")

model = model_plus.model
model = convert_model_names(model, params, args.skip_unknown)
ftype = pick_output_type(model, args.outtype)
model = convert_model_names(model, params, skip_unknown)
ftype = pick_output_type(model, outtype)
model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
outfile = outfile or default_outfile(model_plus.paths, ftype)

params.ftype = ftype
print(f"Writing {outfile}, format {ftype}")
Expand All @@ -1775,12 +1784,12 @@ def main(args_in: list[str] | None = None) -> None:
model,
vocab,
special_vocab,
concurrency=args.concurrency,
concurrency=concurrency,
endianess=endianess,
pad_vocab=args.pad_vocab,
pad_vocab=pad_vocab,
)
print(f"Wrote {outfile}")


if __name__ == "__main__":
main()
convert_llama_to_gguf()
Loading

0 comments on commit 3e2563d

Please sign in to comment.