From c4412f8ee856ef78b2c0c0d339583449b16b8ca3 Mon Sep 17 00:00:00 2001 From: JieguangZhou Date: Sat, 3 Feb 2024 00:47:04 +0800 Subject: [PATCH] Update LLM Finetune documentation --- docs/hr/content/docs/ai_integrations/llm.md | 278 +++++++++++++++++++- examples/llm_finetune.py | 192 ++++---------- 2 files changed, 326 insertions(+), 144 deletions(-) diff --git a/docs/hr/content/docs/ai_integrations/llm.md b/docs/hr/content/docs/ai_integrations/llm.md index db0b05540..c53440b49 100644 --- a/docs/hr/content/docs/ai_integrations/llm.md +++ b/docs/hr/content/docs/ai_integrations/llm.md @@ -32,7 +32,6 @@ Insert example datas ```python from superduperdb.backends.mongodb.query import Collection from superduperdb.base.document import Document -from superduperdb.components.listener import Listener datas = [Document({"question": f"1+{i}=", "id": str(i)}) for i in range(3)] db.execute(Collection('docs').insert_many(datas)) @@ -107,6 +106,25 @@ Parameters defined in `model.predict` will override those in `inference_kwargs`, ## Support Framework/API +### Transformers + +[Transformers](https://huggingface.co/docs/transformers/index) is a popular AI framework, and we have incorporated native support for Transformers to provide essential Large Language Model (LLM) capabilities. + +You can quickly utilize LLM capabilities using the following Python function: + +```python +from superduperdb.ext.llm import LLM +llm = LLM( model_name_or_path="facebook/opt-350m") +llm.predict("What are we having for dinner?") +``` + +The model can be configured with the following parameters: + +- bits: quantization bits, ranging from 4 to 8; the default is None. +- adapter_id: Add an adapter to the base model for inference. +- model_kwargs: a dictionary; all the model_kwargs will be passed to transformers.AutoModelForCausalLM.from_pretrained. You can provide parameters such as trust_remote_code=True. +- tokenizer_kwargs: a dictionary; all the tokenizer_kwargs will be passed to transformers.AutoTokenizer.from_pretrained. + ### vLLM [vLLM](https://docs.vllm.ai/en/latest/) is a fast and easy-to-use library for LLM inference and serving. @@ -268,6 +286,260 @@ class CustomModel(BaseLLMAPI): -## Training +## Fine-tuning + +SuperduperDB currently offers convenient support for model fine-tuning. + +### Quickly Start + +We can quickly run a fine-tuning example using the qlora finetune Mistral-7B model. + +**Install Dependencies** +```bash +pip install transformers torch datasets peft bitsandbytes +``` + +**Training Script** + +```python +from superduperdb import superduper +from superduperdb.backends.mongodb import Collection +from superduperdb.base.document import Document +from superduperdb.ext.llm import LLM +from superduperdb.ext.llm.model import LLMTrainingConfiguration + +from datasets import load_dataset + +model = "mistralai/Mistral-7B-v0.1" +dataset_name = "timdettmers/openassistant-guanaco" + +db = superduper("mongomock://test_llm") +dataset = load_dataset(dataset_name) +train_dataset = dataset["train"] +eval_dataset = dataset["test"] + +train_documents = [ + Document({"text": example["text"], "_fold": "train"}) + for example in train_dataset +] +eval_documents = [ + Document({"text": example["text"], "_fold": "valid"}) + for example in eval_dataset +] + +db.execute(Collection("datas").insert_many(train_documents)) +db.execute(Collection("datas").insert_many(eval_documents)) + +llm = LLM( + identifier="llm-finetune", + bits=4, + model_name_or_path=model, +) + + +training_configuration = LLMTrainingConfiguration( + identifier="llm-finetune-training-config", + output_dir="output/llm-finetune", + overwrite_output_dir=True, + num_train_epochs=1, + save_total_limit=5, + logging_steps=10, + evaluation_strategy="steps", + fp16=True, + eval_steps=100, + save_steps=100, + per_device_train_batch_size=1, + per_device_eval_batch_size=1, + gradient_accumulation_steps=4, + max_length=512, + use_lora=True, +) + +llm.fit( + X="text", + select=Collection("datas").find(), + configuration=training_configuration, + db=db, +) + + +prompt = "### Human: Who are you? ### Assistant: " + +# Automatically load lora model for prediction, default use the latest checkpoint +print(llm.predict(prompt, max_new_tokens=100, do_sample=True)) +``` + +This script can be found in [`llm_finetune.py`](https://github.com/SuperDuperDB/superduperdb/blob/main/examples/llm_finetune.py). + +**Running Training** +You can execute training by running `python examples/llm_finetune.py`. + +If you have multiple GPUs, it will automatically use Ray for multi-GPU training. + +> If you encounter `ImportError: cannot import name 'ExtensionArrayFormatter' from 'pandas.io.formats.format'` while using multiple GPUs, please downgrade the Pandas version with the following command: +> +> ```shell +> pip install 'pandas<=2.1.4' +> ``` + +**Model Usage** +Apart from directly loading and using the model at the end of the script, you can also use your model in other programs provided that you are connected to a real database rather than a mock database. + + +```python +llm = db.load("model", "llm-finetune") +prompt = "### Human: Who are you? ### Assistant: " +print(llm.predict(prompt, max_new_tokens=100, do_sample=True)) +``` +### Supported Features + +**Training Methods**: + +- Full fine-tuning +- LoRA fine-tuning + +**Parallel Training**: + +Parallel training is supported using Ray, with data parallelism as the default strategy. You can also pass DeepSpeed parameters to configure parallelism through [DeepSpeed configuration](https://huggingface.co/docs/transformers/main_classes/deepspeed#zero). + +- Multi-GPUs fine-tuning +- Multi-nodes fine-tuning + +**Remote Training**: +You can perform remote training by providing a `ray_address`. Imagine you have a Ray cluster with GPUs, you can connect to it from your local machine for training. + +### Training Configuration + +The training process consists of the following steps: + +1. Define a model. +2. Define training parameter configurations. +3. Execute training. + +#### Define The Model + + +```python +llm = LLM( + identifier="llm-finetune", + bits=4, + model_name_or_path=model, +) +``` + +LLM class model definition can be found in the above introduction. + +#### Define Training Parameter Configuration + +```python +training_configuration = LLMTrainingConfiguration( + identifier="llm-finetune-training-config", + output_dir="output/llm-finetune", + ... +) +``` + +The configuration inherits from Huggingface `transformers.TrainingArguments`, which means theoretically you can use any parameters supported by it. + +Additionally, some extra parameters are provided to support LLM fine-tuning scenarios. + +``` +use_lora (`bool`, *optional*, defaults to True): + Whether to use LoRA training. + +lora_r (`int`, *optional*, defaults to 8): + Lora R dimension. + +lora_alpha (`int`, *optional*, defaults to 16): + Lora alpha. + +lora_dropout (`float`, *optional*, defaults to 0.05): + Lora dropout. + +lora_target_modules (`List[str]`, *optional*, defaults to None): + Lora target modules. If None, will be automatically inferred. + +lora_bias (`str`, *optional*, defaults to "none"): + Lora bias. + +max_length (`int`, *optional*, defaults to 512): + Maximum source sequence length during training. + +log_to_db (`bool`, *optional*, defaults to True): + Log training to db. + If True, will log checkpoint to superduperdb, + but need ray cluster can access to db. + If can't access to db, please set it to False. +``` + +#### Execute Training + +```python +llm.fit( + X="text", + select=Collection("datas").find(), + configuration=training_configuration, + db=db, +) +``` + +By default, training will execute directly. However, if multiple GPUs are detected, training will be managed and performed in parallel using Ray. + +Additionally, you can manually configure Ray for training, either locally or on a remote Ray cluster. + +Provide three Ray-related parameters for configuration: + +##### on_ray (str) + +Whether to perform training on Ray. + +##### ray_address (str) + +The address of the Ray cluster to connect to. If not provided, a Ray service will be started locally by default. + +##### ray_configs (dict) + +All ray_configs will be passed to [TorchTrainer](https://docs.ray.io/en/latest/train/api/doc/ray.train.torch.TorchTrainer.html). + +Except for the following three fields, which are automatically built by SuperDuperDB: + +- train_loop_per_worker +- train_loop_config +- datasets + +For example, you can provide a configuration like this: + + +```python +from ray.train import RunConfig, ScalingConfig + +scaling_config = ScalingConfig( + num_workers=4, # Number of GPUs you need + use_gpu=True, +) + +run_config = RunConfig( + storage_path="s3://llm-test/llm-finetune", + name="llm-finetune-test100", +) + +ray_configs = { + "scaling_config": scaling_config, + "run_config": run_config, +} + +llm.fit( + X="text", + select=Collection("datas").find(), + configuration=base_config, + db=db, + on_ray=True, + ray_address="ray://ray_cluster_ip:10001", + ray_configs=ray_configs, +) +``` + +For information on how to configure Ray resources, please refer to the ray documentation, such as: +- [ScalingConfig](https://docs.ray.io/en/latest/train/api/doc/ray.train.ScalingConfig.html#ray.train.ScalingConfig) +- [RunConfig](https://docs.ray.io/en/latest/train/api/doc/ray.train.RunConfig.html#ray.train.RunConfig) -Coming soon... diff --git a/examples/llm_finetune.py b/examples/llm_finetune.py index 4a24dba1c..87715615c 100644 --- a/examples/llm_finetune.py +++ b/examples/llm_finetune.py @@ -1,153 +1,63 @@ -import os - -import torch from datasets import load_dataset from superduperdb import superduper from superduperdb.backends.mongodb import Collection from superduperdb.base.document import Document -from superduperdb.ext.llm.model import LLM, LLMTrainingConfiguration - -prompt_template = ( - "Below is an instruction that describes a task," - "paired with an input that provides further context. " - "Write a response that appropriately completes the request." - "\n\n### Instruction:\n{x}\n\n### Response:\n{y}" +from superduperdb.ext.llm import LLM +from superduperdb.ext.llm.model import LLMTrainingConfiguration + +model = "mistralai/Mistral-7B-v0.1" +dataset_name = "timdettmers/openassistant-guanaco" + +db = superduper("mongomock://test_llm") +dataset = load_dataset(dataset_name) +train_dataset = dataset["train"] +eval_dataset = dataset["test"] + +train_documents = [ + Document({"text": example["text"], "_fold": "train"}) for example in train_dataset +] +eval_documents = [ + Document({"text": example["text"], "_fold": "valid"}) for example in eval_dataset +] + +db.execute(Collection("datas").insert_many(train_documents)) +db.execute(Collection("datas").insert_many(eval_documents)) + +llm = LLM( + identifier="llm-finetune", + bits=4, + model_name_or_path=model, ) -collection_name = "alpaca-gpt4-data-zh" - - -def prepare_datas(db, size): - datas = load_dataset("c-s-ale/alpaca-gpt4-data-zh")["train"].to_list()[:size] - - for data in datas: - if data["input"] is not None: - data["instruction"] = data["instruction"] + "\n" + data["input"] - data["text"] = prompt_template.format(x=data["instruction"], y=data["output"]) - - db.execute(Collection(collection_name).insert_many(list(map(Document, datas)))) - - -deepspeed = { - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "gradient_accumulation_steps": "auto", - "zero_optimization": { - "stage": 2, - } -} - - -def train(db, model_identifier, model_name, output_dir): - # training - llm = LLM( - identifier=model_identifier, - # bits=4, - model_name_or_path=model_name, - ) - training_configuration = LLMTrainingConfiguration( - identifier="llm-finetune-training-config", - output_dir=output_dir, - overwrite_output_dir=True, - lora_r=8, - lora_alpha=16, - lora_dropout=0.05, - num_train_epochs=3, - # max_steps=10, - fp16=torch.cuda.is_available(), # mps don't support fp16 - per_device_train_batch_size=2, - per_device_eval_batch_size=1, - gradient_accumulation_steps=2, - evaluation_strategy="steps", - eval_steps=1, - save_strategy="steps", - save_steps=1, - save_total_limit=3, - learning_rate=2e-5, - weight_decay=0.0, - warmup_ratio=0.03, - lr_scheduler_type="cosine", - logging_strategy="steps", - logging_steps=5, - gradient_checkpointing=True, - report_to=[], - # deepspeed=deepspeed, - use_lora=True, - ) - - from ray.train import RunConfig, ScalingConfig - scaling_config = ScalingConfig( - num_workers=1, - # use_gpu=True, - ) - - run_config = RunConfig( - storage_path="s3://llm-test/llm-finetune", - name="llm-finetune-test", - ) - - ray_configs = { - "scaling_config": scaling_config, - "run_config": run_config, - } - - llm.fit( - X="text", - db=db, - select=Collection(collection_name).find(), - configuration=training_configuration, - prefetch_size=1000, - on_ray=True, - # ray_address="ray://ec2-3-90-217-206.compute-1.amazonaws.com:10001", - ray_configs=ray_configs, - ) - - -def inference(db, model_identifier, output_dir): - # inference - llm_base = db.load("model", model_identifier) - checkpoints = [ - checkpoint - for checkpoint in os.listdir(output_dir) - if checkpoint.startswith("checkpoint") - ] - db.add(llm_base) - for checkpoint in checkpoints: - llm_checkpoint = LLM( - identifier=checkpoint, - bits=4 if torch.cuda.is_available() else None, - adapter_id=os.path.join(output_dir, checkpoint), - model_name_or_path=llm_base.model_name_or_path, - ) - db.add(llm_checkpoint) - - datas = list(Collection(collection_name).find().execute(db)) - data = datas[3].content - print(data["text"]) - - prompt = prompt_template.format(x=data["instruction"], y="") - print("-" * 20, "\n") - print(prompt) - print("-" * 20, "\n") - - print("Base model:\n") - print(db.predict(llm_base.identifier, prompt, max_new_tokens=100)[0].content) +training_configuration = LLMTrainingConfiguration( + identifier="llm-finetune-training-config", + output_dir="output/llm-finetune", + overwrite_output_dir=True, + num_train_epochs=1, + save_total_limit=5, + logging_steps=10, + evaluation_strategy="steps", + fp16=True, + eval_steps=100, + save_steps=100, + per_device_train_batch_size=1, + per_device_eval_batch_size=1, + gradient_accumulation_steps=4, + max_length=512, + use_lora=True, +) - for checkpoint in checkpoints: - print("-" * 20, "\n") - print(f"Finetuned model-{checkpoint}:\n") - print(db.predict(checkpoint, prompt, max_new_tokens=100)[0].content) +llm.fit( + X="text", + select=Collection("datas").find(), + configuration=training_configuration, + db=db, +) -if __name__ == "__main__": - db = superduper("mongomock://localhost:27017/test-llm") - model = "facebook/opt-125m" - # model = "mistralai/Mistral-7B-Instruct-v0.2" - output_dir = "outputs/llm-finetune" +prompt = "### Human: Who are you? ### Assistant: " - db.drop(force=True) - prepare_datas(db, size=200) - train(db, "llm-finetune", model, output_dir) - inference(db, "llm-finetune", output_dir) +# Automatically load lora model for prediction, default use the latest checkpoint +print(llm.predict(prompt, max_new_tokens=100, do_sample=True))