From c4412f8ee856ef78b2c0c0d339583449b16b8ca3 Mon Sep 17 00:00:00 2001
From: JieguangZhou <jieguang_zhou@163.com>
Date: Sat, 3 Feb 2024 00:47:04 +0800
Subject: [PATCH] Update LLM Finetune documentation

---
 docs/hr/content/docs/ai_integrations/llm.md | 278 +++++++++++++++++++-
 examples/llm_finetune.py                    | 192 ++++----------
 2 files changed, 326 insertions(+), 144 deletions(-)

diff --git a/docs/hr/content/docs/ai_integrations/llm.md b/docs/hr/content/docs/ai_integrations/llm.md
index db0b05540..c53440b49 100644
--- a/docs/hr/content/docs/ai_integrations/llm.md
+++ b/docs/hr/content/docs/ai_integrations/llm.md
@@ -32,7 +32,6 @@ Insert example datas
 ```python
 from superduperdb.backends.mongodb.query import Collection
 from superduperdb.base.document import Document
-from superduperdb.components.listener import Listener
 
 datas = [Document({"question": f"1+{i}=", "id": str(i)}) for i in range(3)]
 db.execute(Collection('docs').insert_many(datas))
@@ -107,6 +106,25 @@ Parameters defined in `model.predict` will override those in `inference_kwargs`,
 
 ## Support Framework/API
 
+### Transformers
+
+[Transformers](https://huggingface.co/docs/transformers/index) is a popular AI framework, and we have incorporated native support for Transformers to provide essential Large Language Model (LLM) capabilities.
+
+You can quickly utilize LLM capabilities using the following Python function:
+
+```python
+from superduperdb.ext.llm import LLM
+llm = LLM( model_name_or_path="facebook/opt-350m")
+llm.predict("What are we having for dinner?")
+```
+
+The model can be configured with the following parameters:
+
+- bits: quantization bits, ranging from 4 to 8; the default is None.
+- adapter_id: Add an adapter to the base model for inference.
+- model_kwargs: a dictionary; all the model_kwargs will be passed to transformers.AutoModelForCausalLM.from_pretrained. You can provide parameters such as trust_remote_code=True.
+- tokenizer_kwargs: a dictionary; all the tokenizer_kwargs will be passed to transformers.AutoTokenizer.from_pretrained.
+ 
 ### vLLM
 
 [vLLM](https://docs.vllm.ai/en/latest/) is a fast and easy-to-use library for LLM inference and serving.
@@ -268,6 +286,260 @@ class CustomModel(BaseLLMAPI):
 
 
 
-## Training
+## Fine-tuning
+
+SuperduperDB currently offers convenient support for model fine-tuning.
+
+### Quickly Start
+
+We can quickly run a fine-tuning example using the qlora finetune Mistral-7B model.
+
+**Install Dependencies**
+```bash
+pip install transformers torch datasets peft bitsandbytes
+```
+
+**Training Script**
+
+```python
+from superduperdb import superduper
+from superduperdb.backends.mongodb import Collection
+from superduperdb.base.document import Document
+from superduperdb.ext.llm import LLM
+from superduperdb.ext.llm.model import LLMTrainingConfiguration
+
+from datasets import load_dataset
+
+model = "mistralai/Mistral-7B-v0.1"
+dataset_name = "timdettmers/openassistant-guanaco"
+
+db = superduper("mongomock://test_llm")
+dataset = load_dataset(dataset_name)
+train_dataset = dataset["train"]
+eval_dataset = dataset["test"]
+
+train_documents = [
+    Document({"text": example["text"], "_fold": "train"})
+    for example in train_dataset
+]
+eval_documents = [
+    Document({"text": example["text"], "_fold": "valid"})
+    for example in eval_dataset
+]
+
+db.execute(Collection("datas").insert_many(train_documents))
+db.execute(Collection("datas").insert_many(eval_documents))
+
+llm = LLM(
+    identifier="llm-finetune",
+    bits=4,
+    model_name_or_path=model,
+)
+
+
+training_configuration = LLMTrainingConfiguration(
+        identifier="llm-finetune-training-config",
+        output_dir="output/llm-finetune",
+        overwrite_output_dir=True,
+        num_train_epochs=1,
+        save_total_limit=5,
+        logging_steps=10,
+        evaluation_strategy="steps",
+        fp16=True,
+        eval_steps=100,
+        save_steps=100,
+        per_device_train_batch_size=1,
+        per_device_eval_batch_size=1,
+        gradient_accumulation_steps=4,
+        max_length=512,
+        use_lora=True,
+)
+
+llm.fit(
+    X="text",
+    select=Collection("datas").find(),
+    configuration=training_configuration,
+    db=db,
+)
+
+
+prompt = "### Human: Who are you? ### Assistant: "
+
+# Automatically load lora model for prediction, default use the latest checkpoint
+print(llm.predict(prompt, max_new_tokens=100, do_sample=True))
+```
+
+This script can be found in [`llm_finetune.py`](https://github.com/SuperDuperDB/superduperdb/blob/main/examples/llm_finetune.py).
+
+**Running Training**
+You can execute training by running `python examples/llm_finetune.py`.
+
+If you have multiple GPUs, it will automatically use Ray for multi-GPU training.
+
+> If you encounter `ImportError: cannot import name 'ExtensionArrayFormatter' from 'pandas.io.formats.format'` while using multiple GPUs, please downgrade the Pandas version with the following command:
+> 
+> ```shell
+> pip install 'pandas<=2.1.4'
+> ```
+
+**Model Usage**
+Apart from directly loading and using the model at the end of the script, you can also use your model in other programs provided that you are connected to a real database rather than a mock database.
+
+
+```python
+llm = db.load("model", "llm-finetune")
+prompt = "### Human: Who are you? ### Assistant: "
+print(llm.predict(prompt, max_new_tokens=100, do_sample=True))
+```
+### Supported Features
+
+**Training Methods**:
+
+- Full fine-tuning
+- LoRA fine-tuning
+
+**Parallel Training**:
+
+Parallel training is supported using Ray, with data parallelism as the default strategy. You can also pass DeepSpeed parameters to configure parallelism through [DeepSpeed configuration](https://huggingface.co/docs/transformers/main_classes/deepspeed#zero).
+
+- Multi-GPUs fine-tuning
+- Multi-nodes fine-tuning
+
+**Remote Training**:
+You can perform remote training by providing a `ray_address`. Imagine you have a Ray cluster with GPUs, you can connect to it from your local machine for training.
+
+### Training Configuration
+
+The training process consists of the following steps:
+
+1. Define a model.
+2. Define training parameter configurations.
+3. Execute training.
+
+#### Define The Model
+
+
+```python
+llm = LLM(
+    identifier="llm-finetune",
+    bits=4,
+    model_name_or_path=model,
+)
+```
+
+LLM class model definition can be found in the above introduction.
+
+#### Define Training Parameter Configuration
+
+```python
+training_configuration = LLMTrainingConfiguration(
+    identifier="llm-finetune-training-config",
+    output_dir="output/llm-finetune",
+    ...
+)
+```
+
+The configuration inherits from Huggingface `transformers.TrainingArguments`, which means theoretically you can use any parameters supported by it.
+
+Additionally, some extra parameters are provided to support LLM fine-tuning scenarios.
+
+```
+use_lora (`bool`, *optional*, defaults to True):
+    Whether to use LoRA training.
+    
+lora_r (`int`, *optional*, defaults to 8):
+    Lora R dimension.
+
+lora_alpha (`int`, *optional*, defaults to 16):
+    Lora alpha.
+
+lora_dropout (`float`, *optional*, defaults to 0.05):
+    Lora dropout.
+
+lora_target_modules (`List[str]`, *optional*, defaults to None):
+    Lora target modules. If None, will be automatically inferred.
+
+lora_bias (`str`, *optional*, defaults to "none"):
+    Lora bias.
+
+max_length (`int`, *optional*, defaults to 512):
+    Maximum source sequence length during training.
+    
+log_to_db (`bool`, *optional*, defaults to True):
+    Log training to db.
+    If True, will log checkpoint to superduperdb,
+        but need ray cluster can access to db.
+    If can't access to db, please set it to False.
+```
+
+#### Execute Training
+
+```python
+llm.fit(
+    X="text",
+    select=Collection("datas").find(),
+    configuration=training_configuration,
+    db=db,
+)
+```
+
+By default, training will execute directly. However, if multiple GPUs are detected, training will be managed and performed in parallel using Ray.
+
+Additionally, you can manually configure Ray for training, either locally or on a remote Ray cluster.
+
+Provide three Ray-related parameters for configuration:
+
+##### on_ray (str)
+
+Whether to perform training on Ray.
+
+##### ray_address (str)
+
+The address of the Ray cluster to connect to. If not provided, a Ray service will be started locally by default.
+
+##### ray_configs (dict)
+
+All ray_configs will be passed to [TorchTrainer](https://docs.ray.io/en/latest/train/api/doc/ray.train.torch.TorchTrainer.html).
+
+Except for the following three fields, which are automatically built by SuperDuperDB:
+
+- train_loop_per_worker
+- train_loop_config
+- datasets
+
+For example, you can provide a configuration like this:
+
+
+```python
+from ray.train import RunConfig, ScalingConfig
+
+scaling_config = ScalingConfig(
+    num_workers=4, # Number of GPUs you need
+    use_gpu=True,
+)
+
+run_config = RunConfig(
+    storage_path="s3://llm-test/llm-finetune",
+    name="llm-finetune-test100",
+)
+
+ray_configs = {
+    "scaling_config": scaling_config,
+    "run_config": run_config,
+}
+
+llm.fit(
+    X="text",
+    select=Collection("datas").find(),
+    configuration=base_config,
+    db=db,
+    on_ray=True,
+    ray_address="ray://ray_cluster_ip:10001",
+    ray_configs=ray_configs,
+)
+```
+
+For information on how to configure Ray resources, please refer to the ray documentation, such as:
+- [ScalingConfig](https://docs.ray.io/en/latest/train/api/doc/ray.train.ScalingConfig.html#ray.train.ScalingConfig)
+- [RunConfig](https://docs.ray.io/en/latest/train/api/doc/ray.train.RunConfig.html#ray.train.RunConfig)
 
-Coming soon...
diff --git a/examples/llm_finetune.py b/examples/llm_finetune.py
index 4a24dba1c..87715615c 100644
--- a/examples/llm_finetune.py
+++ b/examples/llm_finetune.py
@@ -1,153 +1,63 @@
-import os
-
-import torch
 from datasets import load_dataset
 
 from superduperdb import superduper
 from superduperdb.backends.mongodb import Collection
 from superduperdb.base.document import Document
-from superduperdb.ext.llm.model import LLM, LLMTrainingConfiguration
-
-prompt_template = (
-    "Below is an instruction that describes a task,"
-    "paired with an input that provides further context. "
-    "Write a response that appropriately completes the request."
-    "\n\n### Instruction:\n{x}\n\n### Response:\n{y}"
+from superduperdb.ext.llm import LLM
+from superduperdb.ext.llm.model import LLMTrainingConfiguration
+
+model = "mistralai/Mistral-7B-v0.1"
+dataset_name = "timdettmers/openassistant-guanaco"
+
+db = superduper("mongomock://test_llm")
+dataset = load_dataset(dataset_name)
+train_dataset = dataset["train"]
+eval_dataset = dataset["test"]
+
+train_documents = [
+    Document({"text": example["text"], "_fold": "train"}) for example in train_dataset
+]
+eval_documents = [
+    Document({"text": example["text"], "_fold": "valid"}) for example in eval_dataset
+]
+
+db.execute(Collection("datas").insert_many(train_documents))
+db.execute(Collection("datas").insert_many(eval_documents))
+
+llm = LLM(
+    identifier="llm-finetune",
+    bits=4,
+    model_name_or_path=model,
 )
 
-collection_name = "alpaca-gpt4-data-zh"
-
-
-def prepare_datas(db, size):
-    datas = load_dataset("c-s-ale/alpaca-gpt4-data-zh")["train"].to_list()[:size]
-
-    for data in datas:
-        if data["input"] is not None:
-            data["instruction"] = data["instruction"] + "\n" + data["input"]
-        data["text"] = prompt_template.format(x=data["instruction"], y=data["output"])
-
-    db.execute(Collection(collection_name).insert_many(list(map(Document, datas))))
-
-
-deepspeed = {
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "gradient_accumulation_steps": "auto",
-    "zero_optimization": {
-        "stage": 2,
-    }
-}
-
-
-def train(db, model_identifier, model_name, output_dir):
-    # training
-    llm = LLM(
-        identifier=model_identifier,
-        # bits=4,
-        model_name_or_path=model_name,
-    )
-    training_configuration = LLMTrainingConfiguration(
-        identifier="llm-finetune-training-config",
-        output_dir=output_dir,
-        overwrite_output_dir=True,
-        lora_r=8,
-        lora_alpha=16,
-        lora_dropout=0.05,
-        num_train_epochs=3,
-        # max_steps=10,
-        fp16=torch.cuda.is_available(),  # mps don't support fp16
-        per_device_train_batch_size=2,
-        per_device_eval_batch_size=1,
-        gradient_accumulation_steps=2,
-        evaluation_strategy="steps",
-        eval_steps=1,
-        save_strategy="steps",
-        save_steps=1,
-        save_total_limit=3,
-        learning_rate=2e-5,
-        weight_decay=0.0,
-        warmup_ratio=0.03,
-        lr_scheduler_type="cosine",
-        logging_strategy="steps",
-        logging_steps=5,
-        gradient_checkpointing=True,
-        report_to=[],
-        # deepspeed=deepspeed,
-        use_lora=True,
-    )
-
-    from ray.train import RunConfig, ScalingConfig
 
-    scaling_config = ScalingConfig(
-        num_workers=1,
-        # use_gpu=True,
-    )
-
-    run_config = RunConfig(
-        storage_path="s3://llm-test/llm-finetune",
-        name="llm-finetune-test",
-    )
-
-    ray_configs = {
-        "scaling_config": scaling_config,
-        "run_config": run_config,
-    }
-
-    llm.fit(
-        X="text",
-        db=db,
-        select=Collection(collection_name).find(),
-        configuration=training_configuration,
-        prefetch_size=1000,
-        on_ray=True,
-        # ray_address="ray://ec2-3-90-217-206.compute-1.amazonaws.com:10001",
-        ray_configs=ray_configs,
-    )
-
-
-def inference(db, model_identifier, output_dir):
-    # inference
-    llm_base = db.load("model", model_identifier)
-    checkpoints = [
-        checkpoint
-        for checkpoint in os.listdir(output_dir)
-        if checkpoint.startswith("checkpoint")
-    ]
-    db.add(llm_base)
-    for checkpoint in checkpoints:
-        llm_checkpoint = LLM(
-            identifier=checkpoint,
-            bits=4 if torch.cuda.is_available() else None,
-            adapter_id=os.path.join(output_dir, checkpoint),
-            model_name_or_path=llm_base.model_name_or_path,
-        )
-        db.add(llm_checkpoint)
-
-    datas = list(Collection(collection_name).find().execute(db))
-    data = datas[3].content
-    print(data["text"])
-
-    prompt = prompt_template.format(x=data["instruction"], y="")
-    print("-" * 20, "\n")
-    print(prompt)
-    print("-" * 20, "\n")
-
-    print("Base model:\n")
-    print(db.predict(llm_base.identifier, prompt, max_new_tokens=100)[0].content)
+training_configuration = LLMTrainingConfiguration(
+    identifier="llm-finetune-training-config",
+    output_dir="output/llm-finetune",
+    overwrite_output_dir=True,
+    num_train_epochs=1,
+    save_total_limit=5,
+    logging_steps=10,
+    evaluation_strategy="steps",
+    fp16=True,
+    eval_steps=100,
+    save_steps=100,
+    per_device_train_batch_size=1,
+    per_device_eval_batch_size=1,
+    gradient_accumulation_steps=4,
+    max_length=512,
+    use_lora=True,
+)
 
-    for checkpoint in checkpoints:
-        print("-" * 20, "\n")
-        print(f"Finetuned model-{checkpoint}:\n")
-        print(db.predict(checkpoint, prompt, max_new_tokens=100)[0].content)
+llm.fit(
+    X="text",
+    select=Collection("datas").find(),
+    configuration=training_configuration,
+    db=db,
+)
 
 
-if __name__ == "__main__":
-    db = superduper("mongomock://localhost:27017/test-llm")
-    model = "facebook/opt-125m"
-    # model = "mistralai/Mistral-7B-Instruct-v0.2"
-    output_dir = "outputs/llm-finetune"
+prompt = "### Human: Who are you? ### Assistant: "
 
-    db.drop(force=True)
-    prepare_datas(db, size=200)
-    train(db, "llm-finetune", model, output_dir)
-    inference(db, "llm-finetune", output_dir)
+# Automatically load lora model for prediction, default use the latest checkpoint
+print(llm.predict(prompt, max_new_tokens=100, do_sample=True))